diff --git a/mlir/.clang-format b/mlir/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..392e20189554b6d594482cd341550576e98d32a8
--- /dev/null
+++ b/mlir/.clang-format
@@ -0,0 +1,2 @@
+BasedOnStyle: LLVM
+AlwaysBreakTemplateDeclarations: Yes
\ No newline at end of file
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..67d1f00322c5c3078f76e6ca752004a06dcb615f
--- /dev/null
+++ b/mlir/CMakeLists.txt
@@ -0,0 +1,108 @@
+# MLIR project.
+set(MLIR_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include ) # --src-root
+set(MLIR_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include ) # --includedir
+set(MLIR_TABLEGEN_EXE mlir-tblgen)
+
+set(MLIR_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(MLIR_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+function(mlir_tablegen ofn)
+  tablegen(MLIR ${ARGV} "-I${MLIR_MAIN_SRC_DIR}" "-I${MLIR_INCLUDE_DIR}")
+  set(TABLEGEN_OUTPUT ${TABLEGEN_OUTPUT} ${CMAKE_CURRENT_BINARY_DIR}/${ofn}
+      PARENT_SCOPE)
+endfunction()
+
+function(add_mlir_dialect dialect dialect_doc_filename)
+  set(LLVM_TARGET_DEFINITIONS ${dialect}.td)
+  mlir_tablegen(${dialect}.h.inc -gen-op-decls)
+  mlir_tablegen(${dialect}.cpp.inc -gen-op-defs)
+  add_public_tablegen_target(MLIR${dialect}IncGen)
+
+  # Generate Dialect Documentation
+  set(LLVM_TARGET_DEFINITIONS ${dialect_doc_filename}.td)
+  tablegen(MLIR ${dialect_doc_filename}.md -gen-op-doc "-I${MLIR_MAIN_SRC_DIR}" "-I${MLIR_INCLUDE_DIR}")
+  set(GEN_DOC_FILE ${MLIR_BINARY_DIR}/docs/Dialects/${dialect_doc_filename}.md)
+  add_custom_command(
+          OUTPUT ${GEN_DOC_FILE}
+          COMMAND ${CMAKE_COMMAND} -E copy
+                  ${CMAKE_CURRENT_BINARY_DIR}/${dialect_doc_filename}.md
+                  ${GEN_DOC_FILE}
+          DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${dialect_doc_filename}.md)
+  add_custom_target(${dialect_doc_filename}DocGen DEPENDS ${GEN_DOC_FILE})
+  add_dependencies(mlir-doc ${dialect_doc_filename}DocGen)
+endfunction()
+
+add_custom_target(mlir-doc)
+
+# TODO: This is to handle the current static registration, but should be
+# factored out a bit.
+function(whole_archive_link target)
+  if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
+    set(link_flags "-L${CMAKE_BINARY_DIR}/lib ")
+    FOREACH(LIB ${ARGN})
+      string(CONCAT link_flags ${link_flags} "-Wl,-force_load ${CMAKE_BINARY_DIR}/lib/lib${LIB}.a ")
+    ENDFOREACH(LIB)
+  elseif(MSVC)
+    FOREACH(LIB ${ARGN})
+      string(CONCAT link_flags ${link_flags} "/WHOLEARCHIVE:${LIB} ")
+    ENDFOREACH(LIB)
+  else()
+    set(link_flags "-L${CMAKE_BINARY_DIR}/lib -Wl,--whole-archive,")
+    FOREACH(LIB ${ARGN})
+      string(CONCAT link_flags ${link_flags} "-l${LIB},")
+    ENDFOREACH(LIB)
+    string(CONCAT link_flags ${link_flags} "--no-whole-archive")
+  endif()
+  set_target_properties(${target} PROPERTIES LINK_FLAGS ${link_flags})
+endfunction(whole_archive_link)
+
+# Build the CUDA conversions and run according tests if the NVPTX backend
+# is available
+if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  set(MLIR_CUDA_CONVERSIONS_ENABLED 1)
+else()
+  set(MLIR_CUDA_CONVERSIONS_ENABLED 0)
+endif()
+
+set(MLIR_CUDA_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir CUDA runner")
+
+include_directories( "include")
+include_directories( ${MLIR_INCLUDE_DIR})
+
+add_subdirectory(include/mlir)
+add_subdirectory(lib)
+add_subdirectory(tools)
+add_subdirectory(unittests)
+add_subdirectory(test)
+
+if( LLVM_INCLUDE_EXAMPLES )
+  add_subdirectory(examples)
+endif()
+
+if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+  install(DIRECTORY include/mlir include/mlir-c
+    DESTINATION include
+    COMPONENT mlir-headers
+    FILES_MATCHING
+    PATTERN "*.h"
+    PATTERN "*.inc"
+    PATTERN "LICENSE.TXT"
+    )
+
+  install(DIRECTORY ${MLIR_INCLUDE_DIR}/mlir ${MLIR_INCLUDE_DIR}/mlir-c
+    DESTINATION include
+    COMPONENT mlir-headers
+    FILES_MATCHING
+    PATTERN "*.h"
+    PATTERN "*.gen"
+    PATTERN "*.inc"
+    PATTERN "CMakeFiles" EXCLUDE
+    PATTERN "config.h" EXCLUDE
+    )
+
+  if (NOT LLVM_ENABLE_IDE)
+    add_llvm_install_targets(install-mlir-headers
+                             DEPENDS mlir-headers
+                             COMPONENT mlir-headers)
+  endif()
+endif()
diff --git a/mlir/README.md b/mlir/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..779269bc57419c0c57f2351ef0115bb2ed6c06dc
--- /dev/null
+++ b/mlir/README.md
@@ -0,0 +1,3 @@
+# Multi-Level Intermediate Representation
+
+See [https://mlir.llvm.org/](https://mlir.llvm.org/]) for more information.
diff --git a/mlir/docs/Canonicalization.md b/mlir/docs/Canonicalization.md
new file mode 100644
index 0000000000000000000000000000000000000000..642717faa737c51cec1a00e5ac0d8845a419b9d5
--- /dev/null
+++ b/mlir/docs/Canonicalization.md
@@ -0,0 +1,64 @@
+# Operation Canonicalization in MLIR
+
+Canonicalization is an important part of compiler IR design: it makes it easier
+to implement reliable compiler transformations and to reason about what is
+better or worse in the code, and it forces interesting discussions about the
+goals of a particular level of IR. Dan Gohman wrote
+[an article](https://sunfishcode.github.io/blog/2018/10/22/Canonicalization.html)
+exploring these issues; it is worth reading if you're not familiar with these
+concepts.
+
+Most compilers have canonicalization passes, and sometimes they have many
+different ones (e.g. instcombine, dag combine, etc in LLVM). Because MLIR is a
+multi-level IR, we can provide a single canonicalization infrastructure and
+reuse it across many different IRs that it represents. This document describes
+the general approach, global canonicalizations performed, and provides sections
+to capture IR-specific rules for reference.
+
+## General Design
+
+MLIR has a single canonicalization pass, which iteratively applies
+canonicalization transformations in a greedy way until the IR converges. These
+transformations are defined by the operations themselves, which allows each
+dialect to define its own set of operations and canonicalizations together.
+
+Some important things to think about w.r.t. canonicalization patterns:
+
+*   Repeated applications of patterns should converge. Unstable or cyclic
+    rewrites will cause infinite loops in the canonicalizer.
+
+*   It is generally better to canonicalize towards operations that have fewer
+    uses of a value when the operands are duplicated, because some patterns only
+    match when a value has a single user. For example, it is generally good to
+    canonicalize "x + x" into "x * 2", because this reduces the number of uses
+    of x by one.
+
+*   It is always good to eliminate operations entirely when possible, e.g. by
+    folding known identities (like "x + 0 = x").
+
+## Globally Applied Rules
+
+These transformations are applied to all levels of IR:
+
+*   Elimination of operations that have no side effects and have no uses.
+
+*   Constant folding - e.g. "(addi 1, 2)" to "3". Constant folding hooks are
+    specified by operations.
+
+*   Move constant operands to commutative binary operators to the right side -
+    e.g. "(addi 4, x)" to "(addi x, 4)".
+
+## Builtin Ops Canonicalizations
+
+These transformations are applied to builtin ops:
+
+*   `constant` ops are uniqued and hoisted into the entry block of the first
+    parent region that is isolated from above, e.g. the entry block of a
+    function.
+*   (TODO) Merge `affine.apply` operations that directly feed each other.
+
+## Standard Ops Canonicalizations
+
+*   Shape folding of `alloc` operations to turn dynamic dimensions into static
+    ones.
+*   Folding `memref_cast` operations into users where possible.
diff --git a/mlir/docs/ConversionToLLVMDialect.md b/mlir/docs/ConversionToLLVMDialect.md
new file mode 100644
index 0000000000000000000000000000000000000000..19403e27dc4ce549d3dfc4f63312a78420659db2
--- /dev/null
+++ b/mlir/docs/ConversionToLLVMDialect.md
@@ -0,0 +1,443 @@
+# Conversion to the LLVM Dialect
+
+Conversion from the Standard to the [LLVM Dialect](Dialects/LLVM.md) can be
+performed by the specialized dialect conversion pass by running
+
+```sh
+mlir-opt -convert-std-to-llvm <filename.mlir>
+```
+
+It performs type and operation conversions for a subset of operations from
+standard dialect (operations on scalars and vectors, control flow operations) as
+described in this document. We use the terminology defined by the
+[LLVM IR Dialect description](Dialects/LLVM.md) throughout this document.
+
+[TOC]
+
+## Type Conversion
+
+### Scalar Types
+
+Scalar types are converted to their LLVM counterparts if they exist. The
+following conversions are currently implemented.
+
+-   `i*` converts to `!llvm.i*`
+-   `f16` converts to `!llvm.half`
+-   `f32` converts to `!llvm.float`
+-   `f64` converts to `!llvm.double`
+
+Note: `bf16` type is not supported by LLVM IR and cannot be converted.
+
+### Index Type
+
+Index type is converted to a wrapped LLVM IR integer with bitwidth equal to the
+bitwidth of the pointer size as specified by the
+[data layout](https://llvm.org/docs/LangRef.html#data-layout) of the LLVM module
+[contained](Dialects/LLVM.md#context-and-module-association) in the LLVM Dialect
+object. For example, on x86-64 CPUs it converts to `!llvm.i64`.
+
+### Vector Types
+
+LLVM IR only supports *one-dimensional* vectors, unlike MLIR where vectors can
+be multi-dimensional. Vector types cannot be nested in either IR. In the
+one-dimensional case, MLIR vectors are converted to LLVM IR vectors of the same
+size with element type converted using these conversion rules. In the
+n-dimensional case, MLIR vectors are converted to (n-1)-dimensional array types
+of one-dimensional vectors.
+
+For example, `vector<4 x f32>` converts to `!llvm<"<4 x float>">` and `vector<4
+x 8 x 16 x f32>` converts to `!llvm<"[4 x [8 x <16 x float>]]">`.
+
+### Memref Types
+
+Memref types in MLIR have both static and dynamic information associated with
+them. The dynamic information comprises the buffer pointer as well as sizes and
+strides of any dynamically sized dimensions. Memref types are normalized and
+converted to a descriptor that is only dependent on the rank of the memref. The
+descriptor contains:
+
+1.  the pointer to the data buffer, followed by
+2.  the pointer to properly aligned data payload that the memref indexes,
+    followed by
+3.  a lowered `index`-type integer containing the distance between the beginning
+    of the buffer and the first element to be accessed through the memref,
+    followed by
+4.  an array containing as many `index`-type integers as the rank of the memref:
+    the array represents the size, in number of elements, of the memref along
+    the given dimension. For constant MemRef dimensions, the corresponding size
+    entry is a constant whose runtime value must match the static value,
+    followed by
+5.  a second array containing as many 64-bit integers as the rank of the MemRef:
+    the second array represents the "stride" (in tensor abstraction sense), i.e.
+    the number of consecutive elements of the underlying buffer.
+
+For constant memref dimensions, the corresponding size entry is a constant whose
+runtime value matches the static value. This normalization serves as an ABI for
+the memref type to interoperate with externally linked functions. In the
+particular case of rank `0` memrefs, the size and stride arrays are omitted,
+resulting in a struct containing two pointers + offset.
+
+Examples:
+
+```mlir
+memref<f32> -> !llvm<"{ float*, float*, i64 }">
+memref<1 x f32> -> !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+memref<? x f32> -> !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+memref<10x42x42x43x123 x f32> -> !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
+memref<10x?x42x?x123 x f32> -> !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64]  }">
+
+// Memref types can have vectors as element types
+memref<1x? x vector<4xf32>> -> !llvm<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }">
+```
+
+If the rank of the memref is unknown at compile time, the Memref is converted to
+an unranked descriptor that contains:
+
+1.  a 64-bit integer representing the dynamic rank of the memref, followed by
+2.  a pointer to a ranked memref descriptor with the contents listed above.
+
+Dynamic ranked memrefs should be used only to pass arguments to external library
+calls that expect a unified memref type. The called functions can parse any
+unranked memref descriptor by reading the rank and parsing the enclosed ranked
+descriptor pointer.
+
+Examples:
+
+```mlir
+// unranked descriptor
+memref<*xf32> -> !llvm<"{i64, i8*}">
+```
+
+**In function signatures,** `memref` is passed as a _pointer_ to the structured
+defined above to comply with the calling convention.
+
+Example:
+
+```mlir
+// A function type with memref as argument
+(memref<?xf32>) -> ()
+// is transformed into the LLVM function with pointer-to-structure argument.
+!llvm<"void({ float*, float*, i64, [1 x i64], [1 x i64]}*) ">
+```
+
+### Function Types
+
+Function types get converted to LLVM function types. The arguments are converted
+individually according to these rules. The result types need to accommodate the
+fact that LLVM IR functions always have a return type, which may be a Void type.
+The converted function always has a single result type. If the original function
+type had no results, the converted function will have one result of the wrapped
+`void` type. If the original function type had one result, the converted
+function will have one result converted using these rules. Otherwise, the result
+type will be a wrapped LLVM IR structure type where each element of the
+structure corresponds to one of the results of the original function, converted
+using these rules. In high-order functions, function-typed arguments and results
+are converted to a wrapped LLVM IR function pointer type (since LLVM IR does not
+allow passing functions to functions without indirection) with the pointee type
+converted using these rules.
+
+Examples:
+
+```mlir
+// zero-ary function type with no results.
+() -> ()
+// is converted to a zero-ary function with `void` result
+!llvm<"void ()">
+
+// unary function with one result
+(i32) -> (i64)
+// has its argument and result type converted, before creating the LLVM IR function type
+!llvm<"i64 (i32)">
+
+// binary function with one result
+(i32, f32) -> (i64)
+// has its arguments handled separately
+!llvm<"i64 (i32, float)">
+
+// binary function with two results
+(i32, f32) -> (i64, f64)
+// has its result aggregated into a structure type
+!llvm<"{i64, double} (i32, f32)">
+
+// function-typed arguments or results in higher-order functions
+(() -> ()) -> (() -> ())
+// are converted into pointers to functions
+!llvm<"void ()* (void ()*)">
+```
+
+## Calling Convention
+
+### Function Signature Conversion
+
+LLVM IR functions are defined by a custom operation. The function itself has a
+wrapped LLVM IR function type converted as described above. The function
+definition operation uses MLIR syntax.
+
+Examples:
+
+```mlir
+// zero-ary function type with no results.
+func @foo() -> ()
+// gets LLVM type void().
+llvm.func @foo() -> ()
+
+// function with one result
+func @bar(i32) -> (i64)
+// gets converted to LLVM type i64(i32).
+func @bar(!llvm.i32) -> !llvm.i64
+
+// function with two results
+func @qux(i32, f32) -> (i64, f64)
+// has its result aggregated into a structure type
+func @qux(!llvm.i32, !llvm.float) -> !llvm<"{i64, double}">
+
+// function-typed arguments or results in higher-order functions
+func @quux(() -> ()) -> (() -> ())
+// are converted into pointers to functions
+func @quux(!llvm<"void ()*">) -> !llvm<"void ()*">
+// the call flow is handled by the LLVM dialect `call` operation supporting both
+// direct and indirect calls
+```
+
+### Result Packing
+
+In case of multi-result functions, the returned values are inserted into a
+structure-typed value before being returned and extracted from it at the call
+site. This transformation is a part of the conversion and is transparent to the
+defines and uses of the values being returned.
+
+Example:
+
+```mlir
+func @foo(%arg0: i32, %arg1: i64) -> (i32, i64) {
+  return %arg0, %arg1 : i32, i64
+}
+func @bar() {
+  %0 = constant 42 : i32
+  %1 = constant 17 : i64
+  %2:2 = call @foo(%0, %1) : (i32, i64) -> (i32, i64)
+  "use_i32"(%2#0) : (i32) -> ()
+  "use_i64"(%2#1) : (i64) -> ()
+}
+
+// is transformed into
+
+func @foo(%arg0: !llvm.i32, %arg1: !llvm.i64) -> !llvm<"{i32, i64}"> {
+  // insert the vales into a structure
+  %0 = llvm.mlir.undef :  !llvm<"{i32, i64}">
+  %1 = llvm.insertvalue %arg0, %0[0] : !llvm<"{i32, i64}">
+  %2 = llvm.insertvalue %arg1, %1[1] : !llvm<"{i32, i64}">
+
+  // return the structure value
+  llvm.return %2 : !llvm<"{i32, i64}">
+}
+func @bar() {
+  %0 = llvm.mlir.constant(42 : i32) : !llvm.i32
+  %1 = llvm.mlir.constant(17) : !llvm.i64
+
+  // call and extract the values from the structure
+  %2 = llvm.call @bar(%0, %1) : (%arg0: !llvm.i32, %arg1: !llvm.i32) -> !llvm<"{i32, i64}">
+  %3 = llvm.extractvalue %2[0] : !llvm<"{i32, i64}">
+  %4 = llvm.extractvalue %2[1] : !llvm<"{i32, i64}">
+
+  // use as before
+  "use_i32"(%3) : (!llvm.i32) -> ()
+  "use_i64"(%4) : (!llvm.i64) -> ()
+}
+```
+
+### Calling Convention for `memref`
+
+For function _arguments_ of `memref` type, ranked or unranked, the type of the
+argument is a _pointer_ to the memref descriptor type defined above. The caller
+of such function is required to store the descriptor in memory and guarantee
+that the storage remains live until the callee returns. The caller can than pass
+the pointer to that memory as function argument. The callee loads from the
+pointers it was passed as arguments in the entry block of the function, making
+the descriptor passed in as argument available for use similarly to
+ocally-defined descriptors.
+
+This convention is implemented in the conversion of `std.func` and `std.call` to
+the LLVM dialect. Conversions from other dialects should take it into account.
+The motivation for this convention is to simplify the ABI for interfacing with
+other LLVM modules, in particular those generated from C sources, while avoiding
+platform-specific aspects until MLIR has a proper ABI modeling.
+
+Example:
+
+```mlir
+
+func @foo(memref<?xf32>) -> () {
+  %c0 = constant 0 : index
+  load %arg0[%c0] : memref<?xf32>
+  return
+}
+
+func @bar(%arg0: index) {
+  %0 = alloc(%arg0) : memref<?xf32>
+  call @foo(%0) : (memref<?xf32>)-> ()
+  return
+}
+
+// Gets converted to the following IR.
+// Accepts a pointer to the memref descriptor.
+llvm.func @foo(!llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }*">) {
+  // Loads the descriptor so that it can be used similarly to locally
+  // created descriptors.
+  %0 = llvm.load %arg0 : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }*">
+}
+
+llvm.func @bar(%arg0: !llvm.i64) {
+  // ... Allocation ...
+  // Definition of the descriptor.
+  %7 = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+  // ... Filling in the descriptor ...
+  %14 = // The final value of the allocated descriptor.
+  // Allocate the memory for the descriptor and store it.
+  %15 = llvm.mlir.constant(1 : index) : !llvm.i64
+  %16 = llvm.alloca %15 x !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+      : (!llvm.i64) -> !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }*">
+  llvm.store %14, %16 : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }*">
+  // Pass the pointer to the function.
+  llvm.call @foo(%16) : (!llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }*">) -> ()
+  llvm.return
+}
+```
+
+*This convention may or may not apply if the conversion of MemRef types is
+overridden by the user.*
+
+## Repeated Successor Removal
+
+Since the goal of the LLVM IR dialect is to reflect LLVM IR in MLIR, the dialect
+and the conversion procedure must account for the differences between block
+arguments and LLVM IR PHI nodes. In particular, LLVM IR disallows PHI nodes with
+different values coming from the same source. Therefore, the LLVM IR dialect
+disallows operations that have identical successors accepting arguments, which
+would lead to invalid PHI nodes. The conversion process resolves the potential
+PHI source ambiguity by injecting dummy blocks if the same block is used more
+than once as a successor in an instruction. These dummy blocks branch
+unconditionally to the original successors, pass them the original operands
+(available in the dummy block because it is dominated by the original block) and
+are used instead of them in the original terminator operation.
+
+Example:
+
+```mlir
+  cond_br %0, ^bb1(%1 : i32), ^bb1(%2 : i32)
+^bb1(%3 : i32)
+  "use"(%3) : (i32) -> ()
+```
+
+leads to a new basic block being inserted,
+
+```mlir
+  cond_br %0, ^bb1(%1 : i32), ^dummy
+^bb1(%3 : i32):
+  "use"(%3) : (i32) -> ()
+^dummy:
+  br ^bb1(%4 : i32)
+```
+
+before the conversion to the LLVM IR dialect:
+
+```mlir
+  llvm.cond_br  %0, ^bb1(%1 : !llvm.i32), ^dummy
+^bb1(%3 : !llvm<"i32">):
+  "use"(%3) : (!llvm.i32) -> ()
+^dummy:
+  llvm.br ^bb1(%2 : !llvm.i32)
+```
+
+## Default Memref Model
+
+### Memref Descriptor
+
+Within a converted function, a `memref`-typed value is represented by a memref
+_descriptor_, the type of which is the structure type obtained by converting
+from the memref type. This descriptor holds all the necessary information to
+produce an address of a specific element. In particular, it holds dynamic values
+for static sizes, and they are expected to match at all times.
+
+It is created by the allocation operation and is updated by the conversion
+operations that may change static dimensions into dynamic and vice versa.
+
+**Note**: LLVM IR conversion does not support `memref`s with layouts that are
+not amenable to the strided form.
+
+### Index Linearization
+
+Accesses to a memref element are transformed into an access to an element of the
+buffer pointed to by the descriptor. The position of the element in the buffer
+is calculated by linearizing memref indices in row-major order (lexically first
+index is the slowest varying, similar to C, but accounting for strides). The
+computation of the linear address is emitted as arithmetic operation in the LLVM
+IR dialect. Strides are extracted from the memref descriptor.
+
+Accesses to zero-dimensional memref (that are interpreted as pointers to the
+elemental type) are directly converted into `llvm.load` or `llvm.store` without
+any pointer manipulations.
+
+Examples:
+
+An access to a zero-dimensional memref is converted into a plain load:
+
+```mlir
+// before
+%0 = load %m[] : memref<f32>
+
+// after
+%0 = llvm.load %m : !llvm<"float*">
+```
+
+An access to a memref with indices:
+
+```mlir
+%0 = load %m[1,2,3,4] : memref<10x?x13x?xf32>
+```
+
+is transformed into the equivalent of the following code:
+
+```mlir
+// Compute the linearized index from strides. Each block below extracts one
+// stride from the descriptor, multipllies it with the index and accumulates
+// the total offset.
+%stride1 = llvm.extractvalue[4, 0] : !llvm<"{float*, float*, i64, i64[4], i64[4]}">
+%idx1 = llvm.mlir.constant(1 : index) !llvm.i64
+%addr1 = muli %stride1, %idx1 : !llvm.i64
+
+%stride2 = llvm.extractvalue[4, 1] : !llvm<"{float*, float*, i64, i64[4], i64[4]}">
+%idx2 = llvm.mlir.constant(2 : index) !llvm.i64
+%addr2 = muli %stride2, %idx2 : !llvm.i64
+%addr3 = addi %addr1, %addr2 : !llvm.i64
+
+%stride3 = llvm.extractvalue[4, 2] : !llvm<"{float*, float*, i64, i64[4], i64[4]}">
+%idx3 = llvm.mlir.constant(3 : index) !llvm.i64
+%addr4 = muli %stride3, %idx3 : !llvm.i64
+%addr5 = addi %addr3, %addr4 : !llvm.i64
+
+%stride4 = llvm.extractvalue[4, 3] : !llvm<"{float*, float*, i64, i64[4], i64[4]}">
+%idx4 = llvm.mlir.constant(4 : index) !llvm.i64
+%addr6 = muli %stride4, %idx4 : !llvm.i64
+%addr7 = addi %addr5, %addr6 : !llvm.i64
+
+// Add the linear offset to the address.
+%offset = llvm.extractvalue[2] : !llvm<"{float*, float*, i64, i64[4], i64[4]}">
+%addr8 = addi %addr7, %offset : !llvm.i64
+
+// Obtain the aligned pointer.
+%aligned = llvm.extractvalue[1] : !llvm<"{float*, float*, i64, i64[4], i64[4]}">
+
+// Get the address of the data pointer.
+%ptr = llvm.getelementptr %aligned[%addr8]
+    : !llvm<"{float*, float*, i64, i64[4], i64[4]}"> -> !llvm<"float*">
+
+// Perform the actual load.
+%0 = llvm.load %ptr : !llvm<"float*">
+```
+
+For stores, the address computation code is identical and only the actual store
+operation is different.
+
+Note: the conversion does not perform any sort of common subexpression
+elimination when emitting memref accesses.
diff --git a/mlir/docs/DeclarativeRewrites.md b/mlir/docs/DeclarativeRewrites.md
new file mode 100644
index 0000000000000000000000000000000000000000..67ff102fef968bc0eb8bedbb2082c0e62f4808ca
--- /dev/null
+++ b/mlir/docs/DeclarativeRewrites.md
@@ -0,0 +1,690 @@
+# Table-driven Declarative Rewrite Rule (DRR)
+
+In addition to subclassing the `mlir::RewritePattern` C++ class, MLIR also
+supports defining rewrite rules in a declarative manner. Similar to
+[Op Definition Specification](OpDefinitions.md) (ODS), this is achieved via
+[TableGen][TableGen], which is a language to maintain records of domain-specific
+information. The rewrite rules are specified concisely in a TableGen record,
+which will be expanded into an equivalent `mlir::RewritePattern` subclass at
+compiler build time.
+
+This manual explains in detail all of the available mechanisms for defining
+rewrite rules in such a declarative manner. It aims to be a specification
+instead of a tutorial. Please refer to
+[Quickstart tutorial to adding MLIR graph rewrite](QuickstartRewrites.md) for
+the latter.
+
+Given that declarative rewrite rules depend on op definition specification, this
+manual assumes knowledge of the [ODS](OpDefinitions.md) doc.
+
+## Benefits
+
+Compared to the hand-written C++ classes, this declarative approach has several
+benefits, including but not limited to:
+
+*   **Being declarative**: The pattern creator just needs to state the rewrite
+    pattern declaratively, without worrying about the concrete C++ methods to
+    call.
+*   **Removing boilerplate and showing the very essence of the rewrite**:
+    `mlir::RewritePattern` is already good at hiding boilerplate for defining a
+    rewrite rule. But we still need to write the class and function structures
+    required by the C++ programming language, inspect ops for matching, and call
+    op `build()` methods for constructing. These statements are typically quite
+    simple and similar, so they can be further condensed with auto-generation.
+    Because we reduce the boilerplate to the bare minimum, the declarative
+    rewrite rule will just contain the very essence of the rewrite. This makes
+    it very easy to understand the pattern.
+
+## Strengths and Limitations
+
+The declarative rewrite rule is **operation-based**: it describes a rule to
+match against a directed acyclic graph (DAG) of operations and generate DAGs of
+operations. This gives DRR both its strengths and limitations: it is good at
+expressing op to op conversions, but not that well suited for, say, converting
+an op into a loop nest.
+
+Per the current implementation, DRR does not have good support for the following
+features:
+
+*   Matching and generating ops with regions.
+*   Matching and generating ops with block arguments.
+*   Matching multi-result ops in nested patterns.
+*   Matching and generating variadic operand/result ops in nested patterns.
+*   Packing and unpacking variadic operands/results during generation.
+*   [`NativeCodeCall`](#native-code-call-transforming-the-generated-op)
+    returning more than one results.
+
+## Rule Definition
+
+The core construct for defining a rewrite rule is defined in
+[`OpBase.td`][OpBase] as
+
+```tblgen
+class Pattern<
+    dag sourcePattern, list<dag> resultPatterns,
+    list<dag> additionalConstraints = [],
+    dag benefitsAdded = (addBenefit 0)>;
+```
+
+A declarative rewrite rule contains two main components:
+
+*   A _source pattern_, which is used for matching a DAG of operations.
+*   One or more _result patterns_, which are used for generating DAGs of
+    operations to replace the matched DAG of operations.
+
+We allow multiple result patterns to support
+[multi-result ops](#supporting-multi-result-ops) and
+[auxiliary ops](#supporting-auxiliary-ops), but frequently we just want to
+convert one DAG of operations to another DAG of operations. There is a handy
+wrapper of `Pattern`, `Pat`, which takes a single result pattern:
+
+```tblgen
+class Pat<
+    dag sourcePattern, dag resultPattern,
+    list<dag> additionalConstraints = [],
+    dag benefitsAdded = (addBenefit 0)> :
+  Pattern<sourcePattern, [resultPattern], additionalConstraints, benefitAdded>;
+```
+
+Each pattern is specified as a TableGen `dag` object with the syntax of
+`(operator arg0, arg1, ...)`.
+
+`operator` is typically an MLIR op, but it can also be other
+[directives](#special-directives). `argN` is for matching (if used in source
+pattern) or generating (if used in result pattern) the `N`-th argument for
+`operator`. If the `operator` is some MLIR operation, it means the `N`-th
+argument as specified in the `arguments` list of the op's definition.
+Therefore, we say op argument specification in pattern is **position-based**:
+the position where they appear matters.
+
+`argN` can be a `dag` object itself, thus we can have nested `dag` tree to model
+the def-use relationship between ops.
+
+### Source pattern
+
+The source pattern is for matching a DAG of operations. Arguments in the `dag`
+object are intended to **capture** the op arguments. They can also be used to
+**further limit** the match criteria. The capturing is done by specifying a
+symbol starting with the `$` sign, while further constraints are introduced by
+specifying a `TypeConstraint` (for an operand) or a `AttrConstraint` (for an
+attribute).
+
+#### Binding op arguments and limiting the match
+
+For example,
+
+```tblgen
+def AOp : Op<"a_op"> {
+    let arguments = (ins
+      AnyType:$a_input,
+      AnyAttr:$a_attr
+    );
+
+    let results = (outs
+      AnyType:$a_output
+    );
+}
+
+def : Pat<(AOp $input, F32Attr:$attr), ...>;
+```
+
+In the above, we are matching an `AOp` whose `$input` can be anything valid as
+defined by the op and whose `$attr` must be a float attribute. If the match
+succeeds, we bind the `$input` symbol to the op's only input (`$a_input`) and
+`$attr` to the only attribute (`$a_attr`); we can reference them using `$input`
+and `$attr` in result patterns and additional constraints.
+
+The pattern is position-based: the symbol names used for capturing here do not
+need to match with the op definition as shown in the above example. As another
+example, the pattern can be written as ` def : Pat<(AOp $a, F32Attr:$b), ...>;`
+and use `$a` and `$b` to refer to the captured input and attribute. But using
+the ODS name directly in the pattern is also allowed.
+
+Also note that we only need to add `TypeConstraint` or `AttributeConstraint`
+when we need to further limit the match criteria. If all valid cases to the op
+are acceptable, then we can leave the constraint unspecified.
+
+`$_` is a special symbol to mean ignore capturing an argument. For example,
+`def : Pat<(AOp $_, $b), ...>` means only `$b` is interesting to capture and
+will be referenced later in result patterns. It's still possible to place
+additional constraints even if the symbol is not to be captured; for such case,
+you can simply use just the `TypeConstraint` or `AttributeConstraint` without a
+bound symbol, for example, `def : Pat<(AOp $a, F32Attr), ...>`.
+
+#### Matching DAG of operations
+
+To match an DAG of ops, use nested `dag` objects:
+
+```tblgen
+
+def BOp : Op<"b_op"> {
+    let arguments = (ins);
+
+    let results = (outs
+      AnyType:$b_output
+    );
+}
+
+
+def : Pat<(AOp (BOp), $attr), ...>;
+```
+
+The above pattern matches an `AOp` whose only operand is generated by a `BOp`,
+that is, the following MLIR code:
+
+```mlir
+%0 = "b_op"() : () -> (...)
+%1 = "a_op"(%0) {attr: ...} : () -> (...)
+```
+
+#### Binding op results
+
+To bind a symbol to the results of a matched op for later reference, attach the
+symbol to the op itself:
+
+```tblgen
+def : Pat<(AOp (BOp:$b_result), $attr), ...>;
+```
+
+The above will bind `$b_result` to the matched `BOp`'s result. (There are more
+details regarding multi-result ops, which is covered
+[later](#supporting-multi-result-ops).)
+
+### Result pattern
+
+The result pattern is for generating a DAG of operations. Arguments in the `dag`
+object are intended to **reference** values captured in the source pattern and
+potentially **apply transformations**.
+
+#### Referencing bound symbols
+
+For example,
+
+```tblgen
+def COp : Op<"c_op"> {
+    let arguments = (ins
+      AnyType:$c_input,
+      AnyAttr:$c_attr
+    );
+
+    let results = (outs
+      AnyType:$c_output
+    );
+}
+
+def : Pat<(AOp $input, $attr), (COp $input, $attr)>;
+```
+
+In the above, `AOp`'s only operand and attribute are bound to `$input` and
+`$attr`, respectively. We then reference them in the result pattern for
+generating the `COp` by passing them in as arguments to `COp`'s `build()`
+method.
+
+We can also reference symbols bound to matched op's results:
+
+```tblgen
+def : Pat<(AOp (BOp:$b_result) $attr), (COp $b_result $attr)>;
+```
+
+In the above, we are using `BOp`'s result for building `COp`.
+
+#### Building operations
+
+Given that `COp` was specified with table-driven op definition, there will be
+several `build()` methods generated for it. One of them has aggregated
+parameters for result types, operands, and attributes in the signature: `void
+COp::build(..., ArrayRef<Type> resultTypes, Array<Value> operands,
+ArrayRef<NamedAttribute> attr)`. The pattern in the above calls this `build()`
+method for constructing the `COp`.
+
+In general, arguments in the result pattern will be passed directly to the
+`build()` method to leverage the auto-generated `build()` method, list them in
+the pattern by following the exact same order as the ODS `arguments` definition.
+Otherwise, a custom `build()` method that matches the argument list is required.
+
+Right now all ODS-generated `build()` methods require specifying the result
+type(s), unless the op has known traits like `SameOperandsAndResultType` that
+we can use to auto-generate a `build()` method with result type deduction.
+When generating an op to replace the result of the matched root op, we can use
+the matched root op's result type when calling the ODS-generated builder.
+Otherwise (e.g., generating an [auxiliary op](#supporting-auxiliary-ops) or
+generating an op with a nested result pattern), DRR will not be able to deduce
+the result type(s). The pattern author will need to define a custom builder
+that has result type deduction ability via `OpBuilder` in ODS. For example,
+in the following pattern
+
+```tblgen
+def : Pat<(AOp $input, $attr), (COp (AOp $input, $attr) $attr)>;
+```
+
+`AOp` is generated via a nested result pattern; DRR won't be able to deduce the
+result type for it. A custom builder for `AOp` should be defined and it should
+deduce the result type by itself. The builder should have the separate parameter
+for each operand and attribute and deduce the result type internally by itself.
+For example, for the above `AOp`, a possible builder is:
+
+```c++
+
+void AOp::build(Builder *builder, OperationState &state,
+                Value input, Attribute attr) {
+  state.addOperands({input});
+  state.addAttribute("a_attr", attr);
+  Type type = ...; // Deduce result type here
+  state.addTypes({type});
+}
+```
+
+Failing to define such a builder will result in an error at C++ compilation time
+saying the call to `AOp::build()` cannot be resolved because of the number of
+parameters mismatch.
+
+#### Generating DAG of operations
+
+`dag` objects can be nested to generate a DAG of operations:
+
+```tblgen
+def : Pat<(AOp $input, $attr), (COp (BOp), $attr)>;
+```
+
+In the above, we generate a `BOp`, and then use its result to generate the `COp`
+to replace the matched `AOp`.
+
+#### Binding op results
+
+In the result pattern, we can bind to the result(s) of a newly built op by
+attaching symbols to the op. (But we **cannot** bind to op arguments given that
+they are referencing previously bound symbols.) This is useful for reusing
+newly created results where suitable. For example,
+
+```tblgen
+def DOp : Op<"d_op"> {
+    let arguments = (ins
+      AnyType:$d_input1,
+      AnyType:$d_input2,
+    );
+
+    let results = (outs
+      AnyType:$d_output
+    );
+}
+
+def : Pat<(AOp $input, $ignored_attr), (DOp (BOp:$b_result) $b_result)>;
+```
+
+In this pattern, an `AOp` is matched and replaced with a `DOp` whose two
+operands are from the result of a single `BOp`. This is only possible by binding
+the result of the `BOp` to a name and reuse it for the second operand of the
+`DOp`
+
+#### `NativeCodeCall`: transforming the generated op
+
+Sometimes the captured arguments are not exactly what we want so they cannot be
+directly fed in as arguments to build the new op. For such cases, we can apply
+transformations on the arguments by calling into C++ helper functions. This is
+achieved by `NativeCodeCall`.
+
+For example, if we want to capture some op's attributes and group them as an
+array attribute to construct a new op:
+
+```tblgen
+
+def TwoAttrOp : Op<"two_attr_op"> {
+    let arguments = (ins
+      AnyAttr:$op_attr1,
+      AnyAttr:$op_attr2
+    );
+
+    let results = (outs
+      AnyType:$op_output
+    );
+}
+
+def OneAttrOp : Op<"one_attr_op"> {
+    let arguments = (ins
+      ArrayAttr:$op_attr
+    );
+
+    let results = (outs
+      AnyType:$op_output
+    );
+}
+```
+
+We can write a C++ helper function:
+
+```c++
+Attribute createArrayAttr(Builder &builder, Attribute a, Attribute b) {
+  return builder.getArrayAttr({a, b});
+}
+```
+
+And then write the pattern as:
+
+```tblgen
+def createArrayAttr : NativeCodeCall<"createArrayAttr($_builder, $0, $1)">;
+
+def : Pat<(TwoAttrOp $attr1, $attr2),
+          (OneAttrOp (createArrayAttr $attr1, $attr2))>;
+```
+
+And make sure the generated C++ code from the above pattern has access to the
+definition of the C++ helper function.
+
+In the above example, we are using a string to specialize the `NativeCodeCall`
+template. The string can be an arbitrary C++ expression that evaluates into
+some C++ object expected at the `NativeCodeCall` site (here it would be
+expecting an array attribute). Typically the string should be a function call.
+
+Note that currently `NativeCodeCall` must return no more than one value or
+attribute. This might change in the future.
+
+##### `NativeCodeCall` placeholders
+
+In `NativeCodeCall`, we can use placeholders like `$_builder`, `$N`. The former
+is called _special placeholder_, while the latter is called _positional
+placeholder_.
+
+`NativeCodeCall` right now only supports two special placeholders: `$_builder`
+and `$_self`:
+
+*   `$_builder` will be replaced by the current `mlir::PatternRewriter`.
+*   `$_self` will be replaced with the entity `NativeCodeCall` is attached to.
+
+We have seen how `$_builder` can be used in the above; it allows us to pass a
+`mlir::Builder` (`mlir::PatternRewriter` is a subclass of `mlir::OpBuilder`,
+which is a subclass of `mlir::Builder`) to the C++ helper function to use the
+handy methods on `mlir::Builder`.
+
+`$_self` is useful when we want to write something in the form of
+`NativeCodeCall<"...">:$symbol`. For example, if we want to reverse the previous
+example and decompose the array attribute into two attributes:
+
+```tblgen
+class getNthAttr<int n> : NativeCodeCall<"$_self.getValue()[" # n # "]">;
+
+def : Pat<(OneAttrOp $attr),
+          (TwoAttrOp (getNthAttr<0>:$attr), (getNthAttr<1>:$attr)>;
+```
+
+In the above, `$_self` is substituted by the attribute bound by `$attr`, which
+is `OnAttrOp`'s array attribute.
+
+Positional placeholders will be substituted by the `dag` object parameters at
+the `NativeCodeCall` use site. For example, if we define `SomeCall :
+NativeCodeCall<"someFn($1, $2, $0)">` and use it like `(SomeCall $in0, $in1,
+$in2)`, then this will be translated into C++ call `someFn($in1, $in2, $in0)`.
+
+##### Customizing entire op building
+
+`NativeCodeCall` is not only limited to transforming arguments for building an
+op; it can be also used to specify how to build an op entirely. An example:
+
+If we have a C++ function for building an op:
+
+```c++
+Operation *createMyOp(OpBuilder builder, Value input, Attribute attr);
+```
+
+We can wrap it up and invoke it like:
+
+```tblgen
+def createMyOp : NativeCodeCall<"createMyOp($_builder, $0, $1)">;
+
+def : Pat<(... $input, $attr), (createMyOp $input, $attr)>;
+```
+
+### Supporting auxiliary ops
+
+A declarative rewrite rule supports multiple result patterns. One of the
+purposes is to allow generating _auxiliary ops_. Auxiliary ops are operations
+used for building the replacement ops; but they are not directly used for
+replacement themselves.
+
+For the case of uni-result ops, if there are multiple result patterns, only the
+value generated from the last result pattern will be used to replace the matched
+root op's result; all other result patterns will be considered as generating
+auxiliary ops.
+
+Normally we want to specify ops as nested `dag` objects if their def-use
+relationship can be expressed in the way that an op's result can feed as the
+argument to consuming op. But that is not always possible. For example, if we
+want to allocate memory and store some computation (in pseudocode):
+
+```mlir
+%dst = addi %lhs, %rhs
+```
+
+into
+
+```mlir
+%shape = shape %lhs
+%mem = alloc %shape
+%sum = addi %lhs, %rhs
+store %mem, %sum
+%dst = load %mem
+```
+
+We cannot fit in with just one result pattern given `store` does not return a
+value. Instead we can use multiple result patterns:
+
+```tblgen
+def : Pattern<(AddIOp $lhs, $rhs),
+              [(StoreOp (AllocOp:$mem (ShapeOp %lhs)), (AddIOp $lhs, $rhs)),
+               (LoadOp $mem)];
+```
+
+In the above we use the first result pattern to generate the first four ops, and
+use the last pattern to generate the last op, which is used to replace the
+matched op.
+
+### Supporting multi-result ops
+
+Multi-result ops bring extra complexity to declarative rewrite rules. We use
+TableGen `dag` objects to represent ops in patterns; there is no native way to
+indicate that an op generates multiple results. The approach adopted is based
+on **naming convention**: a `__N` suffix is added to a symbol to indicate the
+`N`-th result.
+
+#### `__N` suffix
+
+The `__N` suffix is specifying the `N`-th result as a whole (which can be
+[variadic](#supporting-variadic-ops)). For example, we can bind a symbol to some
+multi-result op and reference a specific result later:
+
+```tblgen
+def ThreeResultOp : Op<"three_result_op"> {
+    let arguments = (ins ...);
+
+    let results = (outs
+      AnyTensor:$op_output1,
+      AnyTensor:$op_output2,
+      AnyTensor:$op_output3
+    );
+}
+
+def : Pattern<(ThreeResultOp:$results ...),
+              [(... $results__0), ..., (... $results__2), ...]>;
+```
+
+In the above pattern we bind `$results` to all the results generated by
+`ThreeResultOp` and references its `$input1` and `$input3` later in the result
+patterns.
+
+We can also bind a symbol and reference one of its specific result at the same
+time, which is typically useful when generating multi-result ops:
+
+```tblgen
+// TwoResultOp has similar definition as ThreeResultOp, but only has two
+// results.
+
+def : Pattern<(TwoResultOp ...),
+              [(ThreeResultOp:$results__2, ...),
+               (replaceWithValue $results__0)]>;
+```
+
+In the above, we created a `ThreeResultOp` and bind `results` to its results,
+and uses its last result (`$output3`) and first result (`$output1`) to replace
+the `TwoResultOp`'s two results, respectively.
+
+#### Replacing multi-result ops
+
+The above example also shows how to replace a matched multi-result op.
+
+To replace a `N`-result op, the result patterns must generate at least `N`
+declared values (see [Declared vs. actual value](#declared-vs-actual-value) for
+definition). If there are more than `N` declared values generated, only the
+last `N` declared values will be used to replace the matched op. Note that
+because of the existence of multi-result op, one result pattern **may** generate
+multiple declared values. So it means we do not necessarily need `N` result
+patterns to replace an `N`-result op. For example, to replace an op with three
+results, you can have
+
+```tblgen
+// ThreeResultOp/TwoResultOp/OneResultOp generates three/two/one result(s),
+// respectively.
+
+// Replace each result with a result generated from an individual op.
+def : Pattern<(ThreeResultOp ...),
+              [(OneResultOp ...), (OneResultOp ...), (OneResultOp ...)]>;
+
+// Replace the first two results with two results generated from the same op.
+def : Pattern<(ThreeResultOp ...),
+              [(TwoResultOp ...), (OneResultOp ...)]>;
+
+// Replace all three results with three results generated from the same op.
+def : Pat<(ThreeResultOp ...), (ThreeResultOp ...)>;
+
+def : Pattern<(ThreeResultOp ...),
+              [(AuxiliaryOp ...), (ThreeResultOp ...)]>;
+```
+
+But using a single op to serve as both auxiliary op and replacement op is
+forbidden, i.e., the following is not allowed because that the first
+`TwoResultOp` generates two results but only the second result is used for
+replacing the matched op's result:
+
+```tblgen
+def : Pattern<(ThreeResultOp ...),
+              [(TwoResultOp ...), (TwoResultOp ...)]>;
+```
+
+### Supporting variadic ops
+
+#### Declared vs. actual value
+
+Before going into details on variadic op support, we need to define a few terms
+regarding an op's values.
+
+*   _Value_: either an operand or a result
+*   _Declared operand/result/value_: an operand/result/value statically declared
+    in ODS of the op
+*   _Actual operand/result/value_: an operand/result/value of an op instance at
+    runtime
+
+The above terms are needed because ops can have multiple results, and some of the
+results can also be variadic. For example,
+
+```tblgen
+def MultiVariadicOp : Op<"multi_variadic_op"> {
+    let arguments = (ins
+      AnyTensor:$input1,
+      Variadic<AnyTensor>:$input2,
+      AnyTensor:$input3
+    );
+
+    let results = (outs
+      AnyTensor:$output1,
+      Variadic<AnyTensor>:$output2,
+      AnyTensor:$output3
+    );
+}
+```
+
+We say the above op has 3 declared operands and 3 declared results. But at
+runtime, an instance can have 3 values corresponding to `$input2` and 2 values
+correspond to `$output2`; we say it has 5 actual operands and 4 actual
+results. A variadic operand/result is a considered as a declared value that can
+correspond to multiple actual values.
+
+[TODO]
+
+### Supplying additional constraints
+
+Constraints can be placed on op arguments when matching. But sometimes we need
+to also place constraints on the matched op's results or sometimes need to limit
+the matching with some constraints that cover both the arguments and the
+results. The third parameter to `Pattern` (and `Pat`) is for this purpose.
+
+For example, we can write
+
+```tblgen
+def HasNoUseOf: Constraint<
+    CPred<"$_self->use_begin() == $_self->use_end()">, "has no use">;
+
+def HasSameElementType : Constraint<
+    CPred<"$0.cast<ShapedType>().getElementType() == "
+          "$1.cast<ShapedType>().getElementType()">,
+    "has same element type">;
+
+def : Pattern<(TwoResultOp:$results $input),
+              [(...), (...)],
+              [(F32Tensor:$results__0), (HasNoUseOf:$results__1),
+               (HasSameElementShape $results__0, $input)]>;
+```
+
+You can
+
+*   Use normal `TypeConstraint`s on previous bound symbols (the first result of
+    `TwoResultOp` must be a float tensor);
+*   Define new `Constraint` for previous bound symbols (the second result of
+    `TwoResultOp` must has no use);
+*   Apply constraints on multiple bound symbols (`$input` and `TwoResultOp`'s
+    first result must have the same element type).
+
+### Adjusting benefits
+
+The benefit of a `Pattern` is an integer value indicating the benefit of matching
+the pattern. It determines the priorities of patterns inside the pattern rewrite
+driver. A pattern with a higher benefit is applied before one with a lower
+benefit.
+
+In DRR, a rule is set to have a benefit of the number of ops in the source
+pattern. This is based on the heuristics and assumptions that:
+
+*   Larger matches are more beneficial than smaller ones.
+*   If a smaller one is applied first the larger one may not apply anymore.
+
+
+The fourth parameter to `Pattern` (and `Pat`) allows to manually tweak a
+pattern's benefit. Just supply `(addBenefit N)` to add `N` to the benefit value.
+
+## Special directives
+
+[TODO]
+
+## Debugging Tips
+
+### Run `mlir-tblgen` to see the generated content
+
+TableGen syntax sometimes can be obscure; reading the generated content can be
+a very helpful way to understand and debug issues. To build `mlir-tblgen`, run
+`cmake --build . --target mlir-tblgen` in your build directory and find the
+`mlir-tblgen` binary in the `bin/` subdirectory. All the supported generators
+can be found via `mlir-tblgen --help`.
+
+To see the generated code, invoke `mlir-tblgen` with a specific generator by
+providing include paths via `-I`. For example,
+
+```sh
+# To see all the C++ pattern rewrite classes
+mlir-tblgen --gen-rewriters -I /path/to/mlir/include /path/to/input/td/file
+```
+
+### Compilation error: no matching member function for call to 'build'
+
+This is because DRR is failing to call a `build()` method with result type
+deduction ability. See [building operations](#building-operations) for more
+details.
+
+[TableGen]: https://llvm.org/docs/TableGen/index.html
+[OpBase]: https://github.com/tensorflow/mlir/blob/master/include/mlir/IR/OpBase.td
diff --git a/mlir/docs/DefiningAttributesAndTypes.md b/mlir/docs/DefiningAttributesAndTypes.md
new file mode 100644
index 0000000000000000000000000000000000000000..60243e5fd57fc8937ed5f6db51fa243dc82eb06a
--- /dev/null
+++ b/mlir/docs/DefiningAttributesAndTypes.md
@@ -0,0 +1,282 @@
+# Quickstart tutorial to defining custom dialect attributes and types
+
+This document is a quickstart to defining dialect specific extensions to the
+[attribute](LangRef.md#attributes) and [type system](LangRef.md#type-system).
+The main part of the tutorial focuses on defining types, but the instructions
+are nearly identical for defining attributes.
+
+See [MLIR specification](LangRef.md) for more information about MLIR, the
+structure of the IR, operations, etc.
+
+## Types
+
+Types in MLIR (like attributes, locations, and many other things) are
+value-typed. This means that instances of `Type` should be passed around
+by-value, as opposed to by-pointer or by-reference. The `Type` class in itself
+acts as a wrapper around an internal storage object that is uniqued within an
+instance of an `MLIRContext`.
+
+### Reserving a range of type kinds
+
+Types in MLIR rely on having a unique `kind` value to ensure that casting checks
+remain extremely
+efficient([rationale](Rationale.md#reserving-dialect-type-kinds). For a dialect
+author, this means that a range of type `kind` values must be explicitly, and
+statically, reserved. A dialect can reserve a range of values by adding a new
+entry to the
+[DialectSymbolRegistry](https://github.com/tensorflow/mlir/blob/master/include/mlir/IR/DialectSymbolRegistry.def).
+To support out-of-tree and experimental dialects, the registry predefines a set
+of privates ranges, `PRIVATE_EXPERIMENTAL_[0-9]`, that are free for immediate
+use.
+
+```c++
+DEFINE_SYM_KIND_RANGE(LINALG) // Linear Algebra Dialect
+DEFINE_SYM_KIND_RANGE(TOY)    // Toy language (tutorial) Dialect
+
+// The following ranges are reserved for experimenting with MLIR dialects in a
+// private context without having to register them here.
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_0)
+```
+
+For the sake of this tutorial, we will use the predefined
+`PRIVATE_EXPERIMENTAL_0` range. These definitions will provide a range in the
+Type::Kind enum to use when defining the derived types.
+
+```c++
+namespace MyTypes {
+enum Kinds {
+  // These kinds will be used in the examples below.
+  Simple = Type::Kind::FIRST_PRIVATE_EXPERIMENTAL_0_TYPE,
+  Complex
+};
+}
+```
+
+### Defining the type class
+
+As described above, `Type` objects in MLIR are value-typed and rely on having an
+implicitly internal storage object that holds the actual data for the type. When
+defining a new `Type` it isn't always necessary to define a new storage class.
+So before defining the derived `Type`, it's important to know which of the two
+classes of `Type` we are defining. Some types are `primitives` meaning they do
+not have any parameters and are singletons uniqued by kind, like the
+[`index` type](LangRef.md#index-type). Parametric types on the other hand, have
+additional information that differentiates different instances of the same
+`Type` kind. For example the [`integer` type](LangRef.md#integer-type) has a
+bitwidth, making `i8` and `i16` be different instances of
+[`integer` type](LangRef.md#integer-type).
+
+#### Simple non-parametric types
+
+For simple parameterless types, we can jump straight into defining the derived
+type class. Given that these types are uniqued solely on `kind`, we don't need
+to provide our own storage class.
+
+```c++
+/// This class defines a simple parameterless type. All derived types must
+/// inherit from the CRTP class 'Type::TypeBase'. It takes as template
+/// parameters the concrete type (SimpleType), and the base class to use (Type).
+/// 'Type::TypeBase' also provides several utility methods to simplify type
+/// construction.
+class SimpleType : public Type::TypeBase<SimpleType, Type> {
+public:
+  /// Inherit some necessary constructors from 'TypeBase'.
+  using Base::Base;
+
+  /// This static method is used to support type inquiry through isa, cast,
+  /// and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == MyTypes::Simple; }
+
+  /// This method is used to get an instance of the 'SimpleType'. Given that
+  /// this is a parameterless type, it just needs to take the context for
+  /// uniquing purposes.
+  static SimpleType get(MLIRContext *context) {
+    // Call into a helper 'get' method in 'TypeBase' to get a uniqued instance
+    // of this type.
+    return Base::get(context, MyTypes::Simple);
+  }
+};
+```
+
+#### Parametric types
+
+Parametric types are those that have additional construction or uniquing
+constraints outside of the type `kind`. As such, these types require defining a
+type storage class.
+
+##### Defining a type storage
+
+Type storage objects contain all of the data necessary to construct and unique a
+parametric type instance. The storage classes must obey the following:
+
+*   Inherit from the base type storage class `TypeStorage`.
+*   Define a type alias, `KeyTy`, that maps to a type that uniquely identifies
+    an instance of the parent type.
+*   Provide a construction method that is used to allocate a new instance of the
+    storage class.
+    -   `Storage *construct(TypeStorageAllocator &, const KeyTy &key)`
+*   Provide a comparison method between the storage and `KeyTy`.
+    -   `bool operator==(const KeyTy &) const`
+*   Provide a method to generate the `KeyTy` from a list of arguments passed to
+    the uniquer. (Note: This is only necessary if the `KeyTy` cannot be default
+    constructed from these arguments).
+    -   `static KeyTy getKey(Args...&& args)`
+*   Provide a method to hash an instance of the `KeyTy`. (Note: This is not
+    necessary if an `llvm::DenseMapInfo<KeyTy>` specialization exists)
+    -   `static llvm::hash_code hashKey(const KeyTy &)`
+
+Let's look at an example:
+
+```c++
+/// Here we define a storage class for a ComplexType, that holds a non-zero
+/// integer and an integer type.
+struct ComplexTypeStorage : public TypeStorage {
+  ComplexTypeStorage(unsigned nonZeroParam, Type integerType)
+      : nonZeroParam(nonZeroParam), integerType(integerType) {}
+
+  /// The hash key for this storage is a pair of the integer and type params.
+  using KeyTy = std::pair<unsigned, Type>;
+
+  /// Define the comparison function for the key type.
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(nonZeroParam, integerType);
+  }
+
+  /// Define a hash function for the key type.
+  /// Note: This isn't necessary because std::pair, unsigned, and Type all have
+  /// hash functions already available.
+  static llvm::hash_code hashKey(const KeyTy &key) {
+    return llvm::hash_combine(key.first, key.second);
+  }
+
+  /// Define a construction function for the key type.
+  /// Note: This isn't necessary because KeyTy can be directly constructed with
+  /// the given parameters.
+  static KeyTy getKey(unsigned nonZeroParam, Type integerType) {
+    return KeyTy(nonZeroParam, integerType);
+  }
+
+  /// Define a construction method for creating a new instance of this storage.
+  static ComplexTypeStorage *construct(TypeStorageAllocator &allocator,
+                                       const KeyTy &key) {
+    return new (allocator.allocate<ComplexTypeStorage>())
+        ComplexTypeStorage(key.first, key.second);
+  }
+
+  unsigned nonZeroParam;
+  Type integerType;
+};
+```
+
+##### Type class definition
+
+Now that the storage class has been created, the derived type class can be
+defined. This structure is similar to the
+[simple type](#simple-non-parametric-types), except for a bit more of the
+functionality of `Type::TypeBase` is put to use.
+
+```c++
+/// This class defines a parametric type. All derived types must inherit from
+/// the CRTP class 'Type::TypeBase'. It takes as template parameters the
+/// concrete type (ComplexType), the base class to use (Type), and the storage
+/// class (ComplexTypeStorage). 'Type::TypeBase' also provides several utility
+/// methods to simplify type construction and verification.
+class ComplexType : public Type::TypeBase<ComplexType, Type,
+                                          ComplexTypeStorage> {
+public:
+  /// Inherit some necessary constructors from 'TypeBase'.
+  using Base::Base;
+
+  /// This static method is used to support type inquiry through isa, cast,
+  /// and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == MyTypes::Complex; }
+
+  /// This method is used to get an instance of the 'ComplexType'. This method
+  /// asserts that all of the construction invariants were satisfied. To
+  /// gracefully handle failed construction, getChecked should be used instead.
+  static ComplexType get(MLIRContext *context, unsigned param, Type type) {
+    // Call into a helper 'get' method in 'TypeBase' to get a uniqued instance
+    // of this type. All parameters to the storage class are passed after the
+    // type kind.
+    return Base::get(context, MyTypes::Complex, param, type);
+  }
+
+  /// This method is used to get an instance of the 'ComplexType', defined at
+  /// the given location. If any of the construction invariants are invalid,
+  /// errors are emitted with the provided location and a null type is returned.
+  /// Note: This method is completely optional.
+  static ComplexType getChecked(MLIRContext *context, unsigned param, Type type,
+                                Location location) {
+    // Call into a helper 'getChecked' method in 'TypeBase' to get a uniqued
+    // instance of this type. All parameters to the storage class are passed
+    // after the type kind.
+    return Base::getChecked(location, context, MyTypes::Complex, param, type);
+  }
+
+  /// This method is used to verify the construction invariants passed into the
+  /// 'get' and 'getChecked' methods. Note: This method is completely optional.
+  static LogicalResult verifyConstructionInvariants(
+      llvm::Optional<Location> loc, MLIRContext *context, unsigned param,
+      Type type) {
+    // Our type only allows non-zero parameters.
+    if (param == 0) {
+      if (loc)
+        context->emitError(loc) << "non-zero parameter passed to 'ComplexType'";
+      return failure();
+    }
+    // Our type also expects an integer type.
+    if (!type.isa<IntegerType>()) {
+      if (loc)
+        context->emitError(loc) << "non integer-type passed to 'ComplexType'";
+      return failure();
+    }
+    return success();
+  }
+
+  /// Return the parameter value.
+  unsigned getParameter() {
+    // 'getImpl' returns a pointer to our internal storage instance.
+    return getImpl()->nonZeroParam;
+  }
+
+  /// Return the integer parameter type.
+  IntegerType getParameterType() {
+    // 'getImpl' returns a pointer to our internal storage instance.
+    return getImpl()->integerType;
+  }
+};
+```
+
+### Registering types with a Dialect
+
+Once the dialect types have been defined, they must then be registered with a
+`Dialect`. This is done via similar mechanism to
+[operations](LangRef.md#operations), `addTypes`.
+
+```c++
+struct MyDialect : public Dialect {
+  MyDialect(MLIRContext *context) : Dialect(/*name=*/"mydialect", context) {
+    /// Add these types to the dialect.
+    addTypes<SimpleType, ComplexType>();
+  }
+};
+```
+
+### Parsing and Printing
+
+As a final step after registration, a dialect must override the `printType` and
+`parseType` hooks. These enable native support for roundtripping the type in the
+textual IR.
+
+## Attributes
+
+As stated in the introduction, the process for defining dialect attributes is
+nearly identical to that of defining dialect types. That key difference is that
+the things named `*Type` are generally now named `*Attr`.
+
+*   `Type::TypeBase` -> `Attribute::AttrBase`
+*   `TypeStorageAllocator` -> `AttributeStorageAllocator`
+*   `addTypes` -> `addAttributes`
+
+Aside from that, all of the interfaces for uniquing and storage construction are
+all the same.
diff --git a/mlir/docs/DeveloperGuide.md b/mlir/docs/DeveloperGuide.md
new file mode 100644
index 0000000000000000000000000000000000000000..745009959256b0913e929d225f438ae92017fcf5
--- /dev/null
+++ b/mlir/docs/DeveloperGuide.md
@@ -0,0 +1,107 @@
+# Developer Guide
+
+This document attempts to describe a few developer policies used in MLIR (such
+as coding standards used) as well as development approach (such as, testing
+methods).
+
+## Style guide
+
+MLIR follows the [LLVM style](https://llvm.org/docs/CodingStandards.html) guide.
+We also adhere to the following (which deviate from or are not specified in the
+LLVM style guide):
+
+*   Adopts [camelBack](https://llvm.org/docs/Proposals/VariableNames.html);
+*   Except for IR units (Region, Block, and Operation), non-nullable output
+    arguments are passed by non-const reference in general.
+*   IR constructs are not designed for [const correctness](UsageOfConst.md).
+*   Do *not* use recursive algorithms if the recursion can't be bounded
+    statically: that is avoid recursion if there is a possible IR input that can
+    trigger a stack overflow (for example traversing use-def chains in a
+    recursive way). At the moment, we tolerate it for the two following cases:
+    *   The nesting of the IR: we use recursion when traversing nested regions.
+    *   Type nesting: recursion may be used for the nesting of composite types.
+*   Follow the `git` conventions for writing a commit message, in particular the
+    first line is the "title", it should be followed by an empty line and an
+    optional description. This [post](https://chris.beams.io/posts/git-commit/)
+    give examples and more details.
+
+Please run clang-format on the files you modified with the `.clang-format`
+configuration file available in the root directory. Check the clang-format
+[documentation](https://clang.llvm.org/docs/ClangFormat.html) for more details
+on integrating it with your development environment. In particular, if clang is
+installed system-wide, running `git clang-format origin/master` will update the
+files in the working directory with the relevant formatting changes; don't
+forget to include those to the commit.
+
+## Pass name and other command line options
+
+To avoid collision between options provided by different dialects, the naming
+convention is to prepend the dialect name to every dialect-specific passes and
+options in general. Options that are specific to a pass should also be prefixed
+with the pass name. For example, the affine dialect provides a loop tiling pass
+that is registered on the command line as `-affine-tile`, and with a tile size
+option that can be set with `-affine-tile-size`.
+
+We also avoid `cl::opt` to provide pass options in favor of the
+[pass options](WritingAPass.md#instance-specific-pass-options) mechanism. This
+allows for these options to be serialized in a pass pipeline description, as
+well as passing different options to multiple instances of a pass in the same
+pipeline.
+
+## Testing guidelines
+
+See here for the [testing guide](TestingGuide.md).
+
+## Guidelines on contributing a new dialect (or important components)
+
+To contribute a dialect (or a major component in MLIR), it is usual to write an
+overview "RFC" (it can be just a few informal paragraphs) and send it to the
+MLIR mailing list. When accepting a new component to MLIR, the community is also
+accepting the burden of maintaining it. The following points should be
+considered when evaluating whether a dialect is a good fit for the core MLIR
+repository:
+
+*   What is the overall goal of the dialect? What is the first implementation
+    milestone?
+*   How does it fit into the MLIR dialect ecosystem?
+    *   Connection: how does it connect to the existing dialects in a
+        compilation pipeline(s)?
+    *   Consolidation: is there already a dialect with a similar goal or
+        matching abstractions; if so, can it be improved instead of adding a new
+        one?
+    *   Reuse: how does it generalize to similar but slightly different
+        use-cases?
+*   What is the community of users that it is serving?
+*   Who are the future contributors/maintainers beyond those who propose the
+    dialect?
+
+On a practical aspect, we will expect the code to follow the other sections of
+this document, with an emphasis on the documentation alongside the source code.
+
+It is prefered to upstream your dialects/components in small incremental patches
+that can be individually reviewed. That is, after the initial RFC has been
+agreed on, we encourage dialects to be built progressively by faster iterations
+in-tree; as long as it is clear they evolve towards their milestones and goals.
+
+We have seen the following broad categories of dialects:
+
+*   Edge dialects that model a representation external to MLIR. Examples include
+    LLVM, SPIR-V dialects, TensorFlow, XLA/HLO, ... Such dialects may be a
+    better fit for the project that contains the original representation instead
+    of being added to the MLIR repository. In particular, because MLIR will not
+    take an external dependency on another project.
+*   Structured Abstraction dialects that generalize common features of several
+    other dialects or introduce a programming model. Generalization is sometimes
+    demonstrated by having several dialects lower to or originate from a new
+    dialect. While additional abstractions may be useful, they should be traded
+    off against the additional complexity of the dialect ecosystem. Examples of
+    abstraction dialects include the GPU and Loop dialects.
+*   Transformation dialects that serve as input/output for program
+    transformations. These dialects are commonly introduced to materialize
+    transformation pre- and post-conditions in the IR, while conditions can be
+    obtained through analysis or through operation semantics. Examples include
+    Affine and Linalg dialects.
+
+While it can be useful to frame the goals of a proposal, this categorization is
+not exhaustive or absolute, and the community is open to discussing any new
+dialect beyond this taxonomy.
diff --git a/mlir/docs/Diagnostics.md b/mlir/docs/Diagnostics.md
new file mode 100644
index 0000000000000000000000000000000000000000..69a30942c0039b041480750c6edd8e14b8a5138d
--- /dev/null
+++ b/mlir/docs/Diagnostics.md
@@ -0,0 +1,402 @@
+# Introduction and Usage Guide to MLIR's Diagnostics Infrastructure
+
+[TOC]
+
+This document presents an introduction to using and interfacing with MLIR's
+diagnostics infrastructure.
+
+See [MLIR specification](LangRef.md) for more information about MLIR, the
+structure of the IR, operations, etc.
+
+## Source Locations
+
+Source location information is extremely important for any compiler, because it
+provides a baseline for debuggability and error-reporting. MLIR provides several
+different location types depending on the situational need.
+
+### CallSite Location
+
+```
+callsite-location ::= 'callsite' '(' location 'at' location ')'
+```
+
+An instance of this location allows for representing a directed stack of
+location usages. This connects a location of a `callee` with the location of a
+`caller`.
+
+### FileLineCol Location
+
+```
+filelinecol-location ::= string-literal ':' integer-literal ':' integer-literal
+```
+
+An instance of this location represents a tuple of file, line number, and column
+number. This is similar to the type of location that you get from most source
+languages.
+
+### Fused Location
+
+```
+fused-location ::= `fused` fusion-metadata? '[' location (location ',')* ']'
+fusion-metadata ::= '<' attribute-value '>'
+```
+
+An instance of a `fused` location represents a grouping of several other source
+locations, with optional metadata that describes the context of the fusion.
+There are many places within a compiler in which several constructs may be fused
+together, e.g. pattern rewriting, that normally result partial or even total
+loss of location information. With `fused` locations, this is a non-issue.
+
+### Name Location
+
+```
+name-location ::= string-literal ('(' location ')')?
+```
+
+An instance of this location allows for attaching a name to a child location.
+This can be useful for representing the locations of variable, or node,
+definitions.
+
+### Opaque Location
+
+An instance of this location essentially contains a pointer to some data
+structure that is external to MLIR and an optional location that can be used if
+the first one is not suitable. Since it contains an external structure, only the
+optional location is used during serialization.
+
+### Unknown Location
+
+```
+unknown-location ::= `unknown`
+```
+
+Source location information is an extremely integral part of the MLIR
+infrastructure. As such, location information is always present in the IR, and
+must explicitly be set to unknown. Thus an instance of the `unknown` location,
+represents an unspecified source location.
+
+## Diagnostic Engine
+
+The `DiagnosticEngine` acts as the main interface for diagnostics in MLIR. It
+manages the registration of diagnostic handlers, as well as the core API for
+diagnostic emission. Handlers generally take the form of
+`LogicalResult(Diagnostic &)`. If the result is `success`, it signals that the
+diagnostic has been fully processed and consumed. If `failure`, it signals that
+the diagnostic should be propagated to any previously registered handlers. It
+can be interfaced with via an `MLIRContext` instance.
+
+```c++
+DiagnosticEngine engine = ctx->getDiagEngine();
+
+/// Handle the reported diagnostic.
+// Return success to signal that the diagnostic has either been fully processed,
+// or failure if the diagnostic should be propagated to the previous handlers.
+DiagnosticEngine::HandlerID id = engine.registerHandler(
+    [](Diagnostic &diag) -> LogicalResult {
+  bool should_propage_diagnostic = ...;
+  return failure(should_propage_diagnostic);
+});
+
+
+// We can also elide the return value completely, in which the engine assumes
+// that all diagnostics are consumed(i.e. a success() result).
+DiagnosticEngine::HandlerID id = engine.registerHandler([](Diagnostic &diag) {
+  return;
+});
+
+// Unregister this handler when we are done.
+engine.eraseHandler(id);
+```
+
+### Constructing a Diagnostic
+
+As stated above, the `DiagnosticEngine` holds the core API for diagnostic
+emission. A new diagnostic can be emitted with the engine via `emit`. This
+method returns an [InFlightDiagnostic](#inflight-diagnostic) that can be
+modified further.
+
+```c++
+InFlightDiagnostic emit(Location loc, DiagnosticSeverity severity);
+```
+
+Using the `DiagnosticEngine`, though, is generally not the preferred way to emit
+diagnostics in MLIR. [`operation`](LangRef.md#operations) provides utility
+methods for emitting diagnostics:
+
+```c++
+// `emit` methods available in the mlir namespace.
+InFlightDiagnostic emitError/Remark/Warning(Location);
+
+// These methods use the location attached to the operation.
+InFlightDiagnostic Operation::emitError/Remark/Warning();
+
+// This method creates a diagnostic prefixed with "'op-name' op ".
+InFlightDiagnostic Operation::emitOpError();
+```
+
+## Diagnostic
+
+A `Diagnostic` in MLIR contains all of the necessary information for reporting a
+message to the user. A `Diagnostic` essentially boils down to three main
+components:
+
+*   [Source Location](#source-locations)
+*   Severity Level
+    -   Error, Note, Remark, Warning
+*   Diagnostic Arguments
+    -   The diagnostic arguments are used when constructing the output message.
+
+### Appending arguments
+
+One a diagnostic has been constructed, the user can start composing it. The
+output message of a diagnostic is composed of a set of diagnostic arguments that
+have been attached to it. New arguments can be attached to a diagnostic in a few
+different ways:
+
+```c++
+// A few interesting things to use when composing a diagnostic.
+Attribute fooAttr;
+Type fooType;
+SmallVector<int> fooInts;
+
+// Diagnostics can be composed via the streaming operators.
+op->emitError() << "Compose an interesting error: " << fooAttr << ", " << fooType
+                << ", (" << fooInts << ')';
+
+// This could generate something like (FuncAttr:@foo, IntegerType:i32, {0,1,2}):
+"Compose an interesting error: @foo, i32, (0, 1, 2)"
+```
+
+### Attaching notes
+
+Unlike many other compiler frameworks, notes in MLIR cannot be emitted directly.
+They must be explicitly attached to another diagnostic non-note diagnostic. When
+emitting a diagnostic, notes can be directly attached via `attachNote`. When
+attaching a note, if the user does not provide an explicit source location the
+note will inherit the location of the parent diagnostic.
+
+```c++
+// Emit a note with an explicit source location.
+op->emitError("...").attachNote(noteLoc) << "...";
+
+// Emit a note that inherits the parent location.
+op->emitError("...").attachNote() << "...";
+```
+
+## InFlight Diagnostic
+
+Now that [Diagnostics](#diagnostic) have been explained, we introduce the
+`InFlightDiagnostic`. is an RAII wrapper around a diagnostic that is set to be
+reported. This allows for modifying a diagnostic while it is still in flight. If
+it is not reported directly by the user it will automatically report when
+destroyed.
+
+```c++
+{
+  InFlightDiagnostic diag = op->emitError() << "...";
+}  // The diagnostic is automatically reported here.
+```
+
+## Diagnostic Configuration Options
+
+Several options are provided to help control and enhance the behavior of
+diagnostics. These options are listed below:
+
+### Print Operation On Diagnostic
+
+Command Line Flag: `-mlir-print-op-on-diagnostic`
+
+When a diagnostic is emitted on an operation, via `Operation::emitError/...`,
+the textual form of that operation is printed and attached as a note to the
+diagnostic. This option is useful for understanding the current form of an
+operation that may be invalid, especially when debugging verifier failures. An
+example output is shown below:
+
+```shell
+test.mlir:3:3: error: 'module_terminator' op expects parent op 'module'
+  "module_terminator"() : () -> ()
+  ^
+test.mlir:3:3: note: see current operation: "module_terminator"() : () -> ()
+  "module_terminator"() : () -> ()
+  ^
+```
+
+### Print StackTrace On Diagnostic
+
+Command Line Flag: `-mlir-print-stacktrace-on-diagnostic`
+
+When a diagnostic is emitted, attach the current stack trace as a note to the
+diagnostic. This option is useful for understanding which part of the compiler
+generated certain diagnostics. An example output is shown below:
+
+```shell
+test.mlir:3:3: error: 'module_terminator' op expects parent op 'module'
+  "module_terminator"() : () -> ()
+  ^
+test.mlir:3:3: note: diagnostic emitted with trace:
+ #0 0x000055dd40543805 llvm::sys::PrintStackTrace(llvm::raw_ostream&) llvm/lib/Support/Unix/Signals.inc:553:11
+ #1 0x000055dd3f8ac162 emitDiag(mlir::Location, mlir::DiagnosticSeverity, llvm::Twine const&) /lib/IR/Diagnostics.cpp:292:7
+ #2 0x000055dd3f8abe8e mlir::emitError(mlir::Location, llvm::Twine const&) /lib/IR/Diagnostics.cpp:304:10
+ #3 0x000055dd3f998e87 mlir::Operation::emitError(llvm::Twine const&) /lib/IR/Operation.cpp:324:29
+ #4 0x000055dd3f99d21c mlir::Operation::emitOpError(llvm::Twine const&) /lib/IR/Operation.cpp:652:10
+ #5 0x000055dd3f96b01c mlir::OpTrait::HasParent<mlir::ModuleOp>::Impl<mlir::ModuleTerminatorOp>::verifyTrait(mlir::Operation*) /mlir/IR/OpDefinition.h:897:18
+ #6 0x000055dd3f96ab38 mlir::Op<mlir::ModuleTerminatorOp, mlir::OpTrait::ZeroOperands, mlir::OpTrait::ZeroResult, mlir::OpTrait::HasParent<mlir::ModuleOp>::Impl, mlir::OpTrait::IsTerminator>::BaseVerifier<mlir::OpTrait::HasParent<mlir::ModuleOp>::Impl<mlir::ModuleTerminatorOp>, mlir::OpTrait::IsTerminator<mlir::ModuleTerminatorOp> >::verifyTrait(mlir::Operation*) /mlir/IR/OpDefinition.h:1052:29
+ #  ...
+  "module_terminator"() : () -> ()
+  ^
+```
+
+## Common Diagnostic Handlers
+
+To interface with the diagnostics infrastructure, users will need to register a
+diagnostic handler with the [`DiagnosticEngine`](#diagnostic-engine).
+Recognizing the many users will want the same handler functionality, MLIR
+provides several common diagnostic handlers for immediate use.
+
+### Scoped Diagnostic Handler
+
+This diagnostic handler is a simple RAII class that registers and unregisters a
+given diagnostic handler. This class can be either be used directly, or in
+conjunction with a derived diagnostic handler.
+
+```c++
+// Construct the handler directly.
+MLIRContext context;
+ScopedDiagnosticHandler scopedHandler(&context, [](Diagnostic &diag) {
+  ...
+});
+
+// Use this handler in conjunction with another.
+class MyDerivedHandler : public ScopedDiagnosticHandler {
+  MyDerivedHandler(MLIRContext *ctx) : ScopedDiagnosticHandler(ctx) {
+    // Set the handler that should be RAII managed.
+    setHandler([&](Diagnostic diag) {
+      ...
+    });
+  }
+};
+```
+
+### SourceMgr Diagnostic Handler
+
+This diagnostic handler is a wrapper around an llvm::SourceMgr instance. It
+provides support for displaying diagnostic messages inline with a line of a
+respective source file. This handler will also automatically load newly seen
+source files into the SourceMgr when attempting to display the source line of a
+diagnostic. Example usage of this handler can be seen in the `mlir-opt` tool.
+
+```shell
+$ mlir-opt foo.mlir
+
+/tmp/test.mlir:6:24: error: expected non-function type
+func @foo() -> (index, ind) {
+                       ^
+```
+
+To use this handler in your tool, add the following:
+
+```c++
+SourceMgr sourceMgr;
+MLIRContext context;
+SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
+```
+
+### SourceMgr Diagnostic Verifier Handler
+
+This handler is a wrapper around a llvm::SourceMgr that is used to verify that
+certain diagnostics have been emitted to the context. To use this handler,
+annotate your source file with expected diagnostics in the form of:
+
+*   `expected-(error|note|remark|warning) {{ message }}`
+
+A few examples are shown below:
+
+```mlir
+// Expect an error on the same line.
+func @bad_branch() {
+  br ^missing  // expected-error {{reference to an undefined block}}
+}
+
+// Expect an error on an adjacent line.
+func @foo(%a : f32) {
+  // expected-error@+1 {{unknown comparison predicate "foo"}}
+  %result = cmpf "foo", %a, %a : f32
+  return
+}
+
+// Expect an error on the next line that does not contain a designator.
+// expected-remark@below {{remark on function below}}
+// expected-remark@below {{another remark on function below}}
+func @bar(%a : f32)
+
+// Expect an error on the previous line that does not contain a designator.
+func @baz(%a : f32)
+// expected-remark@above {{remark on function above}}
+// expected-remark@above {{another remark on function above}}
+
+```
+
+The handler will report an error if any unexpected diagnostics were seen, or if
+any expected diagnostics weren't.
+
+```shell
+$ mlir-opt foo.mlir
+
+/tmp/test.mlir:6:24: error: unexpected error: expected non-function type
+func @foo() -> (index, ind) {
+                       ^
+
+/tmp/test.mlir:15:4: error: expected remark "expected some remark" was not produced
+// expected-remark {{expected some remark}}
+   ^~~~~~~~~~~~~~~~~~~~~~~~~~
+```
+
+Similarly to the [SourceMgr Diagnostic Handler](#sourcemgr-diagnostic-handler),
+this handler can be added to any tool via the following:
+
+```c++
+SourceMgr sourceMgr;
+MLIRContext context;
+SourceMgrDiagnosticVerifierHandler sourceMgrHandler(sourceMgr, &context);
+```
+
+### Parallel Diagnostic Handler
+
+MLIR is designed from the ground up to be multi-threaded. One important to thing
+to keep in mind when multi-threading is determinism. This means that the
+behavior seen when operating on multiple threads is the same as when operating
+on a single thread. For diagnostics, this means that the ordering of the
+diagnostics is the same regardless of the amount of threads being operated on.
+The ParallelDiagnosticHandler is introduced to solve this problem.
+
+After creating a handler of this type, the only remaining step is to ensure that
+each thread that will be emitting diagnostics to the handler sets a respective
+'orderID'. The orderID corresponds to the order in which diagnostics would be
+emitted when executing synchronously. For example, if we were processing a list
+of operations [a, b, c] on a single-thread. Diagnostics emitted while processing
+operation 'a' would be emitted before those for 'b' or 'c'. This corresponds 1-1
+with the 'orderID'. The thread that is processing 'a' should set the orderID to
+'0'; the thread processing 'b' should set it to '1'; and so on and so forth.
+This provides a way for the handler to deterministically order the diagnostics
+that it receives given the thread that it is receiving on.
+
+A simple example is shown below:
+
+```c++
+MLIRContext *context = ...;
+ParallelDiagnosticHandler handler(context);
+
+// Process a list of operations in parallel.
+std::vector<Operation *> opsToProcess = ...;
+llvm::for_each_n(llvm::parallel::par, 0, opsToProcess.size(),
+                 [&](size_t i) {
+  // Notify the handler that we are processing the i'th operation.
+  handler.setOrderIDForThread(i);
+  auto *op = opsToProcess[i];
+  ...
+
+  // Notify the handler that we are finished processing diagnostics on this
+  // thread.
+  handler.eraseOrderIDForThread();
+});
+```
diff --git a/mlir/docs/DialectConversion.md b/mlir/docs/DialectConversion.md
new file mode 100644
index 0000000000000000000000000000000000000000..e6b652f21913afb1c26923e564afc70ff4f5bd90
--- /dev/null
+++ b/mlir/docs/DialectConversion.md
@@ -0,0 +1,277 @@
+# Dialect Conversion
+
+This document describes a framework in MLIR in which to perform operation
+conversions between, and within dialects. This framework allows for transforming
+illegal operations to those supported by a provided conversion target, via a set
+of pattern-based operation rewriting patterns.
+
+[TOC]
+
+To utilize the framework, a few things must be provided:
+
+*   A [Conversion Target](#conversion-target)
+*   A set of [Rewrite Patterns](#rewrite-pattern-specification)
+*   A [Type Converter](#type-conversion) (Optional)
+
+## Modes of Conversion
+
+When applying a conversion to a set of operations, there are several conversion
+modes that can be selected from:
+
+*   Partial Conversion
+
+    -   A partial conversion will legalize as many operations to the target as
+        possible, but will allow pre-existing operations that were not
+        explicitly marked as `illegal` to remain unconverted. This allows for
+        partially lowering parts of the module in the presence of unknown
+        operations.
+    -   A partial conversion can be applied via `applyPartialConversion`.
+
+*   Full Conversion
+
+    -   A full conversion is only successful if all operations are properly
+        legalized to the given conversion target. This ensures that only known
+        operations will exist after the conversion process.
+    -   A full conversion can be applied via `applyFullConversion`.
+
+*   Analysis Conversion
+
+    -   An analysis conversion will analyze which operations are legalizable to
+        the given conversion target if a conversion were to be applied. Note
+        that no rewrites, or transformations, are actually applied to the input
+        operations.
+    -   An analysis conversion can be applied via `applyAnalysisConversion`.
+
+## Conversion Target
+
+The conversion target is the formal definition of what is considered to be legal
+during the conversion process. The final operations generated by the conversion
+framework must be marked as legal on the `ConversionTarget` for the rewrite to
+be a success. Existing operations need not always be legal, though; see the
+different conversion modes for why. Operations and dialects may be marked with
+any of the provided legality actions below:
+
+*   Legal
+
+    -   This action signals that every instance of a given operation is legal,
+        i.e. any combination of attributes, operands, types, etc. are valid.
+
+*   Dynamic
+
+    -   This action signals that only some instances of a given operation are
+        legal. This allows for defining fine-tune constraints, e.g. saying that
+        `addi` is only legal when operating on 32-bit integers.
+    -   If a specific handler is not provided when setting the action, the
+        target must override the `isDynamicallyLegal` hook provided by
+        `ConversionTarget`.
+
+*   Illegal
+
+    -   This action signals that no instance of a given operation is legal.
+        Operations marked as `illegal` must always be converted for the
+        conversion to be successful. This action also allows for selectively
+        marking specific operations as illegal in an otherwise legal dialect.
+
+An example conversion target is shown below:
+
+```c++
+struct MyTarget : public ConversionTarget {
+  MyTarget(MLIRContext &ctx) : ConversionTarget(ctx) {
+    //--------------------------------------------------------------------------
+    // Marking an operation as Legal:
+
+    /// Mark all operations within the LLVM dialect are legal.
+    addLegalDialects<LLVMDialect>();
+
+    /// Mark `std.constant` op is always legal on this target.
+    addLegalOps<ConstantOp>();
+
+    //--------------------------------------------------------------------------
+    // Marking an operation as dynamically legal.
+
+    /// Mark all operations within Affine dialect have dynamic legality
+    /// constraints.
+    addDynamicallyLegalDialects<AffineDialect>();
+
+    /// Mark `std.return` as dynamically legal.
+    addDynamicallyLegalOp<ReturnOp>();
+
+    /// Mark `std.return` as dynamically legal, but provide a specific legality
+    /// callback.
+    addDynamicallyLegalOp<ReturnOp>([](ReturnOp op) { ... });
+
+    //--------------------------------------------------------------------------
+    // Marking an operation as illegal.
+
+    /// All operations within the GPU dialect are illegal.
+    addIllegalDialect<GPUDialect>();
+
+    /// Mark `std.br` and `std.cond_br` as illegal.
+    addIllegalOp<BranchOp, CondBranchOp>();
+  }
+
+  /// Implement the default legalization handler to handle operations marked as
+  /// dynamically legal that were not provided with an explicit handler.
+  bool isDynamicallyLegal(Operation *op) override { ... }
+};
+```
+
+### Recursive Legality
+
+In some cases, it may be desirable to mark entire regions of operations as
+legal. This provides an additional granularity of context to the concept of
+"legal". The `ConversionTarget` supports marking operations, that were
+previously added as `Legal` or `Dynamic`, as `recursively` legal. Recursive
+legality means that if an operation instance is legal, either statically or
+dynamically, all of the operations nested within are also considered legal. An
+operation can be marked via `markOpRecursivelyLegal<>`:
+
+```c++
+ConversionTarget &target = ...;
+
+/// The operation must first be marked as `Legal` or `Dynamic`.
+target.addLegalOp<MyOp>(...);
+target.addDynamicallyLegalOp<MySecondOp>(...);
+
+/// Mark the operation as always recursively legal.
+target.markOpRecursivelyLegal<MyOp>();
+/// Mark optionally with a callback to allow selective marking.
+target.markOpRecursivelyLegal<MyOp, MySecondOp>([](Operation *op) { ... });
+/// Mark optionally with a callback to allow selective marking.
+target.markOpRecursivelyLegal<MyOp>([](MyOp op) { ... });
+```
+
+## Rewrite Pattern Specification
+
+After the conversion target has been defined, a set of legalization patterns
+must be provided to transform illegal operations into legal ones. The patterns
+supplied here, that do not [require type changes](#conversion-patterns), are the
+same as those described in the
+[quickstart rewrites guide](QuickstartRewrites.md#adding-patterns), but have a
+few additional [restrictions](#restrictions). The patterns provided do not need
+to generate operations that are directly legal on the target. The framework will
+automatically build a graph of conversions to convert non-legal operations into
+a set of legal ones.
+
+As an example, say you define a target that supports one operation: `foo.add`.
+When providing the following patterns: [`bar.add` -> `baz.add`, `baz.add` ->
+`foo.add`], the framework will automatically detect that it can legalize
+`baz.add` -> `foo.add` even though a direct conversion does not exist. This
+means that you don’t have to define a direct legalization pattern for `bar.add`
+-> `foo.add`.
+
+### Restrictions
+
+The framework processes operations in topological order, trying to legalize them
+individually. As such, patterns used in the conversion framework have a few
+additional restrictions:
+
+1.  If a pattern matches, it must erase or replace the op it matched on.
+    Operations can *not* be updated in place.
+2.  Match criteria should not be based on the IR outside of the op itself. The
+    preceding ops will already have been processed by the framework (although it
+    may not update uses), and the subsequent IR will not yet be processed. This
+    can create confusion if a pattern attempts to match against a sequence of
+    ops (e.g. rewrite A + B -> C). That sort of rewrite should be performed in a
+    separate pass.
+
+## Type Conversion
+
+It is sometimes necessary as part of a conversion to convert the set types of
+being operated on. In these cases, a `TypeConverter` object may be defined that
+details how types should be converted. The `TypeConverter` is used by patterns
+and by the general conversion infrastructure to convert the signatures of blocks
+and regions.
+
+### Type Converter
+
+As stated above, the `TypeConverter` contains several hooks for detailing how to
+convert types. Several of these hooks are detailed below:
+
+```c++
+class TypeConverter {
+ public:
+  /// This hook allows for converting a type. This function should return
+  /// failure if no valid conversion exists, success otherwise. If the new set
+  /// of types is empty, the type is removed and any usages of the existing
+  /// value are expected to be removed during conversion.
+  virtual LogicalResult convertType(Type t, SmallVectorImpl<Type> &results);
+
+  /// This hook simplifies defining 1-1 type conversions. This function returns
+  /// the type to convert to on success, and a null type on failure.
+  virtual Type convertType(Type t);
+
+  /// This hook allows for materializing a conversion from a set of types into
+  /// one result type by generating a cast operation of some kind. The generated
+  /// operation should produce one result, of 'resultType', with the provided
+  /// 'inputs' as operands. This hook must be overridden when a type conversion
+  /// results in more than one type, or if a type conversion may persist after
+  /// the conversion has finished.
+  virtual Operation *materializeConversion(PatternRewriter &rewriter,
+                                           Type resultType,
+                                           ArrayRef<Value> inputs,
+                                           Location loc);
+};
+```
+
+### Conversion Patterns
+
+When type conversion comes into play, the general Rewrite Patterns can no longer
+be used. This is due to the fact that the operands of the operation being
+matched will not correspond with the operands of the correct type as determined
+by `TypeConverter`. The operation rewrites on type boundaries must thus use a
+special pattern, the `ConversionPattern`. This pattern provides, as an
+additional argument to the `matchAndRewrite` and `rewrite` methods, the set of
+remapped operands corresponding to the desired type. These patterns also utilize
+a special `PatternRewriter`, `ConversionPatternRewriter`, that provides special
+hooks for use with the conversion infrastructure.
+
+```c++
+struct MyConversionPattern : public ConversionPattern {
+  /// The `matchAndRewrite` hooks on ConversionPatterns take an additional
+  /// `operands` parameter, containing the remapped operands of the original
+  /// operation.
+  virtual PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const;
+};
+```
+
+These patterns have the same [restrictions](#restrictions) as the basic rewrite
+patterns used in dialect conversion.
+
+### Region Signature Conversion
+
+From the perspective of type conversion, the entry block to a region is often
+special. The types of the entry block arguments are often tied semantically to
+details on the operation, e.g. FuncOp, AffineForOp, etc. Given this, the
+conversion of the types for this block must be done explicitly via a conversion
+pattern. To convert the signature of a region entry block, a custom hook on the
+ConversionPatternRewriter must be invoked `applySignatureConversion`. A
+signature conversion, `TypeConverter::SignatureConversion`, can be built
+programmatically:
+
+```c++
+class SignatureConversion {
+public:
+    /// Remap an input of the original signature with a new set of types. The
+    /// new types are appended to the new signature conversion.
+    void addInputs(unsigned origInputNo, ArrayRef<Type> types);
+
+    /// Append new input types to the signature conversion, this should only be
+    /// used if the new types are not intended to remap an existing input.
+    void addInputs(ArrayRef<Type> types);
+
+    /// Remap an input of the original signature with a range of types in the
+    /// new signature.
+    void remapInput(unsigned origInputNo, unsigned newInputNo,
+                    unsigned newInputCount = 1);
+
+    /// Remap an input of the original signature to another `replacement`
+    /// value. This drops the original argument.
+    void remapInput(unsigned origInputNo, Value replacement);
+};
+```
+
+The `TypeConverter` provides several default utilities for signature conversion:
+`convertSignatureArg`/`convertBlockSignature`.
diff --git a/mlir/docs/Dialects/Affine.md b/mlir/docs/Dialects/Affine.md
new file mode 100644
index 0000000000000000000000000000000000000000..c5dcf6a679027082da0618198b21734bbf8aeacf
--- /dev/null
+++ b/mlir/docs/Dialects/Affine.md
@@ -0,0 +1,610 @@
+# Affine Dialect
+
+This dialect provides a powerful abstraction for affine operations and analyses.
+
+[TOC]
+
+## Polyhedral Structures
+
+MLIR uses techniques from polyhedral compilation to make dependence analysis and
+loop transformations efficient and reliable. This section introduces some of the
+core concepts that are used throughout the document.
+
+### Dimensions and Symbols
+
+Dimensions and symbols are the two kinds of identifiers that can appear in the
+polyhedral structures, and are always of [`index`](../LangRef.md#index-type)
+type. Dimensions are declared in parentheses and symbols are declared in square
+brackets.
+
+Examples:
+
+```mlir
+// A 2d to 3d affine mapping.
+// d0/d1 are dimensions, s0 is a symbol
+#affine_map2to3 = (d0, d1)[s0] -> (d0, d1 + s0, d1 - s0)
+```
+
+Dimensional identifiers correspond to the dimensions of the underlying structure
+being represented (a map, set, or more concretely a loop nest or a tensor); for
+example, a three-dimensional loop nest has three dimensional identifiers. Symbol
+identifiers represent an unknown quantity that can be treated as constant for a
+region of interest.
+
+Dimensions and symbols are bound to SSA values by various operations in MLIR and
+use the same parenthesized vs square bracket list to distinguish the two.
+
+Syntax:
+
+```
+// Uses of SSA values that are passed to dimensional identifiers.
+dim-use-list ::= `(` ssa-use-list? `)`
+
+// Uses of SSA values that are used to bind symbols.
+symbol-use-list ::= `[` ssa-use-list? `]`
+
+// Most things that bind SSA values bind dimensions and symbols.
+dim-and-symbol-use-list ::= dim-use-list symbol-use-list?
+```
+
+SSA values bound to dimensions and symbols must always have 'index' type.
+
+Example:
+
+```mlir
+#affine_map2to3 = (d0, d1)[s0] -> (d0, d1 + s0, d1 - s0)
+// Binds %N to the s0 symbol in affine_map2to3.
+%x = alloc()[%N] : memref<40x50xf32, #affine_map2to3>
+```
+
+### Restrictions on Dimensions and Symbols
+
+The affine dialect imposes certain restrictions on dimension and symbolic
+identifiers to enable powerful analysis and transformation. A symbolic
+identifier can be bound to an SSA value that is either an argument to the
+function, a value defined at the top level of that function (outside of all
+loops and if operations), the result of a
+[`constant` operation](Standard.md#constant-operation), or the result of an
+[`affine.apply` operation](#affineapply-operation) that recursively takes as
+arguments any symbolic identifiers, or the result of a [`dim`
+operation](Standard.md#dim-operation) on either a memref that is a function
+argument or a memref where the corresponding dimension is either static or a
+dynamic one in turn bound to a symbolic identifier.  Dimensions may be bound not
+only to anything that a symbol is bound to, but also to induction variables of
+enclosing [`affine.for` operations](#affinefor-operation), and the result of an
+[`affine.apply` operation](#affineapply-operation) (which recursively may use
+other dimensions and symbols).
+
+### Affine Expressions
+
+Syntax:
+
+```
+affine-expr ::= `(` affine-expr `)`
+              | affine-expr `+` affine-expr
+              | affine-expr `-` affine-expr
+              | `-`? integer-literal `*` affine-expr
+              | affine-expr `ceildiv` integer-literal
+              | affine-expr `floordiv` integer-literal
+              | affine-expr `mod` integer-literal
+              | `-`affine-expr
+              | bare-id
+              | `-`? integer-literal
+
+multi-dim-affine-expr ::= `(` affine-expr (`,` affine-expr)* `)`
+```
+
+`ceildiv` is the ceiling function which maps the result of the division of its
+first argument by its second argument to the smallest integer greater than or
+equal to that result. `floordiv` is a function which maps the result of the
+division of its first argument by its second argument to the largest integer
+less than or equal to that result. `mod` is the modulo operation: since its
+second argument is always positive, its results are always positive in our
+usage. The `integer-literal` operand for ceildiv, floordiv, and mod is always
+expected to be positive. `bare-id` is an identifier which must have type
+[index](../LangRef.md#index-type). The precedence of operations in an affine
+expression are ordered from highest to lowest in the order: (1)
+parenthesization, (2) negation, (3) modulo, multiplication, floordiv, and
+ceildiv, and (4) addition and subtraction. All of these operators associate from
+left to right.
+
+A _multidimensional affine expression_ is a comma separated list of
+one-dimensional affine expressions, with the entire list enclosed in
+parentheses.
+
+**Context:** An affine function, informally, is a linear function plus a
+constant. More formally, a function f defined on a vector $$\vec{v} \in
+\mathbb{Z}^n$$ is a multidimensional affine function of $$\vec{v}$$ if
+$$f(\vec{v})$$ can be expressed in the form $$M \vec{v} + \vec{c}$$ where $$M$$
+is a constant matrix from $$\mathbb{Z}^{m \times n}$$ and $$\vec{c}$$ is a
+constant vector from $$\mathbb{Z}$$. $$m$$ is the dimensionality of such an
+affine function. MLIR further extends the definition of an affine function to
+allow 'floordiv', 'ceildiv', and 'mod' with respect to positive integer
+constants. Such extensions to affine functions have often been referred to as
+quasi-affine functions by the polyhedral compiler community. MLIR uses the term
+'affine map' to refer to these multidimensional quasi-affine functions. As
+examples, $$(i+j+1, j)$$, $$(i \mod 2, j+i)$$, $$(j, i/4, i \mod 4)$$, $$(2i+1,
+j)$$ are two-dimensional affine functions of $$(i, j)$$, but $$(i \cdot j,
+i^2)$$, $$(i \mod j, i/j)$$ are not affine functions of $$(i, j)$$.
+
+### Affine Maps
+
+Syntax:
+
+```
+affine-map-inline
+   ::= dim-and-symbol-id-lists `->` multi-dim-affine-expr
+```
+
+The identifiers in the dimensions and symbols lists must be unique. These are
+the only identifiers that may appear in 'multi-dim-affine-expr'. Affine maps
+with one or more symbols in its specification are known as "symbolic affine
+maps", and those with no symbols as "non-symbolic affine maps".
+
+**Context:** Affine maps are mathematical functions that transform a list of
+dimension indices and symbols into a list of results, with affine expressions
+combining the indices and symbols. Affine maps distinguish between
+[indices and symbols](#dimensions-and-symbols) because indices are inputs to the
+affine map when the map is called (through an operation such as
+[affine.apply](#affineapply-operation)), whereas symbols are bound when
+the map is established (e.g. when a memref is formed, establishing a
+memory [layout map](../LangRef.md#layout-map)).
+
+Affine maps are used for various core structures in MLIR. The restrictions we
+impose on their form allows powerful analysis and transformation, while keeping
+the representation closed with respect to several operations of interest.
+
+#### Named affine mappings
+
+Syntax:
+
+```
+affine-map-id ::= `#` suffix-id
+
+// Definitions of affine maps are at the top of the file.
+affine-map-def    ::= affine-map-id `=` affine-map-inline
+module-header-def ::= affine-map-def
+
+// Uses of affine maps may use the inline form or the named form.
+affine-map ::= affine-map-id | affine-map-inline
+```
+
+Affine mappings may be defined inline at the point of use, or may be hoisted to
+the top of the file and given a name with an affine map definition, and used by
+name.
+
+Examples:
+
+```mlir
+// Affine map out-of-line definition and usage example.
+#affine_map42 = (d0, d1)[s0] -> (d0, d0 + d1 + s0 floordiv 2)
+
+// Use an affine mapping definition in an alloc operation, binding the
+// SSA value %N to the symbol s0.
+%a = alloc()[%N] : memref<4x4xf32, #affine_map42>
+
+// Same thing with an inline affine mapping definition.
+%b = alloc()[%N] : memref<4x4xf32, (d0, d1)[s0] -> (d0, d0 + d1 + s0 floordiv 2)>
+```
+
+### Semi-affine maps
+
+Semi-affine maps are extensions of affine maps to allow multiplication,
+`floordiv`, `ceildiv`, and `mod` with respect to symbolic identifiers.
+Semi-affine maps are thus a strict superset of affine maps.
+
+Syntax of semi-affine expressions:
+
+```
+semi-affine-expr ::= `(` semi-affine-expr `)`
+                   | semi-affine-expr `+` semi-affine-expr
+                   | semi-affine-expr `-` semi-affine-expr
+                   | symbol-or-const `*` semi-affine-expr
+                   | semi-affine-expr `ceildiv` symbol-or-const
+                   | semi-affine-expr `floordiv` symbol-or-const
+                   | semi-affine-expr `mod` symbol-or-const
+                   | bare-id
+                   | `-`? integer-literal
+
+symbol-or-const ::= `-`? integer-literal | symbol-id
+
+multi-dim-semi-affine-expr ::= `(` semi-affine-expr (`,` semi-affine-expr)* `)`
+```
+
+The precedence and associativity of operations in the syntax above is the same
+as that for [affine expressions](#affine-expressions).
+
+Syntax of semi-affine maps:
+
+```
+semi-affine-map-inline
+   ::= dim-and-symbol-id-lists `->` multi-dim-semi-affine-expr
+```
+
+Semi-affine maps may be defined inline at the point of use, or may be hoisted to
+the top of the file and given a name with a semi-affine map definition, and used
+by name.
+
+```
+semi-affine-map-id ::= `#` suffix-id
+
+// Definitions of semi-affine maps are at the top of file.
+semi-affine-map-def ::= semi-affine-map-id `=` semi-affine-map-inline
+module-header-def ::= semi-affine-map-def
+
+// Uses of semi-affine maps may use the inline form or the named form.
+semi-affine-map ::= semi-affine-map-id | semi-affine-map-inline
+```
+
+### Integer Sets
+
+An integer set is a conjunction of affine constraints on a list of identifiers.
+The identifiers associated with the integer set are separated out into two
+classes: the set's dimension identifiers, and the set's symbolic identifiers.
+The set is viewed as being parametric on its symbolic identifiers. In the
+syntax, the list of set's dimension identifiers are enclosed in parentheses
+while its symbols are enclosed in square brackets.
+
+Syntax of affine constraints:
+
+```
+affine-constraint ::= affine-expr `>=` `0`
+                    | affine-expr `==` `0`
+affine-constraint-conjunction ::= affine-constraint (`,` affine-constraint)*
+```
+
+Integer sets may be defined inline at the point of use, or may be hoisted to the
+top of the file and given a name with an integer set definition, and used by
+name.
+
+```
+integer-set-id ::= `#` suffix-id
+
+integer-set-inline
+   ::= dim-and-symbol-id-lists `:` '(' affine-constraint-conjunction? ')'
+
+// Declarations of integer sets are at the top of the file.
+integer-set-decl ::= integer-set-id `=` integer-set-inline
+
+// Uses of integer sets may use the inline form or the named form.
+integer-set ::= integer-set-id | integer-set-inline
+```
+
+The dimensionality of an integer set is the number of identifiers appearing in
+dimension list of the set. The affine-constraint non-terminals appearing in the
+syntax above are only allowed to contain identifiers from dims and symbols. A
+set with no constraints is a set that is unbounded along all of the set's
+dimensions.
+
+Example:
+
+```mlir
+// A example two-dimensional integer set with two symbols.
+#set42 = (d0, d1)[s0, s1]
+   : (d0 >= 0, -d0 + s0 - 1 >= 0, d1 >= 0, -d1 + s1 - 1 >= 0)
+
+// Inside a Region
+affine.if #set42(%i, %j)[%M, %N] {
+  ...
+}
+```
+
+`d0` and `d1` correspond to dimensional identifiers of the set, while `s0` and
+`s1` are symbol identifiers.
+
+## Operations
+
+#### 'affine.apply' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `affine.apply` affine-map dim-and-symbol-use-list
+```
+
+The `affine.apply` operation applies an
+[affine mapping](#affine-expressions) to a list of SSA values,
+yielding a single SSA value. The number of dimension and symbol arguments to
+affine.apply must be equal to the respective number of dimensional and symbolic
+inputs to the affine mapping; the `affine.apply` operation always returns one
+value. The input operands and result must all have 'index' type.
+
+Example:
+
+```mlir
+#map10 = (d0, d1) -> (d0 floordiv 8 + d1 floordiv 128)
+...
+%1 = affine.apply #map10 (%s, %t)
+
+// Inline example.
+%2 = affine.apply (i)[s0] -> (i+s0) (%42)[%n]
+```
+
+#### 'affine.for' operation
+
+Syntax:
+
+```
+operation   ::= `affine.for` ssa-id `=` lower-bound `to` upper-bound
+                      (`step` integer-literal)? `{` op* `}`
+
+lower-bound ::= `max`? affine-map dim-and-symbol-use-list | shorthand-bound
+upper-bound ::= `min`? affine-map dim-and-symbol-use-list | shorthand-bound
+shorthand-bound ::= ssa-id | `-`? integer-literal
+```
+
+The `affine.for` operation represents an affine loop nest. It has one region
+containing its body. This region must contain one block that terminates with
+[`affine.terminator`](#affineterminator-operation). *Note:* when `affine.for` is
+printed in custom format, the terminator is omitted. The block has one argument
+of [`index`](../LangRef.md#index-type) type that represents the induction
+variable of the loop.
+
+The `affine.for` operation executes its body a number of times iterating from a
+lower bound to an upper bound by a stride. The stride, represented by `step`, is
+a positive constant integer which defaults to "1" if not present. The lower and
+upper bounds specify a half-open range: the range includes the lower bound but
+does not include the upper bound.
+
+The lower and upper bounds of a `affine.for` operation are represented as an
+application of an affine mapping to a list of SSA values passed to the map. The
+[same restrictions](#restrictions-on-dimensions-and-symbols) hold for these SSA
+values as for all bindings of SSA values to dimensions and symbols.
+
+The affine mappings for the bounds may return multiple results, in which case
+the `max`/`min` keywords are required (for the lower/upper bound respectively),
+and the bound is the maximum/minimum of the returned values. There is no
+semantic ambiguity, but MLIR syntax requires the use of these keywords to make
+things more obvious to human readers.
+
+Many upper and lower bounds are simple, so MLIR accepts two custom form
+syntaxes: the form that accepts a single 'ssa-id' (e.g. `%N`) is shorthand for
+applying that SSA value to a function that maps a single symbol to itself, e.g.,
+`()[s]->(s)()[%N]`. The integer literal form (e.g. `-42`) is shorthand for a
+nullary mapping function that returns the constant value (e.g. `()->(-42)()`).
+
+Example showing reverse iteration of the inner loop:
+
+```mlir
+#map57 = (d0)[s0] -> (s0 - d0 - 1)
+
+func @simple_example(%A: memref<?x?xf32>, %B: memref<?x?xf32>) {
+  %N = dim %A, 0 : memref<?x?xf32>
+  affine.for %i = 0 to %N step 1 {
+    affine.for %j = 0 to %N {   // implicitly steps by 1
+      %0 = affine.apply #map57(%j)[%N]
+      %tmp = call @F1(%A, %i, %0) : (memref<?x?xf32>, index, index)->(f32)
+      call @F2(%tmp, %B, %i, %0) : (f32, memref<?x?xf32>, index, index)->()
+    }
+  }
+  return
+}
+```
+
+#### 'affine.if' operation
+
+Syntax:
+
+```
+operation    ::= `affine.if` if-op-cond `{` op* `}` (`else` `{` op* `}`)?
+if-op-cond ::= integer-set dim-and-symbol-use-list
+```
+
+The `affine.if` operation restricts execution to a subset of the loop iteration
+space defined by an integer set (a conjunction of affine constraints). A single
+`affine.if` may end with an optional `else` clause.
+
+The condition of the `affine.if` is represented by an
+[integer set](#integer-sets) (a conjunction of affine constraints),
+and the SSA values bound to the dimensions and symbols in the integer set. The
+[same restrictions](#restrictions-on-dimensions-and-symbols) hold for these SSA
+values as for all bindings of SSA values to dimensions and symbols.
+
+The `affine.if` operation contains two regions for the "then" and "else"
+clauses. The latter may be empty (i.e. contain no blocks), meaning the absence
+of the else clause. When non-empty, both regions must contain exactly one block
+terminating with [`affine.terminator`](#affineterminator-operation). *Note:*
+when `affine.if` is printed in custom format, the terminator is omitted. These
+blocks must not have any arguments.
+
+Example:
+
+```mlir
+#set = (d0, d1)[s0]: (d0 - 10 >= 0, s0 - d0 - 9 >= 0,
+                      d1 - 10 >= 0, s0 - d1 - 9 >= 0)
+func @reduced_domain_example(%A, %X, %N) : (memref<10xi32>, i32, i32) {
+  affine.for %i = 0 to %N {
+     affine.for %j = 0 to %N {
+       %0 = affine.apply #map42(%j)
+       %tmp = call @S1(%X, %i, %0)
+       affine.if #set(%i, %j)[%N] {
+          %1 = affine.apply #map43(%i, %j)
+          call @S2(%tmp, %A, %i, %1)
+       }
+    }
+  }
+  return
+}
+```
+
+#### 'affine.load' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `affine.load` ssa-use `[` multi-dim-affine-map-of-ssa-ids `]` `:` memref-type
+```
+
+The `affine.load` op reads an element from a memref, where the index for each
+memref dimension is an affine expression of loop induction variables and
+symbols. The output of 'affine.load' is a new value with the same type as the
+elements of the memref. An affine expression of loop IVs and symbols must be
+specified for each dimension of the memref. The keyword 'symbol' can be used to
+indicate SSA identifiers which are symbolic.
+
+Example:
+
+```mlir
+
+  Example 1:
+
+    %1 = affine.load %0[%i0 + 3, %i1 + 7] : memref<100x100xf32>
+
+  Example 2: Uses 'symbol' keyword for symbols '%n' and '%m'.
+
+    %1 = affine.load %0[%i0 + symbol(%n), %i1 + symbol(%m)]
+      : memref<100x100xf32>
+
+```
+
+#### 'affine.store' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `affine.store` ssa-use, ssa-use `[` multi-dim-affine-map-of-ssa-ids `]` `:` memref-type
+```
+
+The `affine.store` op writes an element to a memref, where the index for each
+memref dimension is an affine expression of loop induction variables and
+symbols. The 'affine.store' op stores a new value which is the same type as the
+elements of the memref. An affine expression of loop IVs and symbols must be
+specified for each dimension of the memref. The keyword 'symbol' can be used to
+indicate SSA identifiers which are symbolic.
+
+Example:
+
+```mlir
+
+    Example 1:
+
+      affine.store %v0, %0[%i0 + 3, %i1 + 7] : memref<100x100xf32>
+
+    Example 2: Uses 'symbol' keyword for symbols '%n' and '%m'.
+
+      affine.store %v0, %0[%i0 + symbol(%n), %i1 + symbol(%m)]
+        : memref<100x100xf32>
+
+```
+
+#### 'affine.dma_start' operation
+
+Syntax:
+
+```
+operation ::= `affine.dma_Start` ssa-use `[` multi-dim-affine-map-of-ssa-ids `]`, `[` multi-dim-affine-map-of-ssa-ids `]`, `[` multi-dim-affine-map-of-ssa-ids `]`, ssa-use `:` memref-type
+```
+
+The `affine.dma_start` op starts a non-blocking DMA operation that transfers
+data from a source memref to a destination memref. The source and destination
+memref need not be of the same dimensionality, but need to have the same
+elemental type. The operands include the source and destination memref's
+each followed by its indices, size of the data transfer in terms of the
+number of elements (of the elemental type of the memref), a tag memref with
+its indices, and optionally at the end, a stride and a
+number_of_elements_per_stride arguments. The tag location is used by an
+AffineDmaWaitOp to check for completion. The indices of the source memref,
+destination memref, and the tag memref have the same restrictions as any
+affine.load/store. In particular, index for each memref dimension must be an
+affine expression of loop induction variables and symbols.
+The optional stride arguments should be of 'index' type, and specify a
+stride for the slower memory space (memory space with a lower memory space
+id), transferring chunks of number_of_elements_per_stride every stride until
+%num_elements are transferred. Either both or no stride arguments should be
+specified. The value of 'num_elements' must be a multiple of
+'number_of_elements_per_stride'.
+
+
+Example:
+
+```mlir
+
+For example, a DmaStartOp operation that transfers 256 elements of a memref
+'%src' in memory space 0 at indices [%i + 3, %j] to memref '%dst' in memory
+space 1 at indices [%k + 7, %l], would be specified as follows:
+
+  %num_elements = constant 256
+  %idx = constant 0 : index
+  %tag = alloc() : memref<1xi32, 4>
+  affine.dma_start %src[%i + 3, %j], %dst[%k + 7, %l], %tag[%idx],
+    %num_elements :
+      memref<40x128xf32, 0>, memref<2x1024xf32, 1>, memref<1xi32, 2>
+
+  If %stride and %num_elt_per_stride are specified, the DMA is expected to
+  transfer %num_elt_per_stride elements every %stride elements apart from
+  memory space 0 until %num_elements are transferred.
+
+  affine.dma_start %src[%i, %j], %dst[%k, %l], %tag[%idx], %num_elements,
+    %stride, %num_elt_per_stride : ...
+
+```
+
+#### 'affine.dma_wait' operation
+
+Syntax:
+
+```
+operation ::= `affine.dma_Start` ssa-use `[` multi-dim-affine-map-of-ssa-ids `]`, `[` multi-dim-affine-map-of-ssa-ids `]`, `[` multi-dim-affine-map-of-ssa-ids `]`, ssa-use `:` memref-type
+```
+
+The `affine.dma_start` op blocks until the completion of a DMA operation
+associated with the tag element '%tag[%index]'. %tag is a memref, and %index
+has to be an index with the same restrictions as any load/store index.
+In particular, index for each memref dimension must be an affine expression of
+loop induction variables and symbols. %num_elements is the number of elements
+associated with the DMA operation. For example:
+
+Example:
+
+```mlir
+
+  affine.dma_start %src[%i, %j], %dst[%k, %l], %tag[%index], %num_elements :
+    memref<2048xf32, 0>, memref<256xf32, 1>, memref<1xi32, 2>
+  ...
+  ...
+  affine.dma_wait %tag[%index], %num_elements : memref<1xi32, 2>
+
+```
+
+#### 'affine.min' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `affine.min` affine-map dim-and-symbol-use-list
+```
+
+The `affine.min` operation applies an
+[affine mapping](#affine-expressions) to a list of SSA values, and returns the
+minimum value of all result expressions. The number of dimension and symbol
+arguments to affine.min must be equal to the respective number of dimensional
+and symbolic inputs to the affine mapping; the `affine.min` operation always
+returns one value. The input operands and result must all have 'index' type.
+
+Example:
+
+```mlir
+
+%0 = affine.min (d0)[s0] -> (1000, d0 + 512, s0) (%arg0)[%arg1]
+
+```
+
+#### `affine.terminator` operation
+
+Syntax:
+
+```
+operation ::= `"affine.terminator"() : () -> ()`
+```
+
+Affine terminator is a special terminator operation for blocks inside affine
+loops ([`affine.for`](#affinefor-operation)) and branches
+([`affine.if`](#affineif-operation)). It unconditionally transmits the control
+flow to the successor of the operation enclosing the region.
+
+*Rationale*: bodies of affine operations are [blocks](../LangRef.md#blocks) that
+must have terminators. Loops and branches represent structured control flow and
+should not accept arbitrary branches as terminators.
+
+This operation does _not_ have a custom syntax. However, affine control
+operations omit the terminator in their custom syntax for brevity.
diff --git a/mlir/docs/Dialects/GPU.md b/mlir/docs/Dialects/GPU.md
new file mode 100644
index 0000000000000000000000000000000000000000..7dcd8f6053c420add42a84147d9bbffb35699b91
--- /dev/null
+++ b/mlir/docs/Dialects/GPU.md
@@ -0,0 +1,132 @@
+# GPU Dialect
+
+Note: this dialect is more likely to change than others in the near future; use
+with caution.
+
+This dialect provides middle-level abstractions for launching GPU kernels
+following a programming model similar to that of CUDA or OpenCL. It provides
+abstractions for kernel invocations (and may eventually provide those for device
+management) that are not present at the lower level (e.g., as LLVM IR intrinsics
+for GPUs). Its goal is to abstract away device- and driver-specific
+manipulations to launch a GPU kernel and provide a simple path towards GPU
+execution from MLIR. It may be targeted, for example, by DSLs using MLIR. The
+dialect uses `gpu` as its canonical prefix.
+
+## Memory attribution
+
+Memory buffers are defined at the function level, either in "gpu.launch" or in
+"gpu.func" ops. This encoding makes it clear where the memory belongs and makes
+the lifetime of the memory visible. The memory is only accessible while the
+kernel is launched/the function is currently invoked. The latter is more strict
+than actual GPU implementations but using static memory at the function level is
+just for convenience. It is also always possible to pass pointers to the
+workgroup memory into other functions, provided they expect the correct memory
+space.
+
+The buffers are considered live throughout the execution of the GPU function
+body. The absence of memory attribution syntax means that the function does not
+require special buffers. Rationale: although the underlying models declare
+memory buffers at the module level, we chose to do it at the function level to
+provide some structuring for the lifetime of those buffers; this avoids the
+incentive to use the buffers for communicating between different kernels or
+launches of the same kernel, which should be done through function arguments
+instead; we chose not to use `alloca`-style approach that would require more
+complex lifetime analysis following the principles of MLIR that promote
+structure and representing analysis results in the IR.
+
+## Operations
+
+### `gpu.block_dim`
+
+Returns the number of threads in the thread block (aka the block size) along the
+x, y, or z `dimension`.
+
+Example:
+
+```mlir
+  %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
+```
+
+### `gpu.block_id`
+
+Returns the block id, i.e. the index of the current block within the grid along
+the x, y, or z `dimension`.
+
+Example:
+
+```mlir
+  %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
+```
+
+### `gpu.grid_dim`
+
+Returns the number of thread blocks in the grid along the x, y, or z
+`dimension`.
+
+Example:
+
+```mlir
+  %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+```
+
+### `gpu.thread_id`
+
+Returns the thread id, i.e. the index of the current thread within the block
+along the x, y, or z `dimension`.
+
+Example:
+
+```mlir
+  %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
+```
+
+### `gpu.yield`
+
+Is a special terminator operation for blocks inside regions in gpu ops. It
+returns values to the immediately enclosing gpu op.
+
+Example:
+
+```mlir
+gpu.yield %f0, %f1 : f32, f32
+```
+
+### `gpu.all_reduce`
+
+The "all_reduce" op reduces the value of every work item across a local
+workgroup. The result is equal for all work items of a workgroup.
+
+For example, both
+
+```mlir
+%1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32)
+%2 = "gpu.all_reduce"(%0) ({
+^bb(%lhs : f32, %rhs : f32):
+  %sum = addf %lhs, %rhs : f32
+  "gpu.yield"(%sum) : (f32) -> ()
+}) : (f32) -> (f32)
+```
+
+compute the sum of each work item's %0 value. The first version specifies the
+accumulation as operation, whereas the second version specifies the accumulation
+as code region. The accumulation operation must either be `add` or `mul`.
+
+Either none or all work items of a workgroup need to execute this op
+in convergence.
+
+### `gpu.barrier`
+
+The "barrier" op synchronizes all work items of a workgroup. It is used
+to coordinate communication between the work items of the workgroup.
+
+```mlir
+gpu.barrier
+```
+
+waits until all work items in the workgroup have reached this point and all
+memory accesses made by these work items prior to the op are visible to all work
+items in the workgroup. Data hazards between work items accessing the same
+memory can be avoided by synchronizing work items in-between these accesses.
+
+Either none or all work items of a workgroup need to execute this op
+in convergence.
diff --git a/mlir/docs/Dialects/LLVM.md b/mlir/docs/Dialects/LLVM.md
new file mode 100644
index 0000000000000000000000000000000000000000..00d0fa02fece6c4de8fd4a17c38b366ae280e7d3
--- /dev/null
+++ b/mlir/docs/Dialects/LLVM.md
@@ -0,0 +1,429 @@
+# LLVM IR Dialect
+
+This dialect wraps the LLVM IR types and instructions into MLIR types and
+operations. It provides several additional operations that are necessary to
+cover for the differences in the IR structure (e.g., MLIR does not have `phi`
+operations and LLVM IR does not have a `constant` operation).
+
+In this document, we use "LLVM IR" to designate the
+[intermediate representation of LLVM](https://llvm.org/docs/LangRef.html) and
+"LLVM IR _dialect_" to refer to the MLIR dialect reflecting LLVM instructions
+and types.
+
+[TOC]
+
+## Context and Module Association
+
+The LLVM IR dialect object _contains_ an LLVM Context and an LLVM Module that it
+uses to define, print, parse and manage LLVM IR types. These objects can be
+obtained from the dialect object using `.getLLVMContext()` and
+`getLLVMModule()`. All LLVM IR objects that interact with the LLVM IR dialect
+must exist in the dialect's context.
+
+## Types
+
+The LLVM IR dialect defines a single MLIR type, `LLVM::LLVMType`, that can wrap
+any existing LLVM IR type. Its syntax is as follows
+
+```
+type ::= `!llvm<"` llvm-canonical-type `">
+llvm-canonical-type ::= <canonical textual representation defined by LLVM>
+```
+
+For example, one can use primitive types `!llvm.i32`, pointer types
+`!llvm<"i8*">`, vector types `!llvm<"<4 x float>">` or structure types
+`!llvm<"{i32, float}">`. The parsing and printing of the canonical form is
+delegated to the LLVM assembly parser and printer.
+
+LLVM IR dialect types contain an `llvm::Type*` object that can be obtained by
+calling `.getUnderlyingType()` and used in LLVM API calls directly. These
+objects are allocated within the LLVM context associated with the LLVM IR
+dialect and may be linked to the properties of the associated LLVM module.
+
+LLVM IR dialect type can be constructed from any `llvm::Type*` that is
+associated with the LLVM context of the dialect. In this document, we use the
+term "wrapped LLVM IR type" to refer to the LLVM IR dialect type containing a
+specific LLVM IR type.
+
+## Operations
+
+All operations in the LLVM IR dialect have a custom form in MLIR. The mnemonic
+of an operation is that used in LLVM IR prefixed with "`llvm.`".
+
+### LLVM functions
+
+MLIR functions are defined by an operation that is not built into the IR itself.
+The LLVM IR dialect provides an `llvm.func` operation to define functions
+compatible with LLVM IR. These functions have wrapped LLVM IR function type but
+use MLIR syntax to express it. They are required to have exactly one result
+type. LLVM function operation is intended to capture additional properties of
+LLVM functions, such as linkage and calling convention, that may be modeled
+differently by the built-in MLIR function.
+
+```mlir
+// The type of @bar is !llvm<"i64 (i64)">
+llvm.func @bar(%arg0: !llvm.i64) -> !llvm.i64 {
+  llvm.return %arg0 : !llvm.i64
+}
+
+// Type type of @foo is !llvm<"void (i64)">
+// !llvm.void type is omitted
+llvm.func @foo(%arg0: !llvm.i64) {
+  llvm.return
+}
+
+// A function with `internal` linkage.
+llvm.func internal @internal_func() {
+  llvm.return
+}
+
+```
+
+### LLVM IR operations
+
+The following operations are currently supported. The semantics of these
+operations corresponds to the semantics of the similarly-named LLVM IR
+instructions.
+
+#### Integer binary arithmetic operations
+
+Take two arguments of wrapped LLVM IR integer type, produce one value of the
+same type.
+
+-   `add`
+-   `sub`
+-   `mul`
+-   `udiv`
+-   `sdiv`
+-   `urem`
+-   `srem`
+
+Examples:
+
+```mlir
+// Integer addition.
+%0 = llvm.add %a, %b : !llvm.i32
+
+// Unsigned integer division.
+%1 = llvm.udiv %a, %b : !llvm.i32
+```
+
+#### Floating point binary arithmetic operations
+
+Take two arguments of wrapped LLVM IR floating point type, produce one value of
+the same type.
+
+-   `fadd`
+-   `fsub`
+-   `fmul`
+-   `fdiv`
+-   `frem`
+
+Examples:
+
+```mlir
+// Float addition.
+%0 = llvm.fadd %a, %b : !llvm.float
+
+// Float division.
+%1 = llvm.fdiv %a, %b : !llvm.float
+```
+
+#### Memory-related operations
+
+-   `<r> = alloca <size> x <type>`
+-   `<r> = getelementptr <address>[<index> (, <index>)+]`
+-   `<r> = load <address>`
+-   `store <value>, <address>`
+
+In these operations, `<size>` must be a value of wrapped LLVM IR integer type,
+`<address>` must be a value of wrapped LLVM IR pointer type, and `<value>` must
+be a value of wrapped LLVM IR type that corresponds to the pointer type of
+`<address>`.
+
+The `index` operands are integer values whose semantics is identical to the
+non-pointer arguments of LLVM IR's `getelementptr`.
+
+Examples:
+
+```mlir
+// Allocate an array of 4 floats on stack
+%c4 = llvm.mlir.constant(4) : !llvm.i64
+%0 = llvm.alloca %c4 x !llvm.float : (!llvm.i64) -> !llvm<"float*">
+
+// Get the second element of the array (note 0-based indexing).
+%c1 = llvm.mlir.constant(1) : !llvm.i64
+%1 = llvm.getelementptr %0[%c1] : (!llvm<"float*">, !llvm.i64)
+                                   -> !llvm<"float*">
+
+// Store a constant into this element.
+%cf = llvm.mlir.constant(42.0 : f32) : !llvm.float
+llvm.store %cf, %1 : !llvm<"float*">
+
+// Load the value from this element.
+%3 = llvm.load %1 : !llvm<"float*">
+```
+
+#### Operations on values of aggregate type.
+
+-   `<value> = extractvalue <struct>[<index> (, <index>)+]`
+-   `<struct> = insertvalue <value>, <struct>[<index> (, <index>)+]`
+
+In these operations, `<struct>` must be a value of wrapped LLVM IR structure
+type and `<value>` must be a value that corresponds to one of the (nested)
+structure element types.
+
+Note the use of integer literals to designate subscripts, which is made possible
+by `extractvalue` and `insertvalue` must have constant subscripts. Internally,
+they are modeled as array attributes.
+
+Examples:
+
+```mlir
+// Get the value third element of the second element of a structure.
+%0 = llvm.extractvalue %s[1, 2] : !llvm<"{i32, {i1, i8, i16}">
+
+// Insert the value to the third element of the second element of a structure.
+// Note that this returns a new structure-typed value.
+%1 = llvm.insertvalue %0, %s[1, 2] : !llvm<"{i32, {i1, i8, i16}">
+```
+
+#### Terminator operations.
+
+Branch operations:
+
+-   `br [<successor>(<operands>)]`
+-   `cond_br <condition> [<true-successor>(<true-operands>),`
+    `<false-successor>(<false-operands>)]`
+
+In order to comply with MLIR design, branch operations in the LLVM IR dialect
+pass arguments to basic blocks. Successors must be valid block MLIR identifiers
+and operand lists for each of them must have the same types as the arguments of
+the respective blocks. `<condition>` must be a wrapped LLVM IR `i1` type.
+
+Since LLVM IR uses the name of the predecessor basic block to identify the
+sources of a PHI node, it is invalid for two entries of the PHI node to indicate
+different values coming from the same block. Therefore, `cond_br` in the LLVM IR
+dialect disallows its successors to be the same block _if_ this block has
+arguments.
+
+Examples:
+
+```mlir
+// Branch without arguments.
+^bb0:
+  llvm.br ^bb0
+
+// Branch and pass arguments.
+^bb1(%arg: !llvm.i32):
+  llvm.br ^bb1(%arg : !llvm.i32)
+
+// Conditionally branch and pass arguments to one of the blocks.
+llvm.cond_br %cond, ^bb0, %bb1(%arg : !llvm.i32)
+
+// It's okay to use the same block without arguments, but probably useless.
+llvm.cond_br %cond, ^bb0, ^bb0
+
+// ERROR: Passing different arguments to the same block in a conditional branch.
+llvm.cond_br %cond, ^bb1(%0 : !llvm.i32), ^bb1(%1 : !llvm.i32)
+
+```
+
+Call operations:
+
+-   `<r> = call(<operands>)`
+-   `call(<operands>)`
+
+In LLVM IR, functions may return either 0 or 1 value. LLVM IR dialect implements
+this behavior by providing a variadic `call` operation for 0- and 1-result
+functions. Even though MLIR supports multi-result functions, LLVM IR dialect
+disallows them.
+
+The `call` instruction supports both direct and indirect calls. Direct calls
+start with a function name (`@`-prefixed) and indirect calls start with an SSA
+value (`%`-prefixed). The direct callee, if present, is stored as a function
+attribute `callee`. The trailing type of the instruction is always the MLIR
+function type, which may be different from the indirect callee that has the
+wrapped LLVM IR function type.
+
+Examples:
+
+```mlir
+// Direct call without arguments and with one result.
+%0 = llvm.call @foo() : () -> (!llvm.float)
+
+// Direct call with arguments and without a result.
+llvm.call @bar(%0) : (!llvm.float) -> ()
+
+// Indirect call with an argument and without a result.
+llvm.call %1(%0) : (!llvm.float) -> ()
+```
+
+#### Miscellaneous operations.
+
+Integer comparisons: `icmp "predicate" <lhs>, <rhs>`. The following predicate
+values are supported:
+
+-   `eq` - equality comparison;
+-   `ne` - inequality comparison;
+-   `slt` - signed less-than comparison
+-   `sle` - signed less-than-or-equal comparison
+-   `sgt` - signed greater-than comparison
+-   `sge` - signed greater-than-or-equal comparison
+-   `ult` - unsigned less-than comparison
+-   `ule` - unsigned less-than-or-equal comparison
+-   `ugt` - unsigned greater-than comparison
+-   `uge` - unsigned greater-than-or-equal comparison
+
+Bitwise reinterpretation: `bitcast <value>`.
+
+Selection: `select <condition>, <lhs>, <rhs>`.
+
+### Auxiliary MLIR operations
+
+These operations do not have LLVM IR counterparts but are necessary to map LLVM
+IR into MLIR. They should be prefixed with `llvm.mlir`.
+
+#### `llvm.mlir.addressof`
+
+Creates an SSA value containing a pointer to a global variable or constant
+defined by `llvm.mlir.global`. The global value can be defined after its first
+referenced. If the global value is a constant, storing into it is not allowed.
+
+Examples:
+
+```mlir
+func @foo() {
+  // Get the address of a global.
+  %0 = llvm.mlir.addressof @const : !llvm<"i32*">
+
+  // Use it as a regular pointer.
+  %1 = llvm.load %0 : !llvm<"i32*">
+}
+
+// Define the global.
+llvm.mlir.global @const(42 : i32) : !llvm.i32
+```
+
+#### `llvm.mlir.constant`
+
+Unlike LLVM IR, MLIR does not have first-class constant values. Therefore, all
+constants must be created as SSA values before being used in other operations.
+`llvm.mlir.constant` creates such values for scalars and vectors. It has a
+mandatory `value` attribute, which may be an integer, floating point attribute;
+dense or sparse attribute containing integers or floats. The type of the
+attribute is one the corresponding MLIR standard types. It may be omitted for
+`i64` and `f64` types that are implied. The operation produces a new SSA value
+of the specified LLVM IR dialect type. The type of that value _must_ correspond
+to the attribute type converted to LLVM IR.
+
+Examples:
+
+```mlir
+// Integer constant, internal i32 is mandatory
+%0 = llvm.mlir.constant(42 : i32) : !llvm.i32
+
+// It's okay to omit i64.
+%1 = llvm.mlir.constant(42) : !llvm.i64
+
+// Floating point constant.
+%2 = llvm.mlir.constant(42.0 : f32) : !llvm.float
+
+// Splat dense vector constant.
+%3 = llvm.mlir.constant(dense<1.0> : vector<4xf32>) : !llvm<"<4 x float>">
+```
+
+#### `llvm.mlir.global`
+
+Since MLIR allows for arbitrary operations to be present at the top level,
+global variables are defined using the `llvm.mlir.global` operation. Both global
+constants and variables can be defined, and the value may also be initialized in
+both cases.
+
+There are two forms of initialization syntax. Simple constants that can be
+represented as MLIR attributes can be given in-line:
+
+```mlir
+llvm.mlir.global @variable(32.0 : f32) : !llvm.float
+```
+
+This initialization and type syntax is similar to `llvm.mlir.constant` and may
+use two types: one for MLIR attribute and another for the LLVM value. These
+types must be compatible.
+
+More complex constants that cannot be represented as MLIR attributes can be
+given in an initializer region:
+
+```mlir
+// This global is initialized with the equivalent of:
+//   i32* getelementptr (i32* @g2, i32 2)
+llvm.mlir.global constant @int_gep() : !llvm<"i32*"> {
+  %0 = llvm.mlir.addressof @g2 : !llvm<"i32*">
+  %1 = llvm.mlir.constant(2 : i32) : !llvm.i32
+  %2 = llvm.getelementptr %0[%1] : (!llvm<"i32*">, !llvm.i32) -> !llvm<"i32*">
+  // The initializer region must end with `llvm.return`.
+  llvm.return %2 : !llvm<"i32*">
+}
+```
+
+Only one of the initializer attribute or initializer region may be provided.
+
+`llvm.mlir.global` must appear at top-level of the enclosing module. It uses an
+@-identifier for its value, which will be uniqued by the module with respect to
+other @-identifiers in it.
+
+Examples:
+
+```mlir
+// Global values use @-identifiers.
+llvm.mlir.global constant @cst(42 : i32) : !llvm.i32
+
+// Non-constant values must also be initialized.
+llvm.mlir.global @variable(32.0 : f32) : !llvm.float
+
+// Strings are expected to be of wrapped LLVM i8 array type and do not
+// automatically include the trailing zero.
+llvm.mlir.global @string("abc") : !llvm<"[3 x i8]">
+
+// For strings globals, the trailing type may be omitted.
+llvm.mlir.global constant @no_trailing_type("foo bar")
+
+// A complex initializer is constructed with an initializer region.
+llvm.mlir.global constant @int_gep() : !llvm<"i32*"> {
+  %0 = llvm.mlir.addressof @g2 : !llvm<"i32*">
+  %1 = llvm.mlir.constant(2 : i32) : !llvm.i32
+  %2 = llvm.getelementptr %0[%1] : (!llvm<"i32*">, !llvm.i32) -> !llvm<"i32*">
+  llvm.return %2 : !llvm<"i32*">
+}
+```
+
+#### `llvm.mlir.null`
+
+Unlike LLVM IR, MLIR does not have first-class null pointers. They must be
+explicitly created as SSA values using `llvm.mlir.null`. This operation has
+operands or attributes, and returns a null value of a wrapped LLVM IR pointer
+type.
+
+Examples:
+
+```mlir
+// Null pointer to i8 value.
+%0 = llvm.mlir.null : !llvm<"i8*">
+
+// Null pointer to a function with signature void() value.
+%1 = llvm.mlir.null : !llvm<"void()*">
+```
+
+#### `llvm.mlir.undef`
+
+Unlike LLVM IR, MLIR does not have first-class undefined values. Such values
+must be created as SSA values using `llvm.mlir.undef`. This operation has no
+operands or attributes. It creates an undefined value of the specified LLVM IR
+dialect type wrapping an LLVM IR structure type.
+
+Example:
+
+```mlir
+// Create a structure with a 32-bit integer followed by a float.
+%0 = llvm.mlir.undef : !llvm<"{i32, float}">
+```
diff --git a/mlir/docs/Dialects/Linalg.md b/mlir/docs/Dialects/Linalg.md
new file mode 100644
index 0000000000000000000000000000000000000000..1ed5a2c2a2641ef96072b8eadb1351a880d71354
--- /dev/null
+++ b/mlir/docs/Dialects/Linalg.md
@@ -0,0 +1,8 @@
+# Linalg Dialect
+
+To generate the documentation:
+
+```sh
+mlir-tblgen --gen-op-doc -I /path/to/mlir/include \
+/path/to/mlir/include/mlir/Dialect/Linalg/IR/LinalgDoc.td
+```
diff --git a/mlir/docs/Dialects/SPIR-V.md b/mlir/docs/Dialects/SPIR-V.md
new file mode 100644
index 0000000000000000000000000000000000000000..1d72e5449d3e846fe9b6d691912d84eee313b229
--- /dev/null
+++ b/mlir/docs/Dialects/SPIR-V.md
@@ -0,0 +1,1039 @@
+# SPIR-V Dialect
+
+This document describes the design of the SPIR-V dialect in MLIR. It lists
+various design choices we made for modeling different SPIR-V mechanisms, and
+their rationale.
+
+This document also explains in a high-level manner how different components are
+organized and implemented in the code and gives steps to follow for extending
+them.
+
+This document assumes familiarity with SPIR-V. [SPIR-V][Spirv] is the Khronos
+Group’s binary intermediate language for representing graphics shaders and
+compute kernels. It is adopted by multiple Khronos Group’s APIs, including
+Vulkan and OpenCL. It is fully defined in a
+[human-readable specification][SpirvSpec]; the syntax of various SPIR-V
+instructions are encoded in a [machine-readable grammar][SpirvGrammar].
+
+## Design Guidelines
+
+SPIR-V is a binary intermediate language that serves dual purpose: on one side,
+it is an intermediate language to represent graphics shaders and compute kernels
+for high-level languages to target; on the other side, it defines a stable
+binary format for hardware driver consumption. As a result, SPIR-V has design
+principles pertain to not only intermediate language, but also binary format.
+For example, regularity is one of the design goals of SPIR-V. All concepts are
+represented as SPIR-V instructions, including declaring extensions and
+capabilities, defining types and constants, defining functions, attaching
+additional properties to computation results, etc. This way favors binary
+encoding and decoding for driver consumption but not necessarily compiler
+transformations.
+
+### Dialect design principles
+
+The main objective of the SPIR-V dialect is to be a proper intermediate
+representation (IR) to facilitate compiler transformations. While we still aim
+to support serializing to and deserializing from the binary format for various
+good reasons, the binary format and its concerns play less a role in the design
+of the SPIR-V dialect: when there is a trade-off to be made between favoring IR
+and supporting binary format, we lean towards the former.
+
+On the IR aspect, the SPIR-V dialect aims to model SPIR-V at the same semantic
+level. It is not intended to be a higher level or lower level abstraction than
+the SPIR-V specification. Those abstractions are easily outside the domain of
+SPIR-V and should be modeled with other proper dialects so they can be shared
+among various compilation paths. Because of the dual purpose of SPIR-V, SPIR-V
+dialect staying at the same semantic level as the SPIR-V specification also
+means we can still have straightforward serailization and deserailization for
+the majority of functionalities.
+
+To summarize, the SPIR-V dialect follows the following design principles:
+
+*   Stay as the same semantic level as the SPIR-V specification by having
+    one-to-one mapping for most concepts and entities.
+*   Adopt SPIR-V specification's syntax if possible, but deviate intentionally
+    to utilize MLIR mechanisms if it results in better representation and
+    benefits transformation.
+*   Be straightforward to serialize into and deserialize from the SPIR-V binary
+    format.
+
+SPIR-V is designed to be consumed by hardware drivers, so its representation is
+quite clear, yet verbose for some cases. Allowing representational deviation
+gives us the flexibility to reduce the verbosity by using MLIR mechanisms.
+
+### Dialect scopes
+
+SPIR-V supports multiple execution environments, specified by client APIs.
+Notable adopters include Vulkan and OpenCL. It follows that the SPIR-V dialect
+should support multiple execution environments if to be a proper proxy of SPIR-V
+in MLIR systems. The SPIR-V dialect is designed with these considerations: it
+has proper support for versions, extensions, and capabilities and is as
+extensible as SPIR-V specification.
+
+## Conventions
+
+The SPIR-V dialect adopts the following conventions for IR:
+
+*   The prefix for all SPIR-V types and operations are `spv.`.
+*   All instructions in an extended instruction set are further qualified with
+    the extended instruction set's prefix. For example, all operations in the
+    GLSL extended instruction set is has the prefix of `spv.GLSL.`.
+*   Ops that directly mirror instructions in the specification have `CamelCase`
+    names that are the same as the instruction opnames (without the `Op`
+    prefix). For example, `spv.FMul` is a direct mirror of `OpFMul` in the
+    specification. Such an op will be serialized into and deserialized from one
+    SPIR-V instruction.
+*   Ops with `snake_case` names are those that have different representation
+    from corresponding instructions (or concepts) in the specification. These
+    ops are mostly for defining the SPIR-V structure. For example, `spv.module`
+    and `spv.constant`. They may correspond to one or more instructions during
+    (de)serialization.
+*   Ops with `_snake_case` names are those that have no corresponding
+    instructions (or concepts) in the binary format. They are introduced to
+    satisfy MLIR structural requirements. For example, `spv._module_end` and
+    `spv._merge`. They maps to no instructions during (de)serialization.
+
+(TODO: consider merging the last two cases and adopting `spv.mlir.` prefix for
+them.)
+
+## Module
+
+A SPIR-V module is defined via the `spv.module` op, which has one region that
+contains one block. Model-level instructions, including function definitions,
+are all placed inside the block. Functions are defined using the builtin `func`
+op.
+
+We choose to model a SPIR-V module with a dedicated `spv.module` op based on the
+following considerations:
+
+*   It maps cleanly to a SPIR-V module in the specification.
+*   We can enforce SPIR-V specific verification that is suitable to be performed
+    at the module-level.
+*   We can attach additional model-level attributes.
+*   We can control custom assembly form.
+
+The `spv.module` op's region cannot capture SSA values from outside, neither
+implicitly nor explicitly. The `spv.module` op's region is closed as to what ops
+can appear inside: apart from the builtin `func` op, it can only contain ops
+from the SPIR-V dialect. The `spv.module` op's verifier enforces this rule. This
+meaningfully guarantees that a `spv.module` can be the entry point and boundary
+for serialization.
+
+### Module-level operations
+
+SPIR-V binary format defines the following [sections][SpirvLogicalLayout]:
+
+1.  Capabilities required by the module.
+1.  Extensions required by the module.
+1.  Extended instructions sets required by the module.
+1.  Addressing and memory model specification.
+1.  Entry point specifications.
+1.  Execution mode declarations.
+1.  Debug instructions.
+1.  Annotation/decoration instructions.
+1.  Type, constant, global variables.
+1.  Function declarations.
+1.  Function definitions.
+
+Basically, a SPIR-V binary module contains multiple module-level instructions
+followed by a list of functions. Those module-level instructions are essential
+and they can generate result ids referenced by functions, notably, declaring
+resource variables to interact with the execution environment.
+
+Compared to the binary format, we adjust how these module-level SPIR-V
+instructions are represented in the SPIR-V dialect:
+
+#### Use MLIR attributes for metadata
+
+*   Requirements for capabilities, extensions, extended instruction sets,
+    addressing model, and memory model is conveyed using `spv.module`
+    attributes. This is considered better because these information are for the
+    execution environment. It's easier to probe them if on the module op itself.
+*   Annotations/decoration instructions are "folded" into the instructions they
+    decorate and represented as attributes on those ops. This eliminates
+    potential forward references of SSA values, improves IR readability, and
+    makes querying the annotations more direct. More discussions can be found in
+    the [`Decorations`](#decorations) section.
+
+#### Model types with MLIR custom types
+
+*   Types are represented using MLIR standard types and SPIR-V dialect specific
+    types. There are no type declaration ops in the SPIR-V dialect. More
+    discussions can be found in the [Types](#types) section later.
+
+#### Unify and localize constants
+
+*   Various normal constant instructions are represented by the same
+    `spv.constant` op. Those instructions are just for constants of different
+    types; using one op to represent them reduces IR verbosity and makes
+    transformations less tedious.
+*   Normal constants are not placed in `spv.module`'s region; they are localized
+    into functions. This is to make functions in the SPIR-V dialect to be
+    isolated and explicit capturing. Constants are cheap to duplicate given
+    attributes are uniqued in `MLIRContext`.
+
+#### Adopt symbol-based global variables and specialization constant
+
+*   Global variables are defined with the `spv.globalVariable` op. They do not
+    generate SSA values. Instead they have symbols and should be referenced via
+    symbols. To use a global variables in a function block, `spv._address_of` is
+    needed to turn the symbol into a SSA value.
+*   Specialization constants are defined with the `spv.specConstant` op. Similar
+    to global variables, they do not generate SSA values and have symbols for
+    reference, too. `spv._reference_of` is needed to turn the symbol into a SSA
+    value for use in a function block.
+
+The above choices enables functions in the SPIR-V dialect to be isolated and
+explicit capturing.
+
+#### Disallow implicit capturing in functions
+
+*   In SPIR-V specification, functions support implicit capturing: they can
+    reference SSA values defined in modules. In the SPIR-V dialect functions are
+    defined with `func` op, which disallows implicit capturing. This is more
+    friendly to compiler analyses and transformations. More discussions can be
+    found in the [Function](#function) section later.
+
+### Model entry points and execution models as normal ops
+
+*   A SPIR-V module can have multiple entry points. And these entry points refer
+    to the function and interface variables. It’s not suitable to model them as
+    `spv.module` op attributes. We can model them as normal ops of using symbol
+    references.
+*   Similarly for execution modes, which are coupled with entry points, we can
+    model them as normal ops in `spv.module`'s region.
+
+## Decorations
+
+Annotations/decorations provide additional information on result ids. In SPIR-V,
+all instructions can generate result ids, including value-computing and
+type-defining ones.
+
+For decorations on value result ids, we can just have a corresponding attribute
+attached to the operation generating the SSA value. For example, for the
+following SPIR-V:
+
+```spirv
+OpDecorate %v1 RelaxedPrecision
+OpDecorate %v2 NoContraction
+...
+%v1 = OpFMul %float %0 %0
+%v2 = OpFMul %float %1 %1
+```
+
+We can represent them in the SPIR-V dialect as:
+
+```mlir
+%v1 = "spv.FMul"(%0, %0) {RelaxedPrecision: unit} : (f32, f32) -> (f32)
+%v2 = "spv.FMul"(%1, %1) {NoContraction: unit} : (f32, f32) -> (f32)
+```
+
+This approach benefits transformations. Essentially those decorations are just
+additional properties of the result ids (and thus their defining instructions).
+In SPIR-V binary format, they are just represented as instructions. Literally
+following SPIR-V binary format means we need to through def-use chains to find
+the decoration instructions and query information from them.
+
+For decorations on type result ids, notice that practically, only result ids
+generated from composite types (e.g., `OpTypeArray`, `OpTypeStruct`) need to be
+decorated for memory layouting purpose (e.g., `ArrayStride`, `Offset`, etc.);
+scalar/vector types are required to be uniqued in SPIR-V. Therefore, we can just
+encode them directly in the dialect-specific type.
+
+## Types
+
+Theoretically we can define all SPIR-V types using MLIR extensible type system,
+but other than representational purity, it does not buy us more. Instead, we
+need to maintain the code and invest in pretty printing them. So we prefer to
+use builtin/standard types if possible.
+
+The SPIR-V dialect reuses standard integer, float, and vector types:
+
+Specification                        | Dialect
+:----------------------------------: | :-------------------------------:
+`OpTypeBool`                         | `i1`
+`OpTypeInt <bitwidth>`               | `i<bitwidth>`
+`OpTypeFloat <bitwidth>`             | `f<bitwidth>`
+`OpTypeVector <scalar-type> <count>` | `vector<<count> x <scalar-type>>`
+
+Similarly, `mlir::NoneType` can be used for SPIR-V `OpTypeVoid`; builtin
+function types can be used for SPIR-V `OpTypeFunction` types.
+
+The SPIR-V dialect and defines the following dialect-specific types:
+
+```
+spirv-type ::= array-type
+             | image-type
+             | pointer-type
+             | runtime-array-type
+             | struct-type
+```
+
+### Array type
+
+This corresponds to SPIR-V [array type][ArrayType]. Its syntax is
+
+```
+element-type ::= integer-type
+               | floating-point-type
+               | vector-type
+               | spirv-type
+
+array-type ::= `!spv.array<` integer-literal `x` element-type `>`
+```
+
+For example,
+
+```mlir
+!spv.array<4 x i32>
+!spv.array<16 x vector<4 x f32>>
+```
+
+### Image type
+
+This corresponds to SPIR-V [image type][ImageType]. Its syntax is
+
+```
+dim ::= `1D` | `2D` | `3D` | `Cube` | <and other SPIR-V Dim specifiers...>
+
+depth-info ::= `NoDepth` | `IsDepth` | `DepthUnknown`
+
+arrayed-info ::= `NonArrayed` | `Arrayed`
+
+sampling-info ::= `SingleSampled` | `MultiSampled`
+
+sampler-use-info ::= `SamplerUnknown` | `NeedSampler` | `NoSampler`
+
+format ::= `Unknown` | `Rgba32f` | <and other SPIR-V Image Formats...>
+
+image-type ::= `!spv.image<` element-type `,` dim `,` depth-info `,`
+                           arrayed-info `,` sampling-info `,`
+                           sampler-use-info `,` format `>`
+```
+
+For example,
+
+```mlir
+!spv.image<f32, 1D, NoDepth, NonArrayed, SingleSampled, SamplerUnknown, Unknown>
+!spv.image<f32, Cube, IsDepth, Arrayed, MultiSampled, NeedSampler, Rgba32f>
+```
+
+### Pointer type
+
+This corresponds to SPIR-V [pointer type][PointerType]. Its syntax is
+
+```
+storage-class ::= `UniformConstant`
+                | `Uniform`
+                | `Workgroup`
+                | <and other storage classes...>
+
+pointer-type ::= `!spv.ptr<` element-type `,` storage-class `>`
+```
+
+For example,
+
+```mlir
+!spv.ptr<i32, Function>
+!spv.ptr<vector<4 x f32>, Uniform>
+```
+
+### Runtime array type
+
+This corresponds to SPIR-V [runtime array type][RuntimeArrayType]. Its syntax is
+
+```
+runtime-array-type ::= `!spv.rtarray<` element-type `>`
+```
+
+For example,
+
+```mlir
+!spv.rtarray<i32>
+!spv.rtarray<vector<4 x f32>>
+```
+
+### Struct type
+
+This corresponds to SPIR-V [struct type][StructType]. Its syntax is
+
+```
+struct-member-decoration ::= integer-literal? spirv-decoration*
+struct-type ::= `!spv.struct<` spirv-type (`[` struct-member-decoration `]`)?
+                     (`, ` spirv-type (`[` struct-member-decoration `]`)?
+```
+
+For Example,
+
+```mlir
+!spv.struct<f32>
+!spv.struct<f32 [0]>
+!spv.struct<f32, !spv.image<f32, 1D, NoDepth, NonArrayed, SingleSampled, SamplerUnknown, Unknown>>
+!spv.struct<f32 [0], i32 [4]>
+```
+
+## Function
+
+In SPIR-V, a function construct consists of multiple instructions involving
+`OpFunction`, `OpFunctionParameter`, `OpLabel`, `OpFunctionEnd`.
+
+```spirv
+// int f(int v) { return v; }
+%1 = OpTypeInt 32 0
+%2 = OpTypeFunction %1 %1
+%3 = OpFunction %1 %2
+%4 = OpFunctionParameter %1
+%5 = OpLabel
+%6 = OpReturnValue %4
+     OpFunctionEnd
+```
+
+This construct is very clear yet quite verbose. It is intended for driver
+consumption. There is little benefit to literally replicate this construct in
+the SPIR-V dialect. Instead, we reuse the builtin `func` op to express functions
+more concisely:
+
+```mlir
+func @f(%arg: i32) -> i32 {
+  "spv.ReturnValue"(%arg) : (i32) -> (i32)
+}
+```
+
+A SPIR-V function can have at most one result. It cannot contain nested
+functions or non-SPIR-V operations. `spv.module` verifies these requirements.
+
+A major difference between the SPIR-V dialect and the SPIR-V specification for
+functions is that the former are isolated and require explicit capturing, while
+the latter allow implicit capturing. In SPIR-V specification, functions can
+refer to SSA values (generated by constants, global variables, etc.) defined in
+modules. The SPIR-V dialect adjusted how constants and global variables are
+modeled to enable isolated functions. Isolated functions are more friendly to
+compiler analyses and transformations. This also enables the SPIR-V dialect to
+better utilize core infrastructure: many functionalities in the core
+infrastructure requires ops to be isolated, e.g., the
+[greedy pattern rewriter][GreedyPatternRewriter] can only act on ops isolated
+from above.
+
+(TODO: create a dedicated `spv.fn` op for SPIR-V functions.)
+
+## Operations
+
+In SPIR-V, instruction is a generalized concept; a SPIR-V module is just a
+sequence of instructions. Declaring types, expressing computations, annotating
+result ids, expressing control flows and others are all in the form of
+instructions.
+
+We only discuss instructions expressing computations here, which can be
+represented via SPIR-V dialect ops. Module-level instructions for declarations
+and definitions are represented differently in the SPIR-V dialect as explained
+earlier in the [Module-level operations](#module-level-operations) section.
+
+An instruction computes zero or one result from zero or more operands. The
+result is a new result id. An operand can be a result id generated by a previous
+instruction, an immediate value, or a case of an enum type. We can model result
+id operands and results with MLIR SSA values; for immediate value and enum
+cases, we can model them with MLIR attributes.
+
+For example,
+
+```spirv
+%i32 = OpTypeInt 32 0
+%c42 = OpConstant %i32 42
+...
+%3 = OpVariable %i32 Function 42
+%4 = OpIAdd %i32 %c42 %c42
+```
+
+can be represented in the dialect as
+
+```mlir
+%0 = "spv.constant"() { value = 42 : i32 } : () -> i32
+%1 = "spv.Variable"(%0) { storage_class = "Function" } : (i32) -> !spv.ptr<i32, Function>
+%2 = "spv.IAdd"(%0, %0) : (i32, i32) -> i32
+```
+
+Operation documentation is written in each op's Op Definition Spec using
+TableGen. A markdown version of the doc can be generated using `mlir-tblgen
+-gen-doc`.
+
+### Ops from extended instruction sets
+
+Analogically extended instruction set is a mechanism to import SPIR-V
+instructions within another namespace. [`GLSL.std.450`][GlslStd450] is an
+extended instruction set that provides common mathematical routines that should
+be supported. Instead of modeling `OpExtInstImport` as a separate op and use a
+single op to model `OpExtInst` for all extended instructions, we model each
+SPIR-V instruction in an extended instruction set as a separate op with the
+proper name prefix. For example, for
+
+```spirv
+%glsl = OpExtInstImport "GLSL.std.450"
+
+%f32 = OpTypeFloat 32
+%cst = OpConstant %f32 ...
+
+%1 = OpExtInst %f32 %glsl 28 %cst
+%2 = OpExtInst %f32 %glsl 31 %cst
+```
+
+we can have
+
+```mlir
+%1 = "spv.GLSL.Log"(%cst) : (f32) -> (f32)
+%2 = "spv.GLSL.Sqrt(%cst) : (f32) -> (f32)
+```
+
+## Control Flow
+
+SPIR-V binary format uses merge instructions (`OpSelectionMerge` and
+`OpLoopMerge`) to declare structured control flow. They explicitly declare a
+header block before the control flow diverges and a merge block where control
+flow subsequently converges. These blocks delimit constructs that must nest, and
+can only be entered and exited in structured ways.
+
+In the SPIR-V dialect, we use regions to mark the boundary of a structured
+control flow construct. With this approach, it's easier to discover all blocks
+belonging to a structured control flow construct. It is also more idiomatic to
+MLIR system.
+
+We introduce a `spv.selection` and `spv.loop` op for structured selections and
+loops, respectively. The merge targets are the next ops following them. Inside
+their regions, a special terminator, `spv._merge` is introduced for branching to
+the merge target.
+
+### Selection
+
+`spv.selection` defines a selection construct. It contains one region. The
+region should contain at least two blocks: one selection header block and one
+merge block.
+
+*   The selection header block should be the first block. It should contain the
+    `spv.BranchConditional` or `spv.Switch` op.
+*   The merge block should be the last block. The merge block should only
+    contain a `spv._merge` op. Any block can branch to the merge block for early
+    exit.
+
+```
+               +--------------+
+               | header block |                 (may have multiple outgoing branches)
+               +--------------+
+                    / | \
+                     ...
+
+
+   +---------+   +---------+   +---------+
+   | case #0 |   | case #1 |   | case #2 |  ... (may have branches between each other)
+   +---------+   +---------+   +---------+
+
+
+                     ...
+                    \ | /
+                      v
+               +-------------+
+               | merge block |                  (may have multiple incoming branches)
+               +-------------+
+```
+
+For example, for the given function
+
+```c++
+void loop(bool cond) {
+  int x = 0;
+  if (cond) {
+    x = 1;
+  } else {
+    x = 2;
+  }
+  // ...
+}
+```
+
+It will be represented as
+
+```mlir
+func @selection(%cond: i1) -> () {
+  %zero = spv.constant 0: i32
+  %one = spv.constant 1: i32
+  %two = spv.constant 2: i32
+  %x = spv.Variable init(%zero) : !spv.ptr<i32, Function>
+
+  spv.selection {
+    spv.BranchConditional %cond, ^then, ^else
+
+  ^then:
+    spv.Store "Function" %x, %one : i32
+    spv.Branch ^merge
+
+  ^else:
+    spv.Store "Function" %x, %two : i32
+    spv.Branch ^merge
+
+  ^merge:
+    spv._merge
+  }
+
+  // ...
+}
+
+```
+
+### Loop
+
+`spv.loop` defines a loop construct. It contains one region. The region should
+contain at least four blocks: one entry block, one loop header block, one loop
+continue block, one merge block.
+
+*   The entry block should be the first block and it should jump to the loop
+    header block, which is the second block.
+*   The merge block should be the last block. The merge block should only
+    contain a `spv._merge` op. Any block except the entry block can branch to
+    the merge block for early exit.
+*   The continue block should be the second to last block and it should have a
+    branch to the loop header block.
+*   The loop continue block should be the only block, except the entry block,
+    branching to the loop header block.
+
+```
+    +-------------+
+    | entry block |           (one outgoing branch)
+    +-------------+
+           |
+           v
+    +-------------+           (two incoming branches)
+    | loop header | <-----+   (may have one or two outgoing branches)
+    +-------------+       |
+                          |
+          ...             |
+         \ | /            |
+           v              |
+   +---------------+      |   (may have multiple incoming branches)
+   | loop continue | -----+   (may have one or two outgoing branches)
+   +---------------+
+
+          ...
+         \ | /
+           v
+    +-------------+           (may have multiple incoming branches)
+    | merge block |
+    +-------------+
+```
+
+The reason to have another entry block instead of directly using the loop header
+block as the entry block is to satisfy region's requirement: entry block of
+region may not have predecessors. We have a merge block so that branch ops can
+reference it as successors. The loop continue block here corresponds to
+"continue construct" using SPIR-V spec's term; it does not mean the "continue
+block" as defined in the SPIR-V spec, which is "a block containing a branch to
+an OpLoopMerge instruction’s Continue Target."
+
+For example, for the given function
+
+```c++
+void loop(int count) {
+  for (int i = 0; i < count; ++i) {
+    // ...
+  }
+}
+```
+
+It will be represented as
+
+```mlir
+func @loop(%count : i32) -> () {
+  %zero = spv.constant 0: i32
+  %one = spv.constant 1: i32
+  %var = spv.Variable init(%zero) : !spv.ptr<i32, Function>
+
+  spv.loop {
+    spv.Branch ^header
+
+  ^header:
+    %val0 = spv.Load "Function" %var : i32
+    %cmp = spv.SLessThan %val0, %count : i32
+    spv.BranchConditional %cmp, ^body, ^merge
+
+  ^body:
+    // ...
+    spv.Branch ^continue
+
+  ^continue:
+    %val1 = spv.Load "Function" %var : i32
+    %add = spv.IAdd %val1, %one : i32
+    spv.Store "Function" %var, %add : i32
+    spv.Branch ^header
+
+  ^merge:
+    spv._merge
+  }
+  return
+}
+```
+
+### Block argument for Phi
+
+There are no direct Phi operations in the SPIR-V dialect; SPIR-V `OpPhi`
+instructions are modelled as block arguments in the SPIR-V dialect. (See the
+[Rationale][Rationale] doc for "Block Arguments vs Phi nodes".) Each block
+argument corresponds to one `OpPhi` instruction in the SPIR-V binary format. For
+example, for the following SPIR-V function `foo`:
+
+```spirv
+  %foo = OpFunction %void None ...
+%entry = OpLabel
+  %var = OpVariable %_ptr_Function_int Function
+         OpSelectionMerge %merge None
+         OpBranchConditional %true %true %false
+ %true = OpLabel
+         OpBranch %phi
+%false = OpLabel
+         OpBranch %phi
+  %phi = OpLabel
+  %val = OpPhi %int %int_1 %false %int_0 %true
+         OpStore %var %val
+         OpReturn
+%merge = OpLabel
+         OpReturn
+         OpFunctionEnd
+```
+
+It will be represented as:
+
+```mlir
+func @foo() -> () {
+  %var = spv.Variable : !spv.ptr<i32, Function>
+
+  spv.selection {
+    %true = spv.constant true
+    spv.BranchConditional %true, ^true, ^false
+
+  ^true:
+    %zero = spv.constant 0 : i32
+    spv.Branch ^phi(%zero: i32)
+
+  ^false:
+    %one = spv.constant 1 : i32
+    spv.Branch ^phi(%one: i32)
+
+  ^phi(%arg: i32):
+    spv.Store "Function" %var, %arg : i32
+    spv.Return
+
+  ^merge:
+    spv._merge
+  }
+  spv.Return
+}
+```
+
+## Shader interface (ABI)
+
+SPIR-V itself is just expressing computation happening on GPU device. SPIR-V
+programs themselves are not enough for running workloads on GPU; a companion
+host application is needed to manage the resources referenced by SPIR-V programs
+and dispatch the workload. For the Vulkan execution environment, the host
+application will be written using Vulkan API. Unlike CUDA, the SPIR-V program
+and the Vulkan application are typically authored with different front-end
+languages, which isolates these two worlds. Yet they still need to match
+_interfaces_: the variables declared in a SPIR-V program for referencing
+resources need to match with the actual resources managed by the application
+regarding their parameters.
+
+Still using Vulkan as an example execution environment, there are two primary
+resource types in Vulkan: buffers and images. They are used to back various uses
+that may differ regarding the classes of operations (load, store, atomic) to be
+performed. These uses are differentiated via descriptor types. (For example,
+uniform storage buffer descriptors can only support load operations while
+storage buffer descriptors can support load, store, and atomic operations.)
+Vulkan uses a binding model for resources. Resources are associated with
+descriptors and descriptors are further grouped into sets. Each descriptor thus
+has a set number and a binding number. Descriptors in the application
+corresponds to variables in the SPIR-V program. Their parameters must match,
+including but not limited to set and binding numbers.
+
+Apart from buffers and images, there is other data that is set up by Vulkan and
+referenced inside the SPIR-V program, for example, push constants. They also
+have parameters that require matching between the two worlds.
+
+The interface requirements are external information to the SPIR-V compilation
+path in MLIR. Besides, each Vulkan application may want to handle resources
+differently. To avoid duplication and to share common utilities, a SPIR-V shader
+interface specification needs to be defined to provide the external requirements
+to and guide the SPIR-V compilation path.
+
+### Shader interface attributes
+
+The SPIR-V dialect defines [a few attributes][MlirSpirvAbi] for specifying these
+interfaces:
+
+*   `spv.entry_point_abi` is a struct attribute that should be attached to the
+    entry function. It contains:
+    *   `local_size` for specifying the local work group size for the dispatch.
+*   `spv.interface_var_abi` is a struct attribute that should be attached to
+    each operand and result of the entry function. It contains:
+    *   `descriptor_set` for specifying the descriptor set number for the
+        corresponding resource variable.
+    *   `binding` for specifying the binding number for the corresponding
+        resource variable.
+    *   `storage_class` for specifying the storage class for the corresponding
+        resource variable.
+
+The SPIR-V dialect provides a [`LowerABIAttributesPass`][MlirSpirvPasses] for
+consuming these attributes and create SPIR-V module complying with the
+interface.
+
+## Serialization and deserialization
+
+Although the main objective of the SPIR-V dialect is to act as a proper IR for
+compiler transformations, being able to serialize to and deserialize from the
+binary format is still very valuable for many good reasons. Serialization
+enables the artifacts of SPIR-V compilation to be consumed by a execution
+environment; deserialization allows us to import SPIR-V binary modules and run
+transformations on them. So serialization and deserialization is supported from
+the very beginning of the development of the SPIR-V dialect.
+
+The serialization library provides two entry points, `mlir::spirv::serialize()`
+and `mlir::spirv::deserialize()`, for converting a MLIR SPIR-V module to binary
+format and back. The [Code organization](#code-organization) explains more about
+this.
+
+Given that the focus is transformations, which inevitably means changes to the
+binary module; so serialization is not designed to be a general tool for
+investigating the SPIR-V binary module and does not guarantee roundtrip
+equivalence (at least for now). For the latter, please use the
+assembler/disassembler in the [SPIRV-Tools][SpirvTools] project.
+
+A few transformations are performed in the process of serialization because of
+the representational differences between SPIR-V dialect and binary format:
+
+*   Attributes on `spv.module` are emitted as their corresponding SPIR-V
+    instructions.
+*   Types are serialized into `OpType*` instructions in the SPIR-V binary module
+    section for types, constants, and global variables.
+*   `spv.constant`s are unified and placed in the SPIR-V binary module section
+    for types, constants, and global variables.
+*   Attributes on ops, if not part of the op's binary encoding, are emitted as
+    `OpDecorate*` instructions in the SPIR-V binary module section for
+    decorations.
+*   `spv.selection`s and `spv.loop`s are emitted as basic blocks with `Op*Merge`
+    instructions in the header block as required by the binary format.
+*   Block arguments are materialized as `OpPhi` instructions at the beginning of
+    the corresponding blocks.
+
+Similarly, a few transformations are performed during deserialization:
+
+*   Instructions for execution environment requirements (extensions,
+    capabilities, extended instruction sets, etc.) will be placed as attributes
+    on `spv.module`.
+*   `OpType*` instructions will be converted into proper `mlir::Type`s.
+*   `OpConstant*` instructions are materialized as `spv.constant` at each use
+    site.
+*   `OpVariable` instructions will be converted to `spv.globalVariable` ops if
+    in module-level; otherwise they will be converted into `spv.Variable` ops.
+*   Every use of a module-level `OpVariable` instruction will materialize a
+    `spv._address_of` op to turn the symbol of the corresponding
+    `spv.globalVariable` into an SSA value.
+*   Every use of a `OpSpecConstant` instruction will materialize a
+    `spv._reference_of` op to turn the symbol of the corresponding
+    `spv.specConstant` into an SSA value.
+*   `OpPhi` instructions are converted to block arguments.
+*   Structured control flow are placed inside `spv.selection` and `spv.loop`.
+
+## Conversions
+
+(TODO: expand this section)
+
+## Code organization
+
+We aim to provide multiple libraries with clear dependencies for SPIR-V related
+functionalities in MLIR so developers can just choose the needed components
+without pulling in the whole world.
+
+### The dialect
+
+The code for the SPIR-V dialect resides in a few places:
+
+*   Public headers are placed in [include/mlir/Dialect/SPIRV][MlirSpirvHeaders].
+*   Libraries are placed in [lib/Dialect/SPIRV][MlirSpirvLibs].
+*   IR tests are placed in [test/Dialect/SPIRV][MlirSpirvTests].
+*   Unit tests are placed in [unittests/Dialect/SPIRV][MlirSpirvUnittests].
+
+The whole SPIR-V dialect is exposed via multiple headers for better
+organization:
+
+*   [SPIRVDialect.h][MlirSpirvDialect] defines the SPIR-V dialect.
+*   [SPIRVTypes.h][MlirSpirvTypes] defines all SPIR-V specific types.
+*   [SPIRVOps.h][MlirSPirvOps] defines all SPIR-V operations.
+*   [Serialization.h][MlirSpirvSerialization] defines the entry points for
+    serialization and deserialization.
+
+The dialect itself, including all types and ops, is in the `MLIRSPIRV` library.
+Serialization functionalities are in the `MLIRSPIRVSerialization` library.
+
+### Op definitions
+
+We use [Op Definition Spec][ODS] to define all SPIR-V ops. They are written in
+TableGen syntax and placed in various `*Ops.td` files in the header directory.
+Those `*Ops.td` files are organized according to the instruction categories used
+in the SPIR-V specification, for example, an op belonging to the "Atomics
+Instructions" section is put in the `SPIRVAtomicOps.td` file.
+
+`SPIRVOps.td` serves as the master op definition file that includes all files
+for specific categories.
+
+`SPIRVBase.td` defines common classes and utilities used by various op
+definitions. It contains the TableGen SPIR-V dialect definition, SPIR-V
+versions, known extensions, various SPIR-V enums, TableGen SPIR-V types, and
+base op classes, etc.
+
+Many of the contents in `SPIRVBase.td`, e.g., the opcodes and various enums, and
+all `*Ops.td` files can be automatically updated via a Python script, which
+queries the SPIR-V specification and grammar. This greatly reduces the burden of
+supporting new ops and keeping updated with the SPIR-V spec. More details on
+this automated development can be found in the
+[Automated development flow](#automated-development-flow) section.
+
+### Dialect conversions
+
+The code for conversions from other dialects to the SPIR-V dialect also resides
+in a few places:
+
+*   From GPU dialect: headers are at
+    [include/mlir/Conversion/GPUTOSPIRV][MlirGpuToSpirvHeaders]; libraries are
+    at [lib/Conversion/GPUToSPIRV][MlirGpuToSpirvLibs].
+*   From standard dialect: headers are at
+    [include/mlir/Conversion/StandardTOSPIRV][MlirStdToSpirvHeaders]; libraries
+    are at [lib/Conversion/StandardToSPIRV][MlirStdToSpirvLibs].
+
+These dialect to dialect conversions have their dedicated libraries,
+`MLIRGPUToSPIRVTransforms` and `MLIRStandardToSPIRVTransforms`, respectively.
+
+There are also common utilities when targeting SPIR-V from any dialect:
+
+*   [include/mlir/Dialect/SPIRV/Passes.h][MlirSpirvPasses] contains SPIR-V
+    specific analyses and transformations.
+*   [include/mlir/Dialect/SPIRV/SPIRVLowering.h][MlirSpirvLowering] contains
+    type converters and other utility functions.
+
+These common utilities are implemented in the `MLIRSPIRVTransforms` library.
+
+## Contribution
+
+All kinds of contributions are highly appreciated! :) We have GitHub issues for
+tracking the [dialect][GitHubDialectTracking] and
+[lowering][GitHubLoweringTracking] development. You can find todo tasks there.
+The [Code organization](#code-organization) section gives an overview of how
+SPIR-V related functionalities are implemented in MLIR. This section gives more
+concrete steps on how to contribute.
+
+### Automated development flow
+
+One of the goals of SPIR-V dialect development is to leverage both the SPIR-V
+[human-readable specification][SpirvSpec] and
+[machine-readable grammar][SpirvGrammar] to auto-generate as much contents as
+possible. Specifically, the following tasks can be automated (partially or
+fully):
+
+*   Adding support for a new operation.
+*   Adding support for a new SPIR-V enum.
+*   Serialization and deserialization of a new operation.
+
+We achieve this using the Python script
+[`gen_spirv_dialect.py`][GenSpirvUtilsPy]. It fetches the human-readable
+specification and machine-readable grammar directly from the Internet and
+updates various SPIR-V `*.td` files in place. The script gives us an automated
+flow for adding support for new ops or enums.
+
+Afterwards, we have SPIR-V specific `mlir-tblgen` backends for reading the Op
+Definition Spec and generate various components, including (de)serialization
+logic for ops. Together with standard `mlir-tblgen` backends, we auto-generate
+all op classes, enum classes, etc.
+
+In the following subsections, we list the detailed steps to follow for common
+tasks.
+
+### Add a new op
+
+To add a new op, invoke the `define_inst.sh` script wrapper in utils/spirv.
+`define_inst.sh` requires a few parameters:
+
+```sh
+./define_inst.sh <filename> <base-class-name> <opname>
+```
+
+For example, to define the op for `OpIAdd`, invoke
+
+```sh
+./define_inst.sh SPIRVArithmeticOps.td ArithmeticBinaryOp OpIAdd
+```
+
+where `SPIRVArithmeticOps.td` is the filename for hosting the new op and
+`ArithmeticBinaryOp` is the direct base class the newly defined op will derive
+from.
+
+Similarly, to define the op for `OpAtomicAnd`,
+
+```sh
+./define_inst.sh SPIRVAtomicOps.td AtomicUpdateWithValueOp OpAtomicAnd
+```
+
+Note that the generated SPIR-V op definition is just a best-effort template; it
+is still expected to be updated to have more accurate traits, arguments, and
+results.
+
+The generated op will automatically gain the logic for (de)serialization.
+However, tests still need to be coupled with the change to make sure no
+surprises. Serialization tests live in test/Dialect/SPIRV/Serialization.
+
+### Add a new enum
+
+To add a new enum, invoke the `define_enum.sh` script wrapper in utils/spirv.
+`define_enum.sh` expects the following parameters:
+
+```sh
+./define_enum.sh <enum-class-name>
+```
+
+For example, to add the definition for SPIR-V storage class in to
+`SPIRVBase.td`:
+
+```sh
+./define_enum.sh StorageClass
+```
+
+### Add a new conversion
+
+(TODO: add details for this section)
+
+[Spirv]: https://www.khronos.org/registry/spir-v/
+[SpirvSpec]: https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html
+[SpirvLogicalLayout]: https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html#_a_id_logicallayout_a_logical_layout_of_a_module
+[SpirvGrammar]: https://raw.githubusercontent.com/KhronosGroup/SPIRV-Headers/master/include/spirv/unified1/spirv.core.grammar.json
+[GlslStd450]: https://www.khronos.org/registry/spir-v/specs/1.0/GLSL.std.450.html
+[ArrayType]: https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html#OpTypeArray
+[ImageType]: https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html#OpTypeImage
+[PointerType]: https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html#OpTypePointer
+[RuntimeArrayType]: https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html#OpTypeRuntimeArray
+[StructType]: https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html#Structure
+[SpirvTools]: https://github.com/KhronosGroup/SPIRV-Tools
+[Rationale]: https://github.com/tensorflow/mlir/blob/master/g3doc/Rationale.md#block-arguments-vs-phi-nodes
+[ODS]: https://github.com/tensorflow/mlir/blob/master/g3doc/OpDefinitions.md
+[GreedyPatternRewriter]: https://github.com/tensorflow/mlir/blob/master/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
+[MlirSpirvHeaders]: https://github.com/tensorflow/mlir/tree/master/include/mlir/Dialect/SPIRV
+[MlirSpirvLibs]: https://github.com/tensorflow/mlir/tree/master/lib/Dialect/SPIRV
+[MlirSpirvTests]: https://github.com/tensorflow/mlir/tree/master/test/Dialect/SPIRV
+[MlirSpirvUnittests]: https://github.com/tensorflow/mlir/tree/master/unittests/Dialect/SPIRV
+[MlirGpuToSpirvHeaders]: https://github.com/tensorflow/mlir/tree/master/include/mlir/Conversion/GPUToSPIRV
+[MlirGpuToSpirvLibs]: https://github.com/tensorflow/mlir/tree/master/lib/Conversion/GPUToSPIRV
+[MlirStdToSpirvHeaders]: https://github.com/tensorflow/mlir/tree/master/include/mlir/Conversion/StandardToSPIRV
+[MlirStdToSpirvLibs]: https://github.com/tensorflow/mlir/tree/master/lib/Conversion/StandardToSPIRV
+[MlirSpirvDialect]: https://github.com/tensorflow/mlir/blob/master/include/mlir/Dialect/SPIRV/SPIRVDialect.h
+[MlirSpirvTypes]: https://github.com/tensorflow/mlir/blob/master/include/mlir/Dialect/SPIRV/SPIRVTypes.h
+[MlirSpirvOps]: https://github.com/tensorflow/mlir/blob/master/include/mlir/Dialect/SPIRV/SPIRVOps.h
+[MlirSpirvSerialization]: https://github.com/tensorflow/mlir/blob/master/include/mlir/Dialect/SPIRV/Serialization.h
+[MlirSpirvBase]: https://github.com/tensorflow/mlir/blob/master/include/mlir/Dialect/SPIRV/SPIRVBase.td
+[MlirSpirvPasses]: https://github.com/tensorflow/mlir/blob/master/include/mlir/Dialect/SPIRV/Passes.h
+[MlirSpirvLowering]: https://github.com/tensorflow/mlir/blob/master/include/mlir/Dialect/SPIRV/SPIRVLowering.h
+[MlirSpirvAbi]: https://github.com/tensorflow/mlir/blob/master/include/mlir/Dialect/SPIRV/SPIRVLowering.td
+[GitHubDialectTracking]: https://github.com/tensorflow/mlir/issues/302
+[GitHubLoweringTracking]: https://github.com/tensorflow/mlir/issues/303
+[GenSpirvUtilsPy]: https://github.com/tensorflow/mlir/blob/master/utils/spirv/gen_spirv_dialect.py
diff --git a/mlir/docs/Dialects/Standard.md b/mlir/docs/Dialects/Standard.md
new file mode 100644
index 0000000000000000000000000000000000000000..f84a2c94e921ed463b5730053bb7b89a89bba012
--- /dev/null
+++ b/mlir/docs/Dialects/Standard.md
@@ -0,0 +1,1146 @@
+# Standard Dialect
+
+This dialect provides documentation for operations within the Standard dialect.
+
+Note: This dialect is a collection of operations for several different concepts,
+and should be split into multiple more-focused dialects accordingly.
+
+[TOC]
+
+TODO: shape, which returns a 1D tensor, and can take an unknown rank tensor as
+input.
+
+TODO: rank, which returns an index.
+
+## Terminator operations
+
+Terminator operations are required at the end of each block. They may contain a
+list of successors, i.e. other blocks to which the control flow will proceed.
+
+### 'br' terminator operation
+
+Syntax:
+
+```
+operation ::= `br` successor
+successor ::= bb-id branch-use-list?
+branch-use-list ::= `(` ssa-use-list `:` type-list-no-parens `)`
+```
+
+The `br` terminator operation represents an unconditional jump to a target
+block. The count and types of operands to the branch must align with the
+arguments in the target block.
+
+The MLIR branch operation is not allowed to target the entry block for a region.
+
+### 'cond_br' terminator operation
+
+Syntax:
+
+```
+operation ::= `cond_br` ssa-use `,` successor `,` successor
+```
+
+The `cond_br` terminator operation represents a conditional branch on a boolean
+(1-bit integer) value. If the bit is set, then the first destination is jumped
+to; if it is false, the second destination is chosen. The count and types of
+operands must align with the arguments in the corresponding target blocks.
+
+The MLIR conditional branch operation is not allowed to target the entry block
+for a region. The two destinations of the conditional branch operation are
+allowed to be the same.
+
+The following example illustrates a function with a conditional branch operation
+that targets the same block:
+
+```mlir
+func @select(i32, i32, i1) -> i32 {
+^bb0(%a : i32, %b :i32, %flag : i1) :
+    // Both targets are the same, operands differ
+    cond_br %flag, ^bb1(%a : i32), ^bb1(%b : i32)
+
+^bb1(%x : i32) :
+    return %x : i32
+}
+```
+
+### 'return' terminator operation
+
+Syntax:
+
+```
+operation ::= `return` (ssa-use-list `:` type-list-no-parens)?
+```
+
+The `return` terminator operation represents the completion of a function, and
+produces the result values. The count and types of the operands must match the
+result types of the enclosing function. It is legal for multiple blocks in a
+single function to return.
+
+## Core Operations
+
+### 'call' operation
+
+Syntax:
+
+```
+operation ::=
+    (ssa-id `=`)? `call` symbol-ref-id `(` ssa-use-list? `)` `:` function-type
+```
+
+The `call` operation represents a direct call to a function. The operands and
+result types of the call must match the specified function type. The callee is
+encoded as a function attribute named "callee".
+
+Example:
+
+```mlir
+// Calling the function my_add.
+%31 = call @my_add(%0, %1) : (tensor<16xf32>, tensor<16xf32>) -> tensor<16xf32>
+```
+
+### 'call_indirect' operation
+
+Syntax:
+
+```
+operation ::= `call_indirect` ssa-use `(` ssa-use-list? `)` `:` function-type
+```
+
+The `call_indirect` operation represents an indirect call to a value of function
+type. Functions are first class types in MLIR, and may be passed as arguments
+and merged together with block arguments. The operands and result types of the
+call must match the specified function type.
+
+Function values can be created with the
+[`constant` operation](#constant-operation).
+
+Example:
+
+```mlir
+%31 = call_indirect %15(%0, %1)
+        : (tensor<16xf32>, tensor<16xf32>) -> tensor<16xf32>
+```
+
+### 'dim' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `dim` ssa-id `,` integer-literal `:` type
+```
+
+The `dim` operation takes a memref or tensor operand and a dimension index, and
+returns an [`index`](../LangRef.md#index-type) that is the size of that
+dimension.
+
+The `dim` operation is represented with a single integer attribute named
+`index`, and the type specifies the type of the memref or tensor operand.
+
+Examples:
+
+```mlir
+// Always returns 4, can be constant folded:
+%x = dim %A, 0 : tensor<4 x ? x f32>
+
+// Returns the dynamic dimension of %A.
+%y = dim %A, 1 : tensor<4 x ? x f32>
+
+// Equivalent generic form:
+%x = "std.dim"(%A) {index = 0 : i64} : (tensor<4 x ? x f32>) -> index
+%y = "std.dim"(%A) {index = 1 : i64} : (tensor<4 x ? x f32>) -> index
+```
+
+## Memory Operations
+
+### 'alloc' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `alloc` dim-and-symbol-use-list `:` memref-type
+```
+
+Allocates a new memref of specified type. Values required for dynamic dimension
+sizes are passed as arguments in parentheses (in the same order in which they
+appear in the shape signature of the memref) while the symbols required by the
+layout map are passed in the square brackets in lexicographical order. If no
+layout maps are specified in the memref, then an identity mapping is used.
+
+The buffer referenced by a memref type is created by the `alloc` operation, and
+destroyed by the `dealloc` operation.
+
+Example:
+
+```mlir
+// Allocating memref for a fully static shape.
+%A = alloc() : memref<1024x64xf32, #layout_map0, memspace0>
+
+// %M, %N, %x, %y are SSA values of integer type.  M and N are bound to the
+// two unknown dimensions of the type and x/y are bound to symbols in
+// #layout_map1.
+%B = alloc(%M, %N)[%x, %y] : memref<?x?xf32, #layout_map1, memspace1>
+```
+
+### 'alloc_static' operation
+
+Syntax:
+
+```
+operation ::=
+    ssa-id `=` `alloc_static` `(` integer-literal `)` :  memref-type
+```
+
+Allocates a new memref of specified type with a fixed base pointer location in
+memory. 'alloc_static' does not support types that have dynamic shapes or that
+require dynamic symbols in their layout function (use the
+[`alloc` operation](#alloc-operation) in those cases).
+
+Example:
+
+```mlir
+%A = alloc_static(0x1232a00) : memref<1024 x 64 x f32, #layout_map0, memspace0>
+```
+
+The `alloc_static` operation is used to represent code after buffer allocation
+has been performed.
+
+### 'dealloc' operation
+
+Syntax:
+
+```
+operation ::= `dealloc` ssa-use `:` memref-type
+```
+
+Delineates the end of the lifetime of the memory corresponding to a memref
+allocation. It is paired with an [`alloc`](#alloc-operation) or
+[`alloc_static`](#alloc-static-operation) operation.
+
+Example:
+
+```mlir
+dealloc %A : memref<128 x f32, #layout, memspace0>
+```
+
+### 'dma_start' operation
+
+Syntax:
+
+```
+operation ::= `dma_start` ssa-use`[`ssa-use-list`]` `,`
+               ssa-use`[`ssa-use-list`]` `,` ssa-use `,`
+               ssa-use`[`ssa-use-list`]` (`,` ssa-use `,` ssa-use)?
+              `:` memref-type `,` memref-type `,` memref-type
+```
+
+Starts a non-blocking DMA operation that transfers data from a source memref to
+a destination memref. The operands include the source and destination memref's
+each followed by its indices, size of the data transfer in terms of the number
+of elements (of the elemental type of the memref), a tag memref with its
+indices, and optionally two additional arguments corresponding to the stride (in
+terms of number of elements) and the number of elements to transfer per stride.
+The tag location is used by a dma_wait operation to check for completion. The
+indices of the source memref, destination memref, and the tag memref have the
+same restrictions as any load/store operation in a affine context (whenever DMA
+operations appear in an affine context). See
+[restrictions on dimensions and symbols](Affine.md#restrictions-on-dimensions-and-symbols)
+in affine contexts. This allows powerful static analysis and transformations in
+the presence of such DMAs including rescheduling, pipelining / overlap with
+computation, and checking for matching start/end operations. The source and
+destination memref need not be of the same dimensionality, but need to have the
+same elemental type.
+
+For example, a `dma_start` operation that transfers 32 vector elements from a
+memref `%src` at location `[%i, %j]` to memref `%dst` at `[%k, %l]` would be
+specified as shown below.
+
+Example:
+
+```mlir
+%size = constant 32 : index
+%tag = alloc() : memref<1 x i32, (d0) -> (d0), 4>
+%idx = constant 0 : index
+dma_start %src[%i, %j], %dst[%k, %l], %size, %tag[%idx] :
+     memref<40 x 8 x vector<16xf32>, (d0, d1) -> (d0, d1), 0>,
+     memref<2 x 4 x vector<16xf32>, (d0, d1) -> (d0, d1), 2>,
+     memref<1 x i32>, (d0) -> (d0), 4>
+```
+
+### 'dma_wait' operation
+
+Syntax:
+
+```
+operation ::= `dma_wait` ssa-use`[`ssa-use-list`]` `,` ssa-use `:` memref-type
+```
+
+Blocks until the completion of a DMA operation associated with the tag element
+specified with a tag memref and its indices. The operands include the tag memref
+followed by its indices and the number of elements associated with the DMA being
+waited on. The indices of the tag memref have the same restrictions as
+load/store indices.
+
+Example:
+
+```mlir
+dma_wait %tag[%idx], %size : memref<1 x i32, (d0) -> (d0), 4>
+```
+
+### 'extract_element' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `extract_element` ssa-use `[` ssa-use-list `]` `:` type
+```
+
+The `extract_element` op reads a tensor or vector and returns one element from
+it specified by an index list. The output of the 'extract_element' is a new
+value with the same type as the elements of the tensor or vector. The arity of
+indices matches the rank of the accessed value (i.e., if a tensor is of rank 3,
+then 3 indices are required for the extract. The indices should all be of
+`index` type.
+
+Examples:
+
+```mlir
+%3 = extract_element %v[%1, %2] : vector<4x4xi32>
+%4 = extract_element %t[%1, %2] : tensor<4x4xi32>
+%5 = extract_element %ut[%1, %2] : tensor<*xi32>
+```
+
+### 'load' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `load` ssa-use `[` ssa-use-list `]` `:` memref-type
+```
+
+The `load` op reads an element from a memref specified by an index list. The
+output of load is a new value with the same type as the elements of the memref.
+The arity of indices is the rank of the memref (i.e., if the memref loaded from
+is of rank 3, then 3 indices are required for the load following the memref
+identifier).
+
+In an `affine.if` or `affine.for` body, the indices of a load are restricted to
+SSA values bound to surrounding loop induction variables,
+[symbols](../LangRef.md#dimensions-and-symbols), results of a
+[`constant` operation](#constant-operation), or the result of an `affine.apply`
+operation that can in turn take as arguments all of the aforementioned SSA
+values or the recursively result of such an `affine.apply` operation.
+
+Example:
+
+```mlir
+%1 = affine.apply (d0, d1) -> (3*d0) (%i, %j)
+%2 = affine.apply (d0, d1) -> (d1+1) (%i, %j)
+%12 = load %A[%1, %2] : memref<8x?xi32, #layout, memspace0>
+
+// Example of an indirect load (treated as non-affine)
+%3 = affine.apply (d0) -> (2*d0 + 1)(%12)
+%13 = load %A[%3, %2] : memref<4x?xi32, #layout, memspace0>
+```
+
+**Context:** The `load` and `store` operations are specifically crafted to fully
+resolve a reference to an element of a memref, and (in affine `affine.if` and
+`affine.for` operations) the compiler can follow use-def chains (e.g. through
+[`affine.apply`](Affine.md#affineapply-operation) operations) to precisely
+analyze references at compile-time using polyhedral techniques. This is possible
+because of the
+[restrictions on dimensions and symbols](Affine.md#restrictions-on-dimensions-and-symbols)
+in these contexts.
+
+### 'splat' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `splat` ssa-use `:` ( vector-type | tensor-type )
+```
+
+Broadcast the operand to all elements of the result vector or tensor. The
+operand has to be of either integer or float type. When the result is a tensor,
+it has to be statically shaped.
+
+Example:
+
+```mlir
+  %s = load %A[%i] : memref<128xf32>
+  %v = splat %s : vector<4xf32>
+  %t = splat %s : tensor<8x16xi32>
+```
+
+TODO: This operation is easy to extend to broadcast to dynamically shaped
+tensors in the same way dynamically shaped memrefs are handled.
+```mlir
+// Broadcasts %s to a 2-d dynamically shaped tensor, with %m, %n binding
+// to the sizes of the two dynamic dimensions.
+%m = "foo"() : () -> (index)
+%n = "bar"() : () -> (index)
+%t = splat %s [%m, %n] : tensor<?x?xi32>
+```
+
+### 'store' operation
+
+Syntax:
+
+```
+operation ::= `store` ssa-use `,` ssa-use `[` ssa-use-list `]` `:` memref-type
+```
+
+Store value to memref location given by indices. The value stored should have
+the same type as the elemental type of the memref. The number of arguments
+provided within brackets need to match the rank of the memref.
+
+In an affine context, the indices of a store are restricted to SSA values bound
+to surrounding loop induction variables,
+[symbols](Affine.md#restrictions-on-dimensions-and-symbols), results of a
+[`constant` operation](#constant-operation), or the result of an
+[`affine.apply`](Affine.md#affineapply-operation) operation that can in turn
+take as arguments all of the aforementioned SSA values or the recursively result
+of such an `affine.apply` operation.
+
+Example:
+
+```mlir
+store %100, %A[%1, 1023] : memref<4x?xf32, #layout, memspace0>
+```
+
+**Context:** The `load` and `store` operations are specifically crafted to fully
+resolve a reference to an element of a memref, and (in polyhedral `affine.if`
+and `affine.for` operations) the compiler can follow use-def chains (e.g.
+through [`affine.apply`](Affine.md#affineapply-operation) operations) to
+precisely analyze references at compile-time using polyhedral techniques. This
+is possible because of the
+[restrictions on dimensions and symbols](Affine.md#restrictions-on-dimensions-and-symbols)
+in these contexts.
+
+### 'tensor_load' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `tensor_load` ssa-use-and-type
+```
+
+Create a tensor from a memref, making an independent copy of the element data.
+The result value is a tensor whose shape and element type match the memref
+operand.
+
+Example:
+
+```mlir
+// Produces a value of tensor<4x?xf32> type.
+%12 = tensor_load %10 : memref<4x?xf32, #layout, memspace0>
+```
+
+### 'tensor_store' operation
+
+Syntax:
+
+```
+operation ::= `tensor_store` ssa-use `,` ssa-use `:` memref-type
+```
+
+Stores the contents of a tensor into a memref. The first operand is a value of
+tensor type, the second operand is a value of memref type. The shapes and
+element types of these must match, and are specified by the memref type.
+
+Example:
+
+```mlir
+%9 = dim %8, 1 : tensor<4x?xf32>
+%10 = alloc(%9) : memref<4x?xf32, #layout, memspace0>
+tensor_store %8, %10 : memref<4x?xf32, #layout, memspace0>
+```
+
+## Unary Operations
+
+### 'absf' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `absf` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar absolute value.
+%a = absf %b : f64
+
+// SIMD vector element-wise absolute value.
+%f = absf %g : vector<4xf32>
+
+// Tensor element-wise absolute value.
+%x = absf %y : tensor<4x?xf8>
+```
+
+The `absf` operation computes the absolute value. It takes one operand and
+returns one result of the same type. This type may be a float scalar type, a
+vector whose element type is float, or a tensor of floats. It has no standard
+attributes.
+
+### 'ceilf' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `ceilf` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar ceiling value.
+%a = ceilf %b : f64
+
+// SIMD vector element-wise ceiling value.
+%f = ceilf %g : vector<4xf32>
+
+// Tensor element-wise ceiling value.
+%x = ceilf %y : tensor<4x?xf8>
+```
+
+The `ceilf` operation computes the ceiling of a given value. It takes one
+operand and returns one result of the same type. This type may be a float
+scalar type, a vector whose element type is float, or a tensor of floats. It
+has no standard attributes.
+
+### 'cos' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `cos` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar cosine value.
+%a = cos %b : f64
+
+// SIMD vector element-wise cosine value.
+%f = cos %g : vector<4xf32>
+
+// Tensor element-wise cosine value.
+%x = cos %y : tensor<4x?xf8>
+```
+
+The `cos` operation computes the cosine of a given value. It takes one operand
+and returns one result of the same type. This type may be a float scalar type,
+a vector whose element type is float, or a tensor of floats. It has no standard
+attributes.
+
+### 'exp' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `exp` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar natural exponential.
+%a = exp %b : f64
+
+// SIMD vector element-wise natural exponential.
+%f = exp %g : vector<4xf32>
+
+// Tensor element-wise natural exponential.
+%x = exp %y : tensor<4x?xf8>
+```
+
+The `exp` operation takes one operand and returns one result of the same type.
+This type may be a float scalar type, a vector whose element type is float, or a
+tensor of floats. It has no standard attributes.
+
+### 'negf' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `negf` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar negation value.
+%a = negf %b : f64
+
+// SIMD vector element-wise negation value.
+%f = negf %g : vector<4xf32>
+
+// Tensor element-wise negation value.
+%x = negf %y : tensor<4x?xf8>
+```
+
+The `negf` operation computes the negation of a given value. It takes one
+operand and returns one result of the same type. This type may be a float
+scalar type, a vector whose element type is float, or a tensor of floats. It
+has no standard attributes.
+
+### 'tanh' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `tanh` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar hyperbolic tangent value.
+%a = tanh %b : f64
+
+// SIMD vector element-wise hyperbolic tangent value.
+%f = tanh %g : vector<4xf32>
+
+// Tensor element-wise hyperbolic tangent value.
+%x = tanh %y : tensor<4x?xf8>
+```
+
+The `tanh` operation computes the hyperbolic tangent. It takes one operand and
+returns one result of the same type. This type may be a float scalar type, a
+vector whose element type is float, or a tensor of floats. It has no standard
+attributes.
+
+## Arithmetic Operations
+
+Basic arithmetic in MLIR is specified by standard operations described in this
+section.
+
+### 'addi' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `addi` ssa-use `,` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar addition.
+%a = addi %b, %c : i64
+
+// SIMD vector element-wise addition, e.g. for Intel SSE.
+%f = addi %g, %h : vector<4xi32>
+
+// Tensor element-wise addition.
+%x = addi %y, %z : tensor<4x?xi8>
+```
+
+The `addi` operation takes two operands and returns one result, each of these is
+required to be the same type. This type may be an integer scalar type, a vector
+whose element type is integer, or a tensor of integers. It has no standard
+attributes.
+
+### 'addf' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `addf` ssa-use `,` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar addition.
+%a = addf %b, %c : f64
+
+// SIMD vector addition, e.g. for Intel SSE.
+%f = addf %g, %h : vector<4xf32>
+
+// Tensor addition.
+%x = addf %y, %z : tensor<4x?xbf16>
+```
+
+The `addf` operation takes two operands and returns one result, each of these is
+required to be the same type. This type may be a floating point scalar type, a
+vector whose element type is a floating point type, or a floating point tensor.
+
+It has no standard attributes.
+
+TODO: In the distant future, this will accept optional attributes for fast math,
+contraction, rounding mode, and other controls.
+
+### 'and' operation
+
+Bitwise integer and.
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `and` ssa-use `,` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar integer bitwise and.
+%a = and %b, %c : i64
+
+// SIMD vector element-wise bitwise integer and.
+%f = and %g, %h : vector<4xi32>
+
+// Tensor element-wise bitwise integer and.
+%x = and %y, %z : tensor<4x?xi8>
+```
+
+The `and` operation takes two operands and returns one result, each of these is
+required to be the same type. This type may be an integer scalar type, a vector
+whose element type is integer, or a tensor of integers. It has no standard
+attributes.
+
+### 'cmpi' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `cmpi` string-literal `,` ssa-id `,` ssa-id `:` type
+```
+
+Examples:
+
+```mlir
+// Custom form of scalar "signed less than" comparison.
+%x = cmpi "slt", %lhs, %rhs : i32
+
+// Generic form of the same operation.
+%x = "std.cmpi"(%lhs, %rhs) {predicate = 2 : i64} : (i32, i32) -> i1
+
+// Custom form of vector equality comparison.
+%x = cmpi "eq", %lhs, %rhs : vector<4xi64>
+
+// Generic form of the same operation.
+%x = "std.cmpi"(%lhs, %rhs) {predicate = 0 : i64}
+    : (vector<4xi64>, vector<4xi64>) -> vector<4xi1>
+```
+
+The `cmpi` operation is a generic comparison for integer-like types. Its two
+arguments can be integers, vectors or tensors thereof as long as their types
+match. The operation produces an i1 for the former case, a vector or a tensor of
+i1 with the same shape as inputs in the other cases.
+
+Its first argument is an attribute that defines which type of comparison is
+performed. The following comparisons are supported:
+
+-   equal (mnemonic: `"eq"`; integer value: `0`)
+-   not equal (mnemonic: `"ne"`; integer value: `1`)
+-   signed less than (mnemonic: `"slt"`; integer value: `2`)
+-   signed less than or equal (mnemonic: `"sle"`; integer value: `3`)
+-   signed greater than (mnemonic: `"sgt"`; integer value: `4`)
+-   signed greater than or equal (mnemonic: `"sge"`; integer value: `5`)
+-   unsigned less than (mnemonic: `"ult"`; integer value: `6`)
+-   unsigned less than or equal (mnemonic: `"ule"`; integer value: `7`)
+-   unsigned greater than (mnemonic: `"ugt"`; integer value: `8`)
+-   unsigned greater than or equal (mnemonic: `"uge"`; integer value: `9`)
+
+The result is `1` if the comparison is true and `0` otherwise. For vector or
+tensor operands, the comparison is performed elementwise and the element of the
+result indicates whether the comparison is true for the operand elements with
+the same indices as those of the result.
+
+Note: while the custom assembly form uses strings, the actual underlying
+attribute has integer type (or rather enum class in C++ code) as seen from the
+generic assembly form. String literals are used to improve readability of the IR
+by humans.
+
+This operation only applies to integer-like operands, but not floats. The main
+reason being that comparison operations have diverging sets of attributes:
+integers require sign specification while floats require various floating
+point-related particularities, e.g., `-ffast-math` behavior, IEEE754 compliance,
+etc
+([rationale](../Rationale.md#splitting-floating-point-vs-integer-operations)).
+The type of comparison is specified as attribute to avoid introducing ten
+similar operations, taking into account that they are often implemented using
+the same operation downstream
+([rationale](../Rationale.md#specifying-comparison-kind-as-attribute)). The
+separation between signed and unsigned order comparisons is necessary because of
+integers being signless. The comparison operation must know how to interpret
+values with the foremost bit being set: negatives in two's complement or large
+positives
+([rationale](../Rationale.md#specifying-sign-in-integer-comparison-operations)).
+
+### 'constant' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `constant` attribute-value `:` type
+```
+
+The `constant` operation produces an SSA value equal to some constant specified
+by an attribute. This is the way that MLIR uses to form simple integer and
+floating point constants, as well as more exotic things like references to
+functions and (TODO!) tensor/vector constants.
+
+The `constant` operation is represented with a single attribute named "value".
+The type specifies the result type of the operation.
+
+Examples:
+
+```mlir
+// Integer constant
+%1 = constant 42 : i32
+
+// Reference to function @myfn.
+%3 = constant @myfn : (tensor<16xf32>, f32) -> tensor<16xf32>
+
+// Equivalent generic forms
+%1 = "std.constant"() {value = 42 : i32} : () -> i32
+%3 = "std.constant"() {value = @myfn}
+   : () -> ((tensor<16xf32>, f32) -> tensor<16xf32>)
+
+```
+
+MLIR does not allow direct references to functions in SSA operands because the
+compiler is multithreaded, and disallowing SSA values to directly reference a
+function simplifies this
+([rationale](../Rationale.md#multithreading-the-compiler)).
+
+### 'copysign' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `copysign` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar copysign value.
+%a = copysign %b %c : f64
+
+// SIMD vector element-wise copysign value.
+%f = copysign %g %h : vector<4xf32>
+
+// Tensor element-wise copysign value.
+%x = copysign %y %z : tensor<4x?xf8>
+```
+
+The `copysign` returns a value with the magnitude of the first operand and the
+sign of the second operand. It takes two operands and returns one result of the
+same type. This type may be a float scalar type, a vector whose element type is
+float, or a tensor of floats. It has no standard attributes.
+
+### 'divis' operation
+
+Signed integer division. Rounds towards zero. Treats the leading bit as sign,
+i.e. `6 / -2 = -3`.
+
+Note: the semantics of division by zero or signed division overflow (minimum
+value divided by -1) is TBD; do NOT assume any specific behavior.
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `divis` ssa-use `,` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar signed integer division.
+%a = divis %b, %c : i64
+
+// SIMD vector element-wise division.
+%f = divis %g, %h : vector<4xi32>
+
+// Tensor element-wise integer division.
+%x = divis %y, %z : tensor<4x?xi8>
+```
+
+The `divis` operation takes two operands and returns one result, each of these
+is required to be the same type. This type may be an integer scalar type, a
+vector whose element type is integer, or a tensor of integers. It has no
+standard attributes.
+
+### 'diviu' operation
+
+Unsigned integer division. Rounds towards zero. Treats the leading bit as the
+most significant, i.e. for `i16` given two's complement representation, `6 /
+-2 = 6 / (2^16 - 2) = 0`.
+
+Note: the semantics of division by zero is TBD; do NOT assume any specific
+behavior.
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `diviu` ssa-use `,` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar unsigned integer division.
+%a = diviu %b, %c : i64
+
+// SIMD vector element-wise division.
+%f = diviu %g, %h : vector<4xi32>
+
+// Tensor element-wise integer division.
+%x = diviu %y, %z : tensor<4x?xi8>
+```
+
+The `diviu` operation takes two operands and returns one result, each of these
+is required to be the same type. This type may be an integer scalar type, a
+vector whose element type is integer, or a tensor of integers. It has no
+standard attributes.
+
+### 'memref_cast' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `memref_cast` ssa-use `:` type `to` type
+```
+
+Examples:
+
+```mlir
+// Discard static dimension information.
+%3 = memref_cast %2 : memref<4x?xf32> to memref<?x?xf32>
+
+// Convert to a type with more known dimensions.
+%4 = memref_cast %3 : memref<?x?xf32> to memref<4x?xf32>
+
+// Convert to a type with unknown rank.
+%5 = memref_cast %3 : memref<?x?xf32> to memref<*xf32>
+
+// Convert to a type with static rank.
+%6 = memref_cast %5 : memref<*xf32> to memref<?x?xf32>
+```
+
+Convert a memref from one type to an equivalent type without changing any data
+elements. The types are equivalent if 1. they both have the same static rank,
+same element type, same mappings, same address space. The operation is invalid
+if converting to a mismatching constant dimension, or 2. exactly one of the
+operands have an unknown rank, and they both have the same element type and same
+address space. The operation is invalid if both operands are of dynamic rank or
+if converting to a mismatching static rank.
+
+### 'mulf' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `mulf` ssa-use `,` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar multiplication.
+%a = mulf %b, %c : f64
+
+// SIMD pointwise vector multiplication, e.g. for Intel SSE.
+%f = mulf %g, %h : vector<4xf32>
+
+// Tensor pointwise multiplication.
+%x = mulf %y, %z : tensor<4x?xbf16>
+```
+
+The `mulf` operation takes two operands and returns one result, each of these is
+required to be the same type. This type may be a floating point scalar type, a
+vector whose element type is a floating point type, or a floating point tensor.
+
+It has no standard attributes.
+
+TODO: In the distant future, this will accept optional attributes for fast math,
+contraction, rounding mode, and other controls.
+
+### 'or' operation
+
+Bitwise integer or.
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `or` ssa-use `,` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar integer bitwise or.
+%a = or %b, %c : i64
+
+// SIMD vector element-wise bitwise integer or.
+%f = or %g, %h : vector<4xi32>
+
+// Tensor element-wise bitwise integer or.
+%x = or %y, %z : tensor<4x?xi8>
+```
+
+The `or` operation takes two operands and returns one result, each of these is
+required to be the same type. This type may be an integer scalar type, a vector
+whose element type is integer, or a tensor of integers. It has no standard
+attributes.
+
+### 'remis' operation
+
+Signed integer division remainder. Treats the leading bit as sign, i.e. `6 %
+-2 = 0`.
+
+Note: the semantics of division by zero is TBD; do NOT assume any specific
+behavior.
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `remis` ssa-use `,` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar signed integer division remainder.
+%a = remis %b, %c : i64
+
+// SIMD vector element-wise division remainder.
+%f = remis %g, %h : vector<4xi32>
+
+// Tensor element-wise integer division remainder.
+%x = remis %y, %z : tensor<4x?xi8>
+```
+
+The `remis` operation takes two operands and returns one result, each of these
+is required to be the same type. This type may be an integer scalar type, a
+vector whose element type is integer, or a tensor of integers. It has no
+standard attributes.
+
+### 'remiu' operation
+
+Unsigned integer division remainder. Treats the leading bit as the most
+significant, i.e. for `i16`, `6 % -2 = 6 % (2^16 - 2) = 6`.
+
+Note: the semantics of division by zero is TBD; do NOT assume any specific
+behavior.
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `remiu` ssa-use `,` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar unsigned integer division remainder.
+%a = remiu %b, %c : i64
+
+// SIMD vector element-wise division remainder.
+%f = remiu %g, %h : vector<4xi32>
+
+// Tensor element-wise integer division remainder.
+%x = remiu %y, %z : tensor<4x?xi8>
+```
+
+The `remiu` operation takes two operands and returns one result, each of these
+is required to be the same type. This type may be an integer scalar type, a
+vector whose element type is integer, or a tensor of integers. It has no
+standard attributes.
+
+### 'select' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `select` ssa-use `,` ssa-use `,` ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Custom form of scalar selection.
+%x = select %cond, %true, %false : i32
+
+// Generic form of the same operation.
+%x = "std.select"(%cond, %true, %false) : (i1, i32, i32) -> i32
+
+// Vector selection is element-wise
+%vx = "std.select"(%vcond, %vtrue, %vfalse)
+    : (vector<42xi1>, vector<42xf32>, vector<42xf32>) -> vector<42xf32>
+```
+
+The `select` operation chooses one value based on a binary condition supplied as
+its first operand. If the value of the first operand is `1`, the second operand
+is chosen, otherwise the third operand is chosen. The second and the third
+operand must have the same type.
+
+The operation applies to vectors and tensors elementwise given the _shape_ of
+all operands is identical. The choice is made for each element individually
+based on the value at the same position as the element in the condition operand.
+
+The `select` operation combined with [`cmpi`](#cmpi-operation) can be used to
+implement `min` and `max` with signed or unsigned comparison semantics.
+
+### 'tensor_cast' operation
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `tensor_cast` ssa-use `:` type `to` type
+```
+
+Examples:
+
+```mlir
+// Convert from unknown rank to rank 2 with unknown dimension sizes.
+%2 = "std.tensor_cast"(%1) : (tensor<*xf32>) -> tensor<?x?xf32>
+%2 = tensor_cast %1 : tensor<*xf32> to tensor<?x?xf32>
+
+// Convert to a type with more known dimensions.
+%3 = "std.tensor_cast"(%2) : (tensor<?x?xf32>) -> tensor<4x?xf32>
+
+// Discard static dimension and rank information.
+%4 = "std.tensor_cast"(%3) : (tensor<4x?xf32>) -> tensor<?x?xf32>
+%5 = "std.tensor_cast"(%4) : (tensor<?x?xf32>) -> tensor<*xf32>
+```
+
+Convert a tensor from one type to an equivalent type without changing any data
+elements. The source and destination types must both be tensor types with the
+same element type. If both are ranked, then the rank should be the same and
+static dimensions should match. The operation is invalid if converting to a
+mismatching constant dimension.
+
+### 'xor' operation
+
+Bitwise integer xor.
+
+Syntax:
+
+```
+operation ::= ssa-id `=` `xor` ssa-use, ssa-use `:` type
+```
+
+Examples:
+
+```mlir
+// Scalar integer bitwise xor.
+%a = xor %b, %c : i64
+
+// SIMD vector element-wise bitwise integer xor.
+%f = xor %g, %h : vector<4xi32>
+
+// Tensor element-wise bitwise integer xor.
+%x = xor %y, %z : tensor<4x?xi8>
+```
+
+The `xor` operation takes two operands and returns one result, each of these is
+required to be the same type. This type may be an integer scalar type, a vector
+whose element type is integer, or a tensor of integers. It has no standard
+attributes.
diff --git a/mlir/docs/Dialects/Vector.md b/mlir/docs/Dialects/Vector.md
new file mode 100644
index 0000000000000000000000000000000000000000..04f5ba71cdbd05c83f60573980335ca82340de3d
--- /dev/null
+++ b/mlir/docs/Dialects/Vector.md
@@ -0,0 +1,14 @@
+# Vector Dialect
+
+This dialect provides mid-level abstraction for the MLIR super-vectorizer.
+
+[TOC]
+
+## Operations
+
+# To see op documentation
+
+```sh
+mlir-tblgen --gen-op-doc -I /path/to/mlir/include \
+/path/to/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
+```
diff --git a/mlir/docs/EDSC.md b/mlir/docs/EDSC.md
new file mode 100644
index 0000000000000000000000000000000000000000..eaaeb6c7009bc03ded57904f5ff83cf0f8115ce5
--- /dev/null
+++ b/mlir/docs/EDSC.md
@@ -0,0 +1,132 @@
+# Background: declarative builders API
+
+The main purpose of the declarative builders API is to provide an intuitive way
+of constructing MLIR programmatically. In the majority of cases, the IR we wish
+to construct exhibits structured control-flow. Declarative builders provide an
+API to make MLIR construction and manipulation very idiomatic, for the
+structured control-flow case, in C++.
+
+## ScopedContext
+
+`mlir::edsc::ScopedContext` provides an implicit thread-local context,
+supporting a simple declarative API with globally accessible builders. These
+declarative builders are available within the lifetime of a `ScopedContext`.
+
+## ValueHandle and IndexHandle
+
+`mlir::edsc::ValueHandle` and `mlir::edsc::IndexHandle` provide typed
+abstractions around an `mlir::Value`. These abstractions are "delayed", in the
+sense that they allow separating declaration from definition. They may capture
+IR snippets, as they are built, for programmatic manipulation. Intuitive
+operators are provided to allow concise and idiomatic expressions.
+
+```c++
+ValueHandle zero = constant_index(0);
+IndexHandle i, j, k;
+```
+
+## Intrinsics
+
+`mlir::edsc::ValueBuilder` is a generic wrapper for the `mlir::Builder::create`
+method that operates on `ValueHandle` objects and return a single ValueHandle.
+For instructions that return no values or that return multiple values, the
+`mlir::edsc::InstructionBuilder` can be used. Named intrinsics are provided as
+syntactic sugar to further reduce boilerplate.
+
+```c++
+using load = ValueBuilder<LoadOp>;
+using store = InstructionBuilder<StoreOp>;
+```
+
+## LoopBuilder and AffineLoopNestBuilder
+
+`mlir::edsc::AffineLoopNestBuilder` provides an interface to allow writing
+concise and structured loop nests.
+
+```c++
+  ScopedContext scope(f.get());
+  ValueHandle i(indexType),
+              j(indexType),
+              lb(f->getArgument(0)),
+              ub(f->getArgument(1));
+  ValueHandle f7(constant_float(llvm::APFloat(7.0f), f32Type)),
+              f13(constant_float(llvm::APFloat(13.0f), f32Type)),
+              i7(constant_int(7, 32)),
+              i13(constant_int(13, 32));
+  AffineLoopNestBuilder(&i, lb, ub, 3)([&]{
+      lb * index_t(3) + ub;
+      lb + index_t(3);
+      AffineLoopNestBuilder(&j, lb, ub, 2)([&]{
+          ceilDiv(index_t(31) * floorDiv(i + j * index_t(3), index_t(32)),
+                  index_t(32));
+          ((f7 + f13) / f7) % f13 - f7 * f13;
+          ((i7 + i13) / i7) % i13 - i7 * i13;
+      });
+  });
+```
+
+## IndexedValue
+
+`mlir::edsc::IndexedValue` provides an index notation around load and store
+operations on abstract data types by overloading the C++ assignment and
+parenthesis operators. The relevant loads and stores are emitted as appropriate.
+
+## Putting it all together
+
+With declarative builders, it becomes fairly concise to build rank and
+type-agnostic custom operations even though MLIR does not yet have generic
+types. Here is what a definition of a general pointwise add looks in
+Tablegen with declarative builders.
+
+```c++
+def AddOp : Op<"x.add">,
+    Arguments<(ins Tensor:$A, Tensor:$B)>,
+    Results<(outs Tensor: $C)> {
+  code referenceImplementation = [{
+    auto ivs = makeIndexHandles(view_A.rank());
+    auto pivs = makePIndexHandles(ivs);
+    IndexedValue A(arg_A), B(arg_B), C(arg_C);
+    AffineLoopNestBuilder(pivs, view_A.getLbs(), view_A.getUbs(), view_A.getSteps())(
+      [&]{
+        C(ivs) = A(ivs) + B(ivs)
+      });
+  }];
+}
+```
+
+Depending on the function signature on which this emitter is called, the
+generated IR resembles the following, for a 4-D memref of `vector<4xi8>`:
+
+```
+// CHECK-LABEL: func @t1(%lhs: memref<3x4x5x6xvector<4xi8>>, %rhs: memref<3x4x5x6xvector<4xi8>>, %result: memref<3x4x5x6xvector<4xi8>>) -> () {
+//       CHECK: affine.for {{.*}} = 0 to 3 {
+//       CHECK:   affine.for {{.*}} = 0 to 4 {
+//       CHECK:     affine.for {{.*}} = 0 to 5 {
+//       CHECK:       affine.for {{.*}}= 0 to 6 {
+//       CHECK:         {{.*}} = load %arg1[{{.*}}] : memref<3x4x5x6xvector<4xi8>>
+//       CHECK:         {{.*}} = load %arg0[{{.*}}] : memref<3x4x5x6xvector<4xi8>>
+//       CHECK:         {{.*}} = addi {{.*}} : vector<4xi8>
+//       CHECK:         store {{.*}}, %arg2[{{.*}}] : memref<3x4x5x6xvector<4xi8>>
+```
+
+or the following, for a 0-D `memref<f32>`:
+
+```
+// CHECK-LABEL: func @t3(%lhs: memref<f32>, %rhs: memref<f32>, %result: memref<f32>) -> () {
+//       CHECK: {{.*}} = load %arg1[] : memref<f32>
+//       CHECK: {{.*}} = load %arg0[] : memref<f32>
+//       CHECK: {{.*}} = addf {{.*}}, {{.*}} : f32
+//       CHECK: store {{.*}}, %arg2[] : memref<f32>
+```
+
+Similar APIs are provided to emit the lower-level `loop.for` op with
+`LoopNestBuilder`. See the `builder-api-test.cpp` test for more usage examples.
+
+Since the implementation of declarative builders is in C++, it is also available
+to program the IR with an embedded-DSL flavor directly integrated in MLIR. We
+make use of these properties in the tutorial.
+
+Spoiler: MLIR also provides Python bindings for these builders, and a
+full-fledged Python machine learning DSL with automatic differentiation
+targeting MLIR was built as an early research collaboration.
+
diff --git a/mlir/docs/GenericDAGRewriter.md b/mlir/docs/GenericDAGRewriter.md
new file mode 100644
index 0000000000000000000000000000000000000000..8cc09f7d17ffdf5a1d186d0ade830421f2daad46
--- /dev/null
+++ b/mlir/docs/GenericDAGRewriter.md
@@ -0,0 +1,415 @@
+# MLIR Generic DAG Rewriter Infrastructure
+
+## Introduction and Motivation
+
+The goal of a compiler IR is to represent code - at various levels of
+abstraction which pose different sets of tradeoffs in terms of representational
+capabilities and ease of transformation. However, the ability to represent code
+is not itself very useful - you also need to be able to implement those
+transformations.
+
+There are many different sorts of compiler transformations, but this document
+focuses on a particularly important class of transformation that comes up
+repeatedly at scale, and is important for the immediate goals of MLIR: that of
+pattern matching on a set of operations and replacing with another set. This is
+the key algorithm required to implement the "op fission" algorithm used by the
+tf2xla bridge, pattern matching rewrites from TF ops to TF/Lite, peephole
+optimizations like "eliminate identity nodes" or "replace x+0 with x", as well
+as a useful abstraction to implement optimization algorithms for MLIR graphs at
+all levels.
+
+A particular strength of MLIR (and a major difference vs other compiler
+infrastructures like LLVM, GCC, XLA, TensorFlow, etc) is that it uses a single
+compiler IR to represent code at multiple levels of abstraction: an MLIR
+operation can be a "TensorFlow operation", an "XLA HLO", a "TF Lite
+FlatBufferModel op", a TPU LLO instruction, an LLVM IR instruction (transitively
+including X86, Lanai, CUDA, and other target specific instructions), or anything
+else that the MLIR type system can reasonably express. Because MLIR spans such a
+wide range of different problems, a single infrastructure for performing
+graph-to-graph rewrites can help solve many diverse domain challenges, including
+TensorFlow graph level down to the machine code level.
+
+[Static single assignment](https://en.wikipedia.org/wiki/Static_single_assignment_form)
+(SSA) representations like MLIR make it easy to access the operands and "users"
+of an operation. As such, a natural abstraction for these graph-to-graph
+rewrites is that of DAG pattern matching: clients define DAG tile patterns, and
+each pattern includes a result DAG to produce and the cost of the result (or,
+inversely, the benefit of doing the replacement). A common infrastructure
+efficiently finds and perform the rewrites.
+
+While this concept is simple, the details are more nuanced. This proposal
+defines and explores a set of abstractions that we feel can solve a wide range
+of different problems, and can be applied to many different sorts of problems
+that MLIR is - and is expected to - face over time. We do this by separating the
+pattern definition and matching algorithm from the "driver" of the computation
+loop, and make space for the patterns to be defined declaratively in the future.
+
+## Related Work
+
+There is a huge amount of related work to consider, given that pretty much every
+compiler in existence has to solve this problem many times over. Here are a few
+graph rewrite systems we have used, along with the pros and cons of this related
+work. One unifying problem with all of these is that these systems are only
+trying to solve one particular and usually narrow problem: our proposal would
+like to solve many of these problems with a single infrastructure. Of these, the
+most similar design to our proposal is the LLVM DAG-to-DAG instruction selection
+algorithm at the end.
+
+### Constant folding
+
+A degenerate but pervasive case of DAG-to-DAG pattern matching is constant
+folding: given an operation whose operands contain constants can often be folded
+to a result constant value.
+
+MLIR already has constant folding routines which provide a simpler API than a
+general DAG-to-DAG pattern matcher, and we expect it to remain because the
+simpler contract makes it applicable in some cases that a generic matcher would
+not. For example, a DAG-rewrite can remove arbitrary nodes in the current
+function, which could invalidate iterators. Constant folding as an API does not
+remove any nodes, it just provides a (list of) constant values and allows the
+clients to update their data structures as necessary.
+
+### AST-Level Pattern Matchers
+
+The literature is full of source-to-source translators which transform
+identities in order to improve performance (e.g. transforming `X*0` into `0`).
+One large example that I'm aware of is the GCC `fold` function, which performs
+[many optimizations](https://github.com/gcc-mirror/gcc/blob/master/gcc/fold-const.c)
+on ASTs. Clang has
+[similar routines](http://releases.llvm.org/3.5.0/tools/clang/docs/InternalsManual.html#constant-folding-in-the-clang-ast)
+for simple constant folding of expressions (as required by the C++ standard) but
+doesn't perform general optimizations on its ASTs.
+
+The primary downside of tree optimizers is that you can't see across operations
+that have multiple uses. It is
+[well known in literature](https://llvm.org/pubs/2008-06-LCTES-ISelUsingSSAGraphs.pdf)
+that DAG pattern matching is more powerful than tree pattern matching, but OTOH,
+DAG pattern matching can lead to duplication of computation which needs to be
+checked for.
+
+### "Combiners" and other peephole optimizers
+
+Compilers end up with a lot of peephole optimizers for various things, e.g. the
+GCC
+["combine" routines](https://github.com/gcc-mirror/gcc/blob/master/gcc/combine.c)
+(which try to merge two machine instructions into a single one), the LLVM
+[Inst Combine](http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/)
+[pass](https://llvm.org/docs/Passes.html#instcombine-combine-redundant-instructions),
+LLVM's
+[DAG Combiner](https://github.com/llvm-mirror/llvm/blob/master/lib/CodeGen/SelectionDAG/DAGCombiner.cpp),
+the Swift compiler's
+[SIL Combiner](https://github.com/apple/swift/tree/master/lib/SILOptimizer/SILCombiner),
+etc. These generally match one or more operations and produce zero or more
+operations as a result. The LLVM
+[Legalization](http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/)
+infrastructure has a different outer loop but otherwise works the same way.
+
+These passes have a lot of diversity, but also have a unifying structure: they
+mostly have a worklist outer loop which visits operations. They then use the C++
+visitor pattern (or equivalent) to switch over the class of operation and
+dispatch to a method. That method contains a long list of hand-written C++ code
+that pattern-matches various special cases. LLVM introduced a "match" function
+that allows writing patterns in a somewhat more declarative style using template
+metaprogramming (MLIR has similar facilities). Here's a simple example:
+
+```c++
+  // Y - (X + 1) --> ~X + Y
+  if (match(Op1, m_OneUse(m_Add(m_Value(X), m_One()))))
+    return BinaryOperator::CreateAdd(Builder.CreateNot(X), Op0);
+```
+
+Here is a somewhat more complicated one (this is not the biggest or most
+complicated :)
+
+```c++
+  // C2 is ODD
+  // LHS = XOR(Y,C1), Y = AND(Z,C2), C1==(C2+1) => LHS == NEG(OR(Z, ~C2))
+  // ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2))
+  if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1))))
+    if (C1->countTrailingZeros() == 0)
+      if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) {
+        Value NewOr = Builder.CreateOr(Z, ~(*C2));
+        return Builder.CreateSub(RHS, NewOr, "sub");
+      }
+```
+
+These systems are simple to set up, and pattern matching templates have some
+advantages (they are extensible for new sorts of sub-patterns, look compact at
+point of use). OTOH, they have lots of well known problems, for example:
+
+*   These patterns are very error prone to write, and contain lots of
+    redundancies.
+*   The IR being matched often has identities (e.g. when matching commutative
+    operators) and the C++ code has to handle it manually - take a look at
+    [the full code](http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineAddSub.cpp?view=markup#l775)
+    for checkForNegativeOperand that defines the second pattern).
+*   The matching code compiles slowly, both because it generates tons of code
+    and because the templates instantiate slowly.
+*   Adding new patterns (e.g. for count leading zeros in the example above) is
+    awkward and doesn't often happen.
+*   The cost model for these patterns is not really defined - it is emergent
+    based on the order the patterns are matched in code.
+*   They are non-extensible without rebuilding the compiler.
+*   It isn't practical to apply theorem provers and other tools to these
+    patterns - they cannot be reused for other purposes.
+
+In addition to structured "combiners" like these, there are lots of ad-hoc
+systems like the
+[LLVM Machine code peephole optimizer](http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp?view=markup)
+which are related.
+
+### LLVM's DAG-to-DAG Instruction Selection Infrastructure
+
+The instruction selection subsystem in LLVM is the result of many years worth of
+iteration and discovery, driven by the need for LLVM to support code generation
+for lots of targets, the complexity of code generators for modern instruction
+sets (e.g. X86), and the fanatical pursuit of reusing code across targets. Eli
+wrote a
+[nice short overview](https://eli.thegreenplace.net/2013/02/25/a-deeper-look-into-the-llvm-code-generator-part-1)
+of how this works, and the
+[LLVM documentation](https://llvm.org/docs/CodeGenerator.html#select-instructions-from-dag)
+describes it in more depth including its advantages and limitations. It allows
+writing patterns like this.
+
+```
+def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
+          (BLCI64rr GR64:$src)>;
+```
+
+This example defines a matcher for the
+["blci" instruction](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#TBM_\(Trailing_Bit_Manipulation\))
+in the
+[X86 target description](http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.td?view=markup),
+there are many others in that file (look for `Pat<>` patterns, since they aren't
+entangled in details of the compiler like assembler/disassembler generation
+logic).
+
+For our purposes, there is much to like about this system, for example:
+
+*   It is defined in a declarative format.
+*   It is extensible to target-defined operations.
+*   It automates matching across identities, like commutative patterns.
+*   It allows custom abstractions and intense factoring of target-specific
+    commonalities.
+*   It generates compact code - it compiles into a state machine, which is
+    interpreted.
+*   It allows the instruction patterns to be defined and reused for multiple
+    purposes.
+*   The patterns are "type checked" at compile time, detecting lots of bugs
+    early and eliminating redundancy from the pattern specifications.
+*   It allows the use of general C++ code for weird/complex cases.
+
+While there is a lot that is good here, there is also a lot of bad things:
+
+*   All of this machinery is only applicable to instruction selection. Even
+    directly adjacent problems like the DAGCombiner and Legalizer can't use it.
+*   This isn't extensible at compiler runtime, you have to rebuild the compiler
+    to extend it.
+*   The error messages when failing to match a pattern
+    [are not exactly optimal](https://www.google.com/search?q=llvm+cannot+select).
+*   It has lots of implementation problems and limitations (e.g. can't write a
+    pattern for a multi-result operation) as a result of working with the
+    awkward SelectionDAG representation and being designed and implemented
+    lazily.
+*   This stuff all grew organically over time and has lots of sharp edges.
+
+### Summary
+
+MLIR will face a wide range of pattern matching and graph rewrite problems, and
+one of the major advantages of having a common representation for code at
+multiple levels that it allows us to invest in - and highly leverage - a single
+infra for doing this sort of work.
+
+## Goals
+
+This proposal includes support for defining pattern matching and rewrite
+algorithms on MLIR. We'd like these algorithms to encompass many problems in the
+MLIR space, including 1-to-N expansions (e.g. as seen in the TF/XLA bridge when
+lowering a "tf.AddN" to multiple "add" HLOs), M-to-1 patterns (as seen in
+Grappler optimization passes, e.g. that convert multiple/add into a single
+muladd op), as well as general M-to-N patterns (e.g. instruction selection for
+target instructions). Patterns should have a cost associated with them, and the
+common infrastructure should be responsible for sorting out the lowest cost
+match for a given application.
+
+We separate the task of picking a particular locally optimal pattern from a
+given root node, the algorithm used to rewrite an entire graph given a
+particular set of goals, and the definition of the patterns themselves. We do
+this because DAG tile pattern matching is NP complete, which means that there
+are no known polynomial time algorithms to optimally solve this problem.
+Additionally, we would like to support iterative rewrite algorithms that
+progressively transform the input program through multiple steps. Furthermore,
+we would like to support many different sorts of clients across the MLIR stack,
+and they may have different tolerances for compile time cost, different demands
+for optimality, and other algorithmic goals or constraints.
+
+We aim for MLIR transformations to be easy to implement and reduce the
+likelihood for compiler bugs. We expect there to be a very very large number of
+patterns that are defined over time, and we believe that these sorts of patterns
+will have a very large number of legality/validity constraints - many of which
+are difficult to reason about in a consistent way, may be target specific, and
+whose implementation may be particularly bug-prone. As such, we aim to design the
+API around pattern definition to be simple, resilient to programmer errors, and
+allow separation of concerns between the legality of the nodes generated from
+the idea of the pattern being defined.
+
+Finally, error handling is a topmost concern: in addition to allowing patterns
+to be defined in a target-independent way that may not apply for all hardware,
+we also want failure for any pattern to match to be diagnosable in a reasonable
+way. To be clear, this is not a solvable problem in general - the space of
+malfunction is too great to be fully enumerated and handled optimally, but there
+are better and worse ways to handle the situation. MLIR is already designed to
+represent the provenance of an operation well. This project aims to propagate
+that provenance information precisely, as well as diagnose pattern match
+failures with the rationale for why a set of patterns do not apply.
+
+### Non goals
+
+This proposal doesn't aim to solve all compiler problems, it is simply a
+DAG-to-DAG pattern matching system, starting with a greedy driver algorithm.
+Compiler algorithms that require global dataflow analysis (e.g. common
+subexpression elimination, conditional constant propagation, and many many
+others) will not be directly solved by this infrastructure.
+
+This proposal is limited to DAG patterns, which (by definition) prevent the
+patterns from seeing across cycles in a graph. In an SSA-based IR like MLIR,
+this means that these patterns don't see across PHI nodes / basic block
+arguments. We consider this acceptable given the set of problems we are trying
+to solve - we don't know of any other system that attempts to do so, and
+consider the payoff of worrying about this to be low.
+
+This design includes the ability for DAG patterns to have associated costs
+(benefits), but those costs are defined in terms of magic numbers (typically
+equal to the number of nodes being replaced). For any given application, the
+units of magic numbers will have to be defined.
+
+## Overall design
+
+We decompose the problem into four major pieces:
+
+1.  the code that is used to define patterns to match, cost, and their
+    replacement actions
+1.  the driver logic to pick the best match for a given root node
+1.  the client that is implementing some transformation (e.g. a combiner)
+1.  (future) the subsystem that allows patterns to be described with a
+    declarative syntax, which sugars step #1.
+
+We sketch the first three of these pieces, each in turn. This is not intended to
+be a concrete API proposal, merely to describe the design
+
+### Defining Patterns
+
+Each pattern will be an instance of a mlir::Pattern class, whose subclasses
+implement methods like this. Note that this API is meant for exposition, the
+actual details are different for efficiency and coding standards reasons (e.g.
+the memory management of `PatternState` is not specified below, etc):
+
+```c++
+class Pattern {
+  /// Return the benefit (the inverse of "cost") of matching this pattern.  The
+  /// benefit of a Pattern is always static - rewrites that may have dynamic
+  /// benefit can be instantiated multiple times (different Pattern instances)
+  /// for each benefit that they may return, and be guarded by different match
+  /// condition predicates.
+  PatternBenefit getBenefit() const { return benefit; }
+
+  /// Return the root node that this pattern matches.  Patterns that can
+  /// match multiple root types are instantiated once per root.
+  OperationName getRootKind() const { return rootKind; }
+
+  /// Attempt to match against code rooted at the specified operation,
+  /// which is the same operation code as getRootKind().  On failure, this
+  /// returns a None value.  On success it a (possibly null) pattern-specific
+  /// state wrapped in a Some.  This state is passed back into its rewrite
+  /// function if this match is selected.
+  virtual Optional<PatternState*> match(Operation *op) const = 0;
+
+  /// Rewrite the IR rooted at the specified operation with the result of
+  /// this pattern, generating any new operations with the specified
+  /// rewriter.  If an unexpected error is encountered (an internal
+  /// compiler error), it is emitted through the normal MLIR diagnostic
+  /// hooks and the IR is left in a valid state.
+  virtual void rewrite(Operation *op, PatternState *state,
+                       PatternRewriter &rewriter) const;
+};
+```
+
+In practice, the first patterns we implement will directly subclass and
+implement this stuff, but we will define some helpers to reduce boilerplate.
+When we have a declarative way to describe patterns, this should be
+automatically generated from the description.
+
+Instances of `Pattern` have a benefit that is static upon construction of the
+pattern instance, but may be computed dynamically at pattern initialization
+time, e.g. allowing the benefit to be derived from domain specific information,
+like the target architecture). This limitation allows us MLIR to (eventually)
+perform pattern fusion and compile patterns into an efficient state machine, and
+[Thier, Ertl, and Krall](https://dl.acm.org/citation.cfm?id=3179501) have shown
+that match predicates eliminate the need for dynamically computed costs in
+almost all cases: you can simply instantiate the same pattern one time for each
+possible cost and use the predicate to guard the match.
+
+The two-phase nature of this API (match separate from rewrite) is important for
+two reasons: 1) some clients may want to explore different ways to tile the
+graph, and only rewrite after committing to one tiling. 2) We want to support
+runtime extensibility of the pattern sets, but want to be able to statically
+compile the bulk of known patterns into a state machine at "compiler compile
+time". Both of these reasons lead to us needing to match multiple patterns
+before committing to an answer.
+
+### Picking and performing a replacement
+
+In the short term, this API can be very simple, something like this can work and
+will be useful for many clients:
+
+```c++
+class PatternMatcher {
+   // Create a pattern matcher with a bunch of patterns.  This constructor
+   // looks across all of the specified patterns, and builds an internal
+   // data structure that allows efficient matching.
+   PatternMatcher(ArrayRef<Pattern*> patterns);
+
+   // Given a specific operation, see if there is some rewrite that is
+   // interesting.  If so, return success and return the list of new
+   // operations that were created.  If not, return failure.
+   bool matchAndRewrite(Operation *op,
+                        SmallVectorImpl<Operation*> &newlyCreatedOps);
+};
+```
+
+In practice the interesting part of this class is the acceleration structure it
+builds internally. It buckets up the patterns by root operation, and sorts them
+by their static benefit. When performing a match, it tests any dynamic patterns,
+then tests statically known patterns from highest to lowest benefit.
+
+### First Client: A Greedy Worklist Combiner
+
+We expect that there will be lots of clients for this, but a simple greedy
+worklist-driven combiner should be powerful enough to serve many important ones,
+including the
+[TF2XLA op expansion logic](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/tf2xla/kernels),
+many of the pattern substitution passes of the
+[TOCO compiler](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/toco)
+for TF-Lite, many
+[Grappler](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/grappler)
+passes, and other general performance optimizations for applying identities.
+
+The structure of this algorithm is straight-forward, here is pseudo code:
+
+*   Walk a function in preorder, adding each operation to a worklist.
+*   While the worklist is non-empty, pull something off the back (processing
+    things generally in postorder)
+    *   Perform matchAndRewrite on the operation. If failed, continue to the
+        next operation.
+    *   On success, add the newly created ops to the worklist and continue.
+
+## Future directions
+
+It is important to get implementation and usage experience with this, and many
+patterns can be defined using this sort of framework. Over time, we can look to
+make it easier to declare patterns in a declarative form (e.g. with the LLVM
+tblgen tool or something newer/better). Once we have that, we can define an
+internal abstraction for describing the patterns to match, allowing better high
+level optimization of patterns (including fusion of the matching logic across
+patterns, which the LLVM instruction selector does) and allow the patterns to be
+defined without rebuilding the compiler itself.
diff --git a/mlir/docs/Glossary.md b/mlir/docs/Glossary.md
new file mode 100644
index 0000000000000000000000000000000000000000..542d3756ac70a0830724d55cd531099c1e1b84fb
--- /dev/null
+++ b/mlir/docs/Glossary.md
@@ -0,0 +1,174 @@
+# MLIR Glossary
+
+This glossary contains definitions of MLIR-specific terminology. It is intended
+to be a quick reference document. For terms which are well-documented elsewhere,
+definitions are kept brief and the header links to the more in-depth
+documentation.
+
+<!-- When contributing, please ensure that entries remain in alphabetical order. -->
+
+#### [Block](LangRef.md#blocks)
+
+A sequential list of operations without control flow.
+
+Also called a [basic block](https://en.wikipedia.org/wiki/Basic_block).
+
+#### Conversion
+
+The transformation of code represented in one dialect into a semantically
+equivalent representation in another dialect (i.e. inter-dialect conversion) or
+the same dialect (i.e. intra-dialect conversion).
+
+In the context of MLIR, conversion is distinct from [translation](#translation).
+Conversion refers to a transformation between (or within) dialects, but all
+still within MLIR, whereas translation refers to a transformation between MLIR
+and an external representation.
+
+#### [Declarative Rewrite Rule](DeclarativeRewrites.md) (DRR)
+
+A [rewrite rule](https://en.wikipedia.org/wiki/Graph_rewriting) which can be
+defined declaratively (e.g. through specification in a
+[TableGen](https://llvm.org/docs/TableGen/) record). At compiler build time,
+these rules are expanded into an equivalent `mlir::RewritePattern` subclass.
+
+#### [Dialect](LangRef.md#dialects)
+
+A dialect is a grouping of functionality which can be used to extend the MLIR
+system.
+
+A dialect creates a unique `namespace` within which new
+[operations](#operation-op), [attributes](LangRef.md#attributes), and
+[types](LangRef.md#type-system) are defined. This is the fundamental method by
+which to extend MLIR.
+
+In this way, MLIR is a meta-IR: its extensible framework allows it to be
+leveraged in many different ways (e.g. at different levels of the compilation
+process). Dialects provide an abstraction for the different uses of MLIR while
+recognizing that they are all a part of the meta-IR that is MLIR.
+
+The tutorial provides an example of
+[interfacing with MLIR](Tutorials/Toy/Ch-2.md#interfacing-with-mlir) in this
+way.
+
+(Note that we have intentionally selected the term "dialect" instead of
+"language", as the latter would wrongly suggest that these different namespaces
+define entirely distinct IRs.)
+
+#### Export
+
+To transform code represented in MLIR into a semantically equivalent
+representation which is external to MLIR.
+
+The tool that performs such a transformation is called an exporter.
+
+See also: [translation](#translation).
+
+#### [Function](LangRef.md#functions)
+
+An [operation](#operation-op) with a name containing one [region](#region).
+
+The region of a function is not allowed to implicitly capture values defined
+outside of the function, and all external references must use function arguments
+or attributes that establish a symbolic connection.
+
+#### Import
+
+To transform code represented in an external representation into a semantically
+equivalent representation in MLIR.
+
+The tool that performs such a transformation is called an importer.
+
+See also: [translation](#translation).
+
+#### Legalization
+
+The process of transforming operations into a semantically equivalent
+representation which adheres to the requirements set by the
+[conversion target](DialectConversion.md#conversion-target).
+
+That is, legalization is accomplished if and only if the new representation
+contains only operations which are legal, as specified in the conversion target.
+
+#### Lowering
+
+The process of transforming a higher-level representation of an operation into a
+lower-level, but semantically equivalent, representation.
+
+In MLIR, this is typically accomplished through
+[dialect conversion](DialectConversion.md). This provides a framework by which
+to define the requirements of the lower-level representation, called the
+[conversion target](DialectConversion.md#conversion-target), by specifying which
+operations are legal versus illegal after lowering.
+
+See also: [legalization](#legalization).
+
+#### [Module](LangRef.md#module)
+
+An [operation](#operation-op) which contains a single region containing a single
+block that is comprised of operations.
+
+This provides an organizational structure for MLIR operations, and is the
+expected top-level operation in the IR: the textual parser returns a Module.
+
+#### [Operation](LangRef.md#operations) (op)
+
+A unit of code in MLIR. Operations are the building blocks for all code and
+computations represented by MLIR. They are fully extensible (there is no fixed
+list of operations) and have application-specific semantics.
+
+An operation can have zero or more [regions](#region). Note that this creates a
+nested IR structure, as regions consist of blocks, which in turn, consist of a
+list of operations.
+
+In MLIR, there are two main classes related to operations: `Operation` and `Op`.
+Operation is the actual opaque instance of the operation, and represents the
+general API into an operation instance. An `Op` is the base class of a derived
+operation, like `ConstantOp`, and acts as smart pointer wrapper around a
+`Operation*`
+
+#### [Region](LangRef.md#regions)
+
+A [CFG](https://en.wikipedia.org/wiki/Control-flow_graph) of MLIR
+[blocks](#block).
+
+#### Round-trip
+
+The process of converting from a source format to a target format and then back
+to the source format.
+
+This is a good way of gaining confidence that the target format richly models
+the source format. This is particularly relevant in the MLIR context, since
+MLIR's multi-level nature allows for easily writing target dialects that model a
+source format (such as TensorFlow GraphDef or another non-MLIR format)
+faithfully and have a simple conversion procedure. Further cleanup/lowering can
+be done entirely within the MLIR representation. This separation - making the
+[importer](#import) as simple as possible and performing all further
+cleanups/lowering in MLIR - has proven to be a useful design pattern.
+
+#### [Terminator operation](LangRef.md#terminator-operations)
+
+An [operation](#operation-op) which *must* terminate a [block](#block).
+Terminator operations are a special category of operations.
+
+#### Transitive lowering
+
+An A->B->C [lowering](#lowering); that is, a lowering in which multiple patterns
+may be applied in order to fully transform an illegal operation into a set of
+legal ones.
+
+This provides the flexibility that the [conversion](#conversion) framework may
+perform the lowering in multiple stages of applying patterns (which may utilize
+intermediate patterns not in the conversion target) in order to fully legalize
+an operation. This is accomplished through
+[partial conversion](DialectConversion.md#modes-of-conversion).
+
+#### Translation
+
+The transformation of code represented in an external (non-MLIR) representation
+into a semantically equivalent representation in MLIR (i.e.
+[importing](#import)), or the inverse (i.e. [exporting](#export)).
+
+In the context of MLIR, translation is distinct from [conversion](#conversion).
+Translation refers to a transformation between MLIR and an external
+representation, whereas conversion refers to a transformation within MLIR
+(between or within dialects).
diff --git a/mlir/docs/Interfaces.md b/mlir/docs/Interfaces.md
new file mode 100644
index 0000000000000000000000000000000000000000..f413cac28bb00227db8158f825b691ca95ebcd9d
--- /dev/null
+++ b/mlir/docs/Interfaces.md
@@ -0,0 +1,200 @@
+# Introduction to MLIR Interfaces
+
+MLIR is generic and very extensible; it allows for opaquely representing many
+different dialects that have their own operations, attributes, types, and so on.
+This allows for dialects to be very expressive in their semantics and for MLIR
+to capture many different levels of abstraction. The downside to this is that
+transformations and analyses must be extremely conservative about the operations
+that they encounter, and must special-case the different dialects that they
+support. To combat this, MLIR provides the concept of `interfaces`.
+
+## Motivation
+
+Interfaces provide a generic way of interacting with the IR. The goal is to be
+able to express transformations/analyses in terms of these interfaces without
+encoding specific knowledge about the exact operation or dialect involved. This
+makes the compiler more extensible by allowing the addition of new dialects and
+operations in a decoupled way with respect to the implementation of
+transformations/analyses.
+
+### Dialect Interfaces
+
+Dialect interfaces are generally useful for transformation passes or analyses
+that want to opaquely operate on operations, even *across* dialects. These
+interfaces generally involve wide coverage over the entire dialect and are only
+used for a handful of transformations/analyses. In these cases, registering the
+interface directly on each operation is overly complex and cumbersome. The
+interface is not core to the operation, just to the specific transformation. An
+example of where this type of interface would be used is inlining. Inlining
+generally queries high-level information about the operations within a dialect,
+like legality and cost modeling, that often is not specific to one operation.
+
+A dialect interface can be defined by inheriting from the CRTP base class
+`DialectInterfaceBase::Base`. This class provides the necessary utilities for
+registering an interface with the dialect so that it can be looked up later.
+Once the interface has been defined, dialects can override it using
+dialect-specific information. The interfaces defined by a dialect are registered
+in a similar mechanism to Attributes, Operations, Types, etc.
+
+```c++
+/// Define an Inlining interface to allow for dialects to opt-in.
+class DialectInlinerInterface :
+    public DialectInterface::Base<DialectInlinerInterface> {
+public:
+  /// Returns true if the given region 'src' can be inlined into the region
+  /// 'dest' that is attached to an operation registered to the current dialect.
+  /// 'valueMapping' contains any remapped values from within the 'src' region.
+  /// This can be used to examine what values will replace entry arguments into
+  /// the 'src' region, for example.
+  virtual bool isLegalToInline(Region *dest, Region *src,
+                               BlockAndValueMapping &valueMapping) const {
+    return false;
+  }
+};
+
+/// Override the inliner interface to add support for inlining affine
+/// operations.
+struct AffineInlinerInterface : public DialectInlinerInterface {
+  /// Affine structures have specific inlining constraints.
+  bool isLegalToInline(Region *dest, Region *src,
+                       BlockAndValueMapping &valueMapping) const final {
+    ...
+  }
+};
+
+/// Register the interface with the dialect.
+AffineOpsDialect::AffineOpsDialect(MLIRContext *context) ... {
+  addInterfaces<AffineInlinerInterface>();
+}
+```
+
+Once registered, these interfaces can be opaquely queried from the dialect by
+the transformation/analysis that wants to use them:
+
+```c++
+Dialect *dialect = ...;
+if (auto *interface = dialect->getInterface<DialectInlinerInterface>())
+    ... // The dialect provides this interface.
+```
+
+#### DialectInterfaceCollections
+
+An additional utility is provided via DialectInterfaceCollection. This CRTP
+class allows for collecting all of the dialects that have registered a given
+interface within the context.
+
+```c++
+class InlinerInterface : public
+    DialectInterfaceCollection<DialectInlinerInterface> {
+  /// The hooks for this class mirror the hooks for the DialectInlinerInterface,
+  /// with default implementations that call the hook on the interface for a
+  /// given dialect.
+  virtual bool isLegalToInline(Region *dest, Region *src,
+                               BlockAndValueMapping &valueMapping) const {
+    auto *handler = getInterfaceFor(dest->getContainingOp());
+    return handler ? handler->isLegalToInline(dest, src, valueMapping) : false;
+  }
+};
+
+MLIRContext *ctx = ...;
+InlinerInterface interface(ctx);
+if(!interface.isLegalToInline(...))
+   ...
+```
+
+### Operation Interfaces
+
+Operation interfaces, as the name suggests, are those registered at the
+Operation level. These interfaces provide an opaque view into derived operations
+by providing a virtual interface that must be implemented. As an example, the
+`Linalg` dialect may implement an interface that provides general queries about
+some of the dialects library operations. These queries may provide things like:
+the number of parallel loops; the number of inputs and outputs; etc.
+
+Operation interfaces are defined by overriding the CRTP base class
+`OpInterface`. This class takes, as a template parameter, a `Traits` class that
+defines a `Concept` and a `Model` class. These classes provide an implementation
+of concept-based polymorphism, where the Concept defines a set of virtual
+methods that are overridden by the Model that is templated on the concrete
+operation type. It is important to note that these classes should be pure in
+that they contain no non-static data members. Operations that wish to override
+this interface should add the provided trait `OpInterface<..>::Trait` upon
+registration.
+
+```c++
+struct ExampleOpInterfaceTraits {
+  /// Define a base concept class that defines the virtual interface that needs
+  /// to be overridden.
+  struct Concept {
+    virtual ~Concept();
+    virtual unsigned getNumInputs(Operation *op) = 0;
+  };
+
+  /// Define a model class that specializes a concept on a given operation type.
+  template <typename OpT>
+  struct Model : public Concept {
+    /// Override the method to dispatch on the concrete operation.
+    unsigned getNumInputs(Operation *op) final {
+      return llvm::cast<OpT>(op).getNumInputs();
+    }
+  };
+};
+
+class ExampleOpInterface : public OpInterface<ExampleOpInterface,
+                                              ExampleOpInterfaceTraits> {
+public:
+  /// Use base class constructor to support LLVM-style casts.
+  using OpInterface<ExampleOpInterface, ExampleOpInterfaceTraits>::OpInterface;
+
+  /// The interface dispatches to 'getImpl()', an instance of the concept.
+  unsigned getNumInputs() {
+    return getImpl()->getNumInputs(getOperation());
+  }
+};
+
+```
+
+Once the interface has been defined, it is registered to an operation by adding
+the provided trait `ExampleOpInterface::Trait`. Using this interface is just
+like using any other derived operation type, i.e. casting:
+
+```c++
+/// When defining the operation, the interface is registered via the nested
+/// 'Trait' class provided by the 'OpInterface<>' base class.
+class MyOp : public Op<MyOp, ExampleOpInterface::Trait> {
+public:
+  /// The definition of the interface method on the derived operation.
+  unsigned getNumInputs() { return ...; }
+};
+
+/// Later, we can query if a specific operation(like 'MyOp') overrides the given
+/// interface.
+Operation *op = ...;
+if (ExampleOpInterface example = dyn_cast<ExampleOpInterface>(op))
+  llvm::errs() << "num inputs = " << example.getNumInputs() << "\n";
+```
+
+#### Utilizing the ODS Framework
+
+Operation interfaces require a bit of boiler plate to connect all of the pieces
+together. The ODS(Operation Definition Specification) framework provides
+simplified mechanisms for
+[defining interfaces](OpDefinitions.md#operation-interfaces).
+
+As an example, using the ODS framework would allow for defining the example
+interface above as:
+
+```tablegen
+def ExampleOpInterface : OpInterface<"ExampleOpInterface"> {
+  let description = [{
+    This is an example interface definition.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      "Get the number of inputs for the current operation.",
+      "unsigned", "getNumInputs"
+    >,
+  ];
+}
+```
diff --git a/mlir/docs/LangRef.md b/mlir/docs/LangRef.md
new file mode 100644
index 0000000000000000000000000000000000000000..da60b8b892e985d89a1670c6ef51fbe1f70a5252
--- /dev/null
+++ b/mlir/docs/LangRef.md
@@ -0,0 +1,1497 @@
+# MLIR Specification
+
+MLIR (Multi-Level IR) is a compiler intermediate representation with
+similarities to traditional three-address SSA representations (like
+[LLVM IR](http://llvm.org/docs/LangRef.html) or
+[SIL](https://github.com/apple/swift/blob/master/docs/SIL.rst)), but which
+introduces notions from polyhedral loop optimization as first-class concepts.
+This hybrid design is optimized to represent, analyze, and transform high level
+dataflow graphs as well as target-specific code generated for high performance
+data parallel systems. Beyond its representational capabilities, its single
+continuous design provides a framework to lower from dataflow graphs to
+high-performance target-specific code.
+
+This document defines and describes the key concepts in MLIR, and is intended to
+be a dry reference document - the [rationale documentation](Rationale.md),
+[glossary](Glossary.md), and other content are hosted elsewhere.
+
+MLIR is designed to be used in three different forms: a human-readable textual
+form suitable for debugging, an in-memory form suitable for programmatic
+transformations and analysis, and a compact serialized form suitable for storage
+and transport. The different forms all describe the same semantic content. This
+document describes the human-readable textual form.
+
+[TOC]
+
+## High-Level Structure
+
+MLIR is an
+[SSA-based](https://en.wikipedia.org/wiki/Static_single_assignment_form) IR,
+which means that values are defined before use and have scope defined by their
+dominance relations. Operations may produce zero or more results, and each is a
+distinct SSA value with its own type defined by the [type system](#type-system).
+
+The unit of code in MLIR is an [Operation](#operations). Operations allow for
+representing many different concepts: allocating buffers, producing views to
+transform them, target-independent arithmetic, target-specific operations, and
+even arbitrary user-defined high-level operations including the
+[Module](#module) and [Function](#functions) operations. Operations may contain
+[Regions](#regions) that represent a Control Flow Graph (CFG) of
+[Blocks](#blocks), that contain operations and end with a
+[terminator operation](#terminator-operations) (like branches).
+
+Here's an example of an MLIR module:
+
+```mlir
+// Compute A*B using an implementation of multiply kernel and print the
+// result using a TensorFlow op. The dimensions of A and B are partially
+// known. The shapes are assumed to match.
+func @mul(%A: tensor<100x?xf32>, %B: tensor<?x50xf32>) -> (tensor<100x50xf32>) {
+  // Compute the inner dimension of %A using the dim operation.
+  %n = dim %A, 1 : tensor<100x?xf32>
+
+  // Allocate addressable "buffers" and copy tensors %A and %B into them.
+  %A_m = alloc(%n) : memref<100x?xf32>
+  tensor_store %A to %A_m : memref<100x?xf32>
+
+  %B_m = alloc(%n) : memref<?x50xf32>
+  tensor_store %B to %B_m : memref<?x50xf32>
+
+  // Call function @multiply passing memrefs as arguments,
+  // and getting returned the result of the multiplication.
+  %C_m = call @multiply(%A_m, %B_m)
+          : (memref<100x?xf32>, memref<?x50xf32>) -> (memref<100x50xf32>)
+
+  dealloc %A_m : memref<100x?xf32>
+  dealloc %B_m : memref<?x50xf32>
+
+  // Load the buffer data into a higher level "tensor" value.
+  %C = tensor_load %C_m : memref<100x50xf32>
+  dealloc %C_m : memref<100x50xf32>
+
+  // Call TensorFlow built-in function to print the result tensor.
+  "tf.Print"(%C){message: "mul result"}
+                  : (tensor<100x50xf32) -> (tensor<100x50xf32>)
+
+  return %C : tensor<100x50xf32>
+}
+
+// A function that multiplies two memrefs and returns the result.
+func @multiply(%A: memref<100x?xf32>, %B: memref<?x50xf32>)
+          -> (memref<100x50xf32>)  {
+  // Compute the inner dimension of %A.
+  %n = dim %A, 1 : memref<100x?xf32>
+
+  // Allocate memory for the multiplication result.
+  %C = alloc() : memref<100x50xf32>
+
+  // Multiplication loop nest.
+  affine.for %i = 0 to 100 {
+     affine.for %j = 0 to 50 {
+        store 0 to %C[%i, %j] : memref<100x50xf32>
+        affine.for %k = 0 to %n {
+           %a_v  = load %A[%i, %k] : memref<100x?xf32>
+           %b_v  = load %B[%k, %j] : memref<?x50xf32>
+           %prod = mulf %a_v, %b_v : f32
+           %c_v  = load %C[%i, %j] : memref<100x50xf32>
+           %sum  = addf %c_v, %prod : f32
+           store %sum, %C[%i, %j] : memref<100x50xf32>
+        }
+     }
+  }
+  return %C : memref<100x50xf32>
+}
+```
+
+## Notation
+
+MLIR has a simple and unambiguous grammar, allowing it to reliably round-trip
+through a textual form. This is important for development of the compiler - e.g.
+for understanding the state of code as it is being transformed and writing test
+cases.
+
+This document describes the grammar using
+[Extended Backus-Naur Form (EBNF)](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form).
+
+This is the EBNF grammar used in this document, presented in yellow boxes.
+
+```
+alternation ::= expr0 | expr1 | expr2  // Either expr0 or expr1 or expr2.
+sequence    ::= expr0 expr1 expr2      // Sequence of expr0 expr1 expr2.
+repetition0 ::= expr*  // 0 or more occurrences.
+repetition1 ::= expr+  // 1 or more occurrences.
+optionality ::= expr?  // 0 or 1 occurrence.
+grouping    ::= (expr) // Everything inside parens is grouped together.
+literal     ::= `abcd` // Matches the literal `abcd`.
+```
+
+Code examples are presented in blue boxes.
+
+```mlir
+// This is an example use of the grammar above:
+// This matches things like: ba, bana, boma, banana, banoma, bomana...
+example ::= `b` (`an` | `om`)* `a`
+```
+
+### Common syntax
+
+The following core grammar productions are used in this document:
+
+```
+// TODO: Clarify the split between lexing (tokens) and parsing (grammar).
+digit     ::= [0-9]
+hex_digit ::= [0-9a-fA-F]
+letter    ::= [a-zA-Z]
+id-punct  ::= [$._-]
+
+integer-literal ::= decimal-literal | hexadecimal-literal
+decimal-literal ::= digit+
+hexadecimal-literal ::= `0x` hex_digit+
+float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
+string-literal  ::= `"` [^"\n\f\v\r]* `"`   TODO define escaping rules
+```
+
+Not listed here, but MLIR does support comments. They use standard BCPL syntax,
+starting with a `//` and going until the end of the line.
+
+### Identifiers and keywords
+
+Syntax:
+
+```
+// Identifiers
+bare-id ::= (letter|[_]) (letter|digit|[_$.])*
+bare-id-list ::= bare-id (`,` bare-id)*
+ssa-id ::= `%` suffix-id
+suffix-id ::= (digit+ | ((letter|id-punct) (letter|id-punct|digit)*))
+
+symbol-ref-id ::= `@` (suffix-id | string-literal)
+ssa-id-list ::= ssa-id (`,` ssa-id)*
+
+// Uses of an SSA value, e.g. in an operand list to an operation.
+ssa-use ::= ssa-id
+ssa-use-list ::= ssa-use (`,` ssa-use)*
+```
+
+Identifiers name entities such as SSA values, types and functions, and are
+chosen by the writer of MLIR code. Identifiers may be descriptive (e.g.
+`%batch_size`, `@matmul`), or may be non-descriptive when they are
+auto-generated (e.g. `%23`, `@func42`). Identifier names for SSA values may be
+used in an MLIR text file but are not persisted as part of the IR - the printer
+will give them anonymous names like `%42`.
+
+MLIR guarantees identifiers never collide with keywords by prefixing identifiers
+with a sigil (e.g. `%`, `#`, `@`, `^`, `!`). In certain unambiguous contexts
+(e.g. affine expressions), identifiers are not prefixed, for brevity. New
+keywords may be added to future versions of MLIR without danger of collision
+with existing identifiers.
+
+The scope of SSA values is defined based on the standard definition of
+[dominance](https://en.wikipedia.org/wiki/Dominator_\(graph_theory\)). Argument
+identifiers in mapping functions are in scope for the mapping body. Function
+identifiers and mapping identifiers are visible across the entire module.
+
+## Dialects
+
+Dialects are the mechanism by which to engage with and extend the MLIR
+ecosystem. They allow for defining new [operations](#operations), as well as
+[attributes](#attributes) and [types](#type-system). Each dialect is given a
+unique `namespace` that is prefixed to each defined attribute/operation/type.
+For example, the [Affine dialect](Dialects/Affine.md) defines the namespace:
+`affine`.
+
+MLIR allows for multiple dialects, even those outside of the main tree, to
+co-exist together within one module. Dialects are produced and consumed by
+certain passes. MLIR provides a [framework](DialectConversion.md) to convert
+between, and within, different dialects.
+
+A few of the dialects supported by MLIR:
+
+*   [Affine dialect](Dialects/Affine.md)
+*   [GPU dialect](Dialects/GPU.md)
+*   [LLVM dialect](Dialects/LLVM.md)
+*   [SPIR-V dialect](Dialects/SPIR-V.md)
+*   [Standard dialect](Dialects/Standard.md)
+*   [Vector dialect](Dialects/Vector.md)
+
+### Target specific operations
+
+Dialects provide a modular way in which targets can expose target-specific
+operations directly through to MLIR. As an example, some targets go through
+LLVM. LLVM has a rich set of intrinsics for certain target-independent
+operations (e.g. addition with overflow check) as well as providing access to
+target-specific operations for the targets it supports (e.g. vector permutation
+operations). LLVM intrinsics in MLIR are represented via operations that start
+with an "llvm." name.
+
+Example:
+
+```mlir
+// LLVM: %x = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a, i16 %b)
+%x:2 = "llvm.sadd.with.overflow.i16"(%a, %b) : (i16, i16) -> (i16, i1)
+```
+
+These operations only work when targeting LLVM as a backend (e.g. for CPUs and
+GPUs), and are required to align with the LLVM definition of these intrinsics.
+
+## Operations
+
+Syntax:
+
+```
+operation         ::= op-result-list? (generic-operation | custom-operation)
+                      trailing-location?
+generic-operation ::= string-literal '(' ssa-use-list? ')' attribute-dict?
+                      `:` function-type
+custom-operation  ::= bare-id custom-operation-format
+op-result-list    ::= op-result (`,` op-result)* `=`
+op-result         ::= ssa-id (`:` integer-literal)
+successor-list    ::= successor (`,` successor)*
+successor         ::= caret-id (`:` bb-arg-list)?
+region-list       ::= region (`,` region)*
+trailing-location ::= (`loc` `(` location `)`)?
+```
+
+MLIR introduces a uniform concept called _operations_ to enable describing many
+different levels of abstractions and computations. Operations in MLIR are fully
+extensible (there is no fixed list of operations) and have application-specific
+semantics. For example, MLIR supports
+[target-independent operations](Dialects/Standard.md#memory-operations),
+[affine operations](Dialects/Affine.md), and
+[target-specific machine operations](#target-specific-operations).
+
+The internal representation of an operation is simple: an operation is
+identified by a unique string (e.g. `dim`, `tf.Conv2d`, `x86.repmovsb`,
+`ppc.eieio`, etc), can return zero or more results, take zero or more SSA
+operands, may have zero or more attributes, may have zero or more successors,
+and zero or more enclosed [regions](#regions). The generic printing form
+includes all these elements literally, with a function type to indicate the
+types of the results and operands.
+
+Example:
+
+```mlir
+// An operation that produces two results.
+// The results of %result can be accessed via the <name> `#` <opNo> syntax.
+%result:2 = "foo_div"() : () -> (f32, i32)
+
+// Pretty form that defines a unique name for each result.
+%foo, %bar = "foo_div"() : () -> (f32, i32)
+
+// Invoke a TensorFlow function called tf.scramble with two inputs
+// and an attribute "fruit".
+%2 = "tf.scramble"(%result#0, %bar) {fruit: "banana"} : (f32, i32) -> f32
+```
+
+In addition to the basic syntax above, dialects may register known operations.
+This allows those dialects to support _custom assembly form_ for parsing and
+printing operations. In the operation sets listed below, we show both forms.
+
+### Terminator Operations
+
+These are a special category of operations that *must* terminate a block, e.g.
+[branches](Dialects/Standard.md#terminator-operations). These operations may
+also have a list of successors ([blocks](#blocks) and their arguments).
+
+Example:
+
+```mlir
+// Branch to ^bb1 or ^bb2 depending on the condition %cond.
+// Pass value %v to ^bb2, but not to ^bb1.
+"cond_br"(%cond)[^bb1, ^bb2(%v : index)] : (i1) -> ()
+```
+
+### Module
+
+```
+module ::= `module` symbol-ref-id? (`attributes` attribute-dict)? region
+```
+
+An MLIR module represents an opaque top-level container operation. It contains a
+single region containing a single block that is comprised of any operations.
+Operations within this region must not implicitly capture values defined above
+it. Modules have an optional symbol name that can be used to refer to them in
+operations.
+
+### Functions
+
+An MLIR Function is an operation with a name containing one [region](#regions).
+The region of a function is not allowed to implicitly capture values defined
+outside of the function, and all external references must use function arguments
+or attributes that establish a symbolic connection (e.g. symbols referenced by
+name via a string attribute like [SymbolRefAttr](#symbol-reference-attribute)):
+
+```
+function ::= `func` function-signature function-attributes? function-body?
+
+function-signature ::= symbol-ref-id `(` argument-list `)`
+                       (`->` function-result-list)?
+
+argument-list ::= (named-argument (`,` named-argument)*) | /*empty*/
+argument-list ::= (type attribute-dict? (`,` type attribute-dict?)*) | /*empty*/
+named-argument ::= ssa-id `:` type attribute-dict?
+
+function-result-list ::= function-result-list-parens
+                       | non-function-type
+function-result-list-parens ::= `(` `)`
+                              | `(` function-result-list-no-parens `)`
+function-result-list-no-parens ::= function-result (`,` function-result)*
+function-result ::= type attribute-dict?
+
+function-attributes ::= `attributes` attribute-dict
+function-body ::= region
+```
+
+An external function declaration (used when referring to a function declared in
+some other module) has no body. While the MLIR textual form provides a nice
+inline syntax for function arguments, they are internally represented as "block
+arguments" to the first block in the region.
+
+Only dialect attribute names may be specified in the attribute dictionaries for
+function arguments, results, or the function itself.
+
+Examples:
+
+```mlir
+// External function definitions.
+func @abort()
+func @scribble(i32, i64, memref<? x 128 x f32, #layout_map0>) -> f64
+
+// A function that returns its argument twice:
+func @count(%x: i64) -> (i64, i64)
+  attributes {fruit: "banana"} {
+  return %x, %x: i64, i64
+}
+
+// A function with an argument attribute
+func @example_fn_arg(%x: i32 {swift.self = unit})
+
+// A function with a result attribute
+func @example_fn_result() -> (f64 {dialectName.attrName = 0 : i64})
+
+// A function with an attribute
+func @example_fn_attr() attributes {dialectName.attrName = false}
+```
+
+## Blocks
+
+Syntax:
+
+```
+block           ::= block-label operation+
+block-label     ::= block-id block-arg-list? `:`
+block-id        ::= caret-id
+caret-id        ::= `^` suffix-id
+ssa-id-and-type ::= ssa-id `:` type
+
+// Non-empty list of names and types.
+ssa-id-and-type-list ::= ssa-id-and-type (`,` ssa-id-and-type)*
+
+block-arg-list ::= `(` ssa-id-and-type-list? `)`
+```
+
+A [block](https://en.wikipedia.org/wiki/Basic_block) is a sequential list of
+operations without control flow (calls are not considered control flow for this
+purpose) that are executed from top to bottom. The last operation in a block is
+a [terminator operation](#terminator-operations), which ends the block.
+
+Blocks in MLIR take a list of block arguments, which represent SSA PHI nodes in
+a functional notation. The arguments are defined by the block, and values are
+provided for these block arguments by branches that go to the block.
+
+Here is a simple example function showing branches, returns, and block
+arguments:
+
+```mlir
+func @simple(i64, i1) -> i64 {
+^bb0(%a: i64, %cond: i1): // Code dominated by ^bb0 may refer to %a
+  cond_br %cond, ^bb1, ^bb2
+
+^bb1:
+  br ^bb3(%a: i64)    // Branch passes %a as the argument
+
+^bb2:
+  %b = addi %a, %a : i64
+  br ^bb3(%b: i64)    // Branch passes %b as the argument
+
+// ^bb3 receives an argument, named %c, from predecessors
+// and passes it on to bb4 twice.
+^bb3(%c: i64):
+  br ^bb4(%c, %c : i64, i64)
+
+^bb4(%d : i64, %e : i64):
+  %0 = addi %d, %e : i64
+  return %0 : i64
+}
+```
+
+**Context:** The "block argument" representation eliminates a number of special
+cases from the IR compared to traditional "PHI nodes are operations" SSA IRs
+(like LLVM). For example, the
+[parallel copy semantics](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.524.5461&rep=rep1&type=pdf)
+of SSA is immediately apparent, and function arguments are no longer a special
+case: they become arguments to the entry block
+[[more rationale](Rationale.md#block-arguments-vs-phi-nodes)].
+
+## Regions
+
+### Definition
+
+A region is a CFG of MLIR [Blocks](#blocks). Regions serve to group semantically
+connected blocks, where the semantics is not imposed by the IR. Instead, the
+containing operation defines the semantics of the regions it contains. Regions
+do not have a name or an address, only the blocks contained in a region do.
+Regions are meaningless outside of the containing entity and have no type or
+attributes.
+
+The first block in the region cannot be a successor of any other block. The
+syntax for the region is as follows:
+
+```
+region ::= `{` block* `}`
+```
+
+The function body is an example of a region: it consists of a CFG of blocks and
+has additional semantic restrictions that other types of regions may not have
+(block terminators must either branch to a different block, or return from a
+function where the types of the `return` arguments must match the result types
+of the function signature).
+
+### Control and Value Scoping
+
+Regions provide nested control isolation: it is impossible to branch to a block
+within a region from outside it or to branch from within a region to a block
+outside it. Similarly, it provides a natural scoping for value visibility: SSA
+values defined in a region don't escape to the enclosing region, if any. By
+default, a region can reference values defined outside of the region whenever it
+would have been legal to use them as operands to the enclosing operation.
+
+Example:
+
+```mlir
+func @accelerator_compute(i64, i1) -> i64 {
+^bb0(%a: i64, %cond: i1): // Code dominated by ^bb0 may refer to %a
+  cond_br %cond, ^bb1, ^bb2
+
+^bb1:
+  // This def for %value does not dominate ^bb2
+  %value = "op.convert"(%a) : (i64) -> i64
+  br ^bb3(%a: i64)    // Branch passes %a as the argument
+
+^bb2:
+  "accelerator.launch"() {
+    ^bb0:
+      // Region of code nested under "accelerator.launch", it can reference %a but
+      // not %value.
+      %new_value = "accelerator.do_something"(%a) : (i64) -> ()
+  }
+  // %new_value cannot be referenced outside of the region
+
+^bb3:
+  ...
+}
+```
+
+This can be further restricted using the custom verifier associated with the
+enclosing operation, for example, disallowing references to values defined
+outside the region completely.
+
+### Control Flow
+
+Regions are Single-Entry-Multiple-Exit (SEME). This means that control can only
+flow into the first block of the region, but can flow out of the region at the
+end of any of the contained blocks (This behavior is similar to that of a
+function body in most programming languages). When exiting a Region, control is
+returned to the enclosing operation.
+
+The enclosing operation determines the way in which control is transmitted into
+the entry block of a Region. The successor to a region’s exit points may not
+necessarily exist: for example a call to a function that does not return.
+Concurrent or asynchronous execution of regions is unspecified. Operations may
+define specific rules of execution, e.g. sequential loops or switch cases.
+
+A Region may also enter another region within the enclosing operation. If an
+operation has multiple regions, the semantics of the operation defines into
+which regions the control flows and in which order, if any. An operation may
+transmit control into regions that were specified in other operations, in
+particular those that defined the values the given operation uses. Thus such
+operations can be treated opaquely in the enclosing control flow graph,
+providing a level of control flow isolation similar to that of the call
+operation.
+
+#### Closure
+
+Regions allow defining an operation that creates a closure, for example by
+“boxing” the body of the region into a value they produce. It remains up to the
+operation to define its semantics. Note that if an operation triggers
+asynchronous execution of the region, it is under the responsibility of the
+operation caller to wait for the region to be executed guaranteeing that any
+directly used values remain live.
+
+### Arguments and Results
+
+The arguments of the first block of a region are treated as arguments of the
+region. The source of these arguments is defined by the semantics of the parent
+operation. They may correspond to some of the values the operation itself uses.
+
+Regions produce a (possibly empty) list of values. The operation semantics
+defines the relation between the region results and the operation results.
+
+## Type System
+
+Each SSA value in MLIR has a type defined by the type system below. There are a
+number of primitive types (like integers) and also aggregate types for tensors
+and memory buffers. MLIR [standard types](#standard-types) do not include
+structures, arrays, or dictionaries.
+
+MLIR has an open type system (i.e. there is no fixed list of types), and types
+may have application-specific semantics. For example, MLIR supports a set of
+[dialect types](#dialect-types).
+
+```
+type ::= type-alias | dialect-type | standard-type
+
+type-list-no-parens ::=  type (`,` type)*
+type-list-parens ::= `(` `)`
+                   | `(` type-list-no-parens `)`
+
+// This is a common way to refer to an SSA value with a specified type.
+ssa-use-and-type ::= ssa-use `:` type
+
+// Non-empty list of names and types.
+ssa-use-and-type-list ::= ssa-use-and-type (`,` ssa-use-and-type)*
+```
+
+### Type Aliases
+
+```
+type-alias-def ::= '!' alias-name '=' 'type' type
+type-alias ::= '!' alias-name
+```
+
+MLIR supports defining named aliases for types. A type alias is an identifier
+that can be used in the place of the type that it defines. These aliases *must*
+be defined before their uses. Alias names may not contain a '.', since those
+names are reserved for [dialect types](#dialect-types).
+
+Example:
+
+```mlir
+!avx_m128 = type vector<4 x f32>
+
+// Using the original type.
+"foo"(%x) : vector<4 x f32> -> ()
+
+// Using the type alias.
+"foo"(%x) : !avx_m128 -> ()
+```
+
+### Dialect Types
+
+Similarly to operations, dialects may define custom extensions to the type
+system.
+
+```
+dialect-namespace ::= bare-id
+
+opaque-dialect-item ::= dialect-namespace '<' string-literal '>'
+
+pretty-dialect-item ::= dialect-namespace '.' pretty-dialect-item-lead-ident
+                                              pretty-dialect-item-body?
+
+pretty-dialect-item-lead-ident ::= '[A-Za-z][A-Za-z0-9._]*'
+pretty-dialect-item-body ::= '<' pretty-dialect-item-contents+ '>'
+pretty-dialect-item-contents ::= pretty-dialect-item-body
+                              | '(' pretty-dialect-item-contents+ ')'
+                              | '[' pretty-dialect-item-contents+ ']'
+                              | '{' pretty-dialect-item-contents+ '}'
+                              | '[^[<({>\])}\0]+'
+
+dialect-type ::= '!' opaque-dialect-item
+dialect-type ::= '!' pretty-dialect-item
+```
+
+Dialect types can be specified in a verbose form, e.g. like this:
+
+```mlir
+// LLVM type that wraps around llvm IR types.
+!llvm<"i32*">
+
+// Tensor flow string type.
+!tf.string
+
+// Complex type
+!foo<"something<abcd>">
+
+// Even more complex type
+!foo<"something<a%%123^^^>>>">
+```
+
+Dialect types that are simple enough can use the pretty format, which is a
+lighter weight syntax that is equivalent to the above forms:
+
+```mlir
+// Tensor flow string type.
+!tf.string
+
+// Complex type
+!foo.something<abcd>
+```
+
+Sufficiently complex dialect types are required to use the verbose form for
+generality. For example, the more complex type shown above wouldn't be valid in
+the lighter syntax: `!foo.something<a%%123^^^>>>` because it contains characters
+that are not allowed in the lighter syntax, as well as unbalanced `<>`
+characters.
+
+See [here](DefiningAttributesAndTypes.md) to learn how to define dialect types.
+
+### Standard Types
+
+Standard types are a core set of [dialect types](#dialect-types) that are
+defined in a builtin dialect and thus available to all users of MLIR.
+
+```
+standard-type ::=     complex-type
+                    | float-type
+                    | function-type
+                    | index-type
+                    | integer-type
+                    | memref-type
+                    | none-type
+                    | tensor-type
+                    | tuple-type
+                    | vector-type
+```
+
+#### Complex Type
+
+Syntax:
+
+```
+complex-type ::= `complex` `<` type `>`
+```
+
+The value of `complex` type represents a complex number with a parameterized
+element type, which is composed of a real and imaginary value of that element
+type. The element must be a floating point or integer scalar type.
+
+Examples:
+
+```mlir
+complex<f32>
+complex<i32>
+```
+
+#### Floating Point Types
+
+Syntax:
+
+```
+// Floating point.
+float-type ::= `f16` | `bf16` | `f32` | `f64`
+```
+
+MLIR supports float types of certain widths that are widely used as indicated
+above.
+
+#### Function Type
+
+Syntax:
+
+```
+// MLIR functions can return multiple values.
+function-result-type ::= type-list-parens
+                       | non-function-type
+
+function-type ::= type-list-parens `->` function-result-type
+```
+
+MLIR supports first-class functions: for example, the
+[`constant` operation](Dialects/Standard.md#constant-operation) produces the
+address of a function as an SSA value. This SSA value may be passed to and
+returned from functions, merged across control flow boundaries with
+[block arguments](#blocks), and called with the
+[`call_indirect` operation](Dialects/Standard.md#call-indirect-operation).
+
+Function types are also used to indicate the arguments and results of
+[operations](#operations).
+
+#### Index Type
+
+Syntax:
+
+```
+// Target word-sized integer.
+index-type ::= `index`
+```
+
+The `index` type is a signless integer whose size is equal to the natural
+machine word of the target ([rationale](Rationale.md#signless-types)) and is
+used by the affine constructs in MLIR. Unlike fixed-size integers, it cannot be
+used as an element of vector, tensor or memref type
+([rationale](Rationale.md#index-type-disallowed-in-vectortensormemref-types)).
+
+**Rationale:** integers of platform-specific bit widths are practical to express
+sizes, dimensionalities and subscripts.
+
+#### Integer Type
+
+Syntax:
+
+```
+// Sized integers like i1, i4, i8, i16, i32.
+integer-type ::= `i` [1-9][0-9]*
+```
+
+MLIR supports arbitrary precision integer types. Integer types are signless, but
+have a designated width.
+
+**Rationale:** low precision integers (like `i2`, `i4` etc) are useful for
+low-precision inference chips, and arbitrary precision integers are useful for
+hardware synthesis (where a 13 bit multiplier is a lot cheaper/smaller than a 16
+bit one).
+
+TODO: Need to decide on a representation for quantized integers
+([initial thoughts](Rationale.md#quantized-integer-operations)).
+
+#### Memref Type
+
+Syntax:
+
+```
+memref-type ::= ranked-memref-type | unranked-memref-type
+
+ranked-memref-type ::= `memref` `<` dimension-list-ranked tensor-memref-element-type
+                      (`,` layout-specification)? |
+                      (`,` memory-space)? `>`
+
+unranked-memref-type ::= `memref` `<*x` tensor-memref-element-type
+                         (`,` memory-space)? `>`
+
+stride-list ::= `[` (dimension (`,` dimension)*)? `]`
+strided-layout ::= `offset:` dimension `,` `strides: ` stride-list
+layout-specification ::= semi-affine-map | strided-layout
+memory-space ::= integer-literal /* | TODO: address-space-id */
+```
+
+A `memref` type is a reference to a region of memory (similar to a buffer
+pointer, but more powerful). The buffer pointed to by a memref can be allocated,
+aliased and deallocated. A memref can be used to read and write data from/to the
+memory region which it references. Memref types use the same shape specifier as
+tensor types. Note that `memref<f32>`, `memref<0 x f32>`, `memref<1 x 0 x f32>`,
+and `memref<0 x 1 x f32>` are all different types.
+
+A `memref` is allowed to have an unknown rank (e.g. `memref<*xf32>`). The
+purpose of unranked memrefs is to allow external library functions to receive
+memref arguments of any rank without versioning the functions based on the rank.
+Other uses of this type are disallowed or will have undefined behavior.
+
+##### Codegen of Unranked Memref
+
+Using unranked memref in codegen besides the case mentioned above is highly
+discouraged. Codegen is concerned with generating loop nests and specialized
+instructions for high-performance, unranked memref is concerned with hiding the
+rank and thus, the number of enclosing loops required to iterate over the data.
+However, if there is a need to code-gen unranked memref, one possible path is to
+cast into a static ranked type based on the dynamic rank. Another possible path
+is to emit a single while loop conditioned on a linear index and perform
+delinearization of the linear index to a dynamic array containing the (unranked)
+indices. While this is possible, it is expected to not be a good idea to perform
+this during codegen as the cost of the translations is expected to be
+prohibitive and optimizations at this level are not expected to be worthwhile.
+If expressiveness is the main concern, irrespective of performance, passing
+unranked memrefs to an external C++ library and implementing rank-agnostic logic
+there is expected to be significantly simpler.
+
+Unranked memrefs may provide expressiveness gains in the future and help bridge
+the gap with unranked tensors. Unranked memrefs will not be expected to be
+exposed to codegen but one may query the rank of an unranked memref (a special
+op will be needed for this purpose) and perform a switch and cast to a ranked
+memref as a prerequisite to codegen.
+
+Example:
+
+```mlir
+// With static ranks, we need a function for each possible argument type
+%A = alloc() : memref<16x32xf32> %B = alloc() :
+memref<16x32x64xf32> call @helper_2D(%A) : (memref<16x32xf32>)->() call
+@helper_3D(%B) : (memref<16x32x64xf32>)->()
+
+// With unknown rank, the functions can be unified under one unranked type
+%A = alloc() : memref<16x32xf32>
+%B = alloc() : memref<16x32x64xf32>
+// Remove rank info
+%A_u = memref_cast %A : memref<16x32xf32> -> memref<*xf32>
+%B_u = memref_cast %B : memref<16x32x64xf32> -> memref<*xf32>
+// call same function with dynamic ranks
+call @helper(%A_u) : (memref<*xf32>)->()
+call @helper(%B_u) : (memref<*xf32>)->()
+```
+
+The core syntax and representation of a layout specification is a
+[semi-affine map](Dialects/Affine.md#semi-affine-maps). Additionally, syntactic
+sugar is supported to make certain layout specifications more intuitive to read.
+For the moment, a `memref` supports parsing a strided form which is converted to
+a semi-affine map automatically.
+
+The memory space of a memref is specified by a target-specific integer index. If
+no memory space is specified, then the default memory space (0) is used. The
+default space is target specific but always at index 0.
+
+TODO: MLIR will eventually have target-dialects which allow symbolic use of
+memory hierarchy names (e.g. L3, L2, L1, ...) but we have not spec'd the details
+of that mechanism yet. Until then, this document pretends that it is valid to
+refer to these memories by `bare-id`.
+
+The notionally dynamic value of a memref value includes the address of the
+buffer allocated, as well as the symbols referred to by the shape, layout map,
+and index maps.
+
+Examples of memref static type
+
+```mlir
+// Identity index/layout map
+#identity = (d0, d1) -> (d0, d1)
+
+// Column major layout.
+#col_major = (d0, d1, d2) -> (d2, d1, d0)
+
+// A 2-d tiled layout with tiles of size 128 x 256.
+#tiled_2d_128x256 = (d0, d1) -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)
+
+// A tiled data layout with non-constant tile sizes.
+#tiled_dynamic = (d0, d1)[s0, s1] -> (d0 floordiv s0, d1 floordiv s1,
+                              d0 mod s0, d1 mod s1)
+
+// A layout that yields a padding on two at either end of the minor dimension.
+#padded = (d0, d1) -> (d0, (d1 + 2) floordiv 2, (d1 + 2) mod 2)
+
+
+// The dimension list "16x32" defines the following 2D index space:
+//
+//   { (i, j) : 0 <= i < 16, 0 <= j < 32 }
+//
+memref<16x32xf32, #identity, memspace0>
+
+// The dimension list "16x4x?" defines the following 3D index space:
+//
+//   { (i, j, k) : 0 <= i < 16, 0 <= j < 4, 0 <= k < N }
+//
+// where N is a symbol which represents the runtime value of the size of
+// the third dimension.
+//
+// %N here binds to the size of the third dimension.
+%A = alloc(%N) : memref<16x4x?xf32, #col_major, memspace0>
+
+// A 2-d dynamic shaped memref that also has a dynamically sized tiled layout.
+// The memref index space is of size %M x %N, while %B1 and %B2 bind to the
+// symbols s0, s1 respectively of the layout map #tiled_dynamic. Data tiles of
+// size %B1 x %B2 in the logical space will be stored contiguously in memory.
+// The allocation size will be (%M ceildiv %B1) * %B1 * (%N ceildiv %B2) * %B2
+// f32 elements.
+%T = alloc(%M, %N) [%B1, %B2] : memref<?x?xf32, #tiled_dynamic>
+
+// A memref that has a two-element padding at either end. The allocation size
+// will fit 16 * 68 float elements of data.
+%P = alloc() : memref<16x64xf32, #padded>
+
+// Affine map with symbol 's0' used as offset for the first dimension.
+#imapS = (d0, d1) [s0] -> (d0 + s0, d1)
+// Allocate memref and bind the following symbols:
+// '%n' is bound to the dynamic second dimension of the memref type.
+// '%o' is bound to the symbol 's0' in the affine map of the memref type.
+%n = ...
+%o = ...
+%A = alloc (%n)[%o] : <16x?xf32, #imapS>
+```
+
+##### Index Space
+
+A memref dimension list defines an index space within which the memref can be
+indexed to access data.
+
+##### Index
+
+Data is accessed through a memref type using a multidimensional index into the
+multidimensional index space defined by the memref's dimension list.
+
+Examples
+
+```mlir
+// Allocates a memref with 2D index space:
+//   { (i, j) : 0 <= i < 16, 0 <= j < 32 }
+%A = alloc() : memref<16x32xf32, #imapA, memspace0>
+
+// Loads data from memref '%A' using a 2D index: (%i, %j)
+%v = load %A[%i, %j] : memref<16x32xf32, #imapA, memspace0>
+```
+
+##### Index Map
+
+An index map is a one-to-one
+[semi-affine map](Dialects/Affine.md#semi-affine-maps) that transforms a
+multidimensional index from one index space to another. For example, the
+following figure shows an index map which maps a 2-dimensional index from a 2x2
+index space to a 3x3 index space, using symbols `S0` and `S1` as offsets.
+
+![Index Map Example](includes/img/index-map.svg)
+
+The number of domain dimensions and range dimensions of an index map can be
+different, but must match the number of dimensions of the input and output index
+spaces on which the map operates. The index space is always non-negative and
+integral. In addition, an index map must specify the size of each of its range
+dimensions onto which it maps. Index map symbols must be listed in order with
+symbols for dynamic dimension sizes first, followed by other required symbols.
+
+##### Layout Map
+
+A layout map is a [semi-affine map](Dialects/Affine.md#semi-affine-maps) which
+encodes logical to physical index space mapping, by mapping input dimensions to
+their ordering from most-major (slowest varying) to most-minor (fastest
+varying). Therefore, an identity layout map corresponds to a row-major layout.
+Identity layout maps do not contribute to the MemRef type identification and are
+discarded on construction. That is, a type with an explicit identity map is
+`memref<?x?xf32, (i,j)->(i,j)>` is strictly the same as the one without layout
+maps, `memref<?x?xf32>`.
+
+Layout map examples:
+
+```mlir
+// MxN matrix stored in row major layout in memory:
+#layout_map_row_major = (i, j) -> (i, j)
+
+// MxN matrix stored in column major layout in memory:
+#layout_map_col_major = (i, j) -> (j, i)
+
+// MxN matrix stored in a 2-d blocked/tiled layout with 64x64 tiles.
+#layout_tiled = (i, j) -> (i floordiv 64, j floordiv 64, i mod 64, j mod 64)
+```
+
+##### Affine Map Composition
+
+A memref specifies a semi-affine map composition as part of its type. A
+semi-affine map composition is a composition of semi-affine maps beginning with
+zero or more index maps, and ending with a layout map. The composition must be
+conformant: the number of dimensions of the range of one map, must match the
+number of dimensions of the domain of the next map in the composition.
+
+The semi-affine map composition specified in the memref type, maps from accesses
+used to index the memref in load/store operations to other index spaces (i.e.
+logical to physical index mapping). Each of the
+[semi-affine maps](Dialects/Affine.md) and thus its composition is required to
+be one-to-one.
+
+The semi-affine map composition can be used in dependence analysis, memory
+access pattern analysis, and for performance optimizations like vectorization,
+copy elision and in-place updates. If an affine map composition is not specified
+for the memref, the identity affine map is assumed.
+
+##### Strided MemRef
+
+A memref may specify strides as part of its type. A stride specification is a
+list of integer values that are either static or `?` (dynamic case). Strides
+encode the distance, in number of elements, in (linear) memory between
+successive entries along a particular dimension. A stride specification is
+syntactic sugar for an equivalent strided memref representation using
+semi-affine maps. For example, `memref<42x16xf32, offset: 33 strides: [1, 64]>`
+specifies a non-contiguous memory region of `42` by `16` `f32` elements such
+that:
+
+1.  the minimal size of the enclosing memory region must be `33 + 42 * 1 + 16 *
+    64 = 1066` elements;
+2.  the address calculation for accessing element `(i, j)` computes `33 + i +
+    64 * j`
+3.  the distance between two consecutive elements along the outer dimension is
+    `1` element and the distance between two consecutive elements along the
+    outer dimension is `64` elements.
+
+This corresponds to a column major view of the memory region and is internally
+represented as the type `memref<42x16xf32, (i, j) -> (33 + i + 64 * j)>`.
+
+The specification of strides must not alias: given an n-D strided memref,
+indices `(i1, ..., in)` and `(j1, ..., jn)` may not refer to the same memory
+address unless `i1 == j1, ..., in == jn`.
+
+Strided memrefs represent a view abstraction over preallocated data. They are
+constructed with special ops, yet to be introduced. Strided memrefs are a
+special subclass of memrefs with generic semi-affine map and correspond to a
+normalized memref descriptor when lowering to LLVM.
+
+#### None Type
+
+Syntax:
+
+```
+none-type ::= `none`
+```
+
+The `none` type is a unit type, i.e. a type with exactly one possible value,
+where its value does not have a defined dynamic representation.
+
+#### Tensor Type
+
+Syntax:
+
+```
+tensor-type ::= `tensor` `<` dimension-list tensor-memref-element-type `>`
+tensor-memref-element-type ::= vector-element-type | vector-type | complex-type
+
+// memref requires a known rank, but tensor does not.
+dimension-list ::= dimension-list-ranked | (`*` `x`)
+dimension-list-ranked ::= (dimension `x`)*
+dimension ::= `?` | decimal-literal
+```
+
+SSA values of tensor type represents aggregate N-dimensional data values, and
+have a known element type. It may have an unknown rank (indicated by `*`) or may
+have a fixed rank with a list of dimensions. Each dimension may be a static
+non-negative decimal constant or be dynamically determined (indicated by `?`).
+
+The runtime representation of the MLIR tensor type is intentionally abstracted -
+you cannot control layout or get a pointer to the data. For low level buffer
+access, MLIR has a [`memref` type](#memref-type). This abstracted runtime
+representation holds both the tensor data values as well as information about
+the (potentially dynamic) shape of the tensor. The
+[`dim` operation](Dialects/Standard.md#dim-operation) returns the size of a
+dimension from a value of tensor type.
+
+Note: hexadecimal integer literals are not allowed in tensor type declarations
+to avoid confusion between `0xf32` and `0 x f32`. Zero sizes are allowed in
+tensors and treated as other sizes, e.g., `tensor<0 x 1 x i32>` and `tensor<1 x
+0 x i32>` are different types. Since zero sizes are not allowed in some other
+types, such tensors should be optimized away before lowering tensors to vectors.
+
+Examples:
+
+```mlir
+// Tensor with unknown rank.
+tensor<* x f32>
+
+// Known rank but unknown dimensions.
+tensor<? x ? x ? x ? x f32>
+
+// Partially known dimensions.
+tensor<? x ? x 13 x ? x f32>
+
+// Full static shape.
+tensor<17 x 4 x 13 x 4 x f32>
+
+// Tensor with rank zero. Represents a scalar.
+tensor<f32>
+
+// Zero-element dimensions are allowed.
+tensor<0 x 42 x f32>
+
+// Zero-element tensor of f32 type (hexadecimal literals not allowed here).
+tensor<0xf32>
+```
+
+#### Tuple Type
+
+Syntax:
+
+```
+tuple-type ::= `tuple` `<` (type ( `,` type)*)? `>`
+```
+
+The value of `tuple` type represents a fixed-size collection of elements, where
+each element may be of a different type.
+
+**Rationale:** Though this type is first class in the type system, MLIR provides
+no standard operations for operating on `tuple` types
+([rationale](Rationale.md#tuple-types)).
+
+Examples:
+
+```mlir
+// Empty tuple.
+tuple<>
+
+// Single element
+tuple<f32>
+
+// Many elements.
+tuple<i32, f32, tensor<i1>, i5>
+```
+
+#### Vector Type
+
+Syntax:
+
+```
+vector-type ::= `vector` `<` static-dimension-list vector-element-type `>`
+vector-element-type ::= float-type | integer-type
+
+static-dimension-list ::= (decimal-literal `x`)+
+```
+
+The vector type represents a SIMD style vector, used by target-specific
+operation sets like AVX. While the most common use is for 1D vectors (e.g.
+vector<16 x f32>) we also support multidimensional registers on targets that
+support them (like TPUs).
+
+Vector shapes must be positive decimal integers.
+
+Note: hexadecimal integer literals are not allowed in vector type declarations,
+`vector<0x42xi32>` is invalid because it is interpreted as a 2D vector with
+shape `(0, 42)` and zero shapes are not allowed.
+
+## Attributes
+
+Syntax:
+
+```
+attribute-dict ::= `{` `}`
+                 | `{` attribute-entry (`,` attribute-entry)* `}`
+attribute-entry ::= dialect-attribute-entry | dependent-attribute-entry
+dialect-attribute-entry ::= dialect-namespace `.` bare-id `=` attribute-value
+dependent-attribute-entry ::= dependent-attribute-name `=` attribute-value
+dependent-attribute-name ::= (letter|[_]) (letter|digit|[_$])*
+```
+
+Attributes are the mechanism for specifying constant data on operations in
+places where a variable is never allowed - e.g. the index of a
+[`dim` operation](Dialects/Standard.md#dim-operation), or the stride of a
+convolution. They consist of a name and a concrete attribute value. The set of
+expected attributes, their structure, and their interpretation are all
+contextually dependent on what they are attached to.
+
+There are two main classes of attributes: dependent and dialect. Dependent
+attributes derive their structure and meaning from what they are attached to;
+e.g., the meaning of the `index` attribute on a `dim` operation is defined by
+the `dim` operation. Dialect attributes, on the other hand, derive their context
+and meaning from a specific dialect. An example of a dialect attribute may be a
+`swift.self` function argument attribute that indicates an argument is the
+self/context parameter. The context of this attribute is defined by the `swift`
+dialect and not the function argument.
+
+Attribute values are represented by the following forms:
+
+```
+attribute-value ::= attribute-alias | dialect-attribute | standard-attribute
+```
+
+### Attribute Value Aliases
+
+```
+attribute-alias ::= '#' alias-name '=' attribute-value
+attribute-alias ::= '#' alias-name
+```
+
+MLIR supports defining named aliases for attribute values. An attribute alias is
+an identifier that can be used in the place of the attribute that it defines.
+These aliases *must* be defined before their uses. Alias names may not contain a
+'.', since those names are reserved for
+[dialect attributes](#dialect-attribute-values).
+
+Example:
+
+```mlir
+#map = (d0) -> (d0 + 10)
+
+// Using the original attribute.
+%b = affine.apply (d0) -> (d0 + 10) (%a)
+
+// Using the attribute alias.
+%b = affine.apply #map(%a)
+```
+
+### Dialect Attribute Values
+
+Similarly to operations, dialects may define custom attribute values. The
+syntactic structure of these values is identical to custom dialect type values,
+except that dialect attributes values are distinguished with a leading '#',
+while dialect types are distinguished with a leading '!'.
+
+```
+dialect-attribute ::= '#' opaque-dialect-item
+dialect-attribute ::= '#' pretty-dialect-item
+```
+
+Dialect attributes can be specified in a verbose form, e.g. like this:
+
+```mlir
+// Complex attribute
+#foo<"something<abcd>">
+
+// Even more complex attribute
+#foo<"something<a%%123^^^>>>">
+```
+
+Dialect attributes that are simple enough can use the pretty format, which is a
+lighter weight syntax that is equivalent to the above forms:
+
+```mlir
+// Complex attribute
+#foo.something<abcd>
+```
+
+Sufficiently complex dialect attributes are required to use the verbose form for
+generality. For example, the more complex type shown above wouldn't be valid in
+the lighter syntax: `#foo.something<a%%123^^^>>>` because it contains characters
+that are not allowed in the lighter syntax, as well as unbalanced `<>`
+characters.
+
+See [here](DefiningAttributesAndTypes.md) to learn how to define dialect
+attribute values.
+
+### Standard Attribute Values
+
+Standard attributes are a core set of
+[dialect attributes](#dialect-attribute-values) that are defined in a builtin
+dialect and thus available to all users of MLIR.
+
+```
+standard-attribute ::=   affine-map-attribute
+                       | array-attribute
+                       | bool-attribute
+                       | dictionary-attribute
+                       | elements-attribute
+                       | float-attribute
+                       | integer-attribute
+                       | integer-set-attribute
+                       | string-attribute
+                       | symbol-ref-attribute
+                       | type-attribute
+                       | unit-attribute
+```
+
+#### AffineMap Attribute
+
+Syntax:
+
+```
+affine-map-attribute ::= affine-map
+```
+
+An affine-map attribute is an attribute that represents a affine-map object.
+
+#### Array Attribute
+
+Syntax:
+
+```
+array-attribute ::= `[` (attribute-value (`,` attribute-value)*)? `]`
+```
+
+An array attribute is an attribute that represents a collection of attribute
+values.
+
+#### Boolean Attribute
+
+Syntax:
+
+```
+bool-attribute ::= bool-literal
+```
+
+A boolean attribute is a literal attribute that represents a one-bit boolean
+value, true or false.
+
+#### Dictionary Attribute
+
+Syntax:
+
+```
+dictionary-attribute ::= `{` (attribute-entry (`,` attribute-entry)*)? `}`
+```
+
+A dictionary attribute is an attribute that represents a sorted collection of
+named attribute values. The elements are sorted by name, and each name must be
+unique within the collection.
+
+#### Elements Attributes
+
+Syntax:
+
+```
+elements-attribute ::= dense-elements-attribute
+                     | opaque-elements-attribute
+                     | sparse-elements-attribute
+```
+
+An elements attribute is a literal attribute that represents a constant
+[vector](#vector-type) or [tensor](#tensor-type) value.
+
+##### Dense Elements Attribute
+
+Syntax:
+
+```
+dense-elements-attribute ::= `dense` `<` attribute-value `>` `:`
+                             ( tensor-type | vector-type )
+```
+
+A dense elements attribute is an elements attribute where the storage for the
+constant vector or tensor value has been packed to the element bitwidth. The
+element type of the vector or tensor constant must be of integer, index, or
+floating point type.
+
+##### Opaque Elements Attribute
+
+Syntax:
+
+```
+opaque-elements-attribute ::= `opaque` `<` dialect-namespace  `,`
+                              hex-string-literal `>` `:`
+                              ( tensor-type | vector-type )
+```
+
+An opaque elements attribute is an elements attribute where the content of the
+value is opaque. The representation of the constant stored by this elements
+attribute is only understood, and thus decodable, by the dialect that created
+it.
+
+Note: The parsed string literal must be in hexadecimal form.
+
+##### Sparse Elements Attribute
+
+Syntax:
+
+```
+sparse-elements-attribute ::= `sparse` `<` attribute-value `,` attribute-value
+                              `>` `:` ( tensor-type | vector-type )
+```
+
+A sparse elements attribute is an elements attribute that represents a sparse
+vector or tensor object. This is where very few of the elements are non-zero.
+
+The attribute uses COO (coordinate list) encoding to represent the sparse
+elements of the elements attribute. The indices are stored via a 2-D tensor of
+64-bit integer elements with shape [N, ndims], which specifies the indices of
+the elements in the sparse tensor that contains non-zero values. The element
+values are stored via a 1-D tensor with shape [N], that supplies the
+corresponding values for the indices.
+
+Example:
+
+```mlir
+  sparse<[[0, 0], [1, 2]], [1, 5]> : tensor<3x4xi32>
+
+// This represents the following tensor:
+///  [[1, 0, 0, 0],
+///   [0, 0, 5, 0],
+///   [0, 0, 0, 0]]
+```
+
+#### Float Attribute
+
+Syntax:
+
+```
+float-attribute ::= (float-literal (`:` float-type)?)
+                  | (hexadecimal-literal `:` float-type)
+```
+
+A float attribute is a literal attribute that represents a floating point value
+of the specified [float type](#floating-point-types). It can be represented in
+the hexadecimal form where the hexadecimal value is interpreted as bits of the
+underlying binary representation. This form is useful for representing infinity
+and NaN floating point values. To avoid confusion with integer attributes,
+hexadecimal literals _must_ be followed by a float type to define a float
+attribute.
+
+Examples:
+
+```
+42.0         // float attribute defaults to f64 type
+42.0 : f32   // float attribute of f32 type
+0x7C00 : f16 // positive infinity
+0x7CFF : f16 // NaN (one of possible values)
+42 : f32     // Error: expected integer type
+```
+
+#### Integer Attribute
+
+Syntax:
+
+```
+integer-attribute ::= integer-literal ( `:` (index-type | integer-type) )?
+```
+
+An integer attribute is a literal attribute that represents an integral value of
+the specified integer or index type. The default type for this attribute, if one
+is not specified, is a 64-bit integer.
+
+##### Integer Set Attribute
+
+Syntax:
+
+```
+integer-set-attribute ::= affine-map
+```
+
+An integer-set attribute is an attribute that represents an integer-set object.
+
+#### String Attribute
+
+Syntax:
+
+```
+string-attribute ::= string-literal (`:` type)?
+```
+
+A string attribute is an attribute that represents a string literal value.
+
+#### Symbol Reference Attribute
+
+Syntax:
+
+```
+symbol-ref-attribute ::= symbol-ref-id (`::` symbol-ref-id)*
+```
+
+A symbol reference attribute is a literal attribute that represents a named
+reference to an operation that is nested within an operation with the
+`OpTrait::SymbolTable` trait. As such, this reference is given meaning by the
+nearest parent operation containing the `OpTrait::SymbolTable` trait. It may
+optionally contain a set of nested references that further resolve to a symbol
+nested within a different symbol table.
+
+This attribute can only be held internally by
+[array attributes](#array-attribute) and
+[dictionary attributes](#dictionary-attribute)(including the top-level operation
+attribute dictionary), i.e. no other attribute kinds such as Locations or
+extended attribute kinds. If a reference to a symbol is necessary from outside
+of the symbol table that the symbol is defined in, a
+[string attribute](string-attribute) can be used to refer to the symbol name.
+
+**Rationale:** Given that MLIR models global accesses with symbol references, to
+enable efficient multi-threading, it becomes difficult to effectively reason
+about their uses. By restricting the places that can legally hold a symbol
+reference, we can always opaquely reason about a symbols usage characteristics.
+
+#### Type Attribute
+
+Syntax:
+
+```
+type-attribute ::= type
+```
+
+A type attribute is an attribute that represents a [type object](#type-system).
+
+#### Unit Attribute
+
+```
+unit-attribute ::= `unit`
+```
+
+A unit attribute is an attribute that represents a value of `unit` type. The
+`unit` type allows only one value forming a singleton set. This attribute value
+is used to represent attributes that only have meaning from their existence.
+
+One example of such an attribute could be the `swift.self` attribute. This
+attribute indicates that a function parameter is the self/context parameter. It
+could be represented as a [boolean attribute](#boolean-attribute)(true or
+false), but a value of false doesn't really bring any value. The parameter
+either is the self/context or it isn't.
+
+```mlir
+// A unit attribute defined with the `unit` value specifier.
+func @verbose_form(i1) attributes {dialectName.unitAttr = unit}
+
+// A unit attribute can also be defined without the value specifier.
+func @simple_form(i1) attributes {dialectName.unitAttr}
+```
diff --git a/mlir/docs/MLIRForGraphAlgorithms.md b/mlir/docs/MLIRForGraphAlgorithms.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac26e5beb9b93829945e8f25a1192a390650fea1
--- /dev/null
+++ b/mlir/docs/MLIRForGraphAlgorithms.md
@@ -0,0 +1,403 @@
+# MLIR: Incremental Application to Graph Algorithms in ML Frameworks
+
+The existing documentation about MLIR focuses on long term vision, how its
+pieces fit together, and the benefits of modular and composable infrastructure
+in the vast and distant future. While this viewpoint appeals to some, it causes
+concern for others who are more concerned about the "here and now" - why does it
+make sense to make a "revolutionary" change when any individual problem can be
+fixed in place?
+
+This document explains that adoption of MLIR to solve graph based problems
+_isn't_ a revolutionary change: it is an incremental series of steps which build
+on each other, each of which delivers local value. This document also addresses
+some points of confusion that keep coming up.
+
+One note: even though a major advantage of MLIR is that it can span the full
+spectrum from graph algorithms down to low-level code generation, this document
+focuses on the use of MLIR for **graph-level algorithms**. MLIR will also unlock
+exciting code generation opportunities (particularly given its novel approach to
+integrating state of the art polyhedral techniques), but issues that touch on
+MLIR's relationship to XLA, Eigen, etc, are out of scope for this particular
+doc.
+
+This document uses TensorFlow as the example given that it is the focus of our
+immediate work, but we believe that the same viewpoint could be useful for
+people working in the context of other ML frameworks that may consider adopting
+MLIR in the future.
+
+### How is MLIR relevant?
+
+MLIR is an overloaded acronym which unpacks as "Multi-Level Intermediate
+Representation". Its high-level purpose is to provide mechanics for describing
+and transforming programs and computations in a flexible way. It provides common
+compiler infrastructure for things like constant folding, dead code elimination,
+graph rewriting, and others - which are independent of the representational
+choices picked by a given dialect (e.g. its concurrency semantics). It was built
+with a specific focus on compile time and memory efficiency, accurate
+propagation of source location information (important for reporting high quality
+errors and warnings) and is designed for testability.
+
+TensorFlow has numerous subsystems (some of which are proprietary, e.g.
+Tensor-RT, nGraph, CoreML, etc) as well as translation layers between these
+different subsystems, and these translation layers face similar challenges. ((As
+an aside, the internals of each of these subsystems could often benefit from
+MLIR infrastructure, but that isn't a focus of this doc.))
+
+A key observation that MLIR makes is that these subsystems often have two things
+going on: they are both particular data structures and encodings (e.g. HLO
+graphs, TF-Lite's flat buffer format, TensorFlow's Graph format, the ONNX
+abstraction, etc) as well as an abstraction of computation (a specific way of
+modeling a convolution, a set of supported operations etc).
+
+MLIR uses a standard IR (i.e., a set of data structures) for representing these
+computations - this allows a huge amount of shared infrastructure across these
+problem domains. MLIR then allows the definition of domain-specific "dialects"
+that describe the set of operations that are legal and supported for a given
+application. This means that the actual translations between data structures are
+kept as simple as possible - and are thus relatively easy to make "correct".
+This allows the common compiler infrastructure to handle the mapping problems
+and the other issues within the domain.
+
+MLIR's design is directly informed by the experience of building (and then
+living with) intermediate representations like the LLVM IR, LLVM SelectionDAG,
+the LLVM machine instruction representation, Swift SIL IR, and learns new
+lessons from TensorFlow and XLA HLO, as well as learning from building countless
+research and production systems on top of them. Our goal is to drag the state of
+the art in compilers forward, not to merely apply a few well-known techniques to
+the machine learning domain.
+
+### What does adoption mean?
+
+The point of this document is not to advocate for rewriting any particular
+subsystem in TensorFlow - indeed, the burden required to justify a rewrite is
+high, and often very specific to that subsystem. That said, there are several
+subsystems that are about to get rewritten or substantially revised anyway, so
+we use those as examples to concretely describe the benefits that MLIR provides
+in these cases and what it will take. The subsystems discussed are:
+
+1.  the TF Lite TOCO translator, which we need to improve error
+    reporting/reliability issues and generalize it to support more ops, and
+1.  the TF/XLA bridge which needs to improve usability by merging some of its
+    usage models, support dynamic shapes and generalize guest subsystem support
+    to Tensor-RT and nGraph.
+1.  Grappler is another subsystem that is likely to get substantial revisions in
+    the future, and would definitely benefit from the MLIR framework, but there
+    are no known plans to do that work at this point, so we don't discuss it
+    further.
+
+Adopting MLIR for these works the same way - and, in fact, the work to support
+TF Lite is mostly a subset of the larger work to support the functionality of
+the TF/XLA bridge. TF Lite and the TF/XLA bridge include several compiler passes
+(things like encapsulate, functionalize control flow, lowering of ops, fusion,
+constant folding, shape inference, etc).
+
+MLIR supports converting from TensorFlow Graphs to MLIR and back, which means
+that we can start by putting in a no-op translation to MLIR and back into the
+pipeline, and verify that nothing breaks. Then we can work on replacing the
+compiler transformations one by one by reimplementing them (with the improved
+algorithms that we're planning).
+
+This is a development plan, we wouldn't actually ship a TensorFlow that just
+uses MLIR for a single pass. In practice, we'll have the MLIR flag gated under
+an option, build out a replacement for an entire subsystem (e.g. the TOCO
+translator) and when the time is right, we'll do A/B comparisons and eventually
+make a switch and phase out the old code over time.
+
+## What benefit does MLIR provide?
+
+The adoption plan above might sound like it only makes things worse in the
+immediate term - we have two implementations of the same functionality, we are
+dividing our efforts, etc. In order for this to be worth it, we should have a
+good sense that we are building towards an improved future that will make
+customers and TensorFlow engineers happier when it lands. Here we describe a few
+of the benefits that MLIR provides, in no particular order:
+
+### A Lossless Human Editable Textual Representation
+
+The MLIR in-memory data structure has a human readable and writable format, as
+well as [a specification](LangRef.md) for that format - built just like any
+other programming language. Important properties of this format are that it is
+compact, easy to read, and lossless. You can dump an MLIR program out to disk
+and munge around with it, then send it through a few more passes.
+
+If you haven't worked with a system that works this way, it is hard to overstate
+how big of a deal this in practice: it means that you can call `foo->dump()` on
+an IR object to see its full contents, it means you can diff the IR before and
+after a change, delta reduce IR files, and many other things.
+
+### A Graph Verification Pass
+
+Like many other popular compiler infrastructures, MLIR provides infrastructure
+and implementation for a "verifier" which checks that the IR is well formed. The
+MLIR verifier is a simple framework that makes it easy to provide a single
+source of truth for those correctness properties and is general across all
+Dialects (e.g. TF Graph, TF Lite flat buffer, XLA HLO, etc).
+
+A verifier pass is sort of like a 'super assertion' that catches mistakes in
+program transformations early, making you as an engineer more productive, making
+the product more reliable, and making it easier to track down bugs when they
+appear - because the verifier can be run at any time, either as a compiler pass
+or with a single function call.
+
+While MLIR provides a well-considered infrastructure for IR verification, and
+has simple checks for existing TensorFlow operations, there is a lot that should
+be added here and lots of opportunity to get involved!
+
+### Designed for Testability
+
+There are many aspects of this in MLIR, but we'll focus on compiler
+transformations since they are the easiest to understand. Compiler
+transformations are modeled as subclasses of the `Pass` C++ class, which are
+driven by an `mlir-opt` tool. When combined with a lossless textual
+representation, it becomes really easy to write unit tests for compiler
+transformations, for example, this is a simple test that shows "x-x" is being
+turned into zero:
+
+```mlir
+  // RUN: mlir-opt %s -canonicalize | FileCheck %s
+  func @test_subi_zero_cfg(%arg0: i32) -> i32 {
+    %y = subi %arg0, %arg0 : i32
+    return %y: i32
+  }
+  // CHECK-LABEL: func @test_subi_zero_cfg(%arg0: i32)
+  // CHECK-NEXT: %c0_i32 = constant 0 : i32
+  // CHECK-NEXT: return %c0
+```
+
+The "CHECK" comments are interpreted by the
+[LLVM FileCheck tool](https://llvm.org/docs/CommandGuide/FileCheck.html), which
+is sort of like a really advanced grep. This test is fully self-contained: it
+feeds the input into the [canonicalize pass](Canonicalization.md), and checks
+that the output matches the CHECK lines. See the `test/Transforms` directory for
+more examples. In contrast, standard unit testing exposes the API of the
+underlying framework to lots and lots of tests (making it harder to refactor and
+move the API), typically requires a lot more code, and exacerbates issues with
+link time. For examples, see
+[the TEST_F functions in TensorFlow's testsuite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc).
+
+MLIR has been pervasively designed with this sort of design by testability,
+allowing us to put in place a culture that expects every behavior changing
+commit to include a test case, and for these test cases to be stable and
+reliable over time, since they are testing exactly what they are supposed to.
+End to end integration tests are still super useful for some things of course!
+
+### Infrastructure for Warnings and Error Diagnostics and Location Tracking
+
+MLIR benefits from the lessons learned from building other compilers - including
+Clang which
+[[set the standard](http://blog.llvm.org/2010/04/amazing-feats-of-clang-error-recovery.html)](http://blog.llvm.org/2010/04/amazing-feats-of-clang-error-recovery.html)
+for quality of implementation in C/C++ compiler diagnostics. Drawing from this
+experience (and fixing mistakes in LLVM), MLIR requires that operations and
+functions carry abstract location information, that transformations propagate
+this information, and provides standardized mechanisms to emit errors and
+warnings, as well as for clients to hook into them to capture and report them in
+custom ways.
+
+Why is this important? In practice, many graph-to-graph translators can fail
+(e.g. TF Lite when an unsupported op is used) and it is important to be able to
+report the error up through to the user in the most precise way possible, in
+order for it to be actionable. This includes tracking rewrites through fusions
+and fissions of ops, mapping back into language / API specific domains, etc.
+
+More selfishly for infrastructure hackers, this is a huge boon because it means
+that it is easy to write good tests for this: the testing tools for MLIR capture
+the diagnostics produced by passes (using the standard diagnostic hooks) and
+check that they match the expected diagnostics in the testcase. For example, to
+test the dependence analysis infra in the code generator, Andy Davis wrote a
+simple pass that checks dependencies and emits them as "notes", allowing him to
+write tests like this:
+
+```mlir
+  // RUN: mlir-opt %s -memref-dependence-check -verify-diagnostics
+  func @different_memrefs() {
+    %m.a = alloc() : memref<100xf32>
+    %m.b = alloc() : memref<100xf32>
+    %c0 = constant 0 : index
+    %c1 = constant 1.0 : f32
+    store %c1, %m.a[%c0] : memref<100xf32>
+    // expected-note@-1 {{dependence from memref access 0 to access 1 = false}}
+    %v0 = load %m.b[%c0] : memref<100xf32>
+    return
+  }
+```
+
+Note that a major limitation of this is that MLIR suffers from a problem of
+"garbage in, garbage out": if the input locations to MLIR are imprecise, then
+there is nothing that it can do to recover them. There is work underway in
+TensorFlow/Python to improve the situation, and Swift for TensorFlow already has
+perfect location tracking due to its design.
+
+### Shape Information Captured in the IR
+
+In TensorFlow Graphs, each op takes and returns values using a very simple type
+system (TF_DataType) in which each value is a tensor of unknown rank and
+dimensions. At the same time, many graphs have static shapes easily knowable for
+wide swaths of the computation, and even dynamically shaped operations often
+have statically knowable dimensions. Many analyses and transformations benefit
+and use this information when available, but because TensorFlow graphs don't
+capture this (e.g. serialize it to proto), passes have to recompute it on demand
+with ShapeRefiner.
+
+The [MLIR Tensor Type](LangRef.md#tensor-type) directly captures shape
+information, so you can have things like:
+
+```mlir
+  %x = tf.Add %x, %y : tensor<128 x 8 x ? x f32>
+```
+
+Capturing this in the IR is expected to speed up transformations (avoiding
+recomputing the same info over and over again) which therefore makes it
+practical to apply stronger shape analysis algorithms. It also makes it easier
+to work with the IR, because on-the-side representations can get out of date,
+and the API is easier to work with from an ergonomics perspective.
+
+### Unified Graph Rewriting Infrastructure
+
+This is still a work in progress, but we have sightlines towards a
+[general rewriting infrastructure](GenericDAGRewriter.md) for transforming DAG
+tiles into other DAG tiles, using a declarative pattern format. DAG to DAG
+rewriting is a generalized solution for many common compiler optimizations,
+lowerings, and other rewrites and having an IR enables us to invest in building
+a single high-quality implementation.
+
+Declarative pattern rules are preferable to imperative C++ code for a number of
+reasons: they are more compact, easier to reason about, can have checkers
+written against them, and new tools can be built that inspect and manipulate the
+declarative patterns in interesting ways - e.g. applying theorem provers to
+them. It will be exciting to see this ecosystem develop as the infrastructure
+matures.
+
+### Clarified Semantics for TensorFlow Operations
+
+One of the challenging things about working with TensorFlow is that there are
+many invariants and behaviors that need to be preserved and known about when
+working with Graphs, and these can be difficult to reason about and lead to
+bugs. Things like 'dead values', Switch and Merge nodes, concurrency semantics,
+nodes that execute even when passed a dead value, multiple device program
+representation - etc... all add complexities that can make it challenging to
+reason about whether a transformation or analysis is correct in general. Even
+something as simple as constant folding or transforming integer `x-x` into `0`
+is non-trivial because you need to consider control dependence edges.
+
+One of our major goals for the TensorFlow dialect of MLIR is to sort out these
+situations and upgrade existing TensorFlow graphs to semantics that are easier
+to reason about. The solutions to these problems are all still being debated,
+but those discussions have already yielded a lot of potential answers:
+introducing a `tf_dead_or<x>` types for switch/merge, modeling of TF operations
+using futures/async semantics etc. None of these particular battles are critical
+or important for MLIR to succeed (because of its "meta" nature, the abstraction
+decisions of any given dialect are up for it to decide), but each one that works
+out will make it easier to work with and transform TensorFlow operations. We
+expect these issues to get nailed down in the next couple of months when MLIR
+effort moves beyond TF Lite / TOCO support. The discussions that are happening
+now are super valuable and making progress.
+
+### Ergonomics
+
+A minor-in-theory, but important-in-practice point is that MLIR is designed to
+make it easy, memory efficient, and less error prone to transform code than
+other systems. `TensorFlow::Graph` has implementation issues where the same
+information is stored redundantly in different places (which must be manually
+kept up to date), has somewhat unusual representation of certain constructs
+(e.g. the function library, which makes it very difficult to add or remove
+functions, e.g. during interprocedural transformations), and stores information
+in the graph that is used by the executor, but isn't necessary for program
+transformation.
+
+TensorFlow has made a lot of progress in this area over the years, and there are
+lots of ideas about further improvements in the future, we are happy that MLIR
+addresses these needs (making it much easier to implement correct program
+transformations) today, and are committed to pushing hard to make it better.
+
+### Compile Time Performance and Memory Use
+
+MLIR has been designed to be memory and compile-time efficient in its algorithms
+and data structures, using immutable and uniqued structures, low level
+bit-packing, and other well-known techniques to avoid unnecessary heap
+allocations, and allow simple and safe multithreaded optimization of MLIR
+programs. There are other reasons to believe that the MLIR implementations of
+common transformations will be more efficient than the Python and C++
+TensorFlow::Graph implementations of the same things, given the current
+implementation details of TensorFlow.
+
+That said, this is very much a theory at this point. When the new implementation
+of various subsystems are available, we will see what happens in practice: there
+will be no reason to speculate - we can measure.
+
+## Common Questions and Concerns
+
+Here we address some frequently asked questions and concerns.
+
+### Isn't MLIR a big dependency to take on?
+
+We've heard that at least some people are concerned that MLIR is a "big"
+dependency to take on, and could result in large code size. Here are some key
+points MLIR:
+
+1.  The entire MLIR codebase is a pretty small C++ code base in absolute terms
+    compared to what goes into a modern ML framework.
+1.  Like LLVM, MLIR is designed as a set of libraries that clients can link in
+    or ignore as they wish. For example, the transformations in MLIR kept
+    separate from the core IR abstractions, and dialect specific code (e.g.
+    TensorFlow, TF-Lite, XLA, etc) is all independently selectable by the build
+    system. Clients that don't care about XLA don't link in that code, whether
+    they are a TF-Lite system or a client that is completely unrelated to
+    TensorFlow.
+1.  MLIR's only third party dependency is on LLVM, but it doesn't depend on LLVM
+    IR or any other heavy dependency - it just depends on LLVM's support library
+    which provides efficient hash tables and other
+    [memory efficient data structures that the STL does not](http://llvm.org/docs/ProgrammersManual.html#picking-the-right-data-structure-for-a-task).
+    There have been discussions about splitting this set of libraries out to its
+    own subproject in LLVM that the LLVM IR project depends on. This would be
+    great for MLIR as well as other LLVM subprojects.
+1.  TensorFlow and many other frameworks already use LLVM - if so, MLIR would
+    not be pulling in an additional dependency at all.
+
+### How does MLIR represent {control flow, concurrency, …} semantics in TensorFlow?
+
+MLIR provides a dialect that is an isomorphic 1-1 mapping between TensorFlow
+graphs and MLIR, as well as a pretty complete translator back and forth (the
+only known gap is that a few TF_DataType enums aren't handled yet). MLIR is a
+"Multi-Level IR", which allows it to represent code with different abstraction
+levels, so the ability to faithfully represent TensorFlow code in a completely
+backwards compatible way (even if there are some historical warts!) is critical.
+
+In *addition* to the isomorphic mapping, we are actively working on efforts to
+raise the abstraction level for working with TensorFlow graphs in MLIR. Doing so
+would make it even easier to write TensorFlow transformations than it is today,
+and would provide a path to migrating TF 1.x graphs forward into the TF 2.x
+world. For example, because MLIR has an extensible type system, we can directly
+model whether it is impossible for a Tensor value to be a "dead" value - similar
+to the use of optional types in modern programming languages.
+
+These discussions occasionally cause confusion because there are several issues
+being mixed up into one:
+
+*   What are the current semantics of TensorFlow graphs, and what invariants can
+    we rely on?
+*   What should the semantics be in TensorFlow 2.0?
+*   What do programs rely on in practice, and if it is unfriendly, can we
+    migrate it?
+*   Can we find a way to make it so transforms don't have to worry about the
+    complexities of Switch/Merge, by using higher level control flow
+    representations? (tentative answer: yes)
+*   How should MLIR represent async vs sync operations, what invariants are
+    provided, how does this dovetail with control flow?
+*   When is it safe and beneficial to perform optimizations that might reduce
+    parallelism?
+
+All of these questions have a "conservative/safe fallback": we can continue
+providing exactly the same abstractions that TensorFlow always has. That said,
+we are trying hard to level-up the representation (taking advantage of the
+"Multi-Level" part of MLIR) because doing so will make it much much easier to
+write analyses and transformations than it currently is in TensorFlow.
+
+### Non Goals
+
+It is important to point out things that MLIR does not aim to do. For example,
+there is no runtime component to MLIR: the TensorFlow executor, the TF Lite
+FlatBuffer interpreter, or other existing runtime should be used as-is.
+
+Another non-goal is that MLIR currently doesn't support a stable binary
+encoding. We will certainly add this at some point, but existing formats should
+be used for serialization and distribution in the meantime.
diff --git a/mlir/docs/OpDefinitions.md b/mlir/docs/OpDefinitions.md
new file mode 100644
index 0000000000000000000000000000000000000000..ff3a21fa1bb6ec65cf444b74ddcbf57ea4b21f72
--- /dev/null
+++ b/mlir/docs/OpDefinitions.md
@@ -0,0 +1,1210 @@
+# Table-driven Operation Definition Specification (ODS)
+
+In addition to specializing the `mlir::Op` C++ template, MLIR also supports
+defining operations in a table-driven manner. This is achieved via
+[TableGen][TableGen], which is both a generic language and its tooling to
+maintain records of domain-specific information. Facts regarding an operation
+are specified concisely into a TableGen record, which will be expanded into an
+equivalent `mlir::Op` C++ template specialization at compiler build time.
+
+This manual explains in detail all the available mechanisms for defining
+operations in such a table-driven manner. It aims to be a specification instead
+of a tutorial. Please refer to [Quickstart tutorial to adding MLIR graph
+rewrite](QuickstartRewrites.md) for the latter.
+
+In addition to detailing each mechanism, this manual also tries to capture
+best practices. They are rendered as quoted bullet points.
+
+## Motivation
+
+MLIR allows pluggable dialects, and dialects contain, among others, a list of
+operations. This open and extensible ecosystem leads to the "stringly" type IR
+problem, e.g., repetitive string comparisons during optimization and analysis
+passes, unintuitive accessor methods (e.g., generic/error prone `getOperand(3)`
+vs self-documenting `getStride()`) with more generic return types, verbose and
+generic constructors without default arguments, verbose textual IR dump, and
+so on. Furthermore, operation verification is:
+
+1. best case: a central string-to-verification-function map,
+1. middle case: duplication of verification across the code base, or
+1. worst case: no verification functions.
+
+The fix is to support defining ops in a table-driven manner. Then for each
+dialect, we can have a central place that contains everything you need to know
+about each op, including its constraints, custom assembly form, etc. This
+description is also used to generate helper functions and classes to allow
+building, verification, parsing, printing, analysis, and many more.
+
+## Benefits
+
+Compared to the C++ template, this table-driven approach has several benefits
+including but not limited to:
+
+* **Single source of truth**: We strive to encode all facts regarding an
+  operation into the record, so that readers don't need to jump among code
+  snippets to fully understand an operation.
+* **Removing boilerplate**: We can automatically generate
+  operand/attribute/result getter methods, operation build methods, operation
+  verify methods, and many more utilities from the record. This greatly reduces
+  the boilerplate needed for defining a new op.
+* **Facilitating auto-generation**: The usage of these operation information
+  records are by no means limited to op definition itself. We can use them to
+  drive the auto-generation of many other components, like computation graph
+  serialization.
+
+## TableGen Syntax
+
+We use TableGen as the language for specifying operation information. TableGen
+itself just provides syntax for writing records; the syntax and constructs
+allowed in a TableGen file (typically with filename suffix `.td`) can be found
+[here][TableGenIntro]. The formal language specification can be found
+[here][TableGenRef]. _Roughly_ speaking,
+
+*   TableGen `class` is similar to C++ class; it can be templated and
+    subclassed.
+*   TableGen `def` is similar to C++ object; it can be declared by specializing
+    a TableGen `class` (e.g., `def MyDef : MyClass<...>;`) or completely
+    independently (e.g., `def MyDef;`). It cannot be further templated or
+    subclassed.
+*   TableGen `dag` is a dedicated type for directed acyclic graph of elements. A
+    `dag` has one operator and zero or more arguments. Its syntax is `(operator
+    arg0, arg1, argN)`. The operator can be any TableGen `def`; an argument can
+    be anything, including `dag` itself. We can have names attached to both the
+    operator and the arguments like `(MyOp:$op_name MyArg:$arg_name)`.
+
+Please see the [language introduction][TableGenIntro] to learn about all the
+types and expressions supported by TableGen.
+
+## Operation Definition
+
+MLIR defines several common constructs to help operation definition and provide
+their semantics via a special [TableGen backend][TableGenBackend]:
+[`OpDefinitionsGen`][OpDefinitionsGen]. These constructs are defined in
+[`OpBase.td`][OpBase]. The main ones are
+
+*   The `Op` class: It is the main construct for defining operations. All facts
+    regarding the operation are specified when specializing this class, with the
+    help of the following constructs.
+*   The `Dialect` class: Operations belonging to one logical group are placed in
+    the same dialect. The `Dialect` class contains dialect-level information.
+*   The `OpTrait` class hierarchy: They are used to specify special properties
+    and constraints of the operation, including whether the operation has side
+    effect or whether its output has the same shape as the input.
+*   The `ins`/`outs` marker: These are two special makers builtin to the
+    `OpDefinitionsGen` backend. They lead the definitions of operands/attributes
+    and results respectively.
+*   The `TypeConstraint` class hierarchy: They are used to specify the
+    constraints over operands or results. A notable subclass hierarchy is
+    `Type`, which stands for constraints for common C++ types.
+*   The `AttrConstraint` class hierarchy: They are used to specify the
+    constraints over attributes. A notable subclass hierarchy is `Attr`, which
+    stands for constraints for attributes whose values are of common types.
+
+An operation is defined by specializing the `Op` class with concrete contents
+for all the fields it requires. For example, `tf.AvgPool` is defined as
+
+```tablegen
+def TF_AvgPoolOp : TF_Op<"AvgPool", [NoSideEffect]> {
+  let summary = "Performs average pooling on the input.";
+
+  let description = [{
+Each entry in `output` is the mean of the corresponding size `ksize`
+window in `value`.
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$value,
+
+    Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$ksize,
+    Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$strides,
+    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
+    DefaultValuedAttr<TF_ConvertDataFormatAttr, "NHWC">:$data_format
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+```
+
+In the following we describe all the fields needed. Please see the definition
+of the `Op` class for the complete list of fields supported.
+
+### Operation name
+
+The operation name is a unique identifier of the operation within MLIR, e.g.,
+`tf.Add` for addition operation in the TensorFlow dialect. This is the
+equivalent of the mnemonic in assembly language. It is used for parsing and
+printing in the textual format. It is also used for pattern matching in graph
+rewrites.
+
+The full operation name is composed of the dialect name and the op name, with
+the former provided via the dialect and the latter provided as the second
+template parameter to the `Op` class.
+
+### Operation documentation
+
+This includes both an one-line `summary` and a longer human-readable
+`description`. They will be used to drive automatic generation of dialect
+documentation. They need to be provided in the operation's definition body:
+
+```tablegen
+let summary = "...";
+
+let description = [{
+...
+}];
+```
+
+`description` should be written in Markdown syntax.
+
+Placing the documentation at the beginning is recommended since
+it helps in understanding the operation.
+
+> * Place documentation at the beginning of the operation definition
+> * The summary should be short and concise. It should be a one-liner without
+>   trailing punctuation. Put expanded explanation in description.
+
+### Operation arguments
+
+There are two kinds of arguments: operands and attributes. Operands are runtime
+values produced by other ops; while attributes are compile-time known constant
+values, including two categories:
+
+1. Natural attributes: these attributes affect the behavior of the operations
+   (e.g., padding for convolution);
+1. Derived attributes: these attributes are not needed to define the operation
+   but are instead derived from information of the operation. E.g., the output
+   shape of type. This is mostly used for convenience interface generation or
+   interaction with other frameworks/translation.
+
+Both operands and attributes are specified inside the `dag`-typed `arguments`,
+led by `ins`:
+
+```tablegen
+let arguments = (ins
+  <type-constraint>:$<operand-name>,
+  ...
+  <attr-constraint>:$<attr-name>,
+  ...
+);
+```
+
+Here `<type-constraint>` is a TableGen `def` from the `TypeConstraint` class
+hierarchy. Similarly, `<attr-constraint>` is a TableGen `def` from the
+`AttrConstraint` class hierarchy. See [Constraints](#constraints) for more
+information.
+
+There is no requirements on the relative order of operands and attributes; they
+can mix freely. The relative order of operands themselves matters. From each
+named argument a named getter will be generated that returns the argument with
+the return type (in the case of attributes the return type will be
+constructed from the storage type, while for operands it will be `Value`). Each
+attribute's raw value (e.g., as stored) can also be accessed via generated
+`<name>Attr` getters for use in transformation passes where the more user
+friendly return type is less suitable.
+
+All the arguments should be named to 1) provide documentation, 2) drive
+auto-generation of getter methods, 3) provide a handle to reference for other
+places like constraints.
+
+#### Variadic operands
+
+To declare a variadic operand, wrap the `TypeConstraint` for the operand with
+`Variadic<...>`.
+
+Normally operations have no variadic operands or just one variadic operand. For
+the latter case, it is easy to deduce which dynamic operands are for the static
+variadic operand definition. But if an operation has more than one variadic
+operands, it would be impossible to attribute dynamic operands to the
+corresponding static variadic operand definitions without further information
+from the operation. Therefore, the `SameVariadicOperandSize` trait is needed to
+indicate that all variadic operands have the same number of dynamic values.
+
+#### Optional attributes
+
+To declare an optional attribute, wrap the `AttrConstraint` for the attribute
+with `OptionalAttr<...>`.
+
+#### Attributes with default values
+
+To declare an attribute with a default value, wrap the `AttrConstraint` for the
+attribute with `DefaultValuedAttr<..., "...">`.
+
+The second parameter to `DefaultValuedAttr` should be a string containing the
+C++ default value. For example, a float default value should be specified as
+like `"0.5f"`, and an integer array default value should be specified as like
+`"{1, 2, 3}"`.
+
+#### Confining attributes
+
+`Confined` is provided as a general mechanism to help modelling further
+constraints on attributes beyond the ones brought by value types. You can use
+`Confined` to compose complex constraints out of more primitive ones. For
+example, a 32-bit integer attribute whose minimum value must be 10 can be
+expressed as `Confined<I32Attr, [IntMinValue<10>]>`.
+
+Right now, the following primitive constraints are supported:
+
+*   `IntMinValue<N>`: Specifying an integer attribute to be greater than or
+    equal to `N`
+*   `IntMaxValue<N>`: Specifying an integer attribute to be less than or equal
+    to `N`
+*   `ArrayMinCount<N>`: Specifying an array attribute to have at least `N`
+    elements
+*   `IntArrayNthElemEq<I, N>`: Specifying an integer array attribute's `I`-th
+    element to be equal to `N`
+*   `IntArrayNthElemMinValue<I, N>`: Specifying an integer array attribute's
+    `I`-th element to be greater than or equal to `N`
+
+TODO: Design and implement more primitive constraints
+
+### Operation results
+
+Similar to operands, results are specified inside the `dag`-typed `results`, led
+by `outs`:
+
+```tablegen
+let results = (outs
+  <type-constraint>:$<result-name>,
+  ...
+);
+```
+
+#### Variadic results
+
+Similar to variadic operands, `Variadic<...>` can also be used for results.
+And similarly, `SameVariadicResultSize` for multiple variadic results in the
+same operation.
+
+### Operation traits and constraints
+
+Traits are operation properties that affect syntax or semantics. MLIR C++
+models various traits in the `mlir::OpTrait` namespace.
+
+Both operation traits, [interfaces](#operation-interfaces), and constraints
+involving multiple operands/attributes/results are provided as the second
+template parameter to the `Op` class. They should be deriving from the `OpTrait`
+class. See [Constraints](#constraints) for more information.
+
+### Operation interfaces
+
+[Operation interfaces](Interfaces.md#operation-interfaces) are a mechanism by
+which to opaquely call methods and access information on an *Op instance*,
+without knowing the exact operation type. Operation interfaces defined in C++
+can be accessed in the ODS framework via the `OpInterfaceTrait` class. Aside
+from using pre-existing interfaces in the C++ API, the ODS framework also
+provides a simplified mechanism for defining such interfaces; that removes much
+of the boilerplate necessary.
+
+Providing a definition of the `OpInterface` class will auto-generate the C++
+classes for the interface. An `OpInterface` includes a name, for the C++ class,
+a description, and a list of interface methods.
+
+```tablegen
+def MyInterface : OpInterface<"MyInterface"> {
+  let description = ...;
+  let methods = [...];
+}
+```
+
+There are two types of methods that can be used with an interface,
+`InterfaceMethod` and `StaticInterfaceMethod`. They are both comprised of the
+same core components, with the distinction that `StaticInterfaceMethod` models a
+static method on the derived operation.
+
+An `InterfaceMethod` is comprised of the following components:
+
+*   Description
+    -   A string description of what this method does and its invariants.
+*   ReturnType
+    -   A string corresponding to the C++ return type of the method.
+*   MethodName
+    -   A string corresponding to the desired name of the method.
+*   Arguments (Optional)
+    -   A dag of strings that correspond to a C++ type and variable name
+        respectively.
+*   MethodBody (Optional)
+    -   An optional explicit implementation of the interface method.
+    -   `ConcreteOp` is an implicitly defined typename that can be used to refer
+        to the type of the derived operation currently being operated on.
+    -   In non-static methods, a variable 'ConcreteOp op' is defined and may be
+        used to refer to an instance of the derived operation.
+*   DefaultImplementation (Optional)
+    -   An optional explicit default implementation of the interface method.
+    -   This method is placed within the `Trait` class that is attached to the
+        operation. As such, this method has the same characteristics as any
+        other [`Trait`](Traits.md) method.
+    -   `ConcreteOp` is an implicitly defined typename that can be used to refer
+        to the type of the derived operation currently being operated on.
+
+ODS also allows generating the declarations for the `InterfaceMethod` of the op
+if one specifies the interface with `DeclareOpInterfaceMethods` (see example
+below).
+
+Examples:
+
+```tablegen
+def MyInterface : OpInterface<"MyInterface"> {
+  let description = [{
+    My interface is very interesting. ...
+  }];
+
+  let methods = [
+    // A simple non-static method with no inputs.
+    InterfaceMethod<"'foo' is a non-static method with no inputs.",
+      "unsigned", "foo"
+    >,
+
+    // A new non-static method accepting an input argument.
+    InterfaceMethod<"/*insert doc here*/",
+      "Value ", "bar", (ins "unsigned":$i)
+    >,
+
+    // Query a static property of the derived operation.
+    StaticInterfaceMethod<"'fooStatic' is a static method with no inputs.",
+      "unsigned", "fooStatic"
+    >,
+
+    // Provide the definition of a static interface method.
+    // Note: `ConcreteOp` corresponds to the derived operation typename.
+    StaticInterfaceMethod<"/*insert doc here*/",
+      "Operation *", "create", (ins "OpBuilder &":$builder, "Location":$loc), [{
+        return builder.create<ConcreteOp>(loc);
+    }]>,
+
+    // Provide a definition of the non-static method.
+    // Note: `op` corresponds to the derived operation variable.
+    InterfaceMethod<"/*insert doc here*/",
+      "unsigned", "getNumInputsAndOutputs", (ins), [{
+        return op.getNumInputs() + op.getNumOutputs();
+    }]>,
+
+    // Provide only a default definition of the method.
+    // Note: `ConcreteOp` corresponds to the derived operation typename.
+    InterfaceMethod<"/*insert doc here*/",
+      "unsigned", "getNumInputsAndOutputs", (ins), /*methodBody=*/[{}], [{
+        ConcreteOp op = cast<ConcreteOp>(getOperation());
+        return op.getNumInputs() + op.getNumOutputs();
+    }]>,
+  ];
+}
+
+// Interfaces can optionally be wrapped inside DeclareOpInterfaceMethods. This
+// would result in autogenerating declarations for members `foo`, `bar` and
+// `fooStatic`. Methods with bodies are not declared inside the op
+// declaration but instead handled by the op interface trait directly.
+def OpWithInferTypeInterfaceOp : Op<...
+    [DeclareOpInterfaceMethods<MyInterface>]> { ... }
+```
+
+### Builder methods
+
+For each operation, there are a few builders automatically generated based on
+the arguments and returns types. For example, given the following op definition:
+
+```tablegen
+def MyOp : ... {
+  let arguments = (ins
+    I32:$i32_operand,
+    F32:$f32_operand,
+    ...,
+
+    I32Attr:$i32_attr,
+    F32Attr:$f32_attr,
+    ...
+  );
+
+  let results = (outs
+    I32:$i32_result,
+    F32:$f32_result,
+    ...
+  );
+}
+```
+
+The following builders are generated:
+
+```c++
+// All result-types/operands/attributes have one aggregate parameter.
+static void build(Builder *tblgen_builder, OperationState &tblgen_state,
+                  ArrayRef<Type> resultTypes,
+                  ValueRange operands,
+                  ArrayRef<NamedAttribute> attributes);
+
+// Each result-type/operand/attribute has a separate parameter. The parameters
+// for attributes are of mlir::Attribute types.
+static void build(Builder *tblgen_builder, OperationState &tblgen_state,
+                  Type i32_result, Type f32_result, ...,
+                  Value i32_operand, Value f32_operand, ...,
+                  IntegerAttr i32_attr, FloatAttr f32_attr, ...);
+
+// Each result-type/operand/attribute has a separate parameter. The parameters
+// for attributes are raw values unwrapped with mlir::Attribute instances.
+// (Note that this builder will not always be generated. See the following
+// explanation for more details.)
+static void build(Builder *tblgen_builder, OperationState &tblgen_state,
+                  Type i32_result, Type f32_result, ...,
+                  Value i32_operand, Value f32_operand, ...,
+                  APInt i32_attr, StringRef f32_attr, ...);
+
+// Each operand/attribute has a separate parameter but result type is aggregate.
+static void build(Builder *tblgen_builder, OperationState &tblgen_state,
+                  ArrayRef<Type> resultTypes,
+                  Value i32_operand, Value f32_operand, ...,
+                  IntegerAttr i32_attr, FloatAttr f32_attr, ...);
+
+// All operands/attributes have aggregate parameters.
+// Generated if InferTypeOpInterface interface is specified.
+static void build(Builder *tblgen_builder, OperationState &tblgen_state,
+                  ValueRange operands,
+                  ArrayRef<NamedAttribute> attributes);
+
+// (And manually specified builders depending on the specific op.)
+```
+
+The first form provides basic uniformity so that we can create ops using the
+same form regardless of the exact op. This is particularly useful for
+implementing declarative pattern rewrites.
+
+The second and third forms are good for use in manually written code given that
+they provide better guarantee via signatures.
+
+The third form will be generated if any of the op's attribute has different
+`Attr.returnType` from `Attr.storageType` and we know how to build an attribute
+from an unwrapped value (i.e., `Attr.constBuilderCall` is defined.)
+Additionally, for the third form, if an attribute appearing later in the
+`arguments` list has a default value, the default value will be supplied in the
+declaration. This works for `BoolAttr`, `StrAttr`, `EnumAttr` for now and the
+list can grow in the future. So if possible, default valued attribute should be
+placed at the end of the `arguments` list to leverage this feature. (This
+behavior is essentially due to C++ function parameter default value placement
+restrictions.) Otherwise, the builder of the third form will still be generated
+but default values for the attributes not at the end of the `arguments` list
+will not be supplied in the builder's signature.
+
+And there may potentially exist other builders depending on the specific op;
+please refer to the
+[generated C++ file](#run-mlir-tblgen-to-see-the-generated-content) for the
+complete list.
+
+#### Custom builder methods
+
+However, if the above cases cannot satisfy all needs, you can define additional
+convenience build methods with `OpBuilder`.
+
+`OpBuilder` is a class that takes the parameter list and the optional `build()`
+method body. They are separated because we need to generate op declaration and
+definition into separate files. The parameter list should _include_ `Builder
+*builder, OperationState &state`. If the `body` is not provided, _only_ the
+builder declaration will be generated; this provides a way to define complicated
+builders entirely in C++ files.
+
+For example, for the following op:
+
+```tablegen
+def MyOp : Op<"my_op", []> {
+  let arguments = (ins F32Attr:$attr);
+
+  let results = (outs);
+}
+```
+
+If we want to define a builder with a default value for the only attribute, we
+can add into `MyOp`:
+
+```tablegen
+def MyOp : ... {
+  ...
+
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &state, float val = 0.5f", [{
+      state.addAttribute("attr", builder->getF32FloatAttr(val));
+    }]>
+  ];
+}
+```
+
+The generated builder will look like:
+
+```c++
+static void build(Builder *builder, OperationState &state, float val = 0.5f) {
+  state.addAttribute("attr", builder->getF32FloatAttr(val));
+}
+```
+
+### Custom parser and printer methods
+
+Functions to parse and print the operation's custom assembly form.
+
+### Custom verifier code
+
+Verification code will be automatically generated for
+[constraints](#constraints) specified on various entities of the op. To
+perform _additional_ verification, you can use
+
+```tablegen
+let verifier = [{
+  ...
+}];
+```
+
+Code placed in `verifier` will be called after the auto-generated verification
+code.
+
+### `hasCanonicalizer`
+
+This boolean field indicate whether canonicalization patterns have been defined
+for this operation. If it is `1`, then `::getCanonicalizationPatterns()` should
+be defined.
+
+### `hasFolder`
+
+This boolean field indicate whether general folding rules have been defined
+for this operation. If it is `1`, then `::fold()` should be defined.
+
+### Extra declarations
+
+One of the goals of table-driven op definition is to auto-generate as much logic
+and methods needed for each op as possible. With that said, there will always be
+long-tail cases that won't be covered. For such cases, you can use
+`extraClassDeclaration`. Code in `extraClassDeclaration` will be copied
+literally to the generated C++ op class.
+
+Note that `extraClassDeclaration` is a mechanism intended for long-tail cases
+by power users; for not-yet-implemented widely-applicable cases, improving the
+infrastructure is preferable.
+
+### Generated C++ code
+
+[OpDefinitionsGen][OpDefinitionsGen] processes the op definition spec file and
+generates two files containing the corresponding C++ code: one for declarations,
+the other for definitions. The former is generated via the `-gen-op-decls`
+command-line option, while the latter is via the `-gen-op-defs` option.
+
+The definition file contains all the op method definitions, which can be
+included and enabled by defining `GET_OP_CLASSES`. For each operation,
+OpDefinitionsGen generates an operation class and an
+[operand adaptor](#operand-adaptors) class. Besides, it also contains a
+comma-separated list of all defined ops, which can be included and enabled by
+defining `GET_OP_LIST`.
+
+#### Class name and namespaces
+
+For each operation, its generated C++ class name is the symbol `def`ed with
+TableGen with dialect prefix removed. The first `_` serves as the delimiter.
+For example, for `def TF_AddOp`, the C++ class name would be `AddOp`.
+We remove the `TF` prefix because it is for scoping ops; other dialects
+may as well define their own `AddOp`s.
+
+The namespaces of the generated C++ class will come from the dialect's
+`cppNamespace` field. For example, if a dialect's `cppNamespace` is `A::B`,
+then an op of that dialect will be placed in
+`namespace A { namespace B { ... } }`. If a dialect does not specify a
+`cppNamespace`, we then use the dialect's name as the namespace.
+
+This means the qualified name of the generated C++ class does not necessarily
+match exactly with the operation name as explained in
+[Operation name](#operation-name). This is to allow flexible naming to satisfy
+coding style requirements.
+
+#### Operand adaptors
+
+For each operation, we automatically generate an _operand adaptor_. This class
+solves the problem of accessing operands provided as a list of `Value`s without
+using "magic" constants. The operand adaptor takes a reference to an array of
+`Value` and provides methods with the same names as those in the operation class
+to access them. For example, for a binary arithmetic operation, it may provide
+`.lhs()` to access the first operand and `.rhs()` to access the second operand.
+
+The operand adaptor class lives in the same namespace as the operation class,
+and has the name of the operation followed by `OperandAdaptor`. A template
+declaration `OperandAdaptor<>` is provided to look up the operand adaptor for
+the given operation.
+
+Operand adaptors can be used in function templates that also process operations:
+
+```c++
+template <typename BinaryOpTy>
+std::pair<Value, Value> zip(BinaryOpTy &&op) {
+  return std::make_pair(op.lhs(), op.rhs());;
+}
+
+void process(AddOp op, ArrayRef<Value> newOperands) {
+  zip(op);
+  zip(OperandAdaptor<AddOp>(newOperands));
+  /*...*/
+}
+```
+
+## Constraints
+
+Constraint is a core concept in table-driven operation definition: operation
+verification and graph operation matching are all based on satisfying
+constraints. So both the operation definition and rewrite rules specification
+significantly involve writing constraints. We have the `Constraint` class in
+[`OpBase.td`][OpBase] has the common base class for all constraints.
+
+An operation's constraint can cover different range; it may
+
+* Only concern a single attribute (e.g. being an 32-bit integer greater than 5),
+* Multiple operands and results (e.g., the 1st result's shape must be the same
+  as the 1st operand), or
+* Intrinsic to the operation itself (e.g., having no side effect).
+
+We call them as single-entity constraint, multi-entity constraint, and traits,
+respectively.
+
+### Single-entity constraint
+
+Constraints scoped to a single operand, attribute, or result are specified at
+the entity's declaration place as described in
+[Operation arguments](#operation-arguments) and
+[Operation results](#operation-results).
+
+To help modelling constraints of common types, a set of `TypeConstraint`s are
+created; they are the `Type` subclass hierarchy. It includes `F32` for the
+constraints of being a float, `TensorOf<[F32]>` for the constraints of being
+a float tensor, and so on.
+
+Similarly, a set of `AttrConstraint`s are created for helping modelling
+constraints of common attribute kinds. They are the `Attr` subclass hierarchy.
+It includes `F32Attr` for the constraints of being a float attribute,
+`F32ArrayAttr` for the constraints of being a float array attribute, and so on.
+
+### Multi-entity constraint
+
+Constraints involving more than one operand/attribute/result are quite common
+on operations, like the element type and shape relation between operands and
+results. These constraints should be specified as the `Op` class template
+parameter as described in
+[Operation traits and constraints](#operation-traits-and-constraints).
+
+Multi-entity constraints are modeled as `PredOpTrait` (a subclass of `OpTrait`)
+in [`OpBase.td`][OpBase].A bunch of constraint primitives are provided to help
+specification. See [`OpBase.td`][OpBase] for the complete list.
+
+### Trait
+
+Traits are intrinsic properties of the operation like having side effect or not,
+commutative or not, whether is a terminator, etc. These constraints should be
+specified as the `Op` class template parameter as described in
+[Operation traits and constraints](#operation-traits-and-constraints).
+
+Traits are modeled as `NativeOpTrait` (a subclass of `OpTrait`) in
+[`OpBase.td`][OpBase]. They are backed and will be translated into the
+corresponding C++ `mlir::OpTrait` classes.
+
+### How to specify new constraint
+
+To write a constraint, you need to provide its predicates and give it a
+descriptive name. Predicates, modeled with the `Pred` class, are the workhorse
+for composing constraints. The predicate for a constraint is typically built up
+in a nested manner, using the two categories of predicates:
+
+1.  `CPred`: the primitive leaf predicate.
+2.  Compound predicate: a predicate composed from child predicates using
+    predicate combiners (conjunction: `And`, disjunction: `Or`, negation: `Neg`,
+    substitution: `SubstLeaves`, concatenation: `Concat`).
+
+`CPred` is the basis for composing more complex predicates. It is the "atom"
+predicate from the perspective of TableGen and the "interface" between
+TableGen and C++. What is inside is already C++ code, which will be treated
+as opaque strings with special placeholders to be substituted.
+
+You can put any C++ code that returns a boolean value inside a `CPred`,
+including evaluating expressions, calling functions, calling class methods,
+and so on.
+
+To help interaction with the C++ environment, there are a few special
+placeholders provided to refer to entities in the context where this predicate
+is used. They serve as "hooks" to the enclosing environment.  This includes
+`$_builder`, `$_op`, and `$_self`:
+
+* `$_builder` will be replaced by a `mlir::Builder` instance so that you can
+  access common build methods.
+* `$_op` will be replaced by the current operation so that you can access
+  information of the current operation.
+* `$_self` will be replaced with the entity this predicate is attached to.
+  E.g., `BoolAttr` is an attribute constraint that wraps a
+  `CPred<"$_self.isa<BoolAttr>()">`. Then for `F32:$attr`,`$_self` will be
+  replaced by `$attr`. For type constraints, it's a little bit special since
+  we want the constraints on each type definition reads naturally and we want
+  to attach type constraints directly to an operand/result, `$_self` will be
+  replaced by the operand/result's type. E.g., for `F32` in `F32:$operand`, its
+  `$_self` will be expanded as `getOperand(...)->getType()`.
+
+TODO(b/130663252): Reconsider the leading symbol for special placeholders.
+Eventually we want to allow referencing operand/result $-names; such $-names
+can start with underscore.
+
+For example, to write an attribute `attr` is an `IntegerAttr`, in C++ you can
+just call `attr.isa<IntegerAttr>()`. The code can be wrapped in a `CPred` as
+`$_self.isa<IntegerAttr>()`, with `$_self` as the special placeholder to be
+replaced by the current attribute `attr` at expansion time.
+
+For more complicated predicates, you can wrap it in a single `CPred`, or you
+can use predicate combiners to combine them. For example, to write the
+constraint that an attribute `attr` is a 32-bit or 64-bit integer, you can
+write it as
+
+```tablegen
+And<[
+  CPred<"$_self.isa<IntegerAttr>()">,
+  Or<[
+    CPred<"$_self.cast<IntegerAttr>().getType().isInteger(32)">,
+    CPred<"$_self.cast<IntegerAttr>().getType().isInteger(64)">
+  ]>
+]>
+```
+
+(Note that the above is just to show with a familiar example how you can use
+`CPred` and predicate combiners to write complicated predicates. For integer
+attributes specifically, [`OpBase.td`][OpBase] already defines `I32Attr` and
+`I64Attr`. So you can actually reuse them to write it as `Or<[I32Attr.predicate,
+I64Attr.predicate]>`.)
+
+TODO: Build up a library of reusable primitive constraints
+
+If the predicate is very complex to write with `CPred` together with predicate
+combiners, you can also write it as a normal C++ function and use the `CPred`
+as a way to "invoke" the function. For example, to verify an attribute `attr`
+has some property, you can write a C++ function like
+
+```cpp
+bool HasSomeProperty(Attribute attr) { ... }
+```
+
+and then define the op as:
+
+```tablegen
+def HasSomeProperty : AttrConstraint<CPred<"HasSomeProperty($_self)">,
+                                     "has some property">;
+
+def MyOp : Op<...> {
+  let arguments = (ins
+    ...
+    HasSomeProperty:$attr
+  );
+}
+```
+
+As to whether we should define the predicate using a single `CPred` wrapping
+the whole expression, multiple `CPred`s with predicate combiners, or a single
+`CPred` "invoking" a function, there are no clear-cut criteria. Defining using
+`CPred` and predicate combiners is preferable since it exposes more information
+(instead hiding all the logic behind a C++ function) into the op definition spec
+so that it can potentially drive more auto-generation cases. But it will
+require a nice library of common predicates as the building blocks to avoid the
+duplication, which is being worked on right now.
+
+## Attribute Definition
+
+### Enum attributes
+
+Some attributes can only take values from an predefined enum, e.g., the
+comparison kind of a comparison op. To define such attributes, ODS provides
+several mechanisms: `StrEnumAttr`, `IntEnumAttr`, and `BitEnumAttr`.
+
+*   `StrEnumAttr`: each enum case is a string, the attribute is stored as a
+    [`StringAttr`][StringAttr] in the op.
+*   `IntEnumAttr`: each enum case is an integer, the attribute is stored as a
+    [`IntegerAttr`][IntegerAttr] in the op.
+*   `BitEnumAttr`: each enum case is a bit, the attribute is stored as a
+    [`IntegerAttr`][IntegerAttr] in the op.
+
+All these `*EnumAttr` attributes require fully specifying all of the allowed
+cases via their corresponding `*EnumAttrCase`. With this, ODS is able to
+generate additional verification to only accept allowed cases. To facilitate the
+interaction between `*EnumAttr`s and their C++ consumers, the
+[`EnumsGen`][EnumsGen] TableGen backend can generate a few common utilities: a
+C++ enum class, `llvm::DenseMapInfo` for the enum class, conversion functions
+from/to strings. This is controlled via the `-gen-enum-decls` and
+`-gen-enum-defs` command-line options of `mlir-tblgen`.
+
+For example, given the following `EnumAttr`:
+
+```tablegen
+def Case15: I32EnumAttrCase<"Case15", 15>;
+def Case20: I32EnumAttrCase<"Case20", 20>;
+
+def MyIntEnum: I32EnumAttr<"MyIntEnum", "An example int enum",
+                           [Case15, Case20]> {
+  let cppNamespace = "Outer::Inner";
+  let stringToSymbolFnName = "ConvertToEnum";
+  let symbolToStringFnName = "ConvertToString";
+}
+```
+
+The following will be generated via `mlir-tblgen -gen-enum-decls`:
+
+```c++
+namespace Outer {
+namespace Inner {
+// An example int enum
+enum class MyIntEnum : uint32_t {
+  Case15 = 15,
+  Case20 = 20,
+};
+
+llvm::Optional<MyIntEnum> symbolizeMyIntEnum(uint32_t);
+llvm::StringRef ConvertToString(MyIntEnum);
+llvm::Optional<MyIntEnum> ConvertToEnum(llvm::StringRef);
+inline constexpr unsigned getMaxEnumValForMyIntEnum() {
+  return 20;
+}
+
+} // namespace Inner
+} // namespace Outer
+
+namespace llvm {
+template<> struct DenseMapInfo<Outer::Inner::MyIntEnum> {
+  using StorageInfo = llvm::DenseMapInfo<uint32_t>;
+
+  static inline Outer::Inner::MyIntEnum getEmptyKey() {
+    return static_cast<Outer::Inner::MyIntEnum>(StorageInfo::getEmptyKey());
+  }
+
+  static inline Outer::Inner::MyIntEnum getTombstoneKey() {
+    return static_cast<Outer::Inner::MyIntEnum>(StorageInfo::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const Outer::Inner::MyIntEnum &val) {
+    return StorageInfo::getHashValue(static_cast<uint32_t>(val));
+  }
+
+  static bool isEqual(const Outer::Inner::MyIntEnum &lhs, const Outer::Inner::MyIntEnum &rhs) {
+    return lhs == rhs;
+  }
+};
+}
+```
+
+The following will be generated via `mlir-tblgen -gen-enum-defs`:
+
+```c++
+namespace Outer {
+namespace Inner {
+llvm::StringRef ConvertToString(MyIntEnum val) {
+  switch (val) {
+    case MyIntEnum::Case15: return "Case15";
+    case MyIntEnum::Case20: return "Case20";
+  }
+  return "";
+}
+
+llvm::Optional<MyIntEnum> ConvertToEnum(llvm::StringRef str) {
+  return llvm::StringSwitch<llvm::Optional<MyIntEnum>>(str)
+      .Case("Case15", MyIntEnum::Case15)
+      .Case("Case20", MyIntEnum::Case20)
+      .Default(llvm::None);
+}
+llvm::Optional<MyIntEnum> symbolizeMyIntEnum(uint32_t value) {
+  switch (value) {
+  case 15: return MyIntEnum::Case15;
+  case 20: return MyIntEnum::Case20;
+  default: return llvm::None;
+  }
+}
+
+} // namespace Inner
+} // namespace Outer
+```
+
+Similarly for the following `BitEnumAttr` definition:
+
+```tablegen
+def None: BitEnumAttrCase<"None", 0x0000>;
+def Bit1: BitEnumAttrCase<"Bit1", 0x0001>;
+def Bit2: BitEnumAttrCase<"Bit2", 0x0002>;
+def Bit3: BitEnumAttrCase<"Bit3", 0x0004>;
+
+def MyBitEnum: BitEnumAttr<"MyBitEnum", "An example bit enum",
+                           [None, Bit1, Bit2, Bit3]>;
+```
+
+We can have:
+
+```c++
+// An example bit enum
+enum class MyBitEnum : uint32_t {
+  None = 0,
+  Bit1 = 1,
+  Bit2 = 2,
+  Bit3 = 4,
+};
+
+llvm::Optional<MyBitEnum> symbolizeMyBitEnum(uint32_t);
+std::string stringifyMyBitEnum(MyBitEnum);
+llvm::Optional<MyBitEnum> symbolizeMyBitEnum(llvm::StringRef);
+inline MyBitEnum operator|(MyBitEnum lhs, MyBitEnum rhs) {
+  return static_cast<MyBitEnum>(static_cast<uint32_t>(lhs) | static_cast<uint32_t>(rhs));
+}
+inline MyBitEnum operator&(MyBitEnum lhs, MyBitEnum rhs) {
+  return static_cast<MyBitEnum>(static_cast<uint32_t>(lhs) & static_cast<uint32_t>(rhs));
+}
+inline bool bitEnumContains(MyBitEnum bits, MyBitEnum bit) {
+  return (static_cast<uint32_t>(bits) & static_cast<uint32_t>(bit)) != 0;
+}
+
+namespace llvm {
+template<> struct DenseMapInfo<::MyBitEnum> {
+  using StorageInfo = llvm::DenseMapInfo<uint32_t>;
+
+  static inline ::MyBitEnum getEmptyKey() {
+    return static_cast<::MyBitEnum>(StorageInfo::getEmptyKey());
+  }
+
+  static inline ::MyBitEnum getTombstoneKey() {
+    return static_cast<::MyBitEnum>(StorageInfo::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const ::MyBitEnum &val) {
+    return StorageInfo::getHashValue(static_cast<uint32_t>(val));
+  }
+
+  static bool isEqual(const ::MyBitEnum &lhs, const ::MyBitEnum &rhs) {
+    return lhs == rhs;
+  }
+};
+```
+
+```c++
+std::string stringifyMyBitEnum(MyBitEnum symbol) {
+  auto val = static_cast<uint32_t>(symbol);
+  // Special case for all bits unset.
+  if (val == 0) return "None";
+
+  llvm::SmallVector<llvm::StringRef, 2> strs;
+  if (1u & val) { strs.push_back("Bit1"); val &= ~1u; }
+  if (2u & val) { strs.push_back("Bit2"); val &= ~2u; }
+  if (4u & val) { strs.push_back("Bit3"); val &= ~4u; }
+
+  if (val) return "";
+  return llvm::join(strs, "|");
+}
+
+llvm::Optional<MyBitEnum> symbolizeMyBitEnum(llvm::StringRef str) {
+  // Special case for all bits unset.
+  if (str == "None") return MyBitEnum::None;
+
+  llvm::SmallVector<llvm::StringRef, 2> symbols;
+  str.split(symbols, "|");
+
+  uint32_t val = 0;
+  for (auto symbol : symbols) {
+    auto bit = llvm::StringSwitch<llvm::Optional<uint32_t>>(symbol)
+      .Case("Bit1", 1)
+      .Case("Bit2", 2)
+      .Case("Bit3", 4)
+      .Default(llvm::None);
+    if (bit) { val |= *bit; } else { return llvm::None; }
+  }
+  return static_cast<MyBitEnum>(val);
+}
+
+llvm::Optional<MyBitEnum> symbolizeMyBitEnum(uint32_t value) {
+  // Special case for all bits unset.
+  if (value == 0) return MyBitEnum::None;
+
+  if (value & ~(1u | 2u | 4u)) return llvm::None;
+  return static_cast<MyBitEnum>(value);
+}
+```
+
+TODO(b/132506080): This following is outdated. Update it.
+
+An attribute is a compile time known constant of an operation. Attributes are
+required to be known to construct an operation (e.g., the padding behavior is
+required to fully define the `conv2d` op).
+
+Attributes are defined as having a storage type (corresponding to a derived
+class of `mlir::Attribute`), a return type (that corresponds to the C++ type to
+use in the generation of the helper accessors) as well as method to convert
+between the internal storage and the helper method. Derived attributes are a
+special class of attributes that do not have storage but are instead calculated
+based on the operation and its attributes.
+
+## Debugging Tips
+
+### Run `mlir-tblgen` to see the generated content
+
+TableGen syntax sometimes can be obscure; reading the generated content can be
+a very helpful way to understand and debug issues. To build `mlir-tblgen`, run
+`cmake --build . --target mlir-tblgen` in your build directory and find the
+`mlir-tblgen` binary in the `bin/` subdirectory. All the supported generators
+can be found via `mlir-tblgen --help`. For example, `--gen-op-decls` and
+`--gen-op-defs` as explained in [Generated C++ code](#generated-c++-code).
+
+To see the generated code, invoke `mlir-tblgen` with a specific generator by
+providing include paths via `-I`. For example,
+
+```sh
+# To see op C++ class declaration
+mlir-tblgen --gen-op-decls -I /path/to/mlir/include /path/to/input/td/file
+# To see op C++ class definition
+mlir-tblgen --gen-op-defs -I /path/to/mlir/include /path/to/input/td/file
+# To see op documentation
+mlir-tblgen --gen-op-doc -I /path/to/mlir/include /path/to/input/td/file
+
+# To see op interface C++ class declaration
+mlir-tblgen --gen-op-interface-decls -I /path/to/mlir/include /path/to/input/td/file
+# To see op interface C++ class definition
+mlir-tblgen --gen-op-interface-defs -I /path/to/mlir/include /path/to/input/td/file
+# To see op interface documentation
+mlir-tblgen --gen-op-interface-doc -I /path/to/mlir/include /path/to/input/td/file
+```
+
+
+## Appendix
+
+### Requirements and existing mechanisms analysis
+
+The op description should as declarative as possible to allow a wide range of
+tools to work with them and query methods generated from them. In particular
+this means specifying traits, constraints and shape inference information in
+a way that is easily analyzable (e.g., avoid opaque calls to C++ functions where
+possible).
+
+We considered the approaches of several contemporary systems and focused on
+requirements that were desirable:
+
+*   Ops registered using a registry separate from C++ code.
+    *   Unknown ops are allowed in MLIR, so ops need not be registered. The
+        ability of the compiler to optimize those ops or graphs containing those
+        ops is constrained but correct.
+    *   The current proposal does not include a runtime op description, but it
+        does not preclude such description, it can be added later.
+    *   The op registry is essential for generating C++ classes that make
+        manipulating ops, verifying correct construction etc. in C++ easier by
+        providing a typed representation and accessors.
+*   The op registry will be defined in
+    [TableGen](https://llvm.org/docs/TableGen/index.html) and be used to
+    generate C++ classes and utility functions
+    (builder/verifier/parser/printer).
+    *   TableGen is a modelling specification language used by LLVM's backends
+        and fits in well with trait-based modelling. This is an implementation
+        decision and there are alternative ways of doing this. But the
+        specification language is good for the requirements of modelling the
+        traits (as seen from usage in LLVM processor backend modelling) and easy
+        to extend, so a practical choice. If another good option comes up, we
+        will consider it.
+*   MLIR allows both defined and undefined ops.
+    *   Defined ops should have fixed semantics and could have a corresponding
+        reference implementation defined using, for example, EDSC.
+    *   Dialects are under full control of the dialect owner and normally live
+        with the framework of the dialect.
+*   The op's traits (e.g., commutative) are modelled along with the op in the
+    registry.
+*   The op's operand/return type constraints are modelled along with the op in
+    the registry (see [Shape inference](#shape-inference) discussion below),
+    this allows (e.g.) optimized concise syntax in textual dumps.
+*   Behavior of the op is documented along with the op with a summary and a
+    description. The description is written in markdown and extracted for
+    inclusion in the generated LangRef section of the dialect.
+*   The generic assembly form of printing and parsing is available as normal,
+    but a custom parser and printer can either be specified or automatically
+    generated from an optional string representation showing the mapping of the
+    "assembly" string to operands/type.
+    *   Parser-level remappings (e.g., `eq` to enum) will be supported as part
+        of the parser generation.
+*   Matching patterns are specified separately from the op description.
+    *   Contrasted with LLVM there is no "base" set of ops that every backend
+        needs to be aware of. Instead there are many different dialects and the
+        transformations/legalizations between these dialects form a graph of
+        transformations.
+*   Reference implementation may be provided along with the op definition.
+
+    *   The reference implementation may be in terms of either standard ops or
+        other reference implementations.
+
+    TODO: document expectation if the dependent op's definition changes.
+
+### A proposal for auto-generating printer and parser methods
+
+NOTE: Auto-generating printing/parsing (as explained in the below) has _not_
+been prototyped, and potentially just being able to specify custom printer/
+parser methods are sufficient. This should presumably be influenced by the
+design of the assembler/disassembler logic that LLVM backends get for free
+for machine instructions.
+
+The custom assembly form of the operation is specified using a string with
+matching operation name, operands and attributes. With the ability
+to express additional information that needs to be parsed to build the
+operation:
+
+```tablegen
+tfl.add $lhs, $rhs {fused_activation_function: $fused_activation_function}: ${type(self)}
+```
+
+1. The output is never shown in the "mnemonics" string as that is fixed form
+   and cannot be altered.
+1. Custom parsing of ops may include some punctuation (e.g., parenthesis).
+1. The operands/results are added to the created operation in the order that
+   they are shown in the input and output dags.
+1. The `${type(self)}` operator is used to represent the type of the operator.
+   The type of operands can also be queried.
+1. Attributes names are matched to the placeholders in the mnemonic strings.
+   E.g., attribute axis is matched with `$axis`. Custom parsing for attribute
+   type can be defined along with the attribute definition.
+1. The information in the custom assembly form should be sufficient to invoke
+   the builder generated. That may require being able to propagate information
+   (e.g., the `$lhs` has the same type as the result).
+
+Printing is effectively the inverse of the parsing function generated with the
+mnemonic string serving as a template.
+
+### Shape inference
+
+Type constraints are along (at least) three axis: 1) elemental type, 2) rank
+(including static or dynamic), 3) dimensions. While some ops have no compile
+time fixed shape (e.g., output shape is dictated by data) we could still have
+some knowledge of constraints/bounds in the system for that op (e.g., the output
+of a `tf.where` is at most the size of the input data). And so there are
+additional valuable constraints that could be captured even without full
+knowledge.
+
+Initially the shape inference will be declaratively specified using:
+
+*   Constraint on the operands of an operation directly. For example
+    constraining the input type to be tensor/vector elements or that the
+    elemental type be of a specific type (e.g., output of sign is of elemental
+    type `i1`) or class (e.g., float like).
+*   Constraints across operands and results of an operation. For example,
+    enabling specifying equality constraints on type/constituents of a type
+    (shape and elemental type) between operands and results (e.g., the output
+    type of an add is the same as those of the input operands).
+
+In general there is an input/output transfer function which maps the inputs to
+the outputs (e.g., given input X and Y [or slices thereof] with these sizes, the
+output is Z [or this slice thereof]). Such a function could be used to determine
+the output type (shape) for given input type (shape).
+
+But shape functions are determined by attributes and could be arbitrarily
+complicated with a wide-range of specification possibilities. Equality
+relationships are common (e.g., the elemental type of the output matches the
+primitive type of the inputs, both inputs have exactly the same type [primitive
+type and shape]) and so these should be easy to specify. Algebraic relationships
+would also be common (e.g., a concat of `[n,m]` and `[n,m]` matrix along axis 0
+is `[n+n, m]` matrix), while some ops only have defined shapes under certain
+cases (e.g., matrix multiplication of `[a,b]` and `[c,d]` is only defined if
+`b == c`). As ops are also verified, the shape inference need only specify rules
+for the allowed cases (e.g., shape inference for matmul can ignore the case
+where `b != c`), which would simplify type constraint specification.
+
+Instead of specifying an additional mechanism to specify a shape transfer
+function, the reference implementation of the operation will be used to derive
+the shape function. The reference implementation is general and can support the
+arbitrary computations needed to specify output shapes.
+
+[TableGen]: https://llvm.org/docs/TableGen/index.html
+[TableGenIntro]: https://llvm.org/docs/TableGen/LangIntro.html
+[TableGenRef]: https://llvm.org/docs/TableGen/LangRef.html
+[TableGenBackend]: https://llvm.org/docs/TableGen/BackEnds.html#introduction
+[OpBase]: https://github.com/tensorflow/mlir/blob/master/include/mlir/IR/OpBase.td
+[OpDefinitionsGen]: https://github.com/tensorflow/mlir/blob/master/tools/mlir-tblgen/OpDefinitionsGen.cpp
+[EnumsGen]: https://github.com/tensorflow/mlir/blob/master/tools/mlir-tblgen/EnumsGen.cpp
+[StringAttr]: https://github.com/tensorflow/mlir/blob/master/g3doc/LangRef.md#string-attribute
+[IntegerAttr]: https://github.com/tensorflow/mlir/blob/master/g3doc/LangRef.md#integer-attribute
diff --git a/mlir/docs/Passes.md b/mlir/docs/Passes.md
new file mode 100644
index 0000000000000000000000000000000000000000..78ea257b57bc336d65cab8197898639f5ac20cd4
--- /dev/null
+++ b/mlir/docs/Passes.md
@@ -0,0 +1,298 @@
+# MLIR Passes
+
+This document describes the available MLIR passes and their contracts.
+
+[TOC]
+
+## Affine control lowering (`-lower-affine`)
+
+Convert operations related to affine control into a graph of blocks using
+operations from the standard dialect.
+
+Loop statements are converted to a subgraph of blocks (initialization, condition
+checking, subgraph of body blocks) with loop induction variable being passed as
+the block argument of the condition checking block. Conditional statements are
+converted to a subgraph of blocks (chain of condition checking with
+short-circuit logic, subgraphs of 'then' and 'else' body blocks). `affine.apply`
+operations are converted into sequences of primitive arithmetic operations that
+have the same effect, using operands of the `index` type. Consequently, named
+maps and sets may be removed from the module.
+
+For example, `%r = affine.apply (d0, d1)[s0] -> (d0 + 2*d1 + s0)(%d0, %d1)[%s0]`
+can be converted into:
+
+```mlir
+%d0 = <...>
+%d1 = <...>
+%s0 = <...>
+%0 = constant 2 : index
+%1 = muli %0, %d1
+%2 = addi %d0, %1
+%r = addi %2, %s0
+```
+
+### Input invariant
+
+-   no `Tensor` types;
+
+These restrictions may be lifted in the future.
+
+### Output IR
+
+Functions with `affine.for` and `affine.if` operations eliminated. These
+functions may contain operations from the Standard dialect in addition to those
+already present before the pass.
+
+### Invariants
+
+-   Functions without a body are not modified.
+-   The semantics of the other functions is preserved.
+-   Individual operations other than those mentioned above are not modified if
+    they do not depend on the loop iterator value or on the result of
+    `affine.apply`.
+
+## Conversion from Standard to LLVM IR dialect (`-convert-std-to-llvm`)
+
+Convert standard operations into the LLVM IR dialect operations.
+
+### Input invariant
+
+-   operations including: arithmetic on integers and floats, constants, direct
+    calls, returns and branches;
+-   no `tensor` types;
+-   all `vector` are one-dimensional;
+-   all blocks are reachable by following the successors of the first basic
+    block;
+
+If other operations are present and their results are required by the LLVM IR
+dialect operations, the pass will fail.  Any LLVM IR operations or types already
+present in the IR will be kept as is.
+
+### Output IR
+
+Functions converted to LLVM IR. Function arguments types are converted
+one-to-one. Function results are converted one-to-one and, in case more than 1
+value is returned, packed into an LLVM IR struct type. Function calls and
+returns are updated accordingly. Block argument types are updated to use LLVM IR
+types.
+
+## Data Copy DMA generation (`-affine-data-copy-generate`)
+
+Replaces all loads and stores on memref's living in 'slowMemorySpace' by
+introducing DMA operations (strided DMA if necessary) to transfer data to/from
+`fastMemorySpace` and rewriting the original load's/store's to instead
+load/store from the allocated fast memory buffers. Additional options specify
+the identifier corresponding to the fast memory space and the amount of fast
+memory space available. The pass traverses through the nesting structure,
+recursing to inner levels if necessary to determine at what depth DMA transfers
+need to be placed so that the allocated buffers fit within the memory capacity
+provided. If this is not possible (for example, when the elemental type itself
+is of size larger than the DMA capacity), an error with location information is
+emitted. The DMA transfers are also hoisted up past all loops with respect to
+which the transfers are invariant.
+
+Input
+
+```mlir
+func @loop_nest_tiled() -> memref<256x1024xf32> {
+  %0 = alloc() : memref<256x1024xf32>
+  affine.for %i0 = 0 to 256 step 32 {
+    affine.for %i1 = 0 to 1024 step 32 {
+      affine.for %i2 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 32)(%i0) {
+        affine.for %i3 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 32)(%i1) {
+          %1 = affine.load %0[%i2, %i3] : memref<256x1024xf32>
+        }
+      }
+    }
+  }
+  return %0 : memref<256x1024xf32>
+}
+```
+
+Output (with flags: -affine-data-copy-generate -affine-data-copy-generate-fast-mem-space=2)
+
+```mlir
+module {
+  func @loop_nest_tiled() -> memref<256x1024xf32> {
+    %c262144 = constant 262144 : index
+    %c0 = constant 0 : index
+    %0 = alloc() : memref<256x1024xf32>
+    %1 = alloc() : memref<256x1024xf32, 2>
+    %2 = alloc() : memref<1xi32>
+    affine.dma_start %0[%c0, %c0], %1[%c0, %c0], %2[%c0], %c262144 : memref<256x1024xf32>, memref<256x1024xf32, 2>, memref<1xi32>
+    affine.dma_wait %2[%c0], %c262144 : memref<1xi32>
+    affine.for %arg0 = 0 to 256 step 32 {
+      affine.for %arg1 = 0 to 1024 step 32 {
+        affine.for %arg2 = #map1(%arg0) to #map2(%arg0) {
+          affine.for %arg3 = #map1(%arg1) to #map2(%arg1) {
+            %3 = affine.load %1[%arg2, %arg3] : memref<256x1024xf32, 2>
+          }
+        }
+      }
+    }
+    dealloc %2 : memref<1xi32>
+    dealloc %1 : memref<256x1024xf32, 2>
+    return %0 : memref<256x1024xf32>
+  }
+}
+```
+
+## Loop tiling (`-affine-loop-tile`)
+
+Performs tiling or blocking of loop nests. It currently works on perfect loop
+nests.
+
+## Loop unroll (`-affine-loop-unroll`)
+
+This pass implements loop unrolling. It is able to unroll loops with arbitrary
+bounds, and generate a cleanup loop when necessary.
+
+## Loop unroll and jam (`-affine-loop-unroll-jam`)
+
+This pass implements unroll and jam for loops. It works on both perfect or
+imperfect loop nests.
+
+## Loop fusion (`-affine-loop-fusion`)
+
+Performs fusion of loop nests using a slicing-based approach. The fused loop
+nests, when possible, are rewritten to access significantly smaller local
+buffers instead of the original memref's, and the latter are often
+either completely optimized away or contracted. This transformation leads to
+enhanced locality and lower memory footprint through the elimination or
+contraction of temporaries / intermediate memref's. These benefits are sometimes
+achieved at the expense of redundant computation through a cost model that
+evaluates available choices such as the depth at which a source slice should be
+materialized in the designation slice.
+
+## Memref bound checking (`-memref-bound-check`)
+
+Checks all load's and store's on memref's for out of bound accesses, and reports
+any out of bound accesses (both overrun and underrun) with location information.
+
+```mlir
+test/Transforms/memref-bound-check.mlir:19:13: error: 'load' op memref out of upper bound access along dimension #2
+      %x  = load %A[%idx0, %idx1] : memref<9 x 9 x i32>
+            ^
+test/Transforms/memref-bound-check.mlir:19:13: error: 'load' op memref out of lower bound access along dimension #2
+      %x  = load %A[%idx0, %idx1] : memref<9 x 9 x i32>
+            ^
+```
+
+## Memref dataflow optimization (`-memref-dataflow-opt`)
+
+This pass performs store to load forwarding for memref's to eliminate memory
+accesses and potentially the entire memref if all its accesses are forwarded.
+
+Input
+
+```mlir
+func @store_load_affine_apply() -> memref<10x10xf32> {
+  %cf7 = constant 7.0 : f32
+  %m = alloc() : memref<10x10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.store %cf7, %m[%i0, %i1] : memref<10x10xf32>
+      %v0 = affine.load %m[%i0, %i1] : memref<10x10xf32>
+      %v1 = addf %v0, %v0 : f32
+    }
+  }
+  return %m : memref<10x10xf32>
+}
+```
+
+Output
+
+```mlir
+module {
+  func @store_load_affine_apply() -> memref<10x10xf32> {
+    %cst = constant 7.000000e+00 : f32
+    %0 = alloc() : memref<10x10xf32>
+    affine.for %arg0 = 0 to 10 {
+      affine.for %arg1 = 0 to 10 {
+        affine.store %cst, %0[%arg0, %arg1] : memref<10x10xf32>
+        %1 = addf %cst, %cst : f32
+      }
+    }
+    return %0 : memref<10x10xf32>
+  }
+}
+
+```
+
+## Memref dependence analysis (`-memref-dependence-check`)
+
+This pass performs dependence analysis to determine dependences between pairs of
+memory operations (load's and store's) on memref's. Dependence analysis exploits
+polyhedral information available (affine maps, expressions, and affine.apply
+operations) to precisely represent dependences using affine constraints, while
+also computing dependence vectors from them, where each component of the
+dependence vector provides a lower and an upper bound on the dependence distance
+along the corresponding dimension.
+
+```mlir
+test/Transforms/memref-dataflow-opt.mlir:232:7: note: dependence from 2 to 1 at depth 1 = ([1, 1], [-inf, +inf])
+      store %cf9, %m[%idx] : memref<10xf32>
+```
+
+## Pipeline data transfer (`-affine-pipeline-data-transfer`)
+
+This pass performs a transformation to overlap non-blocking DMA operations in a
+loop with computations through double buffering. This is achieved by advancing
+dma_start operations with respect to other operations.
+
+Input
+
+```mlir
+func @pipelinedatatransfer() {
+  %0 = alloc() : memref<256xf32>
+  %1 = alloc() : memref<32xf32, 1>
+  %2 = alloc() : memref<1xf32>
+  %c0 = constant 0 : index
+  %c128 = constant 128 : index
+  affine.for %i0 = 0 to 8 {
+    affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32>
+    affine.dma_wait %2[%c0], %c128 : memref<1xf32>
+    %3 = affine.load %1[%i0] : memref<32xf32, 1>
+    %4 = "compute"(%3) : (f32) -> f32
+    affine.store %4, %1[%i0] : memref<32xf32, 1>
+  }
+  return
+}
+```
+
+Output
+
+```mlir
+module {
+  func @pipelinedatatransfer() {
+    %c8 = constant 8 : index
+    %c0 = constant 0 : index
+    %0 = alloc() : memref<256xf32>
+    %c0_0 = constant 0 : index
+    %c128 = constant 128 : index
+    %1 = alloc() : memref<2x32xf32, 1>
+    %2 = alloc() : memref<2x1xf32>
+    affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+    affine.for %arg0 = 1 to 8 {
+      affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+      %8 = affine.apply #map3(%arg0)
+      %9 = affine.apply #map4(%8)
+      %10 = affine.apply #map4(%8)
+      affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
+      %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1>
+      %12 = "compute"(%11) : (f32) -> f32
+      affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1>
+    }
+    %3 = affine.apply #map3(%c8)
+    %4 = affine.apply #map4(%3)
+    %5 = affine.apply #map4(%3)
+    affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32>
+    %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1>
+    %7 = "compute"(%6) : (f32) -> f32
+    affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1>
+    dealloc %2 : memref<2x1xf32>
+    dealloc %1 : memref<2x32xf32, 1>
+    return
+  }
+}
+```
diff --git a/mlir/docs/Quantization.md b/mlir/docs/Quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..99e450ca84dacde19b2691bab228c30e3ade7fd6
--- /dev/null
+++ b/mlir/docs/Quantization.md
@@ -0,0 +1,359 @@
+# MLIR Quantization
+
+This document outlines the design of the MLIR quantization system. While the
+term "quantization" is highly overloaded, in this case, it refers to a fairly
+narrow scope of techniques in use to enable conversion of floating-point
+computations to corresponding and plausible variants expressed in integer math
+for inference, as has historically been supported by low-bit depth inference
+engines such as TFLite, various accelerator hardware, and many DSPs.
+
+Much of this is inspired by the approach taken
+[in this paper](https://arxiv.org/abs/1712.05877) with many extensions and
+adaptations folded in. It specifically documents the positions that MLIR has
+taken on the topic, and is not a general reference.
+
+[TOC]
+
+## Uniform quantization
+
+The primary quantization mechanism supported by MLIR is a scheme which can
+express fixed point and affine transformations via uniformly spaced point on the
+Real number line.
+
+Further, the scheme can be applied:
+
+*   *per-layer* : Applying to every value within the target type.
+*   *per-axis* (also called *per-channel*) : Applying individually to each index
+    along a specific axis of a tensor type.
+
+### Fixed point values
+
+[Fixed point](https://en.wikipedia.org/wiki/Fixed-point_arithmetic) values are a
+[Real](https://en.wikipedia.org/wiki/Real_number) number divided by a *scale*.
+We will call the result of the divided Real the *scaled value*.
+
+$$ real\_value = scaled\_value * scale $$
+
+The scale can be interpreted as the distance, in Real units, between neighboring
+scaled values. For example, if the scale is $$ \pi $$, then fixed point values
+with this scale can only represent multiples of $$ \pi $$, and nothing in
+between. The maximum rounding error to convert an arbitrary Real to a fixed
+point value with a given $$ scale $$ is $$ \frac{scale}{2} $$. Continuing the
+previous example, when $$ scale = \pi $$, the maximum rounding error will be $$
+\frac{\pi}{2} $$.
+
+Multiplication can be performed on scaled values with different scales, using
+the same algorithm as multiplication of Real values (note that product scaled
+value has $$ scale_{product} = scale_{left \mbox{ } operand} * scale_{right
+\mbox{ } operand} $$). Addition can be performed on scaled values, as long as
+they have the same scale, using the same algorithm as addition of Real values.
+This makes it convenient to represent scaled values on a computer as signed
+integers, and perform arithmetic on those signed integers, because the results
+will be correct scaled values.
+
+### Affine values
+
+Mathematically speaking, affine values are the result of
+[adding a Real-valued *zero point*, to a scaled value](https://en.wikipedia.org/wiki/Affine_transformation#Representation).
+Or equivalently, subtracting a zero point from an affine value results in a
+scaled value:
+
+$$ real\_value = scaled\_value * scale = (affine\_value - zero\_point) * scale $$
+
+Essentially, affine values are a shifting of the scaled values by some constant
+amount. Arithmetic (i.e., addition, subtraction, multiplication, division)
+cannot, in general, be directly performed on affine values; you must first
+[convert](#affine-to-fixed-point) them to the equivalent scaled values.
+
+As alluded to above, the motivation for using affine values is to more
+efficiently represent the Real values that will actually be encountered during
+computation. Frequently, the Real values that will be encountered are not
+symmetric around the Real zero. We also make the assumption that the Real zero
+is encountered during computation, and should thus be represented.
+
+In this case, it's inefficient to store scaled values represented by signed
+integers, as some of the signed integers will never be used. The bit patterns
+corresponding to those signed integers are going to waste.
+
+In order to exactly represent the Real zero with an integral-valued affine
+value, the zero point must be an integer between the minimum and maximum affine
+value (inclusive). For example, given an affine value represented by an 8 bit
+unsigned integer, we have: $$ 0 \leq zero\_point \leq 255$$. This is important,
+because in deep neural networks' convolution-like operations, we frequently
+need to zero-pad inputs and outputs, so zero must be exactly representable, or
+the result will be biased.
+
+### Relation
+
+Real values, fixed point values, and affine values relate through the following
+equation, which demonstrates how to convert one type of number to another:
+
+$$ real\_value = scaled\_value * scale = (affine\_value - zero\_point) * scale $$
+
+Note that computers generally store mathematical values using a finite number of
+bits. Thus, while the above conversions are exact, to store the result in a
+finite number of bits, we must, in general, round the result of the conversion
+(this applies to both cases: storing using floating point and storing using
+fixed point). Note that a full discussion of rounding behavior is outside the
+scope of this document, and it is safe to assume unless otherwise stated that
+rounding should be according to the IEEE754 default of RNE (where hardware
+permits).
+
+### Converting between Real and fixed point or affine
+
+To convert a Real value to a fixed point value, you must know the scale. To
+convert a Real value to an affine value, you must know the scale and zero point.
+
+#### Real to affine
+
+To convert an input tensor of Real-valued elements (usually represented by a
+floating point format, frequently
+[Single precision](https://en.wikipedia.org/wiki/Single-precision_floating-point_format))
+to a tensor of affine elements represented by an integral type (e.g. 8-bit
+unsigned integer), the following conversion can be performed (note that it is
+not required that all representable values of the integral type are used):
+
+$$
+\begin{align*}
+af&fine\_value_{uint8 \, or \, uint16} \\
+      &= clampToTargetSize(roundToNearestInteger( \frac{real\_value_{Single}}{scale_{Single}})_{sint32} + zero\_point_{uint8 \, or \, uint16})
+\end{align*}
+$$
+
+In the above, we assume that $$real\_value$$ is a Single, $$scale$$ is a Single,
+$$roundToNearestInteger$$ returns a signed 32 bit integer, and $$zero\_point$$
+is an unsigned 8 or 16 bit integer. Note that bit depth and number of fixed
+point values are indicative of common types on typical hardware but is not
+constrained to particular bit depths or a requirement that the entire range of
+an N-bit integer is used.
+
+#### Affine to Real
+
+To convert an output tensor of affine elements represented by uint8
+or uint16 to a tensor of Real-valued elements (usually represented with a
+floating point format, frequently Single precision), the following conversion
+can be performed:
+
+$$
+\begin{align*}
+re&al\_value_{Single} \\
+      &= roundToNearestFloat((affine\_value_{uint8 \, or \, uint16} - zero\_point_{uint8 \, or \, uint16})_{sint32})_{Single} * scale_{Single}
+\end{align*}
+$$
+
+In the above, we assume that the result of subtraction is in 32-bit signed
+integer format, and that $$roundToNearestFloat$$ returns a Single.
+
+#### Affine to fixed point
+
+When the affine and fixed point scales are the same, subtract the zero point
+from the affine value to get the equivalent fixed point value.
+
+$$
+scaled\_value = affine\_value_{non\mbox{-}negative} - zero\_point_{non\mbox{-}negative}
+$$
+
+#### Fixed point to affine
+
+When the affine and fixed point scales are the same, add the zero point to the
+fixed point value to get the equivalent affine value.
+
+$$
+affine\_value_{non\mbox{-}negative} = scaled\_value + zero\_point_{non\mbox{-}negative}
+$$
+
+## Usage within MLIR
+
+There are several components to the quantization system being developed within
+MLIR:
+
+*   *Quantization* dialect containing:
+
+    *   A family of [QuantizedTypes](#quantized-type) which represent the
+        mapping between *expressed* values (typically of a floating point
+        computer type) and *storage* values (typically of an integral computer
+        type).
+    *   [Type conversion ops](#quantized-type-conversion-ops) for converting
+        between types based on a QuantizedType and its *expressed* and *storage*
+        sub-types.
+    *   [Instrumentation ops](#instrumentation-and-constraint-ops) for assigning
+        instrumentation points within the computation where runtime statistics
+        may help guide the quantization process.
+
+*   [Integration with simulated quantization at training time](#integration-with-simulated-quantization-at-training-time)
+
+*   [TFLite native quantization](#tflite-native-quantization)
+
+    *   The TFLite op-set natively supports uniform-quantized variants.
+    *   Passes and tools exist to convert directly from the *TensorFlow* dialect
+        to the TFLite quantized op-set.
+
+*   [*FxpMath* dialect](#fxpmath-dialect) containing (experimental) generalized
+    representations of fixed-point math ops and conversions:
+
+    *   [Real math ops](#real-math-ops) representing common combinations of
+        arithmetic operations that closely match corresponding fixed-point math
+        concepts (as opposed to being spread across multiple ops as is typical
+        in source dialects).
+    *   [Fixed-point math ops](#fixed-point-math-ops) that for carrying out
+        computations on integers, as are typically needed by uniform
+        quantization schemes.
+    *   Passes to lower from real math ops to fixed-point math ops.
+
+*   [Solver tools](#solver-tools) which can (experimentally and generically
+    operate on computations expressed in the *FxpMath* dialect in order to
+    convert from floating point types to appropriate *QuantizedTypes*, allowing
+    the computation to be further lowered to integral math ops.
+
+Not every application of quantization will use all facilities. Specifically, the
+TensorFlow to TensorFlow Lite conversion uses the QuantizedTypes but has its own
+ops for type conversion and expression of the backing math.
+
+## Quantization Dialect
+
+### Quantized type
+
+TODO : Flesh this section out.
+
+*   QuantizedType base class
+*   UniformQuantizedType
+
+### Quantized type conversion ops
+
+*   qcast : Convert from an expressed type to QuantizedType
+*   dcast : Convert from a QuantizedType to its expressed type
+*   scast : Convert between a QuantizedType and its storage type
+
+### Instrumentation and constraint ops
+
+*   const_fake_quant : Emulates the logic of the historic TensorFlow
+    fake_quant_with_min_max_args op.
+*   stats_ref : Declares that statistics should be gathered at this point with a
+    unique key and made available to future passes of the solver.
+*   stats : Declares inline statistics (per layer and per axis) for the point in
+    the computation. stats_ref ops are generally converted to stats ops once
+    trial runs have been performed.
+*   coupled_ref : Declares points in the computation to be coupled from a type
+    inference perspective based on a unique key.
+
+## Integration with simulated quantization at training time
+
+TensorFlow has historically used the
+[tf.quantization.fake_quant_\*](https://www.tensorflow.org/api_docs/python/tf/quantization/fake_quant_with_min_max_args)
+family of operations to simulate the effect of quantization at training time.
+
+As originally implemented, TensorFlow Lite was the primary user of such
+operations at inference time. When quantized inference was enabled, if every
+eligible tensor passed through an appropriate fake_quant node (the rules of
+which tensors can have fake_quant applied are somewhat involved), then
+TensorFlow Lite would use the attributes of the fake_quant ops to make a
+judgment about how to convert to use kernels from its quantized ops subset.
+
+In MLIR-based quantization, fake_quant_\* ops are handled by converting them to
+a sequence of *qcast* (quantize) followed by *dcast* (dequantize) with an
+appropriate *UniformQuantizedType* as the target of the qcast operation.
+
+This allows subsequent compiler passes to preserve the knowledge that
+quantization was simulated in a certain way while giving the compiler
+flexibility to move the casts as it simplifies the computation and converts it
+to a form based on integral arithmetic.
+
+This scheme also naturally allows computations that are *partially quantized*
+where the parts which could not be reduced to integral ops are still carried out
+in floating point with appropriate conversions at the boundaries.
+
+## TFLite Native Quantization
+
+TODO : Flesh this out
+
+### General algorithm
+
+1.  Take input min/max information and set the ArrayInfo (which really is
+    InputOrOutputArrayInfo.
+1.  In LegalizeTF, convert ArrayInfo min/max to tf.Quantize and tf.Dequantize
+    nodes. (or tf.FakeQuant) Convert all constant FakeQuants to (tf.FQ -> tfl.Q
+    -> tfl.DQ).
+1.  Hardcode logic/propagation needs to happen here.
+1.  Run TF constant folding.
+1.  In PrepareTFL, convert all tf.FQ to (tfl.Q -> tfl.DQ).
+1.  Run quantization pass that take (tfl.DQ (for both input and weights) -> op
+    -> tfl.Q) and replaces with (op). Also replace (constant_float -> tfl.Q)
+    with (constant_quant).
+
+## FxpMath Dialect
+
+### Real math ops
+
+Note that these all support explicit clamps, which allows for simple fusions and
+representation of some common sequences quantization-compatible math. Of
+addition, some support explicit biases, which are often represented as separate
+adds in source dialects.
+
+TODO: This op set is still evolving and needs to be completed.
+
+*   RealBinaryOp
+    *   RealAddEwOp
+    *   RealSubEwOp
+    *   RealMulEwOp
+    *   RealDivEwOp
+*   RealUnaryOp
+    *   IDENTITY
+    *   TANH
+    *   SIGMOID
+    *   EXP
+    *   LOG
+    *   NEG
+    *   RSQRT
+    *   SIN
+    *   SQUARE
+    *   SQRT
+    *   CMPZ
+    *   CMPNZ
+    *   CMPLZ
+    *   CMPGZ
+
+### Fixed-point math ops
+
+TODO: This op set only has enough ops to lower a simple power-of-two
+RealAddEwOp.
+
+*   RoundingDivideByPotFxpOp
+*   SaturatingAddFxpOp
+
+## Solver tools
+
+Solver tools exist to analyze an MLIR-computation, expressed in either a
+supported source dialect or in the *real math ops* set and solve for appropriate
+QuantizedTypes that allow the computation to be lowered to integral math.
+
+These tools are an active area of work and may be expanded in the future to
+adjacent areas such as solving for transformations to other kinds of lower
+precision types (i.e. bfloat16 or fp16).
+
+Solver tools are expected to operate in several modes, depending on the
+computation and the manner in which it was trained:
+
+*   *Transform* : With all available information in the MLIR computation, infer
+    boundaries where the computation can be carried out with integral math and
+    change types accordingly to appropriate QuantizedTypes:
+
+    *   For passthrough ops which do not perform active math, change them to
+        operate directly on the storage type, converting in and out at the edges
+        via scast ops.
+    *   For ops that have the *Quantizable* trait, the type can be set directly.
+        This includes ops from the [real math ops set]{#real-math-ops}.
+    *   For others, encase them in appropriate dcast/qcast ops, presuming that
+        some follow-on pass will know what to do with them.
+
+*   *Instrument* : Most of the time, there are not sufficient implied
+    constraints within a computation to perform many transformations. For this
+    reason, the solver can insert instrumentation ops at points where additional
+    runtime statistics may yield solutions. It is expected that such
+    computations will be lowered as-is for execution, run over an appropriate
+    eval set, and statistics at each instrumentation point made available for a
+    future invocation of the solver.
+
+*   *Simplify* : A variety of passes and simplifications are applied once
+    QuantizedTypes are added in order to arrive at a computation that is
+    expressed in as much integral math, with the fewest number of casts as
+    possible.
diff --git a/mlir/docs/QuickstartRewrites.md b/mlir/docs/QuickstartRewrites.md
new file mode 100644
index 0000000000000000000000000000000000000000..6a4a7cca8b88d9d0f282de9c82c78518f3f555c9
--- /dev/null
+++ b/mlir/docs/QuickstartRewrites.md
@@ -0,0 +1,255 @@
+# Quickstart tutorial to adding MLIR graph rewrite
+
+This document will present a quickstart to adding graph rewrites. We shall start
+by defining an operation, showing multiple ways to define the rewrite using
+patterns, as well as defining the rewrite using a graph walker (note: using
+patterns and the rewrite engine is preferred, showing the walker is for
+demonstration purposes).
+
+See [MLIR specification](LangRef.md) for more information about MLIR, the
+structure of the IR, operations, etc. See
+[Table-driven Operation Definition](OpDefinitions.md) and
+[Declarative Rewrite Rule](DeclarativeRewrites.md) for the detailed explanation
+of all available mechanisms for defining operations and rewrites in a
+table-driven manner.
+
+## Adding operation
+
+An operation in MLIR is specified using a definition in
+[TableGen](https://llvm.org/docs/TableGen/LangIntro.html) file. TableGen is a
+modeling tool to specify the ops and the C++ code to interact with these
+operations are generated from. To define an operation one needs to specify:
+
+*   The operation name. This name is a unique identifier of the operation within
+    MLIR. Most operations are within a dialect, so for example one could have
+    `tfl.add` to represent the add operation in the TensorFlow Lite dialect.
+    Instead of repeating the dialect in the op definition, a base class for the
+    op dialect is commonly created that prepends the dialect namespace given an
+    op name.
+*   The traits of the operation. These allow you to specify traits of the
+    operation, such as whether it has side effects or whether it should be
+    verified that the operands and result types are the same. These are backed
+    by C++ traits that perform the verification.
+*   The arguments of the operation. These are the input operands (values at
+    runtime produced by other ops) and attributes (compile time known constant
+    values that affect the behavior of the op) that are the inputs of/define the
+    behavior of the operation. The input operands may be named, the attributes
+    must be named.
+*   The result(s) of the operation. These may again named or not.
+*   Documentation of the operation. This includes a one-line summary as well as
+    a longer human-readable description of the operation.
+*   Dialect specific information. Additional information could be added to the
+    operation definition that are only used by dialect specific drivers. These
+    are ignored by the main op and doc generators, but could be used in, say,
+    the translation from a dialect to another representation.
+
+```tablegen
+def TFL_LeakyReluOp: TFL_Op<TFL_Dialect, "leaky_relu",
+                            [NoSideEffect, SameValueType]>,
+                     Results<(outs Tensor)> {
+  let arguments = (ins
+    F32Tensor:$x,
+    // Slope of the activation function at x < 0.
+    F32Attr:$alpha
+  );
+
+  let summary = "Leaky ReLU operator";
+  let description = [{
+    Element-wise Leaky ReLU operator
+      x -> x >= 0 ? x : (alpha * x)
+  }];
+
+  // TFLite specific attribute that is used when generating the output
+  // flatbuffer.
+  let hasOptions = 1;
+}
+```
+
+Note in the above the result types and inputs are specified in different ways,
+one by way of trait and the other by way of let. It is possible to specify both
+in either way.
+
+<!-- TODO: Define a style convention. -->
+
+Operations can also have custom parser, printer, builder, verifier, constant
+folder, or canonicalizer. These require specifying additional C++ methods to
+invoke for additional functionality. For example, if an operation is marked to
+have a folder, the constant folder also needs to be added, e.g.,:
+
+```c++
+OpFoldResult SpecificOp::fold(ArrayRef<Attribute> constOperands) {
+  if (unable_to_fold)
+    return {};
+  ....
+  return val;
+}
+```
+
+## Adding patterns
+
+There are multiple forms of graph rewrite that can be performed in MLIR. One of
+the most common is DAG tile to DAG tile rewrite. Patterns provide a concise way
+to express this transformation as a pair of source pattern to match and
+resultant pattern. There are both the C++ classes to represent this
+transformation, as well as the patterns in TableGen from which these can be
+generated.
+
+### TableGen patterns
+
+Let us continue with LeakyRelu. To map from TensorFlow's `LeakyRelu` to
+TensorFlow Lite's `LeakyRelu`:
+
+```tablegen
+def : Pat<(TF_LeakyReluOp $arg, F32Attr:$a), (TFL_LeakyReluOp $arg, $a)>
+```
+
+The pattern is specified by instantiating a `Pat` with a source and result DAG.
+The arguments in the source pattern is captured and can be used in the result
+pattern. This is a simple pattern as we have a 1:1 mapping and the attribute
+does not need to be transformed (e.g., both have a floating point attribute for
+alpha). The names of the attributes specified in the pattern is for
+matching/referencing and need not match the original attribute name in the op
+definition but the order of arguments of the dags do need to match.
+
+To specify a pattern, both the source and resultant ops need to be defined using
+TableGen.
+
+If this were a more advance pattern that the current framework could not express
+as destination then one could use a general native code fallback method. This
+consists of defining a pattern as well as adding a C++ function to perform the
+replacement:
+
+```tablegen
+def createTFLLeakyRelu : NativeCodeCall<
+    "createTFLLeakyRelu($_builder, $0->getDefiningOp(), $1, $2)">;
+
+def : Pat<(TF_LeakyReluOp:$old_value, $arg, F32Attr:$a),
+          (createTFLLeakyRelu $old_value, $arg, $a)>;
+```
+
+```c++
+static Value createTFLLeakyRelu(PatternRewriter &rewriter, Operation *op,
+                                Value operand, Attribute attr) {
+  return rewriter.create<mlir::TFL::LeakyReluOp>(
+      op->getLoc(), operands[0]->getType(), /*arg=*/operands[0],
+      /*alpha=*/attrs[0].cast<FloatAttr>());
+}
+```
+
+This allows for arbitrarily complex builders. Input pattern side one can express
+multi-op patterns with constraints on input operands and attributes. But input
+patterns cannot yet express constraints across multiple operands/attributes.
+
+### Register the pattern
+
+The file containing the patterns need to be processed using `mlir-tblgen`
+`-gen-rewriters` during compilation time. It can be invoked with the following
+configuration in CMake:
+
+```cmake
+set(LLVM_TARGET_DEFINITIONS <name-of-the-td-file>)
+mlir_tablegen(<name-of-the-generated-inc-file> -gen-rewriters)
+add_public_tablegen_target(<name-of-the-cmake-target>)
+```
+
+Then you can `#include` the generated file in any C++ implementation file you
+like. (You will also need to make sure the library depends on the CMake target
+defined in the above.) The generated file will have a `populateWithGenerated(
+MLIRContext *context, OwningRewritePatternList *patterns)` function that you can
+use to collect all the generated patterns inside `patterns` and then use
+`patterns` in any pass you would like.
+
+### C++ rewrite specification
+
+In case patterns are not sufficient there is also the fully C++ way of
+expressing a rewrite:
+
+```c++
+/// Multi-step rewrite using "match" and "rewrite". This allows for separating
+/// the concerns of matching and rewriting.
+struct ConvertTFLeakyRelu : public RewritePattern {
+  ConvertTFLeakyRelu(MLIRContext *context)
+      : RewritePattern("tf.LeakyRelu", 1, context) {}
+
+  PatternMatchResult match(Operation *op) const override {
+    return matchSuccess();
+  }
+
+  void rewrite(Operation *op, PatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<TFL::LeakyReluOp>(
+        op, op->getResult(0)->getType(), op->getOperand(0),
+        /*alpha=*/op->getAttrOfType<FloatAttr>("alpha"));
+  }
+};
+
+/// Single-step rewrite with "matchAndRewrite". This allows for performing the
+/// rewrite immediately upon a successful match.
+struct ConvertTFLeakyRelu : public RewritePattern {
+  ConvertTFLeakyRelu(MLIRContext *context)
+      : RewritePattern("tf.LeakyRelu", 1, context) {}
+
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<TFL::LeakyReluOp>(
+        op, op->getResult(0)->getType(), op->getOperand(0),
+        /*alpha=*/op->getAttrOfType<FloatAttr>("alpha"));
+    return matchSuccess();
+  }
+};
+```
+
+In the C++ rewrite the static benefit of the rewrite pattern is specified at
+construction. While in the pattern generator a simple heuristic is currently
+employed based around the number of ops matched and replaced.
+
+The above rule did not capture the matching operands/attributes, but in general
+the `match` function in a multi-step rewrite may populate and return a
+`PatternState` (or class derived from one) to pass information extracted during
+matching to the rewrite. A single-step rewrite with the `matchAndRewrite`
+function has the benefit of being able to directly use any values created when
+matching; removing the need for `PatternState`.
+
+## Testing
+
+MLIR uses [lit](https://llvm.org/docs/CommandGuide/lit.html) (LLVM Integrated
+Testing) tool for performing testing. Testing is performed by way of creating
+the input IR file, running a transformation and then verifying the output IR.
+C++ unit tests are the exception, with the IR transformation serving as the core
+testing mechanism. This results in fewer binaries that need to be built (and
+linked) and forces to focus on the representation as an important piece.
+
+For the legalization transform above we would have a test (probably as part of
+the legalization pass test in TensorFlow Lite) such as:
+
+```mlir
+// RUN: mlir-opt -tfl-legalize-tf %s | FileCheck %s
+
+func @LeakyRelu(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  %2 = "tf.LeakyRelu"(%arg0) {alpha: 0.1} : (tensor<1xf32>) -> tensor<1xf32>
+  return %2: tensor<1xf32>
+
+// CHECK-LABEL: LeakyRelu
+// CHECK:  %0 = "tfl.leaky_relu"(%arg0) {alpha: 1.000000e-01} : (tensor<1xf32>) -> tensor<1xf32>
+}
+```
+
+The RUN command at the top results in running the `mlir-opt` binary (which is
+compiler writer tool to exercise different registered passes) to invoke the
+optimization pass this transform was added as part of on the current file and to
+verify its output using `FileCheck`. `FileCheck` is textual output verifier. In
+particular it uses the CHECK expressions to verify the given output is produced.
+
+There can be multiple RUN commands with different corresponding CHECK prefixes.
+And in addition multiple independent tests separated by `// -----` and
+`mlir-opt` invoked with `-split-input-file` flag. This is especially useful for
+error testing.
+
+This results in very simple, directed testing without need to work around
+constant propagation or other, unrelated, optimization passes.
+
+## Adding optimization pass
+
+Optimization passes that do not fit/are difficult to specify in the above
+structure can be specified as general iterations across modules/functions. See
+[Writing a Pass](WritingAPass.md) for a general overview and introduction to
+optimization passes in MLIR.
diff --git a/mlir/docs/Rationale.md b/mlir/docs/Rationale.md
new file mode 100644
index 0000000000000000000000000000000000000000..763442dce0638342e314a5b31d4c32f0ab3d173c
--- /dev/null
+++ b/mlir/docs/Rationale.md
@@ -0,0 +1,1121 @@
+# MLIR Rationale
+
+This document is intended to capture some of the alternatives considered and
+open debates in the design of MLIR, along with the rationale for certain
+decisions we made. This is not intended to be a "finely groomed" document - we
+prefer the ability to dump in interesting tidbits without worrying too much
+about their consistency or readability.
+
+[TOC]
+
+## Abstract
+
+MLIR is a compiler intermediate representation with similarities to traditional
+three-address SSA representations (like
+[LLVM IR](http://llvm.org/docs/LangRef.html) or
+[SIL](https://github.com/apple/swift/blob/master/docs/SIL.rst)), but which
+introduces notions from the polyhedral loop optimization works as first class
+concepts. This hybrid design is optimized to represent, analyze, and transform
+high level dataflow graphs as well as target-specific code generated for high
+performance data parallel systems. Beyond its representational capabilities, its
+single continuous design provides a framework to lower from dataflow graphs to
+high performance target specific code.
+
+MLIR stands for one of "Multi-Level IR" or "Multi-dimensional Loop IR" or
+"Machine Learning IR" or "Mid Level IR", we prefer the first. This document only
+provides the rationale behind MLIR -- its actual
+[specification document](LangRef.md) and other content is hosted elsewhere.
+
+## Introduction and Motivation
+
+The Multi-Level Intermediate Representation (MLIR) is intended for easy
+expression and optimization of computations involving deep loop nests and dense
+matrices of high dimensionality. It is thus well-suited to deep learning
+computations in particular. Yet it is general enough to also represent arbitrary
+sequential computation. The representation allows high-level optimization and
+parallelization for a wide range of parallel architectures including those with
+deep memory hierarchies --- general-purpose multicores, GPUs, and specialized
+neural network accelerators.
+
+MLIR uses ideas drawn from IRs of LLVM and Swift for lower level constructs
+while combining them with ideas from the polyhedral abstraction to represent
+loop nests, multidimensional data (tensors), and transformations on these
+entities as first class concepts in the IR.
+
+MLIR is a multi-level IR, i.e., it represents code at a domain-specific
+representation such as HLO or TensorFlow graphs, all the way down to the machine
+level. MLIR is able to represent arbitrary control flow and arbitrary data
+accesses, and is general enough to represent nearly all sequential computation.
+This is a key distinction from existing polyhedral representation
+implementations (such as LLVM [Polly](https://polly.llvm.org/)) that are able to
+use the polyhedral abstraction in a way isolated from the LLVM IR and only for
+affine loop nests, i.e., portions of the code where array accesses, loop bounds,
+and conditionals are regular (involve linear functions of loop iterators and
+constant symbols). The presence of statically unpredictable data accesses or
+control flow does not preclude representation in MLIR, but only limits to a
+certain extent the ability to reason about and apply transformations using the
+polyhedral abstraction.
+
+Maps, sets, and relations with affine constraints are the core structures
+underlying a polyhedral representation of high-dimensional loop nests and
+multidimensional arrays. These structures are represented as textual
+expressions in a form close to their mathematical form. These structures are
+used to capture loop nests, tensor data structures, and how they are reordered
+and mapped for a target architecture. All structured or "conforming" loops are
+captured as part of the polyhedral information, and so are tensor variables,
+their layouts, and subscripted accesses to these tensors in memory.
+
+The information captured in the IR allows a compact expression of all loop
+transformations, data remappings, explicit copying necessary for explicitly
+addressed memory in accelerators, mapping to pre-tuned expert written
+primitives, and mapping to specialized vector instructions. Loop transformations
+that can be easily implemented include the body of affine transformations: these
+subsume all traditional loop transformations (unimodular and non-unimodular)
+such as loop tiling, interchange, permutation, skewing, scaling, relative
+shifting, reversal, fusion, and distribution/fission. Transformations on data
+layout such as padding and transforming to blocked layouts are also represented
+well via affine layout maps.
+
+MLIR's design allows a progressive lowering to target-specific forms. Besides
+high-level transformations for loop nests and data layouts that a typical
+mid-level optimizer is expected to deal with, MLIR is also designed to perform
+certain low-level scheduling and mapping decisions that a typical backend IR is
+entrusted with: these include mapping to specialized vector instructions,
+auto-vectorization, and software pipelining. The need to support these
+transformations stems from the fact that neural network accelerators have
+specialized units that deal with large chunks of data whose computation maps
+back to chunks of more than one loop of the loop nests as viewed by a program at
+a level closer to the original specification. Such specialized units or
+instructions operate on multidimensional data chunks from a programmer's
+viewpoint. It thus makes it hard or infeasible for a backend operating on a very
+low-level IR close to assembly to lift and reconstruct loops and perform such a
+mapping. This is in contrast to classic instruction selection and scheduling in
+today's compilers that primarily only deals with the body of the innermost loop.
+MLIR also facilitates automatic mapping to expert pre-tuned primitives or vendor
+libraries operating on data at higher levels (or at the highest level) of the
+memory hierarchy.
+
+In summary, MLIR is convenient for and closed under the kind of transformations
+needed to lower to general-purpose as well as specialized accelerators. It also
+allows one to build modular and reusable target independent and target dependent
+passes.
+
+## Design Decisions
+
+This section sheds light on some of the design decisions -- some of these are
+indirectly implied by the specification document.
+
+### Loads and stores
+
+The 'load' and 'store' instructions are specifically crafted to fully resolve to
+an element of a memref. These instructions take as arguments n+1 indices for an
+n-ranked tensor. This disallows the equivalent of pointer arithmetic or the
+ability to index into the same memref in other ways (something which C arrays
+allow for example). Furthermore, for the affine constructs, the compiler can
+follow use-def chains (e.g. through
+[affine.apply operations](Dialects/Affine.md#affineapply-operation)) or through
+the map attributes of [affine operations](Dialects/Affine.md#Operations)) to
+precisely analyze references at compile-time using polyhedral techniques. This
+is possible because of the [restrictions on dimensions and symbols](Dialects/Affine.md#restrictions-on-dimensions-and-symbols).
+
+A scalar of element-type (a primitive type or a vector type) that is stored in
+memory is modeled as a 0-d memref. This is also necessary for scalars that are
+live out of for loops and if conditionals in a function, for which we don't yet
+have an SSA representation --
+[an extension](#mlfunction-extensions-for-"escaping-scalars") to allow that is
+described later in this doc.
+
+### Symbols and types
+
+The current MLIR disallows use of symbols in types. For example, when a tensor
+or memref dimension is statically unknown, it is denoted in the type as '?'. An
+SSA symbol is then bound to it when a memref is created. The actual value of the
+unknown dimension can be queried using the "dim" builtin as shown below.
+
+Example:
+
+```mlir
+func foo(...) {
+  %A = alloc <8x?xf32, #lmap> (%N)
+  ...
+  call bar(%A) : (memref<8x?xf32, #lmap>)
+}
+
+func bar(%A : memref<8x?xf32, #lmap>) {
+  // Type of %A indicates that %A has dynamic shape with 8 rows
+  // and unknown number of columns. The number of columns is queried
+  // dynamically using dim instruction.
+  %N = dim %A, 1 : memref<8x?xf32, #lmap>
+
+  affine.for %i = 0 to 8 {
+    affine.for %j = 0 to %N {
+      // A[i,j] += 1
+      %s1 = affine.load %A[%i, %j] : memref<8x?xf32, #lmap>
+      %s2 = add %s1, 1
+      affine.store %s2, %A[%i, %j] : memref<8x?xf32, #lmap>
+    }
+  }
+  return
+}
+
+```
+
+An alternative design is to embed the reference to symbols directly in the
+type - memref<8x%Nxf32>. We went for the current approach in MLIR because it
+simplifies the design --- types remain immutable when the values of symbols
+change.
+
+### Block Arguments vs PHI nodes
+
+MLIR Regions represent SSA using "[block arguments](LangRef.md#blocks)" rather
+than [PHI instructions](http://llvm.org/docs/LangRef.html#i-phi) used in LLVM.
+This choice is representationally identical (the same constructs can be
+represented in either form) but block arguments have several advantages:
+
+1.  LLVM PHI nodes always have to be kept at the top of a block, and
+    transformations frequently have to manually skip over them. This is defined
+    away with BB arguments.
+1.  LLVM has a separate function Argument node. This is defined away with BB
+    arguments, because the arguments to the entry block serve this purpose.
+1.  Blocks of PHI nodes in LLVM execute atomically, which is surprising and
+    super confusing to compiler engineers and it is easy to introduce bugs with
+    this (very related to the
+    "[lost copy](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.524.5461&rep=rep1&type=pdf)"
+    problem in SSA lowering literature.) With the BB argument representation,
+    this confusion is defined away.
+1.  The entry list of PHI nodes in LLVM are unordered, and some blocks have
+    thousands of predecessors (e.g. unwind blocks). This can cause long compile
+    time problems because transformations have to linearly scan this list. This
+    is defined away with BB argument representation.
+1.  LLVM has no way to represent values that are available only in one successor
+    but not the other, e.g. its invoke instruction cannot produce the exception
+    value JUST on the exception edge. Instead, the
+    [landingpad instruction](http://llvm.org/docs/LangRef.html#landingpad-instruction)
+    is a hack used to represent this. MLIR doesn't make use of this capability,
+    but SIL uses it extensively, e.g. in the
+    [switch_enum instruction](https://github.com/apple/swift/blob/master/docs/SIL.rst#switch-enum).
+
+For more context, block arguments were previously used in the Swift
+[SIL Intermediate Representation](https://github.com/apple/swift/blob/master/docs/SIL.rst),
+and described in
+[a talk on YouTube](https://www.youtube.com/watch?v=Ntj8ab-5cvE). The section of
+interest
+[starts here](https://www.google.com/url?q=https://youtu.be/Ntj8ab-5cvE?t%3D596&sa=D&ust=1529450150971000&usg=AFQjCNFQHEWL7m8q3eO-1DiKw9zqC2v24Q).
+
+### Index type disallowed in vector/tensor/memref types
+
+Index types are not allowed as elements of `vector`, `tensor` or `memref` type.
+Index types are intended to be used for platform-specific "size" values and may
+appear in subscripts, sizes of aggregate types and affine expressions. They are
+also tightly coupled with `affine.apply` and affine.load/store operations;
+having `index` type is a necessary precondition of a value to be acceptable by
+these operations. While it may be useful to have `memref<?xindex>` to express
+indirect accesses, e.g. sparse matrix manipulations or lookup tables, it creates
+problems MLIR is not ready to address yet. MLIR needs to internally store
+constants of aggregate types and emit code operating on values of those types,
+which are subject to target-specific size and alignment constraints.  Since MLIR
+does not have a target description mechanism at the moment, it cannot reliably
+emit such code. Moreover, some platforms may not support vectors of type
+equivalent to `index`.
+
+Indirect access use cases can be alternatively supported by providing and
+`index_cast` instruction that allows for conversion between `index` and
+fixed-width integer types, at the SSA value level. It has an additional benefit
+of supporting smaller integer types, e.g. `i8` or `i16`, for small indices
+instead of (presumably larger) `index` type.
+
+### Bit width of a non-primitive types and `index` is undefined
+
+The bit width of a compound type is not defined by MLIR, it may be defined by a
+specific lowering pass. In MLIR, bit width is a property of certain primitive
+_type_, in particular integers and floats. It is equal to the number that
+appears in the type definition, e.g. the bit width of `i32` is `32`, so is the
+bit width of `f32`. The bit width is not _necessarily_ related to the amount of
+memory (in bytes) or the size of register (in bits) that is necessary to store
+the value of the given type. These quantities are target and ABI-specific and
+should be defined during the lowering process rather than imposed from above.
+For example, `vector<3xi57>` is likely to be lowered to a vector of four 64-bit
+integers, so that its storage requirement is `4 x 64 / 8 = 32` bytes, rather
+than `(3 x 57) ceildiv 8 = 22` bytes as can be naively computed from the
+bitwidth. Individual components of MLIR that allocate space for storing values
+may use the bit size as the baseline and query the target description when it is
+introduced.
+
+The bit width is not defined for dialect-specific types at MLIR level. Dialects
+are free to define their own quantities for type sizes.
+
+### Signless types
+
+Integers in the builtin MLIR type system have a bitwidth (note that the `index`
+type has a symbolic width equal to the machine word size), but they do not have
+an intrinsic sign. This means that the "standard ops" operation set has things
+like `addi` and `muli` which do two's complement arithmetic, but some other
+operations get a sign, e.g. `divis` vs `diviu`.
+
+LLVM uses the [same design](http://llvm.org/docs/LangRef.html#integer-type),
+which was introduced in a revamp rolled out
+[in the LLVM 2.0 integer type](http://releases.llvm.org/2.0/docs/LangRef.html#t_derived).
+Prior to that, from
+[LLVM 1.0](http://releases.llvm.org/1.0/docs/LangRef.html#t_classifications) to
+[1.9](http://releases.llvm.org/1.9/docs/LangRef.html#t_classifications), LLVM
+uses signed types like "sbyte" and "ubyte". This shift was important and has
+served LLVM well over the years. The reason this is important is that it is a
+good thing for an intermediate representation to represent the same computation
+with the same instruction. Signed types got in the way, because (e.g.) an "add
+of an sbyte" does the same computation as an "add of a ubyte", but the type
+system made them look artificially different. This split also required casts
+like "cast from sbyte to ubyte" which do nothing at the machine level. Removing
+signs from the type system eliminated these problems, making the compiler
+simpler.
+
+More information about this split is available in an old
+[talk on youtube](https://www.youtube.com/watch?v=VeRaLPupGks) talking about
+LLVM 2.0.
+
+Note that this rationale only applies to the "standard ops" dialect in which we
+can express an opinion about its design. Other dialects generally try to model
+an external system, and should aim to reflect its design as closely as possible.
+
+### Splitting floating point vs integer operations
+
+The MLIR "standard" operation set splits many integer and floating point
+operations into different categories, for example `addf` vs `addi` and `cmpf` vs
+`cmpi`
+([following the design of LLVM](http://llvm.org/docs/LangRef.html#binary-operations)).
+These instructions _are_ polymorphic on the number of elements in the type
+though, for example `addf` is used with scalar floats, vectors of floats, and
+tensors of floats (LLVM does the same thing with its scalar/vector types).
+
+This split is important because floating point and integer operations are quite
+different in practice: for example, floating point values include NaN's, so
+[integer comparisons](http://llvm.org/docs/LangRef.html#icmp-instruction) and
+[floating point comparisons](http://llvm.org/docs/LangRef.html#fcmp-instruction)
+should use different comparison opcodes. On the arithmetic side of things,
+floating point operations support rounding modes, floating point contractions,
+["fast math"](http://llvm.org/docs/LangRef.html#fadd-instruction), and integers
+may want to have two's complement overflow behavior or be undefined on
+[various forms of wrapping](http://llvm.org/docs/LangRef.html#add-instruction)
+for performance.
+
+We are a long way from this sort of thing being a priority to care about in
+MLIR, but since we have experience and know the right way to do this, we'd
+rather design it in from the beginning.
+
+Note that this rationale only applies to the "standard ops" dialect in which we
+can express an opinion about its design. Other dialects generally try to model
+an external system, and should aim to reflect its design as closely as possible.
+
+### Specifying sign in integer comparison operations
+
+Since integers are [signless](#signless-types), it is necessary to define the
+sign for integer comparison operations. This sign indicates how to treat the
+foremost bit of the integer: as sign bit or as most significant bit. For
+example, comparing two `i4` values `0b1000` and `0b0010` yields different
+results for unsigned (`8 > 3`) and signed (`-8 < 3`) interpretations. This
+difference is only significant for _order_ comparisons, but not for _equality_
+comparisons. Indeed, for the latter all bits must have the same value
+independently of the sign. Since both arguments have exactly the same bit width
+and cannot be padded by this operation, it is impossible to compare two values
+whose bit representations would differ while the values are interpreted as
+equal.
+
+### Specifying comparison kind as attribute
+
+Unlike arithmetic, comparison operators share several common properties, e.g.
+they cannot be considered associative. In practice, comparisons are sometimes
+implemented by the same instruction or its variants so it makes sense to group
+them together at the IR level.
+
+An alternative would be introducing ten distinct operators for all currently
+supported kinds of integer comparisons. These operators would have increased the
+number of "reserved" names used by standard operations as well as the size of
+the C++ API while their implementations would have been mostly identical.
+
+The comparison kind is internally an integer attribute. However, for the sake of
+readability by humans, custom assembly form accepts string literals that are
+mapped to the underlying integer values: `cmpi "eq", %lhs, %rhs` better implies
+integer equality comparison than `cmpi 0, %lhs, %rhs` where it is unclear what
+gets compared to what else. This syntactic sugar is possible thanks to parser
+logic redefinitions for custom assembly form of non-builtin operations.
+Supporting it in the full notation would have required changing how the main
+parsing algorithm works and may have unexpected repercussions. While it had been
+possible to store the predicate as string attribute, it would have rendered
+impossible to implement switching logic based on the comparison kind and made
+attribute validity checks (one out of ten possible kinds) more complex.
+
+### 'select' operation to implement min/max
+
+Although `min` and `max` operations are likely to occur as a result of
+transforming affine loops in ML functions, we did not make them first-class
+operations. Instead, we provide the `select` operation that can be combined with
+`cmpi` to implement the minimum and maximum computation. Although they now
+require two operations, they are likely to be emitted automatically during the
+transformation inside MLIR. On the other hand, there are multiple benefits of
+introducing `select`: standalone min/max would concern themselves with the
+signedness of the comparison, already taken into account by `cmpi`; `select` can
+support floats transparently if used after a float-comparison operation; the
+lower-level targets provide `select`-like instructions making the translation
+trivial.
+
+This operation could have been implemented with additional control flow: `%r =
+select %cond, %t, %f` is equivalent to
+
+```mlir
+^bb0:
+  cond_br %cond, ^bb1(%t), ^bb1(%f)
+^bb1(%r):
+```
+
+However, this control flow granularity is not available in the ML functions
+where min/max, and thus `select`, are likely to appear. In addition, simpler
+control flow may be beneficial for optimization in general.
+
+### Regions
+
+#### Attributes of type 'Block'
+
+We considered representing regions through `ArrayAttr`s containing a list of a
+special type `IRBlockAttr`, which in turn would contain a list of operations.
+All attributes in MLIR are unique’d within the context, which would make the IR
+inside the regions immortal for no good reason.
+
+#### Use "inlined" functions as regions
+
+We considered attaching a "force-inline" attribute on a function and/or a
+function `call` operation. Even the minimal region support (use cases in
+affine.for and affine.if existing before the regions) requires access to the
+values defined in the dominating block, which is not supported by functions.
+Conceptually, function bodies are instances of regions rather than the inverse;
+regions can also be device kernels, alternative sections, etc.
+
+#### Dedicated `region` operation
+
+This would mean we have a special kind of operation that is allowed to have
+regions while other operations are not. Such distinction is similar to the
+Stmt/Op difference we have had and chose to remove to make the IR simpler and
+more flexible. It would also require analyses and passes to consider the
+interplay between operations (e.g., an `affine.for` operation must be followed
+by a region operation). Finally, a region operation can be introduced using the
+current implementation, among other operations and without being special in any
+sense.
+
+#### Explicit capture of the values used in a region
+
+Being able to use values defined outside the region implies that use-def chains
+may contain uses from different nested regions. Consequently, IR transformations
+and analyses can pull the instruction defining the value across region
+boundaries, for example in case of TableGen-defined canonicalization patterns.
+This would not be the case if all used values had been passed as region
+arguments. One of the motivations for introducing regions in the IR is precisely
+to enable cross-region analyses and transformations that are simpler than
+inter-procedural transformations. Having uses from different regions appear in
+the same use-def chain, contrary to an additional data structure maintaining
+correspondence between function call arguments as uses of the original
+definitions and formal arguments as new definitions, enables such
+simplification. Since individual operations now belong to blocks, which belong
+to regions, it is always possible to check if the definition of the value
+belongs to the same region as its particular use. The risk is that any IR
+traversal will need to handle explicitly this situation and it is easy to forget
+a check (or conversely it isn’t easy to design the right check in a tablegen
+pattern for example): traversing use-def chains potentially crosses implicitly
+semantic barriers, making it possible to unknowingly break region semantics.
+This is expected to be caught in the verifier after the transformation.
+
+At the same time, one may choose to pass certain or all values as region
+arguments to explicitly break the use-def chains in the current proposal. This
+can be combined with an attribute-imposed semantic requirement disallowing the
+body of the region to refer to any value from outside it.
+
+### Quantized integer operations
+
+We haven't designed integer quantized operations in MLIR, but experience from
+TensorFlow suggests that it is better to put information about the quantization
+range/scale into the type itself, rather than have a single type like "qint8"
+and put these on attributes of the operation.
+
+There are a few ways to do this with MLIR, including at least:
+
+*   We could do the same thing TensorFlow does - and we will _have_ to support
+    that model to some extent for compatibility.
+*   We can encode the fp range of quantized integers directly into the types
+    when they are constants. The best practice on this seems to be to encode the
+    zero point as well as a scale factor. This ensures that 0.0 is always
+    exactly representable, e.g. `qi8<-1.42, 31.23x>`.
+*   We could theoretically encode dynamically determined ranges into the types
+    using something like `qi8<?,?>` with the bounds being determined through the
+    SSA dataflow graph dynamically - similar to how dynamic shapes are handled.
+
+We will definitely need to do #1 for compatibility, we probably want to do #2,
+and we should investigate #3 over time. That said, our short term plan is to get
+more implementation experience with the rest of the system first, then come back
+to re-examine the representation for quantized arithmetic when we have that
+experience. When we do, we should chat with benoitjacob@ and
+[read the paper](https://arxiv.org/abs/1712.05877).
+
+### Dialect type extensions
+
+This section describes the design decisions that shaped the dialect extensible
+type system present in MLIR.
+
+#### Reserving dialect type kinds
+
+Dialects that wish to define type extensions must reserve a range of type kinds
+within a '.def' file within the core IR library. This means that every dialect
+wishing to define custom types must modify this file, but it guarantees that all
+type casting checkings are performed in O(1) time.
+
+#### Interactions between dialects
+
+There are two different interactions between dialects that are important to
+understand. When types of a dialect are:
+
+*   In operations of other dialects
+
+    -   For standard/builtin operations, only standard/builtin types are
+        allowed. This restriction allows for operations to clearly understand
+        the invariants that they are working under.
+    -   Outside of standard/builtin operations, dialects are expected to verify
+        the allowable operation types per operation.
+
+*   In types of other dialects
+
+    -   For standard/builtin types, these types are allowed to contain types
+        from other dialects. This simplifies the type system and removes the
+        need for dialects to redefine all of the standard aggregate types, e.g.
+        tensor, as well as the memref type. Dialects are expected to verify that
+        a specific type is valid within a standard type, e.g. if a type can be
+        an element of a tensor.
+    -   For dialect types, the dialect is expected to verify any type
+        invariants, e.g. if the standard tensor type can contain a specific type
+        of that dialect.
+
+#### Separating builtin and standard types
+
+Following the separation between the built-in and standard dialect, it makes
+sense to separate built-in types and standard dialect types. Built-in types are
+required for the validity of the IR itself, e.g. the function type (which
+appears in function signatures and generic assembly forms of operations).
+Integer, float, vector, memref and tensor types, while important, are not
+necessary for IR validity.
+
+#### Unregistered types
+
+MLIR supports unregistered operations in generic assembly form. MLIR also
+supports a similar concept for types. When parsing, if the dialect for dialect
+type has not been registered the type is modeled as an 'OpaqueType'. This allows
+for types to be round-tripped without needing to link in the dialect library
+that defined them. No additional information about opaque types, outside of
+parsing/printing, will be available.
+
+#### Dialect type syntax
+
+Dialect extended types are represented as string literals wrapped inside of the
+dialect namespace. This means that the parser delegates to the dialect for
+parsing specific type instances. This differs from the representation of dialect
+defined operations, of which have an identifier name that the parser uses to
+identify and parse them.
+
+This representation was chosen for several reasons:
+
+##### Dialects must provide custom type parsers
+
+Dialect type parsing cannot plug into the existing parser infrastructure as
+operations do with the OpAsmParser/Printer. Operations have a defined syntax
+structure that is the same across all dialects. Types, on the other hand, may
+have many different, and sometimes conflicting, parsing constraints that would
+be difficult/unmaintainable to provide within a single interface.
+
+This also has the added benefit of encouraging dialects to reuse existing
+external type parsers. For example, an LLVM dialect may provide an MLIR LLVM
+type that is simply a wrapper around LLVM types. The LLVM dialect would then use
+the existing LLVM type parsing infrastructure.
+
+Example:
+
+```mlir
+%s = "foo"() : () -> !llvm<"i32*">
+```
+
+##### Types do not always have canonical names
+
+Unlike operations, types generally do not have a formal canonical name. For
+example, function types have no defined keyword and integer types are defined by
+a regular expression to support arbitrary bitwidth. Dialects with existing type
+systems, e.g. LLVM, are likely to provide wrappers around their existing type
+systems. For these wrapper types there is no simple canonical name, it's logical
+to think of these types as existing within the namespace of the dialect. If a
+dialect wishes to assign a canonical name to a type, it can be done via
+[type aliases](LangRef.md#type-aliases).
+
+### Tuple types
+
+The MLIR type system provides first class support for defining
+[tuple types](LangRef.md#tuple-type). This is due to the fact that `Tuple`
+represents a universal concept that is likely to, and has already begun to,
+present itself in many different dialects. Though this type is first class in
+the type system, it merely serves to provide a common mechanism in which to
+represent this concept in MLIR. As such, MLIR provides no standard operations
+for interfacing with `tuple` types. It is up to dialect authors to provide
+operations, e.g. extract_tuple_element, to interpret and manipulate them. When
+possible, operations should prefer to use multiple results instead. These
+provide a myriad of benefits, such as alleviating any need for tuple-extract
+operations that merely get in the way of analysis and transformation.
+
+### Assembly forms
+
+MLIR decides to support both generic and custom assembly forms under the
+following considerations:
+
+MLIR is an open system; it is designed to support modular and pluggable
+dialects. Depending on whether there exists a corresponding dialect and whether
+the dialect is plugged in, operations may or may not be registered into MLIR
+system. Yet we still need a way to investigate these operations. So the generic
+assembly form is mandated by this aspect of MLIR system. It provides a default
+textual form for operations.
+
+On the other hand, an assembly form is for assisting developers to investigate
+the IR. The generic form serves as a safe fallback but it can be too verbose for
+certain ops. Therefore, MLIR gives each dialect the choice to define a custom
+assembly form for each operation according to the operation's semantics and
+specific needs. The custom assembly form can de-duplicate information from the
+operation to derive a more concise form, thus better facilitating the
+comprehension of the IR.
+
+## Examples
+
+This section describes a few very simple examples that help understand how MLIR
+represents computation.
+
+### Non-affine control flow
+
+```mlir
+// A simple linear search in every row of a matrix
+for (i = 0; i < N; i++) {
+  for (j = 0; j < N; j++) {
+    // dynamic control flow
+    if (a[i][j] == key) {
+      s[i] = j;
+      break;
+    }
+  }
+}
+```
+
+The presence of dynamic control flow leads to an inner non-affine function
+nested in an outer function that using affine loops.
+
+```mlir
+func @search(%A: memref<?x?xi32, %S: <?xi32>, %key : i32) {
+  %ni = dim %A, 0 : memref<?x?xi32>
+  // This loop can be parallelized
+  affine.for %i = 0 to %ni {
+    call @search_body (%A, %S, %key, %i) : (memref<?x?xi32>, memref<?xi32>, i32, i32)
+  }
+  return
+}
+
+func @search_body(%A: memref<?x?xi32>, %S: memref<?xi32>, %key: i32, %i : i32) {
+  %nj = dim %A, 1 : memref<?x?xi32>
+  br ^bb1(0)
+
+^bb1(%j: i32)
+  %p1 = cmpi "lt", %j, %nj : i32
+  cond_br %p1, ^bb2, ^bb5
+
+^bb2:
+  %v = affine.load %A[%i, %j] : memref<?x?xi32>
+  %p2 = cmpi "eq", %v, %key : i32
+  cond_br %p2, ^bb3(%j), ^bb4
+
+^bb3(%j: i32)
+  affine.store %j, %S[%i] : memref<?xi32>
+  br ^bb5
+
+^bb4:
+  %jinc = addi %j, 1 : i32
+  br ^bb1(%jinc)
+
+^bb5:
+  return
+}
+```
+
+As per the [MLIR spec](LangRef.md), the restrictions on dimensions and symbol
+identifiers to be used with the affine.apply operation only apply to accesses
+inside `affine.for` and `affine.if` operations. However, an analysis of accesses
+inside the called function (`@search_body`) is necessary to determine if the
+`%i` loop could be parallelized: such function access analysis is calling
+context sensitive.
+
+### Non-affine loop bounds
+
+Loop bounds that are not affine lead to a nesting of functions as shown below.
+
+```c
+for (i = 0; i < N; i++)
+  for (j = 0; j < N; j++)
+    // Non-affine loop bound for k loop.
+    for (k = 0; k < pow(2, j); k++)
+       for (l = 0; l < N; l++) {
+        // block loop body
+        ...
+       }
+```
+
+```mlir
+func @outer_nest(%n : index) {
+  affine.for %i = 0 to %n {
+    affine.for %j = 0 to %n {
+      %pow = call @pow(2, %j) : (index, index) ->  index
+      call @inner_nest(%pow, %n) : ...
+    }
+  }
+  return
+}
+
+func @inner_nest(%m : index, %n : index) {
+  affine.for %k = 0 to %m {
+    affine.for %l = 0 to %n {
+      ...
+    }
+  }
+  return
+}
+```
+
+### Reference 2D Convolution
+
+The following example illustrates a reference implementation of a 2D
+convolution, which uses an integer set `#domain` to represent valid input data
+in a dilated convolution.
+
+```mlir
+// Dilation factors S0 and S1 can be constant folded if constant at compile time.
+#domain = (d0, d1)[S0,S1,S2,S3]: (d0 % S0 == 0, d1 % S1 == 0, d0 >= 0, d1 >= 0,
+                                   S3 - d0 - 1 >= 0, S4 - d1 - 1 >= 0)
+// Identity map (shown here for illustration).
+#map0 = (d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)
+
+// Affine map from output to input coordinate space.
+// d0 = output_h, d1 = output_w, d2 = kernel_h, d3 = kernel_w
+// S0 = h_stride, S1 = w_stride, S2 = h_kernel_dilation, S3 = w_kernel_dilation
+// S4 = h_pad_low, S5 = w_pad_low
+//     %out0 =  %0#1 * %h_stride + %0#4 * %h_kernel_dilation - %h_pad_low
+//     %out1=  %0#2 * %w_stride + %0#5 * %w_kernel_dilation - %w_pad_low
+#map1_0 = (d0, d1, d2, d3) [S0, S1, S2, S3, S4, S5] -> (d0 * S0 + d2 * S2 - %S4)
+#map1_1 = (d0, d1, d2, d3) [S0, S1, S2, S3, S4, S5] -> (d1 * S1 + d3 * S3 - %S5)
+
+// Semi-affine map to undilated input coordinate space.
+// d0 = input_h, d1 = input_w, S0 = h_base_dilation, S1 = w_base_dilation.
+#map2_0 = (d0, d1) [S0, S1] -> (d0 / S0)
+#map2_1 = (d0, d1) [S0, S1] -> (d1 / S1)
+
+// Conv2D shapes:
+// input:   [batch, input_height, input_width, input_feature]
+// kernel: [kernel_height, kernel_width, input_feature, output_feature]
+// output: [batch, output_height, output_width, output_feature]
+func @conv2d(%input: memref<16x1024x1024x3xf32, #lm0, /*scratchpad=*/1>,
+             %kernel: memref<5x5x3x32xf32, #lm0, /*scratchpad=*/1>,
+             %output: memref<16x512x512x32xf32, #lm0, /*scratchpad=*/1>) {
+  affine.for %b = 0 to %batch {
+    affine.for %oh = 0 to %output_height {
+      affine.for %ow = 0 to %output_width {
+        affine.for %of = 0 to %output_feature {
+          affine.for %kh = 0 to %kernel_height {
+            affine.for %kw = 0 to %kernel_width {
+              affine.for %if = 0 to %input_feature {
+                // Calculate input indices.
+                %1_0 = affine.apply #map1_0 (%0#1, %0#2, %0#4, %0#5)
+                  [%h_stride, %w_stride, %h_kernel_dilation, %w_kernel_dilation,
+                   %h_pad_low, %w_pad_low]
+                %1_1 = affine.apply #map1_1 (%0#1, %0#2, %0#4, %0#5)
+                  [%h_stride, %w_stride, %h_kernel_dilation, %w_kernel_dilation,
+                   %h_pad_low, %w_pad_low]
+
+                // Check if access is not in padding.
+                affine.if #domain(%1_0, %1_1)
+                                       [%h_base_dilation, %w_kernel_dilation, %h_bound, %w_bound] {
+                  %2_0 = affine.apply #map2 (%1_0, %1_1)
+                  %2_1 = affine.apply #map2 (%1_0, %1_1)
+                  // Compute: output[output_indices] += input[input_indices] * kernel[kernel_indices]
+                  call @multiply_accumulate(%input, %kernel, %output, %b, %oh, %ow, %of, %kh, %kw, %if, %2_0, %2_1)
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return
+}
+```
+
+TODO (Add more examples showing the IR for a variety of interesting cases)
+
+## Design alternatives and extensions
+
+This is a list of some design alternatives and extensions that we discussed in
+detail but did not include in the spec or postponed them for future
+consideration on demand. We will revisit these discussions when we have more
+implementation experience and learn more about the challenges and limitations of
+our current design in practice.
+
+### Polyhedral code representation alternatives: schedule lists vs schedules trees vs affine loop/if forms
+
+The current MLIR uses a representation of polyhedral schedules using a tree of
+if/for loops. We extensively debated the tradeoffs involved in the typical
+unordered polyhedral instruction representation (where each instruction has
+multidimensional schedule information), discussed the benefits of schedule tree
+forms, and eventually decided to go with a syntactic tree of affine if/else
+conditionals and affine for loops. Discussion of the tradeoff was captured in
+this document:
+[ MLIR: The case for a simplified polyhedral form](RationaleSimplifiedPolyhedralForm.md).
+
+At a high level, we have two alternatives here:
+
+1.  Schedule tree representation instead of an affine loop AST form: The current
+    proposal uses an affine loop and conditional tree form, which is syntactic
+    and with no separation of domains as sets and schedules as multidimensional
+    affine functions. A schedule tree form however makes polyhedral domains and
+    schedules a first class concept in the IR allowing compact expression of
+    transformations through the schedule tree without changing the domains of
+    instructions. Such a representation also hides prologues, epilogues, partial
+    tiles, complex loop bounds and conditionals making loop nests free of
+    "syntax". Cost models instead look at domains and schedules. In addition, if
+    necessary such a domain schedule representation can be normalized to
+    explicitly propagate the schedule into domains and model all the cleanup
+    code. An example and more detail on the schedule tree form is in the next
+    section.
+1.  Having two different forms of "affine regions": an affine loop tree form
+    and a polyhedral schedule tree form. In the latter, ops could carry
+    attributes capturing domain, scheduling, and other polyhedral code
+    generation options with IntegerSet, AffineMap, and other attributes.
+
+#### Schedule Tree Representation for Affine Regions
+
+This representation is based on a simplified form of the domain/schedule
+representation used by the polyhedral compiler community. Domains represent what
+has to be executed while schedules represent the order in which domain elements
+are interleaved. We model domains as non-piece-wise convex integer sets, and
+schedules as affine functions; however, the former can be disjunctive, and the
+latter can be piece-wise affine relations. In the schedule tree representation,
+domain and schedules for instructions are represented in a tree-like structure
+which is called a schedule tree. Each non-leaf node of the tree is an abstract
+polyhedral dimension corresponding to an abstract fused loop for each ML
+instruction that appears in that branch. Each leaf node is an ML Instruction.
+
+```mlir
+// A tiled matmul code (128x128x128) represented in schedule tree form
+
+// #map0 = (d0, d1, d2, d3, d4, d5) -> (128*d0 + d3, 128*d1 + d4, 128*d2 + d5)
+#intset_ij = (i, j) [M, N, K]  : i >= 0, -i + N - 1 >= 0, j >= 0, -j + N-1 >= 0
+#intset_ijk = (i, j, k) [M, N, K] : i >= 0, -i + N - 1 >= 0, j >= 0,
+                                     -j +  M-1 >= 0, k >= 0, -k + N - 1 >= 0)
+func @matmul(%A, %B, %C, %M, %N, %K) : (...)  { // %M, N, K are symbols
+  // t1, t2, t3, t4, t5, t6  are abstract polyhedral loops
+  mldim %t1 : {S1,S2,S3,S4,S5}  floordiv (i, 128) {
+    mldim %t2 : {S1,S2,S3,S4,S5}  floordiv (j, 128) {
+      // (%i, %j) = affine.apply (d0, d1) -> (128*d0, 128*d1) (%t1, %t2)
+      call dma_mem_to_scratchpad(%C, %i, %j, %M, %N, %K)
+          with @intset_ij(%i, %j) [%M, %N, %K]
+      mldim %t3 :   {S2,S3,S4,S5} floordiv (k, 128) {
+        // (%i, %j, %k) = affine.apply (d0, d1, d2)
+        //                          -> (128*d0, 128*d1, 128*d2)  (%t1, %t2, %t3)
+        call dma_mem_to_scratchpad(%A, ...) with #inset_ijk (%i, %j, %k) [%M, %N, %K]
+        // (%i, %j, %k) = affine.apply (d0, d1, d2)
+        //                          -> (128*d0, 128*d1, 128*d2)  (%t1, %t2, %t3)
+        call dma_mem_to_scratchpad(%B, ...) with #inset_ijk (%i, %j, %k) [%M, %N, %K]
+        mldim %t4 : {S4} i mod 128 {
+          mldim %t5 : {S4} j mod 128 {
+            mldim %t6 : {S4} k mod 128 {
+              // (%i, %j, %k) = affine.apply #map0 (%t1, %t2, %t3, %t4, %t5, %t6)
+              call matmul_body(A, B, C, %i, %j, %k, %M, %N, %K)
+                  with #inset_ijk(%i, %j, %k) [%M, %N, %K]
+            } // end mld4im t6
+          } // end mldim t5
+        } // end mldim t4
+      } // end mldim t3
+      // (%i, %j) = affine.apply (d0, d1) -> (128*d0, 128*d1) (%t1, %t2)
+      call $dma_scratchpad_to_mem_C ... with #intset(%i, %j) [%M, %N, %K]
+    }  // end mldim t2
+  } // end mldim t1
+  return
+}
+
+```
+
+### Affine Relations
+
+The current MLIR spec includes affine maps and integer sets, but not affine
+relations. Affine relations are a natural way to model read and write access
+information, which can be very useful to capture the behavior of opaque external
+library calls, high-performance vendor libraries, or user-provided / user-tuned
+routines.
+
+An affine relation is a relation between input and output dimension identifiers
+while being symbolic on a list of symbolic identifiers and with affine
+constraints on the identifiers.
+
+Syntax:
+
+```
+// Affine relation definition at the top of file
+affine-rel-def ::= affine-rel-id `=` affine-relation-inline
+
+affine-rel-id ::= `##` prefixed-id
+
+affine-relation-inline ::=
+       `(` input-dims `)` (`[` symbols `]`)? `->`
+       `(` output-dims `)` :  affine-constraint-conjunction
+
+input-dims ::= bare-id-list
+output-dims ::= bare-id-list
+symbols ::= bare-id-list
+
+affine-rel ::= affine-rel-id | affine-relation-inline
+
+// Usage
+affine-rel-spec ::= affine-rel dim-and-symbol-use-list
+```
+
+All identifiers appearing in input-dims, output-dims, and symbol-dims are
+pairwise distinct. All affine-constraint non-terminals in the above syntax are
+allowed to contain identifiers only from input-dims, output-dims, and
+symbol-dims.
+
+Affine relations are used to model read, write, may_read, and may_write sets of
+functions in the IR. The output dimension identifiers correspond to the data
+dimensions.
+
+Example:
+
+```mlir
+// read relation: two elements ( d0 <= r0 <= d0+1 )
+##aff_rel9 = (d0) -> (r0) : r0 - d0 >= 0, d0 - r0 + 1 >= 0
+
+func @count (%A : memref<128xf32>, %pos : i32) -> f32
+  reads: {%A ##aff_rel9 (%pos)}
+  writes: /* empty */
+  may_reads: /* empty */
+  may_writes: /* empty */ {
+bb0 (%0, %1: memref<128xf32>, i64):
+  %val = affine.load %A [%pos]
+  %val = affine.load %A [%pos + 1]
+  %p = mulf %val, %val : f32
+  return %p : f32
+}
+```
+
+### Regions
+
+#### Making function definition an operation
+
+MLIR supports values of a Function type. Instead of having first-class IR
+concept for functions, one could define an operation with a body region that
+defines a function value. The particularity of functions is that their names are
+globally visible and can be referred to before being defined, unlike SSA values
+that must be defined first. Implementing a "function definition" operation would
+require to relax some of the SSA constraints in a region, and also make the IR
+Module a region as well. It would also affect the core infrastructure (e.g.,
+function passes) only for the sake of concept unification.
+
+#### Having types on a region
+
+Instead of inspecting the types of arguments of the first block, one could give
+the region itself a type. This type would be redundant with block argument
+types, which must have values and create room for type mismatches. While
+functions do have types that are partly redundant with the arguments of the
+first block in the function, this is necessary to support function declarations
+that do not have a body which we can refer to in order to obtain the argument
+types. A region is always contained in an operation or a function that can be
+queried to obtain the “type” of the region if necessary.
+
+A type on a region can be justified if Regions were to be considered separately
+from the enclosing entity (operation or function) and had their own semantics
+that should be checked.
+
+#### Attaching attributes to regions
+
+Regions could be annotated with dialect attributes to use attribute verification
+hooks. An operation could take multiple regions as arguments, and each of them
+may require different attributes. However, there are currently very few
+practical cases where this would be necessary. Instead, one could simulate
+per-region attributes with array attributes attached to the entity containing
+the region (operation or function). This decreases the overall complexity of the
+IR and enables more concise and op-specific forms, e.g., when all regions of an
+op have the same attribute that can be only mentioned once. Since the semantics
+of the region is entirely defined by the enclosing entity, it also makes sense
+to have attributes attached to that entity rather than to the region itself.
+
+This can be reconsidered in the future if we see a non-neglectable amount of use
+cases.
+
+### Read/Write/May_Read/May_Write sets for External Functions
+
+Having read, write, may_read, and may_write sets for external functions which
+include opaque ones, high-performance vendor libraries such as CuDNN, CuB, MKL,
+FFT libraries, user-provided/optimized functions, or data movement runtimes such
+as DMA ones is a powerful feature. It allows the compiler to perform analysis,
+composition/transformation in the presence of such calls and with loops around
+such calls on sub-tensors. For user-provided or custom hand-tuned functions, the
+read/write/may_read/may_write sets could be provided a-priori by a user as part
+of the external function signature or they could be part of a database.
+
+TODO: Design this, and update to use function attribute syntax.
+
+Example:
+
+```mlir
+##rel9 ( ) [s0] -> (r0, r1) : 0 <= r0 <= 1023, 0 <= r1 <= s0 - 1
+
+func @cblas_reduce_ffi(%M: memref<1024 x ? x f32, #layout_map0, /*mem=*/0>)
+  -> f32 [
+  reads: {%M, ##rel9() }
+  writes: /* empty */
+  may_reads: /* empty */
+  may_writes: /* empty */
+]
+
+func @dma_mem_to_scratchpad(%a : memref<1024 x f32, #layout_map0, /*mem=*/0>,
+    %b : memref<1024 x f32, #layout_map0, 1>, %c : memref<1024 x f32,
+    #layout_map0>) [
+  reads: {%M, ##rel9() }
+  writes: /* empty */
+  may_reads: /* empty */
+  may_writes: /* empty */
+ ]
+
+```
+
+### Memref Extensions
+
+1.  Arbitrary polyhedral shapes for tensors: e.g., triangular shapes in tensor
+    dimensions where there is symmetry: use integer set (affine constraints) to
+    model tensor data space (instead of just extents). Requires some changes to
+    the IR and the in-memory form.
+1.  Layout maps
+
+    1.  Allow piece-wise affine maps for layouts: allows clean modeling of
+        boundary cases for images/tensors through padding, wrapping, mirroring,
+        padding where padded values are the results of computation as opposed to
+        data, padding in the interior as opposed to just boundaries.
+    1.  Allow many-to-one layout maps: Index and layout maps in the current
+        proposal are bijective. Extending them to many-to-one layout maps allows
+        cleaner(?) modeling of broadcast/reduce style computations while reusing
+        memory.
+
+    Proposal 2(a) requires non-trivial changes to the IR and the in-memory
+    representation. 2(b) requires no change, but impacts how cost models look at
+    index and layout maps.
+
+### `affine.if` and `affine.for` Extensions for "Escaping Scalars"
+
+We considered providing a representation for SSA values that are live out of
+`if/else` conditional bodies and loop carried in `affine.for` loops. We
+ultimately abandoned this approach due to its complexity. In the current design
+of MLIR, scalar variables cannot escape for loops or if instructions. In
+situations, where escaping is necessary, we use zero-dimensional tensors and
+memrefs instead of scalars.
+
+**TODO**: This whole section is obsolete and should be updated to use block
+arguments and a yield like terminator in for/if instructions.
+
+The abandoned design of supporting escaping scalars is as follows:
+
+#### affine.for Instruction
+
+Syntax:
+
+```
+[<out-var-list> =]
+for %<index-variable-name> = <lower-bound> ... <upper-bound> step <step>
+   [with <in-var-list>] { <loop-instruction-list> }
+```
+
+out-var-list is a comma separated list of SSA values defined in the loop body
+and used outside the loop body. in-var-list is a comma separated list of SSA
+values used inside the loop body and their initializers. loop-instruction-list
+is a list of instructions that may also include a yield instruction.
+
+Example:
+
+```mlir
+// Return sum of elements in 1-dimensional mref A
+func i32 @sum(%A : memref<?xi32>, %N : i32) -> (i32) {
+   %init = 0
+   %result = affine.for %i = 0 to N with %tmp(%init) {
+      %value = affine.load %A[%i]
+      %sum = %value + %tmp
+      yield %sum
+   }
+   return %result : i32
+}
+```
+
+#### affine.if/else Instruction
+
+Syntax:
+
+```
+<out-var-list> = affine.if (<cond-list>) {...} [else {...}]
+```
+
+Out-var-list is a list of SSA values defined by the if-instruction. The values
+are arguments to the yield-instruction that occurs in both then and else clauses
+when else clause is present. When if instruction contains only if clause, the
+escaping value defined in the then clause should be merged with the value the
+variable had before the if instruction. The design captured here does not handle
+this situation.
+
+Example:
+
+```mlir
+// Compute sum of half of the array
+func i32 @sum_half(%A : memref<?xi32>, %N : i32) -> (i32) {
+   %s0 = 0
+   %s1 = affine.for %i = 1 ... N step 1 with %s2 (%s0) {
+       %s3 = if (%i >= %N / 2) {
+          %v0 = affine.load %A[%i]
+          %s4 = %s2 + %v0
+          yield %s4
+       }
+       yield %s3
+   }
+   return %s1 : i32
+}
+```
+
+### Multithreading the compiler
+
+People want compilers to go fast, and one simple way to do that is to
+multi-thread them. There are multiple strategies for this, but a simple one is
+to optimize and compile separate functions in parallel. LLVM's original pass
+manager anticipated this demand, and the CallGraphSCCPass manager is even
+designed to support this as well, but unfortunately, a few early design
+decisions in LLVM prevent this from ever happening. Instead, things like ThinLTO
+are forced to split programs into separate LLVM modules/context and optimize
+those chunks independently.
+
+The problem is that LLVM has several objects in its IR that are globally uniqued
+and also mutable: notably constants like `i32 0`. In LLVM, these constants are
+`Value`'s, which allow them to be used as operands to instructions, and that
+they also have SSA use lists. Because these things are uniqued, every `i32 0` in
+any function shares a use list. This means that optimizing multiple functions in
+parallel won't work (at least without some sort of synchronization on the use
+lists, which would be unbearably inefficient).
+
+MLIR now supports a multithreaded pass manager. We do this through several
+design choices:
+
+1.  MLIR makes use of extensive uniqued immutable data structures (affine
+    expressions, types, etc are all immutable, uniqued, and immortal).
+2.  Constants are defined in per-function pools, instead of being globally
+    uniqued.
+3.  Functions themselves are not SSA values either, so they don't have the same
+    problem as constants.
+4.  FunctionPasses are copied (through their copy ctor) into one instance per
+    thread, avoiding sharing of local state across threads.
+
+This allows MLIR function passes to support efficient multithreaded compilation
+and code generation.
diff --git a/mlir/docs/RationaleSimplifiedPolyhedralForm.md b/mlir/docs/RationaleSimplifiedPolyhedralForm.md
new file mode 100644
index 0000000000000000000000000000000000000000..ec2ecc9fe502a1cd64d4d8bca241b6d120e35b89
--- /dev/null
+++ b/mlir/docs/RationaleSimplifiedPolyhedralForm.md
@@ -0,0 +1,415 @@
+# MLIR: The case for a <em>simplified</em> polyhedral form
+
+MLIR embraces polyhedral compiler techniques for their many advantages
+representing and transforming dense numerical kernels, but it uses a form that
+differs significantly from other polyhedral frameworks.
+
+**Disclaimer / Warning**
+
+This document is a very early design proposal (which has since been accepted)
+that explored the tradeoffs of using this simplified form vs the traditional
+polyhedral schedule list form. At some point, this document could be dusted off
+and written as a proper academic paper, but until now, it is better to included
+it in this crafty form than not to. Beware that this document uses archaic
+syntax and should not be considered a canonical reference to modern MLIR.
+
+## Introduction
+
+This document discusses general goals of the project, introduces context and the
+two alternatives, then talks about the tradeoffs of these designs. Written by
+Chris Lattner.
+
+## General goals of an IR, and goals of mlfunc's specifically
+
+Our currently planned representation for MLIR consists of two kinds of
+functions: an LLVM-like "CFG Function" and an "ML Function": a function
+represented in multidimensional loop form. The idea is that a CFG function is
+capable of full generality for expressing arbitrary computation, but is awkward
+for loop transformations. In contrast, mlfunc's are limited (e.g. to control
+flow involving loop nests over affine spaces) but these limitations make it much
+easier to transform and analyze, particularly for the set of computations in a
+machine learning kernel.
+
+The design of an intermediate representations is an optimization problem, which
+makes intentional tradeoffs that aim to make certain kinds of compiler
+transformations simple. After all, it is "possible" to do almost any
+transformation on any IR: we could theoretically do loop transformations on
+assembly language. OTOH, such transformations would take too long to write,
+would be fragile due to irrelevant changes, would be difficult to maintain, and
+difficult to make target independent. Performing transformations on the "right
+level" of IR makes it much easier to do analysis and transformation of code, and
+can make them faster by reducing the size of the IR, and eliminating
+possibilities that would have otherwise have to be considered.
+
+This is the reason we're interested in adding polyhedral techniques to an IR in
+the first place: though our base "CFG function" representation is fully capable
+of expressing any computation, it is "too" expressive. The limitations imposed
+by polyhedral techniques (e.g. on affine loop bounds and array subscripts)
+define a closed algebra that can represent an interesting range of
+transformations and their compositions, and because of their simplicity, we can
+perform (e.g.) dependence analysis more efficiently and more reliably.
+
+This raises an important question that this document examines: given we are
+introducing a redundant and limited way to express code and transformations,
+exactly what form is best to perform the analyses and transformations we want?
+
+We explore two different design points that are capable of expressing the same
+class of affine loop computations, but which use different representational
+forms. These forms trade off verbosity, ease of transformation, and ease of
+analysis in interesting ways.
+
+## Context: Traditional Polyhedral Form
+
+We started by discussing a representation that uses the traditional polyhedral
+schedule set + domain representation, e.g. consider C-like code like:
+
+```c
+  void simple_example(...) {
+    for (int i = 0; i < N; ++i) {
+      for (int j = 0; j < N; ++j) {
+         float tmp = X[i,j]    // S1
+         A[i,j] = tmp + 1      // S2
+         B[i,j] = tmp * 42     // S3
+       }
+    }
+  }
+```
+
+The polyhedral representation doesn't care about the actual computation, so we
+will abstract them into S1/S2/S3 in the discussion below. Originally, we planned
+to represent this with a classical form like (syntax details are not important
+and probably slightly incorrect below):
+
+```
+  mlfunc @simple_example(... %N) {
+    %tmp = call @S1(%X, %i, %j)
+      domain: (0 <= %i < %N), (0 <= %j < %N)
+      schedule: (i, j, 0)
+
+    call @S2(%tmp, %A, %i, %j)
+      domain: (0 <= %i < %N), (0 <= %j < %N)
+      schedule: (i, j, 1)
+
+    call @S3(%tmp, %B, %i, %j)
+      domain: (0 <= %i < %N), (0 <= %j < %N)
+      schedule: (i, j, 2)
+  }
+```
+
+In this design, an mlfunc is an unordered bag of instructions whose execution
+order is fully controlled by their schedule.
+
+However, we recently agreed that a more explicit schedule tree representation is
+a better fit for our needs, because it exposes important structure that will
+make analyses and optimizations more efficient, and also makes the scoping of
+SSA values more explicit. This leads us to a representation along the lines of:
+
+```
+  mlfunc @simple_example(... %N) {
+    d0/d1 = mlspace
+    for S1(d0), S2(d0), S3(d0) {
+      for S1(d1), S2(d1), S3(d1) {
+
+        %tmp = call @S1(%X, d0, d1)      ;; S1
+          domain: (0 <= d0 < %N), (0 <= d1 < %N)
+
+        call @S2(%tmp, %A, d0, d1)      ;; S2
+          domain: (0 <= d0 < %N), (0 <= d1 < %N)
+
+        call @S3(%tmp, %B, d0, d1)      ;; S3
+          domain: (0 <= d0 < %N), (0 <= d1 < %N)
+      }
+    }
+  }
+```
+
+This change makes the nesting structure of the loops an explicit part of the
+representation, and makes lexical ordering within a loop significant
+(eliminating the constant 0/1/2 of schedules).
+
+It isn't obvious in the example above, but the representation allows for some
+interesting features, including the ability for instructions within a loop nest
+to have non-equal domains, like this - the second instruction ignores the outer
+10 points inside the loop:
+
+```
+  mlfunc @reduced_domain_example(... %N) {
+    d0/d1 = mlspace
+    for S1(d0), S2(d0) {
+      for S1(d1), S2(d1) {
+        %tmp = call @S1(%X, d0, d1)    ;; S1
+          domain: (0 <= d0 < %N), (0 <= d1 < %N)
+
+        call @S2(%tmp, %A, d0, d1)      ;; S2
+          domain: (10 <= d0 < %N-10), (10 <= d1 < %N-10)
+      }
+    }
+  }
+```
+
+It also allows schedule remapping within the instruction, like this example that
+introduces a diagonal skew through a simple change to the schedules of the two
+instructions:
+
+```
+  mlfunc @skewed_domain_example(... %N) {
+    d0/d1 = mlspace
+    for S1(d0), S2(d0+d1) {
+      for S1(d0+d1), S2(d1) {
+        %tmp = call @S1(%X, d0, d1)    ;; S1
+          domain: (0 <= d0 < %N), (0 <= d1 < %N)
+
+        call @S2(%tmp, %A, d0, d1)      ;; S2
+          domain: (0 <= d0 < %N), (0 <= d1 < %N)
+      }
+    }
+  }
+```
+
+This form has great power, and the polyhedral code generator (which lowers from
+an mlfunc to a cfgfunc representation) handles this power so things that
+introduce loop transformations don't have to explicitly manipulate the looping
+structure.
+
+## Proposal: Simplified Polyhedral Form
+
+This document proposes and explores the idea of going one step further, moving
+all of the domain and schedule information into the "schedule tree". In this
+form, we would have a representation where all instructions inside of a given
+for-loop are known to have the same domain, which is maintained by the loop. In
+the simplified form, we also have an "if" instruction that takes an affine
+condition.
+
+Our simple example above would be represented as:
+
+```mlir
+  mlfunc @simple_example(... %N) {
+    affine.for %i = 0 ... %N step 1 {
+      affine.for %j = 0 ... %N step 1 {
+        // identity noop in this case, but can exist in general.
+        %0,%1 = affine.apply #57(%i, %j)
+
+        %tmp = call @S1(%X, %0, %1)
+
+        call @S2(%tmp, %A, %0, %1)
+
+        call @S3(%tmp, %B, %0, %1)
+      }
+    }
+  }
+```
+
+The example with the reduced domain would be represented with an if instruction:
+
+```mlir
+  mlfunc @reduced_domain_example(... %N) {
+    affine.for %i = 0 ... %N step 1 {
+      affine.for %j = 0 ... %N step 1 {
+        // identity noop in this case, but can exist in general.
+        %0,%1 = affinecall #57(%i, %j)
+
+        %tmp = call @S1(%X, %0, %1)
+
+        if (10 <= %i < %N-10), (10 <= %j < %N-10) {
+
+          %2,%3 = affine.apply(%i, %j)    // identity noop in this case
+
+          call @S2(%tmp, %A, %2, %3)
+        }
+      }
+    }
+  }
+```
+
+These IRs represent exactly the same information, and use a similar information
+density. The 'traditional' form introduces an extra level of abstraction
+(schedules and domains) that make it easy to transform instructions at the
+expense of making it difficult to reason about how those instructions will come
+out after code generation. With the simplified form, transformations have to do
+parts of code generation inline with their transformation: instead of simply
+changing a schedule to **(i+j, j)** to get skewing, you'd have to generate this
+code explicitly (potentially implemented by making polyhedral codegen a library
+that transformations call into):
+
+```mlir
+mlfunc @skewed_domain_example(... %N) {
+  affine.for %t1 = 0 ... 2*N-2 step 1 {
+    affine.for %t2 = max(0, t1-N+1) ... min(N, t1) step 1 {
+      (%i, %j) = (%t1-%t2, %t2)
+      ...
+    }
+  }
+}
+```
+
+## Evaluation
+
+Both of these forms are capable of expressing the same class of computation:
+multidimensional loop nests with affine loop bounds and affine memory
+references. That said, they pose very different tradeoffs in other ways.
+
+### Commonality: can express same computation
+
+Both of these can express the same sorts of computation, e.g. kernels written in
+one form are representable in the other form in all cases.
+
+### Commonality: dependence analysis
+
+These representations both use affine functions for data layout mapping and
+access subscripts, and dependence analysis works the same way.
+
+### Commonality: difficulty of determining optimal transformation series
+
+One major challenge in performance of optimization of this sort of code is
+choosing the ordering and behavior of various loop transformations that get
+applied. There are non-local effects of every decision, and neither
+representation helps solve this inherently hard problem.
+
+### Commonality: compactness of IR
+
+In the cases that are most relevant to us (hyper rectangular spaces) these forms
+are directly equivalent: a traditional instruction with a limited domain (e.g.
+the "reduced_domain_example" above) ends up having one level of ML 'if' inside
+its loops. The simplified form pays for this by eliminating schedules and
+domains from the IR. Both forms allow code duplication to reduce dynamic
+branches in the IR: the traditional approach allows instruction splitting, the
+simplified form supports instruction duplication.
+
+It is important to point out that the traditional form wins on compactness in
+the extreme cases: e.g. the loop skewing case. These cases will be rare in
+practice for our workloads, and are exactly the cases that downstream
+transformations want to be explicit about what they are doing.
+
+### Simplicity of code generation
+
+A key final stage of an mlfunc is its conversion to a CFG function, which is
+required as part of lowering to the target machine. The simplified form has a
+clear advantage here: the IR has a direct correspondence to the structure of the
+generated code.
+
+In contrast, the traditional form has significant complexity in the lowering
+process to a CFG function, because the verbosity not imbued in the IR needs to
+come out during code generation. Code generation from ISL shows that it is
+possible to do this, but it is a non-trivial transformation.
+
+### Ease of transformation
+
+An advantage for the traditional form is that it is easier to perform certain
+transformations on it: skewing and tiling are just transformations on the
+schedule of the instructions in question, it doesn't require changing the loop
+structure.
+
+In practice, the simplified form requires moving the complexity of code
+generation into the transformations themselves - this is sometimes trivial,
+sometimes involved. The author believes that this should be possible by making
+the code generation algorithms themselves be library functions that
+transformations call into, instead of an opaque block that happens at the end of
+the mlfunc processing.
+
+Also, the sorts of transformations performed today by XLA (including tiling,
+padding, unrolling, and other rectangular transformations) should be easy enough
+to implement on either representation. The only cases that are a challenge are
+more advanced cases like skewing, e.g. for DMA data movement generation.
+
+### Ease of analysis: Cost models
+
+The simplified form is much easier for analyses and transformations to build
+cost models for (e.g. answering the question of "how much code bloat will be
+caused by unrolling a loop at this level?"), because it is easier to predict
+what target code will be generated. With the traditional form, these analyses
+will have to anticipate what polyhedral codegen will do to a set of instructions
+under consideration: something that is non-trivial in the interesting cases in
+question (see "Cost of code generation").
+
+### Cost of code generation
+
+State of the art polyhedral code generation is
+[expensive and complicated](https://lirias.kuleuven.be/bitstream/123456789/497238/1/toplas-astgen.pdf),
+sometimes exponential time complexity. We expect that most machine learning
+workloads will be hyper-rectangular, and thus it should be easy to specialize in
+important cases. That said, the traditional polyhedral representation makes it
+very easy to introduce complicated and expensive schedules, and provides no way
+to understand and project a cost model for using them. All downstream clients of
+the IR need to be prepared to handle the full generality of IR that may come to
+them.
+
+The simplified form defines this away: the concepts in the IR remain simple, and
+the code much more directly reflects the cost model for lowering to CFG
+functions and machine code. This is expected to be very important in the late
+stages of a code generator for an accelerator.
+
+### SSA in ML Functions
+
+We agree already that values defined in an mlfunc can include scalar values and
+they are defined based on traditional dominance. In the simplified form, this is
+very simple: arguments and induction variables defined in for-loops are live
+inside their lexical body, and linear series of instructions have the same "top
+down" dominance relation that a basic block does.
+
+In the traditional form though, this is not the case: it seems that a lot of
+knowledge about how codegen will emit the code is necessary to determine if SSA
+form is correct or not. For example, this is invalid code:
+
+```
+  %tmp = call @S1(%X, %0, %1)
+    domain: (10 <= %i < %N), (0 <= %j < %N)
+    schedule: (i, j)
+
+  call @S2(%tmp, %A, %0, %1)
+    domain: (0 <= %i < %N), (0 <= %j < %N)
+    schedule: (i, j)
+```
+
+Because `%tmp` isn't defined on some iterations of the %i loop.
+
+This matters because it makes the verifier more complicated, but more
+significantly, it means that load promotion and other optimizations that will
+produce SSA form will need to be aware of this and be able to model what codegen
+does.
+
+An emergent property of this that we discussed recently is that PHI nodes in
+mlfunc's (if we support them) will also have to have domains.
+
+### Lack of redundancy in IR
+
+The traditional form has multiple encodings for the same sorts of behavior: you
+end up having bits on `affine.for` loops to specify whether codegen should use
+"atomic/separate" policies, unroll loops, etc. Instructions can be split or can
+generate multiple copies of their instruction because of overlapping domains,
+etc.
+
+This is a problem for analyses and cost models, because they each have to reason
+about these additional forms in the IR.
+
+### Suitability to purpose: lowering to machine code
+
+One of the main drivers for this work is lowering to low-level accelerator code,
+including two-dimensional vectorization, insertion of DMAs, and other
+utilization of the matrix accelerator units. In the author's opinion, the extra
+compactness of the traditional form is a negative for this purpose: reasoning
+about the generated machine code will require understanding the mapping from
+mlfunc to lowered code, which means that it must understand what code generation
+will do.
+
+In the simplified form, the effect of "code generation" is always obvious from
+the IR itself, which should make it easier to perform vectorization to target
+instructions and other analyses we need to perform.
+
+## Third Alternative: two different levels of mlfunc
+
+One hybrid alternative is to support both the traditional and simplified forms
+of mlfunc in our IR.
+
+The stages could look like this, for example:
+
+1.  Early performance transformations could be done on the traditional form.
+1.  Partial code generation lowers to the simplified form
+1.  Target specific lowering phases for tiling, and vectorization and other 2D
+    transforms that don't benefit much from the traditional form could be run.
+1.  Final codegen to a cfg func can be done when all of the instructions are
+    replaced with ones valid on the target.
+
+While this is possible, it isn't clear what would justify the complexity of this
+approach. Unless there is a super compelling reason for this, it would be nice
+to not do this. **Update:** we discussed this as a design team and agreed that
+this wouldn't be a good way to go.
diff --git a/mlir/docs/TestingGuide.md b/mlir/docs/TestingGuide.md
new file mode 100644
index 0000000000000000000000000000000000000000..723b78bf0f58236c22e7548b5b8de66e2c2dbb47
--- /dev/null
+++ b/mlir/docs/TestingGuide.md
@@ -0,0 +1,171 @@
+# Testing Guide
+
+Testing is an integral part of any software infrastructure. In general, all
+commits to the MLIR repository should include an accompanying test of some form.
+Commits that include no functional changes, such as API changes like symbol
+renaming, should be tagged with NFC(no functional changes). This signals to the
+reviewer why the change doesn't/shouldn't include a test.
+
+MLIR generally separates testing into two main categories, [Check](#check-tests)
+tests and [Unit](#unit-tests) tests.
+
+## Check tests
+
+Check tests are tests that verify that some set of string tags appear in the
+output of some program. These tests generally encompass anything related to the
+state of the IR (and more); analysis, parsing, transformation, verification,
+etc. They are written utilizing several different tools:
+
+### FileCheck tests
+
+[FileCheck](https://llvm.org/docs/CommandGuide/FileCheck.html) is a utility tool
+that "reads two files (one from standard input, and one specified on the command
+line) and uses one to verify the other." Essentially, one file contains a set of
+tags that are expected to appear in the output file. MLIR utilizes FileCheck, in
+combination with [lit](https://llvm.org/docs/CommandGuide/lit.html), to verify
+different aspects of the IR - such as the output of a transformation pass.
+
+An example FileCheck test is shown below:
+
+```mlir
+// RUN: mlir-opt %s -cse | FileCheck %s
+
+// CHECK-LABEL: func @simple_constant
+func @simple_constant() -> (i32, i32) {
+  // CHECK-NEXT: %[[RESULT:.*]] = constant 1
+  // CHECK-NEXT: return %[[RESULT]], %[[RESULT]]
+
+  %0 = constant 1 : i32
+  %1 = constant 1 : i32
+  return %0, %1 : i32, i32
+}
+```
+
+The above test performs a check that after running Common Sub-Expression
+elimination, only one constant remains in the IR.
+
+#### FileCheck best practices
+
+FileCheck is an extremely useful utility, it allows for easily matching various
+parts of the output. This ease of use means that it becomes easy to write
+brittle tests that are essentially `diff` tests. FileCheck tests should be as
+self-contained as possible and focus on testing the minimal set of
+functionalities needed. Let's see an example:
+
+```mlir
+// RUN: mlir-opt %s -cse | FileCheck %s
+
+// CHECK-LABEL: func @simple_constant() -> (i32, i32)
+func @simple_constant() -> (i32, i32) {
+  // CHECK-NEXT: %result = constant 1 : i32
+  // CHECK-NEXT: return %result, %result : i32, i32
+  // CHECK-NEXT: }
+
+  %0 = constant 1 : i32
+  %1 = constant 1 : i32
+  return %0, %1 : i32, i32
+}
+```
+
+The above example is another way to write the original example shown in the main
+[FileCheck tests](#filecheck-tests) section. There are a few problems with this
+test; below is a breakdown of the no-nos of this test to specifically highlight
+best practices.
+
+*   Tests should be self-contained.
+
+This means that tests should not test lines or sections outside of what is
+intended. In the above example, we see lines such as `CHECK-NEXT: }`. This line
+in particular is testing pieces of the Parser/Printer of FuncOp, which is
+outside of the realm of concern for the CSE pass. This line should be removed.
+
+*   Tests should be minimal, and only check what is absolutely necessary.
+
+This means that anything in the output that is not core to the functionality
+that you are testing should *not* be present in a CHECK line. This is a separate
+bullet just to highlight the importance of it, especially when checking against
+IR output.
+
+If we naively remove the unrelated `CHECK` lines in our source file, we may end
+up with:
+
+```mlir
+// CHECK-LABEL: func @simple_constant
+func @simple_constant() -> (i32, i32) {
+  // CHECK-NEXT: %result = constant 1 : i32
+  // CHECK-NEXT: return %result, %result : i32, i32
+
+  %0 = constant 1 : i32
+  %1 = constant 1 : i32
+  return %0, %1 : i32, i32
+}
+```
+
+It may seem like this is a minimal test case, but it still checks several
+aspects of the output that are unrelated to the CSE transformation. Namely the
+result types of the `constant` and `return` operations, as well the actual SSA
+value names that are produced. FileCheck `CHECK` lines may contain
+[regex statements](https://llvm.org/docs/CommandGuide/FileCheck.html#filecheck-regex-matching-syntax)
+as well as named
+[string substitution blocks](https://llvm.org/docs/CommandGuide/FileCheck.html#filecheck-string-substitution-blocks).
+Utilizing the above, we end up with the example shown in the main
+[FileCheck tests](#filecheck-tests) section.
+
+```mlir
+// CHECK-LABEL: func @simple_constant
+func @simple_constant() -> (i32, i32) {
+  /// Here we use a substitution variable as the output of the constant is
+  /// useful for the test, but we omit as much as possible of everything else.
+  // CHECK-NEXT: %[[RESULT:.*]] = constant 1
+  // CHECK-NEXT: return %[[RESULT]], %[[RESULT]]
+
+  %0 = constant 1 : i32
+  %1 = constant 1 : i32
+  return %0, %1 : i32, i32
+}
+```
+
+### Diagnostic verification tests
+
+MLIR provides rich source location tracking that can be used to emit errors,
+warnings, etc. easily from anywhere throughout the codebase. Certain classes of
+tests are written to check that certain diagnostics are emitted for a given
+input program, such as an MLIR file. These tests are useful in that they allow
+checking specific invariants of the IR without transforming or changing
+anything. Some examples of tests in this category are: those that verify
+invariants of operations, or check the expected results of an analysis.
+Diagnostic verification tests are written utilizing the
+[source manager verifier handler](Diagnostics.md#sourcemgr-diagnostic-verifier-handler),
+accessible via the `verify-diagnostics` flag in mlir-opt.
+
+An example .mlir test running under `mlir-opt` is shown below:
+
+```mlir
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// Expect an error on the same line.
+func @bad_branch() {
+  br ^missing  // expected-error {{reference to an undefined block}}
+}
+
+// -----
+
+// Expect an error on an adjacent line.
+func @foo(%a : f32) {
+  // expected-error@+1 {{unknown comparison predicate "foo"}}
+  %result = cmpf "foo", %a, %a : f32
+  return
+}
+```
+
+## Unit tests
+
+Unit tests are written using
+[Google Test](https://github.com/google/googletest/blob/master/googletest/docs/primer.md)
+and are located in the unittests/ directory. Tests of these form *should* be
+limited to API tests that cannot be reasonably written as [Check](#check-tests)
+tests, e.g. those for data structures. It is important to keep in mind that the
+C++ APIs are not stable, and evolve over time. As such, directly testing the C++
+IR interfaces makes the tests more fragile as those C++ APIs evolve over time.
+This makes future API refactorings, which may happen frequently, much more
+cumbersome as the number of tests scale.
diff --git a/mlir/docs/Traits.md b/mlir/docs/Traits.md
new file mode 100644
index 0000000000000000000000000000000000000000..b233f9bef66dd7d4955f7f9f04f7ef7055f48785
--- /dev/null
+++ b/mlir/docs/Traits.md
@@ -0,0 +1,246 @@
+# Introduction to MLIR Operation Traits
+
+[TOC]
+
+MLIR allows for a truly open operation ecosystem, as any dialect may define
+operations that suit a specific level of abstraction. `Traits` are a mechanism
+in which to abstract implementation details and properties that are common
+across many different operations. `Traits` may be used to specify special
+properties and constraints of the operation, including whether the operation has
+side effects or whether its output has the same type as the input. Some examples
+of traits are `Commutative`, `SingleResult`, `Terminator`, etc. See the more
+[comprehensive list](#traits) below for more examples of what is possible.
+
+## Defining a Trait
+
+Traits may be defined in C++ by inheriting from the
+`OpTrait::TraitBase<ConcreteType, TraitType>` class. This base class takes as
+template parameters:
+
+*   ConcreteType
+    -   The concrete operation type that this trait was attached to.
+*   TraitType
+    -   The type of the trait class that is being defined, for use with the
+        [`Curiously Recurring Template Pattern`](https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern).
+
+A derived trait class is expected to take a single template that corresponds to
+the `ConcreteType`. An example trait definition is shown below:
+
+```c++
+template <typename ConcreteType>
+class MyTrait : public OpTrait::TraitBase<ConcreteType, MyTrait> {
+};
+```
+
+Derived traits may also provide a `verifyTrait` hook, that is called when
+verifying the concrete operation. The trait verifiers will currently always be
+invoked before the main `Op::verify`.
+
+```c++
+template <typename ConcreteType>
+class MyTrait : public OpTrait::TraitBase<ConcreteType, MyTrait> {
+public:
+  /// Override the 'verifyTrait' hook to add additional verification on the
+  /// concrete operation.
+  static LogicalResult verifyTrait(Operation *op) {
+    // ...
+  }
+};
+```
+
+Note: It is generally good practice to define the implementation of the
+`verifyTrait` hook out-of-line as a free function when possible to avoid
+instantiating the implementation for every concrete operation type.
+
+### Parametric Traits
+
+The above demonstrates the definition of a simple self-contained trait. It is
+also often useful to provide some static parameters to the trait to control its
+behavior. Given that the definition of the trait class is rigid, i.e. we must
+have a single template argument for the concrete operation, the templates for
+the parameters will need to be split out. An example is shown below:
+
+```c++
+template <int Parameter>
+class MyParametricTrait {
+public:
+  template <typename ConcreteType>
+  class Impl : public OpTrait::TraitBase<ConcreteType, Impl> {
+    // Inside of 'Impl' we have full access to the template parameters
+    // specified above.
+  };
+};
+```
+
+## Attaching a Trait
+
+Traits may be used when defining a derived operation type, by simply adding the
+name of the trait class to the `Op` class after the concrete operation type:
+
+```c++
+/// Here we define 'MyOp' along with the 'MyTrait' and `MyParametric trait
+/// classes we defined previously.
+class MyOp : public Op<MyOp, MyTrait, MyParametricTrait<10>::Impl> {};
+```
+
+To use a trait in the [ODS](OpDefinitions.md) framework, we need to provide a
+definition of the trait class. This can be done using the `NativeOpTrait` and
+`ParamNativeOpTrait` classes. `ParamNativeOpTrait` provides a mechanism in which
+to specify arguments to a parametric trait class with an internal `Impl`.
+
+```tablegen
+// The argument is the c++ trait class name.
+def MyTrait : NativeOpTrait<"MyTrait">;
+
+// The first argument is the parent c++ class name. The second argument is a
+// string containing the parameter list.
+class MyParametricTrait<int prop>
+  : NativeOpTrait<"MyParametricTrait", !cast<string>(!head(parameters))>;
+```
+
+These can then be used in the `traits` list of an op definition:
+
+```tablegen
+def OpWithInferTypeInterfaceOp : Op<...[MyTrait, MyParametricTrait<10>]> { ... }
+```
+
+See the documentation on [operation definitions](OpDefinitions.md) for more
+details.
+
+## Using a Trait
+
+Traits may be used to provide additional methods, static fields, or other
+information directly on the concrete operation. `Traits` internally become
+`Base` classes of the concrete operation, so all of these are directly
+accessible. To expose this information opaquely to transformations and analyses,
+[`interfaces`](Interfaces.md) may be used.
+
+To query if a specific operation contains a specific trait, the `hasTrait<>`
+method may be used. This takes as a template parameter the trait class, which is
+the same as the one passed when attaching the trait to an operation.
+
+```c++
+Operation *op = ..;
+if (op->hasTrait<MyTrait>() || op->hasTrait<MyParametricTrait<10>::Impl>())
+  ...;
+```
+
+## Trait List
+
+MLIR provides a suite of traits that provide various functionalities that are
+common across many different operations. Below is a list of some key traits that
+may be used directly by any dialect. The format of the header for each trait
+section goes as follows:
+
+*   `Header`
+    -   (`C++ class` -- `ODS class`(if applicable))
+
+### Broadcastable
+
+*   `OpTrait::BroadcastableTwoOperandsOneResult` -- `Broadcastable`
+
+This trait provides the API for operations that are known to have
+[broadcast-compatible](https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+operand and result types. Specifically, starting from the most varying
+dimension, each dimension pair of the two operands' types should either be the
+same or one of them is one. Also, the result type should have the corresponding
+dimension equal to the larger one, if known. Shapes are checked partially if
+ranks or dimensions are not known. For example, an op with `tensor<?x2xf32>` and
+`tensor<2xf32>` as operand types and `tensor<3x2xf32>` as the result type is
+broadcast-compatible.
+
+Ths trait assumes the op has two operands and one result, and it asserts if the
+pre-condition is not satisfied.
+
+### Commutative
+
+*   `OpTrait::IsCommutative` -- `Commutative`
+
+This trait adds the property that the operation is commutative, i.e. `X op Y ==
+Y op X`
+
+### Function-Like
+
+*   `OpTrait::FunctionLike`
+
+This trait provides APIs for operations that behave like functions. In
+particular:
+
+-   Ops must be symbols, i.e. also have the `Symbol` trait;
+-   Ops have a single region with multiple blocks that corresponds to the body
+    of the function;
+-   the absence of a region corresponds to an external function;
+-   arguments of the first block of the region are treated as function
+    arguments;
+-   they can have argument and result attributes that are stored in dictionary
+    attributes on the operation itself.
+
+This trait does *NOT* provide type support for the functions, meaning that
+concrete Ops must handle the type of the declared or defined function.
+`getTypeAttrName()` is a convenience function that returns the name of the
+attribute that can be used to store the function type, but the trait makes no
+assumption based on it.
+
+### HasParent
+
+*   `OpTrait::HasParent<typename ParentOpType>` -- `HasParent<string op>`
+
+This trait provides APIs and verifiers for operations that can only be nested
+within regions that are attached to operations of `ParentOpType`.
+
+### IsolatedFromAbove
+
+*   `OpTrait::IsIsolatedFromAbove` -- `IsolatedFromAbove`
+
+This trait signals that the regions of an operations are known to be isolated
+from above. This trait asserts that the regions of an operation will not
+capture, or reference, SSA values defined above the region scope. This means
+that the following is invalid if `foo.region_op` is defined as
+`IsolatedFromAbove`:
+
+```mlir
+%result = constant 10 : i32
+foo.region_op {
+  foo.yield %result : i32
+}
+```
+
+This trait is an important structural property of the IR, and enables operations
+to have [passes](WritingAPass.md) scheduled under them.
+
+### NoSideEffect
+
+*   `OpTrait::HasNoSideEffect` -- `NoSideEffect`
+
+This trait signifies that the operation is pure and has no visible side effects.
+
+### Single Block with Implicit Terminator
+
+*   `OpTrait::SingleBlockImplicitTerminator<typename TerminatorOpType>` :
+    `SingleBlockImplicitTerminator<string op>`
+
+This trait provides APIs and verifiers for operations with regions that have a
+single block that must terminate with `TerminatorOpType`.
+
+### Symbol
+
+*   `OpTrait::Symbol` -- `Symbol`
+
+This trait is used for operations that define a `Symbol`.
+
+TODO(riverriddle) Link to the proper document detailing the design of symbols.
+
+### SymbolTable
+
+*   `OpTrait::SymbolTable` -- `SymbolTable`
+
+This trait is used for operations that define a `SymbolTable`.
+
+TODO(riverriddle) Link to the proper document detailing the design of symbols.
+
+### Terminator
+
+*   `OpTrait::IsTerminator` -- `Terminator`
+
+This trait provides verification and functionality for operations that are known
+to be [terminators](LangRef.md#terminator-operations).
diff --git a/mlir/docs/Tutorials/Toy/Ch-1.md b/mlir/docs/Tutorials/Toy/Ch-1.md
new file mode 100644
index 0000000000000000000000000000000000000000..cb7f97cb3f69e003c714b06546a1aa2f6073406a
--- /dev/null
+++ b/mlir/docs/Tutorials/Toy/Ch-1.md
@@ -0,0 +1,169 @@
+# Chapter 1: Toy Tutorial Introduction
+
+[TOC]
+
+This tutorial runs through the implementation of a basic toy language on top of
+MLIR. The goal of this tutorial is to introduce the concepts of MLIR; in
+particular, how [dialects](../../LangRef.md#dialects) can help easily support
+language specific constructs and transformations while still offering an easy
+path to lower to LLVM or other codegen infrastructure. This tutorial is based on
+the model of the
+[LLVM Kaleidoscope Tutorial](https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/index.html).
+
+This tutorial assumes you have cloned and built MLIR; if you have not yet done
+so, see
+[Getting started with MLIR](https://github.com/tensorflow/mlir#getting-started-with-mlir).
+
+## The Chapters
+
+This tutorial is divided in the following chapters:
+
+-   [Chapter #1](Ch-1.md): Introduction to the Toy language and the definition
+    of its AST.
+-   [Chapter #2](Ch-2.md): Traversing the AST to emit a dialect in MLIR,
+    introducing base MLIR concepts. Here we show how to start attaching
+    semantics to our custom operations in MLIR.
+-   [Chapter #3](Ch-3.md): High-level language-specific optimization using
+    pattern rewriting system.
+-   [Chapter #4](Ch-4.md): Writing generic dialect-independent transformations
+    with Interfaces. Here we will show how to plug dialect specific information
+    into generic transformations like shape inference and inlining.
+-   [Chapter #5](Ch-5.md): Partially lowering to lower-level dialects. We'll
+    convert some our high level language specific semantics towards a generic
+    affine oriented dialect for optimization.
+-   [Chapter #6](Ch-6.md): Lowering to LLVM and code generation. Here we'll
+    target LLVM IR for code generation, and detail more of the lowering
+    framework.
+-   [Chapter #7](Ch-7.md): Extending Toy: Adding support for a composite type.
+    We'll demonstrate how to add a custom type to MLIR, and how it fits in the
+    existing pipeline.
+
+## The Language
+
+This tutorial will be illustrated with a toy language that we’ll call “Toy”
+(naming is hard...). Toy is a tensor-based language that allows you to define
+functions, perform some math computation, and print results.
+
+Given that we want to keep things simple, the codegen will be limited to tensors
+of rank <= 2, and the only datatype in Toy is a 64-bit floating point type (aka
+‘double’ in C parlance). As such, all values are implicitly double precision,
+`Values` are immutable (i.e. every operation returns a newly allocated value),
+and deallocation is automatically managed. But enough with the long description;
+nothing is better than walking through an example to get a better understanding:
+
+```Toy {.toy}
+def main() {
+  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
+  # The shape is inferred from the supplied literal.
+  var a = [[1, 2, 3], [4, 5, 6]];
+
+  # b is identical to a, the literal tensor is implicitly reshaped: defining new
+  # variables is the way to reshape tensors (element count must match).
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+
+  # transpose() and print() are the only builtin, the following will transpose
+  # a and b and perform an element-wise multiplication before printing the result.
+  print(transpose(a) * transpose(b));
+}
+```
+
+Type checking is statically performed through type inference; the language only
+requires type declarations to specify tensor shapes when needed. Functions are
+generic: their parameters are unranked (in other words, we know these are
+tensors, but we don't know their dimensions). They are specialized for every
+newly discovered signature at call sites. Let's revisit the previous example by
+adding a user-defined function:
+
+```Toy {.toy}
+# User defined generic function that operates on unknown shaped arguments.
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
+  var a = [[1, 2, 3], [4, 5, 6]];
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+
+  # This call will specialize `multiply_transpose` with <2, 3> for both
+  # arguments and deduce a return type of <3, 2> in initialization of `c`.
+  var c = multiply_transpose(a, b);
+
+  # A second call to `multiply_transpose` with <2, 3> for both arguments will
+  # reuse the previously specialized and inferred version and return <3, 2>.
+  var d = multiply_transpose(b, a);
+
+  # A new call with <3, 2> (instead of <2, 3>) for both dimensions will
+  # trigger another specialization of `multiply_transpose`.
+  var e = multiply_transpose(c, d);
+
+  # Finally, calling into `multiply_transpose` with incompatible shape will
+  # trigger a shape inference error.
+  var f = multiply_transpose(transpose(a), c);
+}
+```
+
+## The AST
+
+The AST from the above code is fairly straightforward; here is a dump of it:
+
+```
+Module:
+  Function
+    Proto 'multiply_transpose' @test/ast.toy:5:1'
+    Args: [a, b]
+    Block {
+      Return
+        BinOp: * @test/ast.toy:6:25
+          Call 'transpose' [ @test/ast.toy:6:10
+            var: a @test/ast.toy:6:20
+          ]
+          Call 'transpose' [ @test/ast.toy:6:25
+            var: b @test/ast.toy:6:35
+          ]
+    } // Block
+  Function
+    Proto 'main' @test/ast.toy:9:1'
+    Args: []
+    Block {
+      VarDecl a<> @test/ast.toy:11:3
+        Literal: <2, 3>[<3>[1.000000e+00, 2.000000e+00, 3.000000e+00], <3>[4.000000e+00, 5.000000e+00, 6.000000e+00]] @test/ast.toy:11:17
+      VarDecl b<2, 3> @test/ast.toy:12:3
+        Literal: <6>[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00] @test/ast.toy:12:17
+      VarDecl c<> @test/ast.toy:15:3
+        Call 'multiply_transpose' [ @test/ast.toy:15:11
+          var: a @test/ast.toy:15:30
+          var: b @test/ast.toy:15:33
+        ]
+      VarDecl d<> @test/ast.toy:18:3
+        Call 'multiply_transpose' [ @test/ast.toy:18:11
+          var: b @test/ast.toy:18:30
+          var: a @test/ast.toy:18:33
+        ]
+      VarDecl e<> @test/ast.toy:21:3
+        Call 'multiply_transpose' [ @test/ast.toy:21:11
+          var: b @test/ast.toy:21:30
+          var: c @test/ast.toy:21:33
+        ]
+      VarDecl f<> @test/ast.toy:24:3
+        Call 'multiply_transpose' [ @test/ast.toy:24:11
+          Call 'transpose' [ @test/ast.toy:24:30
+            var: a @test/ast.toy:24:40
+          ]
+          var: c @test/ast.toy:24:44
+        ]
+    } // Block
+```
+
+You can reproduce this result and play with the example in the
+`examples/toy/Ch1/` directory; try running `path/to/BUILD/bin/toyc-ch1
+test/Examples/Toy/Ch1/ast.toy -emit=ast`.
+
+The code for the lexer is fairly straightforward; it is all in a single header:
+`examples/toy/Ch1/include/toy/Lexer.h`. The parser can be found in
+`examples/toy/Ch1/include/toy/Parser.h`; it is a recursive descent parser. If
+you are not familiar with such a Lexer/Parser, these are very similar to the
+LLVM Kaleidoscope equivalent that are detailed in the first two chapters of the
+[Kaleidoscope Tutorial](https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.html).
+
+The [next chapter](Ch-2.md) will demonstrate how to convert this AST into MLIR.
diff --git a/mlir/docs/Tutorials/Toy/Ch-2.md b/mlir/docs/Tutorials/Toy/Ch-2.md
new file mode 100755
index 0000000000000000000000000000000000000000..ce46788f4aefb5af03a412d1b1b7b19063453c46
--- /dev/null
+++ b/mlir/docs/Tutorials/Toy/Ch-2.md
@@ -0,0 +1,577 @@
+# Chapter 2: Emitting Basic MLIR
+
+[TOC]
+
+Now that we're familiar with our language and the AST, let's see how MLIR can
+help to compile Toy.
+
+## Introduction: Multi-Level Intermediate Representation
+
+Other compilers, like LLVM (see the
+[Kaleidoscope tutorial](https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/index.html)),
+offer a fixed set of predefined types and (usually *low-level* / RISC-like)
+instructions. It is up to the frontend for a given language to perform any
+language-specific type-checking, analysis, or transformation before emitting
+LLVM IR. For example, Clang will use its AST to perform not only static analysis
+but also transformations, such as C++ template instantiation through AST cloning
+and rewrite. Finally, languages with construction at a higher-level than C/C++
+may require non-trivial lowering from their AST to generate LLVM IR.
+
+As a consequence, multiple frontends end up reimplementing significant pieces of
+infrastructure to support the need for these analyses and transformation. MLIR
+addresses this issue by being designed for extensibility. As such, there are few
+pre-defined instructions (*operations* in MLIR terminology) or types.
+
+## Interfacing with MLIR
+
+[Language reference](../../LangRef.md)
+
+MLIR is designed to be a completely extensible infrastructure; there is no
+closed set of attributes (think: constant metadata), operations, or types. MLIR
+supports this extensibility with the concept of
+[Dialects](../../LangRef.md#dialects). Dialects provide a grouping mechanism for
+abstraction under a unique `namespace`.
+
+In MLIR, [`Operations`](../../LangRef.md#operations) are the core unit of
+abstraction and computation, similar in many ways to LLVM instructions.
+Operations can have application-specific semantics and can be used to represent
+all of the core IR structures in LLVM: instructions, globals (like functions),
+modules, etc.
+
+Here is the MLIR assembly for the Toy `transpose` operations:
+
+```mlir
+%t_tensor = "toy.transpose"(%tensor) {inplace = true} : (tensor<2x3xf64>) -> tensor<3x2xf64> loc("example/file/path":12:1)
+```
+
+Let's break down the anatomy of this MLIR operation:
+
+-   `%t_tensor`
+
+    *   The name given to the result defined by this operation (which includes
+        [a prefixed sigil to avoid collisions](../../LangRef.md#identifiers-and-keywords)).
+        An operation may define zero or more results (in the context of Toy, we
+        will limit ourselves to single-result operations), which are SSA values.
+        The name is used during parsing but is not persistent (e.g., it is not
+        tracked in the in-memory representation of the SSA value).
+
+-   `"toy.transpose"`
+
+    *   The name of the operation. It is expected to be a unique string, with
+        the namespace of the dialect prefixed before the "`.`". This can be read
+        as the `transpose` operation in the `toy` dialect.
+
+-   `(%tensor)`
+
+    *   A list of zero or more input operands (or arguments), which are SSA
+        values defined by other operations or referring to block arguments.
+
+-   `{ inplace = true }`
+
+    *   A dictionary of zero or more attributes, which are special operands that
+        are always constant. Here we define a boolean attribute named 'inplace'
+        that has a constant value of true.
+
+-   `(tensor<2x3xf64>) -> tensor<3x2xf64>`
+
+    *   This refers to the type of the operation in a functional form, spelling
+        the types of the arguments in parentheses and the type of the return
+        values afterward.
+
+-   `loc("example/file/path":12:1)`
+
+    *   This is the location in the source code from which this operation
+        originated.
+
+Shown here is the general form of an operation. As described above, the set of
+operations in MLIR is extensible. This means that the infrastructure must be
+able to opaquely reason about the structure of an operation. This is done by
+boiling down the composition of an operation into discrete pieces:
+
+-   A name for the operation.
+-   A list of SSA operand values.
+-   A list of [attributes](../../LangRef.md#attributes).
+-   A list of [types](../../LangRef.md#type-system) for result values.
+-   A [source location](../../Diagnostics.md#source-locations) for debugging
+    purposes.
+-   A list of successors [blocks](../../LangRef.md#blocks) (for branches,
+    mostly).
+-   A list of [regions](../../LangRef.md#regions) (for structural operations
+    like functions).
+
+In MLIR, every operation has a mandatory source location associated with it.
+Contrary to LLVM, where debug info locations are metadata and can be dropped, in
+MLIR, the location is a core requirement, and APIs depend on and manipulate it.
+Dropping a location is thus an explicit choice which cannot happen by mistake.
+
+To provide an illustration: If a transformation replaces an operation by
+another, that new operation must still have a location attached. This makes it
+possible to track where that operation came from.
+
+It's worth noting that the mlir-opt tool - a tool for testing
+compiler passes - does not include locations in the output by default. The
+`-mlir-print-debuginfo` flag specifies to include locations. (Run `mlir-opt
+--help` for more options.)
+
+### Opaque API
+
+MLIR is designed to be a completely extensible system, and as such, the
+infrastructure has the capability to opaquely represent all of its core
+components: attributes, operations, types, etc. This allows MLIR to parse,
+represent, and [round-trip](../../Glossary.md#round-trip) any valid IR. For
+example, we could place our Toy operation from above into an `.mlir` file and
+round-trip through *mlir-opt* without registering anything:
+
+```mlir
+func @toy_func(%tensor: tensor<2x3xf64>) -> tensor<3x2xf64> {
+  %t_tensor = "toy.transpose"(%tensor) { inplace = true } : (tensor<2x3xf64>) -> tensor<3x2xf64>
+  return %t_tensor : tensor<3x2xf64>
+}
+```
+
+In the cases of unregistered attributes, operations, and types, MLIR will
+enforce some structural constraints (SSA, block termination, etc.), but
+otherwise they are completely opaque. This can be useful for bootstrapping
+purposes, but it is generally advised against. Opaque operations must be treated
+conservatively by transformations and analyses, and they are much harder to
+construct and manipulate.
+
+This handling can be observed by crafting what should be an invalid IR for Toy
+and seeing it round-trip without tripping the verifier:
+
+```mlir
+// RUN: toyc %s -emit=mlir
+
+func @main() {
+  %0 = "toy.print"() : () -> tensor<2x3xf64>
+}
+```
+
+There are multiple problems here: the `toy.print` operation is not a terminator;
+it should take an operand; and it shouldn't return any values. In the next
+section, we will register our dialect and operations with MLIR, plug into the
+verifier, and add nicer APIs to manipulate our operations.
+
+## Defining a Toy Dialect
+
+To effectively interface with MLIR, we will define a new Toy dialect. This
+dialect will properly model the semantics of the Toy language, as well as
+provide an easy avenue for high-level analysis and transformation.
+
+```c++
+/// This is the definition of the Toy dialect. A dialect inherits from
+/// mlir::Dialect and registers custom attributes, operations, and types (in its
+/// constructor). It can also override some general behavior exposed via virtual
+/// methods, which will be demonstrated in later chapters of the tutorial.
+class ToyDialect : public mlir::Dialect {
+ public:
+  explicit ToyDialect(mlir::MLIRContext *ctx);
+
+  /// Provide a utility accessor to the dialect namespace. This is used by
+  /// several utilities.
+  static llvm::StringRef getDialectNamespace() { return "toy"; }
+};
+```
+
+The dialect can now be registered in the global registry:
+
+```c++
+  mlir::registerDialect<ToyDialect>();
+```
+
+Any new `MLIRContext` created from now on will contain an instance of the Toy
+dialect and invoke specific hooks for things like parsing attributes and types.
+
+## Defining Toy Operations
+
+Now that we have a `Toy` dialect, we can start registering operations. This will
+allow for providing semantic information that the rest of the system can hook
+into. Let's walk through the creation of the `toy.constant` operation:
+
+```mlir
+ %4 = "toy.constant"() {value = dense<1.0> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+```
+
+This operation takes zero operands, a
+[dense elements](../../LangRef.md#dense-elements-attribute) attribute named
+`value`, and returns a single result of
+[TensorType](../../LangRef.md#tensor-type). An operation inherits from the
+[CRTP](https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern)
+`mlir::Op` class which also takes some optional [*traits*](../../Traits.md) to
+customize its behavior. These traits may provide additional accessors,
+verification, etc.
+
+```c++
+class ConstantOp : public mlir::Op<ConstantOp,
+                     /// The ConstantOp takes zero inputs.
+                     mlir::OpTrait::ZeroOperands,
+                     /// The ConstantOp returns a single result.
+                     mlir::OpTrait::OneResult,
+                     /// The ConstantOp is pure and has no visible side-effects.
+                     mlir::OpTrait::HasNoSideEffect> {
+
+ public:
+  /// Inherit the constructors from the base Op class.
+  using Op::Op;
+
+  /// Provide the unique name for this operation. MLIR will use this to register
+  /// the operation and uniquely identify it throughout the system.
+  static llvm::StringRef getOperationName() { return "toy.constant"; }
+
+  /// Return the value of the constant by fetching it from the attribute.
+  mlir::DenseElementsAttr getValue();
+
+  /// Operations can provide additional verification beyond the traits they
+  /// define. Here we will ensure that the specific invariants of the constant
+  /// operation are upheld, for example the result type must be of TensorType.
+  LogicalResult verify();
+
+  /// Provide an interface to build this operation from a set of input values.
+  /// This interface is used by the builder to allow for easily generating
+  /// instances of this operation:
+  ///   mlir::OpBuilder::create<ConstantOp>(...)
+  /// This method populates the given `state` that MLIR uses to create
+  /// operations. This state is a collection of all of the discrete elements
+  /// that an operation may contain.
+  /// Build a constant with the given return type and `value` attribute.
+  static void build(mlir::Builder *builder, mlir::OperationState &state,
+                    mlir::Type result, mlir::DenseElementsAttr value);
+  /// Build a constant and reuse the type from the given 'value'.
+  static void build(mlir::Builder *builder, mlir::OperationState &state,
+                    mlir::DenseElementsAttr value);
+  /// Build a constant by broadcasting the given 'value'.
+  static void build(mlir::Builder *builder, mlir::OperationState &state,
+                    double value);
+};
+```
+
+and we register this operation in the `ToyDialect` constructor:
+
+```c++
+ToyDialect::ToyDialect(mlir::MLIRContext *ctx)
+    : mlir::Dialect(getDialectNamespace(), ctx) {
+  addOperations<ConstantOp>();
+}
+```
+
+### Op vs Operation: Using MLIR Operations
+
+Now that we have defined an operation, we will want to access and transform it.
+In MLIR, there are two main classes related to operations: `Operation` and `Op`.
+Operation is the actual opaque instance of the operation, and represents the
+general API into an operation instance. An `Op` is the base class of a derived
+operation, like `ConstantOp`, and acts as smart pointer wrapper around a
+`Operation*`. This means that when we define our Toy operations, we are actually
+providing a clean interface for building and interfacing with the `Operation`
+class; this is why our `ConstantOp` defines no class fields. Therefore, we
+always pass these classes around by value, instead of by reference or pointer
+(*passing by value* is a common idiom and applies similarly to attributes,
+types, etc). We can always get an instance of our toy operation by using LLVM's
+casting infrastructure:
+
+```c++
+void processConstantOp(mlir::Operation *operation) {
+  ConstantOp op = llvm::dyn_cast<ConstantOp>(operation);
+
+  // This operation is not an instance of `ConstantOp`.
+  if (!op)
+    return;
+
+  // Get the internal operation instance back.
+  mlir::Operation *internalOperation = op.getOperation();
+  assert(internalOperation == operation &&
+         "these operation instances are the same");
+}
+```
+
+### Using the Operation Definition Specification (ODS) Framework
+
+In addition to specializing the `mlir::Op` C++ template, MLIR also supports
+defining operations in a declarative manner. This is achieved via the
+[Operation Definition Specification](../../OpDefinitions.md) framework. Facts
+regarding an operation are specified concisely into a TableGen record, which
+will be expanded into an equivalent `mlir::Op` C++ template specialization at
+compile time. Using the ODS framework is the desired way for defining operations
+in MLIR given the simplicity, conciseness, and general stability in the face of
+C++ API changes.
+
+Lets see how to define the ODS equivalent of our ConstantOp:
+
+The first thing to do is to define a link to the Toy dialect that we defined in
+C++. This is used to link all of the operations that we will define to our
+dialect:
+
+```tablegen
+// Provide a definition of the 'toy' dialect in the ODS framework so that we
+// can define our operations.
+def Toy_Dialect : Dialect {
+  // The namespace of our dialect, this corresponds 1-1 with the string we
+  // provided in `ToyDialect::getDialectNamespace`.
+  let name = "toy";
+
+  // The C++ namespace that the dialect class definition resides in.
+  let cppNamespace = "toy";
+}
+```
+
+Now that we have defined a link to the Toy dialect, we can start defining
+operations. Operations in ODS are defined by inheriting from the `Op` class. To
+simplify our operation definitions, we will define a base class for operations
+in the Toy dialect.
+
+```tablegen
+// Base class for toy dialect operations. This operation inherits from the base
+// `Op` class in OpBase.td, and provides:
+//   * The parent dialect of the operation.
+//   * The mnemonic for the operation, or the name without the dialect prefix.
+//   * A list of traits for the operation.
+class Toy_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Toy_Dialect, mnemonic, traits>;
+```
+
+With all of the preliminary pieces defined, we can begin to define the constant
+operation.
+
+We define a toy operation by inheriting from our base 'Toy_Op' class above. Here
+we provide the mnemonic and a list of traits for the operation. The
+[mnemonic](../../OpDefinitions.md#operation-name) here matches the one given in
+`ConstantOp::getOperationName` without the dialect prefix; `toy.`. The constant
+operation here is also marked as 'NoSideEffect'. This is an ODS trait, and
+matches one-to-one with the trait we providing when defining `ConstantOp`:
+`mlir::OpTrait::HasNoSideEffect`. Missing here from our C++ definition are the
+`ZeroOperands` and `OneResult` traits; these will be automatically inferred
+based upon the `arguments` and `results` fields we define later.
+
+```tablegen
+def ConstantOp : Toy_Op<"constant", [NoSideEffect]> {
+}
+```
+
+At this point you probably might want to know what the C++ code generated by
+TableGen looks like. Simply run the `mlir-tblgen` command with the
+`gen-op-decls` or the `gen-op-defs` action like so:
+
+```
+${build_root}/bin/mlir-tblgen -gen-op-defs ${mlir_src_root}/examples/toy/Ch2/include/toy/Ops.td -I ${mlir_src_root}/include/
+```
+
+Depending on the selected action, this will print either the `ConstantOp` class
+declaration or its implementation. Comparing this output to the hand-crafted
+implementation is incredibly useful when getting started with TableGen.
+
+#### Defining Arguments and Results
+
+With the shell of the operation defined, we can now provide the
+[inputs](../../OpDefinitions.md#operation-arguments) and
+[outputs](../../OpDefinitions.md#operation-results) to our operation. The
+inputs, or arguments, to an operation may be attributes or types for SSA operand
+values. The results correspond to a set of types for the values produced by the
+operation:
+
+```tablegen
+def ConstantOp : Toy_Op<"constant", [NoSideEffect]> {
+  // The constant operation takes an attribute as the only input.
+  // `F64ElementsAttr` corresponds to a 64-bit floating-point ElementsAttr.
+  let arguments = (ins F64ElementsAttr:$value);
+
+  // The constant operation returns a single value of TensorType.
+  // F64Tensor corresponds to a 64-bit floating-point TensorType.
+  let results = (outs F64Tensor);
+}
+```
+
+By providing a name to the arguments or results, e.g. `$value`, ODS will
+automatically generate a matching accessor: `DenseElementsAttr
+ConstantOp::value()`.
+
+#### Adding Documentation
+
+The next step after defining the operation is to document it. Operations may
+provide
+[`summary` and `description`](../../OpDefinitions.md#operation-documentation)
+fields to describe the semantics of the operation. This information is useful
+for users of the dialect and can even be used to auto-generate Markdown
+documents.
+
+```tablegen
+def ConstantOp : Toy_Op<"constant", [NoSideEffect]> {
+  // Provide a summary and description for this operation. This can be used to
+  // auto-generate documentation of the operations within our dialect.
+  let summary = "constant operation";
+  let description = [{
+    Constant operation turns a literal into an SSA value. The data is attached
+    to the operation as an attribute. For example:
+
+      %0 = "toy.constant"()
+         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
+        : () -> tensor<2x3xf64>
+  }];
+
+  // The constant operation takes an attribute as the only input.
+  // `F64ElementsAttr` corresponds to a 64-bit floating-point ElementsAttr.
+  let arguments = (ins F64ElementsAttr:$value);
+
+  // The generic call operation returns a single value of TensorType.
+  // F64Tensor corresponds to a 64-bit floating-point TensorType.
+  let results = (outs F64Tensor);
+}
+```
+
+#### Verifying Operation Semantics
+
+At this point we've already covered a majority of the original C++ operation
+definition. The next piece to define is the verifier. Luckily, much like the
+named accessor, the ODS framework will automatically generate a lot of the
+necessary verification logic based upon the constraints we have given. This
+means that we don't need to verify the structure of the return type, or even the
+input attribute `value`. In many cases, additional verification is not even
+necessary for ODS operations. To add additional verification logic, an operation
+can override the [`verifier`](../../OpDefinitions.md#custom-verifier-code)
+field. The `verifier` field allows for defining a C++ code blob that will be run
+as part of `ConstantOp::verify`. This blob can assume that all of the other
+invariants of the operation have already been verified:
+
+```tablegen
+def ConstantOp : Toy_Op<"constant", [NoSideEffect]> {
+  // Provide a summary and description for this operation. This can be used to
+  // auto-generate documentation of the operations within our dialect.
+  let summary = "constant operation";
+  let description = [{
+    Constant operation turns a literal into an SSA value. The data is attached
+    to the operation as an attribute. For example:
+
+      %0 = "toy.constant"()
+         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
+        : () -> tensor<2x3xf64>
+  }];
+
+  // The constant operation takes an attribute as the only input.
+  // `F64ElementsAttr` corresponds to a 64-bit floating-point ElementsAttr.
+  let arguments = (ins F64ElementsAttr:$value);
+
+  // The generic call operation returns a single value of TensorType.
+  // F64Tensor corresponds to a 64-bit floating-point TensorType.
+  let results = (outs F64Tensor);
+
+  // Add additional verification logic to the constant operation. Here we invoke
+  // a static `verify` method in a C++ source file. This codeblock is executed
+  // inside of ConstantOp::verify, so we can use `this` to refer to the current
+  // operation instance.
+  let verifier = [{ return ::verify(*this); }];
+}
+```
+
+#### Attaching `build` Methods
+
+The final missing component here from our original C++ example are the `build`
+methods. ODS can generate some simple build methods automatically, and in this
+case it will generate our first build method for us. For the rest, we define the
+[`builders`](../../OpDefinitions.md#custom-builder-methods) field. This field
+takes a list of `OpBuilder` objects that take a string corresponding to a list
+of C++ parameters, as well as an optional code block that can be used to specify
+the implementation inline.
+
+```tablegen
+def ConstantOp : Toy_Op<"constant", [NoSideEffect]> {
+  // Provide a summary and description for this operation. This can be used to
+  // auto-generate documentation of the operations within our dialect.
+  let summary = "constant operation";
+  let description = [{
+    Constant operation turns a literal into an SSA value. The data is attached
+    to the operation as an attribute. For example:
+
+      %0 = "toy.constant"()
+         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
+        : () -> tensor<2x3xf64>
+  }];
+
+  // The constant operation takes an attribute as the only input.
+  // `F64ElementsAttr` corresponds to a 64-bit floating-point ElementsAttr.
+  let arguments = (ins F64ElementsAttr:$value);
+
+  // The generic call operation returns a single value of TensorType.
+  // F64Tensor corresponds to a 64-bit floating-point TensorType.
+  let results = (outs F64Tensor);
+
+  // Add additional verification logic to the constant operation. Here we invoke
+  // a static `verify` method in a c++ source file. This codeblock is executed
+  // inside of ConstantOp::verify, so we can use `this` to refer to the current
+  // operation instance.
+  let verifier = [{ return ::verify(*this); }];
+
+  // Add custom build methods for the constant operation. These methods populate
+  // the `state` that MLIR uses to create operations, i.e. these are used when
+  // using `builder.create<ConstantOp>(...)`.
+  let builders = [
+    // Build a constant with a given constant tensor value.
+    OpBuilder<"Builder *builder, OperationState &result, "
+              "DenseElementsAttr value", [{
+      // Call into an autogenerated `build` method.
+      build(builder, result, value.getType(), value);
+    }]>,
+
+    // Build a constant with a given constant floating-point value. This builder
+    // creates a declaration for `ConstantOp::build` with the given parameters.
+    OpBuilder<"Builder *builder, OperationState &result, double value">
+  ];
+}
+```
+
+Above we introduce several of the concepts for defining operations in the ODS
+framework, but there are many more that we haven't had a chance to: regions,
+variadic operands, etc. Check out the
+[full specification](../../OpDefinitions.md) for more details.
+
+## Complete Toy Example
+
+At this point we can generate our "Toy IR". A simplified version of the previous
+example:
+
+```.toy
+# User defined generic function that operates on unknown shaped arguments.
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+  var c = multiply_transpose(a, b);
+  var d = multiply_transpose(b, a);
+  print(d);
+}
+```
+
+Results in the following IR:
+
+```mlir
+module {
+  func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+    %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64> loc("test/codegen.toy":5:10)
+    %1 = "toy.transpose"(%arg1) : (tensor<*xf64>) -> tensor<*xf64> loc("test/codegen.toy":5:25)
+    %2 = "toy.mul"(%0, %1) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64> loc("test/codegen.toy":5:25)
+    "toy.return"(%2) : (tensor<*xf64>) -> () loc("test/codegen.toy":5:3)
+  } loc("test/codegen.toy":4:1)
+  func @main() {
+    %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64> loc("test/codegen.toy":9:17)
+    %1 = "toy.reshape"(%0) : (tensor<2x3xf64>) -> tensor<2x3xf64> loc("test/codegen.toy":9:3)
+    %2 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64> loc("test/codegen.toy":10:17)
+    %3 = "toy.reshape"(%2) : (tensor<6xf64>) -> tensor<2x3xf64> loc("test/codegen.toy":10:3)
+    %4 = "toy.generic_call"(%1, %3) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> loc("test/codegen.toy":11:11)
+    %5 = "toy.generic_call"(%3, %1) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> loc("test/codegen.toy":12:11)
+    "toy.print"(%5) : (tensor<*xf64>) -> () loc("test/codegen.toy":13:3)
+    "toy.return"() : () -> () loc("test/codegen.toy":8:1)
+  } loc("test/codegen.toy":8:1)
+} loc("test/codegen.toy":0:0)
+```
+
+You can build `toyc-ch2` and try yourself: `toyc-ch2
+test/Examples/Toy/Ch2/codegen.toy -emit=mlir -mlir-print-debuginfo`. We can also
+check our RoundTrip: `toyc-ch2 test/Examples/Toy/Ch2/codegen.toy -emit=mlir
+-mlir-print-debuginfo 2> codegen.mlir` followed by `toyc-ch2 codegen.mlir
+-emit=mlir`. You should also use `mlir-tblgen` on the final definition file and
+study the generated C++ code.
+
+At this point, MLIR knows about our Toy dialect and operations. In the
+[next chapter](Ch-3.md), we will leverage our new dialect to implement some
+high-level language-specific analyses and transformations for the Toy language.
diff --git a/mlir/docs/Tutorials/Toy/Ch-3.md b/mlir/docs/Tutorials/Toy/Ch-3.md
new file mode 100644
index 0000000000000000000000000000000000000000..615c2c1bbecc3f67a50734614da906d6af6582e0
--- /dev/null
+++ b/mlir/docs/Tutorials/Toy/Ch-3.md
@@ -0,0 +1,264 @@
+# Chapter 3: High-level Language-Specific Analysis and Transformation
+
+[TOC]
+
+Creating a dialect that closely represents the semantics of an input language
+enables analyses, transformations and optimizations in MLIR that require
+high-level language information and are generally performed on the language AST.
+For example, `clang` has a fairly
+[heavy mechanism](https://clang.llvm.org/doxygen/classclang_1_1TreeTransform.html)
+for performing template instantiation in C++.
+
+We divide compiler transformations into two categories: local and global. In
+this chapter, we focus on how to leverage the Toy Dialect and its high-level
+semantics to perform local pattern-match transformations that would be difficult
+in LLVM. For this, we use MLIR's
+[Generic DAG Rewriter](../../GenericDAGRewriter.md).
+
+There are two methods that can be used to implement pattern-match
+transformations: 1. Imperative, C++ pattern-match and rewrite 2. Declarative,
+rule-based pattern-match and rewrite using table-driven
+[Declarative Rewrite Rules](../../DeclarativeRewrites.md) (DRR). Note that the
+use of DRR requires that the operations be defined using ODS, as described in
+[Chapter 2](Ch-2.md).
+
+# Optimize Transpose using C++ style pattern-match and rewrite
+
+Let's start with a simple pattern and try to eliminate a sequence of two
+transpose that cancel out: `transpose(transpose(X)) -> X`. Here is the
+corresponding Toy example:
+
+```Toy(.toy)
+def transpose_transpose(x) {
+  return transpose(transpose(x));
+}
+```
+
+Which corresponds to the following IR:
+
+```mlir
+func @transpose_transpose(%arg0: tensor<*xf64>) -> tensor<*xf64> {
+  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
+  %1 = "toy.transpose"(%0) : (tensor<*xf64>) -> tensor<*xf64>
+  "toy.return"(%1) : (tensor<*xf64>) -> ()
+}
+```
+
+This is a good example of a transformation that is trivial to match on the Toy
+IR but that would be quite hard for LLVM to figure. For example, today Clang
+can't optimize away the temporary array, and the computation with the naive
+transpose is expressed with these loops:
+
+```c++
+#define N 100
+#define M 100
+
+void sink(void *);
+void double_transpose(int A[N][M]) {
+  int B[M][N];
+  for(int i = 0; i < N; ++i) {
+    for(int j = 0; j < M; ++j) {
+       B[j][i] = A[i][j];
+    }
+  }
+  for(int i = 0; i < N; ++i) {
+    for(int j = 0; j < M; ++j) {
+       A[i][j] = B[j][i];
+    }
+  }
+  sink(A);
+}
+```
+
+For a simple C++ approach to rewrite involving matching a tree-like pattern in
+the IR and replacing it with a different set of operations, we can plug into the
+MLIR `Canonicalizer` pass by implementing a `RewritePattern`:
+
+```c++
+/// Fold transpose(transpose(x)) -> x
+struct SimplifyRedundantTranspose : public mlir::OpRewritePattern<TransposeOp> {
+  /// We register this pattern to match every toy.transpose in the IR.
+  /// The "benefit" is used by the framework to order the patterns and process
+  /// them in order of profitability.
+  SimplifyRedundantTranspose(mlir::MLIRContext *context)
+      : OpRewritePattern<TransposeOp>(context, /*benefit=*/1) {}
+
+  /// This method is attempting to match a pattern and rewrite it. The rewriter
+  /// argument is the orchestrator of the sequence of rewrites. It is expected
+  /// to interact with it to perform any changes to the IR from here.
+  mlir::PatternMatchResult
+  matchAndRewrite(TransposeOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // Look through the input of the current transpose.
+    mlir::Value transposeInput = op.getOperand();
+    TransposeOp transposeInputOp =
+        llvm::dyn_cast_or_null<TransposeOp>(transposeInput->getDefiningOp());
+    // If the input is defined by another Transpose, bingo!
+    if (!transposeInputOp)
+      return matchFailure();
+
+    // Use the rewriter to perform the replacement
+    rewriter.replaceOp(op, {transposeInputOp.getOperand()}, {transposeInputOp});
+    return matchSuccess();
+  }
+};
+```
+
+The implementation of this rewriter is in `ToyCombine.cpp`. The
+[canonicalization pass](../../Canonicalization.md) applies transformations
+defined by operations in a greedy, iterative manner. To ensure that the
+canonicalization pass applies our new transform, we set
+[hasCanonicalizer = 1](../../OpDefinitions.md#hascanonicalizer) and register the
+pattern with the canonicalization framework.
+
+```c++
+// Register our patterns for rewrite by the Canonicalization framework.
+void TransposeOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<SimplifyRedundantTranspose>(context);
+}
+```
+
+We also need to update our main file, `toyc.cpp`, to add an optimization
+pipeline. In MLIR, the optimizations are run through a `PassManager` in a
+similar way to LLVM:
+
+```c++
+  mlir::PassManager pm(module.getContext());
+  pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+```
+
+Finally, we can run `toyc-ch3 test/transpose_transpose.toy -emit=mlir -opt` and
+observe our pattern in action:
+
+```mlir
+func @transpose_transpose(%arg0: tensor<*xf64>) -> tensor<*xf64> {
+  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
+  "toy.return"(%arg0) : (tensor<*xf64>) -> ()
+}
+```
+
+As expected, we now directly return the function argument, bypassing any
+transpose operation. However, one of the transposes still hasn't been
+eliminated. That is not ideal! What happened is that our pattern replaced the
+last transform with the function input and left behind the now dead transpose
+input. The Canonicalizer knows to clean up dead operations; however, MLIR
+conservatively assumes that operations may have side-effects. We can fix this by
+adding a new trait, `NoSideEffect`, to our `TransposeOp`:
+
+```tablegen:
+def TransposeOp : Toy_Op<"transpose", [NoSideEffect]> {...}
+```
+
+Let's retry now `toyc-ch3 test/transpose_transpose.toy -emit=mlir -opt`:
+
+```mlir
+func @transpose_transpose(%arg0: tensor<*xf64>) -> tensor<*xf64> {
+  "toy.return"(%arg0) : (tensor<*xf64>) -> ()
+}
+```
+
+Perfect! No `transpose` operation is left - the code is optimal.
+
+In the next section, we use DRR for pattern match optimizations associated with
+the Reshape op.
+
+# Optimize Reshapes using DRR
+
+Declarative, rule-based pattern-match and rewrite (DRR) is an operation
+DAG-based declarative rewriter that provides a table-based syntax for
+pattern-match and rewrite rules:
+
+```tablegen:
+class Pattern<
+    dag sourcePattern, list<dag> resultPatterns,
+    list<dag> additionalConstraints = [],
+    dag benefitsAdded = (addBenefit 0)>;
+```
+
+A redundant reshape optimization similar to SimplifyRedundantTranspose can be
+expressed more simply using DRR as follows:
+
+```tablegen:
+// Reshape(Reshape(x)) = Reshape(x)
+def ReshapeReshapeOptPattern : Pat<(ReshapeOp(ReshapeOp $arg)),
+                                   (ReshapeOp $arg)>;
+```
+
+The automatically generated C++ code corresponding to each of the DRR patterns
+can be found under path/to/BUILD/projects/mlir/examples/toy/Ch3/ToyCombine.inc.
+
+DRR also provides a method for adding argument constraints when the
+transformation is conditional on some properties of the arguments and results.
+An example is a transformation that eliminates reshapes when they are redundant,
+i.e. when the input and output shapes are identical.
+
+```tablegen:
+def TypesAreIdentical : Constraint<CPred<"$0->getType() == $1->getType()">>;
+def RedundantReshapeOptPattern : Pat<
+  (ReshapeOp:$res $arg), (replaceWithValue $arg),
+  [(TypesAreIdentical $res, $arg)]>;
+```
+
+Some optimizations may require additional transformations on instruction
+arguments. This is achieved using NativeCodeCall, which allows for more complex
+transformations either by calling into a C++ helper function or by using inline
+C++. An example of such an optimization is FoldConstantReshape, where we
+optimize Reshape of a constant value by reshaping the constant in place and
+eliminating the reshape operation.
+
+```tablegen:
+def ReshapeConstant : NativeCodeCall<"$0.reshape(($1->getType()).cast<ShapedType>())">;
+def FoldConstantReshapeOptPattern : Pat<
+  (ReshapeOp:$res (ConstantOp $arg)),
+  (ConstantOp (ReshapeConstant $arg, $res))>;
+```
+
+We demonstrate these reshape optimizations using the following
+trivialReshape.toy program:
+
+```c++
+def main() {
+  var a<2,1> = [1, 2];
+  var b<2,1> = a;
+  var c<2,1> = b;
+  print(c);
+}
+```
+
+```mlir
+module {
+  func @main() {
+    %0 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf64>}
+                           : () -> tensor<2xf64>
+    %1 = "toy.reshape"(%0) : (tensor<2xf64>) -> tensor<2x1xf64>
+    %2 = "toy.reshape"(%1) : (tensor<2x1xf64>) -> tensor<2x1xf64>
+    %3 = "toy.reshape"(%2) : (tensor<2x1xf64>) -> tensor<2x1xf64>
+    "toy.print"(%3) : (tensor<2x1xf64>) -> ()
+    "toy.return"() : () -> ()
+  }
+}
+```
+
+We can try to run `toyc-ch3 test/trivialReshape.toy -emit=mlir -opt` and observe
+our pattern in action:
+
+```mlir
+module {
+  func @main() {
+    %0 = "toy.constant"() {value = dense<[[1.000000e+00], [2.000000e+00]]> \
+                           : tensor<2x1xf64>} : () -> tensor<2x1xf64>
+    "toy.print"(%0) : (tensor<2x1xf64>) -> ()
+    "toy.return"() : () -> ()
+  }
+}
+```
+
+As expected, no reshape operations remain after canonicalization.
+
+Further details on the declarative rewrite method can be found at
+[Table-driven Declarative Rewrite Rule (DRR)](../../DeclarativeRewrites.md).
+
+In this chapter, we saw how to use certain core transformations through always
+available hooks. In the [next chapter](Ch-4.md), we will see how to use generic
+solutions that scale better through Interfaces.
diff --git a/mlir/docs/Tutorials/Toy/Ch-4.md b/mlir/docs/Tutorials/Toy/Ch-4.md
new file mode 100644
index 0000000000000000000000000000000000000000..4a4e11c68e608aeea929cd40982d157ffbc39265
--- /dev/null
+++ b/mlir/docs/Tutorials/Toy/Ch-4.md
@@ -0,0 +1,387 @@
+# Chapter 4: Enabling Generic Transformation with Interfaces
+
+[TOC]
+
+## Background: Grappling with an Extensible IR
+
+Through dialects, MLIR allows for the representation of many different levels of
+abstraction; the Toy dialect that we have previously defined is one such
+example. Though these different dialects may represent different abstractions,
+there is often a set of common transformations and analyses that we would like
+to perform. The problem that arises is that naively implementing each
+transformation for each dialect leads to large amounts of code duplication, as
+the internal algorithms are generally very similar, if not the same. We would
+like to provide the ability for transformations to opaquely hook into dialects
+like Toy to get the information they need.
+
+MLIR provides a set of always available-hooks for certain core transformations,
+as seen in the [previous chapter](Ch-3.md), where we registered some
+canonicalizations via a hook on our operations (`getCanonicalizationPatterns`).
+However, these types of hooks don't really scale well. Therefore, a more generic
+solution was designed, in the form of [interfaces](../../Interfaces.md), to make
+the MLIR infrastructure as extensible as the representation. Interfaces provide
+a generic mechanism for dialects and operations to provide information to a
+transformation or analysis.
+
+## Shape Inference: Preparing for Code Generation
+
+Our Toy IR currently operates on generic tensors, meaning that we don't know the
+shape of tensors other than during the initialization of constants. This
+complicates optimizations, as well as code generation. Fortunately, we can
+simply propagate the shapes through the computation until they are all known.
+The issue is how to handle calls to user-defined generic functions: every call
+site could deduce different shapes. One possibility would be to perform symbolic
+inference based on the argument types, but this would be hard to generalize if
+we were to introduce more control flow in the language. Another approach would
+be function specialization, where every call site with new argument shapes
+duplicates the called function and specializes it. The approach we take for Toy
+is to inline all of the function calls, then perform intraprocedural shape
+propagation.
+
+### Inlining
+
+Here we could write an inlining algorithm specifically designed for the Toy
+dialect, but that can become quite complicated depending on the level of
+complexity that we want. Disregarding cost modeling, the pure structural
+transformation is already complex to implement from scratch. Thankfully, MLIR
+provides a generic inliner algorithm that dialects can plug into. All we need to
+do in Toy is to provide the [interfaces](../../Interfaces.md) for the inliner to
+hook into.
+
+The first thing we need to do is to define the constraints on inlining
+operations in the Toy dialect. This information is provided through a
+[dialect interface](../../Interfaces.md#dialect-interfaces). This is essentially
+a class containing a set of virtual hooks for which a dialect may provide a
+specialization. In this case, the interface is `DialectInlinerInterface`.
+
+```c++
+/// This class defines the interface for handling inlining with Toy operations.
+/// We simplify inherit from the base interface class and provide a
+/// specialization of the necessary methods.
+struct ToyInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  /// This hook checks to see if the given operation is legal to inline into the
+  /// given region. For Toy this hook can simply return true, as all Toy
+  /// operations are inlinable.
+  bool isLegalToInline(Operation *, Region *,
+                       BlockAndValueMapping &) const final {
+    return true;
+  }
+
+  /// This hook is called when a terminator operation has been inlined. The only
+  /// terminator that we have in the Toy dialect is the return
+  /// operation(toy.return). We handle the return by replacing the values
+  /// previously returned by the call operation with the operands of the
+  /// return.
+  void handleTerminator(Operation *op,
+                        ArrayRef<Value> valuesToRepl) const final {
+    // Only "toy.return" needs to be handled here.
+    auto returnOp = cast<ReturnOp>(op);
+
+    // Replace the values directly with the return operands.
+    assert(returnOp.getNumOperands() == valuesToRepl.size());
+    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+      valuesToRepl[it.index()]->replaceAllUsesWith(it.value());
+  }
+};
+```
+
+We then register our dialect interface directly on the Toy dialect, similarly to
+how we did for operations.
+
+```c++
+ToyDialect::ToyDialect(mlir::MLIRContext *ctx) : mlir::Dialect("toy", ctx) {
+  addInterfaces<ToyInlinerInterface>();
+}
+```
+
+Next, we need to provide a way for the inliner to know that `toy.generic_call`
+represents a call to a function. MLIR provides an
+[operation interface](../../Interfaces.md#operation-interfaces) that can be used
+to mark an operation as being "call-like". Unlike dialect interfaces, operation
+interfaces provide a more refined granularity of information that is specific
+and core to a single operation. The interface that we will be adding here is the
+`CallOpInterface`.
+
+To add this interface we just need to include the definition into our operation
+specification file (`Ops.td`):
+
+```tablegen
+#ifdef MLIR_CALLINTERFACES
+#else
+include "mlir/Analysis/CallInterfaces.td"
+#endif // MLIR_CALLINTERFACES
+```
+
+and add it to the traits list of `GenericCallOp`:
+
+```tablegen
+def GenericCallOp : Toy_Op<"generic_call",
+    [DeclareOpInterfaceMethods<CallOpInterface>]> {
+  ...
+}
+```
+
+In the above we also use the `DeclareOpInterfaceMethods` directive to
+auto-declare all of the interface methods in the class declaration of
+GenericCallOp. This means that we just need to provide a definition:
+
+```c++
+/// Return the callee of the generic call operation, this is required by the
+/// call interface.
+CallInterfaceCallable GenericCallOp::getCallableForCallee() {
+  return getAttrOfType<SymbolRefAttr>("callee");
+}
+
+/// Get the argument operands to the called function, this is required by the
+/// call interface.
+Operation::operand_range GenericCallOp::getArgOperands() { return inputs(); }
+```
+
+Now that the inliner has been informed about the Toy dialect, we can add the
+inliner pass to the pass manager for Toy:
+
+```c++
+  pm.addPass(mlir::createInlinerPass());
+```
+
+Now let's look at a working example:
+
+```mlir
+func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
+  %1 = "toy.transpose"(%arg1) : (tensor<*xf64>) -> tensor<*xf64>
+  %2 = "toy.mul"(%0, %1) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+  "toy.return"(%2) : (tensor<*xf64>) -> ()
+}
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %1 = "toy.reshape"(%0) : (tensor<2x3xf64>) -> tensor<2x3xf64>
+  %2 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
+  %3 = "toy.reshape"(%2) : (tensor<6xf64>) -> tensor<2x3xf64>
+  %4 = "toy.generic_call"(%1, %3) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  %5 = "toy.generic_call"(%3, %1) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  "toy.print"(%5) : (tensor<*xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+```
+
+We have two calls to multiple_transpose that we would like to inline into main,
+but if we look at the output nothing has changed. We are missing one last subtle
+piece: there is a hidden type conversion on the edge of the call. If we look at
+the above, the operands to the generic_call are of type `tensor<2x3xf64>`, while
+the inputs to the function expect `tensor<*xf64>`. To resolve this difference,
+the inliner expects an explicit cast operation to be inserted. For this, we need
+to add a new operation to the Toy dialect, `ToyCastOp`(toy.cast), to represent
+casts between two different shapes.
+
+```tablegen
+def CastOp : Toy_Op<"cast", [NoSideEffect, SameOperandsAndResultShape]> {
+  let summary = "shape cast operation";
+  let description = [{
+    The "cast" operation converts a tensor from one type to an equivalent type
+    without changing any data elements. The source and destination types
+    must both be tensor types with the same element type. If both are ranked
+    then the rank should be the same and static dimensions should match. The
+    operation is invalid if converting to a mismatching constant dimension.
+  }];
+
+  let arguments = (ins F64Tensor:$input);
+  let results = (outs F64Tensor:$output);
+
+  // Set the folder bit so that we can fold redundant cast operations.
+  let hasFolder = 1;
+}
+```
+
+We can then override the necessary hook on the ToyInlinerInterface to insert
+this for us when necessary:
+
+```c++
+struct ToyInlinerInterface : public DialectInlinerInterface {
+  ...
+
+  /// Attempts to materialize a conversion for a type mismatch between a call
+  /// from this dialect, and a callable region. This method should generate an
+  /// operation that takes 'input' as the only operand, and produces a single
+  /// result of 'resultType'. If a conversion can not be generated, nullptr
+  /// should be returned.
+  Operation *materializeCallConversion(OpBuilder &builder, Value input,
+                                       Type resultType,
+                                       Location conversionLoc) const final {
+    return builder.create<CastOp>(conversionLoc, resultType, input);
+  }
+};
+```
+
+If we run the working example through the pipeline again, we get the expected:
+
+```mlir
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %1 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %2 = "toy.cast"(%1) : (tensor<2x3xf64>) -> tensor<*xf64>
+  %3 = "toy.cast"(%0) : (tensor<2x3xf64>) -> tensor<*xf64>
+  %4 = "toy.transpose"(%2) : (tensor<*xf64>) -> tensor<*xf64>
+  %5 = "toy.transpose"(%3) : (tensor<*xf64>) -> tensor<*xf64>
+  %6 = "toy.mul"(%4, %5) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+  "toy.print"(%6) : (tensor<*xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+```
+
+NOTE: The generic inliner will also perform simplifications, so the output may
+be a bit cleaner than expected.
+
+### Intraprocedural Shape Inference
+
+Now that we have inlined all of the functions, we are left with a main function
+containing a mix of static and dynamically shaped operations. We can now write a
+simple shape inference pass to propagate shapes intraprocedurally (within a
+single function). We could write this as a pass that directly encodes the
+constraints of the operations within the Toy dialect, but this seems like a good
+candidate for a transformation that could be written generically. As a good rule
+of thumb, it is best to express a transformation as generically as possible,
+such that it can be extended to other dialects in the future. There is no
+telling how many other dialects may have similar needs or encounter the same
+problems.
+
+For shape inference, if we break down the problem to its core, we really just
+want operations to tell us the expected outputs given a set of statically known
+inputs. (We can definitely get more complex than that, but for our needs we can
+keep it simple.) Given that this property is core to a specific operation, we
+can define an operation interface that can be specified on operations that need
+to have their result shapes inferred.
+
+Similarly to operations, we can also
+[define operation interfaces](../../OpDefinitions.md#operation-interfaces) using
+the operation definition specification (ODS) framework.
+
+The interface is defined by inheriting from `OpInterface`, which takes the name
+to be given to the generated C++ interface class as a template argument. For our
+purposes, we will name the generated class a simpler `ShapeInference`. We also
+provide a description for the interface.
+
+```tablegen
+def ShapeInferenceOpInterface : OpInterface<"ShapeInference"> {
+  let description = [{
+    Interface to access a registered method to infer the return types for an
+    operation that can be used during type inference.
+  }];
+}
+```
+
+Next, we define the interface methods that the operations will need to provide.
+An interface method is comprised of: a description; a C++ return type in string
+form; a method name in string form; and a few optional components, depending on
+the need. See the
+[ODS documentation](../../OpDefinitions.md#operation-interfaces) for more
+information.
+
+```tablegen
+def ShapeInferenceOpInterface : OpInterface<"ShapeInference"> {
+  let description = [{
+    Interface to access a registered method to infer the return types for an
+    operation that can be used during type inference.
+  }];
+
+  let methods = [
+    InterfaceMethod<"Infer and set the output shape for the current operation.",
+                    "void", "inferShapes">
+  ];
+}
+```
+
+Now that the interface is defined, we can add it to the necessary Toy operations
+in a similar way to how we added the `CallOpInterface` to the GenericCallOp:
+
+```
+def MulOp : Toy_Op<"mul",
+    [..., DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  ...
+}
+```
+
+Each of these operations will then need to provide a definition for the
+`inferShapes()` method. As an example, for the mul op, the result shape is
+inferred as the shape of the inputs.
+
+```c++
+/// Infer the output shape of the MulOp, this is required by the shape inference
+/// interface.
+void MulOp::inferShapes() { getResult()->setType(getOperand(0)->getType()); }
+```
+
+At this point, each of the necessary Toy operations provide a mechanism by which
+to infer their output shapes. The ShapeInferencePass is a FunctionPass: it will
+runs on each Function in isolation. MLIR also supports general
+[OperationPasses](../../WritingAPass.md#operation-pass) that run on any isolated
+operation (i.e. other function-like operations), but here our module only
+contains functions, so there is no need to generalize to all operations.
+
+Implementing such a pass is done by creating a class inheriting from
+`mlir::FunctionPass` and overriding the `runOnFunction()` method:
+
+```c++
+class ShapeInferencePass : public mlir::FunctionPass<ShapeInferencePass> {
+  void runOnFunction() override {
+    FuncOp function = getFunction();
+    ...
+  }
+};
+```
+
+The algorithm operates as follows:
+
+1.  Build a worklist containing all the operations that return a dynamically
+    shaped tensor: these are the operations that need shape inference.
+2.  Iterate on the worklist:
+    -   find an operation to process: the next ready operation in the worklist
+        has all of its arguments non-generic,
+    -   if no operation is found, break out of the loop,
+    -   remove the operation from the worklist,
+    -   infer the shape of its output from the argument types.
+3.  If the worklist is empty, the algorithm succeeded.
+
+When processing an operation, we query if it registered the `ShapeInference`
+interface.
+
+```c++
+  // Ask the operation to infer its output shapes.
+  LLVM_DEBUG(llvm::dbgs() << "Inferring shape for: " << *op << "\n");
+
+  /// We check if an operation has a particular interface by casting.
+  if (ShapeInference shapeOp = dyn_cast<ShapeInference>(op)) {
+    shapeOp.inferShapes();
+  } else {
+    op->emitError("unable to infer shape of operation without shape "
+                  "inference interface");
+    return signalPassFailure();
+  }
+```
+
+We can then add our pass to the pass manager:
+
+```c++
+  pm.addPass(mlir::createShapeInferencePass());
+```
+
+If we rerun our original example, we now get the following:
+
+```mlir
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %1 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+  %2 = "toy.mul"(%1, %1) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+  "toy.print"(%2) : (tensor<3x2xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+```
+
+You can build `toyc-ch4` and try yourself: `toyc-ch4
+test/Examples/Toy/Ch4/codegen.toy -emit=mlir -opt`.
+
+In the [next chapter](Ch-5.md), we will start the process of code generation by
+targeting a lower level dialect for optimizing some of the more compute-heavy
+Toy operations.
diff --git a/mlir/docs/Tutorials/Toy/Ch-5.md b/mlir/docs/Tutorials/Toy/Ch-5.md
new file mode 100644
index 0000000000000000000000000000000000000000..8a4268b498fa5c3c039c1745f74dcbfc855bb74f
--- /dev/null
+++ b/mlir/docs/Tutorials/Toy/Ch-5.md
@@ -0,0 +1,357 @@
+# Chapter 5: Partial Lowering to Lower-Level Dialects for Optimization
+
+[TOC]
+
+At this point, we are eager to generate actual code and see our Toy language
+take life. We will use LLVM to generate code, but just showing the LLVM builder
+interface here wouldn't be very exciting. Instead, we will show how to perform
+progressive lowering through a mix of dialects coexisting in the same function.
+
+To make it more interesting, in this chapter we will consider that we want to
+reuse existing optimizations implemented in a dialect optimizing affine
+transformations: `Affine`. This dialect is tailored to the computation-heavy
+part of the program and is limited: it doesn't support representing our
+`toy.print` builtin, for instance, neither should it! Instead, we can target
+`Affine` for the computation heavy part of Toy, and in the
+[next chapter](Ch-6.md) directly the `LLVM IR` dialect for lowering `print`. As
+part of this lowering, we will be lowering from the
+[TensorType](../../LangRef.md#tensor-type) that `Toy` operates on to the
+[MemRefType](../../LangRef.md#memref-type) that is indexed via an affine
+loop-nest. Tensors represent an abstract value-typed sequence of data, meaning
+that they don't live in any memory. MemRefs, on the other hand, represent lower
+level buffer access, as they are concrete references to a region of memory.
+
+# Dialect Conversions
+
+MLIR has many different dialects, so it is important to have a unified framework
+for [converting](../../Glossary.md#conversion) between them. This is where the
+`DialectConversion` framework comes into play. This framework allows for
+transforming a set of `illegal` operations to a set of `legal` ones. To use this
+framework, we need to provide two things (and an optional third):
+
+*   A [Conversion Target](../../DialectConversion.md#conversion-target)
+
+    -   This is the formal specification of what operations or dialects are
+        legal for the conversion. Operations that aren't legal will require
+        rewrite patterns to perform
+        [legalization](./../../Glossary.md#legalization).
+
+*   A set of
+    [Rewrite Patterns](../../DialectConversion.md#rewrite-pattern-specification)
+
+    -   These are the set of [patterns](../../QuickstartRewrites.md) used to
+        convert `illegal` operations into a set of zero or more `legal` ones.
+
+*   Optionally, a [Type Converter](../../DialectConversion.md#type-conversion).
+
+    -   If provided, this is used to convert the types of block arguments. We
+        won't be needing this for our conversion.
+
+## Conversion Target
+
+For our purposes, we want to convert the compute-intensive `Toy` operations into
+a combination of operations from the `Affine` `Standard` dialects for further
+optimization. To start off the lowering, we first define our conversion target:
+
+```c++
+void ToyToAffineLoweringPass::runOnFunction() {
+  // The first thing to define is the conversion target. This will define the
+  // final target for this lowering.
+  mlir::ConversionTarget target(getContext());
+
+  // We define the specific operations, or dialects, that are legal targets for
+  // this lowering. In our case, we are lowering to a combination of the
+  // `Affine` and `Standard` dialects.
+  target.addLegalDialect<mlir::AffineOpsDialect, mlir::StandardOpsDialect>();
+
+  // We also define the Toy dialect as Illegal so that the conversion will fail
+  // if any of these operations are *not* converted. Given that we actually want
+  // a partial lowering, we explicitly mark the Toy operations that don't want
+  // to lower, `toy.print`, as `legal`.
+  target.addIllegalDialect<ToyDialect>();
+  target.addLegalOp<PrintOp>();
+  ...
+}
+```
+
+## Conversion Patterns
+
+After the conversion target has been defined, we can define how to convert the
+`illegal` operations into `legal` ones. Similarly to the canonicalization
+framework introduced in [chapter 3](Ch-3.md), the
+[`DialectConversion` framework](../../DialectConversion.md) also uses
+[RewritePatterns](../../QuickstartRewrites.md) to perform the conversion logic.
+These patterns may be the `RewritePatterns` seen before or a new type of pattern
+specific to the conversion framework `ConversionPattern`. `ConversionPatterns`
+are different from traditional `RewritePatterns` in that they accept an
+additional `operands` parameter containing operands that have been
+remapped/replaced. This is used when dealing with type conversions, as the
+pattern will want to operate on values of the new type but match against the
+old. For our lowering, this invariant will be useful as it translates from the
+[TensorType](../../LangRef.md#tensor-type) currently being operated on to the
+[MemRefType](../../LangRef.md#memref-type). Let's look at a snippet of lowering
+the `toy.transpose` operation:
+
+```c++
+/// Lower the `toy.transpose` operation to an affine loop nest.
+struct TransposeOpLowering : public mlir::ConversionPattern {
+  TransposeOpLowering(mlir::MLIRContext *ctx)
+      : mlir::ConversionPattern(TransposeOp::getOperationName(), 1, ctx) {}
+
+  /// Match and rewrite the given `toy.transpose` operation, with the given
+  /// operands that have been remapped from `tensor<...>` to `memref<...>`.
+  mlir::PatternMatchResult
+  matchAndRewrite(mlir::Operation *op, ArrayRef<mlir::Value> operands,
+                  mlir::ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+
+    // Call to a helper function that will lower the current operation to a set
+    // of affine loops. We provide a functor that operates on the remapped
+    // operands, as well as the loop induction variables for the inner most
+    // loop body.
+    lowerOpToLoops(
+        op, operands, rewriter,
+        [loc](mlir::PatternRewriter &rewriter,
+              ArrayRef<mlir::Value> memRefOperands,
+              ArrayRef<mlir::Value> loopIvs) {
+          // Generate an adaptor for the remapped operands of the TransposeOp.
+          // This allows for using the nice named accessors that are generated
+          // by the ODS. This adaptor is automatically provided by the ODS
+          // framework.
+          TransposeOpOperandAdaptor transposeAdaptor(memRefOperands);
+          mlir::Value input = transposeAdaptor.input();
+
+          // Transpose the elements by generating a load from the reverse
+          // indices.
+          SmallVector<mlir::Value, 2> reverseIvs(llvm::reverse(loopIvs));
+          return rewriter.create<mlir::AffineLoadOp>(loc, input, reverseIvs);
+        });
+    return matchSuccess();
+  }
+};
+```
+
+Now we can prepare the list of patterns to use during the lowering process:
+
+```c++
+void ToyToAffineLoweringPass::runOnFunction() {
+  ...
+
+  // Now that the conversion target has been defined, we just need to provide
+  // the set of patterns that will lower the Toy operations.
+  mlir::OwningRewritePatternList patterns;
+  patterns.insert<..., TransposeOpLowering>(&getContext());
+
+  ...
+```
+
+## Partial Lowering
+
+Once the patterns have been defined, we can perform the actual lowering. The
+`DialectConversion` framework provides several different modes of lowering, but,
+for our purposes, we will perform a partial lowering, as we will not convert
+`toy.print` at this time.
+
+```c++
+void ToyToAffineLoweringPass::runOnFunction() {
+  // The first thing to define is the conversion target. This will define the
+  // final target for this lowering.
+  mlir::ConversionTarget target(getContext());
+
+  // We define the specific operations, or dialects, that are legal targets for
+  // this lowering. In our case, we are lowering to a combination of the
+  // `Affine` and `Standard` dialects.
+  target.addLegalDialect<mlir::AffineOpsDialect, mlir::StandardOpsDialect>();
+
+  // We also define the Toy dialect as Illegal so that the conversion will fail
+  // if any of these operations are *not* converted. Given that we actually want
+  // a partial lowering, we explicitly mark the Toy operations that don't want
+  // to lower, `toy.print`, as `legal`.
+  target.addIllegalDialect<ToyDialect>();
+  target.addLegalOp<PrintOp>();
+
+  // Now that the conversion target has been defined, we just need to provide
+  // the set of patterns that will lower the Toy operations.
+  mlir::OwningRewritePatternList patterns;
+  patterns.insert<..., TransposeOpLowering>(&getContext());
+
+  // With the target and rewrite patterns defined, we can now attempt the
+  // conversion. The conversion will signal failure if any of our `illegal`
+  // operations were not converted successfully.
+  auto function = getFunction();
+  if (mlir::failed(mlir::applyPartialConversion(function, target, patterns)))
+    signalPassFailure();
+}
+```
+
+### Design Considerations With Partial Lowering
+
+Before diving into the result of our lowering, this is a good time to discuss
+potential design considerations when it comes to partial lowering. In our
+lowering, we transform from a value-type, TensorType, to an allocated
+(buffer-like) type, MemRefType. However, given that we do not lower the
+`toy.print` operation, we need to temporarily bridge these two worlds. There are
+many ways to go about this, each with their own tradeoffs:
+
+*   Generate `load` operations from the buffer
+
+One option is to generate `load` operations from the buffer type to materialize
+an instance of the value type. This allows for the definition of the `toy.print`
+operation to remain unchanged. The downside to this approach is that the
+optimizations on the `affine` dialect are limited, because the `load` will
+actually involve a full copy that is only visible *after* our optimizations have
+been performed.
+
+*   Generate a new version of `toy.print` that operates on the lowered type
+
+Another option would be to have another, lowered, variant of `toy.print` that
+operates on the lowered type. The benefit of this option is that there is no
+hidden, unnecessary copy to the optimizer. The downside is that another
+operation definition is needed that may duplicate many aspects of the first.
+Defining a base class in [ODS](../../OpDefinitions.md) may simplify this, but
+you still need to treat these operations separately.
+
+*   Update `toy.print` to allow for operating on the lowered type
+
+A third option is to update the current definition of `toy.print` to allow for
+operating the on the lowered type. The benefit of this approach is that it is
+simple, does not introduce an additional hidden copy, and does not require
+another operation definition. The downside to this option is that it requires
+mixing abstraction levels in the `Toy` dialect.
+
+For the sake of simplicity, we will use the third option for this lowering. This
+involves updating the type constraints on the PrintOp in the operation
+definition file:
+
+```tablegen
+def PrintOp : Toy_Op<"print"> {
+  ...
+
+  // The print operation takes an input tensor to print.
+  // We also allow a F64MemRef to enable interop during partial lowering.
+  let arguments = (ins AnyTypeOf<[F64Tensor, F64MemRef]>:$input);
+}
+```
+
+## Complete Toy Example
+
+Looking back at our current working example:
+
+```mlir
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+```
+
+With affine lowering added to our pipeline, we can now generate:
+
+```mlir
+func @main() {
+  %cst = constant 1.000000e+00 : f64
+  %cst_0 = constant 2.000000e+00 : f64
+  %cst_1 = constant 3.000000e+00 : f64
+  %cst_2 = constant 4.000000e+00 : f64
+  %cst_3 = constant 5.000000e+00 : f64
+  %cst_4 = constant 6.000000e+00 : f64
+
+  // Allocating buffers for the inputs and outputs.
+  %0 = alloc() : memref<3x2xf64>
+  %1 = alloc() : memref<3x2xf64>
+  %2 = alloc() : memref<2x3xf64>
+
+  // Initialize the input buffer with the constant values.
+  affine.store %cst, %2[0, 0] : memref<2x3xf64>
+  affine.store %cst_0, %2[0, 1] : memref<2x3xf64>
+  affine.store %cst_1, %2[0, 2] : memref<2x3xf64>
+  affine.store %cst_2, %2[1, 0] : memref<2x3xf64>
+  affine.store %cst_3, %2[1, 1] : memref<2x3xf64>
+  affine.store %cst_4, %2[1, 2] : memref<2x3xf64>
+
+  // Load the transpose value from the input buffer and store it into the
+  // next input buffer.
+  affine.for %arg0 = 0 to 3 {
+    affine.for %arg1 = 0 to 2 {
+      %3 = affine.load %2[%arg1, %arg0] : memref<2x3xf64>
+      affine.store %3, %1[%arg0, %arg1] : memref<3x2xf64>
+    }
+  }
+
+  // Multiply and store into the output buffer.
+  affine.for %arg0 = 0 to 2 {
+    affine.for %arg1 = 0 to 3 {
+      %3 = affine.load %1[%arg0, %arg1] : memref<3x2xf64>
+      %4 = affine.load %1[%arg0, %arg1] : memref<3x2xf64>
+      %5 = mulf %3, %4 : f64
+      affine.store %5, %0[%arg0, %arg1] : memref<3x2xf64>
+    }
+  }
+
+  // Print the value held by the buffer.
+  "toy.print"(%0) : (memref<3x2xf64>) -> ()
+  dealloc %2 : memref<2x3xf64>
+  dealloc %1 : memref<3x2xf64>
+  dealloc %0 : memref<3x2xf64>
+  return
+}
+```
+
+## Taking Advantage of Affine Optimization
+
+Our naive lowering is correct, but it leaves a lot to be desired with regards to
+efficiency. For example, the lowering of `toy.mul` has generated some redundant
+loads. Let's look at how adding a few existing optimizations to the pipeline can
+help clean this up. Adding the `LoopFusion` and `MemRefDataFlowOpt` passes to
+the pipeline gives the following result:
+
+```mlir
+func @main() {
+  %cst = constant 1.000000e+00 : f64
+  %cst_0 = constant 2.000000e+00 : f64
+  %cst_1 = constant 3.000000e+00 : f64
+  %cst_2 = constant 4.000000e+00 : f64
+  %cst_3 = constant 5.000000e+00 : f64
+  %cst_4 = constant 6.000000e+00 : f64
+
+  // Allocating buffers for the inputs and outputs.
+  %0 = alloc() : memref<3x2xf64>
+  %1 = alloc() : memref<2x3xf64>
+
+  // Initialize the input buffer with the constant values.
+  affine.store %cst, %1[0, 0] : memref<2x3xf64>
+  affine.store %cst_0, %1[0, 1] : memref<2x3xf64>
+  affine.store %cst_1, %1[0, 2] : memref<2x3xf64>
+  affine.store %cst_2, %1[1, 0] : memref<2x3xf64>
+  affine.store %cst_3, %1[1, 1] : memref<2x3xf64>
+  affine.store %cst_4, %1[1, 2] : memref<2x3xf64>
+
+  affine.for %arg0 = 0 to 3 {
+    affine.for %arg1 = 0 to 2 {
+      // Load the transpose value from the input buffer.
+      %2 = affine.load %1[%arg1, %arg0] : memref<2x3xf64>
+
+      // Multiply and store into the output buffer.
+      %3 = mulf %2, %2 : f64
+      affine.store %3, %0[%arg0, %arg1] : memref<3x2xf64>
+    }
+  }
+
+  // Print the value held by the buffer.
+  "toy.print"(%0) : (memref<3x2xf64>) -> ()
+  dealloc %1 : memref<2x3xf64>
+  dealloc %0 : memref<3x2xf64>
+  return
+}
+```
+
+Here, we can see that a redundant allocation was removed, the two loop nests
+were fused, and some unnecessary `load`s were removed. You can build `toyc-ch5`
+and try yourself: `toyc-ch5 test/lowering.toy -emit=mlir-affine`. We can also
+check our optimizations by adding `-opt`.
+
+In this chapter we explored some aspects of partial lowering, with the intent to
+optimize. In the [next chapter](Ch-6.md) we will continue the discussion about
+dialect conversion by targeting LLVM for code generation.
diff --git a/mlir/docs/Tutorials/Toy/Ch-6.md b/mlir/docs/Tutorials/Toy/Ch-6.md
new file mode 100644
index 0000000000000000000000000000000000000000..939b2b4f776476179d232f595f5cdad4381b4668
--- /dev/null
+++ b/mlir/docs/Tutorials/Toy/Ch-6.md
@@ -0,0 +1,323 @@
+# Chapter 6: Lowering to LLVM and CodeGeneration
+
+[TOC]
+
+In the [previous chapter](Ch-5.md), we introduced the
+[dialect conversion](../../DialectConversion.md) framework and partially lowered
+many of the `Toy` operations to affine loop nests for optimization. In this
+chapter, we will finally lower to LLVM for code generation.
+
+# Lowering to LLVM
+
+For this lowering, we will again use the dialect conversion framework to perform
+the heavy lifting. However, this time, we will be performing a full conversion
+to the [LLVM dialect](../../Dialects/LLVM.md). Thankfully, we have already
+lowered all but one of the `toy` operations, with the last being `toy.print`.
+Before going over the conversion to LLVM, let's lower the `toy.print` operation.
+We will lower this operation to a non-affine loop nest that invokes `printf` for
+each element. Note that, because the dialect conversion framework supports
+[transitive lowering](Glossary.md#transitive-lowering), we don't need to
+directly emit operations in the LLVM dialect. By transitive lowering, we mean
+that the conversion framework may apply multiple patterns to fully legalize an
+operation. In this example, we are generating a structured loop nest instead of
+the branch-form in the LLVM dialect. As long as we then have a lowering from the
+loop operations to LLVM, the lowering will still succeed.
+
+During lowering we can get, or build, the declaration for printf as so:
+
+```c++
+/// Return a symbol reference to the printf function, inserting it into the
+/// module if necessary.
+static FlatSymbolRefAttr getOrInsertPrintf(PatternRewriter &rewriter,
+                                           ModuleOp module,
+                                           LLVM::LLVMDialect *llvmDialect) {
+  auto *context = module.getContext();
+  if (module.lookupSymbol<LLVM::LLVMFuncOp>("printf"))
+    return SymbolRefAttr::get("printf", context);
+
+  // Create a function declaration for printf, the signature is:
+  //   * `i32 (i8*, ...)`
+  auto llvmI32Ty = LLVM::LLVMType::getInt32Ty(llvmDialect);
+  auto llvmI8PtrTy = LLVM::LLVMType::getInt8PtrTy(llvmDialect);
+  auto llvmFnType = LLVM::LLVMType::getFunctionTy(llvmI32Ty, llvmI8PtrTy,
+                                                  /*isVarArg=*/true);
+
+  // Insert the printf function into the body of the parent module.
+  PatternRewriter::InsertionGuard insertGuard(rewriter);
+  rewriter.setInsertionPointToStart(module.getBody());
+  rewriter.create<LLVM::LLVMFuncOp>(module.getLoc(), "printf", llvmFnType);
+  return SymbolRefAttr::get("printf", context);
+}
+```
+
+Now that the lowering for the printf operation has been defined, we can specify
+the components necessary for the lowering. These are largely the same as the
+components defined in the [previous chapter](Ch-5.md).
+
+## Conversion Target
+
+For this conversion, aside from the top-level module, we will be lowering
+everything to the LLVM dialect.
+
+```c++
+  mlir::ConversionTarget target(getContext());
+  target.addLegalDialect<mlir::LLVM::LLVMDialect>();
+  target.addLegalOp<mlir::ModuleOp, mlir::ModuleTerminatorOp>();
+```
+
+## Type Converter
+
+This lowering will also transform the MemRef types which are currently being
+operated on into a representation in LLVM. To perform this conversion, we use a
+TypeConverter as part of the lowering. This converter specifies how one type
+maps to another. This is necessary now that we are performing more complicated
+lowerings involving block arguments. Given that we don't have any
+Toy-dialect-specific types that need to be lowered, the default converter is
+enough for our use case.
+
+```c++
+  LLVMTypeConverter typeConverter(&getContext());
+```
+
+## Conversion Patterns
+
+Now that the conversion target has been defined, we need to provide the patterns
+used for lowering. At this point in the compilation process, we have a
+combination of `toy`, `affine`, and `std` operations. Luckily, the `std` and
+`affine` dialects already provide the set of patterns needed to transform them
+into LLVM dialect. These patterns allow for lowering the IR in multiple stages
+by relying on [transitive lowering](Glossary.md#transitive-lowering).
+
+```c++
+  mlir::OwningRewritePatternList patterns;
+  mlir::populateAffineToStdConversionPatterns(patterns, &getContext());
+  mlir::populateLoopToStdConversionPatterns(patterns, &getContext());
+  mlir::populateStdToLLVMConversionPatterns(typeConverter, patterns);
+
+  // The only remaining operation to lower from the `toy` dialect, is the
+  // PrintOp.
+  patterns.insert<PrintOpLowering>(&getContext());
+```
+
+## Full Lowering
+
+We want to completely lower to LLVM, so we use a `FullConversion`. This ensures
+that only legal operations will remain after the conversion.
+
+```c++
+  mlir::ModuleOp module = getModule();
+  if (mlir::failed(mlir::applyFullConversion(module, target, patterns,
+                                             &typeConverter)))
+    signalPassFailure();
+```
+
+Looking back at our current working example:
+
+```mlir
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+```
+
+We can now lower down to the LLVM dialect, which produces the following code:
+
+```mlir
+llvm.func @free(!llvm<"i8*">)
+llvm.func @printf(!llvm<"i8*">, ...) -> !llvm.i32
+llvm.func @malloc(!llvm.i64) -> !llvm<"i8*">
+llvm.func @main() {
+  %0 = llvm.mlir.constant(1.000000e+00 : f64) : !llvm.double
+  %1 = llvm.mlir.constant(2.000000e+00 : f64) : !llvm.double
+
+  ...
+
+^bb16:
+  %221 = llvm.extractvalue %25[0 : index] : !llvm<"{ double*, i64, [2 x i64], [2 x i64] }">
+  %222 = llvm.mlir.constant(0 : index) : !llvm.i64
+  %223 = llvm.mlir.constant(2 : index) : !llvm.i64
+  %224 = llvm.mul %214, %223 : !llvm.i64
+  %225 = llvm.add %222, %224 : !llvm.i64
+  %226 = llvm.mlir.constant(1 : index) : !llvm.i64
+  %227 = llvm.mul %219, %226 : !llvm.i64
+  %228 = llvm.add %225, %227 : !llvm.i64
+  %229 = llvm.getelementptr %221[%228] : (!llvm<"double*">, !llvm.i64) -> !llvm<"double*">
+  %230 = llvm.load %229 : !llvm<"double*">
+  %231 = llvm.call @printf(%207, %230) : (!llvm<"i8*">, !llvm.double) -> !llvm.i32
+  %232 = llvm.add %219, %218 : !llvm.i64
+  llvm.br ^bb15(%232 : !llvm.i64)
+
+  ...
+
+^bb18:
+  %235 = llvm.extractvalue %65[0 : index] : !llvm<"{ double*, i64, [2 x i64], [2 x i64] }">
+  %236 = llvm.bitcast %235 : !llvm<"double*"> to !llvm<"i8*">
+  llvm.call @free(%236) : (!llvm<"i8*">) -> ()
+  %237 = llvm.extractvalue %45[0 : index] : !llvm<"{ double*, i64, [2 x i64], [2 x i64] }">
+  %238 = llvm.bitcast %237 : !llvm<"double*"> to !llvm<"i8*">
+  llvm.call @free(%238) : (!llvm<"i8*">) -> ()
+  %239 = llvm.extractvalue %25[0 : index] : !llvm<"{ double*, i64, [2 x i64], [2 x i64] }">
+  %240 = llvm.bitcast %239 : !llvm<"double*"> to !llvm<"i8*">
+  llvm.call @free(%240) : (!llvm<"i8*">) -> ()
+  llvm.return
+}
+```
+
+See [Conversion to the LLVM IR Dialect](../../ConversionToLLVMDialect.md) for
+more in-depth details on lowering to the LLVM dialect.
+
+# CodeGen: Getting Out of MLIR
+
+At this point we are right at the cusp of code generation. We can generate code
+in the LLVM dialect, so now we just need to export to LLVM IR and setup a JIT to
+run it.
+
+## Emitting LLVM IR
+
+Now that our module is comprised only of operations in the LLVM dialect, we can
+export to LLVM IR. To do this programmatically, we can invoke the following
+utility:
+
+```c++
+  std::unique_ptr<llvm::Module> llvmModule = mlir::translateModuleToLLVMIR(module);
+  if (!llvmModule)
+    /* ... an error was encountered ... */
+```
+
+Exporting our module to LLVM IR generates:
+
+```.llvm
+define void @main() {
+  ...
+
+102:
+  %103 = extractvalue { double*, i64, [2 x i64], [2 x i64] } %8, 0
+  %104 = mul i64 %96, 2
+  %105 = add i64 0, %104
+  %106 = mul i64 %100, 1
+  %107 = add i64 %105, %106
+  %108 = getelementptr double, double* %103, i64 %107
+  %109 = load double, double* %108
+  %110 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @frmt_spec, i64 0, i64 0), double %109)
+  %111 = add i64 %100, 1
+  br label %99
+
+  ...
+
+115:
+  %116 = extractvalue { double*, i64, [2 x i64], [2 x i64] } %24, 0
+  %117 = bitcast double* %116 to i8*
+  call void @free(i8* %117)
+  %118 = extractvalue { double*, i64, [2 x i64], [2 x i64] } %16, 0
+  %119 = bitcast double* %118 to i8*
+  call void @free(i8* %119)
+  %120 = extractvalue { double*, i64, [2 x i64], [2 x i64] } %8, 0
+  %121 = bitcast double* %120 to i8*
+  call void @free(i8* %121)
+  ret void
+}
+```
+
+If we enable optimization on the generated LLVM IR, we can trim this down quite
+a bit:
+
+```.llvm
+define void @main()
+  %0 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @frmt_spec, i64 0, i64 0), double 1.000000e+00)
+  %1 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @frmt_spec, i64 0, i64 0), double 1.600000e+01)
+  %putchar = tail call i32 @putchar(i32 10)
+  %2 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @frmt_spec, i64 0, i64 0), double 4.000000e+00)
+  %3 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @frmt_spec, i64 0, i64 0), double 2.500000e+01)
+  %putchar.1 = tail call i32 @putchar(i32 10)
+  %4 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @frmt_spec, i64 0, i64 0), double 9.000000e+00)
+  %5 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @frmt_spec, i64 0, i64 0), double 3.600000e+01)
+  %putchar.2 = tail call i32 @putchar(i32 10)
+  ret void
+}
+
+```
+
+The full code listing for dumping LLVM IR can be found in `Ch6/toy.cpp` in the
+`dumpLLVMIR()` function:
+
+```c++
+
+int dumpLLVMIR(mlir::ModuleOp module) {
+  // Translate the module, that contains the LLVM dialect, to LLVM IR.
+  auto llvmModule = mlir::translateModuleToLLVMIR(module);
+  if (!llvmModule) {
+    llvm::errs() << "Failed to emit LLVM IR\n";
+    return -1;
+  }
+
+  // Initialize LLVM targets.
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+  mlir::ExecutionEngine::setupTargetTriple(llvmModule.get());
+
+  /// Optionally run an optimization pipeline over the llvm module.
+  auto optPipeline = mlir::makeOptimizingTransformer(
+      /*optLevel=*/EnableOpt ? 3 : 0, /*sizeLevel=*/0,
+      /*targetMachine=*/nullptr);
+  if (auto err = optPipeline(llvmModule.get())) {
+    llvm::errs() << "Failed to optimize LLVM IR " << err << "\n";
+    return -1;
+  }
+  llvm::errs() << *llvmModule << "\n";
+  return 0;
+}
+```
+
+## Setting up a JIT
+
+Setting up a JIT to run the module containing the LLVM dialect can be done using
+the `mlir::ExecutionEngine` infrastructure. This is a utility wrapper around
+LLVM's JIT that accepts `.mlir` as input. The full code listing for setting up
+the JIT can be found in `Ch6/toy.cpp` in the `runJit()` function:
+
+```c++
+int runJit(mlir::ModuleOp module) {
+  // Initialize LLVM targets.
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+
+  // An optimization pipeline to use within the execution engine.
+  auto optPipeline = mlir::makeOptimizingTransformer(
+      /*optLevel=*/EnableOpt ? 3 : 0, /*sizeLevel=*/0,
+      /*targetMachine=*/nullptr);
+
+  // Create an MLIR execution engine. The execution engine eagerly JIT-compiles
+  // the module.
+  auto maybeEngine = mlir::ExecutionEngine::create(module, optPipeline);
+  assert(maybeEngine && "failed to construct an execution engine");
+  auto &engine = maybeEngine.get();
+
+  // Invoke the JIT-compiled function.
+  auto invocationResult = engine->invoke("main");
+  if (invocationResult) {
+    llvm::errs() << "JIT invocation failed\n";
+    return -1;
+  }
+
+  return 0;
+}
+```
+
+You can play around with it from the build directory:
+
+```sh
+$ echo 'def main() { print([[1, 2], [3, 4]]); }' | ./bin/toyc-ch6 -emit=jit
+1.000000 2.000000
+3.000000 4.000000
+```
+
+You can also play with `-emit=mlir`, `-emit=mlir-affine`, `-emit=mlir-llvm`, and
+`-emit=llvm` to compare the various levels of IR involved. Also try options like
+[`--print-ir-after-all`](../../WritingAPass.md#ir-printing) to track the
+evolution of the IR throughout the pipeline.
+
+So far, we have worked with primitive data types. In the
+[next chapter](Ch-7.md), we will add a composite `struct` type.
diff --git a/mlir/docs/Tutorials/Toy/Ch-7.md b/mlir/docs/Tutorials/Toy/Ch-7.md
new file mode 100644
index 0000000000000000000000000000000000000000..6298e8253e9a5e350f7b28df478b8a788cc0cefc
--- /dev/null
+++ b/mlir/docs/Tutorials/Toy/Ch-7.md
@@ -0,0 +1,539 @@
+# Chapter 7: Adding a Composite Type to Toy
+
+[TOC]
+
+In the [previous chapter](Ch-6.md), we demonstrated an end-to-end compilation
+flow from our Toy front-end to LLVM IR. In this chapter, we will extend the Toy
+language to support a new composite `struct` type.
+
+## Defining a `struct` in Toy
+
+The first thing we need to define is the interface of this type in our `toy`
+source language. The general syntax of a `struct` type in Toy is as follows:
+
+```toy
+# A struct is defined by using the `struct` keyword followed by a name.
+struct MyStruct {
+  # Inside of the struct is a list of variable declarations without initializers
+  # or shapes, which may also be other previously defined structs.
+  var a;
+  var b;
+}
+```
+
+Structs may now be used in functions as variables or parameters by using the
+name of the struct instead of `var`. The members of the struct are accessed via
+a `.` access operator. Values of `struct` type may be initialized with a
+composite initializer, or a comma-separated list of other initializers
+surrounded by `{}`. An example is shown below:
+
+```toy
+struct Struct {
+  var a;
+  var b;
+}
+
+# User defined generic function may operate on struct types as well.
+def multiply_transpose(Struct value) {
+  # We can access the elements of a struct via the '.' operator.
+  return transpose(value.a) * transpose(value.b);
+}
+
+def main() {
+  # We initialize struct values using a composite initializer.
+  Struct value = {[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]};
+
+  # We pass these arguments to functions like we do with variables.
+  var c = multiply_transpose(value);
+  print(c);
+}
+```
+
+## Defining a `struct` in MLIR
+
+In MLIR, we will also need a representation for our struct types. MLIR does not
+provide a type that does exactly what we need, so we will need to define our
+own. We will simply define our `struct` as an unnamed container of a set of
+element types. The name of the `struct` and its elements are only useful for the
+AST of our `toy` compiler, so we don't need to encode it in the MLIR
+representation.
+
+### Defining the Type Class
+
+#### Reserving a Range of Type Kinds
+
+Types in MLIR rely on having a unique `kind` value to ensure that casting checks
+remain extremely efficient
+([rationale](../../Rationale.md#reserving-dialect-type-kinds)). For `toy`, this
+means we need to explicitly reserve a static range of type `kind` values in the
+symbol registry file
+[DialectSymbolRegistry](https://github.com/tensorflow/mlir/blob/master/include/mlir/IR/DialectSymbolRegistry.def).
+
+```c++
+DEFINE_SYM_KIND_RANGE(LINALG) // Linear Algebra Dialect
+DEFINE_SYM_KIND_RANGE(TOY)    // Toy language (tutorial) Dialect
+
+// The following ranges are reserved for experimenting with MLIR dialects in a
+// private context without having to register them here.
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_0)
+```
+
+These definitions will provide a range in the Type::Kind enum to use when
+defining the derived types.
+
+```c++
+/// Create a local enumeration with all of the types that are defined by Toy.
+namespace ToyTypes {
+enum Types {
+  Struct = mlir::Type::FIRST_TOY_TYPE,
+};
+} // end namespace ToyTypes
+```
+
+#### Defining the Type Class
+
+As mentioned in [chapter 2](Ch-2.md), [`Type`](../../LangRef.md#type-system)
+objects in MLIR are value-typed and rely on having an internal storage object
+that holds the actual data for the type. The `Type` class in itself acts as a
+simple wrapper around an internal `TypeStorage` object that is uniqued within an
+instance of an `MLIRContext`. When constructing a `Type`, we are internally just
+constructing and uniquing an instance of a storage class.
+
+When defining a new `Type` that requires additional information beyond just the
+`kind` (e.g. the `struct` type, which requires additional information to hold
+the element types), we will need to provide a derived storage class. The
+`primitive` types that don't have any additional data (e.g. the
+[`index` type](../../LangRef.md#index-type)) don't require a storage class.
+
+##### Defining the Storage Class
+
+Type storage objects contain all of the data necessary to construct and unique a
+type instance. Derived storage classes must inherit from the base
+`mlir::TypeStorage` and provide a set of aliases and hooks that will be used by
+the `MLIRContext` for uniquing. Below is the definition of the storage instance
+for our `struct` type, with each of the necessary requirements detailed inline:
+
+```c++
+/// This class represents the internal storage of the Toy `StructType`.
+struct StructTypeStorage : public mlir::TypeStorage {
+  /// The `KeyTy` is a required type that provides an interface for the storage
+  /// instance. This type will be used when uniquing an instance of the type
+  /// storage. For our struct type, we will unique each instance structurally on
+  /// the elements that it contains.
+  using KeyTy = llvm::ArrayRef<mlir::Type>;
+
+  /// A constructor for the type storage instance.
+  StructTypeStorage(llvm::ArrayRef<mlir::Type> elementTypes)
+      : elementTypes(elementTypes) {}
+
+  /// Define the comparison function for the key type with the current storage
+  /// instance. This is used when constructing a new instance to ensure that we
+  /// haven't already uniqued an instance of the given key.
+  bool operator==(const KeyTy &key) const { return key == elementTypes; }
+
+  /// Define a hash function for the key type. This is used when uniquing
+  /// instances of the storage.
+  /// Note: This method isn't necessary as both llvm::ArrayRef and mlir::Type
+  /// have hash functions available, so we could just omit this entirely.
+  static llvm::hash_code hashKey(const KeyTy &key) {
+    return llvm::hash_value(key);
+  }
+
+  /// Define a construction function for the key type from a set of parameters.
+  /// These parameters will be provided when constructing the storage instance
+  /// itself, see the `StructType::get` method further below.
+  /// Note: This method isn't necessary because KeyTy can be directly
+  /// constructed with the given parameters.
+  static KeyTy getKey(llvm::ArrayRef<mlir::Type> elementTypes) {
+    return KeyTy(elementTypes);
+  }
+
+  /// Define a construction method for creating a new instance of this storage.
+  /// This method takes an instance of a storage allocator, and an instance of a
+  /// `KeyTy`. The given allocator must be used for *all* necessary dynamic
+  /// allocations used to create the type storage and its internal.
+  static StructTypeStorage *construct(mlir::TypeStorageAllocator &allocator,
+                                      const KeyTy &key) {
+    // Copy the elements from the provided `KeyTy` into the allocator.
+    llvm::ArrayRef<mlir::Type> elementTypes = allocator.copyInto(key);
+
+    // Allocate the storage instance and construct it.
+    return new (allocator.allocate<StructTypeStorage>())
+        StructTypeStorage(elementTypes);
+  }
+
+  /// The following field contains the element types of the struct.
+  llvm::ArrayRef<mlir::Type> elementTypes;
+};
+```
+
+##### Defining the Type Class
+
+With the storage class defined, we can add the definition for the user-visible
+`StructType` class. This is the class that we will actually interface with.
+
+```c++
+/// This class defines the Toy struct type. It represents a collection of
+/// element types. All derived types in MLIR must inherit from the CRTP class
+/// 'Type::TypeBase'. It takes as template parameters the concrete type
+/// (StructType), the base class to use (Type), and the storage class
+/// (StructTypeStorage).
+class StructType : public mlir::Type::TypeBase<StructType, mlir::Type,
+                                               StructTypeStorage> {
+public:
+  /// Inherit some necessary constructors from 'TypeBase'.
+  using Base::Base;
+
+  /// This static method is used to support type inquiry through isa, cast,
+  /// and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == ToyTypes::Struct; }
+
+  /// Create an instance of a `StructType` with the given element types. There
+  /// *must* be at least one element type.
+  static StructType get(llvm::ArrayRef<mlir::Type> elementTypes) {
+    assert(!elementTypes.empty() && "expected at least 1 element type");
+
+    // Call into a helper 'get' method in 'TypeBase' to get a uniqued instance
+    // of this type. The first two parameters are the context to unique in and
+    // the kind of the type. The parameters after the type kind are forwarded to
+    // the storage instance.
+    mlir::MLIRContext *ctx = elementTypes.front().getContext();
+    return Base::get(ctx, ToyTypes::Struct, elementTypes);
+  }
+
+  /// Returns the element types of this struct type.
+  llvm::ArrayRef<mlir::Type> getElementTypes() {
+    // 'getImpl' returns a pointer to the internal storage instance.
+    return getImpl()->elementTypes;
+  }
+
+  /// Returns the number of element type held by this struct.
+  size_t getNumElementTypes() { return getElementTypes().size(); }
+};
+```
+
+We register this type in the `ToyDialect` constructor in a similar way to how we
+did with operations:
+
+```c++
+ToyDialect::ToyDialect(mlir::MLIRContext *ctx)
+    : mlir::Dialect(getDialectNamespace(), ctx) {
+  addTypes<StructType>();
+}
+```
+
+With this we can now use our `StructType` when generating MLIR from Toy. See
+examples/toy/Ch7/mlir/MLIRGen.cpp for more details.
+
+### Parsing and Printing
+
+At this point we can use our `StructType` during MLIR generation and
+transformation, but we can't output or parse `.mlir`. For this we need to add
+support for parsing and printing instances of the `StructType`. This can be done
+by overriding the `parseType` and `printType` methods on the `ToyDialect`.
+
+```c++
+class ToyDialect : public mlir::Dialect {
+public:
+  /// Parse an instance of a type registered to the toy dialect.
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
+
+  /// Print an instance of a type registered to the toy dialect.
+  void printType(mlir::Type type,
+                 mlir::DialectAsmPrinter &printer) const override;
+};
+```
+
+These methods take an instance of a high-level parser or printer that allows for
+easily implementing the necessary functionality. Before going into the
+implementation, let's think about the syntax that we want for the `struct` type
+in the printed IR. As described in the
+[MLIR language reference](../../LangRef.md#dialect-types), dialect types are
+generally represented as: `! dialect-namespace < type-data >`, with a pretty
+form available under certain circumstances. The responsibility of our `Toy`
+parser and printer is to provide the `type-data` bits. We will define our
+`StructType` as having the following form:
+
+```
+  struct-type ::= `struct` `<` type (`,` type)* `>`
+```
+
+#### Parsing
+
+An implementation of the parser is shown below:
+
+```c++
+/// Parse an instance of a type registered to the toy dialect.
+mlir::Type ToyDialect::parseType(mlir::DialectAsmParser &parser) const {
+  // Parse a struct type in the following form:
+  //   struct-type ::= `struct` `<` type (`,` type)* `>`
+
+  // NOTE: All MLIR parser function return a ParseResult. This is a
+  // specialization of LogicalResult that auto-converts to a `true` boolean
+  // value on failure to allow for chaining, but may be used with explicit
+  // `mlir::failed/mlir::succeeded` as desired.
+
+  // Parse: `struct` `<`
+  if (parser.parseKeyword("struct") || parser.parseLess())
+    return Type();
+
+  // Parse the element types of the struct.
+  SmallVector<mlir::Type, 1> elementTypes;
+  do {
+    // Parse the current element type.
+    llvm::SMLoc typeLoc = parser.getCurrentLocation();
+    mlir::Type elementType;
+    if (parser.parseType(elementType))
+      return nullptr;
+
+    // Check that the type is either a TensorType or another StructType.
+    if (!elementType.isa<mlir::TensorType>() &&
+        !elementType.isa<StructType>()) {
+      parser.emitError(typeLoc, "element type for a struct must either "
+                                "be a TensorType or a StructType, got: ")
+          << elementType;
+      return Type();
+    }
+    elementTypes.push_back(elementType);
+
+    // Parse the optional: `,`
+  } while (succeeded(parser.parseOptionalComma()));
+
+  // Parse: `>`
+  if (parser.parseGreater())
+    return Type();
+  return StructType::get(elementTypes);
+}
+```
+
+#### Printing
+
+An implementation of the printer is shown below:
+
+```c++
+/// Print an instance of a type registered to the toy dialect.
+void ToyDialect::printType(mlir::Type type,
+                           mlir::DialectAsmPrinter &printer) const {
+  // Currently the only toy type is a struct type.
+  StructType structType = type.cast<StructType>();
+
+  // Print the struct type according to the parser format.
+  printer << "struct<";
+  mlir::interleaveComma(structType.getElementTypes(), printer);
+  printer << '>';
+}
+```
+
+Before moving on, let's look at a quick of example showcasing the functionality
+we have now:
+
+```toy
+struct Struct {
+  var a;
+  var b;
+}
+
+def multiply_transpose(Struct value) {
+}
+```
+
+Which generates the following:
+
+```mlir
+module {
+  func @multiply_transpose(%arg0: !toy.struct<tensor<*xf64>, tensor<*xf64>>) {
+    "toy.return"() : () -> ()
+  }
+}
+```
+
+### Operating on `StructType`
+
+Now that the `struct` type has been defined, and we can round-trip it through
+the IR. The next step is to add support for using it within our operations.
+
+#### Updating Existing Operations
+
+A few of our existing operations will need to be updated to handle `StructType`.
+The first step is to make the ODS framework aware of our Type so that we can use
+it in the operation definitions. A simple example is shown below:
+
+```tablegen
+// Provide a definition for the Toy StructType for use in ODS. This allows for
+// using StructType in a similar way to Tensor or MemRef.
+def Toy_StructType :
+    Type<CPred<"$_self.isa<StructType>()">, "Toy struct type">;
+
+// Provide a definition of the types that are used within the Toy dialect.
+def Toy_Type : AnyTypeOf<[F64Tensor, Toy_StructType]>;
+```
+
+We can then update our operations, e.g. `ReturnOp`, to also accept the
+`Toy_StructType`:
+
+```tablegen
+def ReturnOp : Toy_Op<"return", [Terminator, HasParent<"FuncOp">]> {
+  ...
+  let arguments = (ins Variadic<Toy_Type>:$input);
+  ...
+}
+```
+
+#### Adding New `Toy` Operations
+
+In addition to the existing operations, we will be adding a few new operations
+that will provide more specific handling of `structs`.
+
+##### `toy.struct_constant`
+
+This new operation materializes a constant value for a struct. In our current
+modeling, we just use an [array attribute](../../LangRef.md#array-attribute)
+that contains a set of constant values for each of the `struct` elements.
+
+```mlir
+  %0 = "toy.struct_constant"() {
+    value = [dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64>]
+  } : () -> !toy.struct<tensor<*xf64>>
+```
+
+##### `toy.struct_access`
+
+This new operation materializes the Nth element of a `struct` value.
+
+```mlir
+  %0 = "toy.struct_constant"() {
+    value = [dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64>]
+  } : () -> !toy.struct<tensor<*xf64>>
+  %1 = "toy.struct_access"(%0) {index = 0 : i64} : (!toy.struct<tensor<*xf64>>) -> tensor<*xf64>
+```
+
+With these operations, we can revisit our original example:
+
+```toy
+struct Struct {
+  var a;
+  var b;
+}
+
+# User defined generic function may operate on struct types as well.
+def multiply_transpose(Struct value) {
+  # We can access the elements of a struct via the '.' operator.
+  return transpose(value.a) * transpose(value.b);
+}
+
+def main() {
+  # We initialize struct values using a composite initializer.
+  Struct value = {[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]};
+
+  # We pass these arguments to functions like we do with variables.
+  var c = multiply_transpose(value);
+  print(c);
+}
+```
+
+and finally get a full MLIR module:
+
+```mlir
+module {
+  func @multiply_transpose(%arg0: !toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64> {
+    %0 = "toy.struct_access"(%arg0) {index = 0 : i64} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
+    %1 = "toy.transpose"(%0) : (tensor<*xf64>) -> tensor<*xf64>
+    %2 = "toy.struct_access"(%arg0) {index = 1 : i64} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
+    %3 = "toy.transpose"(%2) : (tensor<*xf64>) -> tensor<*xf64>
+    %4 = "toy.mul"(%1, %3) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+    "toy.return"(%4) : (tensor<*xf64>) -> ()
+  }
+  func @main() {
+    %0 = "toy.struct_constant"() {value = [dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>, dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>]} : () -> !toy.struct<tensor<*xf64>, tensor<*xf64>>
+    %1 = "toy.generic_call"(%0) {callee = @multiply_transpose} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
+    "toy.print"(%1) : (tensor<*xf64>) -> ()
+    "toy.return"() : () -> ()
+  }
+}
+```
+
+#### Optimizing Operations on `StructType`
+
+Now that we have a few operations operating on `StructType`, we also have many
+new constant folding opportunities.
+
+After inlining, the MLIR module in the previous section looks something like:
+
+```mlir
+module {
+  func @main() {
+    %0 = "toy.struct_constant"() {value = [dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>, dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>]} : () -> !toy.struct<tensor<*xf64>, tensor<*xf64>>
+    %1 = "toy.struct_access"(%0) {index = 0 : i64} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
+    %2 = "toy.transpose"(%1) : (tensor<*xf64>) -> tensor<*xf64>
+    %3 = "toy.struct_access"(%0) {index = 1 : i64} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
+    %4 = "toy.transpose"(%3) : (tensor<*xf64>) -> tensor<*xf64>
+    %5 = "toy.mul"(%2, %4) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+    "toy.print"(%5) : (tensor<*xf64>) -> ()
+    "toy.return"() : () -> ()
+  }
+}
+```
+
+We have several `toy.struct_access` operations that access into a
+`toy.struct_constant`. As detailed in [chapter 3](Ch-3.md), we can add folders
+for these `toy` operations by setting the `hasFolder` bit on the operation
+definition and providing a definition of the `*Op::fold` method.
+
+```c++
+/// Fold constants.
+OpFoldResult ConstantOp::fold(ArrayRef<Attribute> operands) { return value(); }
+
+/// Fold struct constants.
+OpFoldResult StructConstantOp::fold(ArrayRef<Attribute> operands) {
+  return value();
+}
+
+/// Fold simple struct access operations that access into a constant.
+OpFoldResult StructAccessOp::fold(ArrayRef<Attribute> operands) {
+  auto structAttr = operands.front().dyn_cast_or_null<mlir::ArrayAttr>();
+  if (!structAttr)
+    return nullptr;
+
+  size_t elementIndex = index().getZExtValue();
+  return structAttr.getValue()[elementIndex];
+}
+```
+
+To ensure that MLIR generates the proper constant operations when folding our
+`Toy` operations, i.e. `ConstantOp` for `TensorType` and `StructConstant` for
+`StructType`, we will need to provide an override for the dialect hook
+`materializeConstant`. This allows for generic MLIR operations to create
+constants for the `Toy` dialect when necessary.
+
+```c++
+mlir::Operation *ToyDialect::materializeConstant(mlir::OpBuilder &builder,
+                                                 mlir::Attribute value,
+                                                 mlir::Type type,
+                                                 mlir::Location loc) {
+  if (type.isa<StructType>())
+    return builder.create<StructConstantOp>(loc, type,
+                                            value.cast<mlir::ArrayAttr>());
+  return builder.create<ConstantOp>(loc, type,
+                                    value.cast<mlir::DenseElementsAttr>());
+}
+```
+
+With this, we can now generate code that can be generated to LLVM without any
+changes to our pipeline.
+
+```mlir
+module {
+  func @main() {
+    %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+    %1 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+    %2 = "toy.mul"(%1, %1) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+    "toy.print"(%2) : (tensor<3x2xf64>) -> ()
+    "toy.return"() : () -> ()
+  }
+}
+```
+
+You can build `toyc-ch7` and try yourself: `toyc-ch7
+test/Examples/Toy/Ch7/struct-codegen.toy -emit=mlir`. More details on defining
+custom types can be found in
+[DefiningAttributesAndTypes](../../DefiningAttributesAndTypes.md).
diff --git a/mlir/docs/UsageOfConst.md b/mlir/docs/UsageOfConst.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e8ce78e960c256032bc508790c8e19b2123ed9b
--- /dev/null
+++ b/mlir/docs/UsageOfConst.md
@@ -0,0 +1,272 @@
+# Usage of 'Const' in MLIR, for core IR types
+
+aka, where'd `const` go?
+
+The MLIR data structures that represent the IR itself (Instruction, Block, etc)
+form a graph-based data structure, and the compiler analyses and passes
+frequently walk this graph (e.g. traversing from defs to users). The early
+design of MLIR adopted the `const` model of LLVM, which is familiar and well
+understood (even though the LLVM implementation is flawed in many ways).
+
+The design team since decided to change to a different module, which eschews
+`const` entirely for the core IR types: you should never see a `const` method on
+`Operation`, should never see the type `const Value`, and you shouldn't feel bad
+about this. That said, you *should* use `const` for non-IR types, like
+`SmallVector`'s and many other things.
+
+The document below explains this design point from the viewpoint of "why make a
+change", to explain the rationale and the tradeoffs involved that led us to this
+potentially controversial design point.
+
+Bjarke Roune summarized the situation like this:
+
+> In my opinion `const` correctness is highly valuable, catching many bugs and
+> making it clear in a code base where the mutations happen. In my opinion
+> `const` correctness still isn't worth it in particular for IR elements because
+> of the special uses and properties of IRs, in particular that it is common to
+> transfer a pointer/reference to an instruction from an analysis to an
+> optimization which will change the instruction. The analysis should be const,
+> the optimization needs to get a non-`const` pointer. So all analyses either
+> end up being templates (and if they never get instantiated in a const context,
+> then the point of `const` correctness has been defeated), you need to somehow
+> launder the const in a safe way or there will be `const_cast`s. These options
+> are all bad, probably so bad as to out-weigh the benefits of const.
+
+# Reconsidering `const` in MLIR
+
+This document argues this design is introducing significant sub-optimalities
+into the MLIR codebase, argues that the cost/benefit tradeoff of this design is
+a poor tradeoff, and proposes switching to a much simpler approach - eliminating
+the use of const of these IR types entirely.
+
+**Note:** **This document is only discussing things like `const Value` and
+`const Operation*`. There is no proposed change for other types, e.g.
+`SmallVector` references, the immutable types like `Attribute`, etc.**
+
+## Background: The LLVM Const Model
+
+The LLVM and MLIR data structures provide the IR data structures (like
+`mlir::Operation`s and their users) as a structured cyclic graph data structure.
+Clients of the IR typically walk up and down the graph, perform dynamic down
+casting (of various sorts) to check for patterns, and use some high-abstraction
+pattern matching and binding facilities to do their work.
+
+The basic idea of LLVM's design is that these traversals of the IR should
+preserve the const'ness of a pointer: if you have a const pointer to an
+instruction and ask for its parent (or operand, users, etc), you should get a
+const pointer to the block containing the instruction (or value defining the
+operand, instruction using the instruction, etc). The instruction class looks
+like this:
+
+```
+namespace llvm {
+class Instruction : ...  {
+  BasicBlock *Parent;
+public:
+  // A const instruction returns a const parent pointer.
+  inline const BasicBlock *getParent() const { return Parent; }
+  // A non-const instruction returns a non-const parent pointer.
+  inline       BasicBlock *getParent()       { return Parent; }
+…
+};
+}
+```
+
+The rationale for this design is that it would be const-incorrect to return a
+non-const pointer from getParent, because you could then walk the block to find
+the instruction again and get non-const references to the same instruction - all
+without a `const_cast`.
+
+This const model is simple and the C++ type system generally supports it through
+code duplication of methods. That said, LLVM is actually inconsistent and buggy
+about this. Even the core classes have bugs: `llvm::Instruction::getOperand()`
+isn't currently const correct! There are other subsystems (e.g. the
+`llvm/IR/PatternMatch.h` APIs) where you can perform a pattern match on a const
+IR object and bind a non-const IR object.
+
+LLVM is a mature technology with hundreds of people working on it. The fact that
+it still isn't correctly following the const model it set out for strongly hints
+that one of: 1) The design is too complicated to be practical, 2) the benefits
+of the model aren't worth the cost of the complexity, or 3) both 1 and 2,
+together in some combination.
+
+## Advantages of Const-correctness in MLIR
+
+Even though this doc argues for eliminating const from MLIR, it is important to
+evaluate that as a tradeoff with the advantages the const model provides,
+allowing us to do a cost/benefit tradeoff. These are the benefits we see:
+
+The major advantage of allowing const on MLIR types is as a marker in APIs that
+indicate that the function will not modify the specified values. For example,
+the dominator APIs have a `dominates(const Block*, const Block*)` method, and
+the consts provide a way of indicating that the call won't modify the blocks
+passed in - similarly predicates like `Instruction::isTerminator() const` do not
+modify the receiver object.
+
+It is also an advantage that MLIR follows the generally prevailing pattern of
+C++ code, which generally uses const. Consistency with the community norm is
+important.
+
+## Costs of Const-correctness in MLIR
+
+As mentioned above, early work on MLIR adopted the same design as LLVM intended,
+allowing const-correct traversals in the APIs. Here we discuss the various costs
+of doing this by looking at some examples, listed in roughly increasing order of
+severity.
+
+### Pervasively duplicated accessors
+
+Just as the getParent() example above shows, achieving this const model requires
+that all of the graph traversal accessors be duplicated into const and non-const
+versions. This causes API bloat and slows compile time, but these are minor
+problems.
+
+The more significant issue is that this duplication can be so significant that
+the signal disappears in the noise, for example `mlir::Operation` ends up with
+things like this, which is twice as much API surface area just to try to satisfy
+const.
+
+```c++
+  operand_iterator operand_begin();
+  operand_iterator operand_end();
+
+  /// Returns an iterator on the underlying Value's (Value ).
+  operand_range getOperands();
+
+  // Support const operand iteration.
+  using const_operand_iterator =
+      OperandIterator<const Operation, const Value>;
+  using const_operand_range = llvm::iterator_range<const_operand_iterator>;
+
+  const_operand_iterator operand_begin() const;
+  const_operand_iterator operand_end() const;
+
+  /// Returns a const iterator on the underlying Value's (Value ).
+  llvm::iterator_range<const_operand_iterator> getOperands() const;
+
+  ArrayRef<OpOperand> getOpOperands() const {
+    return getOperandStorage().getOperands();
+  }
+  MutableArrayRef<OpOperand> getOpOperands() {
+    return getOperandStorage().getOperands();
+  }
+
+  OpOperand &getOpOperand(unsigned idx) { return getOpOperands()[idx]; }
+  const OpOperand &getOpOperand(unsigned idx) const {
+    return getOpOperands()[idx];
+  }
+
+```
+
+### Templated accessors
+
+A related issue is that having to provide both const and non-const versions of
+accessors leads to us having to turn more code into templates than would
+otherwise be desirable. Things like `ResultIterator` and `ResultTypeIterator`
+are templates *_only_* because they are generic over const and non-const
+versions of types. This leads to them being defined inline in headers (instead
+of in .cpp files).
+
+Thus, our const model is leading to more code in headers and more complexity in
+the implementation.
+
+### Const incorrect in practice
+
+For some things, const is more trouble than it is worth, so they never get
+updated.
+
+This means that certain API in practice don't provide a const variant, leading
+to pervasive use of `const_cast` to drop the const qualifier. For example the
+logic in `Matchers.h` doesn't support const pointers at all (b/123355851), even
+though matching and binding values themselves makes perfect sense for both const
+and non-const values. Actually fixing this would cause massive code bloat and
+complexity.
+
+Other parts of the code are just outright incorrect. For example, the operation
+cloning methods are defined on Operation like this:
+
+```C++
+Operation *clone(BlockAndValueMapping &mapper, MLIRContext *context) const;
+
+Operation *clone(MLIRContext *context) const;
+```
+
+While it makes sense for a clone method to be `const` conceptually (the original
+operation isn't modified) this is a violation of the model, since the returned
+operation must be mutable, and provides access to the full graph of operands as
+the original operation, violating the graph based const model we were shooting
+for.
+
+### The `OpPointer` and `ConstOpPointer` Classes
+
+The "typed operation" classes for registered operations (e.g. like `DimOp` for
+the "std.dim" operation in standard ops) contain a pointer to an operation and
+provide typed APIs for processing it.
+
+However, this is a problem for our current `const` design - `const DimOp` means
+the pointer itself is immutable, not the pointee. The current solution for this
+is the `OpPointer<>` and `ConstOpPointer<>` classes, which exist solely to
+provide const correctness when referring to a typed operation. Instead of
+referring to `DimOp` directly, we need to use `OpPointer<DimOp>` and
+`ConstOpPointer<DimOp>` to preserve this constness.
+
+While `auto` hides many instances of these `OpPointer` classes, their presence
+leads to extremely ugly APIs. It also obscures the fact that the user does not
+have a direct `DimOp` object, creating easy pitfalls with subtly incorrect
+semantics:
+
+```C++
+// OpPointer encodes unnecessary and superfluous information into the API.
+SmallVector<OpPointer<AffineForOp>, 8> stripmineSink(
+  OpPointer<AffineForOp> forOp, uint64_t factor,
+  ArrayRef<OpPointer<AffineForOp>> targets);
+// Compared to the much cleaner and easier to read...
+SmallVector<AffineForOp, 8> stripmineSink(AffineForOp forOp, uint64_t factor,
+                                          ArrayRef<AffineForOp> targets);
+
+// OpPointer is easy to misuse.
+if (auto *dimOp = inst->dyn_cast<DimOp>()) {
+  // This is actually undefined behavior because dyn_cast actually returns
+  // OpPointer<DimOp>. OpPointer<DimOp> happily implicitly converts to DimOp *
+  // creating undefined behavior that will execute correctly most of the time.
+}
+```
+
+It would be much better to eliminate them entirely, and just pass around `DimOp`
+directly. For example, instead of:
+
+```C++
+LogicalResult mlir::getIndexSet(MutableArrayRef<OpPointer<AffineForOp>> forOps,
+                                FlatAffineConstraints *domain) {
+
+```
+
+It would be a lot nicer to just have:
+
+```c++
+LogicalResult mlir::getIndexSet(MutableArrayRef<AffineForOp> forOps,
+                                FlatAffineConstraints *domain) {
+```
+
+Particularly since all of the `FooOp` classes are already semantically a smart
+pointer to their underlying operation.
+
+## Proposal: Remove `const` from IR objects
+
+As we can see above, there is very little benefit to our const design and
+significant cost, and given that the primary purpose of an IR is to represent
+transformations of code, const is providing very little benefit.
+
+As such, we propose eliminating support for const references in MLIR. This
+implies the following changes to the codebase:
+
+1.  All of the const-duplicated accessors would be eliminated, e.g.
+    `Operation::getParent() const` would be removed. This is expected to remove
+    approximately ~130 lines of code from just Operation.h alone.
+1.  Const-only predicates would be changed to be non-const, e.g.
+    `Operation::isTerminator() const` would have the const removed.
+1.  Iterators and other types and functions that are templated to support
+    `const` can have those template arguments removed.
+1.  Types like `OpPointer` and `ConstOpPointer` that exist solely to propagate
+    const can be entirely removed from the codebase.
+1.  We can close bugs complaining about const incorrectness in the IR.
diff --git a/mlir/docs/WritingAPass.md b/mlir/docs/WritingAPass.md
new file mode 100644
index 0000000000000000000000000000000000000000..5119c469e20819a40d6fd41c33435d6f37ce5b7e
--- /dev/null
+++ b/mlir/docs/WritingAPass.md
@@ -0,0 +1,835 @@
+# Writing a Pass
+
+[TOC]
+
+Passes represent the basic infrastructure for transformation and optimization.
+This document provides a quickstart to the pass infrastructure in MLIR and how
+to use it.
+
+See [MLIR specification](LangRef.md) for more information about MLIR and its
+core aspects, such as the IR structure and operations.
+
+See [MLIR Rewrites](QuickstartRewrites.md) for a quick start on graph rewriting
+in MLIR. If your transformation involves pattern matching operation DAGs, this
+is a great place to start.
+
+## Operation Pass
+
+In MLIR, the main unit of abstraction and transformation is an
+[operation](LangRef.md#operations). As such, the pass manager is designed to
+work on instances of operations at different levels of nesting. The structure of
+the [pass manager](#pass-manager), and the concept of nesting, is detailed
+further below. All passes in MLIR derive from `OperationPass` and adhere to the
+following restrictions; any noncompliance will lead to problematic behavior in
+multithreaded and other advanced scenarios:
+
+*   Modify anything within the parent block/region/operation/etc, outside of the
+    current operation being operated on. This includes adding or removing
+    operations from the parent block.
+*   Maintain pass state across invocations of `runOnOperation`. A pass may be
+    run on several different operations with no guarantee of execution order.
+    *   When multithreading, a specific pass instance may not even execute on
+        all operations within the module. As such, a pass should not rely on
+        running on all operations.
+*   Modify the state of another operation not nested within the current
+    operation being operated on.
+    *   Other threads may be operating on different operations within the module
+        simultaneously.
+*   Maintain any global mutable state, e.g. static variables within the source
+    file. All mutable state should be maintained by an instance of the pass.
+*   Must be copy-constructible, multiple instances of the pass may be created by
+    the pass manager to process operations in parallel.
+*   Inspect the IR of sibling operations. Other threads may be modifying these
+    operations in parallel.
+
+When creating an operation pass, there are two different types to choose from
+depending on the usage scenario:
+
+### OperationPass : Op-Specific
+
+An `op-specific` operation pass operates explicitly on a given operation type.
+This operation type must adhere to the restrictions set by the pass manager for
+pass execution.
+
+To define an op-specific operation pass, a derived class must adhere to the
+following:
+
+*   Inherit from the CRTP class `OperationPass` and provide the operation type
+    as an additional template parameter.
+*   Override the virtual `void runOnOperation()` method.
+
+A simple pass may look like:
+
+```c++
+namespace {
+struct MyFunctionPass : public OperationPass<MyFunctionPass, FuncOp> {
+  void runOnOperation() override {
+    // Get the current FuncOp operation being operated on.
+    FuncOp f = getOperation();
+
+    // Walk the operations within the function.
+    f.walk([](Operation *inst) {
+      ....
+    });
+  }
+};
+} // end anonymous namespace
+
+// Register this pass to make it accessible to utilities like mlir-opt.
+// (Pass registration is discussed more below)
+static PassRegistration<MyFunctionPass> pass(
+    "flag-name-to-invoke-pass-via-mlir-opt", "Pass description here");
+```
+
+### OperationPass : Op-Agnostic
+
+An `op-agnostic` pass operates on the operation type of the pass manager that it
+is added to. This means that a pass that operates on several different operation
+types in the same way only needs one implementation.
+
+To create an operation pass, a derived class must adhere to the following:
+
+*   Inherit from the CRTP class `OperationPass`.
+*   Override the virtual `void runOnOperation()` method.
+
+A simple pass may look like:
+
+```c++
+struct MyOperationPass : public OperationPass<MyOperationPass> {
+  void runOnOperation() override {
+    // Get the current operation being operated on.
+    Operation *op = getOperation();
+    ...
+  }
+};
+```
+
+## Analysis Management
+
+An important concept, along with transformation passes, are analyses. These are
+conceptually similar to transformation passes, except that they compute
+information on a specific operation without modifying it. In MLIR, analyses are
+not passes but free-standing classes that are computed lazily on-demand and
+cached to avoid unnecessary recomputation. An analysis in MLIR must adhere to
+the following:
+
+*   Provide a valid constructor taking an `Operation*`.
+*   Must not modify the given operation.
+
+An analysis may provide additional hooks to control various behavior:
+
+*   `bool isInvalidated(const AnalysisManager::PreservedAnalyses &)`
+
+Given a preserved analysis set, the analysis returns true if it should truly be
+invalidated. This allows for more fine-tuned invalidation in cases where an
+analysis wasn't explicitly marked preserved, but may be preserved (or
+invalidated) based upon other properties such as analyses sets.
+
+### Querying Analyses
+
+The base `OperationPass` class provide utilities for querying and preserving
+analyses for the current operation being processed.
+
+*   OperationPass automatically provides the following utilities for querying
+    analyses:
+    *   `getAnalysis<>`
+        -   Get an analysis for the current operation, constructing it if
+            necessary.
+    *   `getCachedAnalysis<>`
+        -   Get an analysis for the current operation, if it already exists.
+    *   `getCachedParentAnalysis<>`
+        -   Get an analysis for a given parent operation, if it exists.
+    *   `getCachedChildAnalysis<>`
+        -   Get an analysis for a given child operation, if it exists.
+    *   `getChildAnalysis<>`
+        -   Get an analysis for a given child operation, constructing it if
+            necessary.
+
+Using the example passes defined above, let's see some examples:
+
+```c++
+/// An interesting analysis.
+struct MyOperationAnalysis {
+  // Compute this analysis with the provided operation.
+  MyOperationAnalysis(Operation *op);
+};
+
+void MyOperationPass::runOnOperation() {
+  // Query MyOperationAnalysis for the current operation.
+  MyOperationAnalysis &myAnalysis = getAnalysis<MyOperationAnalysis>();
+
+  // Query a cached instance of MyOperationAnalysis for the current operation.
+  // It will not be computed if it doesn't exist.
+  auto optionalAnalysis = getCachedAnalysis<MyOperationAnalysis>();
+  if (optionalAnalysis)
+    ...
+
+  // Query a cached instance of MyOperationAnalysis for the parent operation of
+  // the current operation. It will not be computed if it doesn't exist.
+  auto optionalAnalysis = getCachedParentAnalysis<MyOperationAnalysis>();
+  if (optionalAnalysis)
+    ...
+}
+```
+
+### Preserving Analyses
+
+Analyses that are constructed after being queried by a pass are cached to avoid
+unnecessary computation if they are requested again later. To avoid stale
+analyses, all analyses are assumed to be invalidated by a pass. To avoid
+invalidation, a pass must specifically mark analyses that are known to be
+preserved.
+
+*   All Pass classes automatically provide the following utilities for
+    preserving analyses:
+    *   `markAllAnalysesPreserved`
+    *   `markAnalysesPreserved<>`
+
+```c++
+void MyOperationPass::runOnOperation() {
+  // Mark all analyses as preserved. This is useful if a pass can guarantee
+  // that no transformation was performed.
+  markAllAnalysesPreserved();
+
+  // Mark specific analyses as preserved. This is used if some transformation
+  // was performed, but some analyses were either unaffected or explicitly
+  // preserved.
+  markAnalysesPreserved<MyAnalysis, MyAnalyses...>();
+}
+```
+
+## Pass Failure
+
+Passes in MLIR are allowed to gracefully fail. This may happen if some invariant
+of the pass was broken, potentially leaving the IR in some invalid state. If
+such a situation occurs, the pass can directly signal a failure to the pass
+manager. If a pass signaled a failure when executing, no other passes in the
+pipeline will execute and the `PassManager::run` will return failure. Failure
+signaling is provided in the form of a `signalPassFailure` method.
+
+```c++
+void MyPass::runOnOperation() {
+  // Signal failure on a broken invariant.
+  if (some_broken_invariant) {
+    signalPassFailure();
+    return;
+  }
+}
+```
+
+## Pass Manager
+
+Above we introduced the different types of passes and their constraints. Now
+that we have our pass we need to be able to run it over a specific module. This
+is where the pass manager comes into play. The `PassManager` class is used to
+configure and run a pipeline. The `OpPassManager` class is used to schedule
+passes to run at a specific level of nesting.
+
+### OpPassManager
+
+An `OpPassManager` is essentially a collection of passes to execute on an
+operation of a given type. This operation type must adhere to the following
+requirement:
+
+*   Must be registered and marked `IsolatedFromAbove`.
+
+    *   Passes are expected to not modify operations at or above the current
+        operation being processed. If the operation is not isolated, it may
+        inadvertently modify the use-list of an operation it is not supposed to
+        modify.
+
+Passes can be added to a pass manager via `addPass`. The pass must either be an
+`op-specific` pass operating on the same operation type as `OpPassManager`, or
+an `op-agnostic` pass.
+
+An `OpPassManager` cannot be created directly, but must be explicitly nested
+within another `OpPassManager` via the `nest<>` method. This method takes the
+operation type that the nested pass manager will operate on. At the top-level, a
+`PassManager` acts as an `OpPassManager` that operates on the
+[`module`](LangRef.md#module) operation. Nesting in this sense, corresponds to
+the structural nesting within [Regions](LangRef.md#regions) of the IR.
+
+For example, the following `.mlir`:
+
+```
+module {
+  spv.module "Logical" "GLSL450" {
+    func @foo() {
+      ...
+    }
+  }
+}
+```
+
+Has the nesting structure of:
+
+```
+`module`
+  `spv.module`
+    `function`
+```
+
+Below is an example of constructing a pipeline that operates on the above
+structure:
+
+```c++
+PassManager pm(ctx);
+
+// Add a pass on the top-level module operation.
+pm.addPass(std::make_unique<MyModulePass>());
+
+// Nest a pass manager that operates on spirv module operations nested directly
+// under the top-level module.
+OpPassManager &nestedModulePM = pm.nest<spirv::ModuleOp>();
+nestedModulePM.addPass(std::make_unique<MySPIRVModulePass>());
+
+// Nest a pass manager that operates on functions within the nested SPIRV
+// module.
+OpPassManager &nestedFunctionPM = nestedModulePM.nest<FuncOp>();
+nestedFunctionPM.addPass(std::make_unique<MyFunctionPass>());
+
+// Run the pass manager on the top-level module.
+Module m = ...;
+if (failed(pm.run(m)))
+    ... // One of the passes signaled a failure.
+```
+
+The above pass manager would contain the following pipeline structure:
+
+```
+OpPassManager<ModuleOp>
+  MyModulePass
+  OpPassManager<spirv::ModuleOp>
+    MySPIRVModulePass
+    OpPassManager<FuncOp>
+      MyFunctionPass
+```
+
+These pipelines are then run over a single operation at a time. This means that,
+for example, given a series of consecutive passes on FuncOp, it will execute all
+on the first function, then all on the second function, etc. until the entire
+program has been run through the passes. This provides several benefits:
+
+*   This improves the cache behavior of the compiler, because it is only
+    touching a single function at a time, instead of traversing the entire
+    program.
+*   This improves multi-threading performance by reducing the number of jobs
+    that need to be scheduled, as well as increasing the efficiency of each job.
+    An entire function pipeline can be run on each function asynchronously.
+
+## Pass Registration
+
+Briefly shown in the example definitions of the various pass types is the
+`PassRegistration` class. This is a utility to register derived pass classes so
+that they may be created, and inspected, by utilities like mlir-opt. Registering
+a pass class takes the form:
+
+```c++
+static PassRegistration<MyPass> pass("command-line-arg", "description");
+```
+
+*   `MyPass` is the name of the derived pass class.
+*   "command-line-arg" is the argument to use on the command line to invoke the
+    pass from `mlir-opt`.
+*   "description" is a description of the pass.
+
+For passes that cannot be default-constructed, `PassRegistration` accepts an
+optional third argument that takes a callback to create the pass:
+
+```c++
+static PassRegistration<MyParametricPass> pass(
+    "command-line-arg", "description",
+    []() -> std::unique_ptr<Pass> {
+      std::unique_ptr<Pass> p = std::make_unique<MyParametricPass>(/*options*/);
+      /*... non-trivial-logic to configure the pass ...*/;
+      return p;
+    });
+```
+
+This variant of registration can be used, for example, to accept the
+configuration of a pass from command-line arguments and pass it over to the pass
+constructor. Make sure that the pass is copy-constructible in a way that does
+not share data as the [pass manager](#pass-manager) may create copies of the
+pass to run in parallel.
+
+### Pass Pipeline Registration
+
+Described above is the mechanism used for registering a specific derived pass
+class. On top of that, MLIR allows for registering custom pass pipelines in a
+similar fashion. This allows for custom pipelines to be available to tools like
+mlir-opt in the same way that passes are, which is useful for encapsulating
+common pipelines like the "-O1" series of passes. Pipelines are registered via a
+similar mechanism to passes in the form of `PassPipelineRegistration`. Compared
+to `PassRegistration`, this class takes an additional parameter in the form of a
+pipeline builder that modifies a provided `OpPassManager`.
+
+```c++
+void pipelineBuilder(OpPassManager &pm) {
+  pm.addPass(std::make_unique<MyPass>());
+  pm.addPass(std::make_unique<MyOtherPass>());
+}
+
+// Register an existing pipeline builder function.
+static PassPipelineRegistration<> pipeline(
+  "command-line-arg", "description", pipelineBuilder);
+
+// Register an inline pipeline builder.
+static PassPipelineRegistration<> pipeline(
+  "command-line-arg", "description", [](OpPassManager &pm) {
+    pm.addPass(std::make_unique<MyPass>());
+    pm.addPass(std::make_unique<MyOtherPass>());
+  });
+```
+
+Pipeline registration also allows for simplified registration of
+specifializations for existing passes:
+
+```c++
+static PassPipelineRegistration<> foo10(
+    "foo-10", "Foo Pass 10", [] { return std::make_unique<FooPass>(10); } );
+```
+
+### Textual Pass Pipeline Specification
+
+In the previous sections, we showed how to register passes and pass pipelines
+with a specific argument and description. Once registered, these can be used on
+the command line to configure a pass manager. The limitation of using these
+arguments directly is that they cannot build a nested pipeline. For example, if
+our module has another module nested underneath, with just `-my-module-pass`
+there is no way to specify that this pass should run on the nested module and
+not the top-level module. This is due to the flattened nature of the command
+line.
+
+To circumvent this limitation, MLIR also supports a textual description of a
+pass pipeline. This allows for explicitly specifying the structure of the
+pipeline to add to the pass manager. This includes the nesting structure, as
+well as the passes and pass pipelines to run. A textual pipeline is defined as a
+series of names, each of which may in itself recursively contain a nested
+pipeline description. The syntax for this specification is as follows:
+
+```ebnf
+pipeline          ::= op-name `(` pipeline-element (`,` pipeline-element)* `)`
+pipeline-element  ::= pipeline | (pass-name | pass-pipeline-name) options?
+options           ::= '{' (key ('=' value)?)+ '}'
+```
+
+*   `op-name`
+    *   This corresponds to the mnemonic name of an operation to run passes on,
+        e.g. `func` or `module`.
+*   `pass-name` | `pass-pipeline-name`
+    *   This corresponds to the command-line argument of a registered pass or
+        pass pipeline, e.g. `cse` or `canonicalize`.
+*   `options`
+    *   Options are pass specific key value pairs that are handled as described
+        in the [instance specific pass options](#instance-specific-pass-options)
+        section.
+
+For example, the following pipeline:
+
+```shell
+$ mlir-opt foo.mlir -cse -canonicalize -convert-std-to-llvm
+```
+
+Can also be specified as (via the `-pass-pipeline` flag):
+
+```shell
+$ mlir-opt foo.mlir -pass-pipeline='func(cse, canonicalize), convert-std-to-llvm'
+```
+
+In order to support round-tripping your pass to the textual representation using
+`OpPassManager::printAsTextualPipeline(raw_ostream&)`, override
+`Pass::printAsTextualPipeline(raw_ostream&)` to format your pass-name and
+options in the format described above.
+
+### Instance Specific Pass Options
+
+Options may be specified for a parametric pass. Individual options are defined
+using the [LLVM command line](https://llvm.org/docs/CommandLine.html) flag
+definition rules. These options will then be parsed at pass construction time
+independently for each instance of the pass. To provide options for passes, the
+`Option<>` and `OptionList<>` classes may be used:
+
+```c++
+struct MyPass ... {
+  /// Make sure that we have a valid default constructor and copy constructor to
+  /// make sure that the options are initialized properly.
+  MyPass() = default;
+  MyPass(const MyPass& pass) {}
+
+  // These just forward onto llvm::cl::list and llvm::cl::opt respectively.
+  Option<int> exampleOption{*this, "flag-name", llvm::cl::desc("...")};
+  ListOption<int> exampleListOption{*this, "list-flag-name",
+                                    llvm::cl::desc("...")};
+};
+```
+
+For pass pipelines, the `PassPipelineRegistration` templates take an additional
+optional template parameter that is the Option struct definition to be used for
+that pipeline. To use pipeline specific options, create a class that inherits
+from `mlir::PassPipelineOptions` that contains the desired options. When using
+`PassPipelineRegistration`, the constructor now takes a function with the
+signature `void (OpPassManager &pm, const MyPipelineOptions&)` which should
+construct the passes from the options and pass them to the pm:
+
+```c++
+struct MyPipelineOptions : public PassPipelineOptions {
+  // These just forward onto llvm::cl::list and llvm::cl::opt respectively.
+  Option<int> exampleOption{*this, "flag-name", llvm::cl::desc("...")};
+  ListOption<int> exampleListOption{*this, "list-flag-name",
+                                    llvm::cl::desc("...")};
+};
+
+
+static mlir::PassPipelineRegistration<MyPipelineOptions> pipeline(
+    "example-pipeline", "Run an example pipeline.",
+    [](OpPassManager &pm, const MyPipelineOptions &pipelineOptions) {
+      // Initialize the pass manager.
+    });
+```
+
+## Pass Statistics
+
+Statistics are a way to keep track of what the compiler is doing and how
+effective various transformations are. It is often useful to see what effect
+specific transformations have on a particular program, and how often they
+trigger. Pass statistics are instance specific which allow for taking this a
+step further as you are able to see the effect of placing a particular
+transformation at specific places within the pass pipeline. For example, they
+help answer questions like `What happens if I run CSE again here?`.
+
+Statistics can be added to a pass by using the 'Pass::Statistic' class. This
+class takes as a constructor arguments: the parent pass, a name, and a
+description. This class acts like an unsigned integer, and may be incremented
+and updated accordingly. These statistics use the same infrastructure as
+[`llvm::Statistic`](http://llvm.org/docs/ProgrammersManual.html#the-statistic-class-stats-option)
+and thus have similar usage constraints. Collected statistics can be dumped by
+the [pass manager](#pass-manager) programmatically via
+`PassManager::enableStatistics`; or via `-pass-statistics` and
+`-pass-statistics-display` on the command line.
+
+An example is shown below:
+
+```c++
+struct MyPass : public OperationPass<MyPass> {
+  Statistic testStat{this, "testStat", "A test statistic"};
+
+  void runOnOperation() {
+    ...
+
+    // Update our statistic after some invariant was hit.
+    ++testStat;
+
+    ...
+  }
+};
+```
+
+The collected statistics may be aggregated in two types of views:
+
+A pipeline view that models the structure of the pass manager, this is the
+default view:
+
+```shell
+$ mlir-opt -pass-pipeline='func(my-pass,my-pass)' foo.mlir -pass-statistics
+
+===-------------------------------------------------------------------------===
+                         ... Pass statistics report ...
+===-------------------------------------------------------------------------===
+'func' Pipeline
+  MyPass
+    (S) 15 testStat - A test statistic
+  VerifierPass
+  MyPass
+    (S)  6 testStat - A test statistic
+  VerifierPass
+VerifierPass
+```
+
+And a list view that aggregates all instances of a specific pass together:
+
+```shell
+$ mlir-opt -pass-pipeline='func(my-pass, my-pass)' foo.mlir -pass-statistics -pass-statistics-display=list
+
+===-------------------------------------------------------------------------===
+                         ... Pass statistics report ...
+===-------------------------------------------------------------------------===
+MyPass
+  (S) 21 testStat - A test statistic
+```
+
+## Pass Instrumentation
+
+MLIR provides a customizable framework to instrument pass execution and analysis
+computation. This is provided via the `PassInstrumentation` class. This class
+provides hooks into the PassManager that observe various pass events:
+
+*   `runBeforePipeline`
+    *   This callback is run just before a pass pipeline, i.e. pass manager, is
+        executed.
+*   `runAfterPipeline`
+    *   This callback is run right after a pass pipeline has been executed,
+        successfully or not.
+*   `runBeforePass`
+    *   This callback is run just before a pass is executed.
+*   `runAfterPass`
+    *   This callback is run right after a pass has been successfully executed.
+        If this hook is executed, runAfterPassFailed will not be.
+*   `runAfterPassFailed`
+    *   This callback is run right after a pass execution fails. If this hook is
+        executed, runAfterPass will not be.
+*   `runBeforeAnalysis`
+    *   This callback is run just before an analysis is computed.
+*   `runAfterAnalysis`
+    *   This callback is run right after an analysis is computed.
+
+PassInstrumentation objects can be registered directly with a
+[PassManager](#pass-manager) instance via the `addInstrumentation` method.
+Instrumentations added to the PassManager are run in a stack like fashion, i.e.
+the last instrumentation to execute a `runBefore*` hook will be the first to
+execute the respective `runAfter*` hook. Below in an example instrumentation
+that counts the number of times DominanceInfo is computed:
+
+```c++
+struct DominanceCounterInstrumentation : public PassInstrumentation {
+  unsigned &count;
+
+  DominanceCounterInstrumentation(unsigned &count) : count(count) {}
+  void runAfterAnalysis(llvm::StringRef, AnalysisID *id, Operation *) override {
+    if (id == AnalysisID::getID<DominanceInfo>())
+      ++count;
+  }
+};
+
+MLIRContext *ctx = ...;
+PassManager pm(ctx);
+
+// Add the instrumentation to the pass manager.
+unsigned domInfoCount;
+pm.addInstrumentation(
+    std::make_unique<DominanceCounterInstrumentation>(domInfoCount));
+
+// Run the pass manager on a module operation.
+ModuleOp m = ...;
+if (failed(pm.run(m)))
+    ...
+
+llvm::errs() << "DominanceInfo was computed " << domInfoCount << " times!\n";
+```
+
+### Standard Instrumentations
+
+MLIR utilizes the pass instrumentation framework to provide a few useful
+developer tools and utilities. Each of these instrumentations are immediately
+available to all users of the MLIR pass framework.
+
+#### Pass Timing
+
+The PassTiming instrumentation provides timing information about the execution
+of passes and computation of analyses. This provides a quick glimpse into what
+passes are taking the most time to execute, as well as how much of an effect
+your pass has on the total execution time of the pipeline. Users can enable this
+instrumentation directly on the PassManager via `enableTiming`. This
+instrumentation is also made available in mlir-opt via the `-pass-timing` flag.
+The PassTiming instrumentation provides several different display modes for the
+timing results, each of which is described below:
+
+##### List Display Mode
+
+In this mode, the results are displayed in a list sorted by total time with each
+pass/analysis instance aggregated into one unique result. This view is useful
+for getting an overview of what analyses/passes are taking the most time in a
+pipeline. This display mode is available in mlir-opt via
+`-pass-timing-display=list`.
+
+```shell
+$ mlir-opt foo.mlir -disable-pass-threading -pass-pipeline='func(cse,canonicalize)' -convert-std-to-llvm -pass-timing -pass-timing-display=list
+
+===-------------------------------------------------------------------------===
+                      ... Pass execution timing report ...
+===-------------------------------------------------------------------------===
+  Total Execution Time: 0.0203 seconds
+
+   ---Wall Time---  --- Name ---
+   0.0047 ( 55.9%)  Canonicalizer
+   0.0019 ( 22.2%)  VerifierPass
+   0.0016 ( 18.5%)  LLVMLoweringPass
+   0.0003 (  3.4%)  CSE
+   0.0002 (  1.9%)  (A) DominanceInfo
+   0.0084 (100.0%)  Total
+```
+
+##### Pipeline Display Mode
+
+In this mode, the results are displayed in a nested pipeline view that mirrors
+the internal pass pipeline that is being executed in the pass manager. This view
+is useful for understanding specifically which parts of the pipeline are taking
+the most time, and can also be used to identify when analyses are being
+invalidated and recomputed. This is the default display mode.
+
+```shell
+$ mlir-opt foo.mlir -disable-pass-threading -pass-pipeline='func(cse,canonicalize)' -convert-std-to-llvm -pass-timing
+
+===-------------------------------------------------------------------------===
+                      ... Pass execution timing report ...
+===-------------------------------------------------------------------------===
+  Total Execution Time: 0.0249 seconds
+
+   ---Wall Time---  --- Name ---
+   0.0058 ( 70.8%)  'func' Pipeline
+   0.0004 (  4.3%)    CSE
+   0.0002 (  2.6%)      (A) DominanceInfo
+   0.0004 (  4.8%)    VerifierPass
+   0.0046 ( 55.4%)    Canonicalizer
+   0.0005 (  6.2%)    VerifierPass
+   0.0005 (  5.8%)  VerifierPass
+   0.0014 ( 17.2%)  LLVMLoweringPass
+   0.0005 (  6.2%)  VerifierPass
+   0.0082 (100.0%)  Total
+```
+
+##### Multi-threaded Pass Timing
+
+When multi-threading is enabled in the pass manager the meaning of the display
+slightly changes. First, a new timing column is added, `User Time`, that
+displays the total time spent across all threads. Secondly, the `Wall Time`
+column displays the longest individual time spent amongst all of the threads.
+This means that the `Wall Time` column will continue to give an indicator on the
+perceived time, or clock time, whereas the `User Time` will display the total
+cpu time.
+
+```shell
+$ mlir-opt foo.mlir -pass-pipeline='func(cse,canonicalize)' -convert-std-to-llvm -pass-timing
+
+===-------------------------------------------------------------------------===
+                      ... Pass execution timing report ...
+===-------------------------------------------------------------------------===
+  Total Execution Time: 0.0078 seconds
+
+   ---User Time---   ---Wall Time---  --- Name ---
+   0.0177 ( 88.5%)     0.0057 ( 71.3%)  'func' Pipeline
+   0.0044 ( 22.0%)     0.0015 ( 18.9%)    CSE
+   0.0029 ( 14.5%)     0.0012 ( 15.2%)      (A) DominanceInfo
+   0.0038 ( 18.9%)     0.0015 ( 18.7%)    VerifierPass
+   0.0089 ( 44.6%)     0.0025 ( 31.1%)    Canonicalizer
+   0.0006 (  3.0%)     0.0002 (  2.6%)    VerifierPass
+   0.0004 (  2.2%)     0.0004 (  5.4%)  VerifierPass
+   0.0013 (  6.5%)     0.0013 ( 16.3%)  LLVMLoweringPass
+   0.0006 (  2.8%)     0.0006 (  7.0%)  VerifierPass
+   0.0200 (100.0%)     0.0081 (100.0%)  Total
+```
+
+#### IR Printing
+
+When debugging it is often useful to dump the IR at various stages of a pass
+pipeline. This is where the IR printing instrumentation comes into play. This
+instrumentation allows for conditionally printing the IR before and after pass
+execution by optionally filtering on the pass being executed. This
+instrumentation can be added directly to the PassManager via the
+`enableIRPrinting` method. `mlir-opt` provides a few useful flags for utilizing
+this instrumentation:
+
+*   `print-ir-before=(comma-separated-pass-list)`
+    *   Print the IR before each of the passes provided within the pass list.
+*   `print-ir-before-all`
+    *   Print the IR before every pass in the pipeline.
+
+```shell
+$ mlir-opt foo.mlir -pass-pipeline='func(cse)' -print-ir-before=cse
+
+*** IR Dump Before CSE ***
+func @simple_constant() -> (i32, i32) {
+  %c1_i32 = constant 1 : i32
+  %c1_i32_0 = constant 1 : i32
+  return %c1_i32, %c1_i32_0 : i32, i32
+}
+```
+
+*   `print-ir-after=(comma-separated-pass-list)`
+    *   Print the IR after each of the passes provided within the pass list.
+*   `print-ir-after-all`
+    *   Print the IR after every pass in the pipeline.
+
+```shell
+$ mlir-opt foo.mlir -pass-pipeline='func(cse)' -print-ir-after=cse
+
+*** IR Dump After CSE ***
+func @simple_constant() -> (i32, i32) {
+  %c1_i32 = constant 1 : i32
+  return %c1_i32, %c1_i32 : i32, i32
+}
+```
+
+*   `print-ir-after-change`
+    *   Only print the IR after a pass if the pass mutated the IR. This helps to
+        reduce the number of IR dumps for "uninteresting" passes.
+    *   Note: Changes are detected by comparing a hash of the operation before
+        and after the pass. This adds additional run-time to compute the hash of
+        the IR, and in some rare cases may result in false-positives depending
+        on the collision rate of the hash algorithm used.
+    *   Note: This option should be used in unison with one of the other
+        'print-ir-after' options above, as this option alone does not enable
+        printing.
+
+```shell
+$ mlir-opt foo.mlir -pass-pipeline='func(cse,cse)' -print-ir-after=cse -print-ir-after-change
+
+*** IR Dump After CSE ***
+func @simple_constant() -> (i32, i32) {
+  %c1_i32 = constant 1 : i32
+  return %c1_i32, %c1_i32 : i32, i32
+}
+```
+
+*   `print-ir-module-scope`
+    *   Always print the top-level module operation, regardless of pass type or
+        operation nesting level.
+    *   Note: Printing at module scope should only be used when multi-threading
+        is disabled(`-disable-pass-threading`)
+
+```shell
+$ mlir-opt foo.mlir -disable-pass-threading -pass-pipeline='func(cse)' -print-ir-after=cse -print-ir-module-scope
+
+*** IR Dump After CSE ***  ('func' operation: @bar)
+func @bar(%arg0: f32, %arg1: f32) -> f32 {
+  ...
+}
+
+func @simple_constant() -> (i32, i32) {
+  %c1_i32 = constant 1 : i32
+  %c1_i32_0 = constant 1 : i32
+  return %c1_i32, %c1_i32_0 : i32, i32
+}
+
+*** IR Dump After CSE ***  ('func' operation: @simple_constant)
+func @bar(%arg0: f32, %arg1: f32) -> f32 {
+  ...
+}
+
+func @simple_constant() -> (i32, i32) {
+  %c1_i32 = constant 1 : i32
+  return %c1_i32, %c1_i32 : i32, i32
+}
+```
+
+## Crash and Failure Reproduction
+
+The [pass manager](#pass-manager) in MLIR contains a builtin mechanism to
+generate reproducibles in the even of a crash, or a
+[pass failure](#pass-failure). This functionality can be enabled via
+`PassManager::enableCrashReproducerGeneration` or via the command line flag
+`pass-pipeline-crash-reproducer`. In either case, an argument is provided that
+corresponds to the output `.mlir` file name that the reproducible should be
+written to. The reproducible contains the configuration of the pass manager that
+was executing, as well as the initial IR before any passes were run. A potential
+reproducible may have the form:
+
+```mlir
+// configuration: -pass-pipeline='func(cse, canonicalize), inline'
+// note: verifyPasses=false
+
+module {
+  func @foo() {
+    ...
+  }
+}
+```
diff --git a/mlir/docs/includes/img/index-map.svg b/mlir/docs/includes/img/index-map.svg
new file mode 100644
index 0000000000000000000000000000000000000000..6004c2da362d1ec39b28cce73db3e937edc89a18
--- /dev/null
+++ b/mlir/docs/includes/img/index-map.svg
@@ -0,0 +1,380 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   version="1.1"
+   viewBox="0 0 573.56073 250.77821"
+   stroke-miterlimit="10"
+   id="svg133"
+   sodipodi:docname="index-map.svg"
+   width="573.56073"
+   height="250.77821"
+   style="fill:none;stroke:none;stroke-linecap:square;stroke-miterlimit:10"
+   inkscape:version="0.92.2pre0 (973e216, 2017-07-25)">
+  <metadata
+     id="metadata139">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs137" />
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="2145"
+     inkscape:window-height="1372"
+     id="namedview135"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="0.45"
+     inkscape:cx="685.47816"
+     inkscape:cy="101.31222"
+     inkscape:window-x="413"
+     inkscape:window-y="149"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="svg133" />
+  <clipPath
+     id="p.0">
+    <path
+       d="M 0,0 H 1280 V 960 H 0 Z"
+       id="path2"
+       inkscape:connector-curvature="0"
+       style="clip-rule:nonzero" />
+  </clipPath>
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path5"
+     d="M -12.118111,-9.267716 H 1267.8819 V 950.73228 H -12.118111 Z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path7"
+     d="M 94.598429,41.62992 H 118.063 V 68.338584 H 94.598429 Z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path9"
+     d="M 94.598429,41.62992 H 118.063 V 68.338584 H 94.598429 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path11"
+     d="m 111.1453,60.716754 q -0.92188,0.765625 -1.76563,1.09375 -0.82812,0.3125 -1.79687,0.3125 -1.59375,0 -2.45313,-0.78125 -0.85937,-0.78125 -0.85937,-1.984375 0,-0.71875 0.32812,-1.296875 0.32813,-0.59375 0.84375,-0.9375 0.53125,-0.359375 1.1875,-0.546875 0.46875,-0.125 1.45313,-0.25 1.98437,-0.234375 2.92187,-0.5625 0.0156,-0.34375 0.0156,-0.421875 0,-1 -0.46875,-1.421875 -0.625,-0.546875 -1.875,-0.546875 -1.15625,0 -1.70313,0.40625 -0.54687,0.40625 -0.8125,1.421875 l -1.60937,-0.21875 q 0.21875,-1.015625 0.71875,-1.640625 0.5,-0.640625 1.45312,-0.984375 0.95313,-0.34375 2.1875,-0.34375 1.25,0 2.01563,0.296875 0.78125,0.28125 1.14062,0.734375 0.375,0.4375 0.51563,1.109375 0.0781,0.421875 0.0781,1.515625 v 2.1875 q 0,2.28125 0.10938,2.890625 0.10937,0.59375 0.40625,1.15625 h -1.70313 q -0.26562,-0.515625 -0.32812,-1.1875 z m -0.14063,-3.671875 q -0.89062,0.375 -2.67187,0.625 -1.01563,0.140625 -1.4375,0.328125 -0.42188,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45312,0.4375 0.9375,0 1.67188,-0.40625 0.75,-0.421875 1.09375,-1.140625 0.26562,-0.5625 0.26562,-1.640625 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path13"
+     d="m 118.06299,41.62992 h 23.46457 v 26.708664 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path15"
+     d="m 118.06299,41.62992 h 23.46457 v 26.708664 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path17"
+     d="m 129.79737,61.904254 h -1.51563 V 48.544879 h 1.64063 v 4.765625 q 1.04687,-1.296875 2.65625,-1.296875 0.89062,0 1.6875,0.359375 0.79687,0.359375 1.3125,1.015625 0.51562,0.640625 0.79687,1.5625 0.29688,0.921875 0.29688,1.96875 0,2.484375 -1.23438,3.84375 -1.21875,1.359375 -2.95312,1.359375 -1.70313,0 -2.6875,-1.4375 z m -0.0156,-4.90625 q 0,1.734375 0.48438,2.515625 0.76562,1.265625 2.09375,1.265625 1.07812,0 1.85937,-0.9375 0.78125,-0.9375 0.78125,-2.78125 0,-1.890625 -0.75,-2.796875 -0.75,-0.90625 -1.82812,-0.90625 -1.0625,0 -1.85938,0.9375 -0.78125,0.9375 -0.78125,2.703125 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path19"
+     d="m 141.52757,41.62992 h 23.46455 v 26.708664 h -23.46455 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path21"
+     d="m 141.52757,41.62992 h 23.46455 v 26.708664 h -23.46455 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path23"
+     d="m 158.07444,58.357374 1.60937,0.21875 q -0.26562,1.65625 -1.35937,2.609375 -1.07813,0.9375 -2.67188,0.9375 -1.98437,0 -3.1875,-1.296875 -1.20312,-1.296875 -1.20312,-3.71875 0,-1.578125 0.51562,-2.75 0.51563,-1.171875 1.57813,-1.75 1.0625,-0.59375 2.3125,-0.59375 1.57812,0 2.57812,0.796875 1,0.796875 1.28125,2.265625 l -1.59375,0.234375 q -0.23437,-0.96875 -0.8125,-1.453125 -0.57812,-0.5 -1.39062,-0.5 -1.23438,0 -2.01563,0.890625 -0.78125,0.890625 -0.78125,2.8125 0,1.953125 0.75,2.84375 0.75,0.875 1.95313,0.875 0.96875,0 1.60937,-0.59375 0.65625,-0.59375 0.82813,-1.828125 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path25"
+     d="M 94.598429,68.338584 H 118.063 v 26.70866 H 94.598429 Z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path27"
+     d="M 94.598429,68.338584 H 118.063 v 26.70866 H 94.598429 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path29"
+     d="m 111.09843,88.612914 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.640625 -0.96875,-0.640625 -1.5,-1.78125 -0.53125,-1.140625 -0.53125,-2.625 0,-1.453125 0.48438,-2.625 0.48437,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.359375 1.10937,0.953125 v -4.796875 h 1.64063 v 13.359375 z m -5.17188,-4.828125 q 0,1.859375 0.78125,2.78125 0.78125,0.921875 1.84375,0.921875 1.07813,0 1.82813,-0.875 0.75,-0.890625 0.75,-2.6875 0,-1.984375 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.890625 -0.73438,0.890625 -0.73438,2.8125 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path31"
+     d="m 118.06299,68.338584 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path33"
+     d="m 118.06299,68.338584 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path35"
+     d="m 134.92237,85.503539 1.6875,0.203125 q -0.40625,1.484375 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.296875 -1.23438,-1.3125 -1.23438,-3.671875 0,-2.453125 1.25,-3.796875 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.328125 1.23437,1.3125 1.23437,3.703125 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.453125 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.484375 1.01563,-1.515625 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.828125 -0.78125,-0.953125 -2.03125,-0.953125 -1.125,0 -1.90625,0.765625 -0.76562,0.75 -0.84375,2.015625 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path37"
+     d="m 141.52757,68.338584 h 23.46455 v 26.70866 h -23.46455 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path39"
+     d="m 141.52757,68.338584 h 23.46455 v 26.70866 h -23.46455 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path41"
+     d="m 152.29319,88.612914 v -8.40625 h -1.45313 v -1.265625 h 1.45313 v -1.03125 q 0,-0.96875 0.17187,-1.453125 0.23438,-0.640625 0.82813,-1.03125 0.59375,-0.390625 1.67187,-0.390625 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.09375 -0.95312,-0.09375 -0.75,0 -1.0625,0.328125 -0.3125,0.3125 -0.3125,1.1875 v 0.890625 h 1.89062 v 1.265625 h -1.89062 v 8.40625 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path43"
+     d="M 94.598429,95.047244 H 118.063 V 121.7559 H 94.598429 Z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path45"
+     d="M 94.598429,95.047244 H 118.063 V 121.7559 H 94.598429 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path47"
+     d="m 104.5203,116.11844 1.59375,0.23438 q 0.10937,0.75 0.5625,1.07812 0.60937,0.45313 1.67187,0.45313 1.14063,0 1.75,-0.45313 0.625,-0.45312 0.84375,-1.26562 0.125,-0.5 0.10938,-2.10938 -1.0625,1.26563 -2.67188,1.26563 -2,0 -3.09375,-1.4375 -1.09375,-1.4375 -1.09375,-3.45313 0,-1.39062 0.5,-2.5625 0.51563,-1.17187 1.45313,-1.79687 0.95312,-0.64063 2.25,-0.64063 1.70312,0 2.8125,1.375 v -1.15625 h 1.51562 v 8.35938 q 0,2.26562 -0.46875,3.20312 -0.45312,0.9375 -1.45312,1.48438 -0.98438,0.54688 -2.45313,0.54688 -1.71875,0 -2.79687,-0.78126 -1.0625,-0.76563 -1.03125,-2.34375 z m 1.35937,-5.8125 q 0,1.90625 0.75,2.78125 0.76563,0.875 1.90625,0.875 1.125,0 1.89063,-0.85937 0.76562,-0.875 0.76562,-2.73438 0,-1.78125 -0.79687,-2.67187 -0.78125,-0.90625 -1.89063,-0.90625 -1.09375,0 -1.85937,0.89062 -0.76563,0.875 -0.76563,2.625 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path49"
+     d="m 118.06299,95.047244 h 23.46457 V 121.7559 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path51"
+     d="m 118.06299,95.047244 h 23.46457 V 121.7559 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path53"
+     d="M 128.29737,115.32157 V 101.9622 h 1.64062 v 4.79687 q 1.14063,-1.32812 2.89063,-1.32812 1.07812,0 1.85937,0.42187 0.79688,0.42188 1.14063,1.17188 0.34375,0.75 0.34375,2.17187 v 6.125 h -1.64063 v -6.125 q 0,-1.23437 -0.53125,-1.79687 -0.53125,-0.5625 -1.51562,-0.5625 -0.71875,0 -1.35938,0.39062 -0.64062,0.375 -0.92187,1.01563 -0.26563,0.64062 -0.26563,1.78125 v 5.29687 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path55"
+     d="m 141.52757,95.047244 h 23.46455 V 121.7559 h -23.46455 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path57"
+     d="m 141.52757,95.047244 h 23.46455 V 121.7559 h -23.46455 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path59"
+     d="m 152.42181,103.85282 v -1.89062 h 1.64062 v 1.89062 z m 0,11.46875 v -9.67187 h 1.64062 v 9.67187 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path61"
+     d="m 118.06299,196.8609 h 23.46457 v 26.70865 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path63"
+     d="m 118.06299,196.8609 h 23.46457 v 26.70865 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path65"
+     d="m 134.92237,214.02584 1.6875,0.20313 q -0.40625,1.48437 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.29688 -1.23438,-1.3125 -1.23438,-3.67187 0,-2.45313 1.25,-3.79688 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.32813 1.23437,1.3125 1.23437,3.70312 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45313 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.48438 1.01563,-1.51563 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.82812 -0.78125,-0.95313 -2.03125,-0.95313 -1.125,0 -1.90625,0.76563 -0.76562,0.75 -0.84375,2.01562 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path67"
+     d="m 141.52757,196.8609 h 23.46455 v 26.70865 h -23.46455 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path69"
+     d="m 141.52757,196.8609 h 23.46455 v 26.70865 h -23.46455 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path71"
+     d="m 152.15257,217.13522 v -8.40625 h -1.45313 v -1.26563 h 1.45313 v -1.03125 q 0,-0.96875 0.17187,-1.45312 0.23438,-0.64063 0.82813,-1.03125 0.59375,-0.39063 1.67187,-0.39063 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.0937 -0.95312,-0.0937 -0.75,0 -1.0625,0.32813 -0.3125,0.3125 -0.3125,1.1875 v 0.89062 h 1.89062 v 1.26563 h -1.89062 v 8.40625 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path73"
+     d="m 118.06299,223.56955 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path75"
+     d="m 118.06299,223.56955 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path77"
+     d="M 128.29737,243.84388 V 230.4845 h 1.64062 v 4.79688 q 1.14063,-1.32813 2.89063,-1.32813 1.07812,0 1.85937,0.42188 0.79688,0.42187 1.14063,1.17187 0.34375,0.75 0.34375,2.17188 v 6.125 h -1.64063 v -6.125 q 0,-1.23438 -0.53125,-1.79688 -0.53125,-0.5625 -1.51562,-0.5625 -0.71875,0 -1.35938,0.39063 -0.64062,0.375 -0.92187,1.01562 -0.26563,0.64063 -0.26563,1.78125 v 5.29688 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path79"
+     d="m 141.52757,223.56955 h 23.46455 v 26.70866 h -23.46455 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path81"
+     d="m 141.52757,223.56955 h 23.46455 v 26.70866 h -23.46455 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path83"
+     d="m 151.76194,232.37513 v -1.89063 h 1.64062 v 1.89063 z m 0,11.46875 V 234.172 h 1.64062 v 9.67188 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path85"
+     d="m 163.38583,132.35694 h 464.50394 v 64.50395 H 163.38583 Z" />
+  <path
+     style="fill:#434343;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path87"
+     d="m 174.1202,159.27694 v -13.35938 h 1.76563 v 13.35938 z m 4.6833,0 v -9.67188 h 1.46875 v 1.375 q 1.0625,-1.59375 3.07813,-1.59375 0.875,0 1.60937,0.3125 0.73438,0.3125 1.09375,0.82813 0.375,0.5 0.51563,1.20312 0.0937,0.45313 0.0937,1.59375 v 5.95313 h -1.64063 v -5.89063 q 0,-1 -0.20312,-1.48437 -0.1875,-0.5 -0.67188,-0.79688 -0.48437,-0.29687 -1.14062,-0.29687 -1.04688,0 -1.8125,0.67187 -0.75,0.65625 -0.75,2.51563 v 5.28125 z m 16.64135,0 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64063 -0.96875,-0.64062 -1.5,-1.78125 -0.53125,-1.14062 -0.53125,-2.625 0,-1.45312 0.48438,-2.625 0.48437,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35938 1.10937,0.95313 v -4.79688 h 1.64063 v 13.35938 z m -5.17188,-4.82813 q 0,1.85938 0.78125,2.78125 0.78125,0.92188 1.84375,0.92188 1.07813,0 1.82813,-0.875 0.75,-0.89063 0.75,-2.6875 0,-1.98438 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89062 -0.73438,0.89063 -0.73438,2.8125 z m 15.90697,1.71875 1.6875,0.20313 q -0.40625,1.48437 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.29688 -1.23438,-1.3125 -1.23438,-3.67187 0,-2.45313 1.25,-3.79688 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.32813 1.23437,1.3125 1.23437,3.70312 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45313 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.48438 1.01563,-1.51563 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.82812 -0.78125,-0.95313 -2.03125,-0.95313 -1.125,0 -1.90625,0.76563 -0.76562,0.75 -0.84375,2.01562 z m 8.0476,5.76563 3.53125,-5.03125 -3.26563,-4.64063 h 2.04688 l 1.48437,2.26563 q 0.42188,0.64062 0.67188,1.07812 0.40625,-0.59375 0.73437,-1.0625 l 1.64063,-2.28125 h 1.95312 l -3.34375,4.54688 3.59375,5.125 h -2.01562 l -1.98438,-3 -0.51562,-0.8125 -2.54688,3.8125 z m 15.76142,0 v -13.35938 h 2.65625 l 3.15625,9.45313 q 0.4375,1.32812 0.64063,1.98437 0.23437,-0.73437 0.70312,-2.14062 l 3.20313,-9.29688 h 2.375 v 13.35938 h -1.70313 v -11.17188 l -3.875,11.17188 h -1.59375 l -3.85937,-11.375 v 11.375 z m 21.69707,-1.1875 q -0.92187,0.76562 -1.76562,1.09375 -0.82814,0.3125 -1.79689,0.3125 -1.59375,0 -2.45313,-0.78125 -0.85937,-0.78125 -0.85937,-1.98438 0,-0.71875 0.32812,-1.29687 0.32813,-0.59375 0.84375,-0.9375 0.53125,-0.35938 1.1875,-0.54688 0.46875,-0.125 1.45313,-0.25 1.98439,-0.23437 2.92189,-0.5625 0.0156,-0.34375 0.0156,-0.42187 0,-1 -0.46875,-1.42188 -0.625,-0.54687 -1.87501,-0.54687 -1.15625,0 -1.70313,0.40625 -0.54687,0.40625 -0.8125,1.42187 l -1.60937,-0.21875 q 0.21875,-1.01562 0.71875,-1.64062 0.5,-0.64063 1.45312,-0.98438 0.95313,-0.34375 2.18752,-0.34375 1.25,0 2.01562,0.29688 0.78125,0.28125 1.14063,0.73437 0.375,0.4375 0.51562,1.10938 0.0781,0.42187 0.0781,1.51562 v 2.1875 q 0,2.28125 0.10937,2.89063 0.10938,0.59375 0.40625,1.15625 h -1.70312 q -0.26563,-0.51563 -0.32813,-1.1875 z m -0.14062,-3.67188 q -0.89063,0.375 -2.67189,0.625 -1.01563,0.14063 -1.4375,0.32813 -0.42188,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45312,0.4375 0.93752,0 1.67189,-0.40625 0.75,-0.42188 1.09375,-1.14063 0.26563,-0.5625 0.26563,-1.64062 z m 4.20382,8.5625 v -13.375 h 1.48438 v 1.25 q 0.53125,-0.73437 1.1875,-1.09375 0.67187,-0.375 1.625,-0.375 1.23437,0 2.17187,0.64063 0.95313,0.625 1.4375,1.79687 0.48438,1.15625 0.48438,2.54688 0,1.48437 -0.53125,2.67187 -0.53125,1.1875 -1.54688,1.82813 -1.01562,0.625 -2.14062,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 v 4.70312 z m 1.48438,-8.48437 q 0,1.85937 0.75,2.76562 0.76562,0.89063 1.82812,0.89063 1.09375,0 1.875,-0.92188 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76562,-2.76562 -0.75,-0.92188 -1.8125,-0.92188 -1.04688,0 -1.85938,0.98438 -0.79687,0.96875 -0.79687,2.84375 z m 9.34448,-3.03125 v -1.85938 h 1.85937 v 1.85938 z m 0,7.8125 v -1.875 h 1.85937 v 1.875 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path89"
+     d="m 173.32333,181.51132 0.79687,-3.89063 h -1.54687 v -1.35937 h 1.8125 l 0.67187,-3.29688 h -2.48437 v -1.35937 h 2.76562 l 0.79688,-3.90625 h 1.35937 l -0.79687,3.90625 h 2.875 l 0.79687,-3.90625 h 1.375 l -0.79687,3.90625 h 1.57812 v 1.35937 h -1.84375 l -0.6875,3.29688 h 2.53125 v 1.35937 h -2.8125 l -0.78125,3.89063 h -1.375 l 0.78125,-3.89063 h -2.85937 l -0.78125,3.89063 z m 2.4375,-5.25 h 2.85937 l 0.6875,-3.29688 h -2.875 z m 8.23509,-6.45313 v -1.89062 h 1.64063 v 1.89062 z m 0,11.46875 v -9.67187 h 1.64063 v 9.67187 z m 4.14482,0 v -9.67187 h 1.46875 v 1.35937 q 0.45313,-0.71875 1.20313,-1.14062 0.76562,-0.4375 1.71875,-0.4375 1.07812,0 1.76562,0.45312 0.6875,0.4375 0.96875,1.23438 1.15625,-1.6875 2.98438,-1.6875 1.45312,0 2.21875,0.79687 0.78125,0.79688 0.78125,2.45313 v 6.64062 h -1.64063 v -6.09375 q 0,-0.98437 -0.15625,-1.40625 -0.15625,-0.4375 -0.57812,-0.70312 -0.42188,-0.26563 -0.98438,-0.26563 -1.01562,0 -1.6875,0.6875 -0.67187,0.67188 -0.67187,2.15625 v 5.625 h -1.64063 v -6.28125 q 0,-1.09375 -0.40625,-1.64062 -0.40625,-0.54688 -1.3125,-0.54688 -0.6875,0 -1.28125,0.35938 -0.59375,0.35937 -0.85937,1.0625 -0.25,0.70312 -0.25,2.03125 v 5.01562 z m 21.85331,-1.1875 q -0.92188,0.76563 -1.76563,1.09375 -0.82812,0.3125 -1.79687,0.3125 -1.59375,0 -2.45313,-0.78125 -0.85937,-0.78125 -0.85937,-1.98437 0,-0.71875 0.32812,-1.29688 0.32813,-0.59375 0.84375,-0.9375 0.53125,-0.35937 1.1875,-0.54687 0.46875,-0.125 1.45313,-0.25 1.98437,-0.23438 2.92187,-0.5625 0.0156,-0.34375 0.0156,-0.42188 0,-1 -0.46875,-1.42187 -0.625,-0.54688 -1.875,-0.54688 -1.15625,0 -1.70313,0.40625 -0.54687,0.40625 -0.8125,1.42188 l -1.60937,-0.21875 q 0.21875,-1.01563 0.71875,-1.64063 0.5,-0.64062 1.45312,-0.98437 0.95313,-0.34375 2.1875,-0.34375 1.25,0 2.01563,0.29687 0.78125,0.28125 1.14062,0.73438 0.375,0.4375 0.51563,1.10937 0.0781,0.42188 0.0781,1.51563 v 2.1875 q 0,2.28125 0.10938,2.89062 0.10937,0.59375 0.40625,1.15625 h -1.70313 q -0.26562,-0.51562 -0.32812,-1.1875 z m -0.14063,-3.67187 q -0.89062,0.375 -2.67187,0.625 -1.01563,0.14062 -1.4375,0.32812 -0.42188,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45312,0.4375 0.9375,0 1.67188,-0.40625 0.75,-0.42187 1.09375,-1.14062 0.26562,-0.5625 0.26562,-1.64063 z m 4.20384,8.5625 v -13.375 h 1.48438 v 1.25 q 0.53125,-0.73438 1.1875,-1.09375 0.67187,-0.375 1.625,-0.375 1.23437,0 2.17187,0.64062 0.95313,0.625 1.4375,1.79688 0.48438,1.15625 0.48438,2.54687 0,1.48438 -0.53125,2.67188 -0.53125,1.1875 -1.54688,1.82812 -1.01562,0.625 -2.14062,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 v 4.70313 z m 1.48438,-8.48438 q 0,1.85938 0.75,2.76563 0.76562,0.89062 1.82812,0.89062 1.09375,0 1.875,-0.92187 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76562,-2.76563 -0.75,-0.92187 -1.8125,-0.92187 -1.04688,0 -1.85938,0.98437 -0.79687,0.96875 -0.79687,2.84375 z m 9.01634,4.78125 v -13.35937 h 5.01562 q 1.53125,0 2.45313,0.40625 0.92187,0.40625 1.4375,1.25 0.53125,0.84375 0.53125,1.76562 0,0.85938 -0.46875,1.625 -0.45313,0.75 -1.39063,1.20313 1.20313,0.35937 1.85938,1.21875 0.65625,0.85937 0.65625,2.01562 0,0.9375 -0.40625,1.75 -0.39063,0.79688 -0.98438,1.23438 -0.57812,0.4375 -1.45312,0.67187 -0.875,0.21875 -2.15625,0.21875 z m 1.78125,-7.75 h 2.875 q 1.1875,0 1.6875,-0.14062 0.67187,-0.20313 1.01562,-0.67188 0.34375,-0.46875 0.34375,-1.17187 0,-0.65625 -0.32812,-1.15625 -0.3125,-0.51563 -0.90625,-0.70313 -0.59375,-0.1875 -2.03125,-0.1875 h -2.65625 z m 0,6.17188 h 3.3125 q 0.85937,0 1.20312,-0.0625 0.60938,-0.10938 1.01563,-0.35938 0.42187,-0.26562 0.6875,-0.75 0.26562,-0.48437 0.26562,-1.125 0,-0.75 -0.39062,-1.29687 -0.375,-0.54688 -1.0625,-0.76563 -0.67188,-0.23437 -1.95313,-0.23437 h -3.07812 z m 18.69357,0 v 1.57812 h -8.82812 q -0.0156,-0.59375 0.1875,-1.14062 0.34375,-0.90625 1.07812,-1.78125 0.75,-0.875 2.15625,-2.01563 2.17188,-1.78125 2.9375,-2.82812 0.76563,-1.04688 0.76563,-1.96875 0,-0.98438 -0.70313,-1.64063 -0.6875,-0.67187 -1.8125,-0.67187 -1.1875,0 -1.90625,0.71875 -0.70312,0.70312 -0.70312,1.95312 l -1.6875,-0.17187 q 0.17187,-1.89063 1.29687,-2.875 1.14063,-0.98438 3.03125,-0.98438 1.92188,0 3.04688,1.0625 1.125,1.0625 1.125,2.64063 0,0.79687 -0.32813,1.57812 -0.32812,0.78125 -1.09375,1.64063 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23437 -1.89062,1.6875 -0.42188,0.4375 -0.6875,0.875 z m 0.95386,1.57812 5.125,-13.35937 h 1.90625 l 5.46875,13.35937 h -2.01563 l -1.54687,-4.04687 h -5.59375 l -1.46875,4.04687 z m 3.85937,-5.48437 h 4.53125 l -1.40625,-3.70313 q -0.625,-1.6875 -0.9375,-2.76562 -0.26562,1.28125 -0.71875,2.54687 z m 18.15812,9.40625 q -1.35938,-1.70313 -2.29688,-4 -0.9375,-2.29688 -0.9375,-4.76563 0,-2.15625 0.70313,-4.14062 0.82812,-2.3125 2.53125,-4.59375 h 1.17187 q -1.09375,1.89062 -1.45312,2.70312 -0.54688,1.25 -0.875,2.625 -0.39063,1.70313 -0.39063,3.42188 0,4.375 2.71875,8.75 z m 9.3533,-3.92188 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64062 -0.96875,-0.64063 -1.5,-1.78125 -0.53125,-1.14063 -0.53125,-2.625 0,-1.45313 0.48438,-2.625 0.48437,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35937 1.10937,0.95312 v -4.79687 h 1.64063 v 13.35937 z m -5.17188,-4.82812 q 0,1.85937 0.78125,2.78125 0.78125,0.92187 1.84375,0.92187 1.07813,0 1.82813,-0.875 0.75,-0.89062 0.75,-2.6875 0,-1.98437 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89063 -0.73438,0.89062 -0.73438,2.8125 z m 8.82886,-1.76563 q 0,-2.35937 0.48438,-3.79687 0.48437,-1.45313 1.4375,-2.23438 0.96875,-0.78125 2.42187,-0.78125 1.07813,0 1.89063,0.4375 0.8125,0.42188 1.32812,1.25 0.53125,0.8125 0.82813,1.98438 0.3125,1.15625 0.3125,3.14062 0,2.35938 -0.48438,3.8125 -0.48437,1.4375 -1.45312,2.23438 -0.95313,0.78125 -2.42188,0.78125 -1.92187,0 -3.03125,-1.39063 -1.3125,-1.67187 -1.3125,-5.4375 z m 1.67188,0 q 0,3.29688 0.76562,4.39063 0.78125,1.07812 1.90625,1.07812 1.14063,0 1.90625,-1.09375 0.76563,-1.09375 0.76563,-4.375 0,-3.29687 -0.76563,-4.375 -0.76562,-1.07812 -1.92187,-1.07812 -1.125,0 -1.79688,0.95312 -0.85937,1.21875 -0.85937,4.5 z m 9.57882,6.59375 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35937,0.64063 -1.15625,0.98438 l -0.45312,-0.70313 q 0.51562,-0.21875 0.76562,-0.67187 0.25,-0.4375 0.28125,-1.26563 z m 16.21036,0 v -1.21875 q -0.90625,1.4375 -2.70312,1.4375 -1.15625,0 -2.125,-0.64062 -0.96875,-0.64063 -1.5,-1.78125 -0.53125,-1.14063 -0.53125,-2.625 0,-1.45313 0.48437,-2.625 0.48438,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17188,-0.625 0.875,0 1.54687,0.375 0.6875,0.35937 1.10938,0.95312 v -4.79687 h 1.64062 v 13.35937 z m -5.17187,-4.82812 q 0,1.85937 0.78125,2.78125 0.78125,0.92187 1.84375,0.92187 1.07812,0 1.82812,-0.875 0.75,-0.89062 0.75,-2.6875 0,-1.98437 -0.76562,-2.90625 -0.76563,-0.9375 -1.89063,-0.9375 -1.07812,0 -1.8125,0.89063 -0.73437,0.89062 -0.73437,2.8125 z m 15.00073,4.82812 h -1.64063 v -10.45312 q -0.59375,0.5625 -1.5625,1.14062 -0.95312,0.5625 -1.71875,0.84375 v -1.59375 q 1.375,-0.64062 2.40625,-1.5625 1.03125,-0.92187 1.45313,-1.78125 h 1.0625 z m 5.73507,3.92188 h -1.1875 q 2.73438,-4.375 2.73438,-8.75 0,-1.71875 -0.39063,-3.39063 -0.3125,-1.375 -0.875,-2.625 -0.35937,-0.82812 -1.46875,-2.73437 h 1.1875 q 1.70313,2.28125 2.53125,4.59375 0.6875,1.98437 0.6875,4.14062 0,2.46875 -0.9375,4.76563 -0.9375,2.29687 -2.28125,4 z m 5.16581,-0.21875 v -17.0625 h 3.60937 v 1.35937 h -1.96875 v 14.34375 h 1.96875 v 1.35938 z m 4.76144,-8 1.65625,-0.14063 q 0.125,1 0.54688,1.64063 0.4375,0.64062 1.34375,1.04687 0.92187,0.39063 2.0625,0.39063 1,0 1.78125,-0.29688 0.78125,-0.29687 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35938,-0.46875 -1.1875,-0.79687 -0.54688,-0.20313 -2.39063,-0.64063 -1.82812,-0.45312 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23437 -0.46875,-0.75 -0.46875,-1.67188 0,-1 0.57813,-1.875 0.57812,-0.89062 1.67187,-1.34375 1.10938,-0.45312 2.45313,-0.45312 1.48437,0 2.60937,0.48437 1.14063,0.46875 1.75,1.40625 0.60938,0.92188 0.65625,2.09375 l -1.6875,0.125 q -0.14062,-1.26562 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60937,0 -2.34375,0.59375 -0.73437,0.59375 -0.73437,1.42188 0,0.71875 0.53125,1.17187 0.5,0.46875 2.65625,0.96875 2.15625,0.48438 2.95312,0.84375 1.17188,0.53125 1.71875,1.35938 0.5625,0.82812 0.5625,1.90625 0,1.0625 -0.60937,2.01562 -0.60938,0.9375 -1.75,1.46875 -1.14063,0.51563 -2.57813,0.51563 -1.8125,0 -3.04687,-0.53125 -1.21875,-0.53125 -1.92188,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 12.38107,-2.29688 q 0,-2.35937 0.48438,-3.79687 0.48437,-1.45313 1.4375,-2.23438 0.96875,-0.78125 2.42187,-0.78125 1.07813,0 1.89063,0.4375 0.8125,0.42188 1.32812,1.25 0.53125,0.8125 0.82813,1.98438 0.3125,1.15625 0.3125,3.14062 0,2.35938 -0.48438,3.8125 -0.48437,1.4375 -1.45312,2.23438 -0.95313,0.78125 -2.42188,0.78125 -1.92187,0 -3.03125,-1.39063 -1.3125,-1.67187 -1.3125,-5.4375 z m 1.67188,0 q 0,3.29688 0.76562,4.39063 0.78125,1.07812 1.90625,1.07812 1.14063,0 1.90625,-1.09375 0.76563,-1.09375 0.76563,-4.375 0,-3.29687 -0.76563,-4.375 -0.76562,-1.07812 -1.92187,-1.07812 -1.125,0 -1.79688,0.95312 -0.85937,1.21875 -0.85937,4.5 z m 9.57883,6.59375 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35938,0.64063 -1.15625,0.98438 l -0.45313,-0.70313 q 0.51563,-0.21875 0.76563,-0.67187 0.25,-0.4375 0.28125,-1.26563 z m 9.5541,-4.29687 1.65625,-0.14063 q 0.125,1 0.54688,1.64063 0.4375,0.64062 1.34375,1.04687 0.92187,0.39063 2.0625,0.39063 1,0 1.78125,-0.29688 0.78125,-0.29687 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35938,-0.46875 -1.1875,-0.79687 -0.54688,-0.20313 -2.39063,-0.64063 -1.82812,-0.45312 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23437 -0.46875,-0.75 -0.46875,-1.67188 0,-1 0.57813,-1.875 0.57812,-0.89062 1.67187,-1.34375 1.10938,-0.45312 2.45313,-0.45312 1.48437,0 2.60937,0.48437 1.14063,0.46875 1.75,1.40625 0.60938,0.92188 0.65625,2.09375 l -1.6875,0.125 q -0.14062,-1.26562 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60937,0 -2.34375,0.59375 -0.73437,0.59375 -0.73437,1.42188 0,0.71875 0.53125,1.17187 0.5,0.46875 2.65625,0.96875 2.15625,0.48438 2.95312,0.84375 1.17188,0.53125 1.71875,1.35938 0.5625,0.82812 0.5625,1.90625 0,1.0625 -0.60937,2.01562 -0.60938,0.9375 -1.75,1.46875 -1.14063,0.51563 -2.57813,0.51563 -1.8125,0 -3.04687,-0.53125 -1.21875,-0.53125 -1.92188,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 18.55295,4.29687 h -1.64062 v -10.45312 q -0.59375,0.5625 -1.5625,1.14062 -0.95313,0.5625 -1.71875,0.84375 v -1.59375 q 1.375,-0.64062 2.40625,-1.5625 1.03125,-0.92187 1.45312,-1.78125 h 1.0625 z m 7.39136,3.70313 h -3.60938 v -1.35938 h 1.96875 v -14.34375 h -1.96875 v -1.35937 h 3.60938 z m 6.99161,-7.71875 v -1.64063 h 5.03125 v 1.64063 z m 15.4783,-1.82813 -8.84375,3.78125 v -1.625 l 7.01562,-2.90625 -7.01562,-2.875 v -1.64062 l 8.84375,3.73437 z m 10.57825,9.76563 q -1.35938,-1.70313 -2.29688,-4 -0.9375,-2.29688 -0.9375,-4.76563 0,-2.15625 0.70313,-4.14062 0.82812,-2.3125 2.53125,-4.59375 h 1.17187 q -1.09375,1.89062 -1.45312,2.70312 -0.54688,1.25 -0.875,2.625 -0.39063,1.70313 -0.39063,3.42188 0,4.375 2.71875,8.75 z m 9.3533,-3.92188 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64062 -0.96875,-0.64063 -1.5,-1.78125 -0.53125,-1.14063 -0.53125,-2.625 0,-1.45313 0.48438,-2.625 0.48437,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35937 1.10937,0.95312 v -4.79687 h 1.64063 v 13.35937 z m -5.17188,-4.82812 q 0,1.85937 0.78125,2.78125 0.78125,0.92187 1.84375,0.92187 1.07813,0 1.82813,-0.875 0.75,-0.89062 0.75,-2.6875 0,-1.98437 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89063 -0.73438,0.89062 -0.73438,2.8125 z m 8.82886,-1.76563 q 0,-2.35937 0.48437,-3.79687 0.48438,-1.45313 1.4375,-2.23438 0.96875,-0.78125 2.42188,-0.78125 1.07812,0 1.89062,0.4375 0.8125,0.42188 1.32813,1.25 0.53125,0.8125 0.82812,1.98438 0.3125,1.15625 0.3125,3.14062 0,2.35938 -0.48437,3.8125 -0.48438,1.4375 -1.45313,2.23438 -0.95312,0.78125 -2.42187,0.78125 -1.92188,0 -3.03125,-1.39063 -1.3125,-1.67187 -1.3125,-5.4375 z m 1.67187,0 q 0,3.29688 0.76563,4.39063 0.78125,1.07812 1.90625,1.07812 1.14062,0 1.90625,-1.09375 0.76562,-1.09375 0.76562,-4.375 0,-3.29687 -0.76562,-4.375 -0.76563,-1.07812 -1.92188,-1.07812 -1.125,0 -1.79687,0.95312 -0.85938,1.21875 -0.85938,4.5 z m 17.77778,4.4375 v -3.67187 h -3.64063 v -1.51563 h 3.64063 v -3.64062 h 1.54687 v 3.64062 h 3.64063 v 1.51563 h -3.64063 v 3.67187 z m 12.25012,-2.14062 1.65625,-0.14063 q 0.125,1 0.54687,1.64063 0.4375,0.64062 1.34375,1.04687 0.92188,0.39063 2.0625,0.39063 1,0 1.78125,-0.29688 0.78125,-0.29687 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35937,-0.46875 -1.1875,-0.79687 -0.54687,-0.20313 -2.39062,-0.64063 -1.82813,-0.45312 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23437 -0.46875,-0.75 -0.46875,-1.67188 0,-1 0.57812,-1.875 0.57813,-0.89062 1.67188,-1.34375 1.10937,-0.45312 2.45312,-0.45312 1.48438,0 2.60938,0.48437 1.14062,0.46875 1.75,1.40625 0.60937,0.92188 0.65625,2.09375 l -1.6875,0.125 q -0.14063,-1.26562 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60938,0 -2.34375,0.59375 -0.73438,0.59375 -0.73438,1.42188 0,0.71875 0.53125,1.17187 0.5,0.46875 2.65625,0.96875 2.15625,0.48438 2.95313,0.84375 1.17187,0.53125 1.71875,1.35938 0.5625,0.82812 0.5625,1.90625 0,1.0625 -0.60938,2.01562 -0.60937,0.9375 -1.75,1.46875 -1.14062,0.51563 -2.57812,0.51563 -1.8125,0 -3.04688,-0.53125 -1.21875,-0.53125 -1.92187,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 12.38107,-2.29688 q 0,-2.35937 0.48438,-3.79687 0.48437,-1.45313 1.4375,-2.23438 0.96875,-0.78125 2.42187,-0.78125 1.07813,0 1.89063,0.4375 0.8125,0.42188 1.32812,1.25 0.53125,0.8125 0.82813,1.98438 0.3125,1.15625 0.3125,3.14062 0,2.35938 -0.48438,3.8125 -0.48437,1.4375 -1.45312,2.23438 -0.95313,0.78125 -2.42188,0.78125 -1.92187,0 -3.03125,-1.39063 -1.3125,-1.67187 -1.3125,-5.4375 z m 1.67188,0 q 0,3.29688 0.76562,4.39063 0.78125,1.07812 1.90625,1.07812 1.14063,0 1.90625,-1.09375 0.76563,-1.09375 0.76563,-4.375 0,-3.29687 -0.76563,-4.375 -0.76562,-1.07812 -1.92187,-1.07812 -1.125,0 -1.79688,0.95312 -0.85937,1.21875 -0.85937,4.5 z m 9.57882,6.59375 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35937,0.64063 -1.15625,0.98438 l -0.45312,-0.70313 q 0.51562,-0.21875 0.76562,-0.67187 0.25,-0.4375 0.28125,-1.26563 z m 16.21036,0 v -1.21875 q -0.90625,1.4375 -2.70312,1.4375 -1.15625,0 -2.125,-0.64062 -0.96875,-0.64063 -1.5,-1.78125 -0.53125,-1.14063 -0.53125,-2.625 0,-1.45313 0.48437,-2.625 0.48438,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17188,-0.625 0.875,0 1.54687,0.375 0.6875,0.35937 1.10938,0.95312 v -4.79687 h 1.64062 v 13.35937 z m -5.17187,-4.82812 q 0,1.85937 0.78125,2.78125 0.78125,0.92187 1.84375,0.92187 1.07812,0 1.82812,-0.875 0.75,-0.89062 0.75,-2.6875 0,-1.98437 -0.76562,-2.90625 -0.76563,-0.9375 -1.89063,-0.9375 -1.07812,0 -1.8125,0.89063 -0.73437,0.89062 -0.73437,2.8125 z m 15.00073,4.82812 h -1.64063 v -10.45312 q -0.59375,0.5625 -1.5625,1.14062 -0.95312,0.5625 -1.71875,0.84375 v -1.59375 q 1.375,-0.64062 2.40625,-1.5625 1.03125,-0.92187 1.45313,-1.78125 h 1.0625 z m 13.27777,-2.15625 v -3.67187 h -3.64063 v -1.51563 h 3.64063 v -3.64062 h 1.54687 v 3.64062 h 3.64063 v 1.51563 h -3.64063 v 3.67187 z m 12.25012,-2.14062 1.65625,-0.14063 q 0.125,1 0.54688,1.64063 0.4375,0.64062 1.34375,1.04687 0.92187,0.39063 2.0625,0.39063 1,0 1.78125,-0.29688 0.78125,-0.29687 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35938,-0.46875 -1.1875,-0.79687 -0.54688,-0.20313 -2.39063,-0.64063 -1.82812,-0.45312 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23437 -0.46875,-0.75 -0.46875,-1.67188 0,-1 0.57813,-1.875 0.57812,-0.89062 1.67187,-1.34375 1.10938,-0.45312 2.45313,-0.45312 1.48437,0 2.60937,0.48437 1.14063,0.46875 1.75,1.40625 0.60938,0.92188 0.65625,2.09375 l -1.6875,0.125 q -0.14062,-1.26562 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60937,0 -2.34375,0.59375 -0.73437,0.59375 -0.73437,1.42188 0,0.71875 0.53125,1.17187 0.5,0.46875 2.65625,0.96875 2.15625,0.48438 2.95312,0.84375 1.17188,0.53125 1.71875,1.35938 0.5625,0.82812 0.5625,1.90625 0,1.0625 -0.60937,2.01562 -0.60938,0.9375 -1.75,1.46875 -1.14063,0.51563 -2.57813,0.51563 -1.8125,0 -3.04687,-0.53125 -1.21875,-0.53125 -1.92188,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 18.55292,4.29687 h -1.64063 v -10.45312 q -0.59375,0.5625 -1.5625,1.14062 -0.95312,0.5625 -1.71875,0.84375 v -1.59375 q 1.375,-0.64062 2.40625,-1.5625 1.03125,-0.92187 1.45313,-1.78125 h 1.0625 z m 5.7351,3.92188 h -1.1875 q 2.73438,-4.375 2.73438,-8.75 0,-1.71875 -0.39063,-3.39063 -0.3125,-1.375 -0.875,-2.625 -0.35937,-0.82812 -1.46875,-2.73437 h 1.1875 q 1.70313,2.28125 2.53125,4.59375 0.6875,1.98437 0.6875,4.14062 0,2.46875 -0.9375,4.76563 -0.9375,2.29687 -2.28125,4 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path91"
+     d="M 73.383199,62.081364 H 85.761152 V 78.364827 H 73.383199 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path93"
+     d="m 77.995529,75.816074 v -1.890625 h 1.640625 v 1.890625 z m 0,11.46875 v -9.671875 h 1.640625 v 9.671875 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path95"
+     d="M 128.95013,0 H 156.4147 V 33.007874 H 128.95013 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path97"
+     d="m 139.16888,15.466874 v -1.90625 h 1.64062 v 1.90625 z m -2.07813,15.203123 0.3125,-1.390625 q 0.5,0.125 0.78125,0.125 0.5,0 0.73438,-0.328125 0.25,-0.328125 0.25,-1.671875 V 17.248124 h 1.64062 v 10.203123 q 0,1.78125 -0.46875,2.484375 -0.59375,0.90625 -1.96875,0.90625 -0.65625,0 -1.28125,-0.171875 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path99"
+     d="M 0,25.010498 H 128.18896 V 46.490814 H 0 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path101"
+     d="M 13.359375,55.852374 Q 12,54.149249 11.0625,51.852374 10.125,49.555499 10.125,47.086749 q 0,-2.15625 0.703125,-4.140625 0.828125,-2.3125 2.53125,-4.59375 h 1.171875 q -1.09375,1.890625 -1.453125,2.703125 -0.546875,1.25 -0.875,2.625 -0.390625,1.703125 -0.390625,3.421875 0,4.375 2.71875,8.75 z m 2.697052,-8.21875 1.65625,-0.140625 q 0.125,1 0.546875,1.640625 0.4375,0.640625 1.34375,1.046875 0.921875,0.390625 2.0625,0.390625 1,0 1.78125,-0.296875 0.78125,-0.296875 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.359375,-0.46875 -1.1875,-0.796875 -0.546875,-0.203125 -2.390625,-0.640625 -1.828125,-0.453125 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.234375 -0.46875,-0.75 -0.46875,-1.671875 0,-1 0.578125,-1.875 0.578125,-0.890625 1.671875,-1.34375 1.109375,-0.453125 2.453125,-0.453125 1.484375,0 2.609375,0.484375 1.140625,0.46875 1.75,1.40625 0.609375,0.921875 0.65625,2.09375 l -1.6875,0.125 q -0.140625,-1.265625 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.609375,0 -2.34375,0.59375 -0.734375,0.59375 -0.734375,1.421875 0,0.71875 0.53125,1.171875 0.5,0.46875 2.65625,0.96875 2.15625,0.484375 2.953125,0.84375 1.171875,0.53125 1.71875,1.359375 0.5625,0.828125 0.5625,1.90625 0,1.0625 -0.609375,2.015625 -0.609375,0.9375 -1.75,1.46875 -1.140625,0.515625 -2.578125,0.515625 -1.8125,0 -3.046875,-0.53125 -1.21875,-0.53125 -1.921875,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z M 28.4375,45.336749 q 0,-2.359375 0.484375,-3.796875 0.484375,-1.453125 1.4375,-2.234375 0.96875,-0.78125 2.421875,-0.78125 1.078125,0 1.890625,0.4375 0.8125,0.421875 1.328125,1.25 0.53125,0.8125 0.828125,1.984375 0.3125,1.15625 0.3125,3.140625 0,2.359375 -0.484375,3.8125 -0.484375,1.4375 -1.453125,2.234375 -0.953125,0.78125 -2.421875,0.78125 -1.921875,0 -3.03125,-1.390625 -1.3125,-1.671875 -1.3125,-5.4375 z m 1.671875,0 q 0,3.296875 0.765625,4.390625 0.78125,1.078125 1.90625,1.078125 1.140625,0 1.90625,-1.09375 0.765625,-1.09375 0.765625,-4.375 0,-3.296875 -0.765625,-4.375 -0.765625,-1.078125 -1.921875,-1.078125 -1.125,0 -1.796875,0.953125 -0.859375,1.21875 -0.859375,4.5 z m 9.578842,6.59375 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.359375,0.640625 -1.15625,0.984375 l -0.453125,-0.703125 q 0.515625,-0.21875 0.765625,-0.671875 0.25,-0.4375 0.28125,-1.265625 z m 9.554108,-4.296875 1.65625,-0.140625 q 0.125,1 0.546875,1.640625 0.4375,0.640625 1.34375,1.046875 0.921875,0.390625 2.0625,0.390625 1,0 1.78125,-0.296875 0.78125,-0.296875 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.359375,-0.46875 -1.1875,-0.796875 -0.546875,-0.203125 -2.390625,-0.640625 -1.828125,-0.453125 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.234375 -0.46875,-0.75 -0.46875,-1.671875 0,-1 0.578125,-1.875 0.578125,-0.890625 1.671875,-1.34375 1.109375,-0.453125 2.453125,-0.453125 1.484375,0 2.609375,0.484375 1.140625,0.46875 1.75,1.40625 0.609375,0.921875 0.65625,2.09375 l -1.6875,0.125 q -0.140625,-1.265625 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.609375,0 -2.34375,0.59375 -0.734375,0.59375 -0.734375,1.421875 0,0.71875 0.53125,1.171875 0.5,0.46875 2.65625,0.96875 2.15625,0.484375 2.953125,0.84375 1.171875,0.53125 1.71875,1.359375 0.5625,0.828125 0.5625,1.90625 0,1.0625 -0.609375,2.015625 -0.609375,0.9375 -1.75,1.46875 -1.140625,0.515625 -2.578125,0.515625 -1.8125,0 -3.046875,-0.53125 -1.21875,-0.53125 -1.921875,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 18.552948,4.296875 H 66.154648 V 41.477374 q -0.59375,0.5625 -1.5625,1.140625 -0.953125,0.5625 -1.71875,0.84375 v -1.59375 q 1.375,-0.640625 2.40625,-1.5625 1.03125,-0.921875 1.453125,-1.78125 h 1.0625 z m 5.735092,3.921875 h -1.1875 q 2.734375,-4.375 2.734375,-8.75 0,-1.71875 -0.390625,-3.390625 -0.3125,-1.375 -0.875,-2.625 -0.359375,-0.828125 -1.46875,-2.734375 h 1.1875 q 1.703125,2.28125 2.53125,4.59375 0.6875,1.984375 0.6875,4.140625 0,2.46875 -0.9375,4.765625 -0.9375,2.296875 -2.28125,4 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path103"
+     d="M 54.629919,54.490814 118.06299,68.349083" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path105"
+     d="M 54.629919,54.490814 112.20125,67.068466" />
+  <path
+     style="fill:#000000;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt"
+     inkscape:connector-curvature="0"
+     id="path107"
+     d="m 111.84871,68.682134 4.78606,-0.645081 -4.08098,-2.58226 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path109"
+     d="M 153.25985,196.8609 V 121.74278" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path111"
+     d="M 153.25985,196.8609 V 127.74278" />
+  <path
+     style="fill:#000000;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt"
+     inkscape:connector-curvature="0"
+     id="path113"
+     d="m 154.91157,127.74278 -1.65172,-4.5381 -1.65173,4.5381 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path115"
+     d="m 82.181099,203.58267 h 63.433071 v 31.2756 H 82.181099 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path117"
+     d="m 98.681099,230.50267 v -1.21875 q -0.90625,1.4375 -2.70312,1.4375 -1.15625,0 -2.125,-0.64063 -0.96875,-0.64062 -1.5,-1.78125 -0.53125,-1.14062 -0.53125,-2.625 0,-1.45312 0.48437,-2.625 0.48438,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17188,-0.625 0.875,0 1.54687,0.375 0.6875,0.35938 1.10938,0.95313 v -4.79688 h 1.640621 v 13.35938 z m -5.17187,-4.82813 q 0,1.85938 0.78125,2.78125 0.78125,0.92188 1.84375,0.92188 1.07812,0 1.82812,-0.875 0.75,-0.89063 0.75,-2.6875 0,-1.98438 -0.76562,-2.90625 -0.76563,-0.9375 -1.89063,-0.9375 -1.07812,0 -1.8125,0.89062 -0.73437,0.89063 -0.73437,2.8125 z m 8.828841,-1.76562 q 0,-2.35938 0.48437,-3.79688 0.48438,-1.45312 1.4375,-2.23437 0.96875,-0.78125 2.42188,-0.78125 1.07812,0 1.89062,0.4375 0.8125,0.42187 1.32813,1.25 0.53125,0.8125 0.82812,1.98437 0.3125,1.15625 0.3125,3.14063 0,2.35937 -0.48437,3.8125 -0.48438,1.4375 -1.45313,2.23437 -0.95312,0.78125 -2.42187,0.78125 -1.92188,0 -3.03125,-1.39062 -1.3125,-1.67188 -1.3125,-5.4375 z m 1.67187,0 q 0,3.29687 0.76563,4.39062 0.78125,1.07813 1.90625,1.07813 1.14062,0 1.90625,-1.09375 0.76562,-1.09375 0.76562,-4.375 0,-3.29688 -0.76562,-4.375 -0.76563,-1.07813 -1.92188,-1.07813 -1.125,0 -1.79687,0.95313 -0.85938,1.21875 -0.85938,4.5 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path119"
+     d="m 115.32809,160.28871 h 71.55905 v 31.27559 h -71.55905 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path121"
+     d="m 131.82809,187.20871 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64063 -0.96875,-0.64062 -1.5,-1.78125 -0.53125,-1.14062 -0.53125,-2.625 0,-1.45312 0.48438,-2.625 0.48437,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35938 1.10937,0.95313 v -4.79688 h 1.64063 v 13.35938 z m -5.17188,-4.82813 q 0,1.85938 0.78125,2.78125 0.78125,0.92188 1.84375,0.92188 1.07813,0 1.82813,-0.875 0.75,-0.89063 0.75,-2.6875 0,-1.98438 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89062 -0.73438,0.89063 -0.73438,2.8125 z m 15.00072,4.82813 h -1.64062 v -10.45313 q -0.59375,0.5625 -1.5625,1.14063 -0.95313,0.5625 -1.71875,0.84375 v -1.59375 q 1.375,-0.64063 2.40625,-1.5625 1.03125,-0.92188 1.45312,-1.78125 h 1.0625 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path123"
+     d="m 171.88189,57.732284 h 440 v 26.708664 h -440 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path125"
+     d="M 182.61627,84.652284 V 71.292909 h 1.76562 v 13.359375 z m 4.6833,0 v -9.671875 h 1.46875 v 1.375 q 1.0625,-1.59375 3.07813,-1.59375 0.875,0 1.60937,0.3125 0.73438,0.3125 1.09375,0.828125 0.375,0.5 0.51563,1.203125 0.0937,0.453125 0.0937,1.59375 v 5.953125 h -1.64063 v -5.890625 q 0,-1 -0.20312,-1.484375 -0.1875,-0.5 -0.67188,-0.796875 -0.48437,-0.296875 -1.14062,-0.296875 -1.04688,0 -1.8125,0.671875 -0.75,0.65625 -0.75,2.515625 v 5.28125 z m 16.64135,0 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.640625 -0.96875,-0.640625 -1.5,-1.78125 -0.53125,-1.140625 -0.53125,-2.625 0,-1.453125 0.48438,-2.625 0.48437,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.359375 1.10937,0.953125 v -4.796875 h 1.64063 v 13.359375 z m -5.17188,-4.828125 q 0,1.859375 0.78125,2.78125 0.78125,0.921875 1.84375,0.921875 1.07813,0 1.82813,-0.875 0.75,-0.890625 0.75,-2.6875 0,-1.984375 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.890625 -0.73438,0.890625 -0.73438,2.8125 z m 15.90697,1.71875 1.6875,0.203125 q -0.40625,1.484375 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.296875 -1.23438,-1.3125 -1.23438,-3.671875 0,-2.453125 1.25,-3.796875 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.328125 1.23437,1.3125 1.23437,3.703125 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.453125 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.484375 1.01563,-1.515625 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.828125 -0.78125,-0.953125 -2.03125,-0.953125 -1.125,0 -1.90625,0.765625 -0.76562,0.75 -0.84375,2.015625 z m 8.04759,5.765625 3.53125,-5.03125 -3.26562,-4.640625 h 2.04687 l 1.48438,2.265625 q 0.42187,0.640625 0.67187,1.078125 0.40625,-0.59375 0.73438,-1.0625 l 1.64062,-2.28125 h 1.95313 l -3.34375,4.546875 3.59375,5.125 h -2.01563 l -1.98437,-3 -0.51563,-0.8125 -2.54687,3.8125 z m 15.21456,-4.296875 1.65625,-0.140625 q 0.125,1 0.54687,1.640625 0.4375,0.640625 1.34375,1.046875 0.92188,0.390625 2.0625,0.390625 1,0 1.78125,-0.296875 0.78125,-0.296875 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35937,-0.46875 -1.1875,-0.796875 -0.54687,-0.203125 -2.39062,-0.640625 -1.82813,-0.453125 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.234375 -0.46875,-0.75 -0.46875,-1.671875 0,-1 0.57812,-1.875 0.57813,-0.890625 1.67188,-1.34375 1.10937,-0.453125 2.45312,-0.453125 1.48438,0 2.60938,0.484375 1.14062,0.46875 1.75,1.40625 0.60937,0.921875 0.65625,2.09375 l -1.6875,0.125 q -0.14063,-1.265625 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60938,0 -2.34375,0.59375 -0.73438,0.59375 -0.73438,1.421875 0,0.71875 0.53125,1.171875 0.5,0.46875 2.65625,0.96875 2.15625,0.484375 2.95313,0.84375 1.17187,0.53125 1.71875,1.359375 0.5625,0.828125 0.5625,1.90625 0,1.0625 -0.60938,2.015625 -0.60937,0.9375 -1.75,1.46875 -1.14062,0.515625 -2.57812,0.515625 -1.8125,0 -3.04688,-0.53125 -1.21875,-0.53125 -1.92187,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 12.83418,8 v -13.375 h 1.48438 v 1.25 q 0.53125,-0.734375 1.1875,-1.09375 0.67187,-0.375 1.625,-0.375 1.23437,0 2.17187,0.640625 0.95313,0.625 1.4375,1.796875 0.48438,1.15625 0.48438,2.546875 0,1.484375 -0.53125,2.671875 -0.53125,1.1875 -1.54688,1.828125 -1.01562,0.625 -2.14062,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 v 4.703125 z m 1.48438,-8.484375 q 0,1.859375 0.75,2.765625 0.76562,0.890625 1.82812,0.890625 1.09375,0 1.875,-0.921875 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76562,-2.765625 -0.75,-0.921875 -1.8125,-0.921875 -1.04688,0 -1.85938,0.984375 -0.79687,0.96875 -0.79687,2.84375 z m 15.20385,3.59375 q -0.92187,0.765625 -1.76562,1.09375 -0.82813,0.3125 -1.79688,0.3125 -1.59375,0 -2.45312,-0.78125 -0.85938,-0.78125 -0.85938,-1.984375 0,-0.71875 0.32813,-1.296875 0.32812,-0.59375 0.84375,-0.9375 0.53125,-0.359375 1.1875,-0.546875 0.46875,-0.125 1.45312,-0.25 1.98438,-0.234375 2.92188,-0.5625 0.0156,-0.34375 0.0156,-0.421875 0,-1 -0.46875,-1.421875 -0.625,-0.546875 -1.875,-0.546875 -1.15625,0 -1.70312,0.40625 -0.54688,0.40625 -0.8125,1.421875 l -1.60938,-0.21875 q 0.21875,-1.015625 0.71875,-1.640625 0.5,-0.640625 1.45313,-0.984375 0.95312,-0.34375 2.1875,-0.34375 1.25,0 2.01562,0.296875 0.78125,0.28125 1.14063,0.734375 0.375,0.4375 0.51562,1.109375 0.0781,0.421875 0.0781,1.515625 v 2.1875 q 0,2.28125 0.10937,2.890625 0.10938,0.59375 0.40625,1.15625 h -1.70312 q -0.26563,-0.515625 -0.32813,-1.1875 z m -0.14062,-3.671875 q -0.89063,0.375 -2.67188,0.625 -1.01562,0.140625 -1.4375,0.328125 -0.42187,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45313,0.4375 0.9375,0 1.67187,-0.40625 0.75,-0.421875 1.09375,-1.140625 0.26563,-0.5625 0.26563,-1.640625 z m 10.51636,1.3125 1.60937,0.21875 q -0.26562,1.65625 -1.35937,2.609375 -1.07813,0.9375 -2.67188,0.9375 -1.98437,0 -3.1875,-1.296875 -1.20312,-1.296875 -1.20312,-3.71875 0,-1.578125 0.51562,-2.75 0.51563,-1.171875 1.57813,-1.75 1.0625,-0.59375 2.3125,-0.59375 1.57812,0 2.57812,0.796875 1,0.796875 1.28125,2.265625 l -1.59375,0.234375 q -0.23437,-0.96875 -0.8125,-1.453125 -0.57812,-0.5 -1.39062,-0.5 -1.23438,0 -2.01563,0.890625 -0.78125,0.890625 -0.78125,2.8125 0,1.953125 0.75,2.84375 0.75,0.875 1.95313,0.875 0.96875,0 1.60937,-0.59375 0.65625,-0.59375 0.82813,-1.828125 z m 9.64062,0.4375 1.6875,0.203125 q -0.40625,1.484375 -1.48437,2.3125 -1.07813,0.8125 -2.76563,0.8125 -2.125,0 -3.375,-1.296875 -1.23437,-1.3125 -1.23437,-3.671875 0,-2.453125 1.25,-3.796875 1.26562,-1.34375 3.26562,-1.34375 1.9375,0 3.15625,1.328125 1.23438,1.3125 1.23438,3.703125 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.453125 0.8125,0.84375 2.01562,0.84375 0.90625,0 1.54688,-0.46875 0.64062,-0.484375 1.01562,-1.515625 z m -5.39062,-2.65625 h 5.40625 q -0.10938,-1.21875 -0.625,-1.828125 -0.78125,-0.953125 -2.03125,-0.953125 -1.125,0 -1.90625,0.765625 -0.76563,0.75 -0.84375,2.015625 z m 14.10589,-0.07813 v -1.531245 l 8.84375,-3.734375 v 1.640625 l -7.01562,2.875 7.01562,2.90625 v 1.625 z m 10.66059,2.3125 1.64062,-0.21875 q 0.28125,1.40625 0.95313,2.015625 0.6875,0.609375 1.65625,0.609375 1.15625,0 1.95312,-0.796875 0.79688,-0.796875 0.79688,-1.984375 0,-1.125 -0.73438,-1.859375 -0.73437,-0.734375 -1.875,-0.734375 -0.46875,0 -1.15625,0.171875 l 0.1875,-1.4375 q 0.15625,0.01563 0.26563,0.01563 1.04687,0 1.875,-0.546875 0.84375,-0.546875 0.84375,-1.671875 0,-0.90625 -0.60938,-1.5 -0.60937,-0.59375 -1.57812,-0.59375 -0.95313,0 -1.59375,0.609375 -0.64063,0.59375 -0.8125,1.796875 l -1.64063,-0.296875 q 0.29688,-1.640625 1.35938,-2.546875 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92187,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.859375 -0.46875,1.578125 -0.46875,0.703125 -1.375,1.125 1.1875,0.28125 1.84375,1.140625 0.65625,0.859375 0.65625,2.15625 0,1.734375 -1.28125,2.953125 -1.26563,1.21875 -3.21875,1.21875 -1.76563,0 -2.92188,-1.046875 -1.15625,-1.046875 -1.32812,-2.71875 z m 9.73507,3.53125 3.53125,-5.03125 -3.26562,-4.640625 h 2.04687 l 1.48438,2.265625 q 0.42187,0.640625 0.67187,1.078125 0.40625,-0.59375 0.73438,-1.0625 l 1.64062,-2.28125 h 1.95313 l -3.34375,4.546875 3.59375,5.125 h -2.01563 l -1.98437,-3 -0.51563,-0.8125 -2.54687,3.8125 z m 9.96875,-3.53125 1.64063,-0.21875 q 0.28125,1.40625 0.95312,2.015625 0.6875,0.609375 1.65625,0.609375 1.15625,0 1.95313,-0.796875 0.79687,-0.796875 0.79687,-1.984375 0,-1.125 -0.73437,-1.859375 -0.73438,-0.734375 -1.875,-0.734375 -0.46875,0 -1.15625,0.171875 l 0.1875,-1.4375 q 0.15625,0.01563 0.26562,0.01563 1.04688,0 1.875,-0.546875 0.84375,-0.546875 0.84375,-1.671875 0,-0.90625 -0.60937,-1.5 -0.60938,-0.59375 -1.57813,-0.59375 -0.95312,0 -1.59375,0.609375 -0.64062,0.59375 -0.8125,1.796875 l -1.64062,-0.296875 q 0.29687,-1.640625 1.35937,-2.546875 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92188,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.859375 -0.46875,1.578125 -0.46875,0.703125 -1.375,1.125 1.1875,0.28125 1.84375,1.140625 0.65625,0.859375 0.65625,2.15625 0,1.734375 -1.28125,2.953125 -1.26562,1.21875 -3.21875,1.21875 -1.76562,0 -2.92187,-1.046875 -1.15625,-1.046875 -1.32813,-2.71875 z m 9.73508,3.53125 3.53125,-5.03125 -3.26563,-4.640625 h 2.04688 l 1.48437,2.265625 q 0.42188,0.640625 0.67188,1.078125 0.40625,-0.59375 0.73437,-1.0625 l 1.64063,-2.28125 h 1.95312 l -3.34375,4.546875 3.59375,5.125 h -2.01562 l -1.98438,-3 -0.51562,-0.8125 -2.54688,3.8125 z m 10.8125,0 v -8.40625 h -1.45313 v -1.265625 h 1.45313 v -1.03125 q 0,-0.96875 0.17187,-1.453125 0.23438,-0.640625 0.82813,-1.03125 0.59375,-0.390625 1.67187,-0.390625 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.09375 -0.95312,-0.09375 -0.75,0 -1.0625,0.328125 -0.3125,0.3125 -0.3125,1.1875 v 0.890625 h 1.89062 v 1.265625 h -1.89062 v 8.406255 z m 4.33957,-3.53125 1.64062,-0.21875 q 0.28125,1.40625 0.95313,2.015625 0.6875,0.609375 1.65625,0.609375 1.15625,0 1.95312,-0.796875 0.79688,-0.796875 0.79688,-1.984375 0,-1.125 -0.73438,-1.859375 -0.73437,-0.734375 -1.875,-0.734375 -0.46875,0 -1.15625,0.171875 l 0.1875,-1.4375 q 0.15625,0.01563 0.26563,0.01563 1.04687,0 1.875,-0.546875 0.84375,-0.546875 0.84375,-1.671875 0,-0.90625 -0.60938,-1.5 -0.60937,-0.59375 -1.57812,-0.59375 -0.95313,0 -1.59375,0.609375 -0.64063,0.59375 -0.8125,1.796875 L 346.225,74.699159 q 0.29688,-1.640625 1.35938,-2.546875 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92187,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.859375 -0.46875,1.578125 -0.46875,0.703125 -1.375,1.125 1.1875,0.28125 1.84375,1.140625 0.65625,0.859375 0.65625,2.15625 0,1.734375 -1.28125,2.953125 -1.26563,1.21875 -3.21875,1.21875 -1.76563,0 -2.92188,-1.046875 -1.15625,-1.046875 -1.32812,-2.71875 z m 18.98508,1.953125 v 1.57813 h -8.82813 q -0.0156,-0.59375 0.1875,-1.140625 0.34375,-0.90625 1.07813,-1.78125 0.75,-0.875 2.15625,-2.015625 2.17187,-1.78125 2.9375,-2.828125 0.76562,-1.046875 0.76562,-1.96875 0,-0.984375 -0.70312,-1.640625 -0.6875,-0.671875 -1.8125,-0.671875 -1.1875,0 -1.90625,0.71875 -0.70313,0.703125 -0.70313,1.953125 l -1.6875,-0.171875 q 0.17188,-1.890625 1.29688,-2.875 1.14062,-0.984375 3.03125,-0.984375 1.92187,0 3.04687,1.0625 1.125,1.0625 1.125,2.640625 0,0.796875 -0.32812,1.578125 -0.32813,0.78125 -1.09375,1.640625 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.234375 -1.89063,1.6875 -0.42187,0.4375 -0.6875,0.875 z m 10.84448,-4.265625 -8.84375,3.78125 v -1.625 l 7.01562,-2.90625 -7.01562,-2.875 v -1.64062 l 8.84375,3.734375 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path127"
+     d="m 167.14961,208.14961 h 309.7323 v 42.14172 h -309.7323 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path129"
+     d="m 177.88398,235.06961 v -13.35938 h 1.76562 v 13.35938 z m 4.6833,0 v -9.67188 h 1.46875 v 1.375 q 1.0625,-1.59375 3.07813,-1.59375 0.875,0 1.60937,0.3125 0.73438,0.3125 1.09375,0.82813 0.375,0.5 0.51563,1.20312 0.0937,0.45313 0.0937,1.59375 v 5.95313 h -1.64063 v -5.89063 q 0,-1 -0.20312,-1.48437 -0.1875,-0.5 -0.67188,-0.79688 -0.48437,-0.29687 -1.14062,-0.29687 -1.04688,0 -1.8125,0.67187 -0.75,0.65625 -0.75,2.51563 v 5.28125 z m 16.64135,0 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64063 -0.96875,-0.64062 -1.5,-1.78125 -0.53125,-1.14062 -0.53125,-2.625 0,-1.45312 0.48438,-2.625 0.48437,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35938 1.10937,0.95313 v -4.79688 h 1.64063 v 13.35938 z m -5.17188,-4.82813 q 0,1.85938 0.78125,2.78125 0.78125,0.92188 1.84375,0.92188 1.07813,0 1.82813,-0.875 0.75,-0.89063 0.75,-2.6875 0,-1.98438 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89062 -0.73438,0.89063 -0.73438,2.8125 z m 15.90697,1.71875 1.6875,0.20313 q -0.40625,1.48437 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.29688 -1.23438,-1.3125 -1.23438,-3.67187 0,-2.45313 1.25,-3.79688 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.32813 1.23437,1.3125 1.23437,3.70312 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45313 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.48438 1.01563,-1.51563 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.82812 -0.78125,-0.95313 -2.03125,-0.95313 -1.125,0 -1.90625,0.76563 -0.76562,0.75 -0.84375,2.01562 z m 8.0476,5.76563 3.53125,-5.03125 -3.26563,-4.64063 h 2.04688 l 1.48437,2.26563 q 0.42188,0.64062 0.67188,1.07812 0.40625,-0.59375 0.73437,-1.0625 l 1.64063,-2.28125 h 1.95312 l -3.34375,4.54688 3.59375,5.125 h -2.01562 l -1.98438,-3 -0.51562,-0.8125 -2.54688,3.8125 z m 15.21455,-4.29688 1.65625,-0.14062 q 0.125,1 0.54687,1.64062 0.4375,0.64063 1.34375,1.04688 0.92188,0.39062 2.0625,0.39062 1,0 1.78125,-0.29687 0.78125,-0.29688 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35937,-0.46875 -1.1875,-0.79688 -0.54687,-0.20312 -2.39062,-0.64062 -1.82813,-0.45313 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23438 -0.46875,-0.75 -0.46875,-1.67187 0,-1 0.57812,-1.875 0.57813,-0.89063 1.67188,-1.34375 1.10937,-0.45313 2.45312,-0.45313 1.48438,0 2.60938,0.48438 1.14062,0.46875 1.75,1.40625 0.60937,0.92187 0.65625,2.09375 l -1.6875,0.125 q -0.14063,-1.26563 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60938,0 -2.34375,0.59375 -0.73438,0.59375 -0.73438,1.42187 0,0.71875 0.53125,1.17188 0.5,0.46875 2.65625,0.96875 2.15625,0.48437 2.95313,0.84375 1.17187,0.53125 1.71875,1.35937 0.5625,0.82813 0.5625,1.90625 0,1.0625 -0.60938,2.01563 -0.60937,0.9375 -1.75,1.46875 -1.14062,0.51562 -2.57812,0.51562 -1.8125,0 -3.04688,-0.53125 -1.21875,-0.53125 -1.92187,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 12.8342,8 v -13.375 h 1.48437 v 1.25 q 0.53125,-0.73437 1.1875,-1.09375 0.67188,-0.375 1.625,-0.375 1.23438,0 2.17188,0.64063 0.95312,0.625 1.4375,1.79687 0.48437,1.15625 0.48437,2.54688 0,1.48437 -0.53125,2.67187 -0.53125,1.1875 -1.54687,1.82813 -1.01563,0.625 -2.14063,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 v 4.70312 z m 1.48437,-8.48437 q 0,1.85937 0.75,2.76562 0.76563,0.89063 1.82813,0.89063 1.09375,0 1.875,-0.92188 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76563,-2.76562 -0.75,-0.92188 -1.8125,-0.92188 -1.04687,0 -1.85937,0.98438 -0.79688,0.96875 -0.79688,2.84375 z m 15.20386,3.59375 q -0.92188,0.76562 -1.76563,1.09375 -0.82812,0.3125 -1.79687,0.3125 -1.59375,0 -2.45313,-0.78125 -0.85937,-0.78125 -0.85937,-1.98438 0,-0.71875 0.32812,-1.29687 0.32813,-0.59375 0.84375,-0.9375 0.53125,-0.35938 1.1875,-0.54688 0.46875,-0.125 1.45313,-0.25 1.98437,-0.23437 2.92187,-0.5625 0.0156,-0.34375 0.0156,-0.42187 0,-1 -0.46875,-1.42188 -0.625,-0.54687 -1.875,-0.54687 -1.15625,0 -1.70313,0.40625 -0.54687,0.40625 -0.8125,1.42187 l -1.60937,-0.21875 q 0.21875,-1.01562 0.71875,-1.64062 0.5,-0.64063 1.45312,-0.98438 0.95313,-0.34375 2.1875,-0.34375 1.25,0 2.01563,0.29688 0.78125,0.28125 1.14062,0.73437 0.375,0.4375 0.51563,1.10938 0.0781,0.42187 0.0781,1.51562 v 2.1875 q 0,2.28125 0.10938,2.89063 0.10937,0.59375 0.40625,1.15625 h -1.70313 q -0.26562,-0.51563 -0.32812,-1.1875 z m -0.14063,-3.67188 q -0.89062,0.375 -2.67187,0.625 -1.01563,0.14063 -1.4375,0.32813 -0.42188,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45312,0.4375 0.9375,0 1.67188,-0.40625 0.75,-0.42188 1.09375,-1.14063 0.26562,-0.5625 0.26562,-1.64062 z m 10.51633,1.3125 1.60938,0.21875 q -0.26563,1.65625 -1.35938,2.60938 -1.07812,0.9375 -2.67187,0.9375 -1.98438,0 -3.1875,-1.29688 -1.20313,-1.29687 -1.20313,-3.71875 0,-1.57812 0.51563,-2.75 0.51562,-1.17187 1.57812,-1.75 1.0625,-0.59375 2.3125,-0.59375 1.57813,0 2.57813,0.79688 1,0.79687 1.28125,2.26562 l -1.59375,0.23438 q -0.23438,-0.96875 -0.8125,-1.45313 -0.57813,-0.5 -1.39063,-0.5 -1.23437,0 -2.01562,0.89063 -0.78125,0.89062 -0.78125,2.8125 0,1.95312 0.75,2.84375 0.75,0.875 1.95312,0.875 0.96875,0 1.60938,-0.59375 0.65625,-0.59375 0.82812,-1.82813 z m 9.64063,0.4375 1.6875,0.20313 q -0.40625,1.48437 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.29688 -1.23438,-1.3125 -1.23438,-3.67187 0,-2.45313 1.25,-3.79688 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.32813 1.23437,1.3125 1.23437,3.70312 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45313 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.48438 1.01563,-1.51563 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.82812 -0.78125,-0.95313 -2.03125,-0.95313 -1.125,0 -1.90625,0.76563 -0.76562,0.75 -0.84375,2.01562 z m 14.1059,-0.0781 v -1.53125 l 8.84375,-3.73438 v 1.64063 l -7.01563,2.875 7.01563,2.90625 v 1.625 z m 19.26995,4.26562 v 1.57813 h -8.82812 q -0.0156,-0.59375 0.1875,-1.14063 0.34375,-0.90625 1.07812,-1.78125 0.75,-0.875 2.15625,-2.01562 2.17188,-1.78125 2.9375,-2.82813 0.76563,-1.04687 0.76563,-1.96875 0,-0.98437 -0.70313,-1.64062 -0.6875,-0.67188 -1.8125,-0.67188 -1.1875,0 -1.90625,0.71875 -0.70312,0.70313 -0.70312,1.95313 l -1.6875,-0.17188 q 0.17187,-1.89062 1.29687,-2.875 1.14063,-0.98437 3.03125,-0.98437 1.92188,0 3.04688,1.0625 1.125,1.0625 1.125,2.64062 0,0.79688 -0.32813,1.57813 -0.32812,0.78125 -1.09375,1.64062 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23438 -1.89062,1.6875 -0.42188,0.4375 -0.6875,0.875 z m 1.12574,1.57813 3.53125,-5.03125 -3.26563,-4.64063 h 2.04688 l 1.48437,2.26563 q 0.42188,0.64062 0.67188,1.07812 0.40625,-0.59375 0.73437,-1.0625 l 1.64063,-2.28125 h 1.95312 l -3.34375,4.54688 3.59375,5.125 h -2.01562 l -1.98438,-3 -0.51562,-0.8125 -2.54688,3.8125 z m 18.57812,-1.57813 v 1.57813 h -8.82812 q -0.0156,-0.59375 0.1875,-1.14063 0.34375,-0.90625 1.07812,-1.78125 0.75,-0.875 2.15625,-2.01562 2.17188,-1.78125 2.9375,-2.82813 0.76563,-1.04687 0.76563,-1.96875 0,-0.98437 -0.70313,-1.64062 -0.6875,-0.67188 -1.8125,-0.67188 -1.1875,0 -1.90625,0.71875 -0.70312,0.70313 -0.70312,1.95313 l -1.6875,-0.17188 q 0.17187,-1.89062 1.29687,-2.875 1.14063,-0.98437 3.03125,-0.98437 1.92188,0 3.04688,1.0625 1.125,1.0625 1.125,2.64062 0,0.79688 -0.32813,1.57813 -0.32812,0.78125 -1.09375,1.64062 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23438 -1.89062,1.6875 -0.42188,0.4375 -0.6875,0.875 z m 1.1257,1.57813 3.53125,-5.03125 -3.26562,-4.64063 h 2.04687 l 1.48438,2.26563 q 0.42187,0.64062 0.67187,1.07812 0.40625,-0.59375 0.73438,-1.0625 l 1.64062,-2.28125 h 1.95313 l -3.34375,4.54688 3.59375,5.125 h -2.01563 l -1.98437,-3 -0.51563,-0.8125 -2.54687,3.8125 z m 10.8125,0 v -8.40625 h -1.45312 v -1.26563 h 1.45312 v -1.03125 q 0,-0.96875 0.17188,-1.45312 0.23437,-0.64063 0.82812,-1.03125 0.59375,-0.39063 1.67188,-0.39063 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.0937 -0.95313,-0.0937 -0.75,0 -1.0625,0.32813 -0.3125,0.3125 -0.3125,1.1875 v 0.89062 h 1.89063 v 1.26563 h -1.89063 v 8.40625 z m 4.33957,-3.53125 1.64063,-0.21875 q 0.28125,1.40625 0.95312,2.01562 0.6875,0.60938 1.65625,0.60938 1.15625,0 1.95313,-0.79688 0.79687,-0.79687 0.79687,-1.98437 0,-1.125 -0.73437,-1.85938 -0.73438,-0.73437 -1.875,-0.73437 -0.46875,0 -1.15625,0.17187 l 0.1875,-1.4375 q 0.15625,0.0156 0.26562,0.0156 1.04688,0 1.875,-0.54688 0.84375,-0.54687 0.84375,-1.67187 0,-0.90625 -0.60937,-1.5 -0.60938,-0.59375 -1.57813,-0.59375 -0.95312,0 -1.59375,0.60937 -0.64062,0.59375 -0.8125,1.79688 l -1.64062,-0.29688 q 0.29687,-1.64062 1.35937,-2.54687 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92188,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85937 -0.46875,1.57812 -0.46875,0.70313 -1.375,1.125 1.1875,0.28125 1.84375,1.14063 0.65625,0.85937 0.65625,2.15625 0,1.73437 -1.28125,2.95312 -1.26562,1.21875 -3.21875,1.21875 -1.76562,0 -2.92187,-1.04687 -1.15625,-1.04688 -1.32813,-2.71875 z m 18.98508,1.95312 v 1.57813 h -8.82812 q -0.0156,-0.59375 0.1875,-1.14063 0.34375,-0.90625 1.07812,-1.78125 0.75,-0.875 2.15625,-2.01562 2.17188,-1.78125 2.9375,-2.82813 0.76563,-1.04687 0.76563,-1.96875 0,-0.98437 -0.70313,-1.64062 -0.6875,-0.67188 -1.8125,-0.67188 -1.1875,0 -1.90625,0.71875 -0.70312,0.70313 -0.70312,1.95313 l -1.6875,-0.17188 q 0.17187,-1.89062 1.29687,-2.875 1.14063,-0.98437 3.03125,-0.98437 1.92188,0 3.04688,1.0625 1.125,1.0625 1.125,2.64062 0,0.79688 -0.32813,1.57813 -0.32812,0.78125 -1.09375,1.64062 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23438 -1.89062,1.6875 -0.42188,0.4375 -0.6875,0.875 z m 10.84448,-4.26562 -8.84375,3.78125 v -1.625 l 7.01563,-2.90625 -7.01563,-2.875 v -1.64063 l 8.84375,3.73438 z" />
+</svg>
diff --git a/mlir/docs/includes/img/view-operation.svg b/mlir/docs/includes/img/view-operation.svg
new file mode 100644
index 0000000000000000000000000000000000000000..f4d622ee263ce6db50358d42a34ab0177ea133e7
--- /dev/null
+++ b/mlir/docs/includes/img/view-operation.svg
@@ -0,0 +1,580 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   version="1.1"
+   viewBox="0 0 781.88983 360.73489"
+   stroke-miterlimit="10"
+   id="svg213"
+   sodipodi:docname="view-operation.svg"
+   width="781.88983"
+   height="360.73489"
+   style="fill:none;stroke:none;stroke-linecap:square;stroke-miterlimit:10"
+   inkscape:version="0.92.2pre0 (973e216, 2017-07-25)">
+  <metadata
+     id="metadata219">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs217" />
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="2312"
+     inkscape:window-height="1165"
+     id="namedview215"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="0.9"
+     inkscape:cx="514.61205"
+     inkscape:cy="336.45539"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="svg213" />
+  <clipPath
+     id="p.0">
+    <path
+       d="M 0,0 H 1280 V 960 H 0 Z"
+       id="path2"
+       inkscape:connector-curvature="0"
+       style="clip-rule:nonzero" />
+  </clipPath>
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path5"
+     d="M -12.118111,-20.430447 H 1267.8819 V 939.56955 H -12.118111 Z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path7"
+     d="M 94.598429,118.46719 H 118.063 v 26.70865 H 94.598429 Z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path9"
+     d="M 94.598429,118.46719 H 118.063 v 26.70865 H 94.598429 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path11"
+     d="m 111.1453,137.55402 q -0.92188,0.76562 -1.76563,1.09375 -0.82812,0.3125 -1.79687,0.3125 -1.59375,0 -2.45313,-0.78125 -0.85937,-0.78125 -0.85937,-1.98438 0,-0.71875 0.32812,-1.29687 0.32813,-0.59375 0.84375,-0.9375 0.53125,-0.35938 1.1875,-0.54688 0.46875,-0.125 1.45313,-0.25 1.98437,-0.23437 2.92187,-0.5625 0.0156,-0.34375 0.0156,-0.42187 0,-1 -0.46875,-1.42188 -0.625,-0.54687 -1.875,-0.54687 -1.15625,0 -1.70313,0.40625 -0.54687,0.40625 -0.8125,1.42187 l -1.60937,-0.21875 q 0.21875,-1.01562 0.71875,-1.64062 0.5,-0.64063 1.45312,-0.98438 0.95313,-0.34375 2.1875,-0.34375 1.25,0 2.01563,0.29688 0.78125,0.28125 1.14062,0.73437 0.375,0.4375 0.51563,1.10938 0.0781,0.42187 0.0781,1.51562 v 2.1875 q 0,2.28125 0.10938,2.89063 0.10937,0.59375 0.40625,1.15625 h -1.70313 q -0.26562,-0.51563 -0.32812,-1.1875 z m -0.14063,-3.67188 q -0.89062,0.375 -2.67187,0.625 -1.01563,0.14063 -1.4375,0.32813 -0.42188,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45312,0.4375 0.9375,0 1.67188,-0.40625 0.75,-0.42188 1.09375,-1.14063 0.26562,-0.5625 0.26562,-1.64062 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path13"
+     d="m 118.06299,118.46719 h 23.46457 v 26.70865 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path15"
+     d="m 118.06299,118.46719 h 23.46457 v 26.70865 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path17"
+     d="m 129.79737,138.74152 h -1.51563 v -13.35938 h 1.64063 v 4.76563 q 1.04687,-1.29688 2.65625,-1.29688 0.89062,0 1.6875,0.35938 0.79687,0.35937 1.3125,1.01562 0.51562,0.64063 0.79687,1.5625 0.29688,0.92188 0.29688,1.96875 0,2.48438 -1.23438,3.84375 -1.21875,1.35938 -2.95312,1.35938 -1.70313,0 -2.6875,-1.4375 z m -0.0156,-4.90625 q 0,1.73437 0.48438,2.51562 0.76562,1.26563 2.09375,1.26563 1.07812,0 1.85937,-0.9375 0.78125,-0.9375 0.78125,-2.78125 0,-1.89063 -0.75,-2.79688 -0.75,-0.90625 -1.82812,-0.90625 -1.0625,0 -1.85938,0.9375 -0.78125,0.9375 -0.78125,2.70313 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path19"
+     d="m 141.52757,118.46719 h 23.46455 v 26.70865 h -23.46455 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path21"
+     d="m 141.52757,118.46719 h 23.46455 v 26.70865 h -23.46455 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path23"
+     d="m 158.07444,135.19464 1.60937,0.21875 q -0.26562,1.65625 -1.35937,2.60937 -1.07813,0.9375 -2.67188,0.9375 -1.98437,0 -3.1875,-1.29687 -1.20312,-1.29688 -1.20312,-3.71875 0,-1.57813 0.51562,-2.75 0.51563,-1.17188 1.57813,-1.75 1.0625,-0.59375 2.3125,-0.59375 1.57812,0 2.57812,0.79687 1,0.79688 1.28125,2.26563 l -1.59375,0.23437 q -0.23437,-0.96875 -0.8125,-1.45312 -0.57812,-0.5 -1.39062,-0.5 -1.23438,0 -2.01563,0.89062 -0.78125,0.89063 -0.78125,2.8125 0,1.95313 0.75,2.84375 0.75,0.875 1.95313,0.875 0.96875,0 1.60937,-0.59375 0.65625,-0.59375 0.82813,-1.82812 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path25"
+     d="M 94.598429,145.17585 H 118.063 v 26.70866 H 94.598429 Z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path27"
+     d="M 94.598429,145.17585 H 118.063 v 26.70866 H 94.598429 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path29"
+     d="m 111.09843,165.45018 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64063 -0.96875,-0.64062 -1.5,-1.78125 -0.53125,-1.14062 -0.53125,-2.625 0,-1.45312 0.48438,-2.625 0.48437,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35938 1.10937,0.95313 v -4.79688 h 1.64063 v 13.35938 z m -5.17188,-4.82813 q 0,1.85938 0.78125,2.78125 0.78125,0.92188 1.84375,0.92188 1.07813,0 1.82813,-0.875 0.75,-0.89063 0.75,-2.6875 0,-1.98438 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89062 -0.73438,0.89063 -0.73438,2.8125 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path31"
+     d="m 118.06299,145.17585 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path33"
+     d="m 118.06299,145.17585 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path35"
+     d="m 134.92237,162.34081 1.6875,0.20312 q -0.40625,1.48438 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.29687 -1.23438,-1.3125 -1.23438,-3.67188 0,-2.45312 1.25,-3.79687 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.32812 1.23437,1.3125 1.23437,3.70313 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45312 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.48437 1.01563,-1.51562 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.82813 -0.78125,-0.95312 -2.03125,-0.95312 -1.125,0 -1.90625,0.76562 -0.76562,0.75 -0.84375,2.01563 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path37"
+     d="m 141.52757,145.17585 h 23.46455 v 26.70866 h -23.46455 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path39"
+     d="m 141.52757,145.17585 h 23.46455 v 26.70866 h -23.46455 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path41"
+     d="m 152.29319,165.45018 v -8.40625 h -1.45313 v -1.26563 h 1.45313 v -1.03125 q 0,-0.96875 0.17187,-1.45312 0.23438,-0.64063 0.82813,-1.03125 0.59375,-0.39063 1.67187,-0.39063 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.0937 -0.95312,-0.0937 -0.75,0 -1.0625,0.32813 -0.3125,0.3125 -0.3125,1.1875 v 0.89062 h 1.89062 v 1.26563 h -1.89062 v 8.40625 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path43"
+     d="M 94.598429,171.88451 H 118.063 v 26.70866 H 94.598429 Z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path45"
+     d="M 94.598429,171.88451 H 118.063 v 26.70866 H 94.598429 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path47"
+     d="m 104.5203,192.95572 1.59375,0.23437 q 0.10937,0.75 0.5625,1.07813 0.60937,0.45312 1.67187,0.45312 1.14063,0 1.75,-0.45312 0.625,-0.45313 0.84375,-1.26563 0.125,-0.5 0.10938,-2.10937 -1.0625,1.26562 -2.67188,1.26562 -2,0 -3.09375,-1.4375 -1.09375,-1.4375 -1.09375,-3.45312 0,-1.39063 0.5,-2.5625 0.51563,-1.17188 1.45313,-1.79688 0.95312,-0.64062 2.25,-0.64062 1.70312,0 2.8125,1.375 v -1.15625 h 1.51562 v 8.35937 q 0,2.26563 -0.46875,3.20313 -0.45312,0.9375 -1.45312,1.48437 -0.98438,0.54688 -2.45313,0.54688 -1.71875,0 -2.79687,-0.78125 -1.0625,-0.76563 -1.03125,-2.34375 z m 1.35937,-5.8125 q 0,1.90625 0.75,2.78125 0.76563,0.875 1.90625,0.875 1.125,0 1.89063,-0.85938 0.76562,-0.875 0.76562,-2.73437 0,-1.78125 -0.79687,-2.67188 -0.78125,-0.90625 -1.89063,-0.90625 -1.09375,0 -1.85937,0.89063 -0.76563,0.875 -0.76563,2.625 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path49"
+     d="m 118.06299,171.88451 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path51"
+     d="m 118.06299,171.88451 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path53"
+     d="m 128.29737,192.15885 v -13.35938 h 1.64062 v 4.79688 q 1.14063,-1.32813 2.89063,-1.32813 1.07812,0 1.85937,0.42188 0.79688,0.42187 1.14063,1.17187 0.34375,0.75 0.34375,2.17188 v 6.125 h -1.64063 v -6.125 q 0,-1.23438 -0.53125,-1.79688 -0.53125,-0.5625 -1.51562,-0.5625 -0.71875,0 -1.35938,0.39063 -0.64062,0.375 -0.92187,1.01562 -0.26563,0.64063 -0.26563,1.78125 v 5.29688 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path55"
+     d="m 141.52757,171.88451 h 23.46455 v 26.70866 h -23.46455 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path57"
+     d="m 141.52757,171.88451 h 23.46455 v 26.70866 h -23.46455 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path59"
+     d="m 152.42181,180.69009 v -1.89063 h 1.64062 v 1.89063 z m 0,11.46875 v -9.67188 h 1.64062 v 9.67188 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path61"
+     d="m 22.598423,8 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path63"
+     d="m 22.598423,8 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path65"
+     d="m 39.145299,27.086826 q -0.921875,0.765625 -1.765625,1.09375 -0.828125,0.3125 -1.796875,0.3125 -1.59375,0 -2.453125,-0.78125 -0.859375,-0.78125 -0.859375,-1.984375 0,-0.71875 0.328125,-1.296875 0.328125,-0.59375 0.84375,-0.9375 0.53125,-0.359375 1.1875,-0.546875 0.46875,-0.125 1.453125,-0.25 1.984375,-0.234375 2.921875,-0.5625 0.01563,-0.34375 0.01563,-0.421875 0,-1 -0.46875,-1.421875 -0.625,-0.546875 -1.875,-0.546875 -1.15625,0 -1.703125,0.40625 -0.546875,0.40625 -0.8125,1.421875 l -1.609375,-0.21875 q 0.21875,-1.015625 0.71875,-1.640625 0.5,-0.640625 1.453125,-0.984375 0.953125,-0.34375 2.1875,-0.34375 1.25,0 2.015625,0.296875 0.78125,0.28125 1.140625,0.734375 0.375,0.4375 0.515625,1.109375 0.07813,0.421875 0.07813,1.515625 v 2.1875 q 0,2.28125 0.109375,2.890625 0.109375,0.59375 0.40625,1.15625 h -1.703125 q -0.265625,-0.515625 -0.328125,-1.1875 z m -0.140625,-3.671875 q -0.890625,0.375 -2.671875,0.625 -1.015625,0.140625 -1.4375,0.328125 -0.421875,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.453125,0.4375 0.9375,0 1.671875,-0.40625 0.75,-0.421875 1.09375,-1.140625 0.265625,-0.5625 0.265625,-1.640625 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path67"
+     d="M 46.062992,8 H 69.527557 V 34.70866 H 46.062992 Z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path69"
+     d="M 46.062992,8 H 69.527557 V 34.70866 H 46.062992 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path71"
+     d="M 57.797363,28.274326 H 56.281738 V 14.914951 h 1.640625 v 4.765625 q 1.046875,-1.296875 2.65625,-1.296875 0.890625,0 1.6875,0.359375 0.796875,0.359375 1.3125,1.015625 0.515625,0.640625 0.796875,1.5625 0.296875,0.921875 0.296875,1.96875 0,2.484375 -1.234375,3.84375 -1.21875,1.359375 -2.953125,1.359375 -1.703125,0 -2.6875,-1.4375 z m -0.01563,-4.90625 q 0,1.734375 0.484375,2.515625 0.765625,1.265625 2.09375,1.265625 1.078125,0 1.859375,-0.9375 0.78125,-0.9375 0.78125,-2.78125 0,-1.890625 -0.75,-2.796875 -0.75,-0.90625 -1.828125,-0.90625 -1.0625,0 -1.859375,0.9375 -0.78125,0.9375 -0.78125,2.703125 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path73"
+     d="m 69.527559,8 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path75"
+     d="m 69.527559,8 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path77"
+     d="m 86.074429,24.727451 1.609375,0.21875 q -0.265625,1.65625 -1.359375,2.609375 -1.078125,0.9375 -2.671875,0.9375 -1.984375,0 -3.1875,-1.296875 -1.203125,-1.296875 -1.203125,-3.71875 0,-1.578125 0.515625,-2.75 0.515625,-1.171875 1.578125,-1.75 1.0625,-0.59375 2.3125,-0.59375 1.578125,0 2.578125,0.796875 1,0.796875 1.28125,2.265625 l -1.59375,0.234375 q -0.234375,-0.96875 -0.8125,-1.453125 -0.578125,-0.5 -1.390625,-0.5 -1.234375,0 -2.015625,0.890625 -0.78125,0.890625 -0.78125,2.8125 0,1.953125 0.75,2.84375 0.75,0.875 1.953125,0.875 0.96875,0 1.609375,-0.59375 0.65625,-0.59375 0.828125,-1.828125 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path79"
+     d="M 92.992129,8 H 116.45669 V 34.70866 H 92.992129 Z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path81"
+     d="M 92.992129,8 H 116.45669 V 34.70866 H 92.992129 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path83"
+     d="m 109.49213,28.274326 v -1.21875 q -0.90625,1.4375 -2.70312,1.4375 -1.15625,0 -2.125,-0.640625 -0.96875,-0.640625 -1.5,-1.78125 -0.53125,-1.140625 -0.53125,-2.625 0,-1.453125 0.48437,-2.625 0.48438,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17188,-0.625 0.875,0 1.54687,0.375 0.6875,0.359375 1.10938,0.953125 v -4.796875 h 1.64062 v 13.359375 z m -5.17187,-4.828125 q 0,1.859375 0.78125,2.78125 0.78125,0.921875 1.84375,0.921875 1.07812,0 1.82812,-0.875 0.75,-0.890625 0.75,-2.6875 0,-1.984375 -0.76562,-2.90625 -0.76563,-0.9375 -1.89063,-0.9375 -1.07812,0 -1.8125,0.890625 -0.73437,0.890625 -0.73437,2.8125 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path85"
+     d="m 116.45669,8 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path87"
+     d="m 116.45669,8 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path89"
+     d="m 133.31606,25.164951 1.6875,0.203125 q -0.40625,1.484375 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.296875 -1.23438,-1.3125 -1.23438,-3.671875 0,-2.453125 1.25,-3.796875 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.328125 1.23437,1.3125 1.23437,3.703125 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.453125 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.484375 1.01563,-1.515625 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.828125 -0.78125,-0.953125 -2.03125,-0.953125 -1.125,0 -1.90625,0.765625 -0.76562,0.75 -0.84375,2.015625 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path91"
+     d="m 139.92126,8 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path93"
+     d="m 139.92126,8 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path95"
+     d="m 150.54626,28.274326 v -8.40625 h -1.45313 v -1.265625 h 1.45313 v -1.03125 q 0,-0.96875 0.17187,-1.453125 0.23438,-0.640625 0.82813,-1.03125 0.59375,-0.390625 1.67187,-0.390625 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.09375 -0.95312,-0.09375 -0.75,0 -1.0625,0.328125 -0.3125,0.3125 -0.3125,1.1875 v 0.890625 h 1.89062 v 1.265625 h -1.89062 v 8.40625 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path97"
+     d="M 163.38583,8 H 186.8504 V 34.70866 H 163.38583 Z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path99"
+     d="M 163.38583,8 H 186.8504 V 34.70866 H 163.38583 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path101"
+     d="m 173.3077,29.071201 1.59375,0.234375 q 0.10938,0.75 0.5625,1.078125 0.60938,0.453125 1.67188,0.453125 1.14062,0 1.75,-0.453125 0.625,-0.453125 0.84375,-1.265625 0.125,-0.5 0.10937,-2.109375 -1.0625,1.265625 -2.67187,1.265625 -2,0 -3.09375,-1.4375 -1.09375,-1.4375 -1.09375,-3.453125 0,-1.390625 0.5,-2.5625 0.51562,-1.171875 1.45312,-1.796875 0.95313,-0.640625 2.25,-0.640625 1.70313,0 2.8125,1.375 v -1.15625 h 1.51563 v 8.359375 q 0,2.265625 -0.46875,3.203125 -0.45313,0.9375 -1.45313,1.484375 -0.98437,0.546875 -2.45312,0.546875 -1.71875,0 -2.79688,-0.78125 -1.0625,-0.765625 -1.03125,-2.34375 z m 1.35938,-5.8125 q 0,1.90625 0.75,2.78125 0.76562,0.875 1.90625,0.875 1.125,0 1.89062,-0.859375 0.76563,-0.875 0.76563,-2.734375 0,-1.78125 -0.79688,-2.671875 -0.78125,-0.90625 -1.89062,-0.90625 -1.09375,0 -1.85938,0.890625 -0.76562,0.875 -0.76562,2.625 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path103"
+     d="m 186.85039,8 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path105"
+     d="m 186.85039,8 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path107"
+     d="M 197.08477,28.274326 V 14.914951 h 1.64062 v 4.796875 q 1.14063,-1.328125 2.89063,-1.328125 1.07812,0 1.85937,0.421875 0.79688,0.421875 1.14063,1.171875 0.34375,0.75 0.34375,2.171875 v 6.125 h -1.64063 v -6.125 q 0,-1.234375 -0.53125,-1.796875 -0.53125,-0.5625 -1.51562,-0.5625 -0.71875,0 -1.35938,0.390625 -0.64062,0.375 -0.92187,1.015625 -0.26563,0.640625 -0.26563,1.78125 v 5.296875 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path109"
+     d="m 210.31496,8 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path111"
+     d="m 210.31496,8 h 23.46457 v 26.70866 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path113"
+     d="m 220.54934,16.805576 v -1.890625 h 1.64062 v 1.890625 z m 0,11.46875 v -9.671875 h 1.64062 v 9.671875 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path115"
+     d="m 118.06299,273.69815 h 23.46457 v 26.70868 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path117"
+     d="m 118.06299,273.69815 h 23.46457 v 26.70868 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path119"
+     d="m 134.92237,290.8631 1.6875,0.20312 q -0.40625,1.48438 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.29687 -1.23438,-1.3125 -1.23438,-3.67188 0,-2.45312 1.25,-3.79687 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.32812 1.23437,1.3125 1.23437,3.70313 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45312 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.48437 1.01563,-1.51562 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.82813 -0.78125,-0.95312 -2.03125,-0.95312 -1.125,0 -1.90625,0.76562 -0.76562,0.75 -0.84375,2.01563 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path121"
+     d="m 141.52757,273.69815 h 23.46455 v 26.70868 h -23.46455 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path123"
+     d="m 141.52757,273.69815 h 23.46455 v 26.70868 h -23.46455 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path125"
+     d="m 152.15257,293.97247 v -8.40625 h -1.45313 v -1.26563 h 1.45313 v -1.03125 q 0,-0.96875 0.17187,-1.45312 0.23438,-0.64063 0.82813,-1.03125 0.59375,-0.39063 1.67187,-0.39063 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.0937 -0.95312,-0.0937 -0.75,0 -1.0625,0.32813 -0.3125,0.3125 -0.3125,1.1875 v 0.89062 h 1.89062 v 1.26563 h -1.89062 v 8.40625 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path127"
+     d="m 118.06299,300.40683 h 23.46457 v 26.70865 h -23.46457 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path129"
+     d="m 118.06299,300.40683 h 23.46457 v 26.70865 h -23.46457 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path131"
+     d="m 128.29737,320.68115 v -13.35938 h 1.64062 v 4.79688 q 1.14063,-1.32813 2.89063,-1.32813 1.07812,0 1.85937,0.42188 0.79688,0.42187 1.14063,1.17187 0.34375,0.75 0.34375,2.17188 v 6.125 h -1.64063 v -6.125 q 0,-1.23438 -0.53125,-1.79688 -0.53125,-0.5625 -1.51562,-0.5625 -0.71875,0 -1.35938,0.39063 -0.64062,0.375 -0.92187,1.01562 -0.26563,0.64063 -0.26563,1.78125 v 5.29688 z" />
+  <path
+     style="fill:#cfe2f3;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path133"
+     d="m 141.52757,300.40683 h 23.46455 v 26.70865 h -23.46455 z" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path135"
+     d="m 141.52757,300.40683 h 23.46455 v 26.70865 h -23.46455 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path137"
+     d="m 151.76194,309.2124 v -1.89063 h 1.64062 v 1.89063 z m 0,11.46875 v -9.67188 h 1.64062 v 9.67188 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path139"
+     d="M 156.42782,47.356953 H 652.86877 V 68.837269 H 156.42782 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path141"
+     d="m 166.36532,74.511323 0.79687,-3.890625 h -1.54687 v -1.359375 h 1.8125 l 0.67187,-3.296875 h -2.48437 v -1.359375 h 2.76562 l 0.79688,-3.90625 h 1.35937 l -0.79687,3.90625 h 2.875 l 0.79687,-3.90625 h 1.375 l -0.79687,3.90625 h 1.57812 v 1.359375 h -1.84375 l -0.6875,3.296875 h 2.53125 v 1.359375 h -2.8125 l -0.78125,3.890625 h -1.375 l 0.78125,-3.890625 h -2.85937 l -0.78125,3.890625 z m 2.4375,-5.25 h 2.85937 l 0.6875,-3.296875 h -2.875 z m 8.18822,5.015625 V 60.917573 h 1.64062 v 13.359375 z m 4.19169,0 v -9.671875 h 1.46875 v 1.359375 q 0.45313,-0.71875 1.20313,-1.140625 0.76562,-0.4375 1.71875,-0.4375 1.07812,0 1.76562,0.453125 0.6875,0.4375 0.96875,1.234375 1.15625,-1.6875 2.98438,-1.6875 1.45312,0 2.21875,0.796875 0.78125,0.796875 0.78125,2.453125 v 6.640625 h -1.64063 v -6.09375 q 0,-0.984375 -0.15625,-1.40625 -0.15625,-0.4375 -0.57812,-0.703125 -0.42188,-0.265625 -0.98438,-0.265625 -1.01562,0 -1.6875,0.6875 -0.67187,0.671875 -0.67187,2.15625 v 5.625 h -1.64063 v -6.28125 q 0,-1.09375 -0.40625,-1.640625 -0.40625,-0.546875 -1.3125,-0.546875 -0.6875,0 -1.28125,0.359375 -0.59375,0.359375 -0.85937,1.0625 -0.25,0.703125 -0.25,2.03125 v 5.015625 z m 21.85331,-1.1875 q -0.92188,0.765625 -1.76563,1.09375 -0.82812,0.3125 -1.79687,0.3125 -1.59375,0 -2.45313,-0.78125 -0.85937,-0.78125 -0.85937,-1.984375 0,-0.71875 0.32812,-1.296875 0.32813,-0.59375 0.84375,-0.9375 0.53125,-0.359375 1.1875,-0.546875 0.46875,-0.125 1.45313,-0.25 1.98437,-0.234375 2.92187,-0.5625 0.0156,-0.34375 0.0156,-0.421875 0,-1 -0.46875,-1.421875 -0.625,-0.546875 -1.875,-0.546875 -1.15625,0 -1.70313,0.40625 -0.54687,0.40625 -0.8125,1.421875 l -1.60937,-0.21875 q 0.21875,-1.015625 0.71875,-1.640625 0.5,-0.640625 1.45312,-0.984375 0.95313,-0.34375 2.1875,-0.34375 1.25,0 2.01563,0.296875 0.78125,0.28125 1.14062,0.734375 0.375,0.4375 0.51563,1.109375 0.0781,0.421875 0.0781,1.515625 v 2.1875 q 0,2.28125 0.10938,2.890625 0.10937,0.59375 0.40625,1.15625 h -1.70313 q -0.26562,-0.515625 -0.32812,-1.1875 z m -0.14063,-3.671875 q -0.89062,0.375 -2.67187,0.625 -1.01563,0.140625 -1.4375,0.328125 -0.42188,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45312,0.4375 0.9375,0 1.67188,-0.40625 0.75,-0.421875 1.09375,-1.140625 0.26562,-0.5625 0.26562,-1.640625 z m 4.20384,8.5625 v -13.375 h 1.48438 v 1.25 q 0.53125,-0.734375 1.1875,-1.09375 0.67187,-0.375 1.625,-0.375 1.23437,0 2.17187,0.640625 0.95313,0.625 1.4375,1.796875 0.48438,1.15625 0.48438,2.546875 0,1.484375 -0.53125,2.671875 -0.53125,1.1875 -1.54688,1.828125 -1.01562,0.625 -2.14062,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 v 4.703125 z m 1.48438,-8.484375 q 0,1.859375 0.75,2.765625 0.76562,0.890625 1.82812,0.890625 1.09375,0 1.875,-0.921875 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76562,-2.765625 -0.75,-0.921875 -1.8125,-0.921875 -1.04688,0 -1.85938,0.984375 -0.79687,0.96875 -0.79687,2.84375 z m 7.62571,4.78125 5.125,-13.359375 h 1.90625 l 5.46875,13.359375 h -2.01562 l -1.54688,-4.046875 h -5.59375 l -1.46875,4.046875 z m 3.85938,-5.484375 h 4.53125 l -1.40625,-3.703125 q -0.625,-1.6875 -0.9375,-2.765625 -0.26563,1.28125 -0.71875,2.546875 z m 23.65813,-2.375 h -8.82813 v -1.515625 h 8.82813 z m 0,4.0625 h -8.82813 v -1.53125 h 8.82813 z m 10.57826,7.71875 q -1.35938,-1.703125 -2.29688,-4 -0.9375,-2.296875 -0.9375,-4.765625 0,-2.15625 0.70313,-4.140625 0.82812,-2.3125 2.53125,-4.59375 h 1.17187 q -1.09375,1.890625 -1.45312,2.703125 -0.54688,1.25 -0.875,2.625 -0.39063,1.703125 -0.39063,3.421875 0,4.375 2.71875,8.75 z m 3.08768,-15.390625 v -1.890625 h 1.64062 v 1.890625 z m 0,11.46875 v -9.671875 h 1.64062 v 9.671875 z m 4.56671,0 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35938,0.640625 -1.15625,0.984375 l -0.45313,-0.703125 q 0.51563,-0.21875 0.76563,-0.671875 0.25,-0.4375 0.28125,-1.265625 z m 9.9291,-11.453125 v -1.90625 h 1.64063 v 1.90625 z m -2.07812,15.203125 0.3125,-1.390625 q 0.5,0.125 0.78125,0.125 0.5,0 0.73437,-0.328125 0.25,-0.328125 0.25,-1.671875 v -10.15625 h 1.64063 v 10.203125 q 0,1.78125 -0.46875,2.484375 -0.59375,0.90625 -1.96875,0.90625 -0.65625,0 -1.28125,-0.171875 z m 7.31668,0.171875 h -1.1875 q 2.73438,-4.375 2.73438,-8.75 0,-1.71875 -0.39063,-3.390625 -0.3125,-1.375 -0.875,-2.625 -0.35937,-0.828125 -1.46875,-2.734375 h 1.1875 q 1.70313,2.28125 2.53125,4.59375 0.6875,1.984375 0.6875,4.140625 0,2.46875 -0.9375,4.765625 -0.9375,2.296875 -2.28125,4 z m 9.67725,-7.9375 v -1.640625 h 5.03125 v 1.640625 z m 15.4783,-1.828125 -8.84375,3.78125 v -1.625 l 7.01562,-2.90625 -7.01562,-2.875 v -1.640625 l 8.84375,3.734375 z m 10.57825,9.765625 q -1.35938,-1.703125 -2.29688,-4 -0.9375,-2.296875 -0.9375,-4.765625 0,-2.15625 0.70313,-4.140625 0.82812,-2.3125 2.53125,-4.59375 h 1.17187 q -1.09375,1.890625 -1.45312,2.703125 -0.54688,1.25 -0.875,2.625 -0.39063,1.703125 -0.39063,3.421875 0,4.375 2.71875,8.75 z m 3.08767,-15.390625 v -1.890625 h 1.64063 v 1.890625 z m 0,11.46875 v -9.671875 h 1.64063 v 9.671875 z m 4.56671,0 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35937,0.640625 -1.15625,0.984375 l -0.45312,-0.703125 q 0.51562,-0.21875 0.76562,-0.671875 0.25,-0.4375 0.28125,-1.265625 z m 9.92911,-11.453125 v -1.90625 h 1.64063 v 1.90625 z m -2.07812,15.203125 0.3125,-1.390625 q 0.5,0.125 0.78125,0.125 0.5,0 0.73437,-0.328125 0.25,-0.328125 0.25,-1.671875 v -10.15625 h 1.64063 v 10.203125 q 0,1.78125 -0.46875,2.484375 -0.59375,0.90625 -1.96875,0.90625 -0.65625,0 -1.28125,-0.171875 z m 7.31668,0.171875 h -1.1875 q 2.73437,-4.375 2.73437,-8.75 0,-1.71875 -0.39062,-3.390625 -0.3125,-1.375 -0.875,-2.625 -0.35938,-0.828125 -1.46875,-2.734375 h 1.1875 q 1.70312,2.28125 2.53125,4.59375 0.6875,1.984375 0.6875,4.140625 0,2.46875 -0.9375,4.765625 -0.9375,2.296875 -2.28125,4 z m 9.66162,-6.8125 1.625,-0.25 q 0.125,0.96875 0.75,1.5 0.625,0.515625 1.75,0.515625 1.125,0 1.67187,-0.453125 0.54688,-0.46875 0.54688,-1.09375 0,-0.546875 -0.48438,-0.875 -0.32812,-0.21875 -1.67187,-0.546875 -1.8125,-0.46875 -2.51563,-0.796875 -0.6875,-0.328125 -1.04687,-0.90625 -0.35938,-0.59375 -0.35938,-1.3125 0,-0.640625 0.29688,-1.1875 0.29687,-0.5625 0.8125,-0.921875 0.375,-0.28125 1.03125,-0.46875 0.67187,-0.203125 1.42187,-0.203125 1.14063,0 2,0.328125 0.85938,0.328125 1.26563,0.890625 0.42187,0.5625 0.57812,1.5 l -1.60937,0.21875 q -0.10938,-0.75 -0.64063,-1.171875 -0.51562,-0.421875 -1.46875,-0.421875 -1.14062,0 -1.625,0.375 -0.46875,0.375 -0.46875,0.875 0,0.3125 0.1875,0.578125 0.20313,0.265625 0.64063,0.4375 0.23437,0.09375 1.4375,0.421875 1.75,0.453125 2.4375,0.75 0.6875,0.296875 1.07812,0.859375 0.39063,0.5625 0.39063,1.40625 0,0.828125 -0.48438,1.546875 -0.46875,0.71875 -1.375,1.125 -0.90625,0.390625 -2.04687,0.390625 -1.875,0 -2.875,-0.78125 -0.98438,-0.78125 -1.25,-2.328125 z m 9.98437,-8.578125 v -1.890625 h 1.64063 v 1.890625 z m 0,11.46875 v -9.671875 h 1.64063 v 9.671875 z m 3.26981,0 v -1.328125 l 6.15625,-7.078125 q -1.04688,0.0625 -1.84375,0.0625 h -3.9375 v -1.328125 h 7.90625 v 1.078125 l -5.25,6.140625 -1,1.125 q 1.09375,-0.07813 2.0625,-0.07813 h 4.46875 v 1.40625 z m 16.82812,-3.109375 1.6875,0.203125 q -0.40625,1.484375 -1.48437,2.3125 -1.07813,0.8125 -2.76563,0.8125 -2.125,0 -3.375,-1.296875 -1.23437,-1.3125 -1.23437,-3.671875 0,-2.453125 1.25,-3.796875 1.26562,-1.34375 3.26562,-1.34375 1.9375,0 3.15625,1.328125 1.23438,1.3125 1.23438,3.703125 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.453125 0.8125,0.84375 2.01562,0.84375 0.90625,0 1.54688,-0.46875 0.64062,-0.484375 1.01562,-1.515625 z m -5.39062,-2.65625 h 5.40625 q -0.10938,-1.21875 -0.625,-1.828125 -0.78125,-0.953125 -2.03125,-0.953125 -1.125,0 -1.90625,0.765625 -0.76563,0.75 -0.84375,2.015625 z m 17.44965,9.6875 q -1.35938,-1.703125 -2.29688,-4 -0.9375,-2.296875 -0.9375,-4.765625 0,-2.15625 0.70313,-4.140625 0.82812,-2.3125 2.53125,-4.59375 h 1.17187 q -1.09375,1.890625 -1.45312,2.703125 -0.54688,1.25 -0.875,2.625 -0.39063,1.703125 -0.39063,3.421875 0,4.375 2.71875,8.75 z m 2.63455,-7.453125 1.64062,-0.21875 q 0.28125,1.40625 0.95313,2.015625 0.6875,0.609375 1.65625,0.609375 1.15625,0 1.95312,-0.796875 0.79688,-0.796875 0.79688,-1.984375 0,-1.125 -0.73438,-1.859375 -0.73437,-0.734375 -1.875,-0.734375 -0.46875,0 -1.15625,0.171875 l 0.1875,-1.4375 q 0.15625,0.01563 0.26563,0.01563 1.04687,0 1.875,-0.546875 0.84375,-0.546875 0.84375,-1.671875 0,-0.90625 -0.60938,-1.5 -0.60937,-0.59375 -1.57812,-0.59375 -0.95313,0 -1.59375,0.609375 -0.64063,0.59375 -0.8125,1.796875 l -1.64063,-0.296875 q 0.29688,-1.640625 1.35938,-2.546875 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92187,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.859375 -0.46875,1.578125 -0.46875,0.703125 -1.375,1.125 1.1875,0.28125 1.84375,1.140625 0.65625,0.859375 0.65625,2.15625 0,1.734375 -1.28125,2.953125 -1.26563,1.21875 -3.21875,1.21875 -1.76563,0 -2.92188,-1.046875 -1.15625,-1.046875 -1.32812,-2.71875 z m 11.25073,3.53125 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35938,0.640625 -1.15625,0.984375 l -0.45313,-0.703125 q 0.51563,-0.21875 0.76563,-0.671875 0.25,-0.4375 0.28125,-1.265625 z m 9.49161,-3.53125 1.64059,-0.21875 q 0.28125,1.40625 0.95313,2.015625 0.6875,0.609375 1.65625,0.609375 1.15625,0 1.95312,-0.796875 0.79688,-0.796875 0.79688,-1.984375 0,-1.125 -0.73438,-1.859375 -0.73437,-0.734375 -1.875,-0.734375 -0.46875,0 -1.15625,0.171875 l 0.1875,-1.4375 q 0.15625,0.01563 0.26563,0.01563 1.04687,0 1.875,-0.546875 0.84375,-0.546875 0.84375,-1.671875 0,-0.90625 -0.60938,-1.5 -0.60937,-0.59375 -1.57812,-0.59375 -0.95313,0 -1.59375,0.609375 -0.64063,0.59375 -0.8125,1.796875 l -1.6406,-0.296875 q 0.29688,-1.640625 1.35938,-2.546875 1.06247,-0.90625 2.65622,-0.90625 1.09375,0 2,0.46875 0.92187,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.859375 -0.46875,1.578125 -0.46875,0.703125 -1.375,1.125 1.1875,0.28125 1.84375,1.140625 0.65625,0.859375 0.65625,2.15625 0,1.734375 -1.28125,2.953125 -1.26563,1.21875 -3.21875,1.21875 -1.76563,0 -2.92185,-1.046875 -1.15625,-1.046875 -1.32812,-2.71875 z m 11.90695,7.453125 h -1.1875 q 2.73437,-4.375 2.73437,-8.75 0,-1.71875 -0.39062,-3.390625 -0.3125,-1.375 -0.875,-2.625 -0.35938,-0.828125 -1.46875,-2.734375 h 1.1875 q 1.70312,2.28125 2.53125,4.59375 0.6875,1.984375 0.6875,4.140625 0,2.46875 -0.9375,4.765625 -0.9375,2.296875 -2.28125,4 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path143"
+     d="m 163.38583,217.19421 h 582.48816 v 44.34647 H 163.38583 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path145"
+     d="m 173.32333,244.3486 0.79687,-3.89063 h -1.54687 v -1.35937 h 1.8125 l 0.67187,-3.29688 h -2.48437 v -1.35939 h 2.76562 l 0.79688,-3.90625 h 1.35937 l -0.79687,3.90625 h 2.875 l 0.79687,-3.90625 h 1.375 l -0.79687,3.90625 h 1.57812 v 1.35939 h -1.84375 l -0.6875,3.29688 h 2.53125 v 1.35937 h -2.8125 l -0.78125,3.89063 h -1.375 l 0.78125,-3.89063 h -2.85937 l -0.78125,3.89063 z m 2.4375,-5.25 h 2.85937 l 0.6875,-3.29688 h -2.875 z m 8.23509,-6.45314 v -1.89063 h 1.64063 v 1.89063 z m 0,11.46876 v -9.67189 h 1.64063 v 9.67189 z m 4.14482,0 v -9.67189 h 1.46875 v 1.35939 q 0.45313,-0.71876 1.20313,-1.14064 0.76562,-0.4375 1.71875,-0.4375 1.07812,0 1.76562,0.45313 0.6875,0.4375 0.96875,1.23439 1.15625,-1.68752 2.98438,-1.68752 1.45312,0 2.21875,0.79688 0.78125,0.79689 0.78125,2.45314 v 6.64062 h -1.64063 v -6.09375 q 0,-0.98437 -0.15625,-1.40625 -0.15625,-0.4375 -0.57812,-0.70312 -0.42188,-0.26563 -0.98438,-0.26563 -1.01562,0 -1.6875,0.6875 -0.67187,0.67188 -0.67187,2.15625 v 5.625 h -1.64063 v -6.28125 q 0,-1.09375 -0.40625,-1.64062 -0.40625,-0.54688 -1.3125,-0.54688 -0.6875,0 -1.28125,0.35938 -0.59375,0.35937 -0.85937,1.0625 -0.25,0.70312 -0.25,2.03125 v 5.01562 z m 21.85331,-1.1875 q -0.92188,0.76563 -1.76563,1.09375 -0.82812,0.3125 -1.79687,0.3125 -1.59375,0 -2.45313,-0.78125 -0.85937,-0.78125 -0.85937,-1.98437 0,-0.71875 0.32812,-1.29688 0.32813,-0.59375 0.84375,-0.9375 0.53125,-0.35937 1.1875,-0.54687 0.46875,-0.125 1.45313,-0.25 1.98437,-0.23438 2.92187,-0.5625 0.0156,-0.34375 0.0156,-0.42188 0,-1 -0.46875,-1.42187 -0.625,-0.54688 -1.875,-0.54688 -1.15625,0 -1.70313,0.40625 -0.54687,0.40625 -0.8125,1.42188 l -1.60937,-0.21875 q 0.21875,-1.01563 0.71875,-1.64064 0.5,-0.64063 1.45312,-0.98438 0.95313,-0.34375 2.1875,-0.34375 1.25,0 2.01563,0.29688 0.78125,0.28125 1.14062,0.73437 0.375,0.43752 0.51563,1.10939 0.0781,0.42188 0.0781,1.51563 v 2.1875 q 0,2.28125 0.10938,2.89062 0.10937,0.59375 0.40625,1.15625 h -1.70313 q -0.26562,-0.51562 -0.32812,-1.1875 z m -0.14063,-3.67187 q -0.89062,0.375 -2.67187,0.625 -1.01563,0.14062 -1.4375,0.32812 -0.42188,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45312,0.4375 0.9375,0 1.67188,-0.40625 0.75,-0.42187 1.09375,-1.14062 0.26562,-0.5625 0.26562,-1.64063 z m 4.20384,8.5625 v -13.37502 h 1.48438 v 1.25002 q 0.53125,-0.73439 1.1875,-1.09377 0.67187,-0.375 1.625,-0.375 1.23437,0 2.17187,0.64063 0.95313,0.625 1.4375,1.79689 0.48438,1.15625 0.48438,2.54687 0,1.48438 -0.53125,2.67188 -0.53125,1.1875 -1.54688,1.82812 -1.01562,0.625 -2.14062,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 v 4.70313 z m 1.48438,-8.48438 q 0,1.85938 0.75,2.76563 0.76562,0.89062 1.82812,0.89062 1.09375,0 1.875,-0.92187 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76562,-2.76563 -0.75,-0.92189 -1.8125,-0.92189 -1.04688,0 -1.85938,0.98439 -0.79687,0.96875 -0.79687,2.84375 z m 9.01634,4.78125 v -13.35939 h 5.01562 q 1.53125,0 2.45313,0.40625 0.92187,0.40625 1.4375,1.25 0.53125,0.84375 0.53125,1.76563 0,0.85937 -0.46875,1.62501 -0.45313,0.75 -1.39063,1.20313 1.20313,0.35937 1.85938,1.21875 0.65625,0.85937 0.65625,2.01562 0,0.9375 -0.40625,1.75 -0.39063,0.79688 -0.98438,1.23438 -0.57812,0.4375 -1.45312,0.67187 -0.875,0.21875 -2.15625,0.21875 z m 1.78125,-7.75 h 2.875 q 1.1875,0 1.6875,-0.14062 0.67187,-0.20313 1.01562,-0.67189 0.34375,-0.46875 0.34375,-1.17188 0,-0.65625 -0.32812,-1.15625 -0.3125,-0.51562 -0.90625,-0.70312 -0.59375,-0.1875 -2.03125,-0.1875 h -2.65625 z m 0,6.17188 h 3.3125 q 0.85937,0 1.20312,-0.0625 0.60938,-0.10938 1.01563,-0.35938 0.42187,-0.26562 0.6875,-0.75 0.26562,-0.48437 0.26562,-1.125 0,-0.75 -0.39062,-1.29687 -0.375,-0.54688 -1.0625,-0.76563 -0.67188,-0.23437 -1.95313,-0.23437 h -3.07812 z m 18.69357,0 v 1.57812 h -8.82812 q -0.0156,-0.59375 0.1875,-1.14062 0.34375,-0.90625 1.07812,-1.78125 0.75,-0.875 2.15625,-2.01563 2.17188,-1.78125 2.9375,-2.82812 0.76563,-1.04689 0.76563,-1.96877 0,-0.98437 -0.70313,-1.64062 -0.6875,-0.67188 -1.8125,-0.67188 -1.1875,0 -1.90625,0.71875 -0.70312,0.70313 -0.70312,1.95313 l -1.6875,-0.17188 q 0.17187,-1.89062 1.29687,-2.875 1.14063,-0.98437 3.03125,-0.98437 1.92188,0 3.04688,1.0625 1.125,1.0625 1.125,2.64062 0,0.79688 -0.32813,1.57814 -0.32812,0.78125 -1.09375,1.64063 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23437 -1.89062,1.6875 -0.42188,0.4375 -0.6875,0.875 z m 0.95386,1.57812 5.125,-13.35939 h 1.90625 l 5.46875,13.35939 h -2.01563 l -1.54687,-4.04687 h -5.59375 l -1.46875,4.04687 z m 3.85937,-5.48437 h 4.53125 l -1.40625,-3.70314 q -0.625,-1.6875 -0.9375,-2.76563 -0.26562,1.28125 -0.71875,2.54688 z m 23.65812,-2.375 h -8.82813 v -1.51564 h 8.82813 z m 0,4.0625 h -8.82813 v -1.53125 h 8.82813 z m 10.57827,7.71875 q -1.35937,-1.70313 -2.29687,-4 -0.9375,-2.29688 -0.9375,-4.76563 0,-2.15625 0.70312,-4.14064 0.82813,-2.3125 2.53125,-4.59375 h 1.17188 q -1.09375,1.89063 -1.45313,2.70313 -0.54687,1.25 -0.875,2.62501 -0.39062,1.70313 -0.39062,3.42188 0,4.375 2.71875,8.75 z m 9.35331,-3.92188 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64062 -0.96875,-0.64063 -1.5,-1.78125 -0.53125,-1.14063 -0.53125,-2.625 0,-1.45313 0.48438,-2.625 0.48437,-1.18752 1.4375,-1.81252 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35938 1.10937,0.95313 v -4.79688 h 1.64063 v 13.35939 z m -5.17188,-4.82812 q 0,1.85937 0.78125,2.78125 0.78125,0.92187 1.84375,0.92187 1.07813,0 1.82813,-0.875 0.75,-0.89062 0.75,-2.6875 0,-1.98437 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89063 -0.73438,0.89062 -0.73438,2.8125 z m 8.82883,-1.76563 q 0,-2.35939 0.48437,-3.79689 0.48438,-1.45312 1.4375,-2.23437 0.96875,-0.78125 2.42188,-0.78125 1.07812,0 1.89062,0.4375 0.8125,0.42187 1.32813,1.25 0.53125,0.8125 0.82812,1.98437 0.3125,1.15625 0.3125,3.14064 0,2.35938 -0.48437,3.8125 -0.48438,1.4375 -1.45313,2.23438 -0.95312,0.78125 -2.42187,0.78125 -1.92188,0 -3.03125,-1.39063 -1.3125,-1.67187 -1.3125,-5.4375 z m 1.67187,0 q 0,3.29688 0.76563,4.39063 0.78125,1.07812 1.90625,1.07812 1.14062,0 1.90625,-1.09375 0.76562,-1.09375 0.76562,-4.375 0,-3.29689 -0.76562,-4.37501 -0.76563,-1.07813 -1.92188,-1.07813 -1.125,0 -1.79687,0.95313 -0.85938,1.21875 -0.85938,4.50001 z m 9.57886,6.59375 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35937,0.64063 -1.15625,0.98438 l -0.45312,-0.70313 q 0.51562,-0.21875 0.76562,-0.67187 0.25,-0.4375 0.28125,-1.26563 z m 16.21036,0 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64062 -0.96875,-0.64063 -1.5,-1.78125 -0.53125,-1.14063 -0.53125,-2.625 0,-1.45313 0.48438,-2.625 0.48437,-1.18752 1.4375,-1.81252 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35938 1.10937,0.95313 v -4.79688 h 1.64063 v 13.35939 z m -5.17188,-4.82812 q 0,1.85937 0.78125,2.78125 0.78125,0.92187 1.84375,0.92187 1.07813,0 1.82813,-0.875 0.75,-0.89062 0.75,-2.6875 0,-1.98437 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89063 -0.73438,0.89062 -0.73438,2.8125 z m 15.00071,4.82812 h -1.64063 v -10.45314 q -0.59375,0.5625 -1.5625,1.14063 -0.95312,0.5625 -1.71875,0.84376 v -1.59376 q 1.375,-0.64063 2.40625,-1.5625 1.03125,-0.92188 1.45313,-1.78125 h 1.0625 z m 5.7351,3.92188 h -1.1875 q 2.73438,-4.375 2.73438,-8.75 0,-1.71875 -0.39063,-3.39063 -0.3125,-1.37501 -0.875,-2.62501 -0.35937,-0.82813 -1.46875,-2.73438 h 1.1875 q 1.70313,2.28125 2.53125,4.59375 0.6875,1.98439 0.6875,4.14064 0,2.46875 -0.9375,4.76563 -0.9375,2.29687 -2.28125,4 z m 5.1658,-0.21875 v -17.06252 h 3.60938 v 1.35938 h -1.96875 v 14.34376 h 1.96875 v 1.35938 z m 4.76142,-8 1.65625,-0.14063 q 0.125,1 0.54687,1.64063 0.4375,0.64062 1.34375,1.04687 0.92188,0.39063 2.0625,0.39063 1,0 1.78125,-0.29688 0.78125,-0.29687 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35937,-0.46875 -1.1875,-0.79687 -0.54687,-0.20313 -2.39062,-0.64063 -1.82813,-0.45312 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23437 -0.46875,-0.75002 -0.46875,-1.67189 0,-1 0.57812,-1.875 0.57813,-0.89063 1.67188,-1.34375 1.10937,-0.45313 2.45312,-0.45313 1.48438,0 2.60938,0.48438 1.14062,0.46875 1.75,1.40625 0.60937,0.92187 0.65625,2.09375 l -1.6875,0.125 q -0.14063,-1.26563 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60938,0 -2.34375,0.59375 -0.73438,0.59375 -0.73438,1.42187 0,0.71875 0.53125,1.17188 0.5,0.46876 2.65625,0.96876 2.15625,0.48438 2.95313,0.84375 1.17187,0.53125 1.71875,1.35938 0.5625,0.82812 0.5625,1.90625 0,1.0625 -0.60938,2.01562 -0.60937,0.9375 -1.75,1.46875 -1.14062,0.51563 -2.57812,0.51563 -1.8125,0 -3.04688,-0.53125 -1.21875,-0.53125 -1.92187,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 12.38107,-2.29688 q 0,-2.35939 0.48438,-3.79689 0.48437,-1.45312 1.4375,-2.23437 0.96875,-0.78125 2.42187,-0.78125 1.07813,0 1.89063,0.4375 0.8125,0.42187 1.32812,1.25 0.53125,0.8125 0.82813,1.98437 0.3125,1.15625 0.3125,3.14064 0,2.35938 -0.48438,3.8125 -0.48437,1.4375 -1.45312,2.23438 -0.95313,0.78125 -2.42188,0.78125 -1.92187,0 -3.03125,-1.39063 -1.3125,-1.67187 -1.3125,-5.4375 z m 1.67188,0 q 0,3.29688 0.76562,4.39063 0.78125,1.07812 1.90625,1.07812 1.14063,0 1.90625,-1.09375 0.76563,-1.09375 0.76563,-4.375 0,-3.29689 -0.76563,-4.37501 -0.76562,-1.07813 -1.92187,-1.07813 -1.125,0 -1.79688,0.95313 -0.85937,1.21875 -0.85937,4.50001 z m 9.57885,6.59375 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35937,0.64063 -1.15625,0.98438 l -0.45312,-0.70313 q 0.51562,-0.21875 0.76562,-0.67187 0.25,-0.4375 0.28125,-1.26563 z m 9.55411,-4.29687 1.65625,-0.14063 q 0.125,1 0.54688,1.64063 0.4375,0.64062 1.34375,1.04687 0.92187,0.39063 2.0625,0.39063 1,0 1.78125,-0.29688 0.78125,-0.29687 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35938,-0.46875 -1.1875,-0.79687 -0.54688,-0.20313 -2.39063,-0.64063 -1.82812,-0.45312 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23437 -0.46875,-0.75002 -0.46875,-1.67189 0,-1 0.57813,-1.875 0.57812,-0.89063 1.67187,-1.34375 1.10938,-0.45313 2.45313,-0.45313 1.48437,0 2.60937,0.48438 1.14063,0.46875 1.75,1.40625 0.60938,0.92187 0.65625,2.09375 l -1.6875,0.125 q -0.14062,-1.26563 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60937,0 -2.34375,0.59375 -0.73437,0.59375 -0.73437,1.42187 0,0.71875 0.53125,1.17188 0.5,0.46876 2.65625,0.96876 2.15625,0.48438 2.95312,0.84375 1.17188,0.53125 1.71875,1.35938 0.5625,0.82812 0.5625,1.90625 0,1.0625 -0.60937,2.01562 -0.60938,0.9375 -1.75,1.46875 -1.14063,0.51563 -2.57813,0.51563 -1.8125,0 -3.04687,-0.53125 -1.21875,-0.53125 -1.92188,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 18.55295,4.29687 h -1.64063 v -10.45314 q -0.59375,0.5625 -1.5625,1.14063 -0.95312,0.5625 -1.71875,0.84376 v -1.59376 q 1.375,-0.64063 2.40625,-1.5625 1.03125,-0.92188 1.45313,-1.78125 h 1.0625 z m 7.39133,3.70313 h -3.60938 v -1.35938 h 1.96875 v -14.34376 h -1.96875 v -1.35938 h 3.60938 z m 6.9916,-7.71875 v -1.64063 h 5.03125 v 1.64063 z m 15.47831,-1.82813 -8.84375,3.78125 v -1.625 l 7.01562,-2.90625 -7.01562,-2.87501 v -1.64063 l 8.84375,3.73439 z m 10.57827,9.76563 q -1.35937,-1.70313 -2.29687,-4 -0.9375,-2.29688 -0.9375,-4.76563 0,-2.15625 0.70312,-4.14064 0.82813,-2.3125 2.53125,-4.59375 h 1.17188 q -1.09375,1.89063 -1.45313,2.70313 -0.54687,1.25 -0.875,2.62501 -0.39062,1.70313 -0.39062,3.42188 0,4.375 2.71875,8.75 z m 9.35331,-3.92188 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64062 -0.96875,-0.64063 -1.5,-1.78125 -0.53125,-1.14063 -0.53125,-2.625 0,-1.45313 0.48438,-2.625 0.48437,-1.18752 1.4375,-1.81252 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35938 1.10937,0.95313 v -4.79688 h 1.64063 v 13.35939 z m -5.17188,-4.82812 q 0,1.85937 0.78125,2.78125 0.78125,0.92187 1.84375,0.92187 1.07813,0 1.82813,-0.875 0.75,-0.89062 0.75,-2.6875 0,-1.98437 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89063 -0.73438,0.89062 -0.73438,2.8125 z m 8.82883,-1.76563 q 0,-2.35939 0.48437,-3.79689 0.48438,-1.45312 1.4375,-2.23437 0.96875,-0.78125 2.42188,-0.78125 1.07812,0 1.89062,0.4375 0.8125,0.42187 1.32813,1.25 0.53125,0.8125 0.82812,1.98437 0.3125,1.15625 0.3125,3.14064 0,2.35938 -0.48437,3.8125 -0.48438,1.4375 -1.45313,2.23438 -0.95312,0.78125 -2.42187,0.78125 -1.92188,0 -3.03125,-1.39063 -1.3125,-1.67187 -1.3125,-5.4375 z m 1.67187,0 q 0,3.29688 0.76563,4.39063 0.78125,1.07812 1.90625,1.07812 1.14062,0 1.90625,-1.09375 0.76562,-1.09375 0.76562,-4.375 0,-3.29689 -0.76562,-4.37501 -0.76563,-1.07813 -1.92188,-1.07813 -1.125,0 -1.79687,0.95313 -0.85938,1.21875 -0.85938,4.50001 z m 17.77777,4.4375 v -3.67187 h -3.64062 v -1.51563 h 3.64062 v -3.64064 h 1.54688 v 3.64064 h 3.64062 v 1.51563 h -3.64062 v 3.67187 z m 12.25013,-2.14062 1.65625,-0.14063 q 0.125,1 0.54687,1.64063 0.4375,0.64062 1.34375,1.04687 0.92188,0.39063 2.0625,0.39063 1,0 1.78125,-0.29688 0.78125,-0.29687 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35937,-0.46875 -1.1875,-0.79687 -0.54687,-0.20313 -2.39062,-0.64063 -1.82813,-0.45312 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23437 -0.46875,-0.75002 -0.46875,-1.67189 0,-1 0.57812,-1.875 0.57813,-0.89063 1.67188,-1.34375 1.10937,-0.45313 2.45312,-0.45313 1.48438,0 2.60938,0.48438 1.14062,0.46875 1.75,1.40625 0.60937,0.92187 0.65625,2.09375 l -1.6875,0.125 q -0.14063,-1.26563 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60938,0 -2.34375,0.59375 -0.73438,0.59375 -0.73438,1.42187 0,0.71875 0.53125,1.17188 0.5,0.46876 2.65625,0.96876 2.15625,0.48438 2.95313,0.84375 1.17187,0.53125 1.71875,1.35938 0.5625,0.82812 0.5625,1.90625 0,1.0625 -0.60938,2.01562 -0.60937,0.9375 -1.75,1.46875 -1.14062,0.51563 -2.57812,0.51563 -1.8125,0 -3.04688,-0.53125 -1.21875,-0.53125 -1.92187,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 12.38107,-2.29688 q 0,-2.35939 0.48437,-3.79689 0.48438,-1.45312 1.4375,-2.23437 0.96875,-0.78125 2.42188,-0.78125 1.07812,0 1.89062,0.4375 0.8125,0.42187 1.32813,1.25 0.53125,0.8125 0.82812,1.98437 0.3125,1.15625 0.3125,3.14064 0,2.35938 -0.48437,3.8125 -0.48438,1.4375 -1.45313,2.23438 -0.95312,0.78125 -2.42187,0.78125 -1.92188,0 -3.03125,-1.39063 -1.3125,-1.67187 -1.3125,-5.4375 z m 1.67187,0 q 0,3.29688 0.76563,4.39063 0.78125,1.07812 1.90625,1.07812 1.14062,0 1.90625,-1.09375 0.76562,-1.09375 0.76562,-4.375 0,-3.29689 -0.76562,-4.37501 -0.76563,-1.07813 -1.92188,-1.07813 -1.125,0 -1.79687,0.95313 -0.85938,1.21875 -0.85938,4.50001 z m 9.57886,6.59375 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35937,0.64063 -1.15625,0.98438 l -0.45312,-0.70313 q 0.51562,-0.21875 0.76562,-0.67187 0.25,-0.4375 0.28125,-1.26563 z m 16.21033,0 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64062 -0.96875,-0.64063 -1.5,-1.78125 -0.53125,-1.14063 -0.53125,-2.625 0,-1.45313 0.48438,-2.625 0.48437,-1.18752 1.4375,-1.81252 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35938 1.10937,0.95313 v -4.79688 h 1.64063 v 13.35939 z m -5.17188,-4.82812 q 0,1.85937 0.78125,2.78125 0.78125,0.92187 1.84375,0.92187 1.07813,0 1.82813,-0.875 0.75,-0.89062 0.75,-2.6875 0,-1.98437 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89063 -0.73438,0.89062 -0.73438,2.8125 z m 15.00074,4.82812 h -1.64063 v -10.45314 q -0.59375,0.5625 -1.5625,1.14063 -0.95312,0.5625 -1.71875,0.84376 v -1.59376 q 1.375,-0.64063 2.40625,-1.5625 1.03125,-0.92188 1.45313,-1.78125 h 1.0625 z m 13.27777,-2.15625 v -3.67187 h -3.64063 v -1.51563 h 3.64063 v -3.64064 h 1.54687 v 3.64064 h 3.64063 v 1.51563 h -3.64063 v 3.67187 z m 12.25012,-2.14062 1.65625,-0.14063 q 0.125,1 0.54687,1.64063 0.4375,0.64062 1.34375,1.04687 0.92188,0.39063 2.0625,0.39063 1,0 1.78125,-0.29688 0.78125,-0.29687 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35937,-0.46875 -1.1875,-0.79687 -0.54687,-0.20313 -2.39062,-0.64063 -1.82813,-0.45312 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23437 -0.46875,-0.75002 -0.46875,-1.67189 0,-1 0.57812,-1.875 0.57813,-0.89063 1.67188,-1.34375 1.10937,-0.45313 2.45312,-0.45313 1.48438,0 2.60938,0.48438 1.14062,0.46875 1.75,1.40625 0.60937,0.92187 0.65625,2.09375 l -1.6875,0.125 q -0.14063,-1.26563 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60938,0 -2.34375,0.59375 -0.73438,0.59375 -0.73438,1.42187 0,0.71875 0.53125,1.17188 0.5,0.46876 2.65625,0.96876 2.15625,0.48438 2.95313,0.84375 1.17187,0.53125 1.71875,1.35938 0.5625,0.82812 0.5625,1.90625 0,1.0625 -0.60938,2.01562 -0.60937,0.9375 -1.75,1.46875 -1.14062,0.51563 -2.57812,0.51563 -1.8125,0 -3.04688,-0.53125 -1.21875,-0.53125 -1.92187,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 18.55292,4.29687 h -1.64063 v -10.45314 q -0.59375,0.5625 -1.5625,1.14063 -0.95312,0.5625 -1.71875,0.84376 v -1.59376 q 1.375,-0.64063 2.40625,-1.5625 1.03125,-0.92188 1.45313,-1.78125 h 1.0625 z m 5.7351,3.92188 h -1.1875 q 2.73438,-4.375 2.73438,-8.75 0,-1.71875 -0.39063,-3.39063 -0.3125,-1.37501 -0.875,-2.62501 -0.35937,-0.82813 -1.46875,-2.73438 h 1.1875 q 1.70313,2.28125 2.53125,4.59375 0.6875,1.98439 0.6875,4.14064 0,2.46875 -0.9375,4.76563 -0.9375,2.29687 -2.28125,4 z m 9.66162,-6.8125 1.625,-0.25 q 0.125,0.96875 0.75,1.5 0.625,0.51562 1.75,0.51562 1.125,0 1.67188,-0.45312 0.54687,-0.46875 0.54687,-1.09375 0,-0.54688 -0.48437,-0.875 -0.32813,-0.21875 -1.67188,-0.54688 -1.8125,-0.46875 -2.51562,-0.79687 -0.6875,-0.32813 -1.04688,-0.90625 -0.35937,-0.59375 -0.35937,-1.3125 0,-0.64063 0.29687,-1.1875 0.29688,-0.56252 0.8125,-0.92189 0.375,-0.28125 1.03125,-0.46875 0.67188,-0.20313 1.42188,-0.20313 1.14062,0 2,0.32813 0.85937,0.32812 1.26562,0.89062 0.42188,0.56252 0.57813,1.50002 l -1.60938,0.21875 q -0.10937,-0.75 -0.64062,-1.17188 -0.51563,-0.42189 -1.46875,-0.42189 -1.14063,0 -1.625,0.37502 -0.46875,0.375 -0.46875,0.875 0,0.3125 0.1875,0.57812 0.20312,0.26563 0.64062,0.4375 0.23438,0.0937 1.4375,0.42188 1.75,0.45312 2.4375,0.75 0.6875,0.29687 1.07813,0.85937 0.39062,0.5625 0.39062,1.40625 0,0.82813 -0.48437,1.54688 -0.46875,0.71875 -1.375,1.125 -0.90625,0.39062 -2.04688,0.39062 -1.875,0 -2.875,-0.78125 -0.98437,-0.78125 -1.25,-2.32812 z m 9.98438,-8.57814 v -1.89063 h 1.64062 v 1.89063 z m 0,11.46876 v -9.67189 h 1.64062 v 9.67189 z m 3.26983,0 v -1.32812 l 6.15625,-7.07813 q -1.04687,0.0625 -1.84375,0.0625 h -3.9375 v -1.32814 h 7.90625 v 1.07813 l -5.25,6.14064 -1,1.125 q 1.09375,-0.0781 2.0625,-0.0781 h 4.46875 v 1.40625 z m 16.82813,-3.10937 1.6875,0.20312 q -0.40625,1.48438 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.29687 -1.23438,-1.3125 -1.23438,-3.67188 0,-2.45312 1.25,-3.79689 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.32813 1.23437,1.31251 1.23437,3.70314 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45312 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.48437 1.01563,-1.51562 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.82813 -0.78125,-0.95314 -2.03125,-0.95314 -1.125,0 -1.90625,0.76564 -0.76562,0.75 -0.84375,2.01563 z m 17.44965,9.6875 q -1.35937,-1.70313 -2.29687,-4 -0.9375,-2.29688 -0.9375,-4.76563 0,-2.15625 0.70312,-4.14064 0.82813,-2.3125 2.53125,-4.59375 h 1.17188 q -1.09375,1.89063 -1.45313,2.70313 -0.54687,1.25 -0.875,2.62501 -0.39062,1.70313 -0.39062,3.42188 0,4.375 2.71875,8.75 z m 2.63452,-7.45313 1.64063,-0.21875 q 0.28125,1.40625 0.95312,2.01563 0.6875,0.60937 1.65625,0.60937 1.15625,0 1.95313,-0.79687 0.79687,-0.79688 0.79687,-1.98438 0,-1.125 -0.73437,-1.85937 -0.73438,-0.73438 -1.875,-0.73438 -0.46875,0 -1.15625,0.17188 l 0.1875,-1.4375 q 0.15625,0.0156 0.26562,0.0156 1.04688,0 1.875,-0.54687 0.84375,-0.54689 0.84375,-1.67189 0,-0.90625 -0.60937,-1.5 -0.60938,-0.59375 -1.57813,-0.59375 -0.95312,0 -1.59375,0.60937 -0.64062,0.59375 -0.8125,1.79688 l -1.64062,-0.29688 q 0.29687,-1.64062 1.35937,-2.54687 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92188,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85937 -0.46875,1.57814 -0.46875,0.70312 -1.375,1.125 1.1875,0.28125 1.84375,1.14062 0.65625,0.85938 0.65625,2.15625 0,1.73438 -1.28125,2.95313 -1.26562,1.21875 -3.21875,1.21875 -1.76562,0 -2.92187,-1.04688 -1.15625,-1.04687 -1.32813,-2.71875 z m 11.25073,3.53125 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35937,0.64063 -1.15625,0.98438 l -0.45312,-0.70313 q 0.51562,-0.21875 0.76562,-0.67187 0.25,-0.4375 0.28125,-1.26563 z m 9.49158,-3.53125 1.64063,-0.21875 q 0.28125,1.40625 0.95312,2.01563 0.6875,0.60937 1.65625,0.60937 1.15625,0 1.95313,-0.79687 0.79687,-0.79688 0.79687,-1.98438 0,-1.125 -0.73437,-1.85937 -0.73438,-0.73438 -1.875,-0.73438 -0.46875,0 -1.15625,0.17188 l 0.1875,-1.4375 q 0.15625,0.0156 0.26562,0.0156 1.04688,0 1.875,-0.54687 0.84375,-0.54689 0.84375,-1.67189 0,-0.90625 -0.60937,-1.5 -0.60938,-0.59375 -1.57813,-0.59375 -0.95312,0 -1.59375,0.60937 -0.64062,0.59375 -0.8125,1.79688 l -1.64062,-0.29688 q 0.29687,-1.64062 1.35937,-2.54687 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92188,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85937 -0.46875,1.57814 -0.46875,0.70312 -1.375,1.125 1.1875,0.28125 1.84375,1.14062 0.65625,0.85938 0.65625,2.15625 0,1.73438 -1.28125,2.95313 -1.26562,1.21875 -3.21875,1.21875 -1.76562,0 -2.92187,-1.04688 -1.15625,-1.04687 -1.32813,-2.71875 z m 11.90698,7.45313 h -1.1875 q 2.73438,-4.375 2.73438,-8.75 0,-1.71875 -0.39063,-3.39063 -0.3125,-1.37501 -0.875,-2.62501 -0.35937,-0.82813 -1.46875,-2.73438 h 1.1875 q 1.70313,2.28125 2.53125,4.59375 0.6875,1.98439 0.6875,4.14064 0,2.46875 -0.9375,4.76563 -0.9375,2.29687 -2.28125,4 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path147"
+     d="m 73.383199,138.91863 h 12.377953 v 16.28348 H 73.383199 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path149"
+     d="m 77.995529,152.65335 v -1.89063 h 1.640625 v 1.89063 z m 0,11.46875 v -9.67188 h 1.640625 v 9.67188 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path151"
+     d="M 128.95013,76.837268 H 156.4147 V 109.84514 H 128.95013 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path153"
+     d="m 139.16888,92.304143 v -1.90625 h 1.64062 v 1.90625 z m -2.07813,15.203117 0.3125,-1.39062 q 0.5,0.125 0.78125,0.125 0.5,0 0.73438,-0.32813 0.25,-0.32812 0.25,-1.67187 V 94.085393 h 1.64062 v 10.203117 q 0,1.78125 -0.46875,2.48438 -0.59375,0.90625 -1.96875,0.90625 -0.65625,0 -1.28125,-0.17188 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path155"
+     d="m 0,101.84776 h 128.18896 v 21.48032 H 0 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path157"
+     d="m 13.359375,132.68964 q -1.359375,-1.70313 -2.296875,-4 -0.9375,-2.29688 -0.9375,-4.76563 0,-2.15625 0.703125,-4.14062 0.828125,-2.3125 2.53125,-4.59375 h 1.171875 q -1.09375,1.89062 -1.453125,2.70312 -0.546875,1.25 -0.875,2.625 -0.390625,1.70313 -0.390625,3.42188 0,4.375 2.71875,8.75 z m 2.697052,-8.21875 1.65625,-0.14063 q 0.125,1 0.546875,1.64063 0.4375,0.64062 1.34375,1.04687 0.921875,0.39063 2.0625,0.39063 1,0 1.78125,-0.29688 0.78125,-0.29687 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.359375,-0.46875 -1.1875,-0.79687 -0.546875,-0.20313 -2.390625,-0.64063 -1.828125,-0.45312 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23437 -0.46875,-0.75 -0.46875,-1.67188 0,-1 0.578125,-1.875 0.578125,-0.89062 1.671875,-1.34375 1.109375,-0.45312 2.453125,-0.45312 1.484375,0 2.609375,0.48437 1.140625,0.46875 1.75,1.40625 0.609375,0.92188 0.65625,2.09375 l -1.6875,0.125 q -0.140625,-1.26562 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.609375,0 -2.34375,0.59375 -0.734375,0.59375 -0.734375,1.42188 0,0.71875 0.53125,1.17187 0.5,0.46875 2.65625,0.96875 2.15625,0.48438 2.953125,0.84375 1.171875,0.53125 1.71875,1.35938 0.5625,0.82812 0.5625,1.90625 0,1.0625 -0.609375,2.01562 -0.609375,0.9375 -1.75,1.46875 -1.140625,0.51563 -2.578125,0.51563 -1.8125,0 -3.046875,-0.53125 -1.21875,-0.53125 -1.921875,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z M 28.4375,122.17401 q 0,-2.35937 0.484375,-3.79687 0.484375,-1.45313 1.4375,-2.23438 0.96875,-0.78125 2.421875,-0.78125 1.078125,0 1.890625,0.4375 0.8125,0.42188 1.328125,1.25 0.53125,0.8125 0.828125,1.98438 0.3125,1.15625 0.3125,3.14062 0,2.35938 -0.484375,3.8125 -0.484375,1.4375 -1.453125,2.23438 -0.953125,0.78125 -2.421875,0.78125 -1.921875,0 -3.03125,-1.39063 -1.3125,-1.67187 -1.3125,-5.4375 z m 1.671875,0 q 0,3.29688 0.765625,4.39063 0.78125,1.07812 1.90625,1.07812 1.140625,0 1.90625,-1.09375 0.765625,-1.09375 0.765625,-4.375 0,-3.29687 -0.765625,-4.375 -0.765625,-1.07812 -1.921875,-1.07812 -1.125,0 -1.796875,0.95312 -0.859375,1.21875 -0.859375,4.5 z m 9.578842,6.59375 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.359375,0.64063 -1.15625,0.98438 l -0.453125,-0.70313 q 0.515625,-0.21875 0.765625,-0.67187 0.25,-0.4375 0.28125,-1.26563 z m 9.554108,-4.29687 1.65625,-0.14063 q 0.125,1 0.546875,1.64063 0.4375,0.64062 1.34375,1.04687 0.921875,0.39063 2.0625,0.39063 1,0 1.78125,-0.29688 0.78125,-0.29687 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.359375,-0.46875 -1.1875,-0.79687 -0.546875,-0.20313 -2.390625,-0.64063 -1.828125,-0.45312 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23437 -0.46875,-0.75 -0.46875,-1.67188 0,-1 0.578125,-1.875 0.578125,-0.89062 1.671875,-1.34375 1.109375,-0.45312 2.453125,-0.45312 1.484375,0 2.609375,0.48437 1.140625,0.46875 1.75,1.40625 0.609375,0.92188 0.65625,2.09375 l -1.6875,0.125 q -0.140625,-1.26562 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.609375,0 -2.34375,0.59375 -0.734375,0.59375 -0.734375,1.42188 0,0.71875 0.53125,1.17187 0.5,0.46875 2.65625,0.96875 2.15625,0.48438 2.953125,0.84375 1.171875,0.53125 1.71875,1.35938 0.5625,0.82812 0.5625,1.90625 0,1.0625 -0.609375,2.01562 -0.609375,0.9375 -1.75,1.46875 -1.140625,0.51563 -2.578125,0.51563 -1.8125,0 -3.046875,-0.53125 -1.21875,-0.53125 -1.921875,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 18.552948,4.29687 h -1.640625 v -10.45312 q -0.59375,0.5625 -1.5625,1.14062 -0.953125,0.5625 -1.71875,0.84375 v -1.59375 q 1.375,-0.64062 2.40625,-1.5625 1.03125,-0.92187 1.453125,-1.78125 h 1.0625 z m 5.735092,3.92188 h -1.1875 q 2.734375,-4.375 2.734375,-8.75 0,-1.71875 -0.390625,-3.39063 -0.3125,-1.375 -0.875,-2.625 -0.359375,-0.82812 -1.46875,-2.73437 h 1.1875 q 1.703125,2.28125 2.53125,4.59375 0.6875,1.98437 0.6875,4.14062 0,2.46875 -0.9375,4.76563 -0.9375,2.29687 -2.28125,4 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path159"
+     d="m 54.629919,131.32808 63.433071,13.85826" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path161"
+     d="m 54.629919,131.32808 57.571331,12.57765" />
+  <path
+     style="fill:#000000;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt"
+     inkscape:connector-curvature="0"
+     id="path163"
+     d="m 111.84871,145.51939 4.78606,-0.64507 -4.08098,-2.58227 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path165"
+     d="M 129.79528,118.46719 128.18897,34.719153" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path167"
+     d="M 129.79528,118.46719 128.30402,40.718047" />
+  <path
+     style="fill:#000000;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt"
+     inkscape:connector-curvature="0"
+     id="path169"
+     d="m 129.95547,40.686383 -1.73846,-4.505589 -1.5644,4.568936 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path171"
+     d="m 153.25985,273.69815 v -75.1181" />
+  <path
+     style="fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:round"
+     inkscape:connector-curvature="0"
+     id="path173"
+     d="m 153.25985,273.69815 v -69.1181" />
+  <path
+     style="fill:#000000;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt"
+     inkscape:connector-curvature="0"
+     id="path175"
+     d="m 154.91157,204.58005 -1.65172,-4.5381 -1.65173,4.5381 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path177"
+     d="m 82.181099,280.41995 h 63.433071 v 31.27557 H 82.181099 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path179"
+     d="m 98.681099,307.33995 v -1.21875 q -0.90625,1.4375 -2.70312,1.4375 -1.15625,0 -2.125,-0.64063 -0.96875,-0.64062 -1.5,-1.78125 -0.53125,-1.14062 -0.53125,-2.625 0,-1.45312 0.48437,-2.625 0.48438,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17188,-0.625 0.875,0 1.54687,0.375 0.6875,0.35938 1.10938,0.95313 v -4.79688 h 1.640621 v 13.35938 z m -5.17187,-4.82813 q 0,1.85938 0.78125,2.78125 0.78125,0.92188 1.84375,0.92188 1.07812,0 1.82812,-0.875 0.75,-0.89063 0.75,-2.6875 0,-1.98438 -0.76562,-2.90625 -0.76563,-0.9375 -1.89063,-0.9375 -1.07812,0 -1.8125,0.89062 -0.73437,0.89063 -0.73437,2.8125 z m 8.828841,-1.76562 q 0,-2.35938 0.48437,-3.79688 0.48438,-1.45312 1.4375,-2.23437 0.96875,-0.78125 2.42188,-0.78125 1.07812,0 1.89062,0.4375 0.8125,0.42187 1.32813,1.25 0.53125,0.8125 0.82812,1.98437 0.3125,1.15625 0.3125,3.14063 0,2.35937 -0.48437,3.8125 -0.48438,1.4375 -1.45313,2.23437 -0.95312,0.78125 -2.42187,0.78125 -1.92188,0 -3.03125,-1.39062 -1.3125,-1.67188 -1.3125,-5.4375 z m 1.67187,0 q 0,3.29687 0.76563,4.39062 0.78125,1.07813 1.90625,1.07813 1.14062,0 1.90625,-1.09375 0.76562,-1.09375 0.76562,-4.375 0,-3.29688 -0.76562,-4.375 -0.76563,-1.07813 -1.92188,-1.07813 -1.125,0 -1.79687,0.95313 -0.85938,1.21875 -0.85938,4.5 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path181"
+     d="m 115.32809,237.12598 h 71.55905 v 31.2756 h -71.55905 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path183"
+     d="m 131.82809,264.04599 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64062 -0.96875,-0.64063 -1.5,-1.78125 -0.53125,-1.14063 -0.53125,-2.625 0,-1.45313 0.48438,-2.625 0.48437,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35937 1.10937,0.95312 v -4.79687 h 1.64063 v 13.35937 z m -5.17188,-4.82812 q 0,1.85937 0.78125,2.78125 0.78125,0.92187 1.84375,0.92187 1.07813,0 1.82813,-0.875 0.75,-0.89062 0.75,-2.6875 0,-1.98437 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89063 -0.73438,0.89062 -0.73438,2.8125 z m 15.00072,4.82812 h -1.64062 v -10.45312 q -0.59375,0.5625 -1.5625,1.14062 -0.95313,0.5625 -1.71875,0.84375 v -1.59375 q 1.375,-0.64062 2.40625,-1.5625 1.03125,-0.92187 1.45312,-1.78125 h 1.0625 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path185"
+     d="m 171.88189,134.56955 h 440 v 26.70866 h -440 z" />
+  <path
+     style="fill:#434343;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path187"
+     d="m 182.61627,161.48955 v -13.35938 h 1.76562 v 13.35938 z m 4.6833,0 v -9.67188 h 1.46875 v 1.375 q 1.0625,-1.59375 3.07813,-1.59375 0.875,0 1.60937,0.3125 0.73438,0.3125 1.09375,0.82813 0.375,0.5 0.51563,1.20312 0.0937,0.45313 0.0937,1.59375 v 5.95313 h -1.64063 v -5.89063 q 0,-1 -0.20312,-1.48437 -0.1875,-0.5 -0.67188,-0.79688 -0.48437,-0.29687 -1.14062,-0.29687 -1.04688,0 -1.8125,0.67187 -0.75,0.65625 -0.75,2.51563 v 5.28125 z m 16.64135,0 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64063 -0.96875,-0.64062 -1.5,-1.78125 -0.53125,-1.14062 -0.53125,-2.625 0,-1.45312 0.48438,-2.625 0.48437,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35938 1.10937,0.95313 v -4.79688 h 1.64063 v 13.35938 z m -5.17188,-4.82813 q 0,1.85938 0.78125,2.78125 0.78125,0.92188 1.84375,0.92188 1.07813,0 1.82813,-0.875 0.75,-0.89063 0.75,-2.6875 0,-1.98438 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89062 -0.73438,0.89063 -0.73438,2.8125 z m 15.90697,1.71875 1.6875,0.20313 q -0.40625,1.48437 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.29688 -1.23438,-1.3125 -1.23438,-3.67187 0,-2.45313 1.25,-3.79688 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.32813 1.23437,1.3125 1.23437,3.70312 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45313 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.48438 1.01563,-1.51563 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.82812 -0.78125,-0.95313 -2.03125,-0.95313 -1.125,0 -1.90625,0.76563 -0.76562,0.75 -0.84375,2.01562 z m 8.04759,5.76563 3.53125,-5.03125 -3.26562,-4.64063 h 2.04687 l 1.48438,2.26563 q 0.42187,0.64062 0.67187,1.07812 0.40625,-0.59375 0.73438,-1.0625 l 1.64062,-2.28125 h 1.95313 l -3.34375,4.54688 3.59375,5.125 h -2.01563 l -1.98437,-3 -0.51563,-0.8125 -2.54687,3.8125 z m 15.21456,-4.29688 1.65625,-0.14062 q 0.125,1 0.54687,1.64062 0.4375,0.64063 1.34375,1.04688 0.92188,0.39062 2.0625,0.39062 1,0 1.78125,-0.29687 0.78125,-0.29688 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35937,-0.46875 -1.1875,-0.79688 -0.54687,-0.20312 -2.39062,-0.64062 -1.82813,-0.45313 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23438 -0.46875,-0.75 -0.46875,-1.67187 0,-1 0.57812,-1.875 0.57813,-0.89063 1.67188,-1.34375 1.10937,-0.45313 2.45312,-0.45313 1.48438,0 2.60938,0.48438 1.14062,0.46875 1.75,1.40625 0.60937,0.92187 0.65625,2.09375 l -1.6875,0.125 q -0.14063,-1.26563 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60938,0 -2.34375,0.59375 -0.73438,0.59375 -0.73438,1.42187 0,0.71875 0.53125,1.17188 0.5,0.46875 2.65625,0.96875 2.15625,0.48437 2.95313,0.84375 1.17187,0.53125 1.71875,1.35937 0.5625,0.82813 0.5625,1.90625 0,1.0625 -0.60938,2.01563 -0.60937,0.9375 -1.75,1.46875 -1.14062,0.51562 -2.57812,0.51562 -1.8125,0 -3.04688,-0.53125 -1.21875,-0.53125 -1.92187,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 12.83418,8 v -13.375 h 1.48438 v 1.25 q 0.53125,-0.73437 1.1875,-1.09375 0.67187,-0.375 1.625,-0.375 1.23437,0 2.17187,0.64063 0.95313,0.625 1.4375,1.79687 0.48438,1.15625 0.48438,2.54688 0,1.48437 -0.53125,2.67187 -0.53125,1.1875 -1.54688,1.82813 -1.01562,0.625 -2.14062,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 v 4.70312 z m 1.48438,-8.48437 q 0,1.85937 0.75,2.76562 0.76562,0.89063 1.82812,0.89063 1.09375,0 1.875,-0.92188 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76562,-2.76562 -0.75,-0.92188 -1.8125,-0.92188 -1.04688,0 -1.85938,0.98438 -0.79687,0.96875 -0.79687,2.84375 z m 15.20385,3.59375 q -0.92187,0.76562 -1.76562,1.09375 -0.82813,0.3125 -1.79688,0.3125 -1.59375,0 -2.45312,-0.78125 -0.85938,-0.78125 -0.85938,-1.98438 0,-0.71875 0.32813,-1.29687 0.32812,-0.59375 0.84375,-0.9375 0.53125,-0.35938 1.1875,-0.54688 0.46875,-0.125 1.45312,-0.25 1.98438,-0.23437 2.92188,-0.5625 0.0156,-0.34375 0.0156,-0.42187 0,-1 -0.46875,-1.42188 -0.625,-0.54687 -1.875,-0.54687 -1.15625,0 -1.70312,0.40625 -0.54688,0.40625 -0.8125,1.42187 l -1.60938,-0.21875 q 0.21875,-1.01562 0.71875,-1.64062 0.5,-0.64063 1.45313,-0.98438 0.95312,-0.34375 2.1875,-0.34375 1.25,0 2.01562,0.29688 0.78125,0.28125 1.14063,0.73437 0.375,0.4375 0.51562,1.10938 0.0781,0.42187 0.0781,1.51562 v 2.1875 q 0,2.28125 0.10937,2.89063 0.10938,0.59375 0.40625,1.15625 h -1.70312 q -0.26563,-0.51563 -0.32813,-1.1875 z m -0.14062,-3.67188 q -0.89063,0.375 -2.67188,0.625 -1.01562,0.14063 -1.4375,0.32813 -0.42187,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45313,0.4375 0.9375,0 1.67187,-0.40625 0.75,-0.42188 1.09375,-1.14063 0.26563,-0.5625 0.26563,-1.64062 z m 10.51636,1.3125 1.60937,0.21875 q -0.26562,1.65625 -1.35937,2.60938 -1.07813,0.9375 -2.67188,0.9375 -1.98437,0 -3.1875,-1.29688 -1.20312,-1.29687 -1.20312,-3.71875 0,-1.57812 0.51562,-2.75 0.51563,-1.17187 1.57813,-1.75 1.0625,-0.59375 2.3125,-0.59375 1.57812,0 2.57812,0.79688 1,0.79687 1.28125,2.26562 l -1.59375,0.23438 q -0.23437,-0.96875 -0.8125,-1.45313 -0.57812,-0.5 -1.39062,-0.5 -1.23438,0 -2.01563,0.89063 -0.78125,0.89062 -0.78125,2.8125 0,1.95312 0.75,2.84375 0.75,0.875 1.95313,0.875 0.96875,0 1.60937,-0.59375 0.65625,-0.59375 0.82813,-1.82813 z m 9.64062,0.4375 1.6875,0.20313 q -0.40625,1.48437 -1.48437,2.3125 -1.07813,0.8125 -2.76563,0.8125 -2.125,0 -3.375,-1.29688 -1.23437,-1.3125 -1.23437,-3.67187 0,-2.45313 1.25,-3.79688 1.26562,-1.34375 3.26562,-1.34375 1.9375,0 3.15625,1.32813 1.23438,1.3125 1.23438,3.70312 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45313 0.8125,0.84375 2.01562,0.84375 0.90625,0 1.54688,-0.46875 0.64062,-0.48438 1.01562,-1.51563 z m -5.39062,-2.65625 h 5.40625 q -0.10938,-1.21875 -0.625,-1.82812 -0.78125,-0.95313 -2.03125,-0.95313 -1.125,0 -1.90625,0.76563 -0.76563,0.75 -0.84375,2.01562 z m 14.10589,-0.0781 v -1.53127 l 8.84375,-3.73438 v 1.64063 l -7.01562,2.875 7.01562,2.90625 v 1.625 z m 10.66059,2.3125 1.64062,-0.21875 q 0.28125,1.40625 0.95313,2.01562 0.6875,0.60938 1.65625,0.60938 1.15625,0 1.95312,-0.79688 0.79688,-0.79687 0.79688,-1.98437 0,-1.125 -0.73438,-1.85938 -0.73437,-0.73437 -1.875,-0.73437 -0.46875,0 -1.15625,0.17187 l 0.1875,-1.4375 q 0.15625,0.0156 0.26563,0.0156 1.04687,0 1.875,-0.54688 0.84375,-0.54687 0.84375,-1.67187 0,-0.90625 -0.60938,-1.5 -0.60937,-0.59375 -1.57812,-0.59375 -0.95313,0 -1.59375,0.60937 -0.64063,0.59375 -0.8125,1.79688 l -1.64063,-0.29688 q 0.29688,-1.64062 1.35938,-2.54687 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92187,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85937 -0.46875,1.57812 -0.46875,0.70313 -1.375,1.125 1.1875,0.28125 1.84375,1.14063 0.65625,0.85937 0.65625,2.15625 0,1.73437 -1.28125,2.95312 -1.26563,1.21875 -3.21875,1.21875 -1.76563,0 -2.92188,-1.04687 -1.15625,-1.04688 -1.32812,-2.71875 z m 9.73507,3.53125 3.53125,-5.03125 -3.26562,-4.64063 h 2.04687 l 1.48438,2.26563 q 0.42187,0.64062 0.67187,1.07812 0.40625,-0.59375 0.73438,-1.0625 l 1.64062,-2.28125 h 1.95313 l -3.34375,4.54688 3.59375,5.125 h -2.01563 l -1.98437,-3 -0.51563,-0.8125 -2.54687,3.8125 z m 9.96875,-3.53125 1.64063,-0.21875 q 0.28125,1.40625 0.95312,2.01562 0.6875,0.60938 1.65625,0.60938 1.15625,0 1.95313,-0.79688 0.79687,-0.79687 0.79687,-1.98437 0,-1.125 -0.73437,-1.85938 -0.73438,-0.73437 -1.875,-0.73437 -0.46875,0 -1.15625,0.17187 l 0.1875,-1.4375 q 0.15625,0.0156 0.26562,0.0156 1.04688,0 1.875,-0.54688 0.84375,-0.54687 0.84375,-1.67187 0,-0.90625 -0.60937,-1.5 -0.60938,-0.59375 -1.57813,-0.59375 -0.95312,0 -1.59375,0.60937 -0.64062,0.59375 -0.8125,1.79688 l -1.64062,-0.29688 q 0.29687,-1.64062 1.35937,-2.54687 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92188,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85937 -0.46875,1.57812 -0.46875,0.70313 -1.375,1.125 1.1875,0.28125 1.84375,1.14063 0.65625,0.85937 0.65625,2.15625 0,1.73437 -1.28125,2.95312 -1.26562,1.21875 -3.21875,1.21875 -1.76562,0 -2.92187,-1.04687 -1.15625,-1.04688 -1.32813,-2.71875 z m 9.73508,3.53125 3.53125,-5.03125 -3.26563,-4.64063 h 2.04688 l 1.48437,2.26563 q 0.42188,0.64062 0.67188,1.07812 0.40625,-0.59375 0.73437,-1.0625 l 1.64063,-2.28125 h 1.95312 l -3.34375,4.54688 3.59375,5.125 h -2.01562 l -1.98438,-3 -0.51562,-0.8125 -2.54688,3.8125 z m 10.8125,0 v -8.40625 h -1.45313 v -1.26563 h 1.45313 v -1.03125 q 0,-0.96875 0.17187,-1.45312 0.23438,-0.64063 0.82813,-1.03125 0.59375,-0.39063 1.67187,-0.39063 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.0937 -0.95312,-0.0937 -0.75,0 -1.0625,0.32813 -0.3125,0.3125 -0.3125,1.1875 v 0.89062 h 1.89062 v 1.26563 h -1.89062 v 8.40618 z m 4.33957,-3.53125 1.64062,-0.21875 q 0.28125,1.40625 0.95313,2.01562 0.6875,0.60938 1.65625,0.60938 1.15625,0 1.95312,-0.79688 0.79688,-0.79687 0.79688,-1.98437 0,-1.125 -0.73438,-1.85938 -0.73437,-0.73437 -1.875,-0.73437 -0.46875,0 -1.15625,0.17187 l 0.1875,-1.4375 q 0.15625,0.0156 0.26563,0.0156 1.04687,0 1.875,-0.54688 0.84375,-0.54687 0.84375,-1.67187 0,-0.90625 -0.60938,-1.5 -0.60937,-0.59375 -1.57812,-0.59375 -0.95313,0 -1.59375,0.60937 -0.64063,0.59375 -0.8125,1.79688 l -1.64063,-0.29688 q 0.29688,-1.64062 1.35938,-2.54687 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92187,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85937 -0.46875,1.57812 -0.46875,0.70313 -1.375,1.125 1.1875,0.28125 1.84375,1.14063 0.65625,0.85937 0.65625,2.15625 0,1.73437 -1.28125,2.95312 -1.26563,1.21875 -3.21875,1.21875 -1.76563,0 -2.92188,-1.04687 -1.15625,-1.04688 -1.32812,-2.71875 z m 18.98508,1.95312 v 1.57811 h -8.82813 q -0.0156,-0.59375 0.1875,-1.14063 0.34375,-0.90625 1.07813,-1.78125 0.75,-0.875 2.15625,-2.01562 2.17187,-1.78125 2.9375,-2.82813 0.76562,-1.04687 0.76562,-1.96875 0,-0.98437 -0.70312,-1.64062 -0.6875,-0.67188 -1.8125,-0.67188 -1.1875,0 -1.90625,0.71875 -0.70313,0.70313 -0.70313,1.95313 l -1.6875,-0.17188 q 0.17188,-1.89062 1.29688,-2.875 1.14062,-0.98437 3.03125,-0.98437 1.92187,0 3.04687,1.0625 1.125,1.0625 1.125,2.64062 0,0.79688 -0.32812,1.57813 -0.32813,0.78125 -1.09375,1.64062 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23438 -1.89063,1.6875 -0.42187,0.4375 -0.6875,0.875 z m 10.84448,-4.26562 -8.84375,3.78125 v -1.625 l 7.01562,-2.90625 -7.01562,-2.875 v -1.64063 l 8.84375,3.73438 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path189"
+     d="m 181.96001,173.34892 q 0,-1.4375 0.71875,-2.4375 0.71875,-1 2.09375,-1 1.25,0 2.07813,0.90625 0.82812,0.89062 0.82812,2.625 0,1.6875 -0.84375,2.60937 -0.82812,0.92188 -2.04687,0.92188 -1.20313,0 -2.01563,-0.90625 -0.8125,-0.90625 -0.8125,-2.71875 z m 2.85938,-2.3125 q -0.60938,0 -1.01563,0.53125 -0.40625,0.53125 -0.40625,1.9375 0,1.28125 0.40625,1.8125 0.40625,0.51562 1.01563,0.51562 0.625,0 1.01562,-0.51562 0.40625,-0.53125 0.40625,-1.9375 0,-1.29688 -0.40625,-1.8125 -0.40625,-0.53125 -1.01562,-0.53125 z m 0,12.9375 7.3125,-14.0625 h 1.32812 l -7.28125,14.0625 z m 5.78125,-3.625 q 0,-1.4375 0.71875,-2.42188 0.71875,-1 2.09375,-1 1.26562,0 2.07812,0.90625 0.82813,0.89063 0.82813,2.625 0,1.6875 -0.82813,2.60938 -0.82812,0.90625 -2.0625,0.90625 -1.21875,0 -2.03125,-0.90625 -0.79687,-0.90625 -0.79687,-2.71875 z m 2.85937,-2.29688 q -0.625,0 -1.03125,0.53125 -0.39062,0.53125 -0.39062,1.9375 0,1.28125 0.40625,1.8125 0.40625,0.51563 1.01562,0.51563 0.625,0 1.03125,-0.51563 0.40625,-0.53125 0.40625,-1.9375 0,-1.29687 -0.42187,-1.8125 -0.40625,-0.53125 -1.01563,-0.53125 z m 3.97902,5.4375 5.125,-13.35937 h 1.90625 l 5.46875,13.35937 h -2.01563 l -1.54687,-4.04687 h -5.59375 l -1.46875,4.04687 z m 3.85937,-5.48437 h 4.53125 l -1.40625,-3.70313 q -0.625,-1.6875 -0.9375,-2.76562 -0.26562,1.28125 -0.71875,2.54687 z m 23.65813,-2.375 h -8.82812 v -1.51563 h 8.82812 z m 0,4.0625 h -8.82812 v -1.53125 h 8.82812 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path191"
+     d="m 234.42542,176.7708 -2.32813,-0.42188 q 0.40625,-1.40625 1.35938,-2.07812 0.95312,-0.67188 2.84375,-0.67188 1.70312,0 2.54687,0.40625 0.84375,0.40625 1.17188,1.03125 0.34375,0.625 0.34375,2.28125 l -0.0156,3 q 0,1.26563 0.10938,1.875 0.125,0.60938 0.46875,1.29688 h -2.53125 q -0.10938,-0.25 -0.25,-0.75 -0.0625,-0.23438 -0.0937,-0.3125 -0.65625,0.64062 -1.40625,0.96875 -0.73438,0.3125 -1.59375,0.3125 -1.48438,0 -2.34375,-0.8125 -0.85938,-0.8125 -0.85938,-2.04688 0,-0.82812 0.39063,-1.46875 0.39062,-0.64062 1.09375,-0.96875 0.70312,-0.34375 2.03125,-0.60937 1.79687,-0.32813 2.48437,-0.625 v -0.25 q 0,-0.75 -0.35937,-1.0625 -0.35938,-0.3125 -1.375,-0.3125 -0.6875,0 -1.07813,0.28125 -0.375,0.26562 -0.60937,0.9375 z m 3.42187,2.07812 q -0.48437,0.15625 -1.5625,0.39063 -1.0625,0.21875 -1.39062,0.4375 -0.5,0.35937 -0.5,0.90625 0,0.53125 0.40625,0.9375 0.40625,0.39062 1.01562,0.39062 0.70313,0 1.32813,-0.46875 0.46875,-0.34375 0.60937,-0.84375 0.0937,-0.32812 0.0937,-1.25 z m 5.0476,4.64063 v -13.35938 h 2.5625 v 13.35938 z m 5.18329,0 v -13.35938 h 2.5625 v 13.35938 z m 4.58956,-4.96875 q 0,-1.28125 0.625,-2.46875 0.625,-1.20313 1.78125,-1.82813 1.15625,-0.625 2.57813,-0.625 2.1875,0 3.59375,1.42188 1.40625,1.42187 1.40625,3.60937 0,2.1875 -1.42188,3.64063 -1.42187,1.4375 -3.5625,1.4375 -1.32812,0 -2.54687,-0.59375 -1.20313,-0.60938 -1.82813,-1.76563 -0.625,-1.17187 -0.625,-2.82812 z m 2.625,0.125 q 0,1.45312 0.67188,2.21875 0.6875,0.75 1.6875,0.75 1,0 1.67187,-0.75 0.6875,-0.76563 0.6875,-2.23438 0,-1.42187 -0.6875,-2.1875 -0.67187,-0.76562 -1.67187,-0.76562 -1,0 -1.6875,0.76562 -0.67188,0.76563 -0.67188,2.20313 z m 17.80222,-1.96875 -2.53125,0.45312 q -0.125,-0.75 -0.57812,-1.125 -0.45313,-0.39062 -1.17188,-0.39062 -0.95312,0 -1.53125,0.65625 -0.5625,0.65625 -0.5625,2.20312 0,1.73438 0.57813,2.4375 0.57812,0.70313 1.54687,0.70313 0.73438,0 1.20313,-0.40625 0.46875,-0.42188 0.65625,-1.42188 l 2.51562,0.42188 q -0.39062,1.73437 -1.51562,2.625 -1.10938,0.875 -2.96875,0.875 -2.125,0 -3.39063,-1.32813 -1.25,-1.34375 -1.25,-3.71875 0,-2.39062 1.26563,-3.71875 1.26562,-1.34375 3.42187,-1.34375 1.76563,0 2.79688,0.76563 1.04687,0.75 1.51562,2.3125 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path193"
+     d="m 280.10711,183.48955 v -9.67188 h 1.46875 v 1.35938 q 0.45312,-0.71875 1.20312,-1.14063 0.76563,-0.4375 1.71875,-0.4375 1.07813,0 1.76563,0.45313 0.6875,0.4375 0.96875,1.23437 1.15625,-1.6875 2.98437,-1.6875 1.45313,0 2.21875,0.79688 0.78125,0.79687 0.78125,2.45312 v 6.64063 h -1.64062 v -6.09375 q 0,-0.98438 -0.15625,-1.40625 -0.15625,-0.4375 -0.57813,-0.70313 -0.42187,-0.26562 -0.98437,-0.26562 -1.01563,0 -1.6875,0.6875 -0.67188,0.67187 -0.67188,2.15625 v 5.625 h -1.64062 v -6.28125 q 0,-1.09375 -0.40625,-1.64063 -0.40625,-0.54687 -1.3125,-0.54687 -0.6875,0 -1.28125,0.35937 -0.59375,0.35938 -0.85938,1.0625 -0.25,0.70313 -0.25,2.03125 v 5.01563 z m 22.16583,-3.10938 1.6875,0.20313 q -0.40625,1.48437 -1.48437,2.3125 -1.07813,0.8125 -2.76563,0.8125 -2.125,0 -3.375,-1.29688 -1.23437,-1.3125 -1.23437,-3.67187 0,-2.45313 1.25,-3.79688 1.26562,-1.34375 3.26562,-1.34375 1.9375,0 3.15625,1.32813 1.23438,1.3125 1.23438,3.70312 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45313 0.8125,0.84375 2.01562,0.84375 0.90625,0 1.54688,-0.46875 0.64062,-0.48438 1.01562,-1.51563 z m -5.39062,-2.65625 h 5.40625 q -0.10938,-1.21875 -0.625,-1.82812 -0.78125,-0.95313 -2.03125,-0.95313 -1.125,0 -1.90625,0.76563 -0.76563,0.75 -0.84375,2.01562 z m 9.14132,5.76563 v -9.67188 h 1.46875 v 1.35938 q 0.45313,-0.71875 1.20313,-1.14063 0.76562,-0.4375 1.71875,-0.4375 1.07812,0 1.76562,0.45313 0.6875,0.4375 0.96875,1.23437 1.15625,-1.6875 2.98438,-1.6875 1.45312,0 2.21875,0.79688 0.78125,0.79687 0.78125,2.45312 v 6.64063 h -1.64063 v -6.09375 q 0,-0.98438 -0.15625,-1.40625 -0.15625,-0.4375 -0.57812,-0.70313 -0.42188,-0.26562 -0.98438,-0.26562 -1.01562,0 -1.6875,0.6875 -0.67187,0.67187 -0.67187,2.15625 v 5.625 h -1.64063 v -6.28125 q 0,-1.09375 -0.40625,-1.64063 -0.40625,-0.54687 -1.3125,-0.54687 -0.6875,0 -1.28125,0.35937 -0.59375,0.35938 -0.85937,1.0625 -0.25,0.70313 -0.25,2.03125 v 5.01563 z m 15.52518,0 v -9.67188 h 1.46875 v 1.46875 q 0.5625,-1.03125 1.03125,-1.35937 0.48438,-0.32813 1.0625,-0.32813 0.82813,0 1.6875,0.53125 l -0.5625,1.51563 q -0.60937,-0.35938 -1.20312,-0.35938 -0.54688,0 -0.96875,0.32813 -0.42188,0.32812 -0.60938,0.89062 -0.28125,0.875 -0.28125,1.92188 v 5.0625 z m 12.8533,-3.10938 1.6875,0.20313 q -0.40625,1.48437 -1.48437,2.3125 -1.07813,0.8125 -2.76563,0.8125 -2.125,0 -3.375,-1.29688 -1.23437,-1.3125 -1.23437,-3.67187 0,-2.45313 1.25,-3.79688 1.26562,-1.34375 3.26562,-1.34375 1.9375,0 3.15625,1.32813 1.23438,1.3125 1.23438,3.70312 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45313 0.8125,0.84375 2.01562,0.84375 0.90625,0 1.54688,-0.46875 0.64062,-0.48438 1.01562,-1.51563 z m -5.39062,-2.65625 h 5.40625 q -0.10938,-1.21875 -0.625,-1.82812 -0.78125,-0.95313 -2.03125,-0.95313 -1.125,0 -1.90625,0.76563 -0.76563,0.75 -0.84375,2.01562 z m 9.53195,5.76563 v -8.40625 h -1.45312 v -1.26563 h 1.45312 v -1.03125 q 0,-0.96875 0.17188,-1.45312 0.23437,-0.64063 0.82812,-1.03125 0.59375,-0.39063 1.67188,-0.39063 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.0937 -0.95313,-0.0937 -0.75,0 -1.0625,0.32813 -0.3125,0.3125 -0.3125,1.1875 v 0.89062 h 1.89063 v 1.26563 h -1.89063 v 8.4062 z m 4.57394,-5.84375 v -1.53125 l 8.84375,-3.73438 v 1.64063 l -7.01562,2.875 7.01562,2.90625 v 1.625 z m 10.66059,2.3125 1.64062,-0.21875 q 0.28125,1.40625 0.95313,2.01562 0.6875,0.60938 1.65625,0.60938 1.15625,0 1.95312,-0.79688 0.79688,-0.79687 0.79688,-1.98437 0,-1.125 -0.73438,-1.85938 -0.73437,-0.73437 -1.875,-0.73437 -0.46875,0 -1.15625,0.17187 l 0.1875,-1.4375 q 0.15625,0.0156 0.26563,0.0156 1.04687,0 1.875,-0.54688 0.84375,-0.54687 0.84375,-1.67187 0,-0.90625 -0.60938,-1.5 -0.60937,-0.59375 -1.57812,-0.59375 -0.95313,0 -1.59375,0.60937 -0.64063,0.59375 -0.8125,1.79688 l -1.64063,-0.29688 q 0.29688,-1.64062 1.35938,-2.54687 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92187,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85937 -0.46875,1.57812 -0.46875,0.70313 -1.375,1.125 1.1875,0.28125 1.84375,1.14063 0.65625,0.85937 0.65625,2.15625 0,1.73437 -1.28125,2.95312 -1.26563,1.21875 -3.21875,1.21875 -1.76563,0 -2.92188,-1.04687 -1.15625,-1.04688 -1.32812,-2.71875 z m 9.73508,3.53125 3.53125,-5.03125 -3.26563,-4.64063 h 2.04688 l 1.48437,2.26563 q 0.42188,0.64062 0.67188,1.07812 0.40625,-0.59375 0.73437,-1.0625 l 1.64063,-2.28125 h 1.95312 l -3.34375,4.54688 3.59375,5.125 h -2.01562 l -1.98438,-3 -0.51562,-0.8125 -2.54688,3.8125 z m 9.96875,-3.53125 1.64062,-0.21875 q 0.28125,1.40625 0.95313,2.01562 0.6875,0.60938 1.65625,0.60938 1.15625,0 1.95312,-0.79688 0.79688,-0.79687 0.79688,-1.98437 0,-1.125 -0.73438,-1.85938 -0.73437,-0.73437 -1.875,-0.73437 -0.46875,0 -1.15625,0.17187 l 0.1875,-1.4375 q 0.15625,0.0156 0.26563,0.0156 1.04687,0 1.875,-0.54688 0.84375,-0.54687 0.84375,-1.67187 0,-0.90625 -0.60938,-1.5 -0.60937,-0.59375 -1.57812,-0.59375 -0.95313,0 -1.59375,0.60937 -0.64063,0.59375 -0.8125,1.79688 l -1.64063,-0.29688 q 0.29688,-1.64062 1.35938,-2.54687 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92187,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85937 -0.46875,1.57812 -0.46875,0.70313 -1.375,1.125 1.1875,0.28125 1.84375,1.14063 0.65625,0.85937 0.65625,2.15625 0,1.73437 -1.28125,2.95312 -1.26563,1.21875 -3.21875,1.21875 -1.76563,0 -2.92188,-1.04687 -1.15625,-1.04688 -1.32812,-2.71875 z m 9.7351,3.53125 3.53125,-5.03125 -3.26562,-4.64063 h 2.04687 l 1.48438,2.26563 q 0.42187,0.64062 0.67187,1.07812 0.40625,-0.59375 0.73438,-1.0625 l 1.64062,-2.28125 h 1.95313 l -3.34375,4.54688 3.59375,5.125 h -2.01563 l -1.98437,-3 -0.51563,-0.8125 -2.54687,3.8125 z m 10.8125,0 v -8.40625 h -1.45312 v -1.26563 h 1.45312 v -1.03125 q 0,-0.96875 0.17188,-1.45312 0.23437,-0.64063 0.82812,-1.03125 0.59375,-0.39063 1.67188,-0.39063 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.0937 -0.95313,-0.0937 -0.75,0 -1.0625,0.32813 -0.3125,0.3125 -0.3125,1.1875 v 0.89062 h 1.89063 v 1.26563 h -1.89063 v 8.4062 z m 4.33954,-3.53125 1.64063,-0.21875 q 0.28125,1.40625 0.95312,2.01562 0.6875,0.60938 1.65625,0.60938 1.15625,0 1.95313,-0.79688 0.79687,-0.79687 0.79687,-1.98437 0,-1.125 -0.73437,-1.85938 -0.73438,-0.73437 -1.875,-0.73437 -0.46875,0 -1.15625,0.17187 l 0.1875,-1.4375 q 0.15625,0.0156 0.26562,0.0156 1.04688,0 1.875,-0.54688 0.84375,-0.54687 0.84375,-1.67187 0,-0.90625 -0.60937,-1.5 -0.60938,-0.59375 -1.57813,-0.59375 -0.95312,0 -1.59375,0.60937 -0.64062,0.59375 -0.8125,1.79688 l -1.64062,-0.29688 q 0.29687,-1.64062 1.35937,-2.54687 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92188,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85937 -0.46875,1.57812 -0.46875,0.70313 -1.375,1.125 1.1875,0.28125 1.84375,1.14063 0.65625,0.85937 0.65625,2.15625 0,1.73437 -1.28125,2.95312 -1.26562,1.21875 -3.21875,1.21875 -1.76562,0 -2.92187,-1.04687 -1.15625,-1.04688 -1.32813,-2.71875 z m 18.98511,1.95312 v 1.57813 h -8.82813 q -0.0156,-0.59375 0.1875,-1.14063 0.34375,-0.90625 1.07813,-1.78125 0.75,-0.875 2.15625,-2.01562 2.17187,-1.78125 2.9375,-2.82813 0.76562,-1.04687 0.76562,-1.96875 0,-0.98437 -0.70312,-1.64062 -0.6875,-0.67188 -1.8125,-0.67188 -1.1875,0 -1.90625,0.71875 -0.70313,0.70313 -0.70313,1.95313 l -1.6875,-0.17188 q 0.17188,-1.89062 1.29688,-2.875 1.14062,-0.98437 3.03125,-0.98437 1.92187,0 3.04687,1.0625 1.125,1.0625 1.125,2.64062 0,0.79688 -0.32812,1.57813 -0.32813,0.78125 -1.09375,1.64062 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23438 -1.89063,1.6875 -0.42187,0.4375 -0.6875,0.875 z m 2.64136,1.57813 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35938,0.64062 -1.15625,0.98437 l -0.45313,-0.70312 q 0.51563,-0.21875 0.76563,-0.67188 0.25,-0.4375 0.28125,-1.26562 z m 9.64782,0.23437 0.79688,-3.89062 h -1.54688 v -1.35938 h 1.8125 l 0.67188,-3.29687 h -2.48438 v -1.35938 h 2.76563 l 0.79687,-3.90625 h 1.35938 l -0.79688,3.90625 h 2.875 l 0.79688,-3.90625 h 1.375 l -0.79688,3.90625 h 1.57813 v 1.35938 h -1.84375 l -0.6875,3.29687 h 2.53125 v 1.35938 h -2.8125 l -0.78125,3.89062 h -1.375 l 0.78125,-3.89062 h -2.85938 l -0.78125,3.89062 z m 2.4375,-5.25 h 2.85938 l 0.6875,-3.29687 h -2.875 z m 8.18823,5.01563 v -13.35938 h 1.64063 v 13.35938 z m 4.19172,0 v -9.67188 h 1.46875 v 1.35938 q 0.45312,-0.71875 1.20312,-1.14063 0.76563,-0.4375 1.71875,-0.4375 1.07813,0 1.76563,0.45313 0.6875,0.4375 0.96875,1.23437 1.15625,-1.6875 2.98437,-1.6875 1.45313,0 2.21875,0.79688 0.78125,0.79687 0.78125,2.45312 v 6.64063 h -1.64062 v -6.09375 q 0,-0.98438 -0.15625,-1.40625 -0.15625,-0.4375 -0.57813,-0.70313 -0.42187,-0.26562 -0.98437,-0.26562 -1.01563,0 -1.6875,0.6875 -0.67188,0.67187 -0.67188,2.15625 v 5.625 h -1.64062 v -6.28125 q 0,-1.09375 -0.40625,-1.64063 -0.40625,-0.54687 -1.3125,-0.54687 -0.6875,0 -1.28125,0.35937 -0.59375,0.35938 -0.85938,1.0625 -0.25,0.70313 -0.25,2.03125 v 5.01563 z m 21.85327,-1.1875 q -0.92188,0.76562 -1.76563,1.09375 -0.82812,0.3125 -1.79687,0.3125 -1.59375,0 -2.45313,-0.78125 -0.85937,-0.78125 -0.85937,-1.98438 0,-0.71875 0.32812,-1.29687 0.32813,-0.59375 0.84375,-0.9375 0.53125,-0.35938 1.1875,-0.54688 0.46875,-0.125 1.45313,-0.25 1.98437,-0.23437 2.92187,-0.5625 0.0156,-0.34375 0.0156,-0.42187 0,-1 -0.46875,-1.42188 -0.625,-0.54687 -1.875,-0.54687 -1.15625,0 -1.70313,0.40625 -0.54687,0.40625 -0.8125,1.42187 l -1.60937,-0.21875 q 0.21875,-1.01562 0.71875,-1.64062 0.5,-0.64063 1.45312,-0.98438 0.95313,-0.34375 2.1875,-0.34375 1.25,0 2.01563,0.29688 0.78125,0.28125 1.14062,0.73437 0.375,0.4375 0.51563,1.10938 0.0781,0.42187 0.0781,1.51562 v 2.1875 q 0,2.28125 0.10938,2.89063 0.10937,0.59375 0.40625,1.15625 h -1.70313 q -0.26562,-0.51563 -0.32812,-1.1875 z m -0.14063,-3.67188 q -0.89062,0.375 -2.67187,0.625 -1.01563,0.14063 -1.4375,0.32813 -0.42188,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45312,0.4375 0.9375,0 1.67188,-0.40625 0.75,-0.42188 1.09375,-1.14063 0.26562,-0.5625 0.26562,-1.64062 z m 4.20386,8.5625 v -13.375 h 1.48437 v 1.25 q 0.53125,-0.73437 1.1875,-1.09375 0.67188,-0.375 1.625,-0.375 1.23438,0 2.17188,0.64063 0.95312,0.625 1.4375,1.79687 0.48437,1.15625 0.48437,2.54688 0,1.48437 -0.53125,2.67187 -0.53125,1.1875 -1.54687,1.82813 -1.01563,0.625 -2.14063,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 v 4.70312 z m 1.48437,-8.48437 q 0,1.85937 0.75,2.76562 0.76563,0.89063 1.82813,0.89063 1.09375,0 1.875,-0.92188 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76563,-2.76562 -0.75,-0.92188 -1.8125,-0.92188 -1.04687,0 -1.85937,0.98438 -0.79688,0.96875 -0.79688,2.84375 z m 7.62574,4.78125 5.125,-13.35938 h 1.90625 l 5.46875,13.35938 h -2.01563 l -1.54687,-4.04688 h -5.59375 l -1.46875,4.04688 z m 3.85937,-5.48438 h 4.53125 l -1.40625,-3.70312 q -0.625,-1.6875 -0.9375,-2.76563 -0.26562,1.28125 -0.71875,2.54688 z m 10.27167,5.48438 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35938,0.64062 -1.15625,0.98437 l -0.45313,-0.70312 q 0.51563,-0.21875 0.76563,-0.67188 0.25,-0.4375 0.28125,-1.26562 z m 12.63226,0 -3.6875,-9.67188 h 1.73438 l 2.07812,5.79688 q 0.32813,0.9375 0.625,1.9375 0.20313,-0.76563 0.60938,-1.82813 l 2.14062,-5.90625 h 1.6875 l -3.65625,9.67188 z m 6.64063,0 v -9.67188 h 1.46875 v 1.35938 q 0.45312,-0.71875 1.20312,-1.14063 0.76563,-0.4375 1.71875,-0.4375 1.07813,0 1.76563,0.45313 0.6875,0.4375 0.96875,1.23437 1.15625,-1.6875 2.98437,-1.6875 1.45313,0 2.21875,0.79688 0.78125,0.79687 0.78125,2.45312 v 6.64063 h -1.64062 v -6.09375 q 0,-0.98438 -0.15625,-1.40625 -0.15625,-0.4375 -0.57813,-0.70313 -0.42187,-0.26562 -0.98437,-0.26562 -1.01563,0 -1.6875,0.6875 -0.67188,0.67187 -0.67188,2.15625 v 5.625 h -1.64062 v -6.28125 q 0,-1.09375 -0.40625,-1.64063 -0.40625,-0.54687 -1.3125,-0.54687 -0.6875,0 -1.28125,0.35937 -0.59375,0.35938 -0.85938,1.0625 -0.25,0.70313 -0.25,2.03125 v 5.01563 z m 22.16577,-3.10938 1.6875,0.20313 q -0.40625,1.48437 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.29688 -1.23438,-1.3125 -1.23438,-3.67187 0,-2.45313 1.25,-3.79688 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.32813 1.23437,1.3125 1.23437,3.70312 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45313 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.48438 1.01563,-1.51563 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.82812 -0.78125,-0.95313 -2.03125,-0.95313 -1.125,0 -1.90625,0.76563 -0.76562,0.75 -0.84375,2.01562 z m 9.14136,5.76563 v -9.67188 h 1.46875 v 1.35938 q 0.45313,-0.71875 1.20313,-1.14063 0.76562,-0.4375 1.71875,-0.4375 1.07812,0 1.76562,0.45313 0.6875,0.4375 0.96875,1.23437 1.15625,-1.6875 2.98438,-1.6875 1.45312,0 2.21875,0.79688 0.78125,0.79687 0.78125,2.45312 v 6.64063 h -1.64063 v -6.09375 q 0,-0.98438 -0.15625,-1.40625 -0.15625,-0.4375 -0.57812,-0.70313 -0.42188,-0.26562 -0.98438,-0.26562 -1.01562,0 -1.6875,0.6875 -0.67187,0.67187 -0.67187,2.15625 v 5.625 h -1.64063 v -6.28125 q 0,-1.09375 -0.40625,-1.64063 -0.40625,-0.54687 -1.3125,-0.54687 -0.6875,0 -1.28125,0.35937 -0.59375,0.35938 -0.85937,1.0625 -0.25,0.70313 -0.25,2.03125 v 5.01563 z m 24.16583,-5.84375 -8.84375,3.78125 v -1.625 l 7.01563,-2.90625 -7.01563,-2.875 v -1.64063 l 8.84375,3.73438 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path195"
+     d="m 167.14961,276.98688 h 614.7402 v 83.74802 h -614.7402 z" />
+  <path
+     style="fill:#434343;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path197"
+     d="m 177.88398,303.90685 v -13.35937 h 1.76562 v 13.35937 z m 4.6833,0 v -9.67187 h 1.46875 v 1.375 q 1.0625,-1.59375 3.07813,-1.59375 0.875,0 1.60937,0.3125 0.73438,0.3125 1.09375,0.82812 0.375,0.5 0.51563,1.20313 0.0937,0.45312 0.0937,1.59375 v 5.95312 h -1.64063 v -5.89062 q 0,-1 -0.20312,-1.48438 -0.1875,-0.5 -0.67188,-0.79687 -0.48437,-0.29688 -1.14062,-0.29688 -1.04688,0 -1.8125,0.67188 -0.75,0.65625 -0.75,2.51562 v 5.28125 z m 16.64135,0 v -1.21875 q -0.90625,1.4375 -2.70313,1.4375 -1.15625,0 -2.125,-0.64062 -0.96875,-0.64063 -1.5,-1.78125 -0.53125,-1.14063 -0.53125,-2.625 0,-1.45313 0.48438,-2.625 0.48437,-1.1875 1.4375,-1.8125 0.96875,-0.625 2.17187,-0.625 0.875,0 1.54688,0.375 0.6875,0.35937 1.10937,0.95312 v -4.79687 h 1.64063 v 13.35937 z m -5.17188,-4.82812 q 0,1.85937 0.78125,2.78125 0.78125,0.92187 1.84375,0.92187 1.07813,0 1.82813,-0.875 0.75,-0.89062 0.75,-2.6875 0,-1.98437 -0.76563,-2.90625 -0.76562,-0.9375 -1.89062,-0.9375 -1.07813,0 -1.8125,0.89063 -0.73438,0.89062 -0.73438,2.8125 z m 15.90697,1.71875 1.6875,0.20312 q -0.40625,1.48438 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.29687 -1.23438,-1.3125 -1.23438,-3.67188 0,-2.45312 1.25,-3.79687 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.32812 1.23437,1.3125 1.23437,3.70313 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45312 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.48437 1.01563,-1.51562 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.82813 -0.78125,-0.95312 -2.03125,-0.95312 -1.125,0 -1.90625,0.76562 -0.76562,0.75 -0.84375,2.01563 z m 8.0476,5.76562 3.53125,-5.03125 -3.26563,-4.64062 h 2.04688 l 1.48437,2.26562 q 0.42188,0.64063 0.67188,1.07813 0.40625,-0.59375 0.73437,-1.0625 l 1.64063,-2.28125 h 1.95312 l -3.34375,4.54687 3.59375,5.125 h -2.01562 l -1.98438,-3 -0.51562,-0.8125 -2.54688,3.8125 z m 15.21455,-4.29687 1.65625,-0.14063 q 0.125,1 0.54687,1.64063 0.4375,0.64062 1.34375,1.04687 0.92188,0.39063 2.0625,0.39063 1,0 1.78125,-0.29688 0.78125,-0.29687 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35937,-0.46875 -1.1875,-0.79687 -0.54687,-0.20313 -2.39062,-0.64063 -1.82813,-0.45312 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.23437 -0.46875,-0.75 -0.46875,-1.67188 0,-1 0.57812,-1.875 0.57813,-0.89062 1.67188,-1.34375 1.10937,-0.45312 2.45312,-0.45312 1.48438,0 2.60938,0.48437 1.14062,0.46875 1.75,1.40625 0.60937,0.92188 0.65625,2.09375 l -1.6875,0.125 q -0.14063,-1.26562 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60938,0 -2.34375,0.59375 -0.73438,0.59375 -0.73438,1.42188 0,0.71875 0.53125,1.17187 0.5,0.46875 2.65625,0.96875 2.15625,0.48438 2.95313,0.84375 1.17187,0.53125 1.71875,1.35938 0.5625,0.82812 0.5625,1.90625 0,1.0625 -0.60938,2.01562 -0.60937,0.9375 -1.75,1.46875 -1.14062,0.51563 -2.57812,0.51563 -1.8125,0 -3.04688,-0.53125 -1.21875,-0.53125 -1.92187,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 12.8342,8 v -13.375 h 1.48437 v 1.25 q 0.53125,-0.73438 1.1875,-1.09375 0.67188,-0.375 1.625,-0.375 1.23438,0 2.17188,0.64062 0.95312,0.625 1.4375,1.79688 0.48437,1.15625 0.48437,2.54687 0,1.48438 -0.53125,2.67188 -0.53125,1.1875 -1.54687,1.82812 -1.01563,0.625 -2.14063,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 v 4.70313 z m 1.48437,-8.48438 q 0,1.85938 0.75,2.76563 0.76563,0.89062 1.82813,0.89062 1.09375,0 1.875,-0.92187 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76563,-2.76563 -0.75,-0.92187 -1.8125,-0.92187 -1.04687,0 -1.85937,0.98437 -0.79688,0.96875 -0.79688,2.84375 z m 15.20386,3.59375 q -0.92188,0.76563 -1.76563,1.09375 -0.82812,0.3125 -1.79687,0.3125 -1.59375,0 -2.45313,-0.78125 -0.85937,-0.78125 -0.85937,-1.98437 0,-0.71875 0.32812,-1.29688 0.32813,-0.59375 0.84375,-0.9375 0.53125,-0.35937 1.1875,-0.54687 0.46875,-0.125 1.45313,-0.25 1.98437,-0.23438 2.92187,-0.5625 0.0156,-0.34375 0.0156,-0.42188 0,-1 -0.46875,-1.42187 -0.625,-0.54688 -1.875,-0.54688 -1.15625,0 -1.70313,0.40625 -0.54687,0.40625 -0.8125,1.42188 l -1.60937,-0.21875 q 0.21875,-1.01563 0.71875,-1.64063 0.5,-0.64062 1.45312,-0.98437 0.95313,-0.34375 2.1875,-0.34375 1.25,0 2.01563,0.29687 0.78125,0.28125 1.14062,0.73438 0.375,0.4375 0.51563,1.10937 0.0781,0.42188 0.0781,1.51563 v 2.1875 q 0,2.28125 0.10938,2.89062 0.10937,0.59375 0.40625,1.15625 h -1.70313 q -0.26562,-0.51562 -0.32812,-1.1875 z m -0.14063,-3.67187 q -0.89062,0.375 -2.67187,0.625 -1.01563,0.14062 -1.4375,0.32812 -0.42188,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45312,0.4375 0.9375,0 1.67188,-0.40625 0.75,-0.42187 1.09375,-1.14062 0.26562,-0.5625 0.26562,-1.64063 z m 10.51633,1.3125 1.60938,0.21875 q -0.26563,1.65625 -1.35938,2.60937 -1.07812,0.9375 -2.67187,0.9375 -1.98438,0 -3.1875,-1.29687 -1.20313,-1.29688 -1.20313,-3.71875 0,-1.57813 0.51563,-2.75 0.51562,-1.17188 1.57812,-1.75 1.0625,-0.59375 2.3125,-0.59375 1.57813,0 2.57813,0.79687 1,0.79688 1.28125,2.26563 l -1.59375,0.23437 q -0.23438,-0.96875 -0.8125,-1.45312 -0.57813,-0.5 -1.39063,-0.5 -1.23437,0 -2.01562,0.89062 -0.78125,0.89063 -0.78125,2.8125 0,1.95313 0.75,2.84375 0.75,0.875 1.95312,0.875 0.96875,0 1.60938,-0.59375 0.65625,-0.59375 0.82812,-1.82812 z m 9.64063,0.4375 1.6875,0.20312 q -0.40625,1.48438 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.29687 -1.23438,-1.3125 -1.23438,-3.67188 0,-2.45312 1.25,-3.79687 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.32812 1.23437,1.3125 1.23437,3.70313 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45312 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.48437 1.01563,-1.51562 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.82813 -0.78125,-0.95312 -2.03125,-0.95312 -1.125,0 -1.90625,0.76562 -0.76562,0.75 -0.84375,2.01563 z m 14.1059,-0.0781 v -1.53125 l 8.84375,-3.73437 v 1.64062 l -7.01563,2.875 7.01563,2.90625 v 1.625 z m 19.26995,4.26563 v 1.57812 h -8.82812 q -0.0156,-0.59375 0.1875,-1.14062 0.34375,-0.90625 1.07812,-1.78125 0.75,-0.875 2.15625,-2.01563 2.17188,-1.78125 2.9375,-2.82812 0.76563,-1.04688 0.76563,-1.96875 0,-0.98438 -0.70313,-1.64063 -0.6875,-0.67187 -1.8125,-0.67187 -1.1875,0 -1.90625,0.71875 -0.70312,0.70312 -0.70312,1.95312 l -1.6875,-0.17187 q 0.17187,-1.89063 1.29687,-2.875 1.14063,-0.98438 3.03125,-0.98438 1.92188,0 3.04688,1.0625 1.125,1.0625 1.125,2.64063 0,0.79687 -0.32813,1.57812 -0.32812,0.78125 -1.09375,1.64063 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23437 -1.89062,1.6875 -0.42188,0.4375 -0.6875,0.875 z m 1.12574,1.57812 3.53125,-5.03125 -3.26563,-4.64062 h 2.04688 l 1.48437,2.26562 q 0.42188,0.64063 0.67188,1.07813 0.40625,-0.59375 0.73437,-1.0625 l 1.64063,-2.28125 h 1.95312 l -3.34375,4.54687 3.59375,5.125 h -2.01562 l -1.98438,-3 -0.51562,-0.8125 -2.54688,3.8125 z m 18.57812,-1.57812 v 1.57812 h -8.82812 q -0.0156,-0.59375 0.1875,-1.14062 0.34375,-0.90625 1.07812,-1.78125 0.75,-0.875 2.15625,-2.01563 2.17188,-1.78125 2.9375,-2.82812 0.76563,-1.04688 0.76563,-1.96875 0,-0.98438 -0.70313,-1.64063 -0.6875,-0.67187 -1.8125,-0.67187 -1.1875,0 -1.90625,0.71875 -0.70312,0.70312 -0.70312,1.95312 l -1.6875,-0.17187 q 0.17187,-1.89063 1.29687,-2.875 1.14063,-0.98438 3.03125,-0.98438 1.92188,0 3.04688,1.0625 1.125,1.0625 1.125,2.64063 0,0.79687 -0.32813,1.57812 -0.32812,0.78125 -1.09375,1.64063 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23437 -1.89062,1.6875 -0.42188,0.4375 -0.6875,0.875 z m 1.1257,1.57812 3.53125,-5.03125 -3.26562,-4.64062 h 2.04687 l 1.48438,2.26562 q 0.42187,0.64063 0.67187,1.07813 0.40625,-0.59375 0.73438,-1.0625 l 1.64062,-2.28125 h 1.95313 l -3.34375,4.54687 3.59375,5.125 h -2.01563 l -1.98437,-3 -0.51563,-0.8125 -2.54687,3.8125 z m 10.8125,0 v -8.40625 h -1.45312 v -1.26562 h 1.45312 v -1.03125 q 0,-0.96875 0.17188,-1.45313 0.23437,-0.64062 0.82812,-1.03125 0.59375,-0.39062 1.67188,-0.39062 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.0937 -0.95313,-0.0937 -0.75,0 -1.0625,0.32812 -0.3125,0.3125 -0.3125,1.1875 v 0.89063 h 1.89063 v 1.26562 h -1.89063 v 8.40625 z m 4.33957,-3.53125 1.64063,-0.21875 q 0.28125,1.40625 0.95312,2.01563 0.6875,0.60937 1.65625,0.60937 1.15625,0 1.95313,-0.79687 0.79687,-0.79688 0.79687,-1.98438 0,-1.125 -0.73437,-1.85937 -0.73438,-0.73438 -1.875,-0.73438 -0.46875,0 -1.15625,0.17188 l 0.1875,-1.4375 q 0.15625,0.0156 0.26562,0.0156 1.04688,0 1.875,-0.54687 0.84375,-0.54688 0.84375,-1.67188 0,-0.90625 -0.60937,-1.5 -0.60938,-0.59375 -1.57813,-0.59375 -0.95312,0 -1.59375,0.60938 -0.64062,0.59375 -0.8125,1.79687 l -1.64062,-0.29687 q 0.29687,-1.64063 1.35937,-2.54688 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92188,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85938 -0.46875,1.57813 -0.46875,0.70312 -1.375,1.125 1.1875,0.28125 1.84375,1.14062 0.65625,0.85938 0.65625,2.15625 0,1.73438 -1.28125,2.95313 -1.26562,1.21875 -3.21875,1.21875 -1.76562,0 -2.92187,-1.04688 -1.15625,-1.04687 -1.32813,-2.71875 z m 18.98508,1.95313 v 1.57812 h -8.82812 q -0.0156,-0.59375 0.1875,-1.14062 0.34375,-0.90625 1.07812,-1.78125 0.75,-0.875 2.15625,-2.01563 2.17188,-1.78125 2.9375,-2.82812 0.76563,-1.04688 0.76563,-1.96875 0,-0.98438 -0.70313,-1.64063 -0.6875,-0.67187 -1.8125,-0.67187 -1.1875,0 -1.90625,0.71875 -0.70312,0.70312 -0.70312,1.95312 l -1.6875,-0.17187 q 0.17187,-1.89063 1.29687,-2.875 1.14063,-0.98438 3.03125,-0.98438 1.92188,0 3.04688,1.0625 1.125,1.0625 1.125,2.64063 0,0.79687 -0.32813,1.57812 -0.32812,0.78125 -1.09375,1.64063 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23437 -1.89062,1.6875 -0.42188,0.4375 -0.6875,0.875 z m 10.84448,-4.26563 -8.84375,3.78125 v -1.625 l 7.01563,-2.90625 -7.01563,-2.875 v -1.64062 l 8.84375,3.73437 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path199"
+     d="m 177.22773,315.76625 q 0,-1.4375 0.71875,-2.4375 0.71875,-1 2.09375,-1 1.25,0 2.07812,0.90625 0.82813,0.89063 0.82813,2.625 0,1.6875 -0.84375,2.60938 -0.82813,0.92187 -2.04688,0.92187 -1.20312,0 -2.01562,-0.90625 -0.8125,-0.90625 -0.8125,-2.71875 z m 2.85937,-2.3125 q -0.60937,0 -1.01562,0.53125 -0.40625,0.53125 -0.40625,1.9375 0,1.28125 0.40625,1.8125 0.40625,0.51563 1.01562,0.51563 0.625,0 1.01563,-0.51563 0.40625,-0.53125 0.40625,-1.9375 0,-1.29687 -0.40625,-1.8125 -0.40625,-0.53125 -1.01563,-0.53125 z m 0,12.9375 7.3125,-14.0625 h 1.32813 l -7.28125,14.0625 z m 5.78125,-3.625 q 0,-1.4375 0.71875,-2.42187 0.71875,-1 2.09375,-1 1.26563,0 2.07813,0.90625 0.82812,0.89062 0.82812,2.625 0,1.6875 -0.82812,2.60937 -0.82813,0.90625 -2.0625,0.90625 -1.21875,0 -2.03125,-0.90625 -0.79688,-0.90625 -0.79688,-2.71875 z m 2.85938,-2.29687 q -0.625,0 -1.03125,0.53125 -0.39063,0.53125 -0.39063,1.9375 0,1.28125 0.40625,1.8125 0.40625,0.51562 1.01563,0.51562 0.625,0 1.03125,-0.51562 0.40625,-0.53125 0.40625,-1.9375 0,-1.29688 -0.42188,-1.8125 -0.40625,-0.53125 -1.01562,-0.53125 z m 5.36964,5.4375 V 312.5475 h 5.01563 q 1.53125,0 2.45312,0.40625 0.92188,0.40625 1.4375,1.25 0.53125,0.84375 0.53125,1.76563 0,0.85937 -0.46875,1.625 -0.45312,0.75 -1.39062,1.20312 1.20312,0.35938 1.85937,1.21875 0.65625,0.85938 0.65625,2.01563 0,0.9375 -0.40625,1.75 -0.39062,0.79687 -0.98437,1.23437 -0.57813,0.4375 -1.45313,0.67188 -0.875,0.21875 -2.15625,0.21875 z m 1.78125,-7.75 h 2.875 q 1.1875,0 1.6875,-0.14063 0.67188,-0.20312 1.01563,-0.67187 0.34375,-0.46875 0.34375,-1.17188 0,-0.65625 -0.32813,-1.15625 -0.3125,-0.51562 -0.90625,-0.70312 -0.59375,-0.1875 -2.03125,-0.1875 h -2.65625 z m 0,6.17187 h 3.3125 q 0.85938,0 1.20313,-0.0625 0.60937,-0.10937 1.01562,-0.35937 0.42188,-0.26563 0.6875,-0.75 0.26563,-0.48438 0.26563,-1.125 0,-0.75 -0.39063,-1.29688 -0.375,-0.54687 -1.0625,-0.76562 -0.67187,-0.23438 -1.95312,-0.23438 h -3.07813 z m 24.34563,-6.28125 h -8.82812 v -1.51562 h 8.82812 z m 0,4.0625 h -8.82812 v -1.53125 h 8.82812 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path201"
+     d="m 230.44314,325.90685 -3.90625,-9.67187 h 2.6875 l 1.82812,4.9375 0.53125,1.64062 q 0.20313,-0.625 0.26563,-0.82812 0.125,-0.40625 0.26562,-0.8125 l 1.84375,-4.9375 h 2.625 l -3.84375,9.67187 z m 7.71947,-10.98437 v -2.375 h 2.5625 v 2.375 z m 0,10.98437 v -9.67187 h 2.5625 v 9.67187 z m 10.77705,-3.07812 2.54688,0.42187 q -0.48438,1.40625 -1.54688,2.14063 -1.0625,0.73437 -2.65625,0.73437 -2.51562,0 -3.73437,-1.65625 -0.95313,-1.3125 -0.95313,-3.32812 0,-2.40625 1.25,-3.76563 1.26563,-1.35937 3.1875,-1.35937 2.15625,0 3.40625,1.42187 1.25,1.42188 1.1875,4.375 h -6.40625 q 0.0312,1.14063 0.60938,1.78125 0.59375,0.625 1.48437,0.625 0.59375,0 1,-0.32812 0.42188,-0.32813 0.625,-1.0625 z m 0.15625,-2.59375 q -0.0312,-1.10938 -0.57812,-1.6875 -0.54688,-0.57813 -1.32813,-0.57813 -0.84375,0 -1.39062,0.60938 -0.54688,0.60937 -0.53125,1.65625 z m 6.42261,5.67187 -3.0625,-9.67187 h 2.48437 l 1.8125,6.34375 1.67188,-6.34375 h 2.46875 l 1.60937,6.34375 1.85938,-6.34375 h 2.51562 l -3.10937,9.67187 h -2.45313 l -1.67187,-6.21875 -1.64063,6.21875 z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path203"
+     d="m 273.30699,325.90685 v -9.67187 h 1.46875 v 1.35937 q 0.45312,-0.71875 1.20312,-1.14062 0.76563,-0.4375 1.71875,-0.4375 1.07813,0 1.76563,0.45312 0.6875,0.4375 0.96875,1.23438 1.15625,-1.6875 2.98437,-1.6875 1.45313,0 2.21875,0.79687 0.78125,0.79688 0.78125,2.45313 v 6.64062 h -1.64062 v -6.09375 q 0,-0.98437 -0.15625,-1.40625 -0.15625,-0.4375 -0.57813,-0.70312 -0.42187,-0.26563 -0.98437,-0.26563 -1.01563,0 -1.6875,0.6875 -0.67188,0.67188 -0.67188,2.15625 v 5.625 h -1.64062 v -6.28125 q 0,-1.09375 -0.40625,-1.64062 -0.40625,-0.54688 -1.3125,-0.54688 -0.6875,0 -1.28125,0.35938 -0.59375,0.35937 -0.85938,1.0625 -0.25,0.70312 -0.25,2.03125 v 5.01562 z m 22.1658,-3.10937 1.6875,0.20312 q -0.40625,1.48438 -1.48437,2.3125 -1.07813,0.8125 -2.76563,0.8125 -2.125,0 -3.375,-1.29687 -1.23437,-1.3125 -1.23437,-3.67188 0,-2.45312 1.25,-3.79687 1.26562,-1.34375 3.26562,-1.34375 1.9375,0 3.15625,1.32812 1.23438,1.3125 1.23438,3.70313 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45312 0.8125,0.84375 2.01562,0.84375 0.90625,0 1.54688,-0.46875 0.64062,-0.48437 1.01562,-1.51562 z m -5.39062,-2.65625 h 5.40625 q -0.10938,-1.21875 -0.625,-1.82813 -0.78125,-0.95312 -2.03125,-0.95312 -1.125,0 -1.90625,0.76562 -0.76563,0.75 -0.84375,2.01563 z m 9.14132,5.76562 v -9.67187 h 1.46875 v 1.35937 q 0.45313,-0.71875 1.20313,-1.14062 0.76562,-0.4375 1.71875,-0.4375 1.07812,0 1.76562,0.45312 0.6875,0.4375 0.96875,1.23438 1.15625,-1.6875 2.98438,-1.6875 1.45312,0 2.21875,0.79687 0.78125,0.79688 0.78125,2.45313 v 6.64062 h -1.64063 v -6.09375 q 0,-0.98437 -0.15625,-1.40625 -0.15625,-0.4375 -0.57812,-0.70312 -0.42188,-0.26563 -0.98438,-0.26563 -1.01562,0 -1.6875,0.6875 -0.67187,0.67188 -0.67187,2.15625 v 5.625 h -1.64063 v -6.28125 q 0,-1.09375 -0.40625,-1.64062 -0.40625,-0.54688 -1.3125,-0.54688 -0.6875,0 -1.28125,0.35938 -0.59375,0.35937 -0.85937,1.0625 -0.25,0.70312 -0.25,2.03125 v 5.01562 z m 15.52518,0 v -9.67187 h 1.46875 v 1.46875 q 0.5625,-1.03125 1.03125,-1.35938 0.48438,-0.32812 1.0625,-0.32812 0.82813,0 1.6875,0.53125 l -0.5625,1.51562 q -0.60937,-0.35937 -1.20312,-0.35937 -0.54688,0 -0.96875,0.32812 -0.42188,0.32813 -0.60938,0.89063 -0.28125,0.875 -0.28125,1.92187 v 5.0625 z m 12.8533,-3.10937 1.6875,0.20312 q -0.40625,1.48438 -1.48437,2.3125 -1.07813,0.8125 -2.76563,0.8125 -2.125,0 -3.375,-1.29687 -1.23437,-1.3125 -1.23437,-3.67188 0,-2.45312 1.25,-3.79687 1.26562,-1.34375 3.26562,-1.34375 1.9375,0 3.15625,1.32812 1.23438,1.3125 1.23438,3.70313 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45312 0.8125,0.84375 2.01562,0.84375 0.90625,0 1.54688,-0.46875 0.64062,-0.48437 1.01562,-1.51562 z m -5.39062,-2.65625 h 5.40625 q -0.10938,-1.21875 -0.625,-1.82813 -0.78125,-0.95312 -2.03125,-0.95312 -1.125,0 -1.90625,0.76562 -0.76563,0.75 -0.84375,2.01563 z m 9.53198,5.76562 v -8.40625 h -1.45313 v -1.26562 h 1.45313 v -1.03125 q 0,-0.96875 0.17187,-1.45313 0.23438,-0.64062 0.82813,-1.03125 0.59375,-0.39062 1.67187,-0.39062 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.0937 -0.95312,-0.0937 -0.75,0 -1.0625,0.32812 -0.3125,0.3125 -0.3125,1.1875 v 0.89063 h 1.89062 v 1.26562 h -1.89062 v 8.40625 z m 4.57391,-5.84375 v -1.53125 l 8.84375,-3.73437 v 1.64062 l -7.01562,2.875 7.01562,2.90625 v 1.625 z m 19.26996,4.26563 v 1.57812 h -8.82812 q -0.0156,-0.59375 0.1875,-1.14062 0.34375,-0.90625 1.07812,-1.78125 0.75,-0.875 2.15625,-2.01563 2.17188,-1.78125 2.9375,-2.82812 0.76563,-1.04688 0.76563,-1.96875 0,-0.98438 -0.70313,-1.64063 -0.6875,-0.67187 -1.8125,-0.67187 -1.1875,0 -1.90625,0.71875 -0.70312,0.70312 -0.70312,1.95312 l -1.6875,-0.17187 q 0.17187,-1.89063 1.29687,-2.875 1.14063,-0.98438 3.03125,-0.98438 1.92188,0 3.04688,1.0625 1.125,1.0625 1.125,2.64063 0,0.79687 -0.32813,1.57812 -0.32812,0.78125 -1.09375,1.64063 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23437 -1.89062,1.6875 -0.42188,0.4375 -0.6875,0.875 z m 1.12573,1.57812 3.53125,-5.03125 -3.26562,-4.64062 h 2.04687 l 1.48438,2.26562 q 0.42187,0.64063 0.67187,1.07813 0.40625,-0.59375 0.73438,-1.0625 l 1.64062,-2.28125 h 1.95313 l -3.34375,4.54687 3.59375,5.125 h -2.01563 l -1.98437,-3 -0.51563,-0.8125 -2.54687,3.8125 z m 18.57813,-1.57812 v 1.57812 h -8.82813 q -0.0156,-0.59375 0.1875,-1.14062 0.34375,-0.90625 1.07813,-1.78125 0.75,-0.875 2.15625,-2.01563 2.17187,-1.78125 2.9375,-2.82812 0.76562,-1.04688 0.76562,-1.96875 0,-0.98438 -0.70312,-1.64063 -0.6875,-0.67187 -1.8125,-0.67187 -1.1875,0 -1.90625,0.71875 -0.70313,0.70312 -0.70313,1.95312 l -1.6875,-0.17187 q 0.17188,-1.89063 1.29688,-2.875 1.14062,-0.98438 3.03125,-0.98438 1.92187,0 3.04687,1.0625 1.125,1.0625 1.125,2.64063 0,0.79687 -0.32812,1.57812 -0.32813,0.78125 -1.09375,1.64063 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23437 -1.89063,1.6875 -0.42187,0.4375 -0.6875,0.875 z m 1.1257,1.57812 3.53125,-5.03125 -3.26562,-4.64062 h 2.04687 l 1.48438,2.26562 q 0.42187,0.64063 0.67187,1.07813 0.40625,-0.59375 0.73438,-1.0625 l 1.64062,-2.28125 h 1.95313 l -3.34375,4.54687 3.59375,5.125 h -2.01563 l -1.98437,-3 -0.51563,-0.8125 -2.54687,3.8125 z m 10.8125,0 v -8.40625 h -1.45312 v -1.26562 h 1.45312 v -1.03125 q 0,-0.96875 0.17188,-1.45313 0.23437,-0.64062 0.82812,-1.03125 0.59375,-0.39062 1.67188,-0.39062 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.0937 -0.95313,-0.0937 -0.75,0 -1.0625,0.32812 -0.3125,0.3125 -0.3125,1.1875 v 0.89063 h 1.89063 v 1.26562 h -1.89063 v 8.40625 z m 4.33957,-3.53125 1.64062,-0.21875 q 0.28125,1.40625 0.95313,2.01563 0.6875,0.60937 1.65625,0.60937 1.15625,0 1.95312,-0.79687 0.79688,-0.79688 0.79688,-1.98438 0,-1.125 -0.73438,-1.85937 -0.73437,-0.73438 -1.875,-0.73438 -0.46875,0 -1.15625,0.17188 l 0.1875,-1.4375 q 0.15625,0.0156 0.26563,0.0156 1.04687,0 1.875,-0.54687 0.84375,-0.54688 0.84375,-1.67188 0,-0.90625 -0.60938,-1.5 -0.60937,-0.59375 -1.57812,-0.59375 -0.95313,0 -1.59375,0.60938 -0.64063,0.59375 -0.8125,1.79687 l -1.64063,-0.29687 q 0.29688,-1.64063 1.35938,-2.54688 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92187,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85938 -0.46875,1.57813 -0.46875,0.70312 -1.375,1.125 1.1875,0.28125 1.84375,1.14062 0.65625,0.85938 0.65625,2.15625 0,1.73438 -1.28125,2.95313 -1.26563,1.21875 -3.21875,1.21875 -1.76563,0 -2.92188,-1.04688 -1.15625,-1.04687 -1.32812,-2.71875 z m 18.98508,1.95313 v 1.57812 h -8.82813 q -0.0156,-0.59375 0.1875,-1.14062 0.34375,-0.90625 1.07813,-1.78125 0.75,-0.875 2.15625,-2.01563 2.17187,-1.78125 2.9375,-2.82812 0.76562,-1.04688 0.76562,-1.96875 0,-0.98438 -0.70312,-1.64063 -0.6875,-0.67187 -1.8125,-0.67187 -1.1875,0 -1.90625,0.71875 -0.70313,0.70312 -0.70313,1.95312 l -1.6875,-0.17187 q 0.17188,-1.89063 1.29688,-2.875 1.14062,-0.98438 3.03125,-0.98438 1.92187,0 3.04687,1.0625 1.125,1.0625 1.125,2.64063 0,0.79687 -0.32812,1.57812 -0.32813,0.78125 -1.09375,1.64063 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23437 -1.89063,1.6875 -0.42187,0.4375 -0.6875,0.875 z m 2.64135,1.57812 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35937,0.64063 -1.15625,0.98438 l -0.45312,-0.70313 q 0.51562,-0.21875 0.76562,-0.67187 0.25,-0.4375 0.28125,-1.26563 z m 9.64786,0.23438 0.79688,-3.89063 h -1.54688 v -1.35937 h 1.8125 l 0.67188,-3.29688 h -2.48438 v -1.35937 h 2.76563 l 0.79687,-3.90625 h 1.35938 l -0.79688,3.90625 h 2.875 l 0.79688,-3.90625 h 1.375 l -0.79688,3.90625 h 1.57813 v 1.35937 h -1.84375 l -0.6875,3.29688 h 2.53125 v 1.35937 h -2.8125 l -0.78125,3.89063 h -1.375 l 0.78125,-3.89063 h -2.85938 l -0.78125,3.89063 z m 2.4375,-5.25 H 428.14 l 0.6875,-3.29688 h -2.875 z m 8.1882,5.01562 v -13.35937 h 1.64063 v 13.35937 z m 4.19172,0 v -9.67187 h 1.46875 v 1.35937 q 0.45312,-0.71875 1.20312,-1.14062 0.76563,-0.4375 1.71875,-0.4375 1.07813,0 1.76563,0.45312 0.6875,0.4375 0.96875,1.23438 1.15625,-1.6875 2.98437,-1.6875 1.45313,0 2.21875,0.79687 0.78125,0.79688 0.78125,2.45313 v 6.64062 h -1.64062 v -6.09375 q 0,-0.98437 -0.15625,-1.40625 -0.15625,-0.4375 -0.57813,-0.70312 -0.42187,-0.26563 -0.98437,-0.26563 -1.01563,0 -1.6875,0.6875 -0.67188,0.67188 -0.67188,2.15625 v 5.625 h -1.64062 v -6.28125 q 0,-1.09375 -0.40625,-1.64062 -0.40625,-0.54688 -1.3125,-0.54688 -0.6875,0 -1.28125,0.35938 -0.59375,0.35937 -0.85938,1.0625 -0.25,0.70312 -0.25,2.03125 v 5.01562 z m 21.8533,-1.1875 q -0.92188,0.76563 -1.76563,1.09375 -0.82812,0.3125 -1.79687,0.3125 -1.59375,0 -2.45313,-0.78125 -0.85937,-0.78125 -0.85937,-1.98437 0,-0.71875 0.32812,-1.29688 0.32813,-0.59375 0.84375,-0.9375 0.53125,-0.35937 1.1875,-0.54687 0.46875,-0.125 1.45313,-0.25 1.98437,-0.23438 2.92187,-0.5625 0.0156,-0.34375 0.0156,-0.42188 0,-1 -0.46875,-1.42187 -0.625,-0.54688 -1.875,-0.54688 -1.15625,0 -1.70313,0.40625 -0.54687,0.40625 -0.8125,1.42188 l -1.60937,-0.21875 q 0.21875,-1.01563 0.71875,-1.64063 0.5,-0.64062 1.45312,-0.98437 0.95313,-0.34375 2.1875,-0.34375 1.25,0 2.01563,0.29687 0.78125,0.28125 1.14062,0.73438 0.375,0.4375 0.51563,1.10937 0.0781,0.42188 0.0781,1.51563 v 2.1875 q 0,2.28125 0.10938,2.89062 0.10937,0.59375 0.40625,1.15625 h -1.70313 q -0.26562,-0.51562 -0.32812,-1.1875 z m -0.14063,-3.67187 q -0.89062,0.375 -2.67187,0.625 -1.01563,0.14062 -1.4375,0.32812 -0.42188,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45312,0.4375 0.9375,0 1.67188,-0.40625 0.75,-0.42187 1.09375,-1.14062 0.26562,-0.5625 0.26562,-1.64063 z m 4.20383,8.5625 v -13.375 h 1.48437 v 1.25 q 0.53125,-0.73438 1.1875,-1.09375 0.67188,-0.375 1.625,-0.375 1.23438,0 2.17188,0.64062 0.95312,0.625 1.4375,1.79688 0.48437,1.15625 0.48437,2.54687 0,1.48438 -0.53125,2.67188 -0.53125,1.1875 -1.54687,1.82812 -1.01563,0.625 -2.14063,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 v 4.70313 z m 1.48437,-8.48438 q 0,1.85938 0.75,2.76563 0.76563,0.89062 1.82813,0.89062 1.09375,0 1.875,-0.92187 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76563,-2.76563 -0.75,-0.92187 -1.8125,-0.92187 -1.04687,0 -1.85937,0.98437 -0.79688,0.96875 -0.79688,2.84375 z m 7.62574,4.78125 5.125,-13.35937 h 1.90625 l 5.46875,13.35937 h -2.01563 l -1.54687,-4.04687 h -5.59375 l -1.46875,4.04687 z m 3.85937,-5.48437 h 4.53125 l -1.40625,-3.70313 q -0.625,-1.6875 -0.9375,-2.76562 -0.26562,1.28125 -0.71875,2.54687 z m 10.2717,5.48437 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35937,0.64063 -1.15625,0.98438 l -0.45312,-0.70313 q 0.51562,-0.21875 0.76562,-0.67187 0.25,-0.4375 0.28125,-1.26563 z m 9.64786,0.23438 0.79687,-3.89063 h -1.54687 v -1.35937 h 1.8125 l 0.67187,-3.29688 h -2.48437 v -1.35937 h 2.76562 l 0.79688,-3.90625 h 1.35934 l -0.79684,3.90625 h 2.87497 l 0.79687,-3.90625 h 1.375 l -0.79687,3.90625 h 1.57812 v 1.35937 h -1.84375 l -0.6875,3.29688 h 2.53125 v 1.35937 h -2.8125 l -0.78125,3.89063 h -1.375 l 0.78125,-3.89063 h -2.85934 l -0.78125,3.89063 z m 2.4375,-5.25 h 2.85934 l 0.6875,-3.29688 h -2.87497 z m 8.23508,-6.45313 v -1.89062 h 1.64062 v 1.89062 z m 0,11.46875 v -9.67187 h 1.64062 v 9.67187 z m 4.14483,0 v -9.67187 h 1.46875 v 1.35937 q 0.45313,-0.71875 1.20313,-1.14062 0.76562,-0.4375 1.71875,-0.4375 1.07812,0 1.76562,0.45312 0.6875,0.4375 0.96875,1.23438 1.15625,-1.6875 2.98438,-1.6875 1.45312,0 2.21875,0.79687 0.78125,0.79688 0.78125,2.45313 v 6.64062 h -1.64063 v -6.09375 q 0,-0.98437 -0.15625,-1.40625 -0.15625,-0.4375 -0.57812,-0.70312 -0.42188,-0.26563 -0.98438,-0.26563 -1.01562,0 -1.6875,0.6875 -0.67187,0.67188 -0.67187,2.15625 v 5.625 h -1.64063 v -6.28125 q 0,-1.09375 -0.40625,-1.64062 -0.40625,-0.54688 -1.3125,-0.54688 -0.6875,0 -1.28125,0.35938 -0.59375,0.35937 -0.85937,1.0625 -0.25,0.70312 -0.25,2.03125 v 5.01562 z m 21.85327,-1.1875 q -0.92187,0.76563 -1.76562,1.09375 -0.82813,0.3125 -1.79688,0.3125 -1.59375,0 -2.45312,-0.78125 -0.85938,-0.78125 -0.85938,-1.98437 0,-0.71875 0.32813,-1.29688 0.32812,-0.59375 0.84375,-0.9375 0.53125,-0.35937 1.1875,-0.54687 0.46875,-0.125 1.45312,-0.25 1.98438,-0.23438 2.92188,-0.5625 0.0156,-0.34375 0.0156,-0.42188 0,-1 -0.46875,-1.42187 -0.625,-0.54688 -1.875,-0.54688 -1.15625,0 -1.70312,0.40625 -0.54688,0.40625 -0.8125,1.42188 l -1.60938,-0.21875 q 0.21875,-1.01563 0.71875,-1.64063 0.5,-0.64062 1.45313,-0.98437 0.95312,-0.34375 2.1875,-0.34375 1.25,0 2.01562,0.29687 0.78125,0.28125 1.14063,0.73438 0.375,0.4375 0.51562,1.10937 0.0781,0.42188 0.0781,1.51563 v 2.1875 q 0,2.28125 0.10937,2.89062 0.10938,0.59375 0.40625,1.15625 h -1.70307 q -0.26563,-0.51562 -0.32813,-1.1875 z m -0.14062,-3.67187 q -0.89063,0.375 -2.67188,0.625 -1.01562,0.14062 -1.4375,0.32812 -0.42187,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45313,0.4375 0.9375,0 1.67187,-0.40625 0.75,-0.42187 1.09375,-1.14062 0.26563,-0.5625 0.26563,-1.64063 z m 4.20385,8.5625 v -13.375 h 1.48438 v 1.25 q 0.53125,-0.73438 1.1875,-1.09375 0.67187,-0.375 1.625,-0.375 1.23437,0 2.17187,0.64062 0.95313,0.625 1.4375,1.79688 0.48438,1.15625 0.48438,2.54687 0,1.48438 -0.53125,2.67188 -0.53125,1.1875 -1.54688,1.82812 -1.01562,0.625 -2.14062,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 v 4.70313 z m 1.48438,-8.48438 q 0,1.85938 0.75,2.76563 0.76562,0.89062 1.82812,0.89062 1.09375,0 1.875,-0.92187 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76562,-2.76563 -0.75,-0.92187 -1.8125,-0.92187 -1.04688,0 -1.85938,0.98437 -0.79687,0.96875 -0.79687,2.84375 z m 9.01636,4.78125 v -13.35937 h 5.01562 q 1.53125,0 2.45313,0.40625 0.92187,0.40625 1.4375,1.25 0.53125,0.84375 0.53125,1.76562 0,0.85938 -0.46875,1.625 -0.45313,0.75 -1.39063,1.20313 1.20313,0.35937 1.85938,1.21875 0.65625,0.85937 0.65625,2.01562 0,0.9375 -0.40625,1.75 -0.39063,0.79688 -0.98438,1.23438 -0.57812,0.4375 -1.45312,0.67187 -0.875,0.21875 -2.15625,0.21875 z m 1.78125,-7.75 h 2.875 q 1.1875,0 1.6875,-0.14062 0.67187,-0.20313 1.01562,-0.67188 0.34375,-0.46875 0.34375,-1.17187 0,-0.65625 -0.32812,-1.15625 -0.3125,-0.51563 -0.90625,-0.70313 -0.59375,-0.1875 -2.03125,-0.1875 h -2.65625 z m 0,6.17188 h 3.3125 q 0.85937,0 1.20312,-0.0625 0.60938,-0.10938 1.01563,-0.35938 0.42187,-0.26562 0.6875,-0.75 0.26562,-0.48437 0.26562,-1.125 0,-0.75 -0.39062,-1.29687 -0.375,-0.54688 -1.0625,-0.76563 -0.67188,-0.23437 -1.95313,-0.23437 h -3.07812 z m 18.6936,0 v 1.57812 h -8.82812 q -0.0156,-0.59375 0.1875,-1.14062 0.34375,-0.90625 1.07812,-1.78125 0.75,-0.875 2.15625,-2.01563 2.17188,-1.78125 2.9375,-2.82812 0.76563,-1.04688 0.76563,-1.96875 0,-0.98438 -0.70313,-1.64063 -0.6875,-0.67187 -1.8125,-0.67187 -1.1875,0 -1.90625,0.71875 -0.70312,0.70312 -0.70312,1.95312 l -1.6875,-0.17187 q 0.17187,-1.89063 1.29687,-2.875 1.14063,-0.98438 3.03125,-0.98438 1.92188,0 3.04688,1.0625 1.125,1.0625 1.125,2.64063 0,0.79687 -0.32813,1.57812 -0.32812,0.78125 -1.09375,1.64063 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23437 -1.89062,1.6875 -0.42188,0.4375 -0.6875,0.875 z m 0.9538,1.57812 5.125,-13.35937 h 1.90625 l 5.46875,13.35937 h -2.01563 l -1.54687,-4.04687 h -5.59375 l -1.46875,4.04687 z m 3.85937,-5.48437 H 577.52 l -1.40625,-3.70313 q -0.625,-1.6875 -0.9375,-2.76562 -0.26562,1.28125 -0.71875,2.54687 z m 10.27173,5.48437 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35937,0.64063 -1.15625,0.98438 l -0.45312,-0.70313 q 0.51562,-0.21875 0.76562,-0.67187 0.25,-0.4375 0.28125,-1.26563 z m 12.6322,0 -3.6875,-9.67187 h 1.73438 l 2.07812,5.79687 q 0.32813,0.9375 0.625,1.9375 0.20313,-0.76562 0.60938,-1.82812 l 2.14062,-5.90625 h 1.6875 l -3.65625,9.67187 z m 6.64063,0 v -9.67187 h 1.46875 v 1.35937 q 0.45312,-0.71875 1.20312,-1.14062 0.76563,-0.4375 1.71875,-0.4375 1.07813,0 1.76563,0.45312 0.6875,0.4375 0.96875,1.23438 1.15625,-1.6875 2.98437,-1.6875 1.45313,0 2.21875,0.79687 0.78125,0.79688 0.78125,2.45313 v 6.64062 h -1.64062 v -6.09375 q 0,-0.98437 -0.15625,-1.40625 -0.15625,-0.4375 -0.57813,-0.70312 -0.42187,-0.26563 -0.98437,-0.26563 -1.01563,0 -1.6875,0.6875 -0.67188,0.67188 -0.67188,2.15625 v 5.625 h -1.64062 v -6.28125 q 0,-1.09375 -0.40625,-1.64062 -0.40625,-0.54688 -1.3125,-0.54688 -0.6875,0 -1.28125,0.35938 -0.59375,0.35937 -0.85938,1.0625 -0.25,0.70312 -0.25,2.03125 v 5.01562 z m 22.16583,-3.10937 1.6875,0.20312 q -0.40625,1.48438 -1.48437,2.3125 -1.07813,0.8125 -2.76563,0.8125 -2.125,0 -3.375,-1.29687 -1.23437,-1.3125 -1.23437,-3.67188 0,-2.45312 1.25,-3.79687 1.26562,-1.34375 3.26562,-1.34375 1.9375,0 3.15625,1.32812 1.23438,1.3125 1.23438,3.70313 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45312 0.8125,0.84375 2.01562,0.84375 0.90625,0 1.54688,-0.46875 0.64062,-0.48437 1.01562,-1.51562 z m -5.39062,-2.65625 h 5.40625 q -0.10938,-1.21875 -0.625,-1.82813 -0.78125,-0.95312 -2.03125,-0.95312 -1.125,0 -1.90625,0.76562 -0.76563,0.75 -0.84375,2.01563 z m 9.14129,5.76562 v -9.67187 h 1.46875 v 1.35937 q 0.45313,-0.71875 1.20313,-1.14062 0.76562,-0.4375 1.71875,-0.4375 1.07812,0 1.76562,0.45312 0.6875,0.4375 0.96875,1.23438 1.15625,-1.6875 2.98438,-1.6875 1.45312,0 2.21875,0.79687 0.78125,0.79688 0.78125,2.45313 v 6.64062 h -1.64063 v -6.09375 q 0,-0.98437 -0.15625,-1.40625 -0.15625,-0.4375 -0.57812,-0.70312 -0.42188,-0.26563 -0.98438,-0.26563 -1.01562,0 -1.6875,0.6875 -0.67187,0.67188 -0.67187,2.15625 v 5.625 h -1.64063 v -6.28125 q 0,-1.09375 -0.40625,-1.64062 -0.40625,-0.54688 -1.3125,-0.54688 -0.6875,0 -1.28125,0.35938 -0.59375,0.35937 -0.85937,1.0625 -0.25,0.70312 -0.25,2.03125 v 5.01562 z m 24.16583,-5.84375 -8.84375,3.78125 v -1.625 l 7.01563,-2.90625 -7.01563,-2.875 v -1.64062 l 8.84375,3.73437 z m 10.57825,9.76563 q -1.35937,-1.70313 -2.29687,-4 -0.9375,-2.29688 -0.9375,-4.76563 0,-2.15625 0.70312,-4.14062 0.82813,-2.3125 2.53125,-4.59375 h 1.17188 q -1.09375,1.89062 -1.45313,2.70312 -0.54687,1.25 -0.875,2.625 -0.39062,1.70313 -0.39062,3.42188 0,4.375 2.71875,8.75 z m 4.16583,0 h -1.1875 q 2.73438,-4.375 2.73438,-8.75 0,-1.71875 -0.39063,-3.39063 -0.3125,-1.375 -0.875,-2.625 -0.35937,-0.82812 -1.46875,-2.73437 h 1.1875 q 1.70313,2.28125 2.53125,4.59375 0.6875,1.98437 0.6875,4.14062 0,2.46875 -0.9375,4.76563 -0.9375,2.29687 -2.28125,4 z m 10.34906,-0.21875 v -17.0625 h 3.60938 v 1.35937 h -1.96875 v 14.34375 h 1.96875 v 1.35938 z m 4.99579,-13.84375 q 0,-1.4375 0.71875,-2.4375 0.71875,-1 2.09375,-1 1.25,0 2.07813,0.90625 0.82812,0.89062 0.82812,2.625 0,1.6875 -0.84375,2.60937 -0.82812,0.92188 -2.04687,0.92188 -1.20313,0 -2.01563,-0.90625 -0.8125,-0.90625 -0.8125,-2.71875 z m 2.85938,-2.3125 q -0.60938,0 -1.01563,0.53125 -0.40625,0.53125 -0.40625,1.9375 0,1.28125 0.40625,1.8125 0.40625,0.51562 1.01563,0.51562 0.625,0 1.01562,-0.51562 0.40625,-0.53125 0.40625,-1.9375 0,-1.29688 -0.40625,-1.8125 -0.40625,-0.53125 -1.01562,-0.53125 z m 0,12.9375 7.3125,-14.0625 h 1.32812 l -7.28125,14.0625 z m 5.78125,-3.625 q 0,-1.4375 0.71875,-2.42188 0.71875,-1 2.09375,-1 1.26562,0 2.07812,0.90625 0.82813,0.89063 0.82813,2.625 0,1.6875 -0.82813,2.60938 -0.82812,0.90625 -2.0625,0.90625 -1.21875,0 -2.03125,-0.90625 -0.79687,-0.90625 -0.79687,-2.71875 z m 2.85937,-2.29688 q -0.625,0 -1.03125,0.53125 -0.39062,0.53125 -0.39062,1.9375 0,1.28125 0.40625,1.8125 0.40625,0.51563 1.01562,0.51563 0.625,0 1.03125,-0.51563 0.40625,-0.53125 0.40625,-1.9375 0,-1.29687 -0.42187,-1.8125 -0.40625,-0.53125 -1.01563,-0.53125 z m 10.96338,5.4375 h -1.64062 v -10.45312 q -0.59375,0.5625 -1.5625,1.14062 -0.95313,0.5625 -1.71875,0.84375 v -1.59375 q 1.375,-0.64062 2.40625,-1.5625 1.03125,-0.92187 1.45312,-1.78125 h 1.0625 z m 5.07886,0 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35938,0.64063 -1.15625,0.98438 l -0.45313,-0.70313 q 0.51563,-0.21875 0.76563,-0.67187 0.25,-0.4375 0.28125,-1.26563 z m 9.78845,-10.14062 q 0,-1.4375 0.71875,-2.4375 0.71875,-1 2.09375,-1 1.25,0 2.07813,0.90625 0.82812,0.89062 0.82812,2.625 0,1.6875 -0.84375,2.60937 -0.82812,0.92188 -2.04687,0.92188 -1.20313,0 -2.01563,-0.90625 -0.8125,-0.90625 -0.8125,-2.71875 z m 2.85938,-2.3125 q -0.60938,0 -1.01563,0.53125 -0.40625,0.53125 -0.40625,1.9375 0,1.28125 0.40625,1.8125 0.40625,0.51562 1.01563,0.51562 0.625,0 1.01562,-0.51562 0.40625,-0.53125 0.40625,-1.9375 0,-1.29688 -0.40625,-1.8125 -0.40625,-0.53125 -1.01562,-0.53125 z m 0,12.9375 7.3125,-14.0625 h 1.32812 l -7.28125,14.0625 z m 5.78125,-3.625 q 0,-1.4375 0.71875,-2.42188 0.71875,-1 2.09375,-1 1.26562,0 2.07812,0.90625 0.82813,0.89063 0.82813,2.625 0,1.6875 -0.82813,2.60938 -0.82812,0.90625 -2.0625,0.90625 -1.21875,0 -2.03125,-0.90625 -0.79687,-0.90625 -0.79687,-2.71875 z m 2.85937,-2.29688 q -0.625,0 -1.03125,0.53125 -0.39062,0.53125 -0.39062,1.9375 0,1.28125 0.40625,1.8125 0.40625,0.51563 1.01562,0.51563 0.625,0 1.03125,-0.51563 0.40625,-0.53125 0.40625,-1.9375 0,-1.29687 -0.42187,-1.8125 -0.40625,-0.53125 -1.01563,-0.53125 z m 4.90088,-6.17187 v -1.57813 h 8.64063 v 1.28125 q -1.28125,1.35938 -2.53125,3.60938 -1.25,2.25 -1.9375,4.625 -0.48438,1.67187 -0.625,3.67187 h -1.6875 q 0.0312,-1.57812 0.625,-3.8125 0.59375,-2.23437 1.6875,-4.29687 1.10937,-2.07813 2.35937,-3.5 z m 13.45386,15.3125 h -3.60938 v -1.35938 h 1.96875 v -14.34375 h -1.96875 v -1.35937 H 749.89 Z" />
+  <path
+     style="fill:#000000;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path205"
+     d="m 203.14425,337.76625 q 0,-1.4375 0.71875,-2.4375 0.71875,-1 2.09375,-1 1.25,0 2.07812,0.90625 0.82813,0.89063 0.82813,2.625 0,1.6875 -0.84375,2.60938 -0.82813,0.92187 -2.04688,0.92187 -1.20312,0 -2.01562,-0.90625 -0.8125,-0.90625 -0.8125,-2.71875 z m 2.85937,-2.3125 q -0.60937,0 -1.01562,0.53125 -0.40625,0.53125 -0.40625,1.9375 0,1.28125 0.40625,1.8125 0.40625,0.51563 1.01562,0.51563 0.625,0 1.01563,-0.51563 0.40625,-0.53125 0.40625,-1.9375 0,-1.29687 -0.40625,-1.8125 -0.40625,-0.53125 -1.01563,-0.53125 z m 0,12.9375 7.3125,-14.0625 h 1.32813 l -7.28125,14.0625 z m 5.78125,-3.625 q 0,-1.4375 0.71875,-2.42187 0.71875,-1 2.09375,-1 1.26563,0 2.07813,0.90625 0.82812,0.89062 0.82812,2.625 0,1.6875 -0.82812,2.60937 -0.82813,0.90625 -2.0625,0.90625 -1.21875,0 -2.03125,-0.90625 -0.79688,-0.90625 -0.79688,-2.71875 z m 2.85938,-2.29687 q -0.625,0 -1.03125,0.53125 -0.39063,0.53125 -0.39063,1.9375 0,1.28125 0.40625,1.8125 0.40625,0.51562 1.01563,0.51562 0.625,0 1.03125,-0.51562 0.40625,-0.53125 0.40625,-1.9375 0,-1.29688 -0.42188,-1.8125 -0.40625,-0.53125 -1.01562,-0.53125 z m 3.97902,5.4375 5.125,-13.35938 h 1.90625 l 5.46875,13.35938 h -2.01563 L 227.56077,343.86 h -5.59375 l -1.46875,4.04688 z m 3.85937,-5.48438 h 4.53125 l -1.40625,-3.70312 q -0.625,-1.6875 -0.9375,-2.76563 -0.26562,1.28125 -0.71875,2.54688 z m 15.48626,-2.32812 V 338.235 h 1.85937 v 1.85938 z m 0,7.8125 v -1.875 h 1.85937 v 1.875 z m 9.91348,0 V 338.235 h 1.46875 v 1.35938 q 0.45312,-0.71875 1.20312,-1.14063 0.76563,-0.4375 1.71875,-0.4375 1.07813,0 1.76563,0.45313 0.6875,0.4375 0.96875,1.23437 1.15625,-1.6875 2.98437,-1.6875 1.45313,0 2.21875,0.79688 0.78125,0.79687 0.78125,2.45312 v 6.64063 h -1.64062 v -6.09375 q 0,-0.98438 -0.15625,-1.40625 -0.15625,-0.4375 -0.57813,-0.70313 -0.42187,-0.26562 -0.98437,-0.26562 -1.01563,0 -1.6875,0.6875 -0.67188,0.67187 -0.67188,2.15625 v 5.625 h -1.64062 v -6.28125 q 0,-1.09375 -0.40625,-1.64063 -0.40625,-0.54687 -1.3125,-0.54687 -0.6875,0 -1.28125,0.35937 -0.59375,0.35938 -0.85938,1.0625 -0.25,0.70313 -0.25,2.03125 v 5.01563 z m 22.1658,-3.10938 1.6875,0.20313 q -0.40625,1.48437 -1.48437,2.3125 -1.07813,0.8125 -2.76563,0.8125 -2.125,0 -3.375,-1.29688 -1.23437,-1.3125 -1.23437,-3.67187 0,-2.45313 1.25,-3.79688 1.26562,-1.34375 3.26562,-1.34375 1.9375,0 3.15625,1.32813 1.23438,1.3125 1.23438,3.70312 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45313 0.8125,0.84375 2.01562,0.84375 0.90625,0 1.54688,-0.46875 0.64062,-0.48438 1.01562,-1.51563 z m -5.39062,-2.65625 h 5.40625 q -0.10938,-1.21875 -0.625,-1.82812 -0.78125,-0.95313 -2.03125,-0.95313 -1.125,0 -1.90625,0.76563 -0.76563,0.75 -0.84375,2.01562 z m 9.14135,5.76563 V 338.235 h 1.46875 v 1.35938 q 0.45313,-0.71875 1.20313,-1.14063 0.76562,-0.4375 1.71875,-0.4375 1.07812,0 1.76562,0.45313 0.6875,0.4375 0.96875,1.23437 1.15625,-1.6875 2.98438,-1.6875 1.45312,0 2.21875,0.79688 0.78125,0.79687 0.78125,2.45312 v 6.64063 h -1.64063 v -6.09375 q 0,-0.98438 -0.15625,-1.40625 -0.15625,-0.4375 -0.57812,-0.70313 -0.42188,-0.26562 -0.98438,-0.26562 -1.01562,0 -1.6875,0.6875 -0.67187,0.67187 -0.67187,2.15625 v 5.625 h -1.64063 v -6.28125 q 0,-1.09375 -0.40625,-1.64063 -0.40625,-0.54687 -1.3125,-0.54687 -0.6875,0 -1.28125,0.35937 -0.59375,0.35938 -0.85937,1.0625 -0.25,0.70313 -0.25,2.03125 v 5.01563 z m 15.52518,0 V 338.235 h 1.46875 v 1.46875 q 0.5625,-1.03125 1.03125,-1.35937 0.48438,-0.32813 1.0625,-0.32813 0.82813,0 1.6875,0.53125 l -0.5625,1.51563 q -0.60937,-0.35938 -1.20312,-0.35938 -0.54688,0 -0.96875,0.32813 -0.42188,0.32812 -0.60938,0.89062 -0.28125,0.875 -0.28125,1.92188 v 5.0625 z m 12.8533,-3.10938 1.6875,0.20313 q -0.40625,1.48437 -1.48437,2.3125 -1.07813,0.8125 -2.76563,0.8125 -2.125,0 -3.375,-1.29688 -1.23437,-1.3125 -1.23437,-3.67187 0,-2.45313 1.25,-3.79688 1.26562,-1.34375 3.26562,-1.34375 1.9375,0 3.15625,1.32813 1.23438,1.3125 1.23438,3.70312 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45313 0.8125,0.84375 2.01562,0.84375 0.90625,0 1.54688,-0.46875 0.64062,-0.48438 1.01562,-1.51563 z m -5.39062,-2.65625 h 5.40625 q -0.10938,-1.21875 -0.625,-1.82812 -0.78125,-0.95313 -2.03125,-0.95313 -1.125,0 -1.90625,0.76563 -0.76563,0.75 -0.84375,2.01562 z m 9.53195,5.76563 v -8.40625 h -1.45313 V 338.235 h 1.45313 v -1.03125 q 0,-0.96875 0.17187,-1.45312 0.23438,-0.64063 0.82813,-1.03125 0.59375,-0.39063 1.67187,-0.39063 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.0937 -0.95312,-0.0937 -0.75,0 -1.0625,0.32813 -0.3125,0.3125 -0.3125,1.1875 v 0.89062 h 1.89062 v 1.26563 h -1.89062 v 8.40625 z m 4.57394,-5.84375 v -1.53125 l 8.84375,-3.73438 v 1.64063 l -7.01562,2.875 7.01562,2.90625 v 1.625 z m 10.66059,2.3125 1.64062,-0.21875 q 0.28125,1.40625 0.95313,2.01562 0.6875,0.60938 1.65625,0.60938 1.15625,0 1.95312,-0.79688 0.79688,-0.79687 0.79688,-1.98437 0,-1.125 -0.73438,-1.85938 -0.73437,-0.73437 -1.875,-0.73437 -0.46875,0 -1.15625,0.17187 l 0.1875,-1.4375 q 0.15625,0.0156 0.26563,0.0156 1.04687,0 1.875,-0.54688 0.84375,-0.54687 0.84375,-1.67187 0,-0.90625 -0.60938,-1.5 -0.60937,-0.59375 -1.57812,-0.59375 -0.95313,0 -1.59375,0.60937 -0.64063,0.59375 -0.8125,1.79688 l -1.64063,-0.29688 q 0.29688,-1.64062 1.35938,-2.54687 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92187,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85937 -0.46875,1.57812 -0.46875,0.70313 -1.375,1.125 1.1875,0.28125 1.84375,1.14063 0.65625,0.85937 0.65625,2.15625 0,1.73437 -1.28125,2.95312 -1.26563,1.21875 -3.21875,1.21875 -1.76563,0 -2.92188,-1.04687 -1.15625,-1.04688 -1.32812,-2.71875 z m 9.73507,3.53125 3.53125,-5.03125 -3.26562,-4.64063 h 2.04687 l 1.48438,2.26563 q 0.42187,0.64062 0.67187,1.07812 0.40625,-0.59375 0.73438,-1.0625 l 1.64062,-2.28125 h 1.95313 l -3.34375,4.54688 3.59375,5.125 h -2.01563 l -1.98437,-3 -0.51563,-0.8125 -2.54687,3.8125 z m 9.96875,-3.53125 1.64063,-0.21875 q 0.28125,1.40625 0.95312,2.01562 0.6875,0.60938 1.65625,0.60938 1.15625,0 1.95313,-0.79688 0.79687,-0.79687 0.79687,-1.98437 0,-1.125 -0.73437,-1.85938 -0.73438,-0.73437 -1.875,-0.73437 -0.46875,0 -1.15625,0.17187 l 0.1875,-1.4375 q 0.15625,0.0156 0.26562,0.0156 1.04688,0 1.875,-0.54688 0.84375,-0.54687 0.84375,-1.67187 0,-0.90625 -0.60937,-1.5 -0.60938,-0.59375 -1.57813,-0.59375 -0.95312,0 -1.59375,0.60937 -0.64062,0.59375 -0.8125,1.79688 l -1.64062,-0.29688 q 0.29687,-1.64062 1.35937,-2.54687 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92188,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85937 -0.46875,1.57812 -0.46875,0.70313 -1.375,1.125 1.1875,0.28125 1.84375,1.14063 0.65625,0.85937 0.65625,2.15625 0,1.73437 -1.28125,2.95312 -1.26562,1.21875 -3.21875,1.21875 -1.76562,0 -2.92187,-1.04687 -1.15625,-1.04688 -1.32813,-2.71875 z m 9.73511,3.53125 3.53125,-5.03125 -3.26562,-4.64063 h 2.04687 l 1.48438,2.26563 q 0.42187,0.64062 0.67187,1.07812 0.40625,-0.59375 0.73438,-1.0625 l 1.64062,-2.28125 h 1.95313 l -3.34375,4.54688 3.59375,5.125 h -2.01563 l -1.98437,-3 -0.51563,-0.8125 -2.54687,3.8125 z m 10.8125,0 v -8.40625 h -1.45312 V 338.235 h 1.45312 v -1.03125 q 0,-0.96875 0.17188,-1.45312 0.23437,-0.64063 0.82812,-1.03125 0.59375,-0.39063 1.67188,-0.39063 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.0937 -0.95313,-0.0937 -0.75,0 -1.0625,0.32813 -0.3125,0.3125 -0.3125,1.1875 v 0.89062 h 1.89063 v 1.26563 h -1.89063 v 8.40625 z m 4.33954,-3.53125 1.64062,-0.21875 q 0.28125,1.40625 0.95313,2.01562 0.6875,0.60938 1.65625,0.60938 1.15625,0 1.95312,-0.79688 0.79688,-0.79687 0.79688,-1.98437 0,-1.125 -0.73438,-1.85938 -0.73437,-0.73437 -1.875,-0.73437 -0.46875,0 -1.15625,0.17187 l 0.1875,-1.4375 q 0.15625,0.0156 0.26563,0.0156 1.04687,0 1.875,-0.54688 0.84375,-0.54687 0.84375,-1.67187 0,-0.90625 -0.60938,-1.5 -0.60937,-0.59375 -1.57812,-0.59375 -0.95313,0 -1.59375,0.60937 -0.64063,0.59375 -0.8125,1.79688 l -1.64063,-0.29688 q 0.29688,-1.64062 1.35938,-2.54687 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92187,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.85937 -0.46875,1.57812 -0.46875,0.70313 -1.375,1.125 1.1875,0.28125 1.84375,1.14063 0.65625,0.85937 0.65625,2.15625 0,1.73437 -1.28125,2.95312 -1.26563,1.21875 -3.21875,1.21875 -1.76563,0 -2.92188,-1.04687 -1.15625,-1.04688 -1.32812,-2.71875 z m 18.98511,1.95312 v 1.57813 h -8.82813 q -0.0156,-0.59375 0.1875,-1.14063 0.34375,-0.90625 1.07813,-1.78125 0.75,-0.875 2.15625,-2.01562 2.17187,-1.78125 2.9375,-2.82813 0.76562,-1.04687 0.76562,-1.96875 0,-0.98437 -0.70312,-1.64062 -0.6875,-0.67188 -1.8125,-0.67188 -1.1875,0 -1.90625,0.71875 -0.70313,0.70313 -0.70313,1.95313 L 376.6137,338.36 q 0.17188,-1.89062 1.29688,-2.875 1.14062,-0.98437 3.03125,-0.98437 1.92187,0 3.04687,1.0625 1.125,1.0625 1.125,2.64062 0,0.79688 -0.32812,1.57813 -0.32813,0.78125 -1.09375,1.64062 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.23438 -1.89063,1.6875 -0.42187,0.4375 -0.6875,0.875 z m 2.64132,1.57813 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35937,0.64062 -1.15625,0.98437 l -0.45312,-0.70312 q 0.51562,-0.21875 0.76562,-0.67188 0.25,-0.4375 0.28125,-1.26562 z m 9.64786,0.23437 0.79688,-3.89062 h -1.54688 v -1.35938 h 1.8125 l 0.67188,-3.29687 h -2.48438 V 338.235 h 2.76563 l 0.79687,-3.90625 h 1.35938 l -0.79688,3.90625 h 2.875 l 0.79688,-3.90625 h 1.375 l -0.79688,3.90625 h 1.57813 v 1.35938 h -1.84375 l -0.6875,3.29687 h 2.53125 v 1.35938 h -2.8125 l -0.78125,3.89062 h -1.375 l 0.78125,-3.89062 h -2.85938 l -0.78125,3.89062 z m 2.4375,-5.25 h 2.85938 l 0.6875,-3.29687 h -2.875 z m 8.18823,5.01563 V 334.5475 h 1.64063 v 13.35938 z m 4.19168,0 V 338.235 h 1.46875 v 1.35938 q 0.45313,-0.71875 1.20313,-1.14063 0.76562,-0.4375 1.71875,-0.4375 1.07812,0 1.76562,0.45313 0.6875,0.4375 0.96875,1.23437 1.15625,-1.6875 2.98438,-1.6875 1.45312,0 2.21875,0.79688 0.78125,0.79687 0.78125,2.45312 v 6.64063 h -1.64063 v -6.09375 q 0,-0.98438 -0.15625,-1.40625 -0.15625,-0.4375 -0.57812,-0.70313 -0.42188,-0.26562 -0.98438,-0.26562 -1.01562,0 -1.6875,0.6875 -0.67187,0.67187 -0.67187,2.15625 v 5.625 h -1.64063 v -6.28125 q 0,-1.09375 -0.40625,-1.64063 -0.40625,-0.54687 -1.3125,-0.54687 -0.6875,0 -1.28125,0.35937 -0.59375,0.35938 -0.85937,1.0625 -0.25,0.70313 -0.25,2.03125 v 5.01563 z m 21.85334,-1.1875 q -0.92188,0.76562 -1.76563,1.09375 -0.82812,0.3125 -1.79687,0.3125 -1.59375,0 -2.45313,-0.78125 -0.85937,-0.78125 -0.85937,-1.98438 0,-0.71875 0.32812,-1.29687 0.32813,-0.59375 0.84375,-0.9375 0.53125,-0.35938 1.1875,-0.54688 0.46875,-0.125 1.45313,-0.25 1.98437,-0.23437 2.92187,-0.5625 0.0156,-0.34375 0.0156,-0.42187 0,-1 -0.46875,-1.42188 -0.625,-0.54687 -1.875,-0.54687 -1.15625,0 -1.70313,0.40625 -0.54687,0.40625 -0.8125,1.42187 l -1.60937,-0.21875 q 0.21875,-1.01562 0.71875,-1.64062 0.5,-0.64063 1.45312,-0.98438 0.95313,-0.34375 2.1875,-0.34375 1.25,0 2.01563,0.29688 0.78125,0.28125 1.14062,0.73437 0.375,0.4375 0.51563,1.10938 0.0781,0.42187 0.0781,1.51562 v 2.1875 q 0,2.28125 0.10938,2.89063 0.10937,0.59375 0.40625,1.15625 h -1.70313 q -0.26562,-0.51563 -0.32812,-1.1875 z m -0.14063,-3.67188 q -0.89062,0.375 -2.67187,0.625 -1.01563,0.14063 -1.4375,0.32813 -0.42188,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45312,0.4375 0.9375,0 1.67188,-0.40625 0.75,-0.42188 1.09375,-1.14063 0.26562,-0.5625 0.26562,-1.64062 z m 4.20383,8.5625 v -13.375 h 1.48437 v 1.25 q 0.53125,-0.73437 1.1875,-1.09375 0.67188,-0.375 1.625,-0.375 1.23438,0 2.17188,0.64063 0.95312,0.625 1.4375,1.79687 0.48437,1.15625 0.48437,2.54688 0,1.48437 -0.53125,2.67187 -0.53125,1.1875 -1.54687,1.82813 -1.01563,0.625 -2.14063,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 V 351.61 Z m 1.48437,-8.48437 q 0,1.85937 0.75,2.76562 0.76563,0.89063 1.82813,0.89063 1.09375,0 1.875,-0.92188 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76563,-2.76562 -0.75,-0.92188 -1.8125,-0.92188 -1.04687,0 -1.85937,0.98438 -0.79688,0.96875 -0.79688,2.84375 z m 7.62571,4.78125 5.125,-13.35938 h 1.90625 l 5.46875,13.35938 h -2.01563 L 456.20004,343.86 h -5.59375 l -1.46875,4.04688 z m 3.85937,-5.48438 h 4.53125 l -1.40625,-3.70312 q -0.625,-1.6875 -0.9375,-2.76563 -0.26562,1.28125 -0.71875,2.54688 z m 10.2717,5.48438 v -1.875 h 1.875 v 1.875 q 0,1.03125 -0.375,1.65625 -0.35938,0.64062 -1.15625,0.98437 l -0.45313,-0.70312 q 0.51563,-0.21875 0.76563,-0.67188 0.25,-0.4375 0.28125,-1.26562 z m 12.63223,0 -3.6875,-9.67188 h 1.73438 l 2.07812,5.79688 q 0.32813,0.9375 0.625,1.9375 0.20313,-0.76563 0.60938,-1.82813 l 2.14062,-5.90625 h 1.6875 l -3.65625,9.67188 z m 6.64063,0 V 338.235 h 1.46875 v 1.35938 q 0.45312,-0.71875 1.20312,-1.14063 0.76563,-0.4375 1.71875,-0.4375 1.07813,0 1.76563,0.45313 0.6875,0.4375 0.96875,1.23437 1.15625,-1.6875 2.98437,-1.6875 1.45313,0 2.21875,0.79688 0.78125,0.79687 0.78125,2.45312 v 6.64063 h -1.64062 v -6.09375 q 0,-0.98438 -0.15625,-1.40625 -0.15625,-0.4375 -0.57813,-0.70313 -0.42187,-0.26562 -0.98437,-0.26562 -1.01563,0 -1.6875,0.6875 -0.67188,0.67187 -0.67188,2.15625 v 5.625 h -1.64062 v -6.28125 q 0,-1.09375 -0.40625,-1.64063 -0.40625,-0.54687 -1.3125,-0.54687 -0.6875,0 -1.28125,0.35937 -0.59375,0.35938 -0.85938,1.0625 -0.25,0.70313 -0.25,2.03125 v 5.01563 z m 22.1658,-3.10938 1.6875,0.20313 q -0.40625,1.48437 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.29688 -1.23438,-1.3125 -1.23438,-3.67187 0,-2.45313 1.25,-3.79688 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.32813 1.23437,1.3125 1.23437,3.70312 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.45313 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.48438 1.01563,-1.51563 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.82812 -0.78125,-0.95313 -2.03125,-0.95313 -1.125,0 -1.90625,0.76563 -0.76562,0.75 -0.84375,2.01562 z m 9.1413,5.76563 V 338.235 h 1.46875 v 1.35938 q 0.45313,-0.71875 1.20313,-1.14063 0.76562,-0.4375 1.71875,-0.4375 1.07812,0 1.76562,0.45313 0.6875,0.4375 0.96875,1.23437 1.15625,-1.6875 2.98438,-1.6875 1.45312,0 2.21875,0.79688 0.78125,0.79687 0.78125,2.45312 v 6.64063 h -1.64063 v -6.09375 q 0,-0.98438 -0.15625,-1.40625 -0.15625,-0.4375 -0.57812,-0.70313 -0.42188,-0.26562 -0.98438,-0.26562 -1.01562,0 -1.6875,0.6875 -0.67187,0.67187 -0.67187,2.15625 v 5.625 h -1.64063 v -6.28125 q 0,-1.09375 -0.40625,-1.64063 -0.40625,-0.54687 -1.3125,-0.54687 -0.6875,0 -1.28125,0.35937 -0.59375,0.35938 -0.85937,1.0625 -0.25,0.70313 -0.25,2.03125 v 5.01563 z m 24.16583,-5.84375 -8.84375,3.78125 v -1.625 l 7.01563,-2.90625 -7.01563,-2.875 v -1.64063 l 8.84375,3.73438 z" />
+  <path
+     style="fill:#000000;fill-opacity:0;fill-rule:evenodd"
+     inkscape:connector-curvature="0"
+     id="path207"
+     d="M 245.88976,0 H 586.86614 V 16.283463 H 245.88976 Z" />
+  <path
+     style="fill:#434343;fill-rule:nonzero"
+     inkscape:connector-curvature="0"
+     id="path209"
+     d="M 256.32726,25.20346 V 11.844085 h 5.04688 q 1.32812,0 2.03125,0.125 0.96875,0.171875 1.64062,0.640625 0.67188,0.453125 1.07813,1.28125 0.40625,0.828125 0.40625,1.828125 0,1.703125 -1.09375,2.890625 -1.07813,1.171875 -3.92188,1.171875 h -3.42187 v 5.421875 z m 1.76563,-7 h 3.45312 q 1.71875,0 2.4375,-0.640625 0.71875,-0.640625 0.71875,-1.796875 0,-0.84375 -0.42187,-1.4375 -0.42188,-0.59375 -1.125,-0.78125 -0.4375,-0.125 -1.64063,-0.125 h -3.42187 z m 10.47482,7 V 11.844085 h 1.64062 v 4.796875 q 1.14063,-1.328125 2.89063,-1.328125 1.07812,0 1.85937,0.421875 0.79688,0.421875 1.14063,1.171875 0.34375,0.75 0.34375,2.171875 v 6.125 h -1.64063 v -6.125 q 0,-1.234375 -0.53125,-1.796875 -0.53125,-0.5625 -1.51562,-0.5625 -0.71875,0 -1.35938,0.390625 -0.64062,0.375 -0.92187,1.015625 -0.26563,0.640625 -0.26563,1.78125 v 5.296875 z m 10.2976,3.71875 -0.1875,-1.53125 q 0.54688,0.140625 0.9375,0.140625 0.54688,0 0.875,-0.1875 0.32813,-0.171875 0.54688,-0.5 0.15625,-0.25 0.5,-1.21875 0.0469,-0.140625 0.14062,-0.40625 l -3.67187,-9.6875 h 1.76562 l 2.01563,5.59375 q 0.39062,1.078125 0.70312,2.25 0.28125,-1.125 0.67188,-2.203125 l 2.07812,-5.640625 h 1.64063 l -3.6875,9.828125 q -0.59375,1.609375 -0.92188,2.203125 -0.4375,0.8125 -1,1.1875 -0.5625,0.375 -1.34375,0.375 -0.48437,0 -1.0625,-0.203125 z m 8.75,-6.609375 1.625,-0.25 q 0.125,0.96875 0.75,1.5 0.625,0.515625 1.75,0.515625 1.125,0 1.67188,-0.453125 0.54687,-0.46875 0.54687,-1.09375 0,-0.546875 -0.48437,-0.875 -0.32813,-0.21875 -1.67188,-0.546875 -1.8125,-0.46875 -2.51562,-0.796875 -0.6875,-0.328125 -1.04688,-0.90625 -0.35937,-0.59375 -0.35937,-1.3125 0,-0.640625 0.29687,-1.1875 0.29688,-0.5625 0.8125,-0.921875 0.375,-0.28125 1.03125,-0.46875 0.67188,-0.203125 1.42188,-0.203125 1.14062,0 2,0.328125 0.85937,0.328125 1.26562,0.890625 0.42188,0.5625 0.57813,1.5 l -1.60938,0.21875 q -0.10937,-0.75 -0.64062,-1.171875 -0.51563,-0.421875 -1.46875,-0.421875 -1.14063,0 -1.625,0.375 -0.46875,0.375 -0.46875,0.875 0,0.3125 0.1875,0.578125 0.20312,0.265625 0.64062,0.4375 0.23438,0.09375 1.4375,0.421875 1.75,0.453125 2.4375,0.75 0.6875,0.296875 1.07813,0.859375 0.39062,0.5625 0.39062,1.40625 0,0.828125 -0.48437,1.546875 -0.46875,0.71875 -1.375,1.125 -0.90625,0.390625 -2.04688,0.390625 -1.875,0 -2.875,-0.78125 -0.98437,-0.78125 -1.25,-2.328125 z m 9.98438,-8.578125 v -1.890625 h 1.64062 v 1.890625 z m 0,11.46875 v -9.671875 h 1.64062 v 9.671875 z m 10.45731,-3.546875 1.60937,0.21875 q -0.26562,1.65625 -1.35937,2.609375 -1.07813,0.9375 -2.67188,0.9375 -1.98437,0 -3.1875,-1.296875 -1.20312,-1.296875 -1.20312,-3.71875 0,-1.578125 0.51562,-2.75 0.51563,-1.171875 1.57813,-1.75 1.0625,-0.59375 2.3125,-0.59375 1.57812,0 2.57812,0.796875 1,0.796875 1.28125,2.265625 l -1.59375,0.234375 q -0.23437,-0.96875 -0.8125,-1.453125 -0.57812,-0.5 -1.39062,-0.5 -1.23438,0 -2.01563,0.890625 -0.78125,0.890625 -0.78125,2.8125 0,1.953125 0.75,2.84375 0.75,0.875 1.95313,0.875 0.96875,0 1.60937,-0.59375 0.65625,-0.59375 0.82813,-1.828125 z m 9.32812,2.359375 q -0.92187,0.765625 -1.76562,1.09375 -0.82813,0.3125 -1.79688,0.3125 -1.59375,0 -2.45312,-0.78125 -0.85938,-0.78125 -0.85938,-1.984375 0,-0.71875 0.32813,-1.296875 0.32812,-0.59375 0.84375,-0.9375 0.53125,-0.359375 1.1875,-0.546875 0.46875,-0.125 1.45312,-0.25 1.98438,-0.234375 2.92188,-0.5625 0.0156,-0.34375 0.0156,-0.421875 0,-1 -0.46875,-1.421875 -0.625,-0.546875 -1.875,-0.546875 -1.15625,0 -1.70312,0.40625 -0.54688,0.40625 -0.8125,1.421875 l -1.60938,-0.21875 q 0.21875,-1.015625 0.71875,-1.640625 0.5,-0.640625 1.45313,-0.984375 0.95312,-0.34375 2.1875,-0.34375 1.25,0 2.01562,0.296875 0.78125,0.28125 1.14063,0.734375 0.375,0.4375 0.51562,1.109375 0.0781,0.421875 0.0781,1.515625 v 2.1875 q 0,2.28125 0.10937,2.890625 0.10938,0.59375 0.40625,1.15625 h -1.70312 q -0.26563,-0.515625 -0.32813,-1.1875 z m -0.14062,-3.671875 q -0.89063,0.375 -2.67188,0.625 -1.01562,0.140625 -1.4375,0.328125 -0.42187,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45313,0.4375 0.9375,0 1.67187,-0.40625 0.75,-0.421875 1.09375,-1.140625 0.26563,-0.5625 0.26563,-1.640625 z m 4.15698,4.859375 V 11.844085 h 1.64062 V 25.20346 Z m 9.53125,0 V 11.844085 h 2.65625 l 3.15625,9.453125 q 0.4375,1.328125 0.64062,1.984375 0.23438,-0.734375 0.70313,-2.140625 l 3.20312,-9.296875 h 2.375 V 25.20346 h -1.70312 V 14.031585 l -3.875,11.171875 h -1.59375 l -3.85938,-11.375 v 11.375 z m 22.00955,-3.109375 1.6875,0.203125 q -0.40625,1.484375 -1.48437,2.3125 -1.07813,0.8125 -2.76563,0.8125 -2.125,0 -3.375,-1.296875 -1.23437,-1.3125 -1.23437,-3.671875 0,-2.453125 1.25,-3.796875 1.26562,-1.34375 3.26562,-1.34375 1.9375,0 3.15625,1.328125 1.23438,1.3125 1.23438,3.703125 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.453125 0.8125,0.84375 2.01562,0.84375 0.90625,0 1.54688,-0.46875 0.64062,-0.484375 1.01562,-1.515625 z m -5.39062,-2.65625 h 5.40625 q -0.10938,-1.21875 -0.625,-1.828125 -0.78125,-0.953125 -2.03125,-0.953125 -1.125,0 -1.90625,0.765625 -0.76563,0.75 -0.84375,2.015625 z m 9.14132,5.765625 v -9.671875 h 1.46875 v 1.359375 q 0.45313,-0.71875 1.20313,-1.140625 0.76562,-0.4375 1.71875,-0.4375 1.07812,0 1.76562,0.453125 0.6875,0.4375 0.96875,1.234375 1.15625,-1.6875 2.98438,-1.6875 1.45312,0 2.21875,0.796875 0.78125,0.796875 0.78125,2.453125 v 6.640625 h -1.64063 v -6.09375 q 0,-0.984375 -0.15625,-1.40625 -0.15625,-0.4375 -0.57812,-0.703125 -0.42188,-0.265625 -0.98438,-0.265625 -1.01562,0 -1.6875,0.6875 -0.67187,0.671875 -0.67187,2.15625 v 5.625 h -1.64063 v -6.28125 q 0,-1.09375 -0.40625,-1.640625 -0.40625,-0.546875 -1.3125,-0.546875 -0.6875,0 -1.28125,0.359375 -0.59375,0.359375 -0.85937,1.0625 -0.25,0.703125 -0.25,2.03125 v 5.015625 z m 14.93143,-4.84375 q 0,-2.6875 1.48437,-3.96875 1.25,-1.078125 3.04688,-1.078125 2,0 3.26562,1.3125 1.26563,1.296875 1.26563,3.609375 0,1.859375 -0.5625,2.9375 -0.5625,1.0625 -1.64063,1.65625 -1.0625,0.59375 -2.32812,0.59375 -2.03125,0 -3.28125,-1.296875 -1.25,-1.3125 -1.25,-3.765625 z m 1.6875,0 q 0,1.859375 0.79687,2.796875 0.8125,0.921875 2.04688,0.921875 1.21875,0 2.03125,-0.921875 0.8125,-0.9375 0.8125,-2.84375 0,-1.796875 -0.8125,-2.71875 -0.8125,-0.921875 -2.03125,-0.921875 -1.23438,0 -2.04688,0.921875 -0.79687,0.90625 -0.79687,2.765625 z m 9.28198,4.84375 v -9.671875 h 1.46875 v 1.46875 q 0.5625,-1.03125 1.03125,-1.359375 0.48438,-0.328125 1.0625,-0.328125 0.82813,0 1.6875,0.53125 l -0.5625,1.515625 q -0.60937,-0.359375 -1.20312,-0.359375 -0.54688,0 -0.96875,0.328125 -0.42188,0.328125 -0.60938,0.890625 -0.28125,0.875 -0.28125,1.921875 v 5.0625 z m 6.15018,3.71875 -0.1875,-1.53125 q 0.54687,0.140625 0.9375,0.140625 0.54687,0 0.875,-0.1875 0.32812,-0.171875 0.54687,-0.5 0.15625,-0.25 0.5,-1.21875 0.0469,-0.140625 0.14063,-0.40625 l -3.67188,-9.6875 h 1.76563 l 2.01562,5.59375 q 0.39063,1.078125 0.70313,2.25 0.28125,-1.125 0.67187,-2.203125 l 2.07813,-5.640625 h 1.64062 l -3.6875,9.828125 q -0.59375,1.609375 -0.92187,2.203125 -0.4375,0.8125 -1,1.1875 -0.5625,0.375 -1.34375,0.375 -0.48438,0 -1.0625,-0.203125 z m 14.19891,-8.015625 1.65625,-0.140625 q 0.125,1 0.54688,1.640625 0.4375,0.640625 1.34375,1.046875 0.92187,0.390625 2.0625,0.390625 1,0 1.78125,-0.296875 0.78125,-0.296875 1.15625,-0.8125 0.375,-0.53125 0.375,-1.15625 0,-0.625 -0.375,-1.09375 -0.35938,-0.46875 -1.1875,-0.796875 -0.54688,-0.203125 -2.39063,-0.640625 -1.82812,-0.453125 -2.5625,-0.84375 -0.96875,-0.5 -1.4375,-1.234375 -0.46875,-0.75 -0.46875,-1.671875 0,-1 0.57813,-1.875 0.57812,-0.890625 1.67187,-1.34375 1.10938,-0.453125 2.45313,-0.453125 1.48437,0 2.60937,0.484375 1.14063,0.46875 1.75,1.40625 0.60938,0.921875 0.65625,2.09375 l -1.6875,0.125 q -0.14062,-1.265625 -0.9375,-1.90625 -0.78125,-0.65625 -2.3125,-0.65625 -1.60937,0 -2.34375,0.59375 -0.73437,0.59375 -0.73437,1.421875 0,0.71875 0.53125,1.171875 0.5,0.46875 2.65625,0.96875 2.15625,0.484375 2.95312,0.84375 1.17188,0.53125 1.71875,1.359375 0.5625,0.828125 0.5625,1.90625 0,1.0625 -0.60937,2.015625 -0.60938,0.9375 -1.75,1.46875 -1.14063,0.515625 -2.57813,0.515625 -1.8125,0 -3.04687,-0.53125 -1.21875,-0.53125 -1.92188,-1.59375 -0.6875,-1.0625 -0.71875,-2.40625 z m 12.8342,8 v -13.375 h 1.48438 v 1.25 q 0.53125,-0.734375 1.1875,-1.09375 0.67187,-0.375 1.625,-0.375 1.23437,0 2.17187,0.640625 0.95313,0.625 1.4375,1.796875 0.48438,1.15625 0.48438,2.546875 0,1.484375 -0.53125,2.671875 -0.53125,1.1875 -1.54688,1.828125 -1.01562,0.625 -2.14062,0.625 -0.8125,0 -1.46875,-0.34375 -0.65625,-0.34375 -1.0625,-0.875 v 4.703125 z m 1.48438,-8.484375 q 0,1.859375 0.75,2.765625 0.76562,0.890625 1.82812,0.890625 1.09375,0 1.875,-0.921875 0.78125,-0.9375 0.78125,-2.875 0,-1.84375 -0.76562,-2.765625 -0.75,-0.921875 -1.8125,-0.921875 -1.04688,0 -1.85938,0.984375 -0.79687,0.96875 -0.79687,2.84375 z m 15.20385,3.59375 q -0.92187,0.765625 -1.76562,1.09375 -0.82813,0.3125 -1.79688,0.3125 -1.59375,0 -2.45312,-0.78125 -0.85938,-0.78125 -0.85938,-1.984375 0,-0.71875 0.32813,-1.296875 0.32812,-0.59375 0.84375,-0.9375 0.53125,-0.359375 1.1875,-0.546875 0.46875,-0.125 1.45312,-0.25 1.98438,-0.234375 2.92188,-0.5625 0.0156,-0.34375 0.0156,-0.421875 0,-1 -0.46875,-1.421875 -0.625,-0.546875 -1.875,-0.546875 -1.15625,0 -1.70312,0.40625 -0.54688,0.40625 -0.8125,1.421875 l -1.60938,-0.21875 q 0.21875,-1.015625 0.71875,-1.640625 0.5,-0.640625 1.45313,-0.984375 0.95312,-0.34375 2.1875,-0.34375 1.25,0 2.01562,0.296875 0.78125,0.28125 1.14063,0.734375 0.375,0.4375 0.51562,1.109375 0.0781,0.421875 0.0781,1.515625 v 2.1875 q 0,2.28125 0.10937,2.890625 0.10938,0.59375 0.40625,1.15625 h -1.70312 q -0.26563,-0.515625 -0.32813,-1.1875 z m -0.14062,-3.671875 q -0.89063,0.375 -2.67188,0.625 -1.01562,0.140625 -1.4375,0.328125 -0.42187,0.1875 -0.65625,0.53125 -0.21875,0.34375 -0.21875,0.78125 0,0.65625 0.5,1.09375 0.5,0.4375 1.45313,0.4375 0.9375,0 1.67187,-0.40625 0.75,-0.421875 1.09375,-1.140625 0.26563,-0.5625 0.26563,-1.640625 z m 10.51632,1.3125 1.60938,0.21875 q -0.26563,1.65625 -1.35938,2.609375 -1.07812,0.9375 -2.67187,0.9375 -1.98438,0 -3.1875,-1.296875 -1.20313,-1.296875 -1.20313,-3.71875 0,-1.578125 0.51563,-2.75 0.51562,-1.171875 1.57812,-1.75 1.0625,-0.59375 2.3125,-0.59375 1.57813,0 2.57813,0.796875 1,0.796875 1.28125,2.265625 l -1.59375,0.234375 q -0.23438,-0.96875 -0.8125,-1.453125 -0.57813,-0.5 -1.39063,-0.5 -1.23437,0 -2.01562,0.890625 -0.78125,0.890625 -0.78125,2.8125 0,1.953125 0.75,2.84375 0.75,0.875 1.95312,0.875 0.96875,0 1.60938,-0.59375 0.65625,-0.59375 0.82812,-1.828125 z m 9.64063,0.4375 1.6875,0.203125 q -0.40625,1.484375 -1.48438,2.3125 -1.07812,0.8125 -2.76562,0.8125 -2.125,0 -3.375,-1.296875 -1.23438,-1.3125 -1.23438,-3.671875 0,-2.453125 1.25,-3.796875 1.26563,-1.34375 3.26563,-1.34375 1.9375,0 3.15625,1.328125 1.23437,1.3125 1.23437,3.703125 0,0.15625 0,0.4375 h -7.21875 q 0.0937,1.59375 0.90625,2.453125 0.8125,0.84375 2.01563,0.84375 0.90625,0 1.54687,-0.46875 0.64063,-0.484375 1.01563,-1.515625 z m -5.39063,-2.65625 h 5.40625 q -0.10937,-1.21875 -0.625,-1.828125 -0.78125,-0.953125 -2.03125,-0.953125 -1.125,0 -1.90625,0.765625 -0.76562,0.75 -0.84375,2.015625 z m 14.1059,-0.07813 v -1.53125 l 8.84375,-3.734375 v 1.640625 l -7.01562,2.875 7.01562,2.90625 v 1.625 z m 10.89496,2.75 1.57812,-0.140625 q 0.20313,1.109375 0.76563,1.609375 0.5625,0.5 1.45312,0.5 0.75,0 1.3125,-0.34375 0.57813,-0.34375 0.9375,-0.921875 0.375,-0.578125 0.60938,-1.5625 0.25,-0.984375 0.25,-2 0,-0.109375 0,-0.328125 -0.5,0.78125 -1.35938,1.265625 -0.84375,0.484375 -1.82812,0.484375 -1.67188,0 -2.8125,-1.203125 -1.14063,-1.203125 -1.14063,-3.171875 0,-2.03125 1.1875,-3.265625 1.20313,-1.234375 3,-1.234375 1.3125,0 2.39063,0.703125 1.07812,0.703125 1.64062,2 0.5625,1.296875 0.5625,3.75 0,2.5625 -0.5625,4.078125 -0.5625,1.515625 -1.65625,2.3125 -1.09375,0.796875 -2.57812,0.796875 -1.5625,0 -2.5625,-0.875 -0.98438,-0.875 -1.1875,-2.453125 z m 6.71875,-5.890625 q 0,-1.40625 -0.75,-2.234375 -0.75,-0.828125 -1.8125,-0.828125 -1.09375,0 -1.90625,0.890625 -0.8125,0.890625 -0.8125,2.3125 0,1.28125 0.76562,2.078125 0.78125,0.796875 1.90625,0.796875 1.14063,0 1.875,-0.796875 0.73438,-0.796875 0.73438,-2.21875 z m 2.78198,8.984375 3.53125,-5.03125 -3.26562,-4.640625 h 2.04687 l 1.48438,2.265625 q 0.42187,0.640625 0.67187,1.078125 0.40625,-0.59375 0.73438,-1.0625 l 1.64062,-2.28125 h 1.95313 l -3.34375,4.546875 3.59375,5.125 h -2.01563 l -1.98437,-3 -0.51563,-0.8125 -2.54687,3.8125 z m 10.8125,0 v -8.40625 h -1.45312 V 15.53158 h 1.45312 v -1.03125 q 0,-0.96875 0.17188,-1.453125 0.23437,-0.640625 0.82812,-1.03125 0.59375,-0.390625 1.67188,-0.390625 0.6875,0 1.53125,0.15625 l -0.25,1.4375 q -0.5,-0.09375 -0.95313,-0.09375 -0.75,0 -1.0625,0.328125 -0.3125,0.3125 -0.3125,1.1875 v 0.890625 h 1.89063 v 1.265625 h -1.89063 v 8.40625 z m 4.33954,-3.53125 1.64062,-0.21875 q 0.28125,1.40625 0.95313,2.015625 0.6875,0.609375 1.65625,0.609375 1.15625,0 1.95312,-0.796875 0.79688,-0.796875 0.79688,-1.984375 0,-1.125 -0.73438,-1.859375 -0.73437,-0.734375 -1.875,-0.734375 -0.46875,0 -1.15625,0.171875 l 0.1875,-1.4375 q 0.15625,0.01563 0.26563,0.01563 1.04687,0 1.875,-0.546875 0.84375,-0.546875 0.84375,-1.671875 0,-0.90625 -0.60938,-1.5 -0.60937,-0.59375 -1.57812,-0.59375 -0.95313,0 -1.59375,0.609375 -0.64063,0.59375 -0.8125,1.796875 l -1.64063,-0.296875 q 0.29688,-1.640625 1.35938,-2.546875 1.0625,-0.90625 2.65625,-0.90625 1.09375,0 2,0.46875 0.92187,0.46875 1.40625,1.28125 0.5,0.8125 0.5,1.71875 0,0.859375 -0.46875,1.578125 -0.46875,0.703125 -1.375,1.125 1.1875,0.28125 1.84375,1.140625 0.65625,0.859375 0.65625,2.15625 0,1.734375 -1.28125,2.953125 -1.26563,1.21875 -3.21875,1.21875 -1.76563,0 -2.92188,-1.046875 -1.15625,-1.046875 -1.32812,-2.71875 z m 18.98511,1.953125 v 1.578125 h -8.82813 q -0.0156,-0.59375 0.1875,-1.140625 0.34375,-0.90625 1.07813,-1.78125 0.75,-0.875 2.15625,-2.015625 2.17187,-1.78125 2.9375,-2.828125 0.76562,-1.046875 0.76562,-1.96875 0,-0.984375 -0.70312,-1.640625 -0.6875,-0.671875 -1.8125,-0.671875 -1.1875,0 -1.90625,0.71875 -0.70313,0.703125 -0.70313,1.953125 l -1.6875,-0.171875 q 0.17188,-1.890625 1.29688,-2.875 1.14062,-0.984375 3.03125,-0.984375 1.92187,0 3.04687,1.0625 1.125,1.0625 1.125,2.640625 0,0.796875 -0.32812,1.578125 -0.32813,0.78125 -1.09375,1.640625 -0.75,0.84375 -2.53125,2.34375 -1.46875,1.234375 -1.89063,1.6875 -0.42187,0.4375 -0.6875,0.875 z m 10.84448,-4.265625 -8.84375,3.78125 v -1.625 l 7.01562,-2.90625 -7.01562,-2.875 V 14.09408 l 8.84375,3.734375 z" />
+</svg>
diff --git a/mlir/examples/CMakeLists.txt b/mlir/examples/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..37c89d0bae965cfc8665515de7e60ad7867a7d8b
--- /dev/null
+++ b/mlir/examples/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(toy)
diff --git a/mlir/examples/toy/CMakeLists.txt b/mlir/examples/toy/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..56002b1ad2e27aee3ca26a909e25b238599ae2d6
--- /dev/null
+++ b/mlir/examples/toy/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_custom_target(Toy)
+set_target_properties(Toy PROPERTIES FOLDER Examples)
+
+macro(add_toy_chapter name)
+  add_dependencies(Toy ${name})
+  add_llvm_example(${name} ${ARGN})
+endmacro(add_toy_chapter name)
+
+add_subdirectory(Ch1)
+add_subdirectory(Ch2)
+add_subdirectory(Ch3)
+add_subdirectory(Ch4)
+add_subdirectory(Ch5)
+add_subdirectory(Ch6)
+add_subdirectory(Ch7)
diff --git a/mlir/examples/toy/Ch1/CMakeLists.txt b/mlir/examples/toy/Ch1/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f4e85556130161b5eaf59f6b353b608eff9f7eb9
--- /dev/null
+++ b/mlir/examples/toy/Ch1/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+add_toy_chapter(toyc-ch1
+  toyc.cpp
+  parser/AST.cpp
+  )
+include_directories(include/)
+target_link_libraries(toyc-ch1
+  PRIVATE
+    MLIRSupport)
diff --git a/mlir/examples/toy/Ch1/include/toy/AST.h b/mlir/examples/toy/Ch1/include/toy/AST.h
new file mode 100644
index 0000000000000000000000000000000000000000..820600b5b1c900cbeedce7545bad458f096cc92e
--- /dev/null
+++ b/mlir/examples/toy/Ch1/include/toy/AST.h
@@ -0,0 +1,242 @@
+//===- AST.h - Node definition for the Toy AST ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST for the Toy language. It is optimized for
+// simplicity, not efficiency. The AST forms a tree structure where each node
+// references its children using std::unique_ptr<>.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_AST_H_
+#define MLIR_TUTORIAL_TOY_AST_H_
+
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <vector>
+
+namespace toy {
+
+/// A variable type with shape information.
+struct VarType {
+  std::vector<int64_t> shape;
+};
+
+/// Base class for all expression nodes.
+class ExprAST {
+public:
+  enum ExprASTKind {
+    Expr_VarDecl,
+    Expr_Return,
+    Expr_Num,
+    Expr_Literal,
+    Expr_Var,
+    Expr_BinOp,
+    Expr_Call,
+    Expr_Print,
+  };
+
+  ExprAST(ExprASTKind kind, Location location)
+      : kind(kind), location(location) {}
+  virtual ~ExprAST() = default;
+
+  ExprASTKind getKind() const { return kind; }
+
+  const Location &loc() { return location; }
+
+private:
+  const ExprASTKind kind;
+  Location location;
+};
+
+/// A block-list of expressions.
+using ExprASTList = std::vector<std::unique_ptr<ExprAST>>;
+
+/// Expression class for numeric literals like "1.0".
+class NumberExprAST : public ExprAST {
+  double Val;
+
+public:
+  NumberExprAST(Location loc, double val) : ExprAST(Expr_Num, loc), Val(val) {}
+
+  double getValue() { return Val; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Num; }
+};
+
+/// Expression class for a literal value.
+class LiteralExprAST : public ExprAST {
+  std::vector<std::unique_ptr<ExprAST>> values;
+  std::vector<int64_t> dims;
+
+public:
+  LiteralExprAST(Location loc, std::vector<std::unique_ptr<ExprAST>> values,
+                 std::vector<int64_t> dims)
+      : ExprAST(Expr_Literal, loc), values(std::move(values)),
+        dims(std::move(dims)) {}
+
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getValues() { return values; }
+  llvm::ArrayRef<int64_t> getDims() { return dims; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Literal; }
+};
+
+/// Expression class for referencing a variable, like "a".
+class VariableExprAST : public ExprAST {
+  std::string name;
+
+public:
+  VariableExprAST(Location loc, llvm::StringRef name)
+      : ExprAST(Expr_Var, loc), name(name) {}
+
+  llvm::StringRef getName() { return name; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Var; }
+};
+
+/// Expression class for defining a variable.
+class VarDeclExprAST : public ExprAST {
+  std::string name;
+  VarType type;
+  std::unique_ptr<ExprAST> initVal;
+
+public:
+  VarDeclExprAST(Location loc, llvm::StringRef name, VarType type,
+                 std::unique_ptr<ExprAST> initVal)
+      : ExprAST(Expr_VarDecl, loc), name(name), type(std::move(type)),
+        initVal(std::move(initVal)) {}
+
+  llvm::StringRef getName() { return name; }
+  ExprAST *getInitVal() { return initVal.get(); }
+  const VarType &getType() { return type; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_VarDecl; }
+};
+
+/// Expression class for a return operator.
+class ReturnExprAST : public ExprAST {
+  llvm::Optional<std::unique_ptr<ExprAST>> expr;
+
+public:
+  ReturnExprAST(Location loc, llvm::Optional<std::unique_ptr<ExprAST>> expr)
+      : ExprAST(Expr_Return, loc), expr(std::move(expr)) {}
+
+  llvm::Optional<ExprAST *> getExpr() {
+    if (expr.hasValue())
+      return expr->get();
+    return llvm::None;
+  }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Return; }
+};
+
+/// Expression class for a binary operator.
+class BinaryExprAST : public ExprAST {
+  char op;
+  std::unique_ptr<ExprAST> lhs, rhs;
+
+public:
+  char getOp() { return op; }
+  ExprAST *getLHS() { return lhs.get(); }
+  ExprAST *getRHS() { return rhs.get(); }
+
+  BinaryExprAST(Location loc, char Op, std::unique_ptr<ExprAST> lhs,
+                std::unique_ptr<ExprAST> rhs)
+      : ExprAST(Expr_BinOp, loc), op(Op), lhs(std::move(lhs)),
+        rhs(std::move(rhs)) {}
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_BinOp; }
+};
+
+/// Expression class for function calls.
+class CallExprAST : public ExprAST {
+  std::string callee;
+  std::vector<std::unique_ptr<ExprAST>> args;
+
+public:
+  CallExprAST(Location loc, const std::string &callee,
+              std::vector<std::unique_ptr<ExprAST>> args)
+      : ExprAST(Expr_Call, loc), callee(callee), args(std::move(args)) {}
+
+  llvm::StringRef getCallee() { return callee; }
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getArgs() { return args; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Call; }
+};
+
+/// Expression class for builtin print calls.
+class PrintExprAST : public ExprAST {
+  std::unique_ptr<ExprAST> arg;
+
+public:
+  PrintExprAST(Location loc, std::unique_ptr<ExprAST> arg)
+      : ExprAST(Expr_Print, loc), arg(std::move(arg)) {}
+
+  ExprAST *getArg() { return arg.get(); }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Print; }
+};
+
+/// This class represents the "prototype" for a function, which captures its
+/// name, and its argument names (thus implicitly the number of arguments the
+/// function takes).
+class PrototypeAST {
+  Location location;
+  std::string name;
+  std::vector<std::unique_ptr<VariableExprAST>> args;
+
+public:
+  PrototypeAST(Location location, const std::string &name,
+               std::vector<std::unique_ptr<VariableExprAST>> args)
+      : location(location), name(name), args(std::move(args)) {}
+
+  const Location &loc() { return location; }
+  llvm::StringRef getName() const { return name; }
+  llvm::ArrayRef<std::unique_ptr<VariableExprAST>> getArgs() { return args; }
+};
+
+/// This class represents a function definition itself.
+class FunctionAST {
+  std::unique_ptr<PrototypeAST> proto;
+  std::unique_ptr<ExprASTList> body;
+
+public:
+  FunctionAST(std::unique_ptr<PrototypeAST> proto,
+              std::unique_ptr<ExprASTList> body)
+      : proto(std::move(proto)), body(std::move(body)) {}
+  PrototypeAST *getProto() { return proto.get(); }
+  ExprASTList *getBody() { return body.get(); }
+};
+
+/// This class represents a list of functions to be processed together
+class ModuleAST {
+  std::vector<FunctionAST> functions;
+
+public:
+  ModuleAST(std::vector<FunctionAST> functions)
+      : functions(std::move(functions)) {}
+
+  auto begin() -> decltype(functions.begin()) { return functions.begin(); }
+  auto end() -> decltype(functions.end()) { return functions.end(); }
+};
+
+void dump(ModuleAST &);
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_AST_H_
diff --git a/mlir/examples/toy/Ch1/include/toy/Lexer.h b/mlir/examples/toy/Ch1/include/toy/Lexer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a77a91bb5645104b5474c680aca5368e18f130a0
--- /dev/null
+++ b/mlir/examples/toy/Ch1/include/toy/Lexer.h
@@ -0,0 +1,232 @@
+//===- Lexer.h - Lexer for the Toy language -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple Lexer for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_LEXER_H_
+#define MLIR_TUTORIAL_TOY_LEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+
+#include <memory>
+#include <string>
+
+namespace toy {
+
+/// Structure definition a location in a file.
+struct Location {
+  std::shared_ptr<std::string> file; ///< filename.
+  int line;                          ///< line number.
+  int col;                           ///< column number.
+};
+
+// List of Token returned by the lexer.
+enum Token : int {
+  tok_semicolon = ';',
+  tok_parenthese_open = '(',
+  tok_parenthese_close = ')',
+  tok_bracket_open = '{',
+  tok_bracket_close = '}',
+  tok_sbracket_open = '[',
+  tok_sbracket_close = ']',
+
+  tok_eof = -1,
+
+  // commands
+  tok_return = -2,
+  tok_var = -3,
+  tok_def = -4,
+
+  // primary
+  tok_identifier = -5,
+  tok_number = -6,
+};
+
+/// The Lexer is an abstract base class providing all the facilities that the
+/// Parser expects. It goes through the stream one token at a time and keeps
+/// track of the location in the file for debugging purposes.
+/// It relies on a subclass to provide a `readNextLine()` method. The subclass
+/// can proceed by reading the next line from the standard input or from a
+/// memory mapped file.
+class Lexer {
+public:
+  /// Create a lexer for the given filename. The filename is kept only for
+  /// debugging purposes (attaching a location to a Token).
+  Lexer(std::string filename)
+      : lastLocation(
+            {std::make_shared<std::string>(std::move(filename)), 0, 0}) {}
+  virtual ~Lexer() = default;
+
+  /// Look at the current token in the stream.
+  Token getCurToken() { return curTok; }
+
+  /// Move to the next token in the stream and return it.
+  Token getNextToken() { return curTok = getTok(); }
+
+  /// Move to the next token in the stream, asserting on the current token
+  /// matching the expectation.
+  void consume(Token tok) {
+    assert(tok == curTok && "consume Token mismatch expectation");
+    getNextToken();
+  }
+
+  /// Return the current identifier (prereq: getCurToken() == tok_identifier)
+  llvm::StringRef getId() {
+    assert(curTok == tok_identifier);
+    return identifierStr;
+  }
+
+  /// Return the current number (prereq: getCurToken() == tok_number)
+  double getValue() {
+    assert(curTok == tok_number);
+    return numVal;
+  }
+
+  /// Return the location for the beginning of the current token.
+  Location getLastLocation() { return lastLocation; }
+
+  // Return the current line in the file.
+  int getLine() { return curLineNum; }
+
+  // Return the current column in the file.
+  int getCol() { return curCol; }
+
+private:
+  /// Delegate to a derived class fetching the next line. Returns an empty
+  /// string to signal end of file (EOF). Lines are expected to always finish
+  /// with "\n"
+  virtual llvm::StringRef readNextLine() = 0;
+
+  /// Return the next character from the stream. This manages the buffer for the
+  /// current line and request the next line buffer to the derived class as
+  /// needed.
+  int getNextChar() {
+    // The current line buffer should not be empty unless it is the end of file.
+    if (curLineBuffer.empty())
+      return EOF;
+    ++curCol;
+    auto nextchar = curLineBuffer.front();
+    curLineBuffer = curLineBuffer.drop_front();
+    if (curLineBuffer.empty())
+      curLineBuffer = readNextLine();
+    if (nextchar == '\n') {
+      ++curLineNum;
+      curCol = 0;
+    }
+    return nextchar;
+  }
+
+  ///  Return the next token from standard input.
+  Token getTok() {
+    // Skip any whitespace.
+    while (isspace(lastChar))
+      lastChar = Token(getNextChar());
+
+    // Save the current location before reading the token characters.
+    lastLocation.line = curLineNum;
+    lastLocation.col = curCol;
+
+    // Identifier: [a-zA-Z][a-zA-Z0-9_]*
+    if (isalpha(lastChar)) {
+      identifierStr = (char)lastChar;
+      while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
+        identifierStr += (char)lastChar;
+
+      if (identifierStr == "return")
+        return tok_return;
+      if (identifierStr == "def")
+        return tok_def;
+      if (identifierStr == "var")
+        return tok_var;
+      return tok_identifier;
+    }
+
+    // Number: [0-9.]+
+    if (isdigit(lastChar) || lastChar == '.') {
+      std::string numStr;
+      do {
+        numStr += lastChar;
+        lastChar = Token(getNextChar());
+      } while (isdigit(lastChar) || lastChar == '.');
+
+      numVal = strtod(numStr.c_str(), nullptr);
+      return tok_number;
+    }
+
+    if (lastChar == '#') {
+      // Comment until end of line.
+      do {
+        lastChar = Token(getNextChar());
+      } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
+
+      if (lastChar != EOF)
+        return getTok();
+    }
+
+    // Check for end of file.  Don't eat the EOF.
+    if (lastChar == EOF)
+      return tok_eof;
+
+    // Otherwise, just return the character as its ascii value.
+    Token thisChar = Token(lastChar);
+    lastChar = Token(getNextChar());
+    return thisChar;
+  }
+
+  /// The last token read from the input.
+  Token curTok = tok_eof;
+
+  /// Location for `curTok`.
+  Location lastLocation;
+
+  /// If the current Token is an identifier, this string contains the value.
+  std::string identifierStr;
+
+  /// If the current Token is a number, this contains the value.
+  double numVal = 0;
+
+  /// The last value returned by getNextChar(). We need to keep it around as we
+  /// always need to read ahead one character to decide when to end a token and
+  /// we can't put it back in the stream after reading from it.
+  Token lastChar = Token(' ');
+
+  /// Keep track of the current line number in the input stream
+  int curLineNum = 0;
+
+  /// Keep track of the current column number in the input stream
+  int curCol = 0;
+
+  /// Buffer supplied by the derived class on calls to `readNextLine()`
+  llvm::StringRef curLineBuffer = "\n";
+};
+
+/// A lexer implementation operating on a buffer in memory.
+class LexerBuffer final : public Lexer {
+public:
+  LexerBuffer(const char *begin, const char *end, std::string filename)
+      : Lexer(std::move(filename)), current(begin), end(end) {}
+
+private:
+  /// Provide one line at a time to the Lexer, return an empty string when
+  /// reaching the end of the buffer.
+  llvm::StringRef readNextLine() override {
+    auto *begin = current;
+    while (current <= end && *current && *current != '\n')
+      ++current;
+    if (current <= end && *current)
+      ++current;
+    llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
+    return result;
+  }
+  const char *current, *end;
+};
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_LEXER_H_
diff --git a/mlir/examples/toy/Ch1/include/toy/Parser.h b/mlir/examples/toy/Ch1/include/toy/Parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..4557ea26859de3d0a6b71448f4bef030167c3e71
--- /dev/null
+++ b/mlir/examples/toy/Ch1/include/toy/Parser.h
@@ -0,0 +1,485 @@
+//===- Parser.h - Toy Language Parser -------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the parser for the Toy language. It processes the Token
+// provided by the Lexer and returns an AST.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_PARSER_H
+#define MLIR_TUTORIAL_TOY_PARSER_H
+
+#include "toy/AST.h"
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+namespace toy {
+
+/// This is a simple recursive parser for the Toy language. It produces a well
+/// formed AST from a stream of Token supplied by the Lexer. No semantic checks
+/// or symbol resolution is performed. For example, variables are referenced by
+/// string and the code could reference an undeclared variable and the parsing
+/// succeeds.
+class Parser {
+public:
+  /// Create a Parser for the supplied lexer.
+  Parser(Lexer &lexer) : lexer(lexer) {}
+
+  /// Parse a full Module. A module is a list of function definitions.
+  std::unique_ptr<ModuleAST> parseModule() {
+    lexer.getNextToken(); // prime the lexer
+
+    // Parse functions one at a time and accumulate in this vector.
+    std::vector<FunctionAST> functions;
+    while (auto f = parseDefinition()) {
+      functions.push_back(std::move(*f));
+      if (lexer.getCurToken() == tok_eof)
+        break;
+    }
+    // If we didn't reach EOF, there was an error during parsing
+    if (lexer.getCurToken() != tok_eof)
+      return parseError<ModuleAST>("nothing", "at end of module");
+
+    return std::make_unique<ModuleAST>(std::move(functions));
+  }
+
+private:
+  Lexer &lexer;
+
+  /// Parse a return statement.
+  /// return :== return ; | return expr ;
+  std::unique_ptr<ReturnExprAST> parseReturn() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_return);
+
+    // return takes an optional argument
+    llvm::Optional<std::unique_ptr<ExprAST>> expr;
+    if (lexer.getCurToken() != ';') {
+      expr = parseExpression();
+      if (!expr)
+        return nullptr;
+    }
+    return std::make_unique<ReturnExprAST>(std::move(loc), std::move(expr));
+  }
+
+  /// Parse a literal number.
+  /// numberexpr ::= number
+  std::unique_ptr<ExprAST> parseNumberExpr() {
+    auto loc = lexer.getLastLocation();
+    auto result =
+        std::make_unique<NumberExprAST>(std::move(loc), lexer.getValue());
+    lexer.consume(tok_number);
+    return std::move(result);
+  }
+
+  /// Parse a literal array expression.
+  /// tensorLiteral ::= [ literalList ] | number
+  /// literalList ::= tensorLiteral | tensorLiteral, literalList
+  std::unique_ptr<ExprAST> parseTensorLiteralExpr() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(Token('['));
+
+    // Hold the list of values at this nesting level.
+    std::vector<std::unique_ptr<ExprAST>> values;
+    // Hold the dimensions for all the nesting inside this level.
+    std::vector<int64_t> dims;
+    do {
+      // We can have either another nested array or a number literal.
+      if (lexer.getCurToken() == '[') {
+        values.push_back(parseTensorLiteralExpr());
+        if (!values.back())
+          return nullptr; // parse error in the nested array.
+      } else {
+        if (lexer.getCurToken() != tok_number)
+          return parseError<ExprAST>("<num> or [", "in literal expression");
+        values.push_back(parseNumberExpr());
+      }
+
+      // End of this list on ']'
+      if (lexer.getCurToken() == ']')
+        break;
+
+      // Elements are separated by a comma.
+      if (lexer.getCurToken() != ',')
+        return parseError<ExprAST>("] or ,", "in literal expression");
+
+      lexer.getNextToken(); // eat ,
+    } while (true);
+    if (values.empty())
+      return parseError<ExprAST>("<something>", "to fill literal expression");
+    lexer.getNextToken(); // eat ]
+
+    /// Fill in the dimensions now. First the current nesting level:
+    dims.push_back(values.size());
+
+    /// If there is any nested array, process all of them and ensure that
+    /// dimensions are uniform.
+    if (llvm::any_of(values, [](std::unique_ptr<ExprAST> &expr) {
+          return llvm::isa<LiteralExprAST>(expr.get());
+        })) {
+      auto *firstLiteral = llvm::dyn_cast<LiteralExprAST>(values.front().get());
+      if (!firstLiteral)
+        return parseError<ExprAST>("uniform well-nested dimensions",
+                                   "inside literal expression");
+
+      // Append the nested dimensions to the current level
+      auto firstDims = firstLiteral->getDims();
+      dims.insert(dims.end(), firstDims.begin(), firstDims.end());
+
+      // Sanity check that shape is uniform across all elements of the list.
+      for (auto &expr : values) {
+        auto *exprLiteral = llvm::cast<LiteralExprAST>(expr.get());
+        if (!exprLiteral)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+        if (exprLiteral->getDims() != firstDims)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+      }
+    }
+    return std::make_unique<LiteralExprAST>(std::move(loc), std::move(values),
+                                            std::move(dims));
+  }
+
+  /// parenexpr ::= '(' expression ')'
+  std::unique_ptr<ExprAST> parseParenExpr() {
+    lexer.getNextToken(); // eat (.
+    auto v = parseExpression();
+    if (!v)
+      return nullptr;
+
+    if (lexer.getCurToken() != ')')
+      return parseError<ExprAST>(")", "to close expression with parentheses");
+    lexer.consume(Token(')'));
+    return v;
+  }
+
+  /// identifierexpr
+  ///   ::= identifier
+  ///   ::= identifier '(' expression ')'
+  std::unique_ptr<ExprAST> parseIdentifierExpr() {
+    std::string name = lexer.getId();
+
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat identifier.
+
+    if (lexer.getCurToken() != '(') // Simple variable ref.
+      return std::make_unique<VariableExprAST>(std::move(loc), name);
+
+    // This is a function call.
+    lexer.consume(Token('('));
+    std::vector<std::unique_ptr<ExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      while (true) {
+        if (auto arg = parseExpression())
+          args.push_back(std::move(arg));
+        else
+          return nullptr;
+
+        if (lexer.getCurToken() == ')')
+          break;
+
+        if (lexer.getCurToken() != ',')
+          return parseError<ExprAST>(", or )", "in argument list");
+        lexer.getNextToken();
+      }
+    }
+    lexer.consume(Token(')'));
+
+    // It can be a builtin call to print
+    if (name == "print") {
+      if (args.size() != 1)
+        return parseError<ExprAST>("<single arg>", "as argument to print()");
+
+      return std::make_unique<PrintExprAST>(std::move(loc), std::move(args[0]));
+    }
+
+    // Call to a user-defined function
+    return std::make_unique<CallExprAST>(std::move(loc), name, std::move(args));
+  }
+
+  /// primary
+  ///   ::= identifierexpr
+  ///   ::= numberexpr
+  ///   ::= parenexpr
+  ///   ::= tensorliteral
+  std::unique_ptr<ExprAST> parsePrimary() {
+    switch (lexer.getCurToken()) {
+    default:
+      llvm::errs() << "unknown token '" << lexer.getCurToken()
+                   << "' when expecting an expression\n";
+      return nullptr;
+    case tok_identifier:
+      return parseIdentifierExpr();
+    case tok_number:
+      return parseNumberExpr();
+    case '(':
+      return parseParenExpr();
+    case '[':
+      return parseTensorLiteralExpr();
+    case ';':
+      return nullptr;
+    case '}':
+      return nullptr;
+    }
+  }
+
+  /// Recursively parse the right hand side of a binary expression, the ExprPrec
+  /// argument indicates the precedence of the current binary operator.
+  ///
+  /// binoprhs ::= ('+' primary)*
+  std::unique_ptr<ExprAST> parseBinOpRHS(int exprPrec,
+                                         std::unique_ptr<ExprAST> lhs) {
+    // If this is a binop, find its precedence.
+    while (true) {
+      int tokPrec = getTokPrecedence();
+
+      // If this is a binop that binds at least as tightly as the current binop,
+      // consume it, otherwise we are done.
+      if (tokPrec < exprPrec)
+        return lhs;
+
+      // Okay, we know this is a binop.
+      int binOp = lexer.getCurToken();
+      lexer.consume(Token(binOp));
+      auto loc = lexer.getLastLocation();
+
+      // Parse the primary expression after the binary operator.
+      auto rhs = parsePrimary();
+      if (!rhs)
+        return parseError<ExprAST>("expression", "to complete binary operator");
+
+      // If BinOp binds less tightly with rhs than the operator after rhs, let
+      // the pending operator take rhs as its lhs.
+      int nextPrec = getTokPrecedence();
+      if (tokPrec < nextPrec) {
+        rhs = parseBinOpRHS(tokPrec + 1, std::move(rhs));
+        if (!rhs)
+          return nullptr;
+      }
+
+      // Merge lhs/RHS.
+      lhs = std::make_unique<BinaryExprAST>(std::move(loc), binOp,
+                                            std::move(lhs), std::move(rhs));
+    }
+  }
+
+  /// expression::= primary binop rhs
+  std::unique_ptr<ExprAST> parseExpression() {
+    auto lhs = parsePrimary();
+    if (!lhs)
+      return nullptr;
+
+    return parseBinOpRHS(0, std::move(lhs));
+  }
+
+  /// type ::= < shape_list >
+  /// shape_list ::= num | num , shape_list
+  std::unique_ptr<VarType> parseType() {
+    if (lexer.getCurToken() != '<')
+      return parseError<VarType>("<", "to begin type");
+    lexer.getNextToken(); // eat <
+
+    auto type = std::make_unique<VarType>();
+
+    while (lexer.getCurToken() == tok_number) {
+      type->shape.push_back(lexer.getValue());
+      lexer.getNextToken();
+      if (lexer.getCurToken() == ',')
+        lexer.getNextToken();
+    }
+
+    if (lexer.getCurToken() != '>')
+      return parseError<VarType>(">", "to end type");
+    lexer.getNextToken(); // eat >
+    return type;
+  }
+
+  /// Parse a variable declaration, it starts with a `var` keyword followed by
+  /// and identifier and an optional type (shape specification) before the
+  /// initializer.
+  /// decl ::= var identifier [ type ] = expr
+  std::unique_ptr<VarDeclExprAST> parseDeclaration() {
+    if (lexer.getCurToken() != tok_var)
+      return parseError<VarDeclExprAST>("var", "to begin declaration");
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat var
+
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<VarDeclExprAST>("identified",
+                                        "after 'var' declaration");
+    std::string id = lexer.getId();
+    lexer.getNextToken(); // eat id
+
+    std::unique_ptr<VarType> type; // Type is optional, it can be inferred
+    if (lexer.getCurToken() == '<') {
+      type = parseType();
+      if (!type)
+        return nullptr;
+    }
+
+    if (!type)
+      type = std::make_unique<VarType>();
+    lexer.consume(Token('='));
+    auto expr = parseExpression();
+    return std::make_unique<VarDeclExprAST>(std::move(loc), std::move(id),
+                                            std::move(*type), std::move(expr));
+  }
+
+  /// Parse a block: a list of expression separated by semicolons and wrapped in
+  /// curly braces.
+  ///
+  /// block ::= { expression_list }
+  /// expression_list ::= block_expr ; expression_list
+  /// block_expr ::= decl | "return" | expr
+  std::unique_ptr<ExprASTList> parseBlock() {
+    if (lexer.getCurToken() != '{')
+      return parseError<ExprASTList>("{", "to begin block");
+    lexer.consume(Token('{'));
+
+    auto exprList = std::make_unique<ExprASTList>();
+
+    // Ignore empty expressions: swallow sequences of semicolons.
+    while (lexer.getCurToken() == ';')
+      lexer.consume(Token(';'));
+
+    while (lexer.getCurToken() != '}' && lexer.getCurToken() != tok_eof) {
+      if (lexer.getCurToken() == tok_var) {
+        // Variable declaration
+        auto varDecl = parseDeclaration();
+        if (!varDecl)
+          return nullptr;
+        exprList->push_back(std::move(varDecl));
+      } else if (lexer.getCurToken() == tok_return) {
+        // Return statement
+        auto ret = parseReturn();
+        if (!ret)
+          return nullptr;
+        exprList->push_back(std::move(ret));
+      } else {
+        // General expression
+        auto expr = parseExpression();
+        if (!expr)
+          return nullptr;
+        exprList->push_back(std::move(expr));
+      }
+      // Ensure that elements are separated by a semicolon.
+      if (lexer.getCurToken() != ';')
+        return parseError<ExprASTList>(";", "after expression");
+
+      // Ignore empty expressions: swallow sequences of semicolons.
+      while (lexer.getCurToken() == ';')
+        lexer.consume(Token(';'));
+    }
+
+    if (lexer.getCurToken() != '}')
+      return parseError<ExprASTList>("}", "to close block");
+
+    lexer.consume(Token('}'));
+    return exprList;
+  }
+
+  /// prototype ::= def id '(' decl_list ')'
+  /// decl_list ::= identifier | identifier, decl_list
+  std::unique_ptr<PrototypeAST> parsePrototype() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_def);
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<PrototypeAST>("function name", "in prototype");
+
+    std::string fnName = lexer.getId();
+    lexer.consume(tok_identifier);
+
+    if (lexer.getCurToken() != '(')
+      return parseError<PrototypeAST>("(", "in prototype");
+    lexer.consume(Token('('));
+
+    std::vector<std::unique_ptr<VariableExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      do {
+        std::string name = lexer.getId();
+        auto loc = lexer.getLastLocation();
+        lexer.consume(tok_identifier);
+        auto decl = std::make_unique<VariableExprAST>(std::move(loc), name);
+        args.push_back(std::move(decl));
+        if (lexer.getCurToken() != ',')
+          break;
+        lexer.consume(Token(','));
+        if (lexer.getCurToken() != tok_identifier)
+          return parseError<PrototypeAST>(
+              "identifier", "after ',' in function parameter list");
+      } while (true);
+    }
+    if (lexer.getCurToken() != ')')
+      return parseError<PrototypeAST>("}", "to end function prototype");
+
+    // success.
+    lexer.consume(Token(')'));
+    return std::make_unique<PrototypeAST>(std::move(loc), fnName,
+                                          std::move(args));
+  }
+
+  /// Parse a function definition, we expect a prototype initiated with the
+  /// `def` keyword, followed by a block containing a list of expressions.
+  ///
+  /// definition ::= prototype block
+  std::unique_ptr<FunctionAST> parseDefinition() {
+    auto proto = parsePrototype();
+    if (!proto)
+      return nullptr;
+
+    if (auto block = parseBlock())
+      return std::make_unique<FunctionAST>(std::move(proto), std::move(block));
+    return nullptr;
+  }
+
+  /// Get the precedence of the pending binary operator token.
+  int getTokPrecedence() {
+    if (!isascii(lexer.getCurToken()))
+      return -1;
+
+    // 1 is lowest precedence.
+    switch (static_cast<char>(lexer.getCurToken())) {
+    case '-':
+      return 20;
+    case '+':
+      return 20;
+    case '*':
+      return 40;
+    default:
+      return -1;
+    }
+  }
+
+  /// Helper function to signal errors while parsing, it takes an argument
+  /// indicating the expected token and another argument giving more context.
+  /// Location is retrieved from the lexer to enrich the error message.
+  template <typename R, typename T, typename U = const char *>
+  std::unique_ptr<R> parseError(T &&expected, U &&context = "") {
+    auto curToken = lexer.getCurToken();
+    llvm::errs() << "Parse error (" << lexer.getLastLocation().line << ", "
+                 << lexer.getLastLocation().col << "): expected '" << expected
+                 << "' " << context << " but has Token " << curToken;
+    if (isprint(curToken))
+      llvm::errs() << " '" << (char)curToken << "'";
+    llvm::errs() << "\n";
+    return nullptr;
+  }
+};
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_PARSER_H
diff --git a/mlir/examples/toy/Ch1/parser/AST.cpp b/mlir/examples/toy/Ch1/parser/AST.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d6d9359529bffc068520bebf4a9ea56f436a415
--- /dev/null
+++ b/mlir/examples/toy/Ch1/parser/AST.cpp
@@ -0,0 +1,234 @@
+//===- AST.cpp - Helper for printing out the Toy AST ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST dump for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/AST.h"
+
+#include "mlir/ADT/TypeSwitch.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+
+namespace {
+
+// RAII helper to manage increasing/decreasing the indentation as we traverse
+// the AST
+struct Indent {
+  Indent(int &level) : level(level) { ++level; }
+  ~Indent() { --level; }
+  int &level;
+};
+
+/// Helper class that implement the AST tree traversal and print the nodes along
+/// the way. The only data member is the current indentation level.
+class ASTDumper {
+public:
+  void dump(ModuleAST *node);
+
+private:
+  void dump(const VarType &type);
+  void dump(VarDeclExprAST *varDecl);
+  void dump(ExprAST *expr);
+  void dump(ExprASTList *exprList);
+  void dump(NumberExprAST *num);
+  void dump(LiteralExprAST *node);
+  void dump(VariableExprAST *node);
+  void dump(ReturnExprAST *node);
+  void dump(BinaryExprAST *node);
+  void dump(CallExprAST *node);
+  void dump(PrintExprAST *node);
+  void dump(PrototypeAST *node);
+  void dump(FunctionAST *node);
+
+  // Actually print spaces matching the current indentation level
+  void indent() {
+    for (int i = 0; i < curIndent; i++)
+      llvm::errs() << "  ";
+  }
+  int curIndent = 0;
+};
+
+} // namespace
+
+/// Return a formatted string for the location of any node
+template <typename T> static std::string loc(T *node) {
+  const auto &loc = node->loc();
+  return (llvm::Twine("@") + *loc.file + ":" + llvm::Twine(loc.line) + ":" +
+          llvm::Twine(loc.col))
+      .str();
+}
+
+// Helper Macro to bump the indentation level and print the leading spaces for
+// the current indentations
+#define INDENT()                                                               \
+  Indent level_(curIndent);                                                    \
+  indent();
+
+/// Dispatch to a generic expressions to the appropriate subclass using RTTI
+void ASTDumper::dump(ExprAST *expr) {
+  mlir::TypeSwitch<ExprAST *>(expr)
+      .Case<BinaryExprAST, CallExprAST, LiteralExprAST, NumberExprAST,
+            PrintExprAST, ReturnExprAST, VarDeclExprAST, VariableExprAST>(
+          [&](auto *node) { this->dump(node); })
+      .Default([&](ExprAST *) {
+        // No match, fallback to a generic message
+        INDENT();
+        llvm::errs() << "<unknown Expr, kind " << expr->getKind() << ">\n";
+      });
+}
+
+/// A variable declaration is printing the variable name, the type, and then
+/// recurse in the initializer value.
+void ASTDumper::dump(VarDeclExprAST *varDecl) {
+  INDENT();
+  llvm::errs() << "VarDecl " << varDecl->getName();
+  dump(varDecl->getType());
+  llvm::errs() << " " << loc(varDecl) << "\n";
+  dump(varDecl->getInitVal());
+}
+
+/// A "block", or a list of expression
+void ASTDumper::dump(ExprASTList *exprList) {
+  INDENT();
+  llvm::errs() << "Block {\n";
+  for (auto &expr : *exprList)
+    dump(expr.get());
+  indent();
+  llvm::errs() << "} // Block\n";
+}
+
+/// A literal number, just print the value.
+void ASTDumper::dump(NumberExprAST *num) {
+  INDENT();
+  llvm::errs() << num->getValue() << " " << loc(num) << "\n";
+}
+
+/// Helper to print recursively a literal. This handles nested array like:
+///    [ [ 1, 2 ], [ 3, 4 ] ]
+/// We print out such array with the dimensions spelled out at every level:
+///    <2,2>[<2>[ 1, 2 ], <2>[ 3, 4 ] ]
+void printLitHelper(ExprAST *litOrNum) {
+  // Inside a literal expression we can have either a number or another literal
+  if (auto num = llvm::dyn_cast<NumberExprAST>(litOrNum)) {
+    llvm::errs() << num->getValue();
+    return;
+  }
+  auto *literal = llvm::cast<LiteralExprAST>(litOrNum);
+
+  // Print the dimension for this literal first
+  llvm::errs() << "<";
+  mlir::interleaveComma(literal->getDims(), llvm::errs());
+  llvm::errs() << ">";
+
+  // Now print the content, recursing on every element of the list
+  llvm::errs() << "[ ";
+  mlir::interleaveComma(literal->getValues(), llvm::errs(),
+                        [&](auto &elt) { printLitHelper(elt.get()); });
+  llvm::errs() << "]";
+}
+
+/// Print a literal, see the recursive helper above for the implementation.
+void ASTDumper::dump(LiteralExprAST *node) {
+  INDENT();
+  llvm::errs() << "Literal: ";
+  printLitHelper(node);
+  llvm::errs() << " " << loc(node) << "\n";
+}
+
+/// Print a variable reference (just a name).
+void ASTDumper::dump(VariableExprAST *node) {
+  INDENT();
+  llvm::errs() << "var: " << node->getName() << " " << loc(node) << "\n";
+}
+
+/// Return statement print the return and its (optional) argument.
+void ASTDumper::dump(ReturnExprAST *node) {
+  INDENT();
+  llvm::errs() << "Return\n";
+  if (node->getExpr().hasValue())
+    return dump(*node->getExpr());
+  {
+    INDENT();
+    llvm::errs() << "(void)\n";
+  }
+}
+
+/// Print a binary operation, first the operator, then recurse into LHS and RHS.
+void ASTDumper::dump(BinaryExprAST *node) {
+  INDENT();
+  llvm::errs() << "BinOp: " << node->getOp() << " " << loc(node) << "\n";
+  dump(node->getLHS());
+  dump(node->getRHS());
+}
+
+/// Print a call expression, first the callee name and the list of args by
+/// recursing into each individual argument.
+void ASTDumper::dump(CallExprAST *node) {
+  INDENT();
+  llvm::errs() << "Call '" << node->getCallee() << "' [ " << loc(node) << "\n";
+  for (auto &arg : node->getArgs())
+    dump(arg.get());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print a builtin print call, first the builtin name and then the argument.
+void ASTDumper::dump(PrintExprAST *node) {
+  INDENT();
+  llvm::errs() << "Print [ " << loc(node) << "\n";
+  dump(node->getArg());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print type: only the shape is printed in between '<' and '>'
+void ASTDumper::dump(const VarType &type) {
+  llvm::errs() << "<";
+  mlir::interleaveComma(type.shape, llvm::errs());
+  llvm::errs() << ">";
+}
+
+/// Print a function prototype, first the function name, and then the list of
+/// parameters names.
+void ASTDumper::dump(PrototypeAST *node) {
+  INDENT();
+  llvm::errs() << "Proto '" << node->getName() << "' " << loc(node) << "'\n";
+  indent();
+  llvm::errs() << "Params: [";
+  mlir::interleaveComma(node->getArgs(), llvm::errs(),
+                        [](auto &arg) { llvm::errs() << arg->getName(); });
+  llvm::errs() << "]\n";
+}
+
+/// Print a function, first the prototype and then the body.
+void ASTDumper::dump(FunctionAST *node) {
+  INDENT();
+  llvm::errs() << "Function \n";
+  dump(node->getProto());
+  dump(node->getBody());
+}
+
+/// Print a module, actually loop over the functions and print them in sequence.
+void ASTDumper::dump(ModuleAST *node) {
+  INDENT();
+  llvm::errs() << "Module:\n";
+  for (auto &f : *node)
+    dump(&f);
+}
+
+namespace toy {
+
+// Public API
+void dump(ModuleAST &module) { ASTDumper().dump(&module); }
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch1/toyc.cpp b/mlir/examples/toy/Ch1/toyc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48863fa931cd09d7216e262d55006ae341233775
--- /dev/null
+++ b/mlir/examples/toy/Ch1/toyc.cpp
@@ -0,0 +1,66 @@
+//===- toyc.cpp - The Toy Compiler ----------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the entry point for the Toy compiler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Parser.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+namespace cl = llvm::cl;
+
+static cl::opt<std::string> inputFilename(cl::Positional,
+                                          cl::desc("<input toy file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+namespace {
+enum Action { None, DumpAST };
+}
+
+static cl::opt<enum Action>
+    emitAction("emit", cl::desc("Select the kind of output desired"),
+               cl::values(clEnumValN(DumpAST, "ast", "output the AST dump")));
+
+/// Returns a Toy AST resulting from parsing the file or a nullptr on error.
+std::unique_ptr<toy::ModuleAST> parseInputFile(llvm::StringRef filename) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(filename);
+  if (std::error_code ec = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << ec.message() << "\n";
+    return nullptr;
+  }
+  auto buffer = fileOrErr.get()->getBuffer();
+  LexerBuffer lexer(buffer.begin(), buffer.end(), filename);
+  Parser parser(lexer);
+  return parser.parseModule();
+}
+
+int main(int argc, char **argv) {
+  cl::ParseCommandLineOptions(argc, argv, "toy compiler\n");
+
+  auto moduleAST = parseInputFile(inputFilename);
+  if (!moduleAST)
+    return 1;
+
+  switch (emitAction) {
+  case Action::DumpAST:
+    dump(*moduleAST);
+    return 0;
+  default:
+    llvm::errs() << "No action specified (parsing only?), use -emit=<action>\n";
+  }
+
+  return 0;
+}
diff --git a/mlir/examples/toy/Ch2/CMakeLists.txt b/mlir/examples/toy/Ch2/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7154902017eea2e262b79fc76171c0d6e1f597bd
--- /dev/null
+++ b/mlir/examples/toy/Ch2/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_subdirectory(include)
+
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+add_toy_chapter(toyc-ch2
+  toyc.cpp
+  parser/AST.cpp
+  mlir/MLIRGen.cpp
+  mlir/Dialect.cpp
+  )
+include_directories(include/)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/)
+add_dependencies(toyc-ch2 ToyCh2OpsIncGen)
+target_link_libraries(toyc-ch2
+  PRIVATE
+    MLIRAnalysis
+    MLIRIR
+    MLIRParser
+    MLIRTransforms)
diff --git a/mlir/examples/toy/Ch2/include/CMakeLists.txt b/mlir/examples/toy/Ch2/include/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..37c89d0bae965cfc8665515de7e60ad7867a7d8b
--- /dev/null
+++ b/mlir/examples/toy/Ch2/include/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(toy)
diff --git a/mlir/examples/toy/Ch2/include/toy/AST.h b/mlir/examples/toy/Ch2/include/toy/AST.h
new file mode 100644
index 0000000000000000000000000000000000000000..820600b5b1c900cbeedce7545bad458f096cc92e
--- /dev/null
+++ b/mlir/examples/toy/Ch2/include/toy/AST.h
@@ -0,0 +1,242 @@
+//===- AST.h - Node definition for the Toy AST ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST for the Toy language. It is optimized for
+// simplicity, not efficiency. The AST forms a tree structure where each node
+// references its children using std::unique_ptr<>.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_AST_H_
+#define MLIR_TUTORIAL_TOY_AST_H_
+
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <vector>
+
+namespace toy {
+
+/// A variable type with shape information.
+struct VarType {
+  std::vector<int64_t> shape;
+};
+
+/// Base class for all expression nodes.
+class ExprAST {
+public:
+  enum ExprASTKind {
+    Expr_VarDecl,
+    Expr_Return,
+    Expr_Num,
+    Expr_Literal,
+    Expr_Var,
+    Expr_BinOp,
+    Expr_Call,
+    Expr_Print,
+  };
+
+  ExprAST(ExprASTKind kind, Location location)
+      : kind(kind), location(location) {}
+  virtual ~ExprAST() = default;
+
+  ExprASTKind getKind() const { return kind; }
+
+  const Location &loc() { return location; }
+
+private:
+  const ExprASTKind kind;
+  Location location;
+};
+
+/// A block-list of expressions.
+using ExprASTList = std::vector<std::unique_ptr<ExprAST>>;
+
+/// Expression class for numeric literals like "1.0".
+class NumberExprAST : public ExprAST {
+  double Val;
+
+public:
+  NumberExprAST(Location loc, double val) : ExprAST(Expr_Num, loc), Val(val) {}
+
+  double getValue() { return Val; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Num; }
+};
+
+/// Expression class for a literal value.
+class LiteralExprAST : public ExprAST {
+  std::vector<std::unique_ptr<ExprAST>> values;
+  std::vector<int64_t> dims;
+
+public:
+  LiteralExprAST(Location loc, std::vector<std::unique_ptr<ExprAST>> values,
+                 std::vector<int64_t> dims)
+      : ExprAST(Expr_Literal, loc), values(std::move(values)),
+        dims(std::move(dims)) {}
+
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getValues() { return values; }
+  llvm::ArrayRef<int64_t> getDims() { return dims; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Literal; }
+};
+
+/// Expression class for referencing a variable, like "a".
+class VariableExprAST : public ExprAST {
+  std::string name;
+
+public:
+  VariableExprAST(Location loc, llvm::StringRef name)
+      : ExprAST(Expr_Var, loc), name(name) {}
+
+  llvm::StringRef getName() { return name; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Var; }
+};
+
+/// Expression class for defining a variable.
+class VarDeclExprAST : public ExprAST {
+  std::string name;
+  VarType type;
+  std::unique_ptr<ExprAST> initVal;
+
+public:
+  VarDeclExprAST(Location loc, llvm::StringRef name, VarType type,
+                 std::unique_ptr<ExprAST> initVal)
+      : ExprAST(Expr_VarDecl, loc), name(name), type(std::move(type)),
+        initVal(std::move(initVal)) {}
+
+  llvm::StringRef getName() { return name; }
+  ExprAST *getInitVal() { return initVal.get(); }
+  const VarType &getType() { return type; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_VarDecl; }
+};
+
+/// Expression class for a return operator.
+class ReturnExprAST : public ExprAST {
+  llvm::Optional<std::unique_ptr<ExprAST>> expr;
+
+public:
+  ReturnExprAST(Location loc, llvm::Optional<std::unique_ptr<ExprAST>> expr)
+      : ExprAST(Expr_Return, loc), expr(std::move(expr)) {}
+
+  llvm::Optional<ExprAST *> getExpr() {
+    if (expr.hasValue())
+      return expr->get();
+    return llvm::None;
+  }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Return; }
+};
+
+/// Expression class for a binary operator.
+class BinaryExprAST : public ExprAST {
+  char op;
+  std::unique_ptr<ExprAST> lhs, rhs;
+
+public:
+  char getOp() { return op; }
+  ExprAST *getLHS() { return lhs.get(); }
+  ExprAST *getRHS() { return rhs.get(); }
+
+  BinaryExprAST(Location loc, char Op, std::unique_ptr<ExprAST> lhs,
+                std::unique_ptr<ExprAST> rhs)
+      : ExprAST(Expr_BinOp, loc), op(Op), lhs(std::move(lhs)),
+        rhs(std::move(rhs)) {}
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_BinOp; }
+};
+
+/// Expression class for function calls.
+class CallExprAST : public ExprAST {
+  std::string callee;
+  std::vector<std::unique_ptr<ExprAST>> args;
+
+public:
+  CallExprAST(Location loc, const std::string &callee,
+              std::vector<std::unique_ptr<ExprAST>> args)
+      : ExprAST(Expr_Call, loc), callee(callee), args(std::move(args)) {}
+
+  llvm::StringRef getCallee() { return callee; }
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getArgs() { return args; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Call; }
+};
+
+/// Expression class for builtin print calls.
+class PrintExprAST : public ExprAST {
+  std::unique_ptr<ExprAST> arg;
+
+public:
+  PrintExprAST(Location loc, std::unique_ptr<ExprAST> arg)
+      : ExprAST(Expr_Print, loc), arg(std::move(arg)) {}
+
+  ExprAST *getArg() { return arg.get(); }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Print; }
+};
+
+/// This class represents the "prototype" for a function, which captures its
+/// name, and its argument names (thus implicitly the number of arguments the
+/// function takes).
+class PrototypeAST {
+  Location location;
+  std::string name;
+  std::vector<std::unique_ptr<VariableExprAST>> args;
+
+public:
+  PrototypeAST(Location location, const std::string &name,
+               std::vector<std::unique_ptr<VariableExprAST>> args)
+      : location(location), name(name), args(std::move(args)) {}
+
+  const Location &loc() { return location; }
+  llvm::StringRef getName() const { return name; }
+  llvm::ArrayRef<std::unique_ptr<VariableExprAST>> getArgs() { return args; }
+};
+
+/// This class represents a function definition itself.
+class FunctionAST {
+  std::unique_ptr<PrototypeAST> proto;
+  std::unique_ptr<ExprASTList> body;
+
+public:
+  FunctionAST(std::unique_ptr<PrototypeAST> proto,
+              std::unique_ptr<ExprASTList> body)
+      : proto(std::move(proto)), body(std::move(body)) {}
+  PrototypeAST *getProto() { return proto.get(); }
+  ExprASTList *getBody() { return body.get(); }
+};
+
+/// This class represents a list of functions to be processed together
+class ModuleAST {
+  std::vector<FunctionAST> functions;
+
+public:
+  ModuleAST(std::vector<FunctionAST> functions)
+      : functions(std::move(functions)) {}
+
+  auto begin() -> decltype(functions.begin()) { return functions.begin(); }
+  auto end() -> decltype(functions.end()) { return functions.end(); }
+};
+
+void dump(ModuleAST &);
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_AST_H_
diff --git a/mlir/examples/toy/Ch2/include/toy/CMakeLists.txt b/mlir/examples/toy/Ch2/include/toy/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c08f78b0e8c8a93390fb46c401499687fbc232a0
--- /dev/null
+++ b/mlir/examples/toy/Ch2/include/toy/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS Ops.td)
+mlir_tablegen(Ops.h.inc -gen-op-decls)
+mlir_tablegen(Ops.cpp.inc -gen-op-defs)
+add_public_tablegen_target(ToyCh2OpsIncGen)
diff --git a/mlir/examples/toy/Ch2/include/toy/Dialect.h b/mlir/examples/toy/Ch2/include/toy/Dialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..385d6ddb95ac4f50f1d8e64c2a1306114affd258
--- /dev/null
+++ b/mlir/examples/toy/Ch2/include/toy/Dialect.h
@@ -0,0 +1,44 @@
+//===- Dialect.h - Dialect definition for the Toy IR ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IR Dialect for the Toy language.
+// See g3doc/Tutorials/Toy/Ch-2.md for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_DIALECT_H_
+#define MLIR_TUTORIAL_TOY_DIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+
+namespace mlir {
+namespace toy {
+
+/// This is the definition of the Toy dialect. A dialect inherits from
+/// mlir::Dialect and registers custom attributes, operations, and types (in its
+/// constructor). It can also override some general behavior exposed via virtual
+/// methods.
+class ToyDialect : public mlir::Dialect {
+public:
+  explicit ToyDialect(mlir::MLIRContext *ctx);
+
+  /// Provide a utility accessor to the dialect namespace. This is used by
+  /// several utilities for casting between dialects.
+  static llvm::StringRef getDialectNamespace() { return "toy"; }
+};
+
+/// Include the auto-generated header file containing the declarations of the
+/// toy operations.
+#define GET_OP_CLASSES
+#include "toy/Ops.h.inc"
+
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_DIALECT_H_
diff --git a/mlir/examples/toy/Ch2/include/toy/Lexer.h b/mlir/examples/toy/Ch2/include/toy/Lexer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6eff64ee5f09634041f76cbae11c18f8ca46d07c
--- /dev/null
+++ b/mlir/examples/toy/Ch2/include/toy/Lexer.h
@@ -0,0 +1,232 @@
+//===- Lexer.h - Lexer for the Toy language -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple Lexer for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_LEXER_H_
+#define MLIR_TUTORIAL_TOY_LEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+
+#include <memory>
+#include <string>
+
+namespace toy {
+
+/// Structure definition a location in a file.
+struct Location {
+  std::shared_ptr<std::string> file; ///< filename.
+  int line;                          ///< line number.
+  int col;                           ///< column number.
+};
+
+// List of Token returned by the lexer.
+enum Token : int {
+  tok_semicolon = ';',
+  tok_parenthese_open = '(',
+  tok_parenthese_close = ')',
+  tok_bracket_open = '{',
+  tok_bracket_close = '}',
+  tok_sbracket_open = '[',
+  tok_sbracket_close = ']',
+
+  tok_eof = -1,
+
+  // commands
+  tok_return = -2,
+  tok_var = -3,
+  tok_def = -4,
+
+  // primary
+  tok_identifier = -5,
+  tok_number = -6,
+};
+
+/// The Lexer is an abstract base class providing all the facilities that the
+/// Parser expects. It goes through the stream one token at a time and keeps
+/// track of the location in the file for debugging purpose.
+/// It relies on a subclass to provide a `readNextLine()` method. The subclass
+/// can proceed by reading the next line from the standard input or from a
+/// memory mapped file.
+class Lexer {
+public:
+  /// Create a lexer for the given filename. The filename is kept only for
+  /// debugging purpose (attaching a location to a Token).
+  Lexer(std::string filename)
+      : lastLocation(
+            {std::make_shared<std::string>(std::move(filename)), 0, 0}) {}
+  virtual ~Lexer() = default;
+
+  /// Look at the current token in the stream.
+  Token getCurToken() { return curTok; }
+
+  /// Move to the next token in the stream and return it.
+  Token getNextToken() { return curTok = getTok(); }
+
+  /// Move to the next token in the stream, asserting on the current token
+  /// matching the expectation.
+  void consume(Token tok) {
+    assert(tok == curTok && "consume Token mismatch expectation");
+    getNextToken();
+  }
+
+  /// Return the current identifier (prereq: getCurToken() == tok_identifier)
+  llvm::StringRef getId() {
+    assert(curTok == tok_identifier);
+    return identifierStr;
+  }
+
+  /// Return the current number (prereq: getCurToken() == tok_number)
+  double getValue() {
+    assert(curTok == tok_number);
+    return numVal;
+  }
+
+  /// Return the location for the beginning of the current token.
+  Location getLastLocation() { return lastLocation; }
+
+  // Return the current line in the file.
+  int getLine() { return curLineNum; }
+
+  // Return the current column in the file.
+  int getCol() { return curCol; }
+
+private:
+  /// Delegate to a derived class fetching the next line. Returns an empty
+  /// string to signal end of file (EOF). Lines are expected to always finish
+  /// with "\n"
+  virtual llvm::StringRef readNextLine() = 0;
+
+  /// Return the next character from the stream. This manages the buffer for the
+  /// current line and request the next line buffer to the derived class as
+  /// needed.
+  int getNextChar() {
+    // The current line buffer should not be empty unless it is the end of file.
+    if (curLineBuffer.empty())
+      return EOF;
+    ++curCol;
+    auto nextchar = curLineBuffer.front();
+    curLineBuffer = curLineBuffer.drop_front();
+    if (curLineBuffer.empty())
+      curLineBuffer = readNextLine();
+    if (nextchar == '\n') {
+      ++curLineNum;
+      curCol = 0;
+    }
+    return nextchar;
+  }
+
+  ///  Return the next token from standard input.
+  Token getTok() {
+    // Skip any whitespace.
+    while (isspace(lastChar))
+      lastChar = Token(getNextChar());
+
+    // Save the current location before reading the token characters.
+    lastLocation.line = curLineNum;
+    lastLocation.col = curCol;
+
+    // Identifier: [a-zA-Z][a-zA-Z0-9_]*
+    if (isalpha(lastChar)) {
+      identifierStr = (char)lastChar;
+      while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
+        identifierStr += (char)lastChar;
+
+      if (identifierStr == "return")
+        return tok_return;
+      if (identifierStr == "def")
+        return tok_def;
+      if (identifierStr == "var")
+        return tok_var;
+      return tok_identifier;
+    }
+
+    // Number: [0-9.]+
+    if (isdigit(lastChar) || lastChar == '.') {
+      std::string numStr;
+      do {
+        numStr += lastChar;
+        lastChar = Token(getNextChar());
+      } while (isdigit(lastChar) || lastChar == '.');
+
+      numVal = strtod(numStr.c_str(), nullptr);
+      return tok_number;
+    }
+
+    if (lastChar == '#') {
+      // Comment until end of line.
+      do {
+        lastChar = Token(getNextChar());
+      } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
+
+      if (lastChar != EOF)
+        return getTok();
+    }
+
+    // Check for end of file.  Don't eat the EOF.
+    if (lastChar == EOF)
+      return tok_eof;
+
+    // Otherwise, just return the character as its ascii value.
+    Token thisChar = Token(lastChar);
+    lastChar = Token(getNextChar());
+    return thisChar;
+  }
+
+  /// The last token read from the input.
+  Token curTok = tok_eof;
+
+  /// Location for `curTok`.
+  Location lastLocation;
+
+  /// If the current Token is an identifier, this string contains the value.
+  std::string identifierStr;
+
+  /// If the current Token is a number, this contains the value.
+  double numVal = 0;
+
+  /// The last value returned by getNextChar(). We need to keep it around as we
+  /// always need to read ahead one character to decide when to end a token and
+  /// we can't put it back in the stream after reading from it.
+  Token lastChar = Token(' ');
+
+  /// Keep track of the current line number in the input stream
+  int curLineNum = 0;
+
+  /// Keep track of the current column number in the input stream
+  int curCol = 0;
+
+  /// Buffer supplied by the derived class on calls to `readNextLine()`
+  llvm::StringRef curLineBuffer = "\n";
+};
+
+/// A lexer implementation operating on a buffer in memory.
+class LexerBuffer final : public Lexer {
+public:
+  LexerBuffer(const char *begin, const char *end, std::string filename)
+      : Lexer(std::move(filename)), current(begin), end(end) {}
+
+private:
+  /// Provide one line at a time to the Lexer, return an empty string when
+  /// reaching the end of the buffer.
+  llvm::StringRef readNextLine() override {
+    auto *begin = current;
+    while (current <= end && *current && *current != '\n')
+      ++current;
+    if (current <= end && *current)
+      ++current;
+    llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
+    return result;
+  }
+  const char *current, *end;
+};
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_LEXER_H_
diff --git a/mlir/examples/toy/Ch2/include/toy/MLIRGen.h b/mlir/examples/toy/Ch2/include/toy/MLIRGen.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1c8ca1201d1a2a391c0aec0d89197fbbb18efb8
--- /dev/null
+++ b/mlir/examples/toy/Ch2/include/toy/MLIRGen.h
@@ -0,0 +1,32 @@
+//===- MLIRGen.h - MLIR Generation from a Toy AST -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a simple interface to perform IR generation targeting MLIR
+// from a Module AST for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_MLIRGEN_H_
+#define MLIR_TUTORIAL_TOY_MLIRGEN_H_
+
+#include <memory>
+
+namespace mlir {
+class MLIRContext;
+class OwningModuleRef;
+} // namespace mlir
+
+namespace toy {
+class ModuleAST;
+
+/// Emit IR for the given Toy moduleAST, returns a newly created MLIR module
+/// or nullptr on failure.
+mlir::OwningModuleRef mlirGen(mlir::MLIRContext &context, ModuleAST &moduleAST);
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_MLIRGEN_H_
diff --git a/mlir/examples/toy/Ch2/include/toy/Ops.td b/mlir/examples/toy/Ch2/include/toy/Ops.td
new file mode 100644
index 0000000000000000000000000000000000000000..aa7e94fcae77db5d9c3f18efbc02745ced5e4aa1
--- /dev/null
+++ b/mlir/examples/toy/Ch2/include/toy/Ops.td
@@ -0,0 +1,220 @@
+//===- Ops.td - Toy dialect operation definitions ----------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the operations of the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_OPS
+#define TOY_OPS
+
+include "mlir/IR/OpBase.td"
+
+// Provide a definition of the 'toy' dialect in the ODS framework so that we
+// can define our operations.
+def Toy_Dialect : Dialect {
+  let name = "toy";
+  let cppNamespace = "toy";
+}
+
+// Base class for toy dialect operations. This operation inherits from the base
+// `Op` class in OpBase.td, and provides:
+//   * The parent dialect of the operation.
+//   * The mnemonic for the operation, or the name without the dialect prefix.
+//   * A list of traits for the operation.
+class Toy_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Toy_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+// We define a toy operation by inheriting from our base 'Toy_Op' class above.
+// Here we provide the mnemonic and a list of traits for the operation. The
+// constant operation is marked as 'NoSideEffect' as it is a pure operation
+// and may be removed if dead.
+def ConstantOp : Toy_Op<"constant", [NoSideEffect]> {
+  // Provide a summary and description for this operation. This can be used to
+  // auto-generate documentation of the operations within our dialect.
+  let summary = "constant";
+  let description = [{
+    Constant operation turns a literal into an SSA value. The data is attached
+    to the operation as an attribute. For example:
+
+    ```mlir
+      %0 = "toy.constant"()
+         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
+        : () -> tensor<2x3xf64>
+    ```
+  }];
+
+  // The constant operation takes an attribute as the only input.
+  let arguments = (ins F64ElementsAttr:$value);
+
+  // The constant operation returns a single value of TensorType.
+  let results = (outs F64Tensor);
+
+  // Add custom build methods for the constant operation. These method populates
+  // the `state` that MLIR uses to create operations, i.e. these are used when
+  // using `builder.create<ConstantOp>(...)`.
+  let builders = [
+    // Build a constant with a given constant tensor value.
+    OpBuilder<"Builder *builder, OperationState &state, "
+              "DenseElementsAttr value", [{
+      build(builder, state, value.getType(), value);
+    }]>,
+
+    // Build a constant with a given constant floating-point value.
+    OpBuilder<"Builder *builder, OperationState &state, double value">
+  ];
+
+  // Invoke a static verify method to verify this constant operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def AddOp : Toy_Op<"add"> {
+  let summary = "element-wise addition operation";
+  let description = [{
+    The "add" operation performs element-wise addition between two tensors.
+    The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs F64Tensor);
+
+  // Allow building an AddOp with from the two input operands.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
+  ];
+}
+
+def GenericCallOp : Toy_Op<"generic_call"> {
+  let summary = "generic call operation";
+  let description = [{
+    Generic calls represent calls to a user defined function that needs to
+    be specialized for the shape of its arguments. The callee name is attached
+    as a symbol reference via an attribute. The arguments list must match the
+    arguments expected by the callee. For example:
+
+    ```mlir
+     %4 = "toy.generic_call"(%1, %3) {callee = @my_func}
+           : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+    ```
+
+    This is only valid if a function named "my_func" exists and takes two
+    arguments.
+  }];
+
+  // The generic call operation takes a symbol reference attribute as the
+  // callee, and inputs for the call.
+  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<F64Tensor>:$inputs);
+
+  // The generic call operation returns a single value of TensorType.
+  let results = (outs F64Tensor);
+
+  // Add custom build methods for the generic call operation.
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &state, "
+              "StringRef callee, ArrayRef<Value> arguments">
+  ];
+}
+
+def MulOp : Toy_Op<"mul"> {
+  let summary = "element-wise multiplication operation";
+  let description = [{
+    The "mul" operation performs element-wise multiplication between two
+    tensors. The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs F64Tensor);
+
+  // Allow building a MulOp with from the two input operands.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
+  ];
+}
+
+def PrintOp : Toy_Op<"print"> {
+  let summary = "print operation";
+  let description = [{
+    The "print" builtin operation prints a given input tensor, and produces
+    no results.
+  }];
+
+  // The print operation takes an input tensor to print.
+  let arguments = (ins F64Tensor:$input);
+}
+
+def ReshapeOp : Toy_Op<"reshape"> {
+  let summary = "tensor reshape operation";
+  let description = [{
+    Reshape operation is transforming its input tensor into a new tensor with
+    the same number of elements but different shapes. For example:
+
+    ```mlir
+       %0 = "toy.reshape"(%arg1) : (tensor<10xf64>) -> tensor<5x2xf64>
+    ```
+  }];
+
+  let arguments = (ins F64Tensor:$input);
+
+  // We expect that the reshape operation returns a statically shaped tensor.
+  let results = (outs StaticShapeTensorOf<[F64]>);
+}
+
+def ReturnOp : Toy_Op<"return", [Terminator, HasParent<"FuncOp">]> {
+  let summary = "return operation";
+  let description = [{
+    The "return" operation represents a return operation within a function.
+    The operation takes an optional tensor operand and produces no results.
+    The operand type must match the signature of the function that contains
+    the operation. For example:
+
+    ```mlir
+      func @foo() -> tensor<2xf64> {
+        ...
+        toy.return %0 : tensor<2xf64>
+      }
+    ```
+  }];
+
+  // The return operation takes an optional input operand to return. This
+  // value must match the return type of the enclosing function.
+  let arguments = (ins Variadic<F64Tensor>:$input);
+
+  // Allow building a ReturnOp with no return operand.
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &state", [{ build(b, state, llvm::None); }]
+  >];
+
+  // Provide extra utility definitions on the c++ operation class definition.
+  let extraClassDeclaration = [{
+    bool hasOperand() { return getNumOperands() != 0; }
+  }];
+
+  // Invoke a static verify method to verify this return operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def TransposeOp : Toy_Op<"transpose"> {
+  let summary = "transpose operation";
+
+  let arguments = (ins F64Tensor:$input);
+  let results = (outs F64Tensor);
+
+  // Allow building a TransposeOp with from the input operand.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value input">
+  ];
+
+  // Invoke a static verify method to verify this transpose operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+#endif // TOY_OPS
diff --git a/mlir/examples/toy/Ch2/include/toy/Parser.h b/mlir/examples/toy/Ch2/include/toy/Parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..4557ea26859de3d0a6b71448f4bef030167c3e71
--- /dev/null
+++ b/mlir/examples/toy/Ch2/include/toy/Parser.h
@@ -0,0 +1,485 @@
+//===- Parser.h - Toy Language Parser -------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the parser for the Toy language. It processes the Token
+// provided by the Lexer and returns an AST.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_PARSER_H
+#define MLIR_TUTORIAL_TOY_PARSER_H
+
+#include "toy/AST.h"
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+namespace toy {
+
+/// This is a simple recursive parser for the Toy language. It produces a well
+/// formed AST from a stream of Token supplied by the Lexer. No semantic checks
+/// or symbol resolution is performed. For example, variables are referenced by
+/// string and the code could reference an undeclared variable and the parsing
+/// succeeds.
+class Parser {
+public:
+  /// Create a Parser for the supplied lexer.
+  Parser(Lexer &lexer) : lexer(lexer) {}
+
+  /// Parse a full Module. A module is a list of function definitions.
+  std::unique_ptr<ModuleAST> parseModule() {
+    lexer.getNextToken(); // prime the lexer
+
+    // Parse functions one at a time and accumulate in this vector.
+    std::vector<FunctionAST> functions;
+    while (auto f = parseDefinition()) {
+      functions.push_back(std::move(*f));
+      if (lexer.getCurToken() == tok_eof)
+        break;
+    }
+    // If we didn't reach EOF, there was an error during parsing
+    if (lexer.getCurToken() != tok_eof)
+      return parseError<ModuleAST>("nothing", "at end of module");
+
+    return std::make_unique<ModuleAST>(std::move(functions));
+  }
+
+private:
+  Lexer &lexer;
+
+  /// Parse a return statement.
+  /// return :== return ; | return expr ;
+  std::unique_ptr<ReturnExprAST> parseReturn() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_return);
+
+    // return takes an optional argument
+    llvm::Optional<std::unique_ptr<ExprAST>> expr;
+    if (lexer.getCurToken() != ';') {
+      expr = parseExpression();
+      if (!expr)
+        return nullptr;
+    }
+    return std::make_unique<ReturnExprAST>(std::move(loc), std::move(expr));
+  }
+
+  /// Parse a literal number.
+  /// numberexpr ::= number
+  std::unique_ptr<ExprAST> parseNumberExpr() {
+    auto loc = lexer.getLastLocation();
+    auto result =
+        std::make_unique<NumberExprAST>(std::move(loc), lexer.getValue());
+    lexer.consume(tok_number);
+    return std::move(result);
+  }
+
+  /// Parse a literal array expression.
+  /// tensorLiteral ::= [ literalList ] | number
+  /// literalList ::= tensorLiteral | tensorLiteral, literalList
+  std::unique_ptr<ExprAST> parseTensorLiteralExpr() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(Token('['));
+
+    // Hold the list of values at this nesting level.
+    std::vector<std::unique_ptr<ExprAST>> values;
+    // Hold the dimensions for all the nesting inside this level.
+    std::vector<int64_t> dims;
+    do {
+      // We can have either another nested array or a number literal.
+      if (lexer.getCurToken() == '[') {
+        values.push_back(parseTensorLiteralExpr());
+        if (!values.back())
+          return nullptr; // parse error in the nested array.
+      } else {
+        if (lexer.getCurToken() != tok_number)
+          return parseError<ExprAST>("<num> or [", "in literal expression");
+        values.push_back(parseNumberExpr());
+      }
+
+      // End of this list on ']'
+      if (lexer.getCurToken() == ']')
+        break;
+
+      // Elements are separated by a comma.
+      if (lexer.getCurToken() != ',')
+        return parseError<ExprAST>("] or ,", "in literal expression");
+
+      lexer.getNextToken(); // eat ,
+    } while (true);
+    if (values.empty())
+      return parseError<ExprAST>("<something>", "to fill literal expression");
+    lexer.getNextToken(); // eat ]
+
+    /// Fill in the dimensions now. First the current nesting level:
+    dims.push_back(values.size());
+
+    /// If there is any nested array, process all of them and ensure that
+    /// dimensions are uniform.
+    if (llvm::any_of(values, [](std::unique_ptr<ExprAST> &expr) {
+          return llvm::isa<LiteralExprAST>(expr.get());
+        })) {
+      auto *firstLiteral = llvm::dyn_cast<LiteralExprAST>(values.front().get());
+      if (!firstLiteral)
+        return parseError<ExprAST>("uniform well-nested dimensions",
+                                   "inside literal expression");
+
+      // Append the nested dimensions to the current level
+      auto firstDims = firstLiteral->getDims();
+      dims.insert(dims.end(), firstDims.begin(), firstDims.end());
+
+      // Sanity check that shape is uniform across all elements of the list.
+      for (auto &expr : values) {
+        auto *exprLiteral = llvm::cast<LiteralExprAST>(expr.get());
+        if (!exprLiteral)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+        if (exprLiteral->getDims() != firstDims)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+      }
+    }
+    return std::make_unique<LiteralExprAST>(std::move(loc), std::move(values),
+                                            std::move(dims));
+  }
+
+  /// parenexpr ::= '(' expression ')'
+  std::unique_ptr<ExprAST> parseParenExpr() {
+    lexer.getNextToken(); // eat (.
+    auto v = parseExpression();
+    if (!v)
+      return nullptr;
+
+    if (lexer.getCurToken() != ')')
+      return parseError<ExprAST>(")", "to close expression with parentheses");
+    lexer.consume(Token(')'));
+    return v;
+  }
+
+  /// identifierexpr
+  ///   ::= identifier
+  ///   ::= identifier '(' expression ')'
+  std::unique_ptr<ExprAST> parseIdentifierExpr() {
+    std::string name = lexer.getId();
+
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat identifier.
+
+    if (lexer.getCurToken() != '(') // Simple variable ref.
+      return std::make_unique<VariableExprAST>(std::move(loc), name);
+
+    // This is a function call.
+    lexer.consume(Token('('));
+    std::vector<std::unique_ptr<ExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      while (true) {
+        if (auto arg = parseExpression())
+          args.push_back(std::move(arg));
+        else
+          return nullptr;
+
+        if (lexer.getCurToken() == ')')
+          break;
+
+        if (lexer.getCurToken() != ',')
+          return parseError<ExprAST>(", or )", "in argument list");
+        lexer.getNextToken();
+      }
+    }
+    lexer.consume(Token(')'));
+
+    // It can be a builtin call to print
+    if (name == "print") {
+      if (args.size() != 1)
+        return parseError<ExprAST>("<single arg>", "as argument to print()");
+
+      return std::make_unique<PrintExprAST>(std::move(loc), std::move(args[0]));
+    }
+
+    // Call to a user-defined function
+    return std::make_unique<CallExprAST>(std::move(loc), name, std::move(args));
+  }
+
+  /// primary
+  ///   ::= identifierexpr
+  ///   ::= numberexpr
+  ///   ::= parenexpr
+  ///   ::= tensorliteral
+  std::unique_ptr<ExprAST> parsePrimary() {
+    switch (lexer.getCurToken()) {
+    default:
+      llvm::errs() << "unknown token '" << lexer.getCurToken()
+                   << "' when expecting an expression\n";
+      return nullptr;
+    case tok_identifier:
+      return parseIdentifierExpr();
+    case tok_number:
+      return parseNumberExpr();
+    case '(':
+      return parseParenExpr();
+    case '[':
+      return parseTensorLiteralExpr();
+    case ';':
+      return nullptr;
+    case '}':
+      return nullptr;
+    }
+  }
+
+  /// Recursively parse the right hand side of a binary expression, the ExprPrec
+  /// argument indicates the precedence of the current binary operator.
+  ///
+  /// binoprhs ::= ('+' primary)*
+  std::unique_ptr<ExprAST> parseBinOpRHS(int exprPrec,
+                                         std::unique_ptr<ExprAST> lhs) {
+    // If this is a binop, find its precedence.
+    while (true) {
+      int tokPrec = getTokPrecedence();
+
+      // If this is a binop that binds at least as tightly as the current binop,
+      // consume it, otherwise we are done.
+      if (tokPrec < exprPrec)
+        return lhs;
+
+      // Okay, we know this is a binop.
+      int binOp = lexer.getCurToken();
+      lexer.consume(Token(binOp));
+      auto loc = lexer.getLastLocation();
+
+      // Parse the primary expression after the binary operator.
+      auto rhs = parsePrimary();
+      if (!rhs)
+        return parseError<ExprAST>("expression", "to complete binary operator");
+
+      // If BinOp binds less tightly with rhs than the operator after rhs, let
+      // the pending operator take rhs as its lhs.
+      int nextPrec = getTokPrecedence();
+      if (tokPrec < nextPrec) {
+        rhs = parseBinOpRHS(tokPrec + 1, std::move(rhs));
+        if (!rhs)
+          return nullptr;
+      }
+
+      // Merge lhs/RHS.
+      lhs = std::make_unique<BinaryExprAST>(std::move(loc), binOp,
+                                            std::move(lhs), std::move(rhs));
+    }
+  }
+
+  /// expression::= primary binop rhs
+  std::unique_ptr<ExprAST> parseExpression() {
+    auto lhs = parsePrimary();
+    if (!lhs)
+      return nullptr;
+
+    return parseBinOpRHS(0, std::move(lhs));
+  }
+
+  /// type ::= < shape_list >
+  /// shape_list ::= num | num , shape_list
+  std::unique_ptr<VarType> parseType() {
+    if (lexer.getCurToken() != '<')
+      return parseError<VarType>("<", "to begin type");
+    lexer.getNextToken(); // eat <
+
+    auto type = std::make_unique<VarType>();
+
+    while (lexer.getCurToken() == tok_number) {
+      type->shape.push_back(lexer.getValue());
+      lexer.getNextToken();
+      if (lexer.getCurToken() == ',')
+        lexer.getNextToken();
+    }
+
+    if (lexer.getCurToken() != '>')
+      return parseError<VarType>(">", "to end type");
+    lexer.getNextToken(); // eat >
+    return type;
+  }
+
+  /// Parse a variable declaration, it starts with a `var` keyword followed by
+  /// and identifier and an optional type (shape specification) before the
+  /// initializer.
+  /// decl ::= var identifier [ type ] = expr
+  std::unique_ptr<VarDeclExprAST> parseDeclaration() {
+    if (lexer.getCurToken() != tok_var)
+      return parseError<VarDeclExprAST>("var", "to begin declaration");
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat var
+
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<VarDeclExprAST>("identified",
+                                        "after 'var' declaration");
+    std::string id = lexer.getId();
+    lexer.getNextToken(); // eat id
+
+    std::unique_ptr<VarType> type; // Type is optional, it can be inferred
+    if (lexer.getCurToken() == '<') {
+      type = parseType();
+      if (!type)
+        return nullptr;
+    }
+
+    if (!type)
+      type = std::make_unique<VarType>();
+    lexer.consume(Token('='));
+    auto expr = parseExpression();
+    return std::make_unique<VarDeclExprAST>(std::move(loc), std::move(id),
+                                            std::move(*type), std::move(expr));
+  }
+
+  /// Parse a block: a list of expression separated by semicolons and wrapped in
+  /// curly braces.
+  ///
+  /// block ::= { expression_list }
+  /// expression_list ::= block_expr ; expression_list
+  /// block_expr ::= decl | "return" | expr
+  std::unique_ptr<ExprASTList> parseBlock() {
+    if (lexer.getCurToken() != '{')
+      return parseError<ExprASTList>("{", "to begin block");
+    lexer.consume(Token('{'));
+
+    auto exprList = std::make_unique<ExprASTList>();
+
+    // Ignore empty expressions: swallow sequences of semicolons.
+    while (lexer.getCurToken() == ';')
+      lexer.consume(Token(';'));
+
+    while (lexer.getCurToken() != '}' && lexer.getCurToken() != tok_eof) {
+      if (lexer.getCurToken() == tok_var) {
+        // Variable declaration
+        auto varDecl = parseDeclaration();
+        if (!varDecl)
+          return nullptr;
+        exprList->push_back(std::move(varDecl));
+      } else if (lexer.getCurToken() == tok_return) {
+        // Return statement
+        auto ret = parseReturn();
+        if (!ret)
+          return nullptr;
+        exprList->push_back(std::move(ret));
+      } else {
+        // General expression
+        auto expr = parseExpression();
+        if (!expr)
+          return nullptr;
+        exprList->push_back(std::move(expr));
+      }
+      // Ensure that elements are separated by a semicolon.
+      if (lexer.getCurToken() != ';')
+        return parseError<ExprASTList>(";", "after expression");
+
+      // Ignore empty expressions: swallow sequences of semicolons.
+      while (lexer.getCurToken() == ';')
+        lexer.consume(Token(';'));
+    }
+
+    if (lexer.getCurToken() != '}')
+      return parseError<ExprASTList>("}", "to close block");
+
+    lexer.consume(Token('}'));
+    return exprList;
+  }
+
+  /// prototype ::= def id '(' decl_list ')'
+  /// decl_list ::= identifier | identifier, decl_list
+  std::unique_ptr<PrototypeAST> parsePrototype() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_def);
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<PrototypeAST>("function name", "in prototype");
+
+    std::string fnName = lexer.getId();
+    lexer.consume(tok_identifier);
+
+    if (lexer.getCurToken() != '(')
+      return parseError<PrototypeAST>("(", "in prototype");
+    lexer.consume(Token('('));
+
+    std::vector<std::unique_ptr<VariableExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      do {
+        std::string name = lexer.getId();
+        auto loc = lexer.getLastLocation();
+        lexer.consume(tok_identifier);
+        auto decl = std::make_unique<VariableExprAST>(std::move(loc), name);
+        args.push_back(std::move(decl));
+        if (lexer.getCurToken() != ',')
+          break;
+        lexer.consume(Token(','));
+        if (lexer.getCurToken() != tok_identifier)
+          return parseError<PrototypeAST>(
+              "identifier", "after ',' in function parameter list");
+      } while (true);
+    }
+    if (lexer.getCurToken() != ')')
+      return parseError<PrototypeAST>("}", "to end function prototype");
+
+    // success.
+    lexer.consume(Token(')'));
+    return std::make_unique<PrototypeAST>(std::move(loc), fnName,
+                                          std::move(args));
+  }
+
+  /// Parse a function definition, we expect a prototype initiated with the
+  /// `def` keyword, followed by a block containing a list of expressions.
+  ///
+  /// definition ::= prototype block
+  std::unique_ptr<FunctionAST> parseDefinition() {
+    auto proto = parsePrototype();
+    if (!proto)
+      return nullptr;
+
+    if (auto block = parseBlock())
+      return std::make_unique<FunctionAST>(std::move(proto), std::move(block));
+    return nullptr;
+  }
+
+  /// Get the precedence of the pending binary operator token.
+  int getTokPrecedence() {
+    if (!isascii(lexer.getCurToken()))
+      return -1;
+
+    // 1 is lowest precedence.
+    switch (static_cast<char>(lexer.getCurToken())) {
+    case '-':
+      return 20;
+    case '+':
+      return 20;
+    case '*':
+      return 40;
+    default:
+      return -1;
+    }
+  }
+
+  /// Helper function to signal errors while parsing, it takes an argument
+  /// indicating the expected token and another argument giving more context.
+  /// Location is retrieved from the lexer to enrich the error message.
+  template <typename R, typename T, typename U = const char *>
+  std::unique_ptr<R> parseError(T &&expected, U &&context = "") {
+    auto curToken = lexer.getCurToken();
+    llvm::errs() << "Parse error (" << lexer.getLastLocation().line << ", "
+                 << lexer.getLastLocation().col << "): expected '" << expected
+                 << "' " << context << " but has Token " << curToken;
+    if (isprint(curToken))
+      llvm::errs() << " '" << (char)curToken << "'";
+    llvm::errs() << "\n";
+    return nullptr;
+  }
+};
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_PARSER_H
diff --git a/mlir/examples/toy/Ch2/mlir/Dialect.cpp b/mlir/examples/toy/Ch2/mlir/Dialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b4d669d18eabd72acb11eea6b4dbc1c3dab4ecd
--- /dev/null
+++ b/mlir/examples/toy/Ch2/mlir/Dialect.cpp
@@ -0,0 +1,180 @@
+//===- Dialect.cpp - Toy IR Dialect registration in MLIR ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the dialect for the Toy IR: custom type parsing and
+// operation verification.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/StandardTypes.h"
+
+using namespace mlir;
+using namespace mlir::toy;
+
+//===----------------------------------------------------------------------===//
+// ToyDialect
+//===----------------------------------------------------------------------===//
+
+/// Dialect creation, the instance will be owned by the context. This is the
+/// point of registration of custom types and operations for the dialect.
+ToyDialect::ToyDialect(mlir::MLIRContext *ctx) : mlir::Dialect("toy", ctx) {
+  addOperations<
+#define GET_OP_LIST
+#include "toy/Ops.cpp.inc"
+      >();
+}
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ConstantOp
+
+/// Build a constant operation.
+/// The builder is passed as an argument, so is the state that this method is
+/// expected to fill in order to build the operation.
+void ConstantOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                       double value) {
+  auto dataType = RankedTensorType::get({}, builder->getF64Type());
+  auto dataAttribute = DenseElementsAttr::get(dataType, value);
+  ConstantOp::build(builder, state, dataType, dataAttribute);
+}
+
+/// Verifier for the constant operation. This corresponds to the `::verify(...)`
+/// in the op definition.
+static mlir::LogicalResult verify(ConstantOp op) {
+  // If the return type of the constant is not an unranked tensor, the shape
+  // must match the shape of the attribute holding the data.
+  auto resultType =
+      op.getResult()->getType().dyn_cast<mlir::RankedTensorType>();
+  if (!resultType)
+    return success();
+
+  // Check that the rank of the attribute type matches the rank of the constant
+  // result type.
+  auto attrType = op.value().getType().cast<mlir::TensorType>();
+  if (attrType.getRank() != resultType.getRank()) {
+    return op.emitOpError(
+               "return type must match the one of the attached value "
+               "attribute: ")
+           << attrType.getRank() << " != " << resultType.getRank();
+  }
+
+  // Check that each of the dimensions match between the two types.
+  for (int dim = 0, dimE = attrType.getRank(); dim < dimE; ++dim) {
+    if (attrType.getShape()[dim] != resultType.getShape()[dim]) {
+      return op.emitOpError(
+                 "return type shape mismatches its attribute at dimension ")
+             << dim << ": " << attrType.getShape()[dim]
+             << " != " << resultType.getShape()[dim];
+    }
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// AddOp
+
+void AddOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+//===----------------------------------------------------------------------===//
+// GenericCallOp
+
+void GenericCallOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                          StringRef callee, ArrayRef<mlir::Value> arguments) {
+  // Generic call always returns an unranked Tensor initially.
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands(arguments);
+  state.addAttribute("callee", builder->getSymbolRefAttr(callee));
+}
+
+//===----------------------------------------------------------------------===//
+// MulOp
+
+void MulOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+
+static mlir::LogicalResult verify(ReturnOp op) {
+  // We know that the parent operation is a function, because of the 'HasParent'
+  // trait attached to the operation definition.
+  auto function = cast<FuncOp>(op.getParentOp());
+
+  /// ReturnOps can only have a single optional operand.
+  if (op.getNumOperands() > 1)
+    return op.emitOpError() << "expects at most 1 return operand";
+
+  // The operand number and types must match the function signature.
+  const auto &results = function.getType().getResults();
+  if (op.getNumOperands() != results.size())
+    return op.emitOpError()
+           << "does not return the same number of values ("
+           << op.getNumOperands() << ") as the enclosing function ("
+           << results.size() << ")";
+
+  // If the operation does not have an input, we are done.
+  if (!op.hasOperand())
+    return mlir::success();
+
+  auto inputType = *op.operand_type_begin();
+  auto resultType = results.front();
+
+  // Check that the result type of the function matches the operand type.
+  if (inputType == resultType || inputType.isa<mlir::UnrankedTensorType>() ||
+      resultType.isa<mlir::UnrankedTensorType>())
+    return mlir::success();
+
+  return op.emitError() << "type of return operand ("
+                        << *op.operand_type_begin()
+                        << ") doesn't match function result type ("
+                        << results.front() << ")";
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+
+void TransposeOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                        mlir::Value value) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands(value);
+}
+
+static mlir::LogicalResult verify(TransposeOp op) {
+  auto inputType = op.getOperand()->getType().dyn_cast<RankedTensorType>();
+  auto resultType = op.getType().dyn_cast<RankedTensorType>();
+  if (!inputType || !resultType)
+    return mlir::success();
+
+  auto inputShape = inputType.getShape();
+  if (!std::equal(inputShape.begin(), inputShape.end(),
+                  resultType.getShape().rbegin())) {
+    return op.emitError()
+           << "expected result shape to be a transpose of the input";
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "toy/Ops.cpp.inc"
diff --git a/mlir/examples/toy/Ch2/mlir/MLIRGen.cpp b/mlir/examples/toy/Ch2/mlir/MLIRGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9c960c79f47254d31e72037431a0f9d3a614276
--- /dev/null
+++ b/mlir/examples/toy/Ch2/mlir/MLIRGen.cpp
@@ -0,0 +1,452 @@
+//===- MLIRGen.cpp - MLIR Generation from a Toy AST -----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple IR generation targeting MLIR from a Module AST
+// for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/MLIRGen.h"
+#include "toy/AST.h"
+#include "toy/Dialect.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/Support/raw_ostream.h"
+#include <numeric>
+
+using namespace mlir::toy;
+using namespace toy;
+
+using llvm::ArrayRef;
+using llvm::cast;
+using llvm::dyn_cast;
+using llvm::isa;
+using llvm::makeArrayRef;
+using llvm::ScopedHashTableScope;
+using llvm::SmallVector;
+using llvm::StringRef;
+using llvm::Twine;
+
+namespace {
+
+/// Implementation of a simple MLIR emission from the Toy AST.
+///
+/// This will emit operations that are specific to the Toy language, preserving
+/// the semantics of the language and (hopefully) allow to perform accurate
+/// analysis and transformation based on these high level semantics.
+class MLIRGenImpl {
+public:
+  MLIRGenImpl(mlir::MLIRContext &context) : builder(&context) {}
+
+  /// Public API: convert the AST for a Toy module (source file) to an MLIR
+  /// Module operation.
+  mlir::ModuleOp mlirGen(ModuleAST &moduleAST) {
+    // We create an empty MLIR module and codegen functions one at a time and
+    // add them to the module.
+    theModule = mlir::ModuleOp::create(builder.getUnknownLoc());
+
+    for (FunctionAST &F : moduleAST) {
+      auto func = mlirGen(F);
+      if (!func)
+        return nullptr;
+      theModule.push_back(func);
+    }
+
+    // Verify the module after we have finished constructing it, this will check
+    // the structural properties of the IR and invoke any specific verifiers we
+    // have on the Toy operations.
+    if (failed(mlir::verify(theModule))) {
+      theModule.emitError("module verification error");
+      return nullptr;
+    }
+
+    return theModule;
+  }
+
+private:
+  /// A "module" matches a Toy source file: containing a list of functions.
+  mlir::ModuleOp theModule;
+
+  /// The builder is a helper class to create IR inside a function. The builder
+  /// is stateful, in particular it keeps an "insertion point": this is where
+  /// the next operations will be introduced.
+  mlir::OpBuilder builder;
+
+  /// The symbol table maps a variable name to a value in the current scope.
+  /// Entering a function creates a new scope, and the function arguments are
+  /// added to the mapping. When the processing of a function is terminated, the
+  /// scope is destroyed and the mappings created in this scope are dropped.
+  llvm::ScopedHashTable<StringRef, mlir::Value> symbolTable;
+
+  /// Helper conversion for a Toy AST location to an MLIR location.
+  mlir::Location loc(Location loc) {
+    return builder.getFileLineColLoc(builder.getIdentifier(*loc.file), loc.line,
+                                     loc.col);
+  }
+
+  /// Declare a variable in the current scope, return success if the variable
+  /// wasn't declared yet.
+  mlir::LogicalResult declare(llvm::StringRef var, mlir::Value value) {
+    if (symbolTable.count(var))
+      return mlir::failure();
+    symbolTable.insert(var, value);
+    return mlir::success();
+  }
+
+  /// Create the prototype for an MLIR function with as many arguments as the
+  /// provided Toy AST prototype.
+  mlir::FuncOp mlirGen(PrototypeAST &proto) {
+    auto location = loc(proto.loc());
+
+    // This is a generic function, the return type will be inferred later.
+    // Arguments type are uniformly unranked tensors.
+    llvm::SmallVector<mlir::Type, 4> arg_types(proto.getArgs().size(),
+                                               getType(VarType{}));
+    auto func_type = builder.getFunctionType(arg_types, llvm::None);
+    return mlir::FuncOp::create(location, proto.getName(), func_type);
+  }
+
+  /// Emit a new function and add it to the MLIR module.
+  mlir::FuncOp mlirGen(FunctionAST &funcAST) {
+    // Create a scope in the symbol table to hold variable declarations.
+    ScopedHashTableScope<llvm::StringRef, mlir::Value> var_scope(symbolTable);
+
+    // Create an MLIR function for the given prototype.
+    mlir::FuncOp function(mlirGen(*funcAST.getProto()));
+    if (!function)
+      return nullptr;
+
+    // Let's start the body of the function now!
+    // In MLIR the entry block of the function is special: it must have the same
+    // argument list as the function itself.
+    auto &entryBlock = *function.addEntryBlock();
+    auto protoArgs = funcAST.getProto()->getArgs();
+
+    // Declare all the function arguments in the symbol table.
+    for (const auto &name_value :
+         llvm::zip(protoArgs, entryBlock.getArguments())) {
+      if (failed(declare(std::get<0>(name_value)->getName(),
+                         std::get<1>(name_value))))
+        return nullptr;
+    }
+
+    // Set the insertion point in the builder to the beginning of the function
+    // body, it will be used throughout the codegen to create operations in this
+    // function.
+    builder.setInsertionPointToStart(&entryBlock);
+
+    // Emit the body of the function.
+    if (mlir::failed(mlirGen(*funcAST.getBody()))) {
+      function.erase();
+      return nullptr;
+    }
+
+    // Implicitly return void if no return statement was emitted.
+    // FIXME: we may fix the parser instead to always return the last expression
+    // (this would possibly help the REPL case later)
+    ReturnOp returnOp;
+    if (!entryBlock.empty())
+      returnOp = dyn_cast<ReturnOp>(entryBlock.back());
+    if (!returnOp) {
+      builder.create<ReturnOp>(loc(funcAST.getProto()->loc()));
+    } else if (returnOp.hasOperand()) {
+      // Otherwise, if this return operation has an operand then add a result to
+      // the function.
+      function.setType(builder.getFunctionType(function.getType().getInputs(),
+                                               getType(VarType{})));
+    }
+
+    return function;
+  }
+
+  /// Emit a binary operation
+  mlir::Value mlirGen(BinaryExprAST &binop) {
+    // First emit the operations for each side of the operation before emitting
+    // the operation itself. For example if the expression is `a + foo(a)`
+    // 1) First it will visiting the LHS, which will return a reference to the
+    //    value holding `a`. This value should have been emitted at declaration
+    //    time and registered in the symbol table, so nothing would be
+    //    codegen'd. If the value is not in the symbol table, an error has been
+    //    emitted and nullptr is returned.
+    // 2) Then the RHS is visited (recursively) and a call to `foo` is emitted
+    //    and the result value is returned. If an error occurs we get a nullptr
+    //    and propagate.
+    //
+    mlir::Value lhs = mlirGen(*binop.getLHS());
+    if (!lhs)
+      return nullptr;
+    mlir::Value rhs = mlirGen(*binop.getRHS());
+    if (!rhs)
+      return nullptr;
+    auto location = loc(binop.loc());
+
+    // Derive the operation name from the binary operator. At the moment we only
+    // support '+' and '*'.
+    switch (binop.getOp()) {
+    case '+':
+      return builder.create<AddOp>(location, lhs, rhs);
+    case '*':
+      return builder.create<MulOp>(location, lhs, rhs);
+    }
+
+    emitError(location, "invalid binary operator '") << binop.getOp() << "'";
+    return nullptr;
+  }
+
+  /// This is a reference to a variable in an expression. The variable is
+  /// expected to have been declared and so should have a value in the symbol
+  /// table, otherwise emit an error and return nullptr.
+  mlir::Value mlirGen(VariableExprAST &expr) {
+    if (auto variable = symbolTable.lookup(expr.getName()))
+      return variable;
+
+    emitError(loc(expr.loc()), "error: unknown variable '")
+        << expr.getName() << "'";
+    return nullptr;
+  }
+
+  /// Emit a return operation. This will return failure if any generation fails.
+  mlir::LogicalResult mlirGen(ReturnExprAST &ret) {
+    auto location = loc(ret.loc());
+
+    // 'return' takes an optional expression, handle that case here.
+    mlir::Value expr = nullptr;
+    if (ret.getExpr().hasValue()) {
+      if (!(expr = mlirGen(*ret.getExpr().getValue())))
+        return mlir::failure();
+    }
+
+    // Otherwise, this return operation has zero operands.
+    builder.create<ReturnOp>(location, expr ? makeArrayRef(expr)
+                                            : ArrayRef<mlir::Value>());
+    return mlir::success();
+  }
+
+  /// Emit a literal/constant array. It will be emitted as a flattened array of
+  /// data in an Attribute attached to a `toy.constant` operation.
+  /// See documentation on [Attributes](LangRef.md#attributes) for more details.
+  /// Here is an excerpt:
+  ///
+  ///   Attributes are the mechanism for specifying constant data in MLIR in
+  ///   places where a variable is never allowed [...]. They consist of a name
+  ///   and a concrete attribute value. The set of expected attributes, their
+  ///   structure, and their interpretation are all contextually dependent on
+  ///   what they are attached to.
+  ///
+  /// Example, the source level statement:
+  ///   var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  /// will be converted to:
+  ///   %0 = "toy.constant"() {value: dense<tensor<2x3xf64>,
+  ///     [[1.000000e+00, 2.000000e+00, 3.000000e+00],
+  ///      [4.000000e+00, 5.000000e+00, 6.000000e+00]]>} : () -> tensor<2x3xf64>
+  ///
+  mlir::Value mlirGen(LiteralExprAST &lit) {
+    auto type = getType(lit.getDims());
+
+    // The attribute is a vector with a floating point value per element
+    // (number) in the array, see `collectData()` below for more details.
+    std::vector<double> data;
+    data.reserve(std::accumulate(lit.getDims().begin(), lit.getDims().end(), 1,
+                                 std::multiplies<int>()));
+    collectData(lit, data);
+
+    // The type of this attribute is tensor of 64-bit floating-point with the
+    // shape of the literal.
+    mlir::Type elementType = builder.getF64Type();
+    auto dataType = mlir::RankedTensorType::get(lit.getDims(), elementType);
+
+    // This is the actual attribute that holds the list of values for this
+    // tensor literal.
+    auto dataAttribute =
+        mlir::DenseElementsAttr::get(dataType, llvm::makeArrayRef(data));
+
+    // Build the MLIR op `toy.constant`. This invokes the `ConstantOp::build`
+    // method.
+    return builder.create<ConstantOp>(loc(lit.loc()), type, dataAttribute);
+  }
+
+  /// Recursive helper function to accumulate the data that compose an array
+  /// literal. It flattens the nested structure in the supplied vector. For
+  /// example with this array:
+  ///  [[1, 2], [3, 4]]
+  /// we will generate:
+  ///  [ 1, 2, 3, 4 ]
+  /// Individual numbers are represented as doubles.
+  /// Attributes are the way MLIR attaches constant to operations.
+  void collectData(ExprAST &expr, std::vector<double> &data) {
+    if (auto *lit = dyn_cast<LiteralExprAST>(&expr)) {
+      for (auto &value : lit->getValues())
+        collectData(*value, data);
+      return;
+    }
+
+    assert(isa<NumberExprAST>(expr) && "expected literal or number expr");
+    data.push_back(cast<NumberExprAST>(expr).getValue());
+  }
+
+  /// Emit a call expression. It emits specific operations for the `transpose`
+  /// builtin. Other identifiers are assumed to be user-defined functions.
+  mlir::Value mlirGen(CallExprAST &call) {
+    llvm::StringRef callee = call.getCallee();
+    auto location = loc(call.loc());
+
+    // Codegen the operands first.
+    SmallVector<mlir::Value, 4> operands;
+    for (auto &expr : call.getArgs()) {
+      auto arg = mlirGen(*expr);
+      if (!arg)
+        return nullptr;
+      operands.push_back(arg);
+    }
+
+    // Builting calls have their custom operation, meaning this is a
+    // straightforward emission.
+    if (callee == "transpose") {
+      if (call.getArgs().size() != 1) {
+        emitError(location, "MLIR codegen encountered an error: toy.transpose "
+                            "does not accept multiple arguments");
+        return nullptr;
+      }
+      return builder.create<TransposeOp>(location, operands[0]);
+    }
+
+    // Otherwise this is a call to a user-defined function. Calls to ser-defined
+    // functions are mapped to a custom call that takes the callee name as an
+    // attribute.
+    return builder.create<GenericCallOp>(location, callee, operands);
+  }
+
+  /// Emit a print expression. It emits specific operations for two builtins:
+  /// transpose(x) and print(x).
+  mlir::LogicalResult mlirGen(PrintExprAST &call) {
+    auto arg = mlirGen(*call.getArg());
+    if (!arg)
+      return mlir::failure();
+
+    builder.create<PrintOp>(loc(call.loc()), arg);
+    return mlir::success();
+  }
+
+  /// Emit a constant for a single number (FIXME: semantic? broadcast?)
+  mlir::Value mlirGen(NumberExprAST &num) {
+    return builder.create<ConstantOp>(loc(num.loc()), num.getValue());
+  }
+
+  /// Dispatch codegen for the right expression subclass using RTTI.
+  mlir::Value mlirGen(ExprAST &expr) {
+    switch (expr.getKind()) {
+    case toy::ExprAST::Expr_BinOp:
+      return mlirGen(cast<BinaryExprAST>(expr));
+    case toy::ExprAST::Expr_Var:
+      return mlirGen(cast<VariableExprAST>(expr));
+    case toy::ExprAST::Expr_Literal:
+      return mlirGen(cast<LiteralExprAST>(expr));
+    case toy::ExprAST::Expr_Call:
+      return mlirGen(cast<CallExprAST>(expr));
+    case toy::ExprAST::Expr_Num:
+      return mlirGen(cast<NumberExprAST>(expr));
+    default:
+      emitError(loc(expr.loc()))
+          << "MLIR codegen encountered an unhandled expr kind '"
+          << Twine(expr.getKind()) << "'";
+      return nullptr;
+    }
+  }
+
+  /// Handle a variable declaration, we'll codegen the expression that forms the
+  /// initializer and record the value in the symbol table before returning it.
+  /// Future expressions will be able to reference this variable through symbol
+  /// table lookup.
+  mlir::Value mlirGen(VarDeclExprAST &vardecl) {
+    auto init = vardecl.getInitVal();
+    if (!init) {
+      emitError(loc(vardecl.loc()),
+                "missing initializer in variable declaration");
+      return nullptr;
+    }
+
+    mlir::Value value = mlirGen(*init);
+    if (!value)
+      return nullptr;
+
+    // We have the initializer value, but in case the variable was declared
+    // with specific shape, we emit a "reshape" operation. It will get
+    // optimized out later as needed.
+    if (!vardecl.getType().shape.empty()) {
+      value = builder.create<ReshapeOp>(loc(vardecl.loc()),
+                                        getType(vardecl.getType()), value);
+    }
+
+    // Register the value in the symbol table.
+    if (failed(declare(vardecl.getName(), value)))
+      return nullptr;
+    return value;
+  }
+
+  /// Codegen a list of expression, return failure if one of them hit an error.
+  mlir::LogicalResult mlirGen(ExprASTList &blockAST) {
+    ScopedHashTableScope<StringRef, mlir::Value> var_scope(symbolTable);
+    for (auto &expr : blockAST) {
+      // Specific handling for variable declarations, return statement, and
+      // print. These can only appear in block list and not in nested
+      // expressions.
+      if (auto *vardecl = dyn_cast<VarDeclExprAST>(expr.get())) {
+        if (!mlirGen(*vardecl))
+          return mlir::failure();
+        continue;
+      }
+      if (auto *ret = dyn_cast<ReturnExprAST>(expr.get()))
+        return mlirGen(*ret);
+      if (auto *print = dyn_cast<PrintExprAST>(expr.get())) {
+        if (mlir::failed(mlirGen(*print)))
+          return mlir::success();
+        continue;
+      }
+
+      // Generic expression dispatch codegen.
+      if (!mlirGen(*expr))
+        return mlir::failure();
+    }
+    return mlir::success();
+  }
+
+  /// Build a tensor type from a list of shape dimensions.
+  mlir::Type getType(ArrayRef<int64_t> shape) {
+    // If the shape is empty, then this type is unranked.
+    if (shape.empty())
+      return mlir::UnrankedTensorType::get(builder.getF64Type());
+
+    // Otherwise, we use the given shape.
+    return mlir::RankedTensorType::get(shape, builder.getF64Type());
+  }
+
+  /// Build an MLIR type from a Toy AST variable type (forward to the generic
+  /// getType above).
+  mlir::Type getType(const VarType &type) { return getType(type.shape); }
+};
+
+} // namespace
+
+namespace toy {
+
+// The public API for codegen.
+mlir::OwningModuleRef mlirGen(mlir::MLIRContext &context,
+                              ModuleAST &moduleAST) {
+  return MLIRGenImpl(context).mlirGen(moduleAST);
+}
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch2/parser/AST.cpp b/mlir/examples/toy/Ch2/parser/AST.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d6d9359529bffc068520bebf4a9ea56f436a415
--- /dev/null
+++ b/mlir/examples/toy/Ch2/parser/AST.cpp
@@ -0,0 +1,234 @@
+//===- AST.cpp - Helper for printing out the Toy AST ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST dump for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/AST.h"
+
+#include "mlir/ADT/TypeSwitch.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+
+namespace {
+
+// RAII helper to manage increasing/decreasing the indentation as we traverse
+// the AST
+struct Indent {
+  Indent(int &level) : level(level) { ++level; }
+  ~Indent() { --level; }
+  int &level;
+};
+
+/// Helper class that implement the AST tree traversal and print the nodes along
+/// the way. The only data member is the current indentation level.
+class ASTDumper {
+public:
+  void dump(ModuleAST *node);
+
+private:
+  void dump(const VarType &type);
+  void dump(VarDeclExprAST *varDecl);
+  void dump(ExprAST *expr);
+  void dump(ExprASTList *exprList);
+  void dump(NumberExprAST *num);
+  void dump(LiteralExprAST *node);
+  void dump(VariableExprAST *node);
+  void dump(ReturnExprAST *node);
+  void dump(BinaryExprAST *node);
+  void dump(CallExprAST *node);
+  void dump(PrintExprAST *node);
+  void dump(PrototypeAST *node);
+  void dump(FunctionAST *node);
+
+  // Actually print spaces matching the current indentation level
+  void indent() {
+    for (int i = 0; i < curIndent; i++)
+      llvm::errs() << "  ";
+  }
+  int curIndent = 0;
+};
+
+} // namespace
+
+/// Return a formatted string for the location of any node
+template <typename T> static std::string loc(T *node) {
+  const auto &loc = node->loc();
+  return (llvm::Twine("@") + *loc.file + ":" + llvm::Twine(loc.line) + ":" +
+          llvm::Twine(loc.col))
+      .str();
+}
+
+// Helper Macro to bump the indentation level and print the leading spaces for
+// the current indentations
+#define INDENT()                                                               \
+  Indent level_(curIndent);                                                    \
+  indent();
+
+/// Dispatch to a generic expressions to the appropriate subclass using RTTI
+void ASTDumper::dump(ExprAST *expr) {
+  mlir::TypeSwitch<ExprAST *>(expr)
+      .Case<BinaryExprAST, CallExprAST, LiteralExprAST, NumberExprAST,
+            PrintExprAST, ReturnExprAST, VarDeclExprAST, VariableExprAST>(
+          [&](auto *node) { this->dump(node); })
+      .Default([&](ExprAST *) {
+        // No match, fallback to a generic message
+        INDENT();
+        llvm::errs() << "<unknown Expr, kind " << expr->getKind() << ">\n";
+      });
+}
+
+/// A variable declaration is printing the variable name, the type, and then
+/// recurse in the initializer value.
+void ASTDumper::dump(VarDeclExprAST *varDecl) {
+  INDENT();
+  llvm::errs() << "VarDecl " << varDecl->getName();
+  dump(varDecl->getType());
+  llvm::errs() << " " << loc(varDecl) << "\n";
+  dump(varDecl->getInitVal());
+}
+
+/// A "block", or a list of expression
+void ASTDumper::dump(ExprASTList *exprList) {
+  INDENT();
+  llvm::errs() << "Block {\n";
+  for (auto &expr : *exprList)
+    dump(expr.get());
+  indent();
+  llvm::errs() << "} // Block\n";
+}
+
+/// A literal number, just print the value.
+void ASTDumper::dump(NumberExprAST *num) {
+  INDENT();
+  llvm::errs() << num->getValue() << " " << loc(num) << "\n";
+}
+
+/// Helper to print recursively a literal. This handles nested array like:
+///    [ [ 1, 2 ], [ 3, 4 ] ]
+/// We print out such array with the dimensions spelled out at every level:
+///    <2,2>[<2>[ 1, 2 ], <2>[ 3, 4 ] ]
+void printLitHelper(ExprAST *litOrNum) {
+  // Inside a literal expression we can have either a number or another literal
+  if (auto num = llvm::dyn_cast<NumberExprAST>(litOrNum)) {
+    llvm::errs() << num->getValue();
+    return;
+  }
+  auto *literal = llvm::cast<LiteralExprAST>(litOrNum);
+
+  // Print the dimension for this literal first
+  llvm::errs() << "<";
+  mlir::interleaveComma(literal->getDims(), llvm::errs());
+  llvm::errs() << ">";
+
+  // Now print the content, recursing on every element of the list
+  llvm::errs() << "[ ";
+  mlir::interleaveComma(literal->getValues(), llvm::errs(),
+                        [&](auto &elt) { printLitHelper(elt.get()); });
+  llvm::errs() << "]";
+}
+
+/// Print a literal, see the recursive helper above for the implementation.
+void ASTDumper::dump(LiteralExprAST *node) {
+  INDENT();
+  llvm::errs() << "Literal: ";
+  printLitHelper(node);
+  llvm::errs() << " " << loc(node) << "\n";
+}
+
+/// Print a variable reference (just a name).
+void ASTDumper::dump(VariableExprAST *node) {
+  INDENT();
+  llvm::errs() << "var: " << node->getName() << " " << loc(node) << "\n";
+}
+
+/// Return statement print the return and its (optional) argument.
+void ASTDumper::dump(ReturnExprAST *node) {
+  INDENT();
+  llvm::errs() << "Return\n";
+  if (node->getExpr().hasValue())
+    return dump(*node->getExpr());
+  {
+    INDENT();
+    llvm::errs() << "(void)\n";
+  }
+}
+
+/// Print a binary operation, first the operator, then recurse into LHS and RHS.
+void ASTDumper::dump(BinaryExprAST *node) {
+  INDENT();
+  llvm::errs() << "BinOp: " << node->getOp() << " " << loc(node) << "\n";
+  dump(node->getLHS());
+  dump(node->getRHS());
+}
+
+/// Print a call expression, first the callee name and the list of args by
+/// recursing into each individual argument.
+void ASTDumper::dump(CallExprAST *node) {
+  INDENT();
+  llvm::errs() << "Call '" << node->getCallee() << "' [ " << loc(node) << "\n";
+  for (auto &arg : node->getArgs())
+    dump(arg.get());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print a builtin print call, first the builtin name and then the argument.
+void ASTDumper::dump(PrintExprAST *node) {
+  INDENT();
+  llvm::errs() << "Print [ " << loc(node) << "\n";
+  dump(node->getArg());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print type: only the shape is printed in between '<' and '>'
+void ASTDumper::dump(const VarType &type) {
+  llvm::errs() << "<";
+  mlir::interleaveComma(type.shape, llvm::errs());
+  llvm::errs() << ">";
+}
+
+/// Print a function prototype, first the function name, and then the list of
+/// parameters names.
+void ASTDumper::dump(PrototypeAST *node) {
+  INDENT();
+  llvm::errs() << "Proto '" << node->getName() << "' " << loc(node) << "'\n";
+  indent();
+  llvm::errs() << "Params: [";
+  mlir::interleaveComma(node->getArgs(), llvm::errs(),
+                        [](auto &arg) { llvm::errs() << arg->getName(); });
+  llvm::errs() << "]\n";
+}
+
+/// Print a function, first the prototype and then the body.
+void ASTDumper::dump(FunctionAST *node) {
+  INDENT();
+  llvm::errs() << "Function \n";
+  dump(node->getProto());
+  dump(node->getBody());
+}
+
+/// Print a module, actually loop over the functions and print them in sequence.
+void ASTDumper::dump(ModuleAST *node) {
+  INDENT();
+  llvm::errs() << "Module:\n";
+  for (auto &f : *node)
+    dump(&f);
+}
+
+namespace toy {
+
+// Public API
+void dump(ModuleAST &module) { ASTDumper().dump(&module); }
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch2/toyc.cpp b/mlir/examples/toy/Ch2/toyc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e3db97b4aee74bc60fdb9b022eb8a183954ec44
--- /dev/null
+++ b/mlir/examples/toy/Ch2/toyc.cpp
@@ -0,0 +1,137 @@
+//===- toyc.cpp - The Toy Compiler ----------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the entry point for the Toy compiler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+#include "toy/MLIRGen.h"
+#include "toy/Parser.h"
+#include <memory>
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Parser.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+namespace cl = llvm::cl;
+
+static cl::opt<std::string> inputFilename(cl::Positional,
+                                          cl::desc("<input toy file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+namespace {
+enum InputType { Toy, MLIR };
+}
+static cl::opt<enum InputType> inputType(
+    "x", cl::init(Toy), cl::desc("Decided the kind of output desired"),
+    cl::values(clEnumValN(Toy, "toy", "load the input file as a Toy source.")),
+    cl::values(clEnumValN(MLIR, "mlir",
+                          "load the input file as an MLIR file")));
+
+namespace {
+enum Action { None, DumpAST, DumpMLIR };
+}
+static cl::opt<enum Action> emitAction(
+    "emit", cl::desc("Select the kind of output desired"),
+    cl::values(clEnumValN(DumpAST, "ast", "output the AST dump")),
+    cl::values(clEnumValN(DumpMLIR, "mlir", "output the MLIR dump")));
+
+/// Returns a Toy AST resulting from parsing the file or a nullptr on error.
+std::unique_ptr<toy::ModuleAST> parseInputFile(llvm::StringRef filename) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(filename);
+  if (std::error_code ec = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << ec.message() << "\n";
+    return nullptr;
+  }
+  auto buffer = fileOrErr.get()->getBuffer();
+  LexerBuffer lexer(buffer.begin(), buffer.end(), filename);
+  Parser parser(lexer);
+  return parser.parseModule();
+}
+
+int dumpMLIR() {
+  // Register our Dialect with MLIR.
+  mlir::registerDialect<mlir::toy::ToyDialect>();
+
+  mlir::MLIRContext context;
+
+  // Handle '.toy' input to the compiler.
+  if (inputType != InputType::MLIR &&
+      !llvm::StringRef(inputFilename).endswith(".mlir")) {
+    auto moduleAST = parseInputFile(inputFilename);
+    if (!moduleAST)
+      return 6;
+    mlir::OwningModuleRef module = mlirGen(context, *moduleAST);
+    if (!module)
+      return 1;
+
+    module->dump();
+    return 0;
+  }
+
+  // Otherwise, the input is '.mlir'.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(inputFilename);
+  if (std::error_code EC = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << EC.message() << "\n";
+    return -1;
+  }
+
+  // Parse the input mlir.
+  llvm::SourceMgr sourceMgr;
+  sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
+  mlir::OwningModuleRef module = mlir::parseSourceFile(sourceMgr, &context);
+  if (!module) {
+    llvm::errs() << "Error can't load file " << inputFilename << "\n";
+    return 3;
+  }
+
+  module->dump();
+  return 0;
+}
+
+int dumpAST() {
+  if (inputType == InputType::MLIR) {
+    llvm::errs() << "Can't dump a Toy AST when the input is MLIR\n";
+    return 5;
+  }
+
+  auto moduleAST = parseInputFile(inputFilename);
+  if (!moduleAST)
+    return 1;
+
+  dump(*moduleAST);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  cl::ParseCommandLineOptions(argc, argv, "toy compiler\n");
+
+  switch (emitAction) {
+  case Action::DumpAST:
+    return dumpAST();
+  case Action::DumpMLIR:
+    return dumpMLIR();
+  default:
+    llvm::errs() << "No action specified (parsing only?), use -emit=<action>\n";
+  }
+
+  return 0;
+}
diff --git a/mlir/examples/toy/Ch3/CMakeLists.txt b/mlir/examples/toy/Ch3/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..823edfd343a3e06b05f7cd8f9bed930b54448dbb
--- /dev/null
+++ b/mlir/examples/toy/Ch3/CMakeLists.txt
@@ -0,0 +1,31 @@
+add_subdirectory(include)
+
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+set(LLVM_TARGET_DEFINITIONS mlir/ToyCombine.td)
+mlir_tablegen(ToyCombine.inc -gen-rewriters "-I${CMAKE_CURRENT_SOURCE_DIR}/include")
+add_public_tablegen_target(ToyCh3CombineIncGen)
+
+add_toy_chapter(toyc-ch3
+  toyc.cpp
+  parser/AST.cpp
+  mlir/MLIRGen.cpp
+  mlir/Dialect.cpp
+  mlir/ToyCombine.cpp
+  )
+
+add_dependencies(toyc-ch3 ToyCh3OpsIncGen)
+add_dependencies(toyc-ch3 ToyCh3CombineIncGen)
+include_directories(include/)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/)
+target_link_libraries(toyc-ch3
+  PRIVATE
+    MLIRAnalysis
+    MLIRIR
+    MLIRParser
+    MLIRPass
+    MLIRTransforms)
+
diff --git a/mlir/examples/toy/Ch3/include/CMakeLists.txt b/mlir/examples/toy/Ch3/include/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..37c89d0bae965cfc8665515de7e60ad7867a7d8b
--- /dev/null
+++ b/mlir/examples/toy/Ch3/include/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(toy)
diff --git a/mlir/examples/toy/Ch3/include/toy/AST.h b/mlir/examples/toy/Ch3/include/toy/AST.h
new file mode 100644
index 0000000000000000000000000000000000000000..820600b5b1c900cbeedce7545bad458f096cc92e
--- /dev/null
+++ b/mlir/examples/toy/Ch3/include/toy/AST.h
@@ -0,0 +1,242 @@
+//===- AST.h - Node definition for the Toy AST ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST for the Toy language. It is optimized for
+// simplicity, not efficiency. The AST forms a tree structure where each node
+// references its children using std::unique_ptr<>.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_AST_H_
+#define MLIR_TUTORIAL_TOY_AST_H_
+
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <vector>
+
+namespace toy {
+
+/// A variable type with shape information.
+struct VarType {
+  std::vector<int64_t> shape;
+};
+
+/// Base class for all expression nodes.
+class ExprAST {
+public:
+  enum ExprASTKind {
+    Expr_VarDecl,
+    Expr_Return,
+    Expr_Num,
+    Expr_Literal,
+    Expr_Var,
+    Expr_BinOp,
+    Expr_Call,
+    Expr_Print,
+  };
+
+  ExprAST(ExprASTKind kind, Location location)
+      : kind(kind), location(location) {}
+  virtual ~ExprAST() = default;
+
+  ExprASTKind getKind() const { return kind; }
+
+  const Location &loc() { return location; }
+
+private:
+  const ExprASTKind kind;
+  Location location;
+};
+
+/// A block-list of expressions.
+using ExprASTList = std::vector<std::unique_ptr<ExprAST>>;
+
+/// Expression class for numeric literals like "1.0".
+class NumberExprAST : public ExprAST {
+  double Val;
+
+public:
+  NumberExprAST(Location loc, double val) : ExprAST(Expr_Num, loc), Val(val) {}
+
+  double getValue() { return Val; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Num; }
+};
+
+/// Expression class for a literal value.
+class LiteralExprAST : public ExprAST {
+  std::vector<std::unique_ptr<ExprAST>> values;
+  std::vector<int64_t> dims;
+
+public:
+  LiteralExprAST(Location loc, std::vector<std::unique_ptr<ExprAST>> values,
+                 std::vector<int64_t> dims)
+      : ExprAST(Expr_Literal, loc), values(std::move(values)),
+        dims(std::move(dims)) {}
+
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getValues() { return values; }
+  llvm::ArrayRef<int64_t> getDims() { return dims; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Literal; }
+};
+
+/// Expression class for referencing a variable, like "a".
+class VariableExprAST : public ExprAST {
+  std::string name;
+
+public:
+  VariableExprAST(Location loc, llvm::StringRef name)
+      : ExprAST(Expr_Var, loc), name(name) {}
+
+  llvm::StringRef getName() { return name; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Var; }
+};
+
+/// Expression class for defining a variable.
+class VarDeclExprAST : public ExprAST {
+  std::string name;
+  VarType type;
+  std::unique_ptr<ExprAST> initVal;
+
+public:
+  VarDeclExprAST(Location loc, llvm::StringRef name, VarType type,
+                 std::unique_ptr<ExprAST> initVal)
+      : ExprAST(Expr_VarDecl, loc), name(name), type(std::move(type)),
+        initVal(std::move(initVal)) {}
+
+  llvm::StringRef getName() { return name; }
+  ExprAST *getInitVal() { return initVal.get(); }
+  const VarType &getType() { return type; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_VarDecl; }
+};
+
+/// Expression class for a return operator.
+class ReturnExprAST : public ExprAST {
+  llvm::Optional<std::unique_ptr<ExprAST>> expr;
+
+public:
+  ReturnExprAST(Location loc, llvm::Optional<std::unique_ptr<ExprAST>> expr)
+      : ExprAST(Expr_Return, loc), expr(std::move(expr)) {}
+
+  llvm::Optional<ExprAST *> getExpr() {
+    if (expr.hasValue())
+      return expr->get();
+    return llvm::None;
+  }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Return; }
+};
+
+/// Expression class for a binary operator.
+class BinaryExprAST : public ExprAST {
+  char op;
+  std::unique_ptr<ExprAST> lhs, rhs;
+
+public:
+  char getOp() { return op; }
+  ExprAST *getLHS() { return lhs.get(); }
+  ExprAST *getRHS() { return rhs.get(); }
+
+  BinaryExprAST(Location loc, char Op, std::unique_ptr<ExprAST> lhs,
+                std::unique_ptr<ExprAST> rhs)
+      : ExprAST(Expr_BinOp, loc), op(Op), lhs(std::move(lhs)),
+        rhs(std::move(rhs)) {}
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_BinOp; }
+};
+
+/// Expression class for function calls.
+class CallExprAST : public ExprAST {
+  std::string callee;
+  std::vector<std::unique_ptr<ExprAST>> args;
+
+public:
+  CallExprAST(Location loc, const std::string &callee,
+              std::vector<std::unique_ptr<ExprAST>> args)
+      : ExprAST(Expr_Call, loc), callee(callee), args(std::move(args)) {}
+
+  llvm::StringRef getCallee() { return callee; }
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getArgs() { return args; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Call; }
+};
+
+/// Expression class for builtin print calls.
+class PrintExprAST : public ExprAST {
+  std::unique_ptr<ExprAST> arg;
+
+public:
+  PrintExprAST(Location loc, std::unique_ptr<ExprAST> arg)
+      : ExprAST(Expr_Print, loc), arg(std::move(arg)) {}
+
+  ExprAST *getArg() { return arg.get(); }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Print; }
+};
+
+/// This class represents the "prototype" for a function, which captures its
+/// name, and its argument names (thus implicitly the number of arguments the
+/// function takes).
+class PrototypeAST {
+  Location location;
+  std::string name;
+  std::vector<std::unique_ptr<VariableExprAST>> args;
+
+public:
+  PrototypeAST(Location location, const std::string &name,
+               std::vector<std::unique_ptr<VariableExprAST>> args)
+      : location(location), name(name), args(std::move(args)) {}
+
+  const Location &loc() { return location; }
+  llvm::StringRef getName() const { return name; }
+  llvm::ArrayRef<std::unique_ptr<VariableExprAST>> getArgs() { return args; }
+};
+
+/// This class represents a function definition itself.
+class FunctionAST {
+  std::unique_ptr<PrototypeAST> proto;
+  std::unique_ptr<ExprASTList> body;
+
+public:
+  FunctionAST(std::unique_ptr<PrototypeAST> proto,
+              std::unique_ptr<ExprASTList> body)
+      : proto(std::move(proto)), body(std::move(body)) {}
+  PrototypeAST *getProto() { return proto.get(); }
+  ExprASTList *getBody() { return body.get(); }
+};
+
+/// This class represents a list of functions to be processed together
+class ModuleAST {
+  std::vector<FunctionAST> functions;
+
+public:
+  ModuleAST(std::vector<FunctionAST> functions)
+      : functions(std::move(functions)) {}
+
+  auto begin() -> decltype(functions.begin()) { return functions.begin(); }
+  auto end() -> decltype(functions.end()) { return functions.end(); }
+};
+
+void dump(ModuleAST &);
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_AST_H_
diff --git a/mlir/examples/toy/Ch3/include/toy/CMakeLists.txt b/mlir/examples/toy/Ch3/include/toy/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e76780c1f79f3901338b5f2b8e57c13265f7f387
--- /dev/null
+++ b/mlir/examples/toy/Ch3/include/toy/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS Ops.td)
+mlir_tablegen(Ops.h.inc -gen-op-decls)
+mlir_tablegen(Ops.cpp.inc -gen-op-defs)
+add_public_tablegen_target(ToyCh3OpsIncGen)
diff --git a/mlir/examples/toy/Ch3/include/toy/Dialect.h b/mlir/examples/toy/Ch3/include/toy/Dialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..385d6ddb95ac4f50f1d8e64c2a1306114affd258
--- /dev/null
+++ b/mlir/examples/toy/Ch3/include/toy/Dialect.h
@@ -0,0 +1,44 @@
+//===- Dialect.h - Dialect definition for the Toy IR ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IR Dialect for the Toy language.
+// See g3doc/Tutorials/Toy/Ch-2.md for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_DIALECT_H_
+#define MLIR_TUTORIAL_TOY_DIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+
+namespace mlir {
+namespace toy {
+
+/// This is the definition of the Toy dialect. A dialect inherits from
+/// mlir::Dialect and registers custom attributes, operations, and types (in its
+/// constructor). It can also override some general behavior exposed via virtual
+/// methods.
+class ToyDialect : public mlir::Dialect {
+public:
+  explicit ToyDialect(mlir::MLIRContext *ctx);
+
+  /// Provide a utility accessor to the dialect namespace. This is used by
+  /// several utilities for casting between dialects.
+  static llvm::StringRef getDialectNamespace() { return "toy"; }
+};
+
+/// Include the auto-generated header file containing the declarations of the
+/// toy operations.
+#define GET_OP_CLASSES
+#include "toy/Ops.h.inc"
+
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_DIALECT_H_
diff --git a/mlir/examples/toy/Ch3/include/toy/Lexer.h b/mlir/examples/toy/Ch3/include/toy/Lexer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6eff64ee5f09634041f76cbae11c18f8ca46d07c
--- /dev/null
+++ b/mlir/examples/toy/Ch3/include/toy/Lexer.h
@@ -0,0 +1,232 @@
+//===- Lexer.h - Lexer for the Toy language -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple Lexer for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_LEXER_H_
+#define MLIR_TUTORIAL_TOY_LEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+
+#include <memory>
+#include <string>
+
+namespace toy {
+
+/// Structure definition a location in a file.
+struct Location {
+  std::shared_ptr<std::string> file; ///< filename.
+  int line;                          ///< line number.
+  int col;                           ///< column number.
+};
+
+// List of Token returned by the lexer.
+enum Token : int {
+  tok_semicolon = ';',
+  tok_parenthese_open = '(',
+  tok_parenthese_close = ')',
+  tok_bracket_open = '{',
+  tok_bracket_close = '}',
+  tok_sbracket_open = '[',
+  tok_sbracket_close = ']',
+
+  tok_eof = -1,
+
+  // commands
+  tok_return = -2,
+  tok_var = -3,
+  tok_def = -4,
+
+  // primary
+  tok_identifier = -5,
+  tok_number = -6,
+};
+
+/// The Lexer is an abstract base class providing all the facilities that the
+/// Parser expects. It goes through the stream one token at a time and keeps
+/// track of the location in the file for debugging purpose.
+/// It relies on a subclass to provide a `readNextLine()` method. The subclass
+/// can proceed by reading the next line from the standard input or from a
+/// memory mapped file.
+class Lexer {
+public:
+  /// Create a lexer for the given filename. The filename is kept only for
+  /// debugging purpose (attaching a location to a Token).
+  Lexer(std::string filename)
+      : lastLocation(
+            {std::make_shared<std::string>(std::move(filename)), 0, 0}) {}
+  virtual ~Lexer() = default;
+
+  /// Look at the current token in the stream.
+  Token getCurToken() { return curTok; }
+
+  /// Move to the next token in the stream and return it.
+  Token getNextToken() { return curTok = getTok(); }
+
+  /// Move to the next token in the stream, asserting on the current token
+  /// matching the expectation.
+  void consume(Token tok) {
+    assert(tok == curTok && "consume Token mismatch expectation");
+    getNextToken();
+  }
+
+  /// Return the current identifier (prereq: getCurToken() == tok_identifier)
+  llvm::StringRef getId() {
+    assert(curTok == tok_identifier);
+    return identifierStr;
+  }
+
+  /// Return the current number (prereq: getCurToken() == tok_number)
+  double getValue() {
+    assert(curTok == tok_number);
+    return numVal;
+  }
+
+  /// Return the location for the beginning of the current token.
+  Location getLastLocation() { return lastLocation; }
+
+  // Return the current line in the file.
+  int getLine() { return curLineNum; }
+
+  // Return the current column in the file.
+  int getCol() { return curCol; }
+
+private:
+  /// Delegate to a derived class fetching the next line. Returns an empty
+  /// string to signal end of file (EOF). Lines are expected to always finish
+  /// with "\n"
+  virtual llvm::StringRef readNextLine() = 0;
+
+  /// Return the next character from the stream. This manages the buffer for the
+  /// current line and request the next line buffer to the derived class as
+  /// needed.
+  int getNextChar() {
+    // The current line buffer should not be empty unless it is the end of file.
+    if (curLineBuffer.empty())
+      return EOF;
+    ++curCol;
+    auto nextchar = curLineBuffer.front();
+    curLineBuffer = curLineBuffer.drop_front();
+    if (curLineBuffer.empty())
+      curLineBuffer = readNextLine();
+    if (nextchar == '\n') {
+      ++curLineNum;
+      curCol = 0;
+    }
+    return nextchar;
+  }
+
+  ///  Return the next token from standard input.
+  Token getTok() {
+    // Skip any whitespace.
+    while (isspace(lastChar))
+      lastChar = Token(getNextChar());
+
+    // Save the current location before reading the token characters.
+    lastLocation.line = curLineNum;
+    lastLocation.col = curCol;
+
+    // Identifier: [a-zA-Z][a-zA-Z0-9_]*
+    if (isalpha(lastChar)) {
+      identifierStr = (char)lastChar;
+      while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
+        identifierStr += (char)lastChar;
+
+      if (identifierStr == "return")
+        return tok_return;
+      if (identifierStr == "def")
+        return tok_def;
+      if (identifierStr == "var")
+        return tok_var;
+      return tok_identifier;
+    }
+
+    // Number: [0-9.]+
+    if (isdigit(lastChar) || lastChar == '.') {
+      std::string numStr;
+      do {
+        numStr += lastChar;
+        lastChar = Token(getNextChar());
+      } while (isdigit(lastChar) || lastChar == '.');
+
+      numVal = strtod(numStr.c_str(), nullptr);
+      return tok_number;
+    }
+
+    if (lastChar == '#') {
+      // Comment until end of line.
+      do {
+        lastChar = Token(getNextChar());
+      } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
+
+      if (lastChar != EOF)
+        return getTok();
+    }
+
+    // Check for end of file.  Don't eat the EOF.
+    if (lastChar == EOF)
+      return tok_eof;
+
+    // Otherwise, just return the character as its ascii value.
+    Token thisChar = Token(lastChar);
+    lastChar = Token(getNextChar());
+    return thisChar;
+  }
+
+  /// The last token read from the input.
+  Token curTok = tok_eof;
+
+  /// Location for `curTok`.
+  Location lastLocation;
+
+  /// If the current Token is an identifier, this string contains the value.
+  std::string identifierStr;
+
+  /// If the current Token is a number, this contains the value.
+  double numVal = 0;
+
+  /// The last value returned by getNextChar(). We need to keep it around as we
+  /// always need to read ahead one character to decide when to end a token and
+  /// we can't put it back in the stream after reading from it.
+  Token lastChar = Token(' ');
+
+  /// Keep track of the current line number in the input stream
+  int curLineNum = 0;
+
+  /// Keep track of the current column number in the input stream
+  int curCol = 0;
+
+  /// Buffer supplied by the derived class on calls to `readNextLine()`
+  llvm::StringRef curLineBuffer = "\n";
+};
+
+/// A lexer implementation operating on a buffer in memory.
+class LexerBuffer final : public Lexer {
+public:
+  LexerBuffer(const char *begin, const char *end, std::string filename)
+      : Lexer(std::move(filename)), current(begin), end(end) {}
+
+private:
+  /// Provide one line at a time to the Lexer, return an empty string when
+  /// reaching the end of the buffer.
+  llvm::StringRef readNextLine() override {
+    auto *begin = current;
+    while (current <= end && *current && *current != '\n')
+      ++current;
+    if (current <= end && *current)
+      ++current;
+    llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
+    return result;
+  }
+  const char *current, *end;
+};
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_LEXER_H_
diff --git a/mlir/examples/toy/Ch3/include/toy/MLIRGen.h b/mlir/examples/toy/Ch3/include/toy/MLIRGen.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1c8ca1201d1a2a391c0aec0d89197fbbb18efb8
--- /dev/null
+++ b/mlir/examples/toy/Ch3/include/toy/MLIRGen.h
@@ -0,0 +1,32 @@
+//===- MLIRGen.h - MLIR Generation from a Toy AST -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a simple interface to perform IR generation targeting MLIR
+// from a Module AST for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_MLIRGEN_H_
+#define MLIR_TUTORIAL_TOY_MLIRGEN_H_
+
+#include <memory>
+
+namespace mlir {
+class MLIRContext;
+class OwningModuleRef;
+} // namespace mlir
+
+namespace toy {
+class ModuleAST;
+
+/// Emit IR for the given Toy moduleAST, returns a newly created MLIR module
+/// or nullptr on failure.
+mlir::OwningModuleRef mlirGen(mlir::MLIRContext &context, ModuleAST &moduleAST);
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_MLIRGEN_H_
diff --git a/mlir/examples/toy/Ch3/include/toy/Ops.td b/mlir/examples/toy/Ch3/include/toy/Ops.td
new file mode 100644
index 0000000000000000000000000000000000000000..80717119b2fe4deb84528863f9d69f7bc0502f14
--- /dev/null
+++ b/mlir/examples/toy/Ch3/include/toy/Ops.td
@@ -0,0 +1,226 @@
+//===- Ops.td - Toy dialect operation definitions ----------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the operations of the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_OPS
+#define TOY_OPS
+
+include "mlir/IR/OpBase.td"
+
+// Provide a definition of the 'toy' dialect in the ODS framework so that we
+// can define our operations.
+def Toy_Dialect : Dialect {
+  let name = "toy";
+  let cppNamespace = "toy";
+}
+
+// Base class for toy dialect operations. This operation inherits from the base
+// `Op` class in OpBase.td, and provides:
+//   * The parent dialect of the operation.
+//   * The mnemonic for the operation, or the name without the dialect prefix.
+//   * A list of traits for the operation.
+class Toy_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Toy_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+// We define a toy operation by inheriting from our base 'Toy_Op' class above.
+// Here we provide the mnemonic and a list of traits for the operation. The
+// constant operation is marked as 'NoSideEffect' as it is a pure operation
+// and may be removed if dead.
+def ConstantOp : Toy_Op<"constant", [NoSideEffect]> {
+  // Provide a summary and description for this operation. This can be used to
+  // auto-generate documentation of the operations within our dialect.
+  let summary = "constant";
+  let description = [{
+    Constant operation turns a literal into an SSA value. The data is attached
+    to the operation as an attribute. For example:
+
+    ```mlir
+      %0 = "toy.constant"()
+         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
+        : () -> tensor<2x3xf64>
+    ```
+  }];
+
+  // The constant operation takes an attribute as the only input.
+  let arguments = (ins F64ElementsAttr:$value);
+
+  // The constant operation returns a single value of TensorType.
+  let results = (outs F64Tensor);
+
+  // Add custom build methods for the constant operation. These method populates
+  // the `state` that MLIR uses to create operations, i.e. these are used when
+  // using `builder.create<ConstantOp>(...)`.
+  let builders = [
+    // Build a constant with a given constant tensor value.
+    OpBuilder<"Builder *builder, OperationState &state, "
+              "DenseElementsAttr value", [{
+      build(builder, state, value.getType(), value);
+    }]>,
+
+    // Build a constant with a given constant floating-point value.
+    OpBuilder<"Builder *builder, OperationState &state, double value">
+  ];
+
+  // Invoke a static verify method to verify this constant operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def AddOp : Toy_Op<"add", [NoSideEffect]> {
+  let summary = "element-wise addition operation";
+  let description = [{
+    The "add" operation performs element-wise addition between two tensors.
+    The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs F64Tensor);
+
+  // Allow building an AddOp with from the two input operands.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
+  ];
+}
+
+def GenericCallOp : Toy_Op<"generic_call"> {
+  let summary = "generic call operation";
+  let description = [{
+    Generic calls represent calls to a user defined function that needs to
+    be specialized for the shape of its arguments. The callee name is attached
+    as a symbol reference via an attribute. The arguments list must match the
+    arguments expected by the callee. For example:
+
+    ```mlir
+     %4 = "toy.generic_call"(%1, %3) {callee = @my_func}
+           : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+    ```
+
+    This is only valid if a function named "my_func" exists and takes two
+    arguments.
+  }];
+
+  // The generic call operation takes a symbol reference attribute as the
+  // callee, and inputs for the call.
+  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<F64Tensor>:$inputs);
+
+  // The generic call operation returns a single value of TensorType.
+  let results = (outs F64Tensor);
+
+  // Add custom build methods for the generic call operation.
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &state, "
+              "StringRef callee, ArrayRef<Value> arguments">
+  ];
+}
+
+def MulOp : Toy_Op<"mul", [NoSideEffect]> {
+  let summary = "element-wise multiplication operation";
+  let description = [{
+    The "mul" operation performs element-wise multiplication between two
+    tensors. The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs F64Tensor);
+
+  // Allow building a MulOp with from the two input operands.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
+  ];
+}
+
+def PrintOp : Toy_Op<"print"> {
+  let summary = "print operation";
+  let description = [{
+    The "print" builtin operation prints a given input tensor, and produces
+    no results.
+  }];
+
+  // The print operation takes an input tensor to print.
+  let arguments = (ins F64Tensor:$input);
+}
+
+def ReshapeOp : Toy_Op<"reshape", [NoSideEffect]> {
+  let summary = "tensor reshape operation";
+  let description = [{
+    Reshape operation is transforming its input tensor into a new tensor with
+    the same number of elements but different shapes. For example:
+
+    ```mlir
+       %0 = "toy.reshape"(%arg1) : (tensor<10xf64>) -> tensor<5x2xf64>
+    ```
+  }];
+
+  let arguments = (ins F64Tensor:$input);
+
+  // Enabled registering canonicalization patterns with this operation.
+  let hasCanonicalizer = 1;
+
+  // We expect that the reshape operation returns a statically shaped tensor.
+  let results = (outs StaticShapeTensorOf<[F64]>);
+}
+
+def ReturnOp : Toy_Op<"return", [Terminator, HasParent<"FuncOp">]> {
+  let summary = "return operation";
+  let description = [{
+    The "return" operation represents a return operation within a function.
+    The operation takes an optional tensor operand and produces no results.
+    The operand type must match the signature of the function that contains
+    the operation. For example:
+
+    ```mlir
+      func @foo() -> tensor<2xf64> {
+        ...
+        toy.return %0 : tensor<2xf64>
+      }
+    ```
+  }];
+
+  // The return operation takes an optional input operand to return. This
+  // value must match the return type of the enclosing function.
+  let arguments = (ins Variadic<F64Tensor>:$input);
+
+  // Allow building a ReturnOp with no return operand.
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &state", [{ build(b, state, llvm::None); }]
+  >];
+
+  // Provide extra utility definitions on the c++ operation class definition.
+  let extraClassDeclaration = [{
+    bool hasOperand() { return getNumOperands() != 0; }
+  }];
+
+  // Invoke a static verify method to verify this return operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def TransposeOp : Toy_Op<"transpose", [NoSideEffect]> {
+  let summary = "transpose operation";
+
+  let arguments = (ins F64Tensor:$input);
+  let results = (outs F64Tensor);
+
+  // Enabled registering canonicalization patterns with this operation.
+  let hasCanonicalizer = 1;
+
+  // Allow building a TransposeOp with from the input operand.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value input">
+  ];
+
+  // Invoke a static verify method to verify this transpose operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+#endif // TOY_OPS
diff --git a/mlir/examples/toy/Ch3/include/toy/Parser.h b/mlir/examples/toy/Ch3/include/toy/Parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..4557ea26859de3d0a6b71448f4bef030167c3e71
--- /dev/null
+++ b/mlir/examples/toy/Ch3/include/toy/Parser.h
@@ -0,0 +1,485 @@
+//===- Parser.h - Toy Language Parser -------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the parser for the Toy language. It processes the Token
+// provided by the Lexer and returns an AST.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_PARSER_H
+#define MLIR_TUTORIAL_TOY_PARSER_H
+
+#include "toy/AST.h"
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+namespace toy {
+
+/// This is a simple recursive parser for the Toy language. It produces a well
+/// formed AST from a stream of Token supplied by the Lexer. No semantic checks
+/// or symbol resolution is performed. For example, variables are referenced by
+/// string and the code could reference an undeclared variable and the parsing
+/// succeeds.
+class Parser {
+public:
+  /// Create a Parser for the supplied lexer.
+  Parser(Lexer &lexer) : lexer(lexer) {}
+
+  /// Parse a full Module. A module is a list of function definitions.
+  std::unique_ptr<ModuleAST> parseModule() {
+    lexer.getNextToken(); // prime the lexer
+
+    // Parse functions one at a time and accumulate in this vector.
+    std::vector<FunctionAST> functions;
+    while (auto f = parseDefinition()) {
+      functions.push_back(std::move(*f));
+      if (lexer.getCurToken() == tok_eof)
+        break;
+    }
+    // If we didn't reach EOF, there was an error during parsing
+    if (lexer.getCurToken() != tok_eof)
+      return parseError<ModuleAST>("nothing", "at end of module");
+
+    return std::make_unique<ModuleAST>(std::move(functions));
+  }
+
+private:
+  Lexer &lexer;
+
+  /// Parse a return statement.
+  /// return :== return ; | return expr ;
+  std::unique_ptr<ReturnExprAST> parseReturn() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_return);
+
+    // return takes an optional argument
+    llvm::Optional<std::unique_ptr<ExprAST>> expr;
+    if (lexer.getCurToken() != ';') {
+      expr = parseExpression();
+      if (!expr)
+        return nullptr;
+    }
+    return std::make_unique<ReturnExprAST>(std::move(loc), std::move(expr));
+  }
+
+  /// Parse a literal number.
+  /// numberexpr ::= number
+  std::unique_ptr<ExprAST> parseNumberExpr() {
+    auto loc = lexer.getLastLocation();
+    auto result =
+        std::make_unique<NumberExprAST>(std::move(loc), lexer.getValue());
+    lexer.consume(tok_number);
+    return std::move(result);
+  }
+
+  /// Parse a literal array expression.
+  /// tensorLiteral ::= [ literalList ] | number
+  /// literalList ::= tensorLiteral | tensorLiteral, literalList
+  std::unique_ptr<ExprAST> parseTensorLiteralExpr() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(Token('['));
+
+    // Hold the list of values at this nesting level.
+    std::vector<std::unique_ptr<ExprAST>> values;
+    // Hold the dimensions for all the nesting inside this level.
+    std::vector<int64_t> dims;
+    do {
+      // We can have either another nested array or a number literal.
+      if (lexer.getCurToken() == '[') {
+        values.push_back(parseTensorLiteralExpr());
+        if (!values.back())
+          return nullptr; // parse error in the nested array.
+      } else {
+        if (lexer.getCurToken() != tok_number)
+          return parseError<ExprAST>("<num> or [", "in literal expression");
+        values.push_back(parseNumberExpr());
+      }
+
+      // End of this list on ']'
+      if (lexer.getCurToken() == ']')
+        break;
+
+      // Elements are separated by a comma.
+      if (lexer.getCurToken() != ',')
+        return parseError<ExprAST>("] or ,", "in literal expression");
+
+      lexer.getNextToken(); // eat ,
+    } while (true);
+    if (values.empty())
+      return parseError<ExprAST>("<something>", "to fill literal expression");
+    lexer.getNextToken(); // eat ]
+
+    /// Fill in the dimensions now. First the current nesting level:
+    dims.push_back(values.size());
+
+    /// If there is any nested array, process all of them and ensure that
+    /// dimensions are uniform.
+    if (llvm::any_of(values, [](std::unique_ptr<ExprAST> &expr) {
+          return llvm::isa<LiteralExprAST>(expr.get());
+        })) {
+      auto *firstLiteral = llvm::dyn_cast<LiteralExprAST>(values.front().get());
+      if (!firstLiteral)
+        return parseError<ExprAST>("uniform well-nested dimensions",
+                                   "inside literal expression");
+
+      // Append the nested dimensions to the current level
+      auto firstDims = firstLiteral->getDims();
+      dims.insert(dims.end(), firstDims.begin(), firstDims.end());
+
+      // Sanity check that shape is uniform across all elements of the list.
+      for (auto &expr : values) {
+        auto *exprLiteral = llvm::cast<LiteralExprAST>(expr.get());
+        if (!exprLiteral)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+        if (exprLiteral->getDims() != firstDims)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+      }
+    }
+    return std::make_unique<LiteralExprAST>(std::move(loc), std::move(values),
+                                            std::move(dims));
+  }
+
+  /// parenexpr ::= '(' expression ')'
+  std::unique_ptr<ExprAST> parseParenExpr() {
+    lexer.getNextToken(); // eat (.
+    auto v = parseExpression();
+    if (!v)
+      return nullptr;
+
+    if (lexer.getCurToken() != ')')
+      return parseError<ExprAST>(")", "to close expression with parentheses");
+    lexer.consume(Token(')'));
+    return v;
+  }
+
+  /// identifierexpr
+  ///   ::= identifier
+  ///   ::= identifier '(' expression ')'
+  std::unique_ptr<ExprAST> parseIdentifierExpr() {
+    std::string name = lexer.getId();
+
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat identifier.
+
+    if (lexer.getCurToken() != '(') // Simple variable ref.
+      return std::make_unique<VariableExprAST>(std::move(loc), name);
+
+    // This is a function call.
+    lexer.consume(Token('('));
+    std::vector<std::unique_ptr<ExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      while (true) {
+        if (auto arg = parseExpression())
+          args.push_back(std::move(arg));
+        else
+          return nullptr;
+
+        if (lexer.getCurToken() == ')')
+          break;
+
+        if (lexer.getCurToken() != ',')
+          return parseError<ExprAST>(", or )", "in argument list");
+        lexer.getNextToken();
+      }
+    }
+    lexer.consume(Token(')'));
+
+    // It can be a builtin call to print
+    if (name == "print") {
+      if (args.size() != 1)
+        return parseError<ExprAST>("<single arg>", "as argument to print()");
+
+      return std::make_unique<PrintExprAST>(std::move(loc), std::move(args[0]));
+    }
+
+    // Call to a user-defined function
+    return std::make_unique<CallExprAST>(std::move(loc), name, std::move(args));
+  }
+
+  /// primary
+  ///   ::= identifierexpr
+  ///   ::= numberexpr
+  ///   ::= parenexpr
+  ///   ::= tensorliteral
+  std::unique_ptr<ExprAST> parsePrimary() {
+    switch (lexer.getCurToken()) {
+    default:
+      llvm::errs() << "unknown token '" << lexer.getCurToken()
+                   << "' when expecting an expression\n";
+      return nullptr;
+    case tok_identifier:
+      return parseIdentifierExpr();
+    case tok_number:
+      return parseNumberExpr();
+    case '(':
+      return parseParenExpr();
+    case '[':
+      return parseTensorLiteralExpr();
+    case ';':
+      return nullptr;
+    case '}':
+      return nullptr;
+    }
+  }
+
+  /// Recursively parse the right hand side of a binary expression, the ExprPrec
+  /// argument indicates the precedence of the current binary operator.
+  ///
+  /// binoprhs ::= ('+' primary)*
+  std::unique_ptr<ExprAST> parseBinOpRHS(int exprPrec,
+                                         std::unique_ptr<ExprAST> lhs) {
+    // If this is a binop, find its precedence.
+    while (true) {
+      int tokPrec = getTokPrecedence();
+
+      // If this is a binop that binds at least as tightly as the current binop,
+      // consume it, otherwise we are done.
+      if (tokPrec < exprPrec)
+        return lhs;
+
+      // Okay, we know this is a binop.
+      int binOp = lexer.getCurToken();
+      lexer.consume(Token(binOp));
+      auto loc = lexer.getLastLocation();
+
+      // Parse the primary expression after the binary operator.
+      auto rhs = parsePrimary();
+      if (!rhs)
+        return parseError<ExprAST>("expression", "to complete binary operator");
+
+      // If BinOp binds less tightly with rhs than the operator after rhs, let
+      // the pending operator take rhs as its lhs.
+      int nextPrec = getTokPrecedence();
+      if (tokPrec < nextPrec) {
+        rhs = parseBinOpRHS(tokPrec + 1, std::move(rhs));
+        if (!rhs)
+          return nullptr;
+      }
+
+      // Merge lhs/RHS.
+      lhs = std::make_unique<BinaryExprAST>(std::move(loc), binOp,
+                                            std::move(lhs), std::move(rhs));
+    }
+  }
+
+  /// expression::= primary binop rhs
+  std::unique_ptr<ExprAST> parseExpression() {
+    auto lhs = parsePrimary();
+    if (!lhs)
+      return nullptr;
+
+    return parseBinOpRHS(0, std::move(lhs));
+  }
+
+  /// type ::= < shape_list >
+  /// shape_list ::= num | num , shape_list
+  std::unique_ptr<VarType> parseType() {
+    if (lexer.getCurToken() != '<')
+      return parseError<VarType>("<", "to begin type");
+    lexer.getNextToken(); // eat <
+
+    auto type = std::make_unique<VarType>();
+
+    while (lexer.getCurToken() == tok_number) {
+      type->shape.push_back(lexer.getValue());
+      lexer.getNextToken();
+      if (lexer.getCurToken() == ',')
+        lexer.getNextToken();
+    }
+
+    if (lexer.getCurToken() != '>')
+      return parseError<VarType>(">", "to end type");
+    lexer.getNextToken(); // eat >
+    return type;
+  }
+
+  /// Parse a variable declaration, it starts with a `var` keyword followed by
+  /// and identifier and an optional type (shape specification) before the
+  /// initializer.
+  /// decl ::= var identifier [ type ] = expr
+  std::unique_ptr<VarDeclExprAST> parseDeclaration() {
+    if (lexer.getCurToken() != tok_var)
+      return parseError<VarDeclExprAST>("var", "to begin declaration");
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat var
+
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<VarDeclExprAST>("identified",
+                                        "after 'var' declaration");
+    std::string id = lexer.getId();
+    lexer.getNextToken(); // eat id
+
+    std::unique_ptr<VarType> type; // Type is optional, it can be inferred
+    if (lexer.getCurToken() == '<') {
+      type = parseType();
+      if (!type)
+        return nullptr;
+    }
+
+    if (!type)
+      type = std::make_unique<VarType>();
+    lexer.consume(Token('='));
+    auto expr = parseExpression();
+    return std::make_unique<VarDeclExprAST>(std::move(loc), std::move(id),
+                                            std::move(*type), std::move(expr));
+  }
+
+  /// Parse a block: a list of expression separated by semicolons and wrapped in
+  /// curly braces.
+  ///
+  /// block ::= { expression_list }
+  /// expression_list ::= block_expr ; expression_list
+  /// block_expr ::= decl | "return" | expr
+  std::unique_ptr<ExprASTList> parseBlock() {
+    if (lexer.getCurToken() != '{')
+      return parseError<ExprASTList>("{", "to begin block");
+    lexer.consume(Token('{'));
+
+    auto exprList = std::make_unique<ExprASTList>();
+
+    // Ignore empty expressions: swallow sequences of semicolons.
+    while (lexer.getCurToken() == ';')
+      lexer.consume(Token(';'));
+
+    while (lexer.getCurToken() != '}' && lexer.getCurToken() != tok_eof) {
+      if (lexer.getCurToken() == tok_var) {
+        // Variable declaration
+        auto varDecl = parseDeclaration();
+        if (!varDecl)
+          return nullptr;
+        exprList->push_back(std::move(varDecl));
+      } else if (lexer.getCurToken() == tok_return) {
+        // Return statement
+        auto ret = parseReturn();
+        if (!ret)
+          return nullptr;
+        exprList->push_back(std::move(ret));
+      } else {
+        // General expression
+        auto expr = parseExpression();
+        if (!expr)
+          return nullptr;
+        exprList->push_back(std::move(expr));
+      }
+      // Ensure that elements are separated by a semicolon.
+      if (lexer.getCurToken() != ';')
+        return parseError<ExprASTList>(";", "after expression");
+
+      // Ignore empty expressions: swallow sequences of semicolons.
+      while (lexer.getCurToken() == ';')
+        lexer.consume(Token(';'));
+    }
+
+    if (lexer.getCurToken() != '}')
+      return parseError<ExprASTList>("}", "to close block");
+
+    lexer.consume(Token('}'));
+    return exprList;
+  }
+
+  /// prototype ::= def id '(' decl_list ')'
+  /// decl_list ::= identifier | identifier, decl_list
+  std::unique_ptr<PrototypeAST> parsePrototype() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_def);
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<PrototypeAST>("function name", "in prototype");
+
+    std::string fnName = lexer.getId();
+    lexer.consume(tok_identifier);
+
+    if (lexer.getCurToken() != '(')
+      return parseError<PrototypeAST>("(", "in prototype");
+    lexer.consume(Token('('));
+
+    std::vector<std::unique_ptr<VariableExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      do {
+        std::string name = lexer.getId();
+        auto loc = lexer.getLastLocation();
+        lexer.consume(tok_identifier);
+        auto decl = std::make_unique<VariableExprAST>(std::move(loc), name);
+        args.push_back(std::move(decl));
+        if (lexer.getCurToken() != ',')
+          break;
+        lexer.consume(Token(','));
+        if (lexer.getCurToken() != tok_identifier)
+          return parseError<PrototypeAST>(
+              "identifier", "after ',' in function parameter list");
+      } while (true);
+    }
+    if (lexer.getCurToken() != ')')
+      return parseError<PrototypeAST>("}", "to end function prototype");
+
+    // success.
+    lexer.consume(Token(')'));
+    return std::make_unique<PrototypeAST>(std::move(loc), fnName,
+                                          std::move(args));
+  }
+
+  /// Parse a function definition, we expect a prototype initiated with the
+  /// `def` keyword, followed by a block containing a list of expressions.
+  ///
+  /// definition ::= prototype block
+  std::unique_ptr<FunctionAST> parseDefinition() {
+    auto proto = parsePrototype();
+    if (!proto)
+      return nullptr;
+
+    if (auto block = parseBlock())
+      return std::make_unique<FunctionAST>(std::move(proto), std::move(block));
+    return nullptr;
+  }
+
+  /// Get the precedence of the pending binary operator token.
+  int getTokPrecedence() {
+    if (!isascii(lexer.getCurToken()))
+      return -1;
+
+    // 1 is lowest precedence.
+    switch (static_cast<char>(lexer.getCurToken())) {
+    case '-':
+      return 20;
+    case '+':
+      return 20;
+    case '*':
+      return 40;
+    default:
+      return -1;
+    }
+  }
+
+  /// Helper function to signal errors while parsing, it takes an argument
+  /// indicating the expected token and another argument giving more context.
+  /// Location is retrieved from the lexer to enrich the error message.
+  template <typename R, typename T, typename U = const char *>
+  std::unique_ptr<R> parseError(T &&expected, U &&context = "") {
+    auto curToken = lexer.getCurToken();
+    llvm::errs() << "Parse error (" << lexer.getLastLocation().line << ", "
+                 << lexer.getLastLocation().col << "): expected '" << expected
+                 << "' " << context << " but has Token " << curToken;
+    if (isprint(curToken))
+      llvm::errs() << " '" << (char)curToken << "'";
+    llvm::errs() << "\n";
+    return nullptr;
+  }
+};
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_PARSER_H
diff --git a/mlir/examples/toy/Ch3/mlir/Dialect.cpp b/mlir/examples/toy/Ch3/mlir/Dialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b4d669d18eabd72acb11eea6b4dbc1c3dab4ecd
--- /dev/null
+++ b/mlir/examples/toy/Ch3/mlir/Dialect.cpp
@@ -0,0 +1,180 @@
+//===- Dialect.cpp - Toy IR Dialect registration in MLIR ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the dialect for the Toy IR: custom type parsing and
+// operation verification.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/StandardTypes.h"
+
+using namespace mlir;
+using namespace mlir::toy;
+
+//===----------------------------------------------------------------------===//
+// ToyDialect
+//===----------------------------------------------------------------------===//
+
+/// Dialect creation, the instance will be owned by the context. This is the
+/// point of registration of custom types and operations for the dialect.
+ToyDialect::ToyDialect(mlir::MLIRContext *ctx) : mlir::Dialect("toy", ctx) {
+  addOperations<
+#define GET_OP_LIST
+#include "toy/Ops.cpp.inc"
+      >();
+}
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ConstantOp
+
+/// Build a constant operation.
+/// The builder is passed as an argument, so is the state that this method is
+/// expected to fill in order to build the operation.
+void ConstantOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                       double value) {
+  auto dataType = RankedTensorType::get({}, builder->getF64Type());
+  auto dataAttribute = DenseElementsAttr::get(dataType, value);
+  ConstantOp::build(builder, state, dataType, dataAttribute);
+}
+
+/// Verifier for the constant operation. This corresponds to the `::verify(...)`
+/// in the op definition.
+static mlir::LogicalResult verify(ConstantOp op) {
+  // If the return type of the constant is not an unranked tensor, the shape
+  // must match the shape of the attribute holding the data.
+  auto resultType =
+      op.getResult()->getType().dyn_cast<mlir::RankedTensorType>();
+  if (!resultType)
+    return success();
+
+  // Check that the rank of the attribute type matches the rank of the constant
+  // result type.
+  auto attrType = op.value().getType().cast<mlir::TensorType>();
+  if (attrType.getRank() != resultType.getRank()) {
+    return op.emitOpError(
+               "return type must match the one of the attached value "
+               "attribute: ")
+           << attrType.getRank() << " != " << resultType.getRank();
+  }
+
+  // Check that each of the dimensions match between the two types.
+  for (int dim = 0, dimE = attrType.getRank(); dim < dimE; ++dim) {
+    if (attrType.getShape()[dim] != resultType.getShape()[dim]) {
+      return op.emitOpError(
+                 "return type shape mismatches its attribute at dimension ")
+             << dim << ": " << attrType.getShape()[dim]
+             << " != " << resultType.getShape()[dim];
+    }
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// AddOp
+
+void AddOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+//===----------------------------------------------------------------------===//
+// GenericCallOp
+
+void GenericCallOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                          StringRef callee, ArrayRef<mlir::Value> arguments) {
+  // Generic call always returns an unranked Tensor initially.
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands(arguments);
+  state.addAttribute("callee", builder->getSymbolRefAttr(callee));
+}
+
+//===----------------------------------------------------------------------===//
+// MulOp
+
+void MulOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+
+static mlir::LogicalResult verify(ReturnOp op) {
+  // We know that the parent operation is a function, because of the 'HasParent'
+  // trait attached to the operation definition.
+  auto function = cast<FuncOp>(op.getParentOp());
+
+  /// ReturnOps can only have a single optional operand.
+  if (op.getNumOperands() > 1)
+    return op.emitOpError() << "expects at most 1 return operand";
+
+  // The operand number and types must match the function signature.
+  const auto &results = function.getType().getResults();
+  if (op.getNumOperands() != results.size())
+    return op.emitOpError()
+           << "does not return the same number of values ("
+           << op.getNumOperands() << ") as the enclosing function ("
+           << results.size() << ")";
+
+  // If the operation does not have an input, we are done.
+  if (!op.hasOperand())
+    return mlir::success();
+
+  auto inputType = *op.operand_type_begin();
+  auto resultType = results.front();
+
+  // Check that the result type of the function matches the operand type.
+  if (inputType == resultType || inputType.isa<mlir::UnrankedTensorType>() ||
+      resultType.isa<mlir::UnrankedTensorType>())
+    return mlir::success();
+
+  return op.emitError() << "type of return operand ("
+                        << *op.operand_type_begin()
+                        << ") doesn't match function result type ("
+                        << results.front() << ")";
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+
+void TransposeOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                        mlir::Value value) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands(value);
+}
+
+static mlir::LogicalResult verify(TransposeOp op) {
+  auto inputType = op.getOperand()->getType().dyn_cast<RankedTensorType>();
+  auto resultType = op.getType().dyn_cast<RankedTensorType>();
+  if (!inputType || !resultType)
+    return mlir::success();
+
+  auto inputShape = inputType.getShape();
+  if (!std::equal(inputShape.begin(), inputShape.end(),
+                  resultType.getShape().rbegin())) {
+    return op.emitError()
+           << "expected result shape to be a transpose of the input";
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "toy/Ops.cpp.inc"
diff --git a/mlir/examples/toy/Ch3/mlir/MLIRGen.cpp b/mlir/examples/toy/Ch3/mlir/MLIRGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9c960c79f47254d31e72037431a0f9d3a614276
--- /dev/null
+++ b/mlir/examples/toy/Ch3/mlir/MLIRGen.cpp
@@ -0,0 +1,452 @@
+//===- MLIRGen.cpp - MLIR Generation from a Toy AST -----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple IR generation targeting MLIR from a Module AST
+// for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/MLIRGen.h"
+#include "toy/AST.h"
+#include "toy/Dialect.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/Support/raw_ostream.h"
+#include <numeric>
+
+using namespace mlir::toy;
+using namespace toy;
+
+using llvm::ArrayRef;
+using llvm::cast;
+using llvm::dyn_cast;
+using llvm::isa;
+using llvm::makeArrayRef;
+using llvm::ScopedHashTableScope;
+using llvm::SmallVector;
+using llvm::StringRef;
+using llvm::Twine;
+
+namespace {
+
+/// Implementation of a simple MLIR emission from the Toy AST.
+///
+/// This will emit operations that are specific to the Toy language, preserving
+/// the semantics of the language and (hopefully) allow to perform accurate
+/// analysis and transformation based on these high level semantics.
+class MLIRGenImpl {
+public:
+  MLIRGenImpl(mlir::MLIRContext &context) : builder(&context) {}
+
+  /// Public API: convert the AST for a Toy module (source file) to an MLIR
+  /// Module operation.
+  mlir::ModuleOp mlirGen(ModuleAST &moduleAST) {
+    // We create an empty MLIR module and codegen functions one at a time and
+    // add them to the module.
+    theModule = mlir::ModuleOp::create(builder.getUnknownLoc());
+
+    for (FunctionAST &F : moduleAST) {
+      auto func = mlirGen(F);
+      if (!func)
+        return nullptr;
+      theModule.push_back(func);
+    }
+
+    // Verify the module after we have finished constructing it, this will check
+    // the structural properties of the IR and invoke any specific verifiers we
+    // have on the Toy operations.
+    if (failed(mlir::verify(theModule))) {
+      theModule.emitError("module verification error");
+      return nullptr;
+    }
+
+    return theModule;
+  }
+
+private:
+  /// A "module" matches a Toy source file: containing a list of functions.
+  mlir::ModuleOp theModule;
+
+  /// The builder is a helper class to create IR inside a function. The builder
+  /// is stateful, in particular it keeps an "insertion point": this is where
+  /// the next operations will be introduced.
+  mlir::OpBuilder builder;
+
+  /// The symbol table maps a variable name to a value in the current scope.
+  /// Entering a function creates a new scope, and the function arguments are
+  /// added to the mapping. When the processing of a function is terminated, the
+  /// scope is destroyed and the mappings created in this scope are dropped.
+  llvm::ScopedHashTable<StringRef, mlir::Value> symbolTable;
+
+  /// Helper conversion for a Toy AST location to an MLIR location.
+  mlir::Location loc(Location loc) {
+    return builder.getFileLineColLoc(builder.getIdentifier(*loc.file), loc.line,
+                                     loc.col);
+  }
+
+  /// Declare a variable in the current scope, return success if the variable
+  /// wasn't declared yet.
+  mlir::LogicalResult declare(llvm::StringRef var, mlir::Value value) {
+    if (symbolTable.count(var))
+      return mlir::failure();
+    symbolTable.insert(var, value);
+    return mlir::success();
+  }
+
+  /// Create the prototype for an MLIR function with as many arguments as the
+  /// provided Toy AST prototype.
+  mlir::FuncOp mlirGen(PrototypeAST &proto) {
+    auto location = loc(proto.loc());
+
+    // This is a generic function, the return type will be inferred later.
+    // Arguments type are uniformly unranked tensors.
+    llvm::SmallVector<mlir::Type, 4> arg_types(proto.getArgs().size(),
+                                               getType(VarType{}));
+    auto func_type = builder.getFunctionType(arg_types, llvm::None);
+    return mlir::FuncOp::create(location, proto.getName(), func_type);
+  }
+
+  /// Emit a new function and add it to the MLIR module.
+  mlir::FuncOp mlirGen(FunctionAST &funcAST) {
+    // Create a scope in the symbol table to hold variable declarations.
+    ScopedHashTableScope<llvm::StringRef, mlir::Value> var_scope(symbolTable);
+
+    // Create an MLIR function for the given prototype.
+    mlir::FuncOp function(mlirGen(*funcAST.getProto()));
+    if (!function)
+      return nullptr;
+
+    // Let's start the body of the function now!
+    // In MLIR the entry block of the function is special: it must have the same
+    // argument list as the function itself.
+    auto &entryBlock = *function.addEntryBlock();
+    auto protoArgs = funcAST.getProto()->getArgs();
+
+    // Declare all the function arguments in the symbol table.
+    for (const auto &name_value :
+         llvm::zip(protoArgs, entryBlock.getArguments())) {
+      if (failed(declare(std::get<0>(name_value)->getName(),
+                         std::get<1>(name_value))))
+        return nullptr;
+    }
+
+    // Set the insertion point in the builder to the beginning of the function
+    // body, it will be used throughout the codegen to create operations in this
+    // function.
+    builder.setInsertionPointToStart(&entryBlock);
+
+    // Emit the body of the function.
+    if (mlir::failed(mlirGen(*funcAST.getBody()))) {
+      function.erase();
+      return nullptr;
+    }
+
+    // Implicitly return void if no return statement was emitted.
+    // FIXME: we may fix the parser instead to always return the last expression
+    // (this would possibly help the REPL case later)
+    ReturnOp returnOp;
+    if (!entryBlock.empty())
+      returnOp = dyn_cast<ReturnOp>(entryBlock.back());
+    if (!returnOp) {
+      builder.create<ReturnOp>(loc(funcAST.getProto()->loc()));
+    } else if (returnOp.hasOperand()) {
+      // Otherwise, if this return operation has an operand then add a result to
+      // the function.
+      function.setType(builder.getFunctionType(function.getType().getInputs(),
+                                               getType(VarType{})));
+    }
+
+    return function;
+  }
+
+  /// Emit a binary operation
+  mlir::Value mlirGen(BinaryExprAST &binop) {
+    // First emit the operations for each side of the operation before emitting
+    // the operation itself. For example if the expression is `a + foo(a)`
+    // 1) First it will visiting the LHS, which will return a reference to the
+    //    value holding `a`. This value should have been emitted at declaration
+    //    time and registered in the symbol table, so nothing would be
+    //    codegen'd. If the value is not in the symbol table, an error has been
+    //    emitted and nullptr is returned.
+    // 2) Then the RHS is visited (recursively) and a call to `foo` is emitted
+    //    and the result value is returned. If an error occurs we get a nullptr
+    //    and propagate.
+    //
+    mlir::Value lhs = mlirGen(*binop.getLHS());
+    if (!lhs)
+      return nullptr;
+    mlir::Value rhs = mlirGen(*binop.getRHS());
+    if (!rhs)
+      return nullptr;
+    auto location = loc(binop.loc());
+
+    // Derive the operation name from the binary operator. At the moment we only
+    // support '+' and '*'.
+    switch (binop.getOp()) {
+    case '+':
+      return builder.create<AddOp>(location, lhs, rhs);
+    case '*':
+      return builder.create<MulOp>(location, lhs, rhs);
+    }
+
+    emitError(location, "invalid binary operator '") << binop.getOp() << "'";
+    return nullptr;
+  }
+
+  /// This is a reference to a variable in an expression. The variable is
+  /// expected to have been declared and so should have a value in the symbol
+  /// table, otherwise emit an error and return nullptr.
+  mlir::Value mlirGen(VariableExprAST &expr) {
+    if (auto variable = symbolTable.lookup(expr.getName()))
+      return variable;
+
+    emitError(loc(expr.loc()), "error: unknown variable '")
+        << expr.getName() << "'";
+    return nullptr;
+  }
+
+  /// Emit a return operation. This will return failure if any generation fails.
+  mlir::LogicalResult mlirGen(ReturnExprAST &ret) {
+    auto location = loc(ret.loc());
+
+    // 'return' takes an optional expression, handle that case here.
+    mlir::Value expr = nullptr;
+    if (ret.getExpr().hasValue()) {
+      if (!(expr = mlirGen(*ret.getExpr().getValue())))
+        return mlir::failure();
+    }
+
+    // Otherwise, this return operation has zero operands.
+    builder.create<ReturnOp>(location, expr ? makeArrayRef(expr)
+                                            : ArrayRef<mlir::Value>());
+    return mlir::success();
+  }
+
+  /// Emit a literal/constant array. It will be emitted as a flattened array of
+  /// data in an Attribute attached to a `toy.constant` operation.
+  /// See documentation on [Attributes](LangRef.md#attributes) for more details.
+  /// Here is an excerpt:
+  ///
+  ///   Attributes are the mechanism for specifying constant data in MLIR in
+  ///   places where a variable is never allowed [...]. They consist of a name
+  ///   and a concrete attribute value. The set of expected attributes, their
+  ///   structure, and their interpretation are all contextually dependent on
+  ///   what they are attached to.
+  ///
+  /// Example, the source level statement:
+  ///   var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  /// will be converted to:
+  ///   %0 = "toy.constant"() {value: dense<tensor<2x3xf64>,
+  ///     [[1.000000e+00, 2.000000e+00, 3.000000e+00],
+  ///      [4.000000e+00, 5.000000e+00, 6.000000e+00]]>} : () -> tensor<2x3xf64>
+  ///
+  mlir::Value mlirGen(LiteralExprAST &lit) {
+    auto type = getType(lit.getDims());
+
+    // The attribute is a vector with a floating point value per element
+    // (number) in the array, see `collectData()` below for more details.
+    std::vector<double> data;
+    data.reserve(std::accumulate(lit.getDims().begin(), lit.getDims().end(), 1,
+                                 std::multiplies<int>()));
+    collectData(lit, data);
+
+    // The type of this attribute is tensor of 64-bit floating-point with the
+    // shape of the literal.
+    mlir::Type elementType = builder.getF64Type();
+    auto dataType = mlir::RankedTensorType::get(lit.getDims(), elementType);
+
+    // This is the actual attribute that holds the list of values for this
+    // tensor literal.
+    auto dataAttribute =
+        mlir::DenseElementsAttr::get(dataType, llvm::makeArrayRef(data));
+
+    // Build the MLIR op `toy.constant`. This invokes the `ConstantOp::build`
+    // method.
+    return builder.create<ConstantOp>(loc(lit.loc()), type, dataAttribute);
+  }
+
+  /// Recursive helper function to accumulate the data that compose an array
+  /// literal. It flattens the nested structure in the supplied vector. For
+  /// example with this array:
+  ///  [[1, 2], [3, 4]]
+  /// we will generate:
+  ///  [ 1, 2, 3, 4 ]
+  /// Individual numbers are represented as doubles.
+  /// Attributes are the way MLIR attaches constant to operations.
+  void collectData(ExprAST &expr, std::vector<double> &data) {
+    if (auto *lit = dyn_cast<LiteralExprAST>(&expr)) {
+      for (auto &value : lit->getValues())
+        collectData(*value, data);
+      return;
+    }
+
+    assert(isa<NumberExprAST>(expr) && "expected literal or number expr");
+    data.push_back(cast<NumberExprAST>(expr).getValue());
+  }
+
+  /// Emit a call expression. It emits specific operations for the `transpose`
+  /// builtin. Other identifiers are assumed to be user-defined functions.
+  mlir::Value mlirGen(CallExprAST &call) {
+    llvm::StringRef callee = call.getCallee();
+    auto location = loc(call.loc());
+
+    // Codegen the operands first.
+    SmallVector<mlir::Value, 4> operands;
+    for (auto &expr : call.getArgs()) {
+      auto arg = mlirGen(*expr);
+      if (!arg)
+        return nullptr;
+      operands.push_back(arg);
+    }
+
+    // Builting calls have their custom operation, meaning this is a
+    // straightforward emission.
+    if (callee == "transpose") {
+      if (call.getArgs().size() != 1) {
+        emitError(location, "MLIR codegen encountered an error: toy.transpose "
+                            "does not accept multiple arguments");
+        return nullptr;
+      }
+      return builder.create<TransposeOp>(location, operands[0]);
+    }
+
+    // Otherwise this is a call to a user-defined function. Calls to ser-defined
+    // functions are mapped to a custom call that takes the callee name as an
+    // attribute.
+    return builder.create<GenericCallOp>(location, callee, operands);
+  }
+
+  /// Emit a print expression. It emits specific operations for two builtins:
+  /// transpose(x) and print(x).
+  mlir::LogicalResult mlirGen(PrintExprAST &call) {
+    auto arg = mlirGen(*call.getArg());
+    if (!arg)
+      return mlir::failure();
+
+    builder.create<PrintOp>(loc(call.loc()), arg);
+    return mlir::success();
+  }
+
+  /// Emit a constant for a single number (FIXME: semantic? broadcast?)
+  mlir::Value mlirGen(NumberExprAST &num) {
+    return builder.create<ConstantOp>(loc(num.loc()), num.getValue());
+  }
+
+  /// Dispatch codegen for the right expression subclass using RTTI.
+  mlir::Value mlirGen(ExprAST &expr) {
+    switch (expr.getKind()) {
+    case toy::ExprAST::Expr_BinOp:
+      return mlirGen(cast<BinaryExprAST>(expr));
+    case toy::ExprAST::Expr_Var:
+      return mlirGen(cast<VariableExprAST>(expr));
+    case toy::ExprAST::Expr_Literal:
+      return mlirGen(cast<LiteralExprAST>(expr));
+    case toy::ExprAST::Expr_Call:
+      return mlirGen(cast<CallExprAST>(expr));
+    case toy::ExprAST::Expr_Num:
+      return mlirGen(cast<NumberExprAST>(expr));
+    default:
+      emitError(loc(expr.loc()))
+          << "MLIR codegen encountered an unhandled expr kind '"
+          << Twine(expr.getKind()) << "'";
+      return nullptr;
+    }
+  }
+
+  /// Handle a variable declaration, we'll codegen the expression that forms the
+  /// initializer and record the value in the symbol table before returning it.
+  /// Future expressions will be able to reference this variable through symbol
+  /// table lookup.
+  mlir::Value mlirGen(VarDeclExprAST &vardecl) {
+    auto init = vardecl.getInitVal();
+    if (!init) {
+      emitError(loc(vardecl.loc()),
+                "missing initializer in variable declaration");
+      return nullptr;
+    }
+
+    mlir::Value value = mlirGen(*init);
+    if (!value)
+      return nullptr;
+
+    // We have the initializer value, but in case the variable was declared
+    // with specific shape, we emit a "reshape" operation. It will get
+    // optimized out later as needed.
+    if (!vardecl.getType().shape.empty()) {
+      value = builder.create<ReshapeOp>(loc(vardecl.loc()),
+                                        getType(vardecl.getType()), value);
+    }
+
+    // Register the value in the symbol table.
+    if (failed(declare(vardecl.getName(), value)))
+      return nullptr;
+    return value;
+  }
+
+  /// Codegen a list of expression, return failure if one of them hit an error.
+  mlir::LogicalResult mlirGen(ExprASTList &blockAST) {
+    ScopedHashTableScope<StringRef, mlir::Value> var_scope(symbolTable);
+    for (auto &expr : blockAST) {
+      // Specific handling for variable declarations, return statement, and
+      // print. These can only appear in block list and not in nested
+      // expressions.
+      if (auto *vardecl = dyn_cast<VarDeclExprAST>(expr.get())) {
+        if (!mlirGen(*vardecl))
+          return mlir::failure();
+        continue;
+      }
+      if (auto *ret = dyn_cast<ReturnExprAST>(expr.get()))
+        return mlirGen(*ret);
+      if (auto *print = dyn_cast<PrintExprAST>(expr.get())) {
+        if (mlir::failed(mlirGen(*print)))
+          return mlir::success();
+        continue;
+      }
+
+      // Generic expression dispatch codegen.
+      if (!mlirGen(*expr))
+        return mlir::failure();
+    }
+    return mlir::success();
+  }
+
+  /// Build a tensor type from a list of shape dimensions.
+  mlir::Type getType(ArrayRef<int64_t> shape) {
+    // If the shape is empty, then this type is unranked.
+    if (shape.empty())
+      return mlir::UnrankedTensorType::get(builder.getF64Type());
+
+    // Otherwise, we use the given shape.
+    return mlir::RankedTensorType::get(shape, builder.getF64Type());
+  }
+
+  /// Build an MLIR type from a Toy AST variable type (forward to the generic
+  /// getType above).
+  mlir::Type getType(const VarType &type) { return getType(type.shape); }
+};
+
+} // namespace
+
+namespace toy {
+
+// The public API for codegen.
+mlir::OwningModuleRef mlirGen(mlir::MLIRContext &context,
+                              ModuleAST &moduleAST) {
+  return MLIRGenImpl(context).mlirGen(moduleAST);
+}
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch3/mlir/ToyCombine.cpp b/mlir/examples/toy/Ch3/mlir/ToyCombine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e320540217935bab1df63e1afb0878c3fb03a000
--- /dev/null
+++ b/mlir/examples/toy/Ch3/mlir/ToyCombine.cpp
@@ -0,0 +1,69 @@
+//===- ToyCombine.cpp - Toy High Level Optimizer --------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a set of simple combiners for optimizing operations in
+// the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "toy/Dialect.h"
+#include <numeric>
+using namespace mlir;
+using namespace toy;
+
+namespace {
+/// Include the patterns defined in the Declarative Rewrite framework.
+#include "ToyCombine.inc"
+} // end anonymous namespace
+
+/// This is an example of a c++ rewrite pattern for the TransposeOp. It
+/// optimizes the following scenario: transpose(transpose(x)) -> transpose(x)
+struct SimplifyRedundantTranspose : public mlir::OpRewritePattern<TransposeOp> {
+  /// We register this pattern to match every toy.transpose in the IR.
+  /// The "benefit" is used by the framework to order the patterns and process
+  /// them in order of profitability.
+  SimplifyRedundantTranspose(mlir::MLIRContext *context)
+      : OpRewritePattern<TransposeOp>(context, /*benefit=*/1) {}
+
+  /// This method attempts to match a pattern and rewrite it. The rewriter
+  /// argument is the orchestrator of the sequence of rewrites. The pattern is
+  /// expected to interact with it to perform any changes to the IR from here.
+  mlir::PatternMatchResult
+  matchAndRewrite(TransposeOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // Look through the input of the current transpose.
+    mlir::Value transposeInput = op.getOperand();
+    TransposeOp transposeInputOp =
+        llvm::dyn_cast_or_null<TransposeOp>(transposeInput->getDefiningOp());
+
+    // If the input is defined by another Transpose, bingo!
+    if (!transposeInputOp)
+      return matchFailure();
+
+    // Use the rewriter to perform the replacement.
+    rewriter.replaceOp(op, {transposeInputOp.getOperand()}, {transposeInputOp});
+    return matchSuccess();
+  }
+};
+
+/// Register our patterns as "canonicalization" patterns on the TransposeOp so
+/// that they can be picked up by the Canonicalization framework.
+void TransposeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                              MLIRContext *context) {
+  results.insert<SimplifyRedundantTranspose>(context);
+}
+
+/// Register our patterns as "canonicalization" patterns on the ReshapeOp so
+/// that they can be picked up by the Canonicalization framework.
+void ReshapeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  results.insert<ReshapeReshapeOptPattern, RedundantReshapeOptPattern,
+                 FoldConstantReshapeOptPattern>(context);
+}
diff --git a/mlir/examples/toy/Ch3/mlir/ToyCombine.td b/mlir/examples/toy/Ch3/mlir/ToyCombine.td
new file mode 100644
index 0000000000000000000000000000000000000000..e6e33e84d7e8f3e13aea9840f3690029de025d94
--- /dev/null
+++ b/mlir/examples/toy/Ch3/mlir/ToyCombine.td
@@ -0,0 +1,62 @@
+//===- ToyCombine.td - Pattern Match Optimizations for Toy -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines language-specific pattern match optimizations for Toy using
+// Declarative Rewrite Rules (DRR) specified using TableGen records.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_COMBINE
+#define TOY_COMBINE
+
+include "toy/Ops.td"
+
+/// Note: The DRR definition used for defining patterns is shown below:
+///
+/// class Pattern<
+///    dag sourcePattern, list<dag> resultPatterns,
+///    list<dag> additionalConstraints = [],
+///    dag benefitsAdded = (addBenefit 0)
+/// >;
+
+//===----------------------------------------------------------------------===//
+// Basic Pattern-Match and Rewrite
+//===----------------------------------------------------------------------===//
+
+// Reshape(Reshape(x)) = Reshape(x)
+def ReshapeReshapeOptPattern : Pat<(ReshapeOp(ReshapeOp $arg)),
+                                   (ReshapeOp $arg)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern-Match and Rewrite using Native Code Call
+//===----------------------------------------------------------------------===//
+
+// Native Code Calls may be used for more complex transformations using inline
+// C++ and C++ helper functions.
+
+// Reshape(Constant(x)) = x'
+def ReshapeConstant :
+  NativeCodeCall<"$0.reshape(($1->getType()).cast<ShapedType>())">;
+def FoldConstantReshapeOptPattern : Pat<
+  (ReshapeOp:$res (ConstantOp $arg)),
+  (ConstantOp (ReshapeConstant $arg, $res))>;
+
+//===----------------------------------------------------------------------===//
+// Pattern-Match and Rewrite with Constraints
+//===----------------------------------------------------------------------===//
+
+// DRR allows for constraint checking when the transformation is conditional
+// on operand properties.
+
+// Reshape(x) = x, where input and output shapes are identical
+def TypesAreIdentical : Constraint<CPred<"$0->getType() == $1->getType()">>;
+def RedundantReshapeOptPattern : Pat<
+  (ReshapeOp:$res $arg), (replaceWithValue $arg),
+  [(TypesAreIdentical $res, $arg)]>;
+
+#endif // TOY_COMBINE
diff --git a/mlir/examples/toy/Ch3/parser/AST.cpp b/mlir/examples/toy/Ch3/parser/AST.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d6d9359529bffc068520bebf4a9ea56f436a415
--- /dev/null
+++ b/mlir/examples/toy/Ch3/parser/AST.cpp
@@ -0,0 +1,234 @@
+//===- AST.cpp - Helper for printing out the Toy AST ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST dump for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/AST.h"
+
+#include "mlir/ADT/TypeSwitch.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+
+namespace {
+
+// RAII helper to manage increasing/decreasing the indentation as we traverse
+// the AST
+struct Indent {
+  Indent(int &level) : level(level) { ++level; }
+  ~Indent() { --level; }
+  int &level;
+};
+
+/// Helper class that implement the AST tree traversal and print the nodes along
+/// the way. The only data member is the current indentation level.
+class ASTDumper {
+public:
+  void dump(ModuleAST *node);
+
+private:
+  void dump(const VarType &type);
+  void dump(VarDeclExprAST *varDecl);
+  void dump(ExprAST *expr);
+  void dump(ExprASTList *exprList);
+  void dump(NumberExprAST *num);
+  void dump(LiteralExprAST *node);
+  void dump(VariableExprAST *node);
+  void dump(ReturnExprAST *node);
+  void dump(BinaryExprAST *node);
+  void dump(CallExprAST *node);
+  void dump(PrintExprAST *node);
+  void dump(PrototypeAST *node);
+  void dump(FunctionAST *node);
+
+  // Actually print spaces matching the current indentation level
+  void indent() {
+    for (int i = 0; i < curIndent; i++)
+      llvm::errs() << "  ";
+  }
+  int curIndent = 0;
+};
+
+} // namespace
+
+/// Return a formatted string for the location of any node
+template <typename T> static std::string loc(T *node) {
+  const auto &loc = node->loc();
+  return (llvm::Twine("@") + *loc.file + ":" + llvm::Twine(loc.line) + ":" +
+          llvm::Twine(loc.col))
+      .str();
+}
+
+// Helper Macro to bump the indentation level and print the leading spaces for
+// the current indentations
+#define INDENT()                                                               \
+  Indent level_(curIndent);                                                    \
+  indent();
+
+/// Dispatch to a generic expressions to the appropriate subclass using RTTI
+void ASTDumper::dump(ExprAST *expr) {
+  mlir::TypeSwitch<ExprAST *>(expr)
+      .Case<BinaryExprAST, CallExprAST, LiteralExprAST, NumberExprAST,
+            PrintExprAST, ReturnExprAST, VarDeclExprAST, VariableExprAST>(
+          [&](auto *node) { this->dump(node); })
+      .Default([&](ExprAST *) {
+        // No match, fallback to a generic message
+        INDENT();
+        llvm::errs() << "<unknown Expr, kind " << expr->getKind() << ">\n";
+      });
+}
+
+/// A variable declaration is printing the variable name, the type, and then
+/// recurse in the initializer value.
+void ASTDumper::dump(VarDeclExprAST *varDecl) {
+  INDENT();
+  llvm::errs() << "VarDecl " << varDecl->getName();
+  dump(varDecl->getType());
+  llvm::errs() << " " << loc(varDecl) << "\n";
+  dump(varDecl->getInitVal());
+}
+
+/// A "block", or a list of expression
+void ASTDumper::dump(ExprASTList *exprList) {
+  INDENT();
+  llvm::errs() << "Block {\n";
+  for (auto &expr : *exprList)
+    dump(expr.get());
+  indent();
+  llvm::errs() << "} // Block\n";
+}
+
+/// A literal number, just print the value.
+void ASTDumper::dump(NumberExprAST *num) {
+  INDENT();
+  llvm::errs() << num->getValue() << " " << loc(num) << "\n";
+}
+
+/// Helper to print recursively a literal. This handles nested array like:
+///    [ [ 1, 2 ], [ 3, 4 ] ]
+/// We print out such array with the dimensions spelled out at every level:
+///    <2,2>[<2>[ 1, 2 ], <2>[ 3, 4 ] ]
+void printLitHelper(ExprAST *litOrNum) {
+  // Inside a literal expression we can have either a number or another literal
+  if (auto num = llvm::dyn_cast<NumberExprAST>(litOrNum)) {
+    llvm::errs() << num->getValue();
+    return;
+  }
+  auto *literal = llvm::cast<LiteralExprAST>(litOrNum);
+
+  // Print the dimension for this literal first
+  llvm::errs() << "<";
+  mlir::interleaveComma(literal->getDims(), llvm::errs());
+  llvm::errs() << ">";
+
+  // Now print the content, recursing on every element of the list
+  llvm::errs() << "[ ";
+  mlir::interleaveComma(literal->getValues(), llvm::errs(),
+                        [&](auto &elt) { printLitHelper(elt.get()); });
+  llvm::errs() << "]";
+}
+
+/// Print a literal, see the recursive helper above for the implementation.
+void ASTDumper::dump(LiteralExprAST *node) {
+  INDENT();
+  llvm::errs() << "Literal: ";
+  printLitHelper(node);
+  llvm::errs() << " " << loc(node) << "\n";
+}
+
+/// Print a variable reference (just a name).
+void ASTDumper::dump(VariableExprAST *node) {
+  INDENT();
+  llvm::errs() << "var: " << node->getName() << " " << loc(node) << "\n";
+}
+
+/// Return statement print the return and its (optional) argument.
+void ASTDumper::dump(ReturnExprAST *node) {
+  INDENT();
+  llvm::errs() << "Return\n";
+  if (node->getExpr().hasValue())
+    return dump(*node->getExpr());
+  {
+    INDENT();
+    llvm::errs() << "(void)\n";
+  }
+}
+
+/// Print a binary operation, first the operator, then recurse into LHS and RHS.
+void ASTDumper::dump(BinaryExprAST *node) {
+  INDENT();
+  llvm::errs() << "BinOp: " << node->getOp() << " " << loc(node) << "\n";
+  dump(node->getLHS());
+  dump(node->getRHS());
+}
+
+/// Print a call expression, first the callee name and the list of args by
+/// recursing into each individual argument.
+void ASTDumper::dump(CallExprAST *node) {
+  INDENT();
+  llvm::errs() << "Call '" << node->getCallee() << "' [ " << loc(node) << "\n";
+  for (auto &arg : node->getArgs())
+    dump(arg.get());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print a builtin print call, first the builtin name and then the argument.
+void ASTDumper::dump(PrintExprAST *node) {
+  INDENT();
+  llvm::errs() << "Print [ " << loc(node) << "\n";
+  dump(node->getArg());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print type: only the shape is printed in between '<' and '>'
+void ASTDumper::dump(const VarType &type) {
+  llvm::errs() << "<";
+  mlir::interleaveComma(type.shape, llvm::errs());
+  llvm::errs() << ">";
+}
+
+/// Print a function prototype, first the function name, and then the list of
+/// parameters names.
+void ASTDumper::dump(PrototypeAST *node) {
+  INDENT();
+  llvm::errs() << "Proto '" << node->getName() << "' " << loc(node) << "'\n";
+  indent();
+  llvm::errs() << "Params: [";
+  mlir::interleaveComma(node->getArgs(), llvm::errs(),
+                        [](auto &arg) { llvm::errs() << arg->getName(); });
+  llvm::errs() << "]\n";
+}
+
+/// Print a function, first the prototype and then the body.
+void ASTDumper::dump(FunctionAST *node) {
+  INDENT();
+  llvm::errs() << "Function \n";
+  dump(node->getProto());
+  dump(node->getBody());
+}
+
+/// Print a module, actually loop over the functions and print them in sequence.
+void ASTDumper::dump(ModuleAST *node) {
+  INDENT();
+  llvm::errs() << "Module:\n";
+  for (auto &f : *node)
+    dump(&f);
+}
+
+namespace toy {
+
+// Public API
+void dump(ModuleAST &module) { ASTDumper().dump(&module); }
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch3/toyc.cpp b/mlir/examples/toy/Ch3/toyc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8b6e94786bed91eb2a3c3dcfd963daa5efbfdb4
--- /dev/null
+++ b/mlir/examples/toy/Ch3/toyc.cpp
@@ -0,0 +1,157 @@
+//===- toyc.cpp - The Toy Compiler ----------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the entry point for the Toy compiler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+#include "toy/MLIRGen.h"
+#include "toy/Parser.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Parser.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+namespace cl = llvm::cl;
+
+static cl::opt<std::string> inputFilename(cl::Positional,
+                                          cl::desc("<input toy file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+namespace {
+enum InputType { Toy, MLIR };
+}
+static cl::opt<enum InputType> inputType(
+    "x", cl::init(Toy), cl::desc("Decided the kind of output desired"),
+    cl::values(clEnumValN(Toy, "toy", "load the input file as a Toy source.")),
+    cl::values(clEnumValN(MLIR, "mlir",
+                          "load the input file as an MLIR file")));
+
+namespace {
+enum Action { None, DumpAST, DumpMLIR };
+}
+static cl::opt<enum Action> emitAction(
+    "emit", cl::desc("Select the kind of output desired"),
+    cl::values(clEnumValN(DumpAST, "ast", "output the AST dump")),
+    cl::values(clEnumValN(DumpMLIR, "mlir", "output the MLIR dump")));
+
+static cl::opt<bool> enableOpt("opt", cl::desc("Enable optimizations"));
+
+/// Returns a Toy AST resulting from parsing the file or a nullptr on error.
+std::unique_ptr<toy::ModuleAST> parseInputFile(llvm::StringRef filename) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(filename);
+  if (std::error_code ec = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << ec.message() << "\n";
+    return nullptr;
+  }
+  auto buffer = fileOrErr.get()->getBuffer();
+  LexerBuffer lexer(buffer.begin(), buffer.end(), filename);
+  Parser parser(lexer);
+  return parser.parseModule();
+}
+
+int loadMLIR(llvm::SourceMgr &sourceMgr, mlir::MLIRContext &context,
+             mlir::OwningModuleRef &module) {
+  // Handle '.toy' input to the compiler.
+  if (inputType != InputType::MLIR &&
+      !llvm::StringRef(inputFilename).endswith(".mlir")) {
+    auto moduleAST = parseInputFile(inputFilename);
+    if (!moduleAST)
+      return 6;
+    module = mlirGen(context, *moduleAST);
+    return !module ? 1 : 0;
+  }
+
+  // Otherwise, the input is '.mlir'.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(inputFilename);
+  if (std::error_code EC = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << EC.message() << "\n";
+    return -1;
+  }
+
+  // Parse the input mlir.
+  sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
+  module = mlir::parseSourceFile(sourceMgr, &context);
+  if (!module) {
+    llvm::errs() << "Error can't load file " << inputFilename << "\n";
+    return 3;
+  }
+  return 0;
+}
+
+int dumpMLIR() {
+  // Register our Dialect with MLIR.
+  mlir::registerDialect<mlir::toy::ToyDialect>();
+
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module;
+  llvm::SourceMgr sourceMgr;
+  mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
+  if (int error = loadMLIR(sourceMgr, context, module))
+    return error;
+
+  if (enableOpt) {
+    mlir::PassManager pm(&context);
+    // Apply any generic pass manager command line options and run the pipeline.
+    applyPassManagerCLOptions(pm);
+
+    // Add a run of the canonicalizer to optimize the mlir module.
+    pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+    if (mlir::failed(pm.run(*module)))
+      return 4;
+  }
+
+  module->dump();
+  return 0;
+}
+
+int dumpAST() {
+  if (inputType == InputType::MLIR) {
+    llvm::errs() << "Can't dump a Toy AST when the input is MLIR\n";
+    return 5;
+  }
+
+  auto moduleAST = parseInputFile(inputFilename);
+  if (!moduleAST)
+    return 1;
+
+  dump(*moduleAST);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  mlir::registerPassManagerCLOptions();
+  cl::ParseCommandLineOptions(argc, argv, "toy compiler\n");
+
+  switch (emitAction) {
+  case Action::DumpAST:
+    return dumpAST();
+  case Action::DumpMLIR:
+    return dumpMLIR();
+  default:
+    llvm::errs() << "No action specified (parsing only?), use -emit=<action>\n";
+  }
+
+  return 0;
+}
diff --git a/mlir/examples/toy/Ch4/CMakeLists.txt b/mlir/examples/toy/Ch4/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d11e5abcf13037f608f67c0a047e1f30ccf9c57e
--- /dev/null
+++ b/mlir/examples/toy/Ch4/CMakeLists.txt
@@ -0,0 +1,35 @@
+add_subdirectory(include)
+
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+set(LLVM_TARGET_DEFINITIONS mlir/ToyCombine.td)
+mlir_tablegen(ToyCombine.inc -gen-rewriters "-I${CMAKE_CURRENT_SOURCE_DIR}/include")
+add_public_tablegen_target(ToyCh4CombineIncGen)
+
+add_toy_chapter(toyc-ch4
+  toyc.cpp
+  parser/AST.cpp
+  mlir/MLIRGen.cpp
+  mlir/Dialect.cpp
+  mlir/DeadFunctionEliminationPass.cpp
+  mlir/ShapeInferencePass.cpp
+  mlir/ToyCombine.cpp
+  )
+
+add_dependencies(toyc-ch4 ToyCh4OpsIncGen)
+add_dependencies(toyc-ch4 ToyCh4ShapeInferenceInterfaceIncGen)
+add_dependencies(toyc-ch4 ToyCh4CombineIncGen)
+add_dependencies(toyc-ch4 MLIRCallOpInterfacesIncGen)
+include_directories(include/)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/)
+target_link_libraries(toyc-ch4
+  PRIVATE
+    MLIRAnalysis
+    MLIRIR
+    MLIRParser
+    MLIRPass
+    MLIRTransforms)
+
diff --git a/mlir/examples/toy/Ch4/include/CMakeLists.txt b/mlir/examples/toy/Ch4/include/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..37c89d0bae965cfc8665515de7e60ad7867a7d8b
--- /dev/null
+++ b/mlir/examples/toy/Ch4/include/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(toy)
diff --git a/mlir/examples/toy/Ch4/include/toy/AST.h b/mlir/examples/toy/Ch4/include/toy/AST.h
new file mode 100644
index 0000000000000000000000000000000000000000..820600b5b1c900cbeedce7545bad458f096cc92e
--- /dev/null
+++ b/mlir/examples/toy/Ch4/include/toy/AST.h
@@ -0,0 +1,242 @@
+//===- AST.h - Node definition for the Toy AST ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST for the Toy language. It is optimized for
+// simplicity, not efficiency. The AST forms a tree structure where each node
+// references its children using std::unique_ptr<>.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_AST_H_
+#define MLIR_TUTORIAL_TOY_AST_H_
+
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <vector>
+
+namespace toy {
+
+/// A variable type with shape information.
+struct VarType {
+  std::vector<int64_t> shape;
+};
+
+/// Base class for all expression nodes.
+class ExprAST {
+public:
+  enum ExprASTKind {
+    Expr_VarDecl,
+    Expr_Return,
+    Expr_Num,
+    Expr_Literal,
+    Expr_Var,
+    Expr_BinOp,
+    Expr_Call,
+    Expr_Print,
+  };
+
+  ExprAST(ExprASTKind kind, Location location)
+      : kind(kind), location(location) {}
+  virtual ~ExprAST() = default;
+
+  ExprASTKind getKind() const { return kind; }
+
+  const Location &loc() { return location; }
+
+private:
+  const ExprASTKind kind;
+  Location location;
+};
+
+/// A block-list of expressions.
+using ExprASTList = std::vector<std::unique_ptr<ExprAST>>;
+
+/// Expression class for numeric literals like "1.0".
+class NumberExprAST : public ExprAST {
+  double Val;
+
+public:
+  NumberExprAST(Location loc, double val) : ExprAST(Expr_Num, loc), Val(val) {}
+
+  double getValue() { return Val; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Num; }
+};
+
+/// Expression class for a literal value.
+class LiteralExprAST : public ExprAST {
+  std::vector<std::unique_ptr<ExprAST>> values;
+  std::vector<int64_t> dims;
+
+public:
+  LiteralExprAST(Location loc, std::vector<std::unique_ptr<ExprAST>> values,
+                 std::vector<int64_t> dims)
+      : ExprAST(Expr_Literal, loc), values(std::move(values)),
+        dims(std::move(dims)) {}
+
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getValues() { return values; }
+  llvm::ArrayRef<int64_t> getDims() { return dims; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Literal; }
+};
+
+/// Expression class for referencing a variable, like "a".
+class VariableExprAST : public ExprAST {
+  std::string name;
+
+public:
+  VariableExprAST(Location loc, llvm::StringRef name)
+      : ExprAST(Expr_Var, loc), name(name) {}
+
+  llvm::StringRef getName() { return name; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Var; }
+};
+
+/// Expression class for defining a variable.
+class VarDeclExprAST : public ExprAST {
+  std::string name;
+  VarType type;
+  std::unique_ptr<ExprAST> initVal;
+
+public:
+  VarDeclExprAST(Location loc, llvm::StringRef name, VarType type,
+                 std::unique_ptr<ExprAST> initVal)
+      : ExprAST(Expr_VarDecl, loc), name(name), type(std::move(type)),
+        initVal(std::move(initVal)) {}
+
+  llvm::StringRef getName() { return name; }
+  ExprAST *getInitVal() { return initVal.get(); }
+  const VarType &getType() { return type; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_VarDecl; }
+};
+
+/// Expression class for a return operator.
+class ReturnExprAST : public ExprAST {
+  llvm::Optional<std::unique_ptr<ExprAST>> expr;
+
+public:
+  ReturnExprAST(Location loc, llvm::Optional<std::unique_ptr<ExprAST>> expr)
+      : ExprAST(Expr_Return, loc), expr(std::move(expr)) {}
+
+  llvm::Optional<ExprAST *> getExpr() {
+    if (expr.hasValue())
+      return expr->get();
+    return llvm::None;
+  }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Return; }
+};
+
+/// Expression class for a binary operator.
+class BinaryExprAST : public ExprAST {
+  char op;
+  std::unique_ptr<ExprAST> lhs, rhs;
+
+public:
+  char getOp() { return op; }
+  ExprAST *getLHS() { return lhs.get(); }
+  ExprAST *getRHS() { return rhs.get(); }
+
+  BinaryExprAST(Location loc, char Op, std::unique_ptr<ExprAST> lhs,
+                std::unique_ptr<ExprAST> rhs)
+      : ExprAST(Expr_BinOp, loc), op(Op), lhs(std::move(lhs)),
+        rhs(std::move(rhs)) {}
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_BinOp; }
+};
+
+/// Expression class for function calls.
+class CallExprAST : public ExprAST {
+  std::string callee;
+  std::vector<std::unique_ptr<ExprAST>> args;
+
+public:
+  CallExprAST(Location loc, const std::string &callee,
+              std::vector<std::unique_ptr<ExprAST>> args)
+      : ExprAST(Expr_Call, loc), callee(callee), args(std::move(args)) {}
+
+  llvm::StringRef getCallee() { return callee; }
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getArgs() { return args; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Call; }
+};
+
+/// Expression class for builtin print calls.
+class PrintExprAST : public ExprAST {
+  std::unique_ptr<ExprAST> arg;
+
+public:
+  PrintExprAST(Location loc, std::unique_ptr<ExprAST> arg)
+      : ExprAST(Expr_Print, loc), arg(std::move(arg)) {}
+
+  ExprAST *getArg() { return arg.get(); }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Print; }
+};
+
+/// This class represents the "prototype" for a function, which captures its
+/// name, and its argument names (thus implicitly the number of arguments the
+/// function takes).
+class PrototypeAST {
+  Location location;
+  std::string name;
+  std::vector<std::unique_ptr<VariableExprAST>> args;
+
+public:
+  PrototypeAST(Location location, const std::string &name,
+               std::vector<std::unique_ptr<VariableExprAST>> args)
+      : location(location), name(name), args(std::move(args)) {}
+
+  const Location &loc() { return location; }
+  llvm::StringRef getName() const { return name; }
+  llvm::ArrayRef<std::unique_ptr<VariableExprAST>> getArgs() { return args; }
+};
+
+/// This class represents a function definition itself.
+class FunctionAST {
+  std::unique_ptr<PrototypeAST> proto;
+  std::unique_ptr<ExprASTList> body;
+
+public:
+  FunctionAST(std::unique_ptr<PrototypeAST> proto,
+              std::unique_ptr<ExprASTList> body)
+      : proto(std::move(proto)), body(std::move(body)) {}
+  PrototypeAST *getProto() { return proto.get(); }
+  ExprASTList *getBody() { return body.get(); }
+};
+
+/// This class represents a list of functions to be processed together
+class ModuleAST {
+  std::vector<FunctionAST> functions;
+
+public:
+  ModuleAST(std::vector<FunctionAST> functions)
+      : functions(std::move(functions)) {}
+
+  auto begin() -> decltype(functions.begin()) { return functions.begin(); }
+  auto end() -> decltype(functions.end()) { return functions.end(); }
+};
+
+void dump(ModuleAST &);
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_AST_H_
diff --git a/mlir/examples/toy/Ch4/include/toy/CMakeLists.txt b/mlir/examples/toy/Ch4/include/toy/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..798d0df1d8d685f0ffd97d70eac806794cfd2503
--- /dev/null
+++ b/mlir/examples/toy/Ch4/include/toy/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_TARGET_DEFINITIONS Ops.td)
+mlir_tablegen(Ops.h.inc -gen-op-decls "-I${CMAKE_CURRENT_SOURCE_DIR}/..")
+mlir_tablegen(Ops.cpp.inc -gen-op-defs "-I${CMAKE_CURRENT_SOURCE_DIR}/..")
+add_public_tablegen_target(ToyCh4OpsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS ShapeInferenceInterface.td)
+mlir_tablegen(ShapeInferenceOpInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(ShapeInferenceOpInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(ToyCh4ShapeInferenceInterfaceIncGen)
diff --git a/mlir/examples/toy/Ch4/include/toy/Dialect.h b/mlir/examples/toy/Ch4/include/toy/Dialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e8b91dcf4843762db80cde22ef96a0b22929840
--- /dev/null
+++ b/mlir/examples/toy/Ch4/include/toy/Dialect.h
@@ -0,0 +1,46 @@
+//===- Dialect.h - Dialect definition for the Toy IR ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IR Dialect for the Toy language.
+// See g3doc/Tutorials/Toy/Ch-2.md for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_DIALECT_H_
+#define MLIR_TUTORIAL_TOY_DIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/StandardTypes.h"
+#include "toy/ShapeInferenceInterface.h"
+
+namespace mlir {
+namespace toy {
+
+/// This is the definition of the Toy dialect. A dialect inherits from
+/// mlir::Dialect and registers custom attributes, operations, and types (in its
+/// constructor). It can also override some general behavior exposed via virtual
+/// methods.
+class ToyDialect : public mlir::Dialect {
+public:
+  explicit ToyDialect(mlir::MLIRContext *ctx);
+
+  /// Provide a utility accessor to the dialect namespace. This is used by
+  /// several utilities for casting between dialects.
+  static llvm::StringRef getDialectNamespace() { return "toy"; }
+};
+
+/// Include the auto-generated header file containing the declarations of the
+/// toy operations.
+#define GET_OP_CLASSES
+#include "toy/Ops.h.inc"
+
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_DIALECT_H_
diff --git a/mlir/examples/toy/Ch4/include/toy/Lexer.h b/mlir/examples/toy/Ch4/include/toy/Lexer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6eff64ee5f09634041f76cbae11c18f8ca46d07c
--- /dev/null
+++ b/mlir/examples/toy/Ch4/include/toy/Lexer.h
@@ -0,0 +1,232 @@
+//===- Lexer.h - Lexer for the Toy language -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple Lexer for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_LEXER_H_
+#define MLIR_TUTORIAL_TOY_LEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+
+#include <memory>
+#include <string>
+
+namespace toy {
+
+/// Structure definition a location in a file.
+struct Location {
+  std::shared_ptr<std::string> file; ///< filename.
+  int line;                          ///< line number.
+  int col;                           ///< column number.
+};
+
+// List of Token returned by the lexer.
+enum Token : int {
+  tok_semicolon = ';',
+  tok_parenthese_open = '(',
+  tok_parenthese_close = ')',
+  tok_bracket_open = '{',
+  tok_bracket_close = '}',
+  tok_sbracket_open = '[',
+  tok_sbracket_close = ']',
+
+  tok_eof = -1,
+
+  // commands
+  tok_return = -2,
+  tok_var = -3,
+  tok_def = -4,
+
+  // primary
+  tok_identifier = -5,
+  tok_number = -6,
+};
+
+/// The Lexer is an abstract base class providing all the facilities that the
+/// Parser expects. It goes through the stream one token at a time and keeps
+/// track of the location in the file for debugging purpose.
+/// It relies on a subclass to provide a `readNextLine()` method. The subclass
+/// can proceed by reading the next line from the standard input or from a
+/// memory mapped file.
+class Lexer {
+public:
+  /// Create a lexer for the given filename. The filename is kept only for
+  /// debugging purpose (attaching a location to a Token).
+  Lexer(std::string filename)
+      : lastLocation(
+            {std::make_shared<std::string>(std::move(filename)), 0, 0}) {}
+  virtual ~Lexer() = default;
+
+  /// Look at the current token in the stream.
+  Token getCurToken() { return curTok; }
+
+  /// Move to the next token in the stream and return it.
+  Token getNextToken() { return curTok = getTok(); }
+
+  /// Move to the next token in the stream, asserting on the current token
+  /// matching the expectation.
+  void consume(Token tok) {
+    assert(tok == curTok && "consume Token mismatch expectation");
+    getNextToken();
+  }
+
+  /// Return the current identifier (prereq: getCurToken() == tok_identifier)
+  llvm::StringRef getId() {
+    assert(curTok == tok_identifier);
+    return identifierStr;
+  }
+
+  /// Return the current number (prereq: getCurToken() == tok_number)
+  double getValue() {
+    assert(curTok == tok_number);
+    return numVal;
+  }
+
+  /// Return the location for the beginning of the current token.
+  Location getLastLocation() { return lastLocation; }
+
+  // Return the current line in the file.
+  int getLine() { return curLineNum; }
+
+  // Return the current column in the file.
+  int getCol() { return curCol; }
+
+private:
+  /// Delegate to a derived class fetching the next line. Returns an empty
+  /// string to signal end of file (EOF). Lines are expected to always finish
+  /// with "\n"
+  virtual llvm::StringRef readNextLine() = 0;
+
+  /// Return the next character from the stream. This manages the buffer for the
+  /// current line and request the next line buffer to the derived class as
+  /// needed.
+  int getNextChar() {
+    // The current line buffer should not be empty unless it is the end of file.
+    if (curLineBuffer.empty())
+      return EOF;
+    ++curCol;
+    auto nextchar = curLineBuffer.front();
+    curLineBuffer = curLineBuffer.drop_front();
+    if (curLineBuffer.empty())
+      curLineBuffer = readNextLine();
+    if (nextchar == '\n') {
+      ++curLineNum;
+      curCol = 0;
+    }
+    return nextchar;
+  }
+
+  ///  Return the next token from standard input.
+  Token getTok() {
+    // Skip any whitespace.
+    while (isspace(lastChar))
+      lastChar = Token(getNextChar());
+
+    // Save the current location before reading the token characters.
+    lastLocation.line = curLineNum;
+    lastLocation.col = curCol;
+
+    // Identifier: [a-zA-Z][a-zA-Z0-9_]*
+    if (isalpha(lastChar)) {
+      identifierStr = (char)lastChar;
+      while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
+        identifierStr += (char)lastChar;
+
+      if (identifierStr == "return")
+        return tok_return;
+      if (identifierStr == "def")
+        return tok_def;
+      if (identifierStr == "var")
+        return tok_var;
+      return tok_identifier;
+    }
+
+    // Number: [0-9.]+
+    if (isdigit(lastChar) || lastChar == '.') {
+      std::string numStr;
+      do {
+        numStr += lastChar;
+        lastChar = Token(getNextChar());
+      } while (isdigit(lastChar) || lastChar == '.');
+
+      numVal = strtod(numStr.c_str(), nullptr);
+      return tok_number;
+    }
+
+    if (lastChar == '#') {
+      // Comment until end of line.
+      do {
+        lastChar = Token(getNextChar());
+      } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
+
+      if (lastChar != EOF)
+        return getTok();
+    }
+
+    // Check for end of file.  Don't eat the EOF.
+    if (lastChar == EOF)
+      return tok_eof;
+
+    // Otherwise, just return the character as its ascii value.
+    Token thisChar = Token(lastChar);
+    lastChar = Token(getNextChar());
+    return thisChar;
+  }
+
+  /// The last token read from the input.
+  Token curTok = tok_eof;
+
+  /// Location for `curTok`.
+  Location lastLocation;
+
+  /// If the current Token is an identifier, this string contains the value.
+  std::string identifierStr;
+
+  /// If the current Token is a number, this contains the value.
+  double numVal = 0;
+
+  /// The last value returned by getNextChar(). We need to keep it around as we
+  /// always need to read ahead one character to decide when to end a token and
+  /// we can't put it back in the stream after reading from it.
+  Token lastChar = Token(' ');
+
+  /// Keep track of the current line number in the input stream
+  int curLineNum = 0;
+
+  /// Keep track of the current column number in the input stream
+  int curCol = 0;
+
+  /// Buffer supplied by the derived class on calls to `readNextLine()`
+  llvm::StringRef curLineBuffer = "\n";
+};
+
+/// A lexer implementation operating on a buffer in memory.
+class LexerBuffer final : public Lexer {
+public:
+  LexerBuffer(const char *begin, const char *end, std::string filename)
+      : Lexer(std::move(filename)), current(begin), end(end) {}
+
+private:
+  /// Provide one line at a time to the Lexer, return an empty string when
+  /// reaching the end of the buffer.
+  llvm::StringRef readNextLine() override {
+    auto *begin = current;
+    while (current <= end && *current && *current != '\n')
+      ++current;
+    if (current <= end && *current)
+      ++current;
+    llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
+    return result;
+  }
+  const char *current, *end;
+};
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_LEXER_H_
diff --git a/mlir/examples/toy/Ch4/include/toy/MLIRGen.h b/mlir/examples/toy/Ch4/include/toy/MLIRGen.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1c8ca1201d1a2a391c0aec0d89197fbbb18efb8
--- /dev/null
+++ b/mlir/examples/toy/Ch4/include/toy/MLIRGen.h
@@ -0,0 +1,32 @@
+//===- MLIRGen.h - MLIR Generation from a Toy AST -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a simple interface to perform IR generation targeting MLIR
+// from a Module AST for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_MLIRGEN_H_
+#define MLIR_TUTORIAL_TOY_MLIRGEN_H_
+
+#include <memory>
+
+namespace mlir {
+class MLIRContext;
+class OwningModuleRef;
+} // namespace mlir
+
+namespace toy {
+class ModuleAST;
+
+/// Emit IR for the given Toy moduleAST, returns a newly created MLIR module
+/// or nullptr on failure.
+mlir::OwningModuleRef mlirGen(mlir::MLIRContext &context, ModuleAST &moduleAST);
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_MLIRGEN_H_
diff --git a/mlir/examples/toy/Ch4/include/toy/Ops.td b/mlir/examples/toy/Ch4/include/toy/Ops.td
new file mode 100644
index 0000000000000000000000000000000000000000..dfb11cf23b9aa7dc514f4e8610e04f138b8ba35f
--- /dev/null
+++ b/mlir/examples/toy/Ch4/include/toy/Ops.td
@@ -0,0 +1,246 @@
+//===- Ops.td - Toy dialect operation definitions ----------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the operations of the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_OPS
+#define TOY_OPS
+
+include "mlir/Analysis/CallInterfaces.td"
+include "toy/ShapeInferenceInterface.td"
+
+// Provide a definition of the 'toy' dialect in the ODS framework so that we
+// can define our operations.
+def Toy_Dialect : Dialect {
+  let name = "toy";
+  let cppNamespace = "toy";
+}
+
+// Base class for toy dialect operations. This operation inherits from the base
+// `Op` class in OpBase.td, and provides:
+//   * The parent dialect of the operation.
+//   * The mnemonic for the operation, or the name without the dialect prefix.
+//   * A list of traits for the operation.
+class Toy_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Toy_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+// We define a toy operation by inheriting from our base 'Toy_Op' class above.
+// Here we provide the mnemonic and a list of traits for the operation. The
+// constant operation is marked as 'NoSideEffect' as it is a pure operation
+// and may be removed if dead.
+def ConstantOp : Toy_Op<"constant", [NoSideEffect]> {
+  // Provide a summary and description for this operation. This can be used to
+  // auto-generate documentation of the operations within our dialect.
+  let summary = "constant";
+  let description = [{
+    Constant operation turns a literal into an SSA value. The data is attached
+    to the operation as an attribute. For example:
+
+    ```mlir
+      %0 = "toy.constant"()
+         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
+        : () -> tensor<2x3xf64>
+    ```
+  }];
+
+  // The constant operation takes an attribute as the only input.
+  let arguments = (ins F64ElementsAttr:$value);
+
+  // The constant operation returns a single value of TensorType.
+  let results = (outs F64Tensor);
+
+  // Add custom build methods for the constant operation. These method populates
+  // the `state` that MLIR uses to create operations, i.e. these are used when
+  // using `builder.create<ConstantOp>(...)`.
+  let builders = [
+    // Build a constant with a given constant tensor value.
+    OpBuilder<"Builder *builder, OperationState &state, "
+              "DenseElementsAttr value", [{
+      build(builder, state, value.getType(), value);
+    }]>,
+
+    // Build a constant with a given constant floating-point value.
+    OpBuilder<"Builder *builder, OperationState &state, double value">
+  ];
+
+  // Invoke a static verify method to verify this constant operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def AddOp : Toy_Op<"add",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "element-wise addition operation";
+  let description = [{
+    The "add" operation performs element-wise addition between two tensors.
+    The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs F64Tensor);
+
+  // Allow building an AddOp with from the two input operands.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
+  ];
+}
+
+def CastOp : Toy_Op<"cast",
+    [DeclareOpInterfaceMethods<ShapeInferenceOpInterface>, NoSideEffect,
+     SameOperandsAndResultShape]> {
+  let summary = "shape cast operation";
+  let description = [{
+    The "cast" operation converts a tensor from one type to an equivalent type
+    without changing any data elements. The source and destination types
+    must both be tensor types with the same element type. If both are ranked
+    then the rank should be the same and static dimensions should match. The
+    operation is invalid if converting to a mismatching constant dimension.
+  }];
+
+  let arguments = (ins F64Tensor:$input);
+  let results = (outs F64Tensor:$output);
+
+  // Set the folder bit so that we can fold redundant cast operations.
+  let hasFolder = 1;
+}
+
+def GenericCallOp : Toy_Op<"generic_call",
+    [DeclareOpInterfaceMethods<CallOpInterface>]> {
+  let summary = "generic call operation";
+  let description = [{
+    Generic calls represent calls to a user defined function that needs to
+    be specialized for the shape of its arguments. The callee name is attached
+    as a symbol reference via an attribute. The arguments list must match the
+    arguments expected by the callee. For example:
+
+    ```mlir
+     %4 = "toy.generic_call"(%1, %3) {callee = @my_func}
+           : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+    ```
+
+    This is only valid if a function named "my_func" exists and takes two
+    arguments.
+  }];
+
+  // The generic call operation takes a symbol reference attribute as the
+  // callee, and inputs for the call.
+  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<F64Tensor>:$inputs);
+
+  // The generic call operation returns a single value of TensorType.
+  let results = (outs F64Tensor);
+
+  // Add custom build methods for the generic call operation.
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &state, "
+              "StringRef callee, ArrayRef<Value> arguments">
+  ];
+}
+
+def MulOp : Toy_Op<"mul",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "element-wise multiplication operation";
+  let description = [{
+    The "mul" operation performs element-wise multiplication between two
+    tensors. The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs F64Tensor);
+
+  // Allow building a MulOp with from the two input operands.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
+  ];
+}
+
+def PrintOp : Toy_Op<"print"> {
+  let summary = "print operation";
+  let description = [{
+    The "print" builtin operation prints a given input tensor, and produces
+    no results.
+  }];
+
+  // The print operation takes an input tensor to print.
+  let arguments = (ins F64Tensor:$input);
+}
+
+def ReshapeOp : Toy_Op<"reshape", [NoSideEffect]> {
+  let summary = "tensor reshape operation";
+  let description = [{
+    Reshape operation is transforming its input tensor into a new tensor with
+    the same number of elements but different shapes. For example:
+
+    ```mlir
+       %0 = "toy.reshape"(%arg1) : (tensor<10xf64>) -> tensor<5x2xf64>
+    ```
+  }];
+
+  let arguments = (ins F64Tensor:$input);
+  let hasCanonicalizer = 1;
+
+  // We expect that the reshape operation returns a statically shaped tensor.
+  let results = (outs StaticShapeTensorOf<[F64]>);
+}
+
+def ReturnOp : Toy_Op<"return", [Terminator, HasParent<"FuncOp">]> {
+  let summary = "return operation";
+  let description = [{
+    The "return" operation represents a return operation within a function.
+    The operation takes an optional tensor operand and produces no results.
+    The operand type must match the signature of the function that contains
+    the operation. For example:
+
+    ```mlir
+      func @foo() -> tensor<2xf64> {
+        ...
+        toy.return %0 : tensor<2xf64>
+      }
+    ```
+  }];
+
+  // The return operation takes an optional input operand to return. This
+  // value must match the return type of the enclosing function.
+  let arguments = (ins Variadic<F64Tensor>:$input);
+
+  // Allow building a ReturnOp with no return operand.
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &state", [{ build(b, state, llvm::None); }]
+  >];
+
+  // Provide extra utility definitions on the c++ operation class definition.
+  let extraClassDeclaration = [{
+    bool hasOperand() { return getNumOperands() != 0; }
+  }];
+
+  // Invoke a static verify method to verify this return operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def TransposeOp : Toy_Op<"transpose",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "transpose operation";
+
+  let arguments = (ins F64Tensor:$input);
+  let results = (outs F64Tensor);
+  let hasCanonicalizer = 1;
+
+  // Allow building a TransposeOp with from the input operand.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value input">
+  ];
+
+  // Invoke a static verify method to verify this transpose operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+#endif // TOY_OPS
diff --git a/mlir/examples/toy/Ch4/include/toy/Parser.h b/mlir/examples/toy/Ch4/include/toy/Parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..4557ea26859de3d0a6b71448f4bef030167c3e71
--- /dev/null
+++ b/mlir/examples/toy/Ch4/include/toy/Parser.h
@@ -0,0 +1,485 @@
+//===- Parser.h - Toy Language Parser -------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the parser for the Toy language. It processes the Token
+// provided by the Lexer and returns an AST.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_PARSER_H
+#define MLIR_TUTORIAL_TOY_PARSER_H
+
+#include "toy/AST.h"
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+namespace toy {
+
+/// This is a simple recursive parser for the Toy language. It produces a well
+/// formed AST from a stream of Token supplied by the Lexer. No semantic checks
+/// or symbol resolution is performed. For example, variables are referenced by
+/// string and the code could reference an undeclared variable and the parsing
+/// succeeds.
+class Parser {
+public:
+  /// Create a Parser for the supplied lexer.
+  Parser(Lexer &lexer) : lexer(lexer) {}
+
+  /// Parse a full Module. A module is a list of function definitions.
+  std::unique_ptr<ModuleAST> parseModule() {
+    lexer.getNextToken(); // prime the lexer
+
+    // Parse functions one at a time and accumulate in this vector.
+    std::vector<FunctionAST> functions;
+    while (auto f = parseDefinition()) {
+      functions.push_back(std::move(*f));
+      if (lexer.getCurToken() == tok_eof)
+        break;
+    }
+    // If we didn't reach EOF, there was an error during parsing
+    if (lexer.getCurToken() != tok_eof)
+      return parseError<ModuleAST>("nothing", "at end of module");
+
+    return std::make_unique<ModuleAST>(std::move(functions));
+  }
+
+private:
+  Lexer &lexer;
+
+  /// Parse a return statement.
+  /// return :== return ; | return expr ;
+  std::unique_ptr<ReturnExprAST> parseReturn() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_return);
+
+    // return takes an optional argument
+    llvm::Optional<std::unique_ptr<ExprAST>> expr;
+    if (lexer.getCurToken() != ';') {
+      expr = parseExpression();
+      if (!expr)
+        return nullptr;
+    }
+    return std::make_unique<ReturnExprAST>(std::move(loc), std::move(expr));
+  }
+
+  /// Parse a literal number.
+  /// numberexpr ::= number
+  std::unique_ptr<ExprAST> parseNumberExpr() {
+    auto loc = lexer.getLastLocation();
+    auto result =
+        std::make_unique<NumberExprAST>(std::move(loc), lexer.getValue());
+    lexer.consume(tok_number);
+    return std::move(result);
+  }
+
+  /// Parse a literal array expression.
+  /// tensorLiteral ::= [ literalList ] | number
+  /// literalList ::= tensorLiteral | tensorLiteral, literalList
+  std::unique_ptr<ExprAST> parseTensorLiteralExpr() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(Token('['));
+
+    // Hold the list of values at this nesting level.
+    std::vector<std::unique_ptr<ExprAST>> values;
+    // Hold the dimensions for all the nesting inside this level.
+    std::vector<int64_t> dims;
+    do {
+      // We can have either another nested array or a number literal.
+      if (lexer.getCurToken() == '[') {
+        values.push_back(parseTensorLiteralExpr());
+        if (!values.back())
+          return nullptr; // parse error in the nested array.
+      } else {
+        if (lexer.getCurToken() != tok_number)
+          return parseError<ExprAST>("<num> or [", "in literal expression");
+        values.push_back(parseNumberExpr());
+      }
+
+      // End of this list on ']'
+      if (lexer.getCurToken() == ']')
+        break;
+
+      // Elements are separated by a comma.
+      if (lexer.getCurToken() != ',')
+        return parseError<ExprAST>("] or ,", "in literal expression");
+
+      lexer.getNextToken(); // eat ,
+    } while (true);
+    if (values.empty())
+      return parseError<ExprAST>("<something>", "to fill literal expression");
+    lexer.getNextToken(); // eat ]
+
+    /// Fill in the dimensions now. First the current nesting level:
+    dims.push_back(values.size());
+
+    /// If there is any nested array, process all of them and ensure that
+    /// dimensions are uniform.
+    if (llvm::any_of(values, [](std::unique_ptr<ExprAST> &expr) {
+          return llvm::isa<LiteralExprAST>(expr.get());
+        })) {
+      auto *firstLiteral = llvm::dyn_cast<LiteralExprAST>(values.front().get());
+      if (!firstLiteral)
+        return parseError<ExprAST>("uniform well-nested dimensions",
+                                   "inside literal expression");
+
+      // Append the nested dimensions to the current level
+      auto firstDims = firstLiteral->getDims();
+      dims.insert(dims.end(), firstDims.begin(), firstDims.end());
+
+      // Sanity check that shape is uniform across all elements of the list.
+      for (auto &expr : values) {
+        auto *exprLiteral = llvm::cast<LiteralExprAST>(expr.get());
+        if (!exprLiteral)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+        if (exprLiteral->getDims() != firstDims)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+      }
+    }
+    return std::make_unique<LiteralExprAST>(std::move(loc), std::move(values),
+                                            std::move(dims));
+  }
+
+  /// parenexpr ::= '(' expression ')'
+  std::unique_ptr<ExprAST> parseParenExpr() {
+    lexer.getNextToken(); // eat (.
+    auto v = parseExpression();
+    if (!v)
+      return nullptr;
+
+    if (lexer.getCurToken() != ')')
+      return parseError<ExprAST>(")", "to close expression with parentheses");
+    lexer.consume(Token(')'));
+    return v;
+  }
+
+  /// identifierexpr
+  ///   ::= identifier
+  ///   ::= identifier '(' expression ')'
+  std::unique_ptr<ExprAST> parseIdentifierExpr() {
+    std::string name = lexer.getId();
+
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat identifier.
+
+    if (lexer.getCurToken() != '(') // Simple variable ref.
+      return std::make_unique<VariableExprAST>(std::move(loc), name);
+
+    // This is a function call.
+    lexer.consume(Token('('));
+    std::vector<std::unique_ptr<ExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      while (true) {
+        if (auto arg = parseExpression())
+          args.push_back(std::move(arg));
+        else
+          return nullptr;
+
+        if (lexer.getCurToken() == ')')
+          break;
+
+        if (lexer.getCurToken() != ',')
+          return parseError<ExprAST>(", or )", "in argument list");
+        lexer.getNextToken();
+      }
+    }
+    lexer.consume(Token(')'));
+
+    // It can be a builtin call to print
+    if (name == "print") {
+      if (args.size() != 1)
+        return parseError<ExprAST>("<single arg>", "as argument to print()");
+
+      return std::make_unique<PrintExprAST>(std::move(loc), std::move(args[0]));
+    }
+
+    // Call to a user-defined function
+    return std::make_unique<CallExprAST>(std::move(loc), name, std::move(args));
+  }
+
+  /// primary
+  ///   ::= identifierexpr
+  ///   ::= numberexpr
+  ///   ::= parenexpr
+  ///   ::= tensorliteral
+  std::unique_ptr<ExprAST> parsePrimary() {
+    switch (lexer.getCurToken()) {
+    default:
+      llvm::errs() << "unknown token '" << lexer.getCurToken()
+                   << "' when expecting an expression\n";
+      return nullptr;
+    case tok_identifier:
+      return parseIdentifierExpr();
+    case tok_number:
+      return parseNumberExpr();
+    case '(':
+      return parseParenExpr();
+    case '[':
+      return parseTensorLiteralExpr();
+    case ';':
+      return nullptr;
+    case '}':
+      return nullptr;
+    }
+  }
+
+  /// Recursively parse the right hand side of a binary expression, the ExprPrec
+  /// argument indicates the precedence of the current binary operator.
+  ///
+  /// binoprhs ::= ('+' primary)*
+  std::unique_ptr<ExprAST> parseBinOpRHS(int exprPrec,
+                                         std::unique_ptr<ExprAST> lhs) {
+    // If this is a binop, find its precedence.
+    while (true) {
+      int tokPrec = getTokPrecedence();
+
+      // If this is a binop that binds at least as tightly as the current binop,
+      // consume it, otherwise we are done.
+      if (tokPrec < exprPrec)
+        return lhs;
+
+      // Okay, we know this is a binop.
+      int binOp = lexer.getCurToken();
+      lexer.consume(Token(binOp));
+      auto loc = lexer.getLastLocation();
+
+      // Parse the primary expression after the binary operator.
+      auto rhs = parsePrimary();
+      if (!rhs)
+        return parseError<ExprAST>("expression", "to complete binary operator");
+
+      // If BinOp binds less tightly with rhs than the operator after rhs, let
+      // the pending operator take rhs as its lhs.
+      int nextPrec = getTokPrecedence();
+      if (tokPrec < nextPrec) {
+        rhs = parseBinOpRHS(tokPrec + 1, std::move(rhs));
+        if (!rhs)
+          return nullptr;
+      }
+
+      // Merge lhs/RHS.
+      lhs = std::make_unique<BinaryExprAST>(std::move(loc), binOp,
+                                            std::move(lhs), std::move(rhs));
+    }
+  }
+
+  /// expression::= primary binop rhs
+  std::unique_ptr<ExprAST> parseExpression() {
+    auto lhs = parsePrimary();
+    if (!lhs)
+      return nullptr;
+
+    return parseBinOpRHS(0, std::move(lhs));
+  }
+
+  /// type ::= < shape_list >
+  /// shape_list ::= num | num , shape_list
+  std::unique_ptr<VarType> parseType() {
+    if (lexer.getCurToken() != '<')
+      return parseError<VarType>("<", "to begin type");
+    lexer.getNextToken(); // eat <
+
+    auto type = std::make_unique<VarType>();
+
+    while (lexer.getCurToken() == tok_number) {
+      type->shape.push_back(lexer.getValue());
+      lexer.getNextToken();
+      if (lexer.getCurToken() == ',')
+        lexer.getNextToken();
+    }
+
+    if (lexer.getCurToken() != '>')
+      return parseError<VarType>(">", "to end type");
+    lexer.getNextToken(); // eat >
+    return type;
+  }
+
+  /// Parse a variable declaration, it starts with a `var` keyword followed by
+  /// and identifier and an optional type (shape specification) before the
+  /// initializer.
+  /// decl ::= var identifier [ type ] = expr
+  std::unique_ptr<VarDeclExprAST> parseDeclaration() {
+    if (lexer.getCurToken() != tok_var)
+      return parseError<VarDeclExprAST>("var", "to begin declaration");
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat var
+
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<VarDeclExprAST>("identified",
+                                        "after 'var' declaration");
+    std::string id = lexer.getId();
+    lexer.getNextToken(); // eat id
+
+    std::unique_ptr<VarType> type; // Type is optional, it can be inferred
+    if (lexer.getCurToken() == '<') {
+      type = parseType();
+      if (!type)
+        return nullptr;
+    }
+
+    if (!type)
+      type = std::make_unique<VarType>();
+    lexer.consume(Token('='));
+    auto expr = parseExpression();
+    return std::make_unique<VarDeclExprAST>(std::move(loc), std::move(id),
+                                            std::move(*type), std::move(expr));
+  }
+
+  /// Parse a block: a list of expression separated by semicolons and wrapped in
+  /// curly braces.
+  ///
+  /// block ::= { expression_list }
+  /// expression_list ::= block_expr ; expression_list
+  /// block_expr ::= decl | "return" | expr
+  std::unique_ptr<ExprASTList> parseBlock() {
+    if (lexer.getCurToken() != '{')
+      return parseError<ExprASTList>("{", "to begin block");
+    lexer.consume(Token('{'));
+
+    auto exprList = std::make_unique<ExprASTList>();
+
+    // Ignore empty expressions: swallow sequences of semicolons.
+    while (lexer.getCurToken() == ';')
+      lexer.consume(Token(';'));
+
+    while (lexer.getCurToken() != '}' && lexer.getCurToken() != tok_eof) {
+      if (lexer.getCurToken() == tok_var) {
+        // Variable declaration
+        auto varDecl = parseDeclaration();
+        if (!varDecl)
+          return nullptr;
+        exprList->push_back(std::move(varDecl));
+      } else if (lexer.getCurToken() == tok_return) {
+        // Return statement
+        auto ret = parseReturn();
+        if (!ret)
+          return nullptr;
+        exprList->push_back(std::move(ret));
+      } else {
+        // General expression
+        auto expr = parseExpression();
+        if (!expr)
+          return nullptr;
+        exprList->push_back(std::move(expr));
+      }
+      // Ensure that elements are separated by a semicolon.
+      if (lexer.getCurToken() != ';')
+        return parseError<ExprASTList>(";", "after expression");
+
+      // Ignore empty expressions: swallow sequences of semicolons.
+      while (lexer.getCurToken() == ';')
+        lexer.consume(Token(';'));
+    }
+
+    if (lexer.getCurToken() != '}')
+      return parseError<ExprASTList>("}", "to close block");
+
+    lexer.consume(Token('}'));
+    return exprList;
+  }
+
+  /// prototype ::= def id '(' decl_list ')'
+  /// decl_list ::= identifier | identifier, decl_list
+  std::unique_ptr<PrototypeAST> parsePrototype() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_def);
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<PrototypeAST>("function name", "in prototype");
+
+    std::string fnName = lexer.getId();
+    lexer.consume(tok_identifier);
+
+    if (lexer.getCurToken() != '(')
+      return parseError<PrototypeAST>("(", "in prototype");
+    lexer.consume(Token('('));
+
+    std::vector<std::unique_ptr<VariableExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      do {
+        std::string name = lexer.getId();
+        auto loc = lexer.getLastLocation();
+        lexer.consume(tok_identifier);
+        auto decl = std::make_unique<VariableExprAST>(std::move(loc), name);
+        args.push_back(std::move(decl));
+        if (lexer.getCurToken() != ',')
+          break;
+        lexer.consume(Token(','));
+        if (lexer.getCurToken() != tok_identifier)
+          return parseError<PrototypeAST>(
+              "identifier", "after ',' in function parameter list");
+      } while (true);
+    }
+    if (lexer.getCurToken() != ')')
+      return parseError<PrototypeAST>("}", "to end function prototype");
+
+    // success.
+    lexer.consume(Token(')'));
+    return std::make_unique<PrototypeAST>(std::move(loc), fnName,
+                                          std::move(args));
+  }
+
+  /// Parse a function definition, we expect a prototype initiated with the
+  /// `def` keyword, followed by a block containing a list of expressions.
+  ///
+  /// definition ::= prototype block
+  std::unique_ptr<FunctionAST> parseDefinition() {
+    auto proto = parsePrototype();
+    if (!proto)
+      return nullptr;
+
+    if (auto block = parseBlock())
+      return std::make_unique<FunctionAST>(std::move(proto), std::move(block));
+    return nullptr;
+  }
+
+  /// Get the precedence of the pending binary operator token.
+  int getTokPrecedence() {
+    if (!isascii(lexer.getCurToken()))
+      return -1;
+
+    // 1 is lowest precedence.
+    switch (static_cast<char>(lexer.getCurToken())) {
+    case '-':
+      return 20;
+    case '+':
+      return 20;
+    case '*':
+      return 40;
+    default:
+      return -1;
+    }
+  }
+
+  /// Helper function to signal errors while parsing, it takes an argument
+  /// indicating the expected token and another argument giving more context.
+  /// Location is retrieved from the lexer to enrich the error message.
+  template <typename R, typename T, typename U = const char *>
+  std::unique_ptr<R> parseError(T &&expected, U &&context = "") {
+    auto curToken = lexer.getCurToken();
+    llvm::errs() << "Parse error (" << lexer.getLastLocation().line << ", "
+                 << lexer.getLastLocation().col << "): expected '" << expected
+                 << "' " << context << " but has Token " << curToken;
+    if (isprint(curToken))
+      llvm::errs() << " '" << (char)curToken << "'";
+    llvm::errs() << "\n";
+    return nullptr;
+  }
+};
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_PARSER_H
diff --git a/mlir/examples/toy/Ch4/include/toy/Passes.h b/mlir/examples/toy/Ch4/include/toy/Passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..93c51309008fca3771099b863854f0fe9e5655e5
--- /dev/null
+++ b/mlir/examples/toy/Ch4/include/toy/Passes.h
@@ -0,0 +1,27 @@
+//===- Passes.h - Toy Passes Definition -----------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes the entry points to create compiler passes for Toy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_PASSES_H
+#define MLIR_TUTORIAL_TOY_PASSES_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+namespace toy {
+std::unique_ptr<Pass> createShapeInferencePass();
+std::unique_ptr<Pass> createDeadFunctionEliminationPass();
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_PASSES_H
diff --git a/mlir/examples/toy/Ch4/include/toy/ShapeInferenceInterface.h b/mlir/examples/toy/Ch4/include/toy/ShapeInferenceInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..da0fb66018ee4df1882d26f074ecd49a24ddcea9
--- /dev/null
+++ b/mlir/examples/toy/Ch4/include/toy/ShapeInferenceInterface.h
@@ -0,0 +1,28 @@
+//===- ShapeInferenceInterface.h - Interface definitions for ShapeInference -=//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the shape inference interfaces defined
+// in ShapeInferenceInterface.td.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
+#define MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
+
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace toy {
+
+/// Include the auto-generated declarations.
+#include "toy/ShapeInferenceOpInterfaces.h.inc"
+
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
diff --git a/mlir/examples/toy/Ch4/include/toy/ShapeInferenceInterface.td b/mlir/examples/toy/Ch4/include/toy/ShapeInferenceInterface.td
new file mode 100644
index 0000000000000000000000000000000000000000..1b38ada1622862057ad2c18eabe147b875e18cf2
--- /dev/null
+++ b/mlir/examples/toy/Ch4/include/toy/ShapeInferenceInterface.td
@@ -0,0 +1,30 @@
+//===- ShapeInferenceInterface.td - Shape Inference Interface -*- tablegen -==//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the operations of the Shape Inference Op Interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SHAPE_INFERENCE_INTERFACE
+#define SHAPE_INFERENCE_INTERFACE
+
+include "mlir/IR/OpBase.td"
+
+def ShapeInferenceOpInterface : OpInterface<"ShapeInference"> {
+  let description = [{
+    Interface to access a registered method to infer the return types for an
+    operation that can be used during type inference.
+  }];
+
+  let methods = [
+    InterfaceMethod<"Infer and set the output shape for the current operation.",
+                    "void", "inferShapes">
+  ];
+}
+
+#endif // SHAPE_INFERENCE_INTERFACE
diff --git a/mlir/examples/toy/Ch4/mlir/DeadFunctionEliminationPass.cpp b/mlir/examples/toy/Ch4/mlir/DeadFunctionEliminationPass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ee34547860cd98c27c21da874ad794a6d0c99d5
--- /dev/null
+++ b/mlir/examples/toy/Ch4/mlir/DeadFunctionEliminationPass.cpp
@@ -0,0 +1,59 @@
+//===- DeadFunctionEliminationPass.cpp - Eliminate inlined functions ------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Module level pass performing dead function
+// elimination. This is required as a post-processing step after function
+// inlining.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "toy/Passes.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+namespace {
+/// This is a simple function DCE pass that deletes all non-main functions after
+/// inlining.
+/// TODO(riverriddle) This is only necessary because MLIR currently does not
+/// have generic DCE support for functions.
+class DeadFunctionEliminationPass
+    : public mlir::ModulePass<DeadFunctionEliminationPass> {
+public:
+  void runOnModule() override {
+    mlir::ModuleOp module = getModule();
+    mlir::SymbolTable moduleSymTable(module);
+
+    // Eliminate non-main functions.
+    auto mainFn = moduleSymTable.lookup<mlir::FuncOp>("main");
+    for (mlir::FuncOp func :
+         llvm::make_early_inc_range(module.getOps<mlir::FuncOp>())) {
+      if (func != mainFn)
+        func.erase();
+    }
+  }
+};
+} // end anonymous namespace
+
+/// Create a pass that eliminates inlined functions in toy.
+std::unique_ptr<mlir::Pass> mlir::toy::createDeadFunctionEliminationPass() {
+  return std::make_unique<DeadFunctionEliminationPass>();
+}
diff --git a/mlir/examples/toy/Ch4/mlir/Dialect.cpp b/mlir/examples/toy/Ch4/mlir/Dialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a9ded0c3d38ae810d6dd114f4c3a0d85df65b60
--- /dev/null
+++ b/mlir/examples/toy/Ch4/mlir/Dialect.cpp
@@ -0,0 +1,261 @@
+//===- Dialect.cpp - Toy IR Dialect registration in MLIR ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the dialect for the Toy IR: custom type parsing and
+// operation verification.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Transforms/InliningUtils.h"
+
+using namespace mlir;
+using namespace mlir::toy;
+
+//===----------------------------------------------------------------------===//
+// ToyInlinerInterface
+//===----------------------------------------------------------------------===//
+
+/// This class defines the interface for handling inlining with Toy
+/// operations.
+struct ToyInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// All operations within toy can be inlined.
+  bool isLegalToInline(Operation *, Region *,
+                       BlockAndValueMapping &) const final {
+    return true;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator(toy.return) by replacing it with a new
+  /// operation as necessary.
+  void handleTerminator(Operation *op,
+                        ArrayRef<Value> valuesToRepl) const final {
+    // Only "toy.return" needs to be handled here.
+    auto returnOp = cast<ReturnOp>(op);
+
+    // Replace the values directly with the return operands.
+    assert(returnOp.getNumOperands() == valuesToRepl.size());
+    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+      valuesToRepl[it.index()]->replaceAllUsesWith(it.value());
+  }
+
+  /// Attempts to materialize a conversion for a type mismatch between a call
+  /// from this dialect, and a callable region. This method should generate an
+  /// operation that takes 'input' as the only operand, and produces a single
+  /// result of 'resultType'. If a conversion can not be generated, nullptr
+  /// should be returned.
+  Operation *materializeCallConversion(OpBuilder &builder, Value input,
+                                       Type resultType,
+                                       Location conversionLoc) const final {
+    return builder.create<CastOp>(conversionLoc, resultType, input);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyDialect
+//===----------------------------------------------------------------------===//
+
+/// Dialect creation, the instance will be owned by the context. This is the
+/// point of registration of custom types and operations for the dialect.
+ToyDialect::ToyDialect(mlir::MLIRContext *ctx) : mlir::Dialect("toy", ctx) {
+  addOperations<
+#define GET_OP_LIST
+#include "toy/Ops.cpp.inc"
+      >();
+  addInterfaces<ToyInlinerInterface>();
+}
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ConstantOp
+
+/// Build a constant operation.
+/// The builder is passed as an argument, so is the state that this method is
+/// expected to fill in order to build the operation.
+void ConstantOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                       double value) {
+  auto dataType = RankedTensorType::get({}, builder->getF64Type());
+  auto dataAttribute = DenseElementsAttr::get(dataType, value);
+  ConstantOp::build(builder, state, dataType, dataAttribute);
+}
+
+/// Verifier for the constant operation. This corresponds to the `::verify(...)`
+/// in the op definition.
+static mlir::LogicalResult verify(ConstantOp op) {
+  // If the return type of the constant is not an unranked tensor, the shape
+  // must match the shape of the attribute holding the data.
+  auto resultType =
+      op.getResult()->getType().dyn_cast<mlir::RankedTensorType>();
+  if (!resultType)
+    return success();
+
+  // Check that the rank of the attribute type matches the rank of the constant
+  // result type.
+  auto attrType = op.value().getType().cast<mlir::TensorType>();
+  if (attrType.getRank() != resultType.getRank()) {
+    return op.emitOpError(
+               "return type must match the one of the attached value "
+               "attribute: ")
+           << attrType.getRank() << " != " << resultType.getRank();
+  }
+
+  // Check that each of the dimensions match between the two types.
+  for (int dim = 0, dimE = attrType.getRank(); dim < dimE; ++dim) {
+    if (attrType.getShape()[dim] != resultType.getShape()[dim]) {
+      return op.emitOpError(
+                 "return type shape mismatches its attribute at dimension ")
+             << dim << ": " << attrType.getShape()[dim]
+             << " != " << resultType.getShape()[dim];
+    }
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// AddOp
+
+void AddOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+/// Infer the output shape of the AddOp, this is required by the shape inference
+/// interface.
+void AddOp::inferShapes() { getResult()->setType(getOperand(0)->getType()); }
+
+//===----------------------------------------------------------------------===//
+// CastOp
+
+/// Infer the output shape of the CastOp, this is required by the shape
+/// inference interface.
+void CastOp::inferShapes() { getResult()->setType(getOperand()->getType()); }
+
+//===----------------------------------------------------------------------===//
+// GenericCallOp
+
+void GenericCallOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                          StringRef callee, ArrayRef<mlir::Value> arguments) {
+  // Generic call always returns an unranked Tensor initially.
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands(arguments);
+  state.addAttribute("callee", builder->getSymbolRefAttr(callee));
+}
+
+/// Return the callee of the generic call operation, this is required by the
+/// call interface.
+CallInterfaceCallable GenericCallOp::getCallableForCallee() {
+  return getAttrOfType<SymbolRefAttr>("callee");
+}
+
+/// Get the argument operands to the called function, this is required by the
+/// call interface.
+Operation::operand_range GenericCallOp::getArgOperands() { return inputs(); }
+
+//===----------------------------------------------------------------------===//
+// MulOp
+
+void MulOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+/// Infer the output shape of the MulOp, this is required by the shape inference
+/// interface.
+void MulOp::inferShapes() { getResult()->setType(getOperand(0)->getType()); }
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+
+static mlir::LogicalResult verify(ReturnOp op) {
+  // We know that the parent operation is a function, because of the 'HasParent'
+  // trait attached to the operation definition.
+  auto function = cast<FuncOp>(op.getParentOp());
+
+  /// ReturnOps can only have a single optional operand.
+  if (op.getNumOperands() > 1)
+    return op.emitOpError() << "expects at most 1 return operand";
+
+  // The operand number and types must match the function signature.
+  const auto &results = function.getType().getResults();
+  if (op.getNumOperands() != results.size())
+    return op.emitOpError()
+           << "does not return the same number of values ("
+           << op.getNumOperands() << ") as the enclosing function ("
+           << results.size() << ")";
+
+  // If the operation does not have an input, we are done.
+  if (!op.hasOperand())
+    return mlir::success();
+
+  auto inputType = *op.operand_type_begin();
+  auto resultType = results.front();
+
+  // Check that the result type of the function matches the operand type.
+  if (inputType == resultType || inputType.isa<mlir::UnrankedTensorType>() ||
+      resultType.isa<mlir::UnrankedTensorType>())
+    return mlir::success();
+
+  return op.emitError() << "type of return operand ("
+                        << *op.operand_type_begin()
+                        << ") doesn't match function result type ("
+                        << results.front() << ")";
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+
+void TransposeOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                        mlir::Value value) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands(value);
+}
+
+void TransposeOp::inferShapes() {
+  auto arrayTy = getOperand()->getType().cast<RankedTensorType>();
+  SmallVector<int64_t, 2> dims(llvm::reverse(arrayTy.getShape()));
+  getResult()->setType(RankedTensorType::get(dims, arrayTy.getElementType()));
+}
+
+static mlir::LogicalResult verify(TransposeOp op) {
+  auto inputType = op.getOperand()->getType().dyn_cast<RankedTensorType>();
+  auto resultType = op.getType().dyn_cast<RankedTensorType>();
+  if (!inputType || !resultType)
+    return mlir::success();
+
+  auto inputShape = inputType.getShape();
+  if (!std::equal(inputShape.begin(), inputShape.end(),
+                  resultType.getShape().rbegin())) {
+    return op.emitError()
+           << "expected result shape to be a transpose of the input";
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "toy/Ops.cpp.inc"
diff --git a/mlir/examples/toy/Ch4/mlir/MLIRGen.cpp b/mlir/examples/toy/Ch4/mlir/MLIRGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9c960c79f47254d31e72037431a0f9d3a614276
--- /dev/null
+++ b/mlir/examples/toy/Ch4/mlir/MLIRGen.cpp
@@ -0,0 +1,452 @@
+//===- MLIRGen.cpp - MLIR Generation from a Toy AST -----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple IR generation targeting MLIR from a Module AST
+// for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/MLIRGen.h"
+#include "toy/AST.h"
+#include "toy/Dialect.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/Support/raw_ostream.h"
+#include <numeric>
+
+using namespace mlir::toy;
+using namespace toy;
+
+using llvm::ArrayRef;
+using llvm::cast;
+using llvm::dyn_cast;
+using llvm::isa;
+using llvm::makeArrayRef;
+using llvm::ScopedHashTableScope;
+using llvm::SmallVector;
+using llvm::StringRef;
+using llvm::Twine;
+
+namespace {
+
+/// Implementation of a simple MLIR emission from the Toy AST.
+///
+/// This will emit operations that are specific to the Toy language, preserving
+/// the semantics of the language and (hopefully) allow to perform accurate
+/// analysis and transformation based on these high level semantics.
+class MLIRGenImpl {
+public:
+  MLIRGenImpl(mlir::MLIRContext &context) : builder(&context) {}
+
+  /// Public API: convert the AST for a Toy module (source file) to an MLIR
+  /// Module operation.
+  mlir::ModuleOp mlirGen(ModuleAST &moduleAST) {
+    // We create an empty MLIR module and codegen functions one at a time and
+    // add them to the module.
+    theModule = mlir::ModuleOp::create(builder.getUnknownLoc());
+
+    for (FunctionAST &F : moduleAST) {
+      auto func = mlirGen(F);
+      if (!func)
+        return nullptr;
+      theModule.push_back(func);
+    }
+
+    // Verify the module after we have finished constructing it, this will check
+    // the structural properties of the IR and invoke any specific verifiers we
+    // have on the Toy operations.
+    if (failed(mlir::verify(theModule))) {
+      theModule.emitError("module verification error");
+      return nullptr;
+    }
+
+    return theModule;
+  }
+
+private:
+  /// A "module" matches a Toy source file: containing a list of functions.
+  mlir::ModuleOp theModule;
+
+  /// The builder is a helper class to create IR inside a function. The builder
+  /// is stateful, in particular it keeps an "insertion point": this is where
+  /// the next operations will be introduced.
+  mlir::OpBuilder builder;
+
+  /// The symbol table maps a variable name to a value in the current scope.
+  /// Entering a function creates a new scope, and the function arguments are
+  /// added to the mapping. When the processing of a function is terminated, the
+  /// scope is destroyed and the mappings created in this scope are dropped.
+  llvm::ScopedHashTable<StringRef, mlir::Value> symbolTable;
+
+  /// Helper conversion for a Toy AST location to an MLIR location.
+  mlir::Location loc(Location loc) {
+    return builder.getFileLineColLoc(builder.getIdentifier(*loc.file), loc.line,
+                                     loc.col);
+  }
+
+  /// Declare a variable in the current scope, return success if the variable
+  /// wasn't declared yet.
+  mlir::LogicalResult declare(llvm::StringRef var, mlir::Value value) {
+    if (symbolTable.count(var))
+      return mlir::failure();
+    symbolTable.insert(var, value);
+    return mlir::success();
+  }
+
+  /// Create the prototype for an MLIR function with as many arguments as the
+  /// provided Toy AST prototype.
+  mlir::FuncOp mlirGen(PrototypeAST &proto) {
+    auto location = loc(proto.loc());
+
+    // This is a generic function, the return type will be inferred later.
+    // Arguments type are uniformly unranked tensors.
+    llvm::SmallVector<mlir::Type, 4> arg_types(proto.getArgs().size(),
+                                               getType(VarType{}));
+    auto func_type = builder.getFunctionType(arg_types, llvm::None);
+    return mlir::FuncOp::create(location, proto.getName(), func_type);
+  }
+
+  /// Emit a new function and add it to the MLIR module.
+  mlir::FuncOp mlirGen(FunctionAST &funcAST) {
+    // Create a scope in the symbol table to hold variable declarations.
+    ScopedHashTableScope<llvm::StringRef, mlir::Value> var_scope(symbolTable);
+
+    // Create an MLIR function for the given prototype.
+    mlir::FuncOp function(mlirGen(*funcAST.getProto()));
+    if (!function)
+      return nullptr;
+
+    // Let's start the body of the function now!
+    // In MLIR the entry block of the function is special: it must have the same
+    // argument list as the function itself.
+    auto &entryBlock = *function.addEntryBlock();
+    auto protoArgs = funcAST.getProto()->getArgs();
+
+    // Declare all the function arguments in the symbol table.
+    for (const auto &name_value :
+         llvm::zip(protoArgs, entryBlock.getArguments())) {
+      if (failed(declare(std::get<0>(name_value)->getName(),
+                         std::get<1>(name_value))))
+        return nullptr;
+    }
+
+    // Set the insertion point in the builder to the beginning of the function
+    // body, it will be used throughout the codegen to create operations in this
+    // function.
+    builder.setInsertionPointToStart(&entryBlock);
+
+    // Emit the body of the function.
+    if (mlir::failed(mlirGen(*funcAST.getBody()))) {
+      function.erase();
+      return nullptr;
+    }
+
+    // Implicitly return void if no return statement was emitted.
+    // FIXME: we may fix the parser instead to always return the last expression
+    // (this would possibly help the REPL case later)
+    ReturnOp returnOp;
+    if (!entryBlock.empty())
+      returnOp = dyn_cast<ReturnOp>(entryBlock.back());
+    if (!returnOp) {
+      builder.create<ReturnOp>(loc(funcAST.getProto()->loc()));
+    } else if (returnOp.hasOperand()) {
+      // Otherwise, if this return operation has an operand then add a result to
+      // the function.
+      function.setType(builder.getFunctionType(function.getType().getInputs(),
+                                               getType(VarType{})));
+    }
+
+    return function;
+  }
+
+  /// Emit a binary operation
+  mlir::Value mlirGen(BinaryExprAST &binop) {
+    // First emit the operations for each side of the operation before emitting
+    // the operation itself. For example if the expression is `a + foo(a)`
+    // 1) First it will visiting the LHS, which will return a reference to the
+    //    value holding `a`. This value should have been emitted at declaration
+    //    time and registered in the symbol table, so nothing would be
+    //    codegen'd. If the value is not in the symbol table, an error has been
+    //    emitted and nullptr is returned.
+    // 2) Then the RHS is visited (recursively) and a call to `foo` is emitted
+    //    and the result value is returned. If an error occurs we get a nullptr
+    //    and propagate.
+    //
+    mlir::Value lhs = mlirGen(*binop.getLHS());
+    if (!lhs)
+      return nullptr;
+    mlir::Value rhs = mlirGen(*binop.getRHS());
+    if (!rhs)
+      return nullptr;
+    auto location = loc(binop.loc());
+
+    // Derive the operation name from the binary operator. At the moment we only
+    // support '+' and '*'.
+    switch (binop.getOp()) {
+    case '+':
+      return builder.create<AddOp>(location, lhs, rhs);
+    case '*':
+      return builder.create<MulOp>(location, lhs, rhs);
+    }
+
+    emitError(location, "invalid binary operator '") << binop.getOp() << "'";
+    return nullptr;
+  }
+
+  /// This is a reference to a variable in an expression. The variable is
+  /// expected to have been declared and so should have a value in the symbol
+  /// table, otherwise emit an error and return nullptr.
+  mlir::Value mlirGen(VariableExprAST &expr) {
+    if (auto variable = symbolTable.lookup(expr.getName()))
+      return variable;
+
+    emitError(loc(expr.loc()), "error: unknown variable '")
+        << expr.getName() << "'";
+    return nullptr;
+  }
+
+  /// Emit a return operation. This will return failure if any generation fails.
+  mlir::LogicalResult mlirGen(ReturnExprAST &ret) {
+    auto location = loc(ret.loc());
+
+    // 'return' takes an optional expression, handle that case here.
+    mlir::Value expr = nullptr;
+    if (ret.getExpr().hasValue()) {
+      if (!(expr = mlirGen(*ret.getExpr().getValue())))
+        return mlir::failure();
+    }
+
+    // Otherwise, this return operation has zero operands.
+    builder.create<ReturnOp>(location, expr ? makeArrayRef(expr)
+                                            : ArrayRef<mlir::Value>());
+    return mlir::success();
+  }
+
+  /// Emit a literal/constant array. It will be emitted as a flattened array of
+  /// data in an Attribute attached to a `toy.constant` operation.
+  /// See documentation on [Attributes](LangRef.md#attributes) for more details.
+  /// Here is an excerpt:
+  ///
+  ///   Attributes are the mechanism for specifying constant data in MLIR in
+  ///   places where a variable is never allowed [...]. They consist of a name
+  ///   and a concrete attribute value. The set of expected attributes, their
+  ///   structure, and their interpretation are all contextually dependent on
+  ///   what they are attached to.
+  ///
+  /// Example, the source level statement:
+  ///   var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  /// will be converted to:
+  ///   %0 = "toy.constant"() {value: dense<tensor<2x3xf64>,
+  ///     [[1.000000e+00, 2.000000e+00, 3.000000e+00],
+  ///      [4.000000e+00, 5.000000e+00, 6.000000e+00]]>} : () -> tensor<2x3xf64>
+  ///
+  mlir::Value mlirGen(LiteralExprAST &lit) {
+    auto type = getType(lit.getDims());
+
+    // The attribute is a vector with a floating point value per element
+    // (number) in the array, see `collectData()` below for more details.
+    std::vector<double> data;
+    data.reserve(std::accumulate(lit.getDims().begin(), lit.getDims().end(), 1,
+                                 std::multiplies<int>()));
+    collectData(lit, data);
+
+    // The type of this attribute is tensor of 64-bit floating-point with the
+    // shape of the literal.
+    mlir::Type elementType = builder.getF64Type();
+    auto dataType = mlir::RankedTensorType::get(lit.getDims(), elementType);
+
+    // This is the actual attribute that holds the list of values for this
+    // tensor literal.
+    auto dataAttribute =
+        mlir::DenseElementsAttr::get(dataType, llvm::makeArrayRef(data));
+
+    // Build the MLIR op `toy.constant`. This invokes the `ConstantOp::build`
+    // method.
+    return builder.create<ConstantOp>(loc(lit.loc()), type, dataAttribute);
+  }
+
+  /// Recursive helper function to accumulate the data that compose an array
+  /// literal. It flattens the nested structure in the supplied vector. For
+  /// example with this array:
+  ///  [[1, 2], [3, 4]]
+  /// we will generate:
+  ///  [ 1, 2, 3, 4 ]
+  /// Individual numbers are represented as doubles.
+  /// Attributes are the way MLIR attaches constant to operations.
+  void collectData(ExprAST &expr, std::vector<double> &data) {
+    if (auto *lit = dyn_cast<LiteralExprAST>(&expr)) {
+      for (auto &value : lit->getValues())
+        collectData(*value, data);
+      return;
+    }
+
+    assert(isa<NumberExprAST>(expr) && "expected literal or number expr");
+    data.push_back(cast<NumberExprAST>(expr).getValue());
+  }
+
+  /// Emit a call expression. It emits specific operations for the `transpose`
+  /// builtin. Other identifiers are assumed to be user-defined functions.
+  mlir::Value mlirGen(CallExprAST &call) {
+    llvm::StringRef callee = call.getCallee();
+    auto location = loc(call.loc());
+
+    // Codegen the operands first.
+    SmallVector<mlir::Value, 4> operands;
+    for (auto &expr : call.getArgs()) {
+      auto arg = mlirGen(*expr);
+      if (!arg)
+        return nullptr;
+      operands.push_back(arg);
+    }
+
+    // Builting calls have their custom operation, meaning this is a
+    // straightforward emission.
+    if (callee == "transpose") {
+      if (call.getArgs().size() != 1) {
+        emitError(location, "MLIR codegen encountered an error: toy.transpose "
+                            "does not accept multiple arguments");
+        return nullptr;
+      }
+      return builder.create<TransposeOp>(location, operands[0]);
+    }
+
+    // Otherwise this is a call to a user-defined function. Calls to ser-defined
+    // functions are mapped to a custom call that takes the callee name as an
+    // attribute.
+    return builder.create<GenericCallOp>(location, callee, operands);
+  }
+
+  /// Emit a print expression. It emits specific operations for two builtins:
+  /// transpose(x) and print(x).
+  mlir::LogicalResult mlirGen(PrintExprAST &call) {
+    auto arg = mlirGen(*call.getArg());
+    if (!arg)
+      return mlir::failure();
+
+    builder.create<PrintOp>(loc(call.loc()), arg);
+    return mlir::success();
+  }
+
+  /// Emit a constant for a single number (FIXME: semantic? broadcast?)
+  mlir::Value mlirGen(NumberExprAST &num) {
+    return builder.create<ConstantOp>(loc(num.loc()), num.getValue());
+  }
+
+  /// Dispatch codegen for the right expression subclass using RTTI.
+  mlir::Value mlirGen(ExprAST &expr) {
+    switch (expr.getKind()) {
+    case toy::ExprAST::Expr_BinOp:
+      return mlirGen(cast<BinaryExprAST>(expr));
+    case toy::ExprAST::Expr_Var:
+      return mlirGen(cast<VariableExprAST>(expr));
+    case toy::ExprAST::Expr_Literal:
+      return mlirGen(cast<LiteralExprAST>(expr));
+    case toy::ExprAST::Expr_Call:
+      return mlirGen(cast<CallExprAST>(expr));
+    case toy::ExprAST::Expr_Num:
+      return mlirGen(cast<NumberExprAST>(expr));
+    default:
+      emitError(loc(expr.loc()))
+          << "MLIR codegen encountered an unhandled expr kind '"
+          << Twine(expr.getKind()) << "'";
+      return nullptr;
+    }
+  }
+
+  /// Handle a variable declaration, we'll codegen the expression that forms the
+  /// initializer and record the value in the symbol table before returning it.
+  /// Future expressions will be able to reference this variable through symbol
+  /// table lookup.
+  mlir::Value mlirGen(VarDeclExprAST &vardecl) {
+    auto init = vardecl.getInitVal();
+    if (!init) {
+      emitError(loc(vardecl.loc()),
+                "missing initializer in variable declaration");
+      return nullptr;
+    }
+
+    mlir::Value value = mlirGen(*init);
+    if (!value)
+      return nullptr;
+
+    // We have the initializer value, but in case the variable was declared
+    // with specific shape, we emit a "reshape" operation. It will get
+    // optimized out later as needed.
+    if (!vardecl.getType().shape.empty()) {
+      value = builder.create<ReshapeOp>(loc(vardecl.loc()),
+                                        getType(vardecl.getType()), value);
+    }
+
+    // Register the value in the symbol table.
+    if (failed(declare(vardecl.getName(), value)))
+      return nullptr;
+    return value;
+  }
+
+  /// Codegen a list of expression, return failure if one of them hit an error.
+  mlir::LogicalResult mlirGen(ExprASTList &blockAST) {
+    ScopedHashTableScope<StringRef, mlir::Value> var_scope(symbolTable);
+    for (auto &expr : blockAST) {
+      // Specific handling for variable declarations, return statement, and
+      // print. These can only appear in block list and not in nested
+      // expressions.
+      if (auto *vardecl = dyn_cast<VarDeclExprAST>(expr.get())) {
+        if (!mlirGen(*vardecl))
+          return mlir::failure();
+        continue;
+      }
+      if (auto *ret = dyn_cast<ReturnExprAST>(expr.get()))
+        return mlirGen(*ret);
+      if (auto *print = dyn_cast<PrintExprAST>(expr.get())) {
+        if (mlir::failed(mlirGen(*print)))
+          return mlir::success();
+        continue;
+      }
+
+      // Generic expression dispatch codegen.
+      if (!mlirGen(*expr))
+        return mlir::failure();
+    }
+    return mlir::success();
+  }
+
+  /// Build a tensor type from a list of shape dimensions.
+  mlir::Type getType(ArrayRef<int64_t> shape) {
+    // If the shape is empty, then this type is unranked.
+    if (shape.empty())
+      return mlir::UnrankedTensorType::get(builder.getF64Type());
+
+    // Otherwise, we use the given shape.
+    return mlir::RankedTensorType::get(shape, builder.getF64Type());
+  }
+
+  /// Build an MLIR type from a Toy AST variable type (forward to the generic
+  /// getType above).
+  mlir::Type getType(const VarType &type) { return getType(type.shape); }
+};
+
+} // namespace
+
+namespace toy {
+
+// The public API for codegen.
+mlir::OwningModuleRef mlirGen(mlir::MLIRContext &context,
+                              ModuleAST &moduleAST) {
+  return MLIRGenImpl(context).mlirGen(moduleAST);
+}
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch4/mlir/ShapeInferencePass.cpp b/mlir/examples/toy/Ch4/mlir/ShapeInferencePass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..517a1f075306485003e099ed805a23f77cb49147
--- /dev/null
+++ b/mlir/examples/toy/Ch4/mlir/ShapeInferencePass.cpp
@@ -0,0 +1,104 @@
+//===- ShapeInferencePass.cpp - Shape Inference ---------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Function level pass performing interprocedural
+// propagation of array shapes through function specialization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Pass/Pass.h"
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+#include "toy/ShapeInferenceInterface.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "shape-inference"
+
+using namespace mlir;
+using namespace toy;
+
+/// Include the auto-generated definitions for the shape inference interfaces.
+#include "toy/ShapeInferenceOpInterfaces.cpp.inc"
+
+namespace {
+/// The ShapeInferencePass is a FunctionPass that performs intra-procedural
+/// shape inference.
+///
+///    Algorithm:
+///
+///   1) Build a worklist containing all the operations that return a
+///      dynamically shaped tensor: these are the operations that need shape
+///      inference.
+///   2) Iterate on the worklist:
+///     a) find an operation to process: the next ready operation in the
+///        worklist has all of its arguments non-generic,
+///     b) if no operation is found, break out of the loop,
+///     c) remove the operation from the worklist,
+///     d) infer the shape of its output from the argument types.
+///   3) If the worklist is empty, the algorithm succeeded.
+///
+class ShapeInferencePass : public mlir::FunctionPass<ShapeInferencePass> {
+public:
+  void runOnFunction() override {
+    auto f = getFunction();
+
+    // Populate the worklist with the operations that need shape inference:
+    // these are operations that return a dynamic shape.
+    llvm::SmallPtrSet<mlir::Operation *, 16> opWorklist;
+    f.walk([&](mlir::Operation *op) {
+      if (returnsDynamicShape(op))
+        opWorklist.insert(op);
+    });
+
+    // Iterate on the operations in the worklist until all operations have been
+    // inferred or no change happened (fix point).
+    while (!opWorklist.empty()) {
+      // Find the next operation ready for inference, that is an operation
+      // with all operands already resolved (non-generic).
+      auto nextop = llvm::find_if(opWorklist, returnsDynamicShape);
+      if (nextop == opWorklist.end())
+        break;
+
+      Operation *op = *nextop;
+      opWorklist.erase(op);
+
+      // Ask the operation to infer its output shapes.
+      LLVM_DEBUG(llvm::dbgs() << "Inferring shape for: " << *op << "\n");
+      if (auto shapeOp = dyn_cast<ShapeInference>(op)) {
+        shapeOp.inferShapes();
+      } else {
+        op->emitError("unable to infer shape of operation without shape "
+                      "inference interface");
+        return signalPassFailure();
+      }
+    }
+
+    // If the operation worklist isn't empty, this indicates a failure.
+    if (!opWorklist.empty()) {
+      f.emitError("Shape inference failed, ")
+          << opWorklist.size() << " operations couldn't be inferred\n";
+      signalPassFailure();
+    }
+  }
+
+  /// A utility method that returns if the given operation has a dynamically
+  /// shaped result.
+  static bool returnsDynamicShape(Operation *op) {
+    return llvm::any_of(op->getResultTypes(), [](Type resultType) {
+      return !resultType.isa<RankedTensorType>();
+    });
+  }
+};
+} // end anonymous namespace
+
+/// Create a Shape Inference pass.
+std::unique_ptr<mlir::Pass> mlir::toy::createShapeInferencePass() {
+  return std::make_unique<ShapeInferencePass>();
+}
diff --git a/mlir/examples/toy/Ch4/mlir/ToyCombine.cpp b/mlir/examples/toy/Ch4/mlir/ToyCombine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82c247c1be2d4da5ac4419f9267381379f8b365c
--- /dev/null
+++ b/mlir/examples/toy/Ch4/mlir/ToyCombine.cpp
@@ -0,0 +1,74 @@
+//===- ToyCombine.cpp - Toy High Level Optimizer --------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a set of simple combiners for optimizing operations in
+// the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "toy/Dialect.h"
+#include <numeric>
+using namespace mlir;
+using namespace toy;
+
+namespace {
+/// Include the patterns defined in the Declarative Rewrite framework.
+#include "ToyCombine.inc"
+} // end anonymous namespace
+
+/// Fold simple cast operations that return the same type as the input.
+OpFoldResult CastOp::fold(ArrayRef<Attribute> operands) {
+  return mlir::impl::foldCastOp(*this);
+}
+
+/// This is an example of a c++ rewrite pattern for the TransposeOp. It
+/// optimizes the following scenario: transpose(transpose(x)) -> transpose(x)
+struct SimplifyRedundantTranspose : public mlir::OpRewritePattern<TransposeOp> {
+  /// We register this pattern to match every toy.transpose in the IR.
+  /// The "benefit" is used by the framework to order the patterns and process
+  /// them in order of profitability.
+  SimplifyRedundantTranspose(mlir::MLIRContext *context)
+      : OpRewritePattern<TransposeOp>(context, /*benefit=*/1) {}
+
+  /// This method attempts to match a pattern and rewrite it. The rewriter
+  /// argument is the orchestrator of the sequence of rewrites. The pattern is
+  /// expected to interact with it to perform any changes to the IR from here.
+  mlir::PatternMatchResult
+  matchAndRewrite(TransposeOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // Look through the input of the current transpose.
+    mlir::Value transposeInput = op.getOperand();
+    TransposeOp transposeInputOp =
+        llvm::dyn_cast_or_null<TransposeOp>(transposeInput->getDefiningOp());
+
+    // If the input is defined by another Transpose, bingo!
+    if (!transposeInputOp)
+      return matchFailure();
+
+    // Use the rewriter to perform the replacement.
+    rewriter.replaceOp(op, {transposeInputOp.getOperand()}, {transposeInputOp});
+    return matchSuccess();
+  }
+};
+
+/// Register our patterns as "canonicalization" patterns on the TransposeOp so
+/// that they can be picked up by the Canonicalization framework.
+void TransposeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                              MLIRContext *context) {
+  results.insert<SimplifyRedundantTranspose>(context);
+}
+
+/// Register our patterns as "canonicalization" patterns on the ReshapeOp so
+/// that they can be picked up by the Canonicalization framework.
+void ReshapeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  results.insert<ReshapeReshapeOptPattern, RedundantReshapeOptPattern,
+                 FoldConstantReshapeOptPattern>(context);
+}
diff --git a/mlir/examples/toy/Ch4/mlir/ToyCombine.td b/mlir/examples/toy/Ch4/mlir/ToyCombine.td
new file mode 100644
index 0000000000000000000000000000000000000000..e6e33e84d7e8f3e13aea9840f3690029de025d94
--- /dev/null
+++ b/mlir/examples/toy/Ch4/mlir/ToyCombine.td
@@ -0,0 +1,62 @@
+//===- ToyCombine.td - Pattern Match Optimizations for Toy -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines language-specific pattern match optimizations for Toy using
+// Declarative Rewrite Rules (DRR) specified using TableGen records.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_COMBINE
+#define TOY_COMBINE
+
+include "toy/Ops.td"
+
+/// Note: The DRR definition used for defining patterns is shown below:
+///
+/// class Pattern<
+///    dag sourcePattern, list<dag> resultPatterns,
+///    list<dag> additionalConstraints = [],
+///    dag benefitsAdded = (addBenefit 0)
+/// >;
+
+//===----------------------------------------------------------------------===//
+// Basic Pattern-Match and Rewrite
+//===----------------------------------------------------------------------===//
+
+// Reshape(Reshape(x)) = Reshape(x)
+def ReshapeReshapeOptPattern : Pat<(ReshapeOp(ReshapeOp $arg)),
+                                   (ReshapeOp $arg)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern-Match and Rewrite using Native Code Call
+//===----------------------------------------------------------------------===//
+
+// Native Code Calls may be used for more complex transformations using inline
+// C++ and C++ helper functions.
+
+// Reshape(Constant(x)) = x'
+def ReshapeConstant :
+  NativeCodeCall<"$0.reshape(($1->getType()).cast<ShapedType>())">;
+def FoldConstantReshapeOptPattern : Pat<
+  (ReshapeOp:$res (ConstantOp $arg)),
+  (ConstantOp (ReshapeConstant $arg, $res))>;
+
+//===----------------------------------------------------------------------===//
+// Pattern-Match and Rewrite with Constraints
+//===----------------------------------------------------------------------===//
+
+// DRR allows for constraint checking when the transformation is conditional
+// on operand properties.
+
+// Reshape(x) = x, where input and output shapes are identical
+def TypesAreIdentical : Constraint<CPred<"$0->getType() == $1->getType()">>;
+def RedundantReshapeOptPattern : Pat<
+  (ReshapeOp:$res $arg), (replaceWithValue $arg),
+  [(TypesAreIdentical $res, $arg)]>;
+
+#endif // TOY_COMBINE
diff --git a/mlir/examples/toy/Ch4/parser/AST.cpp b/mlir/examples/toy/Ch4/parser/AST.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d6d9359529bffc068520bebf4a9ea56f436a415
--- /dev/null
+++ b/mlir/examples/toy/Ch4/parser/AST.cpp
@@ -0,0 +1,234 @@
+//===- AST.cpp - Helper for printing out the Toy AST ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST dump for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/AST.h"
+
+#include "mlir/ADT/TypeSwitch.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+
+namespace {
+
+// RAII helper to manage increasing/decreasing the indentation as we traverse
+// the AST
+struct Indent {
+  Indent(int &level) : level(level) { ++level; }
+  ~Indent() { --level; }
+  int &level;
+};
+
+/// Helper class that implement the AST tree traversal and print the nodes along
+/// the way. The only data member is the current indentation level.
+class ASTDumper {
+public:
+  void dump(ModuleAST *node);
+
+private:
+  void dump(const VarType &type);
+  void dump(VarDeclExprAST *varDecl);
+  void dump(ExprAST *expr);
+  void dump(ExprASTList *exprList);
+  void dump(NumberExprAST *num);
+  void dump(LiteralExprAST *node);
+  void dump(VariableExprAST *node);
+  void dump(ReturnExprAST *node);
+  void dump(BinaryExprAST *node);
+  void dump(CallExprAST *node);
+  void dump(PrintExprAST *node);
+  void dump(PrototypeAST *node);
+  void dump(FunctionAST *node);
+
+  // Actually print spaces matching the current indentation level
+  void indent() {
+    for (int i = 0; i < curIndent; i++)
+      llvm::errs() << "  ";
+  }
+  int curIndent = 0;
+};
+
+} // namespace
+
+/// Return a formatted string for the location of any node
+template <typename T> static std::string loc(T *node) {
+  const auto &loc = node->loc();
+  return (llvm::Twine("@") + *loc.file + ":" + llvm::Twine(loc.line) + ":" +
+          llvm::Twine(loc.col))
+      .str();
+}
+
+// Helper Macro to bump the indentation level and print the leading spaces for
+// the current indentations
+#define INDENT()                                                               \
+  Indent level_(curIndent);                                                    \
+  indent();
+
+/// Dispatch to a generic expressions to the appropriate subclass using RTTI
+void ASTDumper::dump(ExprAST *expr) {
+  mlir::TypeSwitch<ExprAST *>(expr)
+      .Case<BinaryExprAST, CallExprAST, LiteralExprAST, NumberExprAST,
+            PrintExprAST, ReturnExprAST, VarDeclExprAST, VariableExprAST>(
+          [&](auto *node) { this->dump(node); })
+      .Default([&](ExprAST *) {
+        // No match, fallback to a generic message
+        INDENT();
+        llvm::errs() << "<unknown Expr, kind " << expr->getKind() << ">\n";
+      });
+}
+
+/// A variable declaration is printing the variable name, the type, and then
+/// recurse in the initializer value.
+void ASTDumper::dump(VarDeclExprAST *varDecl) {
+  INDENT();
+  llvm::errs() << "VarDecl " << varDecl->getName();
+  dump(varDecl->getType());
+  llvm::errs() << " " << loc(varDecl) << "\n";
+  dump(varDecl->getInitVal());
+}
+
+/// A "block", or a list of expression
+void ASTDumper::dump(ExprASTList *exprList) {
+  INDENT();
+  llvm::errs() << "Block {\n";
+  for (auto &expr : *exprList)
+    dump(expr.get());
+  indent();
+  llvm::errs() << "} // Block\n";
+}
+
+/// A literal number, just print the value.
+void ASTDumper::dump(NumberExprAST *num) {
+  INDENT();
+  llvm::errs() << num->getValue() << " " << loc(num) << "\n";
+}
+
+/// Helper to print recursively a literal. This handles nested array like:
+///    [ [ 1, 2 ], [ 3, 4 ] ]
+/// We print out such array with the dimensions spelled out at every level:
+///    <2,2>[<2>[ 1, 2 ], <2>[ 3, 4 ] ]
+void printLitHelper(ExprAST *litOrNum) {
+  // Inside a literal expression we can have either a number or another literal
+  if (auto num = llvm::dyn_cast<NumberExprAST>(litOrNum)) {
+    llvm::errs() << num->getValue();
+    return;
+  }
+  auto *literal = llvm::cast<LiteralExprAST>(litOrNum);
+
+  // Print the dimension for this literal first
+  llvm::errs() << "<";
+  mlir::interleaveComma(literal->getDims(), llvm::errs());
+  llvm::errs() << ">";
+
+  // Now print the content, recursing on every element of the list
+  llvm::errs() << "[ ";
+  mlir::interleaveComma(literal->getValues(), llvm::errs(),
+                        [&](auto &elt) { printLitHelper(elt.get()); });
+  llvm::errs() << "]";
+}
+
+/// Print a literal, see the recursive helper above for the implementation.
+void ASTDumper::dump(LiteralExprAST *node) {
+  INDENT();
+  llvm::errs() << "Literal: ";
+  printLitHelper(node);
+  llvm::errs() << " " << loc(node) << "\n";
+}
+
+/// Print a variable reference (just a name).
+void ASTDumper::dump(VariableExprAST *node) {
+  INDENT();
+  llvm::errs() << "var: " << node->getName() << " " << loc(node) << "\n";
+}
+
+/// Return statement print the return and its (optional) argument.
+void ASTDumper::dump(ReturnExprAST *node) {
+  INDENT();
+  llvm::errs() << "Return\n";
+  if (node->getExpr().hasValue())
+    return dump(*node->getExpr());
+  {
+    INDENT();
+    llvm::errs() << "(void)\n";
+  }
+}
+
+/// Print a binary operation, first the operator, then recurse into LHS and RHS.
+void ASTDumper::dump(BinaryExprAST *node) {
+  INDENT();
+  llvm::errs() << "BinOp: " << node->getOp() << " " << loc(node) << "\n";
+  dump(node->getLHS());
+  dump(node->getRHS());
+}
+
+/// Print a call expression, first the callee name and the list of args by
+/// recursing into each individual argument.
+void ASTDumper::dump(CallExprAST *node) {
+  INDENT();
+  llvm::errs() << "Call '" << node->getCallee() << "' [ " << loc(node) << "\n";
+  for (auto &arg : node->getArgs())
+    dump(arg.get());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print a builtin print call, first the builtin name and then the argument.
+void ASTDumper::dump(PrintExprAST *node) {
+  INDENT();
+  llvm::errs() << "Print [ " << loc(node) << "\n";
+  dump(node->getArg());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print type: only the shape is printed in between '<' and '>'
+void ASTDumper::dump(const VarType &type) {
+  llvm::errs() << "<";
+  mlir::interleaveComma(type.shape, llvm::errs());
+  llvm::errs() << ">";
+}
+
+/// Print a function prototype, first the function name, and then the list of
+/// parameters names.
+void ASTDumper::dump(PrototypeAST *node) {
+  INDENT();
+  llvm::errs() << "Proto '" << node->getName() << "' " << loc(node) << "'\n";
+  indent();
+  llvm::errs() << "Params: [";
+  mlir::interleaveComma(node->getArgs(), llvm::errs(),
+                        [](auto &arg) { llvm::errs() << arg->getName(); });
+  llvm::errs() << "]\n";
+}
+
+/// Print a function, first the prototype and then the body.
+void ASTDumper::dump(FunctionAST *node) {
+  INDENT();
+  llvm::errs() << "Function \n";
+  dump(node->getProto());
+  dump(node->getBody());
+}
+
+/// Print a module, actually loop over the functions and print them in sequence.
+void ASTDumper::dump(ModuleAST *node) {
+  INDENT();
+  llvm::errs() << "Module:\n";
+  for (auto &f : *node)
+    dump(&f);
+}
+
+namespace toy {
+
+// Public API
+void dump(ModuleAST &module) { ASTDumper().dump(&module); }
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch4/toyc.cpp b/mlir/examples/toy/Ch4/toyc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7b584407f65627837129ef66ad864fe04115029
--- /dev/null
+++ b/mlir/examples/toy/Ch4/toyc.cpp
@@ -0,0 +1,167 @@
+//===- toyc.cpp - The Toy Compiler ----------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the entry point for the Toy compiler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+#include "toy/MLIRGen.h"
+#include "toy/Parser.h"
+#include "toy/Passes.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Parser.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+namespace cl = llvm::cl;
+
+static cl::opt<std::string> inputFilename(cl::Positional,
+                                          cl::desc("<input toy file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+namespace {
+enum InputType { Toy, MLIR };
+}
+static cl::opt<enum InputType> inputType(
+    "x", cl::init(Toy), cl::desc("Decided the kind of output desired"),
+    cl::values(clEnumValN(Toy, "toy", "load the input file as a Toy source.")),
+    cl::values(clEnumValN(MLIR, "mlir",
+                          "load the input file as an MLIR file")));
+
+namespace {
+enum Action { None, DumpAST, DumpMLIR };
+}
+static cl::opt<enum Action> emitAction(
+    "emit", cl::desc("Select the kind of output desired"),
+    cl::values(clEnumValN(DumpAST, "ast", "output the AST dump")),
+    cl::values(clEnumValN(DumpMLIR, "mlir", "output the MLIR dump")));
+
+static cl::opt<bool> enableOpt("opt", cl::desc("Enable optimizations"));
+
+/// Returns a Toy AST resulting from parsing the file or a nullptr on error.
+std::unique_ptr<toy::ModuleAST> parseInputFile(llvm::StringRef filename) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(filename);
+  if (std::error_code ec = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << ec.message() << "\n";
+    return nullptr;
+  }
+  auto buffer = fileOrErr.get()->getBuffer();
+  LexerBuffer lexer(buffer.begin(), buffer.end(), filename);
+  Parser parser(lexer);
+  return parser.parseModule();
+}
+
+int loadMLIR(llvm::SourceMgr &sourceMgr, mlir::MLIRContext &context,
+             mlir::OwningModuleRef &module) {
+  // Handle '.toy' input to the compiler.
+  if (inputType != InputType::MLIR &&
+      !llvm::StringRef(inputFilename).endswith(".mlir")) {
+    auto moduleAST = parseInputFile(inputFilename);
+    if (!moduleAST)
+      return 6;
+    module = mlirGen(context, *moduleAST);
+    return !module ? 1 : 0;
+  }
+
+  // Otherwise, the input is '.mlir'.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(inputFilename);
+  if (std::error_code EC = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << EC.message() << "\n";
+    return -1;
+  }
+
+  // Parse the input mlir.
+  sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
+  module = mlir::parseSourceFile(sourceMgr, &context);
+  if (!module) {
+    llvm::errs() << "Error can't load file " << inputFilename << "\n";
+    return 3;
+  }
+  return 0;
+}
+
+int dumpMLIR() {
+  // Register our Dialect with MLIR.
+  mlir::registerDialect<mlir::toy::ToyDialect>();
+
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module;
+  llvm::SourceMgr sourceMgr;
+  mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
+  if (int error = loadMLIR(sourceMgr, context, module))
+    return error;
+
+  if (enableOpt) {
+    mlir::PassManager pm(&context);
+    // Apply any generic pass manager command line options and run the pipeline.
+    applyPassManagerCLOptions(pm);
+
+    // Inline all functions into main and then delete them.
+    pm.addPass(mlir::createInlinerPass());
+    pm.addPass(mlir::toy::createDeadFunctionEliminationPass());
+
+    // Now that there is only one function, we can infer the shapes of each of
+    // the operations.
+    mlir::OpPassManager &optPM = pm.nest<mlir::FuncOp>();
+    optPM.addPass(mlir::toy::createShapeInferencePass());
+    optPM.addPass(mlir::createCanonicalizerPass());
+    optPM.addPass(mlir::createCSEPass());
+
+    if (mlir::failed(pm.run(*module)))
+      return 4;
+  }
+
+  module->dump();
+  return 0;
+}
+
+int dumpAST() {
+  if (inputType == InputType::MLIR) {
+    llvm::errs() << "Can't dump a Toy AST when the input is MLIR\n";
+    return 5;
+  }
+
+  auto moduleAST = parseInputFile(inputFilename);
+  if (!moduleAST)
+    return 1;
+
+  dump(*moduleAST);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  mlir::registerPassManagerCLOptions();
+  cl::ParseCommandLineOptions(argc, argv, "toy compiler\n");
+
+  switch (emitAction) {
+  case Action::DumpAST:
+    return dumpAST();
+  case Action::DumpMLIR:
+    return dumpMLIR();
+  default:
+    llvm::errs() << "No action specified (parsing only?), use -emit=<action>\n";
+  }
+
+  return 0;
+}
diff --git a/mlir/examples/toy/Ch5/CMakeLists.txt b/mlir/examples/toy/Ch5/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..df5239589de24040eb54a2af475ebbe35e16c0ee
--- /dev/null
+++ b/mlir/examples/toy/Ch5/CMakeLists.txt
@@ -0,0 +1,42 @@
+add_subdirectory(include)
+
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+set(LLVM_TARGET_DEFINITIONS mlir/ToyCombine.td)
+mlir_tablegen(ToyCombine.inc -gen-rewriters "-I${CMAKE_CURRENT_SOURCE_DIR}/include")
+add_public_tablegen_target(ToyCh5CombineIncGen)
+
+add_toy_chapter(toyc-ch5
+  toyc.cpp
+  parser/AST.cpp
+  mlir/MLIRGen.cpp
+  mlir/Dialect.cpp
+  mlir/DeadFunctionEliminationPass.cpp
+  mlir/LowerToAffineLoops.cpp
+  mlir/ShapeInferencePass.cpp
+  mlir/ToyCombine.cpp
+  )
+
+add_dependencies(toyc-ch5 ToyCh5ShapeInferenceInterfaceIncGen)
+add_dependencies(toyc-ch5 ToyCh5OpsIncGen)
+add_dependencies(toyc-ch5 ToyCh5CombineIncGen)
+add_dependencies(toyc-ch5 MLIRCallOpInterfacesIncGen)
+include_directories(include/)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/)
+target_link_libraries(toyc-ch5
+  PRIVATE
+    MLIRAffineOps
+    MLIRAnalysis
+    MLIRIR
+    MLIRParser
+    MLIRPass
+    MLIRStandardOps
+    MLIRTransforms)
+
+whole_archive_link(toyc-ch5
+  MLIRAffineOps
+  MLIRStandardOps
+  )
diff --git a/mlir/examples/toy/Ch5/include/CMakeLists.txt b/mlir/examples/toy/Ch5/include/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..37c89d0bae965cfc8665515de7e60ad7867a7d8b
--- /dev/null
+++ b/mlir/examples/toy/Ch5/include/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(toy)
diff --git a/mlir/examples/toy/Ch5/include/toy/AST.h b/mlir/examples/toy/Ch5/include/toy/AST.h
new file mode 100644
index 0000000000000000000000000000000000000000..820600b5b1c900cbeedce7545bad458f096cc92e
--- /dev/null
+++ b/mlir/examples/toy/Ch5/include/toy/AST.h
@@ -0,0 +1,242 @@
+//===- AST.h - Node definition for the Toy AST ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST for the Toy language. It is optimized for
+// simplicity, not efficiency. The AST forms a tree structure where each node
+// references its children using std::unique_ptr<>.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_AST_H_
+#define MLIR_TUTORIAL_TOY_AST_H_
+
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <vector>
+
+namespace toy {
+
+/// A variable type with shape information.
+struct VarType {
+  std::vector<int64_t> shape;
+};
+
+/// Base class for all expression nodes.
+class ExprAST {
+public:
+  enum ExprASTKind {
+    Expr_VarDecl,
+    Expr_Return,
+    Expr_Num,
+    Expr_Literal,
+    Expr_Var,
+    Expr_BinOp,
+    Expr_Call,
+    Expr_Print,
+  };
+
+  ExprAST(ExprASTKind kind, Location location)
+      : kind(kind), location(location) {}
+  virtual ~ExprAST() = default;
+
+  ExprASTKind getKind() const { return kind; }
+
+  const Location &loc() { return location; }
+
+private:
+  const ExprASTKind kind;
+  Location location;
+};
+
+/// A block-list of expressions.
+using ExprASTList = std::vector<std::unique_ptr<ExprAST>>;
+
+/// Expression class for numeric literals like "1.0".
+class NumberExprAST : public ExprAST {
+  double Val;
+
+public:
+  NumberExprAST(Location loc, double val) : ExprAST(Expr_Num, loc), Val(val) {}
+
+  double getValue() { return Val; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Num; }
+};
+
+/// Expression class for a literal value.
+class LiteralExprAST : public ExprAST {
+  std::vector<std::unique_ptr<ExprAST>> values;
+  std::vector<int64_t> dims;
+
+public:
+  LiteralExprAST(Location loc, std::vector<std::unique_ptr<ExprAST>> values,
+                 std::vector<int64_t> dims)
+      : ExprAST(Expr_Literal, loc), values(std::move(values)),
+        dims(std::move(dims)) {}
+
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getValues() { return values; }
+  llvm::ArrayRef<int64_t> getDims() { return dims; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Literal; }
+};
+
+/// Expression class for referencing a variable, like "a".
+class VariableExprAST : public ExprAST {
+  std::string name;
+
+public:
+  VariableExprAST(Location loc, llvm::StringRef name)
+      : ExprAST(Expr_Var, loc), name(name) {}
+
+  llvm::StringRef getName() { return name; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Var; }
+};
+
+/// Expression class for defining a variable.
+class VarDeclExprAST : public ExprAST {
+  std::string name;
+  VarType type;
+  std::unique_ptr<ExprAST> initVal;
+
+public:
+  VarDeclExprAST(Location loc, llvm::StringRef name, VarType type,
+                 std::unique_ptr<ExprAST> initVal)
+      : ExprAST(Expr_VarDecl, loc), name(name), type(std::move(type)),
+        initVal(std::move(initVal)) {}
+
+  llvm::StringRef getName() { return name; }
+  ExprAST *getInitVal() { return initVal.get(); }
+  const VarType &getType() { return type; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_VarDecl; }
+};
+
+/// Expression class for a return operator.
+class ReturnExprAST : public ExprAST {
+  llvm::Optional<std::unique_ptr<ExprAST>> expr;
+
+public:
+  ReturnExprAST(Location loc, llvm::Optional<std::unique_ptr<ExprAST>> expr)
+      : ExprAST(Expr_Return, loc), expr(std::move(expr)) {}
+
+  llvm::Optional<ExprAST *> getExpr() {
+    if (expr.hasValue())
+      return expr->get();
+    return llvm::None;
+  }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Return; }
+};
+
+/// Expression class for a binary operator.
+class BinaryExprAST : public ExprAST {
+  char op;
+  std::unique_ptr<ExprAST> lhs, rhs;
+
+public:
+  char getOp() { return op; }
+  ExprAST *getLHS() { return lhs.get(); }
+  ExprAST *getRHS() { return rhs.get(); }
+
+  BinaryExprAST(Location loc, char Op, std::unique_ptr<ExprAST> lhs,
+                std::unique_ptr<ExprAST> rhs)
+      : ExprAST(Expr_BinOp, loc), op(Op), lhs(std::move(lhs)),
+        rhs(std::move(rhs)) {}
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_BinOp; }
+};
+
+/// Expression class for function calls.
+class CallExprAST : public ExprAST {
+  std::string callee;
+  std::vector<std::unique_ptr<ExprAST>> args;
+
+public:
+  CallExprAST(Location loc, const std::string &callee,
+              std::vector<std::unique_ptr<ExprAST>> args)
+      : ExprAST(Expr_Call, loc), callee(callee), args(std::move(args)) {}
+
+  llvm::StringRef getCallee() { return callee; }
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getArgs() { return args; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Call; }
+};
+
+/// Expression class for builtin print calls.
+class PrintExprAST : public ExprAST {
+  std::unique_ptr<ExprAST> arg;
+
+public:
+  PrintExprAST(Location loc, std::unique_ptr<ExprAST> arg)
+      : ExprAST(Expr_Print, loc), arg(std::move(arg)) {}
+
+  ExprAST *getArg() { return arg.get(); }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Print; }
+};
+
+/// This class represents the "prototype" for a function, which captures its
+/// name, and its argument names (thus implicitly the number of arguments the
+/// function takes).
+class PrototypeAST {
+  Location location;
+  std::string name;
+  std::vector<std::unique_ptr<VariableExprAST>> args;
+
+public:
+  PrototypeAST(Location location, const std::string &name,
+               std::vector<std::unique_ptr<VariableExprAST>> args)
+      : location(location), name(name), args(std::move(args)) {}
+
+  const Location &loc() { return location; }
+  llvm::StringRef getName() const { return name; }
+  llvm::ArrayRef<std::unique_ptr<VariableExprAST>> getArgs() { return args; }
+};
+
+/// This class represents a function definition itself.
+class FunctionAST {
+  std::unique_ptr<PrototypeAST> proto;
+  std::unique_ptr<ExprASTList> body;
+
+public:
+  FunctionAST(std::unique_ptr<PrototypeAST> proto,
+              std::unique_ptr<ExprASTList> body)
+      : proto(std::move(proto)), body(std::move(body)) {}
+  PrototypeAST *getProto() { return proto.get(); }
+  ExprASTList *getBody() { return body.get(); }
+};
+
+/// This class represents a list of functions to be processed together
+class ModuleAST {
+  std::vector<FunctionAST> functions;
+
+public:
+  ModuleAST(std::vector<FunctionAST> functions)
+      : functions(std::move(functions)) {}
+
+  auto begin() -> decltype(functions.begin()) { return functions.begin(); }
+  auto end() -> decltype(functions.end()) { return functions.end(); }
+};
+
+void dump(ModuleAST &);
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_AST_H_
diff --git a/mlir/examples/toy/Ch5/include/toy/CMakeLists.txt b/mlir/examples/toy/Ch5/include/toy/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..aaa932896d0f17e2a78f5336d3eda2bd11d285a7
--- /dev/null
+++ b/mlir/examples/toy/Ch5/include/toy/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_TARGET_DEFINITIONS Ops.td)
+mlir_tablegen(Ops.h.inc -gen-op-decls "-I${CMAKE_CURRENT_SOURCE_DIR}/..")
+mlir_tablegen(Ops.cpp.inc -gen-op-defs "-I${CMAKE_CURRENT_SOURCE_DIR}/..")
+add_public_tablegen_target(ToyCh5OpsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS ShapeInferenceInterface.td)
+mlir_tablegen(ShapeInferenceOpInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(ShapeInferenceOpInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(ToyCh5ShapeInferenceInterfaceIncGen)
diff --git a/mlir/examples/toy/Ch5/include/toy/Dialect.h b/mlir/examples/toy/Ch5/include/toy/Dialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e8b91dcf4843762db80cde22ef96a0b22929840
--- /dev/null
+++ b/mlir/examples/toy/Ch5/include/toy/Dialect.h
@@ -0,0 +1,46 @@
+//===- Dialect.h - Dialect definition for the Toy IR ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IR Dialect for the Toy language.
+// See g3doc/Tutorials/Toy/Ch-2.md for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_DIALECT_H_
+#define MLIR_TUTORIAL_TOY_DIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/StandardTypes.h"
+#include "toy/ShapeInferenceInterface.h"
+
+namespace mlir {
+namespace toy {
+
+/// This is the definition of the Toy dialect. A dialect inherits from
+/// mlir::Dialect and registers custom attributes, operations, and types (in its
+/// constructor). It can also override some general behavior exposed via virtual
+/// methods.
+class ToyDialect : public mlir::Dialect {
+public:
+  explicit ToyDialect(mlir::MLIRContext *ctx);
+
+  /// Provide a utility accessor to the dialect namespace. This is used by
+  /// several utilities for casting between dialects.
+  static llvm::StringRef getDialectNamespace() { return "toy"; }
+};
+
+/// Include the auto-generated header file containing the declarations of the
+/// toy operations.
+#define GET_OP_CLASSES
+#include "toy/Ops.h.inc"
+
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_DIALECT_H_
diff --git a/mlir/examples/toy/Ch5/include/toy/Lexer.h b/mlir/examples/toy/Ch5/include/toy/Lexer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6eff64ee5f09634041f76cbae11c18f8ca46d07c
--- /dev/null
+++ b/mlir/examples/toy/Ch5/include/toy/Lexer.h
@@ -0,0 +1,232 @@
+//===- Lexer.h - Lexer for the Toy language -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple Lexer for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_LEXER_H_
+#define MLIR_TUTORIAL_TOY_LEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+
+#include <memory>
+#include <string>
+
+namespace toy {
+
+/// Structure definition a location in a file.
+struct Location {
+  std::shared_ptr<std::string> file; ///< filename.
+  int line;                          ///< line number.
+  int col;                           ///< column number.
+};
+
+// List of Token returned by the lexer.
+enum Token : int {
+  tok_semicolon = ';',
+  tok_parenthese_open = '(',
+  tok_parenthese_close = ')',
+  tok_bracket_open = '{',
+  tok_bracket_close = '}',
+  tok_sbracket_open = '[',
+  tok_sbracket_close = ']',
+
+  tok_eof = -1,
+
+  // commands
+  tok_return = -2,
+  tok_var = -3,
+  tok_def = -4,
+
+  // primary
+  tok_identifier = -5,
+  tok_number = -6,
+};
+
+/// The Lexer is an abstract base class providing all the facilities that the
+/// Parser expects. It goes through the stream one token at a time and keeps
+/// track of the location in the file for debugging purpose.
+/// It relies on a subclass to provide a `readNextLine()` method. The subclass
+/// can proceed by reading the next line from the standard input or from a
+/// memory mapped file.
+class Lexer {
+public:
+  /// Create a lexer for the given filename. The filename is kept only for
+  /// debugging purpose (attaching a location to a Token).
+  Lexer(std::string filename)
+      : lastLocation(
+            {std::make_shared<std::string>(std::move(filename)), 0, 0}) {}
+  virtual ~Lexer() = default;
+
+  /// Look at the current token in the stream.
+  Token getCurToken() { return curTok; }
+
+  /// Move to the next token in the stream and return it.
+  Token getNextToken() { return curTok = getTok(); }
+
+  /// Move to the next token in the stream, asserting on the current token
+  /// matching the expectation.
+  void consume(Token tok) {
+    assert(tok == curTok && "consume Token mismatch expectation");
+    getNextToken();
+  }
+
+  /// Return the current identifier (prereq: getCurToken() == tok_identifier)
+  llvm::StringRef getId() {
+    assert(curTok == tok_identifier);
+    return identifierStr;
+  }
+
+  /// Return the current number (prereq: getCurToken() == tok_number)
+  double getValue() {
+    assert(curTok == tok_number);
+    return numVal;
+  }
+
+  /// Return the location for the beginning of the current token.
+  Location getLastLocation() { return lastLocation; }
+
+  // Return the current line in the file.
+  int getLine() { return curLineNum; }
+
+  // Return the current column in the file.
+  int getCol() { return curCol; }
+
+private:
+  /// Delegate to a derived class fetching the next line. Returns an empty
+  /// string to signal end of file (EOF). Lines are expected to always finish
+  /// with "\n"
+  virtual llvm::StringRef readNextLine() = 0;
+
+  /// Return the next character from the stream. This manages the buffer for the
+  /// current line and request the next line buffer to the derived class as
+  /// needed.
+  int getNextChar() {
+    // The current line buffer should not be empty unless it is the end of file.
+    if (curLineBuffer.empty())
+      return EOF;
+    ++curCol;
+    auto nextchar = curLineBuffer.front();
+    curLineBuffer = curLineBuffer.drop_front();
+    if (curLineBuffer.empty())
+      curLineBuffer = readNextLine();
+    if (nextchar == '\n') {
+      ++curLineNum;
+      curCol = 0;
+    }
+    return nextchar;
+  }
+
+  ///  Return the next token from standard input.
+  Token getTok() {
+    // Skip any whitespace.
+    while (isspace(lastChar))
+      lastChar = Token(getNextChar());
+
+    // Save the current location before reading the token characters.
+    lastLocation.line = curLineNum;
+    lastLocation.col = curCol;
+
+    // Identifier: [a-zA-Z][a-zA-Z0-9_]*
+    if (isalpha(lastChar)) {
+      identifierStr = (char)lastChar;
+      while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
+        identifierStr += (char)lastChar;
+
+      if (identifierStr == "return")
+        return tok_return;
+      if (identifierStr == "def")
+        return tok_def;
+      if (identifierStr == "var")
+        return tok_var;
+      return tok_identifier;
+    }
+
+    // Number: [0-9.]+
+    if (isdigit(lastChar) || lastChar == '.') {
+      std::string numStr;
+      do {
+        numStr += lastChar;
+        lastChar = Token(getNextChar());
+      } while (isdigit(lastChar) || lastChar == '.');
+
+      numVal = strtod(numStr.c_str(), nullptr);
+      return tok_number;
+    }
+
+    if (lastChar == '#') {
+      // Comment until end of line.
+      do {
+        lastChar = Token(getNextChar());
+      } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
+
+      if (lastChar != EOF)
+        return getTok();
+    }
+
+    // Check for end of file.  Don't eat the EOF.
+    if (lastChar == EOF)
+      return tok_eof;
+
+    // Otherwise, just return the character as its ascii value.
+    Token thisChar = Token(lastChar);
+    lastChar = Token(getNextChar());
+    return thisChar;
+  }
+
+  /// The last token read from the input.
+  Token curTok = tok_eof;
+
+  /// Location for `curTok`.
+  Location lastLocation;
+
+  /// If the current Token is an identifier, this string contains the value.
+  std::string identifierStr;
+
+  /// If the current Token is a number, this contains the value.
+  double numVal = 0;
+
+  /// The last value returned by getNextChar(). We need to keep it around as we
+  /// always need to read ahead one character to decide when to end a token and
+  /// we can't put it back in the stream after reading from it.
+  Token lastChar = Token(' ');
+
+  /// Keep track of the current line number in the input stream
+  int curLineNum = 0;
+
+  /// Keep track of the current column number in the input stream
+  int curCol = 0;
+
+  /// Buffer supplied by the derived class on calls to `readNextLine()`
+  llvm::StringRef curLineBuffer = "\n";
+};
+
+/// A lexer implementation operating on a buffer in memory.
+class LexerBuffer final : public Lexer {
+public:
+  LexerBuffer(const char *begin, const char *end, std::string filename)
+      : Lexer(std::move(filename)), current(begin), end(end) {}
+
+private:
+  /// Provide one line at a time to the Lexer, return an empty string when
+  /// reaching the end of the buffer.
+  llvm::StringRef readNextLine() override {
+    auto *begin = current;
+    while (current <= end && *current && *current != '\n')
+      ++current;
+    if (current <= end && *current)
+      ++current;
+    llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
+    return result;
+  }
+  const char *current, *end;
+};
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_LEXER_H_
diff --git a/mlir/examples/toy/Ch5/include/toy/MLIRGen.h b/mlir/examples/toy/Ch5/include/toy/MLIRGen.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1c8ca1201d1a2a391c0aec0d89197fbbb18efb8
--- /dev/null
+++ b/mlir/examples/toy/Ch5/include/toy/MLIRGen.h
@@ -0,0 +1,32 @@
+//===- MLIRGen.h - MLIR Generation from a Toy AST -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a simple interface to perform IR generation targeting MLIR
+// from a Module AST for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_MLIRGEN_H_
+#define MLIR_TUTORIAL_TOY_MLIRGEN_H_
+
+#include <memory>
+
+namespace mlir {
+class MLIRContext;
+class OwningModuleRef;
+} // namespace mlir
+
+namespace toy {
+class ModuleAST;
+
+/// Emit IR for the given Toy moduleAST, returns a newly created MLIR module
+/// or nullptr on failure.
+mlir::OwningModuleRef mlirGen(mlir::MLIRContext &context, ModuleAST &moduleAST);
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_MLIRGEN_H_
diff --git a/mlir/examples/toy/Ch5/include/toy/Ops.td b/mlir/examples/toy/Ch5/include/toy/Ops.td
new file mode 100644
index 0000000000000000000000000000000000000000..410c5df246128bd8ddba8bc264a0ab9df9f65941
--- /dev/null
+++ b/mlir/examples/toy/Ch5/include/toy/Ops.td
@@ -0,0 +1,247 @@
+//===- Ops.td - Toy dialect operation definitions ----------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the operations of the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_OPS
+#define TOY_OPS
+
+include "mlir/Analysis/CallInterfaces.td"
+include "toy/ShapeInferenceInterface.td"
+
+// Provide a definition of the 'toy' dialect in the ODS framework so that we
+// can define our operations.
+def Toy_Dialect : Dialect {
+  let name = "toy";
+  let cppNamespace = "toy";
+}
+
+// Base class for toy dialect operations. This operation inherits from the base
+// `Op` class in OpBase.td, and provides:
+//   * The parent dialect of the operation.
+//   * The mnemonic for the operation, or the name without the dialect prefix.
+//   * A list of traits for the operation.
+class Toy_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Toy_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+// We define a toy operation by inheriting from our base 'Toy_Op' class above.
+// Here we provide the mnemonic and a list of traits for the operation. The
+// constant operation is marked as 'NoSideEffect' as it is a pure operation
+// and may be removed if dead.
+def ConstantOp : Toy_Op<"constant", [NoSideEffect]> {
+  // Provide a summary and description for this operation. This can be used to
+  // auto-generate documentation of the operations within our dialect.
+  let summary = "constant";
+  let description = [{
+    Constant operation turns a literal into an SSA value. The data is attached
+    to the operation as an attribute. For example:
+
+    ```mlir
+      %0 = "toy.constant"()
+         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
+        : () -> tensor<2x3xf64>
+    ```
+  }];
+
+  // The constant operation takes an attribute as the only input.
+  let arguments = (ins F64ElementsAttr:$value);
+
+  // The constant operation returns a single value of TensorType.
+  let results = (outs F64Tensor);
+
+  // Add custom build methods for the constant operation. These method populates
+  // the `state` that MLIR uses to create operations, i.e. these are used when
+  // using `builder.create<ConstantOp>(...)`.
+  let builders = [
+    // Build a constant with a given constant tensor value.
+    OpBuilder<"Builder *builder, OperationState &state, "
+              "DenseElementsAttr value", [{
+      build(builder, state, value.getType(), value);
+    }]>,
+
+    // Build a constant with a given constant floating-point value.
+    OpBuilder<"Builder *builder, OperationState &state, double value">
+  ];
+
+  // Invoke a static verify method to verify this constant operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def AddOp : Toy_Op<"add",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "element-wise addition operation";
+  let description = [{
+    The "add" operation performs element-wise addition between two tensors.
+    The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs F64Tensor);
+
+  // Allow building an AddOp with from the two input operands.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
+  ];
+}
+
+def CastOp : Toy_Op<"cast",
+    [DeclareOpInterfaceMethods<ShapeInferenceOpInterface>, NoSideEffect,
+     SameOperandsAndResultShape]> {
+  let summary = "shape cast operation";
+  let description = [{
+    The "cast" operation converts a tensor from one type to an equivalent type
+    without changing any data elements. The source and destination types
+    must both be tensor types with the same element type. If both are ranked
+    then the rank should be the same and static dimensions should match. The
+    operation is invalid if converting to a mismatching constant dimension.
+  }];
+
+  let arguments = (ins F64Tensor:$input);
+  let results = (outs F64Tensor:$output);
+
+  // Set the folder bit so that we can fold redundant cast operations.
+  let hasFolder = 1;
+}
+
+def GenericCallOp : Toy_Op<"generic_call",
+    [DeclareOpInterfaceMethods<CallOpInterface>]> {
+  let summary = "generic call operation";
+  let description = [{
+    Generic calls represent calls to a user defined function that needs to
+    be specialized for the shape of its arguments. The callee name is attached
+    as a symbol reference via an attribute. The arguments list must match the
+    arguments expected by the callee. For example:
+
+    ```mlir
+     %4 = "toy.generic_call"(%1, %3) {callee = @my_func}
+           : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+    ```
+
+    This is only valid if a function named "my_func" exists and takes two
+    arguments.
+  }];
+
+  // The generic call operation takes a symbol reference attribute as the
+  // callee, and inputs for the call.
+  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<F64Tensor>:$inputs);
+
+  // The generic call operation returns a single value of TensorType.
+  let results = (outs F64Tensor);
+
+  // Add custom build methods for the generic call operation.
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &state, "
+              "StringRef callee, ArrayRef<Value> arguments">
+  ];
+}
+
+def MulOp : Toy_Op<"mul",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "element-wise multiplication operation";
+  let description = [{
+    The "mul" operation performs element-wise multiplication between two
+    tensors. The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs F64Tensor);
+
+  // Allow building a MulOp with from the two input operands.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
+  ];
+}
+
+def PrintOp : Toy_Op<"print"> {
+  let summary = "print operation";
+  let description = [{
+    The "print" builtin operation prints a given input tensor, and produces
+    no results.
+  }];
+
+  // The print operation takes an input tensor to print.
+  // We also allow a F64MemRef to enable interop during partial lowering.
+  let arguments = (ins AnyTypeOf<[F64Tensor, F64MemRef]>:$input);
+}
+
+def ReshapeOp : Toy_Op<"reshape", [NoSideEffect]> {
+  let summary = "tensor reshape operation";
+  let description = [{
+    Reshape operation is transforming its input tensor into a new tensor with
+    the same number of elements but different shapes. For example:
+
+    ```mlir
+       %0 = "toy.reshape"(%arg1) : (tensor<10xf64>) -> tensor<5x2xf64>
+    ```
+  }];
+
+  let arguments = (ins F64Tensor:$input);
+  let hasCanonicalizer = 1;
+
+  // We expect that the reshape operation returns a statically shaped tensor.
+  let results = (outs StaticShapeTensorOf<[F64]>);
+}
+
+def ReturnOp : Toy_Op<"return", [Terminator, HasParent<"FuncOp">]> {
+  let summary = "return operation";
+  let description = [{
+    The "return" operation represents a return operation within a function.
+    The operation takes an optional tensor operand and produces no results.
+    The operand type must match the signature of the function that contains
+    the operation. For example:
+
+    ```mlir
+      func @foo() -> tensor<2xf64> {
+        ...
+        toy.return %0 : tensor<2xf64>
+      }
+    ```
+  }];
+
+  // The return operation takes an optional input operand to return. This
+  // value must match the return type of the enclosing function.
+  let arguments = (ins Variadic<F64Tensor>:$input);
+
+  // Allow building a ReturnOp with no return operand.
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &state", [{ build(b, state, llvm::None); }]
+  >];
+
+  // Provide extra utility definitions on the c++ operation class definition.
+  let extraClassDeclaration = [{
+    bool hasOperand() { return getNumOperands() != 0; }
+  }];
+
+  // Invoke a static verify method to verify this return operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def TransposeOp : Toy_Op<"transpose",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "transpose operation";
+
+  let arguments = (ins F64Tensor:$input);
+  let results = (outs F64Tensor);
+  let hasCanonicalizer = 1;
+
+  // Allow building a TransposeOp with from the input operand.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value input">
+  ];
+
+  // Invoke a static verify method to verify this transpose operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+#endif // TOY_OPS
diff --git a/mlir/examples/toy/Ch5/include/toy/Parser.h b/mlir/examples/toy/Ch5/include/toy/Parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..4557ea26859de3d0a6b71448f4bef030167c3e71
--- /dev/null
+++ b/mlir/examples/toy/Ch5/include/toy/Parser.h
@@ -0,0 +1,485 @@
+//===- Parser.h - Toy Language Parser -------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the parser for the Toy language. It processes the Token
+// provided by the Lexer and returns an AST.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_PARSER_H
+#define MLIR_TUTORIAL_TOY_PARSER_H
+
+#include "toy/AST.h"
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+namespace toy {
+
+/// This is a simple recursive parser for the Toy language. It produces a well
+/// formed AST from a stream of Token supplied by the Lexer. No semantic checks
+/// or symbol resolution is performed. For example, variables are referenced by
+/// string and the code could reference an undeclared variable and the parsing
+/// succeeds.
+class Parser {
+public:
+  /// Create a Parser for the supplied lexer.
+  Parser(Lexer &lexer) : lexer(lexer) {}
+
+  /// Parse a full Module. A module is a list of function definitions.
+  std::unique_ptr<ModuleAST> parseModule() {
+    lexer.getNextToken(); // prime the lexer
+
+    // Parse functions one at a time and accumulate in this vector.
+    std::vector<FunctionAST> functions;
+    while (auto f = parseDefinition()) {
+      functions.push_back(std::move(*f));
+      if (lexer.getCurToken() == tok_eof)
+        break;
+    }
+    // If we didn't reach EOF, there was an error during parsing
+    if (lexer.getCurToken() != tok_eof)
+      return parseError<ModuleAST>("nothing", "at end of module");
+
+    return std::make_unique<ModuleAST>(std::move(functions));
+  }
+
+private:
+  Lexer &lexer;
+
+  /// Parse a return statement.
+  /// return :== return ; | return expr ;
+  std::unique_ptr<ReturnExprAST> parseReturn() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_return);
+
+    // return takes an optional argument
+    llvm::Optional<std::unique_ptr<ExprAST>> expr;
+    if (lexer.getCurToken() != ';') {
+      expr = parseExpression();
+      if (!expr)
+        return nullptr;
+    }
+    return std::make_unique<ReturnExprAST>(std::move(loc), std::move(expr));
+  }
+
+  /// Parse a literal number.
+  /// numberexpr ::= number
+  std::unique_ptr<ExprAST> parseNumberExpr() {
+    auto loc = lexer.getLastLocation();
+    auto result =
+        std::make_unique<NumberExprAST>(std::move(loc), lexer.getValue());
+    lexer.consume(tok_number);
+    return std::move(result);
+  }
+
+  /// Parse a literal array expression.
+  /// tensorLiteral ::= [ literalList ] | number
+  /// literalList ::= tensorLiteral | tensorLiteral, literalList
+  std::unique_ptr<ExprAST> parseTensorLiteralExpr() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(Token('['));
+
+    // Hold the list of values at this nesting level.
+    std::vector<std::unique_ptr<ExprAST>> values;
+    // Hold the dimensions for all the nesting inside this level.
+    std::vector<int64_t> dims;
+    do {
+      // We can have either another nested array or a number literal.
+      if (lexer.getCurToken() == '[') {
+        values.push_back(parseTensorLiteralExpr());
+        if (!values.back())
+          return nullptr; // parse error in the nested array.
+      } else {
+        if (lexer.getCurToken() != tok_number)
+          return parseError<ExprAST>("<num> or [", "in literal expression");
+        values.push_back(parseNumberExpr());
+      }
+
+      // End of this list on ']'
+      if (lexer.getCurToken() == ']')
+        break;
+
+      // Elements are separated by a comma.
+      if (lexer.getCurToken() != ',')
+        return parseError<ExprAST>("] or ,", "in literal expression");
+
+      lexer.getNextToken(); // eat ,
+    } while (true);
+    if (values.empty())
+      return parseError<ExprAST>("<something>", "to fill literal expression");
+    lexer.getNextToken(); // eat ]
+
+    /// Fill in the dimensions now. First the current nesting level:
+    dims.push_back(values.size());
+
+    /// If there is any nested array, process all of them and ensure that
+    /// dimensions are uniform.
+    if (llvm::any_of(values, [](std::unique_ptr<ExprAST> &expr) {
+          return llvm::isa<LiteralExprAST>(expr.get());
+        })) {
+      auto *firstLiteral = llvm::dyn_cast<LiteralExprAST>(values.front().get());
+      if (!firstLiteral)
+        return parseError<ExprAST>("uniform well-nested dimensions",
+                                   "inside literal expression");
+
+      // Append the nested dimensions to the current level
+      auto firstDims = firstLiteral->getDims();
+      dims.insert(dims.end(), firstDims.begin(), firstDims.end());
+
+      // Sanity check that shape is uniform across all elements of the list.
+      for (auto &expr : values) {
+        auto *exprLiteral = llvm::cast<LiteralExprAST>(expr.get());
+        if (!exprLiteral)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+        if (exprLiteral->getDims() != firstDims)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+      }
+    }
+    return std::make_unique<LiteralExprAST>(std::move(loc), std::move(values),
+                                            std::move(dims));
+  }
+
+  /// parenexpr ::= '(' expression ')'
+  std::unique_ptr<ExprAST> parseParenExpr() {
+    lexer.getNextToken(); // eat (.
+    auto v = parseExpression();
+    if (!v)
+      return nullptr;
+
+    if (lexer.getCurToken() != ')')
+      return parseError<ExprAST>(")", "to close expression with parentheses");
+    lexer.consume(Token(')'));
+    return v;
+  }
+
+  /// identifierexpr
+  ///   ::= identifier
+  ///   ::= identifier '(' expression ')'
+  std::unique_ptr<ExprAST> parseIdentifierExpr() {
+    std::string name = lexer.getId();
+
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat identifier.
+
+    if (lexer.getCurToken() != '(') // Simple variable ref.
+      return std::make_unique<VariableExprAST>(std::move(loc), name);
+
+    // This is a function call.
+    lexer.consume(Token('('));
+    std::vector<std::unique_ptr<ExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      while (true) {
+        if (auto arg = parseExpression())
+          args.push_back(std::move(arg));
+        else
+          return nullptr;
+
+        if (lexer.getCurToken() == ')')
+          break;
+
+        if (lexer.getCurToken() != ',')
+          return parseError<ExprAST>(", or )", "in argument list");
+        lexer.getNextToken();
+      }
+    }
+    lexer.consume(Token(')'));
+
+    // It can be a builtin call to print
+    if (name == "print") {
+      if (args.size() != 1)
+        return parseError<ExprAST>("<single arg>", "as argument to print()");
+
+      return std::make_unique<PrintExprAST>(std::move(loc), std::move(args[0]));
+    }
+
+    // Call to a user-defined function
+    return std::make_unique<CallExprAST>(std::move(loc), name, std::move(args));
+  }
+
+  /// primary
+  ///   ::= identifierexpr
+  ///   ::= numberexpr
+  ///   ::= parenexpr
+  ///   ::= tensorliteral
+  std::unique_ptr<ExprAST> parsePrimary() {
+    switch (lexer.getCurToken()) {
+    default:
+      llvm::errs() << "unknown token '" << lexer.getCurToken()
+                   << "' when expecting an expression\n";
+      return nullptr;
+    case tok_identifier:
+      return parseIdentifierExpr();
+    case tok_number:
+      return parseNumberExpr();
+    case '(':
+      return parseParenExpr();
+    case '[':
+      return parseTensorLiteralExpr();
+    case ';':
+      return nullptr;
+    case '}':
+      return nullptr;
+    }
+  }
+
+  /// Recursively parse the right hand side of a binary expression, the ExprPrec
+  /// argument indicates the precedence of the current binary operator.
+  ///
+  /// binoprhs ::= ('+' primary)*
+  std::unique_ptr<ExprAST> parseBinOpRHS(int exprPrec,
+                                         std::unique_ptr<ExprAST> lhs) {
+    // If this is a binop, find its precedence.
+    while (true) {
+      int tokPrec = getTokPrecedence();
+
+      // If this is a binop that binds at least as tightly as the current binop,
+      // consume it, otherwise we are done.
+      if (tokPrec < exprPrec)
+        return lhs;
+
+      // Okay, we know this is a binop.
+      int binOp = lexer.getCurToken();
+      lexer.consume(Token(binOp));
+      auto loc = lexer.getLastLocation();
+
+      // Parse the primary expression after the binary operator.
+      auto rhs = parsePrimary();
+      if (!rhs)
+        return parseError<ExprAST>("expression", "to complete binary operator");
+
+      // If BinOp binds less tightly with rhs than the operator after rhs, let
+      // the pending operator take rhs as its lhs.
+      int nextPrec = getTokPrecedence();
+      if (tokPrec < nextPrec) {
+        rhs = parseBinOpRHS(tokPrec + 1, std::move(rhs));
+        if (!rhs)
+          return nullptr;
+      }
+
+      // Merge lhs/RHS.
+      lhs = std::make_unique<BinaryExprAST>(std::move(loc), binOp,
+                                            std::move(lhs), std::move(rhs));
+    }
+  }
+
+  /// expression::= primary binop rhs
+  std::unique_ptr<ExprAST> parseExpression() {
+    auto lhs = parsePrimary();
+    if (!lhs)
+      return nullptr;
+
+    return parseBinOpRHS(0, std::move(lhs));
+  }
+
+  /// type ::= < shape_list >
+  /// shape_list ::= num | num , shape_list
+  std::unique_ptr<VarType> parseType() {
+    if (lexer.getCurToken() != '<')
+      return parseError<VarType>("<", "to begin type");
+    lexer.getNextToken(); // eat <
+
+    auto type = std::make_unique<VarType>();
+
+    while (lexer.getCurToken() == tok_number) {
+      type->shape.push_back(lexer.getValue());
+      lexer.getNextToken();
+      if (lexer.getCurToken() == ',')
+        lexer.getNextToken();
+    }
+
+    if (lexer.getCurToken() != '>')
+      return parseError<VarType>(">", "to end type");
+    lexer.getNextToken(); // eat >
+    return type;
+  }
+
+  /// Parse a variable declaration, it starts with a `var` keyword followed by
+  /// and identifier and an optional type (shape specification) before the
+  /// initializer.
+  /// decl ::= var identifier [ type ] = expr
+  std::unique_ptr<VarDeclExprAST> parseDeclaration() {
+    if (lexer.getCurToken() != tok_var)
+      return parseError<VarDeclExprAST>("var", "to begin declaration");
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat var
+
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<VarDeclExprAST>("identified",
+                                        "after 'var' declaration");
+    std::string id = lexer.getId();
+    lexer.getNextToken(); // eat id
+
+    std::unique_ptr<VarType> type; // Type is optional, it can be inferred
+    if (lexer.getCurToken() == '<') {
+      type = parseType();
+      if (!type)
+        return nullptr;
+    }
+
+    if (!type)
+      type = std::make_unique<VarType>();
+    lexer.consume(Token('='));
+    auto expr = parseExpression();
+    return std::make_unique<VarDeclExprAST>(std::move(loc), std::move(id),
+                                            std::move(*type), std::move(expr));
+  }
+
+  /// Parse a block: a list of expression separated by semicolons and wrapped in
+  /// curly braces.
+  ///
+  /// block ::= { expression_list }
+  /// expression_list ::= block_expr ; expression_list
+  /// block_expr ::= decl | "return" | expr
+  std::unique_ptr<ExprASTList> parseBlock() {
+    if (lexer.getCurToken() != '{')
+      return parseError<ExprASTList>("{", "to begin block");
+    lexer.consume(Token('{'));
+
+    auto exprList = std::make_unique<ExprASTList>();
+
+    // Ignore empty expressions: swallow sequences of semicolons.
+    while (lexer.getCurToken() == ';')
+      lexer.consume(Token(';'));
+
+    while (lexer.getCurToken() != '}' && lexer.getCurToken() != tok_eof) {
+      if (lexer.getCurToken() == tok_var) {
+        // Variable declaration
+        auto varDecl = parseDeclaration();
+        if (!varDecl)
+          return nullptr;
+        exprList->push_back(std::move(varDecl));
+      } else if (lexer.getCurToken() == tok_return) {
+        // Return statement
+        auto ret = parseReturn();
+        if (!ret)
+          return nullptr;
+        exprList->push_back(std::move(ret));
+      } else {
+        // General expression
+        auto expr = parseExpression();
+        if (!expr)
+          return nullptr;
+        exprList->push_back(std::move(expr));
+      }
+      // Ensure that elements are separated by a semicolon.
+      if (lexer.getCurToken() != ';')
+        return parseError<ExprASTList>(";", "after expression");
+
+      // Ignore empty expressions: swallow sequences of semicolons.
+      while (lexer.getCurToken() == ';')
+        lexer.consume(Token(';'));
+    }
+
+    if (lexer.getCurToken() != '}')
+      return parseError<ExprASTList>("}", "to close block");
+
+    lexer.consume(Token('}'));
+    return exprList;
+  }
+
+  /// prototype ::= def id '(' decl_list ')'
+  /// decl_list ::= identifier | identifier, decl_list
+  std::unique_ptr<PrototypeAST> parsePrototype() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_def);
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<PrototypeAST>("function name", "in prototype");
+
+    std::string fnName = lexer.getId();
+    lexer.consume(tok_identifier);
+
+    if (lexer.getCurToken() != '(')
+      return parseError<PrototypeAST>("(", "in prototype");
+    lexer.consume(Token('('));
+
+    std::vector<std::unique_ptr<VariableExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      do {
+        std::string name = lexer.getId();
+        auto loc = lexer.getLastLocation();
+        lexer.consume(tok_identifier);
+        auto decl = std::make_unique<VariableExprAST>(std::move(loc), name);
+        args.push_back(std::move(decl));
+        if (lexer.getCurToken() != ',')
+          break;
+        lexer.consume(Token(','));
+        if (lexer.getCurToken() != tok_identifier)
+          return parseError<PrototypeAST>(
+              "identifier", "after ',' in function parameter list");
+      } while (true);
+    }
+    if (lexer.getCurToken() != ')')
+      return parseError<PrototypeAST>("}", "to end function prototype");
+
+    // success.
+    lexer.consume(Token(')'));
+    return std::make_unique<PrototypeAST>(std::move(loc), fnName,
+                                          std::move(args));
+  }
+
+  /// Parse a function definition, we expect a prototype initiated with the
+  /// `def` keyword, followed by a block containing a list of expressions.
+  ///
+  /// definition ::= prototype block
+  std::unique_ptr<FunctionAST> parseDefinition() {
+    auto proto = parsePrototype();
+    if (!proto)
+      return nullptr;
+
+    if (auto block = parseBlock())
+      return std::make_unique<FunctionAST>(std::move(proto), std::move(block));
+    return nullptr;
+  }
+
+  /// Get the precedence of the pending binary operator token.
+  int getTokPrecedence() {
+    if (!isascii(lexer.getCurToken()))
+      return -1;
+
+    // 1 is lowest precedence.
+    switch (static_cast<char>(lexer.getCurToken())) {
+    case '-':
+      return 20;
+    case '+':
+      return 20;
+    case '*':
+      return 40;
+    default:
+      return -1;
+    }
+  }
+
+  /// Helper function to signal errors while parsing, it takes an argument
+  /// indicating the expected token and another argument giving more context.
+  /// Location is retrieved from the lexer to enrich the error message.
+  template <typename R, typename T, typename U = const char *>
+  std::unique_ptr<R> parseError(T &&expected, U &&context = "") {
+    auto curToken = lexer.getCurToken();
+    llvm::errs() << "Parse error (" << lexer.getLastLocation().line << ", "
+                 << lexer.getLastLocation().col << "): expected '" << expected
+                 << "' " << context << " but has Token " << curToken;
+    if (isprint(curToken))
+      llvm::errs() << " '" << (char)curToken << "'";
+    llvm::errs() << "\n";
+    return nullptr;
+  }
+};
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_PARSER_H
diff --git a/mlir/examples/toy/Ch5/include/toy/Passes.h b/mlir/examples/toy/Ch5/include/toy/Passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..97a5d0db46c5b8fad86035a3b20a18d852ca84a5
--- /dev/null
+++ b/mlir/examples/toy/Ch5/include/toy/Passes.h
@@ -0,0 +1,32 @@
+//===- Passes.h - Toy Passes Definition -----------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes the entry points to create compiler passes for Toy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_PASSES_H
+#define MLIR_TUTORIAL_TOY_PASSES_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+namespace toy {
+std::unique_ptr<Pass> createDeadFunctionEliminationPass();
+std::unique_ptr<Pass> createShapeInferencePass();
+
+/// Create a pass for lowering to operations in the `Affine` and `Std` dialects,
+/// for a subset of the Toy IR (e.g. matmul).
+std::unique_ptr<mlir::Pass> createLowerToAffinePass();
+
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_PASSES_H
diff --git a/mlir/examples/toy/Ch5/include/toy/ShapeInferenceInterface.h b/mlir/examples/toy/Ch5/include/toy/ShapeInferenceInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..da0fb66018ee4df1882d26f074ecd49a24ddcea9
--- /dev/null
+++ b/mlir/examples/toy/Ch5/include/toy/ShapeInferenceInterface.h
@@ -0,0 +1,28 @@
+//===- ShapeInferenceInterface.h - Interface definitions for ShapeInference -=//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the shape inference interfaces defined
+// in ShapeInferenceInterface.td.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
+#define MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
+
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace toy {
+
+/// Include the auto-generated declarations.
+#include "toy/ShapeInferenceOpInterfaces.h.inc"
+
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
diff --git a/mlir/examples/toy/Ch5/include/toy/ShapeInferenceInterface.td b/mlir/examples/toy/Ch5/include/toy/ShapeInferenceInterface.td
new file mode 100644
index 0000000000000000000000000000000000000000..1b38ada1622862057ad2c18eabe147b875e18cf2
--- /dev/null
+++ b/mlir/examples/toy/Ch5/include/toy/ShapeInferenceInterface.td
@@ -0,0 +1,30 @@
+//===- ShapeInferenceInterface.td - Shape Inference Interface -*- tablegen -==//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the operations of the Shape Inference Op Interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SHAPE_INFERENCE_INTERFACE
+#define SHAPE_INFERENCE_INTERFACE
+
+include "mlir/IR/OpBase.td"
+
+def ShapeInferenceOpInterface : OpInterface<"ShapeInference"> {
+  let description = [{
+    Interface to access a registered method to infer the return types for an
+    operation that can be used during type inference.
+  }];
+
+  let methods = [
+    InterfaceMethod<"Infer and set the output shape for the current operation.",
+                    "void", "inferShapes">
+  ];
+}
+
+#endif // SHAPE_INFERENCE_INTERFACE
diff --git a/mlir/examples/toy/Ch5/mlir/DeadFunctionEliminationPass.cpp b/mlir/examples/toy/Ch5/mlir/DeadFunctionEliminationPass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ee34547860cd98c27c21da874ad794a6d0c99d5
--- /dev/null
+++ b/mlir/examples/toy/Ch5/mlir/DeadFunctionEliminationPass.cpp
@@ -0,0 +1,59 @@
+//===- DeadFunctionEliminationPass.cpp - Eliminate inlined functions ------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Module level pass performing dead function
+// elimination. This is required as a post-processing step after function
+// inlining.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "toy/Passes.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+namespace {
+/// This is a simple function DCE pass that deletes all non-main functions after
+/// inlining.
+/// TODO(riverriddle) This is only necessary because MLIR currently does not
+/// have generic DCE support for functions.
+class DeadFunctionEliminationPass
+    : public mlir::ModulePass<DeadFunctionEliminationPass> {
+public:
+  void runOnModule() override {
+    mlir::ModuleOp module = getModule();
+    mlir::SymbolTable moduleSymTable(module);
+
+    // Eliminate non-main functions.
+    auto mainFn = moduleSymTable.lookup<mlir::FuncOp>("main");
+    for (mlir::FuncOp func :
+         llvm::make_early_inc_range(module.getOps<mlir::FuncOp>())) {
+      if (func != mainFn)
+        func.erase();
+    }
+  }
+};
+} // end anonymous namespace
+
+/// Create a pass that eliminates inlined functions in toy.
+std::unique_ptr<mlir::Pass> mlir::toy::createDeadFunctionEliminationPass() {
+  return std::make_unique<DeadFunctionEliminationPass>();
+}
diff --git a/mlir/examples/toy/Ch5/mlir/Dialect.cpp b/mlir/examples/toy/Ch5/mlir/Dialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a9ded0c3d38ae810d6dd114f4c3a0d85df65b60
--- /dev/null
+++ b/mlir/examples/toy/Ch5/mlir/Dialect.cpp
@@ -0,0 +1,261 @@
+//===- Dialect.cpp - Toy IR Dialect registration in MLIR ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the dialect for the Toy IR: custom type parsing and
+// operation verification.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Transforms/InliningUtils.h"
+
+using namespace mlir;
+using namespace mlir::toy;
+
+//===----------------------------------------------------------------------===//
+// ToyInlinerInterface
+//===----------------------------------------------------------------------===//
+
+/// This class defines the interface for handling inlining with Toy
+/// operations.
+struct ToyInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// All operations within toy can be inlined.
+  bool isLegalToInline(Operation *, Region *,
+                       BlockAndValueMapping &) const final {
+    return true;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator(toy.return) by replacing it with a new
+  /// operation as necessary.
+  void handleTerminator(Operation *op,
+                        ArrayRef<Value> valuesToRepl) const final {
+    // Only "toy.return" needs to be handled here.
+    auto returnOp = cast<ReturnOp>(op);
+
+    // Replace the values directly with the return operands.
+    assert(returnOp.getNumOperands() == valuesToRepl.size());
+    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+      valuesToRepl[it.index()]->replaceAllUsesWith(it.value());
+  }
+
+  /// Attempts to materialize a conversion for a type mismatch between a call
+  /// from this dialect, and a callable region. This method should generate an
+  /// operation that takes 'input' as the only operand, and produces a single
+  /// result of 'resultType'. If a conversion can not be generated, nullptr
+  /// should be returned.
+  Operation *materializeCallConversion(OpBuilder &builder, Value input,
+                                       Type resultType,
+                                       Location conversionLoc) const final {
+    return builder.create<CastOp>(conversionLoc, resultType, input);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyDialect
+//===----------------------------------------------------------------------===//
+
+/// Dialect creation, the instance will be owned by the context. This is the
+/// point of registration of custom types and operations for the dialect.
+ToyDialect::ToyDialect(mlir::MLIRContext *ctx) : mlir::Dialect("toy", ctx) {
+  addOperations<
+#define GET_OP_LIST
+#include "toy/Ops.cpp.inc"
+      >();
+  addInterfaces<ToyInlinerInterface>();
+}
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ConstantOp
+
+/// Build a constant operation.
+/// The builder is passed as an argument, so is the state that this method is
+/// expected to fill in order to build the operation.
+void ConstantOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                       double value) {
+  auto dataType = RankedTensorType::get({}, builder->getF64Type());
+  auto dataAttribute = DenseElementsAttr::get(dataType, value);
+  ConstantOp::build(builder, state, dataType, dataAttribute);
+}
+
+/// Verifier for the constant operation. This corresponds to the `::verify(...)`
+/// in the op definition.
+static mlir::LogicalResult verify(ConstantOp op) {
+  // If the return type of the constant is not an unranked tensor, the shape
+  // must match the shape of the attribute holding the data.
+  auto resultType =
+      op.getResult()->getType().dyn_cast<mlir::RankedTensorType>();
+  if (!resultType)
+    return success();
+
+  // Check that the rank of the attribute type matches the rank of the constant
+  // result type.
+  auto attrType = op.value().getType().cast<mlir::TensorType>();
+  if (attrType.getRank() != resultType.getRank()) {
+    return op.emitOpError(
+               "return type must match the one of the attached value "
+               "attribute: ")
+           << attrType.getRank() << " != " << resultType.getRank();
+  }
+
+  // Check that each of the dimensions match between the two types.
+  for (int dim = 0, dimE = attrType.getRank(); dim < dimE; ++dim) {
+    if (attrType.getShape()[dim] != resultType.getShape()[dim]) {
+      return op.emitOpError(
+                 "return type shape mismatches its attribute at dimension ")
+             << dim << ": " << attrType.getShape()[dim]
+             << " != " << resultType.getShape()[dim];
+    }
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// AddOp
+
+void AddOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+/// Infer the output shape of the AddOp, this is required by the shape inference
+/// interface.
+void AddOp::inferShapes() { getResult()->setType(getOperand(0)->getType()); }
+
+//===----------------------------------------------------------------------===//
+// CastOp
+
+/// Infer the output shape of the CastOp, this is required by the shape
+/// inference interface.
+void CastOp::inferShapes() { getResult()->setType(getOperand()->getType()); }
+
+//===----------------------------------------------------------------------===//
+// GenericCallOp
+
+void GenericCallOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                          StringRef callee, ArrayRef<mlir::Value> arguments) {
+  // Generic call always returns an unranked Tensor initially.
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands(arguments);
+  state.addAttribute("callee", builder->getSymbolRefAttr(callee));
+}
+
+/// Return the callee of the generic call operation, this is required by the
+/// call interface.
+CallInterfaceCallable GenericCallOp::getCallableForCallee() {
+  return getAttrOfType<SymbolRefAttr>("callee");
+}
+
+/// Get the argument operands to the called function, this is required by the
+/// call interface.
+Operation::operand_range GenericCallOp::getArgOperands() { return inputs(); }
+
+//===----------------------------------------------------------------------===//
+// MulOp
+
+void MulOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+/// Infer the output shape of the MulOp, this is required by the shape inference
+/// interface.
+void MulOp::inferShapes() { getResult()->setType(getOperand(0)->getType()); }
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+
+static mlir::LogicalResult verify(ReturnOp op) {
+  // We know that the parent operation is a function, because of the 'HasParent'
+  // trait attached to the operation definition.
+  auto function = cast<FuncOp>(op.getParentOp());
+
+  /// ReturnOps can only have a single optional operand.
+  if (op.getNumOperands() > 1)
+    return op.emitOpError() << "expects at most 1 return operand";
+
+  // The operand number and types must match the function signature.
+  const auto &results = function.getType().getResults();
+  if (op.getNumOperands() != results.size())
+    return op.emitOpError()
+           << "does not return the same number of values ("
+           << op.getNumOperands() << ") as the enclosing function ("
+           << results.size() << ")";
+
+  // If the operation does not have an input, we are done.
+  if (!op.hasOperand())
+    return mlir::success();
+
+  auto inputType = *op.operand_type_begin();
+  auto resultType = results.front();
+
+  // Check that the result type of the function matches the operand type.
+  if (inputType == resultType || inputType.isa<mlir::UnrankedTensorType>() ||
+      resultType.isa<mlir::UnrankedTensorType>())
+    return mlir::success();
+
+  return op.emitError() << "type of return operand ("
+                        << *op.operand_type_begin()
+                        << ") doesn't match function result type ("
+                        << results.front() << ")";
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+
+void TransposeOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                        mlir::Value value) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands(value);
+}
+
+void TransposeOp::inferShapes() {
+  auto arrayTy = getOperand()->getType().cast<RankedTensorType>();
+  SmallVector<int64_t, 2> dims(llvm::reverse(arrayTy.getShape()));
+  getResult()->setType(RankedTensorType::get(dims, arrayTy.getElementType()));
+}
+
+static mlir::LogicalResult verify(TransposeOp op) {
+  auto inputType = op.getOperand()->getType().dyn_cast<RankedTensorType>();
+  auto resultType = op.getType().dyn_cast<RankedTensorType>();
+  if (!inputType || !resultType)
+    return mlir::success();
+
+  auto inputShape = inputType.getShape();
+  if (!std::equal(inputShape.begin(), inputShape.end(),
+                  resultType.getShape().rbegin())) {
+    return op.emitError()
+           << "expected result shape to be a transpose of the input";
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "toy/Ops.cpp.inc"
diff --git a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d6e76de069ce235033287496a0ed556789fcf4a
--- /dev/null
+++ b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
@@ -0,0 +1,309 @@
+//====- LowerToAffineLoops.cpp - Partial lowering from Toy to Affine+Std --===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a partial lowering of Toy operations to a combination of
+// affine loops and standard operations. This lowering expects that all calls
+// have been inlined, and all shapes have been resolved.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/Sequence.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns
+//===----------------------------------------------------------------------===//
+
+/// Convert the given TensorType into the corresponding MemRefType.
+static MemRefType convertTensorToMemRef(TensorType type) {
+  assert(type.hasRank() && "expected only ranked shapes");
+  return MemRefType::get(type.getShape(), type.getElementType());
+}
+
+/// Insert an allocation and deallocation for the given MemRefType.
+static Value insertAllocAndDealloc(MemRefType type, Location loc,
+                                   PatternRewriter &rewriter) {
+  auto alloc = rewriter.create<AllocOp>(loc, type);
+
+  // Make sure to allocate at the beginning of the block.
+  auto *parentBlock = alloc.getOperation()->getBlock();
+  alloc.getOperation()->moveBefore(&parentBlock->front());
+
+  // Make sure to deallocate this alloc at the end of the block. This is fine
+  // as toy functions have no control flow.
+  auto dealloc = rewriter.create<DeallocOp>(loc, alloc);
+  dealloc.getOperation()->moveBefore(&parentBlock->back());
+  return alloc;
+}
+
+/// This defines the function type used to process an iteration of a lowered
+/// loop. It takes as input a rewriter, an array of memRefOperands corresponding
+/// to the operands of the input operation, and the set of loop induction
+/// variables for the iteration. It returns a value to store at the current
+/// index of the iteration.
+using LoopIterationFn = function_ref<Value(PatternRewriter &rewriter,
+                                           ArrayRef<Value> memRefOperands,
+                                           ArrayRef<Value> loopIvs)>;
+
+static void lowerOpToLoops(Operation *op, ArrayRef<Value> operands,
+                           PatternRewriter &rewriter,
+                           LoopIterationFn processIteration) {
+  auto tensorType = (*op->result_type_begin()).cast<TensorType>();
+  auto loc = op->getLoc();
+
+  // Insert an allocation and deallocation for the result of this operation.
+  auto memRefType = convertTensorToMemRef(tensorType);
+  auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter);
+
+  // Create an empty affine loop for each of the dimensions within the shape.
+  SmallVector<Value, 4> loopIvs;
+  for (auto dim : tensorType.getShape()) {
+    auto loop = rewriter.create<AffineForOp>(loc, /*lb=*/0, dim, /*step=*/1);
+    loop.getBody()->clear();
+    loopIvs.push_back(loop.getInductionVar());
+
+    // Terminate the loop body and update the rewriter insertion point to the
+    // beginning of the loop.
+    rewriter.setInsertionPointToStart(loop.getBody());
+    rewriter.create<AffineTerminatorOp>(loc);
+    rewriter.setInsertionPointToStart(loop.getBody());
+  }
+
+  // Generate a call to the processing function with the rewriter, the memref
+  // operands, and the loop induction variables. This function will return the
+  // value to store at the current index.
+  Value valueToStore = processIteration(rewriter, operands, loopIvs);
+  rewriter.create<AffineStoreOp>(loc, valueToStore, alloc,
+                                 llvm::makeArrayRef(loopIvs));
+
+  // Replace this operation with the generated alloc.
+  rewriter.replaceOp(op, alloc);
+}
+
+namespace {
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: Binary operations
+//===----------------------------------------------------------------------===//
+
+template <typename BinaryOp, typename LoweredBinaryOp>
+struct BinaryOpLowering : public ConversionPattern {
+  BinaryOpLowering(MLIRContext *ctx)
+      : ConversionPattern(BinaryOp::getOperationName(), 1, ctx) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+    lowerOpToLoops(
+        op, operands, rewriter,
+        [loc](PatternRewriter &rewriter, ArrayRef<Value> memRefOperands,
+              ArrayRef<Value> loopIvs) {
+          // Generate an adaptor for the remapped operands of the BinaryOp. This
+          // allows for using the nice named accessors that are generated by the
+          // ODS.
+          typename BinaryOp::OperandAdaptor binaryAdaptor(memRefOperands);
+
+          // Generate loads for the element of 'lhs' and 'rhs' at the inner
+          // loop.
+          auto loadedLhs =
+              rewriter.create<AffineLoadOp>(loc, binaryAdaptor.lhs(), loopIvs);
+          auto loadedRhs =
+              rewriter.create<AffineLoadOp>(loc, binaryAdaptor.rhs(), loopIvs);
+
+          // Create the binary operation performed on the loaded values.
+          return rewriter.create<LoweredBinaryOp>(loc, loadedLhs, loadedRhs);
+        });
+    return matchSuccess();
+  }
+};
+using AddOpLowering = BinaryOpLowering<toy::AddOp, AddFOp>;
+using MulOpLowering = BinaryOpLowering<toy::MulOp, MulFOp>;
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: Constant operations
+//===----------------------------------------------------------------------===//
+
+struct ConstantOpLowering : public OpRewritePattern<toy::ConstantOp> {
+  using OpRewritePattern<toy::ConstantOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(toy::ConstantOp op,
+                                     PatternRewriter &rewriter) const final {
+    DenseElementsAttr constantValue = op.value();
+    Location loc = op.getLoc();
+
+    // When lowering the constant operation, we allocate and assign the constant
+    // values to a corresponding memref allocation.
+    auto tensorType = op.getType().cast<TensorType>();
+    auto memRefType = convertTensorToMemRef(tensorType);
+    auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter);
+
+    // We will be generating constant indices up-to the largest dimension.
+    // Create these constants up-front to avoid large amounts of redundant
+    // operations.
+    auto valueShape = memRefType.getShape();
+    SmallVector<Value, 8> constantIndices;
+    for (auto i : llvm::seq<int64_t>(
+             0, *std::max_element(valueShape.begin(), valueShape.end())))
+      constantIndices.push_back(rewriter.create<ConstantIndexOp>(loc, i));
+
+    // The constant operation represents a multi-dimensional constant, so we
+    // will need to generate a store for each of the elements. The following
+    // functor recursively walks the dimensions of the constant shape,
+    // generating a store when the recursion hits the base case.
+    SmallVector<Value, 2> indices;
+    auto valueIt = constantValue.getValues<FloatAttr>().begin();
+    std::function<void(uint64_t)> storeElements = [&](uint64_t dimension) {
+      // The last dimension is the base case of the recursion, at this point
+      // we store the element at the given index.
+      if (dimension == valueShape.size()) {
+        rewriter.create<AffineStoreOp>(
+            loc, rewriter.create<ConstantOp>(loc, *valueIt++), alloc,
+            llvm::makeArrayRef(indices));
+        return;
+      }
+
+      // Otherwise, iterate over the current dimension and add the indices to
+      // the list.
+      for (uint64_t i = 0, e = valueShape[dimension]; i != e; ++i) {
+        indices.push_back(constantIndices[i]);
+        storeElements(dimension + 1);
+        indices.pop_back();
+      }
+    };
+
+    // Start the element storing recursion from the first dimension.
+    storeElements(/*dimension=*/0);
+
+    // Replace this operation with the generated alloc.
+    rewriter.replaceOp(op, alloc);
+    return matchSuccess();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: Return operations
+//===----------------------------------------------------------------------===//
+
+struct ReturnOpLowering : public OpRewritePattern<toy::ReturnOp> {
+  using OpRewritePattern<toy::ReturnOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(toy::ReturnOp op,
+                                     PatternRewriter &rewriter) const final {
+    // During this lowering, we expect that all function calls have been
+    // inlined.
+    if (op.hasOperand())
+      return matchFailure();
+
+    // We lower "toy.return" directly to "std.return".
+    rewriter.replaceOpWithNewOp<ReturnOp>(op);
+    return matchSuccess();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: Transpose operations
+//===----------------------------------------------------------------------===//
+
+struct TransposeOpLowering : public ConversionPattern {
+  TransposeOpLowering(MLIRContext *ctx)
+      : ConversionPattern(toy::TransposeOp::getOperationName(), 1, ctx) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+    lowerOpToLoops(
+        op, operands, rewriter,
+        [loc](PatternRewriter &rewriter, ArrayRef<Value> memRefOperands,
+              ArrayRef<Value> loopIvs) {
+          // Generate an adaptor for the remapped operands of the TransposeOp.
+          // This allows for using the nice named accessors that are generated
+          // by the ODS.
+          toy::TransposeOpOperandAdaptor transposeAdaptor(memRefOperands);
+          Value input = transposeAdaptor.input();
+
+          // Transpose the elements by generating a load from the reverse
+          // indices.
+          SmallVector<Value, 2> reverseIvs(llvm::reverse(loopIvs));
+          return rewriter.create<AffineLoadOp>(loc, input, reverseIvs);
+        });
+    return matchSuccess();
+  }
+};
+
+} // end anonymous namespace.
+
+//===----------------------------------------------------------------------===//
+// ToyToAffineLoweringPass
+//===----------------------------------------------------------------------===//
+
+/// This is a partial lowering to affine loops of the toy operations that are
+/// computationally intensive (like matmul for example...) while keeping the
+/// rest of the code in the Toy dialect.
+namespace {
+struct ToyToAffineLoweringPass : public FunctionPass<ToyToAffineLoweringPass> {
+  void runOnFunction() final;
+};
+} // end anonymous namespace.
+
+void ToyToAffineLoweringPass::runOnFunction() {
+  auto function = getFunction();
+
+  // We only lower the main function as we expect that all other functions have
+  // been inlined.
+  if (function.getName() != "main")
+    return;
+
+  // Verify that the given main has no inputs and results.
+  if (function.getNumArguments() || function.getType().getNumResults()) {
+    function.emitError("expected 'main' to have 0 inputs and 0 results");
+    return signalPassFailure();
+  }
+
+  // The first thing to define is the conversion target. This will define the
+  // final target for this lowering.
+  ConversionTarget target(getContext());
+
+  // We define the specific operations, or dialects, that are legal targets for
+  // this lowering. In our case, we are lowering to a combination of the
+  // `Affine` and `Standard` dialects.
+  target.addLegalDialect<AffineOpsDialect, StandardOpsDialect>();
+
+  // We also define the Toy dialect as Illegal so that the conversion will fail
+  // if any of these operations are *not* converted. Given that we actually want
+  // a partial lowering, we explicitly mark the Toy operations that don't want
+  // to lower, `toy.print`, as `legal`.
+  target.addIllegalDialect<toy::ToyDialect>();
+  target.addLegalOp<toy::PrintOp>();
+
+  // Now that the conversion target has been defined, we just need to provide
+  // the set of patterns that will lower the Toy operations.
+  OwningRewritePatternList patterns;
+  patterns.insert<AddOpLowering, ConstantOpLowering, MulOpLowering,
+                  ReturnOpLowering, TransposeOpLowering>(&getContext());
+
+  // With the target and rewrite patterns defined, we can now attempt the
+  // conversion. The conversion will signal failure if any of our `illegal`
+  // operations were not converted successfully.
+  if (failed(applyPartialConversion(getFunction(), target, patterns)))
+    signalPassFailure();
+}
+
+/// Create a pass for lowering operations in the `Affine` and `Std` dialects,
+/// for a subset of the Toy IR (e.g. matmul).
+std::unique_ptr<Pass> mlir::toy::createLowerToAffinePass() {
+  return std::make_unique<ToyToAffineLoweringPass>();
+}
diff --git a/mlir/examples/toy/Ch5/mlir/MLIRGen.cpp b/mlir/examples/toy/Ch5/mlir/MLIRGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9c960c79f47254d31e72037431a0f9d3a614276
--- /dev/null
+++ b/mlir/examples/toy/Ch5/mlir/MLIRGen.cpp
@@ -0,0 +1,452 @@
+//===- MLIRGen.cpp - MLIR Generation from a Toy AST -----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple IR generation targeting MLIR from a Module AST
+// for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/MLIRGen.h"
+#include "toy/AST.h"
+#include "toy/Dialect.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/Support/raw_ostream.h"
+#include <numeric>
+
+using namespace mlir::toy;
+using namespace toy;
+
+using llvm::ArrayRef;
+using llvm::cast;
+using llvm::dyn_cast;
+using llvm::isa;
+using llvm::makeArrayRef;
+using llvm::ScopedHashTableScope;
+using llvm::SmallVector;
+using llvm::StringRef;
+using llvm::Twine;
+
+namespace {
+
+/// Implementation of a simple MLIR emission from the Toy AST.
+///
+/// This will emit operations that are specific to the Toy language, preserving
+/// the semantics of the language and (hopefully) allow to perform accurate
+/// analysis and transformation based on these high level semantics.
+class MLIRGenImpl {
+public:
+  MLIRGenImpl(mlir::MLIRContext &context) : builder(&context) {}
+
+  /// Public API: convert the AST for a Toy module (source file) to an MLIR
+  /// Module operation.
+  mlir::ModuleOp mlirGen(ModuleAST &moduleAST) {
+    // We create an empty MLIR module and codegen functions one at a time and
+    // add them to the module.
+    theModule = mlir::ModuleOp::create(builder.getUnknownLoc());
+
+    for (FunctionAST &F : moduleAST) {
+      auto func = mlirGen(F);
+      if (!func)
+        return nullptr;
+      theModule.push_back(func);
+    }
+
+    // Verify the module after we have finished constructing it, this will check
+    // the structural properties of the IR and invoke any specific verifiers we
+    // have on the Toy operations.
+    if (failed(mlir::verify(theModule))) {
+      theModule.emitError("module verification error");
+      return nullptr;
+    }
+
+    return theModule;
+  }
+
+private:
+  /// A "module" matches a Toy source file: containing a list of functions.
+  mlir::ModuleOp theModule;
+
+  /// The builder is a helper class to create IR inside a function. The builder
+  /// is stateful, in particular it keeps an "insertion point": this is where
+  /// the next operations will be introduced.
+  mlir::OpBuilder builder;
+
+  /// The symbol table maps a variable name to a value in the current scope.
+  /// Entering a function creates a new scope, and the function arguments are
+  /// added to the mapping. When the processing of a function is terminated, the
+  /// scope is destroyed and the mappings created in this scope are dropped.
+  llvm::ScopedHashTable<StringRef, mlir::Value> symbolTable;
+
+  /// Helper conversion for a Toy AST location to an MLIR location.
+  mlir::Location loc(Location loc) {
+    return builder.getFileLineColLoc(builder.getIdentifier(*loc.file), loc.line,
+                                     loc.col);
+  }
+
+  /// Declare a variable in the current scope, return success if the variable
+  /// wasn't declared yet.
+  mlir::LogicalResult declare(llvm::StringRef var, mlir::Value value) {
+    if (symbolTable.count(var))
+      return mlir::failure();
+    symbolTable.insert(var, value);
+    return mlir::success();
+  }
+
+  /// Create the prototype for an MLIR function with as many arguments as the
+  /// provided Toy AST prototype.
+  mlir::FuncOp mlirGen(PrototypeAST &proto) {
+    auto location = loc(proto.loc());
+
+    // This is a generic function, the return type will be inferred later.
+    // Arguments type are uniformly unranked tensors.
+    llvm::SmallVector<mlir::Type, 4> arg_types(proto.getArgs().size(),
+                                               getType(VarType{}));
+    auto func_type = builder.getFunctionType(arg_types, llvm::None);
+    return mlir::FuncOp::create(location, proto.getName(), func_type);
+  }
+
+  /// Emit a new function and add it to the MLIR module.
+  mlir::FuncOp mlirGen(FunctionAST &funcAST) {
+    // Create a scope in the symbol table to hold variable declarations.
+    ScopedHashTableScope<llvm::StringRef, mlir::Value> var_scope(symbolTable);
+
+    // Create an MLIR function for the given prototype.
+    mlir::FuncOp function(mlirGen(*funcAST.getProto()));
+    if (!function)
+      return nullptr;
+
+    // Let's start the body of the function now!
+    // In MLIR the entry block of the function is special: it must have the same
+    // argument list as the function itself.
+    auto &entryBlock = *function.addEntryBlock();
+    auto protoArgs = funcAST.getProto()->getArgs();
+
+    // Declare all the function arguments in the symbol table.
+    for (const auto &name_value :
+         llvm::zip(protoArgs, entryBlock.getArguments())) {
+      if (failed(declare(std::get<0>(name_value)->getName(),
+                         std::get<1>(name_value))))
+        return nullptr;
+    }
+
+    // Set the insertion point in the builder to the beginning of the function
+    // body, it will be used throughout the codegen to create operations in this
+    // function.
+    builder.setInsertionPointToStart(&entryBlock);
+
+    // Emit the body of the function.
+    if (mlir::failed(mlirGen(*funcAST.getBody()))) {
+      function.erase();
+      return nullptr;
+    }
+
+    // Implicitly return void if no return statement was emitted.
+    // FIXME: we may fix the parser instead to always return the last expression
+    // (this would possibly help the REPL case later)
+    ReturnOp returnOp;
+    if (!entryBlock.empty())
+      returnOp = dyn_cast<ReturnOp>(entryBlock.back());
+    if (!returnOp) {
+      builder.create<ReturnOp>(loc(funcAST.getProto()->loc()));
+    } else if (returnOp.hasOperand()) {
+      // Otherwise, if this return operation has an operand then add a result to
+      // the function.
+      function.setType(builder.getFunctionType(function.getType().getInputs(),
+                                               getType(VarType{})));
+    }
+
+    return function;
+  }
+
+  /// Emit a binary operation
+  mlir::Value mlirGen(BinaryExprAST &binop) {
+    // First emit the operations for each side of the operation before emitting
+    // the operation itself. For example if the expression is `a + foo(a)`
+    // 1) First it will visiting the LHS, which will return a reference to the
+    //    value holding `a`. This value should have been emitted at declaration
+    //    time and registered in the symbol table, so nothing would be
+    //    codegen'd. If the value is not in the symbol table, an error has been
+    //    emitted and nullptr is returned.
+    // 2) Then the RHS is visited (recursively) and a call to `foo` is emitted
+    //    and the result value is returned. If an error occurs we get a nullptr
+    //    and propagate.
+    //
+    mlir::Value lhs = mlirGen(*binop.getLHS());
+    if (!lhs)
+      return nullptr;
+    mlir::Value rhs = mlirGen(*binop.getRHS());
+    if (!rhs)
+      return nullptr;
+    auto location = loc(binop.loc());
+
+    // Derive the operation name from the binary operator. At the moment we only
+    // support '+' and '*'.
+    switch (binop.getOp()) {
+    case '+':
+      return builder.create<AddOp>(location, lhs, rhs);
+    case '*':
+      return builder.create<MulOp>(location, lhs, rhs);
+    }
+
+    emitError(location, "invalid binary operator '") << binop.getOp() << "'";
+    return nullptr;
+  }
+
+  /// This is a reference to a variable in an expression. The variable is
+  /// expected to have been declared and so should have a value in the symbol
+  /// table, otherwise emit an error and return nullptr.
+  mlir::Value mlirGen(VariableExprAST &expr) {
+    if (auto variable = symbolTable.lookup(expr.getName()))
+      return variable;
+
+    emitError(loc(expr.loc()), "error: unknown variable '")
+        << expr.getName() << "'";
+    return nullptr;
+  }
+
+  /// Emit a return operation. This will return failure if any generation fails.
+  mlir::LogicalResult mlirGen(ReturnExprAST &ret) {
+    auto location = loc(ret.loc());
+
+    // 'return' takes an optional expression, handle that case here.
+    mlir::Value expr = nullptr;
+    if (ret.getExpr().hasValue()) {
+      if (!(expr = mlirGen(*ret.getExpr().getValue())))
+        return mlir::failure();
+    }
+
+    // Otherwise, this return operation has zero operands.
+    builder.create<ReturnOp>(location, expr ? makeArrayRef(expr)
+                                            : ArrayRef<mlir::Value>());
+    return mlir::success();
+  }
+
+  /// Emit a literal/constant array. It will be emitted as a flattened array of
+  /// data in an Attribute attached to a `toy.constant` operation.
+  /// See documentation on [Attributes](LangRef.md#attributes) for more details.
+  /// Here is an excerpt:
+  ///
+  ///   Attributes are the mechanism for specifying constant data in MLIR in
+  ///   places where a variable is never allowed [...]. They consist of a name
+  ///   and a concrete attribute value. The set of expected attributes, their
+  ///   structure, and their interpretation are all contextually dependent on
+  ///   what they are attached to.
+  ///
+  /// Example, the source level statement:
+  ///   var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  /// will be converted to:
+  ///   %0 = "toy.constant"() {value: dense<tensor<2x3xf64>,
+  ///     [[1.000000e+00, 2.000000e+00, 3.000000e+00],
+  ///      [4.000000e+00, 5.000000e+00, 6.000000e+00]]>} : () -> tensor<2x3xf64>
+  ///
+  mlir::Value mlirGen(LiteralExprAST &lit) {
+    auto type = getType(lit.getDims());
+
+    // The attribute is a vector with a floating point value per element
+    // (number) in the array, see `collectData()` below for more details.
+    std::vector<double> data;
+    data.reserve(std::accumulate(lit.getDims().begin(), lit.getDims().end(), 1,
+                                 std::multiplies<int>()));
+    collectData(lit, data);
+
+    // The type of this attribute is tensor of 64-bit floating-point with the
+    // shape of the literal.
+    mlir::Type elementType = builder.getF64Type();
+    auto dataType = mlir::RankedTensorType::get(lit.getDims(), elementType);
+
+    // This is the actual attribute that holds the list of values for this
+    // tensor literal.
+    auto dataAttribute =
+        mlir::DenseElementsAttr::get(dataType, llvm::makeArrayRef(data));
+
+    // Build the MLIR op `toy.constant`. This invokes the `ConstantOp::build`
+    // method.
+    return builder.create<ConstantOp>(loc(lit.loc()), type, dataAttribute);
+  }
+
+  /// Recursive helper function to accumulate the data that compose an array
+  /// literal. It flattens the nested structure in the supplied vector. For
+  /// example with this array:
+  ///  [[1, 2], [3, 4]]
+  /// we will generate:
+  ///  [ 1, 2, 3, 4 ]
+  /// Individual numbers are represented as doubles.
+  /// Attributes are the way MLIR attaches constant to operations.
+  void collectData(ExprAST &expr, std::vector<double> &data) {
+    if (auto *lit = dyn_cast<LiteralExprAST>(&expr)) {
+      for (auto &value : lit->getValues())
+        collectData(*value, data);
+      return;
+    }
+
+    assert(isa<NumberExprAST>(expr) && "expected literal or number expr");
+    data.push_back(cast<NumberExprAST>(expr).getValue());
+  }
+
+  /// Emit a call expression. It emits specific operations for the `transpose`
+  /// builtin. Other identifiers are assumed to be user-defined functions.
+  mlir::Value mlirGen(CallExprAST &call) {
+    llvm::StringRef callee = call.getCallee();
+    auto location = loc(call.loc());
+
+    // Codegen the operands first.
+    SmallVector<mlir::Value, 4> operands;
+    for (auto &expr : call.getArgs()) {
+      auto arg = mlirGen(*expr);
+      if (!arg)
+        return nullptr;
+      operands.push_back(arg);
+    }
+
+    // Builting calls have their custom operation, meaning this is a
+    // straightforward emission.
+    if (callee == "transpose") {
+      if (call.getArgs().size() != 1) {
+        emitError(location, "MLIR codegen encountered an error: toy.transpose "
+                            "does not accept multiple arguments");
+        return nullptr;
+      }
+      return builder.create<TransposeOp>(location, operands[0]);
+    }
+
+    // Otherwise this is a call to a user-defined function. Calls to ser-defined
+    // functions are mapped to a custom call that takes the callee name as an
+    // attribute.
+    return builder.create<GenericCallOp>(location, callee, operands);
+  }
+
+  /// Emit a print expression. It emits specific operations for two builtins:
+  /// transpose(x) and print(x).
+  mlir::LogicalResult mlirGen(PrintExprAST &call) {
+    auto arg = mlirGen(*call.getArg());
+    if (!arg)
+      return mlir::failure();
+
+    builder.create<PrintOp>(loc(call.loc()), arg);
+    return mlir::success();
+  }
+
+  /// Emit a constant for a single number (FIXME: semantic? broadcast?)
+  mlir::Value mlirGen(NumberExprAST &num) {
+    return builder.create<ConstantOp>(loc(num.loc()), num.getValue());
+  }
+
+  /// Dispatch codegen for the right expression subclass using RTTI.
+  mlir::Value mlirGen(ExprAST &expr) {
+    switch (expr.getKind()) {
+    case toy::ExprAST::Expr_BinOp:
+      return mlirGen(cast<BinaryExprAST>(expr));
+    case toy::ExprAST::Expr_Var:
+      return mlirGen(cast<VariableExprAST>(expr));
+    case toy::ExprAST::Expr_Literal:
+      return mlirGen(cast<LiteralExprAST>(expr));
+    case toy::ExprAST::Expr_Call:
+      return mlirGen(cast<CallExprAST>(expr));
+    case toy::ExprAST::Expr_Num:
+      return mlirGen(cast<NumberExprAST>(expr));
+    default:
+      emitError(loc(expr.loc()))
+          << "MLIR codegen encountered an unhandled expr kind '"
+          << Twine(expr.getKind()) << "'";
+      return nullptr;
+    }
+  }
+
+  /// Handle a variable declaration, we'll codegen the expression that forms the
+  /// initializer and record the value in the symbol table before returning it.
+  /// Future expressions will be able to reference this variable through symbol
+  /// table lookup.
+  mlir::Value mlirGen(VarDeclExprAST &vardecl) {
+    auto init = vardecl.getInitVal();
+    if (!init) {
+      emitError(loc(vardecl.loc()),
+                "missing initializer in variable declaration");
+      return nullptr;
+    }
+
+    mlir::Value value = mlirGen(*init);
+    if (!value)
+      return nullptr;
+
+    // We have the initializer value, but in case the variable was declared
+    // with specific shape, we emit a "reshape" operation. It will get
+    // optimized out later as needed.
+    if (!vardecl.getType().shape.empty()) {
+      value = builder.create<ReshapeOp>(loc(vardecl.loc()),
+                                        getType(vardecl.getType()), value);
+    }
+
+    // Register the value in the symbol table.
+    if (failed(declare(vardecl.getName(), value)))
+      return nullptr;
+    return value;
+  }
+
+  /// Codegen a list of expression, return failure if one of them hit an error.
+  mlir::LogicalResult mlirGen(ExprASTList &blockAST) {
+    ScopedHashTableScope<StringRef, mlir::Value> var_scope(symbolTable);
+    for (auto &expr : blockAST) {
+      // Specific handling for variable declarations, return statement, and
+      // print. These can only appear in block list and not in nested
+      // expressions.
+      if (auto *vardecl = dyn_cast<VarDeclExprAST>(expr.get())) {
+        if (!mlirGen(*vardecl))
+          return mlir::failure();
+        continue;
+      }
+      if (auto *ret = dyn_cast<ReturnExprAST>(expr.get()))
+        return mlirGen(*ret);
+      if (auto *print = dyn_cast<PrintExprAST>(expr.get())) {
+        if (mlir::failed(mlirGen(*print)))
+          return mlir::success();
+        continue;
+      }
+
+      // Generic expression dispatch codegen.
+      if (!mlirGen(*expr))
+        return mlir::failure();
+    }
+    return mlir::success();
+  }
+
+  /// Build a tensor type from a list of shape dimensions.
+  mlir::Type getType(ArrayRef<int64_t> shape) {
+    // If the shape is empty, then this type is unranked.
+    if (shape.empty())
+      return mlir::UnrankedTensorType::get(builder.getF64Type());
+
+    // Otherwise, we use the given shape.
+    return mlir::RankedTensorType::get(shape, builder.getF64Type());
+  }
+
+  /// Build an MLIR type from a Toy AST variable type (forward to the generic
+  /// getType above).
+  mlir::Type getType(const VarType &type) { return getType(type.shape); }
+};
+
+} // namespace
+
+namespace toy {
+
+// The public API for codegen.
+mlir::OwningModuleRef mlirGen(mlir::MLIRContext &context,
+                              ModuleAST &moduleAST) {
+  return MLIRGenImpl(context).mlirGen(moduleAST);
+}
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch5/mlir/ShapeInferencePass.cpp b/mlir/examples/toy/Ch5/mlir/ShapeInferencePass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..517a1f075306485003e099ed805a23f77cb49147
--- /dev/null
+++ b/mlir/examples/toy/Ch5/mlir/ShapeInferencePass.cpp
@@ -0,0 +1,104 @@
+//===- ShapeInferencePass.cpp - Shape Inference ---------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Function level pass performing interprocedural
+// propagation of array shapes through function specialization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Pass/Pass.h"
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+#include "toy/ShapeInferenceInterface.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "shape-inference"
+
+using namespace mlir;
+using namespace toy;
+
+/// Include the auto-generated definitions for the shape inference interfaces.
+#include "toy/ShapeInferenceOpInterfaces.cpp.inc"
+
+namespace {
+/// The ShapeInferencePass is a FunctionPass that performs intra-procedural
+/// shape inference.
+///
+///    Algorithm:
+///
+///   1) Build a worklist containing all the operations that return a
+///      dynamically shaped tensor: these are the operations that need shape
+///      inference.
+///   2) Iterate on the worklist:
+///     a) find an operation to process: the next ready operation in the
+///        worklist has all of its arguments non-generic,
+///     b) if no operation is found, break out of the loop,
+///     c) remove the operation from the worklist,
+///     d) infer the shape of its output from the argument types.
+///   3) If the worklist is empty, the algorithm succeeded.
+///
+class ShapeInferencePass : public mlir::FunctionPass<ShapeInferencePass> {
+public:
+  void runOnFunction() override {
+    auto f = getFunction();
+
+    // Populate the worklist with the operations that need shape inference:
+    // these are operations that return a dynamic shape.
+    llvm::SmallPtrSet<mlir::Operation *, 16> opWorklist;
+    f.walk([&](mlir::Operation *op) {
+      if (returnsDynamicShape(op))
+        opWorklist.insert(op);
+    });
+
+    // Iterate on the operations in the worklist until all operations have been
+    // inferred or no change happened (fix point).
+    while (!opWorklist.empty()) {
+      // Find the next operation ready for inference, that is an operation
+      // with all operands already resolved (non-generic).
+      auto nextop = llvm::find_if(opWorklist, returnsDynamicShape);
+      if (nextop == opWorklist.end())
+        break;
+
+      Operation *op = *nextop;
+      opWorklist.erase(op);
+
+      // Ask the operation to infer its output shapes.
+      LLVM_DEBUG(llvm::dbgs() << "Inferring shape for: " << *op << "\n");
+      if (auto shapeOp = dyn_cast<ShapeInference>(op)) {
+        shapeOp.inferShapes();
+      } else {
+        op->emitError("unable to infer shape of operation without shape "
+                      "inference interface");
+        return signalPassFailure();
+      }
+    }
+
+    // If the operation worklist isn't empty, this indicates a failure.
+    if (!opWorklist.empty()) {
+      f.emitError("Shape inference failed, ")
+          << opWorklist.size() << " operations couldn't be inferred\n";
+      signalPassFailure();
+    }
+  }
+
+  /// A utility method that returns if the given operation has a dynamically
+  /// shaped result.
+  static bool returnsDynamicShape(Operation *op) {
+    return llvm::any_of(op->getResultTypes(), [](Type resultType) {
+      return !resultType.isa<RankedTensorType>();
+    });
+  }
+};
+} // end anonymous namespace
+
+/// Create a Shape Inference pass.
+std::unique_ptr<mlir::Pass> mlir::toy::createShapeInferencePass() {
+  return std::make_unique<ShapeInferencePass>();
+}
diff --git a/mlir/examples/toy/Ch5/mlir/ToyCombine.cpp b/mlir/examples/toy/Ch5/mlir/ToyCombine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82c247c1be2d4da5ac4419f9267381379f8b365c
--- /dev/null
+++ b/mlir/examples/toy/Ch5/mlir/ToyCombine.cpp
@@ -0,0 +1,74 @@
+//===- ToyCombine.cpp - Toy High Level Optimizer --------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a set of simple combiners for optimizing operations in
+// the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "toy/Dialect.h"
+#include <numeric>
+using namespace mlir;
+using namespace toy;
+
+namespace {
+/// Include the patterns defined in the Declarative Rewrite framework.
+#include "ToyCombine.inc"
+} // end anonymous namespace
+
+/// Fold simple cast operations that return the same type as the input.
+OpFoldResult CastOp::fold(ArrayRef<Attribute> operands) {
+  return mlir::impl::foldCastOp(*this);
+}
+
+/// This is an example of a c++ rewrite pattern for the TransposeOp. It
+/// optimizes the following scenario: transpose(transpose(x)) -> transpose(x)
+struct SimplifyRedundantTranspose : public mlir::OpRewritePattern<TransposeOp> {
+  /// We register this pattern to match every toy.transpose in the IR.
+  /// The "benefit" is used by the framework to order the patterns and process
+  /// them in order of profitability.
+  SimplifyRedundantTranspose(mlir::MLIRContext *context)
+      : OpRewritePattern<TransposeOp>(context, /*benefit=*/1) {}
+
+  /// This method attempts to match a pattern and rewrite it. The rewriter
+  /// argument is the orchestrator of the sequence of rewrites. The pattern is
+  /// expected to interact with it to perform any changes to the IR from here.
+  mlir::PatternMatchResult
+  matchAndRewrite(TransposeOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // Look through the input of the current transpose.
+    mlir::Value transposeInput = op.getOperand();
+    TransposeOp transposeInputOp =
+        llvm::dyn_cast_or_null<TransposeOp>(transposeInput->getDefiningOp());
+
+    // If the input is defined by another Transpose, bingo!
+    if (!transposeInputOp)
+      return matchFailure();
+
+    // Use the rewriter to perform the replacement.
+    rewriter.replaceOp(op, {transposeInputOp.getOperand()}, {transposeInputOp});
+    return matchSuccess();
+  }
+};
+
+/// Register our patterns as "canonicalization" patterns on the TransposeOp so
+/// that they can be picked up by the Canonicalization framework.
+void TransposeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                              MLIRContext *context) {
+  results.insert<SimplifyRedundantTranspose>(context);
+}
+
+/// Register our patterns as "canonicalization" patterns on the ReshapeOp so
+/// that they can be picked up by the Canonicalization framework.
+void ReshapeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  results.insert<ReshapeReshapeOptPattern, RedundantReshapeOptPattern,
+                 FoldConstantReshapeOptPattern>(context);
+}
diff --git a/mlir/examples/toy/Ch5/mlir/ToyCombine.td b/mlir/examples/toy/Ch5/mlir/ToyCombine.td
new file mode 100644
index 0000000000000000000000000000000000000000..e6e33e84d7e8f3e13aea9840f3690029de025d94
--- /dev/null
+++ b/mlir/examples/toy/Ch5/mlir/ToyCombine.td
@@ -0,0 +1,62 @@
+//===- ToyCombine.td - Pattern Match Optimizations for Toy -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines language-specific pattern match optimizations for Toy using
+// Declarative Rewrite Rules (DRR) specified using TableGen records.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_COMBINE
+#define TOY_COMBINE
+
+include "toy/Ops.td"
+
+/// Note: The DRR definition used for defining patterns is shown below:
+///
+/// class Pattern<
+///    dag sourcePattern, list<dag> resultPatterns,
+///    list<dag> additionalConstraints = [],
+///    dag benefitsAdded = (addBenefit 0)
+/// >;
+
+//===----------------------------------------------------------------------===//
+// Basic Pattern-Match and Rewrite
+//===----------------------------------------------------------------------===//
+
+// Reshape(Reshape(x)) = Reshape(x)
+def ReshapeReshapeOptPattern : Pat<(ReshapeOp(ReshapeOp $arg)),
+                                   (ReshapeOp $arg)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern-Match and Rewrite using Native Code Call
+//===----------------------------------------------------------------------===//
+
+// Native Code Calls may be used for more complex transformations using inline
+// C++ and C++ helper functions.
+
+// Reshape(Constant(x)) = x'
+def ReshapeConstant :
+  NativeCodeCall<"$0.reshape(($1->getType()).cast<ShapedType>())">;
+def FoldConstantReshapeOptPattern : Pat<
+  (ReshapeOp:$res (ConstantOp $arg)),
+  (ConstantOp (ReshapeConstant $arg, $res))>;
+
+//===----------------------------------------------------------------------===//
+// Pattern-Match and Rewrite with Constraints
+//===----------------------------------------------------------------------===//
+
+// DRR allows for constraint checking when the transformation is conditional
+// on operand properties.
+
+// Reshape(x) = x, where input and output shapes are identical
+def TypesAreIdentical : Constraint<CPred<"$0->getType() == $1->getType()">>;
+def RedundantReshapeOptPattern : Pat<
+  (ReshapeOp:$res $arg), (replaceWithValue $arg),
+  [(TypesAreIdentical $res, $arg)]>;
+
+#endif // TOY_COMBINE
diff --git a/mlir/examples/toy/Ch5/parser/AST.cpp b/mlir/examples/toy/Ch5/parser/AST.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d6d9359529bffc068520bebf4a9ea56f436a415
--- /dev/null
+++ b/mlir/examples/toy/Ch5/parser/AST.cpp
@@ -0,0 +1,234 @@
+//===- AST.cpp - Helper for printing out the Toy AST ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST dump for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/AST.h"
+
+#include "mlir/ADT/TypeSwitch.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+
+namespace {
+
+// RAII helper to manage increasing/decreasing the indentation as we traverse
+// the AST
+struct Indent {
+  Indent(int &level) : level(level) { ++level; }
+  ~Indent() { --level; }
+  int &level;
+};
+
+/// Helper class that implement the AST tree traversal and print the nodes along
+/// the way. The only data member is the current indentation level.
+class ASTDumper {
+public:
+  void dump(ModuleAST *node);
+
+private:
+  void dump(const VarType &type);
+  void dump(VarDeclExprAST *varDecl);
+  void dump(ExprAST *expr);
+  void dump(ExprASTList *exprList);
+  void dump(NumberExprAST *num);
+  void dump(LiteralExprAST *node);
+  void dump(VariableExprAST *node);
+  void dump(ReturnExprAST *node);
+  void dump(BinaryExprAST *node);
+  void dump(CallExprAST *node);
+  void dump(PrintExprAST *node);
+  void dump(PrototypeAST *node);
+  void dump(FunctionAST *node);
+
+  // Actually print spaces matching the current indentation level
+  void indent() {
+    for (int i = 0; i < curIndent; i++)
+      llvm::errs() << "  ";
+  }
+  int curIndent = 0;
+};
+
+} // namespace
+
+/// Return a formatted string for the location of any node
+template <typename T> static std::string loc(T *node) {
+  const auto &loc = node->loc();
+  return (llvm::Twine("@") + *loc.file + ":" + llvm::Twine(loc.line) + ":" +
+          llvm::Twine(loc.col))
+      .str();
+}
+
+// Helper Macro to bump the indentation level and print the leading spaces for
+// the current indentations
+#define INDENT()                                                               \
+  Indent level_(curIndent);                                                    \
+  indent();
+
+/// Dispatch to a generic expressions to the appropriate subclass using RTTI
+void ASTDumper::dump(ExprAST *expr) {
+  mlir::TypeSwitch<ExprAST *>(expr)
+      .Case<BinaryExprAST, CallExprAST, LiteralExprAST, NumberExprAST,
+            PrintExprAST, ReturnExprAST, VarDeclExprAST, VariableExprAST>(
+          [&](auto *node) { this->dump(node); })
+      .Default([&](ExprAST *) {
+        // No match, fallback to a generic message
+        INDENT();
+        llvm::errs() << "<unknown Expr, kind " << expr->getKind() << ">\n";
+      });
+}
+
+/// A variable declaration is printing the variable name, the type, and then
+/// recurse in the initializer value.
+void ASTDumper::dump(VarDeclExprAST *varDecl) {
+  INDENT();
+  llvm::errs() << "VarDecl " << varDecl->getName();
+  dump(varDecl->getType());
+  llvm::errs() << " " << loc(varDecl) << "\n";
+  dump(varDecl->getInitVal());
+}
+
+/// A "block", or a list of expression
+void ASTDumper::dump(ExprASTList *exprList) {
+  INDENT();
+  llvm::errs() << "Block {\n";
+  for (auto &expr : *exprList)
+    dump(expr.get());
+  indent();
+  llvm::errs() << "} // Block\n";
+}
+
+/// A literal number, just print the value.
+void ASTDumper::dump(NumberExprAST *num) {
+  INDENT();
+  llvm::errs() << num->getValue() << " " << loc(num) << "\n";
+}
+
+/// Helper to print recursively a literal. This handles nested array like:
+///    [ [ 1, 2 ], [ 3, 4 ] ]
+/// We print out such array with the dimensions spelled out at every level:
+///    <2,2>[<2>[ 1, 2 ], <2>[ 3, 4 ] ]
+void printLitHelper(ExprAST *litOrNum) {
+  // Inside a literal expression we can have either a number or another literal
+  if (auto num = llvm::dyn_cast<NumberExprAST>(litOrNum)) {
+    llvm::errs() << num->getValue();
+    return;
+  }
+  auto *literal = llvm::cast<LiteralExprAST>(litOrNum);
+
+  // Print the dimension for this literal first
+  llvm::errs() << "<";
+  mlir::interleaveComma(literal->getDims(), llvm::errs());
+  llvm::errs() << ">";
+
+  // Now print the content, recursing on every element of the list
+  llvm::errs() << "[ ";
+  mlir::interleaveComma(literal->getValues(), llvm::errs(),
+                        [&](auto &elt) { printLitHelper(elt.get()); });
+  llvm::errs() << "]";
+}
+
+/// Print a literal, see the recursive helper above for the implementation.
+void ASTDumper::dump(LiteralExprAST *node) {
+  INDENT();
+  llvm::errs() << "Literal: ";
+  printLitHelper(node);
+  llvm::errs() << " " << loc(node) << "\n";
+}
+
+/// Print a variable reference (just a name).
+void ASTDumper::dump(VariableExprAST *node) {
+  INDENT();
+  llvm::errs() << "var: " << node->getName() << " " << loc(node) << "\n";
+}
+
+/// Return statement print the return and its (optional) argument.
+void ASTDumper::dump(ReturnExprAST *node) {
+  INDENT();
+  llvm::errs() << "Return\n";
+  if (node->getExpr().hasValue())
+    return dump(*node->getExpr());
+  {
+    INDENT();
+    llvm::errs() << "(void)\n";
+  }
+}
+
+/// Print a binary operation, first the operator, then recurse into LHS and RHS.
+void ASTDumper::dump(BinaryExprAST *node) {
+  INDENT();
+  llvm::errs() << "BinOp: " << node->getOp() << " " << loc(node) << "\n";
+  dump(node->getLHS());
+  dump(node->getRHS());
+}
+
+/// Print a call expression, first the callee name and the list of args by
+/// recursing into each individual argument.
+void ASTDumper::dump(CallExprAST *node) {
+  INDENT();
+  llvm::errs() << "Call '" << node->getCallee() << "' [ " << loc(node) << "\n";
+  for (auto &arg : node->getArgs())
+    dump(arg.get());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print a builtin print call, first the builtin name and then the argument.
+void ASTDumper::dump(PrintExprAST *node) {
+  INDENT();
+  llvm::errs() << "Print [ " << loc(node) << "\n";
+  dump(node->getArg());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print type: only the shape is printed in between '<' and '>'
+void ASTDumper::dump(const VarType &type) {
+  llvm::errs() << "<";
+  mlir::interleaveComma(type.shape, llvm::errs());
+  llvm::errs() << ">";
+}
+
+/// Print a function prototype, first the function name, and then the list of
+/// parameters names.
+void ASTDumper::dump(PrototypeAST *node) {
+  INDENT();
+  llvm::errs() << "Proto '" << node->getName() << "' " << loc(node) << "'\n";
+  indent();
+  llvm::errs() << "Params: [";
+  mlir::interleaveComma(node->getArgs(), llvm::errs(),
+                        [](auto &arg) { llvm::errs() << arg->getName(); });
+  llvm::errs() << "]\n";
+}
+
+/// Print a function, first the prototype and then the body.
+void ASTDumper::dump(FunctionAST *node) {
+  INDENT();
+  llvm::errs() << "Function \n";
+  dump(node->getProto());
+  dump(node->getBody());
+}
+
+/// Print a module, actually loop over the functions and print them in sequence.
+void ASTDumper::dump(ModuleAST *node) {
+  INDENT();
+  llvm::errs() << "Module:\n";
+  for (auto &f : *node)
+    dump(&f);
+}
+
+namespace toy {
+
+// Public API
+void dump(ModuleAST &module) { ASTDumper().dump(&module); }
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch5/toyc.cpp b/mlir/examples/toy/Ch5/toyc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..836968e218871e35e4ab8e06a5fb0544d954a30a
--- /dev/null
+++ b/mlir/examples/toy/Ch5/toyc.cpp
@@ -0,0 +1,188 @@
+//===- toyc.cpp - The Toy Compiler ----------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the entry point for the Toy compiler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+#include "toy/MLIRGen.h"
+#include "toy/Parser.h"
+#include "toy/Passes.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Parser.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+namespace cl = llvm::cl;
+
+static cl::opt<std::string> inputFilename(cl::Positional,
+                                          cl::desc("<input toy file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+namespace {
+enum InputType { Toy, MLIR };
+}
+static cl::opt<enum InputType> inputType(
+    "x", cl::init(Toy), cl::desc("Decided the kind of output desired"),
+    cl::values(clEnumValN(Toy, "toy", "load the input file as a Toy source.")),
+    cl::values(clEnumValN(MLIR, "mlir",
+                          "load the input file as an MLIR file")));
+
+namespace {
+enum Action { None, DumpAST, DumpMLIR, DumpMLIRAffine };
+}
+static cl::opt<enum Action> emitAction(
+    "emit", cl::desc("Select the kind of output desired"),
+    cl::values(clEnumValN(DumpAST, "ast", "output the AST dump")),
+    cl::values(clEnumValN(DumpMLIR, "mlir", "output the MLIR dump")),
+    cl::values(clEnumValN(DumpMLIRAffine, "mlir-affine",
+                          "output the MLIR dump after affine lowering")));
+
+static cl::opt<bool> enableOpt("opt", cl::desc("Enable optimizations"));
+
+/// Returns a Toy AST resulting from parsing the file or a nullptr on error.
+std::unique_ptr<toy::ModuleAST> parseInputFile(llvm::StringRef filename) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(filename);
+  if (std::error_code ec = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << ec.message() << "\n";
+    return nullptr;
+  }
+  auto buffer = fileOrErr.get()->getBuffer();
+  LexerBuffer lexer(buffer.begin(), buffer.end(), filename);
+  Parser parser(lexer);
+  return parser.parseModule();
+}
+
+int loadMLIR(llvm::SourceMgr &sourceMgr, mlir::MLIRContext &context,
+             mlir::OwningModuleRef &module) {
+  // Handle '.toy' input to the compiler.
+  if (inputType != InputType::MLIR &&
+      !llvm::StringRef(inputFilename).endswith(".mlir")) {
+    auto moduleAST = parseInputFile(inputFilename);
+    if (!moduleAST)
+      return 6;
+    module = mlirGen(context, *moduleAST);
+    return !module ? 1 : 0;
+  }
+
+  // Otherwise, the input is '.mlir'.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(inputFilename);
+  if (std::error_code EC = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << EC.message() << "\n";
+    return -1;
+  }
+
+  // Parse the input mlir.
+  sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
+  module = mlir::parseSourceFile(sourceMgr, &context);
+  if (!module) {
+    llvm::errs() << "Error can't load file " << inputFilename << "\n";
+    return 3;
+  }
+  return 0;
+}
+
+int dumpMLIR() {
+  // Register our Dialect with MLIR.
+  mlir::registerDialect<mlir::toy::ToyDialect>();
+
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module;
+  llvm::SourceMgr sourceMgr;
+  mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
+  if (int error = loadMLIR(sourceMgr, context, module))
+    return error;
+
+  mlir::PassManager pm(&context);
+  // Apply any generic pass manager command line options and run the pipeline.
+  applyPassManagerCLOptions(pm);
+
+  // Check to see what granularity of MLIR we are compiling to.
+  bool isLoweringToAffine = emitAction >= Action::DumpMLIRAffine;
+
+  if (enableOpt || isLoweringToAffine) {
+    // Inline all functions into main and then delete them.
+    pm.addPass(mlir::createInlinerPass());
+    pm.addPass(mlir::toy::createDeadFunctionEliminationPass());
+
+    // Now that there is only one function, we can infer the shapes of each of
+    // the operations.
+    mlir::OpPassManager &optPM = pm.nest<mlir::FuncOp>();
+    optPM.addPass(mlir::toy::createShapeInferencePass());
+    optPM.addPass(mlir::createCanonicalizerPass());
+    optPM.addPass(mlir::createCSEPass());
+  }
+
+  if (isLoweringToAffine) {
+    // Partially lower the toy dialect with a few cleanups afterwards.
+    pm.addPass(mlir::toy::createLowerToAffinePass());
+
+    mlir::OpPassManager &optPM = pm.nest<mlir::FuncOp>();
+    optPM.addPass(mlir::createCanonicalizerPass());
+    optPM.addPass(mlir::createCSEPass());
+
+    // Add optimizations if enabled.
+    if (enableOpt) {
+      optPM.addPass(mlir::createLoopFusionPass());
+      optPM.addPass(mlir::createMemRefDataFlowOptPass());
+    }
+  }
+
+  if (mlir::failed(pm.run(*module)))
+    return 4;
+
+  module->dump();
+  return 0;
+}
+
+int dumpAST() {
+  if (inputType == InputType::MLIR) {
+    llvm::errs() << "Can't dump a Toy AST when the input is MLIR\n";
+    return 5;
+  }
+
+  auto moduleAST = parseInputFile(inputFilename);
+  if (!moduleAST)
+    return 1;
+
+  dump(*moduleAST);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  mlir::registerPassManagerCLOptions();
+  cl::ParseCommandLineOptions(argc, argv, "toy compiler\n");
+
+  switch (emitAction) {
+  case Action::DumpAST:
+    return dumpAST();
+  case Action::DumpMLIR:
+  case Action::DumpMLIRAffine:
+    return dumpMLIR();
+  default:
+    llvm::errs() << "No action specified (parsing only?), use -emit=<action>\n";
+  }
+
+  return 0;
+}
diff --git a/mlir/examples/toy/Ch6/CMakeLists.txt b/mlir/examples/toy/Ch6/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c342ed1d4a03fe5b316a6e2c8e90e3296f8a5d12
--- /dev/null
+++ b/mlir/examples/toy/Ch6/CMakeLists.txt
@@ -0,0 +1,53 @@
+add_subdirectory(include)
+
+set(LLVM_LINK_COMPONENTS
+  Core
+  Support
+  )
+
+set(LLVM_TARGET_DEFINITIONS mlir/ToyCombine.td)
+mlir_tablegen(ToyCombine.inc -gen-rewriters "-I${CMAKE_CURRENT_SOURCE_DIR}/include")
+add_public_tablegen_target(ToyCh6CombineIncGen)
+
+add_toy_chapter(toyc-ch6
+  toyc.cpp
+  parser/AST.cpp
+  mlir/MLIRGen.cpp
+  mlir/Dialect.cpp
+  mlir/DeadFunctionEliminationPass.cpp
+  mlir/LowerToAffineLoops.cpp
+  mlir/LowerToLLVM.cpp
+  mlir/ShapeInferencePass.cpp
+  mlir/ToyCombine.cpp
+  )
+
+add_dependencies(toyc-ch6 ToyCh6ShapeInferenceInterfaceIncGen)
+add_dependencies(toyc-ch6 ToyCh6OpsIncGen)
+add_dependencies(toyc-ch6 ToyCh6CombineIncGen)
+add_dependencies(toyc-ch6 MLIRCallOpInterfacesIncGen)
+include_directories(include/)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/)
+target_link_libraries(toyc-ch6
+  PRIVATE
+    MLIRAffineOps
+    MLIRAffineToStandard
+    MLIRAnalysis
+    MLIRExecutionEngine
+    MLIRIR
+    MLIRLLVMIR
+    MLIRLoopToStandard
+    MLIRParser
+    MLIRPass
+    MLIRStandardOps
+    MLIRStandardToLLVM
+    MLIRTargetLLVMIR
+    MLIRTransforms
+    )
+
+whole_archive_link(toyc-ch6
+  MLIRAffineToStandard
+  MLIRAffineOps
+  MLIRLLVMIR
+  MLIRStandardOps
+  )
diff --git a/mlir/examples/toy/Ch6/include/CMakeLists.txt b/mlir/examples/toy/Ch6/include/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..37c89d0bae965cfc8665515de7e60ad7867a7d8b
--- /dev/null
+++ b/mlir/examples/toy/Ch6/include/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(toy)
diff --git a/mlir/examples/toy/Ch6/include/toy/AST.h b/mlir/examples/toy/Ch6/include/toy/AST.h
new file mode 100644
index 0000000000000000000000000000000000000000..820600b5b1c900cbeedce7545bad458f096cc92e
--- /dev/null
+++ b/mlir/examples/toy/Ch6/include/toy/AST.h
@@ -0,0 +1,242 @@
+//===- AST.h - Node definition for the Toy AST ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST for the Toy language. It is optimized for
+// simplicity, not efficiency. The AST forms a tree structure where each node
+// references its children using std::unique_ptr<>.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_AST_H_
+#define MLIR_TUTORIAL_TOY_AST_H_
+
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <vector>
+
+namespace toy {
+
+/// A variable type with shape information.
+struct VarType {
+  std::vector<int64_t> shape;
+};
+
+/// Base class for all expression nodes.
+class ExprAST {
+public:
+  enum ExprASTKind {
+    Expr_VarDecl,
+    Expr_Return,
+    Expr_Num,
+    Expr_Literal,
+    Expr_Var,
+    Expr_BinOp,
+    Expr_Call,
+    Expr_Print,
+  };
+
+  ExprAST(ExprASTKind kind, Location location)
+      : kind(kind), location(location) {}
+  virtual ~ExprAST() = default;
+
+  ExprASTKind getKind() const { return kind; }
+
+  const Location &loc() { return location; }
+
+private:
+  const ExprASTKind kind;
+  Location location;
+};
+
+/// A block-list of expressions.
+using ExprASTList = std::vector<std::unique_ptr<ExprAST>>;
+
+/// Expression class for numeric literals like "1.0".
+class NumberExprAST : public ExprAST {
+  double Val;
+
+public:
+  NumberExprAST(Location loc, double val) : ExprAST(Expr_Num, loc), Val(val) {}
+
+  double getValue() { return Val; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Num; }
+};
+
+/// Expression class for a literal value.
+class LiteralExprAST : public ExprAST {
+  std::vector<std::unique_ptr<ExprAST>> values;
+  std::vector<int64_t> dims;
+
+public:
+  LiteralExprAST(Location loc, std::vector<std::unique_ptr<ExprAST>> values,
+                 std::vector<int64_t> dims)
+      : ExprAST(Expr_Literal, loc), values(std::move(values)),
+        dims(std::move(dims)) {}
+
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getValues() { return values; }
+  llvm::ArrayRef<int64_t> getDims() { return dims; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Literal; }
+};
+
+/// Expression class for referencing a variable, like "a".
+class VariableExprAST : public ExprAST {
+  std::string name;
+
+public:
+  VariableExprAST(Location loc, llvm::StringRef name)
+      : ExprAST(Expr_Var, loc), name(name) {}
+
+  llvm::StringRef getName() { return name; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Var; }
+};
+
+/// Expression class for defining a variable.
+class VarDeclExprAST : public ExprAST {
+  std::string name;
+  VarType type;
+  std::unique_ptr<ExprAST> initVal;
+
+public:
+  VarDeclExprAST(Location loc, llvm::StringRef name, VarType type,
+                 std::unique_ptr<ExprAST> initVal)
+      : ExprAST(Expr_VarDecl, loc), name(name), type(std::move(type)),
+        initVal(std::move(initVal)) {}
+
+  llvm::StringRef getName() { return name; }
+  ExprAST *getInitVal() { return initVal.get(); }
+  const VarType &getType() { return type; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_VarDecl; }
+};
+
+/// Expression class for a return operator.
+class ReturnExprAST : public ExprAST {
+  llvm::Optional<std::unique_ptr<ExprAST>> expr;
+
+public:
+  ReturnExprAST(Location loc, llvm::Optional<std::unique_ptr<ExprAST>> expr)
+      : ExprAST(Expr_Return, loc), expr(std::move(expr)) {}
+
+  llvm::Optional<ExprAST *> getExpr() {
+    if (expr.hasValue())
+      return expr->get();
+    return llvm::None;
+  }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Return; }
+};
+
+/// Expression class for a binary operator.
+class BinaryExprAST : public ExprAST {
+  char op;
+  std::unique_ptr<ExprAST> lhs, rhs;
+
+public:
+  char getOp() { return op; }
+  ExprAST *getLHS() { return lhs.get(); }
+  ExprAST *getRHS() { return rhs.get(); }
+
+  BinaryExprAST(Location loc, char Op, std::unique_ptr<ExprAST> lhs,
+                std::unique_ptr<ExprAST> rhs)
+      : ExprAST(Expr_BinOp, loc), op(Op), lhs(std::move(lhs)),
+        rhs(std::move(rhs)) {}
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_BinOp; }
+};
+
+/// Expression class for function calls.
+class CallExprAST : public ExprAST {
+  std::string callee;
+  std::vector<std::unique_ptr<ExprAST>> args;
+
+public:
+  CallExprAST(Location loc, const std::string &callee,
+              std::vector<std::unique_ptr<ExprAST>> args)
+      : ExprAST(Expr_Call, loc), callee(callee), args(std::move(args)) {}
+
+  llvm::StringRef getCallee() { return callee; }
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getArgs() { return args; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Call; }
+};
+
+/// Expression class for builtin print calls.
+class PrintExprAST : public ExprAST {
+  std::unique_ptr<ExprAST> arg;
+
+public:
+  PrintExprAST(Location loc, std::unique_ptr<ExprAST> arg)
+      : ExprAST(Expr_Print, loc), arg(std::move(arg)) {}
+
+  ExprAST *getArg() { return arg.get(); }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Print; }
+};
+
+/// This class represents the "prototype" for a function, which captures its
+/// name, and its argument names (thus implicitly the number of arguments the
+/// function takes).
+class PrototypeAST {
+  Location location;
+  std::string name;
+  std::vector<std::unique_ptr<VariableExprAST>> args;
+
+public:
+  PrototypeAST(Location location, const std::string &name,
+               std::vector<std::unique_ptr<VariableExprAST>> args)
+      : location(location), name(name), args(std::move(args)) {}
+
+  const Location &loc() { return location; }
+  llvm::StringRef getName() const { return name; }
+  llvm::ArrayRef<std::unique_ptr<VariableExprAST>> getArgs() { return args; }
+};
+
+/// This class represents a function definition itself.
+class FunctionAST {
+  std::unique_ptr<PrototypeAST> proto;
+  std::unique_ptr<ExprASTList> body;
+
+public:
+  FunctionAST(std::unique_ptr<PrototypeAST> proto,
+              std::unique_ptr<ExprASTList> body)
+      : proto(std::move(proto)), body(std::move(body)) {}
+  PrototypeAST *getProto() { return proto.get(); }
+  ExprASTList *getBody() { return body.get(); }
+};
+
+/// This class represents a list of functions to be processed together
+class ModuleAST {
+  std::vector<FunctionAST> functions;
+
+public:
+  ModuleAST(std::vector<FunctionAST> functions)
+      : functions(std::move(functions)) {}
+
+  auto begin() -> decltype(functions.begin()) { return functions.begin(); }
+  auto end() -> decltype(functions.end()) { return functions.end(); }
+};
+
+void dump(ModuleAST &);
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_AST_H_
diff --git a/mlir/examples/toy/Ch6/include/toy/CMakeLists.txt b/mlir/examples/toy/Ch6/include/toy/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..aecf11fab6c94d392e6a83244cc0ed4cd3fb4b14
--- /dev/null
+++ b/mlir/examples/toy/Ch6/include/toy/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_TARGET_DEFINITIONS Ops.td)
+mlir_tablegen(Ops.h.inc -gen-op-decls "-I${CMAKE_CURRENT_SOURCE_DIR}/..")
+mlir_tablegen(Ops.cpp.inc -gen-op-defs "-I${CMAKE_CURRENT_SOURCE_DIR}/..")
+add_public_tablegen_target(ToyCh6OpsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS ShapeInferenceInterface.td)
+mlir_tablegen(ShapeInferenceOpInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(ShapeInferenceOpInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(ToyCh6ShapeInferenceInterfaceIncGen)
diff --git a/mlir/examples/toy/Ch6/include/toy/Dialect.h b/mlir/examples/toy/Ch6/include/toy/Dialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e8b91dcf4843762db80cde22ef96a0b22929840
--- /dev/null
+++ b/mlir/examples/toy/Ch6/include/toy/Dialect.h
@@ -0,0 +1,46 @@
+//===- Dialect.h - Dialect definition for the Toy IR ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IR Dialect for the Toy language.
+// See g3doc/Tutorials/Toy/Ch-2.md for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_DIALECT_H_
+#define MLIR_TUTORIAL_TOY_DIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/StandardTypes.h"
+#include "toy/ShapeInferenceInterface.h"
+
+namespace mlir {
+namespace toy {
+
+/// This is the definition of the Toy dialect. A dialect inherits from
+/// mlir::Dialect and registers custom attributes, operations, and types (in its
+/// constructor). It can also override some general behavior exposed via virtual
+/// methods.
+class ToyDialect : public mlir::Dialect {
+public:
+  explicit ToyDialect(mlir::MLIRContext *ctx);
+
+  /// Provide a utility accessor to the dialect namespace. This is used by
+  /// several utilities for casting between dialects.
+  static llvm::StringRef getDialectNamespace() { return "toy"; }
+};
+
+/// Include the auto-generated header file containing the declarations of the
+/// toy operations.
+#define GET_OP_CLASSES
+#include "toy/Ops.h.inc"
+
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_DIALECT_H_
diff --git a/mlir/examples/toy/Ch6/include/toy/Lexer.h b/mlir/examples/toy/Ch6/include/toy/Lexer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6eff64ee5f09634041f76cbae11c18f8ca46d07c
--- /dev/null
+++ b/mlir/examples/toy/Ch6/include/toy/Lexer.h
@@ -0,0 +1,232 @@
+//===- Lexer.h - Lexer for the Toy language -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple Lexer for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_LEXER_H_
+#define MLIR_TUTORIAL_TOY_LEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+
+#include <memory>
+#include <string>
+
+namespace toy {
+
+/// Structure definition a location in a file.
+struct Location {
+  std::shared_ptr<std::string> file; ///< filename.
+  int line;                          ///< line number.
+  int col;                           ///< column number.
+};
+
+// List of Token returned by the lexer.
+enum Token : int {
+  tok_semicolon = ';',
+  tok_parenthese_open = '(',
+  tok_parenthese_close = ')',
+  tok_bracket_open = '{',
+  tok_bracket_close = '}',
+  tok_sbracket_open = '[',
+  tok_sbracket_close = ']',
+
+  tok_eof = -1,
+
+  // commands
+  tok_return = -2,
+  tok_var = -3,
+  tok_def = -4,
+
+  // primary
+  tok_identifier = -5,
+  tok_number = -6,
+};
+
+/// The Lexer is an abstract base class providing all the facilities that the
+/// Parser expects. It goes through the stream one token at a time and keeps
+/// track of the location in the file for debugging purpose.
+/// It relies on a subclass to provide a `readNextLine()` method. The subclass
+/// can proceed by reading the next line from the standard input or from a
+/// memory mapped file.
+class Lexer {
+public:
+  /// Create a lexer for the given filename. The filename is kept only for
+  /// debugging purpose (attaching a location to a Token).
+  Lexer(std::string filename)
+      : lastLocation(
+            {std::make_shared<std::string>(std::move(filename)), 0, 0}) {}
+  virtual ~Lexer() = default;
+
+  /// Look at the current token in the stream.
+  Token getCurToken() { return curTok; }
+
+  /// Move to the next token in the stream and return it.
+  Token getNextToken() { return curTok = getTok(); }
+
+  /// Move to the next token in the stream, asserting on the current token
+  /// matching the expectation.
+  void consume(Token tok) {
+    assert(tok == curTok && "consume Token mismatch expectation");
+    getNextToken();
+  }
+
+  /// Return the current identifier (prereq: getCurToken() == tok_identifier)
+  llvm::StringRef getId() {
+    assert(curTok == tok_identifier);
+    return identifierStr;
+  }
+
+  /// Return the current number (prereq: getCurToken() == tok_number)
+  double getValue() {
+    assert(curTok == tok_number);
+    return numVal;
+  }
+
+  /// Return the location for the beginning of the current token.
+  Location getLastLocation() { return lastLocation; }
+
+  // Return the current line in the file.
+  int getLine() { return curLineNum; }
+
+  // Return the current column in the file.
+  int getCol() { return curCol; }
+
+private:
+  /// Delegate to a derived class fetching the next line. Returns an empty
+  /// string to signal end of file (EOF). Lines are expected to always finish
+  /// with "\n"
+  virtual llvm::StringRef readNextLine() = 0;
+
+  /// Return the next character from the stream. This manages the buffer for the
+  /// current line and request the next line buffer to the derived class as
+  /// needed.
+  int getNextChar() {
+    // The current line buffer should not be empty unless it is the end of file.
+    if (curLineBuffer.empty())
+      return EOF;
+    ++curCol;
+    auto nextchar = curLineBuffer.front();
+    curLineBuffer = curLineBuffer.drop_front();
+    if (curLineBuffer.empty())
+      curLineBuffer = readNextLine();
+    if (nextchar == '\n') {
+      ++curLineNum;
+      curCol = 0;
+    }
+    return nextchar;
+  }
+
+  ///  Return the next token from standard input.
+  Token getTok() {
+    // Skip any whitespace.
+    while (isspace(lastChar))
+      lastChar = Token(getNextChar());
+
+    // Save the current location before reading the token characters.
+    lastLocation.line = curLineNum;
+    lastLocation.col = curCol;
+
+    // Identifier: [a-zA-Z][a-zA-Z0-9_]*
+    if (isalpha(lastChar)) {
+      identifierStr = (char)lastChar;
+      while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
+        identifierStr += (char)lastChar;
+
+      if (identifierStr == "return")
+        return tok_return;
+      if (identifierStr == "def")
+        return tok_def;
+      if (identifierStr == "var")
+        return tok_var;
+      return tok_identifier;
+    }
+
+    // Number: [0-9.]+
+    if (isdigit(lastChar) || lastChar == '.') {
+      std::string numStr;
+      do {
+        numStr += lastChar;
+        lastChar = Token(getNextChar());
+      } while (isdigit(lastChar) || lastChar == '.');
+
+      numVal = strtod(numStr.c_str(), nullptr);
+      return tok_number;
+    }
+
+    if (lastChar == '#') {
+      // Comment until end of line.
+      do {
+        lastChar = Token(getNextChar());
+      } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
+
+      if (lastChar != EOF)
+        return getTok();
+    }
+
+    // Check for end of file.  Don't eat the EOF.
+    if (lastChar == EOF)
+      return tok_eof;
+
+    // Otherwise, just return the character as its ascii value.
+    Token thisChar = Token(lastChar);
+    lastChar = Token(getNextChar());
+    return thisChar;
+  }
+
+  /// The last token read from the input.
+  Token curTok = tok_eof;
+
+  /// Location for `curTok`.
+  Location lastLocation;
+
+  /// If the current Token is an identifier, this string contains the value.
+  std::string identifierStr;
+
+  /// If the current Token is a number, this contains the value.
+  double numVal = 0;
+
+  /// The last value returned by getNextChar(). We need to keep it around as we
+  /// always need to read ahead one character to decide when to end a token and
+  /// we can't put it back in the stream after reading from it.
+  Token lastChar = Token(' ');
+
+  /// Keep track of the current line number in the input stream
+  int curLineNum = 0;
+
+  /// Keep track of the current column number in the input stream
+  int curCol = 0;
+
+  /// Buffer supplied by the derived class on calls to `readNextLine()`
+  llvm::StringRef curLineBuffer = "\n";
+};
+
+/// A lexer implementation operating on a buffer in memory.
+class LexerBuffer final : public Lexer {
+public:
+  LexerBuffer(const char *begin, const char *end, std::string filename)
+      : Lexer(std::move(filename)), current(begin), end(end) {}
+
+private:
+  /// Provide one line at a time to the Lexer, return an empty string when
+  /// reaching the end of the buffer.
+  llvm::StringRef readNextLine() override {
+    auto *begin = current;
+    while (current <= end && *current && *current != '\n')
+      ++current;
+    if (current <= end && *current)
+      ++current;
+    llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
+    return result;
+  }
+  const char *current, *end;
+};
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_LEXER_H_
diff --git a/mlir/examples/toy/Ch6/include/toy/MLIRGen.h b/mlir/examples/toy/Ch6/include/toy/MLIRGen.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1c8ca1201d1a2a391c0aec0d89197fbbb18efb8
--- /dev/null
+++ b/mlir/examples/toy/Ch6/include/toy/MLIRGen.h
@@ -0,0 +1,32 @@
+//===- MLIRGen.h - MLIR Generation from a Toy AST -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a simple interface to perform IR generation targeting MLIR
+// from a Module AST for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_MLIRGEN_H_
+#define MLIR_TUTORIAL_TOY_MLIRGEN_H_
+
+#include <memory>
+
+namespace mlir {
+class MLIRContext;
+class OwningModuleRef;
+} // namespace mlir
+
+namespace toy {
+class ModuleAST;
+
+/// Emit IR for the given Toy moduleAST, returns a newly created MLIR module
+/// or nullptr on failure.
+mlir::OwningModuleRef mlirGen(mlir::MLIRContext &context, ModuleAST &moduleAST);
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_MLIRGEN_H_
diff --git a/mlir/examples/toy/Ch6/include/toy/Ops.td b/mlir/examples/toy/Ch6/include/toy/Ops.td
new file mode 100644
index 0000000000000000000000000000000000000000..410c5df246128bd8ddba8bc264a0ab9df9f65941
--- /dev/null
+++ b/mlir/examples/toy/Ch6/include/toy/Ops.td
@@ -0,0 +1,247 @@
+//===- Ops.td - Toy dialect operation definitions ----------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the operations of the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_OPS
+#define TOY_OPS
+
+include "mlir/Analysis/CallInterfaces.td"
+include "toy/ShapeInferenceInterface.td"
+
+// Provide a definition of the 'toy' dialect in the ODS framework so that we
+// can define our operations.
+def Toy_Dialect : Dialect {
+  let name = "toy";
+  let cppNamespace = "toy";
+}
+
+// Base class for toy dialect operations. This operation inherits from the base
+// `Op` class in OpBase.td, and provides:
+//   * The parent dialect of the operation.
+//   * The mnemonic for the operation, or the name without the dialect prefix.
+//   * A list of traits for the operation.
+class Toy_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Toy_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+// We define a toy operation by inheriting from our base 'Toy_Op' class above.
+// Here we provide the mnemonic and a list of traits for the operation. The
+// constant operation is marked as 'NoSideEffect' as it is a pure operation
+// and may be removed if dead.
+def ConstantOp : Toy_Op<"constant", [NoSideEffect]> {
+  // Provide a summary and description for this operation. This can be used to
+  // auto-generate documentation of the operations within our dialect.
+  let summary = "constant";
+  let description = [{
+    Constant operation turns a literal into an SSA value. The data is attached
+    to the operation as an attribute. For example:
+
+    ```mlir
+      %0 = "toy.constant"()
+         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
+        : () -> tensor<2x3xf64>
+    ```
+  }];
+
+  // The constant operation takes an attribute as the only input.
+  let arguments = (ins F64ElementsAttr:$value);
+
+  // The constant operation returns a single value of TensorType.
+  let results = (outs F64Tensor);
+
+  // Add custom build methods for the constant operation. These method populates
+  // the `state` that MLIR uses to create operations, i.e. these are used when
+  // using `builder.create<ConstantOp>(...)`.
+  let builders = [
+    // Build a constant with a given constant tensor value.
+    OpBuilder<"Builder *builder, OperationState &state, "
+              "DenseElementsAttr value", [{
+      build(builder, state, value.getType(), value);
+    }]>,
+
+    // Build a constant with a given constant floating-point value.
+    OpBuilder<"Builder *builder, OperationState &state, double value">
+  ];
+
+  // Invoke a static verify method to verify this constant operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def AddOp : Toy_Op<"add",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "element-wise addition operation";
+  let description = [{
+    The "add" operation performs element-wise addition between two tensors.
+    The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs F64Tensor);
+
+  // Allow building an AddOp with from the two input operands.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
+  ];
+}
+
+def CastOp : Toy_Op<"cast",
+    [DeclareOpInterfaceMethods<ShapeInferenceOpInterface>, NoSideEffect,
+     SameOperandsAndResultShape]> {
+  let summary = "shape cast operation";
+  let description = [{
+    The "cast" operation converts a tensor from one type to an equivalent type
+    without changing any data elements. The source and destination types
+    must both be tensor types with the same element type. If both are ranked
+    then the rank should be the same and static dimensions should match. The
+    operation is invalid if converting to a mismatching constant dimension.
+  }];
+
+  let arguments = (ins F64Tensor:$input);
+  let results = (outs F64Tensor:$output);
+
+  // Set the folder bit so that we can fold redundant cast operations.
+  let hasFolder = 1;
+}
+
+def GenericCallOp : Toy_Op<"generic_call",
+    [DeclareOpInterfaceMethods<CallOpInterface>]> {
+  let summary = "generic call operation";
+  let description = [{
+    Generic calls represent calls to a user defined function that needs to
+    be specialized for the shape of its arguments. The callee name is attached
+    as a symbol reference via an attribute. The arguments list must match the
+    arguments expected by the callee. For example:
+
+    ```mlir
+     %4 = "toy.generic_call"(%1, %3) {callee = @my_func}
+           : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+    ```
+
+    This is only valid if a function named "my_func" exists and takes two
+    arguments.
+  }];
+
+  // The generic call operation takes a symbol reference attribute as the
+  // callee, and inputs for the call.
+  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<F64Tensor>:$inputs);
+
+  // The generic call operation returns a single value of TensorType.
+  let results = (outs F64Tensor);
+
+  // Add custom build methods for the generic call operation.
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &state, "
+              "StringRef callee, ArrayRef<Value> arguments">
+  ];
+}
+
+def MulOp : Toy_Op<"mul",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "element-wise multiplication operation";
+  let description = [{
+    The "mul" operation performs element-wise multiplication between two
+    tensors. The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs F64Tensor);
+
+  // Allow building a MulOp with from the two input operands.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
+  ];
+}
+
+def PrintOp : Toy_Op<"print"> {
+  let summary = "print operation";
+  let description = [{
+    The "print" builtin operation prints a given input tensor, and produces
+    no results.
+  }];
+
+  // The print operation takes an input tensor to print.
+  // We also allow a F64MemRef to enable interop during partial lowering.
+  let arguments = (ins AnyTypeOf<[F64Tensor, F64MemRef]>:$input);
+}
+
+def ReshapeOp : Toy_Op<"reshape", [NoSideEffect]> {
+  let summary = "tensor reshape operation";
+  let description = [{
+    Reshape operation is transforming its input tensor into a new tensor with
+    the same number of elements but different shapes. For example:
+
+    ```mlir
+       %0 = "toy.reshape"(%arg1) : (tensor<10xf64>) -> tensor<5x2xf64>
+    ```
+  }];
+
+  let arguments = (ins F64Tensor:$input);
+  let hasCanonicalizer = 1;
+
+  // We expect that the reshape operation returns a statically shaped tensor.
+  let results = (outs StaticShapeTensorOf<[F64]>);
+}
+
+def ReturnOp : Toy_Op<"return", [Terminator, HasParent<"FuncOp">]> {
+  let summary = "return operation";
+  let description = [{
+    The "return" operation represents a return operation within a function.
+    The operation takes an optional tensor operand and produces no results.
+    The operand type must match the signature of the function that contains
+    the operation. For example:
+
+    ```mlir
+      func @foo() -> tensor<2xf64> {
+        ...
+        toy.return %0 : tensor<2xf64>
+      }
+    ```
+  }];
+
+  // The return operation takes an optional input operand to return. This
+  // value must match the return type of the enclosing function.
+  let arguments = (ins Variadic<F64Tensor>:$input);
+
+  // Allow building a ReturnOp with no return operand.
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &state", [{ build(b, state, llvm::None); }]
+  >];
+
+  // Provide extra utility definitions on the c++ operation class definition.
+  let extraClassDeclaration = [{
+    bool hasOperand() { return getNumOperands() != 0; }
+  }];
+
+  // Invoke a static verify method to verify this return operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def TransposeOp : Toy_Op<"transpose",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "transpose operation";
+
+  let arguments = (ins F64Tensor:$input);
+  let results = (outs F64Tensor);
+  let hasCanonicalizer = 1;
+
+  // Allow building a TransposeOp with from the input operand.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value input">
+  ];
+
+  // Invoke a static verify method to verify this transpose operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+#endif // TOY_OPS
diff --git a/mlir/examples/toy/Ch6/include/toy/Parser.h b/mlir/examples/toy/Ch6/include/toy/Parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..4557ea26859de3d0a6b71448f4bef030167c3e71
--- /dev/null
+++ b/mlir/examples/toy/Ch6/include/toy/Parser.h
@@ -0,0 +1,485 @@
+//===- Parser.h - Toy Language Parser -------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the parser for the Toy language. It processes the Token
+// provided by the Lexer and returns an AST.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_PARSER_H
+#define MLIR_TUTORIAL_TOY_PARSER_H
+
+#include "toy/AST.h"
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+namespace toy {
+
+/// This is a simple recursive parser for the Toy language. It produces a well
+/// formed AST from a stream of Token supplied by the Lexer. No semantic checks
+/// or symbol resolution is performed. For example, variables are referenced by
+/// string and the code could reference an undeclared variable and the parsing
+/// succeeds.
+class Parser {
+public:
+  /// Create a Parser for the supplied lexer.
+  Parser(Lexer &lexer) : lexer(lexer) {}
+
+  /// Parse a full Module. A module is a list of function definitions.
+  std::unique_ptr<ModuleAST> parseModule() {
+    lexer.getNextToken(); // prime the lexer
+
+    // Parse functions one at a time and accumulate in this vector.
+    std::vector<FunctionAST> functions;
+    while (auto f = parseDefinition()) {
+      functions.push_back(std::move(*f));
+      if (lexer.getCurToken() == tok_eof)
+        break;
+    }
+    // If we didn't reach EOF, there was an error during parsing
+    if (lexer.getCurToken() != tok_eof)
+      return parseError<ModuleAST>("nothing", "at end of module");
+
+    return std::make_unique<ModuleAST>(std::move(functions));
+  }
+
+private:
+  Lexer &lexer;
+
+  /// Parse a return statement.
+  /// return :== return ; | return expr ;
+  std::unique_ptr<ReturnExprAST> parseReturn() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_return);
+
+    // return takes an optional argument
+    llvm::Optional<std::unique_ptr<ExprAST>> expr;
+    if (lexer.getCurToken() != ';') {
+      expr = parseExpression();
+      if (!expr)
+        return nullptr;
+    }
+    return std::make_unique<ReturnExprAST>(std::move(loc), std::move(expr));
+  }
+
+  /// Parse a literal number.
+  /// numberexpr ::= number
+  std::unique_ptr<ExprAST> parseNumberExpr() {
+    auto loc = lexer.getLastLocation();
+    auto result =
+        std::make_unique<NumberExprAST>(std::move(loc), lexer.getValue());
+    lexer.consume(tok_number);
+    return std::move(result);
+  }
+
+  /// Parse a literal array expression.
+  /// tensorLiteral ::= [ literalList ] | number
+  /// literalList ::= tensorLiteral | tensorLiteral, literalList
+  std::unique_ptr<ExprAST> parseTensorLiteralExpr() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(Token('['));
+
+    // Hold the list of values at this nesting level.
+    std::vector<std::unique_ptr<ExprAST>> values;
+    // Hold the dimensions for all the nesting inside this level.
+    std::vector<int64_t> dims;
+    do {
+      // We can have either another nested array or a number literal.
+      if (lexer.getCurToken() == '[') {
+        values.push_back(parseTensorLiteralExpr());
+        if (!values.back())
+          return nullptr; // parse error in the nested array.
+      } else {
+        if (lexer.getCurToken() != tok_number)
+          return parseError<ExprAST>("<num> or [", "in literal expression");
+        values.push_back(parseNumberExpr());
+      }
+
+      // End of this list on ']'
+      if (lexer.getCurToken() == ']')
+        break;
+
+      // Elements are separated by a comma.
+      if (lexer.getCurToken() != ',')
+        return parseError<ExprAST>("] or ,", "in literal expression");
+
+      lexer.getNextToken(); // eat ,
+    } while (true);
+    if (values.empty())
+      return parseError<ExprAST>("<something>", "to fill literal expression");
+    lexer.getNextToken(); // eat ]
+
+    /// Fill in the dimensions now. First the current nesting level:
+    dims.push_back(values.size());
+
+    /// If there is any nested array, process all of them and ensure that
+    /// dimensions are uniform.
+    if (llvm::any_of(values, [](std::unique_ptr<ExprAST> &expr) {
+          return llvm::isa<LiteralExprAST>(expr.get());
+        })) {
+      auto *firstLiteral = llvm::dyn_cast<LiteralExprAST>(values.front().get());
+      if (!firstLiteral)
+        return parseError<ExprAST>("uniform well-nested dimensions",
+                                   "inside literal expression");
+
+      // Append the nested dimensions to the current level
+      auto firstDims = firstLiteral->getDims();
+      dims.insert(dims.end(), firstDims.begin(), firstDims.end());
+
+      // Sanity check that shape is uniform across all elements of the list.
+      for (auto &expr : values) {
+        auto *exprLiteral = llvm::cast<LiteralExprAST>(expr.get());
+        if (!exprLiteral)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+        if (exprLiteral->getDims() != firstDims)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+      }
+    }
+    return std::make_unique<LiteralExprAST>(std::move(loc), std::move(values),
+                                            std::move(dims));
+  }
+
+  /// parenexpr ::= '(' expression ')'
+  std::unique_ptr<ExprAST> parseParenExpr() {
+    lexer.getNextToken(); // eat (.
+    auto v = parseExpression();
+    if (!v)
+      return nullptr;
+
+    if (lexer.getCurToken() != ')')
+      return parseError<ExprAST>(")", "to close expression with parentheses");
+    lexer.consume(Token(')'));
+    return v;
+  }
+
+  /// identifierexpr
+  ///   ::= identifier
+  ///   ::= identifier '(' expression ')'
+  std::unique_ptr<ExprAST> parseIdentifierExpr() {
+    std::string name = lexer.getId();
+
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat identifier.
+
+    if (lexer.getCurToken() != '(') // Simple variable ref.
+      return std::make_unique<VariableExprAST>(std::move(loc), name);
+
+    // This is a function call.
+    lexer.consume(Token('('));
+    std::vector<std::unique_ptr<ExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      while (true) {
+        if (auto arg = parseExpression())
+          args.push_back(std::move(arg));
+        else
+          return nullptr;
+
+        if (lexer.getCurToken() == ')')
+          break;
+
+        if (lexer.getCurToken() != ',')
+          return parseError<ExprAST>(", or )", "in argument list");
+        lexer.getNextToken();
+      }
+    }
+    lexer.consume(Token(')'));
+
+    // It can be a builtin call to print
+    if (name == "print") {
+      if (args.size() != 1)
+        return parseError<ExprAST>("<single arg>", "as argument to print()");
+
+      return std::make_unique<PrintExprAST>(std::move(loc), std::move(args[0]));
+    }
+
+    // Call to a user-defined function
+    return std::make_unique<CallExprAST>(std::move(loc), name, std::move(args));
+  }
+
+  /// primary
+  ///   ::= identifierexpr
+  ///   ::= numberexpr
+  ///   ::= parenexpr
+  ///   ::= tensorliteral
+  std::unique_ptr<ExprAST> parsePrimary() {
+    switch (lexer.getCurToken()) {
+    default:
+      llvm::errs() << "unknown token '" << lexer.getCurToken()
+                   << "' when expecting an expression\n";
+      return nullptr;
+    case tok_identifier:
+      return parseIdentifierExpr();
+    case tok_number:
+      return parseNumberExpr();
+    case '(':
+      return parseParenExpr();
+    case '[':
+      return parseTensorLiteralExpr();
+    case ';':
+      return nullptr;
+    case '}':
+      return nullptr;
+    }
+  }
+
+  /// Recursively parse the right hand side of a binary expression, the ExprPrec
+  /// argument indicates the precedence of the current binary operator.
+  ///
+  /// binoprhs ::= ('+' primary)*
+  std::unique_ptr<ExprAST> parseBinOpRHS(int exprPrec,
+                                         std::unique_ptr<ExprAST> lhs) {
+    // If this is a binop, find its precedence.
+    while (true) {
+      int tokPrec = getTokPrecedence();
+
+      // If this is a binop that binds at least as tightly as the current binop,
+      // consume it, otherwise we are done.
+      if (tokPrec < exprPrec)
+        return lhs;
+
+      // Okay, we know this is a binop.
+      int binOp = lexer.getCurToken();
+      lexer.consume(Token(binOp));
+      auto loc = lexer.getLastLocation();
+
+      // Parse the primary expression after the binary operator.
+      auto rhs = parsePrimary();
+      if (!rhs)
+        return parseError<ExprAST>("expression", "to complete binary operator");
+
+      // If BinOp binds less tightly with rhs than the operator after rhs, let
+      // the pending operator take rhs as its lhs.
+      int nextPrec = getTokPrecedence();
+      if (tokPrec < nextPrec) {
+        rhs = parseBinOpRHS(tokPrec + 1, std::move(rhs));
+        if (!rhs)
+          return nullptr;
+      }
+
+      // Merge lhs/RHS.
+      lhs = std::make_unique<BinaryExprAST>(std::move(loc), binOp,
+                                            std::move(lhs), std::move(rhs));
+    }
+  }
+
+  /// expression::= primary binop rhs
+  std::unique_ptr<ExprAST> parseExpression() {
+    auto lhs = parsePrimary();
+    if (!lhs)
+      return nullptr;
+
+    return parseBinOpRHS(0, std::move(lhs));
+  }
+
+  /// type ::= < shape_list >
+  /// shape_list ::= num | num , shape_list
+  std::unique_ptr<VarType> parseType() {
+    if (lexer.getCurToken() != '<')
+      return parseError<VarType>("<", "to begin type");
+    lexer.getNextToken(); // eat <
+
+    auto type = std::make_unique<VarType>();
+
+    while (lexer.getCurToken() == tok_number) {
+      type->shape.push_back(lexer.getValue());
+      lexer.getNextToken();
+      if (lexer.getCurToken() == ',')
+        lexer.getNextToken();
+    }
+
+    if (lexer.getCurToken() != '>')
+      return parseError<VarType>(">", "to end type");
+    lexer.getNextToken(); // eat >
+    return type;
+  }
+
+  /// Parse a variable declaration, it starts with a `var` keyword followed by
+  /// and identifier and an optional type (shape specification) before the
+  /// initializer.
+  /// decl ::= var identifier [ type ] = expr
+  std::unique_ptr<VarDeclExprAST> parseDeclaration() {
+    if (lexer.getCurToken() != tok_var)
+      return parseError<VarDeclExprAST>("var", "to begin declaration");
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat var
+
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<VarDeclExprAST>("identified",
+                                        "after 'var' declaration");
+    std::string id = lexer.getId();
+    lexer.getNextToken(); // eat id
+
+    std::unique_ptr<VarType> type; // Type is optional, it can be inferred
+    if (lexer.getCurToken() == '<') {
+      type = parseType();
+      if (!type)
+        return nullptr;
+    }
+
+    if (!type)
+      type = std::make_unique<VarType>();
+    lexer.consume(Token('='));
+    auto expr = parseExpression();
+    return std::make_unique<VarDeclExprAST>(std::move(loc), std::move(id),
+                                            std::move(*type), std::move(expr));
+  }
+
+  /// Parse a block: a list of expression separated by semicolons and wrapped in
+  /// curly braces.
+  ///
+  /// block ::= { expression_list }
+  /// expression_list ::= block_expr ; expression_list
+  /// block_expr ::= decl | "return" | expr
+  std::unique_ptr<ExprASTList> parseBlock() {
+    if (lexer.getCurToken() != '{')
+      return parseError<ExprASTList>("{", "to begin block");
+    lexer.consume(Token('{'));
+
+    auto exprList = std::make_unique<ExprASTList>();
+
+    // Ignore empty expressions: swallow sequences of semicolons.
+    while (lexer.getCurToken() == ';')
+      lexer.consume(Token(';'));
+
+    while (lexer.getCurToken() != '}' && lexer.getCurToken() != tok_eof) {
+      if (lexer.getCurToken() == tok_var) {
+        // Variable declaration
+        auto varDecl = parseDeclaration();
+        if (!varDecl)
+          return nullptr;
+        exprList->push_back(std::move(varDecl));
+      } else if (lexer.getCurToken() == tok_return) {
+        // Return statement
+        auto ret = parseReturn();
+        if (!ret)
+          return nullptr;
+        exprList->push_back(std::move(ret));
+      } else {
+        // General expression
+        auto expr = parseExpression();
+        if (!expr)
+          return nullptr;
+        exprList->push_back(std::move(expr));
+      }
+      // Ensure that elements are separated by a semicolon.
+      if (lexer.getCurToken() != ';')
+        return parseError<ExprASTList>(";", "after expression");
+
+      // Ignore empty expressions: swallow sequences of semicolons.
+      while (lexer.getCurToken() == ';')
+        lexer.consume(Token(';'));
+    }
+
+    if (lexer.getCurToken() != '}')
+      return parseError<ExprASTList>("}", "to close block");
+
+    lexer.consume(Token('}'));
+    return exprList;
+  }
+
+  /// prototype ::= def id '(' decl_list ')'
+  /// decl_list ::= identifier | identifier, decl_list
+  std::unique_ptr<PrototypeAST> parsePrototype() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_def);
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<PrototypeAST>("function name", "in prototype");
+
+    std::string fnName = lexer.getId();
+    lexer.consume(tok_identifier);
+
+    if (lexer.getCurToken() != '(')
+      return parseError<PrototypeAST>("(", "in prototype");
+    lexer.consume(Token('('));
+
+    std::vector<std::unique_ptr<VariableExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      do {
+        std::string name = lexer.getId();
+        auto loc = lexer.getLastLocation();
+        lexer.consume(tok_identifier);
+        auto decl = std::make_unique<VariableExprAST>(std::move(loc), name);
+        args.push_back(std::move(decl));
+        if (lexer.getCurToken() != ',')
+          break;
+        lexer.consume(Token(','));
+        if (lexer.getCurToken() != tok_identifier)
+          return parseError<PrototypeAST>(
+              "identifier", "after ',' in function parameter list");
+      } while (true);
+    }
+    if (lexer.getCurToken() != ')')
+      return parseError<PrototypeAST>("}", "to end function prototype");
+
+    // success.
+    lexer.consume(Token(')'));
+    return std::make_unique<PrototypeAST>(std::move(loc), fnName,
+                                          std::move(args));
+  }
+
+  /// Parse a function definition, we expect a prototype initiated with the
+  /// `def` keyword, followed by a block containing a list of expressions.
+  ///
+  /// definition ::= prototype block
+  std::unique_ptr<FunctionAST> parseDefinition() {
+    auto proto = parsePrototype();
+    if (!proto)
+      return nullptr;
+
+    if (auto block = parseBlock())
+      return std::make_unique<FunctionAST>(std::move(proto), std::move(block));
+    return nullptr;
+  }
+
+  /// Get the precedence of the pending binary operator token.
+  int getTokPrecedence() {
+    if (!isascii(lexer.getCurToken()))
+      return -1;
+
+    // 1 is lowest precedence.
+    switch (static_cast<char>(lexer.getCurToken())) {
+    case '-':
+      return 20;
+    case '+':
+      return 20;
+    case '*':
+      return 40;
+    default:
+      return -1;
+    }
+  }
+
+  /// Helper function to signal errors while parsing, it takes an argument
+  /// indicating the expected token and another argument giving more context.
+  /// Location is retrieved from the lexer to enrich the error message.
+  template <typename R, typename T, typename U = const char *>
+  std::unique_ptr<R> parseError(T &&expected, U &&context = "") {
+    auto curToken = lexer.getCurToken();
+    llvm::errs() << "Parse error (" << lexer.getLastLocation().line << ", "
+                 << lexer.getLastLocation().col << "): expected '" << expected
+                 << "' " << context << " but has Token " << curToken;
+    if (isprint(curToken))
+      llvm::errs() << " '" << (char)curToken << "'";
+    llvm::errs() << "\n";
+    return nullptr;
+  }
+};
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_PARSER_H
diff --git a/mlir/examples/toy/Ch6/include/toy/Passes.h b/mlir/examples/toy/Ch6/include/toy/Passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..33c2021c8db298671d41987e10de508507065f15
--- /dev/null
+++ b/mlir/examples/toy/Ch6/include/toy/Passes.h
@@ -0,0 +1,36 @@
+//===- Passes.h - Toy Passes Definition -----------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes the entry points to create compiler passes for Toy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_PASSES_H
+#define MLIR_TUTORIAL_TOY_PASSES_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+namespace toy {
+std::unique_ptr<Pass> createDeadFunctionEliminationPass();
+std::unique_ptr<Pass> createShapeInferencePass();
+
+/// Create a pass for lowering to operations in the `Affine` and `Std` dialects,
+/// for a subset of the Toy IR (e.g. matmul).
+std::unique_ptr<mlir::Pass> createLowerToAffinePass();
+
+/// Create a pass for lowering operations the remaining `Toy` operations, as
+/// well as `Affine` and `Std`, to the LLVM dialect for codegen.
+std::unique_ptr<mlir::Pass> createLowerToLLVMPass();
+
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_PASSES_H
diff --git a/mlir/examples/toy/Ch6/include/toy/ShapeInferenceInterface.h b/mlir/examples/toy/Ch6/include/toy/ShapeInferenceInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..da0fb66018ee4df1882d26f074ecd49a24ddcea9
--- /dev/null
+++ b/mlir/examples/toy/Ch6/include/toy/ShapeInferenceInterface.h
@@ -0,0 +1,28 @@
+//===- ShapeInferenceInterface.h - Interface definitions for ShapeInference -=//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the shape inference interfaces defined
+// in ShapeInferenceInterface.td.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
+#define MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
+
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace toy {
+
+/// Include the auto-generated declarations.
+#include "toy/ShapeInferenceOpInterfaces.h.inc"
+
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
diff --git a/mlir/examples/toy/Ch6/include/toy/ShapeInferenceInterface.td b/mlir/examples/toy/Ch6/include/toy/ShapeInferenceInterface.td
new file mode 100644
index 0000000000000000000000000000000000000000..1b38ada1622862057ad2c18eabe147b875e18cf2
--- /dev/null
+++ b/mlir/examples/toy/Ch6/include/toy/ShapeInferenceInterface.td
@@ -0,0 +1,30 @@
+//===- ShapeInferenceInterface.td - Shape Inference Interface -*- tablegen -==//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the operations of the Shape Inference Op Interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SHAPE_INFERENCE_INTERFACE
+#define SHAPE_INFERENCE_INTERFACE
+
+include "mlir/IR/OpBase.td"
+
+def ShapeInferenceOpInterface : OpInterface<"ShapeInference"> {
+  let description = [{
+    Interface to access a registered method to infer the return types for an
+    operation that can be used during type inference.
+  }];
+
+  let methods = [
+    InterfaceMethod<"Infer and set the output shape for the current operation.",
+                    "void", "inferShapes">
+  ];
+}
+
+#endif // SHAPE_INFERENCE_INTERFACE
diff --git a/mlir/examples/toy/Ch6/mlir/DeadFunctionEliminationPass.cpp b/mlir/examples/toy/Ch6/mlir/DeadFunctionEliminationPass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ee34547860cd98c27c21da874ad794a6d0c99d5
--- /dev/null
+++ b/mlir/examples/toy/Ch6/mlir/DeadFunctionEliminationPass.cpp
@@ -0,0 +1,59 @@
+//===- DeadFunctionEliminationPass.cpp - Eliminate inlined functions ------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Module level pass performing dead function
+// elimination. This is required as a post-processing step after function
+// inlining.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "toy/Passes.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+namespace {
+/// This is a simple function DCE pass that deletes all non-main functions after
+/// inlining.
+/// TODO(riverriddle) This is only necessary because MLIR currently does not
+/// have generic DCE support for functions.
+class DeadFunctionEliminationPass
+    : public mlir::ModulePass<DeadFunctionEliminationPass> {
+public:
+  void runOnModule() override {
+    mlir::ModuleOp module = getModule();
+    mlir::SymbolTable moduleSymTable(module);
+
+    // Eliminate non-main functions.
+    auto mainFn = moduleSymTable.lookup<mlir::FuncOp>("main");
+    for (mlir::FuncOp func :
+         llvm::make_early_inc_range(module.getOps<mlir::FuncOp>())) {
+      if (func != mainFn)
+        func.erase();
+    }
+  }
+};
+} // end anonymous namespace
+
+/// Create a pass that eliminates inlined functions in toy.
+std::unique_ptr<mlir::Pass> mlir::toy::createDeadFunctionEliminationPass() {
+  return std::make_unique<DeadFunctionEliminationPass>();
+}
diff --git a/mlir/examples/toy/Ch6/mlir/Dialect.cpp b/mlir/examples/toy/Ch6/mlir/Dialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a9ded0c3d38ae810d6dd114f4c3a0d85df65b60
--- /dev/null
+++ b/mlir/examples/toy/Ch6/mlir/Dialect.cpp
@@ -0,0 +1,261 @@
+//===- Dialect.cpp - Toy IR Dialect registration in MLIR ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the dialect for the Toy IR: custom type parsing and
+// operation verification.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Transforms/InliningUtils.h"
+
+using namespace mlir;
+using namespace mlir::toy;
+
+//===----------------------------------------------------------------------===//
+// ToyInlinerInterface
+//===----------------------------------------------------------------------===//
+
+/// This class defines the interface for handling inlining with Toy
+/// operations.
+struct ToyInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// All operations within toy can be inlined.
+  bool isLegalToInline(Operation *, Region *,
+                       BlockAndValueMapping &) const final {
+    return true;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator(toy.return) by replacing it with a new
+  /// operation as necessary.
+  void handleTerminator(Operation *op,
+                        ArrayRef<Value> valuesToRepl) const final {
+    // Only "toy.return" needs to be handled here.
+    auto returnOp = cast<ReturnOp>(op);
+
+    // Replace the values directly with the return operands.
+    assert(returnOp.getNumOperands() == valuesToRepl.size());
+    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+      valuesToRepl[it.index()]->replaceAllUsesWith(it.value());
+  }
+
+  /// Attempts to materialize a conversion for a type mismatch between a call
+  /// from this dialect, and a callable region. This method should generate an
+  /// operation that takes 'input' as the only operand, and produces a single
+  /// result of 'resultType'. If a conversion can not be generated, nullptr
+  /// should be returned.
+  Operation *materializeCallConversion(OpBuilder &builder, Value input,
+                                       Type resultType,
+                                       Location conversionLoc) const final {
+    return builder.create<CastOp>(conversionLoc, resultType, input);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyDialect
+//===----------------------------------------------------------------------===//
+
+/// Dialect creation, the instance will be owned by the context. This is the
+/// point of registration of custom types and operations for the dialect.
+ToyDialect::ToyDialect(mlir::MLIRContext *ctx) : mlir::Dialect("toy", ctx) {
+  addOperations<
+#define GET_OP_LIST
+#include "toy/Ops.cpp.inc"
+      >();
+  addInterfaces<ToyInlinerInterface>();
+}
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ConstantOp
+
+/// Build a constant operation.
+/// The builder is passed as an argument, so is the state that this method is
+/// expected to fill in order to build the operation.
+void ConstantOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                       double value) {
+  auto dataType = RankedTensorType::get({}, builder->getF64Type());
+  auto dataAttribute = DenseElementsAttr::get(dataType, value);
+  ConstantOp::build(builder, state, dataType, dataAttribute);
+}
+
+/// Verifier for the constant operation. This corresponds to the `::verify(...)`
+/// in the op definition.
+static mlir::LogicalResult verify(ConstantOp op) {
+  // If the return type of the constant is not an unranked tensor, the shape
+  // must match the shape of the attribute holding the data.
+  auto resultType =
+      op.getResult()->getType().dyn_cast<mlir::RankedTensorType>();
+  if (!resultType)
+    return success();
+
+  // Check that the rank of the attribute type matches the rank of the constant
+  // result type.
+  auto attrType = op.value().getType().cast<mlir::TensorType>();
+  if (attrType.getRank() != resultType.getRank()) {
+    return op.emitOpError(
+               "return type must match the one of the attached value "
+               "attribute: ")
+           << attrType.getRank() << " != " << resultType.getRank();
+  }
+
+  // Check that each of the dimensions match between the two types.
+  for (int dim = 0, dimE = attrType.getRank(); dim < dimE; ++dim) {
+    if (attrType.getShape()[dim] != resultType.getShape()[dim]) {
+      return op.emitOpError(
+                 "return type shape mismatches its attribute at dimension ")
+             << dim << ": " << attrType.getShape()[dim]
+             << " != " << resultType.getShape()[dim];
+    }
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// AddOp
+
+void AddOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+/// Infer the output shape of the AddOp, this is required by the shape inference
+/// interface.
+void AddOp::inferShapes() { getResult()->setType(getOperand(0)->getType()); }
+
+//===----------------------------------------------------------------------===//
+// CastOp
+
+/// Infer the output shape of the CastOp, this is required by the shape
+/// inference interface.
+void CastOp::inferShapes() { getResult()->setType(getOperand()->getType()); }
+
+//===----------------------------------------------------------------------===//
+// GenericCallOp
+
+void GenericCallOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                          StringRef callee, ArrayRef<mlir::Value> arguments) {
+  // Generic call always returns an unranked Tensor initially.
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands(arguments);
+  state.addAttribute("callee", builder->getSymbolRefAttr(callee));
+}
+
+/// Return the callee of the generic call operation, this is required by the
+/// call interface.
+CallInterfaceCallable GenericCallOp::getCallableForCallee() {
+  return getAttrOfType<SymbolRefAttr>("callee");
+}
+
+/// Get the argument operands to the called function, this is required by the
+/// call interface.
+Operation::operand_range GenericCallOp::getArgOperands() { return inputs(); }
+
+//===----------------------------------------------------------------------===//
+// MulOp
+
+void MulOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+/// Infer the output shape of the MulOp, this is required by the shape inference
+/// interface.
+void MulOp::inferShapes() { getResult()->setType(getOperand(0)->getType()); }
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+
+static mlir::LogicalResult verify(ReturnOp op) {
+  // We know that the parent operation is a function, because of the 'HasParent'
+  // trait attached to the operation definition.
+  auto function = cast<FuncOp>(op.getParentOp());
+
+  /// ReturnOps can only have a single optional operand.
+  if (op.getNumOperands() > 1)
+    return op.emitOpError() << "expects at most 1 return operand";
+
+  // The operand number and types must match the function signature.
+  const auto &results = function.getType().getResults();
+  if (op.getNumOperands() != results.size())
+    return op.emitOpError()
+           << "does not return the same number of values ("
+           << op.getNumOperands() << ") as the enclosing function ("
+           << results.size() << ")";
+
+  // If the operation does not have an input, we are done.
+  if (!op.hasOperand())
+    return mlir::success();
+
+  auto inputType = *op.operand_type_begin();
+  auto resultType = results.front();
+
+  // Check that the result type of the function matches the operand type.
+  if (inputType == resultType || inputType.isa<mlir::UnrankedTensorType>() ||
+      resultType.isa<mlir::UnrankedTensorType>())
+    return mlir::success();
+
+  return op.emitError() << "type of return operand ("
+                        << *op.operand_type_begin()
+                        << ") doesn't match function result type ("
+                        << results.front() << ")";
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+
+void TransposeOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                        mlir::Value value) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands(value);
+}
+
+void TransposeOp::inferShapes() {
+  auto arrayTy = getOperand()->getType().cast<RankedTensorType>();
+  SmallVector<int64_t, 2> dims(llvm::reverse(arrayTy.getShape()));
+  getResult()->setType(RankedTensorType::get(dims, arrayTy.getElementType()));
+}
+
+static mlir::LogicalResult verify(TransposeOp op) {
+  auto inputType = op.getOperand()->getType().dyn_cast<RankedTensorType>();
+  auto resultType = op.getType().dyn_cast<RankedTensorType>();
+  if (!inputType || !resultType)
+    return mlir::success();
+
+  auto inputShape = inputType.getShape();
+  if (!std::equal(inputShape.begin(), inputShape.end(),
+                  resultType.getShape().rbegin())) {
+    return op.emitError()
+           << "expected result shape to be a transpose of the input";
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "toy/Ops.cpp.inc"
diff --git a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d6e76de069ce235033287496a0ed556789fcf4a
--- /dev/null
+++ b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
@@ -0,0 +1,309 @@
+//====- LowerToAffineLoops.cpp - Partial lowering from Toy to Affine+Std --===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a partial lowering of Toy operations to a combination of
+// affine loops and standard operations. This lowering expects that all calls
+// have been inlined, and all shapes have been resolved.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/Sequence.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns
+//===----------------------------------------------------------------------===//
+
+/// Convert the given TensorType into the corresponding MemRefType.
+static MemRefType convertTensorToMemRef(TensorType type) {
+  assert(type.hasRank() && "expected only ranked shapes");
+  return MemRefType::get(type.getShape(), type.getElementType());
+}
+
+/// Insert an allocation and deallocation for the given MemRefType.
+static Value insertAllocAndDealloc(MemRefType type, Location loc,
+                                   PatternRewriter &rewriter) {
+  auto alloc = rewriter.create<AllocOp>(loc, type);
+
+  // Make sure to allocate at the beginning of the block.
+  auto *parentBlock = alloc.getOperation()->getBlock();
+  alloc.getOperation()->moveBefore(&parentBlock->front());
+
+  // Make sure to deallocate this alloc at the end of the block. This is fine
+  // as toy functions have no control flow.
+  auto dealloc = rewriter.create<DeallocOp>(loc, alloc);
+  dealloc.getOperation()->moveBefore(&parentBlock->back());
+  return alloc;
+}
+
+/// This defines the function type used to process an iteration of a lowered
+/// loop. It takes as input a rewriter, an array of memRefOperands corresponding
+/// to the operands of the input operation, and the set of loop induction
+/// variables for the iteration. It returns a value to store at the current
+/// index of the iteration.
+using LoopIterationFn = function_ref<Value(PatternRewriter &rewriter,
+                                           ArrayRef<Value> memRefOperands,
+                                           ArrayRef<Value> loopIvs)>;
+
+static void lowerOpToLoops(Operation *op, ArrayRef<Value> operands,
+                           PatternRewriter &rewriter,
+                           LoopIterationFn processIteration) {
+  auto tensorType = (*op->result_type_begin()).cast<TensorType>();
+  auto loc = op->getLoc();
+
+  // Insert an allocation and deallocation for the result of this operation.
+  auto memRefType = convertTensorToMemRef(tensorType);
+  auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter);
+
+  // Create an empty affine loop for each of the dimensions within the shape.
+  SmallVector<Value, 4> loopIvs;
+  for (auto dim : tensorType.getShape()) {
+    auto loop = rewriter.create<AffineForOp>(loc, /*lb=*/0, dim, /*step=*/1);
+    loop.getBody()->clear();
+    loopIvs.push_back(loop.getInductionVar());
+
+    // Terminate the loop body and update the rewriter insertion point to the
+    // beginning of the loop.
+    rewriter.setInsertionPointToStart(loop.getBody());
+    rewriter.create<AffineTerminatorOp>(loc);
+    rewriter.setInsertionPointToStart(loop.getBody());
+  }
+
+  // Generate a call to the processing function with the rewriter, the memref
+  // operands, and the loop induction variables. This function will return the
+  // value to store at the current index.
+  Value valueToStore = processIteration(rewriter, operands, loopIvs);
+  rewriter.create<AffineStoreOp>(loc, valueToStore, alloc,
+                                 llvm::makeArrayRef(loopIvs));
+
+  // Replace this operation with the generated alloc.
+  rewriter.replaceOp(op, alloc);
+}
+
+namespace {
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: Binary operations
+//===----------------------------------------------------------------------===//
+
+template <typename BinaryOp, typename LoweredBinaryOp>
+struct BinaryOpLowering : public ConversionPattern {
+  BinaryOpLowering(MLIRContext *ctx)
+      : ConversionPattern(BinaryOp::getOperationName(), 1, ctx) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+    lowerOpToLoops(
+        op, operands, rewriter,
+        [loc](PatternRewriter &rewriter, ArrayRef<Value> memRefOperands,
+              ArrayRef<Value> loopIvs) {
+          // Generate an adaptor for the remapped operands of the BinaryOp. This
+          // allows for using the nice named accessors that are generated by the
+          // ODS.
+          typename BinaryOp::OperandAdaptor binaryAdaptor(memRefOperands);
+
+          // Generate loads for the element of 'lhs' and 'rhs' at the inner
+          // loop.
+          auto loadedLhs =
+              rewriter.create<AffineLoadOp>(loc, binaryAdaptor.lhs(), loopIvs);
+          auto loadedRhs =
+              rewriter.create<AffineLoadOp>(loc, binaryAdaptor.rhs(), loopIvs);
+
+          // Create the binary operation performed on the loaded values.
+          return rewriter.create<LoweredBinaryOp>(loc, loadedLhs, loadedRhs);
+        });
+    return matchSuccess();
+  }
+};
+using AddOpLowering = BinaryOpLowering<toy::AddOp, AddFOp>;
+using MulOpLowering = BinaryOpLowering<toy::MulOp, MulFOp>;
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: Constant operations
+//===----------------------------------------------------------------------===//
+
+struct ConstantOpLowering : public OpRewritePattern<toy::ConstantOp> {
+  using OpRewritePattern<toy::ConstantOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(toy::ConstantOp op,
+                                     PatternRewriter &rewriter) const final {
+    DenseElementsAttr constantValue = op.value();
+    Location loc = op.getLoc();
+
+    // When lowering the constant operation, we allocate and assign the constant
+    // values to a corresponding memref allocation.
+    auto tensorType = op.getType().cast<TensorType>();
+    auto memRefType = convertTensorToMemRef(tensorType);
+    auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter);
+
+    // We will be generating constant indices up-to the largest dimension.
+    // Create these constants up-front to avoid large amounts of redundant
+    // operations.
+    auto valueShape = memRefType.getShape();
+    SmallVector<Value, 8> constantIndices;
+    for (auto i : llvm::seq<int64_t>(
+             0, *std::max_element(valueShape.begin(), valueShape.end())))
+      constantIndices.push_back(rewriter.create<ConstantIndexOp>(loc, i));
+
+    // The constant operation represents a multi-dimensional constant, so we
+    // will need to generate a store for each of the elements. The following
+    // functor recursively walks the dimensions of the constant shape,
+    // generating a store when the recursion hits the base case.
+    SmallVector<Value, 2> indices;
+    auto valueIt = constantValue.getValues<FloatAttr>().begin();
+    std::function<void(uint64_t)> storeElements = [&](uint64_t dimension) {
+      // The last dimension is the base case of the recursion, at this point
+      // we store the element at the given index.
+      if (dimension == valueShape.size()) {
+        rewriter.create<AffineStoreOp>(
+            loc, rewriter.create<ConstantOp>(loc, *valueIt++), alloc,
+            llvm::makeArrayRef(indices));
+        return;
+      }
+
+      // Otherwise, iterate over the current dimension and add the indices to
+      // the list.
+      for (uint64_t i = 0, e = valueShape[dimension]; i != e; ++i) {
+        indices.push_back(constantIndices[i]);
+        storeElements(dimension + 1);
+        indices.pop_back();
+      }
+    };
+
+    // Start the element storing recursion from the first dimension.
+    storeElements(/*dimension=*/0);
+
+    // Replace this operation with the generated alloc.
+    rewriter.replaceOp(op, alloc);
+    return matchSuccess();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: Return operations
+//===----------------------------------------------------------------------===//
+
+struct ReturnOpLowering : public OpRewritePattern<toy::ReturnOp> {
+  using OpRewritePattern<toy::ReturnOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(toy::ReturnOp op,
+                                     PatternRewriter &rewriter) const final {
+    // During this lowering, we expect that all function calls have been
+    // inlined.
+    if (op.hasOperand())
+      return matchFailure();
+
+    // We lower "toy.return" directly to "std.return".
+    rewriter.replaceOpWithNewOp<ReturnOp>(op);
+    return matchSuccess();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: Transpose operations
+//===----------------------------------------------------------------------===//
+
+struct TransposeOpLowering : public ConversionPattern {
+  TransposeOpLowering(MLIRContext *ctx)
+      : ConversionPattern(toy::TransposeOp::getOperationName(), 1, ctx) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+    lowerOpToLoops(
+        op, operands, rewriter,
+        [loc](PatternRewriter &rewriter, ArrayRef<Value> memRefOperands,
+              ArrayRef<Value> loopIvs) {
+          // Generate an adaptor for the remapped operands of the TransposeOp.
+          // This allows for using the nice named accessors that are generated
+          // by the ODS.
+          toy::TransposeOpOperandAdaptor transposeAdaptor(memRefOperands);
+          Value input = transposeAdaptor.input();
+
+          // Transpose the elements by generating a load from the reverse
+          // indices.
+          SmallVector<Value, 2> reverseIvs(llvm::reverse(loopIvs));
+          return rewriter.create<AffineLoadOp>(loc, input, reverseIvs);
+        });
+    return matchSuccess();
+  }
+};
+
+} // end anonymous namespace.
+
+//===----------------------------------------------------------------------===//
+// ToyToAffineLoweringPass
+//===----------------------------------------------------------------------===//
+
+/// This is a partial lowering to affine loops of the toy operations that are
+/// computationally intensive (like matmul for example...) while keeping the
+/// rest of the code in the Toy dialect.
+namespace {
+struct ToyToAffineLoweringPass : public FunctionPass<ToyToAffineLoweringPass> {
+  void runOnFunction() final;
+};
+} // end anonymous namespace.
+
+void ToyToAffineLoweringPass::runOnFunction() {
+  auto function = getFunction();
+
+  // We only lower the main function as we expect that all other functions have
+  // been inlined.
+  if (function.getName() != "main")
+    return;
+
+  // Verify that the given main has no inputs and results.
+  if (function.getNumArguments() || function.getType().getNumResults()) {
+    function.emitError("expected 'main' to have 0 inputs and 0 results");
+    return signalPassFailure();
+  }
+
+  // The first thing to define is the conversion target. This will define the
+  // final target for this lowering.
+  ConversionTarget target(getContext());
+
+  // We define the specific operations, or dialects, that are legal targets for
+  // this lowering. In our case, we are lowering to a combination of the
+  // `Affine` and `Standard` dialects.
+  target.addLegalDialect<AffineOpsDialect, StandardOpsDialect>();
+
+  // We also define the Toy dialect as Illegal so that the conversion will fail
+  // if any of these operations are *not* converted. Given that we actually want
+  // a partial lowering, we explicitly mark the Toy operations that don't want
+  // to lower, `toy.print`, as `legal`.
+  target.addIllegalDialect<toy::ToyDialect>();
+  target.addLegalOp<toy::PrintOp>();
+
+  // Now that the conversion target has been defined, we just need to provide
+  // the set of patterns that will lower the Toy operations.
+  OwningRewritePatternList patterns;
+  patterns.insert<AddOpLowering, ConstantOpLowering, MulOpLowering,
+                  ReturnOpLowering, TransposeOpLowering>(&getContext());
+
+  // With the target and rewrite patterns defined, we can now attempt the
+  // conversion. The conversion will signal failure if any of our `illegal`
+  // operations were not converted successfully.
+  if (failed(applyPartialConversion(getFunction(), target, patterns)))
+    signalPassFailure();
+}
+
+/// Create a pass for lowering operations in the `Affine` and `Std` dialects,
+/// for a subset of the Toy IR (e.g. matmul).
+std::unique_ptr<Pass> mlir::toy::createLowerToAffinePass() {
+  return std::make_unique<ToyToAffineLoweringPass>();
+}
diff --git a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f1a6ae8bbee6c850f6c1e26e6c595b34a19b5ab
--- /dev/null
+++ b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
@@ -0,0 +1,204 @@
+//====- LowerToLLVM.cpp - Lowering from Toy+Affine+Std to LLVM ------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a partial lowering of Toy operations to a combination of
+// affine loops and standard operations. This lowering expects that all calls
+// have been inlined, and all shapes have been resolved.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/Sequence.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// ToyToLLVM RewritePatterns
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Lowers `toy.print` to a loop nest calling `printf` on each of the individual
+/// elements of the array.
+class PrintOpLowering : public ConversionPattern {
+public:
+  explicit PrintOpLowering(MLIRContext *context)
+      : ConversionPattern(toy::PrintOp::getOperationName(), 1, context) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto memRefType = (*op->operand_type_begin()).cast<MemRefType>();
+    auto memRefShape = memRefType.getShape();
+    auto loc = op->getLoc();
+    auto *llvmDialect =
+        op->getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
+    assert(llvmDialect && "expected llvm dialect to be registered");
+
+    ModuleOp parentModule = op->getParentOfType<ModuleOp>();
+
+    // Get a symbol reference to the printf function, inserting it if necessary.
+    auto printfRef = getOrInsertPrintf(rewriter, parentModule, llvmDialect);
+    Value formatSpecifierCst = getOrCreateGlobalString(
+        loc, rewriter, "frmt_spec", StringRef("%f \0", 4), parentModule,
+        llvmDialect);
+    Value newLineCst = getOrCreateGlobalString(
+        loc, rewriter, "nl", StringRef("\n\0", 2), parentModule, llvmDialect);
+
+    // Create a loop for each of the dimensions within the shape.
+    SmallVector<Value, 4> loopIvs;
+    for (unsigned i = 0, e = memRefShape.size(); i != e; ++i) {
+      auto lowerBound = rewriter.create<ConstantIndexOp>(loc, 0);
+      auto upperBound = rewriter.create<ConstantIndexOp>(loc, memRefShape[i]);
+      auto step = rewriter.create<ConstantIndexOp>(loc, 1);
+      auto loop =
+          rewriter.create<loop::ForOp>(loc, lowerBound, upperBound, step);
+      loop.getBody()->clear();
+      loopIvs.push_back(loop.getInductionVar());
+
+      // Terminate the loop body.
+      rewriter.setInsertionPointToStart(loop.getBody());
+
+      // Insert a newline after each of the inner dimensions of the shape.
+      if (i != e - 1)
+        rewriter.create<CallOp>(loc, printfRef, rewriter.getIntegerType(32),
+                                newLineCst);
+      rewriter.create<loop::TerminatorOp>(loc);
+      rewriter.setInsertionPointToStart(loop.getBody());
+    }
+
+    // Generate a call to printf for the current element of the loop.
+    auto printOp = cast<toy::PrintOp>(op);
+    auto elementLoad = rewriter.create<LoadOp>(loc, printOp.input(), loopIvs);
+    rewriter.create<CallOp>(loc, printfRef, rewriter.getIntegerType(32),
+                            ArrayRef<Value>({formatSpecifierCst, elementLoad}));
+
+    // Notify the rewriter that this operation has been removed.
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+
+private:
+  /// Return a symbol reference to the printf function, inserting it into the
+  /// module if necessary.
+  static FlatSymbolRefAttr getOrInsertPrintf(PatternRewriter &rewriter,
+                                             ModuleOp module,
+                                             LLVM::LLVMDialect *llvmDialect) {
+    auto *context = module.getContext();
+    if (module.lookupSymbol<LLVM::LLVMFuncOp>("printf"))
+      return SymbolRefAttr::get("printf", context);
+
+    // Create a function declaration for printf, the signature is:
+    //   * `i32 (i8*, ...)`
+    auto llvmI32Ty = LLVM::LLVMType::getInt32Ty(llvmDialect);
+    auto llvmI8PtrTy = LLVM::LLVMType::getInt8PtrTy(llvmDialect);
+    auto llvmFnType = LLVM::LLVMType::getFunctionTy(llvmI32Ty, llvmI8PtrTy,
+                                                    /*isVarArg=*/true);
+
+    // Insert the printf function into the body of the parent module.
+    PatternRewriter::InsertionGuard insertGuard(rewriter);
+    rewriter.setInsertionPointToStart(module.getBody());
+    rewriter.create<LLVM::LLVMFuncOp>(module.getLoc(), "printf", llvmFnType);
+    return SymbolRefAttr::get("printf", context);
+  }
+
+  /// Return a value representing an access into a global string with the given
+  /// name, creating the string if necessary.
+  static Value getOrCreateGlobalString(Location loc, OpBuilder &builder,
+                                       StringRef name, StringRef value,
+                                       ModuleOp module,
+                                       LLVM::LLVMDialect *llvmDialect) {
+    // Create the global at the entry of the module.
+    LLVM::GlobalOp global;
+    if (!(global = module.lookupSymbol<LLVM::GlobalOp>(name))) {
+      OpBuilder::InsertionGuard insertGuard(builder);
+      builder.setInsertionPointToStart(module.getBody());
+      auto type = LLVM::LLVMType::getArrayTy(
+          LLVM::LLVMType::getInt8Ty(llvmDialect), value.size());
+      global = builder.create<LLVM::GlobalOp>(loc, type, /*isConstant=*/true,
+                                              LLVM::Linkage::Internal, name,
+                                              builder.getStringAttr(value));
+    }
+
+    // Get the pointer to the first character in the global string.
+    Value globalPtr = builder.create<LLVM::AddressOfOp>(loc, global);
+    Value cst0 = builder.create<LLVM::ConstantOp>(
+        loc, LLVM::LLVMType::getInt64Ty(llvmDialect),
+        builder.getIntegerAttr(builder.getIndexType(), 0));
+    return builder.create<LLVM::GEPOp>(
+        loc, LLVM::LLVMType::getInt8PtrTy(llvmDialect), globalPtr,
+        ArrayRef<Value>({cst0, cst0}));
+  }
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// ToyToLLVMLoweringPass
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct ToyToLLVMLoweringPass : public ModulePass<ToyToLLVMLoweringPass> {
+  void runOnModule() final;
+};
+} // end anonymous namespace
+
+void ToyToLLVMLoweringPass::runOnModule() {
+  // The first thing to define is the conversion target. This will define the
+  // final target for this lowering. For this lowering, we are only targeting
+  // the LLVM dialect.
+  ConversionTarget target(getContext());
+  target.addLegalDialect<LLVM::LLVMDialect>();
+  target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
+
+  // During this lowering, we will also be lowering the MemRef types, that are
+  // currently being operated on, to a representation in LLVM. Do perform this
+  // conversion we use a TypeConverter as part of the lowering. This converter
+  // details how one type maps to another. This is necessary now that we will be
+  // doing more complicated lowerings, involving loop region arguments.
+  LLVMTypeConverter typeConverter(&getContext());
+
+  // Now that the conversion target has been defined, we need to provide the
+  // patterns used for lowering. At this point of the compilation process, we
+  // have a combination of `toy`, `affine`, and `std` operations. Luckily, there
+  // are already exists a set of patterns to transform `affine` and `std`
+  // dialects. These patterns lowering in multiple stages, relying on transitive
+  // lowerings. Transitive lowering, or A->B->C lowering, is when multiple
+  // patterns must be applied to fully transform an illegal operation into a
+  // set of legal ones.
+  OwningRewritePatternList patterns;
+  populateAffineToStdConversionPatterns(patterns, &getContext());
+  populateLoopToStdConversionPatterns(patterns, &getContext());
+  populateStdToLLVMConversionPatterns(typeConverter, patterns);
+
+  // The only remaining operation to lower from the `toy` dialect, is the
+  // PrintOp.
+  patterns.insert<PrintOpLowering>(&getContext());
+
+  // We want to completely lower to LLVM, so we use a `FullConversion`. This
+  // ensures that only legal operations will remain after the conversion.
+  auto module = getModule();
+  if (failed(applyFullConversion(module, target, patterns, &typeConverter)))
+    signalPassFailure();
+}
+
+/// Create a pass for lowering operations the remaining `Toy` operations, as
+/// well as `Affine` and `Std`, to the LLVM dialect for codegen.
+std::unique_ptr<mlir::Pass> mlir::toy::createLowerToLLVMPass() {
+  return std::make_unique<ToyToLLVMLoweringPass>();
+}
diff --git a/mlir/examples/toy/Ch6/mlir/MLIRGen.cpp b/mlir/examples/toy/Ch6/mlir/MLIRGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9c960c79f47254d31e72037431a0f9d3a614276
--- /dev/null
+++ b/mlir/examples/toy/Ch6/mlir/MLIRGen.cpp
@@ -0,0 +1,452 @@
+//===- MLIRGen.cpp - MLIR Generation from a Toy AST -----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple IR generation targeting MLIR from a Module AST
+// for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/MLIRGen.h"
+#include "toy/AST.h"
+#include "toy/Dialect.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/Support/raw_ostream.h"
+#include <numeric>
+
+using namespace mlir::toy;
+using namespace toy;
+
+using llvm::ArrayRef;
+using llvm::cast;
+using llvm::dyn_cast;
+using llvm::isa;
+using llvm::makeArrayRef;
+using llvm::ScopedHashTableScope;
+using llvm::SmallVector;
+using llvm::StringRef;
+using llvm::Twine;
+
+namespace {
+
+/// Implementation of a simple MLIR emission from the Toy AST.
+///
+/// This will emit operations that are specific to the Toy language, preserving
+/// the semantics of the language and (hopefully) allow to perform accurate
+/// analysis and transformation based on these high level semantics.
+class MLIRGenImpl {
+public:
+  MLIRGenImpl(mlir::MLIRContext &context) : builder(&context) {}
+
+  /// Public API: convert the AST for a Toy module (source file) to an MLIR
+  /// Module operation.
+  mlir::ModuleOp mlirGen(ModuleAST &moduleAST) {
+    // We create an empty MLIR module and codegen functions one at a time and
+    // add them to the module.
+    theModule = mlir::ModuleOp::create(builder.getUnknownLoc());
+
+    for (FunctionAST &F : moduleAST) {
+      auto func = mlirGen(F);
+      if (!func)
+        return nullptr;
+      theModule.push_back(func);
+    }
+
+    // Verify the module after we have finished constructing it, this will check
+    // the structural properties of the IR and invoke any specific verifiers we
+    // have on the Toy operations.
+    if (failed(mlir::verify(theModule))) {
+      theModule.emitError("module verification error");
+      return nullptr;
+    }
+
+    return theModule;
+  }
+
+private:
+  /// A "module" matches a Toy source file: containing a list of functions.
+  mlir::ModuleOp theModule;
+
+  /// The builder is a helper class to create IR inside a function. The builder
+  /// is stateful, in particular it keeps an "insertion point": this is where
+  /// the next operations will be introduced.
+  mlir::OpBuilder builder;
+
+  /// The symbol table maps a variable name to a value in the current scope.
+  /// Entering a function creates a new scope, and the function arguments are
+  /// added to the mapping. When the processing of a function is terminated, the
+  /// scope is destroyed and the mappings created in this scope are dropped.
+  llvm::ScopedHashTable<StringRef, mlir::Value> symbolTable;
+
+  /// Helper conversion for a Toy AST location to an MLIR location.
+  mlir::Location loc(Location loc) {
+    return builder.getFileLineColLoc(builder.getIdentifier(*loc.file), loc.line,
+                                     loc.col);
+  }
+
+  /// Declare a variable in the current scope, return success if the variable
+  /// wasn't declared yet.
+  mlir::LogicalResult declare(llvm::StringRef var, mlir::Value value) {
+    if (symbolTable.count(var))
+      return mlir::failure();
+    symbolTable.insert(var, value);
+    return mlir::success();
+  }
+
+  /// Create the prototype for an MLIR function with as many arguments as the
+  /// provided Toy AST prototype.
+  mlir::FuncOp mlirGen(PrototypeAST &proto) {
+    auto location = loc(proto.loc());
+
+    // This is a generic function, the return type will be inferred later.
+    // Arguments type are uniformly unranked tensors.
+    llvm::SmallVector<mlir::Type, 4> arg_types(proto.getArgs().size(),
+                                               getType(VarType{}));
+    auto func_type = builder.getFunctionType(arg_types, llvm::None);
+    return mlir::FuncOp::create(location, proto.getName(), func_type);
+  }
+
+  /// Emit a new function and add it to the MLIR module.
+  mlir::FuncOp mlirGen(FunctionAST &funcAST) {
+    // Create a scope in the symbol table to hold variable declarations.
+    ScopedHashTableScope<llvm::StringRef, mlir::Value> var_scope(symbolTable);
+
+    // Create an MLIR function for the given prototype.
+    mlir::FuncOp function(mlirGen(*funcAST.getProto()));
+    if (!function)
+      return nullptr;
+
+    // Let's start the body of the function now!
+    // In MLIR the entry block of the function is special: it must have the same
+    // argument list as the function itself.
+    auto &entryBlock = *function.addEntryBlock();
+    auto protoArgs = funcAST.getProto()->getArgs();
+
+    // Declare all the function arguments in the symbol table.
+    for (const auto &name_value :
+         llvm::zip(protoArgs, entryBlock.getArguments())) {
+      if (failed(declare(std::get<0>(name_value)->getName(),
+                         std::get<1>(name_value))))
+        return nullptr;
+    }
+
+    // Set the insertion point in the builder to the beginning of the function
+    // body, it will be used throughout the codegen to create operations in this
+    // function.
+    builder.setInsertionPointToStart(&entryBlock);
+
+    // Emit the body of the function.
+    if (mlir::failed(mlirGen(*funcAST.getBody()))) {
+      function.erase();
+      return nullptr;
+    }
+
+    // Implicitly return void if no return statement was emitted.
+    // FIXME: we may fix the parser instead to always return the last expression
+    // (this would possibly help the REPL case later)
+    ReturnOp returnOp;
+    if (!entryBlock.empty())
+      returnOp = dyn_cast<ReturnOp>(entryBlock.back());
+    if (!returnOp) {
+      builder.create<ReturnOp>(loc(funcAST.getProto()->loc()));
+    } else if (returnOp.hasOperand()) {
+      // Otherwise, if this return operation has an operand then add a result to
+      // the function.
+      function.setType(builder.getFunctionType(function.getType().getInputs(),
+                                               getType(VarType{})));
+    }
+
+    return function;
+  }
+
+  /// Emit a binary operation
+  mlir::Value mlirGen(BinaryExprAST &binop) {
+    // First emit the operations for each side of the operation before emitting
+    // the operation itself. For example if the expression is `a + foo(a)`
+    // 1) First it will visiting the LHS, which will return a reference to the
+    //    value holding `a`. This value should have been emitted at declaration
+    //    time and registered in the symbol table, so nothing would be
+    //    codegen'd. If the value is not in the symbol table, an error has been
+    //    emitted and nullptr is returned.
+    // 2) Then the RHS is visited (recursively) and a call to `foo` is emitted
+    //    and the result value is returned. If an error occurs we get a nullptr
+    //    and propagate.
+    //
+    mlir::Value lhs = mlirGen(*binop.getLHS());
+    if (!lhs)
+      return nullptr;
+    mlir::Value rhs = mlirGen(*binop.getRHS());
+    if (!rhs)
+      return nullptr;
+    auto location = loc(binop.loc());
+
+    // Derive the operation name from the binary operator. At the moment we only
+    // support '+' and '*'.
+    switch (binop.getOp()) {
+    case '+':
+      return builder.create<AddOp>(location, lhs, rhs);
+    case '*':
+      return builder.create<MulOp>(location, lhs, rhs);
+    }
+
+    emitError(location, "invalid binary operator '") << binop.getOp() << "'";
+    return nullptr;
+  }
+
+  /// This is a reference to a variable in an expression. The variable is
+  /// expected to have been declared and so should have a value in the symbol
+  /// table, otherwise emit an error and return nullptr.
+  mlir::Value mlirGen(VariableExprAST &expr) {
+    if (auto variable = symbolTable.lookup(expr.getName()))
+      return variable;
+
+    emitError(loc(expr.loc()), "error: unknown variable '")
+        << expr.getName() << "'";
+    return nullptr;
+  }
+
+  /// Emit a return operation. This will return failure if any generation fails.
+  mlir::LogicalResult mlirGen(ReturnExprAST &ret) {
+    auto location = loc(ret.loc());
+
+    // 'return' takes an optional expression, handle that case here.
+    mlir::Value expr = nullptr;
+    if (ret.getExpr().hasValue()) {
+      if (!(expr = mlirGen(*ret.getExpr().getValue())))
+        return mlir::failure();
+    }
+
+    // Otherwise, this return operation has zero operands.
+    builder.create<ReturnOp>(location, expr ? makeArrayRef(expr)
+                                            : ArrayRef<mlir::Value>());
+    return mlir::success();
+  }
+
+  /// Emit a literal/constant array. It will be emitted as a flattened array of
+  /// data in an Attribute attached to a `toy.constant` operation.
+  /// See documentation on [Attributes](LangRef.md#attributes) for more details.
+  /// Here is an excerpt:
+  ///
+  ///   Attributes are the mechanism for specifying constant data in MLIR in
+  ///   places where a variable is never allowed [...]. They consist of a name
+  ///   and a concrete attribute value. The set of expected attributes, their
+  ///   structure, and their interpretation are all contextually dependent on
+  ///   what they are attached to.
+  ///
+  /// Example, the source level statement:
+  ///   var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  /// will be converted to:
+  ///   %0 = "toy.constant"() {value: dense<tensor<2x3xf64>,
+  ///     [[1.000000e+00, 2.000000e+00, 3.000000e+00],
+  ///      [4.000000e+00, 5.000000e+00, 6.000000e+00]]>} : () -> tensor<2x3xf64>
+  ///
+  mlir::Value mlirGen(LiteralExprAST &lit) {
+    auto type = getType(lit.getDims());
+
+    // The attribute is a vector with a floating point value per element
+    // (number) in the array, see `collectData()` below for more details.
+    std::vector<double> data;
+    data.reserve(std::accumulate(lit.getDims().begin(), lit.getDims().end(), 1,
+                                 std::multiplies<int>()));
+    collectData(lit, data);
+
+    // The type of this attribute is tensor of 64-bit floating-point with the
+    // shape of the literal.
+    mlir::Type elementType = builder.getF64Type();
+    auto dataType = mlir::RankedTensorType::get(lit.getDims(), elementType);
+
+    // This is the actual attribute that holds the list of values for this
+    // tensor literal.
+    auto dataAttribute =
+        mlir::DenseElementsAttr::get(dataType, llvm::makeArrayRef(data));
+
+    // Build the MLIR op `toy.constant`. This invokes the `ConstantOp::build`
+    // method.
+    return builder.create<ConstantOp>(loc(lit.loc()), type, dataAttribute);
+  }
+
+  /// Recursive helper function to accumulate the data that compose an array
+  /// literal. It flattens the nested structure in the supplied vector. For
+  /// example with this array:
+  ///  [[1, 2], [3, 4]]
+  /// we will generate:
+  ///  [ 1, 2, 3, 4 ]
+  /// Individual numbers are represented as doubles.
+  /// Attributes are the way MLIR attaches constant to operations.
+  void collectData(ExprAST &expr, std::vector<double> &data) {
+    if (auto *lit = dyn_cast<LiteralExprAST>(&expr)) {
+      for (auto &value : lit->getValues())
+        collectData(*value, data);
+      return;
+    }
+
+    assert(isa<NumberExprAST>(expr) && "expected literal or number expr");
+    data.push_back(cast<NumberExprAST>(expr).getValue());
+  }
+
+  /// Emit a call expression. It emits specific operations for the `transpose`
+  /// builtin. Other identifiers are assumed to be user-defined functions.
+  mlir::Value mlirGen(CallExprAST &call) {
+    llvm::StringRef callee = call.getCallee();
+    auto location = loc(call.loc());
+
+    // Codegen the operands first.
+    SmallVector<mlir::Value, 4> operands;
+    for (auto &expr : call.getArgs()) {
+      auto arg = mlirGen(*expr);
+      if (!arg)
+        return nullptr;
+      operands.push_back(arg);
+    }
+
+    // Builting calls have their custom operation, meaning this is a
+    // straightforward emission.
+    if (callee == "transpose") {
+      if (call.getArgs().size() != 1) {
+        emitError(location, "MLIR codegen encountered an error: toy.transpose "
+                            "does not accept multiple arguments");
+        return nullptr;
+      }
+      return builder.create<TransposeOp>(location, operands[0]);
+    }
+
+    // Otherwise this is a call to a user-defined function. Calls to ser-defined
+    // functions are mapped to a custom call that takes the callee name as an
+    // attribute.
+    return builder.create<GenericCallOp>(location, callee, operands);
+  }
+
+  /// Emit a print expression. It emits specific operations for two builtins:
+  /// transpose(x) and print(x).
+  mlir::LogicalResult mlirGen(PrintExprAST &call) {
+    auto arg = mlirGen(*call.getArg());
+    if (!arg)
+      return mlir::failure();
+
+    builder.create<PrintOp>(loc(call.loc()), arg);
+    return mlir::success();
+  }
+
+  /// Emit a constant for a single number (FIXME: semantic? broadcast?)
+  mlir::Value mlirGen(NumberExprAST &num) {
+    return builder.create<ConstantOp>(loc(num.loc()), num.getValue());
+  }
+
+  /// Dispatch codegen for the right expression subclass using RTTI.
+  mlir::Value mlirGen(ExprAST &expr) {
+    switch (expr.getKind()) {
+    case toy::ExprAST::Expr_BinOp:
+      return mlirGen(cast<BinaryExprAST>(expr));
+    case toy::ExprAST::Expr_Var:
+      return mlirGen(cast<VariableExprAST>(expr));
+    case toy::ExprAST::Expr_Literal:
+      return mlirGen(cast<LiteralExprAST>(expr));
+    case toy::ExprAST::Expr_Call:
+      return mlirGen(cast<CallExprAST>(expr));
+    case toy::ExprAST::Expr_Num:
+      return mlirGen(cast<NumberExprAST>(expr));
+    default:
+      emitError(loc(expr.loc()))
+          << "MLIR codegen encountered an unhandled expr kind '"
+          << Twine(expr.getKind()) << "'";
+      return nullptr;
+    }
+  }
+
+  /// Handle a variable declaration, we'll codegen the expression that forms the
+  /// initializer and record the value in the symbol table before returning it.
+  /// Future expressions will be able to reference this variable through symbol
+  /// table lookup.
+  mlir::Value mlirGen(VarDeclExprAST &vardecl) {
+    auto init = vardecl.getInitVal();
+    if (!init) {
+      emitError(loc(vardecl.loc()),
+                "missing initializer in variable declaration");
+      return nullptr;
+    }
+
+    mlir::Value value = mlirGen(*init);
+    if (!value)
+      return nullptr;
+
+    // We have the initializer value, but in case the variable was declared
+    // with specific shape, we emit a "reshape" operation. It will get
+    // optimized out later as needed.
+    if (!vardecl.getType().shape.empty()) {
+      value = builder.create<ReshapeOp>(loc(vardecl.loc()),
+                                        getType(vardecl.getType()), value);
+    }
+
+    // Register the value in the symbol table.
+    if (failed(declare(vardecl.getName(), value)))
+      return nullptr;
+    return value;
+  }
+
+  /// Codegen a list of expression, return failure if one of them hit an error.
+  mlir::LogicalResult mlirGen(ExprASTList &blockAST) {
+    ScopedHashTableScope<StringRef, mlir::Value> var_scope(symbolTable);
+    for (auto &expr : blockAST) {
+      // Specific handling for variable declarations, return statement, and
+      // print. These can only appear in block list and not in nested
+      // expressions.
+      if (auto *vardecl = dyn_cast<VarDeclExprAST>(expr.get())) {
+        if (!mlirGen(*vardecl))
+          return mlir::failure();
+        continue;
+      }
+      if (auto *ret = dyn_cast<ReturnExprAST>(expr.get()))
+        return mlirGen(*ret);
+      if (auto *print = dyn_cast<PrintExprAST>(expr.get())) {
+        if (mlir::failed(mlirGen(*print)))
+          return mlir::success();
+        continue;
+      }
+
+      // Generic expression dispatch codegen.
+      if (!mlirGen(*expr))
+        return mlir::failure();
+    }
+    return mlir::success();
+  }
+
+  /// Build a tensor type from a list of shape dimensions.
+  mlir::Type getType(ArrayRef<int64_t> shape) {
+    // If the shape is empty, then this type is unranked.
+    if (shape.empty())
+      return mlir::UnrankedTensorType::get(builder.getF64Type());
+
+    // Otherwise, we use the given shape.
+    return mlir::RankedTensorType::get(shape, builder.getF64Type());
+  }
+
+  /// Build an MLIR type from a Toy AST variable type (forward to the generic
+  /// getType above).
+  mlir::Type getType(const VarType &type) { return getType(type.shape); }
+};
+
+} // namespace
+
+namespace toy {
+
+// The public API for codegen.
+mlir::OwningModuleRef mlirGen(mlir::MLIRContext &context,
+                              ModuleAST &moduleAST) {
+  return MLIRGenImpl(context).mlirGen(moduleAST);
+}
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch6/mlir/ShapeInferencePass.cpp b/mlir/examples/toy/Ch6/mlir/ShapeInferencePass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..517a1f075306485003e099ed805a23f77cb49147
--- /dev/null
+++ b/mlir/examples/toy/Ch6/mlir/ShapeInferencePass.cpp
@@ -0,0 +1,104 @@
+//===- ShapeInferencePass.cpp - Shape Inference ---------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Function level pass performing interprocedural
+// propagation of array shapes through function specialization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Pass/Pass.h"
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+#include "toy/ShapeInferenceInterface.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "shape-inference"
+
+using namespace mlir;
+using namespace toy;
+
+/// Include the auto-generated definitions for the shape inference interfaces.
+#include "toy/ShapeInferenceOpInterfaces.cpp.inc"
+
+namespace {
+/// The ShapeInferencePass is a FunctionPass that performs intra-procedural
+/// shape inference.
+///
+///    Algorithm:
+///
+///   1) Build a worklist containing all the operations that return a
+///      dynamically shaped tensor: these are the operations that need shape
+///      inference.
+///   2) Iterate on the worklist:
+///     a) find an operation to process: the next ready operation in the
+///        worklist has all of its arguments non-generic,
+///     b) if no operation is found, break out of the loop,
+///     c) remove the operation from the worklist,
+///     d) infer the shape of its output from the argument types.
+///   3) If the worklist is empty, the algorithm succeeded.
+///
+class ShapeInferencePass : public mlir::FunctionPass<ShapeInferencePass> {
+public:
+  void runOnFunction() override {
+    auto f = getFunction();
+
+    // Populate the worklist with the operations that need shape inference:
+    // these are operations that return a dynamic shape.
+    llvm::SmallPtrSet<mlir::Operation *, 16> opWorklist;
+    f.walk([&](mlir::Operation *op) {
+      if (returnsDynamicShape(op))
+        opWorklist.insert(op);
+    });
+
+    // Iterate on the operations in the worklist until all operations have been
+    // inferred or no change happened (fix point).
+    while (!opWorklist.empty()) {
+      // Find the next operation ready for inference, that is an operation
+      // with all operands already resolved (non-generic).
+      auto nextop = llvm::find_if(opWorklist, returnsDynamicShape);
+      if (nextop == opWorklist.end())
+        break;
+
+      Operation *op = *nextop;
+      opWorklist.erase(op);
+
+      // Ask the operation to infer its output shapes.
+      LLVM_DEBUG(llvm::dbgs() << "Inferring shape for: " << *op << "\n");
+      if (auto shapeOp = dyn_cast<ShapeInference>(op)) {
+        shapeOp.inferShapes();
+      } else {
+        op->emitError("unable to infer shape of operation without shape "
+                      "inference interface");
+        return signalPassFailure();
+      }
+    }
+
+    // If the operation worklist isn't empty, this indicates a failure.
+    if (!opWorklist.empty()) {
+      f.emitError("Shape inference failed, ")
+          << opWorklist.size() << " operations couldn't be inferred\n";
+      signalPassFailure();
+    }
+  }
+
+  /// A utility method that returns if the given operation has a dynamically
+  /// shaped result.
+  static bool returnsDynamicShape(Operation *op) {
+    return llvm::any_of(op->getResultTypes(), [](Type resultType) {
+      return !resultType.isa<RankedTensorType>();
+    });
+  }
+};
+} // end anonymous namespace
+
+/// Create a Shape Inference pass.
+std::unique_ptr<mlir::Pass> mlir::toy::createShapeInferencePass() {
+  return std::make_unique<ShapeInferencePass>();
+}
diff --git a/mlir/examples/toy/Ch6/mlir/ToyCombine.cpp b/mlir/examples/toy/Ch6/mlir/ToyCombine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82c247c1be2d4da5ac4419f9267381379f8b365c
--- /dev/null
+++ b/mlir/examples/toy/Ch6/mlir/ToyCombine.cpp
@@ -0,0 +1,74 @@
+//===- ToyCombine.cpp - Toy High Level Optimizer --------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a set of simple combiners for optimizing operations in
+// the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "toy/Dialect.h"
+#include <numeric>
+using namespace mlir;
+using namespace toy;
+
+namespace {
+/// Include the patterns defined in the Declarative Rewrite framework.
+#include "ToyCombine.inc"
+} // end anonymous namespace
+
+/// Fold simple cast operations that return the same type as the input.
+OpFoldResult CastOp::fold(ArrayRef<Attribute> operands) {
+  return mlir::impl::foldCastOp(*this);
+}
+
+/// This is an example of a c++ rewrite pattern for the TransposeOp. It
+/// optimizes the following scenario: transpose(transpose(x)) -> transpose(x)
+struct SimplifyRedundantTranspose : public mlir::OpRewritePattern<TransposeOp> {
+  /// We register this pattern to match every toy.transpose in the IR.
+  /// The "benefit" is used by the framework to order the patterns and process
+  /// them in order of profitability.
+  SimplifyRedundantTranspose(mlir::MLIRContext *context)
+      : OpRewritePattern<TransposeOp>(context, /*benefit=*/1) {}
+
+  /// This method attempts to match a pattern and rewrite it. The rewriter
+  /// argument is the orchestrator of the sequence of rewrites. The pattern is
+  /// expected to interact with it to perform any changes to the IR from here.
+  mlir::PatternMatchResult
+  matchAndRewrite(TransposeOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // Look through the input of the current transpose.
+    mlir::Value transposeInput = op.getOperand();
+    TransposeOp transposeInputOp =
+        llvm::dyn_cast_or_null<TransposeOp>(transposeInput->getDefiningOp());
+
+    // If the input is defined by another Transpose, bingo!
+    if (!transposeInputOp)
+      return matchFailure();
+
+    // Use the rewriter to perform the replacement.
+    rewriter.replaceOp(op, {transposeInputOp.getOperand()}, {transposeInputOp});
+    return matchSuccess();
+  }
+};
+
+/// Register our patterns as "canonicalization" patterns on the TransposeOp so
+/// that they can be picked up by the Canonicalization framework.
+void TransposeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                              MLIRContext *context) {
+  results.insert<SimplifyRedundantTranspose>(context);
+}
+
+/// Register our patterns as "canonicalization" patterns on the ReshapeOp so
+/// that they can be picked up by the Canonicalization framework.
+void ReshapeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  results.insert<ReshapeReshapeOptPattern, RedundantReshapeOptPattern,
+                 FoldConstantReshapeOptPattern>(context);
+}
diff --git a/mlir/examples/toy/Ch6/mlir/ToyCombine.td b/mlir/examples/toy/Ch6/mlir/ToyCombine.td
new file mode 100644
index 0000000000000000000000000000000000000000..e6e33e84d7e8f3e13aea9840f3690029de025d94
--- /dev/null
+++ b/mlir/examples/toy/Ch6/mlir/ToyCombine.td
@@ -0,0 +1,62 @@
+//===- ToyCombine.td - Pattern Match Optimizations for Toy -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines language-specific pattern match optimizations for Toy using
+// Declarative Rewrite Rules (DRR) specified using TableGen records.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_COMBINE
+#define TOY_COMBINE
+
+include "toy/Ops.td"
+
+/// Note: The DRR definition used for defining patterns is shown below:
+///
+/// class Pattern<
+///    dag sourcePattern, list<dag> resultPatterns,
+///    list<dag> additionalConstraints = [],
+///    dag benefitsAdded = (addBenefit 0)
+/// >;
+
+//===----------------------------------------------------------------------===//
+// Basic Pattern-Match and Rewrite
+//===----------------------------------------------------------------------===//
+
+// Reshape(Reshape(x)) = Reshape(x)
+def ReshapeReshapeOptPattern : Pat<(ReshapeOp(ReshapeOp $arg)),
+                                   (ReshapeOp $arg)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern-Match and Rewrite using Native Code Call
+//===----------------------------------------------------------------------===//
+
+// Native Code Calls may be used for more complex transformations using inline
+// C++ and C++ helper functions.
+
+// Reshape(Constant(x)) = x'
+def ReshapeConstant :
+  NativeCodeCall<"$0.reshape(($1->getType()).cast<ShapedType>())">;
+def FoldConstantReshapeOptPattern : Pat<
+  (ReshapeOp:$res (ConstantOp $arg)),
+  (ConstantOp (ReshapeConstant $arg, $res))>;
+
+//===----------------------------------------------------------------------===//
+// Pattern-Match and Rewrite with Constraints
+//===----------------------------------------------------------------------===//
+
+// DRR allows for constraint checking when the transformation is conditional
+// on operand properties.
+
+// Reshape(x) = x, where input and output shapes are identical
+def TypesAreIdentical : Constraint<CPred<"$0->getType() == $1->getType()">>;
+def RedundantReshapeOptPattern : Pat<
+  (ReshapeOp:$res $arg), (replaceWithValue $arg),
+  [(TypesAreIdentical $res, $arg)]>;
+
+#endif // TOY_COMBINE
diff --git a/mlir/examples/toy/Ch6/parser/AST.cpp b/mlir/examples/toy/Ch6/parser/AST.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d6d9359529bffc068520bebf4a9ea56f436a415
--- /dev/null
+++ b/mlir/examples/toy/Ch6/parser/AST.cpp
@@ -0,0 +1,234 @@
+//===- AST.cpp - Helper for printing out the Toy AST ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST dump for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/AST.h"
+
+#include "mlir/ADT/TypeSwitch.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+
+namespace {
+
+// RAII helper to manage increasing/decreasing the indentation as we traverse
+// the AST
+struct Indent {
+  Indent(int &level) : level(level) { ++level; }
+  ~Indent() { --level; }
+  int &level;
+};
+
+/// Helper class that implement the AST tree traversal and print the nodes along
+/// the way. The only data member is the current indentation level.
+class ASTDumper {
+public:
+  void dump(ModuleAST *node);
+
+private:
+  void dump(const VarType &type);
+  void dump(VarDeclExprAST *varDecl);
+  void dump(ExprAST *expr);
+  void dump(ExprASTList *exprList);
+  void dump(NumberExprAST *num);
+  void dump(LiteralExprAST *node);
+  void dump(VariableExprAST *node);
+  void dump(ReturnExprAST *node);
+  void dump(BinaryExprAST *node);
+  void dump(CallExprAST *node);
+  void dump(PrintExprAST *node);
+  void dump(PrototypeAST *node);
+  void dump(FunctionAST *node);
+
+  // Actually print spaces matching the current indentation level
+  void indent() {
+    for (int i = 0; i < curIndent; i++)
+      llvm::errs() << "  ";
+  }
+  int curIndent = 0;
+};
+
+} // namespace
+
+/// Return a formatted string for the location of any node
+template <typename T> static std::string loc(T *node) {
+  const auto &loc = node->loc();
+  return (llvm::Twine("@") + *loc.file + ":" + llvm::Twine(loc.line) + ":" +
+          llvm::Twine(loc.col))
+      .str();
+}
+
+// Helper Macro to bump the indentation level and print the leading spaces for
+// the current indentations
+#define INDENT()                                                               \
+  Indent level_(curIndent);                                                    \
+  indent();
+
+/// Dispatch to a generic expressions to the appropriate subclass using RTTI
+void ASTDumper::dump(ExprAST *expr) {
+  mlir::TypeSwitch<ExprAST *>(expr)
+      .Case<BinaryExprAST, CallExprAST, LiteralExprAST, NumberExprAST,
+            PrintExprAST, ReturnExprAST, VarDeclExprAST, VariableExprAST>(
+          [&](auto *node) { this->dump(node); })
+      .Default([&](ExprAST *) {
+        // No match, fallback to a generic message
+        INDENT();
+        llvm::errs() << "<unknown Expr, kind " << expr->getKind() << ">\n";
+      });
+}
+
+/// A variable declaration is printing the variable name, the type, and then
+/// recurse in the initializer value.
+void ASTDumper::dump(VarDeclExprAST *varDecl) {
+  INDENT();
+  llvm::errs() << "VarDecl " << varDecl->getName();
+  dump(varDecl->getType());
+  llvm::errs() << " " << loc(varDecl) << "\n";
+  dump(varDecl->getInitVal());
+}
+
+/// A "block", or a list of expression
+void ASTDumper::dump(ExprASTList *exprList) {
+  INDENT();
+  llvm::errs() << "Block {\n";
+  for (auto &expr : *exprList)
+    dump(expr.get());
+  indent();
+  llvm::errs() << "} // Block\n";
+}
+
+/// A literal number, just print the value.
+void ASTDumper::dump(NumberExprAST *num) {
+  INDENT();
+  llvm::errs() << num->getValue() << " " << loc(num) << "\n";
+}
+
+/// Helper to print recursively a literal. This handles nested array like:
+///    [ [ 1, 2 ], [ 3, 4 ] ]
+/// We print out such array with the dimensions spelled out at every level:
+///    <2,2>[<2>[ 1, 2 ], <2>[ 3, 4 ] ]
+void printLitHelper(ExprAST *litOrNum) {
+  // Inside a literal expression we can have either a number or another literal
+  if (auto num = llvm::dyn_cast<NumberExprAST>(litOrNum)) {
+    llvm::errs() << num->getValue();
+    return;
+  }
+  auto *literal = llvm::cast<LiteralExprAST>(litOrNum);
+
+  // Print the dimension for this literal first
+  llvm::errs() << "<";
+  mlir::interleaveComma(literal->getDims(), llvm::errs());
+  llvm::errs() << ">";
+
+  // Now print the content, recursing on every element of the list
+  llvm::errs() << "[ ";
+  mlir::interleaveComma(literal->getValues(), llvm::errs(),
+                        [&](auto &elt) { printLitHelper(elt.get()); });
+  llvm::errs() << "]";
+}
+
+/// Print a literal, see the recursive helper above for the implementation.
+void ASTDumper::dump(LiteralExprAST *node) {
+  INDENT();
+  llvm::errs() << "Literal: ";
+  printLitHelper(node);
+  llvm::errs() << " " << loc(node) << "\n";
+}
+
+/// Print a variable reference (just a name).
+void ASTDumper::dump(VariableExprAST *node) {
+  INDENT();
+  llvm::errs() << "var: " << node->getName() << " " << loc(node) << "\n";
+}
+
+/// Return statement print the return and its (optional) argument.
+void ASTDumper::dump(ReturnExprAST *node) {
+  INDENT();
+  llvm::errs() << "Return\n";
+  if (node->getExpr().hasValue())
+    return dump(*node->getExpr());
+  {
+    INDENT();
+    llvm::errs() << "(void)\n";
+  }
+}
+
+/// Print a binary operation, first the operator, then recurse into LHS and RHS.
+void ASTDumper::dump(BinaryExprAST *node) {
+  INDENT();
+  llvm::errs() << "BinOp: " << node->getOp() << " " << loc(node) << "\n";
+  dump(node->getLHS());
+  dump(node->getRHS());
+}
+
+/// Print a call expression, first the callee name and the list of args by
+/// recursing into each individual argument.
+void ASTDumper::dump(CallExprAST *node) {
+  INDENT();
+  llvm::errs() << "Call '" << node->getCallee() << "' [ " << loc(node) << "\n";
+  for (auto &arg : node->getArgs())
+    dump(arg.get());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print a builtin print call, first the builtin name and then the argument.
+void ASTDumper::dump(PrintExprAST *node) {
+  INDENT();
+  llvm::errs() << "Print [ " << loc(node) << "\n";
+  dump(node->getArg());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print type: only the shape is printed in between '<' and '>'
+void ASTDumper::dump(const VarType &type) {
+  llvm::errs() << "<";
+  mlir::interleaveComma(type.shape, llvm::errs());
+  llvm::errs() << ">";
+}
+
+/// Print a function prototype, first the function name, and then the list of
+/// parameters names.
+void ASTDumper::dump(PrototypeAST *node) {
+  INDENT();
+  llvm::errs() << "Proto '" << node->getName() << "' " << loc(node) << "'\n";
+  indent();
+  llvm::errs() << "Params: [";
+  mlir::interleaveComma(node->getArgs(), llvm::errs(),
+                        [](auto &arg) { llvm::errs() << arg->getName(); });
+  llvm::errs() << "]\n";
+}
+
+/// Print a function, first the prototype and then the body.
+void ASTDumper::dump(FunctionAST *node) {
+  INDENT();
+  llvm::errs() << "Function \n";
+  dump(node->getProto());
+  dump(node->getBody());
+}
+
+/// Print a module, actually loop over the functions and print them in sequence.
+void ASTDumper::dump(ModuleAST *node) {
+  INDENT();
+  llvm::errs() << "Module:\n";
+  for (auto &f : *node)
+    dump(&f);
+}
+
+namespace toy {
+
+// Public API
+void dump(ModuleAST &module) { ASTDumper().dump(&module); }
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch6/toyc.cpp b/mlir/examples/toy/Ch6/toyc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e5b2afb7c65c5b6fcf9b10817c320b9bfdc11b2
--- /dev/null
+++ b/mlir/examples/toy/Ch6/toyc.cpp
@@ -0,0 +1,274 @@
+//===- toyc.cpp - The Toy Compiler ----------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the entry point for the Toy compiler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+#include "toy/MLIRGen.h"
+#include "toy/Parser.h"
+#include "toy/Passes.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/ExecutionEngine/ExecutionEngine.h"
+#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Parser.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Target/LLVMIR.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+namespace cl = llvm::cl;
+
+static cl::opt<std::string> inputFilename(cl::Positional,
+                                          cl::desc("<input toy file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+namespace {
+enum InputType { Toy, MLIR };
+}
+static cl::opt<enum InputType> inputType(
+    "x", cl::init(Toy), cl::desc("Decided the kind of output desired"),
+    cl::values(clEnumValN(Toy, "toy", "load the input file as a Toy source.")),
+    cl::values(clEnumValN(MLIR, "mlir",
+                          "load the input file as an MLIR file")));
+
+namespace {
+enum Action {
+  None,
+  DumpAST,
+  DumpMLIR,
+  DumpMLIRAffine,
+  DumpMLIRLLVM,
+  DumpLLVMIR,
+  RunJIT
+};
+}
+static cl::opt<enum Action> emitAction(
+    "emit", cl::desc("Select the kind of output desired"),
+    cl::values(clEnumValN(DumpAST, "ast", "output the AST dump")),
+    cl::values(clEnumValN(DumpMLIR, "mlir", "output the MLIR dump")),
+    cl::values(clEnumValN(DumpMLIRAffine, "mlir-affine",
+                          "output the MLIR dump after affine lowering")),
+    cl::values(clEnumValN(DumpMLIRLLVM, "mlir-llvm",
+                          "output the MLIR dump after llvm lowering")),
+    cl::values(clEnumValN(DumpLLVMIR, "llvm", "output the LLVM IR dump")),
+    cl::values(
+        clEnumValN(RunJIT, "jit",
+                   "JIT the code and run it by invoking the main function")));
+
+static cl::opt<bool> enableOpt("opt", cl::desc("Enable optimizations"));
+
+/// Returns a Toy AST resulting from parsing the file or a nullptr on error.
+std::unique_ptr<toy::ModuleAST> parseInputFile(llvm::StringRef filename) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(filename);
+  if (std::error_code ec = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << ec.message() << "\n";
+    return nullptr;
+  }
+  auto buffer = fileOrErr.get()->getBuffer();
+  LexerBuffer lexer(buffer.begin(), buffer.end(), filename);
+  Parser parser(lexer);
+  return parser.parseModule();
+}
+
+int loadMLIR(mlir::MLIRContext &context, mlir::OwningModuleRef &module) {
+  // Handle '.toy' input to the compiler.
+  if (inputType != InputType::MLIR &&
+      !llvm::StringRef(inputFilename).endswith(".mlir")) {
+    auto moduleAST = parseInputFile(inputFilename);
+    if (!moduleAST)
+      return 6;
+    module = mlirGen(context, *moduleAST);
+    return !module ? 1 : 0;
+  }
+
+  // Otherwise, the input is '.mlir'.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(inputFilename);
+  if (std::error_code EC = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << EC.message() << "\n";
+    return -1;
+  }
+
+  // Parse the input mlir.
+  llvm::SourceMgr sourceMgr;
+  sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
+  module = mlir::parseSourceFile(sourceMgr, &context);
+  if (!module) {
+    llvm::errs() << "Error can't load file " << inputFilename << "\n";
+    return 3;
+  }
+  return 0;
+}
+
+int loadAndProcessMLIR(mlir::MLIRContext &context,
+                       mlir::OwningModuleRef &module) {
+  if (int error = loadMLIR(context, module))
+    return error;
+
+  mlir::PassManager pm(&context);
+  // Apply any generic pass manager command line options and run the pipeline.
+  applyPassManagerCLOptions(pm);
+
+  // Check to see what granularity of MLIR we are compiling to.
+  bool isLoweringToAffine = emitAction >= Action::DumpMLIRAffine;
+  bool isLoweringToLLVM = emitAction >= Action::DumpMLIRLLVM;
+
+  if (enableOpt || isLoweringToAffine) {
+    // Inline all functions into main and then delete them.
+    pm.addPass(mlir::createInlinerPass());
+    pm.addPass(mlir::toy::createDeadFunctionEliminationPass());
+
+    // Now that there is only one function, we can infer the shapes of each of
+    // the operations.
+    mlir::OpPassManager &optPM = pm.nest<mlir::FuncOp>();
+    optPM.addPass(mlir::toy::createShapeInferencePass());
+    optPM.addPass(mlir::createCanonicalizerPass());
+    optPM.addPass(mlir::createCSEPass());
+  }
+
+  if (isLoweringToAffine) {
+    // Partially lower the toy dialect with a few cleanups afterwards.
+    pm.addPass(mlir::toy::createLowerToAffinePass());
+
+    mlir::OpPassManager &optPM = pm.nest<mlir::FuncOp>();
+    optPM.addPass(mlir::createCanonicalizerPass());
+    optPM.addPass(mlir::createCSEPass());
+
+    // Add optimizations if enabled.
+    if (enableOpt) {
+      optPM.addPass(mlir::createLoopFusionPass());
+      optPM.addPass(mlir::createMemRefDataFlowOptPass());
+    }
+  }
+
+  if (isLoweringToLLVM) {
+    // Finish lowering the toy IR to the LLVM dialect.
+    pm.addPass(mlir::toy::createLowerToLLVMPass());
+  }
+
+  if (mlir::failed(pm.run(*module)))
+    return 4;
+  return 0;
+}
+
+int dumpAST() {
+  if (inputType == InputType::MLIR) {
+    llvm::errs() << "Can't dump a Toy AST when the input is MLIR\n";
+    return 5;
+  }
+
+  auto moduleAST = parseInputFile(inputFilename);
+  if (!moduleAST)
+    return 1;
+
+  dump(*moduleAST);
+  return 0;
+}
+
+int dumpLLVMIR(mlir::ModuleOp module) {
+  auto llvmModule = mlir::translateModuleToLLVMIR(module);
+  if (!llvmModule) {
+    llvm::errs() << "Failed to emit LLVM IR\n";
+    return -1;
+  }
+
+  // Initialize LLVM targets.
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+  mlir::ExecutionEngine::setupTargetTriple(llvmModule.get());
+
+  /// Optionally run an optimization pipeline over the llvm module.
+  auto optPipeline = mlir::makeOptimizingTransformer(
+      /*optLevel=*/enableOpt ? 3 : 0, /*sizeLevel=*/0,
+      /*targetMachine=*/nullptr);
+  if (auto err = optPipeline(llvmModule.get())) {
+    llvm::errs() << "Failed to optimize LLVM IR " << err << "\n";
+    return -1;
+  }
+  llvm::errs() << *llvmModule << "\n";
+  return 0;
+}
+
+int runJit(mlir::ModuleOp module) {
+  // Initialize LLVM targets.
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+
+  // An optimization pipeline to use within the execution engine.
+  auto optPipeline = mlir::makeOptimizingTransformer(
+      /*optLevel=*/enableOpt ? 3 : 0, /*sizeLevel=*/0,
+      /*targetMachine=*/nullptr);
+
+  // Create an MLIR execution engine. The execution engine eagerly JIT-compiles
+  // the module.
+  auto maybeEngine = mlir::ExecutionEngine::create(module, optPipeline);
+  assert(maybeEngine && "failed to construct an execution engine");
+  auto &engine = maybeEngine.get();
+
+  // Invoke the JIT-compiled function.
+  auto invocationResult = engine->invoke("main");
+  if (invocationResult) {
+    llvm::errs() << "JIT invocation failed\n";
+    return -1;
+  }
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  mlir::registerPassManagerCLOptions();
+  cl::ParseCommandLineOptions(argc, argv, "toy compiler\n");
+
+  if (emitAction == Action::DumpAST)
+    return dumpAST();
+
+  // If we aren't dumping the AST, then we are compiling with/to MLIR.
+
+  // Register our Dialect with MLIR.
+  mlir::registerDialect<mlir::toy::ToyDialect>();
+
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module;
+  if (int error = loadAndProcessMLIR(context, module))
+    return error;
+
+  // If we aren't exporting to non-mlir, then we are done.
+  bool isOutputingMLIR = emitAction <= Action::DumpMLIRLLVM;
+  if (isOutputingMLIR) {
+    module->dump();
+    return 0;
+  }
+
+  // Check to see if we are compiling to LLVM IR.
+  if (emitAction == Action::DumpLLVMIR)
+    return dumpLLVMIR(*module);
+
+  // Otherwise, we must be running the jit.
+  if (emitAction == Action::RunJIT)
+    return runJit(*module);
+
+  llvm::errs() << "No action specified (parsing only?), use -emit=<action>\n";
+  return -1;
+}
diff --git a/mlir/examples/toy/Ch7/CMakeLists.txt b/mlir/examples/toy/Ch7/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5956d7f4d9b7193371cad01b56e9fde00ecef716
--- /dev/null
+++ b/mlir/examples/toy/Ch7/CMakeLists.txt
@@ -0,0 +1,53 @@
+add_subdirectory(include)
+
+set(LLVM_LINK_COMPONENTS
+  Core
+  Support
+  )
+
+set(LLVM_TARGET_DEFINITIONS mlir/ToyCombine.td)
+mlir_tablegen(ToyCombine.inc -gen-rewriters "-I${CMAKE_CURRENT_SOURCE_DIR}/include")
+add_public_tablegen_target(ToyCh7CombineIncGen)
+
+add_toy_chapter(toyc-ch7
+  toyc.cpp
+  parser/AST.cpp
+  mlir/MLIRGen.cpp
+  mlir/Dialect.cpp
+  mlir/DeadFunctionEliminationPass.cpp
+  mlir/LowerToAffineLoops.cpp
+  mlir/LowerToLLVM.cpp
+  mlir/ShapeInferencePass.cpp
+  mlir/ToyCombine.cpp
+  )
+
+add_dependencies(toyc-ch7 ToyCh7ShapeInferenceInterfaceIncGen)
+add_dependencies(toyc-ch7 ToyCh7OpsIncGen)
+add_dependencies(toyc-ch7 ToyCh7CombineIncGen)
+add_dependencies(toyc-ch7 MLIRCallOpInterfacesIncGen)
+include_directories(include/)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/)
+target_link_libraries(toyc-ch7
+  PRIVATE
+    MLIRAffineOps
+    MLIRAffineToStandard
+    MLIRAnalysis
+    MLIRExecutionEngine
+    MLIRIR
+    MLIRLLVMIR
+    MLIRLoopToStandard
+    MLIRParser
+    MLIRPass
+    MLIRStandardOps
+    MLIRStandardToLLVM
+    MLIRTargetLLVMIR
+    MLIRTransforms
+    )
+
+whole_archive_link(toyc-ch7
+  MLIRAffineToStandard
+  MLIRAffineOps
+  MLIRLLVMIR
+  MLIRStandardOps
+  )
diff --git a/mlir/examples/toy/Ch7/include/CMakeLists.txt b/mlir/examples/toy/Ch7/include/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..37c89d0bae965cfc8665515de7e60ad7867a7d8b
--- /dev/null
+++ b/mlir/examples/toy/Ch7/include/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(toy)
diff --git a/mlir/examples/toy/Ch7/include/toy/AST.h b/mlir/examples/toy/Ch7/include/toy/AST.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d3ae89dbeb2d03adaaa98c40d112d1e1a1285ab
--- /dev/null
+++ b/mlir/examples/toy/Ch7/include/toy/AST.h
@@ -0,0 +1,308 @@
+//===- AST.h - Node definition for the Toy AST ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST for the Toy language. It is optimized for
+// simplicity, not efficiency. The AST forms a tree structure where each node
+// references its children using std::unique_ptr<>.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_AST_H_
+#define MLIR_TUTORIAL_TOY_AST_H_
+
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <vector>
+
+namespace toy {
+
+/// A variable type with either name or shape information.
+struct VarType {
+  std::string name;
+  std::vector<int64_t> shape;
+};
+
+/// Base class for all expression nodes.
+class ExprAST {
+public:
+  enum ExprASTKind {
+    Expr_VarDecl,
+    Expr_Return,
+    Expr_Num,
+    Expr_Literal,
+    Expr_StructLiteral,
+    Expr_Var,
+    Expr_BinOp,
+    Expr_Call,
+    Expr_Print,
+  };
+
+  ExprAST(ExprASTKind kind, Location location)
+      : kind(kind), location(location) {}
+  virtual ~ExprAST() = default;
+
+  ExprASTKind getKind() const { return kind; }
+
+  const Location &loc() { return location; }
+
+private:
+  const ExprASTKind kind;
+  Location location;
+};
+
+/// A block-list of expressions.
+using ExprASTList = std::vector<std::unique_ptr<ExprAST>>;
+
+/// Expression class for numeric literals like "1.0".
+class NumberExprAST : public ExprAST {
+  double Val;
+
+public:
+  NumberExprAST(Location loc, double val) : ExprAST(Expr_Num, loc), Val(val) {}
+
+  double getValue() { return Val; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Num; }
+};
+
+/// Expression class for a literal value.
+class LiteralExprAST : public ExprAST {
+  std::vector<std::unique_ptr<ExprAST>> values;
+  std::vector<int64_t> dims;
+
+public:
+  LiteralExprAST(Location loc, std::vector<std::unique_ptr<ExprAST>> values,
+                 std::vector<int64_t> dims)
+      : ExprAST(Expr_Literal, loc), values(std::move(values)),
+        dims(std::move(dims)) {}
+
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getValues() { return values; }
+  llvm::ArrayRef<int64_t> getDims() { return dims; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Literal; }
+};
+
+/// Expression class for a literal struct value.
+class StructLiteralExprAST : public ExprAST {
+  std::vector<std::unique_ptr<ExprAST>> values;
+
+public:
+  StructLiteralExprAST(Location loc,
+                       std::vector<std::unique_ptr<ExprAST>> values)
+      : ExprAST(Expr_StructLiteral, loc), values(std::move(values)) {}
+
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getValues() { return values; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) {
+    return c->getKind() == Expr_StructLiteral;
+  }
+};
+
+/// Expression class for referencing a variable, like "a".
+class VariableExprAST : public ExprAST {
+  std::string name;
+
+public:
+  VariableExprAST(Location loc, llvm::StringRef name)
+      : ExprAST(Expr_Var, loc), name(name) {}
+
+  llvm::StringRef getName() { return name; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Var; }
+};
+
+/// Expression class for defining a variable.
+class VarDeclExprAST : public ExprAST {
+  std::string name;
+  VarType type;
+  std::unique_ptr<ExprAST> initVal;
+
+public:
+  VarDeclExprAST(Location loc, llvm::StringRef name, VarType type,
+                 std::unique_ptr<ExprAST> initVal = nullptr)
+      : ExprAST(Expr_VarDecl, loc), name(name), type(std::move(type)),
+        initVal(std::move(initVal)) {}
+
+  llvm::StringRef getName() { return name; }
+  ExprAST *getInitVal() { return initVal.get(); }
+  const VarType &getType() { return type; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_VarDecl; }
+};
+
+/// Expression class for a return operator.
+class ReturnExprAST : public ExprAST {
+  llvm::Optional<std::unique_ptr<ExprAST>> expr;
+
+public:
+  ReturnExprAST(Location loc, llvm::Optional<std::unique_ptr<ExprAST>> expr)
+      : ExprAST(Expr_Return, loc), expr(std::move(expr)) {}
+
+  llvm::Optional<ExprAST *> getExpr() {
+    if (expr.hasValue())
+      return expr->get();
+    return llvm::None;
+  }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Return; }
+};
+
+/// Expression class for a binary operator.
+class BinaryExprAST : public ExprAST {
+  char op;
+  std::unique_ptr<ExprAST> lhs, rhs;
+
+public:
+  char getOp() { return op; }
+  ExprAST *getLHS() { return lhs.get(); }
+  ExprAST *getRHS() { return rhs.get(); }
+
+  BinaryExprAST(Location loc, char Op, std::unique_ptr<ExprAST> lhs,
+                std::unique_ptr<ExprAST> rhs)
+      : ExprAST(Expr_BinOp, loc), op(Op), lhs(std::move(lhs)),
+        rhs(std::move(rhs)) {}
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_BinOp; }
+};
+
+/// Expression class for function calls.
+class CallExprAST : public ExprAST {
+  std::string callee;
+  std::vector<std::unique_ptr<ExprAST>> args;
+
+public:
+  CallExprAST(Location loc, const std::string &callee,
+              std::vector<std::unique_ptr<ExprAST>> args)
+      : ExprAST(Expr_Call, loc), callee(callee), args(std::move(args)) {}
+
+  llvm::StringRef getCallee() { return callee; }
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getArgs() { return args; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Call; }
+};
+
+/// Expression class for builtin print calls.
+class PrintExprAST : public ExprAST {
+  std::unique_ptr<ExprAST> arg;
+
+public:
+  PrintExprAST(Location loc, std::unique_ptr<ExprAST> arg)
+      : ExprAST(Expr_Print, loc), arg(std::move(arg)) {}
+
+  ExprAST *getArg() { return arg.get(); }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Print; }
+};
+
+/// This class represents the "prototype" for a function, which captures its
+/// name, and its argument names (thus implicitly the number of arguments the
+/// function takes).
+class PrototypeAST {
+  Location location;
+  std::string name;
+  std::vector<std::unique_ptr<VarDeclExprAST>> args;
+
+public:
+  PrototypeAST(Location location, const std::string &name,
+               std::vector<std::unique_ptr<VarDeclExprAST>> args)
+      : location(location), name(name), args(std::move(args)) {}
+
+  const Location &loc() { return location; }
+  llvm::StringRef getName() const { return name; }
+  llvm::ArrayRef<std::unique_ptr<VarDeclExprAST>> getArgs() { return args; }
+};
+
+/// This class represents a top level record in a module.
+class RecordAST {
+public:
+  enum RecordASTKind {
+    Record_Function,
+    Record_Struct,
+  };
+
+  RecordAST(RecordASTKind kind) : kind(kind) {}
+  virtual ~RecordAST() = default;
+
+  RecordASTKind getKind() const { return kind; }
+
+private:
+  const RecordASTKind kind;
+};
+
+/// This class represents a function definition itself.
+class FunctionAST : public RecordAST {
+  std::unique_ptr<PrototypeAST> proto;
+  std::unique_ptr<ExprASTList> body;
+
+public:
+  FunctionAST(std::unique_ptr<PrototypeAST> proto,
+              std::unique_ptr<ExprASTList> body)
+      : RecordAST(Record_Function), proto(std::move(proto)),
+        body(std::move(body)) {}
+  PrototypeAST *getProto() { return proto.get(); }
+  ExprASTList *getBody() { return body.get(); }
+
+  /// LLVM style RTTI
+  static bool classof(const RecordAST *R) {
+    return R->getKind() == Record_Function;
+  }
+};
+
+/// This class represents a struct definition.
+class StructAST : public RecordAST {
+  Location location;
+  std::string name;
+  std::vector<std::unique_ptr<VarDeclExprAST>> variables;
+
+public:
+  StructAST(Location location, const std::string &name,
+            std::vector<std::unique_ptr<VarDeclExprAST>> variables)
+      : RecordAST(Record_Struct), location(location), name(name),
+        variables(std::move(variables)) {}
+
+  const Location &loc() { return location; }
+  llvm::StringRef getName() const { return name; }
+  llvm::ArrayRef<std::unique_ptr<VarDeclExprAST>> getVariables() {
+    return variables;
+  }
+
+  /// LLVM style RTTI
+  static bool classof(const RecordAST *R) {
+    return R->getKind() == Record_Struct;
+  }
+};
+
+/// This class represents a list of functions to be processed together
+class ModuleAST {
+  std::vector<std::unique_ptr<RecordAST>> records;
+
+public:
+  ModuleAST(std::vector<std::unique_ptr<RecordAST>> records)
+      : records(std::move(records)) {}
+
+  auto begin() -> decltype(records.begin()) { return records.begin(); }
+  auto end() -> decltype(records.end()) { return records.end(); }
+};
+
+void dump(ModuleAST &);
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_AST_H_
diff --git a/mlir/examples/toy/Ch7/include/toy/CMakeLists.txt b/mlir/examples/toy/Ch7/include/toy/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fa30bd2e8e03eae897f5b7110703bb811125662e
--- /dev/null
+++ b/mlir/examples/toy/Ch7/include/toy/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_TARGET_DEFINITIONS Ops.td)
+mlir_tablegen(Ops.h.inc -gen-op-decls "-I${CMAKE_CURRENT_SOURCE_DIR}/..")
+mlir_tablegen(Ops.cpp.inc -gen-op-defs "-I${CMAKE_CURRENT_SOURCE_DIR}/..")
+add_public_tablegen_target(ToyCh7OpsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS ShapeInferenceInterface.td)
+mlir_tablegen(ShapeInferenceOpInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(ShapeInferenceOpInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(ToyCh7ShapeInferenceInterfaceIncGen)
diff --git a/mlir/examples/toy/Ch7/include/toy/Dialect.h b/mlir/examples/toy/Ch7/include/toy/Dialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..77481b1884fab8ce35fa3864d3c0fcb0303ffc51
--- /dev/null
+++ b/mlir/examples/toy/Ch7/include/toy/Dialect.h
@@ -0,0 +1,100 @@
+//===- Dialect.h - Dialect definition for the Toy IR ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IR Dialect for the Toy language.
+// See g3doc/Tutorials/Toy/Ch-2.md for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_DIALECT_H_
+#define MLIR_TUTORIAL_TOY_DIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/StandardTypes.h"
+#include "toy/ShapeInferenceInterface.h"
+
+namespace mlir {
+namespace toy {
+namespace detail {
+struct StructTypeStorage;
+} // end namespace detail
+
+/// This is the definition of the Toy dialect. A dialect inherits from
+/// mlir::Dialect and registers custom attributes, operations, and types (in its
+/// constructor). It can also override some general behavior exposed via virtual
+/// methods.
+class ToyDialect : public mlir::Dialect {
+public:
+  explicit ToyDialect(mlir::MLIRContext *ctx);
+
+  /// A hook used to materialize constant values with the given type.
+  Operation *materializeConstant(OpBuilder &builder, Attribute value, Type type,
+                                 Location loc) override;
+
+  /// Parse an instance of a type registered to the toy dialect.
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
+
+  /// Print an instance of a type registered to the toy dialect.
+  void printType(mlir::Type type,
+                 mlir::DialectAsmPrinter &printer) const override;
+
+  /// Provide a utility accessor to the dialect namespace. This is used by
+  /// several utilities for casting between dialects.
+  static llvm::StringRef getDialectNamespace() { return "toy"; }
+};
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+/// Include the auto-generated header file containing the declarations of the
+/// toy operations.
+#define GET_OP_CLASSES
+#include "toy/Ops.h.inc"
+
+//===----------------------------------------------------------------------===//
+// Toy Types
+//===----------------------------------------------------------------------===//
+
+/// Create a local enumeration with all of the types that are defined by Toy.
+namespace ToyTypes {
+enum Types {
+  Struct = mlir::Type::FIRST_TOY_TYPE,
+};
+} // end namespace ToyTypes
+
+/// This class defines the Toy struct type. It represents a collection of
+/// element types. All derived types in MLIR must inherit from the CRTP class
+/// 'Type::TypeBase'. It takes as template parameters the concrete type
+/// (StructType), the base class to use (Type), and the storage class
+/// (StructTypeStorage).
+class StructType : public mlir::Type::TypeBase<StructType, mlir::Type,
+                                               detail::StructTypeStorage> {
+public:
+  /// Inherit some necessary constructors from 'TypeBase'.
+  using Base::Base;
+
+  /// This static method is used to support type inquiry through isa, cast,
+  /// and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == ToyTypes::Struct; }
+
+  /// Create an instance of a `StructType` with the given element types. There
+  /// *must* be atleast one element type.
+  static StructType get(llvm::ArrayRef<mlir::Type> elementTypes);
+
+  /// Returns the element types of this struct type.
+  llvm::ArrayRef<mlir::Type> getElementTypes();
+
+  /// Returns the number of element type held by this struct.
+  size_t getNumElementTypes() { return getElementTypes().size(); }
+};
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_DIALECT_H_
diff --git a/mlir/examples/toy/Ch7/include/toy/Lexer.h b/mlir/examples/toy/Ch7/include/toy/Lexer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b41b82f2a0a7064351b99ce6575bddce9b5dc96e
--- /dev/null
+++ b/mlir/examples/toy/Ch7/include/toy/Lexer.h
@@ -0,0 +1,235 @@
+//===- Lexer.h - Lexer for the Toy language -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple Lexer for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_LEXER_H_
+#define MLIR_TUTORIAL_TOY_LEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+
+#include <memory>
+#include <string>
+
+namespace toy {
+
+/// Structure definition a location in a file.
+struct Location {
+  std::shared_ptr<std::string> file; ///< filename.
+  int line;                          ///< line number.
+  int col;                           ///< column number.
+};
+
+// List of Token returned by the lexer.
+enum Token : int {
+  tok_semicolon = ';',
+  tok_parenthese_open = '(',
+  tok_parenthese_close = ')',
+  tok_bracket_open = '{',
+  tok_bracket_close = '}',
+  tok_sbracket_open = '[',
+  tok_sbracket_close = ']',
+
+  tok_eof = -1,
+
+  // commands
+  tok_return = -2,
+  tok_var = -3,
+  tok_def = -4,
+  tok_struct = -5,
+
+  // primary
+  tok_identifier = -6,
+  tok_number = -7,
+};
+
+/// The Lexer is an abstract base class providing all the facilities that the
+/// Parser expects. It goes through the stream one token at a time and keeps
+/// track of the location in the file for debugging purpose.
+/// It relies on a subclass to provide a `readNextLine()` method. The subclass
+/// can proceed by reading the next line from the standard input or from a
+/// memory mapped file.
+class Lexer {
+public:
+  /// Create a lexer for the given filename. The filename is kept only for
+  /// debugging purpose (attaching a location to a Token).
+  Lexer(std::string filename)
+      : lastLocation(
+            {std::make_shared<std::string>(std::move(filename)), 0, 0}) {}
+  virtual ~Lexer() = default;
+
+  /// Look at the current token in the stream.
+  Token getCurToken() { return curTok; }
+
+  /// Move to the next token in the stream and return it.
+  Token getNextToken() { return curTok = getTok(); }
+
+  /// Move to the next token in the stream, asserting on the current token
+  /// matching the expectation.
+  void consume(Token tok) {
+    assert(tok == curTok && "consume Token mismatch expectation");
+    getNextToken();
+  }
+
+  /// Return the current identifier (prereq: getCurToken() == tok_identifier)
+  llvm::StringRef getId() {
+    assert(curTok == tok_identifier);
+    return identifierStr;
+  }
+
+  /// Return the current number (prereq: getCurToken() == tok_number)
+  double getValue() {
+    assert(curTok == tok_number);
+    return numVal;
+  }
+
+  /// Return the location for the beginning of the current token.
+  Location getLastLocation() { return lastLocation; }
+
+  // Return the current line in the file.
+  int getLine() { return curLineNum; }
+
+  // Return the current column in the file.
+  int getCol() { return curCol; }
+
+private:
+  /// Delegate to a derived class fetching the next line. Returns an empty
+  /// string to signal end of file (EOF). Lines are expected to always finish
+  /// with "\n"
+  virtual llvm::StringRef readNextLine() = 0;
+
+  /// Return the next character from the stream. This manages the buffer for the
+  /// current line and request the next line buffer to the derived class as
+  /// needed.
+  int getNextChar() {
+    // The current line buffer should not be empty unless it is the end of file.
+    if (curLineBuffer.empty())
+      return EOF;
+    ++curCol;
+    auto nextchar = curLineBuffer.front();
+    curLineBuffer = curLineBuffer.drop_front();
+    if (curLineBuffer.empty())
+      curLineBuffer = readNextLine();
+    if (nextchar == '\n') {
+      ++curLineNum;
+      curCol = 0;
+    }
+    return nextchar;
+  }
+
+  ///  Return the next token from standard input.
+  Token getTok() {
+    // Skip any whitespace.
+    while (isspace(lastChar))
+      lastChar = Token(getNextChar());
+
+    // Save the current location before reading the token characters.
+    lastLocation.line = curLineNum;
+    lastLocation.col = curCol;
+
+    // Identifier: [a-zA-Z][a-zA-Z0-9_]*
+    if (isalpha(lastChar)) {
+      identifierStr = (char)lastChar;
+      while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
+        identifierStr += (char)lastChar;
+
+      if (identifierStr == "return")
+        return tok_return;
+      if (identifierStr == "def")
+        return tok_def;
+      if (identifierStr == "struct")
+        return tok_struct;
+      if (identifierStr == "var")
+        return tok_var;
+      return tok_identifier;
+    }
+
+    // Number: [0-9] ([0-9.])*
+    if (isdigit(lastChar)) {
+      std::string numStr;
+      do {
+        numStr += lastChar;
+        lastChar = Token(getNextChar());
+      } while (isdigit(lastChar) || lastChar == '.');
+
+      numVal = strtod(numStr.c_str(), nullptr);
+      return tok_number;
+    }
+
+    if (lastChar == '#') {
+      // Comment until end of line.
+      do {
+        lastChar = Token(getNextChar());
+      } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
+
+      if (lastChar != EOF)
+        return getTok();
+    }
+
+    // Check for end of file.  Don't eat the EOF.
+    if (lastChar == EOF)
+      return tok_eof;
+
+    // Otherwise, just return the character as its ascii value.
+    Token thisChar = Token(lastChar);
+    lastChar = Token(getNextChar());
+    return thisChar;
+  }
+
+  /// The last token read from the input.
+  Token curTok = tok_eof;
+
+  /// Location for `curTok`.
+  Location lastLocation;
+
+  /// If the current Token is an identifier, this string contains the value.
+  std::string identifierStr;
+
+  /// If the current Token is a number, this contains the value.
+  double numVal = 0;
+
+  /// The last value returned by getNextChar(). We need to keep it around as we
+  /// always need to read ahead one character to decide when to end a token and
+  /// we can't put it back in the stream after reading from it.
+  Token lastChar = Token(' ');
+
+  /// Keep track of the current line number in the input stream
+  int curLineNum = 0;
+
+  /// Keep track of the current column number in the input stream
+  int curCol = 0;
+
+  /// Buffer supplied by the derived class on calls to `readNextLine()`
+  llvm::StringRef curLineBuffer = "\n";
+};
+
+/// A lexer implementation operating on a buffer in memory.
+class LexerBuffer final : public Lexer {
+public:
+  LexerBuffer(const char *begin, const char *end, std::string filename)
+      : Lexer(std::move(filename)), current(begin), end(end) {}
+
+private:
+  /// Provide one line at a time to the Lexer, return an empty string when
+  /// reaching the end of the buffer.
+  llvm::StringRef readNextLine() override {
+    auto *begin = current;
+    while (current <= end && *current && *current != '\n')
+      ++current;
+    if (current <= end && *current)
+      ++current;
+    llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
+    return result;
+  }
+  const char *current, *end;
+};
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_LEXER_H_
diff --git a/mlir/examples/toy/Ch7/include/toy/MLIRGen.h b/mlir/examples/toy/Ch7/include/toy/MLIRGen.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1c8ca1201d1a2a391c0aec0d89197fbbb18efb8
--- /dev/null
+++ b/mlir/examples/toy/Ch7/include/toy/MLIRGen.h
@@ -0,0 +1,32 @@
+//===- MLIRGen.h - MLIR Generation from a Toy AST -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a simple interface to perform IR generation targeting MLIR
+// from a Module AST for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_MLIRGEN_H_
+#define MLIR_TUTORIAL_TOY_MLIRGEN_H_
+
+#include <memory>
+
+namespace mlir {
+class MLIRContext;
+class OwningModuleRef;
+} // namespace mlir
+
+namespace toy {
+class ModuleAST;
+
+/// Emit IR for the given Toy moduleAST, returns a newly created MLIR module
+/// or nullptr on failure.
+mlir::OwningModuleRef mlirGen(mlir::MLIRContext &context, ModuleAST &moduleAST);
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_MLIRGEN_H_
diff --git a/mlir/examples/toy/Ch7/include/toy/Ops.td b/mlir/examples/toy/Ch7/include/toy/Ops.td
new file mode 100644
index 0000000000000000000000000000000000000000..15395c6da4e61058afd36da4a7c860593a8f4ca1
--- /dev/null
+++ b/mlir/examples/toy/Ch7/include/toy/Ops.td
@@ -0,0 +1,300 @@
+//===- Ops.td - Toy dialect operation definitions ----------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the operations of the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_OPS
+#define TOY_OPS
+
+include "mlir/Analysis/CallInterfaces.td"
+include "toy/ShapeInferenceInterface.td"
+
+// Provide a definition of the 'toy' dialect in the ODS framework so that we
+// can define our operations.
+def Toy_Dialect : Dialect {
+  let name = "toy";
+  let cppNamespace = "toy";
+}
+
+// Base class for toy dialect operations. This operation inherits from the base
+// `Op` class in OpBase.td, and provides:
+//   * The parent dialect of the operation.
+//   * The mnemonic for the operation, or the name without the dialect prefix.
+//   * A list of traits for the operation.
+class Toy_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Toy_Dialect, mnemonic, traits>;
+
+// Provide a definition for the Toy StructType for use in ODS. This allows for
+// using StructType in a similar way to Tensor or MemRef.
+def Toy_StructType :
+    Type<CPred<"$_self.isa<StructType>()">, "Toy struct type">;
+
+// Provide a definition of the types that are used within the Toy dialect.
+def Toy_Type : AnyTypeOf<[F64Tensor, Toy_StructType]>;
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+// We define a toy operation by inheriting from our base 'Toy_Op' class above.
+// Here we provide the mnemonic and a list of traits for the operation. The
+// constant operation is marked as 'NoSideEffect' as it is a pure operation
+// and may be removed if dead.
+def ConstantOp : Toy_Op<"constant",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  // Provide a summary and description for this operation. This can be used to
+  // auto-generate documentation of the operations within our dialect.
+  let summary = "constant";
+  let description = [{
+    Constant operation turns a literal into an SSA value. The data is attached
+    to the operation as an attribute. For example:
+
+    ```mlir
+      %0 = "toy.constant"()
+         { value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64> }
+        : () -> tensor<2x3xf64>
+    ```
+  }];
+
+  // The constant operation takes an attribute as the only input.
+  let arguments = (ins F64ElementsAttr:$value);
+
+  // The constant operation returns a single value of TensorType.
+  let results = (outs F64Tensor);
+
+  // Add custom build methods for the constant operation. These method populates
+  // the `state` that MLIR uses to create operations, i.e. these are used when
+  // using `builder.create<ConstantOp>(...)`.
+  let builders = [
+    // Build a constant with a given constant tensor value.
+    OpBuilder<"Builder *builder, OperationState &state, "
+              "DenseElementsAttr value", [{
+      build(builder, state, value.getType(), value);
+    }]>,
+
+    // Build a constant with a given constant floating-point value.
+    OpBuilder<"Builder *builder, OperationState &state, double value">
+  ];
+
+  // Invoke a static verify method to verify this constant operation.
+  let verifier = [{ return ::verify(*this); }];
+
+  // Set the folder bit so that we can implement constant folders.
+  let hasFolder = 1;
+}
+
+def AddOp : Toy_Op<"add",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "element-wise addition operation";
+  let description = [{
+    The "add" operation performs element-wise addition between two tensors.
+    The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs F64Tensor);
+
+  // Allow building an AddOp with from the two input operands.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
+  ];
+}
+
+def CastOp : Toy_Op<"cast",
+    [DeclareOpInterfaceMethods<ShapeInferenceOpInterface>, NoSideEffect,
+     SameOperandsAndResultShape]> {
+  let summary = "shape cast operation";
+  let description = [{
+    The "cast" operation converts a tensor from one type to an equivalent type
+    without changing any data elements. The source and destination types
+    must both be tensor types with the same element type. If both are ranked
+    then the rank should be the same and static dimensions should match. The
+    operation is invalid if converting to a mismatching constant dimension.
+  }];
+
+  let arguments = (ins F64Tensor:$input);
+  let results = (outs F64Tensor:$output);
+
+  // Set the folder bit so that we can fold redundant cast operations.
+  let hasFolder = 1;
+}
+
+def GenericCallOp : Toy_Op<"generic_call",
+    [DeclareOpInterfaceMethods<CallOpInterface>]> {
+  let summary = "generic call operation";
+  let description = [{
+    Generic calls represent calls to a user defined function that needs to
+    be specialized for the shape of its arguments. The callee name is attached
+    as a symbol reference via an attribute. The arguments list must match the
+    arguments expected by the callee. For example:
+
+    ```mlir
+     %4 = "toy.generic_call"(%1, %3) {callee = @my_func}
+           : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+    ```
+
+    This is only valid if a function named "my_func" exists and takes two
+    arguments.
+  }];
+
+  // The generic call operation takes a symbol reference attribute as the
+  // callee, and inputs for the call.
+  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<Toy_Type>:$inputs);
+
+  // The generic call operation returns a single value of TensorType or
+  // StructType.
+  let results = (outs Toy_Type);
+
+  // Add custom build methods for the generic call operation.
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &state, "
+              "StringRef callee, ArrayRef<Value> arguments">
+  ];
+}
+
+def MulOp : Toy_Op<"mul",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "element-wise multiplication operation";
+  let description = [{
+    The "mul" operation performs element-wise multiplication between two
+    tensors. The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs F64Tensor);
+
+  // Allow building a MulOp with from the two input operands.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value lhs, Value rhs">
+  ];
+}
+
+def PrintOp : Toy_Op<"print"> {
+  let summary = "print operation";
+  let description = [{
+    The "print" builtin operation prints a given input tensor, and produces
+    no results.
+  }];
+
+  // The print operation takes an input tensor to print.
+  // We also allow a F64MemRef to enable interop during partial lowering.
+  let arguments = (ins AnyTypeOf<[F64Tensor, F64MemRef]>:$input);
+}
+
+def ReshapeOp : Toy_Op<"reshape", [NoSideEffect]> {
+  let summary = "tensor reshape operation";
+  let description = [{
+    Reshape operation is transforming its input tensor into a new tensor with
+    the same number of elements but different shapes. For example:
+
+    ```mlir
+       %0 = "toy.reshape"(%arg1) : (tensor<10xf64>) -> tensor<5x2xf64>
+    ```
+  }];
+
+  let arguments = (ins F64Tensor:$input);
+  let hasCanonicalizer = 1;
+
+  // We expect that the reshape operation returns a statically shaped tensor.
+  let results = (outs StaticShapeTensorOf<[F64]>);
+}
+
+def ReturnOp : Toy_Op<"return", [Terminator, HasParent<"FuncOp">]> {
+  let summary = "return operation";
+  let description = [{
+    The "return" operation represents a return operation within a function.
+    The operation takes an optional operand and produces no results.
+    The operand type must match the signature of the function that contains
+    the operation. For example:
+
+    ```mlir
+      func @foo() -> tensor<2xf64> {
+        ...
+        toy.return %0 : tensor<2xf64>
+      }
+    ```
+  }];
+
+  // The return operation takes an optional input operand to return. This
+  // value must match the return type of the enclosing function.
+  let arguments = (ins Variadic<Toy_Type>:$input);
+
+  // Allow building a ReturnOp with no return operand.
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &state", [{ build(b, state, llvm::None); }]
+  >];
+
+  // Provide extra utility definitions on the c++ operation class definition.
+  let extraClassDeclaration = [{
+    bool hasOperand() { return getNumOperands() != 0; }
+  }];
+
+  // Invoke a static verify method to verify this return operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def StructAccessOp : Toy_Op<"struct_access", [NoSideEffect]> {
+  let summary = "struct access";
+  let description = [{
+    Access the Nth element of a value returning a struct type.
+  }];
+
+  let arguments = (ins Toy_StructType:$input, I64Attr:$index);
+  let results = (outs Toy_Type);
+
+  // Allow building a StructAccessOp with just a struct value and an index.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value input, size_t index">
+  ];
+
+  let verifier = [{ return ::verify(*this); }];
+
+  // Set the folder bit so that we can fold constant accesses.
+  let hasFolder = 1;
+}
+
+def StructConstantOp : Toy_Op<"struct_constant", [NoSideEffect]> {
+  let summary = "struct constant";
+  let description = [{
+    Constant operation turns a literal struct value into an SSA value. The data
+    is attached to the operation as an attribute. The struct constant is encoded
+    as an array of other constant values. For example:
+
+    ```mlir
+      %0 = "toy.struct_constant"() {
+        value = [dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf64>]
+      } : () -> !toy.struct<tensor<*xf64>>
+    ```
+  }];
+
+  let hasFolder = 1;
+  let arguments = (ins ArrayAttr:$value);
+  let results = (outs Toy_StructType);
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def TransposeOp : Toy_Op<"transpose",
+    [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "transpose operation";
+
+  let arguments = (ins F64Tensor:$input);
+  let results = (outs F64Tensor);
+  let hasCanonicalizer = 1;
+
+  // Allow building a TransposeOp with from the input operand.
+  let builders = [
+    OpBuilder<"Builder *b, OperationState &state, Value input">
+  ];
+
+  // Invoke a static verify method to verify this transpose operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
+#endif // TOY_OPS
diff --git a/mlir/examples/toy/Ch7/include/toy/Parser.h b/mlir/examples/toy/Ch7/include/toy/Parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2659e04dacb028c092264bdd8db91acf91ac518
--- /dev/null
+++ b/mlir/examples/toy/Ch7/include/toy/Parser.h
@@ -0,0 +1,678 @@
+//===- Parser.h - Toy Language Parser -------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the parser for the Toy language. It processes the Token
+// provided by the Lexer and returns an AST.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_PARSER_H
+#define MLIR_TUTORIAL_TOY_PARSER_H
+
+#include "toy/AST.h"
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+namespace toy {
+
+/// This is a simple recursive parser for the Toy language. It produces a well
+/// formed AST from a stream of Token supplied by the Lexer. No semantic checks
+/// or symbol resolution is performed. For example, variables are referenced by
+/// string and the code could reference an undeclared variable and the parsing
+/// succeeds.
+class Parser {
+public:
+  /// Create a Parser for the supplied lexer.
+  Parser(Lexer &lexer) : lexer(lexer) {}
+
+  /// Parse a full Module. A module is a list of function definitions.
+  std::unique_ptr<ModuleAST> parseModule() {
+    lexer.getNextToken(); // prime the lexer
+
+    // Parse functions and structs one at a time and accumulate in this vector.
+    std::vector<std::unique_ptr<RecordAST>> records;
+    while (true) {
+      std::unique_ptr<RecordAST> record;
+      switch (lexer.getCurToken()) {
+      case tok_eof:
+        break;
+      case tok_def:
+        record = parseDefinition();
+        break;
+      case tok_struct:
+        record = parseStruct();
+        break;
+      default:
+        return parseError<ModuleAST>("'def' or 'struct'",
+                                     "when parsing top level module records");
+      }
+      if (!record)
+        break;
+      records.push_back(std::move(record));
+    }
+
+    // If we didn't reach EOF, there was an error during parsing
+    if (lexer.getCurToken() != tok_eof)
+      return parseError<ModuleAST>("nothing", "at end of module");
+
+    return std::make_unique<ModuleAST>(std::move(records));
+  }
+
+private:
+  Lexer &lexer;
+
+  /// Parse a return statement.
+  /// return :== return ; | return expr ;
+  std::unique_ptr<ReturnExprAST> parseReturn() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_return);
+
+    // return takes an optional argument
+    llvm::Optional<std::unique_ptr<ExprAST>> expr;
+    if (lexer.getCurToken() != ';') {
+      expr = parseExpression();
+      if (!expr)
+        return nullptr;
+    }
+    return std::make_unique<ReturnExprAST>(std::move(loc), std::move(expr));
+  }
+
+  /// Parse a literal number.
+  /// numberexpr ::= number
+  std::unique_ptr<ExprAST> parseNumberExpr() {
+    auto loc = lexer.getLastLocation();
+    auto result =
+        std::make_unique<NumberExprAST>(std::move(loc), lexer.getValue());
+    lexer.consume(tok_number);
+    return std::move(result);
+  }
+
+  /// Parse a literal array expression.
+  /// tensorLiteral ::= [ literalList ] | number
+  /// literalList ::= tensorLiteral | tensorLiteral, literalList
+  std::unique_ptr<ExprAST> parseTensorLiteralExpr() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(Token('['));
+
+    // Hold the list of values at this nesting level.
+    std::vector<std::unique_ptr<ExprAST>> values;
+    // Hold the dimensions for all the nesting inside this level.
+    std::vector<int64_t> dims;
+    do {
+      // We can have either another nested array or a number literal.
+      if (lexer.getCurToken() == '[') {
+        values.push_back(parseTensorLiteralExpr());
+        if (!values.back())
+          return nullptr; // parse error in the nested array.
+      } else {
+        if (lexer.getCurToken() != tok_number)
+          return parseError<ExprAST>("<num> or [", "in literal expression");
+        values.push_back(parseNumberExpr());
+      }
+
+      // End of this list on ']'
+      if (lexer.getCurToken() == ']')
+        break;
+
+      // Elements are separated by a comma.
+      if (lexer.getCurToken() != ',')
+        return parseError<ExprAST>("] or ,", "in literal expression");
+
+      lexer.getNextToken(); // eat ,
+    } while (true);
+    if (values.empty())
+      return parseError<ExprAST>("<something>", "to fill literal expression");
+    lexer.getNextToken(); // eat ]
+
+    /// Fill in the dimensions now. First the current nesting level:
+    dims.push_back(values.size());
+
+    /// If there is any nested array, process all of them and ensure that
+    /// dimensions are uniform.
+    if (llvm::any_of(values, [](std::unique_ptr<ExprAST> &expr) {
+          return llvm::isa<LiteralExprAST>(expr.get());
+        })) {
+      auto *firstLiteral = llvm::dyn_cast<LiteralExprAST>(values.front().get());
+      if (!firstLiteral)
+        return parseError<ExprAST>("uniform well-nested dimensions",
+                                   "inside literal expression");
+
+      // Append the nested dimensions to the current level
+      auto firstDims = firstLiteral->getDims();
+      dims.insert(dims.end(), firstDims.begin(), firstDims.end());
+
+      // Sanity check that shape is uniform across all elements of the list.
+      for (auto &expr : values) {
+        auto *exprLiteral = llvm::cast<LiteralExprAST>(expr.get());
+        if (!exprLiteral)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+        if (exprLiteral->getDims() != firstDims)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+      }
+    }
+    return std::make_unique<LiteralExprAST>(std::move(loc), std::move(values),
+                                            std::move(dims));
+  }
+
+  /// Parse a literal struct expression.
+  /// structLiteral ::= { (structLiteral | tensorLiteral)+ }
+  std::unique_ptr<ExprAST> parseStructLiteralExpr() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(Token('{'));
+
+    // Hold the list of values.
+    std::vector<std::unique_ptr<ExprAST>> values;
+    do {
+      // We can have either another nested array or a number literal.
+      if (lexer.getCurToken() == '[') {
+        values.push_back(parseTensorLiteralExpr());
+        if (!values.back())
+          return nullptr;
+      } else if (lexer.getCurToken() == tok_number) {
+        values.push_back(parseNumberExpr());
+        if (!values.back())
+          return nullptr;
+      } else {
+        if (lexer.getCurToken() != '{')
+          return parseError<ExprAST>("{, [, or number",
+                                     "in struct literal expression");
+        values.push_back(parseStructLiteralExpr());
+      }
+
+      // End of this list on '}'
+      if (lexer.getCurToken() == '}')
+        break;
+
+      // Elements are separated by a comma.
+      if (lexer.getCurToken() != ',')
+        return parseError<ExprAST>("} or ,", "in struct literal expression");
+
+      lexer.getNextToken(); // eat ,
+    } while (true);
+    if (values.empty())
+      return parseError<ExprAST>("<something>",
+                                 "to fill struct literal expression");
+    lexer.getNextToken(); // eat }
+
+    return std::make_unique<StructLiteralExprAST>(std::move(loc),
+                                                  std::move(values));
+  }
+
+  /// parenexpr ::= '(' expression ')'
+  std::unique_ptr<ExprAST> parseParenExpr() {
+    lexer.getNextToken(); // eat (.
+    auto v = parseExpression();
+    if (!v)
+      return nullptr;
+
+    if (lexer.getCurToken() != ')')
+      return parseError<ExprAST>(")", "to close expression with parentheses");
+    lexer.consume(Token(')'));
+    return v;
+  }
+
+  /// Parse a call expression.
+  std::unique_ptr<ExprAST> parseCallExpr(llvm::StringRef name,
+                                         const Location &loc) {
+    lexer.consume(Token('('));
+    std::vector<std::unique_ptr<ExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      while (true) {
+        if (auto arg = parseExpression())
+          args.push_back(std::move(arg));
+        else
+          return nullptr;
+
+        if (lexer.getCurToken() == ')')
+          break;
+
+        if (lexer.getCurToken() != ',')
+          return parseError<ExprAST>(", or )", "in argument list");
+        lexer.getNextToken();
+      }
+    }
+    lexer.consume(Token(')'));
+
+    // It can be a builtin call to print
+    if (name == "print") {
+      if (args.size() != 1)
+        return parseError<ExprAST>("<single arg>", "as argument to print()");
+
+      return std::make_unique<PrintExprAST>(std::move(loc), std::move(args[0]));
+    }
+
+    // Call to a user-defined function
+    return std::make_unique<CallExprAST>(std::move(loc), name, std::move(args));
+  }
+
+  /// identifierexpr
+  ///   ::= identifier
+  ///   ::= identifier '(' expression ')'
+  std::unique_ptr<ExprAST> parseIdentifierExpr() {
+    std::string name = lexer.getId();
+
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat identifier.
+
+    if (lexer.getCurToken() != '(') // Simple variable ref.
+      return std::make_unique<VariableExprAST>(std::move(loc), name);
+
+    // This is a function call.
+    return parseCallExpr(name, loc);
+  }
+
+  /// primary
+  ///   ::= identifierexpr
+  ///   ::= numberexpr
+  ///   ::= parenexpr
+  ///   ::= tensorliteral
+  std::unique_ptr<ExprAST> parsePrimary() {
+    switch (lexer.getCurToken()) {
+    default:
+      llvm::errs() << "unknown token '" << lexer.getCurToken()
+                   << "' when expecting an expression\n";
+      return nullptr;
+    case tok_identifier:
+      return parseIdentifierExpr();
+    case tok_number:
+      return parseNumberExpr();
+    case '(':
+      return parseParenExpr();
+    case '[':
+      return parseTensorLiteralExpr();
+    case '{':
+      return parseStructLiteralExpr();
+    case ';':
+      return nullptr;
+    case '}':
+      return nullptr;
+    }
+  }
+
+  /// Recursively parse the right hand side of a binary expression, the ExprPrec
+  /// argument indicates the precedence of the current binary operator.
+  ///
+  /// binoprhs ::= ('+' primary)*
+  std::unique_ptr<ExprAST> parseBinOpRHS(int exprPrec,
+                                         std::unique_ptr<ExprAST> lhs) {
+    // If this is a binop, find its precedence.
+    while (true) {
+      int tokPrec = getTokPrecedence();
+
+      // If this is a binop that binds at least as tightly as the current binop,
+      // consume it, otherwise we are done.
+      if (tokPrec < exprPrec)
+        return lhs;
+
+      // Okay, we know this is a binop.
+      int binOp = lexer.getCurToken();
+      lexer.consume(Token(binOp));
+      auto loc = lexer.getLastLocation();
+
+      // Parse the primary expression after the binary operator.
+      auto rhs = parsePrimary();
+      if (!rhs)
+        return parseError<ExprAST>("expression", "to complete binary operator");
+
+      // If BinOp binds less tightly with rhs than the operator after rhs, let
+      // the pending operator take rhs as its lhs.
+      int nextPrec = getTokPrecedence();
+      if (tokPrec < nextPrec) {
+        rhs = parseBinOpRHS(tokPrec + 1, std::move(rhs));
+        if (!rhs)
+          return nullptr;
+      }
+
+      // Merge lhs/RHS.
+      lhs = std::make_unique<BinaryExprAST>(std::move(loc), binOp,
+                                            std::move(lhs), std::move(rhs));
+    }
+  }
+
+  /// expression::= primary binop rhs
+  std::unique_ptr<ExprAST> parseExpression() {
+    auto lhs = parsePrimary();
+    if (!lhs)
+      return nullptr;
+
+    return parseBinOpRHS(0, std::move(lhs));
+  }
+
+  /// type ::= < shape_list >
+  /// shape_list ::= num | num , shape_list
+  std::unique_ptr<VarType> parseType() {
+    if (lexer.getCurToken() != '<')
+      return parseError<VarType>("<", "to begin type");
+    lexer.getNextToken(); // eat <
+
+    auto type = std::make_unique<VarType>();
+
+    while (lexer.getCurToken() == tok_number) {
+      type->shape.push_back(lexer.getValue());
+      lexer.getNextToken();
+      if (lexer.getCurToken() == ',')
+        lexer.getNextToken();
+    }
+
+    if (lexer.getCurToken() != '>')
+      return parseError<VarType>(">", "to end type");
+    lexer.getNextToken(); // eat >
+    return type;
+  }
+
+  /// Parse either a variable declaration or a call expression.
+  std::unique_ptr<ExprAST> parseDeclarationOrCallExpr() {
+    auto loc = lexer.getLastLocation();
+    std::string id = lexer.getId();
+    lexer.consume(tok_identifier);
+
+    // Check for a call expression.
+    if (lexer.getCurToken() == '(')
+      return parseCallExpr(id, loc);
+
+    // Otherwise, this is a variable declaration.
+    return parseTypedDeclaration(id, /*requiresInitializer=*/true, loc);
+  }
+
+  /// Parse a typed variable declaration.
+  std::unique_ptr<VarDeclExprAST>
+  parseTypedDeclaration(llvm::StringRef typeName, bool requiresInitializer,
+                        const Location &loc) {
+    // Parse the variable name.
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<VarDeclExprAST>("name", "in variable declaration");
+    std::string id = lexer.getId();
+    lexer.getNextToken(); // eat id
+
+    // Parse the initializer.
+    std::unique_ptr<ExprAST> expr;
+    if (requiresInitializer) {
+      if (lexer.getCurToken() != '=')
+        return parseError<VarDeclExprAST>("initializer",
+                                          "in variable declaration");
+      lexer.consume(Token('='));
+      expr = parseExpression();
+    }
+
+    VarType type;
+    type.name = typeName;
+    return std::make_unique<VarDeclExprAST>(loc, std::move(id), std::move(type),
+                                            std::move(expr));
+  }
+
+  /// Parse a variable declaration, for either a tensor value or a struct value,
+  /// with an optionally required initializer.
+  /// decl ::= var identifier [ type ] (= expr)?
+  /// decl ::= identifier identifier (= expr)?
+  std::unique_ptr<VarDeclExprAST> parseDeclaration(bool requiresInitializer) {
+    // Check to see if this is a 'var' declaration.
+    if (lexer.getCurToken() == tok_var)
+      return parseVarDeclaration(requiresInitializer);
+
+    // Parse the type name.
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<VarDeclExprAST>("type name", "in variable declaration");
+    auto loc = lexer.getLastLocation();
+    std::string typeName = lexer.getId();
+    lexer.getNextToken(); // eat id
+
+    // Parse the rest of the declaration.
+    return parseTypedDeclaration(typeName, requiresInitializer, loc);
+  }
+
+  /// Parse a variable declaration, it starts with a `var` keyword followed by
+  /// and identifier and an optional type (shape specification) before the
+  /// optionally required initializer.
+  /// decl ::= var identifier [ type ] (= expr)?
+  std::unique_ptr<VarDeclExprAST>
+  parseVarDeclaration(bool requiresInitializer) {
+    if (lexer.getCurToken() != tok_var)
+      return parseError<VarDeclExprAST>("var", "to begin declaration");
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat var
+
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<VarDeclExprAST>("identified",
+                                        "after 'var' declaration");
+    std::string id = lexer.getId();
+    lexer.getNextToken(); // eat id
+
+    std::unique_ptr<VarType> type; // Type is optional, it can be inferred
+    if (lexer.getCurToken() == '<') {
+      type = parseType();
+      if (!type)
+        return nullptr;
+    }
+    if (!type)
+      type = std::make_unique<VarType>();
+
+    std::unique_ptr<ExprAST> expr;
+    if (requiresInitializer) {
+      lexer.consume(Token('='));
+      expr = parseExpression();
+    }
+    return std::make_unique<VarDeclExprAST>(std::move(loc), std::move(id),
+                                            std::move(*type), std::move(expr));
+  }
+
+  /// Parse a block: a list of expression separated by semicolons and wrapped in
+  /// curly braces.
+  ///
+  /// block ::= { expression_list }
+  /// expression_list ::= block_expr ; expression_list
+  /// block_expr ::= decl | "return" | expr
+  std::unique_ptr<ExprASTList> parseBlock() {
+    if (lexer.getCurToken() != '{')
+      return parseError<ExprASTList>("{", "to begin block");
+    lexer.consume(Token('{'));
+
+    auto exprList = std::make_unique<ExprASTList>();
+
+    // Ignore empty expressions: swallow sequences of semicolons.
+    while (lexer.getCurToken() == ';')
+      lexer.consume(Token(';'));
+
+    while (lexer.getCurToken() != '}' && lexer.getCurToken() != tok_eof) {
+      if (lexer.getCurToken() == tok_identifier) {
+        // Variable declaration or call
+        auto expr = parseDeclarationOrCallExpr();
+        if (!expr)
+          return nullptr;
+        exprList->push_back(std::move(expr));
+      } else if (lexer.getCurToken() == tok_var) {
+        // Variable declaration
+        auto varDecl = parseDeclaration(/*requiresInitializer=*/true);
+        if (!varDecl)
+          return nullptr;
+        exprList->push_back(std::move(varDecl));
+      } else if (lexer.getCurToken() == tok_return) {
+        // Return statement
+        auto ret = parseReturn();
+        if (!ret)
+          return nullptr;
+        exprList->push_back(std::move(ret));
+      } else {
+        // General expression
+        auto expr = parseExpression();
+        if (!expr)
+          return nullptr;
+        exprList->push_back(std::move(expr));
+      }
+      // Ensure that elements are separated by a semicolon.
+      if (lexer.getCurToken() != ';')
+        return parseError<ExprASTList>(";", "after expression");
+
+      // Ignore empty expressions: swallow sequences of semicolons.
+      while (lexer.getCurToken() == ';')
+        lexer.consume(Token(';'));
+    }
+
+    if (lexer.getCurToken() != '}')
+      return parseError<ExprASTList>("}", "to close block");
+
+    lexer.consume(Token('}'));
+    return exprList;
+  }
+
+  /// prototype ::= def id '(' decl_list ')'
+  /// decl_list ::= identifier | identifier, decl_list
+  std::unique_ptr<PrototypeAST> parsePrototype() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_def);
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<PrototypeAST>("function name", "in prototype");
+
+    std::string fnName = lexer.getId();
+    lexer.consume(tok_identifier);
+
+    if (lexer.getCurToken() != '(')
+      return parseError<PrototypeAST>("(", "in prototype");
+    lexer.consume(Token('('));
+
+    std::vector<std::unique_ptr<VarDeclExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      do {
+        VarType type;
+        std::string name;
+
+        // Parse either the name of the variable, or its type.
+        std::string nameOrType = lexer.getId();
+        auto loc = lexer.getLastLocation();
+        lexer.consume(tok_identifier);
+
+        // If the next token is an identifier, we just parsed the type.
+        if (lexer.getCurToken() == tok_identifier) {
+          type.name = std::move(nameOrType);
+
+          // Parse the name.
+          name = lexer.getId();
+          lexer.consume(tok_identifier);
+        } else {
+          // Otherwise, we just parsed the name.
+          name = std::move(nameOrType);
+        }
+
+        args.push_back(
+            std::make_unique<VarDeclExprAST>(std::move(loc), name, type));
+        if (lexer.getCurToken() != ',')
+          break;
+        lexer.consume(Token(','));
+        if (lexer.getCurToken() != tok_identifier)
+          return parseError<PrototypeAST>(
+              "identifier", "after ',' in function parameter list");
+      } while (true);
+    }
+    if (lexer.getCurToken() != ')')
+      return parseError<PrototypeAST>("}", "to end function prototype");
+
+    // success.
+    lexer.consume(Token(')'));
+    return std::make_unique<PrototypeAST>(std::move(loc), fnName,
+                                          std::move(args));
+  }
+
+  /// Parse a function definition, we expect a prototype initiated with the
+  /// `def` keyword, followed by a block containing a list of expressions.
+  ///
+  /// definition ::= prototype block
+  std::unique_ptr<FunctionAST> parseDefinition() {
+    auto proto = parsePrototype();
+    if (!proto)
+      return nullptr;
+
+    if (auto block = parseBlock())
+      return std::make_unique<FunctionAST>(std::move(proto), std::move(block));
+    return nullptr;
+  }
+
+  /// Parse a struct definition, we expect a struct initiated with the
+  /// `struct` keyword, followed by a block containing a list of variable
+  /// declarations.
+  ///
+  /// definition ::= `struct` identifier `{` decl+ `}`
+  std::unique_ptr<StructAST> parseStruct() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_struct);
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<StructAST>("name", "in struct definition");
+    std::string name = lexer.getId();
+    lexer.consume(tok_identifier);
+
+    // Parse: '{'
+    if (lexer.getCurToken() != '{')
+      return parseError<StructAST>("{", "in struct definition");
+    lexer.consume(Token('{'));
+
+    // Parse: decl+
+    std::vector<std::unique_ptr<VarDeclExprAST>> decls;
+    do {
+      auto decl = parseDeclaration(/*requiresInitializer=*/false);
+      if (!decl)
+        return nullptr;
+      decls.push_back(std::move(decl));
+
+      if (lexer.getCurToken() != ';')
+        return parseError<StructAST>(";",
+                                     "after variable in struct definition");
+      lexer.consume(Token(';'));
+    } while (lexer.getCurToken() != '}');
+
+    // Parse: '}'
+    lexer.consume(Token('}'));
+    return std::make_unique<StructAST>(loc, name, std::move(decls));
+  }
+
+  /// Get the precedence of the pending binary operator token.
+  int getTokPrecedence() {
+    if (!isascii(lexer.getCurToken()))
+      return -1;
+
+    // 1 is lowest precedence.
+    switch (static_cast<char>(lexer.getCurToken())) {
+    case '-':
+      return 20;
+    case '+':
+      return 20;
+    case '*':
+      return 40;
+    case '.':
+      return 60;
+    default:
+      return -1;
+    }
+  }
+
+  /// Helper function to signal errors while parsing, it takes an argument
+  /// indicating the expected token and another argument giving more context.
+  /// Location is retrieved from the lexer to enrich the error message.
+  template <typename R, typename T, typename U = const char *>
+  std::unique_ptr<R> parseError(T &&expected, U &&context = "") {
+    auto curToken = lexer.getCurToken();
+    llvm::errs() << "Parse error (" << lexer.getLastLocation().line << ", "
+                 << lexer.getLastLocation().col << "): expected '" << expected
+                 << "' " << context << " but has Token " << curToken;
+    if (isprint(curToken))
+      llvm::errs() << " '" << (char)curToken << "'";
+    llvm::errs() << "\n";
+    return nullptr;
+  }
+};
+
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_PARSER_H
diff --git a/mlir/examples/toy/Ch7/include/toy/Passes.h b/mlir/examples/toy/Ch7/include/toy/Passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..33c2021c8db298671d41987e10de508507065f15
--- /dev/null
+++ b/mlir/examples/toy/Ch7/include/toy/Passes.h
@@ -0,0 +1,36 @@
+//===- Passes.h - Toy Passes Definition -----------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes the entry points to create compiler passes for Toy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_PASSES_H
+#define MLIR_TUTORIAL_TOY_PASSES_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+namespace toy {
+std::unique_ptr<Pass> createDeadFunctionEliminationPass();
+std::unique_ptr<Pass> createShapeInferencePass();
+
+/// Create a pass for lowering to operations in the `Affine` and `Std` dialects,
+/// for a subset of the Toy IR (e.g. matmul).
+std::unique_ptr<mlir::Pass> createLowerToAffinePass();
+
+/// Create a pass for lowering operations the remaining `Toy` operations, as
+/// well as `Affine` and `Std`, to the LLVM dialect for codegen.
+std::unique_ptr<mlir::Pass> createLowerToLLVMPass();
+
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_PASSES_H
diff --git a/mlir/examples/toy/Ch7/include/toy/ShapeInferenceInterface.h b/mlir/examples/toy/Ch7/include/toy/ShapeInferenceInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..da0fb66018ee4df1882d26f074ecd49a24ddcea9
--- /dev/null
+++ b/mlir/examples/toy/Ch7/include/toy/ShapeInferenceInterface.h
@@ -0,0 +1,28 @@
+//===- ShapeInferenceInterface.h - Interface definitions for ShapeInference -=//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the shape inference interfaces defined
+// in ShapeInferenceInterface.td.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
+#define MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
+
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace toy {
+
+/// Include the auto-generated declarations.
+#include "toy/ShapeInferenceOpInterfaces.h.inc"
+
+} // end namespace toy
+} // end namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
diff --git a/mlir/examples/toy/Ch7/include/toy/ShapeInferenceInterface.td b/mlir/examples/toy/Ch7/include/toy/ShapeInferenceInterface.td
new file mode 100644
index 0000000000000000000000000000000000000000..1b38ada1622862057ad2c18eabe147b875e18cf2
--- /dev/null
+++ b/mlir/examples/toy/Ch7/include/toy/ShapeInferenceInterface.td
@@ -0,0 +1,30 @@
+//===- ShapeInferenceInterface.td - Shape Inference Interface -*- tablegen -==//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the operations of the Shape Inference Op Interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SHAPE_INFERENCE_INTERFACE
+#define SHAPE_INFERENCE_INTERFACE
+
+include "mlir/IR/OpBase.td"
+
+def ShapeInferenceOpInterface : OpInterface<"ShapeInference"> {
+  let description = [{
+    Interface to access a registered method to infer the return types for an
+    operation that can be used during type inference.
+  }];
+
+  let methods = [
+    InterfaceMethod<"Infer and set the output shape for the current operation.",
+                    "void", "inferShapes">
+  ];
+}
+
+#endif // SHAPE_INFERENCE_INTERFACE
diff --git a/mlir/examples/toy/Ch7/mlir/DeadFunctionEliminationPass.cpp b/mlir/examples/toy/Ch7/mlir/DeadFunctionEliminationPass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ee34547860cd98c27c21da874ad794a6d0c99d5
--- /dev/null
+++ b/mlir/examples/toy/Ch7/mlir/DeadFunctionEliminationPass.cpp
@@ -0,0 +1,59 @@
+//===- DeadFunctionEliminationPass.cpp - Eliminate inlined functions ------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Module level pass performing dead function
+// elimination. This is required as a post-processing step after function
+// inlining.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "toy/Passes.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+namespace {
+/// This is a simple function DCE pass that deletes all non-main functions after
+/// inlining.
+/// TODO(riverriddle) This is only necessary because MLIR currently does not
+/// have generic DCE support for functions.
+class DeadFunctionEliminationPass
+    : public mlir::ModulePass<DeadFunctionEliminationPass> {
+public:
+  void runOnModule() override {
+    mlir::ModuleOp module = getModule();
+    mlir::SymbolTable moduleSymTable(module);
+
+    // Eliminate non-main functions.
+    auto mainFn = moduleSymTable.lookup<mlir::FuncOp>("main");
+    for (mlir::FuncOp func :
+         llvm::make_early_inc_range(module.getOps<mlir::FuncOp>())) {
+      if (func != mainFn)
+        func.erase();
+    }
+  }
+};
+} // end anonymous namespace
+
+/// Create a pass that eliminates inlined functions in toy.
+std::unique_ptr<mlir::Pass> mlir::toy::createDeadFunctionEliminationPass() {
+  return std::make_unique<DeadFunctionEliminationPass>();
+}
diff --git a/mlir/examples/toy/Ch7/mlir/Dialect.cpp b/mlir/examples/toy/Ch7/mlir/Dialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7e37f61a4739d51a1bd9e806a5c01d6f88ffd3c5
--- /dev/null
+++ b/mlir/examples/toy/Ch7/mlir/Dialect.cpp
@@ -0,0 +1,474 @@
+//===- Dialect.cpp - Toy IR Dialect registration in MLIR ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the dialect for the Toy IR: custom type parsing and
+// operation verification.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Transforms/InliningUtils.h"
+
+using namespace mlir;
+using namespace mlir::toy;
+
+//===----------------------------------------------------------------------===//
+// ToyInlinerInterface
+//===----------------------------------------------------------------------===//
+
+/// This class defines the interface for handling inlining with Toy
+/// operations.
+struct ToyInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// All operations within toy can be inlined.
+  bool isLegalToInline(Operation *, Region *,
+                       BlockAndValueMapping &) const final {
+    return true;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator(toy.return) by replacing it with a new
+  /// operation as necessary.
+  void handleTerminator(Operation *op,
+                        ArrayRef<Value> valuesToRepl) const final {
+    // Only "toy.return" needs to be handled here.
+    auto returnOp = cast<ReturnOp>(op);
+
+    // Replace the values directly with the return operands.
+    assert(returnOp.getNumOperands() == valuesToRepl.size());
+    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+      valuesToRepl[it.index()]->replaceAllUsesWith(it.value());
+  }
+
+  /// Attempts to materialize a conversion for a type mismatch between a call
+  /// from this dialect, and a callable region. This method should generate an
+  /// operation that takes 'input' as the only operand, and produces a single
+  /// result of 'resultType'. If a conversion can not be generated, nullptr
+  /// should be returned.
+  Operation *materializeCallConversion(OpBuilder &builder, Value input,
+                                       Type resultType,
+                                       Location conversionLoc) const final {
+    return builder.create<CastOp>(conversionLoc, resultType, input);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyDialect
+//===----------------------------------------------------------------------===//
+
+/// Dialect creation, the instance will be owned by the context. This is the
+/// point of registration of custom types and operations for the dialect.
+ToyDialect::ToyDialect(mlir::MLIRContext *ctx) : mlir::Dialect("toy", ctx) {
+  addOperations<
+#define GET_OP_LIST
+#include "toy/Ops.cpp.inc"
+      >();
+  addInterfaces<ToyInlinerInterface>();
+  addTypes<StructType>();
+}
+
+mlir::Operation *ToyDialect::materializeConstant(mlir::OpBuilder &builder,
+                                                 mlir::Attribute value,
+                                                 mlir::Type type,
+                                                 mlir::Location loc) {
+  if (type.isa<StructType>())
+    return builder.create<StructConstantOp>(loc, type,
+                                            value.cast<mlir::ArrayAttr>());
+  return builder.create<ConstantOp>(loc, type,
+                                    value.cast<mlir::DenseElementsAttr>());
+}
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ConstantOp
+
+/// Build a constant operation.
+/// The builder is passed as an argument, so is the state that this method is
+/// expected to fill in order to build the operation.
+void ConstantOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                       double value) {
+  auto dataType = RankedTensorType::get({}, builder->getF64Type());
+  auto dataAttribute = DenseElementsAttr::get(dataType, value);
+  ConstantOp::build(builder, state, dataType, dataAttribute);
+}
+
+/// Verify that the given attribute value is valid for the given type.
+static mlir::LogicalResult verifyConstantForType(mlir::Type type,
+                                                 mlir::Attribute opaqueValue,
+                                                 mlir::Operation *op) {
+  if (type.isa<mlir::TensorType>()) {
+    // Check that the value is a elements attribute.
+    auto attrValue = opaqueValue.dyn_cast<mlir::DenseFPElementsAttr>();
+    if (!attrValue)
+      return op->emitError("constant of TensorType must be initialized by "
+                           "a DenseFPElementsAttr, got ")
+             << opaqueValue;
+
+    // If the return type of the constant is not an unranked tensor, the shape
+    // must match the shape of the attribute holding the data.
+    auto resultType = type.dyn_cast<mlir::RankedTensorType>();
+    if (!resultType)
+      return success();
+
+    // Check that the rank of the attribute type matches the rank of the
+    // constant result type.
+    auto attrType = attrValue.getType().cast<mlir::TensorType>();
+    if (attrType.getRank() != resultType.getRank()) {
+      return op->emitOpError("return type must match the one of the attached "
+                             "value attribute: ")
+             << attrType.getRank() << " != " << resultType.getRank();
+    }
+
+    // Check that each of the dimensions match between the two types.
+    for (int dim = 0, dimE = attrType.getRank(); dim < dimE; ++dim) {
+      if (attrType.getShape()[dim] != resultType.getShape()[dim]) {
+        return op->emitOpError(
+                   "return type shape mismatches its attribute at dimension ")
+               << dim << ": " << attrType.getShape()[dim]
+               << " != " << resultType.getShape()[dim];
+      }
+    }
+    return mlir::success();
+  }
+  auto resultType = type.cast<StructType>();
+  llvm::ArrayRef<mlir::Type> resultElementTypes = resultType.getElementTypes();
+
+  // Verify that the initializer is an Array.
+  auto attrValue = opaqueValue.dyn_cast<ArrayAttr>();
+  if (!attrValue || attrValue.getValue().size() != resultElementTypes.size())
+    return op->emitError("constant of StructType must be initialized by an "
+                         "ArrayAttr with the same number of elements, got ")
+           << opaqueValue;
+
+  // Check that each of the elements are valid.
+  llvm::ArrayRef<mlir::Attribute> attrElementValues = attrValue.getValue();
+  for (const auto &it : llvm::zip(resultElementTypes, attrElementValues))
+    if (failed(verifyConstantForType(std::get<0>(it), std::get<1>(it), op)))
+      return mlir::failure();
+  return mlir::success();
+}
+
+/// Verifier for the constant operation. This corresponds to the `::verify(...)`
+/// in the op definition.
+static mlir::LogicalResult verify(ConstantOp op) {
+  return verifyConstantForType(op.getResult()->getType(), op.value(), op);
+}
+
+static mlir::LogicalResult verify(StructConstantOp op) {
+  return verifyConstantForType(op.getResult()->getType(), op.value(), op);
+}
+
+/// Infer the output shape of the ConstantOp, this is required by the shape
+/// inference interface.
+void ConstantOp::inferShapes() { getResult()->setType(value().getType()); }
+
+//===----------------------------------------------------------------------===//
+// AddOp
+
+void AddOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+/// Infer the output shape of the AddOp, this is required by the shape inference
+/// interface.
+void AddOp::inferShapes() { getResult()->setType(getOperand(0)->getType()); }
+
+//===----------------------------------------------------------------------===//
+// CastOp
+
+/// Infer the output shape of the CastOp, this is required by the shape
+/// inference interface.
+void CastOp::inferShapes() { getResult()->setType(getOperand()->getType()); }
+
+//===----------------------------------------------------------------------===//
+// GenericCallOp
+
+void GenericCallOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                          StringRef callee, ArrayRef<mlir::Value> arguments) {
+  // Generic call always returns an unranked Tensor initially.
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands(arguments);
+  state.addAttribute("callee", builder->getSymbolRefAttr(callee));
+}
+
+/// Return the callee of the generic call operation, this is required by the
+/// call interface.
+CallInterfaceCallable GenericCallOp::getCallableForCallee() {
+  return getAttrOfType<SymbolRefAttr>("callee");
+}
+
+/// Get the argument operands to the called function, this is required by the
+/// call interface.
+Operation::operand_range GenericCallOp::getArgOperands() { return inputs(); }
+
+//===----------------------------------------------------------------------===//
+// MulOp
+
+void MulOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+/// Infer the output shape of the MulOp, this is required by the shape inference
+/// interface.
+void MulOp::inferShapes() { getResult()->setType(getOperand(0)->getType()); }
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+
+static mlir::LogicalResult verify(ReturnOp op) {
+  // We know that the parent operation is a function, because of the 'HasParent'
+  // trait attached to the operation definition.
+  auto function = cast<FuncOp>(op.getParentOp());
+
+  /// ReturnOps can only have a single optional operand.
+  if (op.getNumOperands() > 1)
+    return op.emitOpError() << "expects at most 1 return operand";
+
+  // The operand number and types must match the function signature.
+  const auto &results = function.getType().getResults();
+  if (op.getNumOperands() != results.size())
+    return op.emitOpError()
+           << "does not return the same number of values ("
+           << op.getNumOperands() << ") as the enclosing function ("
+           << results.size() << ")";
+
+  // If the operation does not have an input, we are done.
+  if (!op.hasOperand())
+    return mlir::success();
+
+  auto inputType = *op.operand_type_begin();
+  auto resultType = results.front();
+
+  // Check that the result type of the function matches the operand type.
+  if (inputType == resultType || inputType.isa<mlir::UnrankedTensorType>() ||
+      resultType.isa<mlir::UnrankedTensorType>())
+    return mlir::success();
+
+  return op.emitError() << "type of return operand ("
+                        << *op.operand_type_begin()
+                        << ") doesn't match function result type ("
+                        << results.front() << ")";
+}
+
+//===----------------------------------------------------------------------===//
+// StructAccessOp
+
+void StructAccessOp::build(mlir::Builder *b, mlir::OperationState &state,
+                           mlir::Value input, size_t index) {
+  // Extract the result type from the input type.
+  StructType structTy = input->getType().cast<StructType>();
+  assert(index < structTy.getNumElementTypes());
+  mlir::Type resultType = structTy.getElementTypes()[index];
+
+  // Call into the auto-generated build method.
+  build(b, state, resultType, input, b->getI64IntegerAttr(index));
+}
+
+static mlir::LogicalResult verify(StructAccessOp op) {
+  StructType structTy = op.input()->getType().cast<StructType>();
+  size_t index = op.index().getZExtValue();
+  if (index >= structTy.getNumElementTypes())
+    return op.emitOpError()
+           << "index should be within the range of the input struct type";
+  mlir::Type resultType = op.getResult()->getType();
+  if (resultType != structTy.getElementTypes()[index])
+    return op.emitOpError() << "must have the same result type as the struct "
+                               "element referred to by the index";
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+
+void TransposeOp::build(mlir::Builder *builder, mlir::OperationState &state,
+                        mlir::Value value) {
+  state.addTypes(UnrankedTensorType::get(builder->getF64Type()));
+  state.addOperands(value);
+}
+
+void TransposeOp::inferShapes() {
+  auto arrayTy = getOperand()->getType().cast<RankedTensorType>();
+  SmallVector<int64_t, 2> dims(llvm::reverse(arrayTy.getShape()));
+  getResult()->setType(RankedTensorType::get(dims, arrayTy.getElementType()));
+}
+
+static mlir::LogicalResult verify(TransposeOp op) {
+  auto inputType = op.getOperand()->getType().dyn_cast<RankedTensorType>();
+  auto resultType = op.getType().dyn_cast<RankedTensorType>();
+  if (!inputType || !resultType)
+    return mlir::success();
+
+  auto inputShape = inputType.getShape();
+  if (!std::equal(inputShape.begin(), inputShape.end(),
+                  resultType.getShape().rbegin())) {
+    return op.emitError()
+           << "expected result shape to be a transpose of the input";
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// Toy Types
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace toy {
+namespace detail {
+/// This class represents the internal storage of the Toy `StructType`.
+struct StructTypeStorage : public mlir::TypeStorage {
+  /// The `KeyTy` is a required type that provides an interface for the storage
+  /// instance. This type will be used when uniquing an instance of the type
+  /// storage. For our struct type, we will unique each instance structurally on
+  /// the elements that it contains.
+  using KeyTy = llvm::ArrayRef<mlir::Type>;
+
+  /// A constructor for the type storage instance.
+  StructTypeStorage(llvm::ArrayRef<mlir::Type> elementTypes)
+      : elementTypes(elementTypes) {}
+
+  /// Define the comparison function for the key type with the current storage
+  /// instance. This is used when constructing a new instance to ensure that we
+  /// haven't already uniqued an instance of the given key.
+  bool operator==(const KeyTy &key) const { return key == elementTypes; }
+
+  /// Define a hash function for the key type. This is used when uniquing
+  /// instances of the storage, see the `StructType::get` method.
+  /// Note: This method isn't necessary as both llvm::ArrayRef and mlir::Type
+  /// have hash functions available, so we could just omit this entirely.
+  static llvm::hash_code hashKey(const KeyTy &key) {
+    return llvm::hash_value(key);
+  }
+
+  /// Define a construction function for the key type from a set of parameters.
+  /// These parameters will be provided when constructing the storage instance
+  /// itself.
+  /// Note: This method isn't necessary because KeyTy can be directly
+  /// constructed with the given parameters.
+  static KeyTy getKey(llvm::ArrayRef<mlir::Type> elementTypes) {
+    return KeyTy(elementTypes);
+  }
+
+  /// Define a construction method for creating a new instance of this storage.
+  /// This method takes an instance of a storage allocator, and an instance of a
+  /// `KeyTy`. The given allocator must be used for *all* necessary dynamic
+  /// allocations used to create the type storage and its internal.
+  static StructTypeStorage *construct(mlir::TypeStorageAllocator &allocator,
+                                      const KeyTy &key) {
+    // Copy the elements from the provided `KeyTy` into the allocator.
+    llvm::ArrayRef<mlir::Type> elementTypes = allocator.copyInto(key);
+
+    // Allocate the storage instance and construct it.
+    return new (allocator.allocate<StructTypeStorage>())
+        StructTypeStorage(elementTypes);
+  }
+
+  /// The following field contains the element types of the struct.
+  llvm::ArrayRef<mlir::Type> elementTypes;
+};
+} // end namespace detail
+} // end namespace toy
+} // end namespace mlir
+
+/// Create an instance of a `StructType` with the given element types. There
+/// *must* be at least one element type.
+StructType StructType::get(llvm::ArrayRef<mlir::Type> elementTypes) {
+  assert(!elementTypes.empty() && "expected at least 1 element type");
+
+  // Call into a helper 'get' method in 'TypeBase' to get a uniqued instance
+  // of this type. The first two parameters are the context to unique in and the
+  // kind of the type. The parameters after the type kind are forwarded to the
+  // storage instance.
+  mlir::MLIRContext *ctx = elementTypes.front().getContext();
+  return Base::get(ctx, ToyTypes::Struct, elementTypes);
+}
+
+/// Returns the element types of this struct type.
+llvm::ArrayRef<mlir::Type> StructType::getElementTypes() {
+  // 'getImpl' returns a pointer to the internal storage instance.
+  return getImpl()->elementTypes;
+}
+
+/// Parse an instance of a type registered to the toy dialect.
+mlir::Type ToyDialect::parseType(mlir::DialectAsmParser &parser) const {
+  // Parse a struct type in the following form:
+  //   struct-type ::= `struct` `<` type (`,` type)* `>`
+
+  // NOTE: All MLIR parser function return a ParseResult. This is a
+  // specialization of LogicalResult that auto-converts to a `true` boolean
+  // value on failure to allow for chaining, but may be used with explicit
+  // `mlir::failed/mlir::succeeded` as desired.
+
+  // Parse: `struct` `<`
+  if (parser.parseKeyword("struct") || parser.parseLess())
+    return Type();
+
+  // Parse the element types of the struct.
+  SmallVector<mlir::Type, 1> elementTypes;
+  do {
+    // Parse the current element type.
+    llvm::SMLoc typeLoc = parser.getCurrentLocation();
+    mlir::Type elementType;
+    if (parser.parseType(elementType))
+      return nullptr;
+
+    // Check that the type is either a TensorType or another StructType.
+    if (!elementType.isa<mlir::TensorType>() &&
+        !elementType.isa<StructType>()) {
+      parser.emitError(typeLoc, "element type for a struct must either "
+                                "be a TensorType or a StructType, got: ")
+          << elementType;
+      return Type();
+    }
+    elementTypes.push_back(elementType);
+
+    // Parse the optional: `,`
+  } while (succeeded(parser.parseOptionalComma()));
+
+  // Parse: `>`
+  if (parser.parseGreater())
+    return Type();
+  return StructType::get(elementTypes);
+}
+
+/// Print an instance of a type registered to the toy dialect.
+void ToyDialect::printType(mlir::Type type,
+                           mlir::DialectAsmPrinter &printer) const {
+  // Currently the only toy type is a struct type.
+  StructType structType = type.cast<StructType>();
+
+  // Print the struct type according to the parser format.
+  printer << "struct<";
+  mlir::interleaveComma(structType.getElementTypes(), printer);
+  printer << '>';
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "toy/Ops.cpp.inc"
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d6e76de069ce235033287496a0ed556789fcf4a
--- /dev/null
+++ b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
@@ -0,0 +1,309 @@
+//====- LowerToAffineLoops.cpp - Partial lowering from Toy to Affine+Std --===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a partial lowering of Toy operations to a combination of
+// affine loops and standard operations. This lowering expects that all calls
+// have been inlined, and all shapes have been resolved.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/Sequence.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns
+//===----------------------------------------------------------------------===//
+
+/// Convert the given TensorType into the corresponding MemRefType.
+static MemRefType convertTensorToMemRef(TensorType type) {
+  assert(type.hasRank() && "expected only ranked shapes");
+  return MemRefType::get(type.getShape(), type.getElementType());
+}
+
+/// Insert an allocation and deallocation for the given MemRefType.
+static Value insertAllocAndDealloc(MemRefType type, Location loc,
+                                   PatternRewriter &rewriter) {
+  auto alloc = rewriter.create<AllocOp>(loc, type);
+
+  // Make sure to allocate at the beginning of the block.
+  auto *parentBlock = alloc.getOperation()->getBlock();
+  alloc.getOperation()->moveBefore(&parentBlock->front());
+
+  // Make sure to deallocate this alloc at the end of the block. This is fine
+  // as toy functions have no control flow.
+  auto dealloc = rewriter.create<DeallocOp>(loc, alloc);
+  dealloc.getOperation()->moveBefore(&parentBlock->back());
+  return alloc;
+}
+
+/// This defines the function type used to process an iteration of a lowered
+/// loop. It takes as input a rewriter, an array of memRefOperands corresponding
+/// to the operands of the input operation, and the set of loop induction
+/// variables for the iteration. It returns a value to store at the current
+/// index of the iteration.
+using LoopIterationFn = function_ref<Value(PatternRewriter &rewriter,
+                                           ArrayRef<Value> memRefOperands,
+                                           ArrayRef<Value> loopIvs)>;
+
+static void lowerOpToLoops(Operation *op, ArrayRef<Value> operands,
+                           PatternRewriter &rewriter,
+                           LoopIterationFn processIteration) {
+  auto tensorType = (*op->result_type_begin()).cast<TensorType>();
+  auto loc = op->getLoc();
+
+  // Insert an allocation and deallocation for the result of this operation.
+  auto memRefType = convertTensorToMemRef(tensorType);
+  auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter);
+
+  // Create an empty affine loop for each of the dimensions within the shape.
+  SmallVector<Value, 4> loopIvs;
+  for (auto dim : tensorType.getShape()) {
+    auto loop = rewriter.create<AffineForOp>(loc, /*lb=*/0, dim, /*step=*/1);
+    loop.getBody()->clear();
+    loopIvs.push_back(loop.getInductionVar());
+
+    // Terminate the loop body and update the rewriter insertion point to the
+    // beginning of the loop.
+    rewriter.setInsertionPointToStart(loop.getBody());
+    rewriter.create<AffineTerminatorOp>(loc);
+    rewriter.setInsertionPointToStart(loop.getBody());
+  }
+
+  // Generate a call to the processing function with the rewriter, the memref
+  // operands, and the loop induction variables. This function will return the
+  // value to store at the current index.
+  Value valueToStore = processIteration(rewriter, operands, loopIvs);
+  rewriter.create<AffineStoreOp>(loc, valueToStore, alloc,
+                                 llvm::makeArrayRef(loopIvs));
+
+  // Replace this operation with the generated alloc.
+  rewriter.replaceOp(op, alloc);
+}
+
+namespace {
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: Binary operations
+//===----------------------------------------------------------------------===//
+
+template <typename BinaryOp, typename LoweredBinaryOp>
+struct BinaryOpLowering : public ConversionPattern {
+  BinaryOpLowering(MLIRContext *ctx)
+      : ConversionPattern(BinaryOp::getOperationName(), 1, ctx) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+    lowerOpToLoops(
+        op, operands, rewriter,
+        [loc](PatternRewriter &rewriter, ArrayRef<Value> memRefOperands,
+              ArrayRef<Value> loopIvs) {
+          // Generate an adaptor for the remapped operands of the BinaryOp. This
+          // allows for using the nice named accessors that are generated by the
+          // ODS.
+          typename BinaryOp::OperandAdaptor binaryAdaptor(memRefOperands);
+
+          // Generate loads for the element of 'lhs' and 'rhs' at the inner
+          // loop.
+          auto loadedLhs =
+              rewriter.create<AffineLoadOp>(loc, binaryAdaptor.lhs(), loopIvs);
+          auto loadedRhs =
+              rewriter.create<AffineLoadOp>(loc, binaryAdaptor.rhs(), loopIvs);
+
+          // Create the binary operation performed on the loaded values.
+          return rewriter.create<LoweredBinaryOp>(loc, loadedLhs, loadedRhs);
+        });
+    return matchSuccess();
+  }
+};
+using AddOpLowering = BinaryOpLowering<toy::AddOp, AddFOp>;
+using MulOpLowering = BinaryOpLowering<toy::MulOp, MulFOp>;
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: Constant operations
+//===----------------------------------------------------------------------===//
+
+struct ConstantOpLowering : public OpRewritePattern<toy::ConstantOp> {
+  using OpRewritePattern<toy::ConstantOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(toy::ConstantOp op,
+                                     PatternRewriter &rewriter) const final {
+    DenseElementsAttr constantValue = op.value();
+    Location loc = op.getLoc();
+
+    // When lowering the constant operation, we allocate and assign the constant
+    // values to a corresponding memref allocation.
+    auto tensorType = op.getType().cast<TensorType>();
+    auto memRefType = convertTensorToMemRef(tensorType);
+    auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter);
+
+    // We will be generating constant indices up-to the largest dimension.
+    // Create these constants up-front to avoid large amounts of redundant
+    // operations.
+    auto valueShape = memRefType.getShape();
+    SmallVector<Value, 8> constantIndices;
+    for (auto i : llvm::seq<int64_t>(
+             0, *std::max_element(valueShape.begin(), valueShape.end())))
+      constantIndices.push_back(rewriter.create<ConstantIndexOp>(loc, i));
+
+    // The constant operation represents a multi-dimensional constant, so we
+    // will need to generate a store for each of the elements. The following
+    // functor recursively walks the dimensions of the constant shape,
+    // generating a store when the recursion hits the base case.
+    SmallVector<Value, 2> indices;
+    auto valueIt = constantValue.getValues<FloatAttr>().begin();
+    std::function<void(uint64_t)> storeElements = [&](uint64_t dimension) {
+      // The last dimension is the base case of the recursion, at this point
+      // we store the element at the given index.
+      if (dimension == valueShape.size()) {
+        rewriter.create<AffineStoreOp>(
+            loc, rewriter.create<ConstantOp>(loc, *valueIt++), alloc,
+            llvm::makeArrayRef(indices));
+        return;
+      }
+
+      // Otherwise, iterate over the current dimension and add the indices to
+      // the list.
+      for (uint64_t i = 0, e = valueShape[dimension]; i != e; ++i) {
+        indices.push_back(constantIndices[i]);
+        storeElements(dimension + 1);
+        indices.pop_back();
+      }
+    };
+
+    // Start the element storing recursion from the first dimension.
+    storeElements(/*dimension=*/0);
+
+    // Replace this operation with the generated alloc.
+    rewriter.replaceOp(op, alloc);
+    return matchSuccess();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: Return operations
+//===----------------------------------------------------------------------===//
+
+struct ReturnOpLowering : public OpRewritePattern<toy::ReturnOp> {
+  using OpRewritePattern<toy::ReturnOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(toy::ReturnOp op,
+                                     PatternRewriter &rewriter) const final {
+    // During this lowering, we expect that all function calls have been
+    // inlined.
+    if (op.hasOperand())
+      return matchFailure();
+
+    // We lower "toy.return" directly to "std.return".
+    rewriter.replaceOpWithNewOp<ReturnOp>(op);
+    return matchSuccess();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: Transpose operations
+//===----------------------------------------------------------------------===//
+
+struct TransposeOpLowering : public ConversionPattern {
+  TransposeOpLowering(MLIRContext *ctx)
+      : ConversionPattern(toy::TransposeOp::getOperationName(), 1, ctx) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+    lowerOpToLoops(
+        op, operands, rewriter,
+        [loc](PatternRewriter &rewriter, ArrayRef<Value> memRefOperands,
+              ArrayRef<Value> loopIvs) {
+          // Generate an adaptor for the remapped operands of the TransposeOp.
+          // This allows for using the nice named accessors that are generated
+          // by the ODS.
+          toy::TransposeOpOperandAdaptor transposeAdaptor(memRefOperands);
+          Value input = transposeAdaptor.input();
+
+          // Transpose the elements by generating a load from the reverse
+          // indices.
+          SmallVector<Value, 2> reverseIvs(llvm::reverse(loopIvs));
+          return rewriter.create<AffineLoadOp>(loc, input, reverseIvs);
+        });
+    return matchSuccess();
+  }
+};
+
+} // end anonymous namespace.
+
+//===----------------------------------------------------------------------===//
+// ToyToAffineLoweringPass
+//===----------------------------------------------------------------------===//
+
+/// This is a partial lowering to affine loops of the toy operations that are
+/// computationally intensive (like matmul for example...) while keeping the
+/// rest of the code in the Toy dialect.
+namespace {
+struct ToyToAffineLoweringPass : public FunctionPass<ToyToAffineLoweringPass> {
+  void runOnFunction() final;
+};
+} // end anonymous namespace.
+
+void ToyToAffineLoweringPass::runOnFunction() {
+  auto function = getFunction();
+
+  // We only lower the main function as we expect that all other functions have
+  // been inlined.
+  if (function.getName() != "main")
+    return;
+
+  // Verify that the given main has no inputs and results.
+  if (function.getNumArguments() || function.getType().getNumResults()) {
+    function.emitError("expected 'main' to have 0 inputs and 0 results");
+    return signalPassFailure();
+  }
+
+  // The first thing to define is the conversion target. This will define the
+  // final target for this lowering.
+  ConversionTarget target(getContext());
+
+  // We define the specific operations, or dialects, that are legal targets for
+  // this lowering. In our case, we are lowering to a combination of the
+  // `Affine` and `Standard` dialects.
+  target.addLegalDialect<AffineOpsDialect, StandardOpsDialect>();
+
+  // We also define the Toy dialect as Illegal so that the conversion will fail
+  // if any of these operations are *not* converted. Given that we actually want
+  // a partial lowering, we explicitly mark the Toy operations that don't want
+  // to lower, `toy.print`, as `legal`.
+  target.addIllegalDialect<toy::ToyDialect>();
+  target.addLegalOp<toy::PrintOp>();
+
+  // Now that the conversion target has been defined, we just need to provide
+  // the set of patterns that will lower the Toy operations.
+  OwningRewritePatternList patterns;
+  patterns.insert<AddOpLowering, ConstantOpLowering, MulOpLowering,
+                  ReturnOpLowering, TransposeOpLowering>(&getContext());
+
+  // With the target and rewrite patterns defined, we can now attempt the
+  // conversion. The conversion will signal failure if any of our `illegal`
+  // operations were not converted successfully.
+  if (failed(applyPartialConversion(getFunction(), target, patterns)))
+    signalPassFailure();
+}
+
+/// Create a pass for lowering operations in the `Affine` and `Std` dialects,
+/// for a subset of the Toy IR (e.g. matmul).
+std::unique_ptr<Pass> mlir::toy::createLowerToAffinePass() {
+  return std::make_unique<ToyToAffineLoweringPass>();
+}
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f1a6ae8bbee6c850f6c1e26e6c595b34a19b5ab
--- /dev/null
+++ b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
@@ -0,0 +1,204 @@
+//====- LowerToLLVM.cpp - Lowering from Toy+Affine+Std to LLVM ------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a partial lowering of Toy operations to a combination of
+// affine loops and standard operations. This lowering expects that all calls
+// have been inlined, and all shapes have been resolved.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/Sequence.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// ToyToLLVM RewritePatterns
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Lowers `toy.print` to a loop nest calling `printf` on each of the individual
+/// elements of the array.
+class PrintOpLowering : public ConversionPattern {
+public:
+  explicit PrintOpLowering(MLIRContext *context)
+      : ConversionPattern(toy::PrintOp::getOperationName(), 1, context) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto memRefType = (*op->operand_type_begin()).cast<MemRefType>();
+    auto memRefShape = memRefType.getShape();
+    auto loc = op->getLoc();
+    auto *llvmDialect =
+        op->getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
+    assert(llvmDialect && "expected llvm dialect to be registered");
+
+    ModuleOp parentModule = op->getParentOfType<ModuleOp>();
+
+    // Get a symbol reference to the printf function, inserting it if necessary.
+    auto printfRef = getOrInsertPrintf(rewriter, parentModule, llvmDialect);
+    Value formatSpecifierCst = getOrCreateGlobalString(
+        loc, rewriter, "frmt_spec", StringRef("%f \0", 4), parentModule,
+        llvmDialect);
+    Value newLineCst = getOrCreateGlobalString(
+        loc, rewriter, "nl", StringRef("\n\0", 2), parentModule, llvmDialect);
+
+    // Create a loop for each of the dimensions within the shape.
+    SmallVector<Value, 4> loopIvs;
+    for (unsigned i = 0, e = memRefShape.size(); i != e; ++i) {
+      auto lowerBound = rewriter.create<ConstantIndexOp>(loc, 0);
+      auto upperBound = rewriter.create<ConstantIndexOp>(loc, memRefShape[i]);
+      auto step = rewriter.create<ConstantIndexOp>(loc, 1);
+      auto loop =
+          rewriter.create<loop::ForOp>(loc, lowerBound, upperBound, step);
+      loop.getBody()->clear();
+      loopIvs.push_back(loop.getInductionVar());
+
+      // Terminate the loop body.
+      rewriter.setInsertionPointToStart(loop.getBody());
+
+      // Insert a newline after each of the inner dimensions of the shape.
+      if (i != e - 1)
+        rewriter.create<CallOp>(loc, printfRef, rewriter.getIntegerType(32),
+                                newLineCst);
+      rewriter.create<loop::TerminatorOp>(loc);
+      rewriter.setInsertionPointToStart(loop.getBody());
+    }
+
+    // Generate a call to printf for the current element of the loop.
+    auto printOp = cast<toy::PrintOp>(op);
+    auto elementLoad = rewriter.create<LoadOp>(loc, printOp.input(), loopIvs);
+    rewriter.create<CallOp>(loc, printfRef, rewriter.getIntegerType(32),
+                            ArrayRef<Value>({formatSpecifierCst, elementLoad}));
+
+    // Notify the rewriter that this operation has been removed.
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+
+private:
+  /// Return a symbol reference to the printf function, inserting it into the
+  /// module if necessary.
+  static FlatSymbolRefAttr getOrInsertPrintf(PatternRewriter &rewriter,
+                                             ModuleOp module,
+                                             LLVM::LLVMDialect *llvmDialect) {
+    auto *context = module.getContext();
+    if (module.lookupSymbol<LLVM::LLVMFuncOp>("printf"))
+      return SymbolRefAttr::get("printf", context);
+
+    // Create a function declaration for printf, the signature is:
+    //   * `i32 (i8*, ...)`
+    auto llvmI32Ty = LLVM::LLVMType::getInt32Ty(llvmDialect);
+    auto llvmI8PtrTy = LLVM::LLVMType::getInt8PtrTy(llvmDialect);
+    auto llvmFnType = LLVM::LLVMType::getFunctionTy(llvmI32Ty, llvmI8PtrTy,
+                                                    /*isVarArg=*/true);
+
+    // Insert the printf function into the body of the parent module.
+    PatternRewriter::InsertionGuard insertGuard(rewriter);
+    rewriter.setInsertionPointToStart(module.getBody());
+    rewriter.create<LLVM::LLVMFuncOp>(module.getLoc(), "printf", llvmFnType);
+    return SymbolRefAttr::get("printf", context);
+  }
+
+  /// Return a value representing an access into a global string with the given
+  /// name, creating the string if necessary.
+  static Value getOrCreateGlobalString(Location loc, OpBuilder &builder,
+                                       StringRef name, StringRef value,
+                                       ModuleOp module,
+                                       LLVM::LLVMDialect *llvmDialect) {
+    // Create the global at the entry of the module.
+    LLVM::GlobalOp global;
+    if (!(global = module.lookupSymbol<LLVM::GlobalOp>(name))) {
+      OpBuilder::InsertionGuard insertGuard(builder);
+      builder.setInsertionPointToStart(module.getBody());
+      auto type = LLVM::LLVMType::getArrayTy(
+          LLVM::LLVMType::getInt8Ty(llvmDialect), value.size());
+      global = builder.create<LLVM::GlobalOp>(loc, type, /*isConstant=*/true,
+                                              LLVM::Linkage::Internal, name,
+                                              builder.getStringAttr(value));
+    }
+
+    // Get the pointer to the first character in the global string.
+    Value globalPtr = builder.create<LLVM::AddressOfOp>(loc, global);
+    Value cst0 = builder.create<LLVM::ConstantOp>(
+        loc, LLVM::LLVMType::getInt64Ty(llvmDialect),
+        builder.getIntegerAttr(builder.getIndexType(), 0));
+    return builder.create<LLVM::GEPOp>(
+        loc, LLVM::LLVMType::getInt8PtrTy(llvmDialect), globalPtr,
+        ArrayRef<Value>({cst0, cst0}));
+  }
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// ToyToLLVMLoweringPass
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct ToyToLLVMLoweringPass : public ModulePass<ToyToLLVMLoweringPass> {
+  void runOnModule() final;
+};
+} // end anonymous namespace
+
+void ToyToLLVMLoweringPass::runOnModule() {
+  // The first thing to define is the conversion target. This will define the
+  // final target for this lowering. For this lowering, we are only targeting
+  // the LLVM dialect.
+  ConversionTarget target(getContext());
+  target.addLegalDialect<LLVM::LLVMDialect>();
+  target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
+
+  // During this lowering, we will also be lowering the MemRef types, that are
+  // currently being operated on, to a representation in LLVM. Do perform this
+  // conversion we use a TypeConverter as part of the lowering. This converter
+  // details how one type maps to another. This is necessary now that we will be
+  // doing more complicated lowerings, involving loop region arguments.
+  LLVMTypeConverter typeConverter(&getContext());
+
+  // Now that the conversion target has been defined, we need to provide the
+  // patterns used for lowering. At this point of the compilation process, we
+  // have a combination of `toy`, `affine`, and `std` operations. Luckily, there
+  // are already exists a set of patterns to transform `affine` and `std`
+  // dialects. These patterns lowering in multiple stages, relying on transitive
+  // lowerings. Transitive lowering, or A->B->C lowering, is when multiple
+  // patterns must be applied to fully transform an illegal operation into a
+  // set of legal ones.
+  OwningRewritePatternList patterns;
+  populateAffineToStdConversionPatterns(patterns, &getContext());
+  populateLoopToStdConversionPatterns(patterns, &getContext());
+  populateStdToLLVMConversionPatterns(typeConverter, patterns);
+
+  // The only remaining operation to lower from the `toy` dialect, is the
+  // PrintOp.
+  patterns.insert<PrintOpLowering>(&getContext());
+
+  // We want to completely lower to LLVM, so we use a `FullConversion`. This
+  // ensures that only legal operations will remain after the conversion.
+  auto module = getModule();
+  if (failed(applyFullConversion(module, target, patterns, &typeConverter)))
+    signalPassFailure();
+}
+
+/// Create a pass for lowering operations the remaining `Toy` operations, as
+/// well as `Affine` and `Std`, to the LLVM dialect for codegen.
+std::unique_ptr<mlir::Pass> mlir::toy::createLowerToLLVMPass() {
+  return std::make_unique<ToyToLLVMLoweringPass>();
+}
diff --git a/mlir/examples/toy/Ch7/mlir/MLIRGen.cpp b/mlir/examples/toy/Ch7/mlir/MLIRGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d543f69bdc6a0ab2aca2364ee2f91c8cbe2140e
--- /dev/null
+++ b/mlir/examples/toy/Ch7/mlir/MLIRGen.cpp
@@ -0,0 +1,674 @@
+//===- MLIRGen.cpp - MLIR Generation from a Toy AST -----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple IR generation targeting MLIR from a Module AST
+// for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/MLIRGen.h"
+#include "toy/AST.h"
+#include "toy/Dialect.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/Support/raw_ostream.h"
+#include <numeric>
+
+using namespace mlir::toy;
+using namespace toy;
+
+using llvm::ArrayRef;
+using llvm::cast;
+using llvm::dyn_cast;
+using llvm::isa;
+using llvm::makeArrayRef;
+using llvm::ScopedHashTableScope;
+using llvm::SmallVector;
+using llvm::StringRef;
+using llvm::Twine;
+
+namespace {
+
+/// Implementation of a simple MLIR emission from the Toy AST.
+///
+/// This will emit operations that are specific to the Toy language, preserving
+/// the semantics of the language and (hopefully) allow to perform accurate
+/// analysis and transformation based on these high level semantics.
+class MLIRGenImpl {
+public:
+  MLIRGenImpl(mlir::MLIRContext &context) : builder(&context) {}
+
+  /// Public API: convert the AST for a Toy module (source file) to an MLIR
+  /// Module operation.
+  mlir::ModuleOp mlirGen(ModuleAST &moduleAST) {
+    // We create an empty MLIR module and codegen functions one at a time and
+    // add them to the module.
+    theModule = mlir::ModuleOp::create(builder.getUnknownLoc());
+
+    for (auto &record : moduleAST) {
+      if (FunctionAST *funcAST = llvm::dyn_cast<FunctionAST>(record.get())) {
+        auto func = mlirGen(*funcAST);
+        if (!func)
+          return nullptr;
+
+        theModule.push_back(func);
+        functionMap.insert({func.getName(), func});
+      } else if (StructAST *str = llvm::dyn_cast<StructAST>(record.get())) {
+        if (failed(mlirGen(*str)))
+          return nullptr;
+      } else {
+        llvm_unreachable("unknown record type");
+      }
+    }
+
+    // Verify the module after we have finished constructing it, this will check
+    // the structural properties of the IR and invoke any specific verifiers we
+    // have on the Toy operations.
+    if (failed(mlir::verify(theModule))) {
+      theModule.emitError("module verification error");
+      return nullptr;
+    }
+
+    return theModule;
+  }
+
+private:
+  /// A "module" matches a Toy source file: containing a list of functions.
+  mlir::ModuleOp theModule;
+
+  /// The builder is a helper class to create IR inside a function. The builder
+  /// is stateful, in particular it keeps an "insertion point": this is where
+  /// the next operations will be introduced.
+  mlir::OpBuilder builder;
+
+  /// The symbol table maps a variable name to a value in the current scope.
+  /// Entering a function creates a new scope, and the function arguments are
+  /// added to the mapping. When the processing of a function is terminated, the
+  /// scope is destroyed and the mappings created in this scope are dropped.
+  llvm::ScopedHashTable<StringRef, std::pair<mlir::Value, VarDeclExprAST *>>
+      symbolTable;
+  using SymbolTableScopeT =
+      llvm::ScopedHashTableScope<StringRef,
+                                 std::pair<mlir::Value, VarDeclExprAST *>>;
+
+  /// A mapping for the functions that have been code generated to MLIR.
+  llvm::StringMap<mlir::FuncOp> functionMap;
+
+  /// A mapping for named struct types to the underlying MLIR type and the
+  /// original AST node.
+  llvm::StringMap<std::pair<mlir::Type, StructAST *>> structMap;
+
+  /// Helper conversion for a Toy AST location to an MLIR location.
+  mlir::Location loc(Location loc) {
+    return builder.getFileLineColLoc(builder.getIdentifier(*loc.file), loc.line,
+                                     loc.col);
+  }
+
+  /// Declare a variable in the current scope, return success if the variable
+  /// wasn't declared yet.
+  mlir::LogicalResult declare(VarDeclExprAST &var, mlir::Value value) {
+    if (symbolTable.count(var.getName()))
+      return mlir::failure();
+    symbolTable.insert(var.getName(), {value, &var});
+    return mlir::success();
+  }
+
+  /// Create an MLIR type for the given struct.
+  mlir::LogicalResult mlirGen(StructAST &str) {
+    if (structMap.count(str.getName()))
+      return emitError(loc(str.loc())) << "error: struct type with name `"
+                                       << str.getName() << "' already exists";
+
+    auto variables = str.getVariables();
+    std::vector<mlir::Type> elementTypes;
+    elementTypes.reserve(variables.size());
+    for (auto &variable : variables) {
+      if (variable->getInitVal())
+        return emitError(loc(variable->loc()))
+               << "error: variables within a struct definition must not have "
+                  "initializers";
+      if (!variable->getType().shape.empty())
+        return emitError(loc(variable->loc()))
+               << "error: variables within a struct definition must not have "
+                  "initializers";
+
+      mlir::Type type = getType(variable->getType(), variable->loc());
+      if (!type)
+        return mlir::failure();
+      elementTypes.push_back(type);
+    }
+
+    structMap.try_emplace(str.getName(), StructType::get(elementTypes), &str);
+    return mlir::success();
+  }
+
+  /// Create the prototype for an MLIR function with as many arguments as the
+  /// provided Toy AST prototype.
+  mlir::FuncOp mlirGen(PrototypeAST &proto) {
+    auto location = loc(proto.loc());
+
+    // This is a generic function, the return type will be inferred later.
+    llvm::SmallVector<mlir::Type, 4> argTypes;
+    argTypes.reserve(proto.getArgs().size());
+    for (auto &arg : proto.getArgs()) {
+      mlir::Type type = getType(arg->getType(), arg->loc());
+      if (!type)
+        return nullptr;
+      argTypes.push_back(type);
+    }
+    auto func_type = builder.getFunctionType(argTypes, llvm::None);
+    return mlir::FuncOp::create(location, proto.getName(), func_type);
+  }
+
+  /// Emit a new function and add it to the MLIR module.
+  mlir::FuncOp mlirGen(FunctionAST &funcAST) {
+    // Create a scope in the symbol table to hold variable declarations.
+    SymbolTableScopeT var_scope(symbolTable);
+
+    // Create an MLIR function for the given prototype.
+    mlir::FuncOp function(mlirGen(*funcAST.getProto()));
+    if (!function)
+      return nullptr;
+
+    // Let's start the body of the function now!
+    // In MLIR the entry block of the function is special: it must have the same
+    // argument list as the function itself.
+    auto &entryBlock = *function.addEntryBlock();
+    auto protoArgs = funcAST.getProto()->getArgs();
+
+    // Declare all the function arguments in the symbol table.
+    for (const auto &name_value :
+         llvm::zip(protoArgs, entryBlock.getArguments())) {
+      if (failed(declare(*std::get<0>(name_value), std::get<1>(name_value))))
+        return nullptr;
+    }
+
+    // Set the insertion point in the builder to the beginning of the function
+    // body, it will be used throughout the codegen to create operations in this
+    // function.
+    builder.setInsertionPointToStart(&entryBlock);
+
+    // Emit the body of the function.
+    if (mlir::failed(mlirGen(*funcAST.getBody()))) {
+      function.erase();
+      return nullptr;
+    }
+
+    // Implicitly return void if no return statement was emitted.
+    // FIXME: we may fix the parser instead to always return the last expression
+    // (this would possibly help the REPL case later)
+    ReturnOp returnOp;
+    if (!entryBlock.empty())
+      returnOp = dyn_cast<ReturnOp>(entryBlock.back());
+    if (!returnOp) {
+      builder.create<ReturnOp>(loc(funcAST.getProto()->loc()));
+    } else if (returnOp.hasOperand()) {
+      // Otherwise, if this return operation has an operand then add a result to
+      // the function.
+      function.setType(builder.getFunctionType(function.getType().getInputs(),
+                                               *returnOp.operand_type_begin()));
+    }
+
+    return function;
+  }
+
+  /// Return the struct type that is the result of the given expression, or null
+  /// if it cannot be inferred.
+  StructAST *getStructFor(ExprAST *expr) {
+    llvm::StringRef structName;
+    if (auto *decl = llvm::dyn_cast<VariableExprAST>(expr)) {
+      auto varIt = symbolTable.lookup(decl->getName());
+      if (!varIt.first)
+        return nullptr;
+      structName = varIt.second->getType().name;
+    } else if (auto *access = llvm::dyn_cast<BinaryExprAST>(expr)) {
+      if (access->getOp() != '.')
+        return nullptr;
+      // The name being accessed should be in the RHS.
+      auto *name = llvm::dyn_cast<VariableExprAST>(access->getRHS());
+      if (!name)
+        return nullptr;
+      StructAST *parentStruct = getStructFor(access->getLHS());
+      if (!parentStruct)
+        return nullptr;
+
+      // Get the element within the struct corresponding to the name.
+      VarDeclExprAST *decl = nullptr;
+      for (auto &var : parentStruct->getVariables()) {
+        if (var->getName() == name->getName()) {
+          decl = var.get();
+          break;
+        }
+      }
+      if (!decl)
+        return nullptr;
+      structName = decl->getType().name;
+    }
+    if (structName.empty())
+      return nullptr;
+
+    // If the struct name was valid, check for an entry in the struct map.
+    auto structIt = structMap.find(structName);
+    if (structIt == structMap.end())
+      return nullptr;
+    return structIt->second.second;
+  }
+
+  /// Return the numeric member index of the given struct access expression.
+  llvm::Optional<size_t> getMemberIndex(BinaryExprAST &accessOp) {
+    assert(accessOp.getOp() == '.' && "expected access operation");
+
+    // Lookup the struct node for the LHS.
+    StructAST *structAST = getStructFor(accessOp.getLHS());
+    if (!structAST)
+      return llvm::None;
+
+    // Get the name from the RHS.
+    VariableExprAST *name = llvm::dyn_cast<VariableExprAST>(accessOp.getRHS());
+    if (!name)
+      return llvm::None;
+
+    auto structVars = structAST->getVariables();
+    auto it = llvm::find_if(structVars, [&](auto &var) {
+      return var->getName() == name->getName();
+    });
+    if (it == structVars.end())
+      return llvm::None;
+    return it - structVars.begin();
+  }
+
+  /// Emit a binary operation
+  mlir::Value mlirGen(BinaryExprAST &binop) {
+    // First emit the operations for each side of the operation before emitting
+    // the operation itself. For example if the expression is `a + foo(a)`
+    // 1) First it will visiting the LHS, which will return a reference to the
+    //    value holding `a`. This value should have been emitted at declaration
+    //    time and registered in the symbol table, so nothing would be
+    //    codegen'd. If the value is not in the symbol table, an error has been
+    //    emitted and nullptr is returned.
+    // 2) Then the RHS is visited (recursively) and a call to `foo` is emitted
+    //    and the result value is returned. If an error occurs we get a nullptr
+    //    and propagate.
+    //
+    mlir::Value lhs = mlirGen(*binop.getLHS());
+    if (!lhs)
+      return nullptr;
+    auto location = loc(binop.loc());
+
+    // If this is an access operation, handle it immediately.
+    if (binop.getOp() == '.') {
+      llvm::Optional<size_t> accessIndex = getMemberIndex(binop);
+      if (!accessIndex) {
+        emitError(location, "invalid access into struct expression");
+        return nullptr;
+      }
+      return builder.create<StructAccessOp>(location, lhs, *accessIndex);
+    }
+
+    // Otherwise, this is a normal binary op.
+    mlir::Value rhs = mlirGen(*binop.getRHS());
+    if (!rhs)
+      return nullptr;
+
+    // Derive the operation name from the binary operator. At the moment we only
+    // support '+' and '*'.
+    switch (binop.getOp()) {
+    case '+':
+      return builder.create<AddOp>(location, lhs, rhs);
+    case '*':
+      return builder.create<MulOp>(location, lhs, rhs);
+    }
+
+    emitError(location, "invalid binary operator '") << binop.getOp() << "'";
+    return nullptr;
+  }
+
+  /// This is a reference to a variable in an expression. The variable is
+  /// expected to have been declared and so should have a value in the symbol
+  /// table, otherwise emit an error and return nullptr.
+  mlir::Value mlirGen(VariableExprAST &expr) {
+    if (auto variable = symbolTable.lookup(expr.getName()).first)
+      return variable;
+
+    emitError(loc(expr.loc()), "error: unknown variable '")
+        << expr.getName() << "'";
+    return nullptr;
+  }
+
+  /// Emit a return operation. This will return failure if any generation fails.
+  mlir::LogicalResult mlirGen(ReturnExprAST &ret) {
+    auto location = loc(ret.loc());
+
+    // 'return' takes an optional expression, handle that case here.
+    mlir::Value expr = nullptr;
+    if (ret.getExpr().hasValue()) {
+      if (!(expr = mlirGen(*ret.getExpr().getValue())))
+        return mlir::failure();
+    }
+
+    // Otherwise, this return operation has zero operands.
+    builder.create<ReturnOp>(location, expr ? makeArrayRef(expr)
+                                            : ArrayRef<mlir::Value>());
+    return mlir::success();
+  }
+
+  /// Emit a constant for a literal/constant array. It will be emitted as a
+  /// flattened array of data in an Attribute attached to a `toy.constant`
+  /// operation. See documentation on [Attributes](LangRef.md#attributes) for
+  /// more details. Here is an excerpt:
+  ///
+  ///   Attributes are the mechanism for specifying constant data in MLIR in
+  ///   places where a variable is never allowed [...]. They consist of a name
+  ///   and a concrete attribute value. The set of expected attributes, their
+  ///   structure, and their interpretation are all contextually dependent on
+  ///   what they are attached to.
+  ///
+  /// Example, the source level statement:
+  ///   var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  /// will be converted to:
+  ///   %0 = "toy.constant"() {value: dense<tensor<2x3xf64>,
+  ///     [[1.000000e+00, 2.000000e+00, 3.000000e+00],
+  ///      [4.000000e+00, 5.000000e+00, 6.000000e+00]]>} : () -> tensor<2x3xf64>
+  ///
+  mlir::DenseElementsAttr getConstantAttr(LiteralExprAST &lit) {
+    // The attribute is a vector with a floating point value per element
+    // (number) in the array, see `collectData()` below for more details.
+    std::vector<double> data;
+    data.reserve(std::accumulate(lit.getDims().begin(), lit.getDims().end(), 1,
+                                 std::multiplies<int>()));
+    collectData(lit, data);
+
+    // The type of this attribute is tensor of 64-bit floating-point with the
+    // shape of the literal.
+    mlir::Type elementType = builder.getF64Type();
+    auto dataType = mlir::RankedTensorType::get(lit.getDims(), elementType);
+
+    // This is the actual attribute that holds the list of values for this
+    // tensor literal.
+    return mlir::DenseElementsAttr::get(dataType, llvm::makeArrayRef(data));
+  }
+  mlir::DenseElementsAttr getConstantAttr(NumberExprAST &lit) {
+    // The type of this attribute is tensor of 64-bit floating-point with no
+    // shape.
+    mlir::Type elementType = builder.getF64Type();
+    auto dataType = mlir::RankedTensorType::get({}, elementType);
+
+    // This is the actual attribute that holds the list of values for this
+    // tensor literal.
+    return mlir::DenseElementsAttr::get(dataType,
+                                        llvm::makeArrayRef(lit.getValue()));
+  }
+  /// Emit a constant for a struct literal. It will be emitted as an array of
+  /// other literals in an Attribute attached to a `toy.struct_constant`
+  /// operation. This function returns the generated constant, along with the
+  /// corresponding struct type.
+  std::pair<mlir::ArrayAttr, mlir::Type>
+  getConstantAttr(StructLiteralExprAST &lit) {
+    std::vector<mlir::Attribute> attrElements;
+    std::vector<mlir::Type> typeElements;
+
+    for (auto &var : lit.getValues()) {
+      if (auto *number = llvm::dyn_cast<NumberExprAST>(var.get())) {
+        attrElements.push_back(getConstantAttr(*number));
+        typeElements.push_back(getType(llvm::None));
+      } else if (auto *lit = llvm::dyn_cast<LiteralExprAST>(var.get())) {
+        attrElements.push_back(getConstantAttr(*lit));
+        typeElements.push_back(getType(llvm::None));
+      } else {
+        auto *structLit = llvm::cast<StructLiteralExprAST>(var.get());
+        auto attrTypePair = getConstantAttr(*structLit);
+        attrElements.push_back(attrTypePair.first);
+        typeElements.push_back(attrTypePair.second);
+      }
+    }
+    mlir::ArrayAttr dataAttr = builder.getArrayAttr(attrElements);
+    mlir::Type dataType = StructType::get(typeElements);
+    return std::make_pair(dataAttr, dataType);
+  }
+
+  /// Emit an array literal.
+  mlir::Value mlirGen(LiteralExprAST &lit) {
+    mlir::Type type = getType(lit.getDims());
+    mlir::DenseElementsAttr dataAttribute = getConstantAttr(lit);
+
+    // Build the MLIR op `toy.constant`. This invokes the `ConstantOp::build`
+    // method.
+    return builder.create<ConstantOp>(loc(lit.loc()), type, dataAttribute);
+  }
+
+  /// Emit a struct literal. It will be emitted as an array of
+  /// other literals in an Attribute attached to a `toy.struct_constant`
+  /// operation.
+  mlir::Value mlirGen(StructLiteralExprAST &lit) {
+    mlir::ArrayAttr dataAttr;
+    mlir::Type dataType;
+    std::tie(dataAttr, dataType) = getConstantAttr(lit);
+
+    // Build the MLIR op `toy.struct_constant`. This invokes the
+    // `StructConstantOp::build` method.
+    return builder.create<StructConstantOp>(loc(lit.loc()), dataType, dataAttr);
+  }
+
+  /// Recursive helper function to accumulate the data that compose an array
+  /// literal. It flattens the nested structure in the supplied vector. For
+  /// example with this array:
+  ///  [[1, 2], [3, 4]]
+  /// we will generate:
+  ///  [ 1, 2, 3, 4 ]
+  /// Individual numbers are represented as doubles.
+  /// Attributes are the way MLIR attaches constant to operations.
+  void collectData(ExprAST &expr, std::vector<double> &data) {
+    if (auto *lit = dyn_cast<LiteralExprAST>(&expr)) {
+      for (auto &value : lit->getValues())
+        collectData(*value, data);
+      return;
+    }
+
+    assert(isa<NumberExprAST>(expr) && "expected literal or number expr");
+    data.push_back(cast<NumberExprAST>(expr).getValue());
+  }
+
+  /// Emit a call expression. It emits specific operations for the `transpose`
+  /// builtin. Other identifiers are assumed to be user-defined functions.
+  mlir::Value mlirGen(CallExprAST &call) {
+    llvm::StringRef callee = call.getCallee();
+    auto location = loc(call.loc());
+
+    // Codegen the operands first.
+    SmallVector<mlir::Value, 4> operands;
+    for (auto &expr : call.getArgs()) {
+      auto arg = mlirGen(*expr);
+      if (!arg)
+        return nullptr;
+      operands.push_back(arg);
+    }
+
+    // Builting calls have their custom operation, meaning this is a
+    // straightforward emission.
+    if (callee == "transpose") {
+      if (call.getArgs().size() != 1) {
+        emitError(location, "MLIR codegen encountered an error: toy.transpose "
+                            "does not accept multiple arguments");
+        return nullptr;
+      }
+      return builder.create<TransposeOp>(location, operands[0]);
+    }
+
+    // Otherwise this is a call to a user-defined function. Calls to ser-defined
+    // functions are mapped to a custom call that takes the callee name as an
+    // attribute.
+    auto calledFuncIt = functionMap.find(callee);
+    if (calledFuncIt == functionMap.end()) {
+      emitError(location) << "no defined function found for '" << callee << "'";
+      return nullptr;
+    }
+    mlir::FuncOp calledFunc = calledFuncIt->second;
+    return builder.create<GenericCallOp>(
+        location, calledFunc.getType().getResult(0),
+        builder.getSymbolRefAttr(callee), operands);
+  }
+
+  /// Emit a print expression. It emits specific operations for two builtins:
+  /// transpose(x) and print(x).
+  mlir::LogicalResult mlirGen(PrintExprAST &call) {
+    auto arg = mlirGen(*call.getArg());
+    if (!arg)
+      return mlir::failure();
+
+    builder.create<PrintOp>(loc(call.loc()), arg);
+    return mlir::success();
+  }
+
+  /// Emit a constant for a single number (FIXME: semantic? broadcast?)
+  mlir::Value mlirGen(NumberExprAST &num) {
+    return builder.create<ConstantOp>(loc(num.loc()), num.getValue());
+  }
+
+  /// Dispatch codegen for the right expression subclass using RTTI.
+  mlir::Value mlirGen(ExprAST &expr) {
+    switch (expr.getKind()) {
+    case toy::ExprAST::Expr_BinOp:
+      return mlirGen(cast<BinaryExprAST>(expr));
+    case toy::ExprAST::Expr_Var:
+      return mlirGen(cast<VariableExprAST>(expr));
+    case toy::ExprAST::Expr_Literal:
+      return mlirGen(cast<LiteralExprAST>(expr));
+    case toy::ExprAST::Expr_StructLiteral:
+      return mlirGen(cast<StructLiteralExprAST>(expr));
+    case toy::ExprAST::Expr_Call:
+      return mlirGen(cast<CallExprAST>(expr));
+    case toy::ExprAST::Expr_Num:
+      return mlirGen(cast<NumberExprAST>(expr));
+    default:
+      emitError(loc(expr.loc()))
+          << "MLIR codegen encountered an unhandled expr kind '"
+          << Twine(expr.getKind()) << "'";
+      return nullptr;
+    }
+  }
+
+  /// Handle a variable declaration, we'll codegen the expression that forms the
+  /// initializer and record the value in the symbol table before returning it.
+  /// Future expressions will be able to reference this variable through symbol
+  /// table lookup.
+  mlir::Value mlirGen(VarDeclExprAST &vardecl) {
+    auto init = vardecl.getInitVal();
+    if (!init) {
+      emitError(loc(vardecl.loc()),
+                "missing initializer in variable declaration");
+      return nullptr;
+    }
+
+    mlir::Value value = mlirGen(*init);
+    if (!value)
+      return nullptr;
+
+    // Handle the case where we are initializing a struct value.
+    VarType varType = vardecl.getType();
+    if (!varType.name.empty()) {
+      // Check that the initializer type is the same as the variable
+      // declaration.
+      mlir::Type type = getType(varType, vardecl.loc());
+      if (!type)
+        return nullptr;
+      if (type != value->getType()) {
+        emitError(loc(vardecl.loc()))
+            << "struct type of initializer is different than the variable "
+               "declaration. Got "
+            << value->getType() << ", but expected " << type;
+        return nullptr;
+      }
+
+      // Otherwise, we have the initializer value, but in case the variable was
+      // declared with specific shape, we emit a "reshape" operation. It will
+      // get optimized out later as needed.
+    } else if (!varType.shape.empty()) {
+      value = builder.create<ReshapeOp>(loc(vardecl.loc()),
+                                        getType(varType.shape), value);
+    }
+
+    // Register the value in the symbol table.
+    if (failed(declare(vardecl, value)))
+      return nullptr;
+    return value;
+  }
+
+  /// Codegen a list of expression, return failure if one of them hit an error.
+  mlir::LogicalResult mlirGen(ExprASTList &blockAST) {
+    SymbolTableScopeT var_scope(symbolTable);
+    for (auto &expr : blockAST) {
+      // Specific handling for variable declarations, return statement, and
+      // print. These can only appear in block list and not in nested
+      // expressions.
+      if (auto *vardecl = dyn_cast<VarDeclExprAST>(expr.get())) {
+        if (!mlirGen(*vardecl))
+          return mlir::failure();
+        continue;
+      }
+      if (auto *ret = dyn_cast<ReturnExprAST>(expr.get()))
+        return mlirGen(*ret);
+      if (auto *print = dyn_cast<PrintExprAST>(expr.get())) {
+        if (mlir::failed(mlirGen(*print)))
+          return mlir::success();
+        continue;
+      }
+
+      // Generic expression dispatch codegen.
+      if (!mlirGen(*expr))
+        return mlir::failure();
+    }
+    return mlir::success();
+  }
+
+  /// Build a tensor type from a list of shape dimensions.
+  mlir::Type getType(ArrayRef<int64_t> shape) {
+    // If the shape is empty, then this type is unranked.
+    if (shape.empty())
+      return mlir::UnrankedTensorType::get(builder.getF64Type());
+
+    // Otherwise, we use the given shape.
+    return mlir::RankedTensorType::get(shape, builder.getF64Type());
+  }
+
+  /// Build an MLIR type from a Toy AST variable type (forward to the generic
+  /// getType above for non-struct types).
+  mlir::Type getType(const VarType &type, const Location &location) {
+    if (!type.name.empty()) {
+      auto it = structMap.find(type.name);
+      if (it == structMap.end()) {
+        emitError(loc(location))
+            << "error: unknown struct type '" << type.name << "'";
+        return nullptr;
+      }
+      return it->second.first;
+    }
+
+    return getType(type.shape);
+  }
+};
+
+} // namespace
+
+namespace toy {
+
+// The public API for codegen.
+mlir::OwningModuleRef mlirGen(mlir::MLIRContext &context,
+                              ModuleAST &moduleAST) {
+  return MLIRGenImpl(context).mlirGen(moduleAST);
+}
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch7/mlir/ShapeInferencePass.cpp b/mlir/examples/toy/Ch7/mlir/ShapeInferencePass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..517a1f075306485003e099ed805a23f77cb49147
--- /dev/null
+++ b/mlir/examples/toy/Ch7/mlir/ShapeInferencePass.cpp
@@ -0,0 +1,104 @@
+//===- ShapeInferencePass.cpp - Shape Inference ---------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Function level pass performing interprocedural
+// propagation of array shapes through function specialization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Pass/Pass.h"
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+#include "toy/ShapeInferenceInterface.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "shape-inference"
+
+using namespace mlir;
+using namespace toy;
+
+/// Include the auto-generated definitions for the shape inference interfaces.
+#include "toy/ShapeInferenceOpInterfaces.cpp.inc"
+
+namespace {
+/// The ShapeInferencePass is a FunctionPass that performs intra-procedural
+/// shape inference.
+///
+///    Algorithm:
+///
+///   1) Build a worklist containing all the operations that return a
+///      dynamically shaped tensor: these are the operations that need shape
+///      inference.
+///   2) Iterate on the worklist:
+///     a) find an operation to process: the next ready operation in the
+///        worklist has all of its arguments non-generic,
+///     b) if no operation is found, break out of the loop,
+///     c) remove the operation from the worklist,
+///     d) infer the shape of its output from the argument types.
+///   3) If the worklist is empty, the algorithm succeeded.
+///
+class ShapeInferencePass : public mlir::FunctionPass<ShapeInferencePass> {
+public:
+  void runOnFunction() override {
+    auto f = getFunction();
+
+    // Populate the worklist with the operations that need shape inference:
+    // these are operations that return a dynamic shape.
+    llvm::SmallPtrSet<mlir::Operation *, 16> opWorklist;
+    f.walk([&](mlir::Operation *op) {
+      if (returnsDynamicShape(op))
+        opWorklist.insert(op);
+    });
+
+    // Iterate on the operations in the worklist until all operations have been
+    // inferred or no change happened (fix point).
+    while (!opWorklist.empty()) {
+      // Find the next operation ready for inference, that is an operation
+      // with all operands already resolved (non-generic).
+      auto nextop = llvm::find_if(opWorklist, returnsDynamicShape);
+      if (nextop == opWorklist.end())
+        break;
+
+      Operation *op = *nextop;
+      opWorklist.erase(op);
+
+      // Ask the operation to infer its output shapes.
+      LLVM_DEBUG(llvm::dbgs() << "Inferring shape for: " << *op << "\n");
+      if (auto shapeOp = dyn_cast<ShapeInference>(op)) {
+        shapeOp.inferShapes();
+      } else {
+        op->emitError("unable to infer shape of operation without shape "
+                      "inference interface");
+        return signalPassFailure();
+      }
+    }
+
+    // If the operation worklist isn't empty, this indicates a failure.
+    if (!opWorklist.empty()) {
+      f.emitError("Shape inference failed, ")
+          << opWorklist.size() << " operations couldn't be inferred\n";
+      signalPassFailure();
+    }
+  }
+
+  /// A utility method that returns if the given operation has a dynamically
+  /// shaped result.
+  static bool returnsDynamicShape(Operation *op) {
+    return llvm::any_of(op->getResultTypes(), [](Type resultType) {
+      return !resultType.isa<RankedTensorType>();
+    });
+  }
+};
+} // end anonymous namespace
+
+/// Create a Shape Inference pass.
+std::unique_ptr<mlir::Pass> mlir::toy::createShapeInferencePass() {
+  return std::make_unique<ShapeInferencePass>();
+}
diff --git a/mlir/examples/toy/Ch7/mlir/ToyCombine.cpp b/mlir/examples/toy/Ch7/mlir/ToyCombine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c688a53d86f7db586916127e922b345379d836fe
--- /dev/null
+++ b/mlir/examples/toy/Ch7/mlir/ToyCombine.cpp
@@ -0,0 +1,92 @@
+//===- ToyCombine.cpp - Toy High Level Optimizer --------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a set of simple combiners for optimizing operations in
+// the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "toy/Dialect.h"
+#include <numeric>
+using namespace mlir;
+using namespace toy;
+
+namespace {
+/// Include the patterns defined in the Declarative Rewrite framework.
+#include "ToyCombine.inc"
+} // end anonymous namespace
+
+/// Fold simple cast operations that return the same type as the input.
+OpFoldResult CastOp::fold(ArrayRef<Attribute> operands) {
+  return mlir::impl::foldCastOp(*this);
+}
+
+/// Fold constants.
+OpFoldResult ConstantOp::fold(ArrayRef<Attribute> operands) { return value(); }
+
+/// Fold struct constants.
+OpFoldResult StructConstantOp::fold(ArrayRef<Attribute> operands) {
+  return value();
+}
+
+/// Fold simple struct access operations that access into a constant.
+OpFoldResult StructAccessOp::fold(ArrayRef<Attribute> operands) {
+  auto structAttr = operands.front().dyn_cast_or_null<mlir::ArrayAttr>();
+  if (!structAttr)
+    return nullptr;
+
+  size_t elementIndex = index().getZExtValue();
+  return structAttr.getValue()[elementIndex];
+}
+
+/// This is an example of a c++ rewrite pattern for the TransposeOp. It
+/// optimizes the following scenario: transpose(transpose(x)) -> transpose(x)
+struct SimplifyRedundantTranspose : public mlir::OpRewritePattern<TransposeOp> {
+  /// We register this pattern to match every toy.transpose in the IR.
+  /// The "benefit" is used by the framework to order the patterns and process
+  /// them in order of profitability.
+  SimplifyRedundantTranspose(mlir::MLIRContext *context)
+      : OpRewritePattern<TransposeOp>(context, /*benefit=*/1) {}
+
+  /// This method attempts to match a pattern and rewrite it. The rewriter
+  /// argument is the orchestrator of the sequence of rewrites. The pattern is
+  /// expected to interact with it to perform any changes to the IR from here.
+  mlir::PatternMatchResult
+  matchAndRewrite(TransposeOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // Look through the input of the current transpose.
+    mlir::Value transposeInput = op.getOperand();
+    TransposeOp transposeInputOp =
+        llvm::dyn_cast_or_null<TransposeOp>(transposeInput->getDefiningOp());
+
+    // If the input is defined by another Transpose, bingo!
+    if (!transposeInputOp)
+      return matchFailure();
+
+    // Use the rewriter to perform the replacement.
+    rewriter.replaceOp(op, {transposeInputOp.getOperand()}, {transposeInputOp});
+    return matchSuccess();
+  }
+};
+
+/// Register our patterns as "canonicalization" patterns on the TransposeOp so
+/// that they can be picked up by the Canonicalization framework.
+void TransposeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                              MLIRContext *context) {
+  results.insert<SimplifyRedundantTranspose>(context);
+}
+
+/// Register our patterns as "canonicalization" patterns on the ReshapeOp so
+/// that they can be picked up by the Canonicalization framework.
+void ReshapeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  results.insert<ReshapeReshapeOptPattern, RedundantReshapeOptPattern,
+                 FoldConstantReshapeOptPattern>(context);
+}
diff --git a/mlir/examples/toy/Ch7/mlir/ToyCombine.td b/mlir/examples/toy/Ch7/mlir/ToyCombine.td
new file mode 100644
index 0000000000000000000000000000000000000000..e6e33e84d7e8f3e13aea9840f3690029de025d94
--- /dev/null
+++ b/mlir/examples/toy/Ch7/mlir/ToyCombine.td
@@ -0,0 +1,62 @@
+//===- ToyCombine.td - Pattern Match Optimizations for Toy -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines language-specific pattern match optimizations for Toy using
+// Declarative Rewrite Rules (DRR) specified using TableGen records.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_COMBINE
+#define TOY_COMBINE
+
+include "toy/Ops.td"
+
+/// Note: The DRR definition used for defining patterns is shown below:
+///
+/// class Pattern<
+///    dag sourcePattern, list<dag> resultPatterns,
+///    list<dag> additionalConstraints = [],
+///    dag benefitsAdded = (addBenefit 0)
+/// >;
+
+//===----------------------------------------------------------------------===//
+// Basic Pattern-Match and Rewrite
+//===----------------------------------------------------------------------===//
+
+// Reshape(Reshape(x)) = Reshape(x)
+def ReshapeReshapeOptPattern : Pat<(ReshapeOp(ReshapeOp $arg)),
+                                   (ReshapeOp $arg)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern-Match and Rewrite using Native Code Call
+//===----------------------------------------------------------------------===//
+
+// Native Code Calls may be used for more complex transformations using inline
+// C++ and C++ helper functions.
+
+// Reshape(Constant(x)) = x'
+def ReshapeConstant :
+  NativeCodeCall<"$0.reshape(($1->getType()).cast<ShapedType>())">;
+def FoldConstantReshapeOptPattern : Pat<
+  (ReshapeOp:$res (ConstantOp $arg)),
+  (ConstantOp (ReshapeConstant $arg, $res))>;
+
+//===----------------------------------------------------------------------===//
+// Pattern-Match and Rewrite with Constraints
+//===----------------------------------------------------------------------===//
+
+// DRR allows for constraint checking when the transformation is conditional
+// on operand properties.
+
+// Reshape(x) = x, where input and output shapes are identical
+def TypesAreIdentical : Constraint<CPred<"$0->getType() == $1->getType()">>;
+def RedundantReshapeOptPattern : Pat<
+  (ReshapeOp:$res $arg), (replaceWithValue $arg),
+  [(TypesAreIdentical $res, $arg)]>;
+
+#endif // TOY_COMBINE
diff --git a/mlir/examples/toy/Ch7/parser/AST.cpp b/mlir/examples/toy/Ch7/parser/AST.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..669bc9dbec21514b8bf3a7971028600b33e8d41a
--- /dev/null
+++ b/mlir/examples/toy/Ch7/parser/AST.cpp
@@ -0,0 +1,271 @@
+//===- AST.cpp - Helper for printing out the Toy AST ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST dump for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/AST.h"
+
+#include "mlir/ADT/TypeSwitch.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+
+namespace {
+
+// RAII helper to manage increasing/decreasing the indentation as we traverse
+// the AST
+struct Indent {
+  Indent(int &level) : level(level) { ++level; }
+  ~Indent() { --level; }
+  int &level;
+};
+
+/// Helper class that implement the AST tree traversal and print the nodes along
+/// the way. The only data member is the current indentation level.
+class ASTDumper {
+public:
+  void dump(ModuleAST *node);
+
+private:
+  void dump(const VarType &type);
+  void dump(VarDeclExprAST *varDecl);
+  void dump(ExprAST *expr);
+  void dump(ExprASTList *exprList);
+  void dump(NumberExprAST *num);
+  void dump(LiteralExprAST *node);
+  void dump(StructLiteralExprAST *node);
+  void dump(VariableExprAST *node);
+  void dump(ReturnExprAST *node);
+  void dump(BinaryExprAST *node);
+  void dump(CallExprAST *node);
+  void dump(PrintExprAST *node);
+  void dump(PrototypeAST *node);
+  void dump(FunctionAST *node);
+  void dump(StructAST *node);
+
+  // Actually print spaces matching the current indentation level
+  void indent() {
+    for (int i = 0; i < curIndent; i++)
+      llvm::errs() << "  ";
+  }
+  int curIndent = 0;
+};
+
+} // namespace
+
+/// Return a formatted string for the location of any node
+template <typename T> static std::string loc(T *node) {
+  const auto &loc = node->loc();
+  return (llvm::Twine("@") + *loc.file + ":" + llvm::Twine(loc.line) + ":" +
+          llvm::Twine(loc.col))
+      .str();
+}
+
+// Helper Macro to bump the indentation level and print the leading spaces for
+// the current indentations
+#define INDENT()                                                               \
+  Indent level_(curIndent);                                                    \
+  indent();
+
+/// Dispatch to a generic expressions to the appropriate subclass using RTTI
+void ASTDumper::dump(ExprAST *expr) {
+  mlir::TypeSwitch<ExprAST *>(expr)
+      .Case<BinaryExprAST, CallExprAST, LiteralExprAST, NumberExprAST,
+            PrintExprAST, ReturnExprAST, StructLiteralExprAST, VarDeclExprAST,
+            VariableExprAST>([&](auto *node) { this->dump(node); })
+      .Default([&](ExprAST *) {
+        // No match, fallback to a generic message
+        INDENT();
+        llvm::errs() << "<unknown Expr, kind " << expr->getKind() << ">\n";
+      });
+}
+
+/// A variable declaration is printing the variable name, the type, and then
+/// recurse in the initializer value.
+void ASTDumper::dump(VarDeclExprAST *varDecl) {
+  INDENT();
+  llvm::errs() << "VarDecl " << varDecl->getName();
+  dump(varDecl->getType());
+  llvm::errs() << " " << loc(varDecl) << "\n";
+  if (auto *initVal = varDecl->getInitVal())
+    dump(initVal);
+}
+
+/// A "block", or a list of expression
+void ASTDumper::dump(ExprASTList *exprList) {
+  INDENT();
+  llvm::errs() << "Block {\n";
+  for (auto &expr : *exprList)
+    dump(expr.get());
+  indent();
+  llvm::errs() << "} // Block\n";
+}
+
+/// A literal number, just print the value.
+void ASTDumper::dump(NumberExprAST *num) {
+  INDENT();
+  llvm::errs() << num->getValue() << " " << loc(num) << "\n";
+}
+
+/// Helper to print recursively a literal. This handles nested array like:
+///    [ [ 1, 2 ], [ 3, 4 ] ]
+/// We print out such array with the dimensions spelled out at every level:
+///    <2,2>[<2>[ 1, 2 ], <2>[ 3, 4 ] ]
+void printLitHelper(ExprAST *litOrNum) {
+  // Inside a literal expression we can have either a number or another literal
+  if (auto num = llvm::dyn_cast<NumberExprAST>(litOrNum)) {
+    llvm::errs() << num->getValue();
+    return;
+  }
+  auto *literal = llvm::cast<LiteralExprAST>(litOrNum);
+
+  // Print the dimension for this literal first
+  llvm::errs() << "<";
+  mlir::interleaveComma(literal->getDims(), llvm::errs());
+  llvm::errs() << ">";
+
+  // Now print the content, recursing on every element of the list
+  llvm::errs() << "[ ";
+  mlir::interleaveComma(literal->getValues(), llvm::errs(),
+                        [&](auto &elt) { printLitHelper(elt.get()); });
+  llvm::errs() << "]";
+}
+
+/// Print a literal, see the recursive helper above for the implementation.
+void ASTDumper::dump(LiteralExprAST *node) {
+  INDENT();
+  llvm::errs() << "Literal: ";
+  printLitHelper(node);
+  llvm::errs() << " " << loc(node) << "\n";
+}
+
+/// Print a struct literal.
+void ASTDumper::dump(StructLiteralExprAST *node) {
+  INDENT();
+  llvm::errs() << "Struct Literal: ";
+  for (auto &value : node->getValues())
+    dump(value.get());
+  indent();
+  llvm::errs() << " " << loc(node) << "\n";
+}
+
+/// Print a variable reference (just a name).
+void ASTDumper::dump(VariableExprAST *node) {
+  INDENT();
+  llvm::errs() << "var: " << node->getName() << " " << loc(node) << "\n";
+}
+
+/// Return statement print the return and its (optional) argument.
+void ASTDumper::dump(ReturnExprAST *node) {
+  INDENT();
+  llvm::errs() << "Return\n";
+  if (node->getExpr().hasValue())
+    return dump(*node->getExpr());
+  {
+    INDENT();
+    llvm::errs() << "(void)\n";
+  }
+}
+
+/// Print a binary operation, first the operator, then recurse into LHS and RHS.
+void ASTDumper::dump(BinaryExprAST *node) {
+  INDENT();
+  llvm::errs() << "BinOp: " << node->getOp() << " " << loc(node) << "\n";
+  dump(node->getLHS());
+  dump(node->getRHS());
+}
+
+/// Print a call expression, first the callee name and the list of args by
+/// recursing into each individual argument.
+void ASTDumper::dump(CallExprAST *node) {
+  INDENT();
+  llvm::errs() << "Call '" << node->getCallee() << "' [ " << loc(node) << "\n";
+  for (auto &arg : node->getArgs())
+    dump(arg.get());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print a builtin print call, first the builtin name and then the argument.
+void ASTDumper::dump(PrintExprAST *node) {
+  INDENT();
+  llvm::errs() << "Print [ " << loc(node) << "\n";
+  dump(node->getArg());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print type: only the shape is printed in between '<' and '>'
+void ASTDumper::dump(const VarType &type) {
+  llvm::errs() << "<";
+  if (!type.name.empty())
+    llvm::errs() << type.name;
+  else
+    mlir::interleaveComma(type.shape, llvm::errs());
+  llvm::errs() << ">";
+}
+
+/// Print a function prototype, first the function name, and then the list of
+/// parameters names.
+void ASTDumper::dump(PrototypeAST *node) {
+  INDENT();
+  llvm::errs() << "Proto '" << node->getName() << "' " << loc(node) << "'\n";
+  indent();
+  llvm::errs() << "Params: [";
+  mlir::interleaveComma(node->getArgs(), llvm::errs(),
+                        [](auto &arg) { llvm::errs() << arg->getName(); });
+  llvm::errs() << "]\n";
+}
+
+/// Print a function, first the prototype and then the body.
+void ASTDumper::dump(FunctionAST *node) {
+  INDENT();
+  llvm::errs() << "Function \n";
+  dump(node->getProto());
+  dump(node->getBody());
+}
+
+/// Print a struct.
+void ASTDumper::dump(StructAST *node) {
+  INDENT();
+  llvm::errs() << "Struct: " << node->getName() << " " << loc(node) << "\n";
+
+  {
+    INDENT();
+    llvm::errs() << "Variables: [\n";
+    for (auto &variable : node->getVariables())
+      dump(variable.get());
+    indent();
+    llvm::errs() << "]\n";
+  }
+}
+
+/// Print a module, actually loop over the functions and print them in sequence.
+void ASTDumper::dump(ModuleAST *node) {
+  INDENT();
+  llvm::errs() << "Module:\n";
+  for (auto &record : *node) {
+    if (FunctionAST *function = llvm::dyn_cast<FunctionAST>(record.get()))
+      dump(function);
+    else if (StructAST *str = llvm::dyn_cast<StructAST>(record.get()))
+      dump(str);
+    else
+      llvm::errs() << "<unknown Record, kind " << record->getKind() << ">\n";
+  }
+}
+
+namespace toy {
+
+// Public API
+void dump(ModuleAST &module) { ASTDumper().dump(&module); }
+
+} // namespace toy
diff --git a/mlir/examples/toy/Ch7/toyc.cpp b/mlir/examples/toy/Ch7/toyc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6afab594e1fc74a3c75df91a32c8ba0e45d4543
--- /dev/null
+++ b/mlir/examples/toy/Ch7/toyc.cpp
@@ -0,0 +1,275 @@
+//===- toyc.cpp - The Toy Compiler ----------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the entry point for the Toy compiler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+#include "toy/MLIRGen.h"
+#include "toy/Parser.h"
+#include "toy/Passes.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/ExecutionEngine/ExecutionEngine.h"
+#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Parser.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Target/LLVMIR.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace toy;
+namespace cl = llvm::cl;
+
+static cl::opt<std::string> inputFilename(cl::Positional,
+                                          cl::desc("<input toy file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+namespace {
+enum InputType { Toy, MLIR };
+}
+static cl::opt<enum InputType> inputType(
+    "x", cl::init(Toy), cl::desc("Decided the kind of output desired"),
+    cl::values(clEnumValN(Toy, "toy", "load the input file as a Toy source.")),
+    cl::values(clEnumValN(MLIR, "mlir",
+                          "load the input file as an MLIR file")));
+
+namespace {
+enum Action {
+  None,
+  DumpAST,
+  DumpMLIR,
+  DumpMLIRAffine,
+  DumpMLIRLLVM,
+  DumpLLVMIR,
+  RunJIT
+};
+}
+static cl::opt<enum Action> emitAction(
+    "emit", cl::desc("Select the kind of output desired"),
+    cl::values(clEnumValN(DumpAST, "ast", "output the AST dump")),
+    cl::values(clEnumValN(DumpMLIR, "mlir", "output the MLIR dump")),
+    cl::values(clEnumValN(DumpMLIRAffine, "mlir-affine",
+                          "output the MLIR dump after affine lowering")),
+    cl::values(clEnumValN(DumpMLIRLLVM, "mlir-llvm",
+                          "output the MLIR dump after llvm lowering")),
+    cl::values(clEnumValN(DumpLLVMIR, "llvm", "output the LLVM IR dump")),
+    cl::values(
+        clEnumValN(RunJIT, "jit",
+                   "JIT the code and run it by invoking the main function")));
+
+static cl::opt<bool> enableOpt("opt", cl::desc("Enable optimizations"));
+
+/// Returns a Toy AST resulting from parsing the file or a nullptr on error.
+std::unique_ptr<toy::ModuleAST> parseInputFile(llvm::StringRef filename) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(filename);
+  if (std::error_code ec = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << ec.message() << "\n";
+    return nullptr;
+  }
+  auto buffer = fileOrErr.get()->getBuffer();
+  LexerBuffer lexer(buffer.begin(), buffer.end(), filename);
+  Parser parser(lexer);
+  return parser.parseModule();
+}
+
+int loadMLIR(mlir::MLIRContext &context, mlir::OwningModuleRef &module) {
+  // Handle '.toy' input to the compiler.
+  if (inputType != InputType::MLIR &&
+      !llvm::StringRef(inputFilename).endswith(".mlir")) {
+    auto moduleAST = parseInputFile(inputFilename);
+    if (!moduleAST)
+      return 6;
+    module = mlirGen(context, *moduleAST);
+    return !module ? 1 : 0;
+  }
+
+  // Otherwise, the input is '.mlir'.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(inputFilename);
+  if (std::error_code EC = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << EC.message() << "\n";
+    return -1;
+  }
+
+  // Parse the input mlir.
+  llvm::SourceMgr sourceMgr;
+  sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
+  module = mlir::parseSourceFile(sourceMgr, &context);
+  if (!module) {
+    llvm::errs() << "Error can't load file " << inputFilename << "\n";
+    return 3;
+  }
+  return 0;
+}
+
+int loadAndProcessMLIR(mlir::MLIRContext &context,
+                       mlir::OwningModuleRef &module) {
+  if (int error = loadMLIR(context, module))
+    return error;
+
+  mlir::PassManager pm(&context);
+  // Apply any generic pass manager command line options and run the pipeline.
+  applyPassManagerCLOptions(pm);
+
+  // Check to see what granularity of MLIR we are compiling to.
+  bool isLoweringToAffine = emitAction >= Action::DumpMLIRAffine;
+  bool isLoweringToLLVM = emitAction >= Action::DumpMLIRLLVM;
+
+  if (enableOpt || isLoweringToAffine) {
+    // Inline all functions into main and then delete them.
+    pm.addPass(mlir::createInlinerPass());
+    pm.addPass(mlir::toy::createDeadFunctionEliminationPass());
+
+    // Now that there is only one function, we can infer the shapes of each of
+    // the operations.
+    mlir::OpPassManager &optPM = pm.nest<mlir::FuncOp>();
+    optPM.addPass(mlir::createCanonicalizerPass());
+    optPM.addPass(mlir::toy::createShapeInferencePass());
+    optPM.addPass(mlir::createCanonicalizerPass());
+    optPM.addPass(mlir::createCSEPass());
+  }
+
+  if (isLoweringToAffine) {
+    // Partially lower the toy dialect with a few cleanups afterwards.
+    pm.addPass(mlir::toy::createLowerToAffinePass());
+
+    mlir::OpPassManager &optPM = pm.nest<mlir::FuncOp>();
+    optPM.addPass(mlir::createCanonicalizerPass());
+    optPM.addPass(mlir::createCSEPass());
+
+    // Add optimizations if enabled.
+    if (enableOpt) {
+      optPM.addPass(mlir::createLoopFusionPass());
+      optPM.addPass(mlir::createMemRefDataFlowOptPass());
+    }
+  }
+
+  if (isLoweringToLLVM) {
+    // Finish lowering the toy IR to the LLVM dialect.
+    pm.addPass(mlir::toy::createLowerToLLVMPass());
+  }
+
+  if (mlir::failed(pm.run(*module)))
+    return 4;
+  return 0;
+}
+
+int dumpAST() {
+  if (inputType == InputType::MLIR) {
+    llvm::errs() << "Can't dump a Toy AST when the input is MLIR\n";
+    return 5;
+  }
+
+  auto moduleAST = parseInputFile(inputFilename);
+  if (!moduleAST)
+    return 1;
+
+  dump(*moduleAST);
+  return 0;
+}
+
+int dumpLLVMIR(mlir::ModuleOp module) {
+  auto llvmModule = mlir::translateModuleToLLVMIR(module);
+  if (!llvmModule) {
+    llvm::errs() << "Failed to emit LLVM IR\n";
+    return -1;
+  }
+
+  // Initialize LLVM targets.
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+  mlir::ExecutionEngine::setupTargetTriple(llvmModule.get());
+
+  /// Optionally run an optimization pipeline over the llvm module.
+  auto optPipeline = mlir::makeOptimizingTransformer(
+      /*optLevel=*/enableOpt ? 3 : 0, /*sizeLevel=*/0,
+      /*targetMachine=*/nullptr);
+  if (auto err = optPipeline(llvmModule.get())) {
+    llvm::errs() << "Failed to optimize LLVM IR " << err << "\n";
+    return -1;
+  }
+  llvm::errs() << *llvmModule << "\n";
+  return 0;
+}
+
+int runJit(mlir::ModuleOp module) {
+  // Initialize LLVM targets.
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+
+  // An optimization pipeline to use within the execution engine.
+  auto optPipeline = mlir::makeOptimizingTransformer(
+      /*optLevel=*/enableOpt ? 3 : 0, /*sizeLevel=*/0,
+      /*targetMachine=*/nullptr);
+
+  // Create an MLIR execution engine. The execution engine eagerly JIT-compiles
+  // the module.
+  auto maybeEngine = mlir::ExecutionEngine::create(module, optPipeline);
+  assert(maybeEngine && "failed to construct an execution engine");
+  auto &engine = maybeEngine.get();
+
+  // Invoke the JIT-compiled function.
+  auto invocationResult = engine->invoke("main");
+  if (invocationResult) {
+    llvm::errs() << "JIT invocation failed\n";
+    return -1;
+  }
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  mlir::registerPassManagerCLOptions();
+  cl::ParseCommandLineOptions(argc, argv, "toy compiler\n");
+
+  if (emitAction == Action::DumpAST)
+    return dumpAST();
+
+  // If we aren't dumping the AST, then we are compiling with/to MLIR.
+
+  // Register our Dialect with MLIR.
+  mlir::registerDialect<mlir::toy::ToyDialect>();
+
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module;
+  if (int error = loadAndProcessMLIR(context, module))
+    return error;
+
+  // If we aren't exporting to non-mlir, then we are done.
+  bool isOutputingMLIR = emitAction <= Action::DumpMLIRLLVM;
+  if (isOutputingMLIR) {
+    module->dump();
+    return 0;
+  }
+
+  // Check to see if we are compiling to LLVM IR.
+  if (emitAction == Action::DumpLLVMIR)
+    return dumpLLVMIR(*module);
+
+  // Otherwise, we must be running the jit.
+  if (emitAction == Action::RunJIT)
+    return runJit(*module);
+
+  llvm::errs() << "No action specified (parsing only?), use -emit=<action>\n";
+  return -1;
+}
diff --git a/mlir/examples/toy/README.md b/mlir/examples/toy/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..53912c83abfb228f97a2ad4bac0a93bcd23176a3
--- /dev/null
+++ b/mlir/examples/toy/README.md
@@ -0,0 +1,7 @@
+# Toy Tutorial
+
+This contains sample code to support the tutorial on using MLIR for
+building a compiler for a simple Toy language.
+
+See [g3doc/Tutorials/Toy](../../g3doc/Tutorials/Toy) at the root of
+the repository for more informations.
diff --git a/mlir/include/mlir-c/Core.h b/mlir/include/mlir-c/Core.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e3e2087f8bf40f749470f9efb46d0e2a6ddd209
--- /dev/null
+++ b/mlir/include/mlir-c/Core.h
@@ -0,0 +1,109 @@
+/*===-- mlir-c/Core.h - Core Library C Interface ------------------*- C -*-===*\
+|*                                                                            *|
+|* Part of the MLIR Project, under the Apache License v2.0 with LLVM          *|
+|* Exceptions.                                                                *|
+|* See https://llvm.org/LICENSE.txt for license information.                  *|
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header declares the C interface to MLIR.                              *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+#ifndef MLIR_C_CORE_H
+#define MLIR_C_CORE_H
+
+#ifdef __cplusplus
+#include <cstdint>
+extern "C" {
+#else
+#include <stdint.h>
+#endif
+
+/// Opaque MLIR types.
+/// Opaque C type for mlir::MLIRContext*.
+typedef void *mlir_context_t;
+/// Opaque C type for mlir::Type.
+typedef const void *mlir_type_t;
+/// Opaque C type for mlir::FuncOp.
+typedef void *mlir_func_t;
+/// Opaque C type for mlir::Attribute.
+typedef const void *mlir_attr_t;
+
+/// Simple C lists for non-owning mlir Opaque C types.
+/// Recommended usage is construction from the `data()` and `size()` of a scoped
+/// owning SmallVectorImpl<...> and passing to one of the C functions declared
+/// later in this file.
+/// Once the function returns and the proper EDSC has been constructed,
+/// resources are freed by exiting the scope.
+typedef struct {
+  int64_t *values;
+  uint64_t n;
+} int64_list_t;
+
+typedef struct {
+  mlir_type_t *types;
+  uint64_t n;
+} mlir_type_list_t;
+
+typedef struct {
+  const char *name;
+  mlir_attr_t value;
+} mlir_named_attr_t;
+
+typedef struct {
+  mlir_named_attr_t *list;
+  uint64_t n;
+} mlir_named_attr_list_t;
+
+/// Minimal C API for exposing EDSCs to Swift, Python and other languages.
+
+/// Returns an `mlir::MemRefType` of the element type `elemType` and shape
+/// `sizes`.
+mlir_type_t makeMemRefType(mlir_context_t context, mlir_type_t elemType,
+                           int64_list_t sizes);
+
+/// Returns an `mlir::FunctionType` of the element type `elemType` and shape
+/// `sizes`.
+mlir_type_t makeFunctionType(mlir_context_t context, mlir_type_list_t inputs,
+                             mlir_type_list_t outputs);
+
+/// Returns an `mlir::IndexType`.
+mlir_type_t makeIndexType(mlir_context_t context);
+
+/// Returns an `mlir::IntegerAttr` of the specified type that contains the given
+/// value.
+mlir_attr_t makeIntegerAttr(mlir_type_t type, int64_t value);
+
+/// Returns an `mlir::BoolAttr` with the given value.
+mlir_attr_t makeBoolAttr(mlir_context_t context, bool value);
+
+/// Returns an `mlir::FloatAttr` with the given value.
+mlir_attr_t makeFloatAttr(mlir_context_t context, float value);
+
+/// Returns an `mlir::StringAttr` with the given value.
+mlir_attr_t makeStringAttr(mlir_context_t context, const char *value);
+
+/// Parses an MLIR type from the string `type` in the given context. Returns a
+/// NULL type on error. If non-NULL, `charsRead` will contain the number of
+/// characters that were processed by the parser.
+mlir_type_t mlirParseType(const char *type, mlir_context_t context,
+                          uint64_t *charsRead);
+
+/// Returns the arity of `function`.
+unsigned getFunctionArity(mlir_func_t function);
+
+/// Returns the rank of the `function` argument at position `pos`.
+/// If the argument is of MemRefType, this returns the rank of the MemRef.
+/// Otherwise returns `0`.
+/// TODO(ntv): support more than MemRefType and scalar Type.
+unsigned getRankOfFunctionArgument(mlir_func_t function, unsigned pos);
+
+/// Returns an opaque mlir::Type of the `function` argument at position `pos`.
+mlir_type_t getTypeOfFunctionArgument(mlir_func_t function, unsigned pos);
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
+#endif // MLIR_C_CORE_H
diff --git a/mlir/include/mlir/ADT/TypeSwitch.h b/mlir/include/mlir/ADT/TypeSwitch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dbc611f557e096157c847e8e37910ed9d2b9638
--- /dev/null
+++ b/mlir/include/mlir/ADT/TypeSwitch.h
@@ -0,0 +1,176 @@
+//===- TypeSwitch.h - Switch functionality for RTTI casting -*- C++ -*-----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the TypeSwitch template, which mimics a switch()
+//  statement whose cases are type names.
+//
+//===-----------------------------------------------------------------------===/
+
+#ifndef MLIR_SUPPORT_TYPESWITCH_H
+#define MLIR_SUPPORT_TYPESWITCH_H
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/Optional.h"
+
+namespace mlir {
+namespace detail {
+
+template <typename DerivedT, typename T> class TypeSwitchBase {
+public:
+  TypeSwitchBase(const T &value) : value(value) {}
+  TypeSwitchBase(TypeSwitchBase &&other) : value(other.value) {}
+  ~TypeSwitchBase() = default;
+
+  /// TypeSwitchBase is not copyable.
+  TypeSwitchBase(const TypeSwitchBase &) = delete;
+  void operator=(const TypeSwitchBase &) = delete;
+  void operator=(TypeSwitchBase &&other) = delete;
+
+  /// Invoke a case on the derived class with multiple case types.
+  template <typename CaseT, typename CaseT2, typename... CaseTs,
+            typename CallableT>
+  DerivedT &Case(CallableT &&caseFn) {
+    DerivedT &derived = static_cast<DerivedT &>(*this);
+    return derived.template Case<CaseT>(caseFn)
+        .template Case<CaseT2, CaseTs...>(caseFn);
+  }
+
+  /// Invoke a case on the derived class, inferring the type of the Case from
+  /// the first input of the given callable.
+  /// Note: This inference rules for this overload are very simple: strip
+  ///       pointers and references.
+  template <typename CallableT> DerivedT &Case(CallableT &&caseFn) {
+    using Traits = FunctionTraits<std::decay_t<CallableT>>;
+    using CaseT = std::remove_cv_t<std::remove_pointer_t<
+        std::remove_reference_t<typename Traits::template arg_t<0>>>>;
+
+    DerivedT &derived = static_cast<DerivedT &>(*this);
+    return derived.template Case<CaseT>(std::forward<CallableT>(caseFn));
+  }
+
+protected:
+  /// Trait to check whether `ValueT` provides a 'dyn_cast' method with type
+  /// `CastT`.
+  template <typename ValueT, typename CastT>
+  using has_dyn_cast_t =
+      decltype(std::declval<ValueT &>().template dyn_cast<CastT>());
+
+  /// Attempt to dyn_cast the given `value` to `CastT`. This overload is
+  /// selected if `value` already has a suitable dyn_cast method.
+  template <typename CastT, typename ValueT>
+  static auto castValue(
+      ValueT value,
+      typename std::enable_if_t<
+          is_detected<has_dyn_cast_t, ValueT, CastT>::value> * = nullptr) {
+    return value.template dyn_cast<CastT>();
+  }
+
+  /// Attempt to dyn_cast the given `value` to `CastT`. This overload is
+  /// selected if llvm::dyn_cast should be used.
+  template <typename CastT, typename ValueT>
+  static auto castValue(
+      ValueT value,
+      typename std::enable_if_t<
+          !is_detected<has_dyn_cast_t, ValueT, CastT>::value> * = nullptr) {
+    return dyn_cast<CastT>(value);
+  }
+
+  /// The root value we are switching on.
+  const T value;
+};
+} // end namespace detail
+
+/// This class implements a switch-like dispatch statement for a value of 'T'
+/// using dyn_cast functionality. Each `Case<T>` takes a callable to be invoked
+/// if the root value isa<T>, the callable is invoked with the result of
+/// dyn_cast<T>() as a parameter.
+///
+/// Example:
+///  Operation *op = ...;
+///  LogicalResult result = TypeSwitch<Operation *, LogicalResult>(op)
+///    .Case<ConstantOp>([](ConstantOp op) { ... })
+///    .Default([](Operation *op) { ... });
+///
+template <typename T, typename ResultT = void>
+class TypeSwitch : public detail::TypeSwitchBase<TypeSwitch<T, ResultT>, T> {
+public:
+  using BaseT = detail::TypeSwitchBase<TypeSwitch<T, ResultT>, T>;
+  using BaseT::BaseT;
+  using BaseT::Case;
+  TypeSwitch(TypeSwitch &&other) = default;
+
+  /// Add a case on the given type.
+  template <typename CaseT, typename CallableT>
+  TypeSwitch<T, ResultT> &Case(CallableT &&caseFn) {
+    if (result)
+      return *this;
+
+    // Check to see if CaseT applies to 'value'.
+    if (auto caseValue = BaseT::template castValue<CaseT>(this->value))
+      result = caseFn(caseValue);
+    return *this;
+  }
+
+  /// As a default, invoke the given callable within the root value.
+  template <typename CallableT>
+  LLVM_NODISCARD ResultT Default(CallableT &&defaultFn) {
+    if (result)
+      return std::move(*result);
+    return defaultFn(this->value);
+  }
+
+  LLVM_NODISCARD
+  operator ResultT() {
+    assert(result && "Fell off the end of a type-switch");
+    return std::move(*result);
+  }
+
+private:
+  /// The pointer to the result of this switch statement, once known,
+  /// null before that.
+  Optional<ResultT> result;
+};
+
+/// Specialization of TypeSwitch for void returning callables.
+template <typename T>
+class TypeSwitch<T, void>
+    : public detail::TypeSwitchBase<TypeSwitch<T, void>, T> {
+public:
+  using BaseT = detail::TypeSwitchBase<TypeSwitch<T, void>, T>;
+  using BaseT::BaseT;
+  using BaseT::Case;
+  TypeSwitch(TypeSwitch &&other) = default;
+
+  /// Add a case on the given type.
+  template <typename CaseT, typename CallableT>
+  TypeSwitch<T, void> &Case(CallableT &&caseFn) {
+    if (foundMatch)
+      return *this;
+
+    // Check to see if any of the types apply to 'value'.
+    if (auto caseValue = BaseT::template castValue<CaseT>(this->value)) {
+      caseFn(caseValue);
+      foundMatch = true;
+    }
+    return *this;
+  }
+
+  /// As a default, invoke the given callable within the root value.
+  template <typename CallableT> void Default(CallableT &&defaultFn) {
+    if (!foundMatch)
+      defaultFn(this->value);
+  }
+
+private:
+  /// A flag detailing if we have already found a match.
+  bool foundMatch = false;
+};
+} // end namespace mlir
+
+#endif // MLIR_SUPPORT_TYPESWITCH_H
diff --git a/mlir/include/mlir/Analysis/AffineAnalysis.h b/mlir/include/mlir/Analysis/AffineAnalysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0bcb932c04c78215893a301f8eb5a9e1d5da161
--- /dev/null
+++ b/mlir/include/mlir/Analysis/AffineAnalysis.h
@@ -0,0 +1,131 @@
+//===- AffineAnalysis.h - analyses for affine structures --------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for methods that perform analysis
+// involving affine structures (AffineExprStorage, AffineMap, IntegerSet, etc.)
+// and other IR structures that in turn use these.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_AFFINE_ANALYSIS_H
+#define MLIR_ANALYSIS_AFFINE_ANALYSIS_H
+
+#include "mlir/IR/Value.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir {
+
+class AffineApplyOp;
+class AffineForOp;
+class AffineValueMap;
+class FlatAffineConstraints;
+class Operation;
+
+/// Returns in `affineApplyOps`, the sequence of those AffineApplyOp
+/// Operations that are reachable via a search starting from `operands` and
+/// ending at those operands that are not the result of an AffineApplyOp.
+void getReachableAffineApplyOps(ArrayRef<Value> operands,
+                                SmallVectorImpl<Operation *> &affineApplyOps);
+
+/// Builds a system of constraints with dimensional identifiers corresponding to
+/// the loop IVs of the forOps appearing in that order. Bounds of the loop are
+/// used to add appropriate inequalities. Any symbols founds in the bound
+/// operands are added as symbols in the system. Returns failure for the yet
+/// unimplemented cases.
+//  TODO(bondhugula): handle non-unit strides.
+LogicalResult getIndexSet(MutableArrayRef<AffineForOp> forOps,
+                          FlatAffineConstraints *domain);
+
+/// Encapsulates a memref load or store access information.
+struct MemRefAccess {
+  Value memref;
+  Operation *opInst;
+  SmallVector<Value, 4> indices;
+
+  /// Constructs a MemRefAccess from a load or store operation.
+  // TODO(b/119949820): add accessors to standard op's load, store, DMA op's to
+  // return MemRefAccess, i.e., loadOp->getAccess(), dmaOp->getRead/WriteAccess.
+  explicit MemRefAccess(Operation *opInst);
+
+  // Returns the rank of the memref associated with this access.
+  unsigned getRank() const;
+  // Returns true if this access is of a store op.
+  bool isStore() const;
+
+  /// Populates 'accessMap' with composition of AffineApplyOps reachable from
+  /// 'indices'.
+  void getAccessMap(AffineValueMap *accessMap) const;
+
+  /// Equal if both affine accesses can be proved to be equivalent at compile
+  /// time (considering the memrefs, their respective affine access maps  and
+  /// operands). The equality of access functions + operands is checked by
+  /// subtracting fully composed value maps, and then simplifying the difference
+  /// using the expression flattener.
+  /// TODO: this does not account for aliasing of memrefs.
+  bool operator==(const MemRefAccess &rhs) const;
+  bool operator!=(const MemRefAccess &rhs) const { return !(*this == rhs); }
+};
+
+// DependenceComponent contains state about the direction of a dependence as an
+// interval [lb, ub] for an AffineForOp.
+// Distance vectors components are represented by the interval [lb, ub] with
+// lb == ub.
+// Direction vectors components are represented by the interval [lb, ub] with
+// lb < ub. Note that ub/lb == None means unbounded.
+struct DependenceComponent {
+  // The AffineForOp Operation associated with this dependence component.
+  Operation *op;
+  // The lower bound of the dependence distance.
+  Optional<int64_t> lb;
+  // The upper bound of the dependence distance (inclusive).
+  Optional<int64_t> ub;
+  DependenceComponent() : lb(llvm::None), ub(llvm::None) {}
+};
+
+/// Checks whether two accesses to the same memref access the same element.
+/// Each access is specified using the MemRefAccess structure, which contains
+/// the operation, indices and memref associated with the access. Returns
+/// 'NoDependence' if it can be determined conclusively that the accesses do not
+/// access the same memref element. If 'allowRAR' is true, will consider
+/// read-after-read dependences (typically used by applications trying to
+/// optimize input reuse).
+// TODO(andydavis) Wrap 'dependenceConstraints' and 'dependenceComponents' into
+// a single struct.
+// TODO(andydavis) Make 'dependenceConstraints' optional arg.
+struct DependenceResult {
+  enum ResultEnum {
+    HasDependence, // A dependence exists between 'srcAccess' and 'dstAccess'.
+    NoDependence,  // No dependence exists between 'srcAccess' and 'dstAccess'.
+    Failure,       // Dependence check failed due to unsupported cases.
+  } value;
+  DependenceResult(ResultEnum v) : value(v) {}
+};
+
+DependenceResult checkMemrefAccessDependence(
+    const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
+    unsigned loopDepth, FlatAffineConstraints *dependenceConstraints,
+    SmallVector<DependenceComponent, 2> *dependenceComponents,
+    bool allowRAR = false);
+
+/// Utility function that returns true if the provided DependenceResult
+/// corresponds to a dependence result.
+inline bool hasDependence(DependenceResult result) {
+  return result.value == DependenceResult::HasDependence;
+}
+
+/// Returns in 'depCompsVec', dependence components for dependences between all
+/// load and store ops in loop nest rooted at 'forOp', at loop depths in range
+/// [1, maxLoopDepth].
+void getDependenceComponents(
+    AffineForOp forOp, unsigned maxLoopDepth,
+    std::vector<SmallVector<DependenceComponent, 2>> *depCompsVec);
+
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_AFFINE_ANALYSIS_H
diff --git a/mlir/include/mlir/Analysis/AffineStructures.h b/mlir/include/mlir/Analysis/AffineStructures.h
new file mode 100644
index 0000000000000000000000000000000000000000..47e0ddab5479761a0eab5624e876ef3fd293db8a
--- /dev/null
+++ b/mlir/include/mlir/Analysis/AffineStructures.h
@@ -0,0 +1,815 @@
+//===- AffineStructures.h - MLIR Affine Structures Class --------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Structures for affine/polyhedral analysis of ML functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_AFFINE_STRUCTURES_H
+#define MLIR_ANALYSIS_AFFINE_STRUCTURES_H
+
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+
+class AffineApplyOp;
+class AffineBound;
+class AffineCondition;
+class AffineMap;
+class AffineForOp;
+class IntegerSet;
+class MLIRContext;
+class Value;
+class HyperRectangularSet;
+class MemRefType;
+
+/// A mutable affine map. Its affine expressions are however unique.
+struct MutableAffineMap {
+public:
+  MutableAffineMap() {}
+  MutableAffineMap(AffineMap map);
+
+  ArrayRef<AffineExpr> getResults() const { return results; }
+  AffineExpr getResult(unsigned idx) const { return results[idx]; }
+  void setResult(unsigned idx, AffineExpr result) { results[idx] = result; }
+  unsigned getNumResults() const { return results.size(); }
+  unsigned getNumDims() const { return numDims; }
+  void setNumDims(unsigned d) { numDims = d; }
+  unsigned getNumSymbols() const { return numSymbols; }
+  void setNumSymbols(unsigned d) { numSymbols = d; }
+  MLIRContext *getContext() const { return context; }
+
+  /// Returns true if the idx'th result expression is a multiple of factor.
+  bool isMultipleOf(unsigned idx, int64_t factor) const;
+
+  /// Resets this MutableAffineMap with 'map'.
+  void reset(AffineMap map);
+
+  /// Simplify the (result) expressions in this map using analysis (used by
+  //-simplify-affine-expr pass).
+  void simplify();
+  /// Get the AffineMap corresponding to this MutableAffineMap. Note that an
+  /// AffineMap will be uniqued and stored in context, while a mutable one
+  /// isn't.
+  AffineMap getAffineMap() const;
+
+private:
+  // Same meaning as AffineMap's fields.
+  SmallVector<AffineExpr, 8> results;
+  unsigned numDims;
+  unsigned numSymbols;
+  /// A pointer to the IR's context to store all newly created
+  /// AffineExprStorage's.
+  MLIRContext *context;
+};
+
+/// A mutable integer set. Its affine expressions are however unique.
+struct MutableIntegerSet {
+public:
+  MutableIntegerSet(IntegerSet set, MLIRContext *context);
+
+  /// Create a universal set (no constraints).
+  MutableIntegerSet(unsigned numDims, unsigned numSymbols,
+                    MLIRContext *context);
+
+  unsigned getNumDims() const { return numDims; }
+  unsigned getNumSymbols() const { return numSymbols; }
+  unsigned getNumConstraints() const { return constraints.size(); }
+
+  void clear() {
+    constraints.clear();
+    eqFlags.clear();
+  }
+
+private:
+  unsigned numDims;
+  unsigned numSymbols;
+
+  SmallVector<AffineExpr, 8> constraints;
+  SmallVector<bool, 8> eqFlags;
+};
+
+/// An AffineValueMap is an affine map plus its ML value operands and
+/// results for analysis purposes. The structure is still a tree form that is
+/// same as that of an affine map or an AffineApplyOp. However, its operands,
+/// results, and its map can themselves change  as a result of
+/// substitutions, simplifications, and other analysis.
+// An affine value map can readily be constructed from an AffineApplyOp, or an
+// AffineBound of a AffineForOp. It can be further transformed, substituted
+// into, or simplified. Unlike AffineMap's, AffineValueMap's are created and
+// destroyed during analysis. Only the AffineMap expressions that are pointed by
+// them are unique'd. An affine value map, and the operations on it, maintain
+// the invariant that operands are always positionally aligned with the
+// AffineDimExpr and AffineSymbolExpr in the underlying AffineMap.
+// TODO(bondhugula): Some of these classes could go into separate files.
+class AffineValueMap {
+public:
+  // Creates an empty AffineValueMap (users should call 'reset' to reset map
+  // and operands).
+  AffineValueMap() {}
+  AffineValueMap(AffineMap map, ArrayRef<Value> operands,
+                 ArrayRef<Value> results = llvm::None);
+
+  explicit AffineValueMap(AffineApplyOp applyOp);
+  explicit AffineValueMap(AffineBound bound);
+
+  ~AffineValueMap();
+
+  // Resets this AffineValueMap with 'map', 'operands', and 'results'.
+  void reset(AffineMap map, ArrayRef<Value> operands,
+             ArrayRef<Value> results = llvm::None);
+
+  /// Return the value map that is the difference of value maps 'a' and 'b',
+  /// represented as an affine map and its operands. The output map + operands
+  /// are canonicalized and simplified.
+  static void difference(const AffineValueMap &a, const AffineValueMap &b,
+                         AffineValueMap *res);
+
+  /// Return true if the idx^th result can be proved to be a multiple of
+  /// 'factor', false otherwise.
+  inline bool isMultipleOf(unsigned idx, int64_t factor) const;
+
+  /// Return true if the idx^th result depends on 'value', false otherwise.
+  bool isFunctionOf(unsigned idx, Value value) const;
+
+  /// Return true if the result at 'idx' is a constant, false
+  /// otherwise.
+  bool isConstant(unsigned idx) const;
+
+  /// Return true if this is an identity map.
+  bool isIdentity() const;
+
+  void setResult(unsigned i, AffineExpr e) { map.setResult(i, e); }
+  AffineExpr getResult(unsigned i) { return map.getResult(i); }
+  inline unsigned getNumOperands() const { return operands.size(); }
+  inline unsigned getNumDims() const { return map.getNumDims(); }
+  inline unsigned getNumSymbols() const { return map.getNumSymbols(); }
+  inline unsigned getNumResults() const { return map.getNumResults(); }
+
+  Value getOperand(unsigned i) const;
+  ArrayRef<Value> getOperands() const;
+  AffineMap getAffineMap() const;
+
+private:
+  // A mutable affine map.
+  MutableAffineMap map;
+
+  // TODO: make these trailing objects?
+  /// The SSA operands binding to the dim's and symbols of 'map'.
+  SmallVector<Value, 4> operands;
+  /// The SSA results binding to the results of 'map'.
+  SmallVector<Value, 4> results;
+};
+
+/// An IntegerValueSet is an integer set plus its operands.
+// Both, the integer set being pointed to and the operands can change during
+// analysis, simplification, and transformation.
+class IntegerValueSet {
+  /// Constructs an integer value set from an affine value map.
+  // This will lead to a single equality in 'set'.
+  explicit IntegerValueSet(const AffineValueMap &avm);
+
+  /// Returns true if this integer set is determined to be empty. Emptiness is
+  /// checked by by eliminating identifiers successively (through either
+  /// Gaussian or Fourier-Motzkin) while using the GCD test and a trivial
+  /// invalid constraint check. Returns 'true' if the constraint system is found
+  /// to be empty; false otherwise. This method is exact for rational spaces but
+  /// not integer spaces - thus, if it returns true, the set is provably integer
+  /// empty as well, but if it returns false, it doesn't necessarily mean an
+  /// integer point exists in it. This method also returns false where an
+  /// explosion of constraints is detected - due to the super-exponential
+  /// worse-case complexity of Fourier-Motzkin elimination (rare for realistic
+  /// problem cases but possible for artificial adversarial or improperly
+  // constructed ones), this method returns false conservatively.
+  bool isEmpty() const;
+
+  bool getNumDims() const { return set.getNumDims(); }
+  bool getNumSymbols() const { return set.getNumSymbols(); }
+
+private:
+  // The set pointed to may itself change unlike in IR structures like
+  // 'AffineCondition'.
+  MutableIntegerSet set;
+  /// The SSA operands binding to the dim's and symbols of 'set'.
+  SmallVector<Value, 4> operands;
+};
+
+/// A flat list of affine equalities and inequalities in the form.
+/// Inequality: c_0*x_0 + c_1*x_1 + .... + c_{n-1}*x_{n-1} >= 0
+/// Equality: c_0*x_0 + c_1*x_1 + .... + c_{n-1}*x_{n-1} == 0
+///
+/// FlatAffineConstraints stores coefficients in a contiguous buffer (one buffer
+/// for equalities and one for inequalities). The size of each buffer is
+/// numReservedCols * number of inequalities (or equalities). The reserved size
+/// is numReservedCols * numReservedInequalities (or numReservedEqualities). A
+/// coefficient (r, c) lives at the location numReservedCols * r + c in the
+/// buffer. The extra space between getNumCols() and numReservedCols exists to
+/// prevent frequent movement of data when adding columns, especially at the
+/// end.
+///
+/// The identifiers x_0, x_1, ... appear in the order: dimensional identifiers,
+/// symbolic identifiers, and local identifiers.  The local identifiers
+/// correspond to local/internal variables created when converting from
+/// AffineExpr's containing mod's and div's; they are thus needed to increase
+/// representational power. Each local identifier is always (by construction) a
+/// floordiv of a pure add/mul affine function of dimensional, symbolic, and
+/// other local identifiers, in a non-mutually recursive way. Hence, every local
+/// identifier can ultimately always be recovered as an affine function of
+/// dimensional and symbolic identifiers (involving floordiv's); note however
+/// that some floordiv combinations are converted to mod's by AffineExpr
+/// construction.
+///
+class FlatAffineConstraints {
+public:
+  enum IdKind { Dimension, Symbol, Local };
+
+  /// Constructs a constraint system reserving memory for the specified number
+  /// of constraints and identifiers..
+  FlatAffineConstraints(unsigned numReservedInequalities,
+                        unsigned numReservedEqualities,
+                        unsigned numReservedCols, unsigned numDims = 0,
+                        unsigned numSymbols = 0, unsigned numLocals = 0,
+                        ArrayRef<Optional<Value>> idArgs = {})
+      : numReservedCols(numReservedCols), numDims(numDims),
+        numSymbols(numSymbols) {
+    assert(numReservedCols >= numDims + numSymbols + 1);
+    assert(idArgs.empty() || idArgs.size() == numDims + numSymbols + numLocals);
+    equalities.reserve(numReservedCols * numReservedEqualities);
+    inequalities.reserve(numReservedCols * numReservedInequalities);
+    numIds = numDims + numSymbols + numLocals;
+    ids.reserve(numReservedCols);
+    if (idArgs.empty())
+      ids.resize(numIds, None);
+    else
+      ids.append(idArgs.begin(), idArgs.end());
+  }
+
+  /// Constructs a constraint system with the specified number of
+  /// dimensions and symbols.
+  FlatAffineConstraints(unsigned numDims = 0, unsigned numSymbols = 0,
+                        unsigned numLocals = 0,
+                        ArrayRef<Optional<Value>> idArgs = {})
+      : numReservedCols(numDims + numSymbols + numLocals + 1), numDims(numDims),
+        numSymbols(numSymbols) {
+    assert(numReservedCols >= numDims + numSymbols + 1);
+    assert(idArgs.empty() || idArgs.size() == numDims + numSymbols + numLocals);
+    numIds = numDims + numSymbols + numLocals;
+    ids.reserve(numIds);
+    if (idArgs.empty())
+      ids.resize(numIds, None);
+    else
+      ids.append(idArgs.begin(), idArgs.end());
+  }
+
+  explicit FlatAffineConstraints(const HyperRectangularSet &set);
+
+  /// Create a flat affine constraint system from an AffineValueMap or a list of
+  /// these. The constructed system will only include equalities.
+  // TODO(bondhugula)
+  explicit FlatAffineConstraints(const AffineValueMap &avm);
+  explicit FlatAffineConstraints(ArrayRef<const AffineValueMap *> avmRef);
+
+  /// Creates an affine constraint system from an IntegerSet.
+  explicit FlatAffineConstraints(IntegerSet set);
+
+  /// Create an affine constraint system from an IntegerValueSet.
+  // TODO(bondhugula)
+  explicit FlatAffineConstraints(const IntegerValueSet &set);
+
+  FlatAffineConstraints(const FlatAffineConstraints &other);
+
+  FlatAffineConstraints(ArrayRef<const AffineValueMap *> avmRef,
+                        IntegerSet set);
+
+  FlatAffineConstraints(const MutableAffineMap &map);
+
+  ~FlatAffineConstraints() {}
+
+  // Clears any existing data and reserves memory for the specified constraints.
+  void reset(unsigned numReservedInequalities, unsigned numReservedEqualities,
+             unsigned numReservedCols, unsigned numDims, unsigned numSymbols,
+             unsigned numLocals = 0, ArrayRef<Value> idArgs = {});
+
+  void reset(unsigned numDims = 0, unsigned numSymbols = 0,
+             unsigned numLocals = 0, ArrayRef<Value> idArgs = {});
+
+  /// Appends constraints from 'other' into this. This is equivalent to an
+  /// intersection with no simplification of any sort attempted.
+  void append(const FlatAffineConstraints &other);
+
+  // Checks for emptiness by performing variable elimination on all identifiers,
+  // running the GCD test on each equality constraint, and checking for invalid
+  // constraints.
+  // Returns true if the GCD test fails for any equality, or if any invalid
+  // constraints are discovered on any row. Returns false otherwise.
+  bool isEmpty() const;
+
+  // Runs the GCD test on all equality constraints. Returns 'true' if this test
+  // fails on any equality. Returns 'false' otherwise.
+  // This test can be used to disprove the existence of a solution. If it
+  // returns true, no integer solution to the equality constraints can exist.
+  bool isEmptyByGCDTest() const;
+
+  // Clones this object.
+  std::unique_ptr<FlatAffineConstraints> clone() const;
+
+  /// Returns the value at the specified equality row and column.
+  inline int64_t atEq(unsigned i, unsigned j) const {
+    return equalities[i * numReservedCols + j];
+  }
+  inline int64_t &atEq(unsigned i, unsigned j) {
+    return equalities[i * numReservedCols + j];
+  }
+
+  inline int64_t atIneq(unsigned i, unsigned j) const {
+    return inequalities[i * numReservedCols + j];
+  }
+
+  inline int64_t &atIneq(unsigned i, unsigned j) {
+    return inequalities[i * numReservedCols + j];
+  }
+
+  /// Returns the number of columns in the constraint system.
+  inline unsigned getNumCols() const { return numIds + 1; }
+
+  inline unsigned getNumEqualities() const {
+    assert(equalities.size() % numReservedCols == 0 &&
+           "inconsistent equality buffer size");
+    return equalities.size() / numReservedCols;
+  }
+
+  inline unsigned getNumInequalities() const {
+    assert(inequalities.size() % numReservedCols == 0 &&
+           "inconsistent inequality buffer size");
+    return inequalities.size() / numReservedCols;
+  }
+
+  inline unsigned getNumReservedEqualities() const {
+    return equalities.capacity() / numReservedCols;
+  }
+
+  inline unsigned getNumReservedInequalities() const {
+    return inequalities.capacity() / numReservedCols;
+  }
+
+  inline ArrayRef<int64_t> getEquality(unsigned idx) const {
+    return ArrayRef<int64_t>(&equalities[idx * numReservedCols], getNumCols());
+  }
+
+  inline ArrayRef<int64_t> getInequality(unsigned idx) const {
+    return ArrayRef<int64_t>(&inequalities[idx * numReservedCols],
+                             getNumCols());
+  }
+
+  AffineExpr toAffineExpr(unsigned idx, MLIRContext *context);
+
+  /// Adds constraints (lower and upper bounds) for the specified 'affine.for'
+  /// operation's Value using IR information stored in its bound maps. The
+  /// right identifier is first looked up using forOp's Value. Asserts if the
+  /// Value corresponding to the 'affine.for' operation isn't found in the
+  /// constraint system. Returns failure for the yet unimplemented/unsupported
+  /// cases.  Any new identifiers that are found in the bound operands of the
+  /// 'affine.for' operation are added as trailing identifiers (either
+  /// dimensional or symbolic depending on whether the operand is a valid
+  /// symbol).
+  //  TODO(bondhugula): add support for non-unit strides.
+  LogicalResult addAffineForOpDomain(AffineForOp forOp);
+
+  /// Adds a lower or an upper bound for the identifier at the specified
+  /// position with constraints being drawn from the specified bound map and
+  /// operands. If `eq` is true, add a single equality equal to the bound map's
+  /// first result expr.
+  LogicalResult addLowerOrUpperBound(unsigned pos, AffineMap boundMap,
+                                     ArrayRef<Value> operands, bool eq,
+                                     bool lower = true);
+
+  /// Computes the lower and upper bounds of the first 'num' dimensional
+  /// identifiers (starting at 'offset') as an affine map of the remaining
+  /// identifiers (dimensional and symbolic). This method is able to detect
+  /// identifiers as floordiv's and mod's of affine expressions of other
+  /// identifiers with respect to (positive) constants. Sets bound map to a
+  /// null AffineMap if such a bound can't be found (or yet unimplemented).
+  void getSliceBounds(unsigned offset, unsigned num, MLIRContext *context,
+                      SmallVectorImpl<AffineMap> *lbMaps,
+                      SmallVectorImpl<AffineMap> *ubMaps);
+
+  /// Adds slice lower bounds represented by lower bounds in 'lbMaps' and upper
+  /// bounds in 'ubMaps' to each identifier in the constraint system which has
+  /// a value in 'values'. Note that both lower/upper bounds share the same
+  /// operand list 'operands'.
+  /// This function assumes 'values.size' == 'lbMaps.size' == 'ubMaps.size'.
+  /// Note that both lower/upper bounds use operands from 'operands'.
+  LogicalResult addSliceBounds(ArrayRef<Value> values,
+                               ArrayRef<AffineMap> lbMaps,
+                               ArrayRef<AffineMap> ubMaps,
+                               ArrayRef<Value> operands);
+
+  // Adds an inequality (>= 0) from the coefficients specified in inEq.
+  void addInequality(ArrayRef<int64_t> inEq);
+  // Adds an equality from the coefficients specified in eq.
+  void addEquality(ArrayRef<int64_t> eq);
+
+  /// Adds a constant lower bound constraint for the specified identifier.
+  void addConstantLowerBound(unsigned pos, int64_t lb);
+  /// Adds a constant upper bound constraint for the specified identifier.
+  void addConstantUpperBound(unsigned pos, int64_t ub);
+
+  /// Adds a new local identifier as the floordiv of an affine function of other
+  /// identifiers, the coefficients of which are provided in 'dividend' and with
+  /// respect to a positive constant 'divisor'. Two constraints are added to the
+  /// system to capture equivalence with the floordiv:
+  /// q = dividend floordiv c    <=>   c*q <= dividend <= c*q + c - 1.
+  void addLocalFloorDiv(ArrayRef<int64_t> dividend, int64_t divisor);
+
+  /// Adds a constant lower bound constraint for the specified expression.
+  void addConstantLowerBound(ArrayRef<int64_t> expr, int64_t lb);
+  /// Adds a constant upper bound constraint for the specified expression.
+  void addConstantUpperBound(ArrayRef<int64_t> expr, int64_t ub);
+
+  /// Sets the identifier at the specified position to a constant.
+  void setIdToConstant(unsigned pos, int64_t val);
+
+  /// Sets the identifier corresponding to the specified Value id to a
+  /// constant. Asserts if the 'id' is not found.
+  void setIdToConstant(Value id, int64_t val);
+
+  /// Looks up the position of the identifier with the specified Value. Returns
+  /// true if found (false otherwise). `pos' is set to the (column) position of
+  /// the identifier.
+  bool findId(Value id, unsigned *pos) const;
+
+  /// Returns true if an identifier with the specified Value exists, false
+  /// otherwise.
+  bool containsId(Value id) const;
+
+  // Add identifiers of the specified kind - specified positions are relative to
+  // the kind of identifier. The coefficient column corresponding to the added
+  // identifier is initialized to zero. 'id' is the Value corresponding to the
+  // identifier that can optionally be provided.
+  void addDimId(unsigned pos, Value id = nullptr);
+  void addSymbolId(unsigned pos, Value id = nullptr);
+  void addLocalId(unsigned pos);
+  void addId(IdKind kind, unsigned pos, Value id = nullptr);
+
+  /// Add the specified values as a dim or symbol id depending on its nature, if
+  /// it already doesn't exist in the system. `id' has to be either a terminal
+  /// symbol or a loop IV, i.e., it cannot be the result affine.apply of any
+  /// symbols or loop IVs. The identifier is added to the end of the existing
+  /// dims or symbols. Additional information on the identifier is extracted
+  /// from the IR and added to the constraint system.
+  void addInductionVarOrTerminalSymbol(Value id);
+
+  /// Composes the affine value map with this FlatAffineConstrains, adding the
+  /// results of the map as dimensions at the front [0, vMap->getNumResults())
+  /// and with the dimensions set to the equalities specified by the value map.
+  /// Returns failure if the composition fails (when vMap is a semi-affine map).
+  /// The vMap's operand Value's are used to look up the right positions in
+  /// the FlatAffineConstraints with which to associate. The dimensional and
+  /// symbolic operands of vMap should match 1:1 (in the same order) with those
+  /// of this constraint system, but the latter could have additional trailing
+  /// operands.
+  LogicalResult composeMap(const AffineValueMap *vMap);
+
+  /// Composes an affine map whose dimensions match one to one to the
+  /// dimensions of this FlatAffineConstraints. The results of the map 'other'
+  /// are added as the leading dimensions of this constraint system. Returns
+  /// failure if 'other' is a semi-affine map.
+  LogicalResult composeMatchingMap(AffineMap other);
+
+  /// Projects out (aka eliminates) 'num' identifiers starting at position
+  /// 'pos'. The resulting constraint system is the shadow along the dimensions
+  /// that still exist. This method may not always be integer exact.
+  // TODO(bondhugula): deal with integer exactness when necessary - can return a
+  // value to mark exactness for example.
+  void projectOut(unsigned pos, unsigned num);
+  inline void projectOut(unsigned pos) { return projectOut(pos, 1); }
+
+  /// Projects out the identifier that is associate with Value .
+  void projectOut(Value id);
+
+  void removeId(IdKind idKind, unsigned pos);
+  void removeId(unsigned pos);
+
+  void removeDim(unsigned pos);
+
+  void removeEquality(unsigned pos);
+  void removeInequality(unsigned pos);
+
+  /// Changes the partition between dimensions and symbols. Depending on the new
+  /// symbol count, either a chunk of trailing dimensional identifiers becomes
+  /// symbols, or some of the leading symbols become dimensions.
+  void setDimSymbolSeparation(unsigned newSymbolCount);
+
+  /// Changes all symbol identifiers which are loop IVs to dim identifiers.
+  void convertLoopIVSymbolsToDims();
+
+  /// Sets the specified identifier to a constant and removes it.
+  void setAndEliminate(unsigned pos, int64_t constVal);
+
+  /// Tries to fold the specified identifier to a constant using a trivial
+  /// equality detection; if successful, the constant is substituted for the
+  /// identifier everywhere in the constraint system and then removed from the
+  /// system.
+  LogicalResult constantFoldId(unsigned pos);
+
+  /// This method calls constantFoldId for the specified range of identifiers,
+  /// 'num' identifiers starting at position 'pos'.
+  void constantFoldIdRange(unsigned pos, unsigned num);
+
+  /// Updates the constraints to be the smallest bounding (enclosing) box that
+  /// contains the points of 'this' set and that of 'other', with the symbols
+  /// being treated specially. For each of the dimensions, the min of the lower
+  /// bounds (symbolic) and the max of the upper bounds (symbolic) is computed
+  /// to determine such a bounding box. `other' is expected to have the same
+  /// dimensional identifiers as this constraint system (in the same order).
+  ///
+  /// Eg: if 'this' is {0 <= d0 <= 127}, 'other' is {16 <= d0 <= 192}, the
+  ///      output is {0 <= d0 <= 192}.
+  /// 2) 'this' = {s0 + 5 <= d0 <= s0 + 20}, 'other' is {s0 + 1 <= d0 <= s0 +
+  ///     9}, output = {s0 + 1 <= d0 <= s0 + 20}.
+  /// 3) 'this' = {0 <= d0 <= 5, 1 <= d1 <= 9}, 'other' = {2 <= d0 <= 6, 5 <= d1
+  ///     <= 15}, output = {0 <= d0 <= 6, 1 <= d1 <= 15}.
+  LogicalResult unionBoundingBox(const FlatAffineConstraints &other);
+
+  /// Returns 'true' if this constraint system and 'other' are in the same
+  /// space, i.e., if they are associated with the same set of identifiers,
+  /// appearing in the same order. Returns 'false' otherwise.
+  bool areIdsAlignedWithOther(const FlatAffineConstraints &other);
+
+  /// Merge and align the identifiers of 'this' and 'other' starting at
+  /// 'offset', so that both constraint systems get the union of the contained
+  /// identifiers that is dimension-wise and symbol-wise unique; both
+  /// constraint systems are updated so that they have the union of all
+  /// identifiers, with this's original identifiers appearing first followed by
+  /// any of other's identifiers that didn't appear in 'this'. Local
+  /// identifiers of each system are by design separate/local and are placed
+  /// one after other (this's followed by other's).
+  //  Eg: Input: 'this'  has ((%i %j) [%M %N])
+  //             'other' has (%k, %j) [%P, %N, %M])
+  //      Output: both 'this', 'other' have (%i, %j, %k) [%M, %N, %P]
+  //
+  void mergeAndAlignIdsWithOther(unsigned offset, FlatAffineConstraints *other);
+
+  unsigned getNumConstraints() const {
+    return getNumInequalities() + getNumEqualities();
+  }
+  inline unsigned getNumIds() const { return numIds; }
+  inline unsigned getNumDimIds() const { return numDims; }
+  inline unsigned getNumSymbolIds() const { return numSymbols; }
+  inline unsigned getNumDimAndSymbolIds() const { return numDims + numSymbols; }
+  inline unsigned getNumLocalIds() const {
+    return numIds - numDims - numSymbols;
+  }
+
+  inline ArrayRef<Optional<Value>> getIds() const {
+    return {ids.data(), ids.size()};
+  }
+  inline MutableArrayRef<Optional<Value>> getIds() {
+    return {ids.data(), ids.size()};
+  }
+
+  /// Returns the optional Value corresponding to the pos^th identifier.
+  inline Optional<Value> getId(unsigned pos) const { return ids[pos]; }
+  inline Optional<Value> &getId(unsigned pos) { return ids[pos]; }
+
+  /// Returns the Value associated with the pos^th identifier. Asserts if
+  /// no Value identifier was associated.
+  inline Value getIdValue(unsigned pos) const {
+    assert(ids[pos].hasValue() && "identifier's Value not set");
+    return ids[pos].getValue();
+  }
+
+  /// Returns the Values associated with identifiers in range [start, end).
+  /// Asserts if no Value was associated with one of these identifiers.
+  void getIdValues(unsigned start, unsigned end,
+                   SmallVectorImpl<Value> *values) const {
+    assert((start < numIds || start == end) && "invalid start position");
+    assert(end <= numIds && "invalid end position");
+    values->clear();
+    values->reserve(end - start);
+    for (unsigned i = start; i < end; i++) {
+      values->push_back(getIdValue(i));
+    }
+  }
+  inline void getAllIdValues(SmallVectorImpl<Value> *values) const {
+    getIdValues(0, numIds, values);
+  }
+
+  /// Sets Value associated with the pos^th identifier.
+  inline void setIdValue(unsigned pos, Value val) {
+    assert(pos < numIds && "invalid id position");
+    ids[pos] = val;
+  }
+  /// Sets Values associated with identifiers in the range [start, end).
+  void setIdValues(unsigned start, unsigned end, ArrayRef<Value> values) {
+    assert((start < numIds || end == start) && "invalid start position");
+    assert(end <= numIds && "invalid end position");
+    assert(values.size() == end - start);
+    for (unsigned i = start; i < end; ++i)
+      ids[i] = values[i - start];
+  }
+
+  /// Clears this list of constraints and copies other into it.
+  void clearAndCopyFrom(const FlatAffineConstraints &other);
+
+  /// Returns the smallest known constant bound for the extent of the specified
+  /// identifier (pos^th), i.e., the smallest known constant that is greater
+  /// than or equal to 'exclusive upper bound' - 'lower bound' of the
+  /// identifier. Returns None if it's not a constant. This method employs
+  /// trivial (low complexity / cost) checks and detection. Symbolic identifiers
+  /// are treated specially, i.e., it looks for constant differences between
+  /// affine expressions involving only the symbolic identifiers. See comments
+  /// at function definition for examples. 'lb' and 'lbDivisor', if provided,
+  /// are used to express the lower bound associated with the constant
+  /// difference: 'lb' has the coefficients and lbDivisor, the divisor. For eg.,
+  /// if the lower bound is [(s0 + s2 - 1) floordiv 32] for a system with three
+  /// symbolic identifiers, *lb = [1, 0, 1], lbDivisor = 32.
+  Optional<int64_t>
+  getConstantBoundOnDimSize(unsigned pos,
+                            SmallVectorImpl<int64_t> *lb = nullptr,
+                            int64_t *lbFloorDivisor = nullptr,
+                            SmallVectorImpl<int64_t> *ub = nullptr) const;
+
+  /// Returns the constant lower bound for the pos^th identifier if there is
+  /// one; None otherwise.
+  Optional<int64_t> getConstantLowerBound(unsigned pos) const;
+
+  /// Returns the constant upper bound for the pos^th identifier if there is
+  /// one; None otherwise.
+  Optional<int64_t> getConstantUpperBound(unsigned pos) const;
+
+  /// Gets the lower and upper bound of the pos^th identifier treating
+  /// [0, offset) U [offset + num, symStartPos) as dimensions and
+  /// [symStartPos, getNumDimAndSymbolIds) as symbols. The returned
+  /// multi-dimensional maps in the pair represent the max and min of
+  /// potentially multiple affine expressions. The upper bound is exclusive.
+  /// 'localExprs' holds pre-computed AffineExpr's for all local identifiers in
+  /// the system.
+  std::pair<AffineMap, AffineMap>
+  getLowerAndUpperBound(unsigned pos, unsigned offset, unsigned num,
+                        unsigned symStartPos, ArrayRef<AffineExpr> localExprs,
+                        MLIRContext *context) const;
+
+  /// Returns true if the set can be trivially detected as being
+  /// hyper-rectangular on the specified contiguous set of identifiers.
+  bool isHyperRectangular(unsigned pos, unsigned num) const;
+
+  /// Removes duplicate constraints, trivially true constraints, and constraints
+  /// that can be detected as redundant as a result of differing only in their
+  /// constant term part. A constraint of the form <non-negative constant> >= 0
+  /// is considered trivially true. This method is a linear time method on the
+  /// constraints, does a single scan, and updates in place.
+  void removeTrivialRedundancy();
+
+  /// A more expensive check to detect redundant inequalities thatn
+  /// removeTrivialRedundancy.
+  void removeRedundantInequalities();
+
+  // Removes all equalities and inequalities.
+  void clearConstraints();
+
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+private:
+  /// Returns false if the fields corresponding to various identifier counts, or
+  /// equality/inequality buffer sizes aren't consistent; true otherwise. This
+  /// is meant to be used within an assert internally.
+  bool hasConsistentState() const;
+
+  /// Checks all rows of equality/inequality constraints for trivial
+  /// contradictions (for example: 1 == 0, 0 >= 1), which may have surfaced
+  /// after elimination. Returns 'true' if an invalid constraint is found;
+  /// 'false'otherwise.
+  bool hasInvalidConstraint() const;
+
+  /// Returns the constant lower bound bound if isLower is true, and the upper
+  /// bound if isLower is false.
+  template <bool isLower>
+  Optional<int64_t> computeConstantLowerOrUpperBound(unsigned pos);
+
+  // Eliminates a single identifier at 'position' from equality and inequality
+  // constraints. Returns 'success' if the identifier was eliminated, and
+  // 'failure' otherwise.
+  inline LogicalResult gaussianEliminateId(unsigned position) {
+    return success(gaussianEliminateIds(position, position + 1) == 1);
+  }
+
+  // Eliminates identifiers from equality and inequality constraints
+  // in column range [posStart, posLimit).
+  // Returns the number of variables eliminated.
+  unsigned gaussianEliminateIds(unsigned posStart, unsigned posLimit);
+
+  /// Eliminates identifier at the specified position using Fourier-Motzkin
+  /// variable elimination, but uses Gaussian elimination if there is an
+  /// equality involving that identifier. If the result of the elimination is
+  /// integer exact, *isResultIntegerExact is set to true. If 'darkShadow' is
+  /// set to true, a potential under approximation (subset) of the rational
+  /// shadow / exact integer shadow is computed.
+  // See implementation comments for more details.
+  void FourierMotzkinEliminate(unsigned pos, bool darkShadow = false,
+                               bool *isResultIntegerExact = nullptr);
+
+  /// Tightens inequalities given that we are dealing with integer spaces. This
+  /// is similar to the GCD test but applied to inequalities. The constant term
+  /// can be reduced to the preceding multiple of the GCD of the coefficients,
+  /// i.e.,
+  ///  64*i - 100 >= 0  =>  64*i - 128 >= 0 (since 'i' is an integer). This is a
+  /// fast method (linear in the number of coefficients).
+  void GCDTightenInequalities();
+
+  /// Normalized each constraints by the GCD of its coefficients.
+  void normalizeConstraintsByGCD();
+
+  /// Removes identifiers in column range [idStart, idLimit), and copies any
+  /// remaining valid data into place, updates member variables, and resizes
+  /// arrays as needed.
+  void removeIdRange(unsigned idStart, unsigned idLimit);
+
+  /// Coefficients of affine equalities (in == 0 form).
+  SmallVector<int64_t, 64> equalities;
+
+  /// Coefficients of affine inequalities (in >= 0 form).
+  SmallVector<int64_t, 64> inequalities;
+
+  /// Number of columns reserved. Actual ones in used are returned by
+  /// getNumCols().
+  unsigned numReservedCols;
+
+  /// Total number of identifiers.
+  unsigned numIds;
+
+  /// Number of identifiers corresponding to real dimensions.
+  unsigned numDims;
+
+  /// Number of identifiers corresponding to symbols (unknown but constant for
+  /// analysis).
+  unsigned numSymbols;
+
+  /// Values corresponding to the (column) identifiers of this constraint
+  /// system appearing in the order the identifiers correspond to columns.
+  /// Temporary ones or those that aren't associated to any Value are set to
+  /// None.
+  SmallVector<Optional<Value>, 8> ids;
+
+  /// A parameter that controls detection of an unrealistic number of
+  /// constraints. If the number of constraints is this many times the number of
+  /// variables, we consider such a system out of line with the intended use
+  /// case of FlatAffineConstraints.
+  // The rationale for 32 is that in the typical simplest of cases, an
+  // identifier is expected to have one lower bound and one upper bound
+  // constraint. With a level of tiling or a connection to another identifier
+  // through a div or mod, an extra pair of bounds gets added. As a limit, we
+  // don't expect an identifier to have more than 32 lower/upper/equality
+  // constraints. This is conservatively set low and can be raised if needed.
+  constexpr static unsigned kExplosionFactor = 32;
+};
+
+/// Simplify an affine expression by flattening and some amount of
+/// simple analysis. This has complexity linear in the number of nodes in
+/// 'expr'. Returns the simplified expression, which is the same as the input
+///  expression if it can't be simplified.
+AffineExpr simplifyAffineExpr(AffineExpr expr, unsigned numDims,
+                              unsigned numSymbols);
+
+/// Flattens 'expr' into 'flattenedExpr', which contains the coefficients of the
+/// dimensions, symbols, and additional variables that represent floor divisions
+/// of dimensions, symbols, and in turn other floor divisions.  Returns failure
+/// if 'expr' could not be flattened (i.e., semi-affine is not yet handled).
+/// 'cst' contains constraints that connect newly introduced local identifiers
+/// to existing dimensional and symbolic identifiers. See documentation for
+/// AffineExprFlattener on how mod's and div's are flattened.
+LogicalResult getFlattenedAffineExpr(AffineExpr expr, unsigned numDims,
+                                     unsigned numSymbols,
+                                     SmallVectorImpl<int64_t> *flattenedExpr,
+                                     FlatAffineConstraints *cst = nullptr);
+
+/// Flattens the result expressions of the map to their corresponding flattened
+/// forms and set in 'flattenedExprs'. Returns failure if any expression in the
+/// map could not be flattened (i.e., semi-affine is not yet handled). 'cst'
+/// contains constraints that connect newly introduced local identifiers to
+/// existing dimensional and / symbolic identifiers. See documentation for
+/// AffineExprFlattener on how mod's and div's are flattened. For all affine
+/// expressions that share the same operands (like those of an affine map), this
+/// method should be used instead of repeatedly calling getFlattenedAffineExpr
+/// since local variables added to deal with div's and mod's will be reused
+/// across expressions.
+LogicalResult
+getFlattenedAffineExprs(AffineMap map,
+                        std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
+                        FlatAffineConstraints *cst = nullptr);
+LogicalResult
+getFlattenedAffineExprs(IntegerSet set,
+                        std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
+                        FlatAffineConstraints *cst = nullptr);
+
+} // end namespace mlir.
+
+#endif // MLIR_ANALYSIS_AFFINE_STRUCTURES_H
diff --git a/mlir/include/mlir/Analysis/CMakeLists.txt b/mlir/include/mlir/Analysis/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3d9a7ed369799f04de873b23e532c4bf7fbdb74a
--- /dev/null
+++ b/mlir/include/mlir/Analysis/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_TARGET_DEFINITIONS CallInterfaces.td)
+mlir_tablegen(CallInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(CallInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(MLIRCallOpInterfacesIncGen)
+
+set(LLVM_TARGET_DEFINITIONS InferTypeOpInterface.td)
+mlir_tablegen(InferTypeOpInterface.h.inc -gen-op-interface-decls)
+mlir_tablegen(InferTypeOpInterface.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(MLIRTypeInferOpInterfaceIncGen)
diff --git a/mlir/include/mlir/Analysis/CallGraph.h b/mlir/include/mlir/Analysis/CallGraph.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f954161921edb645abc78acc9d57632ad0f01d3
--- /dev/null
+++ b/mlir/include/mlir/Analysis/CallGraph.h
@@ -0,0 +1,253 @@
+//===- CallGraph.h - CallGraph analysis for MLIR ----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an analysis for computing the multi-level callgraph from a
+// given top-level operation. This nodes within this callgraph are defined by
+// the `CallOpInterface` and `CallableOpInterface` operation interfaces defined
+// in CallInterface.td.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_CALLGRAPH_H
+#define MLIR_ANALYSIS_CALLGRAPH_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace mlir {
+struct CallInterfaceCallable;
+class Operation;
+class Region;
+
+//===----------------------------------------------------------------------===//
+// CallGraphNode
+//===----------------------------------------------------------------------===//
+
+/// This class represents a single callable in the callgraph. Aside from the
+/// external node, each node represents a callable node in the graph and
+/// contains a valid corresponding Region. The external node is a virtual node
+/// used to represent external edges into, and out of, the callgraph.
+class CallGraphNode {
+public:
+  /// This class represents a directed edge between two nodes in the callgraph.
+  class Edge {
+    enum class Kind {
+      // An 'Abstract' edge represents an opaque, non-operation, reference
+      // between this node and the target. Edges of this type are only valid
+      // from the external node, as there is no valid connection to an operation
+      // in the module.
+      Abstract,
+
+      // A 'Call' edge represents a direct reference to the target node via a
+      // call-like operation within the callable region of this node.
+      Call,
+
+      // A 'Child' edge is used when the region of target node is defined inside
+      // of the callable region of this node. This means that the region of this
+      // node is an ancestor of the region for the target node. As such, this
+      // edge cannot be used on the 'external' node.
+      Child,
+    };
+
+  public:
+    /// Returns if this edge represents an `Abstract` edge.
+    bool isAbstract() const { return targetAndKind.getInt() == Kind::Abstract; }
+
+    /// Returns if this edge represents a `Call` edge.
+    bool isCall() const { return targetAndKind.getInt() == Kind::Call; }
+
+    /// Returns if this edge represents a `Child` edge.
+    bool isChild() const { return targetAndKind.getInt() == Kind::Child; }
+
+    /// Returns the target node for this edge.
+    CallGraphNode *getTarget() const { return targetAndKind.getPointer(); }
+
+    bool operator==(const Edge &edge) const {
+      return targetAndKind == edge.targetAndKind;
+    }
+
+  private:
+    Edge(CallGraphNode *node, Kind kind) : targetAndKind(node, kind) {}
+    explicit Edge(llvm::PointerIntPair<CallGraphNode *, 2, Kind> targetAndKind)
+        : targetAndKind(targetAndKind) {}
+
+    /// The target node of this edge, as well as the edge kind.
+    llvm::PointerIntPair<CallGraphNode *, 2, Kind> targetAndKind;
+
+    // Provide access to the constructor and Kind.
+    friend class CallGraphNode;
+  };
+
+  /// Returns if this node is the external node.
+  bool isExternal() const;
+
+  /// Returns the callable region this node represents. This can only be called
+  /// on non-external nodes.
+  Region *getCallableRegion() const;
+
+  /// Adds an abstract reference edge to the given node. An abstract edge does
+  /// not come from any observable operations, so this is only valid on the
+  /// external node.
+  void addAbstractEdge(CallGraphNode *node);
+
+  /// Add an outgoing call edge from this node.
+  void addCallEdge(CallGraphNode *node);
+
+  /// Adds a reference edge to the given child node.
+  void addChildEdge(CallGraphNode *child);
+
+  /// Iterator over the outgoing edges of this node.
+  using iterator = SmallVectorImpl<Edge>::const_iterator;
+  iterator begin() const { return edges.begin(); }
+  iterator end() const { return edges.end(); }
+
+  /// Returns true if this node has any child edges.
+  bool hasChildren() const;
+
+private:
+  /// DenseMap info for callgraph edges.
+  struct EdgeKeyInfo {
+    using BaseInfo =
+        DenseMapInfo<llvm::PointerIntPair<CallGraphNode *, 2, Edge::Kind>>;
+
+    static Edge getEmptyKey() { return Edge(BaseInfo::getEmptyKey()); }
+    static Edge getTombstoneKey() { return Edge(BaseInfo::getTombstoneKey()); }
+    static unsigned getHashValue(const Edge &edge) {
+      return BaseInfo::getHashValue(edge.targetAndKind);
+    }
+    static bool isEqual(const Edge &lhs, const Edge &rhs) { return lhs == rhs; }
+  };
+
+  CallGraphNode(Region *callableRegion) : callableRegion(callableRegion) {}
+
+  /// Add an edge to 'node' with the given kind.
+  void addEdge(CallGraphNode *node, Edge::Kind kind);
+
+  /// The callable region defines the boundary of the call graph node. This is
+  /// the region referenced by 'call' operations. This is at a per-region
+  /// boundary as operations may define multiple callable regions.
+  Region *callableRegion;
+
+  /// A set of out-going edges from this node to other nodes in the graph.
+  llvm::SetVector<Edge, SmallVector<Edge, 4>,
+                  llvm::SmallDenseSet<Edge, 4, EdgeKeyInfo>>
+      edges;
+
+  // Provide access to private methods.
+  friend class CallGraph;
+};
+
+//===----------------------------------------------------------------------===//
+// CallGraph
+//===----------------------------------------------------------------------===//
+
+class CallGraph {
+  using NodeMapT = llvm::MapVector<Region *, std::unique_ptr<CallGraphNode>>;
+
+  /// This class represents an iterator over the internal call graph nodes. This
+  /// class unwraps the map iterator to access the raw node.
+  class NodeIterator final
+      : public llvm::mapped_iterator<
+            NodeMapT::const_iterator,
+            CallGraphNode *(*)(const NodeMapT::value_type &)> {
+    static CallGraphNode *unwrap(const NodeMapT::value_type &value) {
+      return value.second.get();
+    }
+
+  public:
+    /// Initializes the result type iterator to the specified result iterator.
+    NodeIterator(NodeMapT::const_iterator it)
+        : llvm::mapped_iterator<
+              NodeMapT::const_iterator,
+              CallGraphNode *(*)(const NodeMapT::value_type &)>(it, &unwrap) {}
+  };
+
+public:
+  CallGraph(Operation *op);
+
+  /// Get or add a call graph node for the given region. `parentNode`
+  /// corresponds to the direct node in the callgraph that contains the parent
+  /// operation of `region`, or nullptr if there is no parent node.
+  CallGraphNode *getOrAddNode(Region *region, CallGraphNode *parentNode);
+
+  /// Lookup a call graph node for the given region, or nullptr if none is
+  /// registered.
+  CallGraphNode *lookupNode(Region *region) const;
+
+  /// Return the callgraph node representing the indirect-external callee.
+  CallGraphNode *getExternalNode() const {
+    return const_cast<CallGraphNode *>(&externalNode);
+  }
+
+  /// Resolve the callable for given callee to a node in the callgraph, or the
+  /// external node if a valid node was not resolved. 'from' provides an anchor
+  /// for symbol table lookups, and is only required if the callable is a symbol
+  /// reference.
+  CallGraphNode *resolveCallable(CallInterfaceCallable callable,
+                                 Operation *from = nullptr) const;
+
+  /// An iterator over the nodes of the graph.
+  using iterator = NodeIterator;
+  iterator begin() const { return nodes.begin(); }
+  iterator end() const { return nodes.end(); }
+
+  /// Dump the graph in a human readable format.
+  void dump() const;
+  void print(raw_ostream &os) const;
+
+private:
+  /// The set of nodes within the callgraph.
+  NodeMapT nodes;
+
+  /// A special node used to indicate an external edges.
+  CallGraphNode externalNode;
+};
+
+} // end namespace mlir
+
+namespace llvm {
+// Provide graph traits for traversing call graphs using standard graph
+// traversals.
+template <> struct GraphTraits<const mlir::CallGraphNode *> {
+  using NodeRef = mlir::CallGraphNode *;
+  static NodeRef getEntryNode(NodeRef node) { return node; }
+
+  static NodeRef unwrap(const mlir::CallGraphNode::Edge &edge) {
+    return edge.getTarget();
+  }
+
+  // ChildIteratorType/begin/end - Allow iteration over all nodes in the graph.
+  using ChildIteratorType =
+      mapped_iterator<mlir::CallGraphNode::iterator, decltype(&unwrap)>;
+  static ChildIteratorType child_begin(NodeRef node) {
+    return {node->begin(), &unwrap};
+  }
+  static ChildIteratorType child_end(NodeRef node) {
+    return {node->end(), &unwrap};
+  }
+};
+
+template <>
+struct GraphTraits<const mlir::CallGraph *>
+    : public GraphTraits<const mlir::CallGraphNode *> {
+  /// The entry node into the graph is the external node.
+  static NodeRef getEntryNode(const mlir::CallGraph *cg) {
+    return cg->getExternalNode();
+  }
+
+  // nodes_iterator/begin/end - Allow iteration over all nodes in the graph
+  using nodes_iterator = mlir::CallGraph::iterator;
+  static nodes_iterator nodes_begin(mlir::CallGraph *cg) { return cg->begin(); }
+  static nodes_iterator nodes_end(mlir::CallGraph *cg) { return cg->end(); }
+};
+} // end namespace llvm
+
+#endif // MLIR_ANALYSIS_CALLGRAPH_H
diff --git a/mlir/include/mlir/Analysis/CallInterfaces.h b/mlir/include/mlir/Analysis/CallInterfaces.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5870bac1429178c4680848942fc6db622ce1663
--- /dev/null
+++ b/mlir/include/mlir/Analysis/CallInterfaces.h
@@ -0,0 +1,31 @@
+//===- CallInterfaces.h - Call Interfaces for MLIR --------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definitions of the call interfaces defined in
+// `CallInterfaces.td`.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_CALLINTERFACES_H
+#define MLIR_ANALYSIS_CALLINTERFACES_H
+
+#include "mlir/IR/OpDefinition.h"
+#include "llvm/ADT/PointerUnion.h"
+
+namespace mlir {
+
+/// A callable is either a symbol, or an SSA value, that is referenced by a
+/// call-like operation. This represents the destination of the call.
+struct CallInterfaceCallable : public PointerUnion<SymbolRefAttr, Value> {
+  using PointerUnion<SymbolRefAttr, Value>::PointerUnion;
+};
+
+#include "mlir/Analysis/CallInterfaces.h.inc"
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_CALLINTERFACES_H
diff --git a/mlir/include/mlir/Analysis/CallInterfaces.td b/mlir/include/mlir/Analysis/CallInterfaces.td
new file mode 100644
index 0000000000000000000000000000000000000000..3e5b599baf8faf283fcd5fa67a42fc9586d650aa
--- /dev/null
+++ b/mlir/include/mlir/Analysis/CallInterfaces.td
@@ -0,0 +1,84 @@
+//===- CallInterfaces.td - Call Interfaces for ops -*- tablegen ---------*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a set of interfaces that can be used to define information
+// related to call-like and callable operations. Each of which are defined along
+// with the respective interface below.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CALLINTERFACES
+#define MLIR_CALLINTERFACES
+
+include "mlir/IR/OpBase.td"
+
+// `CallInterfaceCallable`: This is a type used to represent a single callable
+// region. A callable is either a symbol, or an SSA value, that is referenced by
+// a call-like operation. This represents the destination of the call.
+
+/// Interface for call-like operations.
+def CallOpInterface : OpInterface<"CallOpInterface"> {
+  let description = [{
+    A call-like operation is one that transfers control from one sub-routine to
+    another. These operations may be traditional direct calls `call @foo`, or
+    indirect calls to other operations `call_indirect %foo`. An operation that
+    uses this interface, must *not* also provide the `CallableOpInterface`.
+  }];
+
+  let methods = [
+    InterfaceMethod<[{
+        Returns the callee of this call-like operation. A `callee` is either a
+        reference to a symbol, via SymbolRefAttr, or a reference to a defined
+        SSA value.
+      }],
+      "CallInterfaceCallable", "getCallableForCallee"
+    >,
+    InterfaceMethod<[{
+        Returns the operands within this call that are used as arguments to the
+        callee.
+      }],
+      "Operation::operand_range", "getArgOperands"
+    >,
+  ];
+}
+
+/// Interface for callable operations.
+def CallableOpInterface : OpInterface<"CallableOpInterface"> {
+  let description = [{
+    A callable operation is one who represents a potential sub-routine, and may
+    be a target for a call-like operation (those providing the CallOpInterface
+    above). These operations may be traditional functional operation
+    `func @foo(...)`, as well as function producing operations
+    `%foo = dialect.create_function(...)`. These operations may produce multiple
+    callable regions, or subroutines.
+  }];
+
+  let methods = [
+    InterfaceMethod<[{
+        Returns a region on the current operation that the given callable refers
+        to. This may return null in the case of an external callable object,
+        e.g. an external function.
+      }],
+      "Region *", "getCallableRegion", (ins "CallInterfaceCallable":$callable)
+    >,
+    InterfaceMethod<[{
+        Returns all of the callable regions of this operation.
+      }],
+      "void", "getCallableRegions",
+              (ins "SmallVectorImpl<Region *> &":$callables)
+    >,
+    InterfaceMethod<[{
+        Returns the results types that the given callable region produces when
+        executed.
+      }],
+      "ArrayRef<Type>", "getCallableResults", (ins "Region *":$callable)
+    >,
+  ];
+}
+
+#endif // MLIR_CALLINTERFACES
diff --git a/mlir/include/mlir/Analysis/Dominance.h b/mlir/include/mlir/Analysis/Dominance.h
new file mode 100644
index 0000000000000000000000000000000000000000..ead54b93e8084592cb196b1980f1961dbf0b989e
--- /dev/null
+++ b/mlir/include/mlir/Analysis/Dominance.h
@@ -0,0 +1,141 @@
+//===- Dominance.h - Dominator analysis for CFGs ----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_DOMINANCE_H
+#define MLIR_ANALYSIS_DOMINANCE_H
+
+#include "mlir/IR/RegionGraphTraits.h"
+#include "llvm/Support/GenericDomTree.h"
+
+extern template class llvm::DominatorTreeBase<mlir::Block, false>;
+extern template class llvm::DominatorTreeBase<mlir::Block, true>;
+
+namespace mlir {
+using DominanceInfoNode = llvm::DomTreeNodeBase<Block>;
+class Operation;
+
+namespace detail {
+template <bool IsPostDom> class DominanceInfoBase {
+  using base = llvm::DominatorTreeBase<Block, IsPostDom>;
+
+public:
+  DominanceInfoBase(Operation *op) { recalculate(op); }
+  DominanceInfoBase(DominanceInfoBase &&) = default;
+  DominanceInfoBase &operator=(DominanceInfoBase &&) = default;
+
+  DominanceInfoBase(const DominanceInfoBase &) = delete;
+  DominanceInfoBase &operator=(const DominanceInfoBase &) = delete;
+
+  /// Recalculate the dominance info.
+  void recalculate(Operation *op);
+
+  /// Get the root dominance node of the given region.
+  DominanceInfoNode *getRootNode(Region *region) {
+    assert(dominanceInfos.count(region) != 0);
+    return dominanceInfos[region]->getRootNode();
+  }
+
+protected:
+  using super = DominanceInfoBase<IsPostDom>;
+
+  /// Return true if the specified block A properly dominates block B.
+  bool properlyDominates(Block *a, Block *b);
+
+  /// A mapping of regions to their base dominator tree.
+  DenseMap<Region *, std::unique_ptr<base>> dominanceInfos;
+};
+} // end namespace detail
+
+/// A class for computing basic dominance information.
+class DominanceInfo : public detail::DominanceInfoBase</*IsPostDom=*/false> {
+public:
+  using super::super;
+
+  /// Return true if operation A properly dominates operation B.
+  bool properlyDominates(Operation *a, Operation *b);
+
+  /// Return true if operation A dominates operation B.
+  bool dominates(Operation *a, Operation *b) {
+    return a == b || properlyDominates(a, b);
+  }
+
+  /// Return true if value A properly dominates operation B.
+  bool properlyDominates(Value a, Operation *b);
+
+  /// Return true if operation A dominates operation B.
+  bool dominates(Value a, Operation *b) {
+    return (Operation *)a->getDefiningOp() == b || properlyDominates(a, b);
+  }
+
+  /// Return true if the specified block A dominates block B.
+  bool dominates(Block *a, Block *b) {
+    return a == b || properlyDominates(a, b);
+  }
+
+  /// Return true if the specified block A properly dominates block B.
+  bool properlyDominates(Block *a, Block *b) {
+    return super::properlyDominates(a, b);
+  }
+
+  /// Return the dominance node from the Region containing block A.
+  DominanceInfoNode *getNode(Block *a);
+
+  /// Update the internal DFS numbers for the dominance nodes.
+  void updateDFSNumbers();
+};
+
+/// A class for computing basic postdominance information.
+class PostDominanceInfo : public detail::DominanceInfoBase</*IsPostDom=*/true> {
+public:
+  using super::super;
+
+  /// Return true if operation A properly postdominates operation B.
+  bool properlyPostDominates(Operation *a, Operation *b);
+
+  /// Return true if operation A postdominates operation B.
+  bool postDominates(Operation *a, Operation *b) {
+    return a == b || properlyPostDominates(a, b);
+  }
+
+  /// Return true if the specified block A properly postdominates block B.
+  bool properlyPostDominates(Block *a, Block *b) {
+    return super::properlyDominates(a, b);
+  }
+
+  /// Return true if the specified block A postdominates block B.
+  bool postDominates(Block *a, Block *b) {
+    return a == b || properlyPostDominates(a, b);
+  }
+};
+
+} //  end namespace mlir
+
+namespace llvm {
+
+/// DominatorTree GraphTraits specialization so the DominatorTree can be
+/// iterated by generic graph iterators.
+template <> struct GraphTraits<mlir::DominanceInfoNode *> {
+  using ChildIteratorType = mlir::DominanceInfoNode::iterator;
+  using NodeRef = mlir::DominanceInfoNode *;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+  static inline ChildIteratorType child_begin(NodeRef N) { return N->begin(); }
+  static inline ChildIteratorType child_end(NodeRef N) { return N->end(); }
+};
+
+template <> struct GraphTraits<const mlir::DominanceInfoNode *> {
+  using ChildIteratorType = mlir::DominanceInfoNode::const_iterator;
+  using NodeRef = const mlir::DominanceInfoNode *;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+  static inline ChildIteratorType child_begin(NodeRef N) { return N->begin(); }
+  static inline ChildIteratorType child_end(NodeRef N) { return N->end(); }
+};
+
+} // end namespace llvm
+#endif
diff --git a/mlir/include/mlir/Analysis/InferTypeOpInterface.h b/mlir/include/mlir/Analysis/InferTypeOpInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..baf16162a0be5563f2fddbbcabfda74f0812e055
--- /dev/null
+++ b/mlir/include/mlir/Analysis/InferTypeOpInterface.h
@@ -0,0 +1,44 @@
+//===- InferTypeOpInterface.h - Infer Type Interfaces -----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definitions of the infer op interfaces defined in
+// `InferTypeOpInterface.td`.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_INFERTYPEOPINTERFACE_H_
+#define MLIR_ANALYSIS_INFERTYPEOPINTERFACE_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir {
+
+#include "mlir/Analysis/InferTypeOpInterface.h.inc"
+
+namespace OpTrait {
+template <typename ConcreteType>
+class TypeOpInterfaceDefault
+    : public TraitBase<ConcreteType, TypeOpInterfaceDefault> {
+public:
+  /// Returns whether two arrays are equal as strongest check for compatibility
+  /// by default.
+  static bool isCompatibleReturnTypes(ArrayRef<Type> lhs, ArrayRef<Type> rhs) {
+    return lhs == rhs;
+  };
+};
+} // namespace OpTrait
+
+} // namespace mlir
+
+#endif // MLIR_ANALYSIS_INFERTYPEOPINTERFACE_H_
diff --git a/mlir/include/mlir/Analysis/InferTypeOpInterface.td b/mlir/include/mlir/Analysis/InferTypeOpInterface.td
new file mode 100644
index 0000000000000000000000000000000000000000..bbcea6be7eb3fd86256e098a8b77e308c0787dcb
--- /dev/null
+++ b/mlir/include/mlir/Analysis/InferTypeOpInterface.td
@@ -0,0 +1,65 @@
+//===- InferTypeOpInterface.td - Infer Type interfaces -----*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a set of interfaces that can be used to define information
+// related to type inference.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_INFERTYPEOPINTERFACE
+#define MLIR_INFERTYPEOPINTERFACE
+
+include "mlir/IR/OpBase.td"
+
+// OpInterface to compute the return type of an operation. The arguments match
+// those in Operation::create with the exception that the location is optional
+// (if no location is provided, then the method will not emit an error on
+// mismatch).
+def InferTypeOpInterface : OpInterface<"InferTypeOpInterface"> {
+  let description = [{
+    Interface to access a registered method to infer the return types for an
+    operation that could be used during op construction, verification or
+    type inference.
+  }];
+
+  let methods = [
+    StaticInterfaceMethod<
+      /*desc=*/[{Infer the return types that an op would generate.
+
+      The method takes an optional location which, if set, will be used to
+      report errors on. The operands and attributes correspond to those with
+      which an Operation would be created (e.g., as used in Operation::create)
+      and the regions of the op.
+      }],
+      /*retTy=*/"LogicalResult",
+      /*methodName=*/"inferReturnTypes",
+      /*args=*/(ins "Optional<Location>":$location,
+                    "ValueRange":$operands,
+                    "ArrayRef<NamedAttribute>":$attributes,
+                    "RegionRange":$regions,
+                    "SmallVectorImpl<Type>&":$inferedReturnTypes)
+    >,
+    StaticInterfaceMethod<
+      /*desc=*/"Returns whether two array of types are compatible result types"
+               " for an op.",
+      /*retTy=*/"bool",
+      /*methodName=*/"isCompatibleReturnTypes",
+      /*args=*/(ins "ArrayRef<Type>":$lhs, "ArrayRef<Type>":$rhs),
+      /*methodBody=*/[{
+        return ConcreteOp::isCompatibleReturnTypes(lhs, rhs);
+      }],
+      /*defaultImplementation=*/[{
+        /// Returns whether two arrays are equal as strongest check for
+        /// compatibility by default.
+        return lhs == rhs;
+      }]
+    >,
+  ];
+}
+
+#endif // MLIR_INFERTYPEOPINTERFACE
diff --git a/mlir/include/mlir/Analysis/Liveness.h b/mlir/include/mlir/Analysis/Liveness.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e1dc2903ae345bd037fd35a695ccfc1a752f12e
--- /dev/null
+++ b/mlir/include/mlir/Analysis/Liveness.h
@@ -0,0 +1,148 @@
+//===- Liveness.h - Liveness analysis for MLIR ------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an analysis for computing liveness information from a
+// given top-level operation. The current version of the analysis uses a
+// traditional algorithm to resolve detailed live-range information about all
+// values within the specified regions. It is also possible to query liveness
+// information on block level.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_LIVENESS_H
+#define MLIR_ANALYSIS_LIVENESS_H
+
+#include <vector>
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+namespace mlir {
+
+class Block;
+class LivenessBlockInfo;
+class Operation;
+class Region;
+class Value;
+
+/// Represents an analysis for computing liveness information from a
+/// given top-level operation. The analysis iterates over all associated
+/// regions that are attached to the given top-level operation. It
+/// computes liveness information for every value and block that are
+/// included in the mentioned regions. It relies on a fixpoint iteration
+/// to compute all live-in and live-out values of all included blocks.
+/// Sample usage:
+///   Liveness liveness(topLevelOp);
+///   auto &allInValues = liveness.getLiveIn(block);
+///   auto &allOutValues = liveness.getLiveOut(block);
+///   auto allOperationsInWhichValueIsLive = liveness.resolveLiveness(value);
+///   bool lastUse = liveness.isLastUse(value, operation);
+class Liveness {
+public:
+  using OperationListT = std::vector<Operation *>;
+  using BlockMapT = DenseMap<Block *, LivenessBlockInfo>;
+  using ValueSetT = SmallPtrSet<Value, 16>;
+
+public:
+  /// Creates a new Liveness analysis that computes liveness
+  /// information for all associated regions.
+  Liveness(Operation *op);
+
+  /// Returns the operation this analysis was constructed from.
+  Operation *getOperation() const { return operation; }
+
+  /// Gets liveness info (if any) for the given value.
+  /// This includes all operations in which the given value is live.
+  /// Note that the operations in this list are not ordered and the current
+  /// implementation is computationally expensive (as it iterates over all
+  /// blocks in which the given value is live).
+  OperationListT resolveLiveness(Value value) const;
+
+  /// Gets liveness info (if any) for the block.
+  const LivenessBlockInfo *getLiveness(Block *block) const;
+
+  /// Returns a reference to a set containing live-in values (unordered).
+  const ValueSetT &getLiveIn(Block *block) const;
+
+  /// Returns a reference to a set containing live-out values (unordered).
+  const ValueSetT &getLiveOut(Block *block) const;
+
+  /// Returns true if the given operation represent the last use of the
+  /// given value.
+  bool isLastUse(Value value, Operation *operation) const;
+
+  /// Dumps the liveness information in a human readable format.
+  void dump() const;
+
+  /// Dumps the liveness information to the given stream.
+  void print(raw_ostream &os) const;
+
+private:
+  /// Initializes the internal mappings.
+  void build(MutableArrayRef<Region> regions);
+
+private:
+  /// The operation this analysis was constructed from.
+  Operation *operation;
+
+  /// Maps blocks to internal liveness information.
+  BlockMapT blockMapping;
+};
+
+/// This class represents liveness information on block level.
+class LivenessBlockInfo {
+public:
+  /// A typedef declaration of a value set.
+  using ValueSetT = Liveness::ValueSetT;
+
+public:
+  /// Returns the underlying block.
+  Block *getBlock() const { return block; }
+
+  /// Returns all values that are live at the beginning
+  /// of the block (unordered).
+  const ValueSetT &in() const { return inValues; }
+
+  /// Returns all values that are live at the end
+  /// of the block (unordered).
+  const ValueSetT &out() const { return outValues; }
+
+  /// Returns true if the given value is in the live-in set.
+  bool isLiveIn(Value value) const;
+
+  /// Returns true if the given value is in the live-out set.
+  bool isLiveOut(Value value) const;
+
+  /// Gets the start operation for the given value. This is the first operation
+  /// the given value is considered to be live. This could either be the start
+  /// operation of the current block (in case the value is live-in) or the
+  /// operation that defines the given value (must be referenced in this block).
+  Operation *getStartOperation(Value value) const;
+
+  /// Gets the end operation for the given value using the start operation
+  /// provided (must be referenced in this block).
+  Operation *getEndOperation(Value value, Operation *startOperation) const;
+
+private:
+  /// The underlying block.
+  Block *block;
+
+  /// The set of all live in values.
+  ValueSetT inValues;
+
+  /// The set of all live out values.
+  ValueSetT outValues;
+
+  friend class Liveness;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_LIVENESS_H
diff --git a/mlir/include/mlir/Analysis/LoopAnalysis.h b/mlir/include/mlir/Analysis/LoopAnalysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..0dd89e454a83a1673f0bc1adcececce4f16b1950
--- /dev/null
+++ b/mlir/include/mlir/Analysis/LoopAnalysis.h
@@ -0,0 +1,88 @@
+//===- LoopAnalysis.h - loop analysis methods -------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for methods to analyze loops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_LOOP_ANALYSIS_H
+#define MLIR_ANALYSIS_LOOP_ANALYSIS_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+
+namespace mlir {
+
+class AffineExpr;
+class AffineForOp;
+class AffineMap;
+class MemRefType;
+class NestedPattern;
+class Operation;
+class Value;
+
+/// Returns the trip count of the loop as an affine map with its corresponding
+/// operands if the latter is expressible as an affine expression, and nullptr
+/// otherwise. This method always succeeds as long as the lower bound is not a
+/// multi-result map. The trip count expression is simplified before returning.
+/// This method only utilizes map composition to construct lower and upper
+/// bounds before computing the trip count expressions
+// TODO(mlir-team): this should be moved into 'Transforms/' and be replaced by a
+// pure analysis method relying on FlatAffineConstraints
+void buildTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
+                                  SmallVectorImpl<Value> *operands);
+
+/// Returns the trip count of the loop if it's a constant, None otherwise. This
+/// uses affine expression analysis and is able to determine constant trip count
+/// in non-trivial cases.
+Optional<uint64_t> getConstantTripCount(AffineForOp forOp);
+
+/// Returns the greatest known integral divisor of the trip count. Affine
+/// expression analysis is used (indirectly through getTripCount), and
+/// this method is thus able to determine non-trivial divisors.
+uint64_t getLargestDivisorOfTripCount(AffineForOp forOp);
+
+/// Given an induction variable `iv` of type AffineForOp and `indices` of type
+/// IndexType, returns the set of `indices` that are independent of `iv`.
+///
+/// Prerequisites (inherited from `isAccessInvariant` above):
+///   1. `iv` and `indices` of the proper type;
+///   2. at most one affine.apply is reachable from each index in `indices`;
+///
+/// Emits a note if it encounters a chain of affine.apply and conservatively
+///  those cases.
+DenseSet<Value, DenseMapInfo<Value>>
+getInvariantAccesses(Value iv, ArrayRef<Value> indices);
+
+using VectorizableLoopFun = std::function<bool(AffineForOp)>;
+
+/// Checks whether the loop is structurally vectorizable; i.e.:
+///   1. no conditionals are nested under the loop;
+///   2. all nested load/stores are to scalar MemRefs.
+/// TODO(ntv): relax the no-conditionals restriction
+bool isVectorizableLoopBody(AffineForOp loop,
+                            NestedPattern &vectorTransferMatcher);
+
+/// Checks whether the loop is structurally vectorizable and that all the LoadOp
+/// and StoreOp matched have access indexing functions that are are either:
+///   1. invariant along the loop induction variable created by 'loop';
+///   2. varying along at most one memory dimension. If such a unique dimension
+///      is found, it is written into `memRefDim`.
+bool isVectorizableLoopBody(AffineForOp loop, int *memRefDim,
+                            NestedPattern &vectorTransferMatcher);
+
+/// Checks where SSA dominance would be violated if a for op's body
+/// operations are shifted by the specified shifts. This method checks if a
+/// 'def' and all its uses have the same shift factor.
+// TODO(mlir-team): extend this to check for memory-based dependence
+// violation when we have the support.
+bool isInstwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts);
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_LOOP_ANALYSIS_H
diff --git a/mlir/include/mlir/Analysis/NestedMatcher.h b/mlir/include/mlir/Analysis/NestedMatcher.h
new file mode 100644
index 0000000000000000000000000000000000000000..2da64e88e14876f798acf9adc94d7d95df64ca05
--- /dev/null
+++ b/mlir/include/mlir/Analysis/NestedMatcher.h
@@ -0,0 +1,187 @@
+//===- NestedMacher.h - Nested matcher for Function -------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_MLFUNCTIONMATCHER_H_
+#define MLIR_ANALYSIS_MLFUNCTIONMATCHER_H_
+
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/Support/Allocator.h"
+
+namespace mlir {
+
+class NestedPattern;
+class Operation;
+
+/// An NestedPattern captures nested patterns in the IR.
+/// It is used in conjunction with a scoped NestedPatternContext which is an
+/// llvm::BumpPtrAllocator that handles memory allocations efficiently and
+/// avoids ownership issues.
+///
+/// In order to use NestedPatterns, first create a scoped context.
+/// When the context goes out of scope, everything is freed.
+/// This design simplifies the API by avoiding references to the context and
+/// makes it clear that references to matchers must not escape.
+///
+/// Example:
+///   {
+///      NestedPatternContext context;
+///      auto gemmLike = Doall(Doall(Red(LoadStores())));
+///      auto matches = gemmLike.match(f);
+///      // do work on matches
+///   }  // everything is freed
+///
+///
+/// Nested abstraction for matching results.
+/// Provides access to the nested Operation* captured by a Matcher.
+///
+/// A NestedMatch contains an Operation* and the children NestedMatch and is
+/// thus cheap to copy. NestedMatch is stored in a scoped bumper allocator whose
+/// lifetime is managed by an RAII NestedPatternContext.
+class NestedMatch {
+public:
+  static NestedMatch build(Operation *operation,
+                           ArrayRef<NestedMatch> nestedMatches);
+  NestedMatch(const NestedMatch &) = default;
+  NestedMatch &operator=(const NestedMatch &) = default;
+
+  explicit operator bool() { return matchedOperation != nullptr; }
+
+  Operation *getMatchedOperation() { return matchedOperation; }
+  ArrayRef<NestedMatch> getMatchedChildren() { return matchedChildren; }
+
+private:
+  friend class NestedPattern;
+  friend class NestedPatternContext;
+
+  /// Underlying global bump allocator managed by a NestedPatternContext.
+  static llvm::BumpPtrAllocator *&allocator();
+
+  NestedMatch() = default;
+
+  /// Payload, holds a NestedMatch and all its children along this branch.
+  Operation *matchedOperation;
+  ArrayRef<NestedMatch> matchedChildren;
+};
+
+/// A NestedPattern is a nested operation walker that:
+///   1. recursively matches a substructure in the tree;
+///   2. uses a filter function to refine matches with extra semantic
+///      constraints (passed via a lambda of type FilterFunctionType);
+///   3. TODO(ntv) optionally applies actions (lambda).
+///
+/// Nested patterns are meant to capture imperfectly nested loops while matching
+/// properties over the whole loop nest. For instance, in vectorization we are
+/// interested in capturing all the imperfectly nested loops of a certain type
+/// and such that all the load and stores have certain access patterns along the
+/// loops' induction variables). Such NestedMatches are first captured using the
+/// `match` function and are later processed to analyze properties and apply
+/// transformations in a non-greedy way.
+///
+/// The NestedMatches captured in the IR can grow large, especially after
+/// aggressive unrolling. As experience has shown, it is generally better to use
+/// a plain walk over operations to match flat patterns but the current
+/// implementation is competitive nonetheless.
+using FilterFunctionType = std::function<bool(Operation &)>;
+inline bool defaultFilterFunction(Operation &) { return true; }
+class NestedPattern {
+public:
+  NestedPattern(ArrayRef<NestedPattern> nested,
+                FilterFunctionType filter = defaultFilterFunction);
+  NestedPattern(const NestedPattern &) = default;
+  NestedPattern &operator=(const NestedPattern &) = default;
+
+  /// Returns all the top-level matches in `func`.
+  void match(FuncOp func, SmallVectorImpl<NestedMatch> *matches) {
+    func.walk([&](Operation *op) { matchOne(op, matches); });
+  }
+
+  /// Returns all the top-level matches in `op`.
+  void match(Operation *op, SmallVectorImpl<NestedMatch> *matches) {
+    op->walk([&](Operation *child) { matchOne(child, matches); });
+  }
+
+  /// Returns the depth of the pattern.
+  unsigned getDepth() const;
+
+private:
+  friend class NestedPatternContext;
+  friend class NestedMatch;
+  friend struct State;
+
+  /// Underlying global bump allocator managed by a NestedPatternContext.
+  static llvm::BumpPtrAllocator *&allocator();
+
+  /// Matches this pattern against a single `op` and fills matches with the
+  /// result.
+  void matchOne(Operation *op, SmallVectorImpl<NestedMatch> *matches);
+
+  /// Nested patterns to be matched.
+  ArrayRef<NestedPattern> nestedPatterns;
+
+  /// Extra filter function to apply to prune patterns as the IR is walked.
+  FilterFunctionType filter;
+
+  /// skip is an implementation detail needed so that we can implement match
+  /// without switching on the type of the Operation. The idea is that a
+  /// NestedPattern first checks if it matches locally and then recursively
+  /// applies its nested matchers to its elem->nested. Since we want to rely on
+  /// the existing operation walking functionality rather than duplicate
+  /// it, we allow an off-by-one traversal to account for the fact that we
+  /// write:
+  ///
+  ///  void match(Operation *elem) {
+  ///    for (auto &c : getNestedPatterns()) {
+  ///      NestedPattern childPattern(...);
+  ///                                  ^~~~ Needs off-by-one skip.
+  ///
+  Operation *skip;
+};
+
+/// RAII structure to transparently manage the bump allocator for
+/// NestedPattern and NestedMatch classes. This avoids passing a context to
+/// all the API functions.
+class NestedPatternContext {
+public:
+  NestedPatternContext() {
+    assert(NestedMatch::allocator() == nullptr &&
+           "Only a single NestedPatternContext is supported");
+    assert(NestedPattern::allocator() == nullptr &&
+           "Only a single NestedPatternContext is supported");
+    NestedMatch::allocator() = &allocator;
+    NestedPattern::allocator() = &allocator;
+  }
+  ~NestedPatternContext() {
+    NestedMatch::allocator() = nullptr;
+    NestedPattern::allocator() = nullptr;
+  }
+  llvm::BumpPtrAllocator allocator;
+};
+
+namespace matcher {
+// Syntactic sugar NestedPattern builder functions.
+NestedPattern Op(FilterFunctionType filter = defaultFilterFunction);
+NestedPattern If(NestedPattern child);
+NestedPattern If(FilterFunctionType filter, NestedPattern child);
+NestedPattern If(ArrayRef<NestedPattern> nested = {});
+NestedPattern If(FilterFunctionType filter,
+                 ArrayRef<NestedPattern> nested = {});
+NestedPattern For(NestedPattern child);
+NestedPattern For(FilterFunctionType filter, NestedPattern child);
+NestedPattern For(ArrayRef<NestedPattern> nested = {});
+NestedPattern For(FilterFunctionType filter,
+                  ArrayRef<NestedPattern> nested = {});
+
+bool isParallelLoop(Operation &op);
+bool isReductionLoop(Operation &op);
+bool isLoadOrStore(Operation &op);
+
+} // end namespace matcher
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_MLFUNCTIONMATCHER_H_
diff --git a/mlir/include/mlir/Analysis/Passes.h b/mlir/include/mlir/Analysis/Passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bbc850e6c9b523e384972d9022e4cca0e26b0ad
--- /dev/null
+++ b/mlir/include/mlir/Analysis/Passes.h
@@ -0,0 +1,36 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes that expose pass constructors in the
+// analysis library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_PASSES_H
+#define MLIR_ANALYSIS_PASSES_H
+
+#include "mlir/Support/LLVM.h"
+#include <memory>
+
+namespace mlir {
+
+class FuncOp;
+template <typename T> class OpPassBase;
+
+/// Creates a pass to check memref accesses in a Function.
+std::unique_ptr<OpPassBase<FuncOp>> createMemRefBoundCheckPass();
+
+/// Creates a pass to check memref access dependences in a Function.
+std::unique_ptr<OpPassBase<FuncOp>> createTestMemRefDependenceCheckPass();
+
+/// Creates a pass to test parallelism detection; emits note for parallel loops.
+std::unique_ptr<OpPassBase<FuncOp>> createParallelismDetectionTestPass();
+
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_PASSES_H
diff --git a/mlir/include/mlir/Analysis/SliceAnalysis.h b/mlir/include/mlir/Analysis/SliceAnalysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7b6e9570142dd051b19f8aa815b9e76e62cf39c
--- /dev/null
+++ b/mlir/include/mlir/Analysis/SliceAnalysis.h
@@ -0,0 +1,206 @@
+//===- SliceAnalysis.h - Analysis for Transitive UseDef chains --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_SLICEANALYSIS_H_
+#define MLIR_ANALYSIS_SLICEANALYSIS_H_
+
+#include <functional>
+#include <vector>
+
+#include "mlir/Support/LLVM.h"
+
+#include "llvm/ADT/SetVector.h"
+
+namespace mlir {
+
+class Operation;
+
+/// Type of the condition to limit the propagation of transitive use-defs.
+/// This can be used in particular to limit the propagation to a given Scope or
+/// to avoid passing through certain types of operation in a configurable
+/// manner.
+using TransitiveFilter = std::function<bool(Operation *)>;
+
+/// Fills `forwardSlice` with the computed forward slice (i.e. all
+/// the transitive uses of op), **without** including that operation.
+///
+/// This additionally takes a TransitiveFilter which acts as a frontier:
+/// when looking at uses transitively, a operation that does not pass the
+/// filter is never propagated through. This allows in particular to carve out
+/// the scope within a ForInst or the scope within an IfInst.
+///
+/// The implementation traverses the use chains in postorder traversal for
+/// efficiency reasons: if a operation is already in `forwardSlice`, no
+/// need to traverse its uses again. Since use-def chains form a DAG, this
+/// terminates.
+///
+/// Upon return to the root call, `forwardSlice` is filled with a
+/// postorder list of uses (i.e. a reverse topological order). To get a proper
+/// topological order, we just just reverse the order in `forwardSlice` before
+/// returning.
+///
+/// Example starting from node 0
+/// ============================
+///
+///               0
+///    ___________|___________
+///    1       2      3      4
+///    |_______|      |______|
+///    |   |             |
+///    |   5             6
+///    |___|_____________|
+///      |               |
+///      7               8
+///      |_______________|
+///              |
+///              9
+///
+/// Assuming all local orders match the numbering order:
+/// 1. after getting back to the root getForwardSlice, `forwardSlice` may
+///    contain:
+///      {9, 7, 8, 5, 1, 2, 6, 3, 4}
+/// 2. reversing the result of 1. gives:
+///      {4, 3, 6, 2, 1, 5, 8, 7, 9}
+///
+void getForwardSlice(
+    Operation *op, llvm::SetVector<Operation *> *forwardSlice,
+    TransitiveFilter filter = /* pass-through*/
+    [](Operation *) { return true; });
+
+/// Fills `backwardSlice` with the computed backward slice (i.e.
+/// all the transitive defs of op), **without** including that operation.
+///
+/// This additionally takes a TransitiveFilter which acts as a frontier:
+/// when looking at defs transitively, a operation that does not pass the
+/// filter is never propagated through. This allows in particular to carve out
+/// the scope within a ForInst or the scope within an IfInst.
+///
+/// The implementation traverses the def chains in postorder traversal for
+/// efficiency reasons: if a operation is already in `backwardSlice`, no
+/// need to traverse its definitions again. Since useuse-def chains form a DAG,
+/// this terminates.
+///
+/// Upon return to the root call, `backwardSlice` is filled with a
+/// postorder list of defs. This happens to be a topological order, from the
+/// point of view of the use-def chains.
+///
+/// Example starting from node 8
+/// ============================
+///
+///    1       2      3      4
+///    |_______|      |______|
+///    |   |             |
+///    |   5             6
+///    |___|_____________|
+///      |               |
+///      7               8
+///      |_______________|
+///              |
+///              9
+///
+/// Assuming all local orders match the numbering order:
+///    {1, 2, 5, 3, 4, 6}
+///
+void getBackwardSlice(
+    Operation *op, llvm::SetVector<Operation *> *backwardSlice,
+    TransitiveFilter filter = /* pass-through*/
+    [](Operation *) { return true; });
+
+/// Iteratively computes backward slices and forward slices until
+/// a fixed point is reached. Returns an `llvm::SetVector<Operation *>` which
+/// **includes** the original operation.
+///
+/// This allows building a slice (i.e. multi-root DAG where everything
+/// that is reachable from an Value in forward and backward direction is
+/// contained in the slice).
+/// This is the abstraction we need to materialize all the operations for
+/// supervectorization without worrying about orderings and Value
+/// replacements.
+///
+/// Example starting from any node
+/// ==============================
+///
+///    1       2      3      4
+///    |_______|      |______|
+///    |   |             |   |
+///    |   5             6___|
+///    |___|_____________|   |
+///      |               |   |
+///      7               8   |
+///      |_______________|   |
+///              |           |
+///              9          10
+///
+/// Return the whole DAG in some topological order.
+///
+/// The implementation works by just filling up a worklist with iterative
+/// alternate calls to `getBackwardSlice` and `getForwardSlice`.
+///
+/// The following section describes some additional implementation
+/// considerations for a potentially more efficient implementation but they are
+/// just an intuition without proof, we still use a worklist for now.
+///
+/// Additional implementation considerations
+/// ========================================
+/// Consider the defs-op-uses hourglass.
+///    ____
+///    \  /  defs (in some topological order)
+///     \/
+///     op
+///     /\
+///    /  \  uses (in some topological order)
+///   /____\
+///
+/// We want to iteratively apply `getSlice` to construct the whole
+/// list of Operation that are reachable by (use|def)+ from op.
+/// We want the resulting slice in topological order.
+/// Ideally we would like the ordering to be maintained in-place to avoid
+/// copying Operation at each step. Keeping this ordering by construction
+/// seems very unclear, so we list invariants in the hope of seeing whether
+/// useful properties pop up.
+///
+/// In the following:
+///   we use |= for set inclusion;
+///   we use << for set topological ordering (i.e. each pair is ordered).
+///
+/// Assumption:
+/// ===========
+/// We wish to maintain the following property by a recursive argument:
+///   """
+///      defs << {op} <<uses are in topological order.
+///   """
+/// The property clearly holds for 0 and 1-sized uses and defs;
+///
+/// Invariants:
+///   2. defs and uses are in topological order internally, by construction;
+///   3. for any {x} |= defs, defs(x) |= defs;    because all go through op
+///   4. for any {x} |= uses,    defs |= defs(x); because all go through op
+///   5. for any {x} |= defs,    uses |= uses(x); because all go through op
+///   6. for any {x} |= uses, uses(x) |= uses;    because all go through op
+///
+/// Intuitively, we should be able to recurse like:
+///   preorder(defs) - op - postorder(uses)
+/// and keep things ordered but this is still hand-wavy and not worth the
+/// trouble for now: punt to a simple worklist-based solution.
+///
+llvm::SetVector<Operation *> getSlice(
+    Operation *op,
+    TransitiveFilter backwardFilter = /* pass-through*/
+    [](Operation *) { return true; },
+    TransitiveFilter forwardFilter = /* pass-through*/
+    [](Operation *) { return true; });
+
+/// Multi-root DAG topological sort.
+/// Performs a topological sort of the Operation in the `toSort` SetVector.
+/// Returns a topologically sorted SetVector.
+llvm::SetVector<Operation *>
+topologicalSort(const llvm::SetVector<Operation *> &toSort);
+
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_SLICEANALYSIS_H_
diff --git a/mlir/include/mlir/Analysis/Utils.h b/mlir/include/mlir/Analysis/Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..7cf1e5c971acc49c76150311754c75b7b28ebaee
--- /dev/null
+++ b/mlir/include/mlir/Analysis/Utils.h
@@ -0,0 +1,295 @@
+//===- Utils.h - General analysis utilities ---------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for various transformation utilities for
+// memref's and non-loop IR structures. These are not passes by themselves but
+// are used either by passes, optimization sequences, or in turn by other
+// transformation utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_UTILS_H
+#define MLIR_ANALYSIS_UTILS_H
+
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Location.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/SmallVector.h"
+#include <memory>
+
+namespace mlir {
+
+class AffineForOp;
+class Block;
+class FlatAffineConstraints;
+class Location;
+struct MemRefAccess;
+class Operation;
+class Value;
+
+/// Populates 'loops' with IVs of the loops surrounding 'op' ordered from
+/// the outermost 'affine.for' operation to the innermost one.
+//  TODO(bondhugula): handle 'affine.if' ops.
+void getLoopIVs(Operation &op, SmallVectorImpl<AffineForOp> *loops);
+
+/// Returns the nesting depth of this operation, i.e., the number of loops
+/// surrounding this operation.
+unsigned getNestingDepth(Operation &op);
+
+/// Returns in 'sequentialLoops' all sequential loops in loop nest rooted
+/// at 'forOp'.
+void getSequentialLoops(AffineForOp forOp,
+                        llvm::SmallDenseSet<Value, 8> *sequentialLoops);
+
+/// ComputationSliceState aggregates loop IVs, loop bound AffineMaps and their
+/// associated operands for a set of loops within a loop nest (typically the
+/// set of loops surrounding a store operation). Loop bound AffineMaps which
+/// are non-null represent slices of that loop's iteration space.
+struct ComputationSliceState {
+  // List of sliced loop IVs (ordered from outermost to innermost).
+  // EX: 'ivs[i]' has lower bound 'lbs[i]' and upper bound 'ubs[i]'.
+  SmallVector<Value, 4> ivs;
+  // List of lower bound AffineMaps.
+  SmallVector<AffineMap, 4> lbs;
+  // List of upper bound AffineMaps.
+  SmallVector<AffineMap, 4> ubs;
+  // List of lower bound operands (lbOperands[i] are used by 'lbs[i]').
+  std::vector<SmallVector<Value, 4>> lbOperands;
+  // List of upper bound operands (ubOperands[i] are used by 'ubs[i]').
+  std::vector<SmallVector<Value, 4>> ubOperands;
+  // Slice loop nest insertion point in target loop nest.
+  Block::iterator insertPoint;
+  // Adds to 'cst' with constraints which represent the slice bounds on 'ivs'
+  // in 'this'. Specifically, the values in 'ivs' are added to 'cst' as dim
+  // identifiers and the values in 'lb/ubOperands' are added as symbols.
+  // Constraints are added for all loop IV bounds (dim or symbol), and
+  // constraints are added for slice bounds in 'lbs'/'ubs'.
+  // Returns failure if we cannot add loop bounds because of unsupported cases.
+  LogicalResult getAsConstraints(FlatAffineConstraints *cst);
+
+  // Clears all bounds and operands in slice state.
+  void clearBounds();
+};
+
+/// Computes the computation slice loop bounds for one loop nest as affine maps
+/// of the other loop nest's IVs and symbols, using 'dependenceConstraints'
+/// computed between 'depSourceAccess' and 'depSinkAccess'.
+/// If 'isBackwardSlice' is true, a backwards slice is computed in which the
+/// slice bounds of loop nest surrounding 'depSourceAccess' are computed in
+/// terms of loop IVs and symbols of the loop nest surrounding 'depSinkAccess'
+/// at 'loopDepth'.
+/// If 'isBackwardSlice' is false, a forward slice is computed in which the
+/// slice bounds of loop nest surrounding 'depSinkAccess' are computed in terms
+/// of loop IVs and symbols of the loop nest surrounding 'depSourceAccess' at
+/// 'loopDepth'.
+/// The slice loop bounds and associated operands are returned in 'sliceState'.
+//
+//  Backward slice example:
+//
+//    affine.for %i0 = 0 to 10 {
+//      affine.store %cst, %0[%i0] : memref<100xf32>  // 'depSourceAccess'
+//    }
+//    affine.for %i1 = 0 to 10 {
+//      %v = affine.load %0[%i1] : memref<100xf32>    // 'depSinkAccess'
+//    }
+//
+//    // Backward computation slice of loop nest '%i0'.
+//    affine.for %i0 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 1)(%i1) {
+//      affine.store %cst, %0[%i0] : memref<100xf32>  // 'depSourceAccess'
+//    }
+//
+//  Forward slice example:
+//
+//    affine.for %i0 = 0 to 10 {
+//      affine.store %cst, %0[%i0] : memref<100xf32>  // 'depSourceAccess'
+//    }
+//    affine.for %i1 = 0 to 10 {
+//      %v = affine.load %0[%i1] : memref<100xf32>    // 'depSinkAccess'
+//    }
+//
+//    // Forward computation slice of loop nest '%i1'.
+//    affine.for %i1 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 1)(%i0) {
+//      %v = affine.load %0[%i1] : memref<100xf32>    // 'depSinkAccess'
+//    }
+//
+void getComputationSliceState(Operation *depSourceOp, Operation *depSinkOp,
+                              FlatAffineConstraints *dependenceConstraints,
+                              unsigned loopDepth, bool isBackwardSlice,
+                              ComputationSliceState *sliceState);
+
+/// Computes in 'sliceUnion' the union of all slice bounds computed at
+/// 'loopDepth' between all dependent pairs of ops in 'opsA' and 'opsB'.
+/// The parameter 'numCommonLoops' is the number of loops common to the
+/// operations in 'opsA' and 'opsB'.
+/// If 'isBackwardSlice' is true, computes slice bounds for loop nest
+/// surrounding ops in 'opsA', as a function of IVs and symbols of loop nest
+/// surrounding ops in 'opsB' at 'loopDepth'.
+/// If 'isBackwardSlice' is false, computes slice bounds for loop nest
+/// surrounding ops in 'opsB', as a function of IVs and symbols of loop nest
+/// surrounding ops in 'opsA' at 'loopDepth'.
+/// Returns 'success' if union was computed, 'failure' otherwise.
+// TODO(andydavis) Change this API to take 'forOpA'/'forOpB'.
+LogicalResult computeSliceUnion(ArrayRef<Operation *> opsA,
+                                ArrayRef<Operation *> opsB, unsigned loopDepth,
+                                unsigned numCommonLoops, bool isBackwardSlice,
+                                ComputationSliceState *sliceUnion);
+
+/// Creates a clone of the computation contained in the loop nest surrounding
+/// 'srcOpInst', slices the iteration space of src loop based on slice bounds
+/// in 'sliceState', and inserts the computation slice at the beginning of the
+/// operation block of the loop at 'dstLoopDepth' in the loop nest surrounding
+/// 'dstOpInst'. Returns the top-level loop of the computation slice on
+/// success, returns nullptr otherwise.
+// Loop depth is a crucial optimization choice that determines where to
+// materialize the results of the backward slice - presenting a trade-off b/w
+// storage and redundant computation in several cases.
+// TODO(andydavis) Support computation slices with common surrounding loops.
+AffineForOp insertBackwardComputationSlice(Operation *srcOpInst,
+                                           Operation *dstOpInst,
+                                           unsigned dstLoopDepth,
+                                           ComputationSliceState *sliceState);
+
+/// A region of a memref's data space; this is typically constructed by
+/// analyzing load/store op's on this memref and the index space of loops
+/// surrounding such op's.
+// For example, the memref region for a load operation at loop depth = 1:
+//
+//    affine.for %i = 0 to 32 {
+//      affine.for %ii = %i to (d0) -> (d0 + 8) (%i) {
+//        affine.load %A[%ii]
+//      }
+//    }
+//
+// Region:  {memref = %A, write = false, {%i <= m0 <= %i + 7} }
+// The last field is a 2-d FlatAffineConstraints symbolic in %i.
+//
+struct MemRefRegion {
+  explicit MemRefRegion(Location loc) : loc(loc) {}
+
+  /// Computes the memory region accessed by this memref with the region
+  /// represented as constraints symbolic/parametric in 'loopDepth' loops
+  /// surrounding opInst. The computed region's 'cst' field has exactly as many
+  /// dimensional identifiers as the rank of the memref, and *potentially*
+  /// additional symbolic identifiers which could include any of the loop IVs
+  /// surrounding opInst up until 'loopDepth' and another additional Function
+  /// symbols involved with the access (for eg., those appear in affine.apply's,
+  /// loop bounds, etc.). If 'sliceState' is non-null, operands from
+  /// 'sliceState' are added as symbols, and the following constraints are added
+  /// to the system:
+  /// *) Inequality constraints which represent loop bounds for 'sliceState'
+  ///    operands which are loop IVS (these represent the destination loop IVs
+  ///    of the slice, and are added as symbols to MemRefRegion's constraint
+  ///    system).
+  /// *) Inequality constraints for the slice bounds in 'sliceState', which
+  ///    represent the bounds on the loop IVs in this constraint system w.r.t
+  ///    to slice operands (which correspond to symbols).
+  /// If 'addMemRefDimBounds' is true, constant upper/lower bounds
+  /// [0, memref.getDimSize(i)) are added for each MemRef dimension 'i'.
+  ///
+  ///  For example, the memref region for this operation at loopDepth = 1 will
+  ///  be:
+  ///
+  ///    affine.for %i = 0 to 32 {
+  ///      affine.for %ii = %i to (d0) -> (d0 + 8) (%i) {
+  ///        load %A[%ii]
+  ///      }
+  ///    }
+  ///
+  ///   {memref = %A, write = false, {%i <= m0 <= %i + 7} }
+  /// The last field is a 2-d FlatAffineConstraints symbolic in %i.
+  ///
+  LogicalResult compute(Operation *op, unsigned loopDepth,
+                        ComputationSliceState *sliceState = nullptr,
+                        bool addMemRefDimBounds = true);
+
+  FlatAffineConstraints *getConstraints() { return &cst; }
+  const FlatAffineConstraints *getConstraints() const { return &cst; }
+  bool isWrite() const { return write; }
+  void setWrite(bool flag) { write = flag; }
+
+  /// Returns a constant upper bound on the number of elements in this region if
+  /// bounded by a known constant (always possible for static shapes), None
+  /// otherwise. Note that the symbols of the region are treated specially,
+  /// i.e., the returned bounding constant holds for *any given* value of the
+  /// symbol identifiers. The 'shape' vector is set to the corresponding
+  /// dimension-wise bounds major to minor. We use int64_t instead of uint64_t
+  /// since index types can be at most int64_t.
+  Optional<int64_t> getConstantBoundingSizeAndShape(
+      SmallVectorImpl<int64_t> *shape = nullptr,
+      std::vector<SmallVector<int64_t, 4>> *lbs = nullptr,
+      SmallVectorImpl<int64_t> *lbDivisors = nullptr) const;
+
+  /// A wrapper around FlatAffineConstraints::getConstantBoundOnDimSize(). 'pos'
+  /// corresponds to the position of the memref shape's dimension (major to
+  /// minor) which matches 1:1 with the dimensional identifier positions in
+  //'cst'.
+  Optional<int64_t>
+  getConstantBoundOnDimSize(unsigned pos,
+                            SmallVectorImpl<int64_t> *lb = nullptr,
+                            int64_t *lbFloorDivisor = nullptr) const {
+    assert(pos < getRank() && "invalid position");
+    return cst.getConstantBoundOnDimSize(pos, lb);
+  }
+
+  /// Returns the size of this MemRefRegion in bytes.
+  Optional<int64_t> getRegionSize();
+
+  // Wrapper around FlatAffineConstraints::unionBoundingBox.
+  LogicalResult unionBoundingBox(const MemRefRegion &other);
+
+  /// Returns the rank of the memref that this region corresponds to.
+  unsigned getRank() const;
+
+  /// Memref that this region corresponds to.
+  Value memref;
+
+  /// Read or write.
+  bool write;
+
+  /// If there is more than one load/store op associated with the region, the
+  /// location information would correspond to one of those op's.
+  Location loc;
+
+  /// Region (data space) of the memref accessed. This set will thus have at
+  /// least as many dimensional identifiers as the shape dimensionality of the
+  /// memref, and these are the leading dimensions of the set appearing in that
+  /// order (major to minor / outermost to innermost). There may be additional
+  /// identifiers since getMemRefRegion() is called with a specific loop depth,
+  /// and thus the region is symbolic in the outer surrounding loops at that
+  /// depth.
+  // TODO(bondhugula): Replace this to exploit HyperRectangularSet.
+  FlatAffineConstraints cst;
+};
+
+/// Returns the size of memref data in bytes if it's statically shaped, None
+/// otherwise.
+Optional<uint64_t> getMemRefSizeInBytes(MemRefType memRefType);
+
+/// Checks a load or store op for an out of bound access; returns failure if the
+/// access is out of bounds along any of the dimensions, success otherwise.
+/// Emits a diagnostic error (with location information) if emitError is true.
+template <typename LoadOrStoreOpPointer>
+LogicalResult boundCheckLoadOrStoreOp(LoadOrStoreOpPointer loadOrStoreOp,
+                                      bool emitError = true);
+
+/// Returns the number of surrounding loops common to both A and B.
+unsigned getNumCommonSurroundingLoops(Operation &A, Operation &B);
+
+/// Gets the memory footprint of all data touched in the specified memory space
+/// in bytes; if the memory space is unspecified, considers all memory spaces.
+Optional<int64_t> getMemoryFootprintBytes(AffineForOp forOp,
+                                          int memorySpace = -1);
+
+/// Returns true if `forOp' is a parallel loop.
+bool isLoopParallel(AffineForOp forOp);
+
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_UTILS_H
diff --git a/mlir/include/mlir/Analysis/Verifier.h b/mlir/include/mlir/Analysis/Verifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7075b4f1578f1614dc6e0196bd0dab860f05fa9
--- /dev/null
+++ b/mlir/include/mlir/Analysis/Verifier.h
@@ -0,0 +1,22 @@
+//===- Verifier.h - Verifier analysis for MLIR structures -------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_VERIFIER_H
+#define MLIR_ANALYSIS_VERIFIER_H
+
+namespace mlir {
+struct LogicalResult;
+class Operation;
+
+/// Perform (potentially expensive) checks of invariants, used to detect
+/// compiler bugs, on this operation and any nested operations. On error, this
+/// reports the error through the MLIRContext and returns failure.
+LogicalResult verify(Operation *op);
+} //  end namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/CMakeLists.txt b/mlir/include/mlir/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..84031a5e72a744ed8f58eab2d529f790105a7f88
--- /dev/null
+++ b/mlir/include/mlir/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_subdirectory(Analysis)
+add_subdirectory(Dialect)
+add_subdirectory(IR)
+add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Conversion/AffineToStandard/AffineToStandard.h b/mlir/include/mlir/Conversion/AffineToStandard/AffineToStandard.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6a2fac6ec9fa9458821143375fd7b0d4dd3c972
--- /dev/null
+++ b/mlir/include/mlir/Conversion/AffineToStandard/AffineToStandard.h
@@ -0,0 +1,47 @@
+//===- AffineToStandard.h - Convert Affine to Standard dialect --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_AFFINETOSTANDARD_AFFINETOSTANDARD_H
+#define MLIR_CONVERSION_AFFINETOSTANDARD_AFFINETOSTANDARD_H
+
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+class AffineExpr;
+class AffineForOp;
+class Location;
+struct LogicalResult;
+class MLIRContext;
+class OpBuilder;
+class RewritePattern;
+class Value;
+
+// Owning list of rewriting patterns.
+class OwningRewritePatternList;
+
+/// Emit code that computes the given affine expression using standard
+/// arithmetic operations applied to the provided dimension and symbol values.
+Value expandAffineExpr(OpBuilder &builder, Location loc, AffineExpr expr,
+                       ArrayRef<Value> dimValues, ArrayRef<Value> symbolValues);
+
+/// Collect a set of patterns to convert from the Affine dialect to the Standard
+/// dialect, in particular convert structured affine control flow into CFG
+/// branch-based control flow.
+void populateAffineToStdConversionPatterns(OwningRewritePatternList &patterns,
+                                           MLIRContext *ctx);
+
+/// Emit code that computes the lower bound of the given affine loop using
+/// standard arithmetic operations.
+Value lowerAffineLowerBound(AffineForOp op, OpBuilder &builder);
+
+/// Emit code that computes the upper bound of the given affine loop using
+/// standard arithmetic operations.
+Value lowerAffineUpperBound(AffineForOp op, OpBuilder &builder);
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_AFFINETOSTANDARD_AFFINETOSTANDARD_H
diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
new file mode 100644
index 0000000000000000000000000000000000000000..4eb6379adf6e7b588dc902dc110722bf8016c120
--- /dev/null
+++ b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
@@ -0,0 +1,55 @@
+//===- GPUToCUDAPass.h - MLIR CUDA runtime support --------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
+#define MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
+
+#include "mlir/Support/LLVM.h"
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace mlir {
+
+class Location;
+class ModuleOp;
+
+namespace LLVM {
+class LLVMDialect;
+} // namespace LLVM
+
+template <typename T> class OpPassBase;
+
+using OwnedCubin = std::unique_ptr<std::vector<char>>;
+using CubinGenerator =
+    std::function<OwnedCubin(const std::string &, Location, StringRef)>;
+
+/// Creates a pass to convert kernel functions into CUBIN blobs.
+///
+/// This transformation takes the body of each function that is annotated with
+/// the 'nvvm.kernel' attribute, copies it to a new LLVM module, compiles the
+/// module with help of the nvptx backend to PTX and then invokes the provided
+/// cubinGenerator to produce a binary blob (the cubin). Such blob is then
+/// attached as a string attribute named 'nvvm.cubin' to the kernel function.
+/// After the transformation, the body of the kernel function is removed (i.e.,
+/// it is turned into a declaration).
+std::unique_ptr<OpPassBase<ModuleOp>>
+createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);
+
+/// Creates a pass to convert a gpu.launch_func operation into a sequence of
+/// CUDA calls.
+///
+/// This pass does not generate code to call CUDA directly but instead uses a
+/// small wrapper library that exports a stable and conveniently typed ABI
+/// on top of CUDA.
+std::unique_ptr<OpPassBase<ModuleOp>>
+createConvertGpuLaunchFuncToCudaCallsPass();
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
new file mode 100644
index 0000000000000000000000000000000000000000..75e4f7e374c6ae00971ab6349fbb36ba31aa6032
--- /dev/null
+++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
@@ -0,0 +1,29 @@
+//===- GPUToNVVMPass.h - Convert GPU kernel to NVVM dialect -----*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_
+#define MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_
+
+#include <memory>
+
+namespace mlir {
+class LLVMTypeConverter;
+class OwningRewritePatternList;
+
+class ModuleOp;
+template <typename OpT> class OpPassBase;
+
+/// Collect a set of patterns to convert from the GPU dialect to NVVM.
+void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
+                                         OwningRewritePatternList &patterns);
+
+/// Creates a pass that lowers GPU dialect operations to NVVM counterparts.
+std::unique_ptr<OpPassBase<ModuleOp>> createLowerGpuOpsToNVVMOpsPass();
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_
diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
new file mode 100644
index 0000000000000000000000000000000000000000..e913c2e1131a1712552991fdfe63dea6f8cfe863
--- /dev/null
+++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -0,0 +1,23 @@
+//===- GPUToROCDLPass.h - Convert GPU kernel to ROCDL dialect ---*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUTOROCDL_GPUTOROCDLPASS_H_
+#define MLIR_CONVERSION_GPUTOROCDL_GPUTOROCDLPASS_H_
+
+#include <memory>
+
+namespace mlir {
+
+class ModuleOp;
+template <typename OpT> class OpPassBase;
+
+/// Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
+std::unique_ptr<OpPassBase<ModuleOp>> createLowerGpuOpsToROCDLOpsPass();
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTOROCDL_GPUTOROCDLPASS_H_
diff --git a/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.h b/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.h
new file mode 100644
index 0000000000000000000000000000000000000000..762a6e502d4e8d00e339139f1b62005391725e82
--- /dev/null
+++ b/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.h
@@ -0,0 +1,29 @@
+//===- ConvertGPUToSPIRV.h - GPU Ops to SPIR-V dialect patterns ----C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides patterns for lowering GPU Ops to SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_GPUTOSPIRV_CONVERTGPUTOSPIRV_H
+#define MLIR_CONVERSION_GPUTOSPIRV_CONVERTGPUTOSPIRV_H
+
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+class SPIRVTypeConverter;
+/// Appends to a pattern list additional patterns for translating GPU Ops to
+/// SPIR-V ops. Needs the workgroup size as input since SPIR-V/Vulkan requires
+/// the workgroup size to be statically specified.
+void populateGPUToSPIRVPatterns(MLIRContext *context,
+                                SPIRVTypeConverter &typeConverter,
+                                OwningRewritePatternList &patterns,
+                                ArrayRef<int64_t> workGroupSize);
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTOSPIRV_CONVERTGPUTOSPIRV_H
diff --git a/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h b/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h
new file mode 100644
index 0000000000000000000000000000000000000000..37230f4c0e11f3f455471d3befab2e7afc3e4faf
--- /dev/null
+++ b/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h
@@ -0,0 +1,31 @@
+//===- ConvertGPUToSPIRVPass.h - GPU to SPIR-V conversion pass --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides a pass to convert GPU ops to SPIRV ops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_GPUTOSPIRV_CONVERTGPUTOSPIRVPASS_H
+#define MLIR_CONVERSION_GPUTOSPIRV_CONVERTGPUTOSPIRVPASS_H
+
+#include "mlir/Support/LLVM.h"
+
+#include <memory>
+
+namespace mlir {
+
+class ModuleOp;
+template <typename T> class OpPassBase;
+
+/// Pass to convert GPU Ops to SPIR-V ops.  Needs the workgroup size as input
+/// since SPIR-V/Vulkan requires the workgroup size to be statically specified.
+std::unique_ptr<OpPassBase<ModuleOp>>
+createConvertGPUToSPIRVPass(ArrayRef<int64_t> workGroupSize);
+
+} // namespace mlir
+#endif // MLIR_CONVERSION_GPUTOSPIRV_CONVERTGPUTOSPIRVPASS_H
diff --git a/mlir/include/mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h b/mlir/include/mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h
new file mode 100644
index 0000000000000000000000000000000000000000..27950177c1d9cd096cc0eab3f45feccece73e184
--- /dev/null
+++ b/mlir/include/mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h
@@ -0,0 +1,30 @@
+//===- LinalgToLLVM.h - Utils to convert from the linalg dialect ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_LINALGTOLLVM_LINALGTOLLVM_H_
+#define MLIR_CONVERSION_LINALGTOLLVM_LINALGTOLLVM_H_
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+class MLIRContext;
+
+class LinalgTypeConverter : public LLVMTypeConverter {
+public:
+  using LLVMTypeConverter::LLVMTypeConverter;
+  Type convertType(Type t) override;
+};
+
+/// Populate the given list with patterns that convert from Linalg to LLVM.
+void populateLinalgToLLVMConversionPatterns(LinalgTypeConverter &converter,
+                                            OwningRewritePatternList &patterns,
+                                            MLIRContext *ctx);
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_LINALGTOLLVM_LINALGTOLLVM_H_
diff --git a/mlir/include/mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h b/mlir/include/mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cb8f59e6f7eb5026d1301a5cfe002e005fe4f00
--- /dev/null
+++ b/mlir/include/mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h
@@ -0,0 +1,35 @@
+//===- ConvertLoopToStandard.h - Pass entrypoint ----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_LOOPTOSTANDARD_CONVERTLOOPTOSTANDARD_H_
+#define MLIR_CONVERSION_LOOPTOSTANDARD_CONVERTLOOPTOSTANDARD_H_
+
+#include <memory>
+#include <vector>
+
+namespace mlir {
+struct LogicalResult;
+class MLIRContext;
+class Pass;
+class RewritePattern;
+
+// Owning list of rewriting patterns.
+class OwningRewritePatternList;
+
+/// Collect a set of patterns to lower from loop.for, loop.if, and
+/// loop.terminator to CFG operations within the Standard dialect, in particular
+/// convert structured control flow into CFG branch-based control flow.
+void populateLoopToStdConversionPatterns(OwningRewritePatternList &patterns,
+                                         MLIRContext *ctx);
+
+/// Creates a pass to convert loop.for, loop.if and loop.terminator ops to CFG.
+std::unique_ptr<Pass> createLowerToCFGPass();
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_LOOPTOSTANDARD_CONVERTLOOPTOSTANDARD_H_
diff --git a/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h b/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..80faa03f31332dc4afa7af35436407e789246f6d
--- /dev/null
+++ b/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h
@@ -0,0 +1,77 @@
+//===- LoopsToGPU.h - Convert loop nests to GPU kernels ---------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPU_H_
+#define MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPU_H_
+
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+class AffineForOp;
+struct LogicalResult;
+class Value;
+
+namespace loop {
+class ForOp;
+} // end namespace loop
+
+/// Convert a perfect affine loop nest with the outermost loop identified by
+/// `forOp` into a gpu::Launch operation.  Map `numBlockDims` outer loops to
+/// GPU blocks and `numThreadDims` to GPU threads.  The bounds of the loops that
+/// are mapped should be independent of the induction variables of the other
+/// mapped loops.
+///
+/// No check on the size of the block or grid, or on the validity of
+/// parallelization is performed, it is under the responsibility of the caller
+/// to strip-mine the loops and to perform the dependence analysis before
+/// calling the conversion.
+LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp,
+                                               unsigned numBlockDims,
+                                               unsigned numThreadDims);
+
+/// Convert a perfect linalg loop nest with the outermost loop identified by
+/// `forOp` into a gpu::Launch operation.  Map `numBlockDims` outer loops to
+/// GPU blocks and `numThreadDims` to GPU threads.  The bounds of the loops that
+/// are mapped should be independent of the induction variables of the other
+/// mapped loops.
+///
+/// No check on the size of the block or grid, or on the validity of
+/// parallelization is performed, it is under the responsibility of the caller
+/// to strip-mine the loops and to perform the dependence analysis before
+/// calling the conversion.
+LogicalResult convertLoopNestToGPULaunch(loop::ForOp forOp,
+                                         unsigned numBlockDims,
+                                         unsigned numThreadDims);
+
+/// Convert a loop operation into a GPU launch with the values provided in
+/// `numWorkGroups` as the grid size and the values provided in `workGroupSizes`
+/// as the block size. Size of `numWorkGroups` and workGroupSizes` must be less
+/// than or equal to 3. The loop operation can be an imperfectly nested
+/// computation with the following restrictions:
+/// 1) The loop nest must contain as many perfectly nested loops as the number
+/// of values passed in through `numWorkGroups`. This corresponds to the number
+/// of grid dimensions of the launch. All loops within the loop nest must be
+/// parallel.
+/// 2) The body of the innermost loop of the above perfectly nested loops, must
+/// contain statements that satisfy one of the two conditions below:
+///   a) A perfect loop nest of depth greater than or equal to the number of
+///   values passed in through `workGroupSizes`, i.e. the number of thread
+///   dimensions of the launch. Loops at depth less than or equal to size of
+///   `workGroupSizes` must be parallel. Loops nested deeper can be sequential
+///   and are retained as such in the generated GPU launch code.
+///   b) Statements that are safe to be executed by all threads within the
+///   workgroup. No checks are performed that this is indeed the case.
+///   TODO(ravishankarm) : Add checks that verify 2(b) above.
+/// The above conditions are assumed to be satisfied by the computation rooted
+/// at `forOp`.
+LogicalResult convertLoopToGPULaunch(loop::ForOp forOp,
+                                     ArrayRef<Value> numWorkGroups,
+                                     ArrayRef<Value> workGroupSizes);
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPU_H_
diff --git a/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h b/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3d663ae3d75e29cbb721fb569a1abad8d331678
--- /dev/null
+++ b/mlir/include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h
@@ -0,0 +1,41 @@
+//===- LoopsToGPUPass.h - Pass converting loops to GPU kernels --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPUPASS_H_
+#define MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPUPASS_H_
+
+#include "mlir/Support/LLVM.h"
+
+#include <memory>
+
+namespace mlir {
+class FuncOp;
+template <typename T> class OpPassBase;
+
+/// Create a pass that converts loop nests into GPU kernels.  It considers
+/// top-level affine.for and linalg.for operations as roots of loop nests and
+/// converts them to the gpu.launch operations if possible.
+///
+/// No check on the size of the block or grid, or on the validity of
+/// parallelization is performed, it is under the responsibility of the caller
+/// to strip-mine the loops and to perform the dependence analysis before
+/// calling the conversion.
+std::unique_ptr<OpPassBase<FuncOp>>
+createSimpleLoopsToGPUPass(unsigned numBlockDims, unsigned numThreadDims);
+
+/// Create a pass that converts every loop operation within the body of the
+/// FuncOp into a GPU launch. The number of workgroups and workgroup size for
+/// the implementation is controlled by SSA values passed into conversion
+/// method. For testing, the values are set as constants obtained from a command
+/// line flag. See convertLoopToGPULaunch for a description of the required
+/// semantics of the converted loop operation.
+std::unique_ptr<OpPassBase<FuncOp>>
+createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups,
+                    ArrayRef<int64_t> workGroupSize);
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_LOOPSTOGPU_LOOPSTOGPUPASS_H_
diff --git a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
new file mode 100644
index 0000000000000000000000000000000000000000..e78859f992bac930647e9d9d832939861917b1ec
--- /dev/null
+++ b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
@@ -0,0 +1,244 @@
+//===- ConvertStandardToLLVM.h - Convert to the LLVM dialect ----*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides a dialect conversion targeting the LLVM IR dialect.  By default, it
+// converts Standard ops and types and provides hooks for dialect-specific
+// extensions to the conversion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVM_H
+#define MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVM_H
+
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace llvm {
+class IntegerType;
+class LLVMContext;
+class Module;
+class Type;
+} // namespace llvm
+
+namespace mlir {
+
+class UnrankedMemRefType;
+
+namespace LLVM {
+class LLVMDialect;
+class LLVMType;
+} // namespace LLVM
+
+/// Conversion from types in the Standard dialect to the LLVM IR dialect.
+class LLVMTypeConverter : public TypeConverter {
+public:
+  using TypeConverter::convertType;
+
+  LLVMTypeConverter(MLIRContext *ctx);
+
+  /// Convert types to LLVM IR.  This calls `convertAdditionalType` to convert
+  /// non-standard or non-builtin types.
+  Type convertType(Type t) override;
+
+  /// Convert a function type.  The arguments and results are converted one by
+  /// one and results are packed into a wrapped LLVM IR structure type. `result`
+  /// is populated with argument mapping.
+  LLVM::LLVMType convertFunctionSignature(FunctionType type, bool isVariadic,
+                                          SignatureConversion &result);
+
+  /// Convert a non-empty list of types to be returned from a function into a
+  /// supported LLVM IR type.  In particular, if more than one values is
+  /// returned, create an LLVM IR structure type with elements that correspond
+  /// to each of the MLIR types converted with `convertType`.
+  Type packFunctionResults(ArrayRef<Type> types);
+
+  /// Returns the LLVM context.
+  llvm::LLVMContext &getLLVMContext();
+
+  /// Returns the LLVM dialect.
+  LLVM::LLVMDialect *getDialect() { return llvmDialect; }
+
+  /// Promote the LLVM struct representation of all MemRef descriptors to stack
+  /// and use pointers to struct to avoid the complexity of the
+  /// platform-specific C/C++ ABI lowering related to struct argument passing.
+  SmallVector<Value, 4> promoteMemRefDescriptors(Location loc,
+                                                 ValueRange opOperands,
+                                                 ValueRange operands,
+                                                 OpBuilder &builder);
+
+  /// Promote the LLVM struct representation of one MemRef descriptor to stack
+  /// and use pointer to struct to avoid the complexity of the platform-specific
+  /// C/C++ ABI lowering related to struct argument passing.
+  Value promoteOneMemRefDescriptor(Location loc, Value operand,
+                                   OpBuilder &builder);
+
+protected:
+  /// LLVM IR module used to parse/create types.
+  llvm::Module *module;
+  LLVM::LLVMDialect *llvmDialect;
+
+private:
+  Type convertStandardType(Type type);
+
+  // Convert a function type.  The arguments and results are converted one by
+  // one.  Additionally, if the function returns more than one value, pack the
+  // results into an LLVM IR structure type so that the converted function type
+  // returns at most one result.
+  Type convertFunctionType(FunctionType type);
+
+  // Convert the index type.  Uses llvmModule data layout to create an integer
+  // of the pointer bitwidth.
+  Type convertIndexType(IndexType type);
+
+  // Convert an integer type `i*` to `!llvm<"i*">`.
+  Type convertIntegerType(IntegerType type);
+
+  // Convert a floating point type: `f16` to `!llvm.half`, `f32` to
+  // `!llvm.float` and `f64` to `!llvm.double`.  `bf16` is not supported
+  // by LLVM.
+  Type convertFloatType(FloatType type);
+
+  // Convert a memref type into an LLVM type that captures the relevant data.
+  // For statically-shaped memrefs, the resulting type is a pointer to the
+  // (converted) memref element type. For dynamically-shaped memrefs, the
+  // resulting type is an LLVM structure type that contains:
+  //   1. a pointer to the (converted) memref element type
+  //   2. as many index types as memref has dynamic dimensions.
+  Type convertMemRefType(MemRefType type);
+
+  // Convert an unranked memref type to an LLVM type that captures the
+  // runtime rank and a pointer to the static ranked memref desc
+  Type convertUnrankedMemRefType(UnrankedMemRefType type);
+
+  // Convert a 1D vector type into an LLVM vector type.
+  Type convertVectorType(VectorType type);
+
+  // Get the LLVM representation of the index type based on the bitwidth of the
+  // pointer as defined by the data layout of the module.
+  LLVM::LLVMType getIndexType();
+
+  // Extract an LLVM IR dialect type.
+  LLVM::LLVMType unwrap(Type type);
+};
+
+/// Helper class to produce LLVM dialect operations extracting or inserting
+/// values to a struct.
+class StructBuilder {
+public:
+  /// Construct a helper for the given value.
+  explicit StructBuilder(Value v);
+  /// Builds IR creating an `undef` value of the descriptor type.
+  static StructBuilder undef(OpBuilder &builder, Location loc,
+                             Type descriptorType);
+
+  /*implicit*/ operator Value() { return value; }
+
+protected:
+  // LLVM value
+  Value value;
+  // Cached struct type.
+  Type structType;
+
+protected:
+  /// Builds IR to extract a value from the struct at position pos
+  Value extractPtr(OpBuilder &builder, Location loc, unsigned pos);
+  /// Builds IR to set a value in the struct at position pos
+  void setPtr(OpBuilder &builder, Location loc, unsigned pos, Value ptr);
+};
+/// Helper class to produce LLVM dialect operations extracting or inserting
+/// elements of a MemRef descriptor. Wraps a Value pointing to the descriptor.
+/// The Value may be null, in which case none of the operations are valid.
+class MemRefDescriptor : public StructBuilder {
+public:
+  /// Construct a helper for the given descriptor value.
+  explicit MemRefDescriptor(Value descriptor);
+  /// Builds IR creating an `undef` value of the descriptor type.
+  static MemRefDescriptor undef(OpBuilder &builder, Location loc,
+                                Type descriptorType);
+  /// Builds IR creating a MemRef descriptor that represents `type` and
+  /// populates it with static shape and stride information extracted from the
+  /// type.
+  static MemRefDescriptor fromStaticShape(OpBuilder &builder, Location loc,
+                                          LLVMTypeConverter &typeConverter,
+                                          MemRefType type, Value memory);
+
+  /// Builds IR extracting the allocated pointer from the descriptor.
+  Value allocatedPtr(OpBuilder &builder, Location loc);
+  /// Builds IR inserting the allocated pointer into the descriptor.
+  void setAllocatedPtr(OpBuilder &builder, Location loc, Value ptr);
+
+  /// Builds IR extracting the aligned pointer from the descriptor.
+  Value alignedPtr(OpBuilder &builder, Location loc);
+
+  /// Builds IR inserting the aligned pointer into the descriptor.
+  void setAlignedPtr(OpBuilder &builder, Location loc, Value ptr);
+
+  /// Builds IR extracting the offset from the descriptor.
+  Value offset(OpBuilder &builder, Location loc);
+
+  /// Builds IR inserting the offset into the descriptor.
+  void setOffset(OpBuilder &builder, Location loc, Value offset);
+  void setConstantOffset(OpBuilder &builder, Location loc, uint64_t offset);
+
+  /// Builds IR extracting the pos-th size from the descriptor.
+  Value size(OpBuilder &builder, Location loc, unsigned pos);
+
+  /// Builds IR inserting the pos-th size into the descriptor
+  void setSize(OpBuilder &builder, Location loc, unsigned pos, Value size);
+  void setConstantSize(OpBuilder &builder, Location loc, unsigned pos,
+                       uint64_t size);
+
+  /// Builds IR extracting the pos-th size from the descriptor.
+  Value stride(OpBuilder &builder, Location loc, unsigned pos);
+
+  /// Builds IR inserting the pos-th stride into the descriptor
+  void setStride(OpBuilder &builder, Location loc, unsigned pos, Value stride);
+  void setConstantStride(OpBuilder &builder, Location loc, unsigned pos,
+                         uint64_t stride);
+
+  /// Returns the (LLVM) type this descriptor points to.
+  LLVM::LLVMType getElementType();
+
+private:
+  // Cached index type.
+  Type indexType;
+};
+
+class UnrankedMemRefDescriptor : public StructBuilder {
+public:
+  /// Construct a helper for the given descriptor value.
+  explicit UnrankedMemRefDescriptor(Value descriptor);
+  /// Builds IR creating an `undef` value of the descriptor type.
+  static UnrankedMemRefDescriptor undef(OpBuilder &builder, Location loc,
+                                        Type descriptorType);
+
+  /// Builds IR extracting the rank from the descriptor
+  Value rank(OpBuilder &builder, Location loc);
+  /// Builds IR setting the rank in the descriptor
+  void setRank(OpBuilder &builder, Location loc, Value value);
+  /// Builds IR extracting ranked memref descriptor ptr
+  Value memRefDescPtr(OpBuilder &builder, Location loc);
+  /// Builds IR setting ranked memref descriptor ptr
+  void setMemRefDescPtr(OpBuilder &builder, Location loc, Value value);
+};
+/// Base class for operation conversions targeting the LLVM IR dialect. Provides
+/// conversion patterns with an access to the containing LLVMLowering for the
+/// purpose of type conversions.
+class LLVMOpLowering : public ConversionPattern {
+public:
+  LLVMOpLowering(StringRef rootOpName, MLIRContext *context,
+                 LLVMTypeConverter &lowering, PatternBenefit benefit = 1);
+
+protected:
+  // Back-reference to the lowering class, used to call type and function
+  // conversions accounting for potential extensions.
+  LLVMTypeConverter &lowering;
+};
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVM_H
diff --git a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4d95da6a75d41a8d1a10285888b212071582777
--- /dev/null
+++ b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h
@@ -0,0 +1,109 @@
+//===- ConvertStandardToLLVMPass.h - Pass entrypoint ------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVMPASS_H_
+#define MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVMPASS_H_
+
+#include "llvm/ADT/STLExtras.h"
+#include <memory>
+#include <vector>
+
+namespace llvm {
+class Module;
+} // namespace llvm
+
+namespace mlir {
+class DialectConversion;
+class FuncOp;
+class LLVMTypeConverter;
+struct LogicalResult;
+class MLIRContext;
+class ModuleOp;
+template <typename T> class OpPassBase;
+class RewritePattern;
+class Type;
+
+// Owning list of rewriting patterns.
+class OwningRewritePatternList;
+
+/// Type for a callback constructing the owning list of patterns for the
+/// conversion to the LLVMIR dialect.  The callback is expected to append
+/// patterns to the owning list provided as the second argument.
+using LLVMPatternListFiller =
+    std::function<void(LLVMTypeConverter &, OwningRewritePatternList &)>;
+
+/// Type for a callback constructing the type converter for the conversion to
+/// the LLVMIR dialect.  The callback is expected to return an instance of the
+/// converter.
+using LLVMTypeConverterMaker =
+    std::function<std::unique_ptr<LLVMTypeConverter>(MLIRContext *)>;
+
+/// Collect a set of patterns to convert memory-related operations from the
+/// Standard dialect to the LLVM dialect, excluding the memory-related
+/// operations.
+void populateStdToLLVMMemoryConversionPatters(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns);
+
+/// Collect a set of patterns to convert from the Standard dialect to the LLVM
+/// dialect, excluding the memory-related operations.
+void populateStdToLLVMNonMemoryConversionPatterns(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns);
+
+/// Collect a set of patterns to convert from the Standard dialect to LLVM.
+void populateStdToLLVMConversionPatterns(LLVMTypeConverter &converter,
+                                         OwningRewritePatternList &patterns);
+
+/// Creates a pass to convert the Standard dialect into the LLVMIR dialect.
+/// By default stdlib malloc/free are used for allocating MemRef payloads.
+/// Specifying `useAlloca-true` emits stack allocations instead. In the future
+/// this may become an enum when we have concrete uses for other options.
+std::unique_ptr<OpPassBase<ModuleOp>>
+createLowerToLLVMPass(bool useAlloca = false);
+
+/// Creates a pass to convert operations to the LLVMIR dialect.  The conversion
+/// is defined by a list of patterns and a type converter that will be obtained
+/// during the pass using the provided callbacks.
+/// By default stdlib malloc/free are used for allocating MemRef payloads.
+/// Specifying `useAlloca-true` emits stack allocations instead. In the future
+/// this may become an enum when we have concrete uses for other options.
+std::unique_ptr<OpPassBase<ModuleOp>>
+createLowerToLLVMPass(LLVMPatternListFiller patternListFiller,
+                      LLVMTypeConverterMaker typeConverterMaker,
+                      bool useAlloca = false);
+
+/// Creates a pass to convert operations to the LLVMIR dialect.  The conversion
+/// is defined by a list of patterns obtained during the pass using the provided
+/// callback and an optional type conversion class, an instance is created
+/// during the pass.
+/// By default stdlib malloc/free are used for allocating MemRef payloads.
+/// Specifying `useAlloca-true` emits stack allocations instead. In the future
+/// this may become an enum when we have concrete uses for other options.
+template <typename TypeConverter = LLVMTypeConverter>
+std::unique_ptr<OpPassBase<ModuleOp>>
+createLowerToLLVMPass(LLVMPatternListFiller patternListFiller,
+                      bool useAlloca = false) {
+  return createLowerToLLVMPass(
+      patternListFiller,
+      [](MLIRContext *context) {
+        return std::make_unique<TypeConverter>(context);
+      },
+      useAlloca);
+}
+
+namespace LLVM {
+/// Make argument-taking successors of each block distinct.  PHI nodes in LLVM
+/// IR use the predecessor ID to identify which value to take.  They do not
+/// support different values coming from the same predecessor.  If a block has
+/// another block as a successor more than once with different values, insert
+/// a new dummy block for LLVM PHI nodes to tell the sources apart.
+void ensureDistinctSuccessors(ModuleOp m);
+} // namespace LLVM
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVMPASS_H_
diff --git a/mlir/include/mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h b/mlir/include/mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0e874027bf443d4737a9d384c99e421a298d186
--- /dev/null
+++ b/mlir/include/mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h
@@ -0,0 +1,35 @@
+//===- ConvertStandardToSPIRV.h - Convert to SPIR-V dialect -----*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides patterns to lower StandardOps to SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_STANDARDTOSPIRV_CONVERTSTANDARDTOSPIRV_H
+#define MLIR_CONVERSION_STANDARDTOSPIRV_CONVERTSTANDARDTOSPIRV_H
+
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+class SPIRVTypeConverter;
+
+/// Appends to a pattern list additional patterns for translating StandardOps to
+/// SPIR-V ops. Also adds the patterns legalize ops not directly translated to
+/// SPIR-V dialect.
+void populateStandardToSPIRVPatterns(MLIRContext *context,
+                                     SPIRVTypeConverter &typeConverter,
+                                     OwningRewritePatternList &patterns);
+
+/// Appends to a pattern list patterns to legalize ops that are not directly
+/// lowered to SPIR-V.
+void populateStdLegalizationPatternsForSPIRVLowering(
+    MLIRContext *context, OwningRewritePatternList &patterns);
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_STANDARDTOSPIRV_CONVERTSTANDARDTOSPIRV_H
diff --git a/mlir/include/mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h b/mlir/include/mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h
new file mode 100644
index 0000000000000000000000000000000000000000..7dbaf1c04188d7c8ac5895cbe988ebfd182f54a7
--- /dev/null
+++ b/mlir/include/mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h
@@ -0,0 +1,28 @@
+//===- ConvertStandardToSPIRVPass.h - StdOps to SPIR-V pass -----*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides a pass to lower from StandardOps to SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_STANDARDTOSPIRV_CONVERTSTANDARDTOSPIRVPASS_H
+#define MLIR_CONVERSION_STANDARDTOSPIRV_CONVERTSTANDARDTOSPIRVPASS_H
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+
+/// Pass to convert StandardOps to SPIR-V ops.
+std::unique_ptr<OpPassBase<ModuleOp>> createConvertStandardToSPIRVPass();
+
+/// Pass to legalize ops that are not directly lowered to SPIR-V.
+std::unique_ptr<Pass> createLegalizeStdOpsForSPIRVLoweringPass();
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_STANDARDTOSPIRV_CONVERTSTANDARDTOSPIRVPASS_H
diff --git a/mlir/include/mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h b/mlir/include/mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8b97c21a3efbf9ad398fddba199c3ad65671812
--- /dev/null
+++ b/mlir/include/mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h
@@ -0,0 +1,27 @@
+//===- ConvertVectorToLLVM.h - Utils to convert from the vector dialect ---===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_VECTORTOLLVM_CONVERTVECTORTOLLVM_H_
+#define MLIR_CONVERSION_VECTORTOLLVM_CONVERTVECTORTOLLVM_H_
+
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+class LLVMTypeConverter;
+class ModuleOp;
+template <typename T> class OpPassBase;
+
+/// Collect a set of patterns to convert from the Vector dialect to LLVM.
+void populateVectorToLLVMConversionPatterns(LLVMTypeConverter &converter,
+                                            OwningRewritePatternList &patterns);
+
+/// Create a pass to convert vector operations to the LLVMIR dialect.
+OpPassBase<ModuleOp> *createLowerVectorToLLVMPass();
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_VECTORTOLLVM_CONVERTVECTORTOLLVM_H_
diff --git a/mlir/include/mlir/Conversion/VectorToLoops/ConvertVectorToLoops.h b/mlir/include/mlir/Conversion/VectorToLoops/ConvertVectorToLoops.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f7d0843b7326211d61028afe2cf805ecaa00752
--- /dev/null
+++ b/mlir/include/mlir/Conversion/VectorToLoops/ConvertVectorToLoops.h
@@ -0,0 +1,27 @@
+//===- ConvertVectorToLoops.h - Utils to convert from the vector dialect --===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_VECTORTOLLVM_CONVERTVECTORTOLOOPS_H_
+#define MLIR_CONVERSION_VECTORTOLLVM_CONVERTVECTORTOLOOPS_H_
+
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+class MLIRContext;
+class ModuleOp;
+template <typename T> class OpPassBase;
+
+/// Collect a set of patterns to convert from the Vector dialect to loops + std.
+void populateVectorToAffineLoopsConversionPatterns(
+    MLIRContext *context, OwningRewritePatternList &patterns);
+
+/// Create a pass to convert vector operations to affine loops + std dialect.
+OpPassBase<ModuleOp> *createLowerVectorToLoopsPass();
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_VECTORTOLLVM_CONVERTVECTORTOLOOPS_H_
diff --git a/mlir/include/mlir/Dialect/AffineOps/AffineOps.h b/mlir/include/mlir/Dialect/AffineOps/AffineOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..b884ac5c2cea4f154e15e309bc719db9c4424a8e
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AffineOps/AffineOps.h
@@ -0,0 +1,677 @@
+//===- AffineOps.h - MLIR Affine Operations -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines convenience types for working with Affine operations
+// in the MLIR operation set.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_AFFINEOPS_AFFINEOPS_H
+#define MLIR_DIALECT_AFFINEOPS_AFFINEOPS_H
+
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Transforms/LoopLikeInterface.h"
+
+namespace mlir {
+class AffineBound;
+class AffineDimExpr;
+class AffineValueMap;
+class AffineTerminatorOp;
+class FlatAffineConstraints;
+class OpBuilder;
+
+/// A utility function to check if a value is defined at the top level of a
+/// function. A value of index type defined at the top level is always a valid
+/// symbol.
+bool isTopLevelValue(Value value);
+
+class AffineOpsDialect : public Dialect {
+public:
+  AffineOpsDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "affine"; }
+
+  /// Materialize a single constant operation from a given attribute value with
+  /// the desired resultant type.
+  Operation *materializeConstant(OpBuilder &builder, Attribute value, Type type,
+                                 Location loc) override;
+};
+
+/// The "affine.apply" operation applies an affine map to a list of operands,
+/// yielding a single result. The operand list must be the same size as the
+/// number of arguments to the affine mapping.  All operands and the result are
+/// of type 'Index'. This operation requires a single affine map attribute named
+/// "map".  For example:
+///
+///   %y = "affine.apply" (%x) { map: (d0) -> (d0 + 1) } :
+///          (index) -> (index)
+///
+/// equivalently:
+///
+///   #map42 = (d0)->(d0+1)
+///   %y = affine.apply #map42(%x)
+///
+class AffineApplyOp : public Op<AffineApplyOp, OpTrait::VariadicOperands,
+                                OpTrait::OneResult, OpTrait::HasNoSideEffect> {
+public:
+  using Op::Op;
+
+  /// Builds an affine apply op with the specified map and operands.
+  static void build(Builder *builder, OperationState &result, AffineMap map,
+                    ValueRange operands);
+
+  /// Returns the affine map to be applied by this operation.
+  AffineMap getAffineMap() {
+    return getAttrOfType<AffineMapAttr>("map").getValue();
+  }
+
+  /// Returns true if the result of this operation can be used as dimension id.
+  bool isValidDim();
+
+  /// Returns true if the result of this operation is a symbol.
+  bool isValidSymbol();
+
+  static StringRef getOperationName() { return "affine.apply"; }
+
+  operand_range getMapOperands() { return getOperands(); }
+
+  // Hooks to customize behavior of this op.
+  static ParseResult parse(OpAsmParser &parser, OperationState &result);
+  void print(OpAsmPrinter &p);
+  LogicalResult verify();
+  OpFoldResult fold(ArrayRef<Attribute> operands);
+
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context);
+};
+
+/// AffineDmaStartOp starts a non-blocking DMA operation that transfers data
+/// from a source memref to a destination memref. The source and destination
+/// memref need not be of the same dimensionality, but need to have the same
+/// elemental type. The operands include the source and destination memref's
+/// each followed by its indices, size of the data transfer in terms of the
+/// number of elements (of the elemental type of the memref), a tag memref with
+/// its indices, and optionally at the end, a stride and a
+/// number_of_elements_per_stride arguments. The tag location is used by an
+/// AffineDmaWaitOp to check for completion. The indices of the source memref,
+/// destination memref, and the tag memref have the same restrictions as any
+/// affine.load/store. In particular, index for each memref dimension must be an
+/// affine expression of loop induction variables and symbols.
+/// The optional stride arguments should be of 'index' type, and specify a
+/// stride for the slower memory space (memory space with a lower memory space
+/// id), transferring chunks of number_of_elements_per_stride every stride until
+/// %num_elements are transferred. Either both or no stride arguments should be
+/// specified. The value of 'num_elements' must be a multiple of
+/// 'number_of_elements_per_stride'.
+//
+// For example, a DmaStartOp operation that transfers 256 elements of a memref
+// '%src' in memory space 0 at indices [%i + 3, %j] to memref '%dst' in memory
+// space 1 at indices [%k + 7, %l], would be specified as follows:
+//
+//   %num_elements = constant 256
+//   %idx = constant 0 : index
+//   %tag = alloc() : memref<1xi32, 4>
+//   affine.dma_start %src[%i + 3, %j], %dst[%k + 7, %l], %tag[%idx],
+//     %num_elements :
+//       memref<40x128xf32, 0>, memref<2x1024xf32, 1>, memref<1xi32, 2>
+//
+//   If %stride and %num_elt_per_stride are specified, the DMA is expected to
+//   transfer %num_elt_per_stride elements every %stride elements apart from
+//   memory space 0 until %num_elements are transferred.
+//
+//   affine.dma_start %src[%i, %j], %dst[%k, %l], %tag[%idx], %num_elements,
+//     %stride, %num_elt_per_stride : ...
+//
+// TODO(mlir-team): add additional operands to allow source and destination
+// striding, and multiple stride levels (possibly using AffineMaps to specify
+// multiple levels of striding).
+// TODO(andydavis) Consider replacing src/dst memref indices with view memrefs.
+class AffineDmaStartOp : public Op<AffineDmaStartOp, OpTrait::VariadicOperands,
+                                   OpTrait::ZeroResult> {
+public:
+  using Op::Op;
+
+  static void build(Builder *builder, OperationState &result, Value srcMemRef,
+                    AffineMap srcMap, ValueRange srcIndices, Value destMemRef,
+                    AffineMap dstMap, ValueRange destIndices, Value tagMemRef,
+                    AffineMap tagMap, ValueRange tagIndices, Value numElements,
+                    Value stride = nullptr, Value elementsPerStride = nullptr);
+
+  /// Returns the operand index of the src memref.
+  unsigned getSrcMemRefOperandIndex() { return 0; }
+
+  /// Returns the source MemRefType for this DMA operation.
+  Value getSrcMemRef() { return getOperand(getSrcMemRefOperandIndex()); }
+  MemRefType getSrcMemRefType() {
+    return getSrcMemRef()->getType().cast<MemRefType>();
+  }
+
+  /// Returns the rank (number of indices) of the source MemRefType.
+  unsigned getSrcMemRefRank() { return getSrcMemRefType().getRank(); }
+
+  /// Returns the affine map used to access the src memref.
+  AffineMap getSrcMap() { return getSrcMapAttr().getValue(); }
+  AffineMapAttr getSrcMapAttr() {
+    return getAttr(getSrcMapAttrName()).cast<AffineMapAttr>();
+  }
+
+  /// Returns the source memref affine map indices for this DMA operation.
+  operand_range getSrcIndices() {
+    return {operand_begin() + getSrcMemRefOperandIndex() + 1,
+            operand_begin() + getSrcMemRefOperandIndex() + 1 +
+                getSrcMap().getNumInputs()};
+  }
+
+  /// Returns the memory space of the src memref.
+  unsigned getSrcMemorySpace() {
+    return getSrcMemRef()->getType().cast<MemRefType>().getMemorySpace();
+  }
+
+  /// Returns the operand index of the dst memref.
+  unsigned getDstMemRefOperandIndex() {
+    return getSrcMemRefOperandIndex() + 1 + getSrcMap().getNumInputs();
+  }
+
+  /// Returns the destination MemRefType for this DMA operations.
+  Value getDstMemRef() { return getOperand(getDstMemRefOperandIndex()); }
+  MemRefType getDstMemRefType() {
+    return getDstMemRef()->getType().cast<MemRefType>();
+  }
+
+  /// Returns the rank (number of indices) of the destination MemRefType.
+  unsigned getDstMemRefRank() {
+    return getDstMemRef()->getType().cast<MemRefType>().getRank();
+  }
+
+  /// Returns the memory space of the src memref.
+  unsigned getDstMemorySpace() {
+    return getDstMemRef()->getType().cast<MemRefType>().getMemorySpace();
+  }
+
+  /// Returns the affine map used to access the dst memref.
+  AffineMap getDstMap() { return getDstMapAttr().getValue(); }
+  AffineMapAttr getDstMapAttr() {
+    return getAttr(getDstMapAttrName()).cast<AffineMapAttr>();
+  }
+
+  /// Returns the destination memref indices for this DMA operation.
+  operand_range getDstIndices() {
+    return {operand_begin() + getDstMemRefOperandIndex() + 1,
+            operand_begin() + getDstMemRefOperandIndex() + 1 +
+                getDstMap().getNumInputs()};
+  }
+
+  /// Returns the operand index of the tag memref.
+  unsigned getTagMemRefOperandIndex() {
+    return getDstMemRefOperandIndex() + 1 + getDstMap().getNumInputs();
+  }
+
+  /// Returns the Tag MemRef for this DMA operation.
+  Value getTagMemRef() { return getOperand(getTagMemRefOperandIndex()); }
+  MemRefType getTagMemRefType() {
+    return getTagMemRef()->getType().cast<MemRefType>();
+  }
+
+  /// Returns the rank (number of indices) of the tag MemRefType.
+  unsigned getTagMemRefRank() {
+    return getTagMemRef()->getType().cast<MemRefType>().getRank();
+  }
+
+  /// Returns the affine map used to access the tag memref.
+  AffineMap getTagMap() { return getTagMapAttr().getValue(); }
+  AffineMapAttr getTagMapAttr() {
+    return getAttr(getTagMapAttrName()).cast<AffineMapAttr>();
+  }
+
+  /// Returns the tag memref indices for this DMA operation.
+  operand_range getTagIndices() {
+    return {operand_begin() + getTagMemRefOperandIndex() + 1,
+            operand_begin() + getTagMemRefOperandIndex() + 1 +
+                getTagMap().getNumInputs()};
+  }
+
+  /// Returns the number of elements being transferred by this DMA operation.
+  Value getNumElements() {
+    return getOperand(getTagMemRefOperandIndex() + 1 +
+                      getTagMap().getNumInputs());
+  }
+
+  /// Returns the AffineMapAttr associated with 'memref'.
+  NamedAttribute getAffineMapAttrForMemRef(Value memref) {
+    if (memref == getSrcMemRef())
+      return {Identifier::get(getSrcMapAttrName(), getContext()),
+              getSrcMapAttr()};
+    else if (memref == getDstMemRef())
+      return {Identifier::get(getDstMapAttrName(), getContext()),
+              getDstMapAttr()};
+    assert(memref == getTagMemRef() &&
+           "DmaStartOp expected source, destination or tag memref");
+    return {Identifier::get(getTagMapAttrName(), getContext()),
+            getTagMapAttr()};
+  }
+
+  /// Returns true if this is a DMA from a faster memory space to a slower one.
+  bool isDestMemorySpaceFaster() {
+    return (getSrcMemorySpace() < getDstMemorySpace());
+  }
+
+  /// Returns true if this is a DMA from a slower memory space to a faster one.
+  bool isSrcMemorySpaceFaster() {
+    // Assumes that a lower number is for a slower memory space.
+    return (getDstMemorySpace() < getSrcMemorySpace());
+  }
+
+  /// Given a DMA start operation, returns the operand position of either the
+  /// source or destination memref depending on the one that is at the higher
+  /// level of the memory hierarchy. Asserts failure if neither is true.
+  unsigned getFasterMemPos() {
+    assert(isSrcMemorySpaceFaster() || isDestMemorySpaceFaster());
+    return isSrcMemorySpaceFaster() ? 0 : getDstMemRefOperandIndex();
+  }
+
+  static StringRef getSrcMapAttrName() { return "src_map"; }
+  static StringRef getDstMapAttrName() { return "dst_map"; }
+  static StringRef getTagMapAttrName() { return "tag_map"; }
+
+  static StringRef getOperationName() { return "affine.dma_start"; }
+  static ParseResult parse(OpAsmParser &parser, OperationState &result);
+  void print(OpAsmPrinter &p);
+  LogicalResult verify();
+  LogicalResult fold(ArrayRef<Attribute> cstOperands,
+                     SmallVectorImpl<OpFoldResult> &results);
+
+  /// Returns true if this DMA operation is strided, returns false otherwise.
+  bool isStrided() {
+    return getNumOperands() !=
+           getTagMemRefOperandIndex() + 1 + getTagMap().getNumInputs() + 1;
+  }
+
+  /// Returns the stride value for this DMA operation.
+  Value getStride() {
+    if (!isStrided())
+      return nullptr;
+    return getOperand(getNumOperands() - 1 - 1);
+  }
+
+  /// Returns the number of elements to transfer per stride for this DMA op.
+  Value getNumElementsPerStride() {
+    if (!isStrided())
+      return nullptr;
+    return getOperand(getNumOperands() - 1);
+  }
+};
+
+/// AffineDmaWaitOp blocks until the completion of a DMA operation associated
+/// with the tag element '%tag[%index]'. %tag is a memref, and %index has to be
+/// an index with the same restrictions as any load/store index. In particular,
+/// index for each memref dimension must be an affine expression of loop
+/// induction variables and symbols. %num_elements is the number of elements
+/// associated with the DMA operation. For example:
+//
+//   affine.dma_start %src[%i, %j], %dst[%k, %l], %tag[%index], %num_elements :
+//     memref<2048xf32, 0>, memref<256xf32, 1>, memref<1xi32, 2>
+//   ...
+//   ...
+//   affine.dma_wait %tag[%index], %num_elements : memref<1xi32, 2>
+//
+class AffineDmaWaitOp : public Op<AffineDmaWaitOp, OpTrait::VariadicOperands,
+                                  OpTrait::ZeroResult> {
+public:
+  using Op::Op;
+
+  static void build(Builder *builder, OperationState &result, Value tagMemRef,
+                    AffineMap tagMap, ValueRange tagIndices, Value numElements);
+
+  static StringRef getOperationName() { return "affine.dma_wait"; }
+
+  // Returns the Tag MemRef associated with the DMA operation being waited on.
+  Value getTagMemRef() { return getOperand(0); }
+  MemRefType getTagMemRefType() {
+    return getTagMemRef()->getType().cast<MemRefType>();
+  }
+
+  /// Returns the affine map used to access the tag memref.
+  AffineMap getTagMap() { return getTagMapAttr().getValue(); }
+  AffineMapAttr getTagMapAttr() {
+    return getAttr(getTagMapAttrName()).cast<AffineMapAttr>();
+  }
+
+  // Returns the tag memref index for this DMA operation.
+  operand_range getTagIndices() {
+    return {operand_begin() + 1,
+            operand_begin() + 1 + getTagMap().getNumInputs()};
+  }
+
+  // Returns the rank (number of indices) of the tag memref.
+  unsigned getTagMemRefRank() {
+    return getTagMemRef()->getType().cast<MemRefType>().getRank();
+  }
+
+  /// Returns the AffineMapAttr associated with 'memref'.
+  NamedAttribute getAffineMapAttrForMemRef(Value memref) {
+    assert(memref == getTagMemRef());
+    return {Identifier::get(getTagMapAttrName(), getContext()),
+            getTagMapAttr()};
+  }
+
+  /// Returns the number of elements transferred in the associated DMA op.
+  Value getNumElements() { return getOperand(1 + getTagMap().getNumInputs()); }
+
+  static StringRef getTagMapAttrName() { return "tag_map"; }
+  static ParseResult parse(OpAsmParser &parser, OperationState &result);
+  void print(OpAsmPrinter &p);
+  LogicalResult verify();
+  LogicalResult fold(ArrayRef<Attribute> cstOperands,
+                     SmallVectorImpl<OpFoldResult> &results);
+};
+
+/// The "affine.load" op reads an element from a memref, where the index
+/// for each memref dimension is an affine expression of loop induction
+/// variables and symbols. The output of 'affine.load' is a new value with the
+/// same type as the elements of the memref. An affine expression of loop IVs
+/// and symbols must be specified for each dimension of the memref. The keyword
+/// 'symbol' can be used to indicate SSA identifiers which are symbolic.
+//
+//  Example 1:
+//
+//    %1 = affine.load %0[%i0 + 3, %i1 + 7] : memref<100x100xf32>
+//
+//  Example 2: Uses 'symbol' keyword for symbols '%n' and '%m'.
+//
+//    %1 = affine.load %0[%i0 + symbol(%n), %i1 + symbol(%m)]
+//      : memref<100x100xf32>
+//
+class AffineLoadOp : public Op<AffineLoadOp, OpTrait::OneResult,
+                               OpTrait::AtLeastNOperands<1>::Impl> {
+public:
+  using Op::Op;
+
+  /// Builds an affine load op with the specified map and operands.
+  static void build(Builder *builder, OperationState &result, AffineMap map,
+                    ValueRange operands);
+  /// Builds an affine load op with an identity map and operands.
+  static void build(Builder *builder, OperationState &result, Value memref,
+                    ValueRange indices = {});
+  /// Builds an affine load op with the specified map and its operands.
+  static void build(Builder *builder, OperationState &result, Value memref,
+                    AffineMap map, ValueRange mapOperands);
+
+  /// Returns the operand index of the memref.
+  unsigned getMemRefOperandIndex() { return 0; }
+
+  /// Get memref operand.
+  Value getMemRef() { return getOperand(getMemRefOperandIndex()); }
+  void setMemRef(Value value) { setOperand(getMemRefOperandIndex(), value); }
+  MemRefType getMemRefType() {
+    return getMemRef()->getType().cast<MemRefType>();
+  }
+
+  /// Get affine map operands.
+  operand_range getMapOperands() { return llvm::drop_begin(getOperands(), 1); }
+
+  /// Returns the affine map used to index the memref for this operation.
+  AffineMap getAffineMap() { return getAffineMapAttr().getValue(); }
+  AffineMapAttr getAffineMapAttr() {
+    return getAttr(getMapAttrName()).cast<AffineMapAttr>();
+  }
+
+  /// Returns the AffineMapAttr associated with 'memref'.
+  NamedAttribute getAffineMapAttrForMemRef(Value memref) {
+    assert(memref == getMemRef());
+    return {Identifier::get(getMapAttrName(), getContext()),
+            getAffineMapAttr()};
+  }
+
+  static StringRef getMapAttrName() { return "map"; }
+  static StringRef getOperationName() { return "affine.load"; }
+
+  // Hooks to customize behavior of this op.
+  static ParseResult parse(OpAsmParser &parser, OperationState &result);
+  void print(OpAsmPrinter &p);
+  LogicalResult verify();
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context);
+  OpFoldResult fold(ArrayRef<Attribute> operands);
+};
+
+/// The "affine.store" op writes an element to a memref, where the index
+/// for each memref dimension is an affine expression of loop induction
+/// variables and symbols. The 'affine.store' op stores a new value which is the
+/// same type as the elements of the memref. An affine expression of loop IVs
+/// and symbols must be specified for each dimension of the memref. The keyword
+/// 'symbol' can be used to indicate SSA identifiers which are symbolic.
+//
+//  Example 1:
+//
+//    affine.store %v0, %0[%i0 + 3, %i1 + 7] : memref<100x100xf32>
+//
+//  Example 2: Uses 'symbol' keyword for symbols '%n' and '%m'.
+//
+//    affine.store %v0, %0[%i0 + symbol(%n), %i1 + symbol(%m)]
+//      : memref<100x100xf32>
+//
+class AffineStoreOp : public Op<AffineStoreOp, OpTrait::ZeroResult,
+                                OpTrait::AtLeastNOperands<1>::Impl> {
+public:
+  using Op::Op;
+
+  /// Builds an affine store operation with the provided indices (identity map).
+  static void build(Builder *builder, OperationState &result,
+                    Value valueToStore, Value memref, ValueRange indices);
+  /// Builds an affine store operation with the specified map and its operands.
+  static void build(Builder *builder, OperationState &result,
+                    Value valueToStore, Value memref, AffineMap map,
+                    ValueRange mapOperands);
+
+  /// Get value to be stored by store operation.
+  Value getValueToStore() { return getOperand(0); }
+
+  /// Returns the operand index of the memref.
+  unsigned getMemRefOperandIndex() { return 1; }
+
+  /// Get memref operand.
+  Value getMemRef() { return getOperand(getMemRefOperandIndex()); }
+  void setMemRef(Value value) { setOperand(getMemRefOperandIndex(), value); }
+
+  MemRefType getMemRefType() {
+    return getMemRef()->getType().cast<MemRefType>();
+  }
+
+  /// Get affine map operands.
+  operand_range getMapOperands() { return llvm::drop_begin(getOperands(), 2); }
+
+  /// Returns the affine map used to index the memref for this operation.
+  AffineMap getAffineMap() { return getAffineMapAttr().getValue(); }
+  AffineMapAttr getAffineMapAttr() {
+    return getAttr(getMapAttrName()).cast<AffineMapAttr>();
+  }
+
+  /// Returns the AffineMapAttr associated with 'memref'.
+  NamedAttribute getAffineMapAttrForMemRef(Value memref) {
+    assert(memref == getMemRef());
+    return {Identifier::get(getMapAttrName(), getContext()),
+            getAffineMapAttr()};
+  }
+
+  static StringRef getMapAttrName() { return "map"; }
+  static StringRef getOperationName() { return "affine.store"; }
+
+  // Hooks to customize behavior of this op.
+  static ParseResult parse(OpAsmParser &parser, OperationState &result);
+  void print(OpAsmPrinter &p);
+  LogicalResult verify();
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context);
+  LogicalResult fold(ArrayRef<Attribute> cstOperands,
+                     SmallVectorImpl<OpFoldResult> &results);
+};
+
+/// Returns true if the given Value can be used as a dimension id.
+bool isValidDim(Value value);
+
+/// Returns true if the given Value can be used as a symbol.
+bool isValidSymbol(Value value);
+
+/// Modifies both `map` and `operands` in-place so as to:
+/// 1. drop duplicate operands
+/// 2. drop unused dims and symbols from map
+/// 3. promote valid symbols to symbolic operands in case they appeared as
+///    dimensional operands
+/// 4. propagate constant operands and drop them
+void canonicalizeMapAndOperands(AffineMap *map,
+                                SmallVectorImpl<Value> *operands);
+/// Canonicalizes an integer set the same way canonicalizeMapAndOperands does
+/// for affine maps.
+void canonicalizeSetAndOperands(IntegerSet *set,
+                                SmallVectorImpl<Value> *operands);
+
+/// Returns a composed AffineApplyOp by composing `map` and `operands` with
+/// other AffineApplyOps supplying those operands. The operands of the resulting
+/// AffineApplyOp do not change the length of  AffineApplyOp chains.
+AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map,
+                                      ArrayRef<Value> operands);
+
+/// Given an affine map `map` and its input `operands`, this method composes
+/// into `map`, maps of AffineApplyOps whose results are the values in
+/// `operands`, iteratively until no more of `operands` are the result of an
+/// AffineApplyOp. When this function returns, `map` becomes the composed affine
+/// map, and each Value in `operands` is guaranteed to be either a loop IV or a
+/// terminal symbol, i.e., a symbol defined at the top level or a block/function
+/// argument.
+void fullyComposeAffineMapAndOperands(AffineMap *map,
+                                      SmallVectorImpl<Value> *operands);
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/AffineOps/AffineOps.h.inc"
+
+/// Returns if the provided value is the induction variable of a AffineForOp.
+bool isForInductionVar(Value val);
+
+/// Returns the loop parent of an induction variable. If the provided value is
+/// not an induction variable, then return nullptr.
+AffineForOp getForInductionVarOwner(Value val);
+
+/// Extracts the induction variables from a list of AffineForOps and places them
+/// in the output argument `ivs`.
+void extractForInductionVars(ArrayRef<AffineForOp> forInsts,
+                             SmallVectorImpl<Value> *ivs);
+
+/// AffineBound represents a lower or upper bound in the for operation.
+/// This class does not own the underlying operands. Instead, it refers
+/// to the operands stored in the AffineForOp. Its life span should not exceed
+/// that of the for operation it refers to.
+class AffineBound {
+public:
+  AffineForOp getAffineForOp() { return op; }
+  AffineMap getMap() { return map; }
+
+  /// Returns an AffineValueMap representing this bound.
+  AffineValueMap getAsAffineValueMap();
+
+  unsigned getNumOperands() { return opEnd - opStart; }
+  Value getOperand(unsigned idx) { return op.getOperand(opStart + idx); }
+
+  using operand_iterator = AffineForOp::operand_iterator;
+  using operand_range = AffineForOp::operand_range;
+
+  operand_iterator operand_begin() { return op.operand_begin() + opStart; }
+  operand_iterator operand_end() { return op.operand_begin() + opEnd; }
+  operand_range getOperands() { return {operand_begin(), operand_end()}; }
+
+private:
+  // 'affine.for' operation that contains this bound.
+  AffineForOp op;
+  // Start and end positions of this affine bound operands in the list of
+  // the containing 'affine.for' operation operands.
+  unsigned opStart, opEnd;
+  // Affine map for this bound.
+  AffineMap map;
+
+  AffineBound(AffineForOp op, unsigned opStart, unsigned opEnd, AffineMap map)
+      : op(op), opStart(opStart), opEnd(opEnd), map(map) {}
+
+  friend class AffineForOp;
+};
+
+/// An `AffineApplyNormalizer` is a helper class that supports renumbering
+/// operands of AffineApplyOp. This acts as a reindexing map of Value to
+/// positional dims or symbols and allows simplifications such as:
+///
+/// ```mlir
+///    %1 = affine.apply (d0, d1) -> (d0 - d1) (%0, %0)
+/// ```
+///
+/// into:
+///
+/// ```mlir
+///    %1 = affine.apply () -> (0)
+/// ```
+struct AffineApplyNormalizer {
+  AffineApplyNormalizer(AffineMap map, ArrayRef<Value> operands);
+
+  /// Returns the AffineMap resulting from normalization.
+  AffineMap getAffineMap() { return affineMap; }
+
+  SmallVector<Value, 8> getOperands() {
+    SmallVector<Value, 8> res(reorderedDims);
+    res.append(concatenatedSymbols.begin(), concatenatedSymbols.end());
+    return res;
+  }
+
+  unsigned getNumSymbols() { return concatenatedSymbols.size(); }
+  unsigned getNumDims() { return reorderedDims.size(); }
+
+  /// Normalizes 'otherMap' and its operands 'otherOperands' to map to this
+  /// normalizer's coordinate space.
+  void normalize(AffineMap *otherMap, SmallVectorImpl<Value> *otherOperands);
+
+private:
+  /// Helper function to insert `v` into the coordinate system of the current
+  /// AffineApplyNormalizer. Returns the AffineDimExpr with the corresponding
+  /// renumbered position.
+  AffineDimExpr renumberOneDim(Value v);
+
+  /// Given an `other` normalizer, this rewrites `other.affineMap` in the
+  /// coordinate system of the current AffineApplyNormalizer.
+  /// Returns the rewritten AffineMap and updates the dims and symbols of
+  /// `this`.
+  AffineMap renumber(const AffineApplyNormalizer &other);
+
+  /// Maps of Value to position in `affineMap`.
+  DenseMap<Value, unsigned> dimValueToPosition;
+
+  /// Ordered dims and symbols matching positional dims and symbols in
+  /// `affineMap`.
+  SmallVector<Value, 8> reorderedDims;
+  SmallVector<Value, 8> concatenatedSymbols;
+
+  AffineMap affineMap;
+
+  /// Used with RAII to control the depth at which AffineApply are composed
+  /// recursively. Only accepts depth 1 for now to allow a behavior where a
+  /// newly composed AffineApplyOp does not increase the length of the chain of
+  /// AffineApplyOps. Full composition is implemented iteratively on top of
+  /// this behavior.
+  static unsigned &affineApplyDepth() {
+    static thread_local unsigned depth = 0;
+    return depth;
+  }
+  static constexpr unsigned kMaxAffineApplyDepth = 1;
+
+  AffineApplyNormalizer() { affineApplyDepth()++; }
+
+public:
+  ~AffineApplyNormalizer() { affineApplyDepth()--; }
+};
+
+} // end namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/Dialect/AffineOps/AffineOps.td b/mlir/include/mlir/Dialect/AffineOps/AffineOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..114e20513b2ae0f9721e34e683ba759cbd5af1b0
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AffineOps/AffineOps.td
@@ -0,0 +1,350 @@
+//===- AffineOps.td - Affine operation definitions ---------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines MLIR affine operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AFFINE_OPS
+#define AFFINE_OPS
+
+include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+include "mlir/IR/OpBase.td"
+include "mlir/Transforms/LoopLikeInterface.td"
+
+def Affine_Dialect : Dialect {
+  let name = "affine";
+  let cppNamespace = "";
+}
+
+// Base class for Affine dialect ops.
+class Affine_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Affine_Dialect, mnemonic, traits> {
+  // For every affine op, there needs to be a:
+  //   * void print(OpAsmPrinter &p, ${C++ class of Op} op)
+  //   * LogicalResult verify(${C++ class of Op} op)
+  //   * ParseResult parse${C++ class of Op}(OpAsmParser &parser,
+  //                                         OperationState &result)
+  // functions.
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+// Require regions to have affine terminator.
+def ImplicitAffineTerminator
+    : SingleBlockImplicitTerminator<"AffineTerminatorOp">;
+
+def AffineForOp : Affine_Op<"for",
+    [ImplicitAffineTerminator,
+     DeclareOpInterfaceMethods<LoopLikeOpInterface>]> {
+  let summary = "for operation";
+  let description = [{
+    The "affine.for" operation represents an affine loop nest, defining an SSA
+    value for its induction variable. It has one region capturing the loop body.
+    The induction variable is represented as a argument of this region. This SSA
+    value always has type index, which is the size of the machine word. The
+    stride, represented by step, is a positive constant integer which defaults
+    to "1" if not present. The lower and upper bounds specify a half-open range:
+    the range includes the lower bound but does not include the upper bound.
+
+    The body region must contain exactly one block that terminates with
+    "affine.terminator".  Calling AffineForOp::build will create such region
+    and insert the terminator, so will the parsing even in cases if it is absent
+    from the custom format.
+
+    The lower and upper bounds of a for operation are represented as an
+    application of an affine mapping to a list of SSA values passed to the map.
+    The same restrictions hold for these SSA values as for all bindings of SSA
+    values to dimensions and symbols. The affine mappings for the bounds may
+    return multiple results, in which case the max/min keywords are required
+    (for the lower/upper bound respectively), and the bound is the
+    maximum/minimum of the returned values.
+
+    Example:
+
+      affine.for %i = 1 to 10 {
+        ...
+      }
+
+  }];
+  let arguments = (ins Variadic<AnyType>);
+  let regions = (region SizedRegion<1>:$region);
+
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &result, "
+              "int64_t lowerBound, int64_t upperBound, int64_t step = 1">,
+    OpBuilder<"Builder *builder, OperationState &result, "
+              "ValueRange lbOperands, AffineMap lbMap, "
+              "ValueRange ubOperands, AffineMap ubMap, "
+              "int64_t step = 1">
+  ];
+
+  let extraClassDeclaration = [{
+    static StringRef getStepAttrName() { return "step"; }
+    static StringRef getLowerBoundAttrName() { return "lower_bound"; }
+    static StringRef getUpperBoundAttrName() { return "upper_bound"; }
+
+    Block *getBody() { return &region().front(); }
+    Value getInductionVar() { return getBody()->getArgument(0); }
+    OpBuilder getBodyBuilder() {
+      return OpBuilder(getBody(), std::prev(getBody()->end()));
+    }
+
+    // TODO: provide iterators for the lower and upper bound operands
+    // if the current access via getLowerBound(), getUpperBound() is too slow.
+
+    /// Returns operands for the lower bound map.
+    operand_range getLowerBoundOperands();
+
+    /// Returns operands for the upper bound map.
+    operand_range getUpperBoundOperands();
+
+    /// Returns information about the lower bound as a single object.
+    AffineBound getLowerBound();
+
+    /// Returns information about the upper bound as a single object.
+    AffineBound getUpperBound();
+
+    /// Returns loop step.
+    int64_t getStep() {
+      return getAttr(getStepAttrName()).cast<IntegerAttr>().getInt();
+    }
+
+    /// Returns affine map for the lower bound.
+    AffineMap getLowerBoundMap() { return getLowerBoundMapAttr().getValue(); }
+    AffineMapAttr getLowerBoundMapAttr() {
+      return getAttr(getLowerBoundAttrName()).cast<AffineMapAttr>();
+    }
+    /// Returns affine map for the upper bound. The upper bound is exclusive.
+    AffineMap getUpperBoundMap() { return getUpperBoundMapAttr().getValue(); }
+    AffineMapAttr getUpperBoundMapAttr() {
+      return getAttr(getUpperBoundAttrName()).cast<AffineMapAttr>();
+    }
+
+    /// Set lower bound. The new bound must have the same number of operands as
+    /// the current bound map. Otherwise, 'replaceForLowerBound' should be used.
+    void setLowerBound(ValueRange operands, AffineMap map);
+    /// Set upper bound. The new bound must not have more operands than the
+    /// current bound map. Otherwise, 'replaceForUpperBound' should be used.
+    void setUpperBound(ValueRange operands, AffineMap map);
+
+    /// Set the lower bound map without changing operands.
+    void setLowerBoundMap(AffineMap map);
+
+    /// Set the upper bound map without changing operands.
+    void setUpperBoundMap(AffineMap map);
+
+    /// Set loop step.
+    void setStep(int64_t step) {
+      assert(step > 0 && "step has to be a positive integer constant");
+      auto *context = getLowerBoundMap().getContext();
+      setAttr(Identifier::get(getStepAttrName(), context),
+              IntegerAttr::get(IndexType::get(context), step));
+    }
+
+    /// Returns true if the lower bound is constant.
+    bool hasConstantLowerBound();
+    /// Returns true if the upper bound is constant.
+    bool hasConstantUpperBound();
+    /// Returns true if both bounds are constant.
+    bool hasConstantBounds() {
+      return hasConstantLowerBound() && hasConstantUpperBound();
+    }
+    /// Returns the value of the constant lower bound.
+    /// Fails assertion if the bound is non-constant.
+    int64_t getConstantLowerBound();
+    /// Returns the value of the constant upper bound. The upper bound is
+    /// exclusive. Fails assertion if the bound is non-constant.
+    int64_t getConstantUpperBound();
+    /// Sets the lower bound to the given constant value.
+    void setConstantLowerBound(int64_t value);
+    /// Sets the upper bound to the given constant value.
+    void setConstantUpperBound(int64_t value);
+
+    /// Returns true if both the lower and upper bound have the same operand
+    /// lists (same operands in the same order).
+    bool matchingBoundOperandList();
+  }];
+
+  let hasCanonicalizer = 1;
+  let hasFolder = 1;
+}
+
+def AffineIfOp : Affine_Op<"if", [ImplicitAffineTerminator]> {
+  let summary = "if-then-else operation";
+  let description = [{
+    The "if" operation represents an if-then-else construct for conditionally
+    executing two regions of code. The operands to an if operation are an
+    IntegerSet condition and a set of symbol/dimension operands to the
+    condition set. The operation produces no results. For example:
+
+       affine.if #set(%i)  {
+         ...
+       } else {
+         ...
+       }
+
+    The 'else' blocks to the if operation are optional, and may be omitted. For
+    example:
+
+       affine.if #set(%i)  {
+         ...
+       }
+  }];
+  let arguments = (ins Variadic<AnyType>);
+  let regions = (region SizedRegion<1>:$thenRegion, AnyRegion:$elseRegion);
+
+  let skipDefaultBuilders = 1;
+
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &result, "
+              "IntegerSet set, ValueRange args, bool withElseRegion">
+  ];
+
+  let extraClassDeclaration = [{
+    static StringRef getConditionAttrName() { return "condition"; }
+
+    IntegerSet getIntegerSet();
+    void setIntegerSet(IntegerSet newSet);
+
+    /// Sets the integer set with its operands. The size of 'operands' must not
+    /// exceed the current number of operands for this instance, as the operands
+    /// list of AffineIf is not resizable.
+    void setConditional(IntegerSet set, ValueRange operands);
+
+    OpBuilder getThenBodyBuilder() {
+      assert(!thenRegion().empty() && "Unexpected empty 'then' region.");
+      Block &body = thenRegion().front();
+      return OpBuilder(&body, std::prev(body.end()));
+    }
+    OpBuilder getElseBodyBuilder() {
+      assert(!elseRegion().empty() && "Unexpected empty 'else' region.");
+      Block &body = elseRegion().front();
+      return OpBuilder(&body, std::prev(body.end()));
+    }
+  }];
+
+  let hasFolder = 1;
+}
+
+def AffineMinOp : Affine_Op<"min"> {
+  let summary = "min operation";
+  let description = [{
+    The "min" operation computes the minimum value result from a multi-result
+    affine map.
+
+    Example:
+
+       %0 = affine.min (d0) -> (1000, d0 + 512) (%i0) : index
+  }];
+  let arguments = (ins AffineMapAttr:$map, Variadic<Index>:$operands);
+  let results = (outs Index);
+  let extraClassDeclaration = [{
+    static StringRef getMapAttrName() { return "map"; }
+  }];
+  let hasFolder = 1;
+}
+
+def AffinePrefetchOp : Affine_Op<"prefetch"> {
+  let summary = "affine prefetch operation";
+  let description = [{
+    The "affine.prefetch" op prefetches data from a memref location described
+    with an affine subscript similar to affine.load, and has three attributes:
+    a read/write specifier, a locality hint, and a cache type specifier as shown
+    below:
+
+      affine.prefetch %0[%i, %j + 5], read, locality<3>, data
+                          : memref<400x400xi32>
+
+    The read/write specifier is either 'read' or 'write', the locality hint
+    specifier ranges from locality<0> (no locality) to locality<3> (extremely
+    local keep in cache). The cache type specifier is either 'data' or 'instr'
+    and specifies whether the prefetch is performed on data cache or on
+    instruction cache.
+  }];
+
+  let arguments = (ins AnyMemRef:$memref, Variadic<Index>:$indices,
+                   BoolAttr:$isWrite,
+                   Confined<I32Attr, [IntMinValue<0>,
+                     IntMaxValue<3>]>:$localityHint,
+                   BoolAttr:$isDataCache);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value memref,"
+    "AffineMap map, ArrayRef<Value> mapOperands, bool isWrite,"
+    "unsigned localityHint, bool isDataCache",
+    [{
+      assert(map.getNumInputs() == mapOperands.size()
+             && "inconsistent index info");
+      auto localityHintAttr = builder->getI32IntegerAttr(localityHint);
+      auto isWriteAttr = builder->getBoolAttr(isWrite);
+      auto isDataCacheAttr = builder->getBoolAttr(isDataCache);
+      result.addOperands(memref);
+      result.addAttribute(getMapAttrName(), AffineMapAttr::get(map));
+      result.addOperands(mapOperands);
+      result.addAttribute(getLocalityHintAttrName(), localityHintAttr);
+      result.addAttribute(getIsWriteAttrName(), isWriteAttr);
+      result.addAttribute(getIsDataCacheAttrName(), isDataCacheAttr);
+    }]>];
+
+  let extraClassDeclaration = [{
+    MemRefType getMemRefType() {
+      return memref()->getType().cast<MemRefType>();
+    }
+
+    /// Returns the affine map used to index the memref for this operation.
+    AffineMap getAffineMap() { return getAffineMapAttr().getValue(); }
+    AffineMapAttr getAffineMapAttr() {
+      return getAttr(getMapAttrName()).cast<AffineMapAttr>();
+    }
+
+    /// Returns the AffineMapAttr associated with 'memref'.
+    NamedAttribute getAffineMapAttrForMemRef(Value mref) {
+      assert(mref == memref());
+      return {Identifier::get(getMapAttrName(), getContext()),
+        getAffineMapAttr()};
+    }
+
+    /// Get affine map operands.
+    operand_range getMapOperands() {
+      return {operand_begin() + 1, operand_end()};
+    }
+
+    static StringRef getMapAttrName() { return "map"; }
+    static StringRef getLocalityHintAttrName() { return "localityHint"; }
+    static StringRef getIsWriteAttrName() { return "isWrite"; }
+    static StringRef getIsDataCacheAttrName() { return "isDataCache"; }
+  }];
+
+  let hasCanonicalizer = 1;
+  let hasFolder = 1;
+}
+
+def AffineTerminatorOp :
+    Affine_Op<"terminator", [Terminator]> {
+  let summary = "affine terminator operation";
+  let description = [{
+    Affine terminator is a special terminator operation for blocks inside affine
+    loops and branches. It unconditionally transmits the control flow to the
+    successor of the operation enclosing the region.
+
+    This operation does _not_ have a custom syntax. However, affine control
+    operations omit the terminator in their custom syntax for brevity.
+  }];
+
+  // No custom parsing/printing form.
+  let parser = ?;
+  let printer = ?;
+
+  // Fully specified by traits.
+  let verifier = ?;
+}
+
+#endif // AFFINE_OPS
diff --git a/mlir/include/mlir/Dialect/AffineOps/AffineOpsBase.td b/mlir/include/mlir/Dialect/AffineOps/AffineOpsBase.td
new file mode 100644
index 0000000000000000000000000000000000000000..6aee5f3cd4a51fcc67806a2d263a324dbe3aa50a
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AffineOps/AffineOpsBase.td
@@ -0,0 +1,31 @@
+//===- AffineOpsBase.td - Affine operation definitions -----*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines base support for MLIR affine operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AFFINE_OPS_BASE
+#define AFFINE_OPS_BASE
+
+include "mlir/IR/OpBase.td"
+
+// Attributes containing affine maps.
+def AffineMapAttr : Attr<
+    CPred<"$_self.isa<AffineMapAttr>()">, "AffineMap attribute"> {
+  let storageType = [{ AffineMapAttr }];
+  let returnType = [{ AffineMap }];
+  let constBuilderCall = "AffineMapAttr::get($0)";
+}
+
+def AffineMapArrayAttr : TypedArrayAttrBase<AffineMapAttr,
+                                      "AffineMap array attribute"> {
+  let constBuilderCall = "$_builder.getAffineMapArrayAttr($0)";
+}
+
+#endif // AFFINE_OPS_BASE
diff --git a/mlir/include/mlir/Dialect/AffineOps/CMakeLists.txt b/mlir/include/mlir/Dialect/AffineOps/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7339bcc9dcfd6406ec3e358487d09caa9edde88c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AffineOps/CMakeLists.txt
@@ -0,0 +1 @@
+add_mlir_dialect(AffineOps AffineOps)
diff --git a/mlir/include/mlir/Dialect/CMakeLists.txt b/mlir/include/mlir/Dialect/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9235436995aea83a6b1736b897f97abb74a7f6bc
--- /dev/null
+++ b/mlir/include/mlir/Dialect/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_subdirectory(AffineOps)
+add_subdirectory(FxpMathOps)
+add_subdirectory(GPU)
+add_subdirectory(Linalg)
+add_subdirectory(LLVMIR)
+add_subdirectory(LoopOps)
+add_subdirectory(QuantOps)
+add_subdirectory(SPIRV)
+add_subdirectory(StandardOps)
+add_subdirectory(VectorOps)
diff --git a/mlir/include/mlir/Dialect/CommonFolders.h b/mlir/include/mlir/Dialect/CommonFolders.h
new file mode 100644
index 0000000000000000000000000000000000000000..d667de73d4194eb77b5ccc34324b71d4c3e5187f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/CommonFolders.h
@@ -0,0 +1,73 @@
+//===- CommonFolders.h - Common Operation Folders----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file declares various common operation folders. These folders
+// are intended to be used by dialects to support common folding behavior
+// without requiring each dialect to provide its own implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_COMMONFOLDERS_H
+#define MLIR_DIALECT_COMMONFOLDERS_H
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+
+namespace mlir {
+/// Performs constant folding `calculate` with element-wise behavior on the two
+/// attributes in `operands` and returns the result if possible.
+template <class AttrElementT,
+          class ElementValueT = typename AttrElementT::ValueType,
+          class CalculationT =
+              function_ref<ElementValueT(ElementValueT, ElementValueT)>>
+Attribute constFoldBinaryOp(ArrayRef<Attribute> operands,
+                            const CalculationT &calculate) {
+  assert(operands.size() == 2 && "binary op takes two operands");
+  if (!operands[0] || !operands[1])
+    return {};
+  if (operands[0].getType() != operands[1].getType())
+    return {};
+
+  if (operands[0].isa<AttrElementT>() && operands[1].isa<AttrElementT>()) {
+    auto lhs = operands[0].cast<AttrElementT>();
+    auto rhs = operands[1].cast<AttrElementT>();
+
+    return AttrElementT::get(lhs.getType(),
+                             calculate(lhs.getValue(), rhs.getValue()));
+  } else if (operands[0].isa<SplatElementsAttr>() &&
+             operands[1].isa<SplatElementsAttr>()) {
+    // Both operands are splats so we can avoid expanding the values out and
+    // just fold based on the splat value.
+    auto lhs = operands[0].cast<SplatElementsAttr>();
+    auto rhs = operands[1].cast<SplatElementsAttr>();
+
+    auto elementResult = calculate(lhs.getSplatValue<ElementValueT>(),
+                                   rhs.getSplatValue<ElementValueT>());
+    return DenseElementsAttr::get(lhs.getType(), elementResult);
+  } else if (operands[0].isa<ElementsAttr>() &&
+             operands[1].isa<ElementsAttr>()) {
+    // Operands are ElementsAttr-derived; perform an element-wise fold by
+    // expanding the values.
+    auto lhs = operands[0].cast<ElementsAttr>();
+    auto rhs = operands[1].cast<ElementsAttr>();
+
+    auto lhsIt = lhs.getValues<ElementValueT>().begin();
+    auto rhsIt = rhs.getValues<ElementValueT>().begin();
+    SmallVector<ElementValueT, 4> elementResults;
+    elementResults.reserve(lhs.getNumElements());
+    for (size_t i = 0, e = lhs.getNumElements(); i < e; ++i, ++lhsIt, ++rhsIt)
+      elementResults.push_back(calculate(*lhsIt, *rhsIt));
+    return DenseElementsAttr::get(lhs.getType(), elementResults);
+  }
+  return {};
+}
+} // namespace mlir
+
+#endif // MLIR_DIALECT_COMMONFOLDERS_H
diff --git a/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt b/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..484230778b3d78c85050ad3e77184e1ca23df69f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt
@@ -0,0 +1 @@
+add_mlir_dialect(FxpMathOps FxpMathOps)
diff --git a/mlir/include/mlir/Dialect/FxpMathOps/FxpMathOps.h b/mlir/include/mlir/Dialect/FxpMathOps/FxpMathOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c0e7aa1aadce647cf5df07ddb72cfd19217b9f6
--- /dev/null
+++ b/mlir/include/mlir/Dialect/FxpMathOps/FxpMathOps.h
@@ -0,0 +1,31 @@
+//===- FxpMathOps.h - Fixed point ops ---------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_FXPMATHOPS_FXPMATHOPS_H_
+#define MLIR_DIALECT_FXPMATHOPS_FXPMATHOPS_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+
+namespace mlir {
+namespace fxpmath {
+
+/// Defines the 'FxpMathOps' dialect.
+class FxpMathOpsDialect : public Dialect {
+public:
+  FxpMathOpsDialect(MLIRContext *context);
+};
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.h.inc"
+
+} // namespace fxpmath
+} // namespace mlir
+
+#endif // MLIR_DIALECT_FXPMATHOPS_FXPMATHOPS_H_
diff --git a/mlir/include/mlir/Dialect/FxpMathOps/FxpMathOps.td b/mlir/include/mlir/Dialect/FxpMathOps/FxpMathOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..d527b759a10c19c64614a6995f0ade815988ea41
--- /dev/null
+++ b/mlir/include/mlir/Dialect/FxpMathOps/FxpMathOps.td
@@ -0,0 +1,277 @@
+//===- FxpMathOps.td - Fixed point ops  --------------------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the operation definition file for fixed point ops (and real
+// equivalents).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DIALECT_FXPMATHOPS_FXPMATH_OPS_
+#define DIALECT_FXPMATHOPS_FXPMATH_OPS_
+
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/QuantOps/QuantPredicates.td"
+
+def fxpmath_Dialect : Dialect {
+  let name = "fxpmath";
+}
+
+//===----------------------------------------------------------------------===//
+// Attributes
+//===----------------------------------------------------------------------===//
+
+// Real value for an (inclusive) min/max clamp limit.
+def fxpmath_ClampValueAttr : OptionalAttr<F64Attr>;
+
+// Element-wise activation function to apply.
+// Note that RELU activations are not here: they are expressed as clamps.
+def fxpmath_EwUnaryFnAttr :
+    StringBasedAttr<CPred<"true">, "element-wise unary function"> {
+  let returnType = [{ StringRef }];
+  let defaultValue = "IDENTITY";
+}
+
+class fxpmath_ConstEwUnaryFn<string val> : ConstantAttr<fxpmath_EwUnaryFnAttr, val>;
+def fxpmath_EwUnaryFn_Abs     : fxpmath_ConstEwUnaryFn<"ABS">;
+def fxpmath_EwUnaryFn_Exp     : fxpmath_ConstEwUnaryFn<"EXP">;
+def fxpmath_EwUnaryFn_Identity: fxpmath_ConstEwUnaryFn<"IDENTITY">;
+def fxpmath_EwUnaryFn_Log     : fxpmath_ConstEwUnaryFn<"LOG">;
+def fxpmath_EwUnaryFn_Neg     : fxpmath_ConstEwUnaryFn<"NEG">;
+def fxpmath_EwUnaryFn_Rsqrt   : fxpmath_ConstEwUnaryFn<"RSQRT">;
+def fxpmath_EwUnaryFn_Sigmoid : fxpmath_ConstEwUnaryFn<"SIGMOID">;
+def fxpmath_EwUnaryFn_Sign    : fxpmath_ConstEwUnaryFn<"SIGN">;
+def fxpmath_EwUnaryFn_Sin     : fxpmath_ConstEwUnaryFn<"SIN">;
+def fxpmath_EwUnaryFn_Sqrt    : fxpmath_ConstEwUnaryFn<"SQRT">;
+def fxpmath_EwUnaryFn_Square  : fxpmath_ConstEwUnaryFn<"SQUARE">;
+def fxpmath_EwUnaryFn_Tanh    : fxpmath_ConstEwUnaryFn<"TANH">;
+
+//===----------------------------------------------------------------------===//
+// Comparison functions (compares relative to zero on a subtraction result).
+//===----------------------------------------------------------------------===//
+
+def fxpmath_CompareZ    : StrEnumAttrCase<"CMPZ">;
+def fxpmath_CompareNZ   : StrEnumAttrCase<"CMPNZ">;
+def fxpmath_CompareLZ   : StrEnumAttrCase<"CMPLZ">;
+def fxpmath_CompareLZE  : StrEnumAttrCase<"CMPLZE">;
+def fxpmath_CompareGZ   : StrEnumAttrCase<"CMPGZ">;
+def fxpmath_CompareGZE  : StrEnumAttrCase<"CMPGZE">;
+
+def fxpmath_CompareFnAttr : StrEnumAttr<"ComparisonFn",
+    "Type of subtraction-result comparison to perform.",
+    [
+      fxpmath_CompareZ,
+      fxpmath_CompareNZ,
+      fxpmath_CompareLZ,
+      fxpmath_CompareLZE,
+      fxpmath_CompareGZ,
+      fxpmath_CompareGZE
+    ]>;
+
+//===----------------------------------------------------------------------===//
+// Base classes
+//===----------------------------------------------------------------------===//
+
+class fxpmath_Op<string mnemonic, list<OpTrait> traits> :
+    Op<fxpmath_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Fixed-point (fxp) arithmetic ops used by kernels.
+// Some of these are temporary pending inclusion into a more core dialect.
+//===----------------------------------------------------------------------===//
+
+def fxpmath_ClampISOp : fxpmath_Op<"clampis", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary =
+      "Clamps a signed-integer like argument to a min/max range.";
+  let description = [{
+    Element-wise equivalent to:
+      r = std::min(clamp_max, std::max(e, clamp_min))
+  }];
+  let arguments = (ins IntegerLike:$operand,
+                       APIntAttr:$clamp_min,
+                       APIntAttr:$clamp_max);
+  let results = (outs IntegerLike);
+}
+
+def fxpmath_ConvertISOp :
+    fxpmath_Op<"convertis",
+               [NoSideEffect, SameOperandsAndResultShape]> {
+  let summary =
+      "Does an element-wise conversion from a signed integer to signed integer";
+  let description = [{
+    Similar to an element-wise static_cast in C++, from a one signed integer
+    element type to another.
+  }];
+  let arguments = (ins IntegerLike:$operand);
+  let results = (outs IntegerLike);
+}
+
+def fxpmath_ConvertISToFOp :
+    fxpmath_Op<"convertistof",
+               [NoSideEffect, SameOperandsAndResultShape]> {
+  let summary =
+      "Does an element-wise conversion from a signed integer to a float";
+  let description = [{
+    Similar to an element-wise static_cast in C++, from a signed integer
+    element type to a floating point element type, rounding to the nearest
+    floating point value.
+  }];
+  let arguments = (ins IntegerLike:$operand);
+  let results = (outs FloatLike);
+}
+
+
+def fxpmath_VecScalarSaturatingRoundingDoublingHighMulISOp :
+    fxpmath_Op<"vs_saturating_rounding_doubling_high_mulis",
+               [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Implements equivalent functionality to ARMv7 NEON VQRDMULH";
+  let description = [{
+    Equivalent to the ARMv7 NEON VQRDMULH instruction.
+    See gemmlowp::SaturatingRoundingDoublingHighMul for a reference
+    implementation.
+  }];
+  let arguments = (ins IntegerLike:$a, APIntAttr:$b);
+  let results = (outs IntegerLike);
+}
+
+def fxpmath_RoundingDivideByPotISOp :
+    fxpmath_Op<"rounding_divide_by_potis", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = [{
+    Computes a rounding arithmetic right shift.
+  }];
+  let description = [{
+    Computes integer division by a power-of-two, correctly rounded-to-nearest.
+    Also known as a rounding arithmetic right shift. See
+    gemmlowp::RoundingDivideByPOT for a reference implementation.
+  }];
+  let arguments = (ins IntegerLike:$operand, APIntAttr:$exponent);
+  let results = (outs IntegerLike:$res);
+  let verifier = [{
+    auto verifyExponent = exponent().getSExtValue();
+    if (verifyExponent < 0 || verifyExponent > 31) {
+      return emitOpError("exponent must be in range [0..31]");
+    }
+    return success();
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Real math ops.
+//
+// Math ops on real numbers which may have a representation in quantized
+// arithmetic. It is expected that eligible ops are lowered from a source
+// dialect to this set of ops prior to the process of converting a computation
+// to a quantized form. It is a non-goal of these ops to preserve enough
+// information to convert back to the higher level, source dialect.
+//
+// These ops support either real/floating point or QuantizedTypes as operands
+// and results. Since not all transformations are supported (globally or
+// sometimes for specific targets), a computation may end up with
+// untransformable RealMathOps, in which case they need to be lowered as is
+// (using floating point math).
+//
+// This op set takes advantage of the fact that it is typically trivial to
+// combine a math function with a compatible bias addition and real-valued
+// clamp (which can be done at a higher accumulation bit depth).
+//
+// In addition, all element-wise unary functions are collapsed into a single
+// fxpmath_RealUnaryEwOp and selected via an enum-like attribute. Especially at
+// low bit depths, this makes matching simpler and allows the construction of
+// generic LUT-based implementations. It also allows specific lowering rules
+// to consolidate runs of chained unary ops and fuse them to preceding math
+// ops, potentially allowing them to operate directly on higher precision
+// intermediates without resorting to lots of custom kernels for common
+// formulas that can suffer from insufficient precision at low bit depths.
+//
+// Comparison operators are modeled as element-wise unary functions (i.e.
+// CMPZ, CMPNZ, CMPLZ, CMPGZ) intended to follow a sub and output a 1bit
+// quantized value. It is expected that lowering rules can fuse them with
+// the preceding sub.
+//===----------------------------------------------------------------------===//
+
+class fxpmath_RealMathOp<string mnemonic, list<OpTrait> traits = [], dag args> :
+    fxpmath_Op<mnemonic, traits>,
+    Arguments<!con(args, (ins
+        fxpmath_ClampValueAttr:$clamp_min, fxpmath_ClampValueAttr:$clamp_max))>;
+
+//===----------------------------------------------------------------------===//
+// Element wise binary real math ops.
+//===----------------------------------------------------------------------===//
+
+class fxpmath_RealBinaryOp<string mnemonic, list<OpTrait> traits = []> :
+    fxpmath_RealMathOp<mnemonic, traits,
+                     (ins quant_RealValueType:$lhs,
+                      quant_RealValueType:$rhs)>,
+    Results<(outs quant_RealValueType:$res)>;
+
+class fxpmath_RealBinaryBiasOp<string mnemonic, list<OpTrait> traits = []> :
+    fxpmath_RealMathOp<mnemonic, traits,
+                     (ins quant_RealValueType:$lhs, quant_RealValueType:$rhs,
+                          quant_RealValueType:$bias)>,
+    Results<(outs quant_RealValueType:$res)>;
+
+def fxpmath_RealAddEwOp :
+    fxpmath_RealBinaryOp<"real_add_ew", [NoSideEffect]>;
+
+def fxpmath_RealSubEwOp :
+    fxpmath_RealBinaryOp<"real_sub_ew", [NoSideEffect]>;
+
+def fxpmath_RealMulEwOp :
+    fxpmath_RealBinaryOp<"real_mul_ew", [NoSideEffect]>;
+
+def fxpmath_RealDivEwOp :
+    fxpmath_RealBinaryOp<"real_div_ew", [NoSideEffect]>;
+
+//===----------------------------------------------------------------------===//
+// Element wise unary real math op.
+//===----------------------------------------------------------------------===//
+
+def fxpmath_RealUnaryEwOp :
+    fxpmath_RealMathOp<"real_unary_ew", [NoSideEffect],
+        (ins quant_RealValueType:$operand, fxpmath_EwUnaryFnAttr:$fn)>,
+    Results<(outs quant_RealValueType:$res)>;
+
+def fxpmath_RealCompareZeroEwOp : fxpmath_Op<"compare", [NoSideEffect]>,
+    Arguments<(ins quant_RealValueType:$operand, fxpmath_CompareFnAttr:$fn)>,
+    Results<(outs I1Tensor:$res)> {
+  let description = [{
+    Compares a real value to zero, returning an I1 (boolean) tensor with the
+    result of applying the comparison function.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Dot op with fused bias addition.
+//===----------------------------------------------------------------------===//
+
+def fxpmath_RealMatMulOp :
+    fxpmath_RealBinaryOp<"real_matmul", [NoSideEffect]> {
+  let summary = "Matmul";
+  let description = [{
+    A matrix multiply of [m, k] and [k, n] -> [m, n] where the bias vector is
+    of shape [n]. Also accepts rank 3 or more input tensors, in which case
+    the leading dimensions are batch dims.
+
+    Many real systems have specific library calls optimized for this precise
+    operation, which is why it is handled explicitly versus purely as a
+    generalized tensor contraction.
+  }];
+}
+
+def fxpmath_RealMatMulBiasOp :
+    fxpmath_RealBinaryBiasOp<"real_matmul_bias", [NoSideEffect]> {
+  let summary = "Matmul with bias";
+  let description = [{
+    A specialization of a RealMatMulOp that also accepts an [n] dimension
+    bias vector.
+
+    In addition, there is often special support for a fused bias and clamp,
+    which is why they are included.
+  }];
+}
+
+#endif  // DIALECT_FXPMATHOPS_FXPMATH_OPS_
diff --git a/mlir/include/mlir/Dialect/FxpMathOps/Passes.h b/mlir/include/mlir/Dialect/FxpMathOps/Passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..aec21c4c18621b3dfc3f0dbdeab001ac6cf6817f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/FxpMathOps/Passes.h
@@ -0,0 +1,35 @@
+//===- Passes.h - Fixed point math passes -----------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the passes owned by the FxpMathOps dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_FXPMATHOPS_PASSES_H
+#define MLIR_DIALECT_FXPMATHOPS_PASSES_H
+
+namespace mlir {
+class FuncOp;
+template <typename T> class OpPassBase;
+
+namespace fxpmath {
+
+/// Creates a pass that lowers uniform-quantized real math ops to integer
+/// arithmetic. This will leave unrecognized real math ops as-is and is
+/// typically followed by a pass that lowers any unrecognized ops to a pure
+/// floating point form.
+OpPassBase<FuncOp> *createLowerUniformRealMathPass();
+
+/// Creates a pass that lowers uniform-quantized qcast/dcast ops to equivalent
+/// operations that perform quantize/dequantize.
+OpPassBase<FuncOp> *createLowerUniformCastsPass();
+
+} // namespace fxpmath
+} // namespace mlir
+
+#endif // MLIR_DIALECT_FXPMATHOPS_PASSES_H
diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fd85b5bcfbfa21463eff0e66ab138f16899d7935
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
@@ -0,0 +1 @@
+add_mlir_dialect(GPUOps GPUOps)
diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..1776ff7198052fa19087216f2692c6675aac1a41
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -0,0 +1,82 @@
+//===- GPUDialect.h - MLIR Dialect for GPU Kernels --------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the GPU kernel-related operations and puts them in the
+// corresponding dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_GPUDIALECT_H
+#define MLIR_DIALECT_GPU_GPUDIALECT_H
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/FunctionSupport.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/SymbolTable.h"
+
+namespace mlir {
+class FuncOp;
+
+namespace gpu {
+
+/// The dialect containing GPU kernel launching operations and related
+/// facilities.
+class GPUDialect : public Dialect {
+public:
+  /// Create the dialect in the given `context`.
+  explicit GPUDialect(MLIRContext *context);
+  /// Get dialect namespace.
+  static StringRef getDialectNamespace() { return "gpu"; }
+
+  /// Get the name of the attribute used to annotate the modules that contain
+  /// kernel modules.
+  static StringRef getContainerModuleAttrName() {
+    return "gpu.container_module";
+  }
+
+  /// Get the canonical string name of the dialect.
+  static StringRef getDialectName();
+
+  /// Get the name of the attribute used to annotate external kernel functions.
+  static StringRef getKernelFuncAttrName() { return "gpu.kernel"; }
+
+  /// Get the name of the attribute used to annotate kernel modules.
+  static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; }
+
+  /// Returns whether the given function is a kernel function, i.e., has the
+  /// 'gpu.kernel' attribute.
+  static bool isKernel(Operation *op);
+
+  /// Returns the numeric value used to identify the workgroup memory address
+  /// space.
+  static unsigned getWorkgroupAddressSpace() { return 3; }
+
+  /// Returns the numeric value used to identify the private memory address
+  /// space.
+  static unsigned getPrivateAddressSpace() { return 5; }
+
+  LogicalResult verifyOperationAttribute(Operation *op,
+                                         NamedAttribute attr) override;
+};
+
+/// Utility class for the GPU dialect to represent triples of `Value`s
+/// accessible through `.x`, `.y`, and `.z` similarly to CUDA notation.
+struct KernelDim3 {
+  Value x;
+  Value y;
+  Value z;
+};
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/GPU/GPUOps.h.inc"
+
+} // end namespace gpu
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_GPU_GPUDIALECT_H
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..b5b93e9b553b58740178f895ad83fe3b4e6f8ce4
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -0,0 +1,587 @@
+//===-- GPUOps.td - GPU dialect operation definitions ------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines some operations of the GPU dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GPU_OPS
+#define GPU_OPS
+
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+
+// Type constraint accepting standard integers, indices and wrapped LLVM integer
+// types.
+def IntLikeOrLLVMInt : TypeConstraint<
+  Or<[AnyInteger.predicate, Index.predicate, LLVMInt.predicate]>,
+  "integer, index or LLVM dialect equivalent">;
+
+//===----------------------------------------------------------------------===//
+// GPU Dialect operations.
+//===----------------------------------------------------------------------===//
+
+def GPU_Dialect : Dialect {
+  let name = "gpu";
+}
+
+class GPU_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<GPU_Dialect, mnemonic, traits>;
+
+class GPU_IndexOp<string mnemonic, list<OpTrait> traits = []> :
+    GPU_Op<mnemonic, !listconcat(traits, [NoSideEffect])>,
+    Arguments<(ins StrAttr:$dimension)>, Results<(outs Index)> {
+  let verifier = [{ return ::verifyIndexOp(*this); }];
+}
+
+def GPU_BlockDimOp : GPU_IndexOp<"block_dim">;
+def GPU_BlockIdOp : GPU_IndexOp<"block_id">;
+def GPU_GridDimOp : GPU_IndexOp<"grid_dim">;
+def GPU_ThreadIdOp : GPU_IndexOp<"thread_id">;
+
+def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> {
+  let summary = "Function executable on a GPU";
+
+  let description = [{
+    Defines a function that can be executed on a GPU. This supports memory
+    attribution and its body has a particular execution model.
+
+    GPU functions are either kernels (as indicated by the `kernel` attribute) or
+    regular functions. The former can be launched from the host side, while the
+    latter are device side only.
+
+    The memory attribution defines SSA values that correspond to memory buffers
+    allocated in the memory hierarchy of the GPU (see below).
+
+    The operation has one attached region that corresponds to the body of the
+    function. The region arguments consist of the function arguments without
+    modification, followed by buffers defined in memory annotations. The body of
+    a GPU function, when launched, is executed by multiple work items. There are
+    no guarantees on the order in which work items execute, or on the connection
+    between them. In particular, work items are not necessarily executed in
+    lock-step. Synchronization ops such as "gpu.barrier" should be used to
+    coordinate work items. Declarations of GPU functions, i.e. not having the
+    body region, are not supported.
+
+    Syntax:
+
+    ```
+    op ::= `gpu.func` symbol-ref-id `(` argument-list `)` (`->`
+    function-result-list)?
+           memory-attribution `kernel`? function-attributes? region
+
+    memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
+                           (`private` `(` ssa-id-and-type-list `)`)?
+    ```
+
+    Example:
+
+    ```mlir
+    gpu.func @foo(%arg0: index)
+        workgroup(%workgroup: memref<32xf32, 3>)
+        private(%private: memref<1xf32, 5>)
+        kernel
+        attributes {qux: "quux"} {
+      gpu.return
+    }
+    ```
+
+    The generic form illustrates the concept
+
+    ```mlir
+    "gpu.func"(%arg: index) {sym_name: "foo", kernel, qux: "quux"} ({
+    ^bb0(%arg0: index, %workgroup: memref<32xf32, 3>,
+         %private: memref<1xf32, 5>):
+      "gpu.return"() : () -> ()
+    }) : (index) -> ()
+    ```
+
+    Note the non-default memory spaces used in memref types in memory
+    attribution.
+  }];
+
+  let regions = (region AnyRegion:$body);
+
+  let skipDefaultBuilders = 1;
+
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &result, StringRef name, "
+              "FunctionType type, ArrayRef<Type> workgroupAttributions = {}, "
+              "ArrayRef<Type> privateAttributions = {}, "
+              "ArrayRef<NamedAttribute> attrs = {}">
+  ];
+
+  let extraClassDeclaration = [{
+    /// Returns `true` if the GPU function defined by this Op is a kernel, i.e.
+    /// it is intended to be launched from host.
+    bool isKernel() {
+      return getAttrOfType<UnitAttr>(GPUDialect::getKernelFuncAttrName()) !=
+             nullptr;
+    }
+
+    /// Returns the type of the function this Op defines.
+    FunctionType getType() {
+      return getTypeAttr().getValue().cast<FunctionType>();
+    }
+
+    /// Change the type of this function in place. This is an extremely
+    /// dangerous operation and it is up to the caller to ensure that this is
+    /// legal for this function, and to restore invariants:
+    ///  - the entry block args must be updated to match the function params.
+    ///  - the argument/result attributes may need an update: if the new type
+    ///  has less parameters we drop the extra attributes, if there are more
+    ///  parameters they won't have any attributes.
+    // TODO(b/146349912): consider removing this function thanks to rewrite
+    // patterns.
+    void setType(FunctionType newType);
+
+    /// Returns the number of buffers located in the workgroup memory.
+    unsigned getNumWorkgroupAttributions() {
+      return getAttrOfType<IntegerAttr>(getNumWorkgroupAttributionsAttrName())
+          .getInt();
+    }
+
+    /// Returns a list of block arguments that correspond to buffers located in
+    /// the workgroup memory
+    ArrayRef<BlockArgument> getWorkgroupAttributions() {
+      auto begin =
+          std::next(getBody().front().args_begin(), getType().getNumInputs());
+      auto end = std::next(begin, getNumWorkgroupAttributions());
+      return {begin, end};
+    }
+
+    /// Returns a list of block arguments that correspond to buffers located in
+    /// the private memory.
+    ArrayRef<BlockArgument> getPrivateAttributions() {
+      auto begin =
+          std::next(getBody().front().args_begin(),
+                    getType().getNumInputs() + getNumWorkgroupAttributions());
+      return {begin, getBody().front().args_end()};
+    }
+
+    /// Returns the name of the attribute containing the number of buffers
+    /// located in the workgroup memory.
+    static StringRef getNumWorkgroupAttributionsAttrName() {
+      return "workgroup_attributions";
+    }
+
+    // FunctionLike trait needs access to the functions below.
+    friend class OpTrait::FunctionLike<GPUFuncOp>;
+
+    /// Hooks for the input/output type enumeration in FunctionLike .
+    unsigned getNumFuncArguments() { return getType().getNumInputs(); }
+    unsigned getNumFuncResults() { return getType().getNumResults(); }
+
+    /// Returns the keywords used in the custom syntax for this Op.
+    static StringRef getWorkgroupKeyword() { return "workgroup"; }
+    static StringRef getPrivateKeyword() { return "private"; }
+    static StringRef getKernelKeyword() { return "kernel"; }
+
+    /// Hook for FunctionLike verifier.
+    LogicalResult verifyType();
+
+    /// Verifies the body of the function.
+    LogicalResult verifyBody();
+  }];
+
+  // let verifier = [{ return ::verifFuncOpy(*this); }];
+  let printer = [{ printGPUFuncOp(p, *this); }];
+  let parser = [{ return parseGPUFuncOp(parser, result); }];
+}
+
+def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
+    Arguments<(ins IntLikeOrLLVMInt:$gridSizeX, IntLikeOrLLVMInt:$gridSizeY,
+               IntLikeOrLLVMInt:$gridSizeZ, IntLikeOrLLVMInt:$blockSizeX,
+               IntLikeOrLLVMInt:$blockSizeY, IntLikeOrLLVMInt:$blockSizeZ,
+               Variadic<AnyType>:$operands)>,
+    Results<(outs)> {
+  let summary = "Launches a function as a GPU kerneel";
+
+  let description = [{
+    Launch a kernel function on the specified grid of thread blocks.
+    `gpu.launch` operations are lowered to `gpu.launch_func` operations by
+    outlining the kernel body into a function in a dedicated module, which
+    reflects the separate compilation process. The kernel function is required
+    to have the `gpu.kernel` attribute. The module containing the kernel
+    function is required to have the `gpu.kernel_module` attribute and must be
+    named. And finally, the module containing the kernel module (which thus
+    cannot be the top-level module) is required to have the
+    `gpu.container_module` attribute. The `gpu.launch_func` operation has a
+    string attribute named `kernel` to specify the name of the kernel function
+    to launch and an attribute named `kernel_module` to specify the name of the
+    module containing that kernel function.
+
+    The operation takes at least six operands, with the first three operands
+    being grid sizes along x,y,z dimensions and the following three being block
+    sizes along x,y,z dimensions. When a lower-dimensional kernel is required,
+    unused sizes must be explicitly set to `1`. The remaining operands are
+    passed as arguments to the kernel function.
+
+    A custom syntax for this operation is currently not available.
+
+    Example:
+
+    ```mlir
+    module attributes {gpu.container_module} {
+
+      // This module creates a separate compilation unit for the GPU compiler.
+      module @kernels attributes {gpu.kernel_module} {
+        func @kernel_1(%arg0 : f32, %arg1 : !llvm<"float*">)
+            attributes { nvvm.kernel = true } {
+
+          // Operations that produce block/thread IDs and dimensions are
+          // injected when outlining the `gpu.launch` body to a function called
+          // by `gpu.launch_func`.
+          %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
+          %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
+          %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
+
+          %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
+          %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
+          %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
+
+          %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
+          %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
+          %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
+
+          %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
+          %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
+          %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+
+          "some_op"(%bx, %tx) : (index, index) -> ()
+          %42 = load %arg1[%bx] : memref<?xf32, 1>
+        }
+      }
+
+      "gpu.launch_func"(%cst, %cst, %cst,  // Grid sizes.
+                        %cst, %cst, %cst,  // Block sizes.
+                        %arg0, %arg1)      // Arguments passed to the kernel.
+            { kernel_module = @kernels,    // Module containing the kernel.
+              kernel = "kernel_1" }        // Kernel function.
+            : (index, index, index, index, index, index, f32, !llvm<"float*">)
+              -> ()
+    }
+    ```
+  }];
+
+  let skipDefaultBuilders = 1;
+
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &result, GPUFuncOp kernelFunc, "
+              "Value gridSizeX, Value gridSizeY, Value gridSizeZ, "
+              "Value blockSizeX, Value blockSizeY, Value blockSizeZ, "
+              "ValueRange kernelOperands">,
+    OpBuilder<"Builder *builder, OperationState &result, GPUFuncOp kernelFunc, "
+              "KernelDim3 gridSize, KernelDim3 blockSize, "
+              "ValueRange kernelOperands">
+  ];
+
+  let extraClassDeclaration = [{
+    /// The kernel function specified by the operation's `kernel` attribute.
+    StringRef kernel();
+
+    /// The number of operands passed to the kernel function.
+    unsigned getNumKernelOperands();
+
+    /// The name of the kernel module specified by the operation's
+    /// `kernel_module` attribute.
+    StringRef getKernelModuleName();
+
+    /// The i-th operand passed to the kernel function.
+    Value getKernelOperand(unsigned i);
+
+    /// Get the SSA values passed as operands to specify the grid size.
+    KernelDim3 getGridSizeOperandValues();
+
+    /// Get the SSA values passed as operands to specify the block size.
+    KernelDim3 getBlockSizeOperandValues();
+
+    /// The number of launch configuration operands, placed at the leading
+    /// positions of the operand list.
+    static constexpr unsigned kNumConfigOperands = 6;
+
+    // This needs to quietly verify if attributes with names defined below are
+    // present since it is run before the verifier of this op.
+    friend LogicalResult GPUDialect::verifyOperationAttribute(Operation *,
+                                                              NamedAttribute);
+
+    /// The name of the symbolRef attribute specifying the kernel to launch.
+    static StringRef getKernelAttrName() { return "kernel"; }
+
+    /// The name of the symbolRef attribute specifying the name of the module
+    /// containing the kernel to launch.
+    static StringRef getKernelModuleAttrName() { return "kernel_module"; }
+  }];
+
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def GPU_LaunchOp : GPU_Op<"launch", [IsolatedFromAbove]>,
+    Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
+               Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
+               Variadic<AnyType>:$operands)>,
+    Results<(outs)> {
+  let summary = "GPU kernel launch operation";
+
+  let description = [{
+    Launch a kernel on the specified grid of thread blocks. The body of the
+    kernel is defined by the single region that this operation contains. The
+    operation takes at least six operands, with first three operands being grid
+    sizes along x,y,z dimensions, the following three arguments being block
+    sizes along x,y,z dimension, and the remaining operands are arguments of the
+    kernel. When a lower-dimensional kernel is required, unused sizes must be
+    explicitly set to `1`.
+
+    The body region has at least _twelve_ arguments, grouped as follows:
+
+    -   three arguments that contain block identifiers along x,y,z dimensions;
+    -   three arguments that contain thread identifiers along x,y,z dimensions;
+    -   operands of the `gpu.launch` operation as is, including six leading
+        operands for grid and block sizes.
+
+    Operations inside the body region, and any operations in the nested regions,
+    are _not_ allowed to use values defined outside the _body_ region, as if
+    this region was a function. If necessary, values must be passed as kernel
+    arguments into the body region. Nested regions inside the kernel body are
+    allowed to use values defined in their ancestor regions as long as they
+    don't cross the kernel body region boundary.
+
+    Syntax:
+
+    ```
+    operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment
+                             `threads` `(` ssa-id-list `)` `in` ssa-reassignment
+                               (`args` ssa-reassignment `:` type-list)?
+                               region attr-dict?
+    ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
+    ```
+
+    Example:
+
+    ```mlir
+    gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
+               threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5)
+               args(%arg0 = %6, %arg1 = 7) : f32, memref<?xf32, 1> {
+      // Block and thread identifiers, as well as block/grid sizes are
+      // immediately usable inside body region.
+      "some_op"(%bx, %tx) : (index, index) -> ()
+      %42 = load %arg1[%bx] : memref<?xf32, 1>
+    }
+
+    // Generic syntax explains how the pretty syntax maps to the IR structure.
+    "gpu.launch"(%cst, %cst, %c1,  // Grid sizes.
+                        %cst, %c1, %c1,   // Block sizes.
+                        %arg0, %arg1)     // Actual arguments.
+        {/*attributes*/}
+        // All sizes and identifiers have "index" size.
+        : (index, index, index, index, index, index, f32, memref<?xf32, 1>)
+            -> () {
+    // The operation passes block and thread identifiers, followed by grid and
+    // block sizes, followed by actual arguments to the entry block of the
+    // region.
+    ^bb0(%bx : index, %by : index, %bz : index,
+         %tx : index, %ty : index, %tz : index,
+         %num_bx : index, %num_by : index, %num_bz : index,
+         %num_tx : index, %num_ty : index, %num_tz : index,
+         %arg0 : f32, %arg1 : memref<?xf32, 1>):
+      "some_op"(%bx, %tx) : (index, index) -> ()
+      %3 = "std.load"(%arg1, %bx) : (memref<?xf32, 1>, index) -> f32
+    }
+    ```
+
+    Rationale: using operation/block arguments gives analyses a clear way of
+    understanding that a value has additional semantics (e.g., we will need to
+    know what value corresponds to threadIdx.x for coalescing). We can recover
+    these properties by analyzing the operations producing values, but it is
+    easier just to have that information by construction.
+  }];
+
+  let regions = (region AnyRegion:$body);
+
+  let skipDefaultBuilders = 1;
+
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &result, Value gridSizeX,"
+              "Value gridSizeY, Value gridSizeZ, Value blockSizeX,"
+              "Value blockSizeY, Value blockSizeZ,"
+              "ValueRange operands">
+  ];
+
+  let hasCanonicalizer = 1;
+
+  let extraClassDeclaration = [{
+    /// Get the SSA values corresponding to kernel block identifiers.
+    KernelDim3 getBlockIds();
+    /// Get the SSA values corresponding to kernel thread identifiers.
+    KernelDim3 getThreadIds();
+    /// Get the SSA values corresponding to kernel grid size.
+    KernelDim3 getGridSize();
+    /// Get the SSA values corresponding to kernel block size.
+    KernelDim3 getBlockSize();
+    /// Get the operand values passed as kernel arguments.
+    operand_range getKernelOperandValues();
+    /// Get the operand types passed as kernel arguments.
+    operand_type_range getKernelOperandTypes();
+
+    /// Get the SSA values passed as operands to specify the grid size.
+    KernelDim3 getGridSizeOperandValues();
+    /// Get the SSA values passed as operands to specify the block size.
+    KernelDim3 getBlockSizeOperandValues();
+
+    /// Get the SSA values of the kernel arguments.
+    iterator_range<Block::args_iterator> getKernelArguments();
+
+    /// Erase the `index`-th kernel argument.  Both the entry block argument and
+    /// the operand will be dropped.  The block argument must not have any uses.
+    void eraseKernelArgument(unsigned index);
+
+    static StringRef getBlocksKeyword() { return "blocks"; }
+    static StringRef getThreadsKeyword() { return "threads"; }
+    static StringRef getArgsKeyword() { return "args"; }
+
+    /// The number of launch configuration operands, placed at the leading
+    /// positions of the operand list.
+    static constexpr unsigned kNumConfigOperands = 6;
+
+    /// The number of region attributes containing the launch configuration,
+    /// placed in the leading positions of the argument list.
+    static constexpr unsigned kNumConfigRegionAttributes = 12;
+  }];
+
+  let parser = [{ return parseLaunchOp(parser, result); }];
+  let printer = [{ printLaunchOp(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def GPU_ReturnOp : GPU_Op<"return", [Terminator]>, Arguments<(ins)>,
+    Results<(outs)> {
+  let summary = "Terminator for GPU launch regions.";
+  let description = [{
+    A terminator operation for regions that appear in the body of `gpu.launch`
+    operation.  These regions are not expected to return any value so the
+    terminator takes no operands.
+  }];
+
+  let parser = [{ return success(); }];
+  let printer = [{ p << getOperationName(); }];
+}
+
+def GPU_YieldOp : GPU_Op<"yield", [Terminator]>,
+    Arguments<(ins Variadic<AnyType>:$values)> {
+  let summary = "GPU yield operation";
+  let description = [{
+    "gpu.yield" is a special terminator operation for blocks inside regions
+    in gpu ops. It returns values to the immediately enclosing gpu op.
+
+    Example:
+
+       ```gpu.yield %f0, %f1 : f32, f32
+       ```
+  }];
+}
+
+// These mirror the XLA ComparisonDirection enum.
+def GPU_AllReduceOpAdd : StrEnumAttrCase<"add">;
+def GPU_AllReduceOpMul : StrEnumAttrCase<"mul">;
+
+def GPU_AllReduceOperationAttr : StrEnumAttr<"AllReduceOperationAttr",
+    "built-in reduction operations supported by gpu.allreduce.",
+    [
+      GPU_AllReduceOpAdd,
+      GPU_AllReduceOpMul,
+    ]>;
+
+def GPU_AllReduceOp : GPU_Op<"all_reduce",
+    [SameOperandsAndResultType, IsolatedFromAbove]>,
+    Arguments<(ins AnyType:$value,
+               OptionalAttr<GPU_AllReduceOperationAttr>:$op)>,
+    Results<(outs AnyType)> {
+  let summary = "Reduce values among workgroup.";
+  let description = [{
+    The "all_reduce" op reduces the value of every work item across a local
+    workgroup. The result is equal for all work items of a workgroup.
+
+    For example, both
+    ```
+      %1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32)
+      %2 = "gpu.all_reduce"(%0) ({
+      ^bb(%lhs : f32, %rhs : f32):
+        %sum = addf %lhs, %rhs : f32
+        "gpu.yield"(%sum) : (f32) -> ()
+      }) : (f32) -> (f32)
+    ```
+    compute the sum of each work item's %0 value. The first version specifies
+    the accumulation as operation, whereas the second version specifies the
+    accumulation as code region. The accumulation operation must either be
+    `add` or `mul`.
+
+    Either none or all work items of a workgroup need to execute this op
+    in convergence.
+  }];
+  let regions = (region AnyRegion:$body);
+  let verifier = [{ return ::verifyAllReduce(*this); }];
+}
+
+def GPU_ShuffleOpXor : StrEnumAttrCase<"xor">;
+
+def GPU_ShuffleModeAttr : StrEnumAttr<"ShuffleModeAttr",
+    "Indexing modes supported by gpu.shuffle.",
+    [
+      GPU_ShuffleOpXor,
+    ]>;
+
+def GPU_ShuffleOp : GPU_Op<"shuffle", [NoSideEffect]>,
+    Arguments<(ins AnyType:$value, I32:$offset, I32:$width,
+               GPU_ShuffleModeAttr:$mode)>,
+    Results<(outs AnyType:$result, I1:$valid)> {
+  let summary = "Shuffles values within a subgroup.";
+  let description = [{
+    The "shuffle" op moves values to a different invocation within the same
+    subgroup.
+
+    For example
+    ```
+      %1, %2 = gpu.shuffle %0, %offset, %width xor : f32
+    ```
+    for lane k returns the value from lane `k ^ offset` and `true` if that lane
+    is smaller than %width. Otherwise it returns an unspecified value and
+    `false`. A lane is the index of an invocation relative to its subgroup.
+
+    The width specifies the number of invocations that participate in the
+    shuffle. The width needs to be the same for all invocations that participate
+    in the shuffle. Exactly the first `width` invocations of a subgroup need to
+    execute this op in convergence.
+  }];
+  let verifier = [{ return ::verifyShuffleOp(*this); }];
+  let printer = [{ printShuffleOp(p, *this); }];
+  let parser = [{ return parseShuffleOp(parser, result); }];
+}
+
+def GPU_BarrierOp : GPU_Op<"barrier"> {
+  let summary = "Synchronizes all work items of a workgroup.";
+  let description = [{
+    The "barrier" op synchronizes all work items of a workgroup. It is used
+    to coordinate communication between the work items of the workgroup.
+
+    ```
+      gpu.barrier
+    ```
+    waits until all work items in the workgroup have reached this point
+    and all memory accesses made by these work items prior to the op are
+    visible to all work items in the workgroup. Data hazards between work items
+    accessing the same memory can be avoided by synchronizing work items
+    in-between these accesses.
+
+    Either none or all work items of a workgroup need to execute this op
+    in convergence.
+  }];
+  let parser = [{ return success(); }];
+  let printer = [{ p << getOperationName(); }];
+}
+
+#endif // GPU_OPS
diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..daf6d28d4526a8cb1d698b06970ab85634b05c8f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -0,0 +1,27 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes that expose pass constructors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_PASSES_H_
+#define MLIR_DIALECT_GPU_PASSES_H_
+
+#include <memory>
+
+namespace mlir {
+
+class ModuleOp;
+template <typename T> class OpPassBase;
+
+std::unique_ptr<OpPassBase<ModuleOp>> createGpuKernelOutliningPass();
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_PASSES_H_
diff --git a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fa68eff91b0c52e7658331e369731b0261445fc3
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(LLVM_TARGET_DEFINITIONS LLVMOps.td)
+mlir_tablegen(LLVMOps.h.inc -gen-op-decls)
+mlir_tablegen(LLVMOps.cpp.inc -gen-op-defs)
+mlir_tablegen(LLVMOpsEnums.h.inc -gen-enum-decls)
+mlir_tablegen(LLVMOpsEnums.cpp.inc -gen-enum-defs)
+add_public_tablegen_target(MLIRLLVMOpsIncGen)
+
+add_mlir_dialect(NVVMOps NVVMOps)
+add_mlir_dialect(ROCDLOps ROCDLOps)
+
+set(LLVM_TARGET_DEFINITIONS LLVMOps.td)
+mlir_tablegen(LLVMConversions.inc -gen-llvmir-conversions)
+add_public_tablegen_target(MLIRLLVMConversionsIncGen)
+set(LLVM_TARGET_DEFINITIONS NVVMOps.td)
+mlir_tablegen(NVVMConversions.inc -gen-llvmir-conversions)
+add_public_tablegen_target(MLIRNVVMConversionsIncGen)
+set(LLVM_TARGET_DEFINITIONS ROCDLOps.td)
+mlir_tablegen(ROCDLConversions.inc -gen-llvmir-conversions)
+add_public_tablegen_target(MLIRROCDLConversionsIncGen)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..d36619bb9a9515a50f94df323e441b0ddef00a58
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
@@ -0,0 +1,199 @@
+//===- LLVMDialect.h - MLIR LLVM IR dialect ---------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LLVM IR dialect in MLIR, containing LLVM operations and
+// LLVM type system.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LLVMIR_LLVMDIALECT_H_
+#define MLIR_DIALECT_LLVMIR_LLVMDIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/TypeSupport.h"
+#include "mlir/IR/Types.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMOpsEnums.h.inc"
+
+namespace llvm {
+class Type;
+class LLVMContext;
+} // end namespace llvm
+
+namespace mlir {
+namespace LLVM {
+class LLVMDialect;
+
+namespace detail {
+struct LLVMTypeStorage;
+struct LLVMDialectImpl;
+} // namespace detail
+
+class LLVMType : public mlir::Type::TypeBase<LLVMType, mlir::Type,
+                                             detail::LLVMTypeStorage> {
+public:
+  enum Kind {
+    LLVM_TYPE = FIRST_LLVM_TYPE,
+  };
+
+  using Base::Base;
+
+  static bool kindof(unsigned kind) { return kind == LLVM_TYPE; }
+
+  LLVMDialect &getDialect();
+  llvm::Type *getUnderlyingType() const;
+
+  /// Utilities to identify types.
+  bool isFloatTy() { return getUnderlyingType()->isFloatTy(); }
+  bool isDoubleTy() { return getUnderlyingType()->isDoubleTy(); }
+  bool isIntegerTy() { return getUnderlyingType()->isIntegerTy(); }
+  bool isIntegerTy(unsigned bitwidth) {
+    return getUnderlyingType()->isIntegerTy(bitwidth);
+  }
+
+  /// Array type utilities.
+  LLVMType getArrayElementType();
+  unsigned getArrayNumElements();
+  bool isArrayTy();
+
+  /// Vector type utilities.
+  LLVMType getVectorElementType();
+  bool isVectorTy();
+
+  /// Function type utilities.
+  LLVMType getFunctionParamType(unsigned argIdx);
+  unsigned getFunctionNumParams();
+  LLVMType getFunctionResultType();
+  bool isFunctionTy();
+
+  /// Pointer type utilities.
+  LLVMType getPointerTo(unsigned addrSpace = 0);
+  LLVMType getPointerElementTy();
+  bool isPointerTy();
+
+  /// Struct type utilities.
+  LLVMType getStructElementType(unsigned i);
+  unsigned getStructNumElements();
+  bool isStructTy();
+
+  /// Utilities used to generate floating point types.
+  static LLVMType getDoubleTy(LLVMDialect *dialect);
+  static LLVMType getFloatTy(LLVMDialect *dialect);
+  static LLVMType getHalfTy(LLVMDialect *dialect);
+  static LLVMType getFP128Ty(LLVMDialect *dialect);
+  static LLVMType getX86_FP80Ty(LLVMDialect *dialect);
+
+  /// Utilities used to generate integer types.
+  static LLVMType getIntNTy(LLVMDialect *dialect, unsigned numBits);
+  static LLVMType getInt1Ty(LLVMDialect *dialect) {
+    return getIntNTy(dialect, /*numBits=*/1);
+  }
+  static LLVMType getInt8Ty(LLVMDialect *dialect) {
+    return getIntNTy(dialect, /*numBits=*/8);
+  }
+  static LLVMType getInt8PtrTy(LLVMDialect *dialect) {
+    return getInt8Ty(dialect).getPointerTo();
+  }
+  static LLVMType getInt16Ty(LLVMDialect *dialect) {
+    return getIntNTy(dialect, /*numBits=*/16);
+  }
+  static LLVMType getInt32Ty(LLVMDialect *dialect) {
+    return getIntNTy(dialect, /*numBits=*/32);
+  }
+  static LLVMType getInt64Ty(LLVMDialect *dialect) {
+    return getIntNTy(dialect, /*numBits=*/64);
+  }
+
+  /// Utilities used to generate other miscellaneous types.
+  static LLVMType getArrayTy(LLVMType elementType, uint64_t numElements);
+  static LLVMType getFunctionTy(LLVMType result, ArrayRef<LLVMType> params,
+                                bool isVarArg);
+  static LLVMType getFunctionTy(LLVMType result, bool isVarArg) {
+    return getFunctionTy(result, llvm::None, isVarArg);
+  }
+  static LLVMType getStructTy(LLVMDialect *dialect, ArrayRef<LLVMType> elements,
+                              bool isPacked = false);
+  static LLVMType getStructTy(LLVMDialect *dialect, bool isPacked = false) {
+    return getStructTy(dialect, llvm::None, isPacked);
+  }
+  template <typename... Args>
+  static typename std::enable_if<llvm::are_base_of<LLVMType, Args...>::value,
+                                 LLVMType>::type
+  getStructTy(LLVMType elt1, Args... elts) {
+    SmallVector<LLVMType, 8> fields({elt1, elts...});
+    return getStructTy(&elt1.getDialect(), fields);
+  }
+  static LLVMType getVectorTy(LLVMType elementType, unsigned numElements);
+  static LLVMType getVoidTy(LLVMDialect *dialect);
+
+private:
+  friend LLVMDialect;
+
+  /// Get an LLVMType with a pre-existing llvm type.
+  static LLVMType get(MLIRContext *context, llvm::Type *llvmType);
+
+  /// Get an LLVMType with an llvm type that may cause changes to the underlying
+  /// llvm context when constructed.
+  static LLVMType getLocked(LLVMDialect *dialect,
+                            function_ref<llvm::Type *()> typeBuilder);
+};
+
+///// Ops /////
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LLVMIR/LLVMOps.h.inc"
+
+class LLVMDialect : public Dialect {
+public:
+  explicit LLVMDialect(MLIRContext *context);
+  ~LLVMDialect();
+  static StringRef getDialectNamespace() { return "llvm"; }
+
+  llvm::LLVMContext &getLLVMContext();
+  llvm::Module &getLLVMModule();
+
+  /// Parse a type registered to this dialect.
+  Type parseType(DialectAsmParser &parser) const override;
+
+  /// Print a type registered to this dialect.
+  void printType(Type type, DialectAsmPrinter &os) const override;
+
+  /// Verify a region argument attribute registered to this dialect.
+  /// Returns failure if the verification failed, success otherwise.
+  LogicalResult verifyRegionArgAttribute(Operation *op, unsigned regionIdx,
+                                         unsigned argIdx,
+                                         NamedAttribute argAttr) override;
+
+private:
+  friend LLVMType;
+
+  std::unique_ptr<detail::LLVMDialectImpl> impl;
+};
+
+/// Create an LLVM global containing the string "value" at the module containing
+/// surrounding the insertion point of builder. Obtain the address of that
+/// global and use it to compute the address of the first character in the
+/// string (operations inserted at the builder insertion point).
+Value createGlobalString(Location loc, OpBuilder &builder, StringRef name,
+                         StringRef value, LLVM::Linkage linkage,
+                         LLVM::LLVMDialect *llvmDialect);
+
+/// LLVM requires some operations to be inside of a Module operation. This
+/// function confirms that the Operation has the desired properties.
+bool satisfiesLLVMModule(Operation *op);
+
+} // end namespace LLVM
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_LLVMIR_LLVMDIALECT_H_
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
new file mode 100644
index 0000000000000000000000000000000000000000..ed935d5b7f7829cd524e9943f0b2946bd952d5af
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -0,0 +1,52 @@
+//===-- LLVMOpBase.td - LLVM IR dialect shared definitions -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains shared definitions for the LLVM IR dialect and its
+// subdialects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVMIR_OP_BASE
+#define LLVMIR_OP_BASE
+
+include "mlir/IR/OpBase.td"
+
+def LLVM_Dialect : Dialect {
+  let name = "llvm";
+  let cppNamespace = "LLVM";
+}
+
+// LLVM IR type wrapped in MLIR.
+def LLVM_Type : Type<CPred<"$_self.isa<::mlir::LLVM::LLVMType>()">,
+                     "LLVM dialect type">;
+
+// Type constraint accepting only wrapped LLVM integer types.
+def LLVMInt : TypeConstraint<
+  And<[LLVM_Type.predicate,
+       CPred<"$_self.cast<::mlir::LLVM::LLVMType>().isIntegerTy()">]>,
+  "LLVM dialect integer">;
+
+// Base class for LLVM operations. Defines the interface to the llvm::IRBuilder
+// used to translate to LLVM IR proper.
+class LLVM_OpBase<Dialect dialect, string mnemonic, list<OpTrait> traits = []> :
+    Op<dialect, mnemonic, traits> {
+  // A pattern for constructing the LLVM IR Instruction (or other Value) that
+  // corresponds to this op.  This pattern can use `builder` to refer to an
+  // `llvm::IRBuilder<>` instance, $-names of arguments and results and the
+  // following special variable names:
+  //   - $_resultType - substituted with the LLVM IR type of the result;
+  //   - $_numOperands - substituted with the number of operands (including
+  //                     the variadic ones);
+  //   - $_hasResult - substituted with a check that a variadic-result op does
+  //                   have a result (LLVM ops can have 0 or 1 result);
+  //   - $_location - mlir::Location object of the instruction.
+  // Additionally, `$$` can be used to produce the dollar character.
+  string llvmBuilder = "";
+}
+
+#endif  // LLVMIR_OP_BASE
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..2e47eb034747d31d2ce888f2d7ba1ae77e2ce548
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -0,0 +1,734 @@
+//===-- LLVMOps.td - LLVM IR dialect op definition file ----*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the LLVM IR operation definition file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVMIR_OPS
+#define LLVMIR_OPS
+
+include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+
+// Base class for LLVM operations.  All operations get an "llvm." prefix in
+// their name automatically.  LLVM operations have either zero or one result,
+// this class is specialized below for both cases and should not be used
+// directly.
+class LLVM_Op<string mnemonic, list<OpTrait> traits = []> :
+    LLVM_OpBase<LLVM_Dialect, mnemonic, traits> {
+}
+
+class LLVM_Builder<string builder> {
+  string llvmBuilder = builder;
+}
+
+def LLVM_OneResultOpBuilder : OpBuilder<
+  "Builder *, OperationState &result, Type resultType, "
+  "ValueRange operands, ArrayRef<NamedAttribute> attributes = {}",
+  [{
+    if (resultType) result.addTypes(resultType);
+    result.addOperands(operands);
+    for (auto namedAttr : attributes) {
+      result.addAttribute(namedAttr.first, namedAttr.second);
+    }
+  }]>;
+
+def LLVM_ZeroResultOpBuilder : OpBuilder<
+  "Builder *, OperationState &result, ValueRange operands, "
+  "ArrayRef<NamedAttribute> attributes = {}",
+  [{
+    result.addOperands(operands);
+    for (auto namedAttr : attributes) {
+      result.addAttribute(namedAttr.first, namedAttr.second);
+    }
+  }]>;
+
+class LLVM_TwoBuilders<OpBuilder b1, OpBuilder b2> {
+  list<OpBuilder> builders = [b1, b2];
+}
+
+// Base class for LLVM operations with one result.
+class LLVM_OneResultOp<string mnemonic, list<OpTrait> traits = []> :
+    LLVM_Op<mnemonic, traits>, Results<(outs LLVM_Type:$res)> {
+  let builders = [LLVM_OneResultOpBuilder];
+}
+
+// Compatibility builder that takes an instance of wrapped llvm::VoidType
+// to indicate no result.
+def LLVM_VoidResultTypeOpBuilder : OpBuilder<
+  "Builder *builder, OperationState &result, Type resultType, "
+  "ValueRange operands, ArrayRef<NamedAttribute> attributes = {}",
+  [{
+    auto llvmType = resultType.dyn_cast<LLVM::LLVMType>(); (void)llvmType;
+    assert(llvmType && "result must be an LLVM type");
+    assert(llvmType.getUnderlyingType() &&
+            llvmType.getUnderlyingType()->isVoidTy() &&
+            "for zero-result operands, only 'void' is accepted as result type");
+    build(builder, result, operands, attributes);
+  }]>;
+
+// Base class for LLVM operations with zero results.
+class LLVM_ZeroResultOp<string mnemonic, list<OpTrait> traits = []> :
+    LLVM_Op<mnemonic, traits>, Results<(outs)>,
+    LLVM_TwoBuilders<LLVM_VoidResultTypeOpBuilder, LLVM_ZeroResultOpBuilder>;
+
+// Base class for LLVM terminator operations.  All terminator operations have
+// zero results and an optional list of successors.
+class LLVM_TerminatorOp<string mnemonic, list<OpTrait> traits = []> :
+    LLVM_Op<mnemonic, !listconcat(traits, [Terminator])>,
+    Arguments<(ins Variadic<LLVM_Type>:$args)>, Results<(outs)> {
+  let builders = [
+    OpBuilder<
+      "Builder *, OperationState &result, "
+      "ValueRange properOperands, "
+      "ArrayRef<Block *> destinations, "
+      "ArrayRef<ValueRange> operands, "
+      "ArrayRef<NamedAttribute> attributes = {}",
+      [{
+        result.addOperands(properOperands);
+        for (auto kvp : llvm::zip(destinations, operands)) {
+          result.addSuccessor(std::get<0>(kvp), std::get<1>(kvp));
+        }
+        for (auto namedAttr : attributes) {
+          result.addAttribute(namedAttr.first, namedAttr.second);
+        }
+      }]
+    >,
+    OpBuilder<
+      "Builder *builder, OperationState &result, "
+      "ValueRange properOperands, "
+      "ArrayRef<Block *> destinations, "
+      "ArrayRef<NamedAttribute> attributes = {}",
+    [{
+        SmallVector<ValueRange, 2> operands(destinations.size(), {});
+        build(builder, result, properOperands,
+            destinations, operands, attributes);
+      }]
+    >,
+  ];
+}
+
+// Class for arithmetic binary operations.
+class LLVM_ArithmeticOp<string mnemonic, string builderFunc,
+                        list<OpTrait> traits = []> :
+    LLVM_OneResultOp<mnemonic,
+           !listconcat([NoSideEffect, SameOperandsAndResultType], traits)>,
+    Arguments<(ins LLVM_Type:$lhs, LLVM_Type:$rhs)>,
+    LLVM_Builder<"$res = builder." # builderFunc # "($lhs, $rhs);"> {
+  let parser = [{ return impl::parseOneResultSameOperandTypeOp(parser, result); }];
+  let printer = [{ mlir::impl::printOneResultOp(this->getOperation(), p); }];
+}
+class LLVM_UnaryArithmeticOp<string mnemonic, string builderFunc,
+                        list<OpTrait> traits = []> :
+    LLVM_OneResultOp<mnemonic,
+           !listconcat([NoSideEffect, SameOperandsAndResultType], traits)>,
+    Arguments<(ins LLVM_Type:$operand)>,
+    LLVM_Builder<"$res = builder." # builderFunc # "($operand);"> {
+  let parser = [{ return impl::parseOneResultSameOperandTypeOp(parser, result); }];
+  let printer = [{ mlir::impl::printOneResultOp(this->getOperation(), p); }];
+}
+
+// Integer binary operations.
+def LLVM_AddOp : LLVM_ArithmeticOp<"add", "CreateAdd", [Commutative]>;
+def LLVM_SubOp : LLVM_ArithmeticOp<"sub", "CreateSub">;
+def LLVM_MulOp : LLVM_ArithmeticOp<"mul", "CreateMul", [Commutative]>;
+def LLVM_UDivOp : LLVM_ArithmeticOp<"udiv", "CreateUDiv">;
+def LLVM_SDivOp : LLVM_ArithmeticOp<"sdiv", "CreateSDiv">;
+def LLVM_URemOp : LLVM_ArithmeticOp<"urem", "CreateURem">;
+def LLVM_SRemOp : LLVM_ArithmeticOp<"srem", "CreateSRem">;
+def LLVM_AndOp : LLVM_ArithmeticOp<"and", "CreateAnd">;
+def LLVM_OrOp : LLVM_ArithmeticOp<"or", "CreateOr">;
+def LLVM_XOrOp : LLVM_ArithmeticOp<"xor", "CreateXor">;
+def LLVM_ShlOp : LLVM_ArithmeticOp<"shl", "CreateShl">;
+def LLVM_LShrOp : LLVM_ArithmeticOp<"lshr", "CreateLShr">;
+def LLVM_AShrOp : LLVM_ArithmeticOp<"ashr", "CreateAShr">;
+
+// Predicate for integer comparisons.
+def ICmpPredicateEQ  : I64EnumAttrCase<"eq", 0>;
+def ICmpPredicateNE  : I64EnumAttrCase<"ne", 1>;
+def ICmpPredicateSLT : I64EnumAttrCase<"slt", 2>;
+def ICmpPredicateSLE : I64EnumAttrCase<"sle", 3>;
+def ICmpPredicateSGT : I64EnumAttrCase<"sgt", 4>;
+def ICmpPredicateSGE : I64EnumAttrCase<"sge", 5>;
+def ICmpPredicateULT : I64EnumAttrCase<"ult", 6>;
+def ICmpPredicateULE : I64EnumAttrCase<"ule", 7>;
+def ICmpPredicateUGT : I64EnumAttrCase<"ugt", 8>;
+def ICmpPredicateUGE : I64EnumAttrCase<"uge", 9>;
+def ICmpPredicate : I64EnumAttr<
+    "ICmpPredicate",
+    "llvm.icmp comparison predicate",
+    [ICmpPredicateEQ, ICmpPredicateNE, ICmpPredicateSLT, ICmpPredicateSLE,
+     ICmpPredicateSGT, ICmpPredicateSGE, ICmpPredicateULT, ICmpPredicateULE,
+     ICmpPredicateUGT, ICmpPredicateUGE]> {
+  let cppNamespace = "::mlir::LLVM";
+}
+
+// Other integer operations.
+def LLVM_ICmpOp : LLVM_OneResultOp<"icmp", [NoSideEffect]>,
+                  Arguments<(ins ICmpPredicate:$predicate, LLVM_Type:$lhs,
+                             LLVM_Type:$rhs)> {
+  let llvmBuilder = [{
+    $res = builder.CreateICmp(getLLVMCmpPredicate($predicate), $lhs, $rhs);
+  }];
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &result, ICmpPredicate predicate, Value lhs, "
+    "Value rhs", [{
+      LLVMDialect *dialect = &lhs->getType().cast<LLVMType>().getDialect();
+      build(b, result, LLVMType::getInt1Ty(dialect),
+            b->getI64IntegerAttr(static_cast<int64_t>(predicate)), lhs, rhs);
+    }]>];
+  let parser = [{ return parseCmpOp<ICmpPredicate>(parser, result); }];
+  let printer = [{ printICmpOp(p, *this); }];
+}
+
+// Predicate for float comparisons
+def FCmpPredicateFALSE  : I64EnumAttrCase<"_false", 0>;
+def FCmpPredicateOEQ    : I64EnumAttrCase<"oeq", 1>;
+def FCmpPredicateOGT    : I64EnumAttrCase<"ogt", 2>;
+def FCmpPredicateOGE    : I64EnumAttrCase<"oge", 3>;
+def FCmpPredicateOLT    : I64EnumAttrCase<"olt", 4>;
+def FCmpPredicateOLE    : I64EnumAttrCase<"ole", 5>;
+def FCmpPredicateONE    : I64EnumAttrCase<"one", 6>;
+def FCmpPredicateORD    : I64EnumAttrCase<"ord", 7>;
+def FCmpPredicateUEQ    : I64EnumAttrCase<"ueq", 8>;
+def FCmpPredicateUGT    : I64EnumAttrCase<"ugt", 9>;
+def FCmpPredicateUGE    : I64EnumAttrCase<"uge", 10>;
+def FCmpPredicateULT    : I64EnumAttrCase<"ult", 11>;
+def FCmpPredicateULE    : I64EnumAttrCase<"ule", 12>;
+def FCmpPredicateUNE    : I64EnumAttrCase<"une", 13>;
+def FCmpPredicateUNO    : I64EnumAttrCase<"uno", 14>;
+def FCmpPredicateTRUE   : I64EnumAttrCase<"_true", 15>;
+
+def FCmpPredicate : I64EnumAttr<
+    "FCmpPredicate",
+    "llvm.fcmp comparison predicate",
+    [FCmpPredicateFALSE, FCmpPredicateOEQ, FCmpPredicateOGT, FCmpPredicateOGE,
+     FCmpPredicateOLT, FCmpPredicateOLE, FCmpPredicateONE, FCmpPredicateORD,
+     FCmpPredicateUEQ, FCmpPredicateUGT, FCmpPredicateUGE, FCmpPredicateULT,
+     FCmpPredicateULE, FCmpPredicateUNE, FCmpPredicateUNO, FCmpPredicateTRUE
+    ]> {
+  let cppNamespace = "::mlir::LLVM";
+}
+
+// Other integer operations.
+def LLVM_FCmpOp : LLVM_OneResultOp<"fcmp", [NoSideEffect]>,
+                  Arguments<(ins FCmpPredicate:$predicate, LLVM_Type:$lhs,
+                             LLVM_Type:$rhs)> {
+  let llvmBuilder = [{
+    $res = builder.CreateFCmp(getLLVMCmpPredicate($predicate), $lhs, $rhs);
+  }];
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &result, FCmpPredicate predicate, Value lhs, "
+    "Value rhs", [{
+      LLVMDialect *dialect = &lhs->getType().cast<LLVMType>().getDialect();
+      build(b, result, LLVMType::getInt1Ty(dialect),
+            b->getI64IntegerAttr(static_cast<int64_t>(predicate)), lhs, rhs);
+    }]>];
+  let parser = [{ return parseCmpOp<FCmpPredicate>(parser, result); }];
+  let printer = [{ printFCmpOp(p, *this); }];
+}
+
+// Floating point binary operations.
+def LLVM_FAddOp : LLVM_ArithmeticOp<"fadd", "CreateFAdd">;
+def LLVM_FSubOp : LLVM_ArithmeticOp<"fsub", "CreateFSub">;
+def LLVM_FMulOp : LLVM_ArithmeticOp<"fmul", "CreateFMul">;
+def LLVM_FDivOp : LLVM_ArithmeticOp<"fdiv", "CreateFDiv">;
+def LLVM_FRemOp : LLVM_ArithmeticOp<"frem", "CreateFRem">;
+def LLVM_FNegOp : LLVM_UnaryArithmeticOp<"fneg", "CreateFNeg">;
+
+// Memory-related operations.
+def LLVM_AllocaOp :
+    LLVM_OneResultOp<"alloca">,
+    Arguments<(ins LLVM_Type:$arraySize, OptionalAttr<I64Attr>:$alignment)> {
+  string llvmBuilder = [{
+    auto *alloca = builder.CreateAlloca(
+      $_resultType->getPointerElementType(), $arraySize);
+    if ($alignment.hasValue()) {
+      auto align = $alignment.getValue().getZExtValue();
+      if (align != 0)
+        alloca->setAlignment(llvm::MaybeAlign(align));
+    }
+    $res = alloca;
+  }];
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &result, Type resultType, Value arraySize, "
+    "unsigned alignment",
+    [{
+      if (alignment == 0)
+        return build(b, result, resultType, arraySize, IntegerAttr());
+      build(b, result, resultType, arraySize, b->getI64IntegerAttr(alignment));
+  }]>];
+  let parser = [{ return parseAllocaOp(parser, result); }];
+  let printer = [{ printAllocaOp(p, *this); }];
+  let verifier = [{
+    if (alignment().hasValue()) {
+      auto align = alignment().getValue().getSExtValue();
+      if (align < 0)
+        return emitOpError("expected positive alignment");
+    }
+    return success();
+  }];
+}
+def LLVM_GEPOp : LLVM_OneResultOp<"getelementptr", [NoSideEffect]>,
+                 Arguments<(ins LLVM_Type:$base, Variadic<LLVM_Type>:$indices)>,
+                 LLVM_Builder<"$res = builder.CreateGEP($base, $indices);"> {
+  let parser = [{ return parseGEPOp(parser, result); }];
+  let printer = [{ printGEPOp(p, *this); }];
+}
+def LLVM_LoadOp : LLVM_OneResultOp<"load">, Arguments<(ins LLVM_Type:$addr)>,
+                  LLVM_Builder<"$res = builder.CreateLoad($addr);"> {
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &result, Value addr",
+    [{
+      auto type = addr->getType().cast<LLVM::LLVMType>().getPointerElementTy();
+      build(b, result, type, addr);
+    }]>];
+  let parser = [{ return parseLoadOp(parser, result); }];
+  let printer = [{ printLoadOp(p, *this); }];
+}
+def LLVM_StoreOp : LLVM_ZeroResultOp<"store">,
+                   Arguments<(ins LLVM_Type:$value, LLVM_Type:$addr)>,
+                   LLVM_Builder<"builder.CreateStore($value, $addr);"> {
+  let parser = [{ return parseStoreOp(parser, result); }];
+  let printer = [{ printStoreOp(p, *this); }];
+}
+
+// Casts.
+class LLVM_CastOp<string mnemonic, string builderFunc,
+                  list<OpTrait> traits = []> :
+    LLVM_OneResultOp<mnemonic,
+           !listconcat([NoSideEffect], traits)>,
+    Arguments<(ins LLVM_Type:$arg)>,
+    LLVM_Builder<"$res = builder." # builderFunc # "($arg, $_resultType);"> {
+  let parser = [{ return mlir::impl::parseCastOp(parser, result); }];
+  let printer = [{ mlir::impl::printCastOp(this->getOperation(), p); }];
+}
+def LLVM_BitcastOp : LLVM_CastOp<"bitcast", "CreateBitCast">;
+def LLVM_AddrSpaceCastOp : LLVM_CastOp<"addrspacecast", "CreateAddrSpaceCast">;
+def LLVM_IntToPtrOp : LLVM_CastOp<"inttoptr", "CreateIntToPtr">;
+def LLVM_PtrToIntOp : LLVM_CastOp<"ptrtoint", "CreatePtrToInt">;
+def LLVM_SExtOp : LLVM_CastOp<"sext", "CreateSExt">;
+def LLVM_ZExtOp : LLVM_CastOp<"zext", "CreateZExt">;
+def LLVM_TruncOp : LLVM_CastOp<"trunc", "CreateTrunc">;
+def LLVM_SIToFPOp : LLVM_CastOp<"sitofp", "CreateSIToFP">;
+def LLVM_UIToFPOp : LLVM_CastOp<"uitofp", "CreateUIToFP">;
+def LLVM_FPToSIOp : LLVM_CastOp<"fptosi", "CreateFPToSI">;
+def LLVM_FPToUIOp : LLVM_CastOp<"fptoui", "CreateFPToUI">;
+def LLVM_FPExtOp : LLVM_CastOp<"fpext", "CreateFPExt">;
+def LLVM_FPTruncOp : LLVM_CastOp<"fptrunc", "CreateFPTrunc">;
+
+// Call-related operations.
+def LLVM_CallOp : LLVM_Op<"call">,
+                  Arguments<(ins OptionalAttr<FlatSymbolRefAttr>:$callee,
+                             Variadic<LLVM_Type>)>,
+                  Results<(outs Variadic<LLVM_Type>)>,
+                  LLVM_TwoBuilders<LLVM_OneResultOpBuilder,
+                                   LLVM_ZeroResultOpBuilder> {
+  let verifier = [{
+    if (getNumResults() > 1)
+      return emitOpError("must have 0 or 1 result");
+    return success();
+  }];
+  let parser = [{ return parseCallOp(parser, result); }];
+  let printer = [{ printCallOp(p, *this); }];
+}
+def LLVM_ExtractElementOp : LLVM_OneResultOp<"extractelement", [NoSideEffect]>,
+                          Arguments<(ins LLVM_Type:$vector,
+                                     LLVM_Type:$position)> {
+  string llvmBuilder = [{
+    $res = builder.CreateExtractElement($vector, $position);
+  }];
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &result, Value vector, Value position,"
+    "ArrayRef<NamedAttribute> attrs = {}">];
+  let parser = [{ return parseExtractElementOp(parser, result); }];
+  let printer = [{ printExtractElementOp(p, *this); }];
+}
+def LLVM_ExtractValueOp : LLVM_OneResultOp<"extractvalue", [NoSideEffect]>,
+                          Arguments<(ins LLVM_Type:$container,
+                                     ArrayAttr:$position)> {
+  string llvmBuilder = [{
+    $res = builder.CreateExtractValue($container, extractPosition($position));
+  }];
+  let parser = [{ return parseExtractValueOp(parser, result); }];
+  let printer = [{ printExtractValueOp(p, *this); }];
+}
+def LLVM_InsertElementOp : LLVM_OneResultOp<"insertelement", [NoSideEffect]>,
+                         Arguments<(ins LLVM_Type:$vector, LLVM_Type:$value,
+                                    LLVM_Type:$position)> {
+  string llvmBuilder = [{
+    $res = builder.CreateInsertElement($vector, $value, $position);
+  }];
+  let parser = [{ return parseInsertElementOp(parser, result); }];
+  let printer = [{ printInsertElementOp(p, *this); }];
+}
+def LLVM_InsertValueOp : LLVM_OneResultOp<"insertvalue", [NoSideEffect]>,
+                         Arguments<(ins LLVM_Type:$container, LLVM_Type:$value,
+                                    ArrayAttr:$position)> {
+  string llvmBuilder = [{
+    $res = builder.CreateInsertValue($container, $value,
+                                     extractPosition($position));
+  }];
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &result, Value container, Value value, "
+    "ArrayAttr position",
+    [{
+      build(b, result, container->getType(), container, value, position);
+    }]>];
+  let parser = [{ return parseInsertValueOp(parser, result); }];
+  let printer = [{ printInsertValueOp(p, *this); }];
+}
+def LLVM_ShuffleVectorOp
+    : LLVM_OneResultOp<"shufflevector", [NoSideEffect]>,
+      Arguments<(ins LLVM_Type:$v1, LLVM_Type:$v2, ArrayAttr:$mask)>,
+      LLVM_Builder<
+      "$res = builder.CreateShuffleVector($v1, $v2, extractPosition($mask));"> {
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &result, Value v1, Value v2, "
+    "ArrayAttr mask, ArrayRef<NamedAttribute> attrs = {}">];
+  let verifier = [{
+    auto wrappedVectorType1 = v1()->getType().cast<LLVM::LLVMType>();
+    auto wrappedVectorType2 = v2()->getType().cast<LLVM::LLVMType>();
+    if (!wrappedVectorType2.getUnderlyingType()->isVectorTy())
+      return emitOpError("expected LLVM IR Dialect vector type for operand #2");
+    if (wrappedVectorType1.getVectorElementType() !=
+        wrappedVectorType2.getVectorElementType())
+      return emitOpError("expected matching LLVM IR Dialect element types");
+    return success();
+  }];
+  let parser = [{ return parseShuffleVectorOp(parser, result); }];
+  let printer = [{ printShuffleVectorOp(p, *this); }];
+}
+
+// Misc operations.
+def LLVM_SelectOp
+    : LLVM_OneResultOp<"select", [NoSideEffect]>,
+      Arguments<(ins LLVM_Type:$condition, LLVM_Type:$trueValue,
+                 LLVM_Type:$falseValue)>,
+      LLVM_Builder<
+          "$res = builder.CreateSelect($condition, $trueValue, $falseValue);"> {
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &result, Value condition, Value lhs, "
+    "Value rhs", [{
+      build(b, result, lhs->getType(), condition, lhs, rhs);
+    }]>];
+  let parser = [{ return parseSelectOp(parser, result); }];
+  let printer = [{ printSelectOp(p, *this); }];
+}
+
+// Terminators.
+def LLVM_BrOp : LLVM_TerminatorOp<"br", []> {
+  let parser = [{ return parseBrOp(parser, result); }];
+  let printer = [{ printBrOp(p, *this); }];
+}
+def LLVM_CondBrOp : LLVM_TerminatorOp<"cond_br", []> {
+  let verifier = [{
+    if (getNumSuccessors() != 2)
+      return emitOpError("expected exactly two successors");
+    return success();
+  }];
+  let parser = [{ return parseCondBrOp(parser, result); }];
+  let printer = [{ printCondBrOp(p, *this); }];
+}
+def LLVM_ReturnOp : LLVM_TerminatorOp<"return", []> {
+  string llvmBuilder = [{
+    if ($_numOperands != 0)
+      builder.CreateRet($args[0]);
+    else
+      builder.CreateRetVoid();
+  }];
+
+  let verifier = [{
+    if (getNumOperands() > 1)
+      return emitOpError("expects at most 1 operand");
+    return success();
+  }];
+
+  let parser = [{ return parseReturnOp(parser, result); }];
+  let printer = [{ printReturnOp(p, *this); }];
+}
+def LLVM_UnreachableOp : LLVM_TerminatorOp<"unreachable", []> {
+  string llvmBuilder = [{ builder.CreateUnreachable(); }];
+  let parser = [{ return success(); }];
+  let printer = [{ p << getOperationName(); }];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Auxiliary operations (do not appear in LLVM IR but necessary for the dialect
+// to work correctly).
+////////////////////////////////////////////////////////////////////////////////
+
+// Linkage attribute is used on functions and globals. The order follows that of
+// https://llvm.org/docs/LangRef.html#linkage-types. The names are equivalent to
+// visible names in the IR rather than to enum values names in llvm::GlobalValue
+// since the latter is easier to change.
+def LinkagePrivate             : I64EnumAttrCase<"Private", 0>;
+def LinkageInternal            : I64EnumAttrCase<"Internal", 1>;
+def LinkageAvailableExternally : I64EnumAttrCase<"AvailableExternally", 2>;
+def LinkageLinkonce            : I64EnumAttrCase<"Linkonce", 3>;
+def LinkageWeak                : I64EnumAttrCase<"Weak", 4>;
+def LinkageCommon              : I64EnumAttrCase<"Common", 5>;
+def LinkageAppending           : I64EnumAttrCase<"Appending", 6>;
+def LinkageExternWeak          : I64EnumAttrCase<"ExternWeak", 7>;
+def LinkageLinkonceODR         : I64EnumAttrCase<"LinkonceODR", 8>;
+def LinkageWeakODR             : I64EnumAttrCase<"WeakODR", 9>;
+def LinkageExternal            : I64EnumAttrCase<"External", 10>;
+def Linkage : I64EnumAttr<
+    "Linkage",
+    "LLVM linkage types",
+    [LinkagePrivate, LinkageInternal, LinkageAvailableExternally,
+     LinkageLinkonce, LinkageWeak, LinkageCommon, LinkageAppending,
+     LinkageExternWeak, LinkageLinkonceODR, LinkageWeakODR, LinkageExternal]> {
+  let cppNamespace = "::mlir::LLVM";
+}
+
+
+def LLVM_AddressOfOp
+    : LLVM_OneResultOp<"mlir.addressof">,
+      Arguments<(ins FlatSymbolRefAttr:$global_name)> {
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &result, LLVMType resType, "
+              "StringRef name, ArrayRef<NamedAttribute> attrs = {}", [{
+      result.addAttribute("global_name", builder->getSymbolRefAttr(name));
+      result.addAttributes(attrs);
+      result.addTypes(resType);}]>,
+
+    OpBuilder<"Builder *builder, OperationState &result, GlobalOp global, "
+              "ArrayRef<NamedAttribute> attrs = {}", [{
+      build(builder, result,
+            global.getType().getPointerTo(global.addr_space().getZExtValue()),
+            global.sym_name(), attrs);}]>
+  ];
+
+  let extraClassDeclaration = [{
+    /// Return the llvm.mlir.global operation that defined the value referenced
+    /// here.
+    GlobalOp getGlobal();
+  }];
+
+  let printer = "printAddressOfOp(p, *this);";
+  let parser = "return parseAddressOfOp(parser, result);";
+  let verifier = "return ::verify(*this);";
+}
+
+def LLVM_GlobalOp
+    : LLVM_ZeroResultOp<"mlir.global",
+                        [IsolatedFromAbove,
+                         SingleBlockImplicitTerminator<"ReturnOp">, Symbol]>,
+      Arguments<(ins TypeAttr:$type, UnitAttr:$constant, StrAttr:$sym_name,
+                 Linkage:$linkage,
+                 OptionalAttr<AnyAttr>:$value,
+                 DefaultValuedAttr<NonNegativeI32Attr, "0">:$addr_space)> {
+  let summary = "LLVM dialect global.";
+  let description = [{
+    Can contain an optional initializer region or attribute for simple
+    initializers.
+
+    Examples:
+      // Initialized using an attribute.
+      llvm.mlir.global @a("abc") : !llvm<"[3 x i8]">
+      // Initialized using a region.
+      llvm.mlir.global constant @b() : !llvm<"i32*"> {
+        %0 = llvm.constant(0 : i32) : !llvm.i32
+        %1 = llvm.inttoptr %0 : !llvm.i32 to !llvm<"i32*">
+        llvm.return %1 : !llvm<"i32*">
+      }
+  }];
+  let regions = (region AnyRegion:$initializer);
+
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &result, LLVMType type, "
+              "bool isConstant, Linkage linkage, StringRef name, "
+              "Attribute value, unsigned addrSpace = 0, "
+              "ArrayRef<NamedAttribute> attrs = {}">
+  ];
+
+  let extraClassDeclaration = [{
+    /// Return the LLVM type of the global.
+    LLVMType getType() {
+      return type().cast<LLVMType>();
+    }
+    /// Return the initializer attribute if it exists, or a null attribute.
+    Attribute getValueOrNull() {
+      return value().getValueOr(Attribute());
+    }
+    /// Return the initializer region. This may be empty, but if it is not it
+    /// terminates in an `llvm.return` op with the initializer value.
+    Region &getInitializerRegion() {
+      return getOperation()->getRegion(0);
+    }
+    /// Return the initializer block. If the initializer region is empty this
+    /// is nullptr. If it is not nullptr, it terminates with an `llvm.return`
+    /// op with the initializer value.
+    Block *getInitializerBlock() {
+      return getInitializerRegion().empty() ?
+        nullptr : &getInitializerRegion().front();
+    }
+  }];
+
+  let printer = "printGlobalOp(p, *this);";
+  let parser = "return parseGlobalOp(parser, result);";
+  let verifier = "return ::verify(*this);";
+}
+
+def LLVM_LLVMFuncOp
+    : LLVM_ZeroResultOp<"func", [IsolatedFromAbove, FunctionLike, Symbol]>,
+      Arguments<(ins DefaultValuedAttr<Linkage,
+                                       "Linkage::External">:$linkage)> {
+  let summary = "LLVM dialect function, has wrapped LLVM IR function type";
+
+  let regions = (region AnyRegion:$body);
+
+  let skipDefaultBuilders = 1;
+
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &result, StringRef name, "
+              "LLVMType type, LLVM::Linkage linkage = LLVM::Linkage::External, "
+              "ArrayRef<NamedAttribute> attrs = {}, "
+              "ArrayRef<NamedAttributeList> argAttrs = {}">
+  ];
+
+  let extraClassDeclaration = [{
+    // Add an entry block to an empty function, and set up the block arguments
+    // to match the signature of the function.
+    Block *addEntryBlock();
+
+    LLVMType getType() {
+      return getAttrOfType<TypeAttr>(getTypeAttrName())
+          .getValue().cast<LLVMType>();
+    }
+    bool isVarArg() {
+      return getType().getUnderlyingType()->isFunctionVarArg();
+    }
+
+    // Hook for OpTrait::FunctionLike, returns the number of function arguments.
+    // Depends on the type attribute being correct as checked by verifyType.
+    unsigned getNumFuncArguments();
+
+    // Hook for OpTrait::FunctionLike, returns the number of function results.
+    // Depends on the type attribute being correct as checked by verifyType.
+    unsigned getNumFuncResults();
+
+    // Hook for OpTrait::FunctionLike, called after verifying that the 'type'
+    // attribute is present.  This can check for preconditions of the
+    // getNumArguments hook not failing.
+    LogicalResult verifyType();
+  }];
+
+  let verifier = [{ return ::verify(*this); }];
+  let printer = [{ printLLVMFuncOp(p, *this); }];
+  let parser = [{ return parseLLVMFuncOp(parser, result); }];
+}
+
+def LLVM_NullOp
+    : LLVM_OneResultOp<"mlir.null", [NoSideEffect]>,
+      LLVM_Builder<"$res = llvm::ConstantPointerNull::get("
+                   "    cast<llvm::PointerType>($_resultType));"> {
+  let parser = [{ return parseNullOp(parser, result); }];
+  let printer = [{ printNullOp(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def LLVM_UndefOp : LLVM_OneResultOp<"mlir.undef", [NoSideEffect]>,
+                   LLVM_Builder<"$res = llvm::UndefValue::get($_resultType);"> {
+  let parser = [{ return parseUndefOp(parser, result); }];
+  let printer = [{ printUndefOp(p, *this); }];
+}
+def LLVM_ConstantOp
+    : LLVM_OneResultOp<"mlir.constant", [NoSideEffect]>,
+      Arguments<(ins AnyAttr:$value)>,
+      LLVM_Builder<"$res = getLLVMConstant($_resultType, $value, $_location);">
+{
+  let parser = [{ return parseConstantOp(parser, result); }];
+  let printer = [{ printConstantOp(p, *this); }];
+}
+
+// Operations that correspond to LLVM intrinsics. With MLIR operation set being
+// extendable, there is no reason to introduce a hard boundary between "core"
+// operations and intrinsics. However, we systematically prefix them with
+// "intr." to avoid potential name clashes.
+
+class LLVM_UnaryIntrinsicOp<string func, list<OpTrait> traits = []> :
+    LLVM_OneResultOp<"intr." # func,
+           !listconcat([NoSideEffect, SameOperandsAndResultType], traits)>,
+    Arguments<(ins LLVM_Type:$in)>,
+    LLVM_Builder<"$res = builder.CreateCall(llvm::Intrinsic::getDeclaration("
+      "builder.GetInsertBlock()->getModule(), llvm::Intrinsic::" # func # ","
+      "{$in->getType()}), {$in});"> {
+}
+
+class LLVM_BinaryIntrinsicOp<string func, list<OpTrait> traits = []> :
+    LLVM_OneResultOp<"intr." # func,
+           !listconcat([NoSideEffect, SameOperandsAndResultType], traits)>,
+    Arguments<(ins LLVM_Type:$a, LLVM_Type:$b)>,
+    LLVM_Builder<"$res = builder.CreateCall(llvm::Intrinsic::getDeclaration("
+      "builder.GetInsertBlock()->getModule(), llvm::Intrinsic::" # func # ","
+      "{$a->getType(), $b->getType()}), {$a, $b});"> {
+}
+
+class LLVM_TernaryIntrinsicOp<string func, list<OpTrait> traits = []> :
+    LLVM_OneResultOp<"intr." # func,
+           !listconcat([NoSideEffect, SameOperandsAndResultType], traits)>,
+    Arguments<(ins LLVM_Type:$a, LLVM_Type:$b, LLVM_Type:$c)>,
+    LLVM_Builder<"$res = builder.CreateCall(llvm::Intrinsic::getDeclaration("
+      "builder.GetInsertBlock()->getModule(), llvm::Intrinsic::" # func # ","
+      "{$a->getType(), $b->getType(), $c->getType()}), {$a, $b, $c});"> {
+}
+
+def LLVM_ExpOp : LLVM_UnaryIntrinsicOp<"exp">;
+def LLVM_FAbsOp : LLVM_UnaryIntrinsicOp<"fabs">;
+def LLVM_FCeilOp : LLVM_UnaryIntrinsicOp<"ceil">;
+def LLVM_CosOp : LLVM_UnaryIntrinsicOp<"cos">;
+def LLVM_CopySignOp : LLVM_BinaryIntrinsicOp<"copysign">;
+def LLVM_FMulAddOp : LLVM_TernaryIntrinsicOp<"fmuladd">;
+
+def LLVM_LogOp : LLVM_Op<"intr.log", [NoSideEffect]>,
+                   Arguments<(ins LLVM_Type:$in)>,
+                   Results<(outs LLVM_Type:$res)> {
+  let llvmBuilder = [{
+    llvm::Module *module = builder.GetInsertBlock()->getModule();
+    llvm::Function *fn = llvm::Intrinsic::getDeclaration(
+        module, llvm::Intrinsic::log, {$in->getType()});
+    $res = builder.CreateCall(fn, {$in});
+  }];
+}
+
+def LLVM_Log10Op : LLVM_Op<"intr.log10", [NoSideEffect]>,
+                   Arguments<(ins LLVM_Type:$in)>,
+                   Results<(outs LLVM_Type:$res)> {
+  let llvmBuilder = [{
+    llvm::Module *module = builder.GetInsertBlock()->getModule();
+    llvm::Function *fn = llvm::Intrinsic::getDeclaration(
+        module, llvm::Intrinsic::log10, {$in->getType()});
+    $res = builder.CreateCall(fn, {$in});
+  }];
+}
+
+def LLVM_Log2Op : LLVM_Op<"intr.log2", [NoSideEffect]>,
+                   Arguments<(ins LLVM_Type:$in)>,
+                   Results<(outs LLVM_Type:$res)> {
+  let llvmBuilder = [{
+    llvm::Module *module = builder.GetInsertBlock()->getModule();
+    llvm::Function *fn = llvm::Intrinsic::getDeclaration(
+        module, llvm::Intrinsic::log2, {$in->getType()});
+    $res = builder.CreateCall(fn, {$in});
+  }];
+}
+
+def LLVM_Prefetch : LLVM_ZeroResultOp<"intr.prefetch">,
+                    Arguments<(ins LLVM_Type:$addr, LLVM_Type:$rw,
+                    LLVM_Type:$hint, LLVM_Type:$cache)> {
+  let llvmBuilder = [{
+    llvm::Module *module = builder.GetInsertBlock()->getModule();
+    llvm::Function *fn = llvm::Intrinsic::getDeclaration(
+        module, llvm::Intrinsic::prefetch, $addr->getType());
+    builder.CreateCall(fn, {$addr, $rw, $hint, $cache});
+  }];
+}
+
+#endif // LLVMIR_OPS
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..afb6d4ab6272834e2f0baab9885c70be971b233b
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -0,0 +1,36 @@
+//===- NVVMDialect.h - MLIR NVVM IR dialect ---------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the NVVM IR dialect in MLIR, containing NVVM operations and
+// NVVM specific extensions to the LLVM type system.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_
+#define MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+namespace mlir {
+namespace NVVM {
+
+///// Ops /////
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LLVMIR/NVVMOps.h.inc"
+
+class NVVMDialect : public Dialect {
+public:
+  explicit NVVMDialect(MLIRContext *context);
+
+  static StringRef getDialectNamespace() { return "nvvm"; }
+};
+
+} // namespace NVVM
+} // namespace mlir
+
+#endif /* MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_ */
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..f35b7798149247ba98abd8968e596d8308755962
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -0,0 +1,137 @@
+//===-- NVVMOps.td - NVVM IR dialect op definition file ----*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the NVVM IR operation definition file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVVMIR_OPS
+#define NVVMIR_OPS
+
+include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+
+//===----------------------------------------------------------------------===//
+// NVVM dialect definitions
+//===----------------------------------------------------------------------===//
+
+def NVVM_Dialect : Dialect {
+  let name = "nvvm";
+  let cppNamespace = "NVVM";
+}
+
+//===----------------------------------------------------------------------===//
+// NVVM op definitions
+//===----------------------------------------------------------------------===//
+
+class NVVM_Op<string mnemonic, list<OpTrait> traits = []> :
+  LLVM_OpBase<NVVM_Dialect, mnemonic, traits> {
+}
+
+//===----------------------------------------------------------------------===//
+// NVVM special register op definitions
+//===----------------------------------------------------------------------===//
+
+class NVVM_SpecialRegisterOp<string mnemonic,
+    list<OpTrait> traits = []> :
+  NVVM_Op<mnemonic, !listconcat(traits, [NoSideEffect])>,
+  Results<(outs LLVM_Type:$res)>, Arguments<(ins)> {
+  string llvmBuilder = "$res = createIntrinsicCall(builder,"
+    # "llvm::Intrinsic::nvvm_" # !subst(".","_", mnemonic) # ");";
+  let parser = [{ return parseNVVMSpecialRegisterOp(parser, result); }];
+  let printer = [{ printNVVMIntrinsicOp(p, this->getOperation()); }];
+}
+
+//===----------------------------------------------------------------------===//
+// Lane index and range
+def NVVM_LaneIdOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.laneid">;
+def NVVM_WarpSizeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.warpsize">;
+
+//===----------------------------------------------------------------------===//
+// Thread index and range
+def NVVM_ThreadIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.x">;
+def NVVM_ThreadIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.y">;
+def NVVM_ThreadIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.z">;
+def NVVM_BlockDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.x">;
+def NVVM_BlockDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.y">;
+def NVVM_BlockDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.z">;
+
+//===----------------------------------------------------------------------===//
+// Block index and range
+def NVVM_BlockIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.x">;
+def NVVM_BlockIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.y">;
+def NVVM_BlockIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.z">;
+def NVVM_GridDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.x">;
+def NVVM_GridDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.y">;
+def NVVM_GridDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.z">;
+
+//===----------------------------------------------------------------------===//
+// NVVM synchronization op definitions
+//===----------------------------------------------------------------------===//
+
+def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
+  string llvmBuilder = [{
+      createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier0);
+  }];
+  let parser = [{ return success(); }];
+  let printer = [{ printNVVMIntrinsicOp(p, this->getOperation()); }];
+}
+
+def NVVM_ShflBflyOp :
+  NVVM_Op<"shfl.sync.bfly">,
+  Results<(outs LLVM_Type:$res)>,
+  Arguments<(ins LLVM_Type:$dst,
+                 LLVM_Type:$val,
+                 LLVM_Type:$offset,
+                 LLVM_Type:$mask_and_clamp,
+                 OptionalAttr<UnitAttr>:$return_value_and_is_valid)> {
+  string llvmBuilder = [{
+      auto intId = getShflBflyIntrinsicId(
+          $_resultType, static_cast<bool>($return_value_and_is_valid));
+      $res = createIntrinsicCall(builder,
+          intId, {$dst, $val, $offset, $mask_and_clamp});
+  }];
+  let parser = [{ return parseNVVMShflSyncBflyOp(parser, result); }];
+  let printer = [{ printNVVMIntrinsicOp(p, this->getOperation()); }];
+  let verifier = [{
+    if (!getAttrOfType<UnitAttr>("return_value_and_is_valid"))
+      return success();
+    auto type = getType().cast<LLVM::LLVMType>();
+    if (!type.isStructTy() || type.getStructNumElements() != 2 ||
+        !type.getStructElementType(1).isIntegerTy(
+            /*Bitwidth=*/1))
+      return emitError("expected return type !llvm<\"{ ?, i1 }\">");
+    return success();
+  }];
+}
+
+def NVVM_VoteBallotOp :
+  NVVM_Op<"vote.ballot.sync">,
+  Results<(outs LLVM_Type:$res)>,
+  Arguments<(ins LLVM_Type:$mask, LLVM_Type:$pred)> {
+  string llvmBuilder = [{
+      $res = createIntrinsicCall(builder,
+            llvm::Intrinsic::nvvm_vote_ballot_sync, {$mask, $pred});
+  }];
+  let parser = [{ return parseNVVMVoteBallotOp(parser, result); }];
+  let printer = [{ printNVVMIntrinsicOp(p, this->getOperation()); }];
+}
+
+def NVVM_MmaOp :
+  NVVM_Op<"mma.sync">,
+  Results<(outs LLVM_Type:$res)>,
+  Arguments<(ins Variadic<LLVM_Type>:$args)> {
+  string llvmBuilder = [{
+    $res = createIntrinsicCall(
+        builder, llvm::Intrinsic::nvvm_mma_m8n8k4_row_row_f32_f32, $args);
+  }];
+  let parser = [{ return parseNVVMMmaOp(parser, result); }];
+  let printer = [{ printNVVMMmaOp(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+#endif // NVVMIR_OPS
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..dab32d30e8f45dbd2e047b961821b5d3290c5b87
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
@@ -0,0 +1,45 @@
+//===- ROCDLDialect.h - MLIR ROCDL IR dialect -------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ROCDL dialect in MLIR, containing ROCDL operations
+// and ROCDL specific extensions to the LLVM type system.
+//
+// Unfortunately there does not exists a formal definition of ROCDL IR that be
+// pointed to here. However the following links contain more information about
+// ROCDL (ROCm-Device-Library)
+//
+// https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/master/doc/OCML.md
+// https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/master/doc/OCKL.md
+// https://llvm.org/docs/AMDGPUUsage.html
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LLVMIR_ROCDLDIALECT_H_
+#define MLIR_DIALECT_LLVMIR_ROCDLDIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace ROCDL {
+
+///// Ops /////
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LLVMIR/ROCDLOps.h.inc"
+
+class ROCDLDialect : public Dialect {
+public:
+  explicit ROCDLDialect(MLIRContext *context);
+
+  static StringRef getDialectNamespace() { return "rocdl"; }
+};
+
+} // namespace ROCDL
+} // namespace mlir
+
+#endif /* MLIR_DIALECT_LLVMIR_ROCDLDIALECT_H_ */
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..697ff9740a844b684e0d4e98b215aec9f9067ccb
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -0,0 +1,92 @@
+//===-- ROCDLOps.td - ROCDL IR dialect op definition file --*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the ROCDL IR operation definition file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ROCDLIR_OPS
+#define ROCDLIR_OPS
+
+include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+
+//===----------------------------------------------------------------------===//
+// ROCDL dialect definitions
+//===----------------------------------------------------------------------===//
+
+def ROCDL_Dialect : Dialect {
+  let name = "rocdl";
+  let cppNamespace = "ROCDL";
+}
+
+//===----------------------------------------------------------------------===//
+// ROCDL op definitions
+//===----------------------------------------------------------------------===//
+
+class ROCDL_Op<string mnemonic, list<OpTrait> traits = []> :
+  LLVM_OpBase<ROCDL_Dialect, mnemonic, traits> {
+}
+
+//===----------------------------------------------------------------------===//
+// ROCDL special register op definitions
+//===----------------------------------------------------------------------===//
+
+class ROCDL_SpecialRegisterOp<string mnemonic,
+    list<OpTrait> traits = []> :
+  ROCDL_Op<mnemonic, !listconcat(traits, [NoSideEffect])>,
+  Results<(outs LLVM_Type:$res)>, Arguments<(ins)> {
+  string llvmBuilder = "$res = createIntrinsicCall(builder,"
+    # "llvm::Intrinsic::amdgcn_" # !subst(".","_", mnemonic) # ");";
+  let parser = [{ return parseROCDLOp(parser, result); }];
+  let printer = [{ printROCDLOp(p, this->getOperation()); }];
+}
+
+class ROCDL_DeviceFunctionOp<string mnemonic, string device_function,
+                             int parameter, list<OpTrait> traits = []> :
+  ROCDL_Op<mnemonic, !listconcat(traits, [NoSideEffect])>,
+  Results<(outs LLVM_Type:$res)>, Arguments<(ins)> {
+  string llvmBuilder = "$res = createDeviceFunctionCall(builder, \""
+  # device_function # "\", " # parameter # ");";
+  let parser = [{ return parseROCDLOp(parser, result); }];
+  let printer = [{ printROCDLOp(p, this->getOperation()); }];
+}
+
+//===----------------------------------------------------------------------===//
+// Thread index and Block index
+
+def ROCDL_ThreadIdXOp : ROCDL_SpecialRegisterOp<"workitem.id.x">;
+def ROCDL_ThreadIdYOp : ROCDL_SpecialRegisterOp<"workitem.id.y">;
+def ROCDL_ThreadIdZOp : ROCDL_SpecialRegisterOp<"workitem.id.z">;
+
+def ROCDL_BlockIdXOp : ROCDL_SpecialRegisterOp<"workgroup.id.x">;
+def ROCDL_BlockIdYOp : ROCDL_SpecialRegisterOp<"workgroup.id.y">;
+def ROCDL_BlockIdZOp : ROCDL_SpecialRegisterOp<"workgroup.id.z">;
+
+//===----------------------------------------------------------------------===//
+// Thread range and Block range
+
+def ROCDL_BlockDimXOp : ROCDL_DeviceFunctionOp<"workgroup.dim.x",
+                                               "__ockl_get_local_size", 0>;
+
+def ROCDL_BlockDimYOp : ROCDL_DeviceFunctionOp<"workgroup.dim.y",
+                                               "__ockl_get_local_size", 1>;
+
+def ROCDL_BlockDimZOp : ROCDL_DeviceFunctionOp<"workgroup.dim.z",
+                                               "__ockl_get_local_size", 2>;
+
+def ROCDL_GridDimXOp : ROCDL_DeviceFunctionOp<"grid.dim.x",
+                                               "__ockl_get_global_size", 0>;
+
+def ROCDL_GridDimYOp : ROCDL_DeviceFunctionOp<"grid.dim.y",
+                                               "__ockl_get_global_size", 1>;
+
+def ROCDL_GridDimZOp : ROCDL_DeviceFunctionOp<"grid.dim.z",
+                                               "__ockl_get_global_size", 2>;
+
+
+#endif // ROCDLIR_OPS
diff --git a/mlir/include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h b/mlir/include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd5034e823ceb9a17ecabc16bf231c9f9885e647
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h
@@ -0,0 +1,134 @@
+//===- DependenceAnalysis.h - Dependence analysis on SSA views --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LINALG_ANALYSIS_DEPENDENCEANALYSIS_H_
+#define MLIR_DIALECT_LINALG_ANALYSIS_DEPENDENCEANALYSIS_H_
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+class FuncOp;
+
+namespace linalg {
+
+class LinalgOp;
+
+/// A very primitive alias analysis which just records for each view, either:
+///   1. The base buffer, or
+///   2. The block argument view
+/// that it indexes into.
+/// This does not perform inter-block or inter-procedural analysis and assumes
+/// that different block argument views do not alias.
+class Aliases {
+public:
+  /// Returns true if v1 and v2 alias.
+  bool alias(Value v1, Value v2) { return find(v1) == find(v2); }
+
+private:
+  /// Returns the base buffer or block argument into which the view `v` aliases.
+  /// This lazily records the new aliases discovered while walking back the
+  /// use-def chain.
+  Value find(Value v);
+
+  DenseMap<Value, Value> aliases;
+};
+
+/// Data structure for holding a dependence graph that operates on LinalgOp and
+/// views as SSA values.
+class LinalgDependenceGraph {
+public:
+  struct LinalgOpView {
+    Operation *op;
+    Value view;
+  };
+  struct LinalgDependenceGraphElem {
+    // dependentOpView may be either:
+    //   1. src in the case of dependencesIntoGraphs.
+    //   2. dst in the case of dependencesFromDstGraphs.
+    LinalgOpView dependentOpView;
+    // View in the op that is used to index in the graph:
+    //   1. src in the case of dependencesFromDstGraphs.
+    //   2. dst in the case of dependencesIntoGraphs.
+    Value indexingView;
+  };
+  using LinalgDependences = SmallVector<LinalgDependenceGraphElem, 8>;
+  using DependenceGraph = DenseMap<Operation *, LinalgDependences>;
+  using dependence_iterator = LinalgDependences::const_iterator;
+  using dependence_range = iterator_range<dependence_iterator>;
+
+  enum DependenceType { RAR = 0, RAW, WAR, WAW, NumTypes };
+
+  // Builds a linalg dependence graph for the ops of type LinalgOp under `f`.
+  static LinalgDependenceGraph buildDependenceGraph(Aliases &aliases, FuncOp f);
+  LinalgDependenceGraph(Aliases &aliases, ArrayRef<Operation *> ops);
+
+  /// Returns the X such that op -> X is a dependence of type dt.
+  dependence_range getDependencesFrom(Operation *src, DependenceType dt) const;
+  dependence_range getDependencesFrom(LinalgOp src, DependenceType dt) const;
+
+  /// Returns the X such that X -> op is a dependence of type dt.
+  dependence_range getDependencesInto(Operation *dst, DependenceType dt) const;
+  dependence_range getDependencesInto(LinalgOp dst, DependenceType dt) const;
+
+  /// Returns the operations that are interleaved between `srcLinalgOp` and
+  /// `dstLinalgOp` and that are involved in any RAW, WAR or WAW dependence
+  /// relation with `srcLinalgOp`, on any view.
+  /// Any such operation prevents reordering.
+  SmallVector<Operation *, 8>
+  findCoveringDependences(LinalgOp srcLinalgOp, LinalgOp dstLinalgOp) const;
+
+  /// Returns the operations that are interleaved between `srcLinalgOp` and
+  /// `dstLinalgOp` and that are involved in a RAR or RAW with `srcLinalgOp`.
+  /// Dependences are restricted to views aliasing `view`.
+  SmallVector<Operation *, 8> findCoveringReads(LinalgOp srcLinalgOp,
+                                                LinalgOp dstLinalgOp,
+                                                Value view) const;
+
+  /// Returns the operations that are interleaved between `srcLinalgOp` and
+  /// `dstLinalgOp` and that are involved in a WAR or WAW with `srcLinalgOp`.
+  /// Dependences are restricted to views aliasing `view`.
+  SmallVector<Operation *, 8> findCoveringWrites(LinalgOp srcLinalgOp,
+                                                 LinalgOp dstLinalgOp,
+                                                 Value view) const;
+
+private:
+  // Keep dependences in both directions, this is not just a performance gain
+  // but it also reduces usage errors.
+  // Dependence information is stored as a map of:
+  //   (source operation -> LinalgDependenceGraphElem)
+  DependenceGraph dependencesFromGraphs[DependenceType::NumTypes];
+  // Reverse dependence information is stored as a map of:
+  //   (destination operation -> LinalgDependenceGraphElem)
+  DependenceGraph dependencesIntoGraphs[DependenceType::NumTypes];
+
+  /// Analyses the aliasing views between `src` and `dst` and inserts the proper
+  /// dependences in the graph.
+  void addDependencesBetween(LinalgOp src, LinalgOp dst);
+
+  // Adds an new dependence unit in the proper graph.
+  // Uses std::pair to keep operations and view together and avoid usage errors
+  // related to src/dst and producer/consumer terminology in the context of
+  // dependences.
+  void addDependenceElem(DependenceType dt, LinalgOpView indexingOpView,
+                         LinalgOpView dependentOpView);
+
+  /// Implementation detail for findCoveringxxx.
+  SmallVector<Operation *, 8>
+  findOperationsWithCoveringDependences(LinalgOp srcLinalgOp,
+                                        LinalgOp dstLinalgOp, Value view,
+                                        ArrayRef<DependenceType> types) const;
+
+  Aliases &aliases;
+  SmallVector<Operation *, 8> linalgOps;
+  DenseMap<Operation *, unsigned> linalgOpPositions;
+};
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_ANALYSIS_DEPENDENCEANALYSIS_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt b/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9f57627c321fb0c74b3e4a404e3c36bd435f64a7
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(IR)
+add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Dialect/Linalg/EDSC/Builders.h b/mlir/include/mlir/Dialect/Linalg/EDSC/Builders.h
new file mode 100644
index 0000000000000000000000000000000000000000..97fbede1cc78771eefdc8548e0e7d17935a9107b
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/EDSC/Builders.h
@@ -0,0 +1,229 @@
+//===- Builders.h - MLIR Declarative Linalg Builders ------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides intuitive composable interfaces for building structured MLIR
+// snippets in a declarative fashion.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_DIALECT_LINALG_EDSC_BUILDERS_H_
+#define MLIR_DIALECT_LINALG_EDSC_BUILDERS_H_
+
+#include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Intrinsics.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+
+namespace mlir {
+class BlockArgument;
+
+namespace edsc {
+enum class IterType { Parallel, Reduction };
+
+inline StringRef toString(IterType t) {
+  switch (t) {
+  case IterType::Parallel:
+    return getParallelIteratorTypeName();
+  case IterType::Reduction:
+    return getReductionIteratorTypeName();
+  default:
+    llvm_unreachable("Unsupport IterType");
+  }
+}
+
+/// A StructuredIndexed represents a captured value that can be indexed and
+/// passed to the `makeLinalgGenericOp`. It allows writing intuitive index
+/// expressions such as:
+///
+/// ```
+///      StructuredIndexed A(vA), B(vB), C(vC);
+///      makeLinalgGenericOp({A({m, n}), B({k, n})}, {C({m, n})}, ... );
+/// ```
+struct StructuredIndexed {
+  StructuredIndexed(Value v) : value(v) {}
+  StructuredIndexed operator()(ArrayRef<AffineExpr> indexings) {
+    return StructuredIndexed(value, indexings);
+  }
+
+  operator Value() const /* implicit */ { return value; }
+  ArrayRef<AffineExpr> getExprs() { return exprs; }
+
+private:
+  StructuredIndexed(Value v, ArrayRef<AffineExpr> indexings)
+      : value(v), exprs(indexings.begin(), indexings.end()) {
+    assert(v->getType().isa<MemRefType>() && "MemRefType expected");
+  }
+  StructuredIndexed(ValueHandle v, ArrayRef<AffineExpr> indexings)
+      : StructuredIndexed(v.getValue(), indexings) {}
+
+  Value value;
+  SmallVector<AffineExpr, 4> exprs;
+};
+
+inline void defaultRegionBuilder(ArrayRef<BlockArgument> args) {}
+
+Operation *makeLinalgGenericOp(
+    ArrayRef<IterType> iteratorTypes, ArrayRef<StructuredIndexed> inputs,
+    ArrayRef<StructuredIndexed> outputs,
+    function_ref<void(ArrayRef<BlockArgument>)> regionBuilder =
+        defaultRegionBuilder,
+    ArrayRef<Value> otherValues = {}, ArrayRef<Attribute> otherAttributes = {});
+
+namespace ops {
+using edsc::StructuredIndexed;
+using edsc::ValueHandle;
+using edsc::intrinsics::linalg_yield;
+
+//===----------------------------------------------------------------------===//
+// EDSC builders for linalg generic operations.
+//===----------------------------------------------------------------------===//
+
+/// Build the body of a region to compute a multiply-accumulate, under the
+/// current ScopedContext, at the current insert point.
+void macRegionBuilder(ArrayRef<BlockArgument> args);
+
+/// TODO(ntv): In the future we should tie these implementations to something in
+/// Tablegen that generates the proper interfaces and the proper sugared named
+/// ops.
+
+/// Build a linalg.pointwise, under the current ScopedContext, at the current
+/// insert point, that computes:
+/// ```
+///    (i0, ..., in) = (par, ..., par)
+///    |
+///    |  O...(some_subset...(i0, ..., in)) =
+///    |    some_pointwise_func...(I...(some_other_subset...(i0, ..., in)))
+/// ```
+///
+/// This is a very generic entry point that can be configured in many ways to
+/// build a perfect loop nest of parallel loops with arbitrarily complex
+/// innermost loop code and whatever (explicit) broadcast semantics.
+///
+/// This can be used with both out-of-place and in-place semantics.
+/// The client is responsible for ensuring the region operations are compatible
+/// with in-place semantics and parallelism.
+
+/// Unary pointwise operation (with broadcast) entry point.
+using UnaryPointwiseOpBuilder = function_ref<Value(ValueHandle)>;
+Operation *linalg_pointwise(UnaryPointwiseOpBuilder unaryOp,
+                            StructuredIndexed I, StructuredIndexed O);
+
+/// Build a linalg.pointwise with all `parallel` iterators and a region that
+/// computes `O = tanh(I)`. The client is responsible for specifying the proper
+/// indexings when creating the StructuredIndexed.
+Operation *linalg_pointwise_tanh(StructuredIndexed I, StructuredIndexed O);
+
+/// Binary pointwise operation (with broadcast) entry point.
+using BinaryPointwiseOpBuilder = function_ref<Value(ValueHandle, ValueHandle)>;
+Operation *linalg_pointwise(BinaryPointwiseOpBuilder binaryOp,
+                            StructuredIndexed I1, StructuredIndexed I2,
+                            StructuredIndexed O);
+
+/// Build a linalg.pointwise with all `parallel` iterators and a region that
+/// computes `O = I1 + I2`. The client is responsible for specifying the proper
+/// indexings when creating the StructuredIndexed.
+Operation *linalg_pointwise_add(StructuredIndexed I1, StructuredIndexed I2,
+                                StructuredIndexed O);
+
+/// Build a linalg.pointwise with all `parallel` iterators and a region that
+/// computes `O = max(I!, I2)`. The client is responsible for specifying the
+/// proper indexings when creating the StructuredIndexed.
+Operation *linalg_pointwise_max(StructuredIndexed I1, StructuredIndexed I2,
+                                StructuredIndexed O);
+
+// TODO(ntv): Implement more useful pointwise operations on a per-need basis.
+
+/// Build a linalg.generic, under the current ScopedContext, at the current
+/// insert point, that computes:
+/// ```
+///    (m, n, k) = (par, par, seq)
+///    |
+///    |  C(m, n) += A(m, k) * B(k, n)
+/// ```
+Operation *linalg_matmul(ValueHandle vA, ValueHandle vB, ValueHandle vC);
+
+template <typename Container> Operation *linalg_matmul(Container values) {
+  assert(values.size() == 3 && "Expected exactly 3 values");
+  return linalg_matmul(values[0], values[1], values[2]);
+}
+
+/// Build a linalg.generic, under the current ScopedContext, at the current
+/// insert point, that computes:
+/// ```
+///    (batch, f, [h, w, ...], [kh, kw, ...], c) =
+///    |  (par, par, [par, par, ...], [red, red, ...], red)
+///    |
+///    | O(batch, [h, w, ...], f) +=
+///    |   I(batch,
+///    |     [
+///    |       stride[0] * h + dilations[0] * kh,
+///    |       stride[1] * w + dilations[1] * kw, ...
+///          ],
+///    |     c)
+///    |   *
+///    |   W([kh, kw, ...], c, f)
+/// ```
+/// If `dilations` or `strides` are left empty, the default value of `1` is used
+/// along each relevant dimension.
+///
+/// For now `...` must be empty (i.e. only 2-D convolutions are supported).
+///
+// TODO(ntv) Extend convolution rank with some template magic.
+Operation *linalg_conv_nhwc(ValueHandle vI, ValueHandle vW, ValueHandle vO,
+                            ArrayRef<int> strides = {},
+                            ArrayRef<int> dilations = {});
+
+template <typename Container>
+Operation *linalg_conv_nhwc(Container values, ArrayRef<int> strides = {},
+                            ArrayRef<int> dilations = {}) {
+  assert(values.size() == 3 && "Expected exactly 3 values");
+  return linalg_conv_nhwc(values[0], values[1], values[2], strides, dilations);
+}
+
+/// Build a linalg.generic, under the current ScopedContext, at the current
+/// insert point, that computes:
+/// ```
+///    (batch, dm, c, [h, w, ...], [kh, kw, ...]) =
+///    |  (par, par, par, [par, par, ...], [red, red, ...])
+///    |
+///    | O(batch, [h, w, ...], c * depth_multiplier) +=
+///    |   I(batch,
+///    |     [
+///    |       stride[0] * h + dilations[0] * kh,
+///    |       stride[1] * w + dilations[1] * kw, ...
+///          ],
+///    |     c)
+///    |   *
+///    |   W([kh, kw, ...], c, depth_multiplier)
+/// ```
+/// If `dilations` or `strides` are left empty, the default value of `1` is used
+/// along each relevant dimension.
+///
+/// For now `...` must be empty (i.e. only 2-D convolutions are supported).
+///
+// TODO(ntv) Extend convolution rank with some template magic.
+Operation *linalg_dilated_conv_nhwc(ValueHandle vI, ValueHandle vW,
+                                    ValueHandle vO, int depth_multiplier = 1,
+                                    ArrayRef<int> strides = {},
+                                    ArrayRef<int> dilations = {});
+
+template <typename Container>
+Operation *linalg_dilated_conv_nhwc(Container values, int depth_multiplier,
+                                    ArrayRef<int> strides = {},
+                                    ArrayRef<int> dilations = {}) {
+  assert(values.size() == 3 && "Expected exactly 3 values");
+  return linalg_dilated_conv_nhwc(values[0], values[1], values[2],
+                                  depth_multiplier, strides, dilations);
+}
+
+} // namespace ops
+} // namespace edsc
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_EDSC_BUILDERS_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/EDSC/Intrinsics.h b/mlir/include/mlir/Dialect/Linalg/EDSC/Intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..b04c11f22bb9f1e919aec58e028ccb86d7cad93a
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/EDSC/Intrinsics.h
@@ -0,0 +1,26 @@
+//===- Intrinsics.h - MLIR EDSC Intrinsics for Linalg -----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_DIALECT_LINALG_EDSC_INTRINSICS_H_
+#define MLIR_DIALECT_LINALG_EDSC_INTRINSICS_H_
+
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Intrinsics.h"
+
+namespace mlir {
+namespace edsc {
+namespace intrinsics {
+
+using linalg_fill = OperationBuilder<linalg::FillOp>;
+using linalg_yield = OperationBuilder<linalg::YieldOp>;
+
+} // namespace intrinsics
+} // namespace edsc
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_EDSC_INTRINSICS_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..269729bc644528e4573c3f0e8338570d55f8bd5c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_mlir_dialect(LinalgOps LinalgDoc)
+set(LLVM_TARGET_DEFINITIONS LinalgStructuredOps.td)
+mlir_tablegen(LinalgStructuredOps.h.inc -gen-op-decls)
+mlir_tablegen(LinalgStructuredOps.cpp.inc -gen-op-defs)
+mlir_tablegen(LinalgStructuredOpsInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(LinalgStructuredOpsInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(MLIRLinalgStructuredOpsIncGen)
+
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td
new file mode 100644
index 0000000000000000000000000000000000000000..c1adc8b4d05c908ae7eb351067635a233ef1f81b
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td
@@ -0,0 +1,111 @@
+//===- LinalgBase.td - Linalg dialect base support ---------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the definition file for base linear algebra support.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LINALG_BASE
+#define LINALG_BASE
+
+include "mlir/IR/OpBase.td"
+
+def Linalg_Dialect : Dialect {
+  let name = "linalg";
+  let description = [{
+    The `linalg` dialect groups together a set of types, operations and
+    transformations that are useful to implement a structured abstraction where
+    ops can lower to scalar load/store and operations or to more general library
+    calls.
+
+    The `linalg` dialect manipulates the following types and operations:
+
+    ### Core data types and special ops.
+
+    The following abstractions are used by the `linalg` dialect:
+
+    #### Views
+    The current implementation uses the strided memref abstraction. In the
+    future other abstractions than strided memref will be used.
+
+    #### `!linalg.range`
+    This data type is currently just a triple (`min`,`max`, `step`) that does
+    not pass function boundaries.
+
+    #### `linalg.yield`
+    This op is used as a terminator within the appropriate `linalg` regions.
+
+    In the future, richer `view` and `range` representations are expected, in
+    particular to represent sparse traversals.
+
+    ### Metadata Ops
+    A set of ops that manipulate metadata but do not move memory. These ops take
+    `view` operands + extra attributes and return new `view`s. The returned
+    `view`s generally alias the operand `view`. At the moment the existing ops
+    are:
+
+        * `std.view`,
+        * `std.subview`,
+        * `linalg.range`,
+        * `linalg.slice`,
+        * `linalg.transpose`.
+
+    Future ops are added on a per-need basis but should include:
+
+        * `linalg.reshape`,
+        * `linalg.tile`,
+        * `linalg.intersection`,
+        * `linalg.convex_union`,
+        * `linalg.difference` (would need to work on a list of views).
+
+    ### Payload Ops
+    A set of payload carrying operations that implement the [structured ops](
+    https://docs.google.com/presentation/d/1P-j1GrH6Q5gLBjao0afQ-GfvcAeF-QU4GXXeSy0eJ9I/edit#slide=id.p
+    )
+    abstraction on buffers. `linalg` has `2` generic operations `linalg.generic`
+    and `linalg.indexed_generic` for expressing custom operations. This is
+    subject to further evolution as transformations and analyses continue to be
+    developed.
+
+    Additionally, `linalg` provides some common named operations:
+
+        * `linalg.copy`,
+        * `linalg.fill`,
+        * `linalg.dot`,
+        * `linalg.matmul`,
+        * `linalg.conv`.
+
+    Future ops are added on a per-need basis but should include:
+
+        * `linalg.pad`.
+
+    In an ideal world, all the named ops would be automatically generated from
+    a description in terms of only the `2` generic ops. Unfortunately we do not
+    have such support yet (contributions are most welcome).
+
+    ### Convention for external library interop
+    The `linalg` dialect adopts a convention that is similar to `BLAS` when
+    offloading operations to fast library implementations: pass a non-owning
+    pointer to input and output data with additional metadata. This convention
+    is also found in libraries such as `MKL`, `OpenBLAS`, `BLIS`, `cuBLAS`,
+    `cuDNN`, etc.. and more generally at interface points across language
+    boundaries (e.g. C++ / Python).
+
+    Generally, `linalg` passes non-owning pointers to strided memref data
+    structures to precompiled library calls linked externally. The name `view`
+    is used interchangeably in `linalg` to signify strided memref discussed at
+    length in the [strided memref RFC](
+    https://groups.google.com/a/tensorflow.org/g/mlir/c/MaL8m2nXuio/m/a_v07o9yBwAJ).
+  }];
+}
+
+// Whether a type is a RangeType.
+def LinalgIsRangeTypePred : CPred<"$_self.isa<RangeType>()">;
+def Range : Type<LinalgIsRangeTypePred, "range">;
+
+#endif // LINALG_BASE
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgDoc.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgDoc.td
new file mode 100644
index 0000000000000000000000000000000000000000..819d02d396d4598f42a513eb335e7484b7253f43
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgDoc.td
@@ -0,0 +1,23 @@
+//===- LinalgDoc.td - Linalg documentation -----------------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This documentation files exists to circumvent limitations on mixing different
+// .td files in cases one does not want to have all ops belong to the same
+// logical unit. This file should only include other .td files only and be used
+// for the purpose of generating documentation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LINALG_DOC
+#define LINALG_DOC
+
+include "mlir/Dialect/Linalg/IR/LinalgBase.td"
+include "mlir/Dialect/Linalg/IR/LinalgOps.td"
+include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.td"
+
+#endif // LINALG_DOC
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgLibraryOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgLibraryOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..6fdb8a644af7dbdb9f209f7e17491c26daff80f0
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgLibraryOps.td
@@ -0,0 +1,616 @@
+//===- LinalgLibraryOps.td - Linalg dialect library ops -*- tablegen ----*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the operation definition file for linear algebra operations that
+// correspond to underlying library calls (e.g. BLAS).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LINALG_LIBRARY_OPS
+#define LINALG_LIBRARY_OPS
+
+include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+include "mlir/Dialect/Linalg/IR/LinalgBase.td"
+
+// The Linalg `NInputs` trait provides the API for ops that are known
+// to have a specified number of inputs, all passed as operands.
+// See Linalg/LinalgTraits.h for implementation details an usage.
+class NInputs<int args_in> :
+  NativeOpTrait<"linalg::NInputs<" # !cast<string>(args_in) # ">::Impl"> {}
+
+// The Linalg `NOutputs` trait provides the API for ops that are known
+// to have a specified number of outputs, all passed as operands.
+// See Linalg/LinalgTraits.h for implementation details an usage.
+class NOutputs<int args_out> :
+  NativeOpTrait<"linalg::NOutputs<" # !cast<string>(args_out) # ">::Impl"> {}
+
+def ViewTraits : NativeOpTrait<"linalg::ViewTraits">;
+
+// The linalg 'LinalgLibraryInterface' provides access to the 'LinalgOp'
+// interface.
+def LinalgLibraryInterface : OpInterface<"LinalgOp"> {
+  let methods = [
+    InterfaceMethod<
+      "Query the number of inputs from the current operation.",
+      "unsigned", "getNumInputs"
+    >,
+    InterfaceMethod<
+      "Query the number of outputs from the current operation.",
+      "unsigned", "getNumOutputs"
+    >,
+    InterfaceMethod<
+      "Query the number of inputs and outputs from the current operation.",
+      "unsigned", "getNumInputsAndOutputs"
+    >,
+    InterfaceMethod<
+      "Query the input operands from the current operation.",
+      "Operation::operand_range", "getInputs"
+    >,
+    InterfaceMethod<
+      "Query the output operands from the current operation.",
+      "Operation::operand_range", "getOutputs"
+    >,
+    InterfaceMethod<
+      "Query the input and output operands from the current operation.",
+      "Operation::operand_range", "getInputsAndOutputs"
+    >,
+    InterfaceMethod<
+      "Query the iterator types attribute within the current operation.",
+      "ArrayAttr", "iterator_types"
+    >,
+    InterfaceMethod<
+      "Query the indexing maps attribute within the current operation.",
+      "ArrayAttr", "indexing_maps"
+    >,
+    InterfaceMethod<
+      "Query the number of parallel loops within the current operation.",
+      "unsigned", "getNumParallelLoops"
+    >,
+    InterfaceMethod<
+      "Query the number of reduction loops within the current operation.",
+      "unsigned", "getNumReductionLoops"
+    >,
+    InterfaceMethod<
+      "Query the number of window loops within the current operation.",
+      "unsigned", "getNumWindowLoops"
+    >,
+    InterfaceMethod<
+      "Query the number of loops within the current operation.",
+      "unsigned", "getNumLoops">,
+    InterfaceMethod<"Query the input view at the given index.",
+      "Value ", "getInput", (ins "unsigned":$i)
+    >,
+    InterfaceMethod<"Query the output view at the given index.",
+      "Value ", "getOutput", (ins "unsigned":$i)
+    >,
+    InterfaceMethod<[{
+        Query the index of the given input value, or `None` if the value is not
+        an input.
+      }],
+      "Optional<unsigned>", "getIndexOfInput", (ins "Value ":$view)
+    >,
+    InterfaceMethod<[{
+        Query the index of the given view value, or `None` if the value is not
+        an view.
+      }],
+      "Optional<unsigned>", "getIndexOfOutput", (ins "Value ":$view)
+    >,
+    InterfaceMethod<[{
+        Query the type of the input view at the given index.
+      }], "MemRefType", "getInputViewType", (ins "unsigned":$i)>,
+    InterfaceMethod<[{
+        Query the type of the output view at the given index.
+      }], "MemRefType", "getOutputViewType", (ins "unsigned":$i)>,
+
+    StaticInterfaceMethod<[{
+        Create an operation of the current type with the given location,
+        operands, and attributes.
+      }],
+      "Operation *", "create",
+      (ins "OpBuilder &":$builder, "Location":$loc,
+           "ValueRange":$operands,
+           "ArrayRef<NamedAttribute>":$attributes), [{
+        return builder.create<ConcreteOp>(loc, ArrayRef<Type>{}, operands,
+                                          attributes);
+      }]
+    >,
+
+    /// Clone an operation with the given location and operands. This is used to
+    /// abstract away the optional underlying region creation.
+    InterfaceMethod<[{
+        Clone the current operation with the given location and operands. This
+        is used to abstract away the optional underlying region creation.
+      }],
+      "Operation *", "clone",
+      (ins "OpBuilder &":$b, "Location":$loc, "ValueRange":$operands), [{
+        BlockAndValueMapping map;
+        unsigned numRegions = op.getOperation()->getNumRegions();
+        Operation *res = create(b, loc, operands, op.getAttrs());
+        assert(res->getNumRegions() == numRegions && "inconsistent # regions");
+        for (unsigned ridx = 0; ridx < numRegions; ++ridx)
+          op.getOperation()->getRegion(ridx).cloneInto(
+            &res->getRegion(ridx), map);
+        return res;
+      }]
+    >
+  ];
+}
+
+// Base Tablegen class for Linalg ops.
+// Linalg ops that correspond to library calls operate on linalg::View as their
+// first operands. These may be optionally followed by non-view operands
+// depending on the specific Linalg op.
+class LinalgLibraryBase_Op<string mnemonic, list<OpTrait> props>
+  : Op<Linalg_Dialect, mnemonic,
+       !listconcat(props, [ViewTraits, LinalgLibraryInterface])> {
+  let parser = [{ return parseLinalgLibraryOp(parser, result); }];
+  let printer = [{ printLinalgLibraryOp(p, *this); }];
+}
+
+class LinalgLibrary_Op<string mnemonic, list<OpTrait> props>
+  : LinalgLibraryBase_Op<mnemonic, props> {
+  code libraryCallName = [{
+    std::string getLibraryCallName() {
+      return generateLibraryCallName(getOperation());
+    }
+  }];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Concrete Linalg ops.
+////////////////////////////////////////////////////////////////////////////////
+def CopyOp : LinalgLibrary_Op<"copy", [NInputs<1>, NOutputs<1>]> {
+  let description = [{
+    Copies the data in the input view into the output view.
+
+    Usage:
+      ```mlir
+      linalg.copy(%arg0, %arg1) : memref<?xf32, stride_specification>,
+                                  memref<?xf32, stride_specification>
+      ```
+
+    One possible lowering to loop form is:
+      ```mlir
+      %0 = linalg.dim %arg0, 0 : index
+      loop.for %i0 = %c0 to %0 step %c1 {
+        %1 = linalg.load %arg0[%i0] : memref<?xf32, stride_specification>
+        linalg.store %1, %arg1[%i0] : memref<?xf32, stride_specification>
+      }
+      ```
+
+    Optionally, can take `input_permutation` and `output_permutation` attributes
+    to reorder the dimensions of the input and output views.
+
+    Usage:
+      ```mlir
+      linalg.copy(%arg0, %arg1) {inputPermutation : (i, j, k) -> (i, k, j),
+                                 outputPermutation : (i, j, k) -> (k, j, i)} :
+        memref<?x?x?xf32, stride_specification>,
+        memref<?x?x?xf32, stride_specification>
+     ```
+
+    One possible lowering to loop form is:
+      ```mlir
+      %0 = linalg.dim %arg0, 0
+      %1 = linalg.dim %arg0, 1
+      %2 = linalg.dim %arg0, 2
+      loop.for %i0 = %c0 to %{{.*}} step %c1 {
+        loop.for %i1 = %c0 to %{{.*}} step %c1 {
+          loop.for %i2 = %c0 to %{{.*}} step %c1 {
+            %3 = linalg.load %arg0[%i0, %i2, %i1] :
+                    memref<?x?x?xf32, stride_specification>
+            linalg.store %3, %arg1[%i2, %i1, %i0] :
+                    memref<?x?x?xf32, stride_specification>
+      ```
+
+    The views are expected to be compatible for correctness but this is not
+    enforced at the moment.
+  }];
+  let arguments = (ins
+    AnyStridedMemRef:$input,
+    AnyStridedMemRef:$output,
+    OptionalAttr<AffineMapAttr>:$inputPermutation,
+    OptionalAttr<AffineMapAttr>:$outputPermutation);
+  // TODO(ntv) this should go away once the usage of OptionalAttr triggers
+  // emission of builders with default arguments left unspecified.
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value input, Value output", [{
+    return build(
+      builder, result, input, output, AffineMapAttr(), AffineMapAttr());
+  }]>];
+  let extraClassDeclaration = libraryCallName # [{
+    ArrayAttr indexing_maps();
+
+    ArrayAttr iterator_types() {
+      unsigned nPar = input()->getType().cast<ShapedType>().getRank();
+      MLIRContext *ctx = getContext();
+      SmallVector<Attribute, 8> iters(
+        nPar, StringAttr::get(getParallelIteratorTypeName(), ctx));
+      return ArrayAttr::get(iters, ctx);
+    }
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def FillOp : LinalgLibrary_Op<"fill", [NInputs<0>, NOutputs<1>]> {
+  let arguments = (ins AnyStridedMemRef:$output,
+                   AnyTypeOf<[AnyFloat, AnyInteger, AnyVector]>:$value);
+  let extraClassDeclaration = libraryCallName # [{
+    ArrayAttr indexing_maps();
+
+    ArrayAttr iterator_types() {
+      unsigned nPar = output()->getType().cast<ShapedType>().getRank();
+      MLIRContext *ctx = getContext();
+      SmallVector<Attribute, 8> iters(
+        nPar, StringAttr::get(getParallelIteratorTypeName(), ctx));
+      return ArrayAttr::get(iters, ctx);
+    }
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def DotOp : LinalgLibrary_Op<"dot", [NInputs<2>, NOutputs<1>]> {
+  let arguments = (ins AnyStridedMemRefOfRank<1>,
+                       AnyStridedMemRefOfRank<1>,
+                       AnyStridedMemRefOfRank<0>);
+  let extraClassDeclaration = libraryCallName # [{
+    ArrayAttr indexing_maps();
+
+    ArrayAttr iterator_types() {
+      MLIRContext *ctx = getContext();
+      return ArrayAttr::get(
+        StringAttr::get(getReductionIteratorTypeName(), ctx), ctx);
+    }
+  }];
+}
+
+def MatvecOp : LinalgLibrary_Op<"matvec", [NInputs<2>, NOutputs<1>]> {
+  let arguments = (ins AnyStridedMemRefOfRank<2>,
+                       AnyStridedMemRefOfRank<1>,
+                       AnyStridedMemRefOfRank<1>);
+  let extraClassDeclaration = libraryCallName # [{
+    ArrayAttr indexing_maps();
+
+    ArrayAttr iterator_types() {
+      MLIRContext *ctx = getContext();
+      Attribute iters[2]{
+        StringAttr::get(getParallelIteratorTypeName(), ctx),
+        StringAttr::get(getReductionIteratorTypeName(), ctx)};
+      return ArrayAttr::get(iters, ctx);
+    }
+  }];
+}
+
+def MatmulOp : LinalgLibrary_Op<"matmul", [NInputs<2>, NOutputs<1>]> {
+  let arguments = (ins AnyStridedMemRefOfRank<2>,
+                       AnyStridedMemRefOfRank<2>,
+                       AnyStridedMemRefOfRank<2>);
+  let extraClassDeclaration = libraryCallName # [{
+    ArrayAttr indexing_maps();
+
+    ArrayAttr iterator_types() {
+      MLIRContext *ctx = getContext();
+      Attribute iters[3]{
+        StringAttr::get(getParallelIteratorTypeName(), ctx),
+        StringAttr::get(getParallelIteratorTypeName(), ctx),
+        StringAttr::get(getReductionIteratorTypeName(), ctx)};
+      return ArrayAttr::get(iters, ctx);
+    }
+  }];
+}
+
+def ConvOp : LinalgLibrary_Op<"conv", [NInputs<2>, NOutputs<1>]> {
+  let description = [{
+    Generic n-D convolution as described in the TF documentation:
+    https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/nn/convolution
+
+    ```
+      output[b, x[0], ..., x[N-1], k] =
+      sum_{z[0], ..., z[N-1], q}
+          filter[z[0], ..., z[N-1], q, k] *
+          padded_input[b,
+                       x[0] * strides[0] + dilation_rate[0] * z[0],
+                       ...,
+                       x[N-1] * strides[N-1] + dilation_rate[N-1] * z[N-1],
+                       q]
+    ```
+  }];
+
+  // TODO(ntv) padding.
+  // Following the TF source of truth above, strides and dilations are integer
+  // attributes of the same rank as the number of window dimensions.
+  let arguments = (ins AnyStridedMemRef:$filter, AnyStridedMemRef:$input,
+                   AnyStridedMemRef:$output,
+                   OptionalAttr<I64ArrayAttr>:$strides,
+                   OptionalAttr<I64ArrayAttr>:$dilations);
+  let extraClassDeclaration = libraryCallName # [{
+    // TODO(ntv) extend to support more than 1 dimensions and potentially
+    // grouping too.
+    unsigned getNumBatchDimensions() { return 1; }
+    unsigned getNumInputFeatureDimensions() { return 1; }
+    unsigned getNumOutputFeatureDimensions() { return 1; }
+
+    ArrayAttr indexing_maps();
+
+    ArrayAttr iterator_types() {
+      // Outer parallel loops are always the number of output dimensions; i.e.
+      // [ b, xs, q] in the TF notation above.
+      unsigned nPar = getOutputViewType(0).getRank();
+      unsigned nRed = getNumInputFeatureDimensions();
+      // Window loops are a special kind of reduction that is never tiled or
+      // parallelized across; i.e. [zs] in the TF notation above whose number
+      // match `xs` (i.e. 1 window loop per "image" dimension).
+      // This may evolve in the future.
+      unsigned nWin =
+        nPar - getNumBatchDimensions() - getNumInputFeatureDimensions();
+      MLIRContext *ctx = getContext();
+      SmallVector<Attribute, 8> iters(
+        nPar, StringAttr::get(getParallelIteratorTypeName(), ctx));
+      iters.reserve(nPar + nRed + nWin);
+      iters.append(nRed, StringAttr::get(getReductionIteratorTypeName(), ctx));
+      iters.append(nWin, StringAttr::get(getWindowIteratorTypeName(), ctx));
+      return ArrayAttr::get(iters, ctx);
+    }
+
+    int64_t getStride(unsigned i) {
+      assert(i < getNumWindowLoops());
+      if (!strides().hasValue()) return 1;
+      return strides()->getValue()[i]
+        .cast<IntegerAttr>().getValue().getSExtValue();
+    }
+
+    int64_t getDilation(unsigned i) {
+      assert(i < getNumWindowLoops());
+      if (!dilations().hasValue()) return 1;
+      return dilations()->getValue()[i]
+        .cast<IntegerAttr>().getValue().getSExtValue();
+    }
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+class GenericOpBase<string mnemonic> : LinalgLibraryBase_Op<mnemonic, []> {
+  let arguments = (ins Variadic<AnyStridedMemRef>:$views,
+                   I64Attr:$args_in,
+                   I64Attr:$args_out,
+                   AffineMapArrayAttr:$indexing_maps,
+                   ArrayAttr:$iterator_types,
+                   OptionalAttr<StrAttr>:$doc,
+                   OptionalAttr<FlatSymbolRefAttr>:$fun,
+                   OptionalAttr<StrAttr>:$library_call);
+  let regions = (region AnyRegion:$region);
+  let extraClassDeclaration = [{
+    SmallVector<StringRef, 8> linalgTraitAttrNames() {
+      return SmallVector<StringRef, 8>{
+        getArgsInAttrName(), getArgsOutAttrName(), getDocAttrName(),
+        getFunAttrName(), getIndexingMapsAttrName(), getLibraryCallAttrName(),
+        getIteratorTypesAttrName()
+      };
+    }
+    unsigned getNumInputs() { return args_in().getSExtValue(); }
+    unsigned getNumOutputs() { return args_out().getSExtValue(); }
+    FuncOp getFunction() {
+      auto moduleOp = getParentOfType<ModuleOp>();
+      return fun().hasValue() ?
+        moduleOp.lookupSymbol<FuncOp>(fun().getValue()) : FuncOp();
+    }
+    StringRef getLibraryCallName() {
+      return library_call().hasValue() ? library_call().getValue() : "";
+    }
+    AffineMap getIndexingMap(unsigned i) {
+      assert(i < getNumInputsAndOutputs());
+      return indexing_maps().getValue()[i].cast<AffineMapAttr>().getValue();
+    }
+    AffineMap getInputIndexingMap(unsigned i) {
+      assert(i < getNumInputs());
+      return indexing_maps().getValue()[i].cast<AffineMapAttr>().getValue();
+    }
+    AffineMap getOutputIndexingMap(unsigned i) {
+      assert(i < getNumOutputs());
+      return indexing_maps().getValue()[i + getNumInputs()]
+          .cast<AffineMapAttr>().getValue();
+    }
+  }];
+  let printer = [{ return ::print(p, *this); }];
+  let parser = [{ return ::parseGenericOp(parser, result); }];
+}
+
+def GenericOp : GenericOpBase<"generic"> {
+  let description = [{
+    Generic Linalg op form where the key properties of the computation are
+    specified as attributes. In pretty form, a linalg.generic op is written as:
+
+      ```mlir
+        linalg.generic #trait_attribute %A, %B, %C {other-attributes} :
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>
+      ```
+
+    Where #trait_attributes is an alias of a dictionary attribute containing:
+      - args_in: an I64Attr representing the number of input (readonly) views
+      - args_out: an I64Attr representing the number of output (readwrite) views
+      - doc [optional]: a documentation string
+      - fun: a FlatSymbolRefAttr that must resolve to an existing function
+        symbol. To support inplace updates in a generic fashion, the signature
+        of the function must be:
+        ```
+          fun([input views element types], [output views element types])
+            -> ([output views element types])
+        ```
+      - indexing_maps: a list of AffineMapAttr, one AffineMapAttr per each input
+        and output view. Such AffineMapAttr specifies the mapping between the
+        loops and the indexing within each view.
+      - library_call [optional]: a StringAttr containing the name of an
+        external library function that the linalg.generic operation maps to.
+        The external library is assumed to be dynamically linked and no strong
+        compile-time guarantees are provided. In the absence of such a library
+        call, linalg.generic will always lower to loops.
+      - iterator_types: an ArrayAttr specifying the type of the enclosing loops.
+        Each element of the list represents and iterator of one of the following
+        types:
+          parallel, reduction, window
+
+    Example:
+    Defining a #matmul_trait attribute in MLIR can be done as follows:
+      ```mlir
+        func @fma(%a: f32, %b: f32, %c: f32) -> f32 {
+          %d = mulf %a, %b: f32
+          %e = addf %c, %d: f32
+          return %e: f32
+        }
+        #matmul_accesses = [
+          (m, n, k) -> (m, k),
+          (m, n, k) -> (k, n),
+          (m, n, k) -> (m, n)
+        ]
+        #matmul_trait = {
+          doc = "C(m, n) += A(m, k) * B(k, n)",
+          fun = @fma,
+          indexing_maps = #matmul_accesses,
+          library_call = "linalg_matmul",
+          n_views = [2, 1],
+          iterator_types = ["parallel", "parallel", "reduction"]
+        }
+      ```
+
+    And can be reused in multiple places as:
+      ```mlir
+        linalg.generic #matmul_trait %A, %B, %C [other-attributes] :
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>
+      ```
+
+    This may lower to either:
+      ```mlir
+        call @linalg_matmul(%A, %B, %C) :
+          (memref<?x?xf32, stride_specification>,
+           memref<?x?xf32, stride_specification>,
+           memref<?x?xf32, stride_specification>)
+          -> ()
+      ```
+
+    or IR resembling:
+    ```mlir
+    loop.for %m = %c0 to %M step %c1 {
+      loop.for %n = %c0 to %N step %c1 {
+        loop.for %k = %c0 to %K step %c1 {
+          %a = linalg.load %A[%m, %k] : memref<?x?xf32, stride_specification>
+          %b = linalg.load %B[%k, %n] : memref<?x?xf32, stride_specification>
+          %c = linalg.load %C[%m, %n] : memref<?x?xf32, stride_specification>
+          %d = call @func_of_elements(%a, %b, %c)
+                 : (f32, f32, f32) -> (f32)
+          linalg.store %d, %C[%m, %n] : memref<?x?x?xf32, stride_specification>
+        }
+      }
+    }
+    ```
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def IndexedGenericOp : GenericOpBase<"indexed_generic"> {
+  let description = [{
+    Indexed Generic Linalg op form where the key properties of the computation
+    are specified as attributes. In pretty form, a linalg.indexed_generic op is
+    written as:
+
+      ```mlir
+        linalg.indexed_generic #trait_attribute %A, %B, %C {other-attributes} :
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>
+      ```
+
+    Where #trait_attributes is an alias of a dictionary attribute containing:
+      - args_in: an I64Attr representing the number of input (readonly) views
+      - args_out: an I64Attr representing the number of output (readwrite) views
+      - doc [optional]: a documentation string
+      - fun: a FlatSymbolRefAttr that must resolve to an existing function
+        symbol. To support inplace updates in a generic fashion, the signature
+        of the function must be:
+        ```
+          fun([index types of induction variables], [input views element types],
+              [output views element types]) -> ([output views element types])
+        ```
+      - indexing_maps: a list of AffineMapAttr, one AffineMapAttr per each input
+        and output view. Such AffineMapAttr specifies the mapping between the
+        loops and the indexing within each view.
+      - library_call [optional]: a StringAttr containing the name of an
+        external library function that the linalg.indexed_generic operation
+        maps to.  The external library is assumed to be dynamically linked and
+        no strong compile-time guarantees are provided. In the absence of such
+        a library call, linalg.indexed_generic will always lower to loops.
+      - iterator_types: an ArrayAttr they type of the enclosing loops; Each
+        element of the list represents and iterator of one of the following
+        types:
+          parallel, reduction, window
+
+    Example:
+    Defining a #matmul_trait attribute in MLIR can be done as follows:
+      ```mlir
+        func @fma(%i: index, %j: index, %k: index, %a: f32, %b: f32, %c: f32)
+          -> f32
+        {
+          %d = mulf %a, %b: f32
+          %e = addf %c, %d: f32
+          return %e: f32
+        }
+        #matmul_accesses = [
+          (m, n, k) -> (m, k),
+          (m, n, k) -> (k, n),
+          (m, n, k) -> (m, n)
+        ]
+        #matmul_trait = {
+          doc = "C(m, n) += A(m, k) * B(k, n)",
+          fun = @fma,
+          indexing_maps = #matmul_accesses,
+          library_call = "linalg_matmul",
+          n_views = [2, 1],
+          iterator_types = ["parallel", "parallel", "reduction"]
+        }
+      ```
+
+    And can be reused in multiple places as:
+      ```mlir
+        linalg.indexed_generic #matmul_trait %A, %B, %C [other-attributes] :
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>
+      ```
+
+    This may lower to either:
+      ```mlir
+        call @linalg_matmul(%A, %B, %C) :
+          (memref<?x?xf32, stride_specification>,
+           memref<?x?xf32, stride_specification>,
+           memref<?x?xf32, stride_specification>)
+          -> ()
+      ```
+
+    or IR resembling:
+    ```mlir
+    loop.for %m = %c0 to %M step %c1 {
+      loop.for %n = %c0 to %N step %c1 {
+        loop.for %k = %c0 to %K step %c1 {
+          %a = linalg.load %A[%m, %k] : memref<?x?xf32, stride_specification>
+          %b = linalg.load %B[%k, %n] : memref<?x?xf32, stride_specification>
+          %c = linalg.load %C[%m, %n] : memref<?x?xf32, stride_specification>
+          %d = call @func_of_elements_and_indices(%m, %n, %k, %a, %b, %c)
+                 : (index, index, index, f32, f32, f32) -> (f32)
+          linalg.store %d, %C[%m, %n] : memref<?x?x?xf32, stride_specification>
+        }
+      }
+    }
+    ```
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+#endif // LINALG_LIBRARY_OPS
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..3249edb48e020f0aec277e3ad8ee766f972c0661
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h
@@ -0,0 +1,83 @@
+//===- LinalgOps.h - Linalg Operations --------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LINALG_LINALGOPS_H_
+#define MLIR_DIALECT_LINALG_LINALGOPS_H_
+
+#include "mlir/Dialect/Linalg/IR/LinalgTraits.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+namespace linalg {
+
+/// Returns the name mangled library call name to disambiguate between different
+/// overloads at the C level. The name mangling scheme is basic and uses MLIR
+/// type names:
+///   1. form a string which is the concatenation of the linalg op name with all
+///      the operand type names, separate by underscores;
+///   2. drop the `linalg.` prefix, and the `<`, `>`, `?` symbols from the type.
+/// Assumes `op` is a LinalgOp.
+///
+/// Examples:
+///
+/// 1. linalg.fill(%A, %f) : memref<f32>, f32
+///   name mangles into `linalg_fill_viewf32_f32_impl`
+///
+/// 2. linalg.dot(%A, %B, %C) :
+///      memref<?xf32, stride_specification>,
+///      memref<?xf32, stride_specification>, memref<f32>
+///   name mangles into `linalg_dot_viewxf32_viewxf32_viewf32_impl`
+///
+/// 3. linalg.matmul(...) :
+///      memref<?x?xf32, stride_specification>,
+///      memref<?x?xf32, stride_specification>,
+///      memref<?x?xf32, stride_specification>
+///   name mangles into `linalg_matmul_viewxxf32_viewxxf32_viewxxf32_impl`
+std::string generateLibraryCallName(Operation *op);
+
+/// Returns the list of maps that map loops to operands of a Linalg op.
+/// The i-th affine map identifies loop indices to subscripts that are used when
+/// accessing the i-th operand.
+/// For instance, a matmul that can be written in index notation as:
+/// `A(i, k) * B(k, j) -> C(i, j)` will have the following, ordered, list of
+/// affine maps:
+///
+/// ```mlir
+///    (
+///      (i, j, k) -> (i, k),
+///      (i, j, k) -> (k, j),
+///      (i, j, k) -> (i, j)
+///    )
+/// ```
+///
+/// Only permutation maps are currently supported.
+SmallVector<AffineMap, 4> loopToOperandRangesMaps(Operation *op);
+
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterfaces.h.inc"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h.inc"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.h.inc"
+
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_LINALGOPS_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..0445968ee809dec2bdc1f95d69b84a6d772d330c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
@@ -0,0 +1,181 @@
+//===- LinalgOps.td - Linalg dialect ops -------------------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the operation definition file for linear algebra operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LINALG_OPS
+#define LINALG_OPS
+
+include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+include "mlir/Dialect/Linalg/IR/LinalgBase.td"
+
+// Base class for Linalg dialect ops that do not correspond to library calls.
+class Linalg_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Linalg_Dialect, mnemonic, traits> {
+  // For every linalg op, there needs to be a:
+  //   * void print(OpAsmPrinter &p, ${C++ class of Op} op)
+  //   * LogicalResult verify(${C++ class of Op} op)
+  //   * ParseResult parse${C++ class of Op}(OpAsmParser &parser,
+  //                                         OperationState &result)
+  // functions.
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+def Linalg_RangeOp :
+    Linalg_Op<"range", [NoSideEffect]>,
+    Arguments<(ins Index:$min, Index:$max, Index:$step)>,
+    Results<(outs Range)> {
+  let summary = "Create a `range` type value, used to create `view`s";
+  let description = [{
+    The `linalg.range` op creates a `!linalg.range` from 3 values of type
+    `index` that represent the min, max and step values of the `range`. This
+    type does not pass function boundaries at the moment.
+
+    Example:
+
+    ```mlir
+      %3 = linalg.range %0:%1:%2 : !linalg.range
+    ````
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value min, Value max, "
+    "Value step",
+    [{
+      auto rangeType = RangeType::get(builder->getContext());
+      build(builder, result, rangeType, min, max, step);
+    }]>];
+
+  // Fully specified by traits.
+  let verifier = ?;
+}
+
+def Linalg_SliceOp : Linalg_Op<"slice", [NoSideEffect]>,
+    Arguments<(ins AnyStridedMemRef:$view, Variadic<AnyTypeOf<[Range, Index]>>:$indexings)>,
+    Results<(outs AnyStridedMemRef)> {
+  let summary = "Produce a rank-reduced `subview` of a base `view`.";
+  let description = [{
+    The `linalg.slice` op allows defining a subregion of a smaller rank than the
+    operand `view` within the underlying buffer.
+
+    A `linalg.slice` op takes a view and a variadic number of indexings and
+    produces a `view` of the same elemental type. An indexing is either:
+      1. a `linalg.range`, in which case it does not reduce the rank of the
+         parent `view` along the corresponding dimension.
+      2. an `index`, in which case it reduces the rank of the parent view by
+         one.
+
+    If an indexing extends past the size of the `view`, this is undefined
+    behavior. Ideally the `linalg.slice` operation would automatically truncate
+    it to be within bounds but there are tradeoffs involved now that `std.view`
+    is a standard op.
+
+    Examples:
+
+      1. rank-preserving `slice`:
+
+      ```mlir
+        %4 = linalg.slice %0[%1, %2] : memref<?x?xf32, stride_spec>,
+          !linalg.range, !linalg.range, memref<?x?xf32, stride_spec>
+       ```
+
+      2. rank-reducing `slice` (from 2-D to 1-D):
+
+      ```mlir
+        %4 = linalg.slice %0[%1, %2] : memref<?x?xf32, stride_spec>,
+          index, !linalg.range, memref<?x?xf32, stride_spec>
+      ```
+
+      3. rank-reducing `slice` (from 2-D to 0-D):
+
+      ```mlir
+        %4 = linalg.slice %0[%1, %2] : memref<?x?xf32, stride_spec>,
+          index, index, memref<?x?xf32, stride_spec>
+      ```
+  }];
+
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &result, Value base, "
+    "ValueRange indexings">];
+
+  let extraClassDeclaration = [{
+    enum { FirstIndexingOperand = 1 };
+    unsigned getRank() { return getViewType().getRank(); }
+    Type getElementType() { return getViewType().getElementType(); }
+    MemRefType getViewType() { return getType().cast<MemRefType>(); }
+    unsigned getBaseViewRank() { return getBaseViewType().getRank(); }
+    MemRefType getBaseViewType() { return view()->getType().cast<MemRefType>(); }
+
+    // Get the underlying indexing at a given rank.
+    Value indexing(unsigned rank) { return *(indexings().begin() + rank); }
+
+    // Get the subset of indexings that are of RangeType.
+    SmallVector<Value, 8> getRanges() {
+      SmallVector<Value, 8> res;
+      for (auto operand : indexings())
+        if (!operand->getType().isa<IndexType>())
+          res.push_back(operand);
+      return res;
+    }
+  }];
+}
+
+def Linalg_TransposeOp : Linalg_Op<"transpose", [NoSideEffect]>,
+    Arguments<(ins AnyStridedMemRef:$view, AffineMapAttr:$permutation)>,
+    Results<(outs AnyStridedMemRef)> {
+  let summary = "transpose operation produces a new strided memref (metadata-only)";
+  let description = [{
+    The `linalg.transpose` op produces a strided memref whose sizes and strides
+    are a permutation of the original `view`. This is a pure metadata
+    transformation.
+
+    Example:
+
+    ```mlir
+       %1 = linalg.transpose %0 (i, j) -> (j, i) : memref<?x?xf32, stride_spec>
+    ```
+  }];
+
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &result, Value view, "
+    "AffineMapAttr permutation, ArrayRef<NamedAttribute> attrs = {}">];
+
+  let verifier = [{
+    if (!permutation().isPermutation())
+      return emitOpError("expected a permutation map");
+    if (permutation().getNumDims() != getViewType().getRank())
+      return emitOpError("expected a permutation map of same rank as the view");
+    return success();
+  }];
+
+  let extraClassDeclaration = [{
+    static StringRef getPermutationAttrName() { return "permutation"; }
+    MemRefType getViewType() { return view()->getType().cast<MemRefType>(); }
+  }];
+}
+
+def Linalg_YieldOp : Linalg_Op<"yield", [NativeOpTrait<"IsTerminator">]>,
+    Arguments<(ins Variadic<AnyType>:$values)> {
+  let summary = "Linalg yield operation";
+  let description = [{
+    `linalg.yield` is a special terminator operation for blocks inside regions
+    in `linalg` generic ops. It returns values to the immediately enclosing
+    `linalg` generic op.
+
+    Example:
+
+    ```mlir
+       linalg.yield %f0, %f1 : f32, f32
+    ```
+  }];
+}
+
+#endif // LINALG_OPS
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..dd9e09b8eae78abdeac426ad6d0739c283235866
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -0,0 +1,616 @@
+//===- LinalgStructuredOps.td - Linalg dialect library ops -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the operation definition file for structured operations on buffers
+// that correspond to underlying library calls (e.g. BLAS).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LINALG_STRUCTURED_OPS
+#define LINALG_STRUCTURED_OPS
+
+include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+include "mlir/Dialect/Linalg/IR/LinalgBase.td"
+
+// The Linalg `NInputs` trait provides the API for ops that are known
+// to have a specified number of inputs, all passed as operands.
+// See Linalg/LinalgTraits.h for implementation details an usage.
+class NInputs<int args_in> :
+  NativeOpTrait<"linalg::NInputs<" # !cast<string>(args_in) # ">::Impl"> {}
+
+// The Linalg `NOutputs` trait provides the API for ops that are known
+// to have a specified number of outputs, all passed as operands.
+// See Linalg/LinalgTraits.h for implementation details an usage.
+class NOutputs<int args_out> :
+  NativeOpTrait<"linalg::NOutputs<" # !cast<string>(args_out) # ">::Impl"> {}
+
+def ViewTraits : NativeOpTrait<"linalg::ViewTraits">;
+
+// The linalg 'LinalgStructuredInterface' provides access to the 'LinalgOp'
+// interface.
+def LinalgStructuredInterface : OpInterface<"LinalgOp"> {
+  let methods = [
+    InterfaceMethod<
+      "Query the number of inputs from the current operation.",
+      "unsigned", "getNumInputs"
+    >,
+    InterfaceMethod<
+      "Query the number of outputs from the current operation.",
+      "unsigned", "getNumOutputs"
+    >,
+    InterfaceMethod<
+      "Query the number of inputs and outputs from the current operation.",
+      "unsigned", "getNumInputsAndOutputs"
+    >,
+    InterfaceMethod<
+      "Query the input operands from the current operation.",
+      "Operation::operand_range", "getInputs"
+    >,
+    InterfaceMethod<
+      "Query the output operands from the current operation.",
+      "Operation::operand_range", "getOutputs"
+    >,
+    InterfaceMethod<
+      "Query the input and output operands from the current operation.",
+      "Operation::operand_range", "getInputsAndOutputs"
+    >,
+    InterfaceMethod<
+      "Query the iterator types attribute within the current operation.",
+      "ArrayAttr", "iterator_types"
+    >,
+    InterfaceMethod<
+      "Query the indexing maps attribute within the current operation.",
+      "ArrayAttr", "indexing_maps"
+    >,
+    InterfaceMethod<
+      "Query the number of parallel loops within the current operation.",
+      "unsigned", "getNumParallelLoops"
+    >,
+    InterfaceMethod<
+      "Query the number of reduction loops within the current operation.",
+      "unsigned", "getNumReductionLoops"
+    >,
+    InterfaceMethod<
+      "Query the number of window loops within the current operation.",
+      "unsigned", "getNumWindowLoops"
+    >,
+    InterfaceMethod<
+      "Query the number of loops within the current operation.",
+      "unsigned", "getNumLoops">,
+    InterfaceMethod<"Query the input view at the given index.",
+      "Value ", "getInput", (ins "unsigned":$i)
+    >,
+    InterfaceMethod<"Query the output view at the given index.",
+      "Value ", "getOutput", (ins "unsigned":$i)
+    >,
+    InterfaceMethod<[{
+        Query the index of the given input value, or `None` if the value is not
+        an input.
+      }],
+      "llvm::Optional<unsigned>", "getIndexOfInput", (ins "Value ":$view)
+    >,
+    InterfaceMethod<[{
+        Query the index of the given view value, or `None` if the value is not
+        an view.
+      }],
+      "llvm::Optional<unsigned>", "getIndexOfOutput", (ins "Value ":$view)
+    >,
+    InterfaceMethod<[{
+        Query the type of the input view at the given index.
+      }], "MemRefType", "getInputViewType", (ins "unsigned":$i)>,
+    InterfaceMethod<[{
+        Query the type of the output view at the given index.
+      }], "MemRefType", "getOutputViewType", (ins "unsigned":$i)>,
+
+    StaticInterfaceMethod<[{
+        Create an operation of the current type with the given location,
+        operands, and attributes.
+      }],
+      "Operation *", "create",
+      (ins "OpBuilder &":$builder, "Location":$loc,
+           "ValueRange":$operands,
+           "ArrayRef<NamedAttribute>":$attributes), [{
+        return builder.create<ConcreteOp>(loc, ArrayRef<Type>{}, operands,
+                                          attributes);
+      }]
+    >,
+
+    /// Clone an operation with the given location and operands. This is used to
+    /// abstract away the optional underlying region creation.
+    InterfaceMethod<[{
+        Clone the current operation with the given location and operands. This
+        is used to abstract away the optional underlying region creation.
+      }],
+      "Operation *", "clone",
+      (ins "OpBuilder &":$b, "Location":$loc, "ValueRange":$operands), [{
+        BlockAndValueMapping map;
+        unsigned numRegions = op.getOperation()->getNumRegions();
+        Operation *res = create(b, loc, operands, op.getAttrs());
+        assert(res->getNumRegions() == numRegions && "inconsistent # regions");
+        for (unsigned ridx = 0; ridx < numRegions; ++ridx)
+          op.getOperation()->getRegion(ridx).cloneInto(
+            &res->getRegion(ridx), map);
+        return res;
+      }]
+    >
+  ];
+}
+
+// Base Tablegen class for Linalg ops.
+// Linalg ops that correspond to library calls operate on linalg::View as their
+// first operands. These may be optionally followed by non-view operands
+// depending on the specific Linalg op.
+class LinalgStructuredBase_Op<string mnemonic, list<OpTrait> props>
+  : Op<Linalg_Dialect, mnemonic,
+       !listconcat(props, [ViewTraits, LinalgStructuredInterface])> {
+  let parser = [{ return parseLinalgStructuredOp(parser, result); }];
+  let printer = [{ printLinalgStructuredOp(p, *this); }];
+}
+
+class LinalgStructured_Op<string mnemonic, list<OpTrait> props>
+  : LinalgStructuredBase_Op<mnemonic, props> {
+  code libraryCallName = [{
+    std::string getLibraryCallName() {
+      return generateLibraryCallName(getOperation());
+    }
+  }];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Concrete Linalg ops.
+////////////////////////////////////////////////////////////////////////////////
+def CopyOp : LinalgStructured_Op<"copy", [NInputs<1>, NOutputs<1>]> {
+  let description = [{
+    Copies the data in the input view into the output view.
+
+    Usage:
+      ```mlir
+      linalg.copy(%arg0, %arg1) : memref<?xf32, stride_specification>,
+                                  memref<?xf32, stride_specification>
+      ```
+
+    One possible lowering to loop form is:
+      ```mlir
+      %0 = linalg.dim %arg0, 0 : index
+      loop.for %i0 = %c0 to %0 step %c1 {
+        %1 = linalg.load %arg0[%i0] : memref<?xf32, stride_specification>
+        linalg.store %1, %arg1[%i0] : memref<?xf32, stride_specification>
+      }
+      ```
+
+    Optionally, can take `input_permutation` and `output_permutation` attributes
+    to reorder the dimensions of the input and output views.
+
+    Usage:
+      ```mlir
+      linalg.copy(%arg0, %arg1) {inputPermutation : (i, j, k) -> (i, k, j),
+                                 outputPermutation : (i, j, k) -> (k, j, i)} :
+        memref<?x?x?xf32, stride_specification>,
+        memref<?x?x?xf32, stride_specification>
+     ```
+
+    One possible lowering to loop form is:
+      ```mlir
+      %0 = linalg.dim %arg0, 0
+      %1 = linalg.dim %arg0, 1
+      %2 = linalg.dim %arg0, 2
+      loop.for %i0 = %c0 to %{{.*}} step %c1 {
+        loop.for %i1 = %c0 to %{{.*}} step %c1 {
+          loop.for %i2 = %c0 to %{{.*}} step %c1 {
+            %3 = linalg.load %arg0[%i0, %i2, %i1] :
+                    memref<?x?x?xf32, stride_specification>
+            linalg.store %3, %arg1[%i2, %i1, %i0] :
+                    memref<?x?x?xf32, stride_specification>
+      ```
+
+    The views are expected to be compatible for correctness but this is not
+    enforced at the moment.
+  }];
+  let arguments = (ins
+    AnyStridedMemRef:$input,
+    AnyStridedMemRef:$output,
+    OptionalAttr<AffineMapAttr>:$inputPermutation,
+    OptionalAttr<AffineMapAttr>:$outputPermutation);
+  // TODO(ntv) this should go away once the usage of OptionalAttr triggers
+  // emission of builders with default arguments left unspecified.
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value input, Value output", [{
+    return build(
+      builder, result, input, output, AffineMapAttr(), AffineMapAttr());
+  }]>];
+  let extraClassDeclaration = libraryCallName # [{
+    ArrayAttr indexing_maps();
+
+    ArrayAttr iterator_types() {
+      unsigned nPar = input()->getType().cast<ShapedType>().getRank();
+      MLIRContext *ctx = getContext();
+      SmallVector<Attribute, 8> iters(
+        nPar, StringAttr::get(getParallelIteratorTypeName(), ctx));
+      return ArrayAttr::get(iters, ctx);
+    }
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def FillOp : LinalgStructured_Op<"fill", [NInputs<0>, NOutputs<1>]> {
+  let arguments = (ins AnyStridedMemRef:$input,
+                   AnyTypeOf<[AnyFloat, AnyInteger, AnyVector]>:$value);
+  let extraClassDeclaration = libraryCallName # [{
+    ArrayAttr indexing_maps();
+
+    ArrayAttr iterator_types() {
+      unsigned nPar = input()->getType().cast<ShapedType>().getRank();
+      MLIRContext *ctx = getContext();
+      SmallVector<Attribute, 8> iters(
+        nPar, StringAttr::get(getParallelIteratorTypeName(), ctx));
+      return ArrayAttr::get(iters, ctx);
+    }
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def DotOp : LinalgStructured_Op<"dot", [NInputs<2>, NOutputs<1>]> {
+  let arguments = (ins AnyStridedMemRefOfRank<1>,
+                       AnyStridedMemRefOfRank<1>,
+                       AnyStridedMemRefOfRank<0>);
+  let extraClassDeclaration = libraryCallName # [{
+    ArrayAttr indexing_maps();
+
+    ArrayAttr iterator_types() {
+      MLIRContext *ctx = getContext();
+      return ArrayAttr::get(
+        StringAttr::get(getReductionIteratorTypeName(), ctx), ctx);
+    }
+  }];
+}
+
+def MatvecOp : LinalgStructured_Op<"matvec", [NInputs<2>, NOutputs<1>]> {
+  let arguments = (ins AnyStridedMemRefOfRank<2>,
+                       AnyStridedMemRefOfRank<1>,
+                       AnyStridedMemRefOfRank<1>);
+  let extraClassDeclaration = libraryCallName # [{
+    ArrayAttr indexing_maps();
+
+    ArrayAttr iterator_types() {
+      MLIRContext *ctx = getContext();
+      Attribute iters[2]{
+        StringAttr::get(getParallelIteratorTypeName(), ctx),
+        StringAttr::get(getReductionIteratorTypeName(), ctx)};
+      return ArrayAttr::get(iters, ctx);
+    }
+  }];
+}
+
+def MatmulOp : LinalgStructured_Op<"matmul", [NInputs<2>, NOutputs<1>]> {
+  let arguments = (ins AnyStridedMemRefOfRank<2>,
+                       AnyStridedMemRefOfRank<2>,
+                       AnyStridedMemRefOfRank<2>);
+  let extraClassDeclaration = libraryCallName # [{
+    ArrayAttr indexing_maps();
+
+    ArrayAttr iterator_types() {
+      MLIRContext *ctx = getContext();
+      Attribute iters[3]{
+        StringAttr::get(getParallelIteratorTypeName(), ctx),
+        StringAttr::get(getParallelIteratorTypeName(), ctx),
+        StringAttr::get(getReductionIteratorTypeName(), ctx)};
+      return ArrayAttr::get(iters, ctx);
+    }
+  }];
+}
+
+def ConvOp : LinalgStructured_Op<"conv", [NInputs<2>, NOutputs<1>]> {
+  let description = [{
+    Generic n-D convolution as described in the TF documentation:
+    https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/nn/convolution
+
+    ```
+      output[b, x[0], ..., x[N-1], k] =
+      sum_{z[0], ..., z[N-1], q}
+          filter[z[0], ..., z[N-1], q, k] *
+          padded_input[b,
+                       x[0] * strides[0] + dilation_rate[0] * z[0],
+                       ...,
+                       x[N-1] * strides[N-1] + dilation_rate[N-1] * z[N-1],
+                       q]
+    ```
+  }];
+
+  // TODO(ntv) padding.
+  // Following the TF source of truth above, strides and dilations are integer
+  // attributes of the same rank as the number of window dimensions.
+  let arguments = (ins AnyStridedMemRef:$filter, AnyStridedMemRef:$input,
+                   AnyStridedMemRef:$output,
+                   OptionalAttr<I64ArrayAttr>:$strides,
+                   OptionalAttr<I64ArrayAttr>:$dilations);
+  let extraClassDeclaration = libraryCallName # [{
+    // TODO(ntv) extend to support more than 1 dimensions and potentially
+    // grouping too.
+    unsigned getNumBatchDimensions() { return 1; }
+    unsigned getNumInputFeatureDimensions() { return 1; }
+    unsigned getNumOutputFeatureDimensions() { return 1; }
+
+    ArrayAttr indexing_maps();
+
+    ArrayAttr iterator_types() {
+      // Outer parallel loops are always the number of output dimensions; i.e.
+      // [ b, xs, q] in the TF notation above.
+      unsigned nPar = getOutputViewType(0).getRank();
+      unsigned nRed = getNumInputFeatureDimensions();
+      // Window loops are a special kind of reduction that is never tiled or
+      // parallelized across; i.e. [zs] in the TF notation above whose number
+      // match `xs` (i.e. 1 window loop per "image" dimension).
+      // This may evolve in the future.
+      unsigned nWin =
+        nPar - getNumBatchDimensions() - getNumInputFeatureDimensions();
+      MLIRContext *ctx = getContext();
+      SmallVector<Attribute, 8> iters(
+        nPar, StringAttr::get(getParallelIteratorTypeName(), ctx));
+      iters.reserve(nPar + nRed + nWin);
+      iters.append(nRed, StringAttr::get(getReductionIteratorTypeName(), ctx));
+      iters.append(nWin, StringAttr::get(getWindowIteratorTypeName(), ctx));
+      return ArrayAttr::get(iters, ctx);
+    }
+
+    int64_t getStride(unsigned i) {
+      assert(i < getNumWindowLoops());
+      if (!strides().hasValue()) return 1;
+      return strides()->getValue()[i]
+        .cast<IntegerAttr>().getValue().getSExtValue();
+    }
+
+    int64_t getDilation(unsigned i) {
+      assert(i < getNumWindowLoops());
+      if (!dilations().hasValue()) return 1;
+      return dilations()->getValue()[i]
+        .cast<IntegerAttr>().getValue().getSExtValue();
+    }
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+class GenericOpBase<string mnemonic> : LinalgStructuredBase_Op<mnemonic, []> {
+  let arguments = (ins Variadic<AnyStridedMemRef>:$views,
+                   I64Attr:$args_in,
+                   I64Attr:$args_out,
+                   AffineMapArrayAttr:$indexing_maps,
+                   ArrayAttr:$iterator_types,
+                   OptionalAttr<StrAttr>:$doc,
+                   OptionalAttr<FlatSymbolRefAttr>:$fun,
+                   OptionalAttr<StrAttr>:$library_call);
+  let regions = (region AnyRegion:$region);
+  let extraClassDeclaration = [{
+    SmallVector<StringRef, 8> linalgTraitAttrNames() {
+      return SmallVector<StringRef, 8>{
+        getArgsInAttrName(), getArgsOutAttrName(), getDocAttrName(),
+        getFunAttrName(), getIndexingMapsAttrName(), getLibraryCallAttrName(),
+        getIteratorTypesAttrName()
+      };
+    }
+    unsigned getNumInputs() { return args_in().getSExtValue(); }
+    unsigned getNumOutputs() { return args_out().getSExtValue(); }
+    FuncOp getFunction() {
+      auto moduleOp = getParentOfType<ModuleOp>();
+      return fun().hasValue() ?
+        moduleOp.lookupSymbol<FuncOp>(fun().getValue()) : FuncOp();
+    }
+    StringRef getLibraryCallName() {
+      return library_call().hasValue() ? library_call().getValue() : "";
+    }
+    AffineMap getIndexingMap(unsigned i) {
+      assert(i < getNumInputsAndOutputs());
+      return indexing_maps().getValue()[i].cast<AffineMapAttr>().getValue();
+    }
+    AffineMap getInputIndexingMap(unsigned i) {
+      assert(i < getNumInputs());
+      return indexing_maps().getValue()[i].cast<AffineMapAttr>().getValue();
+    }
+    AffineMap getOutputIndexingMap(unsigned i) {
+      assert(i < getNumOutputs());
+      return indexing_maps().getValue()[i + getNumInputs()]
+          .cast<AffineMapAttr>().getValue();
+    }
+  }];
+  let printer = [{ return ::print(p, *this); }];
+  let parser = [{ return ::parseGenericOp(parser, result); }];
+}
+
+def GenericOp : GenericOpBase<"generic"> {
+  let description = [{
+    Generic Linalg op form where the key properties of the computation are
+    specified as attributes. In pretty form, a linalg.generic op is written as:
+
+      ```mlir
+        linalg.generic #trait_attribute %A, %B, %C {other-attributes} :
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>
+      ```
+
+    Where #trait_attributes is an alias of a dictionary attribute containing:
+      - args_in: an I64Attr representing the number of input (readonly) views
+      - args_out: an I64Attr representing the number of output (readwrite) views
+      - doc [optional]: a documentation string
+      - fun: a FlatSymbolRefAttr that must resolve to an existing function
+        symbol. To support inplace updates in a generic fashion, the signature
+        of the function must be:
+        ```
+          fun([input views element types], [output views element types])
+            -> ([output views element types])
+        ```
+      - indexing_maps: a list of AffineMapAttr, one AffineMapAttr per each input
+        and output view. Such AffineMapAttr specifies the mapping between the
+        loops and the indexing within each view.
+      - library_call [optional]: a StringAttr containing the name of an
+        external library function that the linalg.generic operation maps to.
+        The external library is assumed to be dynamically linked and no strong
+        compile-time guarantees are provided. In the absence of such a library
+        call, linalg.generic will always lower to loops.
+      - iterator_types: an ArrayAttr specifying the type of the enclosing loops.
+        Each element of the list represents and iterator of one of the following
+        types:
+          parallel, reduction, window
+
+    Example:
+    Defining a #matmul_trait attribute in MLIR can be done as follows:
+      ```mlir
+        func @fma(%a: f32, %b: f32, %c: f32) -> f32 {
+          %d = mulf %a, %b: f32
+          %e = addf %c, %d: f32
+          return %e: f32
+        }
+        #matmul_accesses = [
+          (m, n, k) -> (m, k),
+          (m, n, k) -> (k, n),
+          (m, n, k) -> (m, n)
+        ]
+        #matmul_trait = {
+          doc = "C(m, n) += A(m, k) * B(k, n)",
+          fun = @fma,
+          indexing_maps = #matmul_accesses,
+          library_call = "linalg_matmul",
+          n_views = [2, 1],
+          iterator_types = ["parallel", "parallel", "reduction"]
+        }
+      ```
+
+    And can be reused in multiple places as:
+      ```mlir
+        linalg.generic #matmul_trait %A, %B, %C [other-attributes] :
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>
+      ```
+
+    This may lower to either:
+      ```mlir
+        call @linalg_matmul(%A, %B, %C) :
+          (memref<?x?xf32, stride_specification>,
+           memref<?x?xf32, stride_specification>,
+           memref<?x?xf32, stride_specification>)
+          -> ()
+      ```
+
+    or IR resembling:
+    ```mlir
+    loop.for %m = %c0 to %M step %c1 {
+      loop.for %n = %c0 to %N step %c1 {
+        loop.for %k = %c0 to %K step %c1 {
+          %a = linalg.load %A[%m, %k] : memref<?x?xf32, stride_specification>
+          %b = linalg.load %B[%k, %n] : memref<?x?xf32, stride_specification>
+          %c = linalg.load %C[%m, %n] : memref<?x?xf32, stride_specification>
+          %d = call @func_of_elements(%a, %b, %c)
+                 : (f32, f32, f32) -> (f32)
+          linalg.store %d, %C[%m, %n] : memref<?x?x?xf32, stride_specification>
+        }
+      }
+    }
+    ```
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def IndexedGenericOp : GenericOpBase<"indexed_generic"> {
+  let description = [{
+    Indexed Generic Linalg op form where the key properties of the computation
+    are specified as attributes. In pretty form, a linalg.indexed_generic op is
+    written as:
+
+      ```mlir
+        linalg.indexed_generic #trait_attribute %A, %B, %C {other-attributes} :
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>
+      ```
+
+    Where #trait_attributes is an alias of a dictionary attribute containing:
+      - args_in: an I64Attr representing the number of input (readonly) views
+      - args_out: an I64Attr representing the number of output (readwrite) views
+      - doc [optional]: a documentation string
+      - fun: a FlatSymbolRefAttr that must resolve to an existing function
+        symbol. To support inplace updates in a generic fashion, the signature
+        of the function must be:
+        ```
+          fun([index types of induction variables], [input views element types],
+              [output views element types]) -> ([output views element types])
+        ```
+      - indexing_maps: a list of AffineMapAttr, one AffineMapAttr per each input
+        and output view. Such AffineMapAttr specifies the mapping between the
+        loops and the indexing within each view.
+      - library_call [optional]: a StringAttr containing the name of an
+        external library function that the linalg.indexed_generic operation
+        maps to.  The external library is assumed to be dynamically linked and
+        no strong compile-time guarantees are provided. In the absence of such
+        a library call, linalg.indexed_generic will always lower to loops.
+      - iterator_types: an ArrayAttr they type of the enclosing loops; Each
+        element of the list represents and iterator of one of the following
+        types:
+          parallel, reduction, window
+
+    Example:
+    Defining a #matmul_trait attribute in MLIR can be done as follows:
+      ```mlir
+        func @fma(%i: index, %j: index, %k: index, %a: f32, %b: f32, %c: f32)
+          -> f32
+        {
+          %d = mulf %a, %b: f32
+          %e = addf %c, %d: f32
+          return %e: f32
+        }
+        #matmul_accesses = [
+          (m, n, k) -> (m, k),
+          (m, n, k) -> (k, n),
+          (m, n, k) -> (m, n)
+        ]
+        #matmul_trait = {
+          doc = "C(m, n) += A(m, k) * B(k, n)",
+          fun = @fma,
+          indexing_maps = #matmul_accesses,
+          library_call = "linalg_matmul",
+          n_views = [2, 1],
+          iterator_types = ["parallel", "parallel", "reduction"]
+        }
+      ```
+
+    And can be reused in multiple places as:
+      ```mlir
+        linalg.indexed_generic #matmul_trait %A, %B, %C [other-attributes] :
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>,
+          memref<?x?xf32, stride_specification>
+      ```
+
+    This may lower to either:
+      ```mlir
+        call @linalg_matmul(%A, %B, %C) :
+          (memref<?x?xf32, stride_specification>,
+           memref<?x?xf32, stride_specification>,
+           memref<?x?xf32, stride_specification>)
+          -> ()
+      ```
+
+    or IR resembling:
+    ```mlir
+    loop.for %m = %c0 to %M step %c1 {
+      loop.for %n = %c0 to %N step %c1 {
+        loop.for %k = %c0 to %K step %c1 {
+          %a = linalg.load %A[%m, %k] : memref<?x?xf32, stride_specification>
+          %b = linalg.load %B[%k, %n] : memref<?x?xf32, stride_specification>
+          %c = linalg.load %C[%m, %n] : memref<?x?xf32, stride_specification>
+          %d = call @func_of_elements_and_indices(%m, %n, %k, %a, %b, %c)
+                 : (index, index, index, f32, f32, f32) -> (f32)
+          linalg.store %d, %C[%m, %n] : memref<?x?x?xf32, stride_specification>
+        }
+      }
+    }
+    ```
+  }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+#endif // LINALG_STRUCTURED_OPS
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0d651806d3207d233e7aec8c21e2e04eaa72fdb
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h
@@ -0,0 +1,157 @@
+//===- LinalgTraits.h - Linalg Traits ---------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LINALG_LINALGTRAITS_H_
+#define MLIR_DIALECT_LINALG_LINALGTRAITS_H_
+
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+namespace OpTrait {
+namespace linalg {
+
+/// This class provides the API for ops that are known to have a specified
+/// number of inputs, all passed as operands. This is used as a trait like this:
+///
+///   class DotOp : public Op<DotOp, OpTrait::NInputs<2>::Impl> {
+///
+template <unsigned N> class NInputs {
+public:
+  template <typename ConcreteType>
+  class Impl : public OpTrait::TraitBase<ConcreteType, NInputs<N>::Impl> {
+  public:
+    static unsigned getNumInputs() { return N; }
+  };
+};
+
+/// This class provides the API for ops that are known to have a specified
+/// number of inputs, all passed as operands. This is used as a trait like this:
+///
+///   class DotOp : public Op<DotOp, OpTrait::NOutputs<2>::Impl> {
+///
+template <unsigned N> class NOutputs {
+public:
+  template <typename ConcreteType>
+  class Impl : public OpTrait::TraitBase<ConcreteType, NOutputs<N>::Impl> {
+  public:
+    static unsigned getNumOutputs() { return N; }
+  };
+};
+
+/// This class provides the API for ops that are known to operate on views. This
+/// trait must be used in conjunction with an op definition or a trait that
+/// provides the methods `getNumInputs` and `getNumOutputs`. This is used as a
+/// trait like this:
+///
+///   class DotOp : public Op<DotOp, OpTrait::ViewTrait> {
+///
+template <typename ConcreteType>
+class ViewTraits : public OpTrait::TraitBase<ConcreteType, ViewTraits> {
+private:
+  /// Return the number of input views. For internal use only.
+  unsigned nInputs() {
+    return cast<ConcreteType>(this->getOperation()).getNumInputs();
+  }
+  /// Return the number of input views. For internal use only.
+  unsigned nOutputs() {
+    return cast<ConcreteType>(this->getOperation()).getNumOutputs();
+  }
+
+public:
+  /// Return the `i`-th input view.
+  Value getInput(unsigned i) {
+    assert(i < nInputs());
+    return this->getOperation()->getOperand(i);
+  }
+  /// Return the index of `view` in the list of input views if found, llvm::None
+  /// otherwise.
+  Optional<unsigned> getIndexOfInput(Value view) {
+    auto it = llvm::find(getInputs(), view);
+    if (it != getInputs().end())
+      return it - getInputs().begin();
+    return llvm::None;
+  }
+  /// Return the `i`-th input view type.
+  MemRefType getInputViewType(unsigned i) {
+    return getInput(i)->getType().template cast<MemRefType>();
+  }
+  /// Return the range over input views.
+  Operation::operand_range getInputs() {
+    auto range = this->getOperation()->getOperands();
+    return {range.begin(), range.begin() + nInputs()};
+  }
+  /// Return the `i`-th output view.
+  Value getOutput(unsigned i) {
+    return this->getOperation()->getOperand(nInputs() + i);
+  }
+  /// Return the index of `view` in the list of output views if found,
+  /// llvm::None otherwise.
+  Optional<unsigned> getIndexOfOutput(Value view) {
+    auto it = llvm::find(getOutputs(), view);
+    if (it != getOutputs().end())
+      return it - getOutputs().begin();
+    return llvm::None;
+  }
+  /// Return the `i`-th output view type.
+  MemRefType getOutputViewType(unsigned i) {
+    return getOutput(i)->getType().template cast<MemRefType>();
+  }
+  /// Return the range over output views.
+  Operation::operand_range getOutputs() {
+    auto range = this->getOperation()->getOperands();
+    return {range.begin() + nInputs(),
+            range.begin() + getNumInputsAndOutputs()};
+  }
+  /// Return the number of input and output views.
+  unsigned getNumInputsAndOutputs() { return nInputs() + nOutputs(); }
+  /// Return the `i`-th view type.
+  MemRefType getViewType(unsigned i) {
+    return (i < nInputs()) ? getInputViewType(i)
+                           : getOutputViewType(i - nInputs());
+  }
+  /// Return the range over input and output views.
+  Operation::operand_range getInputsAndOutputs() {
+    auto range = this->getOperation()->getOperands();
+    return {range.begin(), range.begin() + getNumInputsAndOutputs()};
+  }
+  unsigned getNumParallelLoops() {
+    return getNumIterators(
+        getParallelIteratorTypeName(),
+        cast<ConcreteType>(this->getOperation()).iterator_types());
+  }
+  unsigned getNumReductionLoops() {
+    return getNumIterators(
+        getReductionIteratorTypeName(),
+        cast<ConcreteType>(this->getOperation()).iterator_types());
+  }
+  unsigned getNumWindowLoops() {
+    return getNumIterators(
+        getWindowIteratorTypeName(),
+        cast<ConcreteType>(this->getOperation()).iterator_types());
+  }
+  unsigned getNumLoops() {
+    return getNumIterators(
+        cast<ConcreteType>(this->getOperation()).iterator_types());
+  }
+  static LogicalResult verifyTrait(Operation *op) {
+    auto nViews = cast<ConcreteType>(op).getNumInputsAndOutputs();
+    if (failed(OpTrait::impl::verifyAtLeastNOperands(op, nViews)))
+      return failure();
+    return success();
+  }
+};
+
+} // namespace linalg
+} // namespace OpTrait
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_LINALGTRAITS_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..abeda3e05528b6d7ba1106cb6cf7dcb9a07573cf
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h
@@ -0,0 +1,61 @@
+//===- LinalgTypes.h - Linalg Types ---------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LINALG_LINALGTYPES_H_
+#define MLIR_DIALECT_LINALG_LINALGTYPES_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Types.h"
+
+namespace mlir {
+class MLIRContext;
+
+namespace linalg {
+enum LinalgTypes {
+  Range = Type::FIRST_LINALG_TYPE,
+  LAST_USED_LINALG_TYPE = Range,
+};
+
+class LinalgDialect : public Dialect {
+public:
+  explicit LinalgDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "linalg"; }
+
+  /// Parse a type registered to this dialect.
+  Type parseType(DialectAsmParser &parser) const override;
+
+  /// Print a type registered to this dialect.
+  void printType(Type type, DialectAsmPrinter &os) const override;
+};
+
+/// A RangeType represents a minimal range abstraction (min, max, step).
+/// It is constructed by calling the linalg.range op with three values index of
+/// index type:
+///
+/// ```mlir
+///    func @foo(%arg0 : index, %arg1 : index, %arg2 : index) {
+///      %0 = linalg.range %arg0:%arg1:%arg2 : !linalg.range
+///    }
+/// ```
+class RangeType : public Type::TypeBase<RangeType, Type> {
+public:
+  // Used for generic hooks in TypeBase.
+  using Base::Base;
+  /// Construction hook.
+  static RangeType get(MLIRContext *context) {
+    /// Custom, uniq'ed construction in the MLIRContext.
+    return Base::get(context, LinalgTypes::Range);
+  }
+  /// Used to implement llvm-style cast.
+  static bool kindof(unsigned kind) { return kind == LinalgTypes::Range; }
+};
+
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_LINALGTYPES_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..86cf6fdd02797aa06d1814816805c9d9cd053d9d
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.h
@@ -0,0 +1,48 @@
+//===- Passes.h - Linalg pass entry points ----------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes that expose pass constructors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LINALG_PASSES_H_
+#define MLIR_DIALECT_LINALG_PASSES_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+class FuncOp;
+class ModuleOp;
+template <typename T> class OpPassBase;
+
+namespace linalg {
+std::unique_ptr<OpPassBase<FuncOp>> createLinalgFusionPass();
+
+std::unique_ptr<OpPassBase<FuncOp>>
+createLinalgTilingPass(ArrayRef<int64_t> tileSizes = {});
+
+std::unique_ptr<OpPassBase<FuncOp>>
+createLinalgPromotionPass(bool dynamicBuffers);
+
+/// Create a pass to convert Linalg operations to loop.for loops and
+/// std.load/std.store accesses.
+std::unique_ptr<OpPassBase<FuncOp>> createConvertLinalgToLoopsPass();
+
+/// Create a pass to convert Linalg operations to affine.for loops and
+/// affine_load/affine_store accesses.
+/// Placeholder for now, this is NYI.
+std::unique_ptr<OpPassBase<FuncOp>> createConvertLinalgToAffineLoopsPass();
+
+/// Create a pass to convert Linalg operations to the LLVMIR dialect.
+std::unique_ptr<OpPassBase<ModuleOp>> createConvertLinalgToLLVMPass();
+
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_PASSES_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/Linalg/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f87938c943effd0b1ea7cda1241c8b6325549dc4
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(LLVM_TARGET_DEFINITIONS LinalgTransformPatterns.td)
+mlir_tablegen(LinalgTransformPatterns.h.inc -gen-rewriters)
+add_public_tablegen_target(MLIRLinalgTransformPatternsIncGen)
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td b/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td
new file mode 100644
index 0000000000000000000000000000000000000000..8f6762f004896c2605be0dc1e808b2f76b000f25
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td
@@ -0,0 +1,108 @@
+//===- LinalgPatterns.td - Linalg transformation patterns --*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the pattern definition file for declarative Linalg transformation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LINALG_TRANSFORMS
+#define LINALG_TRANSFORMS
+
+include "mlir/Dialect/Linalg/IR/LinalgOps.td"
+include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.td"
+include "mlir/Dialect/AffineOps/AffineOps.td"
+
+def HasNoLinalgTransformMarker : CPred<[{
+  !$0.getAttrOfType<StringAttr>(LinalgTransforms::kLinalgTransformMarker)
+}]>;
+
+class HasLinalgTransformMarker<string str> : CPred<[{
+  $0.getAttrOfType<StringAttr>(
+    LinalgTransforms::kLinalgTransformMarker) &&
+  $0.getAttrOfType<StringAttr>(
+    LinalgTransforms::kLinalgTransformMarker).getValue() == "}] # str # [{"}]>;
+
+class IsProducedByOpOfType<string str> :
+  CPred<"isProducedByOpOfType<" # str # ">($0, $1)">;
+
+class AffineMapDomainHasDim<int n> : CPred<[{
+  $0.getAttrOfType<ArrayAttr>(getIndexingMapsAttrName()).getValue()[0].
+  cast<AffineMapAttr>().getValue().getNumDims() ==}] # n # [{}]>;
+
+class HasOperandsOfType<string type>: CPred<[{
+    llvm::any_of($0.getOperands(),
+        [](Value v) {
+          return dyn_cast_or_null<}] # type # [{>(v->getDefiningOp());
+        })
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Linalg fusion patterns.
+//===----------------------------------------------------------------------===//
+//
+// In the future, tile sizes should be derived from op properties + machine
+// description but we do not need to wait on this to start having useful
+// patterns.
+class TileAndFuseLinalgOp<
+    list<int> sizes, list<int> operandIndices, string value> : NativeCodeCall<
+  "if (failed(tileAndFuseLinalgOpAndSetMarker($_builder, $0, {" #
+  StrJoinInt<sizes>.result # "}, {" # StrJoinInt<operandIndices>.result # "}," #
+      " \"" # value # "\")))" #
+  "  return matchFailure();">;
+
+//===----------------------------------------------------------------------===//
+// Linalg tiling patterns.
+//===----------------------------------------------------------------------===//
+//
+// In the future, tile sizes should be derived from op properties + machine
+// description but we do not need to wait on this to start having useful
+// patterns.
+// `permutation` is an optional parameter to specify the ordering of the
+// tiled loops. If provided, it must be a list of integers with the same number
+// of elements as `sizes`.
+class TileLinalgOp<list<int> sizes, string value, list<int> permutation=[]> :
+  NativeCodeCall<
+    "if (failed(tileLinalgOpAndSetMarker($_builder, $0, {" #
+    StrJoinInt<sizes>.result # "}, \"" # value # "\", {" #
+    StrJoinInt<permutation>.result # "})))" #
+    "  return matchFailure();">;
+
+//===----------------------------------------------------------------------===//
+// Linalg to loop patterns.
+//===----------------------------------------------------------------------===//
+class LinalgOpToLoops<string OpType> : NativeCodeCall<
+  "if (failed(linalgOpToLoops<" # OpType # ">($_builder, $0))) " #
+  "  return matchFailure();">;
+
+class LinalgOpToAffineLoops<string OpType> : NativeCodeCall<
+  "if (failed(linalgOpToAffineLoops<" # OpType # ">($_builder, $0))) " #
+  "  return matchFailure();">;
+
+//===----------------------------------------------------------------------===//
+// Linalg to vector contraction patterns.
+//===----------------------------------------------------------------------===//
+class LinalgOpToVectorContraction<string OpType> : NativeCodeCall<
+  "if (failed(vectorizeGenericOp($_builder, $0))) " #
+  "  return matchFailure();">;
+
+//===----------------------------------------------------------------------===//
+// Linalg generic permutation patterns.
+//===----------------------------------------------------------------------===//
+class PermuteGenericLinalgOp<list<int> permutation, string value> :
+  NativeCodeCall<
+    "if (failed(permuteGenericLinalgOp($_builder, $0, {" #
+    StrJoinInt<permutation>.result # "}, \"" # value # "\"))) " #
+    "  return matchFailure();">;
+
+//===----------------------------------------------------------------------===//
+// Linalg promote subview operands.
+//===----------------------------------------------------------------------===//
+class LinalgOpPromoteSubviews<string OpType> : NativeCodeCall<
+  "if (failed(linalgOpPromoteSubviews($_builder, $0))) " #
+  "  return matchFailure();">;
+#endif // LINALG_TRANSFORMS
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransforms.h
new file mode 100644
index 0000000000000000000000000000000000000000..757ee3ad1a7bdeaae72d896320fba07e15afe688
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransforms.h
@@ -0,0 +1,96 @@
+//===- LinalgTransforms.h - Linalg transformations as patterns --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DIALECT_LINALG_TRANSFORMS_LINALGTRANSFORMS_H_
+#define DIALECT_LINALG_TRANSFORMS_LINALGTRANSFORMS_H_
+
+#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+#include "llvm/ADT/STLExtras.h"
+
+namespace mlir {
+namespace linalg {
+
+// Marker used as attribute name in generated Linalg rewriting transformations.
+struct LinalgTransforms {
+  static const StringLiteral kLinalgTransformMarker;
+};
+
+namespace detail {
+// Implementation detail of isProducedByOpOfType avoids the need for explicit
+// template instantiations.
+bool isProducedByOpOfTypeImpl(Operation *consumerOp, Value consumedView,
+                              function_ref<bool(Operation *)> isaOpType);
+} // namespace detail
+
+// Returns true if the `consumedView` value use in `consumerOp` is produced by
+// an op of type `OpTy`. This is used to implement use-def type information on
+// buffers.
+template <typename OpTy>
+bool isProducedByOpOfType(Operation *consumerOp, Value consumedView) {
+  return detail::isProducedByOpOfTypeImpl(
+      consumerOp, consumedView, [](Operation *op) { return isa<OpTy>(op); });
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The following Declarative Rewrite Rule (DRR) helpers are used in rewrite
+// patterns. As such, they must not call into `rewriter.erase/replace` APIs and
+// it is the responsibility of the enclosing PatternRewriter to erase on
+// success.
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tiles `op` by `sizes` permuting the looops according to `permutation`
+/// and sets the attribute `kLinalgTransformMarker` to `linalgMarker`.
+/// The permutation is expressed as a list of integers that specify
+/// the new ordering of the loop nest. The length of `permutation`
+/// must be equal to the length of `tileSizes`.
+/// E.g. the permutation `(i,j,k) -> (j,k,i)` will be expressed with
+/// `permutation = [1,2,0]`. All values in `permutation` must be
+/// integers, in the range 0..`tileSizes.size()` without duplications
+/// (i.e. `[1,1,2]` is an invalid permutation). An empty list
+/// states for the identity permutation.
+LogicalResult tileLinalgOpAndSetMarker(PatternRewriter &rewriter, Operation *op,
+                                       ArrayRef<int64_t> sizes,
+                                       StringRef linalgMarker,
+                                       ArrayRef<unsigned> permutation);
+
+/// Tiles `op` by `sizes`, fuses the producers of `operandIndicesToFuse` and
+/// sets the attribute `kLinalgTransformMarker` to `linalgMarker`.
+LogicalResult tileAndFuseLinalgOpAndSetMarker(
+    PatternRewriter &rewriter, Operation *op, ArrayRef<int64_t> sizes,
+    ArrayRef<int64_t> operandIndicesToFuse, StringRef linalgMarker);
+
+/// Emits a loop nest of `loop.for` with the proper body for `op`.
+template <typename ConcreteOp>
+LogicalResult linalgOpToLoops(PatternRewriter &rewriter, Operation *op);
+
+/// Emits a loop nest of `affine.for` with the proper body for `op`.
+template <typename ConcreteOp>
+LogicalResult linalgOpToAffineLoops(PatternRewriter &rewriter, Operation *op);
+
+/// Rewrite a linalg.generic into a suitable vector.contraction op.
+LogicalResult vectorizeGenericOp(PatternRewriter &rewriter, Operation *op);
+
+/// Emits a `generic` or `indexed_generic` operation with the `indexing_maps`
+/// and `iterator_types` permutated according to `permutation`.
+LogicalResult permuteGenericLinalgOp(PatternRewriter &rewriter, Operation *op,
+                                     ArrayRef<unsigned> permutation,
+                                     StringRef linalgMarker);
+
+/// Promote std.subviews feeding linalg operations
+LogicalResult linalgOpPromoteSubviews(PatternRewriter &rewriter, Operation *op);
+
+} // namespace linalg
+} // namespace mlir
+
+#endif // DIALECT_LINALG_TRANSFORMS_LINALGTRANSFORMS_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Intrinsics.h b/mlir/include/mlir/Dialect/Linalg/Utils/Intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..778d853aeefe91ae718f1b9eebd304792b7a4d67
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Intrinsics.h
@@ -0,0 +1,29 @@
+//===- Intrinsics.h - Linalg intrinsics definitions -----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LINALG_INTRINSICS_H_
+#define MLIR_DIALECT_LINALG_INTRINSICS_H_
+
+#include "mlir/EDSC/Intrinsics.h"
+
+namespace mlir {
+namespace linalg {
+class CopyOp;
+class FillOp;
+class RangeOp;
+class SliceOp;
+namespace intrinsics {
+using copy = mlir::edsc::intrinsics::OperationBuilder<CopyOp>;
+using fill = mlir::edsc::intrinsics::OperationBuilder<FillOp>;
+using range = mlir::edsc::intrinsics::ValueBuilder<RangeOp>;
+using slice = mlir::edsc::intrinsics::ValueBuilder<SliceOp>;
+} // namespace intrinsics
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_INTRINSICS_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..996658b4c5c73797cf6d95541bf1b24cd9c3b9af
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -0,0 +1,226 @@
+//===- Utils.h - Utilities to support the Linalg dialect --------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LINALG_UTILS_H_
+#define MLIR_DIALECT_LINALG_UTILS_H_
+
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/EDSC/Helpers.h"
+
+#include "llvm/ADT/SetVector.h"
+
+namespace mlir {
+class AffineExpr;
+class AffineMap;
+class OperationFolder;
+
+namespace edsc {
+
+/// A LoopRangeBuilder is a generic NestedBuilder for loop.for operations.
+/// More specifically it is meant to be used as a temporary object for
+/// representing any nested MLIR construct that is "related to" an mlir::Value
+/// (for now an induction variable).
+class LoopRangeBuilder : public NestedBuilder {
+public:
+  /// Constructs a new loop.for and captures the associated induction
+  /// variable. A ValueHandle pointer is passed as the first argument and is the
+  /// *only* way to capture the loop induction variable.
+  LoopRangeBuilder(ValueHandle *iv, ValueHandle range);
+  LoopRangeBuilder(ValueHandle *iv, Value range);
+  LoopRangeBuilder(ValueHandle *iv, SubViewOp::Range range);
+
+  LoopRangeBuilder(const LoopRangeBuilder &) = delete;
+  LoopRangeBuilder(LoopRangeBuilder &&) = default;
+
+  LoopRangeBuilder &operator=(const LoopRangeBuilder &) = delete;
+  LoopRangeBuilder &operator=(LoopRangeBuilder &&) = default;
+
+  /// The only purpose of this operator is to serve as a sequence point so that
+  /// the evaluation of `fun` (which build IR snippets in a scoped fashion) is
+  /// scoped within a LoopRangeBuilder.
+  ValueHandle operator()(std::function<void(void)> fun = nullptr);
+};
+
+/// Helper class to sugar building loop.for loop nests from ranges.
+/// This is similar to edsc::AffineLoopNestBuilder except it works on ranges
+/// directly. In the current implementation it produces loop.for operations.
+class LoopNestRangeBuilder {
+public:
+  LoopNestRangeBuilder(ArrayRef<edsc::ValueHandle *> ivs,
+                       ArrayRef<edsc::ValueHandle> ranges);
+  LoopNestRangeBuilder(ArrayRef<edsc::ValueHandle *> ivs,
+                       ArrayRef<Value> ranges);
+  LoopNestRangeBuilder(ArrayRef<edsc::ValueHandle *> ivs,
+                       ArrayRef<SubViewOp::Range> ranges);
+  edsc::ValueHandle operator()(std::function<void(void)> fun = nullptr);
+
+private:
+  SmallVector<LoopRangeBuilder, 4> loops;
+};
+
+} // namespace edsc
+
+namespace linalg {
+class LinalgDependenceGraph;
+
+struct FusionInfo {
+  LinalgOp originalProducer;
+  LinalgOp fusedProducer;
+};
+
+/// Checks whether the specific `producer` is the last write to exactly the
+/// whole `consumedView`. This checks structural dominance, that the dependence
+/// is a RAW without any interleaved write to any piece of `consumedView`.
+bool isProducerLastWriteOfView(const LinalgDependenceGraph &graph,
+                               LinalgOp consumer, Value consumedView,
+                               LinalgOp producer);
+
+/// Checks whether fusing the specific `producer` of the `consumedView` is
+/// feasible. This checks `producer` is the last write of `consumedView` and
+/// that no interleaved dependence would be violated (RAW, WAR or WAW).
+bool isFusableInto(const LinalgDependenceGraph &graph, LinalgOp consumer,
+                   Value consumedView, LinalgOp producer);
+
+/// Fuses producer into consumer if the producer is structurally feasible and
+/// the fusion would not violate dependencies.
+/// When non-null, the optional pointer `folder` is used to call into the
+/// `createAndFold` builder method. If `folder` is null, the regular `create`
+/// method is called.
+Optional<FusionInfo> fuseProducerOf(OpBuilder &b, LinalgOp consumer,
+                                    unsigned consumerIdx,
+                                    const LinalgDependenceGraph &graph,
+                                    OperationFolder *folder = nullptr);
+
+/// Returns the linearized list of all view dimensions in a linalgOp. Applying
+/// the inverse, concatenated loopToOperandRangeMaps to this list allows the
+/// derivation of loop ranges for any linalgOp.
+template <typename ConcreteOp>
+SmallVector<Value, 8> getViewSizes(ConcreteOp linalgOp) {
+  SmallVector<Value, 8> res;
+  for (auto v : linalgOp.getInputsAndOutputs()) {
+    MemRefType t = v->getType().template cast<MemRefType>();
+    for (unsigned i = 0; i < t.getRank(); ++i)
+      res.push_back(edsc::intrinsics::dim(v, i));
+  }
+  return res;
+}
+
+/// Returns the values obtained by applying `map` to the list of values.
+/// When non-null, the optional pointer `folder` is used to call into the
+/// `createAndFold` builder method. If `folder` is null, the regular `create`
+/// method is called.
+SmallVector<Value, 4> applyMapToValues(OpBuilder &b, Location loc,
+                                       AffineMap map, ArrayRef<Value> values,
+                                       OperationFolder *folder = nullptr);
+
+struct TiledLinalgOp {
+  LinalgOp op;
+  SmallVector<loop::ForOp, 8> loops;
+};
+
+/// Performs standalone tiling of a single LinalgOp by `tileSizes`.
+/// and permute the loop nest according to `permutation`
+/// The permutation is expressed as a list of integers that specify
+/// the new ordering of the loop nest. The length of `permutation`
+/// must be equal to the length of `tileSizes`.
+/// E.g. the permutation `(i,j,k) -> (j,k,i)` will be expressed with
+/// `permutation = [1,2,0]`. All values in `permutation` must be
+/// integers, in the range 0..`tileSizes.size()` without duplications
+/// (i.e. `[1,1,2]` is an invalid permutation). An empty list
+/// states for the identity permutation.
+/// Returns a struct containing the tiled loops in the specified order
+/// and the cloned op if successful, llvm::None otherwise.
+/// When non-null, the optional pointer `folder` is used to call into the
+/// `createAndFold` builder method. If `folder` is null, the regular `create`
+/// method is called.
+Optional<TiledLinalgOp> tileLinalgOp(OpBuilder &b, LinalgOp op,
+                                     ArrayRef<Value> tileSizes,
+                                     ArrayRef<unsigned> permutation = {},
+                                     OperationFolder *folder = nullptr);
+
+/// Performs standalone tiling of a single LinalgOp by constant `tileSizes`.
+/// and permute the loop nest according to `permutation`
+/// The permutation is expressed as a list of integers that specify
+/// the new ordering of the loop nest. The length of `permutation`
+/// must be equal to the length of `tileSizes`.
+/// E.g. the permutation `(i,j,k) -> (j,k,i)` will be expressed with
+/// `permutation = [1,2,0]`. All values in `permutation` must be
+/// integers, in the range 0..`tileSizes.size()` without duplications
+/// (i.e. `[1,1,2]` is an invalid permutation). An empty list
+/// states for the identity permutation.
+/// Returns a struct containing the tiled loops in the specified order
+/// and the cloned op if successful, llvm::None otherwise.
+/// When non-null, the optional pointer `folder` is used to call into the
+/// `createAndFold` builder method. If `folder` is null, the regular `create`
+/// method is called.
+Optional<TiledLinalgOp> tileLinalgOp(OpBuilder &b, LinalgOp op,
+                                     ArrayRef<int64_t> tileSizes,
+                                     ArrayRef<unsigned> permutation = {},
+                                     OperationFolder *folder = nullptr);
+
+template <typename... Args>
+Optional<TiledLinalgOp> tileLinalgOperation(OpBuilder &b, Operation *op,
+                                            Args... args) {
+  return tileLinalgOp(b, cast<LinalgOp>(op), args...);
+}
+
+struct PromotionInfo {
+  Value buffer;
+  Value fullLocalView;
+  Value partialLocalView;
+};
+
+/// Promotes the `subViews` into a new buffer allocated at the insertion point
+/// `b`. For now, promotion occurs in 3 steps:
+///   1. Create a new buffer for a full tile (i.e. not clipped at the boundary).
+///   2. Take a full view on the buffer and `linalg.fill` it with zeros (use
+///      float zero for now).
+///   3. Take a partial slice of the full view in step 2. and copy into it.
+/// Infers statically sized buffers from subViews unless `dynamicBuffers` is
+/// true.
+///
+/// Returns a list of PromotionInfo which hold the promoted buffer and the
+/// full and partial views indexing into the buffer.
+SmallVector<PromotionInfo, 8>
+promoteSubViews(OpBuilder &b, Location loc, ArrayRef<Value> subViews,
+                bool dynamicBuffers = false, OperationFolder *folder = nullptr);
+
+/// Returns all the operands of `linalgOp` that are not views.
+/// Asserts that these operands are value types to allow transformations like
+/// tiling to just use the values when cloning `linalgOp`.
+SmallVector<Value, 4> getAssumedNonViewOperands(LinalgOp linalgOp);
+
+/// Apply the permutation defined by `permutation` to `inVec`.
+/// Element `i` in `inVec` is mapped to location `j = permutation[i]`.
+/// E.g.: for an input vector `inVec = ['a', 'b', 'c']` and a permutation vector
+/// `permutation = [2, 0, 1]`, this function leaves `inVec = ['c', 'a', 'b']`.
+template <typename T, unsigned N>
+void applyPermutationToVector(SmallVector<T, N> &inVec,
+                              ArrayRef<unsigned> permutation) {
+  SmallVector<T, N> auxVec(inVec.size());
+  for (unsigned i = 0; i < permutation.size(); ++i)
+    auxVec[i] = inVec[permutation[i]];
+  inVec = auxVec;
+}
+
+/// Prepares the SubView promotion later performed by `promoteSubViews`
+/// (where most of the transformation happens). It arranges the new
+/// operands for `LinalgOp op` and deallocates the new buffer(s)
+/// It is the entry point for declarative transformation
+/// Returns the cloned `LinalgOp` with the new operands
+LinalgOp promoteSubViewOperands(OpBuilder &b, LinalgOp op,
+                                llvm::SetVector<Value> subViews,
+                                bool dynamicBuffers = false,
+                                OperationFolder *folder = nullptr);
+
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_UTILS_H_
diff --git a/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt b/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0fda882d3f54947bf72cb3891622c75942b63a69
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt
@@ -0,0 +1 @@
+add_mlir_dialect(LoopOps LoopOps)
diff --git a/mlir/include/mlir/Dialect/LoopOps/LoopOps.h b/mlir/include/mlir/Dialect/LoopOps/LoopOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..2617d7fd7839825def7fb6ee62749b0ccb26b7e4
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LoopOps/LoopOps.h
@@ -0,0 +1,48 @@
+//===- Ops.h - Loop MLIR Operations -----------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines convenience types for working with loop operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_LOOPOPS_OPS_H_
+#define MLIR_LOOPOPS_OPS_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Transforms/LoopLikeInterface.h"
+
+namespace mlir {
+namespace loop {
+
+class TerminatorOp;
+
+class LoopOpsDialect : public Dialect {
+public:
+  LoopOpsDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "loop"; }
+};
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LoopOps/LoopOps.h.inc"
+
+// Insert `loop.terminator` at the end of the only region's only block if it
+// does not have a terminator already.  If a new `loop.terminator` is inserted,
+// the location is specified by `loc`. If the region is empty, insert a new
+// block first.
+void ensureLoopTerminator(Region &region, Builder &builder, Location loc);
+
+/// Returns the loop parent of an induction variable. If the provided value is
+/// not an induction variable, then return nullptr.
+ForOp getForInductionVarOwner(Value val);
+
+} // end namespace loop
+} // end namespace mlir
+#endif // MLIR_LOOPOPS_OPS_H_
diff --git a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..707b788aaa84654aa73a70aa15a53c6414813d36
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
@@ -0,0 +1,147 @@
+//===- Ops.td - Loop operation definitions ---------------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines MLIR loop operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LOOP_OPS
+#define LOOP_OPS
+
+include "mlir/IR/OpBase.td"
+include "mlir/Transforms/LoopLikeInterface.td"
+
+def Loop_Dialect : Dialect {
+  let name = "loop";
+  let cppNamespace = "";
+}
+
+// Base class for Loop dialect ops.
+class Loop_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Loop_Dialect, mnemonic, traits> {
+  // For every standard op, there needs to be a:
+  //   * void print(OpAsmPrinter &p, ${C++ class of Op} op)
+  //   * LogicalResult verify(${C++ class of Op} op)
+  //   * ParseResult parse${C++ class of Op}(OpAsmParser &parser,
+  //                                         OperationState &result)
+  // functions.
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+def ForOp : Loop_Op<"for",
+      [DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+       SingleBlockImplicitTerminator<"TerminatorOp">]> {
+  let summary = "for operation";
+  let description = [{
+    The "loop.for" operation represents a loop nest taking 3 SSA value as
+    operands that represent the lower bound, upper bound and step respectively.
+    The operation defines an SSA value for its induction variable. It has one
+    region capturing the loop body. The induction variable is represented as an
+    argument of this region. This SSA value always has type index, which is the
+    size of the machine word. The step is a value of type index, required to be
+    positive.
+    The lower and upper bounds specify a half-open range: the range includes the
+    lower bound but does not include the upper bound.
+
+    The body region must contain exactly one block that terminates with
+    "loop.terminator".  Calling ForOp::build will create such region and insert
+    the terminator, so will the parsing even in cases when it is absent from the
+    custom format. For example:
+
+       loop.for %iv = %lb to %ub step %step {
+         ... // body
+       }
+  }];
+  let arguments = (ins Index:$lowerBound, Index:$upperBound, Index:$step);
+  let regions = (region SizedRegion<1>:$region);
+
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &result, "
+              "Value lowerBound, Value upperBound, Value step">
+  ];
+
+  let extraClassDeclaration = [{
+    Block *getBody() { return &region().front(); }
+    Value getInductionVar() { return getBody()->getArgument(0); }
+    OpBuilder getBodyBuilder() {
+      return OpBuilder(getBody(), std::prev(getBody()->end()));
+    }
+    void setLowerBound(Value bound) { getOperation()->setOperand(0, bound); }
+    void setUpperBound(Value bound) { getOperation()->setOperand(1, bound); }
+    void setStep(Value step) { getOperation()->setOperand(2, step); }
+  }];
+}
+
+def IfOp : Loop_Op<"if",
+      [SingleBlockImplicitTerminator<"TerminatorOp">]> {
+  let summary = "if-then-else operation";
+  let description = [{
+    The "loop.if" operation represents an if-then-else construct for
+    conditionally executing two regions of code. The operand to an if operation
+    is a boolean value. The operation produces no results. For example:
+
+       loop.if %b  {
+         ...
+       } else {
+         ...
+       }
+
+    The 'else' block is optional, and may be omitted. For
+    example:
+
+       loop.if %b  {
+         ...
+       }
+  }];
+  let arguments = (ins I1:$condition);
+  let regions = (region SizedRegion<1>:$thenRegion, AnyRegion:$elseRegion);
+
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &result, "
+              "Value cond, bool withElseRegion">
+  ];
+
+  let extraClassDeclaration = [{
+    OpBuilder getThenBodyBuilder() {
+      assert(!thenRegion().empty() && "Unexpected empty 'then' region.");
+      Block &body = thenRegion().front();
+      return OpBuilder(&body, std::prev(body.end()));
+    }
+    OpBuilder getElseBodyBuilder() {
+      assert(!elseRegion().empty() && "Unexpected empty 'else' region.");
+      Block &body = elseRegion().front();
+      return OpBuilder(&body, std::prev(body.end()));
+    }
+  }];
+}
+
+def TerminatorOp :
+    Loop_Op<"terminator", [NativeOpTrait<"IsTerminator">]> {
+  let summary = "cf terminator operation";
+  let description = [{
+    "loop.terminator" is a special terminator operation for blocks inside
+    loops. It terminates the region. This operation does _not_ have a custom
+    syntax. However, `std` control operations omit the terminator in their
+    custom syntax for brevity.
+
+       loop.terminator
+  }];
+
+  // No custom parsing/printing form.
+  let parser = ?;
+  let printer = ?;
+
+  // Fully specified by traits.
+  let verifier = ?;
+}
+
+#endif // LOOP_OPS
diff --git a/mlir/include/mlir/Dialect/QuantOps/CMakeLists.txt b/mlir/include/mlir/Dialect/QuantOps/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..90a61c4c194f1ea7f15a2f9ad0e51216eca3c508
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QuantOps/CMakeLists.txt
@@ -0,0 +1 @@
+add_mlir_dialect(QuantOps QuantOps)
diff --git a/mlir/include/mlir/Dialect/QuantOps/FakeQuantSupport.h b/mlir/include/mlir/Dialect/QuantOps/FakeQuantSupport.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a141e3b1b359d4b7151874c240f7693fd8fce17
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QuantOps/FakeQuantSupport.h
@@ -0,0 +1,67 @@
+//===- FakeQuantSupport.h - Support utilities for FakeQuant ops -*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines support utilities for interoperating with FakeQuant* based
+// QAT (Quantized Aware Training) computations, as implemented by TFLite. Note
+// that FakeQuant* operators mix multiple concerns specific to how TFLite
+// originally implemented quantization. As such, utilities here enforce
+// opinions taken by that codebase (vs providing any amount of genericity).
+//
+// Specifically, it combines the following concerns, each of which would be
+// independent variables in a more generic setup:
+//   - numBits and isSigned imply storage data type (uint8, int8, int16)
+//   - numBits < 8 is promoted to uint8 or int8
+//   - "narrow_range" narrows the lower bound of the storage type's range by
+//     1
+//   - the specified min/max values are "nudged" so that the result has a zero
+//     that can be exactly expressed
+//   - min=max=0 implies scale=0 and zero_point=0
+//
+// With the above assumptions applied, every conforming specified FakeQuant op
+// can be represented by a UniformQuantizedType. This scheme is not expected to
+// be generalized further in the future and should be considered to be a
+// legacy set of rules.
+//
+// As canonically used in TensorFlow graphs, the presence of a FakeQuant node
+// is a hint that the specific math represented here has been simulated at
+// training time. As such, it is usually not advised to arbitrarily change
+// quantization parameters derived from FakeQuant.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_QUANTOPS_FAKEQUANTSUPPORT_H_
+#define MLIR_DIALECT_QUANTOPS_FAKEQUANTSUPPORT_H_
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+
+namespace mlir {
+namespace quant {
+
+/// Converts per-layer FakeQuant attributes to the corresponding type.
+/// In the event that the parameters cannot be converted, returns a nullptr
+/// convertible Type and issues an appropriate error.
+/// Note that there are multiple variants of a per-layer FakeQuant op, so
+/// this function takes the attributes discretely vs taking a reference to the
+/// originating op.
+UniformQuantizedType fakeQuantAttrsToType(Location loc, unsigned numBits,
+                                          double rmin, double rmax,
+                                          bool narrowRange, Type expressedType,
+                                          bool isSigned = false);
+
+/// Converts per-channel FakeQuant attributes to the corresponding type.
+/// In the event that the parameters cannot be converted, returns a nullptr
+/// convertible Type and issues an appropriate error.
+UniformQuantizedPerAxisType
+fakeQuantAttrsToType(Location loc, unsigned numBits, int32_t quantizedDimension,
+                     ArrayRef<double> rmins, ArrayRef<double> rmax,
+                     bool narrowRange, Type expressedType,
+                     bool isSigned = false);
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTOPS_FAKEQUANTSUPPORT_H_
diff --git a/mlir/include/mlir/Dialect/QuantOps/Passes.h b/mlir/include/mlir/Dialect/QuantOps/Passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3109775db2bb29d5a7ca64258aaa908126db632
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QuantOps/Passes.h
@@ -0,0 +1,41 @@
+//===- Passes.h - Quantization Passes ------ --------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the passes owned by the quantization dialect. As
+// things mature, it is expected that passes specific to certain frontend or
+// backend dialects will move to those dialects directly. For now, they are
+// incubated here.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_QUANTOPS_PASSES_H
+#define MLIR_DIALECT_QUANTOPS_PASSES_H
+
+#include <memory>
+
+namespace mlir {
+class FuncOp;
+template <typename T> class OpPassBase;
+
+namespace quant {
+
+/// Creates a pass that converts quantization simulation operations (i.e.
+/// FakeQuant and those like it) to casts into/out of supported QuantizedTypes.
+std::unique_ptr<OpPassBase<FuncOp>> createConvertSimulatedQuantPass();
+
+/// Creates a pass that converts constants followed by a qbarrier to a
+/// constant whose value is quantized. This is typically one of the last
+/// passes done when lowering to express actual quantized arithmetic in a
+/// low level representation. Because it modifies the constant, it is
+/// destructive and cannot be undone.
+std::unique_ptr<OpPassBase<FuncOp>> createConvertConstPass();
+
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTOPS_PASSES_H
diff --git a/mlir/include/mlir/Dialect/QuantOps/QuantOps.h b/mlir/include/mlir/Dialect/QuantOps/QuantOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a4eec67c740f0d7086734b65a42129c705cd873
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QuantOps/QuantOps.h
@@ -0,0 +1,41 @@
+//===- QuantOps.h - Quantization Ops and Types ------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_QUANTOPS_QUANTOPS_H_
+#define MLIR_DIALECT_QUANTOPS_QUANTOPS_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "llvm/Support/MathExtras.h"
+
+namespace mlir {
+namespace quant {
+
+/// Defines the 'Quantization' dialect
+class QuantizationDialect : public Dialect {
+public:
+  QuantizationDialect(MLIRContext *context);
+
+  /// Parse a type registered to this dialect.
+  Type parseType(DialectAsmParser &parser) const override;
+
+  /// Print a type registered to this dialect.
+  void printType(Type type, DialectAsmPrinter &os) const override;
+};
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/QuantOps/QuantOps.h.inc"
+
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTOPS_QUANTOPS_H_
diff --git a/mlir/include/mlir/Dialect/QuantOps/QuantOps.td b/mlir/include/mlir/Dialect/QuantOps/QuantOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..bbeb9419cc4088f2e85764180d850f06c22166c4
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QuantOps/QuantOps.td
@@ -0,0 +1,258 @@
+//===- QuantOps.td - Quantization operation definition -----*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the operation definition file for Quantization.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DIALECT_QUANTOPS_QUANT_OPS_
+#define DIALECT_QUANTOPS_QUANT_OPS_
+
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/QuantOps/QuantPredicates.td"
+
+def quant_Dialect : Dialect {
+  let name = "quant";
+}
+
+//===----------------------------------------------------------------------===//
+// Base classes
+//===----------------------------------------------------------------------===//
+
+class quant_Op<string mnemonic, list<OpTrait> traits> :
+    Op<quant_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Quantization casts
+//===----------------------------------------------------------------------===//
+// A QuantizeCast (qcast) represents a potential type shift from a quantizable
+// type to a quantized type.
+//
+// At runtime, a qcast will apply the transformation expressed by its
+// operand and result type. For flexibility during transformation, it is also
+// possible to have a qcast that performs no transformation (both its
+// operand and result type are quantizable).
+//
+// A qcast will typically originate from either:
+//   a) An expressed or implied constraint in the source dialect which signals
+//      that a certain level of quantization is possible or required.
+//   b) An inference made by a quantization algorithm indicating that a
+//      quantized representation may be acceptable.
+//
+// Especially early in transformation, it is common to have pairs of
+// qcast/dcast at points where a transition to a quantized type is
+// required. In addition, it is also common to have an identity qcast
+// (where the operand and result type are not quantized) at all points where
+// it is legal to use a quantized representation (but is not known to be
+// acceptable).
+def quant_QuantizeCastOp : quant_Op<"qcast", [NoSideEffect]> {
+  let arguments = (ins quant_RealValueType:$arg);
+  let results = (outs quant_RealValueType);
+}
+
+// A DequantizeCast op (dcast) represents the inverse of a qcast,
+// converting back from a quantized to quantizable (expressed) type.
+//
+// Like qcasts, a dcast is allowed to have both its operand and result
+// as non quantized types. This facilitates transformations and marks edges
+// where the computation must be carried out in the expressed type.
+//
+// Especially early in transformation, it is common to have dcasts on
+// all operands to ops that must operate with the expressed type (typically
+// math ops prior to lowering to target-specific, quantized kernels).
+def quant_DequantizeCastOp : quant_Op<"dcast", [NoSideEffect]> {
+  let arguments = (ins quant_RealValueType:$arg);
+  let results = (outs quant_RealValueType);
+}
+
+// A StorageCast (scast) represents a cast from or to a type based on the
+// storage type and a type based on a corresponding quantized type.
+//
+// This op exists to ensure type coherency for between parts of the computation
+// which are operating directly on an underlying storage type and those which
+// operate on quantized values.
+//
+// Examples from storage to quantized type:
+//   i8 -> !quant<"uniform[i8:f32]{1.0}">
+//   tensor<4xi8> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+//   vector<4xi8> -> vector<4x!quant<"uniform[i8:f32]{1.0}">>
+def quant_StorageCastOp : quant_Op<"scast", [NoSideEffect]> {
+  let arguments = (ins quant_RealOrStorageValueType:$arg);
+  let results = (outs quant_RealOrStorageValueType);
+  let hasFolder = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Training integration and instrumentation ops
+//===----------------------------------------------------------------------===//
+
+def quant_ConstFakeQuant : quant_Op<"const_fake_quant",
+                                    [SameOperandsAndResultType, NoSideEffect]> {
+  let summary =
+      "Simulates the effect of uniform quantization with const range.";
+
+  let description = [{
+    Given a const min, max, num_bits and narrow_range attribute, applies the
+    same uniform quantization simulation as is done by the TensorFlow
+    fake_quant_with_min_max_args op. See the fakeQuantAttrsToType() utility
+    method and the quant-convert-simulated-quantization pass for futher details.
+  }];
+
+  let arguments = (ins
+    F32Tensor:$inputs,
+    F32Attr:$min,
+    F32Attr:$max,
+    // The bitwidth of the quantization; between 2 and 16, inclusive.
+    I64Attr:$num_bits,
+    // Quantization range starts from 0 or 1; starts from 1 if true.
+    DefaultValuedAttr<BoolAttr, "false">:$narrow_range,
+    // The sign of the quantization.
+    DefaultValuedAttr<BoolAttr, "false">:$is_signed
+  );
+
+  let results = (outs
+    F32Tensor:$outputs
+  );
+}
+
+def quant_ConstFakeQuantPerAxis : quant_Op<"const_fake_quant_per_axis",
+                                    [SameOperandsAndResultType, NoSideEffect]> {
+  let summary =
+      "Simulates the effect of per axis uniform quantization with const range.";
+
+  let description = [{
+    Given a const min, max, num_bits and narrow_range attribute, applies the
+    same per axis uniform quantization simulation as is done by the TensorFlow
+    fake_quant_with_min_max_vars_per_channel op. See the fakeQuantAttrsToType()
+    utility method and the quant-convert-simulated-quantization pass for futher
+    details.
+  }];
+
+  let arguments = (ins
+    F32Tensor:$inputs,
+    F32ArrayAttr:$min,
+    F32ArrayAttr:$max,
+    // The quantized dimension of the inputs tensor.
+    I64Attr:$axis,
+    // The bitwidth of the quantization; between 2 and 16, inclusive.
+    I64Attr:$num_bits,
+    // Quantization range starts from 0 or 1; starts from 1 if true.
+    DefaultValuedAttr<BoolAttr, "false">:$narrow_range,
+    // The sign of the quantization.
+    DefaultValuedAttr<BoolAttr, "false">:$is_signed
+  );
+
+  let results = (outs
+    F32Tensor:$outputs
+  );
+}
+
+def quant_StatisticsRefOp : quant_Op<"stats_ref", [SameOperandsAndResultType]> {
+  let summary =
+      "Indicates that statistics are resolved by reference.";
+
+  let description = [{
+    This op acts as an identity that, when encountered at runtime, should result
+    in statistics being collected about about the value of its operand/result.
+    Such statistics will be stored with the provided key, allowing this node
+    to later be converted to a 'stats' op if statistics with that key have been
+    encountered.
+  }];
+
+  let arguments = (ins
+    quant_RealValueType:$arg,
+    StrAttr:$statsKey
+  );
+  let results = (outs quant_RealValueType);
+}
+
+def quant_StatisticsOp : quant_Op<"stats", [SameOperandsAndResultType]> {
+  let summary =
+      "Identity op which associates statistics with the value.";
+
+  let description = [{
+    Associates statistics about the runtime ranges of values observed for
+    evaluations of this node.
+
+    Statistics about the entire type are reported in the 'layerStats' attribute
+    and those for each axis, in the (optional) `axisStats` attribute. The
+    interpretation of each is determined by the last dimension of its shape.
+    Currently, only dim=2 is supported, which is interpreted as [min, max].
+
+    `layerStats` must be a rank 1 tensor: [2]
+    `axisStats` must be a rank 2 tensor: [N, 2], where N=the slice size
+      splitted by the `axis` dimension. For example:
+      <?x?x3x2>, axis=3 => N=2
+      <?x?x3x2>, axis=2 => N=6
+  }];
+
+  let arguments = (ins
+    quant_RealValueType:$arg,
+    ElementsAttr:$layerStats,
+    OptionalAttr<ElementsAttr>:$axisStats,
+    OptionalAttr<I64Attr>:$axis);
+  let results = (outs quant_RealValueType);
+
+  let verifier = [{
+    auto tensorArg = arg()->getType().dyn_cast<TensorType>();
+    if (!tensorArg) return emitOpError("arg needs to be tensor type.");
+
+    // Verify layerStats attribute.
+    {
+      auto layerStatsType = layerStats().getType();
+      if (!layerStatsType.getElementType().isa<FloatType>()) {
+        return emitOpError(
+            "layerStats must have a floating point element type");
+      }
+      if (layerStatsType.getRank() != 1 || layerStatsType.getDimSize(0) != 2) {
+        return emitOpError("layerStats must have shape [2]");
+      }
+    }
+    // Verify axisStats (optional) attribute.
+    if (axisStats()) {
+      if (!axis()) return emitOpError("axis must be specified for axisStats");
+
+      auto shape = tensorArg.getShape();
+      auto argSliceSize = std::accumulate(std::next(shape.begin(),
+        axis()->getSExtValue()), shape.end(), 1, std::multiplies<int64_t>());
+
+      auto axisStatsType = axisStats()->getType();
+      if (!axisStatsType.getElementType().isa<FloatType>()) {
+        return emitOpError("axisStats must have a floating point element type");
+      }
+      if (axisStatsType.getRank() != 2 ||
+          axisStatsType.getDimSize(1) != 2 ||
+          axisStatsType.getDimSize(0) != argSliceSize) {
+        return emitOpError("axisStats must have shape [N,2] "
+                           "where N = the slice size defined by the axis dim");
+      }
+    }
+    return success();
+  }];
+}
+
+def quant_CoupledRefOp : quant_Op<"coupled_ref", [SameOperandsAndResultType]> {
+  let summary =
+      "Indicates that one point of the computation is coupled to another.";
+
+  let description = [{
+    Ordinarily, relationships between ops for the purposes of determining
+    compatible quantized types is explicit based on the use-def chain. However,
+    in some situations, a use may be separated from its def by arbitrary
+    external connections. In such a case, during analysis, all coupled_ref
+    nodes in a module which share a coupledKey will be considered to be
+    directly connected as via an identity op for the purpose of type inference.
+  }];
+
+  let arguments = (ins
+    quant_RealValueType:$arg,
+    StrAttr:$coupledKey);
+  let results = (outs quant_RealValueType);
+}
+
+#endif // DIALECT_QUANTOPS_QUANT_OPS_
diff --git a/mlir/include/mlir/Dialect/QuantOps/QuantPredicates.td b/mlir/include/mlir/Dialect/QuantOps/QuantPredicates.td
new file mode 100644
index 0000000000000000000000000000000000000000..7225dcc72db1ee4e652c0b80f2d982674d52151b
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QuantOps/QuantPredicates.td
@@ -0,0 +1,63 @@
+//===- QuantPredicates.td - Predicates for dialect types ---*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Predicates for types in the Quantization dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DIALECT_QUANTOPS_QUANT_PREDICATES_
+#define DIALECT_QUANTOPS_QUANT_PREDICATES_
+
+//===----------------------------------------------------------------------===//
+// Quantization type definitions
+//===----------------------------------------------------------------------===//
+
+class quant_TypedPrimitiveOrContainer<Type etype> :
+    Type<Or<[etype.predicate,
+                TensorOf<[etype]>.predicate,
+                VectorOf<[etype]>.predicate]>,
+         "primitive/tensor/vector of " # etype.description>;
+
+// An implementation of QuantizedType.
+def quant_QuantizedType :
+    Type<CPred<"$_self.isa<mlir::quant::QuantizedType>()">, "QuantizedType">;
+
+// A primitive type that can represent a real value. This is either a
+// floating point value or a quantized type.
+def quant_RealPrimitiveType :
+    Type<Or<[AnyFloat.predicate, quant_QuantizedType.predicate]>,
+    "real valued primitive (float or quantized type)">;
+
+// A primitive type that can represent a storage value. This is either an
+// integer or quantized type.
+def quant_StoragePrimitiveType :
+    Type<Or<[AnyInteger.predicate, quant_QuantizedType.predicate]>,
+    "quantized storage primitive (integer or quantized type)">;
+
+// A primitive or container of RealPrimitiveType.
+def quant_RealValueType :
+    quant_TypedPrimitiveOrContainer<quant_RealPrimitiveType>;
+
+// A primitive or container of StoragePrimitiveType.
+def quant_StorageValueType :
+    quant_TypedPrimitiveOrContainer<quant_StoragePrimitiveType>;
+
+// Either a real valued or storage primitive or container type.
+def quant_RealOrStorageValueType :
+    Type<Or<[quant_RealValueType.predicate,
+                quant_StorageValueType.predicate]>>;
+
+// An implementation of UniformQuantizedType.
+def quant_UniformQuantizedType :
+    Type<CPred<"$_self.isa<UniformQuantizedType>()">, "UniformQuantizedType">;
+
+// Predicate for detecting a container or primitive of UniformQuantizedType.
+def quant_UniformQuantizedValueType :
+    quant_TypedPrimitiveOrContainer<quant_UniformQuantizedType>;
+
+#endif // DIALECT_QUANTOPS_QUANT_PREDICATES_
diff --git a/mlir/include/mlir/Dialect/QuantOps/QuantTypes.h b/mlir/include/mlir/Dialect/QuantOps/QuantTypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..daeb03744608d8d49442618910c33bface4d3bb7
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QuantOps/QuantTypes.h
@@ -0,0 +1,402 @@
+//===- QuantTypes.h - Quantization Ops and Types ----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_QUANTOPS_QUANT_TYPES_H_
+#define MLIR_DIALECT_QUANTOPS_QUANT_TYPES_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "llvm/Support/MathExtras.h"
+
+namespace mlir {
+namespace quant {
+
+class QuantizedIntegerType;
+
+namespace detail {
+
+struct QuantizedTypeStorage;
+struct AnyQuantizedTypeStorage;
+struct UniformQuantizedTypeStorage;
+struct UniformQuantizedPerAxisTypeStorage;
+
+} // namespace detail
+
+namespace QuantizationTypes {
+enum Kind {
+  Any = Type::FIRST_QUANTIZATION_TYPE,
+  UniformQuantized,
+  UniformQuantizedPerAxis,
+  LAST_USED_QUANTIZATION_TYPE = UniformQuantizedPerAxis,
+};
+} // namespace QuantizationTypes
+
+/// Enumeration of bit-mapped flags related to quantized types.
+namespace QuantizationFlags {
+enum FlagValue {
+  // Indicates that the storage type should be interpreted as a signed
+  // integer. The default is to interpret it as an unsigned value.
+  Signed = 1,
+};
+} // namespace QuantizationFlags
+
+/// Base class for all quantized types known to this dialect.
+/// All quantized types have:
+///   - storageType: The (narrower) numeric type that is being used to
+///     approximate some expressed type.
+///   - expressedType: The type that is being approximated.
+///
+/// The base class provides generic support for manipulating the types based
+/// on these fields.
+class QuantizedType : public Type {
+public:
+  using ImplType = detail::QuantizedTypeStorage;
+  using Type::Type;
+
+  /// The maximum number of bits supported for storage types.
+  static constexpr unsigned MaxStorageBits = 32;
+
+  static LogicalResult
+  verifyConstructionInvariants(Optional<Location> loc, MLIRContext *context,
+                               unsigned flags, Type storageType,
+                               Type expressedType, int64_t storageTypeMin,
+                               int64_t storageTypeMax);
+
+  /// Support method to enable LLVM-style type casting.
+  static bool classof(Type type) {
+    return type.getKind() >= Type::FIRST_QUANTIZATION_TYPE &&
+           type.getKind() <= QuantizationTypes::LAST_USED_QUANTIZATION_TYPE;
+  }
+
+  /// Gets the minimum possible stored by a storageType. storageTypeMin must
+  /// be greater than or equal to this value.
+  static int64_t getDefaultMinimumForInteger(bool isSigned,
+                                             unsigned integralWidth) {
+    if (isSigned) {
+      return llvm::minIntN(integralWidth);
+    }
+    return 0;
+  }
+
+  /// Gets the maximum possible stored by a storageType. storageTypeMax must
+  /// be less than or equal to this value.
+  static int64_t getDefaultMaximumForInteger(bool isSigned,
+                                             unsigned integralWidth) {
+    if (isSigned) {
+      return llvm::maxIntN(integralWidth);
+    }
+    return llvm::maxUIntN(integralWidth);
+  }
+
+  /// Gets the original expressed type that this quantized type approximates.
+  /// Note that this presumes that the quantized type was always derived from
+  /// a floating point type, which in the broadest definition, is not true (i.e.
+  /// it could be some form of integral, fixed type or affine type in its own
+  /// right); however, at the high level, no examples of such usage are
+  /// presently known and the restriction serves some useful purposes (such as
+  /// always being able to reverse a transformation or measure error). In most
+  /// cases, this will be f32.
+  Type getExpressedType() const;
+
+  /// Gets the flags associated with this type. Typically a more specific
+  /// accessor is appropriate.
+  unsigned getFlags() const;
+
+  // Convenience helpers.
+  /// Whether the storage type should be interpreted as a signed quantity
+  /// (true) or an unsigned value (false).
+  bool isSigned() const {
+    return (getFlags() & QuantizationFlags::Signed) ==
+           QuantizationFlags::Signed;
+  }
+
+  /// Gets the underlying type used for to store values. Note that this may
+  /// be signed or unsigned. Use the isSigned() accessor to differentiate.
+  Type getStorageType() const;
+
+  /// The minimum value that storageType can take.
+  int64_t getStorageTypeMin() const;
+
+  /// The maximum value that storageType can take.
+  int64_t getStorageTypeMax() const;
+
+  /// Gets the integral bit width that the underlying storage type can exactly
+  /// represent. For integral storage types, this will just be their width.
+  unsigned getStorageTypeIntegralWidth() const;
+
+  /// Returns whether the candidateExpressedType is a match for this
+  /// QuantizedType. This will be true if the candidate type is either a
+  /// primitive type or a container type whose element type equals this
+  /// QuantizedType's expressed type.
+  /// Examples of compatible candidateExpressedType:
+  ///   !quant.uniform<i8:f32, 1.0> =~ f32
+  ///   !quant.uniform<i8:f32, 1.0> =~ tensor<4xf32>
+  bool isCompatibleExpressedType(Type candidateExpressedType);
+
+  /// Returns the element type as a QuantizedType or nullptr if it is not
+  /// a quantized type. If the type is primitive, returns that. If it is a
+  /// container (vector/tensor), return the element type.
+  /// Examples:
+  ///   !quant.uniform<i8:f32, 1.0> -> !quant.uniform<i8:f32, 1.0>
+  ///   tensor<4x!quant.uniform<i8:f32, 1.0> -> quant.uniform<i8:f32, 1.0>
+  static QuantizedType getQuantizedElementType(Type primitiveOrContainerType);
+
+  /// Casts from a type based on the storageType to a corresponding type based
+  /// on this type (returns nullptr if the cast is not valid).
+  /// Examples:
+  ///   i8 -> !quant.uniform<i8:f32, 1.0>
+  ///   tensor<4xi8> -> tensor<4x!quant.uniform<i8:f32, 1.0}>>
+  ///   vector<4xi8> -> vector<4x!quant.uniform<i8:f32, 1.0>>
+  Type castFromStorageType(Type candidateType);
+
+  /// Casts from a type based on a QuantizedType to a corresponding type based
+  /// on the storageType (returns nullptr if the cast is not valid).
+  /// This is the inverse of castFromStorageType().
+  static Type castToStorageType(Type quantizedType);
+
+  /// Casts from a type based on the expressedType to a corresponding type based
+  /// on this type (returns nullptr if the cast is not valid).
+  /// Examples:
+  ///   f32 -> !quant.uniform<i8:f32, 1.0>
+  ///   tensor<4xf32> -> tensor<4x!quant.uniform<i8:f32, 1.0>>
+  ///   vector<4xf32> -> vector<4x!quant.uniform<i8:f32, 1.0>>
+  Type castFromExpressedType(Type candidateType);
+
+  /// Casts from a type based on QuantizedType to a corresponding type based
+  /// on the expressedType (returns nullptr if the cast is not valid).
+  /// This is the inverse of castFromExpressedType.
+  static Type castToExpressedType(Type quantizedType);
+
+  /// Casts from a type based on the expressedType to the equivalent type
+  /// based on storageType by way of this QuantizedType. Equivalent to:
+  ///   QuantizedType::castToStorageType(castFromExpressedType(candidateType))
+  /// (but with validity checks).
+  /// Example (for this = !quant.uniform<i8:f32, 1.0>):
+  ///   tensor<4xf32> -> tensor<4xi8>
+  Type castExpressedToStorageType(Type candidateType);
+
+private:
+  /// Hide the following methods inherited from `Type`. It is almost certainly
+  /// a bug to call them from a `QuantizedType` object. Users should call
+  /// `getStorageType` or `getExpressedType` to get the underlying types
+  /// they want to inspect.
+  using Type::isBF16;
+  using Type::isF16;
+  using Type::isF32;
+  using Type::isF64;
+  using Type::isIndex;
+  using Type::isInteger;
+};
+
+/// A quantized type that maps storage to/from expressed types in an
+/// unspecified way.
+///
+/// Typical syntax:
+///   quant.any<i8:f32>
+///   quant.any<i8>
+///   quant.any<i8<-16,15>>
+///
+/// Note that for the any type, the expressed type is optional.
+class AnyQuantizedType
+    : public Type::TypeBase<AnyQuantizedType, QuantizedType,
+                            detail::AnyQuantizedTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Support method to enable LLVM-style type casting.
+  static bool kindof(unsigned kind) { return kind == QuantizationTypes::Any; }
+
+  /// Gets an instance of the type with all parameters specified but not
+  /// checked.
+  static AnyQuantizedType get(unsigned flags, Type storageType,
+                              Type expressedType, int64_t storageTypeMin,
+                              int64_t storageTypeMax);
+
+  /// Gets an instance of the type with all specified parameters checked.
+  /// Returns a nullptr convertible type on failure.
+  static AnyQuantizedType getChecked(unsigned flags, Type storageType,
+                                     Type expressedType, int64_t storageTypeMin,
+                                     int64_t storageTypeMax, Location location);
+
+  /// Verifies construction invariants and issues errors/warnings.
+  static LogicalResult
+  verifyConstructionInvariants(Optional<Location> loc, MLIRContext *context,
+                               unsigned flags, Type storageType,
+                               Type expressedType, int64_t storageTypeMin,
+                               int64_t storageTypeMax);
+};
+
+/// Represents a family of uniform, quantized types.
+///
+/// Each instance of this type expresses a mapping between real values (most
+/// often expressed in floating point f32) and quantized values (either fixed
+/// point or affine).
+///
+/// The relationship is:
+///     real_value = scale * (quantized_value - zero_point)
+///
+/// It is used as part of high level graph transformations that have the goal
+/// of re-expressing parts of a computation in terms of this common form for
+/// more efficient execution at runtime. In addition, it is designed to be
+/// expressive enough to facilitate lowering to precise types and operations
+/// in target hardware.
+///
+/// As a high-level type, focused on intermediate passes, this type holds
+/// opinions consistent with high-level usage. If lowering math kernels below
+/// the high level arithmetic ops (i.e. to LLVM IR or hardware specific
+/// instruction sets), it is expected that the information expressed here
+/// will be used to drive low level codegen and target specific type selection,
+/// but this type will likely be erased in the process.
+///
+/// Syntax synopsis:
+///   Per-layer, all parameters expressed:
+///     !quant<uniform[StorageType:ExpressedType]{Scale:ZeroPoint}>
+///   Per-layer, optional parameters omitted:
+///     !quant<uniform[StorageType]{Scale}>
+///
+///   StorageType: 'i'|'u' NumBits
+///   ExpressedType: 'f16', 'f32', 'bf16', 'f64'
+///   Scale: A legal double value
+///   ZeroPoint: An integer value
+class UniformQuantizedType
+    : public Type::TypeBase<UniformQuantizedType, QuantizedType,
+                            detail::UniformQuantizedTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Gets an instance of the type with all parameters specified but not
+  /// checked.
+  static UniformQuantizedType get(unsigned flags, Type storageType,
+                                  Type expressedType, double scale,
+                                  int64_t zeroPoint, int64_t storageTypeMin,
+                                  int64_t storageTypeMax);
+
+  /// Gets an instance of the type with all specified parameters checked.
+  /// Returns a nullptr convertible type on failure.
+  static UniformQuantizedType
+  getChecked(unsigned flags, Type storageType, Type expressedType, double scale,
+             int64_t zeroPoint, int64_t storageTypeMin, int64_t storageTypeMax,
+             Location location);
+
+  /// Verifies construction invariants and issues errors/warnings.
+  static LogicalResult verifyConstructionInvariants(
+      Optional<Location> loc, MLIRContext *context, unsigned flags,
+      Type storageType, Type expressedType, double scale, int64_t zeroPoint,
+      int64_t storageTypeMin, int64_t storageTypeMax);
+
+  /// Support method to enable LLVM-style type casting.
+  static bool kindof(unsigned kind) {
+    return kind == QuantizationTypes::UniformQuantized;
+  }
+
+  /// Gets the scale term. The scale designates the difference between the real
+  /// values corresponding to consecutive quantized values differing by 1.
+  double getScale() const;
+
+  /// Gets the storage value corresponding to the real value 0 in the affine
+  /// equation.
+  int64_t getZeroPoint() const;
+
+  // Fixed point values are real numbers divided by a scale.
+  // Currently, only signed storage types are treated as fixed point.
+  // A fixed point value can be obtained from an affine value by subtracting
+  // the zeroPoint.
+  // In the future, this may be explicit versus implied by type and zeroPoint.
+  bool isFixedPoint() const { return isSigned() && getZeroPoint() == 0; }
+};
+
+/// Represents per-axis (also known as per-channel quantization).
+///
+/// Syntax synopsis:
+///   Per-axis, all parameters expressed:
+///     !quant<uniform[StorageType:ExpressedType:QuantizedDim]{QuantParams}>
+///   Per-axis, optional parameters omitted:
+///     !quant<uniform[StorageType]{Scale}>
+///
+///   StorageType: 'i'|'u' NumBits
+///   ExpressedType: 'f16', 'f32', 'bf16', 'f64'
+///   QuantizedDim: An integer value
+///   QuantParams: (Scale ':' ZeroPoint)+
+///   Scale: A legal double value
+///   ZeroPoint: An integer value
+class UniformQuantizedPerAxisType
+    : public Type::TypeBase<UniformQuantizedPerAxisType, QuantizedType,
+                            detail::UniformQuantizedPerAxisTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Gets an instance of the type with all parameters specified but not
+  /// checked.
+  static UniformQuantizedPerAxisType
+  get(unsigned flags, Type storageType, Type expressedType,
+      ArrayRef<double> scales, ArrayRef<int64_t> zeroPoints,
+      int32_t quantizedDimension, int64_t storageTypeMin,
+      int64_t storageTypeMax);
+
+  /// Gets an instance of the type with all specified parameters checked.
+  /// Returns a nullptr convertible type on failure.
+  static UniformQuantizedPerAxisType
+  getChecked(unsigned flags, Type storageType, Type expressedType,
+             ArrayRef<double> scales, ArrayRef<int64_t> zeroPoints,
+             int32_t quantizedDimension, int64_t storageTypeMin,
+             int64_t storageTypeMax, Location location);
+
+  /// Verifies construction invariants and issues errors/warnings.
+  static LogicalResult verifyConstructionInvariants(
+      Optional<Location> loc, MLIRContext *context, unsigned flags,
+      Type storageType, Type expressedType, ArrayRef<double> scales,
+      ArrayRef<int64_t> zeroPoints, int32_t quantizedDimension,
+      int64_t storageTypeMin, int64_t storageTypeMax);
+
+  /// Support method to enable LLVM-style type casting.
+  static bool kindof(unsigned kind) {
+    return kind == QuantizationTypes::UniformQuantizedPerAxis;
+  }
+
+  /// Gets the quantization scales. The scales designate the difference between
+  /// the real values corresponding to consecutive quantized values differing
+  /// by 1. The ith scale corresponds to the ith slice in the
+  /// quantized_dimension.
+  ArrayRef<double> getScales() const;
+
+  /// Gets the storage values corresponding to the real value 0 in the affine
+  /// equation. The ith zero point corresponds to the ith slice in the
+  /// quantized_dimension.
+  ArrayRef<int64_t> getZeroPoints() const;
+
+  /// Specifies the dimension of the Tensor's shape that the scales and
+  /// zero_points correspond to. For example, a tensor t, with dims=[4, 3, 2, 1]
+  /// with quantization params:
+  ///   scales=[1.0, 2.0, 3.0], zeroPoints=[1, 2, 3], quantizedDimension=1
+  /// will be quantized across the second dimension of t.
+  ///   t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
+  ///   t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
+  ///   t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+  int32_t getQuantizedDimension() const;
+
+  /// Fixed point values are real numbers divided by a scale.
+  /// Currently, only signed storage types are treated as fixed point.
+  /// A fixed point value can be obtained from an affine value by subtracting
+  /// the zeroPoint.
+  /// In the future, this may be explicit versus implied by type and zeroPoint.
+  bool isFixedPoint() const {
+    if (!isSigned())
+      return false;
+    return llvm::all_of(getZeroPoints(),
+                        [](int64_t zeroPoint) { return zeroPoint != 0; });
+  }
+};
+
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTOPS_QUANT_TYPES_H_
diff --git a/mlir/include/mlir/Dialect/QuantOps/QuantizeUtils.h b/mlir/include/mlir/Dialect/QuantOps/QuantizeUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c40b9e6f0265d73142032964ee480fc97001c15e
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QuantOps/QuantizeUtils.h
@@ -0,0 +1,61 @@
+//===- QuantizeUtils.h - Support utilities for quantization -----*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_QUANTOPS_QUANTIZEUTILS_H_
+#define MLIR_DIALECT_QUANTOPS_QUANTIZEUTILS_H_
+
+namespace mlir {
+class Attribute;
+class Type;
+
+namespace quant {
+class QuantizedType;
+class UniformQuantizedType;
+class UniformQuantizedValueConverter;
+
+/// Converts an attribute from a type based on
+/// quantizedElementType.getExpressedType() to one based on
+/// quantizedElementType.getStorageType(), where quantizedElementType is as from
+/// QuantizedType::getQuantizedElementType().
+/// Returns nullptr if the conversion is not supported. On success, stores the
+/// converted type in outConvertedType.
+///
+/// Examples:
+/// 1. realValue is a primitive value attribute:
+/// (realValue: FloatAttr, quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (IntegerAttr, outConvertedType: i8)
+/// 2. realValue is an elements attribute:
+/// (realValue: DenseElementsAttr[tensor<2x2xf32>],
+///  quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (DenseElementsAttr[tensor<2x2xi8>], outConvertedType: tensor<2x2xi8>)
+Attribute quantizeAttr(Attribute realValue, QuantizedType quantizedElementType,
+                       Type &outConvertedType);
+
+/// Converts an attribute from a type based on
+/// quantizedElementType.getExpressedType() to one based on
+/// quantizedElementType.getStorageType(), where quantizedElementType is as from
+/// QuantizedType::getQuantizedElementType() and casted to an
+/// UniformQuantizedType. Returns nullptr if the conversion is not supported. On
+/// success, stores the converted type in outConvertedType.
+///
+/// Examples:
+/// 1. realValue is a primitive value attribute:
+/// (realValue: FloatAttr, quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (IntegerAttr, outConvertedType: i8)
+/// 2. realValue is an elements attribute:
+/// (realValue: DenseElementsAttr[tensor<2x2xf32>],
+///  quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (DenseElementsAttr[tensor<2x2xi8>], outConvertedType: tensor<2x2xi8>)
+Attribute quantizeAttrUniform(Attribute realValue,
+                              UniformQuantizedType quantizedElementType,
+                              const UniformQuantizedValueConverter &converter,
+                              Type &outConvertedType);
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTOPS_QUANTIZEUTILS_H_
diff --git a/mlir/include/mlir/Dialect/QuantOps/UniformSupport.h b/mlir/include/mlir/Dialect/QuantOps/UniformSupport.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c74fc56b8f03f4ef6b0af6cb41fb768c19de034
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QuantOps/UniformSupport.h
@@ -0,0 +1,218 @@
+//===- UniformSupport.h - Support utilities for uniform quant ---*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_QUANTOPS_UNIFORMSUPPORT_H_
+#define MLIR_DIALECT_QUANTOPS_UNIFORMSUPPORT_H_
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+
+namespace mlir {
+namespace quant {
+
+/// Performs type conversion from an arbitrary input type to a type
+/// that is expressed by a QuantizedType.
+///
+/// This handles cases where the inputType is a supported primitive type
+/// (i.e. f32, bf16, etc) or a vector/tensor type based on a supported
+/// elemental type.
+///
+/// Since conversion often involves introspecting some attributes of the
+/// input type in order to determine how to represent it, this is a two step
+/// process.
+struct ExpressedToQuantizedConverter {
+  /// Creates a converter for the given input type.
+  static const ExpressedToQuantizedConverter forInputType(Type inputType);
+
+  /// Converts the inputType to be based on the given elemental type,
+  /// returning the new type (or nullptr and emit an error on failure).
+  Type convert(QuantizedType elementalType) const;
+
+  /// Whether the conversion is legal.
+  explicit operator bool() const { return (bool)expressedType; }
+
+  /// The input type that is being converted from.
+  /// This may be an elemental or composite type.
+  const Type inputType;
+
+  /// Supported, elemental expressed type (i.e. f32).
+  /// Will be nullptr if conversion is not supported.
+  const Type expressedType;
+};
+
+/// Reference implementation of converting between real numbers and values
+/// represented by a UniformQuantizedType.
+/// Note that this is not expected to be speedy and may be superseded eventually
+/// by a more optimal implementation.
+/// Also, the interface assumes that quantization is done per-layer and will
+/// need to be wider for various per-channel schemes. As such, this is a
+/// placeholder.
+class UniformQuantizedValueConverter {
+public:
+  explicit UniformQuantizedValueConverter(UniformQuantizedType uniformType)
+      : UniformQuantizedValueConverter(
+            uniformType.getScale(),
+            static_cast<double>(uniformType.getZeroPoint()),
+            static_cast<double>(uniformType.getStorageTypeMin()),
+            static_cast<double>(uniformType.getStorageTypeMax()),
+            uniformType.getStorageTypeIntegralWidth(), uniformType.isSigned()) {
+    assert(uniformType.getExpressedType().isa<FloatType>());
+    assert(uniformType.getStorageType().isa<IntegerType>());
+  }
+
+  UniformQuantizedValueConverter(double scale, double zeroPoint,
+                                 double clampMin, double clampMax,
+                                 uint32_t storageBitWidth, bool isSigned)
+      : scale(scale), zeroPoint(zeroPoint), clampMin(clampMin),
+        clampMax(clampMax), scaleDouble(scale), zeroPointDouble(zeroPoint),
+        clampMinDouble(clampMin), clampMaxDouble(clampMax),
+        storageBitWidth(storageBitWidth), isSigned(isSigned),
+        roundMode(APFloat::rmNearestTiesToAway) {}
+
+  UniformQuantizedValueConverter(double scale, double zeroPoint,
+                                 APFloat clampMin, APFloat clampMax,
+                                 uint32_t storageBitWidth, bool isSigned)
+      : scale(scale), zeroPoint(zeroPoint), clampMin(clampMin),
+        clampMax(clampMax), scaleDouble(scale), zeroPointDouble(zeroPoint),
+        clampMinDouble(clampMin.convertToDouble()),
+        clampMaxDouble(clampMax.convertToDouble()),
+        storageBitWidth(storageBitWidth), isSigned(isSigned),
+        roundMode(APFloat::rmNearestTiesToAway) {}
+
+  virtual APInt quantizeFloatToInt(APFloat expressedValue) const {
+    // This function is a performance critical code path in quantization
+    // since it runs for each single float parameter value.
+
+    // Specialize f32->u8/i8 case to optimize performance.
+    if (&expressedValue.getSemantics() == &APFloat::IEEEsingle() &&
+        storageBitWidth == 8 &&
+        roundMode == llvm::APFloatBase::rmNearestTiesToAway) {
+      return quantizeF32ToInt8(expressedValue);
+    }
+
+    bool lossy;
+    expressedValue.convert(scale.getSemantics(), roundMode, &lossy);
+    // fixedpoint = clamp(clampMin, clampMax, (
+    //   roundHalfToEven(expressed / scale) + zeroPoint))
+    APFloat scaled = (expressedValue / scale);
+    scaled.roundToIntegral(roundMode);
+    scaled.add(zeroPoint, roundMode);
+    APFloat fixedpoint = llvm::minimum(scaled, clampMax);
+    fixedpoint = llvm::maximum(fixedpoint, clampMin);
+
+    llvm::APSInt result(storageBitWidth, !isSigned);
+    fixedpoint.convertToInteger(result, roundMode, &lossy);
+
+    return std::move(result);
+  }
+
+  int64_t quantizeFloatToInt64(APFloat expressedValue) const {
+    APInt qValue = quantizeFloatToInt(expressedValue);
+    return isSigned ? qValue.getSExtValue() : qValue.getZExtValue();
+  }
+
+  virtual ~UniformQuantizedValueConverter() {}
+
+private:
+  // An optimized implementation to quantize f32 to i8/u8 with C++ native
+  // arithmetic.
+  virtual APInt quantizeF32ToInt8(APFloat expressedValue) const {
+    assert(&expressedValue.getSemantics() == &APFloat::IEEEsingle());
+    assert(storageBitWidth == 8);
+    assert(roundMode == llvm::APFloatBase::rmNearestTiesToAway);
+
+    const float realValue = expressedValue.convertToFloat();
+
+    const double scaled = realValue / scaleDouble + zeroPointDouble;
+    // Round to nearest integer with halfway cases rounded away from zero.
+    const double scaledRounded = std::round(scaled);
+    const double clamped =
+        std::min(std::max(scaledRounded, clampMinDouble), clampMaxDouble);
+
+    uint64_t signlessResult;
+    if (isSigned) {
+      int64_t clampedInt = static_cast<int8_t>(clamped);
+      memcpy(&signlessResult, &clampedInt, sizeof(clampedInt));
+    } else {
+      signlessResult = static_cast<uint8_t>(clamped);
+    }
+    return APInt(storageBitWidth, signlessResult);
+  }
+
+  // Keep both APFloat and double versions of the quantization parameters
+  // around since they will be used in generic and specialized arithmetic,
+  // respectively.
+  const APFloat scale;
+  const APFloat zeroPoint;
+  const APFloat clampMin;
+  const APFloat clampMax;
+
+  const double scaleDouble;
+  const double zeroPointDouble;
+  const double clampMinDouble;
+  const double clampMaxDouble;
+
+  const uint32_t storageBitWidth;
+  const bool isSigned;
+  const llvm::APFloat::roundingMode roundMode;
+};
+
+/// An utility class to quantize an attribute by the per-axis quantization
+/// parameters. The size of the quantization dim in the converted elements
+/// attribute should matche the size of of scales/zeroPoints vectors in the
+/// quantization parameters.
+class UniformQuantizedPerAxisValueConverter {
+public:
+  explicit UniformQuantizedPerAxisValueConverter(
+      UniformQuantizedPerAxisType uniformType)
+      : scales(uniformType.getScales()),
+        zeroPoints(uniformType.getZeroPoints()),
+        clampMin(static_cast<double>(uniformType.getStorageTypeMin())),
+        clampMax(static_cast<double>(uniformType.getStorageTypeMax())),
+        storageBitWidth(uniformType.getStorageTypeIntegralWidth()),
+        isSigned(uniformType.isSigned()),
+        quantizationDim(uniformType.getQuantizedDimension()) {
+    assert(uniformType.getExpressedType().isa<FloatType>());
+    assert(uniformType.getStorageType().isa<IntegerType>());
+    assert(scales.size() == zeroPoints.size());
+  }
+
+  /// Quantize an Attribute by the quantization parameters. Return nullptr if
+  /// the conversion fails or the input array isn't an ElementsAttr.
+  ElementsAttr convert(Attribute realValue);
+
+private:
+  /// Quantize an DenseFPElementsAttr by the quantization parameters.
+  DenseElementsAttr convert(DenseFPElementsAttr attr);
+
+  /// Get a uniform converter for the index-th chunk along the quantizationDim.
+  /// All the elements in this chunk is quantized by the returned converter.
+  UniformQuantizedValueConverter getPerChunkConverter(int index) const {
+    UniformQuantizedValueConverter converter(scales[index], zeroPoints[index],
+                                             clampMin, clampMax,
+                                             storageBitWidth, isSigned);
+    return converter;
+  }
+
+  const ArrayRef<double> scales;
+  const ArrayRef<int64_t> zeroPoints;
+  const APFloat clampMin;
+  const APFloat clampMax;
+  const uint32_t storageBitWidth;
+  const bool isSigned;
+  int32_t quantizationDim;
+};
+
+} // namespace quant
+} // namespace mlir
+
+#endif // MLIR_DIALECT_QUANTOPS_UNIFORMSUPPORT_H_
diff --git a/mlir/include/mlir/Dialect/SDBM/SDBM.h b/mlir/include/mlir/Dialect/SDBM/SDBM.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8a0eec8ca84c88560b87ce22b9d39234d63290b
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SDBM/SDBM.h
@@ -0,0 +1,197 @@
+//===- SDBM.h - MLIR SDBM declaration ---------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A striped difference-bound matrix (SDBM) is a set in Z^N (or R^N) defined
+// as {(x_1, ... x_n) | f(x_1, ... x_n) >= 0} where f is an SDBM expression.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SDBM_SDBM_H
+#define MLIR_DIALECT_SDBM_SDBM_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace mlir {
+
+class MLIRContext;
+class SDBMDialect;
+class SDBMExpr;
+class SDBMTermExpr;
+
+/// A utility class for SDBM to represent an integer with potentially infinite
+/// positive value. This uses the largest value of int64_t to represent infinity
+/// and redefines the arithmetic operators so that the infinity "saturates":
+///   inf + x = inf,
+///   inf - x = inf.
+/// If a sum of two finite values reaches the largest value of int64_t, the
+/// behavior of IntInfty is undefined (in practice, it asserts), similarly to
+/// regular signed integer overflow.
+class IntInfty {
+public:
+  constexpr static int64_t infty = std::numeric_limits<int64_t>::max();
+
+  /*implicit*/ IntInfty(int64_t v) : value(v) {}
+
+  IntInfty &operator=(int64_t v) {
+    value = v;
+    return *this;
+  }
+
+  static IntInfty infinity() { return IntInfty(infty); }
+
+  int64_t getValue() const { return value; }
+  explicit operator int64_t() const { return value; }
+
+  bool isFinite() { return value != infty; }
+
+private:
+  int64_t value;
+};
+
+inline IntInfty operator+(IntInfty lhs, IntInfty rhs) {
+  if (!lhs.isFinite() || !rhs.isFinite())
+    return IntInfty::infty;
+
+  // Check for overflows, treating the sum of two values adding up to INT_MAX as
+  // overflow.  Convert values to unsigned to get an extra bit and avoid the
+  // undefined behavior of signed integer overflows.
+  assert((lhs.getValue() <= 0 || rhs.getValue() <= 0 ||
+          static_cast<uint64_t>(lhs.getValue()) +
+                  static_cast<uint64_t>(rhs.getValue()) <
+              static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) &&
+         "IntInfty overflow");
+  // Check for underflows by converting values to unsigned to avoid undefined
+  // behavior of signed integers perform the addition (bitwise result is same
+  // because numbers are required to be two's complement in C++) and check if
+  // the sign bit remains negative.
+  assert((lhs.getValue() >= 0 || rhs.getValue() >= 0 ||
+          ((static_cast<uint64_t>(lhs.getValue()) +
+            static_cast<uint64_t>(rhs.getValue())) >>
+           63) == 1) &&
+         "IntInfty underflow");
+
+  return lhs.getValue() + rhs.getValue();
+}
+
+inline bool operator<(IntInfty lhs, IntInfty rhs) {
+  return lhs.getValue() < rhs.getValue();
+}
+
+inline bool operator<=(IntInfty lhs, IntInfty rhs) {
+  return lhs.getValue() <= rhs.getValue();
+}
+
+inline bool operator==(IntInfty lhs, IntInfty rhs) {
+  return lhs.getValue() == rhs.getValue();
+}
+
+inline bool operator!=(IntInfty lhs, IntInfty rhs) { return !(lhs == rhs); }
+
+/// Striped difference-bound matrix is a representation of an integer set bound
+/// by a system of SDBMExprs interpreted as inequalities "expr <= 0".
+class SDBM {
+public:
+  /// Obtain an SDBM from a list of SDBM expressions treated as inequalities and
+  /// equalities with zero.
+  static SDBM get(ArrayRef<SDBMExpr> inequalities,
+                  ArrayRef<SDBMExpr> equalities);
+
+  void getSDBMExpressions(SDBMDialect *dialect,
+                          SmallVectorImpl<SDBMExpr> &inequalities,
+                          SmallVectorImpl<SDBMExpr> &equalities);
+
+  void print(raw_ostream &os);
+  void dump();
+
+  IntInfty operator()(int i, int j) { return at(i, j); }
+
+private:
+  /// Get the given element of the difference bounds matrix.  First index
+  /// corresponds to the negative term of the difference, second index
+  /// corresponds to the positive term of the difference.
+  IntInfty &at(int i, int j) { return matrix[i * getNumVariables() + j]; }
+
+  /// Populate `inequalities` and `equalities` based on the values at(row,col)
+  /// and at(col,row) of the DBM.  Depending on the values being finite and
+  /// being subsumed by stripe expressions, this may or may not add elements to
+  /// the lists of equalities and inequalities.
+  void convertDBMElement(unsigned row, unsigned col, SDBMTermExpr rowExpr,
+                         SDBMTermExpr colExpr,
+                         SmallVectorImpl<SDBMExpr> &inequalities,
+                         SmallVectorImpl<SDBMExpr> &equalities);
+
+  /// Populate `inequalities` based on the value at(pos,pos) of the DBM. Only
+  /// adds new inequalities if the inequality is not trivially true.
+  void convertDBMDiagonalElement(unsigned pos, SDBMTermExpr expr,
+                                 SmallVectorImpl<SDBMExpr> &inequalities);
+
+  /// Get the total number of elements in the matrix.
+  unsigned getNumVariables() const {
+    return 1 + numDims + numSymbols + numTemporaries;
+  }
+
+  /// Get the position in the matrix that corresponds to the given dimension.
+  unsigned getDimPosition(unsigned position) const { return 1 + position; }
+
+  /// Get the position in the matrix that corresponds to the given symbol.
+  unsigned getSymbolPosition(unsigned position) const {
+    return 1 + numDims + position;
+  }
+
+  /// Get the position in the matrix that corresponds to the given temporary.
+  unsigned getTemporaryPosition(unsigned position) const {
+    return 1 + numDims + numSymbols + position;
+  }
+
+  /// Number of dimensions in the system,
+  unsigned numDims;
+  /// Number of symbols in the system.
+  unsigned numSymbols;
+  /// Number of temporary variables in the system.
+  unsigned numTemporaries;
+
+  /// Difference bounds matrix, stored as a linearized row-major vector.
+  /// Each value in this matrix corresponds to an inequality
+  ///
+  ///   v@col - v@row <= at(row, col)
+  ///
+  /// where v@col and v@row are the variables that correspond to the linearized
+  /// position in the matrix.  The positions correspond to
+  ///
+  ///   - constant 0 (producing constraints v@col <= X and -v@row <= Y);
+  ///   - SDBM expression dimensions (d0, d1, ...);
+  ///   - SDBM expression symbols (s0, s1, ...);
+  ///   - temporary variables (t0, t1, ...).
+  ///
+  /// Temporary variables are introduced to represent expressions that are not
+  /// trivially a difference between two variables.  For example, if one side of
+  /// a difference expression is itself a stripe expression, it will be replaced
+  /// with a temporary variable assigned equal to this expression.
+  ///
+  /// Infinite entries in the matrix correspond correspond to an absence of a
+  /// constraint:
+  ///
+  ///   v@col - v@row <= infinity
+  ///
+  /// is trivially true.  Negated values at symmetric positions in the matrix
+  /// allow one to couple two inequalities into a single equality.
+  std::vector<IntInfty> matrix;
+
+  /// The mapping between the indices of variables in the DBM and the stripe
+  /// expressions they are equal to.  These expressions are stored as they
+  /// appeared when constructing an SDBM from a SDBMExprs, in particular no
+  /// temporaries can appear in these expressions.  This removes the need to
+  /// iteratively substitute definitions of the temporaries in the reverse
+  /// conversion.
+  DenseMap<unsigned, SDBMExpr> stripeToPoint;
+};
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_SDBM_SDBM_H
diff --git a/mlir/include/mlir/Dialect/SDBM/SDBMDialect.h b/mlir/include/mlir/Dialect/SDBM/SDBMDialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..501c66140f026eb13251946f8f7f294b294d4a09
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SDBM/SDBMDialect.h
@@ -0,0 +1,32 @@
+//===- SDBMDialect.h - Dialect for striped DBMs -----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SDBM_SDBMDIALECT_H
+#define MLIR_DIALECT_SDBM_SDBMDIALECT_H
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/Support/StorageUniquer.h"
+
+namespace mlir {
+class MLIRContext;
+
+class SDBMDialect : public Dialect {
+public:
+  SDBMDialect(MLIRContext *context) : Dialect(getDialectNamespace(), context) {}
+
+  static StringRef getDialectNamespace() { return "sdbm"; }
+
+  /// Get the uniquer for SDBM expressions. This should not be used directly.
+  StorageUniquer &getUniquer() { return uniquer; }
+
+private:
+  StorageUniquer uniquer;
+};
+} // namespace mlir
+
+#endif // MLIR_DIALECT_SDBM_SDBMDIALECT_H
diff --git a/mlir/include/mlir/Dialect/SDBM/SDBMExpr.h b/mlir/include/mlir/Dialect/SDBM/SDBMExpr.h
new file mode 100644
index 0000000000000000000000000000000000000000..84a9a8405a8394576c1eacd46c07c4dec124b8da
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SDBM/SDBMExpr.h
@@ -0,0 +1,576 @@
+//===- SDBMExpr.h - MLIR SDBM Expression ------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A striped difference-bound matrix (SDBM) expression is a constant expression,
+// an identifier, a binary expression with constant RHS and +, stripe operators
+// or a difference expression between two identifiers.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SDBM_SDBMEXPR_H
+#define MLIR_DIALECT_SDBM_SDBMEXPR_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMapInfo.h"
+
+namespace mlir {
+
+class AffineExpr;
+class MLIRContext;
+
+enum class SDBMExprKind { Add, Stripe, Diff, Constant, DimId, SymbolId, Neg };
+
+namespace detail {
+struct SDBMExprStorage;
+struct SDBMBinaryExprStorage;
+struct SDBMDiffExprStorage;
+struct SDBMTermExprStorage;
+struct SDBMConstantExprStorage;
+struct SDBMNegExprStorage;
+} // namespace detail
+
+class SDBMConstantExpr;
+class SDBMDialect;
+class SDBMDimExpr;
+class SDBMSymbolExpr;
+class SDBMTermExpr;
+
+/// Striped Difference-Bounded Matrix (SDBM) expression is a base left-hand side
+/// expression for the SDBM framework.  SDBM expressions are a subset of affine
+/// expressions supporting low-complexity algorithms for the operations used in
+/// loop transformations.  In particular, are supported:
+///   - constant expressions;
+///   - single variables (dimensions and symbols) with +1 or -1 coefficient;
+///   - stripe expressions: "x # C", where "x" is a single variable or another
+///     stripe expression, "#" is the stripe operator, and "C" is a constant
+///     expression; "#" is defined as x - x mod C.
+///   - sum expressions between single variable/stripe expressions and constant
+///     expressions;
+///   - difference expressions between single variable/stripe expressions.
+/// `SDBMExpr` class hierarchy provides a type-safe interface to constructing
+/// and operating on SDBM expressions.  For example, it requires the LHS of a
+/// sum expression to be a single variable or a stripe expression.  These
+/// restrictions are intended to force the caller to perform the necessary
+/// simplifications to stay within the SDBM domain, because SDBM expressions do
+/// not combine in more cases than they do.  This choice may be reconsidered in
+/// the future.
+///
+/// SDBM expressions are grouped into the following structure
+/// - expression
+///   - varying
+///     - direct
+///       - sum <- (term, constant)
+///       - term
+///         - symbol
+///         - dimension
+///         - stripe <- (direct, constant)
+///     - negation <- (direct)
+///     - difference <- (direct, term)
+///   - constant
+/// The notation <- (...) denotes the types of subexpressions a compound
+/// expression can combine.  The tree of subexpressions essentially imposes the
+/// following canonicalization rules:
+///   - constants are always folded;
+///   - constants can only appear on the RHS of an expression;
+///   - double negation must be elided;
+///   - an additive constant term is only allowed in a sum expression, and
+///     should be sunk into the nearest such expression in the tree;
+///   - zero constant expression can only appear at the top level.
+///
+/// `SDBMExpr` and derived classes are thin wrappers around a pointer owned by
+/// an MLIRContext, and should be used by-value.  They are uniqued in the
+/// MLIRContext and immortal.
+class SDBMExpr {
+public:
+  using ImplType = detail::SDBMExprStorage;
+  SDBMExpr() : impl(nullptr) {}
+  /* implicit */ SDBMExpr(ImplType *expr) : impl(expr) {}
+
+  /// SDBM expressions are thin wrappers around a unique'ed immutable pointer,
+  /// which makes them trivially assignable and trivially copyable.
+  SDBMExpr(const SDBMExpr &) = default;
+  SDBMExpr &operator=(const SDBMExpr &) = default;
+
+  /// SDBM expressions can be compared straight-forwardly.
+  bool operator==(const SDBMExpr &other) const { return impl == other.impl; }
+  bool operator!=(const SDBMExpr &other) const { return !(*this == other); }
+
+  /// SDBM expressions are convertible to `bool`: null expressions are converted
+  /// to false, non-null expressions are converted to true.
+  explicit operator bool() const { return impl != nullptr; }
+  bool operator!() const { return !static_cast<bool>(*this); }
+
+  /// Negate the given SDBM expression.
+  SDBMExpr operator-();
+
+  /// Prints the SDBM expression.
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  /// LLVM-style casts.
+  template <typename U> bool isa() const { return U::isClassFor(*this); }
+  template <typename U> U dyn_cast() const {
+    if (!isa<U>())
+      return {};
+    return U(const_cast<SDBMExpr *>(this)->impl);
+  }
+  template <typename U> U cast() const {
+    assert(isa<U>() && "cast to incorrect subtype");
+    return U(const_cast<SDBMExpr *>(this)->impl);
+  }
+
+  /// Support for LLVM hashing.
+  ::llvm::hash_code hash_value() const { return ::llvm::hash_value(impl); }
+
+  /// Returns the kind of the SDBM expression.
+  SDBMExprKind getKind() const;
+
+  /// Returns the MLIR context in which this expression lives.
+  MLIRContext *getContext() const;
+
+  /// Returns the SDBM dialect instance.
+  SDBMDialect *getDialect() const;
+
+  /// Convert the SDBM expression into an Affine expression.  This always
+  /// succeeds because SDBM are a subset of affine.
+  AffineExpr getAsAffineExpr() const;
+
+  /// Try constructing an SDBM expression from the given affine expression.
+  /// This may fail if the affine expression is not representable as SDBM, in
+  /// which case llvm::None is returned.  The conversion procedure recognizes
+  /// (nested) multiplicative ((x floordiv B) * B) and additive (x - x mod B)
+  /// patterns for the stripe expression.
+  static Optional<SDBMExpr> tryConvertAffineExpr(AffineExpr affine);
+
+protected:
+  ImplType *impl;
+};
+
+/// SDBM constant expression, wraps a 64-bit integer.
+class SDBMConstantExpr : public SDBMExpr {
+public:
+  using ImplType = detail::SDBMConstantExprStorage;
+
+  using SDBMExpr::SDBMExpr;
+
+  /// Obtain or create a constant expression unique'ed in the given dialect
+  /// (which belongs to a context).
+  static SDBMConstantExpr get(SDBMDialect *dialect, int64_t value);
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::Constant;
+  }
+
+  int64_t getValue() const;
+};
+
+/// SDBM varying expression can be one of:
+///   - input variable expression;
+///   - stripe expression;
+///   - negation (product with -1) of either of the above.
+///   - sum of a varying and a constant expression
+///   - difference between varying expressions
+class SDBMVaryingExpr : public SDBMExpr {
+public:
+  using ImplType = detail::SDBMExprStorage;
+  using SDBMExpr::SDBMExpr;
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::DimId ||
+           expr.getKind() == SDBMExprKind::SymbolId ||
+           expr.getKind() == SDBMExprKind::Neg ||
+           expr.getKind() == SDBMExprKind::Stripe ||
+           expr.getKind() == SDBMExprKind::Add ||
+           expr.getKind() == SDBMExprKind::Diff;
+  }
+};
+
+/// SDBM direct expression includes exactly one variable (symbol or dimension),
+/// which is not negated in the expression.  It can be one of:
+///   - term expression;
+///   - sum expression.
+class SDBMDirectExpr : public SDBMVaryingExpr {
+public:
+  using SDBMVaryingExpr::SDBMVaryingExpr;
+
+  /// If this is a sum expression, return its variable part, otherwise return
+  /// self.
+  SDBMTermExpr getTerm();
+
+  /// If this is a sum expression, return its constant part, otherwise return 0.
+  int64_t getConstant();
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::DimId ||
+           expr.getKind() == SDBMExprKind::SymbolId ||
+           expr.getKind() == SDBMExprKind::Stripe ||
+           expr.getKind() == SDBMExprKind::Add;
+  }
+};
+
+/// SDBM term expression can be one of:
+///  - single variable expression;
+///  - stripe expression.
+/// Stripe expressions are treated as terms since, in the SDBM domain, they are
+/// attached to temporary variables and can appear anywhere a variable can.
+class SDBMTermExpr : public SDBMDirectExpr {
+public:
+  using SDBMDirectExpr::SDBMDirectExpr;
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::DimId ||
+           expr.getKind() == SDBMExprKind::SymbolId ||
+           expr.getKind() == SDBMExprKind::Stripe;
+  }
+};
+
+/// SDBM sum expression.  LHS is a term expression and RHS is a constant.
+class SDBMSumExpr : public SDBMDirectExpr {
+public:
+  using ImplType = detail::SDBMBinaryExprStorage;
+  using SDBMDirectExpr::SDBMDirectExpr;
+
+  /// Obtain or create a sum expression unique'ed in the given context.
+  static SDBMSumExpr get(SDBMTermExpr lhs, SDBMConstantExpr rhs);
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    SDBMExprKind kind = expr.getKind();
+    return kind == SDBMExprKind::Add;
+  }
+
+  SDBMTermExpr getLHS() const;
+  SDBMConstantExpr getRHS() const;
+};
+
+/// SDBM difference expression.  LHS is a direct expression, i.e. it may be a
+/// sum of a term and a constant.  RHS is a term expression.  Thus the
+/// expression (t1 - t2 + C) with term expressions t1,t2 is represented as
+///   diff(sum(t1, C), t2)
+/// and it is possible to extract the constant factor without negating it.
+class SDBMDiffExpr : public SDBMVaryingExpr {
+public:
+  using ImplType = detail::SDBMDiffExprStorage;
+  using SDBMVaryingExpr::SDBMVaryingExpr;
+
+  /// Obtain or create a difference expression unique'ed in the given context.
+  static SDBMDiffExpr get(SDBMDirectExpr lhs, SDBMTermExpr rhs);
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::Diff;
+  }
+
+  SDBMDirectExpr getLHS() const;
+  SDBMTermExpr getRHS() const;
+};
+
+/// SDBM stripe expression "x # C" where "x" is a term expression, "C" is a
+/// constant expression and "#" is the stripe operator defined as:
+///   x # C = x - x mod C.
+class SDBMStripeExpr : public SDBMTermExpr {
+public:
+  using ImplType = detail::SDBMBinaryExprStorage;
+  using SDBMTermExpr::SDBMTermExpr;
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::Stripe;
+  }
+
+  static SDBMStripeExpr get(SDBMDirectExpr var, SDBMConstantExpr stripeFactor);
+
+  SDBMDirectExpr getLHS() const;
+  SDBMConstantExpr getStripeFactor() const;
+};
+
+/// SDBM "input" variable expression can be either a dimension identifier or
+/// a symbol identifier.  When used to define SDBM functions, dimensions are
+/// interpreted as function arguments while symbols are treated as unknown but
+/// constant values, hence the name.
+class SDBMInputExpr : public SDBMTermExpr {
+public:
+  using ImplType = detail::SDBMTermExprStorage;
+  using SDBMTermExpr::SDBMTermExpr;
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::DimId ||
+           expr.getKind() == SDBMExprKind::SymbolId;
+  }
+
+  unsigned getPosition() const;
+};
+
+/// SDBM dimension expression.  Dimensions correspond to function arguments
+/// when defining functions using SDBM expressions.
+class SDBMDimExpr : public SDBMInputExpr {
+public:
+  using ImplType = detail::SDBMTermExprStorage;
+  using SDBMInputExpr::SDBMInputExpr;
+
+  /// Obtain or create a dimension expression unique'ed in the given dialect
+  /// (which belongs to a context).
+  static SDBMDimExpr get(SDBMDialect *dialect, unsigned position);
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::DimId;
+  }
+};
+
+/// SDBM symbol expression.  Symbols correspond to symbolic constants when
+/// defining functions using SDBM expressions.
+class SDBMSymbolExpr : public SDBMInputExpr {
+public:
+  using ImplType = detail::SDBMTermExprStorage;
+  using SDBMInputExpr::SDBMInputExpr;
+
+  /// Obtain or create a symbol expression unique'ed in the given dialect (which
+  /// belongs to a context).
+  static SDBMSymbolExpr get(SDBMDialect *dialect, unsigned position);
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::SymbolId;
+  }
+};
+
+/// Negation of an SDBM variable expression.  Equivalent to multiplying the
+/// expression with -1 (SDBM does not support other coefficients that 1 and -1).
+class SDBMNegExpr : public SDBMVaryingExpr {
+public:
+  using ImplType = detail::SDBMNegExprStorage;
+  using SDBMVaryingExpr::SDBMVaryingExpr;
+
+  /// Obtain or create a negation expression unique'ed in the given context.
+  static SDBMNegExpr get(SDBMDirectExpr var);
+
+  static bool isClassFor(const SDBMExpr &expr) {
+    return expr.getKind() == SDBMExprKind::Neg;
+  }
+
+  SDBMDirectExpr getVar() const;
+};
+
+/// A visitor class for SDBM expressions.  Calls the kind-specific function
+/// depending on the kind of expression it visits.
+template <typename Derived, typename Result = void> class SDBMVisitor {
+public:
+  /// Visit the given SDBM expression, dispatching to kind-specific functions.
+  Result visit(SDBMExpr expr) {
+    auto *derived = static_cast<Derived *>(this);
+    switch (expr.getKind()) {
+    case SDBMExprKind::Add:
+    case SDBMExprKind::Diff:
+    case SDBMExprKind::DimId:
+    case SDBMExprKind::SymbolId:
+    case SDBMExprKind::Neg:
+    case SDBMExprKind::Stripe:
+      return derived->visitVarying(expr.cast<SDBMVaryingExpr>());
+    case SDBMExprKind::Constant:
+      return derived->visitConstant(expr.cast<SDBMConstantExpr>());
+    }
+
+    llvm_unreachable("unsupported SDBM expression kind");
+  }
+
+  /// Traverse the SDBM expression tree calling `visit` on each node
+  /// in depth-first preorder.
+  void walkPreorder(SDBMExpr expr) { return walk</*isPreorder=*/true>(expr); }
+
+  /// Traverse the SDBM expression tree calling `visit` on each node in
+  /// depth-first postorder.
+  void walkPostorder(SDBMExpr expr) { return walk</*isPreorder=*/false>(expr); }
+
+protected:
+  /// Default visitors do nothing.
+  void visitSum(SDBMSumExpr) {}
+  void visitDiff(SDBMDiffExpr) {}
+  void visitStripe(SDBMStripeExpr) {}
+  void visitDim(SDBMDimExpr) {}
+  void visitSymbol(SDBMSymbolExpr) {}
+  void visitNeg(SDBMNegExpr) {}
+  void visitConstant(SDBMConstantExpr) {}
+
+  /// Default implementation of visitDirect dispatches to the dedicated for sums
+  /// or delegates to visitTerm for the other expression kinds.  Concrete
+  /// visitors can overload it.
+  Result visitDirect(SDBMDirectExpr expr) {
+    auto *derived = static_cast<Derived *>(this);
+    if (auto sum = expr.dyn_cast<SDBMSumExpr>())
+      return derived->visitSum(sum);
+    else
+      return derived->visitTerm(expr.cast<SDBMTermExpr>());
+  }
+
+  /// Default implementation of visitTerm dispatches to the special functions
+  /// for stripes and other variables.  Concrete visitors can override it.
+  Result visitTerm(SDBMTermExpr expr) {
+    auto *derived = static_cast<Derived *>(this);
+    if (expr.getKind() == SDBMExprKind::Stripe)
+      return derived->visitStripe(expr.cast<SDBMStripeExpr>());
+    else
+      return derived->visitInput(expr.cast<SDBMInputExpr>());
+  }
+
+  /// Default implementation of visitInput dispatches to the special
+  /// functions for dimensions or symbols.  Concrete visitors can override it to
+  /// visit all variables instead.
+  Result visitInput(SDBMInputExpr expr) {
+    auto *derived = static_cast<Derived *>(this);
+    if (expr.getKind() == SDBMExprKind::DimId)
+      return derived->visitDim(expr.cast<SDBMDimExpr>());
+    else
+      return derived->visitSymbol(expr.cast<SDBMSymbolExpr>());
+  }
+
+  /// Default implementation of visitVarying dispatches to the special
+  /// functions for variables and negations thereof.  Concrete visitors can
+  /// override it to visit all variables and negations instead.
+  Result visitVarying(SDBMVaryingExpr expr) {
+    auto *derived = static_cast<Derived *>(this);
+    if (auto var = expr.dyn_cast<SDBMDirectExpr>())
+      return derived->visitDirect(var);
+    else if (auto neg = expr.dyn_cast<SDBMNegExpr>())
+      return derived->visitNeg(neg);
+    else if (auto diff = expr.dyn_cast<SDBMDiffExpr>())
+      return derived->visitDiff(diff);
+
+    llvm_unreachable("unhandled subtype of varying SDBM expression");
+  }
+
+  template <bool isPreorder> void walk(SDBMExpr expr) {
+    if (isPreorder)
+      visit(expr);
+    if (auto sumExpr = expr.dyn_cast<SDBMSumExpr>()) {
+      walk<isPreorder>(sumExpr.getLHS());
+      walk<isPreorder>(sumExpr.getRHS());
+    } else if (auto diffExpr = expr.dyn_cast<SDBMDiffExpr>()) {
+      walk<isPreorder>(diffExpr.getLHS());
+      walk<isPreorder>(diffExpr.getRHS());
+    } else if (auto stripeExpr = expr.dyn_cast<SDBMStripeExpr>()) {
+      walk<isPreorder>(stripeExpr.getLHS());
+      walk<isPreorder>(stripeExpr.getStripeFactor());
+    } else if (auto negExpr = expr.dyn_cast<SDBMNegExpr>()) {
+      walk<isPreorder>(negExpr.getVar());
+    }
+    if (!isPreorder)
+      visit(expr);
+  }
+};
+
+/// Overloaded arithmetic operators for SDBM expressions asserting that their
+/// arguments have the proper SDBM expression subtype.  Perform canonicalization
+/// and constant folding on these expressions.
+namespace ops_assertions {
+
+/// Add two SDBM expressions.  At least one of the expressions must be a
+/// constant or a negation, but both expressions cannot be negations
+/// simultaneously.
+SDBMExpr operator+(SDBMExpr lhs, SDBMExpr rhs);
+inline SDBMExpr operator+(SDBMExpr lhs, int64_t rhs) {
+  return lhs + SDBMConstantExpr::get(lhs.getDialect(), rhs);
+}
+inline SDBMExpr operator+(int64_t lhs, SDBMExpr rhs) {
+  return SDBMConstantExpr::get(rhs.getDialect(), lhs) + rhs;
+}
+
+/// Subtract an SDBM expression from another SDBM expression.  Both expressions
+/// must not be difference expressions.
+SDBMExpr operator-(SDBMExpr lhs, SDBMExpr rhs);
+inline SDBMExpr operator-(SDBMExpr lhs, int64_t rhs) {
+  return lhs - SDBMConstantExpr::get(lhs.getDialect(), rhs);
+}
+inline SDBMExpr operator-(int64_t lhs, SDBMExpr rhs) {
+  return SDBMConstantExpr::get(rhs.getDialect(), lhs) - rhs;
+}
+
+/// Construct a stripe expression from a positive expression and a positive
+/// constant stripe factor.
+SDBMExpr stripe(SDBMExpr expr, SDBMExpr factor);
+inline SDBMExpr stripe(SDBMExpr expr, int64_t factor) {
+  return stripe(expr, SDBMConstantExpr::get(expr.getDialect(), factor));
+}
+} // namespace ops_assertions
+
+} // end namespace mlir
+
+namespace llvm {
+// SDBMExpr hash just like pointers.
+template <> struct DenseMapInfo<mlir::SDBMExpr> {
+  static mlir::SDBMExpr getEmptyKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::SDBMExpr(static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static mlir::SDBMExpr getTombstoneKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::SDBMExpr(static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::SDBMExpr expr) {
+    return expr.hash_value();
+  }
+  static bool isEqual(mlir::SDBMExpr lhs, mlir::SDBMExpr rhs) {
+    return lhs == rhs;
+  }
+};
+
+// SDBMDirectExpr hash just like pointers.
+template <> struct DenseMapInfo<mlir::SDBMDirectExpr> {
+  static mlir::SDBMDirectExpr getEmptyKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::SDBMDirectExpr(
+        static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static mlir::SDBMDirectExpr getTombstoneKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::SDBMDirectExpr(
+        static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::SDBMDirectExpr expr) {
+    return expr.hash_value();
+  }
+  static bool isEqual(mlir::SDBMDirectExpr lhs, mlir::SDBMDirectExpr rhs) {
+    return lhs == rhs;
+  }
+};
+
+// SDBMTermExpr hash just like pointers.
+template <> struct DenseMapInfo<mlir::SDBMTermExpr> {
+  static mlir::SDBMTermExpr getEmptyKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::SDBMTermExpr(static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static mlir::SDBMTermExpr getTombstoneKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::SDBMTermExpr(static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::SDBMTermExpr expr) {
+    return expr.hash_value();
+  }
+  static bool isEqual(mlir::SDBMTermExpr lhs, mlir::SDBMTermExpr rhs) {
+    return lhs == rhs;
+  }
+};
+
+// SDBMConstantExpr hash just like pointers.
+template <> struct DenseMapInfo<mlir::SDBMConstantExpr> {
+  static mlir::SDBMConstantExpr getEmptyKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::SDBMConstantExpr(
+        static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static mlir::SDBMConstantExpr getTombstoneKey() {
+    auto *pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::SDBMConstantExpr(
+        static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::SDBMConstantExpr expr) {
+    return expr.hash_value();
+  }
+  static bool isEqual(mlir::SDBMConstantExpr lhs, mlir::SDBMConstantExpr rhs) {
+    return lhs == rhs;
+  }
+};
+} // namespace llvm
+
+#endif // MLIR_DIALECT_SDBM_SDBMEXPR_H
diff --git a/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt b/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fc7180de6cbeca1d97d19ca0e00c0dca0c1607a6
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(LLVM_TARGET_DEFINITIONS SPIRVLowering.td)
+mlir_tablegen(SPIRVLowering.h.inc -gen-struct-attr-decls)
+mlir_tablegen(SPIRVLowering.cpp.inc -gen-struct-attr-defs)
+add_public_tablegen_target(MLIRSPIRVLoweringStructGen)
+
+add_mlir_dialect(SPIRVOps SPIRVOps)
+
+set(LLVM_TARGET_DEFINITIONS SPIRVBase.td)
+mlir_tablegen(SPIRVEnums.h.inc -gen-enum-decls)
+mlir_tablegen(SPIRVEnums.cpp.inc -gen-enum-defs)
+add_public_tablegen_target(MLIRSPIRVEnumsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS SPIRVOps.td)
+mlir_tablegen(SPIRVSerialization.inc -gen-spirv-serialization)
+add_public_tablegen_target(MLIRSPIRVSerializationGen)
+
+set(LLVM_TARGET_DEFINITIONS SPIRVBase.td)
+mlir_tablegen(SPIRVOpUtils.inc -gen-spirv-op-utils)
+add_public_tablegen_target(MLIRSPIRVOpUtilsGen)
diff --git a/mlir/include/mlir/Dialect/SPIRV/LayoutUtils.h b/mlir/include/mlir/Dialect/SPIRV/LayoutUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..329caa2d3aa2cfd6d7f1990a1d3f6fe7a9acc2e7
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/LayoutUtils.h
@@ -0,0 +1,71 @@
+//===-- LayoutUtils.h - Decorate composite type with layout information ---===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utilities used to get alignment and layout information for
+// types in SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_DIALECT_SPIRV_LAYOUTUTILS_H_
+#define MLIR_DIALECT_SPIRV_LAYOUTUTILS_H_
+
+#include <cstdint>
+
+namespace mlir {
+class Type;
+class VectorType;
+namespace spirv {
+class StructType;
+class ArrayType;
+} // namespace spirv
+
+/// According to the Vulkan spec "14.5.4. Offset and Stride Assignment":
+/// "There are different alignment requirements depending on the specific
+/// resources and on the features enabled on the device."
+///
+/// There are 3 types of alignment: scalar, base, extended.
+/// See the spec for details.
+///
+/// Note: Even if scalar alignment is supported, it is generally more
+/// performant to use the base alignment. So here the calculation is based on
+/// base alignment.
+///
+/// The memory layout must obey the following rules:
+/// 1. The Offset decoration of any member must be a multiple of its alignment.
+/// 2. Any ArrayStride or MatrixStride decoration must be a multiple of the
+/// alignment of the array or matrix as defined above.
+///
+/// According to the SPIR-V spec:
+/// "The ArrayStride, MatrixStride, and Offset decorations must be large
+/// enough to hold the size of the objects they affect (that is, specifying
+/// overlap is invalid)."
+class VulkanLayoutUtils {
+public:
+  using Size = uint64_t;
+
+  /// Returns a new StructType with layout info. Assigns the type size in bytes
+  /// to the `size`. Assigns the type alignment in bytes to the `alignment`.
+  static spirv::StructType decorateType(spirv::StructType structType,
+                                        Size &size, Size &alignment);
+  /// Checks whether a type is legal in terms of Vulkan layout info
+  /// decoration. A type is dynamically illegal if it's a composite type in the
+  /// StorageBuffer, PhysicalStorageBuffer, Uniform, and PushConstant Storage
+  /// Classes without layout information.
+  static bool isLegalType(Type type);
+
+private:
+  static Type decorateType(Type type, Size &size, Size &alignment);
+  static Type decorateType(VectorType vectorType, Size &size, Size &alignment);
+  static Type decorateType(spirv::ArrayType arrayType, Size &size,
+                           Size &alignment);
+  /// Calculates the alignment for the given scalar type.
+  static Size getScalarTypeAlignment(Type scalarType);
+};
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_LAYOUTUTILS_H_
diff --git a/mlir/include/mlir/Dialect/SPIRV/Passes.h b/mlir/include/mlir/Dialect/SPIRV/Passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..68f149b54d57dd498f2d3c4660243cb1db8e8d11
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/Passes.h
@@ -0,0 +1,40 @@
+//===- Passes.h - SPIR-V pass entry points ----------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes that expose pass constructors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_PASSES_H_
+#define MLIR_DIALECT_SPIRV_PASSES_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace spirv {
+
+class ModuleOp;
+/// Creates a module pass that converts composite types used by objects in the
+/// StorageBuffer, PhysicalStorageBuffer, Uniform, and PushConstant storage
+/// classes with layout information.
+/// Right now this pass only supports Vulkan layout rules.
+std::unique_ptr<OpPassBase<mlir::ModuleOp>>
+createDecorateSPIRVCompositeTypeLayoutPass();
+
+/// Creates a module pass that lowers the ABI attributes specified during SPIR-V
+/// Lowering. Specifically,
+/// 1) Creates the global variables for arguments of entry point function using
+/// the specification in the ABI attributes for each argument.
+/// 2) Inserts the EntryPointOp and the ExecutionModeOp for entry point
+/// functions using the specification in the EntryPointAttr.
+std::unique_ptr<OpPassBase<spirv::ModuleOp>> createLowerABIAttributesPass();
+
+} // namespace spirv
+} // namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_PASSES_H_
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVArithmeticOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVArithmeticOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..39858f357ff17fc7923851848f78c77d8b6f0d93
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVArithmeticOps.td
@@ -0,0 +1,537 @@
+//===-- SPIRVArithmeticOps.td - MLIR SPIR-V Arithmetic Ops -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains arithmetic ops for the SPIR-V dialect. It corresponds
+// to "3.32.13. Arithmetic Instructions" of the SPIR-V specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_ARITHMETIC_OPS
+#define SPIRV_ARITHMETIC_OPS
+
+include "mlir/Dialect/SPIRV/SPIRVBase.td"
+
+class SPV_ArithmeticBinaryOp<string mnemonic, Type type,
+                       list<OpTrait> traits = []> :
+      // Operands type same as result type.
+      SPV_BinaryOp<mnemonic, type, type,
+                   !listconcat(traits,
+                               [NoSideEffect, SameOperandsAndResultType])>;
+
+class SPV_ArithmeticUnaryOp<string mnemonic, Type type,
+                            list<OpTrait> traits = []> :
+      // Operand type same as result type.
+      SPV_UnaryOp<mnemonic, type, type,
+                   !listconcat(traits,
+                               [NoSideEffect, SameOperandsAndResultType])>;
+
+// -----
+
+def SPV_FAddOp : SPV_ArithmeticBinaryOp<"FAdd", SPV_Float, [Commutative]> {
+  let summary = "Floating-point addition of Operand 1 and Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fadd-op ::= ssa-id `=` `spv.FAdd` ssa-use, ssa-use
+                          `:` float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.FAdd %0, %1 : f32
+    %5 = spv.FAdd %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FDivOp : SPV_ArithmeticBinaryOp<"FDiv", SPV_Float, []> {
+  let summary = "Floating-point division of Operand 1 divided by Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.
+
+    ### Custom assembly form
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fdiv-op ::= ssa-id `=` `spv.FDiv` ssa-use, ssa-use
+                          `:` float-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FDiv %0, %1 : f32
+    %5 = spv.FDiv %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FModOp : SPV_ArithmeticBinaryOp<"FMod", SPV_Float, []> {
+  let summary = [{
+    The floating-point remainder whose sign matches the sign of Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.  Otherwise, the result is the remainder r of Operand
+    1 divided by Operand 2 where if r ≠ 0, the sign of r is the same as the
+    sign of Operand 2.
+
+    ### Custom assembly form
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fmod-op ::= ssa-id `=` `spv.FMod` ssa-use, ssa-use
+                          `:` float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.FMod %0, %1 : f32
+    %5 = spv.FMod %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FMulOp : SPV_ArithmeticBinaryOp<"FMul", SPV_Float, [Commutative]> {
+  let summary = "Floating-point multiplication of Operand 1 and Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fmul-op ::= `spv.FMul` ssa-use, ssa-use
+                          `:` float-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FMul %0, %1 : f32
+    %5 = spv.FMul %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FNegateOp : SPV_ArithmeticUnaryOp<"FNegate", SPV_Float, []> {
+  let summary = "Floating-point subtract of Operand from zero.";
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The type of Operand must be the same as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fmul-op ::= `spv.FNegate` ssa-use `:` float-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %1 = spv.FNegate %0 : f32
+    %3 = spv.FNegate %2 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FRemOp : SPV_ArithmeticBinaryOp<"FRem", SPV_Float, []> {
+  let summary = [{
+    The floating-point remainder whose sign matches the sign of Operand 1.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.  Otherwise, the result is the remainder r of Operand
+    1 divided by Operand 2 where if r ≠ 0, the sign of r is the same as the
+    sign of Operand 1.
+
+    ### Custom assembly form
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    frem-op ::= ssa-id `=` `spv.FRemOp` ssa-use, ssa-use
+                          `:` float-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FRemOp %0, %1 : f32
+    %5 = spv.FRemOp %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FSubOp : SPV_ArithmeticBinaryOp<"FSub", SPV_Float, []> {
+  let summary = "Floating-point subtraction of Operand 2 from Operand 1.";
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fsub-op ::= ssa-id `=` `spv.FRemOp` ssa-use, ssa-use
+                          `:` float-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FRemOp %0, %1 : f32
+    %5 = spv.FRemOp %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_IAddOp : SPV_ArithmeticBinaryOp<"IAdd", SPV_Integer, [Commutative]> {
+  let summary = "Integer addition of Operand 1 and Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same number of components as Result
+    Type. They must have the same component width as Result Type.
+
+    The resulting value will equal the low-order N bits of the correct
+    result R, where N is the component width and R is computed with enough
+    precision to avoid overflow and underflow.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    iadd-op ::= ssa-id `=` `spv.IAdd` ssa-use, ssa-use
+                          `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.IAdd %0, %1 : i32
+    %5 = spv.IAdd %2, %3 : vector<4xi32>
+
+    ```
+  }];
+
+  let hasFolder = 1;
+}
+
+// -----
+
+def SPV_IMulOp : SPV_ArithmeticBinaryOp<"IMul", SPV_Integer, [Commutative]> {
+  let summary = "Integer multiplication of Operand 1 and Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same number of components as Result
+    Type. They must have the same component width as Result Type.
+
+    The resulting value will equal the low-order N bits of the correct
+    result R, where N is the component width and R is computed with enough
+    precision to avoid overflow and underflow.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    imul-op ::= ssa-id `=` `spv.IMul` ssa-use, ssa-use
+                          `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.IMul %0, %1 : i32
+    %5 = spv.IMul %2, %3 : vector<4xi32>
+
+    ```
+  }];
+
+  let hasFolder = 1;
+}
+
+// -----
+
+def SPV_ISubOp : SPV_ArithmeticBinaryOp<"ISub", SPV_Integer, []> {
+  let summary = "Integer subtraction of Operand 2 from Operand 1.";
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same number of components as Result
+    Type. They must have the same component width as Result Type.
+
+    The resulting value will equal the low-order N bits of the correct
+    result R, where N is the component width and R is computed with enough
+    precision to avoid overflow and underflow.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    isub-op ::= `spv.ISub` ssa-use, ssa-use
+                          `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.ISub %0, %1 : i32
+    %5 = spv.ISub %2, %3 : vector<4xi32>
+
+    ```
+  }];
+
+  let hasFolder = 1;
+}
+
+// -----
+
+def SPV_SDivOp : SPV_ArithmeticBinaryOp<"SDiv", SPV_Integer, []> {
+  let summary = "Signed-integer division of Operand 1 divided by Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same number of components as Result
+    Type. They must have the same component width as Result Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    sdiv-op ::= ssa-id `=` `spv.SDiv` ssa-use, ssa-use
+                           `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.SDiv %0, %1 : i32
+    %5 = spv.SDiv %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SModOp : SPV_ArithmeticBinaryOp<"SMod", SPV_Integer, []> {
+  let summary = [{
+    Signed remainder operation for the remainder whose sign matches the sign
+    of Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same number of components as Result
+    Type. They must have the same component width as Result Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.  Otherwise, the result is the remainder r of Operand
+    1 divided by Operand 2 where if r ≠ 0, the sign of r is the same as the
+    sign of Operand 2.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    smod-op ::= ssa-id `=` `spv.SMod` ssa-use, ssa-use
+                           `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.SMod %0, %1 : i32
+    %5 = spv.SMod %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SRemOp : SPV_ArithmeticBinaryOp<"SRem", SPV_Integer, []> {
+  let summary = [{
+    Signed remainder operation for the remainder whose sign matches the sign
+    of Operand 1.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same number of components as Result
+    Type. They must have the same component width as Result Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.  Otherwise, the result is the remainder r of Operand
+    1 divided by Operand 2 where if r ≠ 0, the sign of r is the same as the
+    sign of Operand 1.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    srem-op ::= ssa-id `=` `spv.SRem` ssa-use, ssa-use
+                           `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.SRem %0, %1 : i32
+    %5 = spv.SRem %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_UDivOp : SPV_ArithmeticBinaryOp<"UDiv", SPV_Integer, []> {
+  let summary = "Unsigned-integer division of Operand 1 divided by Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type, whose Signedness
+    operand is 0.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    udiv-op ::= ssa-id `=` `spv.UDiv` ssa-use, ssa-use
+                           `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.UDiv %0, %1 : i32
+    %5 = spv.UDiv %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_UModOp : SPV_ArithmeticBinaryOp<"UMod", SPV_Integer> {
+  let summary = "Unsigned modulo operation of Operand 1 modulo Operand 2.";
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type, whose Signedness
+    operand is 0.
+
+     The types of Operand 1 and Operand 2 both must be the same as Result
+    Type.
+
+     Results are computed per component.  The resulting value is undefined
+    if Operand 2 is 0.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    umod-op ::= ssa-id `=` `spv.UMod` ssa-use, ssa-use
+                           `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.UMod %0, %1 : i32
+    %5 = spv.UMod %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+#endif // SPIRV_ARITHMETIC_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVAtomicOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVAtomicOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..c2ea100c12162798535887f5198ac69b929bd8e8
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVAtomicOps.td
@@ -0,0 +1,552 @@
+//===-- SPIRVAtomicOps.td - MLIR SPIR-V Atomic Ops ---------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains atomic ops for the SPIR-V dialect. It corresponds to
+// "3.32.18. Atomic Instructions" of the SPIR-V specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_ATOMIC_OPS
+#define SPIRV_ATOMIC_OPS
+
+class SPV_AtomicUpdateOp<string mnemonic, list<OpTrait> traits = []> :
+  SPV_Op<mnemonic, traits> {
+  let parser = [{ return ::parseAtomicUpdateOp(parser, result, false); }];
+  let printer = [{ return ::printAtomicUpdateOp(getOperation(), p); }];
+  let verifier = [{ return ::verifyAtomicUpdateOp(getOperation()); }];
+
+  let arguments = (ins
+    SPV_AnyPtr:$pointer,
+    SPV_ScopeAttr:$memory_scope,
+    SPV_MemorySemanticsAttr:$semantics
+  );
+  let results = (outs
+    SPV_Integer:$result
+  );
+}
+
+class SPV_AtomicUpdateWithValueOp<string mnemonic, list<OpTrait> traits = []> :
+  SPV_Op<mnemonic, traits> {
+  let parser = [{ return ::parseAtomicUpdateOp(parser, result, true); }];
+  let printer = [{ return ::printAtomicUpdateOp(getOperation(), p); }];
+  let verifier = [{ return ::verifyAtomicUpdateOp(getOperation()); }];
+
+  let arguments = (ins
+    SPV_AnyPtr:$pointer,
+    SPV_ScopeAttr:$memory_scope,
+    SPV_MemorySemanticsAttr:$semantics,
+    SPV_Integer:$value
+  );
+  let results = (outs
+    SPV_Integer:$result
+  );
+}
+
+// -----
+
+def SPV_AtomicAndOp : SPV_AtomicUpdateWithValueOp<"AtomicAnd", []> {
+  let summary = [{
+    Perform the following steps atomically with respect to any other atomic
+    accesses within Scope to the same location:
+  }];
+
+  let description = [{
+    1) load through Pointer to get an Original Value,
+
+    2) get a New Value by the bitwise AND of Original Value and Value, and
+
+    3) store the New Value back through Pointer.
+
+    The instruction’s result is the Original Value.
+
+    Result Type must be an integer type scalar.
+
+     The type of Value must be the same as Result Type.  The type of the
+    value pointed to by Pointer must be the same as Result Type.
+
+    Memory must be a valid memory Scope.
+
+    ### Custom assembly form
+
+    ```
+    scope ::= `"CrossDevice"` | `"Device"` | `"Workgroup"` | ...
+
+    memory-semantics ::= `"None"` | `"Acquire"` | "Release"` | ...
+
+    atomic-and-op ::=
+        `spv.AtomicAnd` scope memory-semantics
+                        ssa-use `,` ssa-use `:` spv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.AtomicAnd "Device" "None" %pointer, %value :
+                       !spv.ptr<i32, StorageBuffer>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_AtomicCompareExchangeWeakOp : SPV_Op<"AtomicCompareExchangeWeak", []> {
+  let summary = "Deprecated (use OpAtomicCompareExchange).";
+
+  let description = [{
+    Has the same semantics as OpAtomicCompareExchange.
+
+    Memory must be a valid memory Scope.
+
+    ### Custom assembly form
+
+    ```
+    atomic-compare-exchange-weak-op ::=
+        `spv.AtomicCompareExchangeWeak` scope memory-semantics memory-semantics
+                                        ssa-use `,` ssa-use `,` ssa-use
+                                        `:` spv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.AtomicCompareExchangeWeak "Workgroup" "Acquire" "None"
+                                       %pointer, %value, %comparator
+                                       : !spv.ptr<i32, WorkGroup>
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_AnyPtr:$pointer,
+    SPV_ScopeAttr:$memory_scope,
+    SPV_MemorySemanticsAttr:$equal_semantics,
+    SPV_MemorySemanticsAttr:$unequal_semantics,
+    SPV_Integer:$value,
+    SPV_Integer:$comparator
+  );
+
+  let results = (outs
+    SPV_Integer:$result
+  );
+}
+
+// -----
+
+def SPV_AtomicIAddOp : SPV_AtomicUpdateWithValueOp<"AtomicIAdd", []> {
+  let summary = [{
+    Perform the following steps atomically with respect to any other atomic
+    accesses within Scope to the same location:
+  }];
+
+  let description = [{
+    1) load through Pointer to get an Original Value,
+
+    2) get a New Value by integer addition of Original Value and Value, and
+
+    3) store the New Value back through Pointer.
+
+    The instruction’s result is the Original Value.
+
+    Result Type must be an integer type scalar.
+
+     The type of Value must be the same as Result Type.  The type of the
+    value pointed to by Pointer must be the same as Result Type.
+
+    Memory must be a valid memory Scope.
+
+    ### Custom assembly form
+
+    ```
+    atomic-iadd-op ::=
+        `spv.AtomicIAdd` scope memory-semantics
+                         ssa-use `,` ssa-use `:` spv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.AtomicIAdd "Device" "None" %pointer, %value :
+                        !spv.ptr<i32, StorageBuffer>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_AtomicIDecrementOp : SPV_AtomicUpdateOp<"AtomicIDecrement", []> {
+  let summary = [{
+    Perform the following steps atomically with respect to any other atomic
+    accesses within Scope to the same location:
+  }];
+
+  let description = [{
+    1) load through Pointer to get an Original Value,
+
+    2) get a New Value through integer subtraction of 1 from Original Value,
+    and
+
+    3) store the New Value back through Pointer.
+
+    The instruction’s result is the Original Value.
+
+    Result Type must be an integer type scalar.  The type of the value
+    pointed to by Pointer must be the same as Result Type.
+
+    Memory must be a valid memory Scope.
+
+    ### Custom assembly form
+
+    ```
+    atomic-idecrement-op ::=
+        `spv.AtomicIDecrement` scope memory-semantics ssa-use
+                               `:` spv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.AtomicIDecrement "Device" "None" %pointer :
+                              !spv.ptr<i32, StorageBuffer>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_AtomicIIncrementOp : SPV_AtomicUpdateOp<"AtomicIIncrement", []> {
+  let summary = [{
+    Perform the following steps atomically with respect to any other atomic
+    accesses within Scope to the same location:
+  }];
+
+  let description = [{
+    1) load through Pointer to get an Original Value,
+
+    2) get a New Value through integer addition of 1 to Original Value, and
+
+    3) store the New Value back through Pointer.
+
+    The instruction’s result is the Original Value.
+
+    Result Type must be an integer type scalar.  The type of the value
+    pointed to by Pointer must be the same as Result Type.
+
+    Memory must be a valid memory Scope.
+
+    ### Custom assembly form
+
+    ```
+    atomic-iincrement-op ::=
+        `spv.AtomicIIncrement` scope memory-semantics ssa-use
+                               `:` spv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.AtomicIncrement "Device" "None" %pointer :
+                             !spv.ptr<i32, StorageBuffer>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_AtomicISubOp : SPV_AtomicUpdateWithValueOp<"AtomicISub", []> {
+  let summary = [{
+    Perform the following steps atomically with respect to any other atomic
+    accesses within Scope to the same location:
+  }];
+
+  let description = [{
+    1) load through Pointer to get an Original Value,
+
+    2) get a New Value by integer subtraction of Value from Original Value,
+    and
+
+    3) store the New Value back through Pointer.
+
+    The instruction’s result is the Original Value.
+
+    Result Type must be an integer type scalar.
+
+     The type of Value must be the same as Result Type.  The type of the
+    value pointed to by Pointer must be the same as Result Type.
+
+    Memory must be a valid memory Scope.
+
+    ### Custom assembly form
+
+    ```
+    atomic-isub-op ::=
+        `spv.AtomicISub` scope memory-semantics
+                         ssa-use `,` ssa-use `:` spv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.AtomicISub "Device" "None" %pointer, %value :
+                        !spv.ptr<i32, StorageBuffer>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_AtomicOrOp : SPV_AtomicUpdateWithValueOp<"AtomicOr", []> {
+  let summary = [{
+    Perform the following steps atomically with respect to any other atomic
+    accesses within Scope to the same location:
+  }];
+
+  let description = [{
+    1) load through Pointer to get an Original Value,
+
+    2) get a New Value by the bitwise OR of Original Value and Value, and
+
+    3) store the New Value back through Pointer.
+
+    The instruction’s result is the Original Value.
+
+    Result Type must be an integer type scalar.
+
+     The type of Value must be the same as Result Type.  The type of the
+    value pointed to by Pointer must be the same as Result Type.
+
+    Memory must be a valid memory Scope.
+
+    ### Custom assembly form
+
+    ```
+    atomic-or-op ::=
+        `spv.AtomicOr` scope memory-semantics
+                       ssa-use `,` ssa-use `:` spv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.AtomicOr "Device" "None" %pointer, %value :
+                      !spv.ptr<i32, StorageBuffer>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_AtomicSMaxOp : SPV_AtomicUpdateWithValueOp<"AtomicSMax", []> {
+  let summary = [{
+    Perform the following steps atomically with respect to any other atomic
+    accesses within Scope to the same location:
+  }];
+
+  let description = [{
+    1) load through Pointer to get an Original Value,
+
+    2) get a New Value by finding the largest signed integer of Original
+    Value and Value, and
+
+    3) store the New Value back through Pointer.
+
+    The instruction’s result is the Original Value.
+
+    Result Type must be an integer type scalar.
+
+     The type of Value must be the same as Result Type.  The type of the
+    value pointed to by Pointer must be the same as Result Type.
+
+    Memory must be a valid memory Scope.
+
+    ### Custom assembly form
+
+    ```
+    atomic-smax-op ::=
+        `spv.AtomicSMax` scope memory-semantics
+                         ssa-use `,` ssa-use `:` spv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.AtomicSMax "Device" "None" %pointer, %value :
+                        !spv.ptr<i32, StorageBuffer>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_AtomicSMinOp : SPV_AtomicUpdateWithValueOp<"AtomicSMin", []> {
+  let summary = [{
+    Perform the following steps atomically with respect to any other atomic
+    accesses within Scope to the same location:
+  }];
+
+  let description = [{
+    1) load through Pointer to get an Original Value,
+
+    2) get a New Value by finding the smallest signed integer of Original
+    Value and Value, and
+
+    3) store the New Value back through Pointer.
+
+    The instruction’s result is the Original Value.
+
+    Result Type must be an integer type scalar.
+
+     The type of Value must be the same as Result Type.  The type of the
+    value pointed to by Pointer must be the same as Result Type.
+
+    Memory must be a valid memory Scope.
+
+    ### Custom assembly form
+
+    ```
+    atomic-smin-op ::=
+        `spv.AtomicSMin` scope memory-semantics
+                         ssa-use `,` ssa-use `:` spv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.AtomicSMin "Device" "None" %pointer, %value :
+                        !spv.ptr<i32, StorageBuffer>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_AtomicUMaxOp : SPV_AtomicUpdateWithValueOp<"AtomicUMax", []> {
+  let summary = [{
+    Perform the following steps atomically with respect to any other atomic
+    accesses within Scope to the same location:
+  }];
+
+  let description = [{
+    1) load through Pointer to get an Original Value,
+
+    2) get a New Value by finding the largest unsigned integer of Original
+    Value and Value, and
+
+    3) store the New Value back through Pointer.
+
+    The instruction’s result is the Original Value.
+
+    Result Type must be an integer type scalar.
+
+     The type of Value must be the same as Result Type.  The type of the
+    value pointed to by Pointer must be the same as Result Type.
+
+    Memory must be a valid memory Scope.
+
+    ### Custom assembly form
+
+    ```
+    atomic-umax-op ::=
+        `spv.AtomicUMax` scope memory-semantics
+                         ssa-use `,` ssa-use `:` spv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.AtomicUMax "Device" "None" %pointer, %value :
+                        !spv.ptr<i32, StorageBuffer>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_AtomicUMinOp : SPV_AtomicUpdateWithValueOp<"AtomicUMin", []> {
+  let summary = [{
+    Perform the following steps atomically with respect to any other atomic
+    accesses within Scope to the same location:
+  }];
+
+  let description = [{
+    1) load through Pointer to get an Original Value,
+
+    2) get a New Value by finding the smallest unsigned integer of Original
+    Value and Value, and
+
+    3) store the New Value back through Pointer.
+
+    The instruction’s result is the Original Value.
+
+    Result Type must be an integer type scalar.
+
+     The type of Value must be the same as Result Type.  The type of the
+    value pointed to by Pointer must be the same as Result Type.
+
+    Memory must be a valid memory Scope.
+
+    ### Custom assembly form
+
+    ```
+    atomic-umin-op ::=
+        `spv.AtomicUMin` scope memory-semantics
+                         ssa-use `,` ssa-use `:` spv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.AtomicUMin "Device" "None" %pointer, %value :
+                        !spv.ptr<i32, StorageBuffer>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_AtomicXorOp : SPV_AtomicUpdateWithValueOp<"AtomicXor", []> {
+  let summary = [{
+    Perform the following steps atomically with respect to any other atomic
+    accesses within Scope to the same location:
+  }];
+
+  let description = [{
+    1) load through Pointer to get an Original Value,
+
+    2) get a New Value by the bitwise exclusive OR of Original Value and
+    Value, and
+
+    3) store the New Value back through Pointer.
+
+    The instruction’s result is the Original Value.
+
+    Result Type must be an integer type scalar.
+
+     The type of Value must be the same as Result Type.  The type of the
+    value pointed to by Pointer must be the same as Result Type.
+
+    Memory must be a valid memory Scope.
+
+    ### Custom assembly form
+
+    ```
+    atomic-xor-op ::=
+        `spv.AtomicXor` scope memory-semantics
+                        ssa-use `,` ssa-use `:` spv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.AtomicXor "Device" "None" %pointer, %value :
+                       !spv.ptr<i32, StorageBuffer>
+    ```
+  }];
+}
+
+// -----
+
+#endif // SPIRV_ATOMIC_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td
new file mode 100644
index 0000000000000000000000000000000000000000..5751a32e1695ca3278f94267b4f2fc0dbd13190b
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td
@@ -0,0 +1,1319 @@
+//===- SPIRVBase.td - MLIR SPIR-V Op Definitions Base file -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the base file for SPIR-V operation definition specification.
+// This file defines the SPIR-V dialect, common SPIR-V types, and utilities
+// for facilitating defining SPIR-V ops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_BASE
+#define SPIRV_BASE
+
+include "mlir/IR/OpBase.td"
+
+//===----------------------------------------------------------------------===//
+// SPIR-V dialect definitions
+//===----------------------------------------------------------------------===//
+
+def SPV_Dialect : Dialect {
+  let name = "spv";
+
+  let description = [{
+    The SPIR-V dialect in MLIR.
+
+    SPIR-V is the Khronos Group's binary intermediate language for representing
+    graphical-shader stages and compute kernels for multiple Khronos APIs,
+    including OpenCL, OpenGL, and Vulkan.
+    See https://www.khronos.org/registry/spir-v for more details.
+
+    This dialect aims to be a simple proxy for the SPIR-V binary format to
+    enable straightforward and lightweight conversion from/to the binary
+    format. Ops in this dialect should stay at the same semantic level and
+    try to be a mechanical mapping to the corresponding SPIR-V instructions;
+    but they may deviate representationally to allow using MLIR mechanisms.
+    As a convention, if such deviation happens, the op name follows "snake_case"
+    style; otherwise, the op name just follows the SPIR-V mnemonic (by removing
+    the leading `Op` prefix) to use "CamelCase" style.
+  }];
+
+  let cppNamespace = "spirv";
+}
+
+//===----------------------------------------------------------------------===//
+// SPIR-V extension definitions
+//===----------------------------------------------------------------------===//
+
+// Extensions known to the SPIR-V dialect.
+// https://github.com/KhronosGroup/SPIRV-Registry has the full list.
+def SPV_KHR_16bit_storage                : StrEnumAttrCase<"SPV_KHR_16bit_storage">;
+def SPV_KHR_8bit_storage                 : StrEnumAttrCase<"SPV_KHR_8bit_storage">;
+def SPV_KHR_device_group                 : StrEnumAttrCase<"SPV_KHR_device_group">;
+def SPV_KHR_float_controls               : StrEnumAttrCase<"SPV_KHR_float_controls">;
+def SPV_KHR_physical_storage_buffer      : StrEnumAttrCase<"SPV_KHR_physical_storage_buffer">;
+def SPV_KHR_multiview                    : StrEnumAttrCase<"SPV_KHR_multiview">;
+def SPV_KHR_no_integer_wrap_decoration   : StrEnumAttrCase<"SPV_KHR_no_integer_wrap_decoration">;
+def SPV_KHR_post_depth_coverage          : StrEnumAttrCase<"SPV_KHR_post_depth_coverage">;
+def SPV_KHR_shader_atomic_counter_ops    : StrEnumAttrCase<"SPV_KHR_shader_atomic_counter_ops">;
+def SPV_KHR_shader_ballot                : StrEnumAttrCase<"SPV_KHR_shader_ballot">;
+def SPV_KHR_shader_draw_parameters       : StrEnumAttrCase<"SPV_KHR_shader_draw_parameters">;
+def SPV_KHR_storage_buffer_storage_class : StrEnumAttrCase<"SPV_KHR_storage_buffer_storage_class">;
+def SPV_KHR_subgroup_vote                : StrEnumAttrCase<"SPV_KHR_subgroup_vote">;
+def SPV_KHR_variable_pointers            : StrEnumAttrCase<"SPV_KHR_variable_pointers">;
+def SPV_KHR_vulkan_memory_model          : StrEnumAttrCase<"SPV_KHR_vulkan_memory_model">;
+
+def SPV_EXT_fragment_fully_covered       : StrEnumAttrCase<"SPV_EXT_fragment_fully_covered">;
+def SPV_EXT_fragment_invocation_density  : StrEnumAttrCase<"SPV_EXT_fragment_invocation_density">;
+def SPV_EXT_fragment_shader_interlock    : StrEnumAttrCase<"SPV_EXT_fragment_shader_interlock">;
+def SPV_EXT_physical_storage_buffer      : StrEnumAttrCase<"SPV_EXT_physical_storage_buffer">;
+def SPV_EXT_shader_stencil_export        : StrEnumAttrCase<"SPV_EXT_shader_stencil_export">;
+
+def SPV_AMD_shader_explicit_vertex_parameter : StrEnumAttrCase<"SPV_AMD_shader_explicit_vertex_parameter">;
+
+def SPV_GOOGLE_user_type                 : StrEnumAttrCase<"SPV_GOOGLE_user_type">;
+
+def SPV_NV_compute_shader_derivatives    : StrEnumAttrCase<"SPV_NV_compute_shader_derivatives">;
+def SPV_NV_fragment_shader_barycentric   : StrEnumAttrCase<"SPV_NV_fragment_shader_barycentric">;
+def SPV_NV_geometry_shader_passthrough   : StrEnumAttrCase<"SPV_NV_geometry_shader_passthrough">;
+def SPV_NV_mesh_shader                   : StrEnumAttrCase<"SPV_NV_mesh_shader">;
+def SPV_NV_ray_tracing                   : StrEnumAttrCase<"SPV_NV_ray_tracing">;
+def SPV_NV_sample_mask_override_coverage : StrEnumAttrCase<"SPV_NV_sample_mask_override_coverage">;
+def SPV_NV_shader_sm_builtins            : StrEnumAttrCase<"SPV_NV_shader_sm_builtins">;
+def SPV_NV_shading_rate                  : StrEnumAttrCase<"SPV_NV_shading_rate">;
+def SPV_NV_stereo_view_rendering         : StrEnumAttrCase<"SPV_NV_stereo_view_rendering">;
+def SPV_NV_viewport_array2               : StrEnumAttrCase<"SPV_NV_viewport_array2">;
+
+def SPV_NVX_multiview_per_view_attributes : StrEnumAttrCase<"SPV_NVX_multiview_per_view_attributes">;
+
+def SPV_ExtensionAttr :
+    StrEnumAttr<"Extension", "supported SPIR-V extensions", [
+      SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_device_group,
+      SPV_KHR_float_controls, SPV_KHR_physical_storage_buffer, SPV_KHR_multiview,
+      SPV_KHR_no_integer_wrap_decoration, SPV_KHR_post_depth_coverage,
+      SPV_KHR_shader_atomic_counter_ops, SPV_KHR_shader_ballot,
+      SPV_KHR_shader_draw_parameters, SPV_KHR_storage_buffer_storage_class,
+      SPV_KHR_subgroup_vote, SPV_KHR_variable_pointers,
+      SPV_KHR_vulkan_memory_model, SPV_EXT_fragment_fully_covered,
+      SPV_EXT_fragment_invocation_density, SPV_EXT_fragment_shader_interlock,
+      SPV_EXT_physical_storage_buffer, SPV_EXT_shader_stencil_export,
+      SPV_AMD_shader_explicit_vertex_parameter, SPV_GOOGLE_user_type,
+      SPV_NV_compute_shader_derivatives, SPV_NV_fragment_shader_barycentric,
+      SPV_NV_geometry_shader_passthrough, SPV_NV_mesh_shader, SPV_NV_ray_tracing,
+      SPV_NV_sample_mask_override_coverage, SPV_NV_shader_sm_builtins,
+      SPV_NV_shading_rate, SPV_NV_stereo_view_rendering,
+      SPV_NV_viewport_array2, SPV_NVX_multiview_per_view_attributes,
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+//===----------------------------------------------------------------------===//
+// SPIR-V enum definitions
+//===----------------------------------------------------------------------===//
+
+// Begin enum section. Generated from SPIR-V spec; DO NOT MODIFY!
+
+def SPV_C_Matrix                                    : I32EnumAttrCase<"Matrix", 0>;
+def SPV_C_Shader                                    : I32EnumAttrCase<"Shader", 1>;
+def SPV_C_Geometry                                  : I32EnumAttrCase<"Geometry", 2>;
+def SPV_C_Tessellation                              : I32EnumAttrCase<"Tessellation", 3>;
+def SPV_C_Addresses                                 : I32EnumAttrCase<"Addresses", 4>;
+def SPV_C_Linkage                                   : I32EnumAttrCase<"Linkage", 5>;
+def SPV_C_Kernel                                    : I32EnumAttrCase<"Kernel", 6>;
+def SPV_C_Vector16                                  : I32EnumAttrCase<"Vector16", 7>;
+def SPV_C_Float16Buffer                             : I32EnumAttrCase<"Float16Buffer", 8>;
+def SPV_C_Float16                                   : I32EnumAttrCase<"Float16", 9>;
+def SPV_C_Float64                                   : I32EnumAttrCase<"Float64", 10>;
+def SPV_C_Int64                                     : I32EnumAttrCase<"Int64", 11>;
+def SPV_C_Int64Atomics                              : I32EnumAttrCase<"Int64Atomics", 12>;
+def SPV_C_ImageBasic                                : I32EnumAttrCase<"ImageBasic", 13>;
+def SPV_C_ImageReadWrite                            : I32EnumAttrCase<"ImageReadWrite", 14>;
+def SPV_C_ImageMipmap                               : I32EnumAttrCase<"ImageMipmap", 15>;
+def SPV_C_Pipes                                     : I32EnumAttrCase<"Pipes", 17>;
+def SPV_C_Groups                                    : I32EnumAttrCase<"Groups", 18>;
+def SPV_C_DeviceEnqueue                             : I32EnumAttrCase<"DeviceEnqueue", 19>;
+def SPV_C_LiteralSampler                            : I32EnumAttrCase<"LiteralSampler", 20>;
+def SPV_C_AtomicStorage                             : I32EnumAttrCase<"AtomicStorage", 21>;
+def SPV_C_Int16                                     : I32EnumAttrCase<"Int16", 22>;
+def SPV_C_TessellationPointSize                     : I32EnumAttrCase<"TessellationPointSize", 23>;
+def SPV_C_GeometryPointSize                         : I32EnumAttrCase<"GeometryPointSize", 24>;
+def SPV_C_ImageGatherExtended                       : I32EnumAttrCase<"ImageGatherExtended", 25>;
+def SPV_C_StorageImageMultisample                   : I32EnumAttrCase<"StorageImageMultisample", 27>;
+def SPV_C_UniformBufferArrayDynamicIndexing         : I32EnumAttrCase<"UniformBufferArrayDynamicIndexing", 28>;
+def SPV_C_SampledImageArrayDynamicIndexing          : I32EnumAttrCase<"SampledImageArrayDynamicIndexing", 29>;
+def SPV_C_StorageBufferArrayDynamicIndexing         : I32EnumAttrCase<"StorageBufferArrayDynamicIndexing", 30>;
+def SPV_C_StorageImageArrayDynamicIndexing          : I32EnumAttrCase<"StorageImageArrayDynamicIndexing", 31>;
+def SPV_C_ClipDistance                              : I32EnumAttrCase<"ClipDistance", 32>;
+def SPV_C_CullDistance                              : I32EnumAttrCase<"CullDistance", 33>;
+def SPV_C_ImageCubeArray                            : I32EnumAttrCase<"ImageCubeArray", 34>;
+def SPV_C_SampleRateShading                         : I32EnumAttrCase<"SampleRateShading", 35>;
+def SPV_C_ImageRect                                 : I32EnumAttrCase<"ImageRect", 36>;
+def SPV_C_SampledRect                               : I32EnumAttrCase<"SampledRect", 37>;
+def SPV_C_GenericPointer                            : I32EnumAttrCase<"GenericPointer", 38>;
+def SPV_C_Int8                                      : I32EnumAttrCase<"Int8", 39>;
+def SPV_C_InputAttachment                           : I32EnumAttrCase<"InputAttachment", 40>;
+def SPV_C_SparseResidency                           : I32EnumAttrCase<"SparseResidency", 41>;
+def SPV_C_MinLod                                    : I32EnumAttrCase<"MinLod", 42>;
+def SPV_C_Sampled1D                                 : I32EnumAttrCase<"Sampled1D", 43>;
+def SPV_C_Image1D                                   : I32EnumAttrCase<"Image1D", 44>;
+def SPV_C_SampledCubeArray                          : I32EnumAttrCase<"SampledCubeArray", 45>;
+def SPV_C_SampledBuffer                             : I32EnumAttrCase<"SampledBuffer", 46>;
+def SPV_C_ImageBuffer                               : I32EnumAttrCase<"ImageBuffer", 47>;
+def SPV_C_ImageMSArray                              : I32EnumAttrCase<"ImageMSArray", 48>;
+def SPV_C_StorageImageExtendedFormats               : I32EnumAttrCase<"StorageImageExtendedFormats", 49>;
+def SPV_C_ImageQuery                                : I32EnumAttrCase<"ImageQuery", 50>;
+def SPV_C_DerivativeControl                         : I32EnumAttrCase<"DerivativeControl", 51>;
+def SPV_C_InterpolationFunction                     : I32EnumAttrCase<"InterpolationFunction", 52>;
+def SPV_C_TransformFeedback                         : I32EnumAttrCase<"TransformFeedback", 53>;
+def SPV_C_GeometryStreams                           : I32EnumAttrCase<"GeometryStreams", 54>;
+def SPV_C_StorageImageReadWithoutFormat             : I32EnumAttrCase<"StorageImageReadWithoutFormat", 55>;
+def SPV_C_StorageImageWriteWithoutFormat            : I32EnumAttrCase<"StorageImageWriteWithoutFormat", 56>;
+def SPV_C_MultiViewport                             : I32EnumAttrCase<"MultiViewport", 57>;
+def SPV_C_SubgroupDispatch                          : I32EnumAttrCase<"SubgroupDispatch", 58>;
+def SPV_C_NamedBarrier                              : I32EnumAttrCase<"NamedBarrier", 59>;
+def SPV_C_PipeStorage                               : I32EnumAttrCase<"PipeStorage", 60>;
+def SPV_C_GroupNonUniform                           : I32EnumAttrCase<"GroupNonUniform", 61>;
+def SPV_C_GroupNonUniformVote                       : I32EnumAttrCase<"GroupNonUniformVote", 62>;
+def SPV_C_GroupNonUniformArithmetic                 : I32EnumAttrCase<"GroupNonUniformArithmetic", 63>;
+def SPV_C_GroupNonUniformBallot                     : I32EnumAttrCase<"GroupNonUniformBallot", 64>;
+def SPV_C_GroupNonUniformShuffle                    : I32EnumAttrCase<"GroupNonUniformShuffle", 65>;
+def SPV_C_GroupNonUniformShuffleRelative            : I32EnumAttrCase<"GroupNonUniformShuffleRelative", 66>;
+def SPV_C_GroupNonUniformClustered                  : I32EnumAttrCase<"GroupNonUniformClustered", 67>;
+def SPV_C_GroupNonUniformQuad                       : I32EnumAttrCase<"GroupNonUniformQuad", 68>;
+def SPV_C_ShaderLayer                               : I32EnumAttrCase<"ShaderLayer", 69>;
+def SPV_C_ShaderViewportIndex                       : I32EnumAttrCase<"ShaderViewportIndex", 70>;
+def SPV_C_SubgroupBallotKHR                         : I32EnumAttrCase<"SubgroupBallotKHR", 4423>;
+def SPV_C_DrawParameters                            : I32EnumAttrCase<"DrawParameters", 4427>;
+def SPV_C_SubgroupVoteKHR                           : I32EnumAttrCase<"SubgroupVoteKHR", 4431>;
+def SPV_C_StorageBuffer16BitAccess                  : I32EnumAttrCase<"StorageBuffer16BitAccess", 4433>;
+def SPV_C_StorageUniform16                          : I32EnumAttrCase<"StorageUniform16", 4434>;
+def SPV_C_StoragePushConstant16                     : I32EnumAttrCase<"StoragePushConstant16", 4435>;
+def SPV_C_StorageInputOutput16                      : I32EnumAttrCase<"StorageInputOutput16", 4436>;
+def SPV_C_DeviceGroup                               : I32EnumAttrCase<"DeviceGroup", 4437>;
+def SPV_C_MultiView                                 : I32EnumAttrCase<"MultiView", 4439>;
+def SPV_C_VariablePointersStorageBuffer             : I32EnumAttrCase<"VariablePointersStorageBuffer", 4441>;
+def SPV_C_VariablePointers                          : I32EnumAttrCase<"VariablePointers", 4442>;
+def SPV_C_AtomicStorageOps                          : I32EnumAttrCase<"AtomicStorageOps", 4445>;
+def SPV_C_SampleMaskPostDepthCoverage               : I32EnumAttrCase<"SampleMaskPostDepthCoverage", 4447>;
+def SPV_C_StorageBuffer8BitAccess                   : I32EnumAttrCase<"StorageBuffer8BitAccess", 4448>;
+def SPV_C_UniformAndStorageBuffer8BitAccess         : I32EnumAttrCase<"UniformAndStorageBuffer8BitAccess", 4449>;
+def SPV_C_StoragePushConstant8                      : I32EnumAttrCase<"StoragePushConstant8", 4450>;
+def SPV_C_DenormPreserve                            : I32EnumAttrCase<"DenormPreserve", 4464>;
+def SPV_C_DenormFlushToZero                         : I32EnumAttrCase<"DenormFlushToZero", 4465>;
+def SPV_C_SignedZeroInfNanPreserve                  : I32EnumAttrCase<"SignedZeroInfNanPreserve", 4466>;
+def SPV_C_RoundingModeRTE                           : I32EnumAttrCase<"RoundingModeRTE", 4467>;
+def SPV_C_RoundingModeRTZ                           : I32EnumAttrCase<"RoundingModeRTZ", 4468>;
+def SPV_C_Float16ImageAMD                           : I32EnumAttrCase<"Float16ImageAMD", 5008>;
+def SPV_C_ImageGatherBiasLodAMD                     : I32EnumAttrCase<"ImageGatherBiasLodAMD", 5009>;
+def SPV_C_FragmentMaskAMD                           : I32EnumAttrCase<"FragmentMaskAMD", 5010>;
+def SPV_C_StencilExportEXT                          : I32EnumAttrCase<"StencilExportEXT", 5013>;
+def SPV_C_ImageReadWriteLodAMD                      : I32EnumAttrCase<"ImageReadWriteLodAMD", 5015>;
+def SPV_C_ShaderClockKHR                            : I32EnumAttrCase<"ShaderClockKHR", 5055>;
+def SPV_C_SampleMaskOverrideCoverageNV              : I32EnumAttrCase<"SampleMaskOverrideCoverageNV", 5249>;
+def SPV_C_GeometryShaderPassthroughNV               : I32EnumAttrCase<"GeometryShaderPassthroughNV", 5251>;
+def SPV_C_ShaderViewportIndexLayerEXT               : I32EnumAttrCase<"ShaderViewportIndexLayerEXT", 5254>;
+def SPV_C_ShaderViewportMaskNV                      : I32EnumAttrCase<"ShaderViewportMaskNV", 5255>;
+def SPV_C_ShaderStereoViewNV                        : I32EnumAttrCase<"ShaderStereoViewNV", 5259>;
+def SPV_C_PerViewAttributesNV                       : I32EnumAttrCase<"PerViewAttributesNV", 5260>;
+def SPV_C_FragmentFullyCoveredEXT                   : I32EnumAttrCase<"FragmentFullyCoveredEXT", 5265>;
+def SPV_C_MeshShadingNV                             : I32EnumAttrCase<"MeshShadingNV", 5266>;
+def SPV_C_ImageFootprintNV                          : I32EnumAttrCase<"ImageFootprintNV", 5282>;
+def SPV_C_FragmentBarycentricNV                     : I32EnumAttrCase<"FragmentBarycentricNV", 5284>;
+def SPV_C_ComputeDerivativeGroupQuadsNV             : I32EnumAttrCase<"ComputeDerivativeGroupQuadsNV", 5288>;
+def SPV_C_FragmentDensityEXT                        : I32EnumAttrCase<"FragmentDensityEXT", 5291>;
+def SPV_C_GroupNonUniformPartitionedNV              : I32EnumAttrCase<"GroupNonUniformPartitionedNV", 5297>;
+def SPV_C_ShaderNonUniform                          : I32EnumAttrCase<"ShaderNonUniform", 5301>;
+def SPV_C_RuntimeDescriptorArray                    : I32EnumAttrCase<"RuntimeDescriptorArray", 5302>;
+def SPV_C_InputAttachmentArrayDynamicIndexing       : I32EnumAttrCase<"InputAttachmentArrayDynamicIndexing", 5303>;
+def SPV_C_UniformTexelBufferArrayDynamicIndexing    : I32EnumAttrCase<"UniformTexelBufferArrayDynamicIndexing", 5304>;
+def SPV_C_StorageTexelBufferArrayDynamicIndexing    : I32EnumAttrCase<"StorageTexelBufferArrayDynamicIndexing", 5305>;
+def SPV_C_UniformBufferArrayNonUniformIndexing      : I32EnumAttrCase<"UniformBufferArrayNonUniformIndexing", 5306>;
+def SPV_C_SampledImageArrayNonUniformIndexing       : I32EnumAttrCase<"SampledImageArrayNonUniformIndexing", 5307>;
+def SPV_C_StorageBufferArrayNonUniformIndexing      : I32EnumAttrCase<"StorageBufferArrayNonUniformIndexing", 5308>;
+def SPV_C_StorageImageArrayNonUniformIndexing       : I32EnumAttrCase<"StorageImageArrayNonUniformIndexing", 5309>;
+def SPV_C_InputAttachmentArrayNonUniformIndexing    : I32EnumAttrCase<"InputAttachmentArrayNonUniformIndexing", 5310>;
+def SPV_C_UniformTexelBufferArrayNonUniformIndexing : I32EnumAttrCase<"UniformTexelBufferArrayNonUniformIndexing", 5311>;
+def SPV_C_StorageTexelBufferArrayNonUniformIndexing : I32EnumAttrCase<"StorageTexelBufferArrayNonUniformIndexing", 5312>;
+def SPV_C_RayTracingNV                              : I32EnumAttrCase<"RayTracingNV", 5340>;
+def SPV_C_VulkanMemoryModel                         : I32EnumAttrCase<"VulkanMemoryModel", 5345>;
+def SPV_C_VulkanMemoryModelDeviceScope              : I32EnumAttrCase<"VulkanMemoryModelDeviceScope", 5346>;
+def SPV_C_PhysicalStorageBufferAddresses            : I32EnumAttrCase<"PhysicalStorageBufferAddresses", 5347>;
+def SPV_C_ComputeDerivativeGroupLinearNV            : I32EnumAttrCase<"ComputeDerivativeGroupLinearNV", 5350>;
+def SPV_C_CooperativeMatrixNV                       : I32EnumAttrCase<"CooperativeMatrixNV", 5357>;
+def SPV_C_FragmentShaderSampleInterlockEXT          : I32EnumAttrCase<"FragmentShaderSampleInterlockEXT", 5363>;
+def SPV_C_FragmentShaderShadingRateInterlockEXT     : I32EnumAttrCase<"FragmentShaderShadingRateInterlockEXT", 5372>;
+def SPV_C_ShaderSMBuiltinsNV                        : I32EnumAttrCase<"ShaderSMBuiltinsNV", 5373>;
+def SPV_C_FragmentShaderPixelInterlockEXT           : I32EnumAttrCase<"FragmentShaderPixelInterlockEXT", 5378>;
+def SPV_C_DemoteToHelperInvocationEXT               : I32EnumAttrCase<"DemoteToHelperInvocationEXT", 5379>;
+def SPV_C_SubgroupShuffleINTEL                      : I32EnumAttrCase<"SubgroupShuffleINTEL", 5568>;
+def SPV_C_SubgroupBufferBlockIOINTEL                : I32EnumAttrCase<"SubgroupBufferBlockIOINTEL", 5569>;
+def SPV_C_SubgroupImageBlockIOINTEL                 : I32EnumAttrCase<"SubgroupImageBlockIOINTEL", 5570>;
+def SPV_C_SubgroupImageMediaBlockIOINTEL            : I32EnumAttrCase<"SubgroupImageMediaBlockIOINTEL", 5579>;
+def SPV_C_IntegerFunctions2INTEL                    : I32EnumAttrCase<"IntegerFunctions2INTEL", 5584>;
+def SPV_C_SubgroupAvcMotionEstimationINTEL          : I32EnumAttrCase<"SubgroupAvcMotionEstimationINTEL", 5696>;
+def SPV_C_SubgroupAvcMotionEstimationIntraINTEL     : I32EnumAttrCase<"SubgroupAvcMotionEstimationIntraINTEL", 5697>;
+def SPV_C_SubgroupAvcMotionEstimationChromaINTEL    : I32EnumAttrCase<"SubgroupAvcMotionEstimationChromaINTEL", 5698>;
+
+def SPV_CapabilityAttr :
+    I32EnumAttr<"Capability", "valid SPIR-V Capability", [
+      SPV_C_Matrix, SPV_C_Shader, SPV_C_Geometry, SPV_C_Tessellation,
+      SPV_C_Addresses, SPV_C_Linkage, SPV_C_Kernel, SPV_C_Vector16,
+      SPV_C_Float16Buffer, SPV_C_Float16, SPV_C_Float64, SPV_C_Int64,
+      SPV_C_Int64Atomics, SPV_C_ImageBasic, SPV_C_ImageReadWrite, SPV_C_ImageMipmap,
+      SPV_C_Pipes, SPV_C_Groups, SPV_C_DeviceEnqueue, SPV_C_LiteralSampler,
+      SPV_C_AtomicStorage, SPV_C_Int16, SPV_C_TessellationPointSize,
+      SPV_C_GeometryPointSize, SPV_C_ImageGatherExtended,
+      SPV_C_StorageImageMultisample, SPV_C_UniformBufferArrayDynamicIndexing,
+      SPV_C_SampledImageArrayDynamicIndexing,
+      SPV_C_StorageBufferArrayDynamicIndexing,
+      SPV_C_StorageImageArrayDynamicIndexing, SPV_C_ClipDistance, SPV_C_CullDistance,
+      SPV_C_ImageCubeArray, SPV_C_SampleRateShading, SPV_C_ImageRect,
+      SPV_C_SampledRect, SPV_C_GenericPointer, SPV_C_Int8, SPV_C_InputAttachment,
+      SPV_C_SparseResidency, SPV_C_MinLod, SPV_C_Sampled1D, SPV_C_Image1D,
+      SPV_C_SampledCubeArray, SPV_C_SampledBuffer, SPV_C_ImageBuffer,
+      SPV_C_ImageMSArray, SPV_C_StorageImageExtendedFormats, SPV_C_ImageQuery,
+      SPV_C_DerivativeControl, SPV_C_InterpolationFunction, SPV_C_TransformFeedback,
+      SPV_C_GeometryStreams, SPV_C_StorageImageReadWithoutFormat,
+      SPV_C_StorageImageWriteWithoutFormat, SPV_C_MultiViewport,
+      SPV_C_SubgroupDispatch, SPV_C_NamedBarrier, SPV_C_PipeStorage,
+      SPV_C_GroupNonUniform, SPV_C_GroupNonUniformVote,
+      SPV_C_GroupNonUniformArithmetic, SPV_C_GroupNonUniformBallot,
+      SPV_C_GroupNonUniformShuffle, SPV_C_GroupNonUniformShuffleRelative,
+      SPV_C_GroupNonUniformClustered, SPV_C_GroupNonUniformQuad, SPV_C_ShaderLayer,
+      SPV_C_ShaderViewportIndex, SPV_C_SubgroupBallotKHR, SPV_C_DrawParameters,
+      SPV_C_SubgroupVoteKHR, SPV_C_StorageBuffer16BitAccess, SPV_C_StorageUniform16,
+      SPV_C_StoragePushConstant16, SPV_C_StorageInputOutput16, SPV_C_DeviceGroup,
+      SPV_C_MultiView, SPV_C_VariablePointersStorageBuffer, SPV_C_VariablePointers,
+      SPV_C_AtomicStorageOps, SPV_C_SampleMaskPostDepthCoverage,
+      SPV_C_StorageBuffer8BitAccess, SPV_C_UniformAndStorageBuffer8BitAccess,
+      SPV_C_StoragePushConstant8, SPV_C_DenormPreserve, SPV_C_DenormFlushToZero,
+      SPV_C_SignedZeroInfNanPreserve, SPV_C_RoundingModeRTE, SPV_C_RoundingModeRTZ,
+      SPV_C_Float16ImageAMD, SPV_C_ImageGatherBiasLodAMD, SPV_C_FragmentMaskAMD,
+      SPV_C_StencilExportEXT, SPV_C_ImageReadWriteLodAMD, SPV_C_ShaderClockKHR,
+      SPV_C_SampleMaskOverrideCoverageNV, SPV_C_GeometryShaderPassthroughNV,
+      SPV_C_ShaderViewportIndexLayerEXT, SPV_C_ShaderViewportMaskNV,
+      SPV_C_ShaderStereoViewNV, SPV_C_PerViewAttributesNV,
+      SPV_C_FragmentFullyCoveredEXT, SPV_C_MeshShadingNV, SPV_C_ImageFootprintNV,
+      SPV_C_FragmentBarycentricNV, SPV_C_ComputeDerivativeGroupQuadsNV,
+      SPV_C_FragmentDensityEXT, SPV_C_GroupNonUniformPartitionedNV,
+      SPV_C_ShaderNonUniform, SPV_C_RuntimeDescriptorArray,
+      SPV_C_InputAttachmentArrayDynamicIndexing,
+      SPV_C_UniformTexelBufferArrayDynamicIndexing,
+      SPV_C_StorageTexelBufferArrayDynamicIndexing,
+      SPV_C_UniformBufferArrayNonUniformIndexing,
+      SPV_C_SampledImageArrayNonUniformIndexing,
+      SPV_C_StorageBufferArrayNonUniformIndexing,
+      SPV_C_StorageImageArrayNonUniformIndexing,
+      SPV_C_InputAttachmentArrayNonUniformIndexing,
+      SPV_C_UniformTexelBufferArrayNonUniformIndexing,
+      SPV_C_StorageTexelBufferArrayNonUniformIndexing, SPV_C_RayTracingNV,
+      SPV_C_VulkanMemoryModel, SPV_C_VulkanMemoryModelDeviceScope,
+      SPV_C_PhysicalStorageBufferAddresses, SPV_C_ComputeDerivativeGroupLinearNV,
+      SPV_C_CooperativeMatrixNV, SPV_C_FragmentShaderSampleInterlockEXT,
+      SPV_C_FragmentShaderShadingRateInterlockEXT, SPV_C_ShaderSMBuiltinsNV,
+      SPV_C_FragmentShaderPixelInterlockEXT, SPV_C_DemoteToHelperInvocationEXT,
+      SPV_C_SubgroupShuffleINTEL, SPV_C_SubgroupBufferBlockIOINTEL,
+      SPV_C_SubgroupImageBlockIOINTEL, SPV_C_SubgroupImageMediaBlockIOINTEL,
+      SPV_C_IntegerFunctions2INTEL, SPV_C_SubgroupAvcMotionEstimationINTEL,
+      SPV_C_SubgroupAvcMotionEstimationIntraINTEL,
+      SPV_C_SubgroupAvcMotionEstimationChromaINTEL
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_AM_Logical                 : I32EnumAttrCase<"Logical", 0>;
+def SPV_AM_Physical32              : I32EnumAttrCase<"Physical32", 1>;
+def SPV_AM_Physical64              : I32EnumAttrCase<"Physical64", 2>;
+def SPV_AM_PhysicalStorageBuffer64 : I32EnumAttrCase<"PhysicalStorageBuffer64", 5348>;
+
+def SPV_AddressingModelAttr :
+    I32EnumAttr<"AddressingModel", "valid SPIR-V AddressingModel", [
+      SPV_AM_Logical, SPV_AM_Physical32, SPV_AM_Physical64,
+      SPV_AM_PhysicalStorageBuffer64
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_BI_Position                    : I32EnumAttrCase<"Position", 0>;
+def SPV_BI_PointSize                   : I32EnumAttrCase<"PointSize", 1>;
+def SPV_BI_ClipDistance                : I32EnumAttrCase<"ClipDistance", 3>;
+def SPV_BI_CullDistance                : I32EnumAttrCase<"CullDistance", 4>;
+def SPV_BI_VertexId                    : I32EnumAttrCase<"VertexId", 5>;
+def SPV_BI_InstanceId                  : I32EnumAttrCase<"InstanceId", 6>;
+def SPV_BI_PrimitiveId                 : I32EnumAttrCase<"PrimitiveId", 7>;
+def SPV_BI_InvocationId                : I32EnumAttrCase<"InvocationId", 8>;
+def SPV_BI_Layer                       : I32EnumAttrCase<"Layer", 9>;
+def SPV_BI_ViewportIndex               : I32EnumAttrCase<"ViewportIndex", 10>;
+def SPV_BI_TessLevelOuter              : I32EnumAttrCase<"TessLevelOuter", 11>;
+def SPV_BI_TessLevelInner              : I32EnumAttrCase<"TessLevelInner", 12>;
+def SPV_BI_TessCoord                   : I32EnumAttrCase<"TessCoord", 13>;
+def SPV_BI_PatchVertices               : I32EnumAttrCase<"PatchVertices", 14>;
+def SPV_BI_FragCoord                   : I32EnumAttrCase<"FragCoord", 15>;
+def SPV_BI_PointCoord                  : I32EnumAttrCase<"PointCoord", 16>;
+def SPV_BI_FrontFacing                 : I32EnumAttrCase<"FrontFacing", 17>;
+def SPV_BI_SampleId                    : I32EnumAttrCase<"SampleId", 18>;
+def SPV_BI_SamplePosition              : I32EnumAttrCase<"SamplePosition", 19>;
+def SPV_BI_SampleMask                  : I32EnumAttrCase<"SampleMask", 20>;
+def SPV_BI_FragDepth                   : I32EnumAttrCase<"FragDepth", 22>;
+def SPV_BI_HelperInvocation            : I32EnumAttrCase<"HelperInvocation", 23>;
+def SPV_BI_NumWorkgroups               : I32EnumAttrCase<"NumWorkgroups", 24>;
+def SPV_BI_WorkgroupSize               : I32EnumAttrCase<"WorkgroupSize", 25>;
+def SPV_BI_WorkgroupId                 : I32EnumAttrCase<"WorkgroupId", 26>;
+def SPV_BI_LocalInvocationId           : I32EnumAttrCase<"LocalInvocationId", 27>;
+def SPV_BI_GlobalInvocationId          : I32EnumAttrCase<"GlobalInvocationId", 28>;
+def SPV_BI_LocalInvocationIndex        : I32EnumAttrCase<"LocalInvocationIndex", 29>;
+def SPV_BI_WorkDim                     : I32EnumAttrCase<"WorkDim", 30>;
+def SPV_BI_GlobalSize                  : I32EnumAttrCase<"GlobalSize", 31>;
+def SPV_BI_EnqueuedWorkgroupSize       : I32EnumAttrCase<"EnqueuedWorkgroupSize", 32>;
+def SPV_BI_GlobalOffset                : I32EnumAttrCase<"GlobalOffset", 33>;
+def SPV_BI_GlobalLinearId              : I32EnumAttrCase<"GlobalLinearId", 34>;
+def SPV_BI_SubgroupSize                : I32EnumAttrCase<"SubgroupSize", 36>;
+def SPV_BI_SubgroupMaxSize             : I32EnumAttrCase<"SubgroupMaxSize", 37>;
+def SPV_BI_NumSubgroups                : I32EnumAttrCase<"NumSubgroups", 38>;
+def SPV_BI_NumEnqueuedSubgroups        : I32EnumAttrCase<"NumEnqueuedSubgroups", 39>;
+def SPV_BI_SubgroupId                  : I32EnumAttrCase<"SubgroupId", 40>;
+def SPV_BI_SubgroupLocalInvocationId   : I32EnumAttrCase<"SubgroupLocalInvocationId", 41>;
+def SPV_BI_VertexIndex                 : I32EnumAttrCase<"VertexIndex", 42>;
+def SPV_BI_InstanceIndex               : I32EnumAttrCase<"InstanceIndex", 43>;
+def SPV_BI_SubgroupEqMask              : I32EnumAttrCase<"SubgroupEqMask", 4416>;
+def SPV_BI_SubgroupGeMask              : I32EnumAttrCase<"SubgroupGeMask", 4417>;
+def SPV_BI_SubgroupGtMask              : I32EnumAttrCase<"SubgroupGtMask", 4418>;
+def SPV_BI_SubgroupLeMask              : I32EnumAttrCase<"SubgroupLeMask", 4419>;
+def SPV_BI_SubgroupLtMask              : I32EnumAttrCase<"SubgroupLtMask", 4420>;
+def SPV_BI_BaseVertex                  : I32EnumAttrCase<"BaseVertex", 4424>;
+def SPV_BI_BaseInstance                : I32EnumAttrCase<"BaseInstance", 4425>;
+def SPV_BI_DrawIndex                   : I32EnumAttrCase<"DrawIndex", 4426>;
+def SPV_BI_DeviceIndex                 : I32EnumAttrCase<"DeviceIndex", 4438>;
+def SPV_BI_ViewIndex                   : I32EnumAttrCase<"ViewIndex", 4440>;
+def SPV_BI_BaryCoordNoPerspAMD         : I32EnumAttrCase<"BaryCoordNoPerspAMD", 4992>;
+def SPV_BI_BaryCoordNoPerspCentroidAMD : I32EnumAttrCase<"BaryCoordNoPerspCentroidAMD", 4993>;
+def SPV_BI_BaryCoordNoPerspSampleAMD   : I32EnumAttrCase<"BaryCoordNoPerspSampleAMD", 4994>;
+def SPV_BI_BaryCoordSmoothAMD          : I32EnumAttrCase<"BaryCoordSmoothAMD", 4995>;
+def SPV_BI_BaryCoordSmoothCentroidAMD  : I32EnumAttrCase<"BaryCoordSmoothCentroidAMD", 4996>;
+def SPV_BI_BaryCoordSmoothSampleAMD    : I32EnumAttrCase<"BaryCoordSmoothSampleAMD", 4997>;
+def SPV_BI_BaryCoordPullModelAMD       : I32EnumAttrCase<"BaryCoordPullModelAMD", 4998>;
+def SPV_BI_FragStencilRefEXT           : I32EnumAttrCase<"FragStencilRefEXT", 5014>;
+def SPV_BI_ViewportMaskNV              : I32EnumAttrCase<"ViewportMaskNV", 5253>;
+def SPV_BI_SecondaryPositionNV         : I32EnumAttrCase<"SecondaryPositionNV", 5257>;
+def SPV_BI_SecondaryViewportMaskNV     : I32EnumAttrCase<"SecondaryViewportMaskNV", 5258>;
+def SPV_BI_PositionPerViewNV           : I32EnumAttrCase<"PositionPerViewNV", 5261>;
+def SPV_BI_ViewportMaskPerViewNV       : I32EnumAttrCase<"ViewportMaskPerViewNV", 5262>;
+def SPV_BI_FullyCoveredEXT             : I32EnumAttrCase<"FullyCoveredEXT", 5264>;
+def SPV_BI_TaskCountNV                 : I32EnumAttrCase<"TaskCountNV", 5274>;
+def SPV_BI_PrimitiveCountNV            : I32EnumAttrCase<"PrimitiveCountNV", 5275>;
+def SPV_BI_PrimitiveIndicesNV          : I32EnumAttrCase<"PrimitiveIndicesNV", 5276>;
+def SPV_BI_ClipDistancePerViewNV       : I32EnumAttrCase<"ClipDistancePerViewNV", 5277>;
+def SPV_BI_CullDistancePerViewNV       : I32EnumAttrCase<"CullDistancePerViewNV", 5278>;
+def SPV_BI_LayerPerViewNV              : I32EnumAttrCase<"LayerPerViewNV", 5279>;
+def SPV_BI_MeshViewCountNV             : I32EnumAttrCase<"MeshViewCountNV", 5280>;
+def SPV_BI_MeshViewIndicesNV           : I32EnumAttrCase<"MeshViewIndicesNV", 5281>;
+def SPV_BI_BaryCoordNV                 : I32EnumAttrCase<"BaryCoordNV", 5286>;
+def SPV_BI_BaryCoordNoPerspNV          : I32EnumAttrCase<"BaryCoordNoPerspNV", 5287>;
+def SPV_BI_FragSizeEXT                 : I32EnumAttrCase<"FragSizeEXT", 5292>;
+def SPV_BI_FragInvocationCountEXT      : I32EnumAttrCase<"FragInvocationCountEXT", 5293>;
+def SPV_BI_LaunchIdNV                  : I32EnumAttrCase<"LaunchIdNV", 5319>;
+def SPV_BI_LaunchSizeNV                : I32EnumAttrCase<"LaunchSizeNV", 5320>;
+def SPV_BI_WorldRayOriginNV            : I32EnumAttrCase<"WorldRayOriginNV", 5321>;
+def SPV_BI_WorldRayDirectionNV         : I32EnumAttrCase<"WorldRayDirectionNV", 5322>;
+def SPV_BI_ObjectRayOriginNV           : I32EnumAttrCase<"ObjectRayOriginNV", 5323>;
+def SPV_BI_ObjectRayDirectionNV        : I32EnumAttrCase<"ObjectRayDirectionNV", 5324>;
+def SPV_BI_RayTminNV                   : I32EnumAttrCase<"RayTminNV", 5325>;
+def SPV_BI_RayTmaxNV                   : I32EnumAttrCase<"RayTmaxNV", 5326>;
+def SPV_BI_InstanceCustomIndexNV       : I32EnumAttrCase<"InstanceCustomIndexNV", 5327>;
+def SPV_BI_ObjectToWorldNV             : I32EnumAttrCase<"ObjectToWorldNV", 5330>;
+def SPV_BI_WorldToObjectNV             : I32EnumAttrCase<"WorldToObjectNV", 5331>;
+def SPV_BI_HitTNV                      : I32EnumAttrCase<"HitTNV", 5332>;
+def SPV_BI_HitKindNV                   : I32EnumAttrCase<"HitKindNV", 5333>;
+def SPV_BI_IncomingRayFlagsNV          : I32EnumAttrCase<"IncomingRayFlagsNV", 5351>;
+def SPV_BI_WarpsPerSMNV                : I32EnumAttrCase<"WarpsPerSMNV", 5374>;
+def SPV_BI_SMCountNV                   : I32EnumAttrCase<"SMCountNV", 5375>;
+def SPV_BI_WarpIDNV                    : I32EnumAttrCase<"WarpIDNV", 5376>;
+def SPV_BI_SMIDNV                      : I32EnumAttrCase<"SMIDNV", 5377>;
+
+def SPV_BuiltInAttr :
+    I32EnumAttr<"BuiltIn", "valid SPIR-V BuiltIn", [
+      SPV_BI_Position, SPV_BI_PointSize, SPV_BI_ClipDistance, SPV_BI_CullDistance,
+      SPV_BI_VertexId, SPV_BI_InstanceId, SPV_BI_PrimitiveId, SPV_BI_InvocationId,
+      SPV_BI_Layer, SPV_BI_ViewportIndex, SPV_BI_TessLevelOuter,
+      SPV_BI_TessLevelInner, SPV_BI_TessCoord, SPV_BI_PatchVertices,
+      SPV_BI_FragCoord, SPV_BI_PointCoord, SPV_BI_FrontFacing, SPV_BI_SampleId,
+      SPV_BI_SamplePosition, SPV_BI_SampleMask, SPV_BI_FragDepth,
+      SPV_BI_HelperInvocation, SPV_BI_NumWorkgroups, SPV_BI_WorkgroupSize,
+      SPV_BI_WorkgroupId, SPV_BI_LocalInvocationId, SPV_BI_GlobalInvocationId,
+      SPV_BI_LocalInvocationIndex, SPV_BI_WorkDim, SPV_BI_GlobalSize,
+      SPV_BI_EnqueuedWorkgroupSize, SPV_BI_GlobalOffset, SPV_BI_GlobalLinearId,
+      SPV_BI_SubgroupSize, SPV_BI_SubgroupMaxSize, SPV_BI_NumSubgroups,
+      SPV_BI_NumEnqueuedSubgroups, SPV_BI_SubgroupId,
+      SPV_BI_SubgroupLocalInvocationId, SPV_BI_VertexIndex, SPV_BI_InstanceIndex,
+      SPV_BI_SubgroupEqMask, SPV_BI_SubgroupGeMask, SPV_BI_SubgroupGtMask,
+      SPV_BI_SubgroupLeMask, SPV_BI_SubgroupLtMask, SPV_BI_BaseVertex,
+      SPV_BI_BaseInstance, SPV_BI_DrawIndex, SPV_BI_DeviceIndex, SPV_BI_ViewIndex,
+      SPV_BI_BaryCoordNoPerspAMD, SPV_BI_BaryCoordNoPerspCentroidAMD,
+      SPV_BI_BaryCoordNoPerspSampleAMD, SPV_BI_BaryCoordSmoothAMD,
+      SPV_BI_BaryCoordSmoothCentroidAMD, SPV_BI_BaryCoordSmoothSampleAMD,
+      SPV_BI_BaryCoordPullModelAMD, SPV_BI_FragStencilRefEXT, SPV_BI_ViewportMaskNV,
+      SPV_BI_SecondaryPositionNV, SPV_BI_SecondaryViewportMaskNV,
+      SPV_BI_PositionPerViewNV, SPV_BI_ViewportMaskPerViewNV, SPV_BI_FullyCoveredEXT,
+      SPV_BI_TaskCountNV, SPV_BI_PrimitiveCountNV, SPV_BI_PrimitiveIndicesNV,
+      SPV_BI_ClipDistancePerViewNV, SPV_BI_CullDistancePerViewNV,
+      SPV_BI_LayerPerViewNV, SPV_BI_MeshViewCountNV, SPV_BI_MeshViewIndicesNV,
+      SPV_BI_BaryCoordNV, SPV_BI_BaryCoordNoPerspNV, SPV_BI_FragSizeEXT,
+      SPV_BI_FragInvocationCountEXT, SPV_BI_LaunchIdNV, SPV_BI_LaunchSizeNV,
+      SPV_BI_WorldRayOriginNV, SPV_BI_WorldRayDirectionNV, SPV_BI_ObjectRayOriginNV,
+      SPV_BI_ObjectRayDirectionNV, SPV_BI_RayTminNV, SPV_BI_RayTmaxNV,
+      SPV_BI_InstanceCustomIndexNV, SPV_BI_ObjectToWorldNV, SPV_BI_WorldToObjectNV,
+      SPV_BI_HitTNV, SPV_BI_HitKindNV, SPV_BI_IncomingRayFlagsNV,
+      SPV_BI_WarpsPerSMNV, SPV_BI_SMCountNV, SPV_BI_WarpIDNV, SPV_BI_SMIDNV
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_D_RelaxedPrecision            : I32EnumAttrCase<"RelaxedPrecision", 0>;
+def SPV_D_SpecId                      : I32EnumAttrCase<"SpecId", 1>;
+def SPV_D_Block                       : I32EnumAttrCase<"Block", 2>;
+def SPV_D_BufferBlock                 : I32EnumAttrCase<"BufferBlock", 3>;
+def SPV_D_RowMajor                    : I32EnumAttrCase<"RowMajor", 4>;
+def SPV_D_ColMajor                    : I32EnumAttrCase<"ColMajor", 5>;
+def SPV_D_ArrayStride                 : I32EnumAttrCase<"ArrayStride", 6>;
+def SPV_D_MatrixStride                : I32EnumAttrCase<"MatrixStride", 7>;
+def SPV_D_GLSLShared                  : I32EnumAttrCase<"GLSLShared", 8>;
+def SPV_D_GLSLPacked                  : I32EnumAttrCase<"GLSLPacked", 9>;
+def SPV_D_CPacked                     : I32EnumAttrCase<"CPacked", 10>;
+def SPV_D_BuiltIn                     : I32EnumAttrCase<"BuiltIn", 11>;
+def SPV_D_NoPerspective               : I32EnumAttrCase<"NoPerspective", 13>;
+def SPV_D_Flat                        : I32EnumAttrCase<"Flat", 14>;
+def SPV_D_Patch                       : I32EnumAttrCase<"Patch", 15>;
+def SPV_D_Centroid                    : I32EnumAttrCase<"Centroid", 16>;
+def SPV_D_Sample                      : I32EnumAttrCase<"Sample", 17>;
+def SPV_D_Invariant                   : I32EnumAttrCase<"Invariant", 18>;
+def SPV_D_Restrict                    : I32EnumAttrCase<"Restrict", 19>;
+def SPV_D_Aliased                     : I32EnumAttrCase<"Aliased", 20>;
+def SPV_D_Volatile                    : I32EnumAttrCase<"Volatile", 21>;
+def SPV_D_Constant                    : I32EnumAttrCase<"Constant", 22>;
+def SPV_D_Coherent                    : I32EnumAttrCase<"Coherent", 23>;
+def SPV_D_NonWritable                 : I32EnumAttrCase<"NonWritable", 24>;
+def SPV_D_NonReadable                 : I32EnumAttrCase<"NonReadable", 25>;
+def SPV_D_Uniform                     : I32EnumAttrCase<"Uniform", 26>;
+def SPV_D_UniformId                   : I32EnumAttrCase<"UniformId", 27>;
+def SPV_D_SaturatedConversion         : I32EnumAttrCase<"SaturatedConversion", 28>;
+def SPV_D_Stream                      : I32EnumAttrCase<"Stream", 29>;
+def SPV_D_Location                    : I32EnumAttrCase<"Location", 30>;
+def SPV_D_Component                   : I32EnumAttrCase<"Component", 31>;
+def SPV_D_Index                       : I32EnumAttrCase<"Index", 32>;
+def SPV_D_Binding                     : I32EnumAttrCase<"Binding", 33>;
+def SPV_D_DescriptorSet               : I32EnumAttrCase<"DescriptorSet", 34>;
+def SPV_D_Offset                      : I32EnumAttrCase<"Offset", 35>;
+def SPV_D_XfbBuffer                   : I32EnumAttrCase<"XfbBuffer", 36>;
+def SPV_D_XfbStride                   : I32EnumAttrCase<"XfbStride", 37>;
+def SPV_D_FuncParamAttr               : I32EnumAttrCase<"FuncParamAttr", 38>;
+def SPV_D_FPRoundingMode              : I32EnumAttrCase<"FPRoundingMode", 39>;
+def SPV_D_FPFastMathMode              : I32EnumAttrCase<"FPFastMathMode", 40>;
+def SPV_D_LinkageAttributes           : I32EnumAttrCase<"LinkageAttributes", 41>;
+def SPV_D_NoContraction               : I32EnumAttrCase<"NoContraction", 42>;
+def SPV_D_InputAttachmentIndex        : I32EnumAttrCase<"InputAttachmentIndex", 43>;
+def SPV_D_Alignment                   : I32EnumAttrCase<"Alignment", 44>;
+def SPV_D_MaxByteOffset               : I32EnumAttrCase<"MaxByteOffset", 45>;
+def SPV_D_AlignmentId                 : I32EnumAttrCase<"AlignmentId", 46>;
+def SPV_D_MaxByteOffsetId             : I32EnumAttrCase<"MaxByteOffsetId", 47>;
+def SPV_D_NoSignedWrap                : I32EnumAttrCase<"NoSignedWrap", 4469>;
+def SPV_D_NoUnsignedWrap              : I32EnumAttrCase<"NoUnsignedWrap", 4470>;
+def SPV_D_ExplicitInterpAMD           : I32EnumAttrCase<"ExplicitInterpAMD", 4999>;
+def SPV_D_OverrideCoverageNV          : I32EnumAttrCase<"OverrideCoverageNV", 5248>;
+def SPV_D_PassthroughNV               : I32EnumAttrCase<"PassthroughNV", 5250>;
+def SPV_D_ViewportRelativeNV          : I32EnumAttrCase<"ViewportRelativeNV", 5252>;
+def SPV_D_SecondaryViewportRelativeNV : I32EnumAttrCase<"SecondaryViewportRelativeNV", 5256>;
+def SPV_D_PerPrimitiveNV              : I32EnumAttrCase<"PerPrimitiveNV", 5271>;
+def SPV_D_PerViewNV                   : I32EnumAttrCase<"PerViewNV", 5272>;
+def SPV_D_PerTaskNV                   : I32EnumAttrCase<"PerTaskNV", 5273>;
+def SPV_D_PerVertexNV                 : I32EnumAttrCase<"PerVertexNV", 5285>;
+def SPV_D_NonUniform                  : I32EnumAttrCase<"NonUniform", 5300>;
+def SPV_D_RestrictPointer             : I32EnumAttrCase<"RestrictPointer", 5355>;
+def SPV_D_AliasedPointer              : I32EnumAttrCase<"AliasedPointer", 5356>;
+def SPV_D_CounterBuffer               : I32EnumAttrCase<"CounterBuffer", 5634>;
+def SPV_D_UserSemantic                : I32EnumAttrCase<"UserSemantic", 5635>;
+def SPV_D_UserTypeGOOGLE              : I32EnumAttrCase<"UserTypeGOOGLE", 5636>;
+
+def SPV_DecorationAttr :
+    I32EnumAttr<"Decoration", "valid SPIR-V Decoration", [
+      SPV_D_RelaxedPrecision, SPV_D_SpecId, SPV_D_Block, SPV_D_BufferBlock,
+      SPV_D_RowMajor, SPV_D_ColMajor, SPV_D_ArrayStride, SPV_D_MatrixStride,
+      SPV_D_GLSLShared, SPV_D_GLSLPacked, SPV_D_CPacked, SPV_D_BuiltIn,
+      SPV_D_NoPerspective, SPV_D_Flat, SPV_D_Patch, SPV_D_Centroid, SPV_D_Sample,
+      SPV_D_Invariant, SPV_D_Restrict, SPV_D_Aliased, SPV_D_Volatile, SPV_D_Constant,
+      SPV_D_Coherent, SPV_D_NonWritable, SPV_D_NonReadable, SPV_D_Uniform,
+      SPV_D_UniformId, SPV_D_SaturatedConversion, SPV_D_Stream, SPV_D_Location,
+      SPV_D_Component, SPV_D_Index, SPV_D_Binding, SPV_D_DescriptorSet, SPV_D_Offset,
+      SPV_D_XfbBuffer, SPV_D_XfbStride, SPV_D_FuncParamAttr, SPV_D_FPRoundingMode,
+      SPV_D_FPFastMathMode, SPV_D_LinkageAttributes, SPV_D_NoContraction,
+      SPV_D_InputAttachmentIndex, SPV_D_Alignment, SPV_D_MaxByteOffset,
+      SPV_D_AlignmentId, SPV_D_MaxByteOffsetId, SPV_D_NoSignedWrap,
+      SPV_D_NoUnsignedWrap, SPV_D_ExplicitInterpAMD, SPV_D_OverrideCoverageNV,
+      SPV_D_PassthroughNV, SPV_D_ViewportRelativeNV,
+      SPV_D_SecondaryViewportRelativeNV, SPV_D_PerPrimitiveNV, SPV_D_PerViewNV,
+      SPV_D_PerTaskNV, SPV_D_PerVertexNV, SPV_D_NonUniform, SPV_D_RestrictPointer,
+      SPV_D_AliasedPointer, SPV_D_CounterBuffer, SPV_D_UserSemantic,
+      SPV_D_UserTypeGOOGLE
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_D_1D          : I32EnumAttrCase<"Dim1D", 0>;
+def SPV_D_2D          : I32EnumAttrCase<"Dim2D", 1>;
+def SPV_D_3D          : I32EnumAttrCase<"Dim3D", 2>;
+def SPV_D_Cube        : I32EnumAttrCase<"Cube", 3>;
+def SPV_D_Rect        : I32EnumAttrCase<"Rect", 4>;
+def SPV_D_Buffer      : I32EnumAttrCase<"Buffer", 5>;
+def SPV_D_SubpassData : I32EnumAttrCase<"SubpassData", 6>;
+
+def SPV_DimAttr :
+    I32EnumAttr<"Dim", "valid SPIR-V Dim", [
+      SPV_D_1D, SPV_D_2D, SPV_D_3D, SPV_D_Cube, SPV_D_Rect, SPV_D_Buffer,
+      SPV_D_SubpassData
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_EM_Invocations                      : I32EnumAttrCase<"Invocations", 0>;
+def SPV_EM_SpacingEqual                     : I32EnumAttrCase<"SpacingEqual", 1>;
+def SPV_EM_SpacingFractionalEven            : I32EnumAttrCase<"SpacingFractionalEven", 2>;
+def SPV_EM_SpacingFractionalOdd             : I32EnumAttrCase<"SpacingFractionalOdd", 3>;
+def SPV_EM_VertexOrderCw                    : I32EnumAttrCase<"VertexOrderCw", 4>;
+def SPV_EM_VertexOrderCcw                   : I32EnumAttrCase<"VertexOrderCcw", 5>;
+def SPV_EM_PixelCenterInteger               : I32EnumAttrCase<"PixelCenterInteger", 6>;
+def SPV_EM_OriginUpperLeft                  : I32EnumAttrCase<"OriginUpperLeft", 7>;
+def SPV_EM_OriginLowerLeft                  : I32EnumAttrCase<"OriginLowerLeft", 8>;
+def SPV_EM_EarlyFragmentTests               : I32EnumAttrCase<"EarlyFragmentTests", 9>;
+def SPV_EM_PointMode                        : I32EnumAttrCase<"PointMode", 10>;
+def SPV_EM_Xfb                              : I32EnumAttrCase<"Xfb", 11>;
+def SPV_EM_DepthReplacing                   : I32EnumAttrCase<"DepthReplacing", 12>;
+def SPV_EM_DepthGreater                     : I32EnumAttrCase<"DepthGreater", 14>;
+def SPV_EM_DepthLess                        : I32EnumAttrCase<"DepthLess", 15>;
+def SPV_EM_DepthUnchanged                   : I32EnumAttrCase<"DepthUnchanged", 16>;
+def SPV_EM_LocalSize                        : I32EnumAttrCase<"LocalSize", 17>;
+def SPV_EM_LocalSizeHint                    : I32EnumAttrCase<"LocalSizeHint", 18>;
+def SPV_EM_InputPoints                      : I32EnumAttrCase<"InputPoints", 19>;
+def SPV_EM_InputLines                       : I32EnumAttrCase<"InputLines", 20>;
+def SPV_EM_InputLinesAdjacency              : I32EnumAttrCase<"InputLinesAdjacency", 21>;
+def SPV_EM_Triangles                        : I32EnumAttrCase<"Triangles", 22>;
+def SPV_EM_InputTrianglesAdjacency          : I32EnumAttrCase<"InputTrianglesAdjacency", 23>;
+def SPV_EM_Quads                            : I32EnumAttrCase<"Quads", 24>;
+def SPV_EM_Isolines                         : I32EnumAttrCase<"Isolines", 25>;
+def SPV_EM_OutputVertices                   : I32EnumAttrCase<"OutputVertices", 26>;
+def SPV_EM_OutputPoints                     : I32EnumAttrCase<"OutputPoints", 27>;
+def SPV_EM_OutputLineStrip                  : I32EnumAttrCase<"OutputLineStrip", 28>;
+def SPV_EM_OutputTriangleStrip              : I32EnumAttrCase<"OutputTriangleStrip", 29>;
+def SPV_EM_VecTypeHint                      : I32EnumAttrCase<"VecTypeHint", 30>;
+def SPV_EM_ContractionOff                   : I32EnumAttrCase<"ContractionOff", 31>;
+def SPV_EM_Initializer                      : I32EnumAttrCase<"Initializer", 33>;
+def SPV_EM_Finalizer                        : I32EnumAttrCase<"Finalizer", 34>;
+def SPV_EM_SubgroupSize                     : I32EnumAttrCase<"SubgroupSize", 35>;
+def SPV_EM_SubgroupsPerWorkgroup            : I32EnumAttrCase<"SubgroupsPerWorkgroup", 36>;
+def SPV_EM_SubgroupsPerWorkgroupId          : I32EnumAttrCase<"SubgroupsPerWorkgroupId", 37>;
+def SPV_EM_LocalSizeId                      : I32EnumAttrCase<"LocalSizeId", 38>;
+def SPV_EM_LocalSizeHintId                  : I32EnumAttrCase<"LocalSizeHintId", 39>;
+def SPV_EM_PostDepthCoverage                : I32EnumAttrCase<"PostDepthCoverage", 4446>;
+def SPV_EM_DenormPreserve                   : I32EnumAttrCase<"DenormPreserve", 4459>;
+def SPV_EM_DenormFlushToZero                : I32EnumAttrCase<"DenormFlushToZero", 4460>;
+def SPV_EM_SignedZeroInfNanPreserve         : I32EnumAttrCase<"SignedZeroInfNanPreserve", 4461>;
+def SPV_EM_RoundingModeRTE                  : I32EnumAttrCase<"RoundingModeRTE", 4462>;
+def SPV_EM_RoundingModeRTZ                  : I32EnumAttrCase<"RoundingModeRTZ", 4463>;
+def SPV_EM_StencilRefReplacingEXT           : I32EnumAttrCase<"StencilRefReplacingEXT", 5027>;
+def SPV_EM_OutputLinesNV                    : I32EnumAttrCase<"OutputLinesNV", 5269>;
+def SPV_EM_OutputPrimitivesNV               : I32EnumAttrCase<"OutputPrimitivesNV", 5270>;
+def SPV_EM_DerivativeGroupQuadsNV           : I32EnumAttrCase<"DerivativeGroupQuadsNV", 5289>;
+def SPV_EM_DerivativeGroupLinearNV          : I32EnumAttrCase<"DerivativeGroupLinearNV", 5290>;
+def SPV_EM_OutputTrianglesNV                : I32EnumAttrCase<"OutputTrianglesNV", 5298>;
+def SPV_EM_PixelInterlockOrderedEXT         : I32EnumAttrCase<"PixelInterlockOrderedEXT", 5366>;
+def SPV_EM_PixelInterlockUnorderedEXT       : I32EnumAttrCase<"PixelInterlockUnorderedEXT", 5367>;
+def SPV_EM_SampleInterlockOrderedEXT        : I32EnumAttrCase<"SampleInterlockOrderedEXT", 5368>;
+def SPV_EM_SampleInterlockUnorderedEXT      : I32EnumAttrCase<"SampleInterlockUnorderedEXT", 5369>;
+def SPV_EM_ShadingRateInterlockOrderedEXT   : I32EnumAttrCase<"ShadingRateInterlockOrderedEXT", 5370>;
+def SPV_EM_ShadingRateInterlockUnorderedEXT : I32EnumAttrCase<"ShadingRateInterlockUnorderedEXT", 5371>;
+
+def SPV_ExecutionModeAttr :
+    I32EnumAttr<"ExecutionMode", "valid SPIR-V ExecutionMode", [
+      SPV_EM_Invocations, SPV_EM_SpacingEqual, SPV_EM_SpacingFractionalEven,
+      SPV_EM_SpacingFractionalOdd, SPV_EM_VertexOrderCw, SPV_EM_VertexOrderCcw,
+      SPV_EM_PixelCenterInteger, SPV_EM_OriginUpperLeft, SPV_EM_OriginLowerLeft,
+      SPV_EM_EarlyFragmentTests, SPV_EM_PointMode, SPV_EM_Xfb, SPV_EM_DepthReplacing,
+      SPV_EM_DepthGreater, SPV_EM_DepthLess, SPV_EM_DepthUnchanged, SPV_EM_LocalSize,
+      SPV_EM_LocalSizeHint, SPV_EM_InputPoints, SPV_EM_InputLines,
+      SPV_EM_InputLinesAdjacency, SPV_EM_Triangles, SPV_EM_InputTrianglesAdjacency,
+      SPV_EM_Quads, SPV_EM_Isolines, SPV_EM_OutputVertices, SPV_EM_OutputPoints,
+      SPV_EM_OutputLineStrip, SPV_EM_OutputTriangleStrip, SPV_EM_VecTypeHint,
+      SPV_EM_ContractionOff, SPV_EM_Initializer, SPV_EM_Finalizer,
+      SPV_EM_SubgroupSize, SPV_EM_SubgroupsPerWorkgroup,
+      SPV_EM_SubgroupsPerWorkgroupId, SPV_EM_LocalSizeId, SPV_EM_LocalSizeHintId,
+      SPV_EM_PostDepthCoverage, SPV_EM_DenormPreserve, SPV_EM_DenormFlushToZero,
+      SPV_EM_SignedZeroInfNanPreserve, SPV_EM_RoundingModeRTE,
+      SPV_EM_RoundingModeRTZ, SPV_EM_StencilRefReplacingEXT, SPV_EM_OutputLinesNV,
+      SPV_EM_OutputPrimitivesNV, SPV_EM_DerivativeGroupQuadsNV,
+      SPV_EM_DerivativeGroupLinearNV, SPV_EM_OutputTrianglesNV,
+      SPV_EM_PixelInterlockOrderedEXT, SPV_EM_PixelInterlockUnorderedEXT,
+      SPV_EM_SampleInterlockOrderedEXT, SPV_EM_SampleInterlockUnorderedEXT,
+      SPV_EM_ShadingRateInterlockOrderedEXT, SPV_EM_ShadingRateInterlockUnorderedEXT
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_EM_Vertex                 : I32EnumAttrCase<"Vertex", 0>;
+def SPV_EM_TessellationControl    : I32EnumAttrCase<"TessellationControl", 1>;
+def SPV_EM_TessellationEvaluation : I32EnumAttrCase<"TessellationEvaluation", 2>;
+def SPV_EM_Geometry               : I32EnumAttrCase<"Geometry", 3>;
+def SPV_EM_Fragment               : I32EnumAttrCase<"Fragment", 4>;
+def SPV_EM_GLCompute              : I32EnumAttrCase<"GLCompute", 5>;
+def SPV_EM_Kernel                 : I32EnumAttrCase<"Kernel", 6>;
+def SPV_EM_TaskNV                 : I32EnumAttrCase<"TaskNV", 5267>;
+def SPV_EM_MeshNV                 : I32EnumAttrCase<"MeshNV", 5268>;
+def SPV_EM_RayGenerationNV        : I32EnumAttrCase<"RayGenerationNV", 5313>;
+def SPV_EM_IntersectionNV         : I32EnumAttrCase<"IntersectionNV", 5314>;
+def SPV_EM_AnyHitNV               : I32EnumAttrCase<"AnyHitNV", 5315>;
+def SPV_EM_ClosestHitNV           : I32EnumAttrCase<"ClosestHitNV", 5316>;
+def SPV_EM_MissNV                 : I32EnumAttrCase<"MissNV", 5317>;
+def SPV_EM_CallableNV             : I32EnumAttrCase<"CallableNV", 5318>;
+
+def SPV_ExecutionModelAttr :
+    I32EnumAttr<"ExecutionModel", "valid SPIR-V ExecutionModel", [
+      SPV_EM_Vertex, SPV_EM_TessellationControl, SPV_EM_TessellationEvaluation,
+      SPV_EM_Geometry, SPV_EM_Fragment, SPV_EM_GLCompute, SPV_EM_Kernel,
+      SPV_EM_TaskNV, SPV_EM_MeshNV, SPV_EM_RayGenerationNV, SPV_EM_IntersectionNV,
+      SPV_EM_AnyHitNV, SPV_EM_ClosestHitNV, SPV_EM_MissNV, SPV_EM_CallableNV
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_FC_None       : BitEnumAttrCase<"None", 0x0000>;
+def SPV_FC_Inline     : BitEnumAttrCase<"Inline", 0x0001>;
+def SPV_FC_DontInline : BitEnumAttrCase<"DontInline", 0x0002>;
+def SPV_FC_Pure       : BitEnumAttrCase<"Pure", 0x0004>;
+def SPV_FC_Const      : BitEnumAttrCase<"Const", 0x0008>;
+
+def SPV_FunctionControlAttr :
+    BitEnumAttr<"FunctionControl", "valid SPIR-V FunctionControl", [
+      SPV_FC_None, SPV_FC_Inline, SPV_FC_DontInline, SPV_FC_Pure, SPV_FC_Const
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_IF_Unknown      : I32EnumAttrCase<"Unknown", 0>;
+def SPV_IF_Rgba32f      : I32EnumAttrCase<"Rgba32f", 1>;
+def SPV_IF_Rgba16f      : I32EnumAttrCase<"Rgba16f", 2>;
+def SPV_IF_R32f         : I32EnumAttrCase<"R32f", 3>;
+def SPV_IF_Rgba8        : I32EnumAttrCase<"Rgba8", 4>;
+def SPV_IF_Rgba8Snorm   : I32EnumAttrCase<"Rgba8Snorm", 5>;
+def SPV_IF_Rg32f        : I32EnumAttrCase<"Rg32f", 6>;
+def SPV_IF_Rg16f        : I32EnumAttrCase<"Rg16f", 7>;
+def SPV_IF_R11fG11fB10f : I32EnumAttrCase<"R11fG11fB10f", 8>;
+def SPV_IF_R16f         : I32EnumAttrCase<"R16f", 9>;
+def SPV_IF_Rgba16       : I32EnumAttrCase<"Rgba16", 10>;
+def SPV_IF_Rgb10A2      : I32EnumAttrCase<"Rgb10A2", 11>;
+def SPV_IF_Rg16         : I32EnumAttrCase<"Rg16", 12>;
+def SPV_IF_Rg8          : I32EnumAttrCase<"Rg8", 13>;
+def SPV_IF_R16          : I32EnumAttrCase<"R16", 14>;
+def SPV_IF_R8           : I32EnumAttrCase<"R8", 15>;
+def SPV_IF_Rgba16Snorm  : I32EnumAttrCase<"Rgba16Snorm", 16>;
+def SPV_IF_Rg16Snorm    : I32EnumAttrCase<"Rg16Snorm", 17>;
+def SPV_IF_Rg8Snorm     : I32EnumAttrCase<"Rg8Snorm", 18>;
+def SPV_IF_R16Snorm     : I32EnumAttrCase<"R16Snorm", 19>;
+def SPV_IF_R8Snorm      : I32EnumAttrCase<"R8Snorm", 20>;
+def SPV_IF_Rgba32i      : I32EnumAttrCase<"Rgba32i", 21>;
+def SPV_IF_Rgba16i      : I32EnumAttrCase<"Rgba16i", 22>;
+def SPV_IF_Rgba8i       : I32EnumAttrCase<"Rgba8i", 23>;
+def SPV_IF_R32i         : I32EnumAttrCase<"R32i", 24>;
+def SPV_IF_Rg32i        : I32EnumAttrCase<"Rg32i", 25>;
+def SPV_IF_Rg16i        : I32EnumAttrCase<"Rg16i", 26>;
+def SPV_IF_Rg8i         : I32EnumAttrCase<"Rg8i", 27>;
+def SPV_IF_R16i         : I32EnumAttrCase<"R16i", 28>;
+def SPV_IF_R8i          : I32EnumAttrCase<"R8i", 29>;
+def SPV_IF_Rgba32ui     : I32EnumAttrCase<"Rgba32ui", 30>;
+def SPV_IF_Rgba16ui     : I32EnumAttrCase<"Rgba16ui", 31>;
+def SPV_IF_Rgba8ui      : I32EnumAttrCase<"Rgba8ui", 32>;
+def SPV_IF_R32ui        : I32EnumAttrCase<"R32ui", 33>;
+def SPV_IF_Rgb10a2ui    : I32EnumAttrCase<"Rgb10a2ui", 34>;
+def SPV_IF_Rg32ui       : I32EnumAttrCase<"Rg32ui", 35>;
+def SPV_IF_Rg16ui       : I32EnumAttrCase<"Rg16ui", 36>;
+def SPV_IF_Rg8ui        : I32EnumAttrCase<"Rg8ui", 37>;
+def SPV_IF_R16ui        : I32EnumAttrCase<"R16ui", 38>;
+def SPV_IF_R8ui         : I32EnumAttrCase<"R8ui", 39>;
+
+def SPV_ImageFormatAttr :
+    I32EnumAttr<"ImageFormat", "valid SPIR-V ImageFormat", [
+      SPV_IF_Unknown, SPV_IF_Rgba32f, SPV_IF_Rgba16f, SPV_IF_R32f, SPV_IF_Rgba8,
+      SPV_IF_Rgba8Snorm, SPV_IF_Rg32f, SPV_IF_Rg16f, SPV_IF_R11fG11fB10f,
+      SPV_IF_R16f, SPV_IF_Rgba16, SPV_IF_Rgb10A2, SPV_IF_Rg16, SPV_IF_Rg8,
+      SPV_IF_R16, SPV_IF_R8, SPV_IF_Rgba16Snorm, SPV_IF_Rg16Snorm, SPV_IF_Rg8Snorm,
+      SPV_IF_R16Snorm, SPV_IF_R8Snorm, SPV_IF_Rgba32i, SPV_IF_Rgba16i, SPV_IF_Rgba8i,
+      SPV_IF_R32i, SPV_IF_Rg32i, SPV_IF_Rg16i, SPV_IF_Rg8i, SPV_IF_R16i, SPV_IF_R8i,
+      SPV_IF_Rgba32ui, SPV_IF_Rgba16ui, SPV_IF_Rgba8ui, SPV_IF_R32ui,
+      SPV_IF_Rgb10a2ui, SPV_IF_Rg32ui, SPV_IF_Rg16ui, SPV_IF_Rg8ui, SPV_IF_R16ui,
+      SPV_IF_R8ui
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_LT_Export : I32EnumAttrCase<"Export", 0>;
+def SPV_LT_Import : I32EnumAttrCase<"Import", 1>;
+
+def SPV_LinkageTypeAttr :
+    I32EnumAttr<"LinkageType", "valid SPIR-V LinkageType", [
+      SPV_LT_Export, SPV_LT_Import
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_LC_None               : BitEnumAttrCase<"None", 0x0000>;
+def SPV_LC_Unroll             : BitEnumAttrCase<"Unroll", 0x0001>;
+def SPV_LC_DontUnroll         : BitEnumAttrCase<"DontUnroll", 0x0002>;
+def SPV_LC_DependencyInfinite : BitEnumAttrCase<"DependencyInfinite", 0x0004>;
+def SPV_LC_DependencyLength   : BitEnumAttrCase<"DependencyLength", 0x0008>;
+def SPV_LC_MinIterations      : BitEnumAttrCase<"MinIterations", 0x0010>;
+def SPV_LC_MaxIterations      : BitEnumAttrCase<"MaxIterations", 0x0020>;
+def SPV_LC_IterationMultiple  : BitEnumAttrCase<"IterationMultiple", 0x0040>;
+def SPV_LC_PeelCount          : BitEnumAttrCase<"PeelCount", 0x0080>;
+def SPV_LC_PartialCount       : BitEnumAttrCase<"PartialCount", 0x0100>;
+
+def SPV_LoopControlAttr :
+    BitEnumAttr<"LoopControl", "valid SPIR-V LoopControl", [
+      SPV_LC_None, SPV_LC_Unroll, SPV_LC_DontUnroll, SPV_LC_DependencyInfinite,
+      SPV_LC_DependencyLength, SPV_LC_MinIterations, SPV_LC_MaxIterations,
+      SPV_LC_IterationMultiple, SPV_LC_PeelCount, SPV_LC_PartialCount
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_MA_None                 : BitEnumAttrCase<"None", 0x0000>;
+def SPV_MA_Volatile             : BitEnumAttrCase<"Volatile", 0x0001>;
+def SPV_MA_Aligned              : BitEnumAttrCase<"Aligned", 0x0002>;
+def SPV_MA_Nontemporal          : BitEnumAttrCase<"Nontemporal", 0x0004>;
+def SPV_MA_MakePointerAvailable : BitEnumAttrCase<"MakePointerAvailable", 0x0008>;
+def SPV_MA_MakePointerVisible   : BitEnumAttrCase<"MakePointerVisible", 0x0010>;
+def SPV_MA_NonPrivatePointer    : BitEnumAttrCase<"NonPrivatePointer", 0x0020>;
+
+def SPV_MemoryAccessAttr :
+    BitEnumAttr<"MemoryAccess", "valid SPIR-V MemoryAccess", [
+      SPV_MA_None, SPV_MA_Volatile, SPV_MA_Aligned, SPV_MA_Nontemporal,
+      SPV_MA_MakePointerAvailable, SPV_MA_MakePointerVisible,
+      SPV_MA_NonPrivatePointer
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_MM_Simple  : I32EnumAttrCase<"Simple", 0>;
+def SPV_MM_GLSL450 : I32EnumAttrCase<"GLSL450", 1>;
+def SPV_MM_OpenCL  : I32EnumAttrCase<"OpenCL", 2>;
+def SPV_MM_Vulkan  : I32EnumAttrCase<"Vulkan", 3>;
+
+def SPV_MemoryModelAttr :
+    I32EnumAttr<"MemoryModel", "valid SPIR-V MemoryModel", [
+      SPV_MM_Simple, SPV_MM_GLSL450, SPV_MM_OpenCL, SPV_MM_Vulkan
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_MS_None                   : BitEnumAttrCase<"None", 0x0000>;
+def SPV_MS_Acquire                : BitEnumAttrCase<"Acquire", 0x0002>;
+def SPV_MS_Release                : BitEnumAttrCase<"Release", 0x0004>;
+def SPV_MS_AcquireRelease         : BitEnumAttrCase<"AcquireRelease", 0x0008>;
+def SPV_MS_SequentiallyConsistent : BitEnumAttrCase<"SequentiallyConsistent", 0x0010>;
+def SPV_MS_UniformMemory          : BitEnumAttrCase<"UniformMemory", 0x0040>;
+def SPV_MS_SubgroupMemory         : BitEnumAttrCase<"SubgroupMemory", 0x0080>;
+def SPV_MS_WorkgroupMemory        : BitEnumAttrCase<"WorkgroupMemory", 0x0100>;
+def SPV_MS_CrossWorkgroupMemory   : BitEnumAttrCase<"CrossWorkgroupMemory", 0x0200>;
+def SPV_MS_AtomicCounterMemory    : BitEnumAttrCase<"AtomicCounterMemory", 0x0400>;
+def SPV_MS_ImageMemory            : BitEnumAttrCase<"ImageMemory", 0x0800>;
+def SPV_MS_OutputMemory           : BitEnumAttrCase<"OutputMemory", 0x1000>;
+def SPV_MS_MakeAvailable          : BitEnumAttrCase<"MakeAvailable", 0x2000>;
+def SPV_MS_MakeVisible            : BitEnumAttrCase<"MakeVisible", 0x4000>;
+def SPV_MS_Volatile               : BitEnumAttrCase<"Volatile", 0x8000>;
+
+def SPV_MemorySemanticsAttr :
+    BitEnumAttr<"MemorySemantics", "valid SPIR-V MemorySemantics", [
+      SPV_MS_None, SPV_MS_Acquire, SPV_MS_Release, SPV_MS_AcquireRelease,
+      SPV_MS_SequentiallyConsistent, SPV_MS_UniformMemory, SPV_MS_SubgroupMemory,
+      SPV_MS_WorkgroupMemory, SPV_MS_CrossWorkgroupMemory,
+      SPV_MS_AtomicCounterMemory, SPV_MS_ImageMemory, SPV_MS_OutputMemory,
+      SPV_MS_MakeAvailable, SPV_MS_MakeVisible, SPV_MS_Volatile
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_S_CrossDevice : I32EnumAttrCase<"CrossDevice", 0>;
+def SPV_S_Device      : I32EnumAttrCase<"Device", 1>;
+def SPV_S_Workgroup   : I32EnumAttrCase<"Workgroup", 2>;
+def SPV_S_Subgroup    : I32EnumAttrCase<"Subgroup", 3>;
+def SPV_S_Invocation  : I32EnumAttrCase<"Invocation", 4>;
+def SPV_S_QueueFamily : I32EnumAttrCase<"QueueFamily", 5>;
+
+def SPV_ScopeAttr :
+    I32EnumAttr<"Scope", "valid SPIR-V Scope", [
+      SPV_S_CrossDevice, SPV_S_Device, SPV_S_Workgroup, SPV_S_Subgroup,
+      SPV_S_Invocation, SPV_S_QueueFamily
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_SC_None        : BitEnumAttrCase<"None", 0x0000>;
+def SPV_SC_Flatten     : BitEnumAttrCase<"Flatten", 0x0001>;
+def SPV_SC_DontFlatten : BitEnumAttrCase<"DontFlatten", 0x0002>;
+
+def SPV_SelectionControlAttr :
+    BitEnumAttr<"SelectionControl", "valid SPIR-V SelectionControl", [
+      SPV_SC_None, SPV_SC_Flatten, SPV_SC_DontFlatten
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_SC_UniformConstant        : I32EnumAttrCase<"UniformConstant", 0>;
+def SPV_SC_Input                  : I32EnumAttrCase<"Input", 1>;
+def SPV_SC_Uniform                : I32EnumAttrCase<"Uniform", 2>;
+def SPV_SC_Output                 : I32EnumAttrCase<"Output", 3>;
+def SPV_SC_Workgroup              : I32EnumAttrCase<"Workgroup", 4>;
+def SPV_SC_CrossWorkgroup         : I32EnumAttrCase<"CrossWorkgroup", 5>;
+def SPV_SC_Private                : I32EnumAttrCase<"Private", 6>;
+def SPV_SC_Function               : I32EnumAttrCase<"Function", 7>;
+def SPV_SC_Generic                : I32EnumAttrCase<"Generic", 8>;
+def SPV_SC_PushConstant           : I32EnumAttrCase<"PushConstant", 9>;
+def SPV_SC_AtomicCounter          : I32EnumAttrCase<"AtomicCounter", 10>;
+def SPV_SC_Image                  : I32EnumAttrCase<"Image", 11>;
+def SPV_SC_StorageBuffer          : I32EnumAttrCase<"StorageBuffer", 12>;
+def SPV_SC_CallableDataNV         : I32EnumAttrCase<"CallableDataNV", 5328>;
+def SPV_SC_IncomingCallableDataNV : I32EnumAttrCase<"IncomingCallableDataNV", 5329>;
+def SPV_SC_RayPayloadNV           : I32EnumAttrCase<"RayPayloadNV", 5338>;
+def SPV_SC_HitAttributeNV         : I32EnumAttrCase<"HitAttributeNV", 5339>;
+def SPV_SC_IncomingRayPayloadNV   : I32EnumAttrCase<"IncomingRayPayloadNV", 5342>;
+def SPV_SC_ShaderRecordBufferNV   : I32EnumAttrCase<"ShaderRecordBufferNV", 5343>;
+def SPV_SC_PhysicalStorageBuffer  : I32EnumAttrCase<"PhysicalStorageBuffer", 5349>;
+
+def SPV_StorageClassAttr :
+    I32EnumAttr<"StorageClass", "valid SPIR-V StorageClass", [
+      SPV_SC_UniformConstant, SPV_SC_Input, SPV_SC_Uniform, SPV_SC_Output,
+      SPV_SC_Workgroup, SPV_SC_CrossWorkgroup, SPV_SC_Private, SPV_SC_Function,
+      SPV_SC_Generic, SPV_SC_PushConstant, SPV_SC_AtomicCounter, SPV_SC_Image,
+      SPV_SC_StorageBuffer, SPV_SC_CallableDataNV, SPV_SC_IncomingCallableDataNV,
+      SPV_SC_RayPayloadNV, SPV_SC_HitAttributeNV, SPV_SC_IncomingRayPayloadNV,
+      SPV_SC_ShaderRecordBufferNV, SPV_SC_PhysicalStorageBuffer
+    ]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+// End enum section. Generated from SPIR-V spec; DO NOT MODIFY!
+
+// Enums added manually that are not part of SPIR-V spec
+
+def SPV_IDI_NoDepth      : I32EnumAttrCase<"NoDepth", 0>;
+def SPV_IDI_IsDepth      : I32EnumAttrCase<"IsDepth", 1>;
+def SPV_IDI_DepthUnknown : I32EnumAttrCase<"DepthUnknown", 2>;
+
+def SPV_DepthAttr :
+    I32EnumAttr<"ImageDepthInfo", "valid SPIR-V Image Depth specification",
+      [SPV_IDI_NoDepth, SPV_IDI_IsDepth, SPV_IDI_DepthUnknown]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_IAI_NonArrayed : I32EnumAttrCase<"NonArrayed", 0>;
+def SPV_IAI_Arrayed    : I32EnumAttrCase<"Arrayed", 1>;
+
+def SPV_ArrayedAttr :
+    I32EnumAttr<"ImageArrayedInfo", "valid SPIR-V Image Arrayed specification",
+      [SPV_IAI_NonArrayed, SPV_IAI_Arrayed]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_ISI_SingleSampled : I32EnumAttrCase<"SingleSampled", 0>;
+def SPV_ISI_MultiSampled  : I32EnumAttrCase<"MultiSampled", 1>;
+
+def SPV_SamplingAttr:
+    I32EnumAttr<"ImageSamplingInfo", "valid SPIR-V Image Sampling specification",
+      [SPV_ISI_SingleSampled, SPV_ISI_MultiSampled]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+def SPV_ISUI_SamplerUnknown : I32EnumAttrCase<"SamplerUnknown", 0>;
+def SPV_ISUI_NeedSampler    : I32EnumAttrCase<"NeedSampler", 1>;
+def SPV_ISUI_NoSampler      : I32EnumAttrCase<"NoSampler", 2>;
+
+def SPV_SamplerUseAttr:
+    I32EnumAttr<"ImageSamplerUseInfo", "valid SPIR-V Sampler Use specification",
+      [SPV_ISUI_SamplerUnknown, SPV_ISUI_NeedSampler, SPV_ISUI_NoSampler]> {
+  let cppNamespace = "::mlir::spirv";
+}
+
+//===----------------------------------------------------------------------===//
+// SPIR-V type definitions
+//===----------------------------------------------------------------------===//
+
+def SPV_IsPtrType : CPred<"$_self.isa<::mlir::spirv::PointerType>()">;
+def SPV_IsArrayType : CPred<"$_self.isa<::mlir::spirv::ArrayType>()">;
+def SPV_IsRTArrayType : CPred<"$_self.isa<::mlir::spirv::RuntimeArrayType>()">;
+def SPV_IsStructType : CPred<"$_self.isa<::mlir::spirv::StructType>()">;
+
+// See https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html#_types
+// for the definition of the following types and type categories.
+
+def SPV_Void : TypeAlias<NoneType, "void type">;
+def SPV_Bool : IntOfWidths<[1]>;
+def SPV_Integer : IntOfWidths<[8, 16, 32, 64]>;
+def SPV_Float : FloatOfWidths<[16, 32, 64]>;
+def SPV_Float16or32 : FloatOfWidths<[16, 32]>;
+def SPV_Vector : VectorOfLengthAndType<[2, 3, 4],
+                                       [SPV_Bool, SPV_Integer, SPV_Float]>;
+// Component type check is done in the type parser for the following SPIR-V
+// dialect-specific types so we use "Any" here.
+def SPV_AnyPtr : Type<SPV_IsPtrType, "any SPIR-V pointer type">;
+def SPV_AnyArray : Type<SPV_IsArrayType, "any SPIR-V array type">;
+def SPV_AnyRTArray : Type<SPV_IsRTArrayType, "any SPIR-V runtime array type">;
+def SPV_AnyStruct : Type<SPV_IsStructType, "any SPIR-V struct type">;
+
+def SPV_Numerical : AnyTypeOf<[SPV_Integer, SPV_Float]>;
+def SPV_Scalar : AnyTypeOf<[SPV_Numerical, SPV_Bool]>;
+def SPV_Aggregate : AnyTypeOf<[SPV_AnyArray, SPV_AnyStruct]>;
+def SPV_Composite :
+    AnyTypeOf<[SPV_Vector, SPV_AnyArray, SPV_AnyRTArray, SPV_AnyStruct]>;
+def SPV_Type : AnyTypeOf<[
+    SPV_Void, SPV_Bool, SPV_Integer, SPV_Float, SPV_Vector,
+    SPV_AnyPtr, SPV_AnyArray, SPV_AnyRTArray, SPV_AnyStruct
+  ]>;
+
+class SPV_ScalarOrVectorOf<Type type> :
+    AnyTypeOf<[type, VectorOfLengthAndType<[2, 3, 4], [type]>]>;
+
+def SPV_ScalarOrVector : AnyTypeOf<[SPV_Scalar, SPV_Vector]>;
+def SPV_ScalarOrVectorOrPtr : AnyTypeOf<[SPV_ScalarOrVector, SPV_AnyPtr]>;
+
+class SPV_Vec4<Type type> : VectorOfLengthAndType<[4], [type]>;
+def SPV_IntVec4 : SPV_Vec4<SPV_Integer>;
+def SPV_I32Vec4 : SPV_Vec4<I32>;
+
+// TODO(antiagainst): Use a more appropriate way to model optional operands
+class SPV_Optional<Type type> : Variadic<type>;
+
+// TODO(ravishankarm): From 1.4, this should also include Composite type.
+def SPV_SelectType : AnyTypeOf<[SPV_Scalar, SPV_Vector, SPV_AnyPtr]>;
+
+//===----------------------------------------------------------------------===//
+// SPIR-V OpTrait definitions
+//===----------------------------------------------------------------------===//
+
+// Check that an op can only be used within the scope of a FuncOp.
+def InFunctionScope : PredOpTrait<
+  "op must appear in a 'func' block",
+  CPred<"($_op.getParentOfType<FuncOp>())">>;
+
+// Check that an op can only be used within the scope of a SPIR-V ModuleOp.
+def InModuleScope : PredOpTrait<
+  "op must appear in a 'spv.module' block",
+  CPred<"llvm::isa_and_nonnull<spirv::ModuleOp>($_op.getParentOp())">>;
+
+//===----------------------------------------------------------------------===//
+// SPIR-V opcode specification
+//===----------------------------------------------------------------------===//
+
+class SPV_OpCode<string name, int val> {
+  // Name used as reference to retrieve the opcode
+  string opname = name;
+
+  // Opcode associated with the name
+  int opcode = val;
+}
+
+// Begin opcode section. Generated from SPIR-V spec; DO NOT MODIFY!
+
+def SPV_OC_OpNop                       : I32EnumAttrCase<"OpNop", 0>;
+def SPV_OC_OpUndef                     : I32EnumAttrCase<"OpUndef", 1>;
+def SPV_OC_OpSourceContinued           : I32EnumAttrCase<"OpSourceContinued", 2>;
+def SPV_OC_OpSource                    : I32EnumAttrCase<"OpSource", 3>;
+def SPV_OC_OpSourceExtension           : I32EnumAttrCase<"OpSourceExtension", 4>;
+def SPV_OC_OpName                      : I32EnumAttrCase<"OpName", 5>;
+def SPV_OC_OpMemberName                : I32EnumAttrCase<"OpMemberName", 6>;
+def SPV_OC_OpString                    : I32EnumAttrCase<"OpString", 7>;
+def SPV_OC_OpExtension                 : I32EnumAttrCase<"OpExtension", 10>;
+def SPV_OC_OpExtInstImport             : I32EnumAttrCase<"OpExtInstImport", 11>;
+def SPV_OC_OpExtInst                   : I32EnumAttrCase<"OpExtInst", 12>;
+def SPV_OC_OpMemoryModel               : I32EnumAttrCase<"OpMemoryModel", 14>;
+def SPV_OC_OpEntryPoint                : I32EnumAttrCase<"OpEntryPoint", 15>;
+def SPV_OC_OpExecutionMode             : I32EnumAttrCase<"OpExecutionMode", 16>;
+def SPV_OC_OpCapability                : I32EnumAttrCase<"OpCapability", 17>;
+def SPV_OC_OpTypeVoid                  : I32EnumAttrCase<"OpTypeVoid", 19>;
+def SPV_OC_OpTypeBool                  : I32EnumAttrCase<"OpTypeBool", 20>;
+def SPV_OC_OpTypeInt                   : I32EnumAttrCase<"OpTypeInt", 21>;
+def SPV_OC_OpTypeFloat                 : I32EnumAttrCase<"OpTypeFloat", 22>;
+def SPV_OC_OpTypeVector                : I32EnumAttrCase<"OpTypeVector", 23>;
+def SPV_OC_OpTypeArray                 : I32EnumAttrCase<"OpTypeArray", 28>;
+def SPV_OC_OpTypeRuntimeArray          : I32EnumAttrCase<"OpTypeRuntimeArray", 29>;
+def SPV_OC_OpTypeStruct                : I32EnumAttrCase<"OpTypeStruct", 30>;
+def SPV_OC_OpTypePointer               : I32EnumAttrCase<"OpTypePointer", 32>;
+def SPV_OC_OpTypeFunction              : I32EnumAttrCase<"OpTypeFunction", 33>;
+def SPV_OC_OpConstantTrue              : I32EnumAttrCase<"OpConstantTrue", 41>;
+def SPV_OC_OpConstantFalse             : I32EnumAttrCase<"OpConstantFalse", 42>;
+def SPV_OC_OpConstant                  : I32EnumAttrCase<"OpConstant", 43>;
+def SPV_OC_OpConstantComposite         : I32EnumAttrCase<"OpConstantComposite", 44>;
+def SPV_OC_OpConstantNull              : I32EnumAttrCase<"OpConstantNull", 46>;
+def SPV_OC_OpSpecConstantTrue          : I32EnumAttrCase<"OpSpecConstantTrue", 48>;
+def SPV_OC_OpSpecConstantFalse         : I32EnumAttrCase<"OpSpecConstantFalse", 49>;
+def SPV_OC_OpSpecConstant              : I32EnumAttrCase<"OpSpecConstant", 50>;
+def SPV_OC_OpSpecConstantComposite     : I32EnumAttrCase<"OpSpecConstantComposite", 51>;
+def SPV_OC_OpFunction                  : I32EnumAttrCase<"OpFunction", 54>;
+def SPV_OC_OpFunctionParameter         : I32EnumAttrCase<"OpFunctionParameter", 55>;
+def SPV_OC_OpFunctionEnd               : I32EnumAttrCase<"OpFunctionEnd", 56>;
+def SPV_OC_OpFunctionCall              : I32EnumAttrCase<"OpFunctionCall", 57>;
+def SPV_OC_OpVariable                  : I32EnumAttrCase<"OpVariable", 59>;
+def SPV_OC_OpLoad                      : I32EnumAttrCase<"OpLoad", 61>;
+def SPV_OC_OpStore                     : I32EnumAttrCase<"OpStore", 62>;
+def SPV_OC_OpAccessChain               : I32EnumAttrCase<"OpAccessChain", 65>;
+def SPV_OC_OpDecorate                  : I32EnumAttrCase<"OpDecorate", 71>;
+def SPV_OC_OpMemberDecorate            : I32EnumAttrCase<"OpMemberDecorate", 72>;
+def SPV_OC_OpCompositeConstruct        : I32EnumAttrCase<"OpCompositeConstruct", 80>;
+def SPV_OC_OpCompositeExtract          : I32EnumAttrCase<"OpCompositeExtract", 81>;
+def SPV_OC_OpCompositeInsert           : I32EnumAttrCase<"OpCompositeInsert", 82>;
+def SPV_OC_OpConvertFToU               : I32EnumAttrCase<"OpConvertFToU", 109>;
+def SPV_OC_OpConvertFToS               : I32EnumAttrCase<"OpConvertFToS", 110>;
+def SPV_OC_OpConvertSToF               : I32EnumAttrCase<"OpConvertSToF", 111>;
+def SPV_OC_OpConvertUToF               : I32EnumAttrCase<"OpConvertUToF", 112>;
+def SPV_OC_OpUConvert                  : I32EnumAttrCase<"OpUConvert", 113>;
+def SPV_OC_OpSConvert                  : I32EnumAttrCase<"OpSConvert", 114>;
+def SPV_OC_OpFConvert                  : I32EnumAttrCase<"OpFConvert", 115>;
+def SPV_OC_OpBitcast                   : I32EnumAttrCase<"OpBitcast", 124>;
+def SPV_OC_OpFNegate                   : I32EnumAttrCase<"OpFNegate", 127>;
+def SPV_OC_OpIAdd                      : I32EnumAttrCase<"OpIAdd", 128>;
+def SPV_OC_OpFAdd                      : I32EnumAttrCase<"OpFAdd", 129>;
+def SPV_OC_OpISub                      : I32EnumAttrCase<"OpISub", 130>;
+def SPV_OC_OpFSub                      : I32EnumAttrCase<"OpFSub", 131>;
+def SPV_OC_OpIMul                      : I32EnumAttrCase<"OpIMul", 132>;
+def SPV_OC_OpFMul                      : I32EnumAttrCase<"OpFMul", 133>;
+def SPV_OC_OpUDiv                      : I32EnumAttrCase<"OpUDiv", 134>;
+def SPV_OC_OpSDiv                      : I32EnumAttrCase<"OpSDiv", 135>;
+def SPV_OC_OpFDiv                      : I32EnumAttrCase<"OpFDiv", 136>;
+def SPV_OC_OpUMod                      : I32EnumAttrCase<"OpUMod", 137>;
+def SPV_OC_OpSRem                      : I32EnumAttrCase<"OpSRem", 138>;
+def SPV_OC_OpSMod                      : I32EnumAttrCase<"OpSMod", 139>;
+def SPV_OC_OpFRem                      : I32EnumAttrCase<"OpFRem", 140>;
+def SPV_OC_OpFMod                      : I32EnumAttrCase<"OpFMod", 141>;
+def SPV_OC_OpLogicalEqual              : I32EnumAttrCase<"OpLogicalEqual", 164>;
+def SPV_OC_OpLogicalNotEqual           : I32EnumAttrCase<"OpLogicalNotEqual", 165>;
+def SPV_OC_OpLogicalOr                 : I32EnumAttrCase<"OpLogicalOr", 166>;
+def SPV_OC_OpLogicalAnd                : I32EnumAttrCase<"OpLogicalAnd", 167>;
+def SPV_OC_OpLogicalNot                : I32EnumAttrCase<"OpLogicalNot", 168>;
+def SPV_OC_OpSelect                    : I32EnumAttrCase<"OpSelect", 169>;
+def SPV_OC_OpIEqual                    : I32EnumAttrCase<"OpIEqual", 170>;
+def SPV_OC_OpINotEqual                 : I32EnumAttrCase<"OpINotEqual", 171>;
+def SPV_OC_OpUGreaterThan              : I32EnumAttrCase<"OpUGreaterThan", 172>;
+def SPV_OC_OpSGreaterThan              : I32EnumAttrCase<"OpSGreaterThan", 173>;
+def SPV_OC_OpUGreaterThanEqual         : I32EnumAttrCase<"OpUGreaterThanEqual", 174>;
+def SPV_OC_OpSGreaterThanEqual         : I32EnumAttrCase<"OpSGreaterThanEqual", 175>;
+def SPV_OC_OpULessThan                 : I32EnumAttrCase<"OpULessThan", 176>;
+def SPV_OC_OpSLessThan                 : I32EnumAttrCase<"OpSLessThan", 177>;
+def SPV_OC_OpULessThanEqual            : I32EnumAttrCase<"OpULessThanEqual", 178>;
+def SPV_OC_OpSLessThanEqual            : I32EnumAttrCase<"OpSLessThanEqual", 179>;
+def SPV_OC_OpFOrdEqual                 : I32EnumAttrCase<"OpFOrdEqual", 180>;
+def SPV_OC_OpFUnordEqual               : I32EnumAttrCase<"OpFUnordEqual", 181>;
+def SPV_OC_OpFOrdNotEqual              : I32EnumAttrCase<"OpFOrdNotEqual", 182>;
+def SPV_OC_OpFUnordNotEqual            : I32EnumAttrCase<"OpFUnordNotEqual", 183>;
+def SPV_OC_OpFOrdLessThan              : I32EnumAttrCase<"OpFOrdLessThan", 184>;
+def SPV_OC_OpFUnordLessThan            : I32EnumAttrCase<"OpFUnordLessThan", 185>;
+def SPV_OC_OpFOrdGreaterThan           : I32EnumAttrCase<"OpFOrdGreaterThan", 186>;
+def SPV_OC_OpFUnordGreaterThan         : I32EnumAttrCase<"OpFUnordGreaterThan", 187>;
+def SPV_OC_OpFOrdLessThanEqual         : I32EnumAttrCase<"OpFOrdLessThanEqual", 188>;
+def SPV_OC_OpFUnordLessThanEqual       : I32EnumAttrCase<"OpFUnordLessThanEqual", 189>;
+def SPV_OC_OpFOrdGreaterThanEqual      : I32EnumAttrCase<"OpFOrdGreaterThanEqual", 190>;
+def SPV_OC_OpFUnordGreaterThanEqual    : I32EnumAttrCase<"OpFUnordGreaterThanEqual", 191>;
+def SPV_OC_OpShiftRightLogical         : I32EnumAttrCase<"OpShiftRightLogical", 194>;
+def SPV_OC_OpShiftRightArithmetic      : I32EnumAttrCase<"OpShiftRightArithmetic", 195>;
+def SPV_OC_OpShiftLeftLogical          : I32EnumAttrCase<"OpShiftLeftLogical", 196>;
+def SPV_OC_OpBitwiseOr                 : I32EnumAttrCase<"OpBitwiseOr", 197>;
+def SPV_OC_OpBitwiseXor                : I32EnumAttrCase<"OpBitwiseXor", 198>;
+def SPV_OC_OpBitwiseAnd                : I32EnumAttrCase<"OpBitwiseAnd", 199>;
+def SPV_OC_OpNot                       : I32EnumAttrCase<"OpNot", 200>;
+def SPV_OC_OpBitFieldInsert            : I32EnumAttrCase<"OpBitFieldInsert", 201>;
+def SPV_OC_OpBitFieldSExtract          : I32EnumAttrCase<"OpBitFieldSExtract", 202>;
+def SPV_OC_OpBitFieldUExtract          : I32EnumAttrCase<"OpBitFieldUExtract", 203>;
+def SPV_OC_OpBitReverse                : I32EnumAttrCase<"OpBitReverse", 204>;
+def SPV_OC_OpBitCount                  : I32EnumAttrCase<"OpBitCount", 205>;
+def SPV_OC_OpControlBarrier            : I32EnumAttrCase<"OpControlBarrier", 224>;
+def SPV_OC_OpMemoryBarrier             : I32EnumAttrCase<"OpMemoryBarrier", 225>;
+def SPV_OC_OpAtomicCompareExchangeWeak : I32EnumAttrCase<"OpAtomicCompareExchangeWeak", 231>;
+def SPV_OC_OpAtomicIIncrement          : I32EnumAttrCase<"OpAtomicIIncrement", 232>;
+def SPV_OC_OpAtomicIDecrement          : I32EnumAttrCase<"OpAtomicIDecrement", 233>;
+def SPV_OC_OpAtomicIAdd                : I32EnumAttrCase<"OpAtomicIAdd", 234>;
+def SPV_OC_OpAtomicISub                : I32EnumAttrCase<"OpAtomicISub", 235>;
+def SPV_OC_OpAtomicSMin                : I32EnumAttrCase<"OpAtomicSMin", 236>;
+def SPV_OC_OpAtomicUMin                : I32EnumAttrCase<"OpAtomicUMin", 237>;
+def SPV_OC_OpAtomicSMax                : I32EnumAttrCase<"OpAtomicSMax", 238>;
+def SPV_OC_OpAtomicUMax                : I32EnumAttrCase<"OpAtomicUMax", 239>;
+def SPV_OC_OpAtomicAnd                 : I32EnumAttrCase<"OpAtomicAnd", 240>;
+def SPV_OC_OpAtomicOr                  : I32EnumAttrCase<"OpAtomicOr", 241>;
+def SPV_OC_OpAtomicXor                 : I32EnumAttrCase<"OpAtomicXor", 242>;
+def SPV_OC_OpPhi                       : I32EnumAttrCase<"OpPhi", 245>;
+def SPV_OC_OpLoopMerge                 : I32EnumAttrCase<"OpLoopMerge", 246>;
+def SPV_OC_OpSelectionMerge            : I32EnumAttrCase<"OpSelectionMerge", 247>;
+def SPV_OC_OpLabel                     : I32EnumAttrCase<"OpLabel", 248>;
+def SPV_OC_OpBranch                    : I32EnumAttrCase<"OpBranch", 249>;
+def SPV_OC_OpBranchConditional         : I32EnumAttrCase<"OpBranchConditional", 250>;
+def SPV_OC_OpReturn                    : I32EnumAttrCase<"OpReturn", 253>;
+def SPV_OC_OpReturnValue               : I32EnumAttrCase<"OpReturnValue", 254>;
+def SPV_OC_OpUnreachable               : I32EnumAttrCase<"OpUnreachable", 255>;
+def SPV_OC_OpModuleProcessed           : I32EnumAttrCase<"OpModuleProcessed", 330>;
+def SPV_OC_OpGroupNonUniformBallot     : I32EnumAttrCase<"OpGroupNonUniformBallot", 339>;
+def SPV_OC_OpSubgroupBallotKHR         : I32EnumAttrCase<"OpSubgroupBallotKHR", 4421>;
+
+def SPV_OpcodeAttr :
+    I32EnumAttr<"Opcode", "valid SPIR-V instructions", [
+      SPV_OC_OpNop, SPV_OC_OpUndef, SPV_OC_OpSourceContinued, SPV_OC_OpSource,
+      SPV_OC_OpSourceExtension, SPV_OC_OpName, SPV_OC_OpMemberName, SPV_OC_OpString,
+      SPV_OC_OpExtension, SPV_OC_OpExtInstImport, SPV_OC_OpExtInst,
+      SPV_OC_OpMemoryModel, SPV_OC_OpEntryPoint, SPV_OC_OpExecutionMode,
+      SPV_OC_OpCapability, SPV_OC_OpTypeVoid, SPV_OC_OpTypeBool, SPV_OC_OpTypeInt,
+      SPV_OC_OpTypeFloat, SPV_OC_OpTypeVector, SPV_OC_OpTypeArray,
+      SPV_OC_OpTypeRuntimeArray, SPV_OC_OpTypeStruct, SPV_OC_OpTypePointer,
+      SPV_OC_OpTypeFunction, SPV_OC_OpConstantTrue, SPV_OC_OpConstantFalse,
+      SPV_OC_OpConstant, SPV_OC_OpConstantComposite, SPV_OC_OpConstantNull,
+      SPV_OC_OpSpecConstantTrue, SPV_OC_OpSpecConstantFalse, SPV_OC_OpSpecConstant,
+      SPV_OC_OpSpecConstantComposite, SPV_OC_OpFunction, SPV_OC_OpFunctionParameter,
+      SPV_OC_OpFunctionEnd, SPV_OC_OpFunctionCall, SPV_OC_OpVariable, SPV_OC_OpLoad,
+      SPV_OC_OpStore, SPV_OC_OpAccessChain, SPV_OC_OpDecorate,
+      SPV_OC_OpMemberDecorate, SPV_OC_OpCompositeConstruct,
+      SPV_OC_OpCompositeExtract, SPV_OC_OpCompositeInsert, SPV_OC_OpConvertFToU,
+      SPV_OC_OpConvertFToS, SPV_OC_OpConvertSToF, SPV_OC_OpConvertUToF,
+      SPV_OC_OpUConvert, SPV_OC_OpSConvert, SPV_OC_OpFConvert, SPV_OC_OpBitcast,
+      SPV_OC_OpFNegate, SPV_OC_OpIAdd, SPV_OC_OpFAdd, SPV_OC_OpISub, SPV_OC_OpFSub,
+      SPV_OC_OpIMul, SPV_OC_OpFMul, SPV_OC_OpUDiv, SPV_OC_OpSDiv, SPV_OC_OpFDiv,
+      SPV_OC_OpUMod, SPV_OC_OpSRem, SPV_OC_OpSMod, SPV_OC_OpFRem, SPV_OC_OpFMod,
+      SPV_OC_OpLogicalEqual, SPV_OC_OpLogicalNotEqual, SPV_OC_OpLogicalOr,
+      SPV_OC_OpLogicalAnd, SPV_OC_OpLogicalNot, SPV_OC_OpSelect, SPV_OC_OpIEqual,
+      SPV_OC_OpINotEqual, SPV_OC_OpUGreaterThan, SPV_OC_OpSGreaterThan,
+      SPV_OC_OpUGreaterThanEqual, SPV_OC_OpSGreaterThanEqual, SPV_OC_OpULessThan,
+      SPV_OC_OpSLessThan, SPV_OC_OpULessThanEqual, SPV_OC_OpSLessThanEqual,
+      SPV_OC_OpFOrdEqual, SPV_OC_OpFUnordEqual, SPV_OC_OpFOrdNotEqual,
+      SPV_OC_OpFUnordNotEqual, SPV_OC_OpFOrdLessThan, SPV_OC_OpFUnordLessThan,
+      SPV_OC_OpFOrdGreaterThan, SPV_OC_OpFUnordGreaterThan,
+      SPV_OC_OpFOrdLessThanEqual, SPV_OC_OpFUnordLessThanEqual,
+      SPV_OC_OpFOrdGreaterThanEqual, SPV_OC_OpFUnordGreaterThanEqual,
+      SPV_OC_OpShiftRightLogical, SPV_OC_OpShiftRightArithmetic,
+      SPV_OC_OpShiftLeftLogical, SPV_OC_OpBitwiseOr, SPV_OC_OpBitwiseXor,
+      SPV_OC_OpBitwiseAnd, SPV_OC_OpNot, SPV_OC_OpBitFieldInsert,
+      SPV_OC_OpBitFieldSExtract, SPV_OC_OpBitFieldUExtract, SPV_OC_OpBitReverse,
+      SPV_OC_OpBitCount, SPV_OC_OpControlBarrier, SPV_OC_OpMemoryBarrier,
+      SPV_OC_OpAtomicCompareExchangeWeak, SPV_OC_OpAtomicIIncrement,
+      SPV_OC_OpAtomicIDecrement, SPV_OC_OpAtomicIAdd, SPV_OC_OpAtomicISub,
+      SPV_OC_OpAtomicSMin, SPV_OC_OpAtomicUMin, SPV_OC_OpAtomicSMax,
+      SPV_OC_OpAtomicUMax, SPV_OC_OpAtomicAnd, SPV_OC_OpAtomicOr, SPV_OC_OpAtomicXor,
+      SPV_OC_OpPhi, SPV_OC_OpLoopMerge, SPV_OC_OpSelectionMerge, SPV_OC_OpLabel,
+      SPV_OC_OpBranch, SPV_OC_OpBranchConditional, SPV_OC_OpReturn,
+      SPV_OC_OpReturnValue, SPV_OC_OpUnreachable, SPV_OC_OpModuleProcessed,
+      SPV_OC_OpGroupNonUniformBallot, SPV_OC_OpSubgroupBallotKHR
+      ]> {
+    let cppNamespace = "::mlir::spirv";
+}
+
+// End opcode section. Generated from SPIR-V spec; DO NOT MODIFY!
+
+//===----------------------------------------------------------------------===//
+// SPIR-V op definitions
+//===----------------------------------------------------------------------===//
+
+// Base class for all SPIR-V ops.
+class SPV_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<SPV_Dialect, mnemonic, traits> {
+
+  // For each SPIR-V op, the following static functions need to be defined
+  // in SPVOps.cpp:
+  //
+  // * static ParseResult parse<op-c++-class-name>(OpAsmParser &parser,
+  //                                               OperationState &result)
+  // * static void print(OpAsmPrinter &p, <op-c++-class-name> op)
+  // * static LogicalResult verify(<op-c++-class-name> op)
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+  let printer = [{ return ::print(*this, p); }];
+  let verifier = [{ return ::verify(*this); }];
+
+  // Specifies whether this op has a direct corresponding SPIR-V binary
+  // instruction opcode. The (de)serializer use this field to determine whether
+  // to auto-generate an entry in the (de)serialization dispatch table for this
+  // op.
+  bit hasOpcode = 1;
+
+  // Name of the corresponding SPIR-V op. Only valid to use when hasOpcode is 1.
+  string spirvOpName = "Op" # mnemonic;
+
+  // Controls whether to auto-generate this op's (de)serialization method.
+  // If set, it results in generation of the following methods:
+  //
+  // ```c++
+  // template<typename OpTy> Serializer::processOp(OpTy op);
+  // template<typename OpTy> Deserializer::processOp(ArrayRef<uint32_t>);
+  // ```
+  //
+  // If this field is not set, then manual implementation of a specialization of
+  // these methods is required.
+  //
+  // Note:
+  // 1) If hasOpcode is set but autogenSerialization is not set, the
+  //    (de)serializer dispatch method still calls the above method for
+  //    (de)serializing this op.
+  // 2) If hasOpcode is not set, but autogenSerialization is set, the
+  //    above methods for (de)serialization are generated, but there is no
+  //    entry added in the dispatch tables to invoke these methods. The
+  //    dispatch needs to be handled manually. SPV_ExtInstOps are an
+  //    example of this.
+  bit autogenSerialization = 1;
+}
+
+class SPV_UnaryOp<string mnemonic, Type resultType, Type operandType,
+                  list<OpTrait> traits = []> :
+      SPV_Op<mnemonic, traits> {
+  let arguments = (ins
+    SPV_ScalarOrVectorOf<operandType>:$operand
+  );
+
+  let results = (outs
+    SPV_ScalarOrVectorOf<resultType>:$result
+  );
+
+  let parser = [{ return ::parseUnaryOp(parser, result); }];
+  let printer = [{ return ::printUnaryOp(getOperation(), p); }];
+  // No additional verification needed in addition to the ODS-generated ones.
+  let verifier = [{ return success(); }];
+}
+
+class SPV_BinaryOp<string mnemonic, Type resultType, Type operandsType,
+                   list<OpTrait> traits = []> :
+      SPV_Op<mnemonic, traits> {
+  let arguments = (ins
+    SPV_ScalarOrVectorOf<operandsType>:$operand1,
+    SPV_ScalarOrVectorOf<operandsType>:$operand2
+  );
+
+  let results = (outs
+    SPV_ScalarOrVectorOf<resultType>:$result
+  );
+
+  let parser = [{ return impl::parseOneResultSameOperandTypeOp(parser, result); }];
+  let printer = [{ return impl::printOneResultOp(getOperation(), p); }];
+  // No additional verification needed in addition to the ODS-generated ones.
+  let verifier = [{ return success(); }];
+}
+
+class SPV_ExtInstOp<string mnemonic, string setPrefix, string setName,
+                    int opcode, list<OpTrait> traits = []> :
+  SPV_Op<setPrefix # "." # mnemonic, traits> {
+
+  // Extended instruction sets have no direct opcode (they share the
+  // same `OpExtInst` instruction). So the hasOpcode field is set to
+  // false. So no entry corresponding to these ops are added in the
+  // dispatch functions for (de)serialization. The methods for
+  // (de)serialization are still automatically generated (since
+  // autogenSerialization remains 1). A separate method is generated
+  // for dispatching extended instruction set ops.
+  let hasOpcode = 0;
+
+  // Opcode within extended instruction set.
+  int extendedInstOpcode = opcode;
+
+  // Name used to import the extended instruction set.
+  string extendedInstSetName = setName;
+}
+
+#endif // SPIRV_BASE
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVBinaryUtils.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVBinaryUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a4264884238ad9fc2a16cdb0ff229a1d6f2b40e
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVBinaryUtils.h
@@ -0,0 +1,49 @@
+//===- SPIRVBinaryUtils.cpp - SPIR-V Binary Module Utils --------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares common utilities for SPIR-V binary module.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_SPIRV_BINARY_UTILS_H_
+#define MLIR_DIALECT_SPIRV_SPIRV_BINARY_UTILS_H_
+
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Support/LogicalResult.h"
+
+#include <cstdint>
+
+namespace mlir {
+namespace spirv {
+
+/// SPIR-V binary header word count
+constexpr unsigned kHeaderWordCount = 5;
+
+/// SPIR-V magic number
+constexpr uint32_t kMagicNumber = 0x07230203;
+
+/// The serializer tool ID registered to the Khronos Group
+constexpr uint32_t kGeneratorNumber = 22;
+
+/// Auto-generated getOpcode<*Op>() specializations
+#define GET_SPIRV_SERIALIZATION_UTILS
+#include "mlir/Dialect/SPIRV/SPIRVSerialization.inc"
+
+/// Appends a SPRI-V module header to `header` with the given `idBound`.
+void appendModuleHeader(SmallVectorImpl<uint32_t> &header, uint32_t idBound);
+
+/// Returns the word-count-prefixed opcode for an SPIR-V instruction.
+uint32_t getPrefixedOpcode(uint32_t wordCount, spirv::Opcode opcode);
+
+/// Encodes an SPIR-V `literal` string into the given `binary` vector.
+LogicalResult encodeStringLiteralInto(SmallVectorImpl<uint32_t> &binary,
+                                      StringRef literal);
+} // end namespace spirv
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_SPIRV_BINARY_UTILS_H_
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVBitOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVBitOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..360edeec52d6ef57ab15e038174c7707fde69add
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVBitOps.td
@@ -0,0 +1,523 @@
+//===-- SPIRVBitOps.td - MLIR SPIR-V Bit Ops -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains bit ops for the SPIR-V dialect. It corresponds
+// to "3.32.13. Bit Instructions" of the SPIR-V specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_BIT_OPS
+#define SPIRV_BIT_OPS
+
+include "mlir/Dialect/SPIRV/SPIRVBase.td"
+
+class SPV_BitBinaryOp<string mnemonic, list<OpTrait> traits = []> :
+      // All the operands type used in bit instructions are SPV_Integer.
+      SPV_BinaryOp<mnemonic, SPV_Integer, SPV_Integer,
+                   !listconcat(traits,
+                               [NoSideEffect, SameOperandsAndResultType])>;
+
+class SPV_BitFieldExtractOp<string mnemonic, list<OpTrait> traits = []> :
+      SPV_Op<mnemonic, !listconcat(traits, [NoSideEffect])> {
+  let arguments = (ins
+    SPV_ScalarOrVectorOf<SPV_Integer>:$base,
+    SPV_Integer:$offset,
+    SPV_Integer:$count
+  );
+
+  let results = (outs
+    SPV_ScalarOrVectorOf<SPV_Integer>:$result
+  );
+
+  let parser = [{ return ::parseBitFieldExtractOp(parser, result); }];
+  let printer = [{ ::printBitFieldExtractOp(this->getOperation(), p); }];
+  let verifier = [{ return ::verifyBitFieldExtractOp(this->getOperation()); }];
+}
+
+class SPV_BitUnaryOp<string mnemonic, list<OpTrait> traits = []> :
+      SPV_UnaryOp<mnemonic, SPV_Integer, SPV_Integer,
+                   !listconcat(traits,
+                               [NoSideEffect, SameOperandsAndResultType])>;
+
+class SPV_ShiftOp<string mnemonic, list<OpTrait> traits = []> :
+      SPV_BinaryOp<mnemonic, SPV_Integer, SPV_Integer,
+                   !listconcat(traits,
+                               [NoSideEffect, SameOperandsAndResultShape])> {
+  let parser = [{ return ::parseShiftOp(parser, result); }];
+  let printer = [{ ::printShiftOp(this->getOperation(), p); }];
+  let verifier = [{ return ::verifyShiftOp(this->getOperation()); }];
+}
+
+// -----
+
+def SPV_BitCountOp : SPV_BitUnaryOp<"BitCount", []> {
+  let summary = "Count the number of set bits in an object.";
+
+  let description = [{
+     Results are computed per component.
+
+    Result Type must be a scalar or vector of integer type.  The components
+    must be wide enough to hold the unsigned Width of Base as an unsigned
+    value. That is, no sign bit is needed or counted when checking for a
+    wide enough result width.
+
+    Base must be a scalar or vector of integer type.  It must have the same
+    number of components as Result Type.
+
+    The result is the unsigned value that is the number of bits in Base that
+    are 1.
+
+    ### Custom assembly form
+
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                  `vector<` integer-literal `x` integer-type `>`
+    bitcount-op ::= ssa-id `=` `spv.BitCount` ssa-use
+                               `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.BitCount %0: i32
+    %3 = spv.BitCount %1: vector<4xi32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_BitFieldInsertOp : SPV_Op<"BitFieldInsert", [NoSideEffect]> {
+  let summary = [{
+    Make a copy of an object, with a modified bit field that comes from
+    another object.
+  }];
+
+  let description = [{
+     Results are computed per component.
+
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Base and Insert must be the same as Result Type.
+
+    Any result bits numbered outside [Offset, Offset + Count -  1]
+    (inclusive) will come from the corresponding bits in Base.
+
+    Any result bits numbered in [Offset, Offset + Count -  1] come, in
+    order, from the bits numbered [0, Count - 1] of Insert.
+
+    Count  must be an integer type scalar. Count is the number of bits taken
+    from Insert. It will be consumed as an unsigned value. Count can be 0,
+    in which case the result will be Base.
+
+    Offset  must be an integer type scalar. Offset is the lowest-order bit
+    of the bit field.  It will be consumed as an unsigned value.
+
+    The resulting value is undefined if Count or Offset or their sum is
+    greater than the number of bits in the result.
+
+    ### Custom assembly form
+
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                  `vector<` integer-literal `x` integer-type `>`
+    bitfield-insert-op ::= ssa-id `=` `spv.BitFieldInsert` ssa-use `,` ssa-use
+                                      `,` ssa-use `,` ssa-use
+                                      `:` integer-scalar-vector-type
+                                      `,` integer-type `,` integer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.BitFieldInsert %base, %insert, %offset, %count : vector<3xi32>, i8, i8
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_ScalarOrVectorOf<SPV_Integer>:$base,
+    SPV_ScalarOrVectorOf<SPV_Integer>:$insert,
+    SPV_Integer:$offset,
+    SPV_Integer:$count
+  );
+
+  let results = (outs
+    SPV_ScalarOrVectorOf<SPV_Integer>:$result
+  );
+}
+
+// -----
+
+def SPV_BitFieldSExtractOp : SPV_BitFieldExtractOp<"BitFieldSExtract", []> {
+  let summary = "Extract a bit field from an object, with sign extension.";
+
+  let description = [{
+     Results are computed per component.
+
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Base must be the same as Result Type.
+
+    If Count is greater than 0: The bits of Base numbered in [Offset, Offset
+    + Count -  1] (inclusive) become the bits numbered [0, Count - 1] of the
+    result. The remaining bits of the result will all be the same as bit
+    Offset + Count -  1 of Base.
+
+    Count  must be an integer type scalar. Count is the number of bits
+    extracted from Base. It will be consumed as an unsigned value. Count can
+    be 0, in which case the result will be 0.
+
+    Offset  must be an integer type scalar. Offset is the lowest-order bit
+    of the bit field to extract from Base.  It will be consumed as an
+    unsigned value.
+
+    The resulting value is undefined if Count or Offset or their sum is
+    greater than the number of bits in the result.
+
+    ### Custom assembly form
+
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                  `vector<` integer-literal `x` integer-type `>`
+    bitfield-extract-s-op ::= ssa-id `=` `spv.BitFieldSExtract` ssa-use
+                                         `,` ssa-use `,` ssa-use
+                                         `:` integer-scalar-vector-type
+                                         `,` integer-type `,` integer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.BitFieldSExtract %base, %offset, %count : vector<3xi32>, i8, i8
+    ```
+  }];
+}
+
+// -----
+
+def SPV_BitFieldUExtractOp : SPV_BitFieldExtractOp<"BitFieldUExtract", []> {
+  let summary = "Extract a bit field from an object, without sign extension.";
+
+  let description = [{
+    The semantics are the same as with OpBitFieldSExtract with the exception
+    that there is no sign extension. The remaining bits of the result will
+    all be 0.
+
+    ### Custom assembly form
+
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                  `vector<` integer-literal `x` integer-type `>`
+    bitfield-extract-u-op ::= ssa-id `=` `spv.BitFieldUExtract` ssa-use
+                                         `,` ssa-use `,` ssa-use
+                                         `:` integer-scalar-vector-type
+                                         `,` integer-type `,` integer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.BitFieldUExtract %base, %offset, %count : vector<3xi32>, i8, i8
+    ```
+  }];
+}
+
+// -----
+
+def SPV_BitReverseOp : SPV_BitUnaryOp<"BitReverse", []> {
+  let summary = "Reverse the bits in an object.";
+
+  let description = [{
+     Results are computed per component.
+
+    Result Type must be a scalar or vector of integer type.
+
+     The type of Base must be the same as Result Type.
+
+    The bit-number n of the result will be taken from bit-number Width - 1 -
+    n of Base, where Width is the OpTypeInt operand of the Result Type.
+
+    ### Custom assembly form
+
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                   `vector<` integer-literal `x` integer-type `>`
+    bitreverse-op ::= ssa-id `=` `spv.BitReverse` ssa-use
+                                 `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.BitReverse %0 : i32
+    %3 = spv.BitReverse %1 : vector<4xi32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_BitwiseAndOp : SPV_BitBinaryOp<"BitwiseAnd", [Commutative]> {
+  let summary = [{
+    Result is 1 if both Operand 1 and Operand 2 are 1. Result is 0 if either
+    Operand 1 or Operand 2 are 0.
+  }];
+
+  let description = [{
+     Results are computed per component, and within each component, per bit.
+
+    Result Type must be a scalar or vector of integer type.  The type of
+    Operand 1 and Operand 2  must be a scalar or vector of integer type.
+    They must have the same number of components as Result Type. They must
+    have the same component width as Result Type.
+
+    ### Custom assembly form
+
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                  `vector<` integer-literal `x` integer-type `>`
+    bitwise-and-op ::= ssa-id `=` `spv.BitwiseAnd` ssa-use, ssa-use
+                                  `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.BitwiseAnd %0, %1 : i32
+    %2 = spv.BitwiseAnd %0, %1 : vector<4xi32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_BitwiseOrOp : SPV_BitBinaryOp<"BitwiseOr", [Commutative]> {
+  let summary = [{
+    Result is 1 if either Operand 1 or Operand 2 is 1. Result is 0 if both
+    Operand 1 and Operand 2 are 0.
+  }];
+
+  let description = [{
+     Results are computed per component, and within each component, per bit.
+
+    Result Type must be a scalar or vector of integer type.  The type of
+    Operand 1 and Operand 2  must be a scalar or vector of integer type.
+    They must have the same number of components as Result Type. They must
+    have the same component width as Result Type.
+
+    ### Custom assembly form
+
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                  `vector<` integer-literal `x` integer-type `>`
+    bitwise-or-op ::= ssa-id `=` `spv.BitwiseOr` ssa-use, ssa-use
+                                  `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.BitwiseOr %0, %1 : i32
+    %2 = spv.BitwiseOr %0, %1 : vector<4xi32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_BitwiseXorOp : SPV_BitBinaryOp<"BitwiseXor", [Commutative]> {
+  let summary = [{
+    Result is 1 if exactly one of Operand 1 or Operand 2 is 1. Result is 0
+    if Operand 1 and Operand 2 have the same value.
+  }];
+
+  let description = [{
+     Results are computed per component, and within each component, per bit.
+
+    Result Type must be a scalar or vector of integer type.  The type of
+    Operand 1 and Operand 2  must be a scalar or vector of integer type.
+    They must have the same number of components as Result Type. They must
+    have the same component width as Result Type.
+
+    ### Custom assembly form
+
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                  `vector<` integer-literal `x` integer-type `>`
+    bitwise-xor-op ::= ssa-id `=` `spv.BitwiseXor` ssa-use, ssa-use
+                                  `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.BitwiseXor %0, %1 : i32
+    %2 = spv.BitwiseXor %0, %1 : vector<4xi32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_ShiftLeftLogicalOp : SPV_ShiftOp<"ShiftLeftLogical", []> {
+  let summary = [{
+    Shift the bits in Base left by the number of bits specified in Shift.
+    The least-significant bits will be zero filled.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of each Base and Shift must be a scalar or vector of integer
+    type. Base and Shift must have the same number of components.  The
+    number of components and bit width of the type of Base must be the same
+    as in Result Type.
+
+    Shift is treated as unsigned. The result is undefined if Shift is
+    greater than or equal to the bit width of the components of Base.
+
+    The number of components and bit width of Result Type must match those
+    Base type. All types must be integer types.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                  `vector<` integer-literal `x` integer-type `>`
+    shift-left-logical-op ::= ssa-id `=` `spv.ShiftLeftLogical`
+                                          ssa-use `,` ssa-use `:`
+                                          integer-scalar-vector-type `,`
+                                          integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.ShiftLeftLogical %0, %1 : i32, i16
+    %5 = spv.ShiftLeftLogical %3, %4 : vector<3xi32>, vector<3xi16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_ShiftRightArithmeticOp : SPV_ShiftOp<"ShiftRightArithmetic", []> {
+  let summary = [{
+    Shift the bits in Base right by the number of bits specified in Shift.
+    The most-significant bits will be filled with the sign bit from Base.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of each Base and Shift must be a scalar or vector of integer
+    type. Base and Shift must have the same number of components.  The
+    number of components and bit width of the type of Base must be the same
+    as in Result Type.
+
+    Shift is treated as unsigned. The result is undefined if Shift is
+    greater than or equal to the bit width of the components of Base.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                  `vector<` integer-literal `x` integer-type `>`
+    shift-right-arithmetic-op ::= ssa-id `=` `spv.ShiftRightArithmetic`
+                                              ssa-use `,` ssa-use `:`
+                                              integer-scalar-vector-type `,`
+                                              integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.ShiftRightArithmetic %0, %1 : i32, i16
+    %5 = spv.ShiftRightArithmetic %3, %4 : vector<3xi32>, vector<3xi16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_ShiftRightLogicalOp : SPV_ShiftOp<"ShiftRightLogical", []> {
+  let summary = [{
+    Shift the bits in Base right by the number of bits specified in Shift.
+    The most-significant bits will be zero filled.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+     The type of each Base and Shift must be a scalar or vector of integer
+    type. Base and Shift must have the same number of components.  The
+    number of components and bit width of the type of Base must be the same
+    as in Result Type.
+
+    Shift is consumed as an unsigned integer. The result is undefined if
+    Shift is greater than or equal to the bit width of the components of
+    Base.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                  `vector<` integer-literal `x` integer-type `>`
+    shift-right-logical-op ::= ssa-id `=` `spv.ShiftRightLogical`
+                                           ssa-use `,` ssa-use `:`
+                                           integer-scalar-vector-type `,`
+                                           integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.ShiftRightLogical %0, %1 : i32, i16
+    %5 = spv.ShiftRightLogical %3, %4 : vector<3xi32>, vector<3xi16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_NotOp : SPV_BitUnaryOp<"Not", []> {
+  let summary = "Complement the bits of Operand.";
+
+  let description = [{
+     Results are computed per component, and within each component, per bit.
+
+    Result Type must be a scalar or vector of integer type.
+
+    Operand’s type  must be a scalar or vector of integer type.  It must
+    have the same number of components as Result Type.  The component width
+    must equal the component width in Result Type.
+
+    ### Custom assembly form
+
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                  `vector<` integer-literal `x` integer-type `>`
+    not-op ::= ssa-id `=` `spv.BitNot` ssa-use `:` integer-scalar-vector-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.Not %0 : i32
+    %3 = spv.Not %1 : vector<4xi32>
+    ```
+  }];
+}
+
+#endif // SPIRV_BIT_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVCastOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVCastOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..99fe0bbbf5f34fb4fa285a8a6c62658947e6a939
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVCastOps.td
@@ -0,0 +1,325 @@
+//===-- SPIRVCastOps.td - MLIR SPIR-V Cast Ops -------*- tablegen -*-------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains cast ops for the SPIR-V dialect. It corresponds
+// to "3.32.11. Convertion Instructions" of the SPIR-V specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_CAST_OPS
+#define SPIRV_CAST_OPS
+
+include "mlir/Dialect/SPIRV/SPIRVBase.td"
+
+class SPV_CastOp<string mnemonic, Type resultType, Type operandType,
+                 list<OpTrait> traits = []> :
+      SPV_Op<mnemonic,
+             !listconcat(traits,
+                         [NoSideEffect, SameOperandsAndResultShape])> {
+  let arguments = (ins
+    SPV_ScalarOrVectorOf<operandType>:$operand
+  );
+
+  let results = (outs
+    SPV_ScalarOrVectorOf<resultType>:$result
+  );
+
+  let parser = [{ return mlir::impl::parseCastOp(parser, result); }];
+  let printer = [{ mlir::impl::printCastOp(this->getOperation(), p); }];
+  let verifier = [{ return verifyCastOp(this->getOperation()); }];
+}
+
+// -----
+
+def SPV_BitcastOp : SPV_Op<"Bitcast", [NoSideEffect]> {
+  let summary = "Bit pattern-preserving type conversion.";
+
+  let description = [{
+    Result Type must be an OpTypePointer, or a scalar or vector of
+    numerical-type.
+
+    Operand must have a type of OpTypePointer, or a scalar or vector of
+    numerical-type. It must be a different type than Result Type.
+
+    If either Result Type or Operand is a pointer, the other must be a
+    pointer (diverges from the SPIR-V spec).
+
+    If Result Type has a different number of components than Operand, the
+    total number of bits in Result Type must equal the total number of bits
+    in Operand. Let L be the type, either Result Type or Operand’s type,
+    that has the larger number of components. Let S be the other type, with
+    the smaller number of components. The number of components in L must be
+    an integer multiple of the number of components in S. The first
+    component (that is, the only or lowest-numbered component) of S maps to
+    the first components of L, and so on,  up to the last component of S
+    mapping to the last components of L. Within this mapping, any single
+    component of S (mapping to multiple components of L) maps its lower-
+    ordered bits to the lower-numbered components of L.
+
+    ### Custom assembly form
+
+    ```
+    bitcast-op ::= ssa-id `=` `spv.Bitcast` ssa-use
+                   `:` operand-type `to` result-type
+    ```
+
+    For example:
+
+    ```
+    %1 = spv.Bitcast %0 : f32 to i32
+    %1 = spv.Bitcast %0 : vector<2xf32> to i64
+    %1 = spv.Bitcast %0 : !spv.ptr<f32, Function> to !spv.ptr<i32, Function>
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_ScalarOrVectorOrPtr:$operand
+  );
+
+  let results = (outs
+    SPV_ScalarOrVectorOrPtr:$result
+  );
+
+  let parser = [{ return mlir::impl::parseCastOp(parser, result); }];
+  let printer = [{ mlir::impl::printCastOp(this->getOperation(), p); }];
+
+  let hasCanonicalizer = 1;
+}
+
+// -----
+
+def SPV_ConvertFToSOp : SPV_CastOp<"ConvertFToS", SPV_Integer, SPV_Float, []> {
+  let summary = [{
+    Convert value numerically from floating point to signed integer, with
+    round toward 0.0.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+    Float Value must be a scalar or vector of floating-point type.  It must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    convert-f-to-s-op ::= ssa-id `=` `spv.ConvertFToSOp` ssa-use
+                          `:` operand-type `to` result-type
+    ```
+
+    For example:
+
+    ```
+    %1 = spv.ConvertFToS %0 : f32 to i32
+    %3 = spv.ConvertFToS %2 : vector<3xf32> to vector<3xi32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_ConvertFToUOp : SPV_CastOp<"ConvertFToU", SPV_Integer, SPV_Float, []> {
+  let summary = [{
+    Convert value numerically from floating point to unsigned integer, with
+    round toward 0.0.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type, whose Signedness
+    operand is 0.
+
+    Float Value must be a scalar or vector of floating-point type.  It must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    convert-f-to-u-op ::= ssa-id `=` `spv.ConvertFToUOp` ssa-use
+                          `:` operand-type `to` result-type
+    ```
+
+    For example:
+
+    ```
+    %1 = spv.ConvertFToU %0 : f32 to i32
+    %3 = spv.ConvertFToU %2 : vector<3xf32> to vector<3xi32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_ConvertSToFOp : SPV_CastOp<"ConvertSToF", SPV_Float, SPV_Integer, []> {
+  let summary = [{
+    Convert value numerically from signed integer to floating point.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+    Signed Value must be a scalar or vector of integer type.  It must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    convert-s-to-f-op ::= ssa-id `=` `spv.ConvertSToFOp` ssa-use
+                          `:` operand-type `to` result-type
+    ```
+
+    For example:
+
+    ```
+    %1 = spv.ConvertSToF %0 : i32 to f32
+    %3 = spv.ConvertSToF %2 : vector<3xi32> to vector<3xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_ConvertUToFOp : SPV_CastOp<"ConvertUToF", SPV_Float, SPV_Integer, []> {
+  let summary = [{
+    Convert value numerically from unsigned integer to floating point.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+    Unsigned Value must be a scalar or vector of integer type.  It must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    convert-u-to-f-op ::= ssa-id `=` `spv.ConvertUToFOp` ssa-use
+                          `:` operand-type `to` result-type
+    ```
+
+    For example:
+
+    ```
+    %1 = spv.ConvertUToF %0 : i32 to f32
+    %3 = spv.ConvertUToF %2 : vector<3xi32> to vector<3xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FConvertOp : SPV_CastOp<"FConvert", SPV_Float, SPV_Float, []> {
+  let summary = [{
+    Convert value numerically from one floating-point width to another
+    width.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type.
+
+    Float Value must be a scalar or vector of floating-point type.  It must
+    have the same number of components as Result Type.  The component width
+    cannot equal the component width in Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    f-convert-op ::= ssa-id `=` `spv.FConvertOp` ssa-use
+                     `:` operand-type `to` result-type
+    ```
+
+    For example:
+
+    ```
+    %1 = spv.FConvertOp %0 : f32 to f64
+    %3 = spv.FConvertOp %2 : vector<3xf32> to vector<3xf64>
+    ```
+  }];
+
+  let verifier = [{ return verifyCastOp(this->getOperation(), false); }];
+}
+
+// -----
+
+def SPV_SConvertOp : SPV_CastOp<"SConvert", SPV_Integer, SPV_Integer, []> {
+  let summary = [{
+    Convert signed width.  This is either a truncate or a sign extend.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type.
+
+    Signed Value must be a scalar or vector of integer type.  It must have
+    the same number of components as Result Type.  The component width
+    cannot equal the component width in Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    s-convert-op ::= ssa-id `=` `spv.SConvertOp` ssa-use
+                     `:` operand-type `to` result-type
+    ```
+
+    For example:
+
+    ```
+    %1 = spv.SConvertOp %0 : i32 to i64
+    %3 = spv.SConvertOp %2 : vector<3xi32> to vector<3xi64>
+    ```
+  }];
+
+  let verifier = [{ return verifyCastOp(this->getOperation(), false); }];
+}
+
+// -----
+
+def SPV_UConvertOp : SPV_CastOp<"UConvert", SPV_Integer, SPV_Integer, []> {
+  let summary = [{
+    Convert unsigned width. This is either a truncate or a zero extend.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of integer type, whose Signedness
+    operand is 0.
+
+    Unsigned Value must be a scalar or vector of integer type.  It must have
+    the same number of components as Result Type.  The component width
+    cannot equal the component width in Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    u-convert-op ::= ssa-id `=` `spv.UConvertOp` ssa-use
+                 `:` operand-type `to` result-type
+    ```
+
+    For example:
+
+    ```
+    %1 = spv.UConvertOp %0 : i32 to i64
+    %3 = spv.UConvertOp %2 : vector<3xi32> to vector<3xi64>
+    ```
+  }];
+
+  let verifier = [{ return verifyCastOp(this->getOperation(), false); }];
+}
+
+#endif // SPIRV_CAST_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVCompositeOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVCompositeOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..5a8235fff1a3e8178caf92d891a6c75d2664efb4
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVCompositeOps.td
@@ -0,0 +1,166 @@
+//===-- SPIRVCompositeOps.td - MLIR SPIR-V Composite Ops ---*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains composite ops for SPIR-V dialect. It corresponds
+// to "3.32.12. Composite Instructions" of the SPIR-V spec.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_COMPOSITE_OPS
+#define SPIRV_COMPOSITE_OPS
+
+include "mlir/Dialect/SPIRV/SPIRVBase.td"
+
+// -----
+
+def SPV_CompositeConstructOp : SPV_Op<"CompositeConstruct", [NoSideEffect]> {
+  let summary = [{
+    Construct a new composite object from a set of constituent objects that
+    will fully form it.
+  }];
+
+  let description = [{
+    Result Type must be a composite type, whose top-level
+    members/elements/components/columns have the same type as the types of
+    the operands, with one exception. The exception is that for constructing
+    a vector, the operands may also be vectors with the same component type
+    as the Result Type component type. When constructing a vector, the total
+    number of components in all the operands must equal the number of
+    components in Result Type.
+
+    Constituents will become members of a structure, or elements of an
+    array, or components of a vector, or columns of a matrix. There must be
+    exactly one Constituent for each top-level
+    member/element/component/column of the result, with one exception. The
+    exception is that for constructing a vector, a contiguous subset of the
+    scalars consumed can be represented by a vector operand instead. The
+    Constituents must appear in the order needed by the definition of the
+    type of the result. When constructing a vector, there must be at least
+    two Constituent operands.
+
+    ### Custom assembly form
+
+    ```
+    composite-construct-op ::= ssa-id `=` `spv.CompositeConstruct`
+                               (ssa-use (`,` ssa-use)* )? `:` composite-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.CompositeConstruct %1, %2, %3 : vector<3xf32>
+    ```
+  }];
+
+  let arguments = (ins
+    Variadic<SPV_Type>:$constituents
+  );
+
+  let results = (outs
+    SPV_Composite:$result
+  );
+}
+
+// -----
+
+def SPV_CompositeExtractOp : SPV_Op<"CompositeExtract", [NoSideEffect]> {
+  let summary = "Extract a part of a composite object.";
+
+  let description = [{
+    Result Type must be the type of object selected by the last provided
+    index.  The instruction result is the extracted object.
+
+    Composite is the composite to extract from.
+
+    Indexes walk the type hierarchy, potentially down to component
+    granularity, to select the part to extract. All indexes must be in
+    bounds.  All composite constituents use zero-based numbering, as
+    described by their OpType… instruction.
+
+    ### Custom assembly form
+
+    ```
+    composite-extract-op ::= ssa-id `=` `spv.CompositeExtract` ssa-use
+                             `[` integer-literal (',' integer-literal)* `]`
+                             `:` composite-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.Variable : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+    %1 = spv.Load "Function" %0 ["Volatile"] : !spv.array<4x!spv.array<4xf32>>
+    %2 = spv.CompositeExtract %1[1 : i32] : !spv.array<4x!spv.array<4xf32>>
+    ```
+
+  }];
+
+  let arguments = (ins
+    SPV_Composite:$composite,
+    I32ArrayAttr:$indices
+  );
+
+  let results = (outs
+    SPV_Type:$component
+  );
+
+  let builders = [
+    OpBuilder<[{Builder *builder, OperationState &state,
+                Value composite, ArrayRef<int32_t> indices}]>
+  ];
+
+  let hasFolder = 1;
+}
+
+// -----
+
+def SPV_CompositeInsertOp : SPV_Op<"CompositeInsert", [NoSideEffect]> {
+  let summary = [{
+    Make a copy of a composite object, while modifying one part of it.
+  }];
+
+  let description = [{
+    Result Type must be the same type as Composite.
+
+    Object is the object to use as the modified part.
+
+    Composite is the composite to copy all but the modified part from.
+
+    Indexes walk the type hierarchy of Composite to the desired depth,
+    potentially down to component granularity, to select the part to modify.
+    All indexes must be in bounds. All composite constituents use zero-based
+    numbering, as described by their OpType… instruction. The type of the
+    part selected to modify must match the type of Object.
+
+    ### Custom assembly form
+
+    ```
+    composite-insert-op ::= ssa-id `=` `spv.CompositeInsert` ssa-use, ssa-use
+                            `[` integer-literal (',' integer-literal)* `]`
+                            `:` object-type `into` composite-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.CompositeInsert %object, %composite[1 : i32] : f32 into !spv.array<4xf32>
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_Type:$object,
+    SPV_Composite:$composite,
+    I32ArrayAttr:$indices
+  );
+
+  let results = (outs
+    SPV_Composite:$result
+  );
+}
+
+#endif // SPIRV_COMPOSITE_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..be0955794515ed58a8e245d7ab236f0694be26da
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td
@@ -0,0 +1,466 @@
+//===-- SPIRVControlFlowOps.td - SPIR-V Control Flow Ops ---*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains control flow ops for the SPIR-V dialect. It corresponds
+// to "3.32.17. Control-Flow Instructions" of the SPIR-V specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_CONTROLFLOW_OPS
+#define SPIRV_CONTROLFLOW_OPS
+
+include "mlir/Dialect/SPIRV/SPIRVBase.td"
+include "mlir/Analysis/CallInterfaces.td"
+
+// -----
+
+def SPV_BranchOp : SPV_Op<"Branch", [InFunctionScope, Terminator]> {
+  let summary = "Unconditional branch to target block.";
+
+  let description = [{
+    This instruction must be the last instruction in a block.
+
+    ### Custom assembly form
+
+    ```
+    branch-op ::= `spv.Branch` successor
+    successor ::= bb-id branch-use-list?
+    branch-use-list ::= `(` ssa-use-list `:` type-list-no-parens `)`
+    ```
+
+    For example:
+
+    ```
+    spv.Branch ^target
+    spv.Branch ^target(%0, %1: i32, f32)
+    ```
+  }];
+
+  let arguments = (ins
+    Variadic<AnyType>:$block_arguments
+  );
+
+  let results = (outs);
+
+  let builders = [
+    OpBuilder<
+      "Builder *, OperationState &state, "
+      "Block *successor, ValueRange arguments = {}", [{
+        state.addSuccessor(successor, arguments);
+      }]
+    >
+  ];
+
+  let skipDefaultBuilders = 1;
+
+  let extraClassDeclaration = [{
+    /// Returns the branch target block.
+    Block *getTarget() { return getOperation()->getSuccessor(0); }
+
+    /// Returns the block arguments.
+    operand_range getBlockArguments() {
+      return getOperation()->getSuccessorOperands(0);
+    }
+  }];
+
+  let autogenSerialization = 0;
+}
+
+// -----
+
+def SPV_BranchConditionalOp : SPV_Op<"BranchConditional",
+                                     [InFunctionScope, Terminator]> {
+  let summary = [{
+    If Condition is true, branch to true block, otherwise branch to false
+    block.
+  }];
+
+  let description = [{
+    Condition must be a Boolean type scalar.
+
+    Branch weights are unsigned 32-bit integer literals. There must be
+    either no Branch Weights or exactly two branch weights. If present, the
+    first is the weight for branching to True Label, and the second is the
+    weight for branching to False Label. The implied probability that a
+    branch is taken is its weight divided by the sum of the two Branch
+    weights. At least one weight must be non-zero. A weight of zero does not
+    imply a branch is dead or permit its removal; branch weights are only
+    hints. The two weights must not overflow a 32-bit unsigned integer when
+    added together.
+
+    This instruction must be the last instruction in a block.
+
+    ### Custom assembly form
+
+    ```
+    branch-conditional-op ::= `spv.BranchConditional` ssa-use
+                              (`[` integer-literal, integer-literal `]`)?
+                              `,` successor `,` successor
+    successor ::= bb-id branch-use-list?
+    branch-use-list ::= `(` ssa-use-list `:` type-list-no-parens `)`
+    ```
+
+    For example:
+
+    ```
+    spv.BranchConditional %condition, ^true_branch, ^false_branch
+    spv.BranchConditional %condition, ^true_branch(%0: i32), ^false_branch(%1: i32)
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_Bool:$condition,
+    Variadic<AnyType>:$branch_arguments,
+    OptionalAttr<I32ArrayAttr>:$branch_weights
+  );
+
+  let results = (outs);
+
+  let builders = [
+    OpBuilder<
+      "Builder *builder, OperationState &state, Value condition, "
+      "Block *trueBlock, ValueRange trueArguments, "
+      "Block *falseBlock, ValueRange falseArguments, "
+      "Optional<std::pair<uint32_t, uint32_t>> weights = {}",
+      [{
+        state.addOperands(condition);
+        state.addSuccessor(trueBlock, trueArguments);
+        state.addSuccessor(falseBlock, falseArguments);
+        if (weights) {
+          auto attr =
+              builder->getI32ArrayAttr({static_cast<int32_t>(weights->first),
+                                        static_cast<int32_t>(weights->second)});
+          state.addAttribute("branch_weights", attr);
+        }
+      }]
+    >
+  ];
+
+  let skipDefaultBuilders = 1;
+
+  let autogenSerialization = 0;
+
+  let extraClassDeclaration = [{
+    /// Branch indices into the successor list.
+    enum { kTrueIndex = 0, kFalseIndex = 1 };
+
+    /// Returns the target block for the true branch.
+    Block *getTrueBlock() { return getOperation()->getSuccessor(kTrueIndex); }
+
+    /// Returns the target block for the false branch.
+    Block *getFalseBlock() { return getOperation()->getSuccessor(kFalseIndex); }
+
+    /// Returns the number of arguments to the true target block.
+    unsigned getNumTrueBlockArguments() {
+      return getNumSuccessorOperands(kTrueIndex);
+    }
+
+    /// Returns the number of arguments to the false target block.
+    unsigned getNumFalseBlockArguments() {
+      return getNumSuccessorOperands(kFalseIndex);
+    }
+
+    // Iterator and range support for true target block arguments.
+    operand_iterator true_block_argument_begin() {
+      return operand_begin() + getTrueBlockArgumentIndex();
+    }
+    operand_iterator true_block_argument_end() {
+      return true_block_argument_begin() + getNumTrueBlockArguments();
+    }
+    operand_range getTrueBlockArguments() {
+      return {true_block_argument_begin(), true_block_argument_end()};
+    }
+
+    // Iterator and range support for false target block arguments.
+    operand_iterator false_block_argument_begin() {
+      return true_block_argument_end();
+    }
+    operand_iterator false_block_argument_end() {
+      return false_block_argument_begin() + getNumFalseBlockArguments();
+    }
+    operand_range getFalseBlockArguments() {
+      return {false_block_argument_begin(), false_block_argument_end()};
+    }
+
+  private:
+    /// Gets the index of the first true block argument in the operand list.
+    unsigned getTrueBlockArgumentIndex() {
+      return 1; // Omit the first argument, which is the condition.
+    }
+
+    /// Gets the index of the first false block argument in the operand list.
+    unsigned getFalseBlockArgumentIndex() {
+      return getTrueBlockArgumentIndex() + getNumTrueBlockArguments();
+    }
+  }];
+}
+
+// -----
+
+def SPV_FunctionCallOp : SPV_Op<"FunctionCall", [
+    InFunctionScope, DeclareOpInterfaceMethods<CallOpInterface>]> {
+  let summary = "Call a function.";
+
+  let description = [{
+    Result Type is the type of the return value of the function. It must be
+    the same as the Return Type operand of the Function Type operand of the
+    Function operand.
+
+    Function is an OpFunction instruction.  This could be a forward
+    reference.
+
+    Argument N is the object to copy to parameter N of Function.
+
+    Note: A forward call is possible because there is no missing type
+    information: Result Type must match the Return Type of the function, and
+    the calling argument types must match the formal parameter types.
+
+    ### Custom assembly form
+
+    ```
+    function-call-op ::= `spv.FunctionCall` function-id `(` ssa-use-list `)`
+                     `:` function-type
+    ```
+
+    For example:
+
+    ```
+    spv.FunctionCall @f_void(%arg0) : (i32) ->  ()
+    %0 = spv.FunctionCall @f_iadd(%arg0, %arg1) : (i32, i32) -> i32
+    ```
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$callee,
+    Variadic<SPV_Type>:$arguments
+  );
+
+  let results = (outs
+    SPV_Optional<SPV_Type>:$result
+  );
+
+  let autogenSerialization = 0;
+}
+
+// -----
+
+def SPV_LoopOp : SPV_Op<"loop", [InFunctionScope]> {
+  let summary = "Define a structured loop.";
+
+  let description = [{
+    SPIR-V can explicitly declare structured control-flow constructs using merge
+    instructions. These explicitly declare a header block before the control
+    flow diverges and a merge block where control flow subsequently converges.
+    These blocks delimit constructs that must nest, and can only be entered
+    and exited in structured ways. See "2.11. Structured Control Flow" of the
+    SPIR-V spec for more details.
+
+    Instead of having a `spv.LoopMerge` op to directly model loop merge
+    instruction for indicating the merge and continue target, we use regions
+    to delimit the boundary of the loop: the merge target is the next op
+    following the `spv.loop` op and the continue target is the block that
+    has a back-edge pointing to the entry block inside the `spv.loop`'s region.
+    This way it's easier to discover all blocks belonging to a construct and
+    it plays nicer with the MLIR system.
+
+    The `spv.loop` region should contain at least four blocks: one entry block,
+    one loop header block, one loop continue block, one loop merge block.
+    The entry block should be the first block and it should jump to the loop
+    header block, which is the second block. The loop merge block should be the
+    last block. The merge block should only contain a `spv._merge` op.
+    The continue block should be the second to last block and it should have a
+    branch to the loop header block. The loop continue block should be the only
+    block, except the entry block, branching to the header block.
+  }];
+
+  let arguments = (ins
+    SPV_LoopControlAttr:$loop_control
+  );
+
+  let results = (outs);
+
+  let regions = (region AnyRegion:$body);
+
+  let builders = [OpBuilder<"Builder *builder, OperationState &state">];
+
+  let extraClassDeclaration = [{
+    // Returns the entry block.
+    Block *getEntryBlock();
+
+    // Returns the loop header block.
+    Block *getHeaderBlock();
+
+    // Returns the loop continue block.
+    Block *getContinueBlock();
+
+    // Returns the loop merge block.
+    Block *getMergeBlock();
+
+    // Adds an empty entry block and loop merge block containing one
+    // spv._merge op.
+    void addEntryAndMergeBlock();
+  }];
+
+  let hasOpcode = 0;
+
+  let autogenSerialization = 0;
+}
+
+// -----
+
+def SPV_MergeOp : SPV_Op<"_merge", [Terminator]> {
+  let summary = "A special terminator for merging a structured selection/loop.";
+
+  let description = [{
+    We use `spv.selection`/`spv.loop` for modelling structured selection/loop.
+    This op is a terminator used inside their regions to mean jumping to the
+    merge point, which is the next op following the `spv.selection` or
+    `spv.loop` op. This op does not have a corresponding instruction in the
+    SPIR-V binary format; it's solely for structural purpose.
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs);
+
+  let parser = [{ return parseNoIOOp(parser, result); }];
+  let printer = [{ printNoIOOp(getOperation(), p); }];
+
+  let hasOpcode = 0;
+
+  let autogenSerialization = 0;
+}
+
+// -----
+
+def SPV_ReturnOp : SPV_Op<"Return", [InFunctionScope, Terminator]> {
+  let summary = "Return with no value from a function with void return type.";
+
+  let description = [{
+    This instruction must be the last instruction in a block.
+
+    ### Custom assembly form
+
+    ```
+    return-op ::= `spv.Return`
+    ```
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs);
+
+  let parser = [{ return parseNoIOOp(parser, result); }];
+  let printer = [{ printNoIOOp(getOperation(), p); }];
+}
+
+// -----
+
+def SPV_UnreachableOp : SPV_Op<"Unreachable", [InFunctionScope, Terminator]> {
+  let summary = "Declares that this block is not reachable in the CFG.";
+
+  let description = [{
+    This instruction must be the last instruction in a block.
+
+    ### Custom assembly form
+
+    ```
+    unreachable-op ::= `spv.Unreachable`
+    ```
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs);
+
+  let parser = [{ return parseNoIOOp(parser, result); }];
+  let printer = [{ printNoIOOp(getOperation(), p); }];
+}
+
+// -----
+
+def SPV_ReturnValueOp : SPV_Op<"ReturnValue", [InFunctionScope, Terminator]> {
+  let summary = "Return a value from a function.";
+
+  let description = [{
+    Value is the value returned, by copy, and must match the Return Type
+    operand of the OpTypeFunction type of the OpFunction body this return
+    instruction is in.
+
+    This instruction must be the last instruction in a block.
+
+    ### Custom assembly form
+
+    ```
+    return-value-op ::= `spv.ReturnValue` ssa-use `:` spirv-type
+    ```
+
+    For example:
+
+    ```
+    spv.ReturnValue %0 : f32
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_Type:$value
+  );
+
+  let results = (outs);
+}
+
+def SPV_SelectionOp : SPV_Op<"selection", [InFunctionScope]> {
+  let summary = "Define a structured selection.";
+
+  let description = [{
+    SPIR-V can explicitly declare structured control-flow constructs using merge
+    instructions. These explicitly declare a header block before the control
+    flow diverges and a merge block where control flow subsequently converges.
+    These blocks delimit constructs that must nest, and can only be entered
+    and exited in structured ways. See "2.11. Structured Control Flow" of the
+    SPIR-V spec for more details.
+
+    Instead of having a `spv.SelectionMerge` op to directly model selection
+    merge instruction for indicating the merge target, we use regions to delimit
+    the boundary of the selection: the merge target is the next op following the
+    `spv.selection` op. This way it's easier to discover all blocks belonging to
+    the selection and it plays nicer with the MLIR system.
+
+    The `spv.selection` region should contain at least two blocks: one selection
+    header block, and one selection merge. The selection header block should be
+    the first block. The selection merge block should be the last block.
+    The merge block should only contain a `spv._merge` op.
+  }];
+
+  let arguments = (ins
+    SPV_SelectionControlAttr:$selection_control
+  );
+
+  let results = (outs);
+
+  let regions = (region AnyRegion:$body);
+
+  let extraClassDeclaration = [{
+    // Returns the selection header block.
+    Block *getHeaderBlock();
+
+    // Returns the selection merge block.
+    Block *getMergeBlock();
+
+    // Adds a selection merge block containing one spv._merge op.
+    void addMergeBlock();
+  }];
+
+  let hasOpcode = 0;
+
+  let autogenSerialization = 0;
+
+  let hasCanonicalizer = 1;
+}
+
+#endif // SPIRV_CONTROLFLOW_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVDialect.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVDialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c0eebd34d1640899a8eb9bab1f2da22ea447408
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVDialect.h
@@ -0,0 +1,53 @@
+//===- SPIRVDialect.h - MLIR SPIR-V dialect ---------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SPIR-V dialect in MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_SPIRVDIALECT_H_
+#define MLIR_DIALECT_SPIRV_SPIRVDIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+
+namespace mlir {
+namespace spirv {
+
+enum class Decoration : uint32_t;
+
+class SPIRVDialect : public Dialect {
+public:
+  explicit SPIRVDialect(MLIRContext *context);
+
+  static StringRef getDialectNamespace() { return "spv"; }
+
+  /// Checks if the given `type` is valid in SPIR-V dialect.
+  static bool isValidType(Type type);
+
+  /// Checks if the given `scalar type` is valid in SPIR-V dialect.
+  static bool isValidScalarType(Type type);
+
+  /// Returns the attribute name to use when specifying decorations on results
+  /// of operations.
+  static std::string getAttributeName(Decoration decoration);
+
+  /// Parses a type registered to this dialect.
+  Type parseType(DialectAsmParser &parser) const override;
+
+  /// Prints a type registered to this dialect.
+  void printType(Type type, DialectAsmPrinter &os) const override;
+
+  /// Provides a hook for materializing a constant to this dialect.
+  Operation *materializeConstant(OpBuilder &builder, Attribute value, Type type,
+                                 Location loc) override;
+};
+
+} // end namespace spirv
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_SPIRVDIALECT_H_
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVGLSLOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVGLSLOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..b2eacbf306aea2309053b202773161bc1e33e750
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVGLSLOps.td
@@ -0,0 +1,570 @@
+//===- SPIRVGLSLOps.td - GLSL extended insts spec file -----*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the op definition spec of GLSL extension ops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_GLSL_OPS
+#define SPIRV_GLSL_OPS
+
+include "mlir/Dialect/SPIRV/SPIRVBase.td"
+
+//===----------------------------------------------------------------------===//
+// SPIR-V GLSL 4.50 opcode specification.
+//===----------------------------------------------------------------------===//
+
+// Base class for all GLSL ops.
+class SPV_GLSLOp<string mnemonic, int opcode, list<OpTrait> traits = []> :
+  SPV_ExtInstOp<mnemonic, "GLSL", "GLSL.std.450", opcode, traits>;
+
+// Base class for GLSL unary ops.
+class SPV_GLSLUnaryOp<string mnemonic, int opcode, Type resultType,
+                      Type operandType, list<OpTrait> traits = []> :
+  SPV_GLSLOp<mnemonic, opcode, !listconcat([NoSideEffect], traits)> {
+
+  let arguments = (ins
+    SPV_ScalarOrVectorOf<operandType>:$operand
+  );
+
+  let results = (outs
+    SPV_ScalarOrVectorOf<resultType>:$result
+  );
+
+  let parser = [{ return parseUnaryOp(parser, result); }];
+
+  let printer = [{ return printUnaryOp(getOperation(), p); }];
+
+  let verifier = [{ return success(); }];
+}
+
+// Base class for GLSL Unary arithmetic ops where return type matches
+// the operand type.
+class SPV_GLSLUnaryArithmeticOp<string mnemonic, int opcode, Type type,
+                                list<OpTrait> traits = []> :
+  SPV_GLSLUnaryOp<mnemonic, opcode, type, type, traits>;
+
+// Base class for GLSL binary ops.
+class SPV_GLSLBinaryOp<string mnemonic, int opcode, Type resultType,
+                        Type operandType, list<OpTrait> traits = []> :
+  SPV_GLSLOp<mnemonic, opcode, !listconcat([NoSideEffect], traits)> {
+
+  let arguments = (ins
+    SPV_ScalarOrVectorOf<operandType>:$lhs,
+    SPV_ScalarOrVectorOf<operandType>:$rhs
+  );
+
+  let results = (outs
+    SPV_ScalarOrVectorOf<resultType>:$result
+  );
+
+  let parser = [{ return impl::parseOneResultSameOperandTypeOp(parser, result); }];
+
+  let printer = [{ return impl::printOneResultOp(getOperation(), p); }];
+
+  let verifier = [{ return success(); }];
+}
+
+// Base class for GLSL Binary arithmetic ops where operand types and
+// return type matches.
+class SPV_GLSLBinaryArithmeticOp<string mnemonic, int opcode, Type type,
+                                 list<OpTrait> traits = []> :
+  SPV_GLSLBinaryOp<mnemonic, opcode, type, type, traits>;
+
+// -----
+
+def SPV_GLSLFAbsOp : SPV_GLSLUnaryArithmeticOp<"FAbs", 4, SPV_Float> {
+  let summary = "Absolute value of operand";
+
+  let description = [{
+    Result is x if x >= 0; otherwise result is -x.
+
+    The operand x must be a scalar or vector whose component type is
+    floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    ### Custom assembly format
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    abs-op ::= ssa-id `=` `spv.GLSL.FAbs` ssa-use `:`
+               float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.FAbs %0 : f32
+    %3 = spv.GLSL.FAbs %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLSAbsOp : SPV_GLSLUnaryArithmeticOp<"SAbs", 5, SPV_Integer> {
+  let summary = "Absolute value of operand";
+
+  let description = [{
+    Result is x if x ≥ 0; otherwise result is -x, where x is interpreted as a
+    signed integer.
+
+    Result Type and the type of x must both be integer scalar or integer vector
+    types. Result Type and operand types must have the same number of components
+    with the same component width. Results are computed per component.
+
+    ### Custom assembly format
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                   `vector<` integer-literal `x` integer-type `>`
+    abs-op ::= ssa-id `=` `spv.GLSL.SAbs` ssa-use `:`
+               integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.SAbs %0 : i32
+    %3 = spv.GLSL.SAbs %1 : vector<3xi16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLCeilOp : SPV_GLSLUnaryArithmeticOp<"Ceil", 9, SPV_Float> {
+  let summary = "Rounds up to the next whole number";
+
+  let description = [{
+    Result is the value equal to the nearest whole number that is greater than
+    or equal to x.
+
+    The operand x must be a scalar or vector whose component type is
+    floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    ### Custom assembly format
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    ceil-op ::= ssa-id `=` `spv.GLSL.Ceil` ssa-use `:`
+                float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.Ceil %0 : f32
+    %3 = spv.GLSL.Ceil %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLCosOp : SPV_GLSLUnaryArithmeticOp<"Cos", 14, SPV_Float16or32> {
+  let summary = "Cosine of operand in radians";
+
+  let description = [{
+    The standard trigonometric cosine of x radians.
+
+    The operand x must be a scalar or vector whose component type is 16-bit or
+    32-bit floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    ### Custom assembly format
+    ```
+    restricted-float-scalar-type ::=  `f16` | `f32`
+    restricted-float-scalar-vector-type ::=
+      restricted-float-scalar-type |
+      `vector<` integer-literal `x` restricted-float-scalar-type `>`
+    cos-op ::= ssa-id `=` `spv.GLSL.Cos` ssa-use `:`
+               restricted-float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.Cos %0 : f32
+    %3 = spv.GLSL.Cos %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLExpOp : SPV_GLSLUnaryArithmeticOp<"Exp", 27, SPV_Float16or32> {
+  let summary = "Exponentiation of Operand 1";
+
+  let description = [{
+    Result is the natural exponentiation of x; e^x.
+
+    The operand x must be a scalar or vector whose component type is
+    16-bit or 32-bit floating-point.
+
+    Result Type and the type of x must be the same type. Results are
+    computed per component.";
+
+    ### Custom assembly format
+    ```
+    restricted-float-scalar-type ::=  `f16` | `f32`
+    restricted-float-scalar-vector-type ::=
+      restricted-float-scalar-type |
+      `vector<` integer-literal `x` restricted-float-scalar-type `>`
+    exp-op ::= ssa-id `=` `spv.GLSL.Exp` ssa-use `:`
+               restricted-float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.Exp %0 : f32
+    %3 = spv.GLSL.Exp %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLFloorOp : SPV_GLSLUnaryArithmeticOp<"Floor", 8, SPV_Float> {
+  let summary = "Rounds down to the next whole number";
+
+  let description = [{
+    Result is the value equal to the nearest whole number that is less than or
+    equal to x.
+
+    The operand x must be a scalar or vector whose component type is
+    floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    ### Custom assembly format
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    floor-op ::= ssa-id `=` `spv.GLSL.Floor` ssa-use `:`
+                float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.Floor %0 : f32
+    %3 = spv.GLSL.Floor %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLInverseSqrtOp : SPV_GLSLUnaryArithmeticOp<"InverseSqrt", 32, SPV_Float> {
+  let summary = "Reciprocal of sqrt(operand)";
+
+  let description = [{
+    Result is the reciprocal of sqrt x. Result is undefined if x <= 0.
+
+    The operand x must be a scalar or vector whose component type is
+    floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    ### Custom assembly format
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    rsqrt-op ::= ssa-id `=` `spv.GLSL.InverseSqrt` ssa-use `:`
+                 float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.InverseSqrt %0 : f32
+    %3 = spv.GLSL.InverseSqrt %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLLogOp : SPV_GLSLUnaryArithmeticOp<"Log", 28, SPV_Float16or32> {
+  let summary = "Natural logarithm of the operand";
+
+  let description = [{
+    Result is the natural logarithm of x, i.e., the value y which satisfies the
+    equation x = ey. Result is undefined if x <= 0.
+
+    The operand x must be a scalar or vector whose component type is 16-bit or
+    32-bit floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    ### Custom assembly format
+    ```
+    restricted-float-scalar-type ::=  `f16` | `f32`
+    restricted-float-scalar-vector-type ::=
+      restricted-float-scalar-type |
+      `vector<` integer-literal `x` restricted-float-scalar-type `>`
+    log-op ::= ssa-id `=` `spv.GLSL.Log` ssa-use `:`
+               restricted-float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.Log %0 : f32
+    %3 = spv.GLSL.Log %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLFMaxOp : SPV_GLSLBinaryArithmeticOp<"FMax", 40, SPV_Float> {
+  let summary = "Return maximum of two floating-point operands";
+
+  let description = [{
+    Result is y if x < y; otherwise result is x. Which operand is the
+    result is undefined if one of the operands is a NaN.
+
+    The operands must all be a scalar or vector whose component type
+    is floating-point.
+
+    Result Type and the type of all operands must be the same
+    type. Results are computed per component.
+
+    ### Custom assembly format
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fmax-op ::= ssa-id `=` `spv.GLSL.FMax` ssa-use `:`
+                float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.FMax %0, %1 : f32
+    %3 = spv.GLSL.FMax %0, %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLSMaxOp : SPV_GLSLBinaryArithmeticOp<"SMax", 42, SPV_Integer> {
+  let summary = "Return maximum of two signed integer operands";
+
+  let description = [{
+    Result is y if x < y; otherwise result is x, where x and y are interpreted
+    as signed integers.
+
+    Result Type and the type of x and y must both be integer scalar or integer
+    vector types. Result Type and operand types must have the same number of
+    components with the same component width. Results are computed per
+    component.
+
+    ### Custom assembly format
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                   `vector<` integer-literal `x` integer-type `>`
+    smax-op ::= ssa-id `=` `spv.GLSL.SMax` ssa-use `:`
+                integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.SMax %0, %1 : i32
+    %3 = spv.GLSL.SMax %0, %1 : vector<3xi16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLFMinOp : SPV_GLSLBinaryArithmeticOp<"FMin", 37, SPV_Float> {
+  let summary = "Return minimum of two floating-point operands";
+
+  let description = [{
+    Result is y if y < x; otherwise result is x. Which operand is the result is
+    undefined if one of the operands is a NaN.
+
+    The operands must all be a scalar or vector whose component type is
+    floating-point.
+
+    Result Type and the type of all operands must be the same type. Results are
+    computed per component.
+
+    ### Custom assembly format
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fmin-op ::= ssa-id `=` `spv.GLSL.FMin` ssa-use `:`
+                float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.FMin %0, %1 : f32
+    %3 = spv.GLSL.FMin %0, %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLSMinOp : SPV_GLSLBinaryArithmeticOp<"SMin", 39, SPV_Integer> {
+  let summary = "Return minimum of two signed integer operands";
+
+  let description = [{
+    Result is y if y < x; otherwise result is x, where x and y are interpreted
+    as signed integers.
+
+    Result Type and the type of x and y must both be integer scalar or integer
+    vector types. Result Type and operand types must have the same number of
+    components with the same component width. Results are computed per
+    component.
+
+    ### Custom assembly format
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                   `vector<` integer-literal `x` integer-type `>`
+    smin-op ::= ssa-id `=` `spv.GLSL.SMin` ssa-use `:`
+                integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.SMin %0, %1 : i32
+    %3 = spv.GLSL.SMin %0, %1 : vector<3xi16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLFSignOp : SPV_GLSLUnaryArithmeticOp<"FSign", 6, SPV_Float> {
+  let summary = "Returns the sign of the operand";
+
+  let description = [{
+    Result is 1.0 if x > 0, 0.0 if x = 0, or -1.0 if x < 0.
+
+    The operand x must be a scalar or vector whose component type is
+    floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    ### Custom assembly format
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    sign-op ::= ssa-id `=` `spv.GLSL.FSign` ssa-use `:`
+                float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.FSign %0 : f32
+    %3 = spv.GLSL.FSign %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLSSignOp : SPV_GLSLUnaryArithmeticOp<"SSign", 7, SPV_Integer> {
+  let summary = "Returns the sign of the operand";
+
+  let description = [{
+    Result is 1 if x > 0, 0 if x = 0, or -1 if x < 0, where x is interpreted as
+    a signed integer.
+
+    Result Type and the type of x must both be integer scalar or integer vector
+    types. Result Type and operand types must have the same number of components
+    with the same component width. Results are computed per component.
+
+    ### Custom assembly format
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                   `vector<` integer-literal `x` integer-type `>`
+    sign-op ::= ssa-id `=` `spv.GLSL.SSign` ssa-use `:`
+                integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.SSign %0 : i32
+    %3 = spv.GLSL.SSign %1 : vector<3xi16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLSqrtOp : SPV_GLSLUnaryArithmeticOp<"Sqrt", 31, SPV_Float> {
+  let summary = "Returns the square root of the operand";
+
+  let description = [{
+    Result is the square root of x. Result is undefined if x < 0.
+
+    The operand x must be a scalar or vector whose component type is
+    floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    ### Custom assembly format
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    sqrt-op ::= ssa-id `=` `spv.GLSL.Sqrt` ssa-use `:`
+                float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.Sqrt %0 : f32
+    %3 = spv.GLSL.Sqrt %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_GLSLTanhOp : SPV_GLSLUnaryArithmeticOp<"Tanh", 21, SPV_Float16or32> {
+  let summary = "Hyperbolic tangent of operand in radians";
+
+  let description = [{
+    Hyperbolic tangent of x radians.
+
+    The operand x must be a scalar or vector whose component type is 16-bit or
+    32-bit floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    ### Custom assembly format
+    ```
+    restricted-float-scalar-type ::=  `f16` | `f32`
+    restricted-float-scalar-vector-type ::=
+      restricted-float-scalar-type |
+      `vector<` integer-literal `x` restricted-float-scalar-type `>`
+    tanh-op ::= ssa-id `=` `spv.GLSL.Tanh` ssa-use `:`
+                restricted-float-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %2 = spv.GLSL.Tanh %0 : f32
+    %3 = spv.GLSL.Tanh %1 : vector<3xf16>
+    ```
+  }];
+}
+
+#endif // SPIRV_GLSL_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVGroupOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVGroupOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..827636afbafa69c3590819f18f35ae8f83f7689f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVGroupOps.td
@@ -0,0 +1,65 @@
+//===-- SPIRVGroupOps.td - MLIR SPIR-V (Sub)Group Ops ------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains group and subgroup ops for the SPIR-V dialect. It
+// corresponds to "3.32.21. Group and Subgroup Instructions" of the SPIR-V
+// specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_GROUP_OPS
+#define SPIRV_GROUP_OPS
+
+// -----
+
+def SPV_SubgroupBallotKHROp : SPV_Op<"SubgroupBallotKHR", []> {
+  let summary = "See extension SPV_KHR_shader_ballot";
+
+  let description = [{
+    Computes a bitfield value combining the Predicate value from all invocations
+    in the current Subgroup that execute the same dynamic instance of this
+    instruction. The bit is set to one if the corresponding invocation is active
+    and the predicate is evaluated to true; otherwise, it is set to zero.
+
+    Predicate must be a Boolean type.
+
+    Result Type must be a 4 component vector of 32 bit integer types.
+
+    Result is a set of bitfields where the first invocation is represented in bit
+    0 of the first vector component and the last (up to SubgroupSize) is the
+    higher bit number of the last bitmask needed to represent all bits of the
+    subgroup invocations.
+
+    ### Custom assembly form
+
+    ```
+    subgroup-ballot-op ::= ssa-id `=` `spv.SubgroupBallotKHR`
+                                ssa-use `:` `vector` `<` 4 `x` `i32` `>`
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.SubgroupBallotKHR %predicate : vector<4xi32>
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_Bool:$predicate
+  );
+
+  let results = (outs
+    SPV_I32Vec4:$result
+  );
+
+  let verifier = [{ return success(); }];
+}
+
+// -----
+
+#endif // SPIRV_GROUP_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVLogicalOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVLogicalOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..ac377d5e866612fbe0de74524f3b83b07e094cab
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVLogicalOps.td
@@ -0,0 +1,991 @@
+//===-- SPIRVLogicalOps.td - MLIR SPIR-V Logical Ops -------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains arithmetic ops for the SPIR-V dialect. It corresponds
+// to "3.32.15. Relational and Logical Instructions" of the SPIR-V spec.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_LOGICAL_OPS
+#define SPIRV_LOGICAL_OPS
+
+include "mlir/Dialect/SPIRV/SPIRVBase.td"
+
+class SPV_LogicalBinaryOp<string mnemonic, Type operandsType,
+                    list<OpTrait> traits = []> :
+      // Result type is SPV_Bool.
+      SPV_BinaryOp<mnemonic, SPV_Bool, operandsType,
+                   !listconcat(traits,
+                               [NoSideEffect, SameTypeOperands,
+                                SameOperandsAndResultShape])> {
+  let parser = [{ return ::parseLogicalBinaryOp(parser, result); }];
+  let printer = [{ return ::printLogicalOp(getOperation(), p); }];
+}
+
+class SPV_LogicalUnaryOp<string mnemonic, Type operandType,
+                         list<OpTrait> traits = []> :
+      // Result type is SPV_Bool.
+      SPV_UnaryOp<mnemonic, SPV_Bool, operandType,
+                  !listconcat(traits, [NoSideEffect, SameTypeOperands,
+                                       SameOperandsAndResultShape])> {
+  let parser = [{ return ::parseLogicalUnaryOp(parser, result); }];
+  let printer = [{ return ::printLogicalOp(getOperation(), p); }];
+}
+
+// -----
+
+def SPV_FOrdEqualOp : SPV_LogicalBinaryOp<"FOrdEqual", SPV_Float, [Commutative]> {
+  let summary = "Floating-point comparison for being ordered and equal.";
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    floating-point type.  They must have the same type, and they must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fordequal-op ::= ssa-id `=` `spv.FOrdEqual` ssa-use, ssa-use
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FOrdEqual %0, %1 : f32
+    %5 = spv.FOrdEqual %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FOrdGreaterThanOp : SPV_LogicalBinaryOp<"FOrdGreaterThan", SPV_Float, []> {
+  let summary = [{
+    Floating-point comparison if operands are ordered and Operand 1 is
+    greater than  Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    floating-point type.  They must have the same type, and they must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fordgt-op ::= ssa-id `=` `spv.FOrdGreaterThan` ssa-use, ssa-use
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FOrdGreaterThan %0, %1 : f32
+    %5 = spv.FOrdGreaterThan %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FOrdGreaterThanEqualOp : SPV_LogicalBinaryOp<"FOrdGreaterThanEqual", SPV_Float, []> {
+  let summary = [{
+    Floating-point comparison if operands are ordered and Operand 1 is
+    greater than or equal to Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    floating-point type.  They must have the same type, and they must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fordgte-op ::= ssa-id `=` `spv.FOrdGreaterThanEqual` ssa-use, ssa-use
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FOrdGreaterThanEqual %0, %1 : f32
+    %5 = spv.FOrdGreaterThanEqual %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FOrdLessThanOp : SPV_LogicalBinaryOp<"FOrdLessThan", SPV_Float, []> {
+  let summary = [{
+    Floating-point comparison if operands are ordered and Operand 1 is less
+    than Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    floating-point type.  They must have the same type, and they must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fordlt-op ::= ssa-id `=` `spv.FOrdLessThan` ssa-use, ssa-use
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FOrdLessThan %0, %1 : f32
+    %5 = spv.FOrdLessThan %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FOrdLessThanEqualOp : SPV_LogicalBinaryOp<"FOrdLessThanEqual", SPV_Float, []> {
+  let summary = [{
+    Floating-point comparison if operands are ordered and Operand 1 is less
+    than or equal to Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    floating-point type.  They must have the same type, and they must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fordlte-op ::= ssa-id `=` `spv.FOrdLessThanEqual` ssa-use, ssa-use
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FOrdLessThanEqual %0, %1 : f32
+    %5 = spv.FOrdLessThanEqual %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FOrdNotEqualOp : SPV_LogicalBinaryOp<"FOrdNotEqual", SPV_Float, [Commutative]> {
+  let summary = "Floating-point comparison for being ordered and not equal.";
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    floating-point type.  They must have the same type, and they must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    fordneq-op ::= ssa-id `=` `spv.FOrdNotEqual` ssa-use, ssa-use
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FOrdNotEqual %0, %1 : f32
+    %5 = spv.FOrdNotEqual %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FUnordEqualOp : SPV_LogicalBinaryOp<"FUnordEqual", SPV_Float, [Commutative]> {
+  let summary = "Floating-point comparison for being unordered or equal.";
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    floating-point type.  They must have the same type, and they must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    funordequal-op ::= ssa-id `=` `spv.FUnordEqual` ssa-use, ssa-use
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FUnordEqual %0, %1 : f32
+    %5 = spv.FUnordEqual %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FUnordGreaterThanOp : SPV_LogicalBinaryOp<"FUnordGreaterThan", SPV_Float, []> {
+  let summary = [{
+    Floating-point comparison if operands are unordered or Operand 1 is
+    greater than  Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    floating-point type.  They must have the same type, and they must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    funordgt-op ::= ssa-id `=` `spv.FUnordGreaterThan` ssa-use, ssa-use
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FUnordGreaterThan %0, %1 : f32
+    %5 = spv.FUnordGreaterThan %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FUnordGreaterThanEqualOp : SPV_LogicalBinaryOp<"FUnordGreaterThanEqual", SPV_Float, []> {
+  let summary = [{
+    Floating-point comparison if operands are unordered or Operand 1 is
+    greater than or equal to Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    floating-point type.  They must have the same type, and they must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    funordgte-op ::= ssa-id `=` `spv.FUnordGreaterThanEqual` ssa-use, ssa-use
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FUnordGreaterThanEqual %0, %1 : f32
+    %5 = spv.FUnordGreaterThanEqual %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FUnordLessThanOp : SPV_LogicalBinaryOp<"FUnordLessThan", SPV_Float, []> {
+  let summary = [{
+    Floating-point comparison if operands are unordered or Operand 1 is less
+    than Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    floating-point type.  They must have the same type, and they must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    funordlt-op ::= ssa-id `=` `spv.FUnordLessThan` ssa-use, ssa-use
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FUnordLessThan %0, %1 : f32
+    %5 = spv.FUnordLessThan %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FUnordLessThanEqualOp : SPV_LogicalBinaryOp<"FUnordLessThanEqual", SPV_Float, []> {
+  let summary = [{
+    Floating-point comparison if operands are unordered or Operand 1 is less
+    than or equal to Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    floating-point type.  They must have the same type, and they must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    funordlte-op ::= ssa-id `=` `spv.FUnordLessThanEqual` ssa-use, ssa-use
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FUnordLessThanEqual %0, %1 : f32
+    %5 = spv.FUnordLessThanEqual %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_FUnordNotEqualOp : SPV_LogicalBinaryOp<"FUnordNotEqual", SPV_Float, [Commutative]> {
+  let summary = "Floating-point comparison for being unordered or not equal.";
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    floating-point type.  They must have the same type, and they must have
+    the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    float-scalar-vector-type ::= float-type |
+                                 `vector<` integer-literal `x` float-type `>`
+    funordneq-op ::= ssa-id `=` `spv.FUnordNotEqual` ssa-use, ssa-use
+    ```
+
+    For example:
+
+    ```
+    %4 = spv.FUnordNotEqual %0, %1 : f32
+    %5 = spv.FUnordNotEqual %2, %3 : vector<4xf32>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_IEqualOp : SPV_LogicalBinaryOp<"IEqual", SPV_Integer, [Commutative]> {
+  let summary = "Integer comparison for equality.";
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    iequal-op ::= ssa-id `=` `spv.IEqual` ssa-use, ssa-use
+                             `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.IEqual %0, %1 : i32
+    %5 = spv.IEqual %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_INotEqualOp : SPV_LogicalBinaryOp<"INotEqual", SPV_Integer, [Commutative]> {
+  let summary = "Integer comparison for inequality.";
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    inot-equal-op ::= ssa-id `=` `spv.INotEqual` ssa-use, ssa-use
+                                 `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.INotEqual %0, %1 : i32
+    %5 = spv.INotEqual %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_LogicalAndOp : SPV_LogicalBinaryOp<"LogicalAnd", SPV_Bool, [Commutative]> {
+  let summary = [{
+    Result is true if both Operand 1 and Operand 2 are true. Result is false
+    if either Operand 1 or Operand 2 are false.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 must be the same as Result Type.
+
+     The type of Operand 2 must be the same as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    logical-and ::= `spv.LogicalAnd` ssa-use `,` ssa-use
+                    `:` operand-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.LogicalAnd %0, %1 : i1
+    %2 = spv.LogicalAnd %0, %1 : vector<4xi1>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_LogicalEqualOp : SPV_LogicalBinaryOp<"LogicalEqual", SPV_Bool, [Commutative]> {
+  let summary = [{
+    Result is true if Operand 1 and Operand 2 have the same value. Result is
+    false if Operand 1 and Operand 2 have different values.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 must be the same as Result Type.
+
+     The type of Operand 2 must be the same as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    logical-equal ::= `spv.LogicalEqual` ssa-use `,` ssa-use
+                      `:` operand-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.LogicalEqual %0, %1 : i1
+    %2 = spv.LogicalEqual %0, %1 : vector<4xi1>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_LogicalNotOp : SPV_LogicalUnaryOp<"LogicalNot", SPV_Bool, []> {
+  let summary = [{
+    Result is true if Operand is false.  Result is false if Operand is true.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand must be the same as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    logical-not ::= `spv.LogicalNot` ssa-use `:` operand-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.LogicalNot %0 : i1
+    %2 = spv.LogicalNot %0 : vector<4xi1>
+    ```
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+// -----
+
+def SPV_LogicalNotEqualOp : SPV_LogicalBinaryOp<"LogicalNotEqual", SPV_Bool, [Commutative]> {
+  let summary = [{
+    Result is true if Operand 1 and Operand 2 have different values. Result
+    is false if Operand 1 and Operand 2 have the same value.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 must be the same as Result Type.
+
+     The type of Operand 2 must be the same as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    logical-not-equal ::= `spv.LogicalNotEqual` ssa-use `,` ssa-use
+                          `:` operand-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.LogicalNotEqual %0, %1 : i1
+    %2 = spv.LogicalNotEqual %0, %1 : vector<4xi1>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_LogicalOrOp : SPV_LogicalBinaryOp<"LogicalOr", SPV_Bool, [Commutative]> {
+  let summary = [{
+    Result is true if either Operand 1 or Operand 2 is true. Result is false
+    if both Operand 1 and Operand 2 are false.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 must be the same as Result Type.
+
+     The type of Operand 2 must be the same as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+
+    ```
+    logical-or ::= `spv.LogicalOr` ssa-use `,` ssa-use
+                    `:` operand-type
+    ```
+
+    For example:
+
+    ```
+    %2 = spv.LogicalOr %0, %1 : i1
+    %2 = spv.LogicalOr %0, %1 : vector<4xi1>
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SGreaterThanOp : SPV_LogicalBinaryOp<"SGreaterThan", SPV_Integer, []> {
+  let summary = [{
+    Signed-integer comparison if Operand 1 is greater than  Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    sgreater-than-op ::= ssa-id `=` `spv.SGreaterThan` ssa-use, ssa-use
+                                    `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.SGreaterThan %0, %1 : i32
+    %5 = spv.SGreaterThan %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SGreaterThanEqualOp : SPV_LogicalBinaryOp<"SGreaterThanEqual", SPV_Integer, []> {
+  let summary = [{
+    Signed-integer comparison if Operand 1 is greater than or equal to
+    Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    sgreater-than-equal-op ::= ssa-id `=` `spv.SGreaterThanEqual` ssa-use, ssa-use
+                                          `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.SGreaterThanEqual %0, %1 : i32
+    %5 = spv.SGreaterThanEqual %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SLessThanOp : SPV_LogicalBinaryOp<"SLessThan", SPV_Integer, []> {
+  let summary = [{
+    Signed-integer comparison if Operand 1 is less than Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    sless-than-op ::= ssa-id `=` `spv.SLessThan` ssa-use, ssa-use
+                                 `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.SLessThan %0, %1 : i32
+    %5 = spv.SLessThan %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SLessThanEqualOp : SPV_LogicalBinaryOp<"SLessThanEqual", SPV_Integer, []> {
+  let summary = [{
+    Signed-integer comparison if Operand 1 is less than or equal to Operand
+    2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    sless-than-equal-op ::= ssa-id `=` `spv.SLessThanEqual` ssa-use, ssa-use
+                                       `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.SLessThanEqual %0, %1 : i32
+    %5 = spv.SLessThanEqual %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_SelectOp : SPV_Op<"Select", [NoSideEffect]> {
+  let summary = [{
+    Select between two objects. Before version 1.4, results are only
+    computed per component.
+  }];
+
+  let description = [{
+    Before version 1.4, Result Type must be a pointer, scalar, or vector.
+
+     The types of Object 1 and Object 2 must be the same as Result Type.
+
+    Condition must be a scalar or vector of Boolean type.
+
+    If Condition is a scalar and true, the result is Object 1. If Condition
+    is a scalar and false, the result is Object 2.
+
+    If Condition is a vector, Result Type must be a vector with the same
+    number of components as Condition and the result is a mix of Object 1
+    and Object 2: When a component of Condition is true, the corresponding
+    component in the result is taken from Object 1, otherwise it is taken
+    from Object 2.
+
+    ### Custom assembly form
+
+    ```
+    scalar-type ::= integer-type | float-type | boolean-type
+    select-object-type ::= scalar-type
+                           | `vector<` integer-literal `x` scalar-type `>`
+                           | pointer-type
+    select-condition-type ::= boolean-type
+                              | `vector<` integer-literal `x` boolean-type `>`
+    select-op ::= ssa-id `=` `spv.Select` ssa-use, ssa-use, ssa-use
+                  `:` select-condition-type `,` select-object-type
+    ```
+
+    For example:
+
+    ```
+    %3 = spv.Select %0, %1, %2 : i1, f32
+    %3 = spv.Select %0, %1, %2 : i1, vector<3xi32>
+    %3 = spv.Select %0, %1, %2 : vector<3xi1>, vector<3xf32>
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_ScalarOrVectorOf<SPV_Bool>:$condition,
+    SPV_SelectType:$true_value,
+    SPV_SelectType:$false_value
+  );
+
+  let results = (outs
+    SPV_SelectType:$result
+  );
+
+  let builders = [OpBuilder<[{Builder *builder, OperationState &state,
+                              Value cond, Value trueValue,
+                              Value falseValue}]>];
+}
+
+// -----
+
+def SPV_UGreaterThanOp : SPV_LogicalBinaryOp<"UGreaterThan", SPV_Integer, []> {
+  let summary = [{
+    Unsigned-integer comparison if Operand 1 is greater than  Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    ugreater-than-op ::= ssa-id `=` `spv.UGreaterThan` ssa-use, ssa-use
+                                    `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.UGreaterThan %0, %1 : i32
+    %5 = spv.UGreaterThan %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_UGreaterThanEqualOp : SPV_LogicalBinaryOp<"UGreaterThanEqual", SPV_Integer, []> {
+  let summary = [{
+    Unsigned-integer comparison if Operand 1 is greater than or equal to
+    Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    ugreater-than-equal-op ::= ssa-id `=` `spv.UGreaterThanEqual` ssa-use, ssa-use
+                                          `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.UGreaterThanEqual %0, %1 : i32
+    %5 = spv.UGreaterThanEqual %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_ULessThanOp : SPV_LogicalBinaryOp<"ULessThan", SPV_Integer, []> {
+  let summary = [{
+    Unsigned-integer comparison if Operand 1 is less than Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    uless-than-op ::= ssa-id `=` `spv.ULessThan` ssa-use, ssa-use
+                                 `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.ULessThan %0, %1 : i32
+    %5 = spv.ULessThan %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+// -----
+
+def SPV_ULessThanEqualOp :
+  SPV_LogicalBinaryOp<"ULessThanEqual", SPV_Integer, []> {
+  let summary = [{
+    Unsigned-integer comparison if Operand 1 is less than or equal to
+    Operand 2.
+  }];
+
+  let description = [{
+    Result Type must be a scalar or vector of Boolean type.
+
+     The type of Operand 1 and Operand 2  must be a scalar or vector of
+    integer type.  They must have the same component width, and they must
+    have the same number of components as Result Type.
+
+     Results are computed per component.
+
+    ### Custom assembly form
+    ```
+    integer-scalar-vector-type ::= integer-type |
+                                 `vector<` integer-literal `x` integer-type `>`
+    uless-than-equal-op ::= ssa-id `=` `spv.ULessThanEqual` ssa-use, ssa-use
+                                       `:` integer-scalar-vector-type
+    ```
+    For example:
+
+    ```
+    %4 = spv.ULessThanEqual %0, %1 : i32
+    %5 = spv.ULessThanEqual %2, %3 : vector<4xi32>
+
+    ```
+  }];
+}
+
+#endif // SPIRV_LOGICAL_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f481f5956d180eba05509ee0e9d19c27dfede36
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h
@@ -0,0 +1,86 @@
+//===- SPIRVLowering.h - SPIR-V lowering utilities  -------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines utilities to use while targeting SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_SPIRVLOWERING_H
+#define MLIR_DIALECT_SPIRV_SPIRVLOWERING_H
+
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/Support/StringExtras.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace mlir {
+
+/// Type conversion from standard types to SPIR-V types for shader interface.
+///
+/// For composite types, this converter additionally performs type wrapping to
+/// satisfy shader interface requirements: shader interface types must be
+/// pointers to structs.
+class SPIRVTypeConverter final : public TypeConverter {
+public:
+  using TypeConverter::TypeConverter;
+
+  /// Converts the given standard `type` to SPIR-V correspondence.
+  Type convertType(Type type) override;
+
+  /// Gets the SPIR-V correspondence for the standard index type.
+  static Type getIndexType(MLIRContext *context);
+};
+
+/// Base class to define a conversion pattern to lower `SourceOp` into SPIR-V.
+template <typename SourceOp>
+class SPIRVOpLowering : public OpConversionPattern<SourceOp> {
+public:
+  SPIRVOpLowering(MLIRContext *context, SPIRVTypeConverter &typeConverter,
+                  PatternBenefit benefit = 1)
+      : OpConversionPattern<SourceOp>(context, benefit),
+        typeConverter(typeConverter) {}
+
+protected:
+  SPIRVTypeConverter &typeConverter;
+};
+
+#include "mlir/Dialect/SPIRV/SPIRVLowering.h.inc"
+
+namespace spirv {
+/// Returns a value that represents a builtin variable value within the SPIR-V
+/// module.
+Value getBuiltinVariableValue(Operation *op, spirv::BuiltIn builtin,
+                              OpBuilder &builder);
+
+/// Attribute name for specifying argument ABI information.
+StringRef getInterfaceVarABIAttrName();
+
+/// Get the InterfaceVarABIAttr given its fields.
+InterfaceVarABIAttr getInterfaceVarABIAttr(unsigned descriptorSet,
+                                           unsigned binding,
+                                           spirv::StorageClass storageClass,
+                                           MLIRContext *context);
+
+/// Attribute name for specifying entry point information.
+StringRef getEntryPointABIAttrName();
+
+/// Get the EntryPointABIAttr given its fields.
+EntryPointABIAttr getEntryPointABIAttr(ArrayRef<int32_t> localSize,
+                                       MLIRContext *context);
+
+/// Sets the InterfaceVarABIAttr and EntryPointABIAttr for a function and its
+/// arguments
+LogicalResult setABIAttrs(FuncOp funcOp,
+                          spirv::EntryPointABIAttr entryPointInfo,
+                          ArrayRef<spirv::InterfaceVarABIAttr> argABIInfo);
+
+} // namespace spirv
+} // namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_SPIRVLOWERING_H
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.td
new file mode 100644
index 0000000000000000000000000000000000000000..91a8ff68bbf86229156aaa5cc1417f1db3e668fa
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.td
@@ -0,0 +1,46 @@
+//===- SPIRVBase.td - MLIR SPIR-V Op Definitions Base file -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the base file for supporting lowering to SPIR-V dialect. This
+// file defines SPIR-V attributes used for specifying the shader
+// interface or ABI. This is because SPIR-V module is expected to work in
+// an execution environment as specified by a client API. A SPIR-V module
+// needs to "link" correctly with the execution environment regarding the
+// resources that are used in the SPIR-V module and get populated with
+// data via the client API. The shader interface (or ABI) is passed into
+// SPIR-V lowering path via attributes defined in this file. A
+// compilation flow targeting SPIR-V is expected to attach such
+// attributes to resources and other suitable places.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_LOWERING
+#define SPIRV_LOWERING
+
+include "mlir/Dialect/SPIRV/SPIRVBase.td"
+
+// For arguments that eventually map to spv.globalVariable for the
+// shader interface, this attribute specifies the information regarding
+// the global variable :
+// 1) Descriptor Set.
+// 2) Binding number.
+// 3) Storage class.
+def SPV_InterfaceVarABIAttr:
+    StructAttr<"InterfaceVarABIAttr", SPV_Dialect,
+               [StructFieldAttr<"descriptor_set", I32Attr>,
+                StructFieldAttr<"binding", I32Attr>,
+                StructFieldAttr<"storage_class", SPV_StorageClassAttr>]>;
+
+// For entry functions, this attribute specifies information related to entry
+// points in the generated SPIR-V module:
+// 1) WorkGroup Size.
+def SPV_EntryPointABIAttr:
+    StructAttr<"EntryPointABIAttr", SPV_Dialect,
+               [StructFieldAttr<"local_size", I32ElementsAttr>]>;
+
+#endif // SPIRV_LOWERING
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVNonUniformOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVNonUniformOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..f3a9a61a9e93853587cbac2de16edf9a573728c4
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVNonUniformOps.td
@@ -0,0 +1,69 @@
+//===-- SPIRVNonUniformOps.td - MLIR SPIR-V NonUniform Ops -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains non-uniform ops for the SPIR-V dialect. It corresponds to
+// "3.32.24. Non-Uniform Instructions" of the SPIR-V specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_NON_UNIFORM_OPS
+#define SPIRV_NON_UNIFORM_OPS
+
+// -----
+
+def SPV_GroupNonUniformBallotOp : SPV_Op<"GroupNonUniformBallot", []> {
+  let summary = [{
+    Returns a bitfield value combining the Predicate value from all
+    invocations in the group that execute the same dynamic instance of this
+    instruction. The bit is set to one if the corresponding invocation is
+    active and the Predicate for that invocation evaluated to true;
+    otherwise, it is set to zero.
+  }];
+
+  let description = [{
+    Result Type  must be a vector of four components of integer type scalar,
+    whose Signedness operand is 0.
+
+    Result is a set of bitfields where the first invocation is represented
+    in the lowest bit of the first vector component and the last (up to the
+    size of the group) is the higher bit number of the last bitmask needed
+    to represent all bits of the group invocations.
+
+    Execution must be Workgroup or Subgroup Scope.
+
+    Predicate must be a Boolean type.
+
+    ### Custom assembly form
+
+    ```
+    scope ::= `"Workgroup"` | `"Subgroup"`
+    non-uniform-ballot-op ::= ssa-id `=` `spv.GroupNonUniformBallot` scope
+                              ssa-use `:` `vector` `<` 4 `x` `integer-type` `>`
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.GroupNonUniformBallot "SubGroup" %predicate : vector<4xi32>
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_ScopeAttr:$execution_scope,
+    SPV_Bool:$predicate
+  );
+
+  let results = (outs
+    SPV_IntVec4:$result
+  );
+}
+
+// -----
+
+#endif // SPIRV_NON_UNIFORM_OPS
+
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fa417bfe25cbfcf19aeab28467945ae1090642f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.h
@@ -0,0 +1,41 @@
+//===- SPIRVOps.h - MLIR SPIR-V operations ----------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the operations in the SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_SPIRVOPS_H_
+#define MLIR_DIALECT_SPIRV_SPIRVOPS_H_
+
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/Function.h"
+
+namespace mlir {
+class OpBuilder;
+
+namespace spirv {
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/SPIRV/SPIRVOps.h.inc"
+
+/// Following methods are auto-generated.
+///
+/// Get the name used in the Op to refer to an enum value of the given
+/// `EnumClass`.
+/// template <typename EnumClass> StringRef attributeName();
+///
+/// Get the function that can be used to symbolize an enum value.
+/// template <typename EnumClass>
+/// Optional<EnumClass> (*)(StringRef) symbolizeEnum();
+#include "mlir/Dialect/SPIRV/SPIRVOpUtils.inc"
+
+} // end namespace spirv
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_SPIRVOPS_H_
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..1ce28928c41c55d25216e239dc609a8bdde2e2fe
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td
@@ -0,0 +1,468 @@
+//===-- SPIRVOps.td - MLIR SPIR-V Op Definitions Spec ------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the main operation definition specification file for SPIR-V
+// operations.
+//
+//===----------------------------------------------------------------------===//
+
+// Note that for each op in this file and the included files for specific op
+// categories, we use a tool to automatically generate certain sections in its
+// definition: basic structure, summary, description. So modifications to these
+// sections will not be respected. Modifications to op traits, arguments,
+// results, and sections after the results are retained. Besides, ops must be
+// separated via the '// -----' marker.
+
+#ifndef SPIRV_OPS
+#define SPIRV_OPS
+
+include "mlir/Dialect/SPIRV/SPIRVBase.td"
+include "mlir/Dialect/SPIRV/SPIRVArithmeticOps.td"
+include "mlir/Dialect/SPIRV/SPIRVAtomicOps.td"
+include "mlir/Dialect/SPIRV/SPIRVBitOps.td"
+include "mlir/Dialect/SPIRV/SPIRVCastOps.td"
+include "mlir/Dialect/SPIRV/SPIRVCompositeOps.td"
+include "mlir/Dialect/SPIRV/SPIRVControlFlowOps.td"
+include "mlir/Dialect/SPIRV/SPIRVGLSLOps.td"
+include "mlir/Dialect/SPIRV/SPIRVGroupOps.td"
+include "mlir/Dialect/SPIRV/SPIRVLogicalOps.td"
+include "mlir/Dialect/SPIRV/SPIRVNonUniformOps.td"
+include "mlir/Dialect/SPIRV/SPIRVStructureOps.td"
+
+// -----
+
+def SPV_AccessChainOp : SPV_Op<"AccessChain", [NoSideEffect]> {
+  let summary = [{
+    Create a pointer into a composite object that can be used with OpLoad
+    and OpStore.
+  }];
+
+  let description = [{
+    Result Type must be an OpTypePointer. Its Type operand must be the type
+    reached by walking the Base’s type hierarchy down to the last provided
+    index in Indexes, and its Storage Class operand must be the same as the
+    Storage Class of Base.
+
+    Base must be a pointer, pointing to the base of a composite object.
+
+    Indexes walk the type hierarchy to the desired depth, potentially down
+    to scalar granularity. The first index in Indexes will select the top-
+    level member/element/component/element of the base composite. All
+    composite constituents use zero-based numbering, as described by their
+    OpType… instruction. The second index will apply similarly to that
+    result, and so on. Once any non-composite type is reached, there must be
+    no remaining (unused) indexes.
+
+     Each index in Indexes
+
+    - must be a scalar integer type,
+
+    - is treated as a signed count, and
+
+    - must be an OpConstant when indexing into a structure.
+
+    ### Custom assembly form
+    ```
+    access-chain-op ::= ssa-id `=` `spv.AccessChain` ssa-use
+                        `[` ssa-use (',' ssa-use)* `]`
+                        `:` pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = "spv.constant"() { value = 1: i32} : () -> i32
+    %1 = spv.Variable : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+    %2 = spv.AccessChain %1[%0] : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+    %3 = spv.Load "Function" %2 ["Volatile"] : !spv.array<4xf32>
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_AnyPtr:$base_ptr,
+    Variadic<SPV_Integer>:$indices
+  );
+
+  let results = (outs
+    SPV_AnyPtr:$component_ptr
+  );
+
+  let builders = [OpBuilder<[{Builder *builder, OperationState &state,
+                              Value basePtr, ValueRange indices}]>];
+
+  let hasCanonicalizer = 1;
+}
+
+// -----
+
+def SPV_ControlBarrierOp : SPV_Op<"ControlBarrier", []> {
+  let summary = [{
+    Wait for other invocations of this module to reach the current point of
+    execution.
+  }];
+
+  let description = [{
+    All invocations of this module within Execution scope must reach this
+    point of execution before any invocation will proceed beyond it.
+
+    When Execution is Workgroup or larger, behavior is undefined if this
+    instruction is used in control flow that is non-uniform within
+    Execution. When Execution is Subgroup or Invocation, the behavior of
+    this instruction in non-uniform control flow is defined by the client
+    API.
+
+    If Semantics is not None, this instruction also serves as an
+    OpMemoryBarrier instruction, and must also perform and adhere to the
+    description and semantics of an OpMemoryBarrier instruction with the
+    same Memory and Semantics operands.  This allows atomically specifying
+    both a control barrier and a memory barrier (that is, without needing
+    two instructions). If Semantics is None, Memory is ignored.
+
+    Before version 1.3, it is only valid to use this instruction with
+    TessellationControl, GLCompute, or Kernel execution models. There is no
+    such restriction starting with version 1.3.
+
+    When used with the TessellationControl execution model, it also
+    implicitly synchronizes the Output Storage Class:  Writes to Output
+    variables performed by any invocation executed prior to a
+    OpControlBarrier will be visible to any other invocation after return
+    from that OpControlBarrier.
+
+    ### Custom assembly form
+
+    ```
+    scope ::= `"CrossDevice"` | `"Device"` | `"Workgroup"` | ...
+
+    memory-semantics ::= `"None"` | `"Acquire"` | "Release"` | ...
+
+    control-barrier-op ::= `spv.ControlBarrier` scope, scope, memory-semantics
+    ```
+
+    For example:
+
+    ```
+    spv.ControlBarrier "Workgroup", "Device", "Acquire|UniformMemory"
+
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_ScopeAttr:$execution_scope,
+    SPV_ScopeAttr:$memory_scope,
+    SPV_MemorySemanticsAttr:$memory_semantics
+  );
+
+  let results = (outs);
+
+  let verifier = [{ return verifyMemorySemantics(*this); }];
+
+  let autogenSerialization = 0;
+}
+
+// -----
+
+def SPV_ExecutionModeOp : SPV_Op<"ExecutionMode", [InModuleScope]> {
+  let summary = "Declare an execution mode for an entry point.";
+
+  let description = [{
+    Entry Point must be the Entry Point <id> operand of an OpEntryPoint
+    instruction.
+
+    Mode is the execution mode. See Execution Mode.
+
+    This instruction is only valid when the Mode operand is an execution
+    mode that takes no Extra Operands, or takes Extra Operands that are not
+    <id> operands.
+
+    ### Custom assembly form
+
+    ```
+    execution-mode ::= "Invocations" | "SpacingEqual" |
+                       <and other SPIR-V execution modes...>
+
+    execution-mode-op ::= `spv.ExecutionMode ` ssa-use execution-mode
+                          (integer-literal (`, ` integer-literal)* )?
+    ```
+
+    For example:
+
+    ```
+    spv.ExecutionMode @foo "ContractionOff"
+    spv.ExecutionMode @bar "LocalSizeHint", 3, 4, 5
+    ```
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$fn,
+    SPV_ExecutionModeAttr:$execution_mode,
+    I32ArrayAttr:$values
+  );
+
+  let results = (outs);
+
+  let verifier = [{ return success(); }];
+
+  let autogenSerialization = 0;
+
+  let builders = [OpBuilder<[{Builder *builder, OperationState &state,
+                              FuncOp function,
+                              spirv::ExecutionMode executionMode,
+                              ArrayRef<int32_t> params}]>];
+}
+
+// -----
+
+def SPV_LoadOp : SPV_Op<"Load", []> {
+  let summary = "Load through a pointer.";
+
+  let description = [{
+    Result Type is the type of the loaded object. It must be a type with
+    fixed size; i.e., it cannot be, nor include, any OpTypeRuntimeArray
+    types.
+
+    Pointer is the pointer to load through.  Its type must be an
+    OpTypePointer whose Type operand is the same as Result Type.
+
+    If present, any Memory Operands must begin with a memory operand
+    literal. If not present, it is the same as specifying the memory operand
+    None.
+
+    ### Custom assembly form
+
+    ```
+    memory-access ::= `"None"` | `"Volatile"` | `"Aligned", ` integer-literal
+                    | `"NonTemporal"`
+
+    load-op ::= ssa-id ` = spv.Load ` storage-class ssa-use
+                (`[` memory-access `]`)? ` : ` spirv-element-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.Variable : !spv.ptr<f32, Function>
+    %1 = spv.Load "Function" %0 : f32
+    %2 = spv.Load "Function" %0 ["Volatile"] : f32
+    %3 = spv.Load "Function" %0 ["Aligned", 4] : f32
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_AnyPtr:$ptr,
+    OptionalAttr<SPV_MemoryAccessAttr>:$memory_access,
+    OptionalAttr<I32Attr>:$alignment
+  );
+
+  let results = (outs
+    SPV_Type:$value
+  );
+
+  let builders = [OpBuilder<[{Builder *builder, OperationState &state,
+                  Value basePtr, /*optional*/IntegerAttr memory_access,
+                  /*optional*/IntegerAttr alignment}]>];
+}
+
+// -----
+
+def SPV_MemoryBarrierOp : SPV_Op<"MemoryBarrier", []> {
+  let summary = "Control the order that memory accesses are observed.";
+
+  let description = [{
+    Ensures that memory accesses issued before this instruction will be
+    observed before memory accesses issued after this instruction. This
+    control is ensured only for memory accesses issued by this invocation
+    and observed by another invocation executing within Memory scope. If the
+    Vulkan memory model is declared, this ordering only applies to memory
+    accesses that use the NonPrivatePointer memory operand or
+    NonPrivateTexel image operand.
+
+    Semantics declares what kind of memory is being controlled and what kind
+    of control to apply.
+
+    To execute both a memory barrier and a control barrier, see
+    OpControlBarrier.
+
+    ### Custom assembly form
+
+    ```
+    scope ::= `"CrossDevice"` | `"Device"` | `"Workgroup"` | ...
+
+    memory-semantics ::= `"None"` | `"Acquire"` | `"Release"` | ...
+
+    memory-barrier-op ::= `spv.MemoryBarrier` scope, memory-semantics
+    ```
+
+    For example:
+
+    ```
+    spv.MemoryBarrier "Device", "Acquire|UniformMemory"
+
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_ScopeAttr:$memory_scope,
+    SPV_MemorySemanticsAttr:$memory_semantics
+  );
+
+  let results = (outs);
+
+  let verifier = [{ return verifyMemorySemantics(*this); }];
+
+  let autogenSerialization = 0;
+}
+
+// -----
+
+def SPV_StoreOp : SPV_Op<"Store", []> {
+  let summary = "Store through a pointer.";
+
+  let description = [{
+    Pointer is the pointer to store through.  Its type must be an
+    OpTypePointer whose Type operand is the same as the type of Object.
+
+    Object is the object to store.
+
+    If present, any Memory Operands must begin with a memory operand
+    literal. If not present, it is the same as specifying the memory operand
+    None.
+
+    ### Custom assembly form
+
+    ```
+    store-op ::= `spv.Store ` storage-class ssa-use `, ` ssa-use `, `
+                  (`[` memory-access `]`)? `:` spirv-element-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.Variable : !spv.ptr<f32, Function>
+    %1 = spv.FMul ... : f32
+    spv.Store "Function" %0, %1 : f32
+    spv.Store "Function" %0, %1 ["Volatile"] : f32
+    spv.Store "Function" %0, %1 ["Aligned", 4] : f32
+  }];
+
+  let arguments = (ins
+    SPV_AnyPtr:$ptr,
+    SPV_Type:$value,
+    OptionalAttr<SPV_MemoryAccessAttr>:$memory_access,
+    OptionalAttr<I32Attr>:$alignment
+  );
+
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &state, "
+      "Value ptr, Value value, ArrayRef<NamedAttribute> namedAttrs", [{
+      state.addOperands(ptr);
+      state.addOperands(value);
+      state.addAttributes(namedAttrs);
+    }]>
+  ];
+
+  let results = (outs);
+}
+
+// -----
+
+def SPV_UndefOp : SPV_Op<"undef", []> {
+  let summary = "Make an intermediate object whose value is undefined.";
+
+  let description = [{
+    Result Type is the type of object to make.
+
+    Each consumption of Result <id> yields an arbitrary, possibly different
+    bit pattern or abstract value resulting in possibly different concrete,
+    abstract, or opaque values.
+
+    ### Custom assembly form
+
+    ```
+    undef-op ::= `spv.undef` `:` spirv-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.undef : f32
+    %1 = spv.undef : !spv.struct<!spv.array<4 x vector<4xi32>>>
+    ```
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs
+    SPV_Type:$result
+  );
+
+  let verifier = [{ return success(); }];
+
+  let hasOpcode = 0;
+  let autogenSerialization = 0;
+}
+
+// -----
+
+def SPV_VariableOp : SPV_Op<"Variable", []> {
+  let summary = [{
+    Allocate an object in memory, resulting in a pointer to it, which can be
+    used with OpLoad and OpStore.
+  }];
+
+  let description = [{
+    Result Type must be an OpTypePointer. Its Type operand is the type of
+    object in memory.
+
+    Storage Class is the Storage Class of the memory holding the object. It
+    cannot be Generic. It must be the same as the Storage Class operand of
+    the Result Type.
+
+    Initializer is optional.  If Initializer is present, it will be the
+    initial value of the variable’s memory content. Initializer must be an
+    <id> from a constant instruction or a global (module scope) OpVariable
+    instruction. Initializer must have the same type as the type pointed to
+    by Result Type.
+
+    ### Custom assembly form
+
+    ```
+    variable-op ::= ssa-id `=` `spv.Variable` (`init(` ssa-use `)`)?
+                    (`bind(` integer-literal, integer-literal `)`)?
+                    (`built_in(` string-literal `)`)?
+                    attribute-dict? `:` spirv-pointer-type
+    ```
+
+    where `init` specifies initializer and `bind` specifies the
+    descriptor set and binding number. `built_in` specifies SPIR-V
+    BuiltIn decoration associated with the op.
+
+    For example:
+
+    ```
+    %0 = spv.constant ...
+
+    %1 = spv.Variable : !spv.ptr<f32, Function>
+    %2 = spv.Variable init(%0): !spv.ptr<f32, Private>
+    %3 = spv.Variable init(%0) bind(1, 2): !spv.ptr<f32, Uniform>
+    %3 = spv.Variable built_in("GlobalInvocationID") : !spv.ptr<vector<3xi32>, Uniform>
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_StorageClassAttr:$storage_class,
+    SPV_Optional<AnyType>:$initializer
+  );
+
+  let results = (outs
+    SPV_AnyPtr:$pointer
+  );
+}
+
+// -----
+
+#endif // SPIRV_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVStructureOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVStructureOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..c37796b9f60a38ad049e185f2408ecb4aeabba18
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVStructureOps.td
@@ -0,0 +1,461 @@
+//===-- SPIRVStructureOps.td - MLIR SPIR-V Structure Ops ---*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains ops for defining the SPIR-V structure: module, function,
+// and module-level operations. The representational form of these ops deviate
+// from the SPIR-V binary format in order to utilize MLIR mechanisms.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPIRV_STRUCTURE_OPS
+#define SPIRV_STRUCTURE_OPS
+
+include "mlir/Dialect/SPIRV/SPIRVBase.td"
+
+def SPV_AddressOfOp : SPV_Op<"_address_of", [InFunctionScope, NoSideEffect]> {
+  let summary = "Get the address of a global variable.";
+
+  let description = [{
+    Variables in module scope are defined using symbol names. This op generates
+    an SSA value that can be used to refer to the symbol within function scope
+    for use in ops that expect an SSA value. This operation has no corresponding
+    SPIR-V instruction; it's merely used for modelling purpose in the SPIR-V
+    dialect. Since variables in module scope in SPIR-V dialect are of pointer
+    type, this op returns a pointer type as well, and the type is the same as
+    the variable referenced.
+
+    ### Custom assembly form
+
+    ```
+    spv-address-of-op ::= ssa-id `=` `spv._address_of` symbol-ref-id
+                                     `:` spirv-pointer-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv._address_of @global_var : !spv.ptr<f32, Input>
+    ```
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$variable
+  );
+
+  let results = (outs
+    SPV_AnyPtr:$pointer
+  );
+
+  let hasOpcode = 0;
+
+  let autogenSerialization = 0;
+
+  let builders = [OpBuilder<[{Builder *builder, OperationState &state,
+                              spirv::GlobalVariableOp var}]>];
+}
+
+def SPV_ConstantOp : SPV_Op<"constant", [NoSideEffect]> {
+  let summary = "The op that declares a SPIR-V normal constant";
+
+  let description = [{
+    This op declares a SPIR-V normal constant. SPIR-V has multiple constant
+    instructions covering different constant types:
+
+    * `OpConstantTrue` and `OpConstantFalse` for boolean constants
+    * `OpConstant` for scalar constants
+    * `OpConstantComposite` for composite constants
+    * `OpConstantNull` for null constants
+    * ...
+
+    Having such a plethora of constant instructions renders IR transformations
+    more tedious. Therefore, we use a single `spv.constant` op to represent
+    them all. Note that conversion between those SPIR-V constant instructions
+    and this op is purely mechanical; so it can be scoped to the binary
+    (de)serialization process.
+
+    ### Custom assembly form
+
+    ```
+    spv-constant-op ::= ssa-id `=` `spv.constant` attribute-value
+                        (`:` spirv-type)?
+    ```
+
+    For example:
+
+    ```
+    %0 = spv.constant true
+    %1 = spv.constant dense<[2, 3]> : vector<2xf32>
+    %2 = spv.constant [dense<3.0> : vector<2xf32>] : !spv.array<1xvector<2xf32>>
+    ```
+
+    TODO(antiagainst): support constant structs
+  }];
+
+  let arguments = (ins
+    AnyAttr:$value
+  );
+
+  let results = (outs
+    SPV_Type:$constant
+  );
+
+  let hasFolder = 1;
+
+  let extraClassDeclaration = [{
+    // Returns true if a constant can be built for the given `type`.
+    static bool isBuildableWith(Type type);
+
+    // Creates a constant zero/one of the given `type` at the current insertion
+    // point of `builder` and returns it.
+    static spirv::ConstantOp getZero(Type type, Location loc,
+                                     OpBuilder *builder);
+    static spirv::ConstantOp getOne(Type type, Location loc,
+                                    OpBuilder *builder);
+  }];
+
+  let hasOpcode = 0;
+
+  let autogenSerialization = 0;
+}
+
+def SPV_EntryPointOp : SPV_Op<"EntryPoint", [InModuleScope]> {
+  let summary = [{
+    Declare an entry point, its execution model, and its interface.
+  }];
+
+  let description = [{
+    Execution Model is the execution model for the entry point and its
+    static call tree. See Execution Model.
+
+    Entry Point must be the Result <id> of an OpFunction instruction.
+
+    Name is a name string for the entry point. A module cannot have two
+    OpEntryPoint instructions with the same Execution Model and the same
+    Name string.
+
+    Interface is a list of symbol references to `spv.globalVariable`
+    operations. These declare the set of global variables from a
+    module that form the interface of this entry point. The set of
+    Interface symbols must be equal to or a superset of the
+    `spv.globalVariable`s referenced by the entry point’s static call
+    tree, within the interface’s storage classes.  Before version 1.4,
+    the interface’s storage classes are limited to the Input and
+    Output storage classes. Starting with version 1.4, the interface’s
+    storage classes are all storage classes used in declaring all
+    global variables referenced by the entry point’s call tree.
+
+    ### Custom assembly form
+
+    ```
+    execution-model ::= "Vertex" | "TesellationControl" |
+                        <and other SPIR-V execution models...>
+
+    entry-point-op ::= ssa-id `=` `spv.EntryPoint` execution-model
+                       symbol-reference (`, ` symbol-reference)*
+    ```
+
+    For example:
+
+    ```
+    spv.EntryPoint "GLCompute" @foo
+    spv.EntryPoint "Kernel" @foo, @var1, @var2
+
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_ExecutionModelAttr:$execution_model,
+    FlatSymbolRefAttr:$fn,
+    SymbolRefArrayAttr:$interface
+  );
+
+  let results = (outs);
+
+  let autogenSerialization = 0;
+
+  let builders = [OpBuilder<[{Builder *builder, OperationState &state,
+                              spirv::ExecutionModel executionModel,
+                              FuncOp function,
+                              ArrayRef<Attribute> interfaceVars}]>];
+}
+
+
+def SPV_GlobalVariableOp : SPV_Op<"globalVariable", [InModuleScope, Symbol]> {
+  let summary = [{
+    Allocate an object in memory at module scope. The object is
+    referenced using a symbol name.
+  }];
+
+  let description = [{
+    The variable type must be an OpTypePointer. Its type operand is the type of
+    object in memory.
+
+    Storage Class is the Storage Class of the memory holding the object. It
+    cannot be Generic. It must be the same as the Storage Class operand of
+    the variable types. Only those storage classes that are valid at module
+    scope (like Input, Output, StorageBuffer, etc.) are valid.
+
+    Initializer is optional.  If Initializer is present, it will be
+    the initial value of the variable’s memory content. Initializer
+    must be an symbol defined from a constant instruction or other
+    `spv.globalVariable` operation in module scope. Initializer must
+    have the same type as the type of the defined symbol.
+
+    ### Custom assembly form
+
+    ```
+    variable-op ::= `spv.globalVariable` spirv-type symbol-ref-id
+                    (`initializer(` symbol-ref-id `)`)?
+                    (`bind(` integer-literal, integer-literal `)`)?
+                    (`built_in(` string-literal `)`)?
+                    attribute-dict?
+    ```
+
+    where `initializer` specifies initializer and `bind` specifies the
+    descriptor set and binding number. `built_in` specifies SPIR-V
+    BuiltIn decoration associated with the op.
+
+    For example:
+
+    ```
+    spv.globalVariable @var0 : !spv.ptr<f32, Input> @var0
+    spv.globalVariable @var1 initializer(@var0) : !spv.ptr<f32, Output>
+    spv.globalVariable @var2 bind(1, 2) : !spv.ptr<f32, Uniform>
+    spv.globalVariable @var3 built_in("GlobalInvocationId") : !spv.ptr<vector<3xi32>, Input>
+    ```
+  }];
+
+  let arguments = (ins
+    TypeAttr:$type,
+    StrAttr:$sym_name,
+    OptionalAttr<FlatSymbolRefAttr>:$initializer
+  );
+
+  let builders = [
+    OpBuilder<"Builder *builder, OperationState &state, "
+      "TypeAttr type, ArrayRef<NamedAttribute> namedAttrs", [{
+      state.addAttribute("type", type);
+      state.addAttributes(namedAttrs);
+    }]>,
+    OpBuilder<[{Builder *builder, OperationState &state,
+                Type type, StringRef name, unsigned descriptorSet,
+                unsigned binding}]>,
+    OpBuilder<[{Builder *builder, OperationState &state,
+                Type type, StringRef name, spirv::BuiltIn builtin}]>
+  ];
+
+  let results = (outs);
+
+  let hasOpcode = 0;
+
+  let autogenSerialization = 0;
+
+  let extraClassDeclaration = [{
+    ::mlir::spirv::StorageClass storageClass() {
+      return this->type().cast<::mlir::spirv::PointerType>().getStorageClass();
+    }
+  }];
+}
+
+def SPV_ModuleOp : SPV_Op<"module",
+                          [IsolatedFromAbove,
+                           SingleBlockImplicitTerminator<"ModuleEndOp">,
+                           NativeOpTrait<"SymbolTable">]> {
+  let summary = "The top-level op that defines a SPIR-V module";
+
+  let description = [{
+    This op defines a SPIR-V module using a MLIR region. The region contains
+    one block. Module-level operations, including functions definitions,
+    are all placed in this block.
+
+    Using an op with a region to define a SPIR-V module enables "embedding"
+    SPIR-V modules in other dialects in a clean manner: this op guarantees
+    the validity and serializability of a SPIR-V module and thus serves as
+    a clear-cut boundary.
+
+    This op takes no operands and generates no results. This op should not
+    implicitly capture values from the enclosing environment.
+
+    This op has only one region, which only contains one block. The block
+    must be terminated via the `spv._module_end` op.
+
+    ### Custom assembly form
+
+    ```
+    addressing-model ::= `"Logical"` | `"Physical32"` | `"Physical64"`
+    memory-model ::= `"Simple"` | `"GLSL450"` | `"OpenCL"` | `"VulkanKHR"`
+    spv-module-op ::= `spv.module` addressing-model memory-model
+                      region
+                      (`attributes` attribute-dict)?
+    ```
+
+    For example:
+
+    ```
+    spv.module "Logical" "VulkanKHR" { }
+
+    spv.module "Logical" "VulkanKHR" {
+      func @do_nothing() -> () {
+        spv.Return
+      }
+    } attributes {
+      capability = ["Shader"],
+      extension = ["SPV_KHR_16bit_storage"]
+    }
+    ```
+  }];
+
+  let arguments = (ins
+    SPV_AddressingModelAttr:$addressing_model,
+    SPV_MemoryModelAttr:$memory_model,
+    OptionalAttr<StrArrayAttr>:$capabilities,
+    OptionalAttr<StrArrayAttr>:$extensions,
+    OptionalAttr<StrArrayAttr>:$extended_instruction_sets
+  );
+
+  let results = (outs);
+
+  let regions = (region SizedRegion<1>:$body);
+
+  let builders =
+    [OpBuilder<"Builder *, OperationState &state">,
+     OpBuilder<[{Builder *, OperationState &state,
+                 IntegerAttr addressing_model,
+                 IntegerAttr memory_model}]>,
+     OpBuilder<[{Builder *, OperationState &state,
+                 spirv::AddressingModel addressing_model,
+                 spirv::MemoryModel memory_model,
+                 /*optional*/ ArrayRef<spirv::Capability> capabilities = {},
+                 /*optional*/ ArrayRef<spirv::Extension> extensions = {},
+                 /*optional*/ ArrayAttr extended_instruction_sets = nullptr}]>];
+
+  // We need to ensure the block inside the region is properly terminated;
+  // the auto-generated builders do not guarantee that.
+  let skipDefaultBuilders = 1;
+
+  let hasOpcode = 0;
+
+  let autogenSerialization = 0;
+
+  let extraClassDeclaration = [{
+    Block& getBlock() {
+      return this->getOperation()->getRegion(0).front();
+    }
+  }];
+}
+
+def SPV_ModuleEndOp : SPV_Op<"_module_end", [InModuleScope, Terminator]> {
+  let summary = "The pseudo op that ends a SPIR-V module";
+
+  let description = [{
+    This op terminates the only block inside a `spv.module`'s only region.
+    This op does not have a corresponding SPIR-V instruction and thus will
+    not be serialized into the binary format; it is used solely to satisfy
+    the structual requirement that an block must be ended with a terminator.
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs);
+
+  let parser = [{ return parseNoIOOp(parser, result); }];
+  let printer = [{ printNoIOOp(getOperation(), p); }];
+
+  let verifier = [{ return success(); }];
+
+  let hasOpcode = 0;
+
+  let autogenSerialization = 0;
+}
+
+def SPV_ReferenceOfOp : SPV_Op<"_reference_of", [NoSideEffect]> {
+  let summary = "Reference a specialization constant.";
+
+  let description = [{
+    Specialization constant in module scope are defined using symbol names.
+    This op generates an SSA value that can be used to refer to the symbol
+    within function scope for use in ops that expect an SSA value.
+    This operation has no corresponding SPIR-V instruction; it's merely used
+    for modelling purpose in the SPIR-V dialect. This op's return type is
+    the same as the specialization constant.
+
+    ### Custom assembly form
+
+    ```
+    spv-reference-of-op ::= ssa-id `=` `spv._reference_of` symbol-ref-id
+                                       `:` spirv-scalar-type
+    ```
+
+    For example:
+
+    ```
+    %0 = spv._reference_of @spec_const : f32
+    ```
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$spec_const
+  );
+
+  let results = (outs
+    SPV_Type:$reference
+  );
+
+  let hasOpcode = 0;
+
+  let autogenSerialization = 0;
+}
+
+def SPV_SpecConstantOp : SPV_Op<"specConstant", [InModuleScope, Symbol]> {
+  let summary = "The op that declares a SPIR-V specialization constant";
+
+  let description = [{
+    This op declares a SPIR-V scalar specialization constant. SPIR-V has
+    multiple constant instructions covering different scalar types:
+
+    * `OpSpecConstantTrue` and `OpSpecConstantFalse` for boolean constants
+    * `OpSpecConstant` for scalar constants
+
+    Similar as `spv.constant`, this op represents all of the above cases.
+    `OpSpecConstantComposite` and `OpSpecConstantOp` are modelled with
+    separate ops.
+
+    ### Custom assembly form
+
+    ```
+    spv-spec-constant-op ::= `spv.specConstant` symbol-ref-id
+                             `spec_id(` integer `)`
+                             `=` attribute-value (`:` spirv-type)?
+    ```
+
+    where `spec_id` specifies the SPIR-V SpecId decoration associated with
+    the op.
+
+    For example:
+
+    ```
+    spv.specConstant @spec_const1 = true
+    spv.specConstant @spec_const2 spec_id(5) = 42 : i32
+    ```
+
+    TODO(antiagainst): support composite spec constants with another op
+  }];
+
+  let arguments = (ins
+    StrAttr:$sym_name,
+    AnyAttr:$default_value
+  );
+
+  let results = (outs);
+
+  let hasOpcode = 0;
+
+  let autogenSerialization = 0;
+}
+
+#endif // SPIRV_STRUCTURE_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..001d3130778402c39dae08ef9d9d573482f81762
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h
@@ -0,0 +1,197 @@
+//===- SPIRVTypes.h - MLIR SPIR-V Types -------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the types in the SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_SPIRVTYPES_H_
+#define MLIR_DIALECT_SPIRV_SPIRVTYPES_H_
+
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeSupport.h"
+#include "mlir/IR/Types.h"
+
+// Pull in all enum type definitions and utility function declarations
+#include "mlir/Dialect/SPIRV/SPIRVEnums.h.inc"
+
+#include <tuple>
+
+namespace mlir {
+namespace spirv {
+
+namespace detail {
+struct ArrayTypeStorage;
+struct ImageTypeStorage;
+struct PointerTypeStorage;
+struct RuntimeArrayTypeStorage;
+struct StructTypeStorage;
+} // namespace detail
+
+namespace TypeKind {
+enum Kind {
+  Array = Type::FIRST_SPIRV_TYPE,
+  Image,
+  Pointer,
+  RuntimeArray,
+  Struct,
+  LAST_SPIRV_TYPE = Struct,
+};
+}
+
+// SPIR-V composite type: VectorType, SPIR-V ArrayType, or SPIR-V StructType.
+class CompositeType : public Type {
+public:
+  using Type::Type;
+
+  static bool classof(Type type);
+
+  unsigned getNumElements() const;
+
+  Type getElementType(unsigned) const;
+};
+
+// SPIR-V array type
+class ArrayType : public Type::TypeBase<ArrayType, CompositeType,
+                                        detail::ArrayTypeStorage> {
+public:
+  using Base::Base;
+  // Zero layout specifies that is no layout
+  using LayoutInfo = uint64_t;
+
+  static bool kindof(unsigned kind) { return kind == TypeKind::Array; }
+
+  static ArrayType get(Type elementType, unsigned elementCount);
+
+  static ArrayType get(Type elementType, unsigned elementCount,
+                       LayoutInfo layoutInfo);
+
+  unsigned getNumElements() const;
+
+  Type getElementType() const;
+
+  bool hasLayout() const;
+
+  uint64_t getArrayStride() const;
+};
+
+// SPIR-V image type
+class ImageType
+    : public Type::TypeBase<ImageType, Type, detail::ImageTypeStorage> {
+public:
+  using Base::Base;
+
+  static bool kindof(unsigned kind) { return kind == TypeKind::Image; }
+
+  static ImageType
+  get(Type elementType, Dim dim,
+      ImageDepthInfo depth = ImageDepthInfo::DepthUnknown,
+      ImageArrayedInfo arrayed = ImageArrayedInfo::NonArrayed,
+      ImageSamplingInfo samplingInfo = ImageSamplingInfo::SingleSampled,
+      ImageSamplerUseInfo samplerUse = ImageSamplerUseInfo::SamplerUnknown,
+      ImageFormat format = ImageFormat::Unknown) {
+    return ImageType::get(
+        std::tuple<Type, Dim, ImageDepthInfo, ImageArrayedInfo,
+                   ImageSamplingInfo, ImageSamplerUseInfo, ImageFormat>(
+            elementType, dim, depth, arrayed, samplingInfo, samplerUse,
+            format));
+  }
+
+  static ImageType
+      get(std::tuple<Type, Dim, ImageDepthInfo, ImageArrayedInfo,
+                     ImageSamplingInfo, ImageSamplerUseInfo, ImageFormat>);
+
+  Type getElementType() const;
+  Dim getDim() const;
+  ImageDepthInfo getDepthInfo() const;
+  ImageArrayedInfo getArrayedInfo() const;
+  ImageSamplingInfo getSamplingInfo() const;
+  ImageSamplerUseInfo getSamplerUseInfo() const;
+  ImageFormat getImageFormat() const;
+  // TODO(ravishankarm): Add support for Access qualifier
+};
+
+// SPIR-V pointer type
+class PointerType
+    : public Type::TypeBase<PointerType, Type, detail::PointerTypeStorage> {
+public:
+  using Base::Base;
+
+  static bool kindof(unsigned kind) { return kind == TypeKind::Pointer; }
+
+  static PointerType get(Type pointeeType, StorageClass storageClass);
+
+  Type getPointeeType() const;
+
+  StorageClass getStorageClass() const;
+};
+
+// SPIR-V run-time array type
+class RuntimeArrayType
+    : public Type::TypeBase<RuntimeArrayType, Type,
+                            detail::RuntimeArrayTypeStorage> {
+public:
+  using Base::Base;
+
+  static bool kindof(unsigned kind) { return kind == TypeKind::RuntimeArray; }
+
+  static RuntimeArrayType get(Type elementType);
+
+  Type getElementType() const;
+};
+
+// SPIR-V struct type
+class StructType : public Type::TypeBase<StructType, CompositeType,
+                                         detail::StructTypeStorage> {
+public:
+  using Base::Base;
+
+  // Layout information used for members in a struct in SPIR-V
+  //
+  // TODO(ravishankarm) : For now this only supports the offset type, so uses
+  // uint64_t value to represent the offset, with
+  // std::numeric_limit<uint64_t>::max indicating no offset. Change this to
+  // something that can hold all the information needed for different member
+  // types
+  using LayoutInfo = uint64_t;
+
+  using MemberDecorationInfo = std::pair<uint32_t, spirv::Decoration>;
+
+  static bool kindof(unsigned kind) { return kind == TypeKind::Struct; }
+
+  /// Construct a StructType with at least one member.
+  static StructType get(ArrayRef<Type> memberTypes,
+                        ArrayRef<LayoutInfo> layoutInfo = {},
+                        ArrayRef<MemberDecorationInfo> memberDecorations = {});
+
+  /// Construct a struct with no members.
+  static StructType getEmpty(MLIRContext *context);
+
+  unsigned getNumElements() const;
+
+  Type getElementType(unsigned) const;
+
+  bool hasLayout() const;
+
+  uint64_t getOffset(unsigned) const;
+
+  // Returns in `allMemberDecorations` the spirv::Decorations (apart from
+  // Offset) associated with all members of the StructType.
+  void getMemberDecorations(SmallVectorImpl<StructType::MemberDecorationInfo>
+                                &allMemberDecorations) const;
+
+  // Returns in `memberDecorations` all the spirv::Decorations (apart from
+  // Offset) associated with the `i`-th member of the StructType.
+  void getMemberDecorations(
+      unsigned i, SmallVectorImpl<spirv::Decoration> &memberDecorations) const;
+};
+
+} // end namespace spirv
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_SPIRVTYPES_H_
diff --git a/mlir/include/mlir/Dialect/SPIRV/Serialization.h b/mlir/include/mlir/Dialect/SPIRV/Serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8240b0072e822573e17b8b4184d9f4a6cfe120d
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/Serialization.h
@@ -0,0 +1,40 @@
+//===- Serialization.h - MLIR SPIR-V (De)serialization ----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the entry points for serialize and deserialize SPIR-V
+// binary modules.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SPIRV_SERIALIZATION_H_
+#define MLIR_DIALECT_SPIRV_SERIALIZATION_H_
+
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+struct LogicalResult;
+class MLIRContext;
+
+namespace spirv {
+class ModuleOp;
+
+/// Serializes the given SPIR-V `module` and writes to `binary`. On failure,
+/// reports errors to the error handler registered with the MLIR context for
+/// `module`.
+LogicalResult serialize(ModuleOp module, SmallVectorImpl<uint32_t> &binary);
+
+/// Deserializes the given SPIR-V `binary` module and creates a MLIR ModuleOp
+/// in the given `context`. Returns the ModuleOp on success; otherwise, reports
+/// errors to the error handler registered with `context` and returns
+/// llvm::None.
+Optional<ModuleOp> deserialize(ArrayRef<uint32_t> binary, MLIRContext *context);
+
+} // end namespace spirv
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_SPIRV_SERIALIZATION_H_
diff --git a/mlir/include/mlir/Dialect/StandardOps/CMakeLists.txt b/mlir/include/mlir/Dialect/StandardOps/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b6534797a065cbcc148416b41ee1aaecdeb89b36
--- /dev/null
+++ b/mlir/include/mlir/Dialect/StandardOps/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(LLVM_TARGET_DEFINITIONS Ops.td)
+mlir_tablegen(Ops.h.inc -gen-op-decls)
+mlir_tablegen(Ops.cpp.inc -gen-op-defs)
+mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
+mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
+add_public_tablegen_target(MLIRStandardOpsIncGen)
diff --git a/mlir/include/mlir/Dialect/StandardOps/Ops.h b/mlir/include/mlir/Dialect/StandardOps/Ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ba16c56f8eb6ad012d2af2f8725f0ff1d9db02f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/StandardOps/Ops.h
@@ -0,0 +1,342 @@
+//===- Ops.h - Standard MLIR Operations -------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines convenience types for working with standard operations
+// in the MLIR operation set.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_STANDARDOPS_OPS_H
+#define MLIR_DIALECT_STANDARDOPS_OPS_H
+
+#include "mlir/Analysis/CallInterfaces.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+
+// Pull in all enum type definitions and utility function declarations.
+#include "mlir/Dialect/StandardOps/OpsEnums.h.inc"
+
+namespace mlir {
+class AffineMap;
+class Builder;
+class FuncOp;
+class OpBuilder;
+
+class StandardOpsDialect : public Dialect {
+public:
+  StandardOpsDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "std"; }
+
+  /// Materialize a single constant operation from a given attribute value with
+  /// the desired resultant type.
+  Operation *materializeConstant(OpBuilder &builder, Attribute value, Type type,
+                                 Location loc) override;
+};
+
+/// The predicate indicates the type of the comparison to perform:
+/// (un)orderedness, (in)equality and less/greater than (or equal to) as
+/// well as predicates that are always true or false.
+enum class CmpFPredicate {
+  FirstValidValue,
+  // Always false
+  AlwaysFalse = FirstValidValue,
+  // Ordered comparisons
+  OEQ,
+  OGT,
+  OGE,
+  OLT,
+  OLE,
+  ONE,
+  // Both ordered
+  ORD,
+  // Unordered comparisons
+  UEQ,
+  UGT,
+  UGE,
+  ULT,
+  ULE,
+  UNE,
+  // Any unordered
+  UNO,
+  // Always true
+  AlwaysTrue,
+  // Number of predicates.
+  NumPredicates
+};
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/StandardOps/Ops.h.inc"
+
+/// This is a refinement of the "constant" op for the case where it is
+/// returning a float value of FloatType.
+///
+///   %1 = "std.constant"(){value: 42.0} : bf16
+///
+class ConstantFloatOp : public ConstantOp {
+public:
+  using ConstantOp::ConstantOp;
+
+  /// Builds a constant float op producing a float of the specified type.
+  static void build(Builder *builder, OperationState &result,
+                    const APFloat &value, FloatType type);
+
+  APFloat getValue() { return getAttrOfType<FloatAttr>("value").getValue(); }
+
+  static bool classof(Operation *op);
+};
+
+/// This is a refinement of the "constant" op for the case where it is
+/// returning an integer value of IntegerType.
+///
+///   %1 = "std.constant"(){value: 42} : i32
+///
+class ConstantIntOp : public ConstantOp {
+public:
+  using ConstantOp::ConstantOp;
+  /// Build a constant int op producing an integer of the specified width.
+  static void build(Builder *builder, OperationState &result, int64_t value,
+                    unsigned width);
+
+  /// Build a constant int op producing an integer with the specified type,
+  /// which must be an integer type.
+  static void build(Builder *builder, OperationState &result, int64_t value,
+                    Type type);
+
+  int64_t getValue() { return getAttrOfType<IntegerAttr>("value").getInt(); }
+
+  static bool classof(Operation *op);
+};
+
+/// This is a refinement of the "constant" op for the case where it is
+/// returning an integer value of Index type.
+///
+///   %1 = "std.constant"(){value: 99} : () -> index
+///
+class ConstantIndexOp : public ConstantOp {
+public:
+  using ConstantOp::ConstantOp;
+
+  /// Build a constant int op producing an index.
+  static void build(Builder *builder, OperationState &result, int64_t value);
+
+  int64_t getValue() { return getAttrOfType<IntegerAttr>("value").getInt(); }
+
+  static bool classof(Operation *op);
+};
+
+// DmaStartOp starts a non-blocking DMA operation that transfers data from a
+// source memref to a destination memref. The source and destination memref need
+// not be of the same dimensionality, but need to have the same elemental type.
+// The operands include the source and destination memref's each followed by its
+// indices, size of the data transfer in terms of the number of elements (of the
+// elemental type of the memref), a tag memref with its indices, and optionally
+// at the end, a stride and a number_of_elements_per_stride arguments. The tag
+// location is used by a DmaWaitOp to check for completion. The indices of the
+// source memref, destination memref, and the tag memref have the same
+// restrictions as any load/store. The optional stride arguments should be of
+// 'index' type, and specify a stride for the slower memory space (memory space
+// with a lower memory space id), transferring chunks of
+// number_of_elements_per_stride every stride until %num_elements are
+// transferred. Either both or no stride arguments should be specified.
+//
+// For example, a DmaStartOp operation that transfers 256 elements of a memref
+// '%src' in memory space 0 at indices [%i, %j] to memref '%dst' in memory space
+// 1 at indices [%k, %l], would be specified as follows:
+//
+//   %num_elements = constant 256
+//   %idx = constant 0 : index
+//   %tag = alloc() : memref<1 x i32, (d0) -> (d0), 4>
+//   dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] :
+//     memref<40 x 128 x f32>, (d0) -> (d0), 0>,
+//     memref<2 x 1024 x f32>, (d0) -> (d0), 1>,
+//     memref<1 x i32>, (d0) -> (d0), 2>
+//
+//   If %stride and %num_elt_per_stride are specified, the DMA is expected to
+//   transfer %num_elt_per_stride elements every %stride elements apart from
+//   memory space 0 until %num_elements are transferred.
+//
+//   dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride,
+//             %num_elt_per_stride :
+//
+// TODO(mlir-team): add additional operands to allow source and destination
+// striding, and multiple stride levels.
+// TODO(andydavis) Consider replacing src/dst memref indices with view memrefs.
+class DmaStartOp
+    : public Op<DmaStartOp, OpTrait::VariadicOperands, OpTrait::ZeroResult> {
+public:
+  using Op::Op;
+
+  static void build(Builder *builder, OperationState &result, Value srcMemRef,
+                    ValueRange srcIndices, Value destMemRef,
+                    ValueRange destIndices, Value numElements, Value tagMemRef,
+                    ValueRange tagIndices, Value stride = nullptr,
+                    Value elementsPerStride = nullptr);
+
+  // Returns the source MemRefType for this DMA operation.
+  Value getSrcMemRef() { return getOperand(0); }
+  // Returns the rank (number of indices) of the source MemRefType.
+  unsigned getSrcMemRefRank() {
+    return getSrcMemRef()->getType().cast<MemRefType>().getRank();
+  }
+  // Returns the source memref indices for this DMA operation.
+  operand_range getSrcIndices() {
+    return {getOperation()->operand_begin() + 1,
+            getOperation()->operand_begin() + 1 + getSrcMemRefRank()};
+  }
+
+  // Returns the destination MemRefType for this DMA operations.
+  Value getDstMemRef() { return getOperand(1 + getSrcMemRefRank()); }
+  // Returns the rank (number of indices) of the destination MemRefType.
+  unsigned getDstMemRefRank() {
+    return getDstMemRef()->getType().cast<MemRefType>().getRank();
+  }
+  unsigned getSrcMemorySpace() {
+    return getSrcMemRef()->getType().cast<MemRefType>().getMemorySpace();
+  }
+  unsigned getDstMemorySpace() {
+    return getDstMemRef()->getType().cast<MemRefType>().getMemorySpace();
+  }
+
+  // Returns the destination memref indices for this DMA operation.
+  operand_range getDstIndices() {
+    return {getOperation()->operand_begin() + 1 + getSrcMemRefRank() + 1,
+            getOperation()->operand_begin() + 1 + getSrcMemRefRank() + 1 +
+                getDstMemRefRank()};
+  }
+
+  // Returns the number of elements being transferred by this DMA operation.
+  Value getNumElements() {
+    return getOperand(1 + getSrcMemRefRank() + 1 + getDstMemRefRank());
+  }
+
+  // Returns the Tag MemRef for this DMA operation.
+  Value getTagMemRef() {
+    return getOperand(1 + getSrcMemRefRank() + 1 + getDstMemRefRank() + 1);
+  }
+  // Returns the rank (number of indices) of the tag MemRefType.
+  unsigned getTagMemRefRank() {
+    return getTagMemRef()->getType().cast<MemRefType>().getRank();
+  }
+
+  // Returns the tag memref index for this DMA operation.
+  operand_range getTagIndices() {
+    unsigned tagIndexStartPos =
+        1 + getSrcMemRefRank() + 1 + getDstMemRefRank() + 1 + 1;
+    return {getOperation()->operand_begin() + tagIndexStartPos,
+            getOperation()->operand_begin() + tagIndexStartPos +
+                getTagMemRefRank()};
+  }
+
+  /// Returns true if this is a DMA from a faster memory space to a slower one.
+  bool isDestMemorySpaceFaster() {
+    return (getSrcMemorySpace() < getDstMemorySpace());
+  }
+
+  /// Returns true if this is a DMA from a slower memory space to a faster one.
+  bool isSrcMemorySpaceFaster() {
+    // Assumes that a lower number is for a slower memory space.
+    return (getDstMemorySpace() < getSrcMemorySpace());
+  }
+
+  /// Given a DMA start operation, returns the operand position of either the
+  /// source or destination memref depending on the one that is at the higher
+  /// level of the memory hierarchy. Asserts failure if neither is true.
+  unsigned getFasterMemPos() {
+    assert(isSrcMemorySpaceFaster() || isDestMemorySpaceFaster());
+    return isSrcMemorySpaceFaster() ? 0 : getSrcMemRefRank() + 1;
+  }
+
+  static StringRef getOperationName() { return "std.dma_start"; }
+  static ParseResult parse(OpAsmParser &parser, OperationState &result);
+  void print(OpAsmPrinter &p);
+  LogicalResult verify();
+
+  LogicalResult fold(ArrayRef<Attribute> cstOperands,
+                     SmallVectorImpl<OpFoldResult> &results);
+
+  bool isStrided() {
+    return getNumOperands() != 1 + getSrcMemRefRank() + 1 + getDstMemRefRank() +
+                                   1 + 1 + getTagMemRefRank();
+  }
+
+  Value getStride() {
+    if (!isStrided())
+      return nullptr;
+    return getOperand(getNumOperands() - 1 - 1);
+  }
+
+  Value getNumElementsPerStride() {
+    if (!isStrided())
+      return nullptr;
+    return getOperand(getNumOperands() - 1);
+  }
+};
+
+// DmaWaitOp blocks until the completion of a DMA operation associated with the
+// tag element '%tag[%index]'. %tag is a memref, and %index has to be an index
+// with the same restrictions as any load/store index. %num_elements is the
+// number of elements associated with the DMA operation. For example:
+//
+//   dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] :
+//     memref<2048 x f32>, (d0) -> (d0), 0>,
+//     memref<256 x f32>, (d0) -> (d0), 1>
+//     memref<1 x i32>, (d0) -> (d0), 2>
+//   ...
+//   ...
+//   dma_wait %tag[%index], %num_elements : memref<1 x i32, (d0) -> (d0), 2>
+//
+class DmaWaitOp
+    : public Op<DmaWaitOp, OpTrait::VariadicOperands, OpTrait::ZeroResult> {
+public:
+  using Op::Op;
+
+  static void build(Builder *builder, OperationState &result, Value tagMemRef,
+                    ValueRange tagIndices, Value numElements);
+
+  static StringRef getOperationName() { return "std.dma_wait"; }
+
+  // Returns the Tag MemRef associated with the DMA operation being waited on.
+  Value getTagMemRef() { return getOperand(0); }
+
+  // Returns the tag memref index for this DMA operation.
+  operand_range getTagIndices() {
+    return {getOperation()->operand_begin() + 1,
+            getOperation()->operand_begin() + 1 + getTagMemRefRank()};
+  }
+
+  // Returns the rank (number of indices) of the tag memref.
+  unsigned getTagMemRefRank() {
+    return getTagMemRef()->getType().cast<MemRefType>().getRank();
+  }
+
+  // Returns the number of elements transferred in the associated DMA operation.
+  Value getNumElements() { return getOperand(1 + getTagMemRefRank()); }
+
+  static ParseResult parse(OpAsmParser &parser, OperationState &result);
+  void print(OpAsmPrinter &p);
+  LogicalResult fold(ArrayRef<Attribute> cstOperands,
+                     SmallVectorImpl<OpFoldResult> &results);
+};
+
+/// Prints dimension and symbol list.
+void printDimAndSymbolList(Operation::operand_iterator begin,
+                           Operation::operand_iterator end, unsigned numDims,
+                           OpAsmPrinter &p);
+
+/// Parses dimension and symbol list and returns true if parsing failed.
+ParseResult parseDimAndSymbolList(OpAsmParser &parser,
+                                  SmallVectorImpl<Value> &operands,
+                                  unsigned &numDims);
+
+raw_ostream &operator<<(raw_ostream &os, SubViewOp::Range &range);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_STANDARDOPS_OPS_H
diff --git a/mlir/include/mlir/Dialect/StandardOps/Ops.td b/mlir/include/mlir/Dialect/StandardOps/Ops.td
new file mode 100644
index 0000000000000000000000000000000000000000..1c8bb251c0298740cbfe58a40a83a3d602202cea
--- /dev/null
+++ b/mlir/include/mlir/Dialect/StandardOps/Ops.td
@@ -0,0 +1,1626 @@
+//===- Ops.td - Standard operation definitions -------------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines some MLIR standard operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef STANDARD_OPS
+#define STANDARD_OPS
+
+include "mlir/Analysis/CallInterfaces.td"
+include "mlir/IR/OpAsmInterface.td"
+
+def Std_Dialect : Dialect {
+  let name = "std";
+  let cppNamespace = "";
+}
+
+// Base class for Standard dialect ops.
+class Std_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Std_Dialect, mnemonic, traits> {
+  // For every standard op, there needs to be a:
+  //   * void print(OpAsmPrinter &p, ${C++ class of Op} op)
+  //   * LogicalResult verify(${C++ class of Op} op)
+  //   * ParseResult parse${C++ class of Op}(OpAsmParser &parser,
+  //                                         OperationState &result)
+  // functions.
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+// Base class for standard cast operations. Requires single operand and result,
+// but does not constrain them to specific types.
+class CastOp<string mnemonic, list<OpTrait> traits = []> :
+    Std_Op<mnemonic, !listconcat(traits, [NoSideEffect])> {
+
+  let results = (outs AnyType);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value source, Type destType", [{
+       impl::buildCastOp(builder, result, source, destType);
+  }]>];
+
+  let parser = [{
+    return impl::parseCastOp(parser, result);
+  }];
+  let printer = [{
+    return printStandardCastOp(this->getOperation(), p);
+  }];
+  let verifier = [{ return ::verifyCastOp(*this); }];
+
+  let hasFolder = 1;
+}
+
+// Base class for unary ops. Requires single operand and result. Individual
+// classes will have `operand` accessor.
+class UnaryOp<string mnemonic, list<OpTrait> traits = []> :
+    Op<Std_Dialect, mnemonic, !listconcat(traits, [NoSideEffect])> {
+  let results = (outs AnyType);
+  let printer = [{
+    return printStandardUnaryOp(this->getOperation(), p);
+  }];
+}
+
+class UnaryOpSameOperandAndResultType<string mnemonic,
+                                      list<OpTrait> traits = []> :
+    UnaryOp<mnemonic, !listconcat(traits, [SameOperandsAndResultType])> {
+  let parser = [{
+    return impl::parseOneResultSameOperandTypeOp(parser, result);
+  }];
+}
+
+class FloatUnaryOp<string mnemonic, list<OpTrait> traits = []> :
+    UnaryOpSameOperandAndResultType<mnemonic, traits>,
+    Arguments<(ins FloatLike:$operand)>;
+
+// Base class for standard arithmetic operations.  Requires operands and
+// results to be of the same type, but does not constrain them to specific
+// types.  Individual classes will have `lhs` and `rhs` accessor to operands.
+class ArithmeticOp<string mnemonic, list<OpTrait> traits = []> :
+    Op<Std_Dialect, mnemonic,
+       !listconcat(traits, [NoSideEffect, SameOperandsAndResultType])> {
+
+  let results = (outs AnyType);
+
+  let parser = [{
+    return impl::parseOneResultSameOperandTypeOp(parser, result);
+  }];
+
+  let printer = [{
+    return printStandardBinaryOp(this->getOperation(), p);
+  }];
+}
+
+// Base class for standard arithmetic operations on integers, vectors and
+// tensors thereof.  This operation takes two operands and returns one result,
+// each of these is required to be of the same type.  This type may be an
+// integer scalar type, a vector whose element type is an integer type, or an
+// integer tensor.  The custom assembly form of the operation is as follows
+//
+//     <op>i %0, %1 : i32
+class IntArithmeticOp<string mnemonic, list<OpTrait> traits = []> :
+    ArithmeticOp<mnemonic, traits>,
+    Arguments<(ins IntegerLike:$lhs, IntegerLike:$rhs)>;
+
+// Base class for standard arithmetic binary operations on floats, vectors and
+// tensors thereof.  This operation has two operands and returns one result,
+// each of these is required to be of the same type.  This type may be a
+// floating point scalar type, a vector whose element type is a floating point
+// type, or a floating point tensor.  The custom assembly form of the operation
+// is as follows
+//
+//     <op>f %0, %1 : f32
+class FloatArithmeticOp<string mnemonic, list<OpTrait> traits = []> :
+    ArithmeticOp<mnemonic, traits>,
+    Arguments<(ins FloatLike:$lhs, FloatLike:$rhs)>;
+
+def AbsFOp : FloatUnaryOp<"absf"> {
+  let summary = "floating point absolute-value operation";
+  let description = [{
+    The `absf` operation computes the absolute value. It takes one operand and
+    returns one result of the same type. This type may be a float scalar type,
+    a vector whose element type is float, or a tensor of floats. It has no
+    standard attributes.
+  }];
+}
+
+def AddFOp : FloatArithmeticOp<"addf"> {
+  let summary = "floating point addition operation";
+  let hasFolder = 1;
+}
+
+def AddIOp : IntArithmeticOp<"addi", [Commutative]> {
+  let summary = "integer addition operation";
+  let hasFolder = 1;
+}
+
+def AllocOp : Std_Op<"alloc"> {
+  let summary = "memory allocation operation";
+  let description = [{
+    The "alloc" operation allocates a region of memory, as specified by its
+    memref type. For example:
+
+      %0 = alloc() : memref<8x64xf32, (d0, d1) -> (d0, d1), 1>
+
+    The optional list of dimension operands are bound to the dynamic dimensions
+    specified in its memref type. In the example below, the ssa value '%d' is
+    bound to the second dimension of the memref (which is dynamic).
+
+      %0 = alloc(%d) : memref<8x?xf32, (d0, d1) -> (d0, d1), 1>
+
+    The optional list of symbol operands are bound to the symbols of the
+    memrefs affine map. In the example below, the ssa value '%s' is bound to
+    the symbol 's0' in the affine map specified in the allocs memref type.
+
+      %0 = alloc()[%s] : memref<8x64xf32, (d0, d1)[s0] -> ((d0 + s0), d1), 1>
+
+    This operation returns a single ssa value of memref type, which can be used
+    by subsequent load and store operations.
+
+    The optional `alignment` attribute may be specified to ensure that the
+    region of memory that will be indexed is aligned at the specified byte
+    boundary. TODO(b/144281289) optional alignment attribute to MemRefType.
+
+      %0 = alloc()[%s] {alignment = 8} :
+        memref<8x64xf32, (d0, d1)[s0] -> ((d0 + s0), d1), 1>
+  }];
+
+  let arguments = (ins Variadic<Index>:$value,
+                   Confined<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$alignment);
+  let results = (outs AnyMemRef);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, MemRefType memrefType", [{
+       result.types.push_back(memrefType);
+     }]>,
+    OpBuilder<
+    "Builder *builder, OperationState &result, MemRefType memrefType, " #
+    "ArrayRef<Value> operands, IntegerAttr alignment = IntegerAttr()", [{
+       result.addOperands(operands);
+       result.types.push_back(memrefType);
+       if (alignment)
+         result.addAttribute(getAlignmentAttrName(), alignment);
+     }]>];
+
+  let extraClassDeclaration = [{
+    static StringRef getAlignmentAttrName() { return "alignment"; }
+
+    MemRefType getType() { return getResult()->getType().cast<MemRefType>(); }
+
+    /// Returns the number of symbolic operands (the ones in square brackets),
+    /// which bind to the symbols of the memref's layout map.
+    unsigned getNumSymbolicOperands() {
+      return getNumOperands() - getType().getNumDynamicDims();
+    }
+
+    /// Returns the symbolic operands (the ones in square brackets), which bind
+    /// to the symbols of the memref's layout map.
+    operand_range getSymbolicOperands() {
+      return {operand_begin() + getType().getNumDynamicDims(), operand_end()};
+    }
+
+    /// Returns the dynamic sizes for this alloc operation if specified.
+    operand_range getDynamicSizes() { return getOperands(); }
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def AndOp : IntArithmeticOp<"and", [Commutative]> {
+  let summary = "integer binary and";
+  let hasFolder = 1;
+}
+
+def BranchOp : Std_Op<"br", [Terminator]> {
+  let summary = "branch operation";
+  let description = [{
+    The "br" operation represents a branch operation in a function.
+    The operation takes variable number of operands and produces no results.
+    The operand number and types for each successor must match the arguments of
+    the block successor. For example:
+
+      ^bb2:
+        %2 = call @someFn()
+        br ^bb3(%2 : tensor<*xf32>)
+      ^bb3(%3: tensor<*xf32>):
+  }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let builders = [OpBuilder<
+    "Builder *, OperationState &result, Block *dest,"
+    "ValueRange operands = {}", [{
+      result.addSuccessor(dest, operands);
+  }]>];
+
+  // BranchOp is fully verified by traits.
+  let verifier = ?;
+
+  let extraClassDeclaration = [{
+    Block *getDest();
+    void setDest(Block *block);
+
+    /// Erase the operand at 'index' from the operand list.
+    void eraseOperand(unsigned index);
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def CallOp : Std_Op<"call", [CallOpInterface]> {
+  let summary = "call operation";
+  let description = [{
+    The "call" operation represents a direct call to a function that is within
+    the same symbol scope as the call.  The operands and result types of the
+    call must match the specified function type. The callee is encoded as a
+    function attribute named "callee".
+
+      %2 = call @my_add(%0, %1) : (f32, f32) -> f32
+  }];
+
+  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<AnyType>:$operands);
+  let results = (outs Variadic<AnyType>);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, FuncOp callee,"
+    "ValueRange operands = {}", [{
+      result.addOperands(operands);
+      result.addAttribute("callee", builder->getSymbolRefAttr(callee));
+      result.addTypes(callee.getType().getResults());
+  }]>, OpBuilder<
+    "Builder *builder, OperationState &result, SymbolRefAttr callee,"
+    "ArrayRef<Type> results, ValueRange operands = {}", [{
+      result.addOperands(operands);
+      result.addAttribute("callee", callee);
+      result.addTypes(results);
+  }]>, OpBuilder<
+    "Builder *builder, OperationState &result, StringRef callee,"
+    "ArrayRef<Type> results, ValueRange operands = {}", [{
+      build(builder, result, builder->getSymbolRefAttr(callee), results,
+            operands);
+  }]>];
+
+  let extraClassDeclaration = [{
+    StringRef getCallee() { return callee(); }
+    FunctionType getCalleeType();
+
+    /// Get the argument operands to the called function.
+    operand_range getArgOperands() {
+      return {arg_operand_begin(), arg_operand_end()};
+    }
+
+    operand_iterator arg_operand_begin() { return operand_begin(); }
+    operand_iterator arg_operand_end() { return operand_end(); }
+
+    /// Return the callee of this operation.
+    CallInterfaceCallable getCallableForCallee() {
+      return getAttrOfType<SymbolRefAttr>("callee");
+    }
+  }];
+}
+
+def CallIndirectOp : Std_Op<"call_indirect", [CallOpInterface]> {
+  let summary = "indirect call operation";
+  let description = [{
+    The "call_indirect" operation represents an indirect call to a value of
+    function type.  Functions are first class types in MLIR, and may be passed
+    as arguments and merged together with block arguments.  The operands
+    and result types of the call must match the specified function type.
+
+      %3 = call_indirect %2(%0, %1) : (f32, f32) -> f32
+  }];
+
+  let arguments = (ins FunctionType:$callee, Variadic<AnyType>:$operands);
+  let results = (outs Variadic<AnyType>);
+
+  let builders = [OpBuilder<
+    "Builder *, OperationState &result, Value callee,"
+    "ValueRange operands = {}", [{
+      result.operands.push_back(callee);
+      result.addOperands(operands);
+      result.addTypes(callee->getType().cast<FunctionType>().getResults());
+  }]>];
+
+  let extraClassDeclaration = [{
+    Value getCallee() { return getOperand(0); }
+
+    /// Get the argument operands to the called function.
+    operand_range getArgOperands() {
+      return {arg_operand_begin(), arg_operand_end()};
+    }
+
+    operand_iterator arg_operand_begin() { return ++operand_begin(); }
+    operand_iterator arg_operand_end() { return operand_end(); }
+
+    /// Return the callee of this operation.
+    CallInterfaceCallable getCallableForCallee() { return getCallee(); }
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def CeilFOp : FloatUnaryOp<"ceilf"> {
+  let summary = "ceiling of the specified value";
+  let description = [{
+    The `ceilf` operation computes the ceiling of a given value. It takes one
+    operand and returns one result of the same type. This type may be a float
+    scalar type, a vector whose element type is float, or a tensor of floats.
+    It has no standard attributes.
+  }];
+}
+
+def CmpFOp : Std_Op<"cmpf",
+    [NoSideEffect, SameTypeOperands, SameOperandsAndResultShape]> {
+  let summary = "floating-point comparison operation";
+  let description = [{
+    The "cmpf" operation compares its two operands according to the float
+    comparison rules and the predicate specified by the respective attribute.
+    The predicate defines the type of comparison: (un)orderedness, (in)equality
+    and signed less/greater than (or equal to) as well as predicates that are
+    always true or false.  The operands must have the same type, and this type
+    must be a float type, or a vector or tensor thereof.  The result is an i1,
+    or a vector/tensor thereof having the same shape as the inputs. Unlike cmpi,
+    the operands are always treated as signed. The u prefix indicates
+    *unordered* comparison, not unsigned comparison, so "une" means unordered or
+    not equal. For the sake of readability by humans, custom assembly form for
+    the operation uses a string-typed attribute for the predicate.  The value of
+    this attribute corresponds to lower-cased name of the predicate constant,
+    e.g., "one" means "ordered not equal".  The string representation of the
+    attribute is merely a syntactic sugar and is converted to an integer
+    attribute by the parser.
+
+      %r1 = cmpf "oeq" %0, %1 : f32
+      %r2 = cmpf "ult" %0, %1 : tensor<42x42xf64>
+      %r3 = "std.cmpf"(%0, %1) {predicate: 0} : (f8, f8) -> i1
+  }];
+
+  let arguments = (ins FloatLike:$lhs, FloatLike:$rhs);
+  let results = (outs BoolLike);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, CmpFPredicate predicate,"
+    "Value lhs, Value rhs", [{
+      ::buildCmpFOp(builder, result, predicate, lhs, rhs);
+  }]>];
+
+  let extraClassDeclaration = [{
+    static StringRef getPredicateAttrName() { return "predicate"; }
+    static CmpFPredicate getPredicateByName(StringRef name);
+
+    CmpFPredicate getPredicate() {
+      return (CmpFPredicate)getAttrOfType<IntegerAttr>(getPredicateAttrName())
+          .getInt();
+    }
+  }];
+
+  let hasFolder = 1;
+}
+
+def CMPI_P_EQ  : I64EnumAttrCase<"eq", 0>;
+def CMPI_P_NE  : I64EnumAttrCase<"ne", 1>;
+def CMPI_P_SLT : I64EnumAttrCase<"slt", 2>;
+def CMPI_P_SLE : I64EnumAttrCase<"sle", 3>;
+def CMPI_P_SGT : I64EnumAttrCase<"sgt", 4>;
+def CMPI_P_SGE : I64EnumAttrCase<"sge", 5>;
+def CMPI_P_ULT : I64EnumAttrCase<"ult", 6>;
+def CMPI_P_ULE : I64EnumAttrCase<"ule", 7>;
+def CMPI_P_UGT : I64EnumAttrCase<"ugt", 8>;
+def CMPI_P_UGE : I64EnumAttrCase<"uge", 9>;
+
+def CmpIPredicateAttr : I64EnumAttr<
+    "CmpIPredicate", "",
+    [CMPI_P_EQ, CMPI_P_NE, CMPI_P_SLT, CMPI_P_SLE, CMPI_P_SGT,
+     CMPI_P_SGE, CMPI_P_ULT, CMPI_P_ULE, CMPI_P_UGT, CMPI_P_UGE]> {
+  let cppNamespace = "::mlir";
+}
+
+def CmpIOp : Std_Op<"cmpi",
+    [NoSideEffect, SameTypeOperands, SameOperandsAndResultShape]> {
+  let summary = "integer comparison operation";
+  let description = [{
+    The "cmpi" operation compares its two operands according to the integer
+    comparison rules and the predicate specified by the respective attribute.
+    The predicate defines the type of comparison: (in)equality, (un)signed
+    less/greater than (or equal to).  The operands must have the same type, and
+    this type must be an integer type, a vector or a tensor thereof.  The result
+    is an i1, or a vector/tensor thereof having the same shape as the inputs.
+    Since integers are signless, the predicate also explicitly indicates
+    whether to interpret the operands as signed or unsigned integers for
+    less/greater than comparisons.  For the sake of readability by humans,
+    custom assembly form for the operation uses a string-typed attribute for
+    the predicate.  The value of this attribute corresponds to lower-cased name
+    of the predicate constant, e.g., "slt" means "signed less than".  The string
+    representation of the attribute is merely a syntactic sugar and is converted
+    to an integer attribute by the parser.
+
+      %r1 = cmpi "eq" %0, %1 : i32
+      %r2 = cmpi "slt" %0, %1 : tensor<42x42xi64>
+      %r3 = "std.cmpi"(%0, %1){predicate: 0} : (i8, i8) -> i1
+  }];
+
+  let arguments = (ins
+      CmpIPredicateAttr:$predicate,
+      IntegerLike:$lhs,
+      IntegerLike:$rhs
+  );
+  let results = (outs BoolLike);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, CmpIPredicate predicate,"
+    "Value lhs, Value rhs", [{
+      ::buildCmpIOp(builder, result, predicate, lhs, rhs);
+  }]>];
+
+  let extraClassDeclaration = [{
+    static StringRef getPredicateAttrName() { return "predicate"; }
+    static CmpIPredicate getPredicateByName(StringRef name);
+
+    CmpIPredicate getPredicate() {
+      return (CmpIPredicate)getAttrOfType<IntegerAttr>(getPredicateAttrName())
+          .getInt();
+    }
+  }];
+
+  let verifier = [{ return success(); }];
+
+  let hasFolder = 1;
+}
+
+def CondBranchOp : Std_Op<"cond_br", [Terminator]> {
+  let summary = "conditional branch operation";
+  let description = [{
+    The "cond_br" operation represents a conditional branch operation in a
+    function. The operation takes variable number of operands and produces
+    no results. The operand number and types for each successor must match the
+    arguments of the block successor. For example:
+
+      ^bb0:
+         %0 = extract_element %arg0[] : tensor<i1>
+         cond_br %0, ^bb1, ^bb2
+      ^bb1:
+         ...
+      ^bb2:
+         ...
+  }];
+
+  let arguments = (ins I1:$condition, Variadic<AnyType>:$branchOperands);
+
+  let builders = [OpBuilder<
+    "Builder *, OperationState &result, Value condition,"
+    "Block *trueDest, ValueRange trueOperands,"
+    "Block *falseDest, ValueRange falseOperands", [{
+      result.addOperands(condition);
+      result.addSuccessor(trueDest, trueOperands);
+      result.addSuccessor(falseDest, falseOperands);
+  }]>];
+
+  // CondBranchOp is fully verified by traits.
+  let verifier = ?;
+
+  let extraClassDeclaration = [{
+    // These are the indices into the dests list.
+    enum { trueIndex = 0, falseIndex = 1 };
+
+    // The condition operand is the first operand in the list.
+    Value getCondition() { return getOperand(0); }
+
+    /// Return the destination if the condition is true.
+    Block *getTrueDest() {
+      return getSuccessor(trueIndex);
+    }
+
+    /// Return the destination if the condition is false.
+    Block *getFalseDest() {
+      return getSuccessor(falseIndex);
+    }
+
+    // Accessors for operands to the 'true' destination.
+    Value getTrueOperand(unsigned idx) {
+      assert(idx < getNumTrueOperands());
+      return getOperand(getTrueDestOperandIndex() + idx);
+    }
+
+    void setTrueOperand(unsigned idx, Value value) {
+      assert(idx < getNumTrueOperands());
+      setOperand(getTrueDestOperandIndex() + idx, value);
+    }
+
+    operand_iterator true_operand_begin() {
+      return operand_begin() + getTrueDestOperandIndex();
+    }
+    operand_iterator true_operand_end() {
+      return true_operand_begin() + getNumTrueOperands();
+    }
+    operand_range getTrueOperands() {
+      return {true_operand_begin(), true_operand_end()};
+    }
+
+    unsigned getNumTrueOperands()  {
+      return getNumSuccessorOperands(trueIndex);
+    }
+
+    /// Erase the operand at 'index' from the true operand list.
+    void eraseTrueOperand(unsigned index)  {
+      getOperation()->eraseSuccessorOperand(trueIndex, index);
+    }
+
+    // Accessors for operands to the 'false' destination.
+    Value getFalseOperand(unsigned idx) {
+      assert(idx < getNumFalseOperands());
+      return getOperand(getFalseDestOperandIndex() + idx);
+    }
+    void setFalseOperand(unsigned idx, Value value) {
+      assert(idx < getNumFalseOperands());
+      setOperand(getFalseDestOperandIndex() + idx, value);
+    }
+
+    operand_iterator false_operand_begin() { return true_operand_end(); }
+    operand_iterator false_operand_end() {
+      return false_operand_begin() + getNumFalseOperands();
+    }
+    operand_range getFalseOperands() {
+      return {false_operand_begin(), false_operand_end()};
+    }
+
+    unsigned getNumFalseOperands() {
+      return getNumSuccessorOperands(falseIndex);
+    }
+
+    /// Erase the operand at 'index' from the false operand list.
+    void eraseFalseOperand(unsigned index) {
+      getOperation()->eraseSuccessorOperand(falseIndex, index);
+    }
+
+  private:
+    /// Get the index of the first true destination operand.
+    unsigned getTrueDestOperandIndex() { return 1; }
+
+    /// Get the index of the first false destination operand.
+    unsigned getFalseDestOperandIndex() {
+      return getTrueDestOperandIndex() + getNumTrueOperands();
+    }
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def ConstantOp : Std_Op<"constant",
+    [NoSideEffect, DeclareOpInterfaceMethods<OpAsmOpInterface>]> {
+  let summary = "constant";
+
+  let arguments = (ins AnyAttr:$value);
+  let results = (outs AnyType);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Attribute value",
+    [{ build(builder, result, value.getType(), value); }]>];
+
+  let extraClassDeclaration = [{
+    Attribute getValue() { return getAttr("value"); }
+
+    /// Returns true if a constant operation can be built with the given value
+    /// and result type.
+    static bool isBuildableWith(Attribute value, Type type);
+  }];
+
+  let hasFolder = 1;
+}
+
+def CopySignOp : FloatArithmeticOp<"copysign"> {
+  let summary = "A copysign operation";
+  let description = [{
+    The `copysign` returns a value with the magnitude of the first operand and
+    the sign of the second operand. It takes two operands and returns one
+    result of the same type. This type may be a float scalar type, a vector
+    whose element type is float, or a tensor of floats. It has no standard
+    attributes.
+  }];
+}
+
+def CosOp : FloatUnaryOp<"cos"> {
+  let summary = "cosine of the specified value";
+  let description = [{
+    The `cos` operation computes the cosine of a given value. It takes one
+    operand and returns one result of the same type. This type may be a float
+    scalar type, a vector whose element type is float, or a tensor of floats.
+    It has no standard attributes.
+  }];
+}
+
+def DeallocOp : Std_Op<"dealloc"> {
+  let summary = "memory deallocation operation";
+  let description = [{
+    The "dealloc" operation frees the region of memory referenced by a memref
+    which was originally created by the "alloc" operation.
+    The "dealloc" operation should not be called on memrefs which alias an
+    alloc'd memref (i.e. memrefs returned by the "view" and "reshape"
+    operations).
+
+      %0 = alloc() : memref<8x64xf32, (d0, d1) -> (d0, d1), 1>
+      dealloc %0 : memref<8x64xf32, (d0, d1) -> (d0, d1), 1>
+  }];
+
+  let arguments = (ins AnyMemRef:$memref);
+
+  let hasCanonicalizer = 1;
+  let hasFolder = 1;
+}
+
+def DimOp : Std_Op<"dim", [NoSideEffect]> {
+  let summary = "dimension index operation";
+  let description = [{
+    The "dim" operation takes a memref or tensor operand and returns an "index".
+    It requires a single integer attribute named "index". It returns the size
+    of the specified dimension. For example:
+
+      %1 = dim %0, 2 : tensor<?x?x?xf32>
+  }];
+
+  let arguments = (ins AnyTypeOf<[AnyMemRef, AnyTensor],
+                                 "any tensor or memref type">:$memrefOrTensor,
+                       APIntAttr:$index);
+  let results = (outs Index);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value memrefOrTensor,"
+    "unsigned index", [{
+      auto indexType = builder->getIndexType();
+      auto indexAttr = builder->getIntegerAttr(indexType, index);
+      build(builder, result, indexType, memrefOrTensor, indexAttr);
+    }]>];
+
+  let extraClassDeclaration = [{
+    unsigned getIndex() {
+      return getAttrOfType<IntegerAttr>("index").getValue().getZExtValue();
+    }
+  }];
+
+  let hasFolder = 1;
+}
+
+def DivFOp : FloatArithmeticOp<"divf"> {
+  let summary = "floating point division operation";
+}
+
+def SignedDivIOp : IntArithmeticOp<"divi_signed"> {
+  let summary = "signed integer division operation";
+  let hasFolder = 1;
+}
+
+def UnsignedDivIOp : IntArithmeticOp<"divi_unsigned"> {
+  let summary = "unsigned integer division operation";
+  let hasFolder = 1;
+}
+
+def ExpOp : FloatUnaryOp<"exp"> {
+  let summary = "base-e exponential of the specified value";
+}
+
+def ExtractElementOp : Std_Op<"extract_element", [NoSideEffect]> {
+  let summary = "element extract operation";
+  let description = [{
+    The "extract_element" op reads a tensor or vector and returns one element
+    from it specified by an index list. The output of extract is a new value
+    with the same type as the elements of the tensor or vector. The arity of
+    indices matches the rank of the accessed value (i.e., if a tensor is of rank
+    3, then 3 indices are required for the extract).  The indices should all be
+    of index type. For example:
+
+      %3 = extract_element %0[%1, %2] : vector<4x4xi32>
+  }];
+
+  let arguments = (ins AnyTypeOf<[AnyVector, AnyTensor]>:$aggregate,
+                       Variadic<Index>:$indices);
+  let results = (outs AnyType);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value aggregate,"
+    "ValueRange indices = {}", [{
+      auto resType = aggregate->getType().cast<ShapedType>()
+                                         .getElementType();
+      build(builder, result, resType, aggregate, indices);
+    }]>];
+
+  let extraClassDeclaration = [{
+    Value getAggregate() { return getOperand(0); }
+
+    operand_range getIndices() {
+      return {operand_begin() + 1, operand_end()};
+    }
+  }];
+
+  let hasFolder = 1;
+}
+
+def IndexCastOp : CastOp<"index_cast">, Arguments<(ins AnyType:$in)> {
+  let summary = "cast between index and integer types";
+  let description = [{
+    Casts between integer scalars and 'index' scalars.  Index is an integer of
+    platform-specific bit width.  If casting to a wider integer, the value is
+    sign-extended.  If casting to a narrower integer, the value is truncated.
+  }];
+
+  let extraClassDeclaration = [{
+    /// Return true if `a` and `b` are valid operand and result pairs for
+    /// the operation.
+    static bool areCastCompatible(Type a, Type b);
+  }];
+
+  let hasFolder = 0;
+}
+
+def FPExtOp : CastOp<"fpext">, Arguments<(ins AnyType:$in)> {
+  let summary = "cast from floating-point to wider floating-point";
+  let description = [{
+    Cast a floating-point value to a larger floating-point-typed value.
+    The destination type must to be strictly wider than the source type.
+    Only scalars are currently supported.
+  }];
+
+  let extraClassDeclaration = [{
+    /// Return true if `a` and `b` are valid operand and result pairs for
+    /// the operation.
+    static bool areCastCompatible(Type a, Type b);
+  }];
+
+  let hasFolder = 0;
+}
+
+def FPTruncOp : CastOp<"fptrunc">, Arguments<(ins AnyType:$in)> {
+  let summary = "cast from floating-point to narrower floating-point";
+  let description = [{
+    Truncate a floating-point value to a smaller floating-point-typed value.
+    The destination type must be strictly narrower than the source type.
+    If the value cannot be exactly represented, it is rounded using the default
+    rounding mode. Only scalars are currently supported.
+  }];
+
+  let extraClassDeclaration = [{
+    /// Return true if `a` and `b` are valid operand and result pairs for
+    /// the operation.
+    static bool areCastCompatible(Type a, Type b);
+  }];
+
+  let hasFolder = 0;
+}
+
+def LoadOp : Std_Op<"load"> {
+  let summary = "load operation";
+  let description = [{
+    The "load" op reads an element from a memref specified by an index list. The
+    output of load is a new value with the same type as the elements of the
+    memref. The arity of indices is the rank of the memref (i.e., if the memref
+    loaded from is of rank 3, then 3 indices are required for the load following
+    the memref identifier). For example:
+
+      %3 = load %0[%1, %1] : memref<4x4xi32>
+  }];
+
+  let arguments = (ins AnyMemRef:$memref, Variadic<Index>:$indices);
+  let results = (outs AnyType);
+
+  let builders = [OpBuilder<
+    "Builder *, OperationState &result, Value memref,"
+    "ValueRange indices = {}", [{
+      auto memrefType = memref->getType().cast<MemRefType>();
+      result.addOperands(memref);
+      result.addOperands(indices);
+      result.types.push_back(memrefType.getElementType());
+  }]>];
+
+  let extraClassDeclaration = [{
+    Value getMemRef() { return getOperand(0); }
+    void setMemRef(Value value) { setOperand(0, value); }
+    MemRefType getMemRefType() {
+      return getMemRef()->getType().cast<MemRefType>();
+    }
+
+    operand_range getIndices() { return {operand_begin() + 1, operand_end()}; }
+  }];
+
+  let hasFolder = 1;
+}
+
+def LogOp : FloatUnaryOp<"log"> {
+  let summary = "base-e logarithm of the specified value";
+}
+
+def Log10Op : FloatUnaryOp<"log10"> {
+  let summary = "base-10 logarithm of the specified value";
+}
+
+def Log2Op : FloatUnaryOp<"log2"> {
+  let summary = "base-2 logarithm of the specified value";
+}
+
+def MemRefCastOp : CastOp<"memref_cast"> {
+  let summary = "memref cast operation";
+  let description = [{
+    The "memref_cast" operation converts a memref from one type to an equivalent
+    type with a compatible shape. The source and destination types are
+    compatible if:
+    a. both are ranked memref types with the same element type, affine mappings,
+    address space, and rank but where the individual dimensions may add or
+    remove constant dimensions from the memref type.
+
+    If the cast converts any dimensions from an unknown to a known size, then it
+    acts as an assertion that fails at runtime of the dynamic dimensions
+    disagree with resultant destination size.
+
+    Example:
+    Assert that the input dynamic shape matches the destination static shape.
+       %2 = memref_cast %1 : memref<?x?xf32> to memref<4x4xf32>
+    Erase static shape information, replacing it with dynamic information.
+       %3 = memref_cast %1 : memref<4xf32> to memref<?xf32>
+
+    The same holds true for offsets and strides.
+
+    Assert that the input dynamic shape matches the destination static stride.
+       %4 = memref_cast %1 : memref<12x4xf32, offset:?, strides: [?, ?]> to
+                             memref<12x4xf32, offset:5, strides: [4, 1]>
+    Erase static offset and stride information, replacing it with
+    dynamic information.
+       %5 = memref_cast %1 : memref<12x4xf32, offset:5, strides: [4, 1]> to
+                             memref<12x4xf32, offset:?, strides: [?, ?]>
+
+    b. either or both memref types are unranked with the same element type, and
+    address space.
+
+    Example:
+    Cast to concrete shape.
+        %4 = memref_cast %1 : memref<*xf32> to memref<4x?xf32>
+
+    Erase rank information.
+        %5 = memref_cast %1 : memref<4x?xf32> to memref<*xf32>
+  }];
+
+  let arguments = (ins AnyRankedOrUnrankedMemRef:$source);
+  let results = (outs AnyRankedOrUnrankedMemRef);
+
+  let extraClassDeclaration = [{
+    /// Return true if `a` and `b` are valid operand and result pairs for
+    /// the operation.
+    static bool areCastCompatible(Type a, Type b);
+
+    /// The result of a memref_cast is always a memref.
+    Type getType() { return getResult()->getType(); }
+  }];
+}
+
+def MulFOp : FloatArithmeticOp<"mulf"> {
+  let summary = "floating point multiplication operation";
+  let hasFolder = 1;
+}
+
+def MulIOp : IntArithmeticOp<"muli", [Commutative]> {
+  let summary = "integer multiplication operation";
+  let hasFolder = 1;
+}
+
+def NegFOp : FloatUnaryOp<"negf"> {
+  let summary = "floating point negation";
+  let description = [{
+    The `negf` operation computes the negation of a given value. It takes one
+    operand and returns one result of the same type. This type may be a float
+    scalar type, a vector whose element type is float, or a tensor of floats.
+    It has no standard attributes.
+  }];
+}
+
+def OrOp : IntArithmeticOp<"or", [Commutative]> {
+  let summary = "integer binary or";
+  let hasFolder = 1;
+}
+
+def PrefetchOp : Std_Op<"prefetch"> {
+  let summary = "prefetch operation";
+  let description = [{
+    The "prefetch" op prefetches data from a memref location described with
+    subscript indices similar to std.load, and with three attributes: a
+    read/write specifier, a locality hint, and a cache type specifier as shown
+    below:
+
+      prefetch %0[%i, %j], read, locality<3>, data : memref<400x400xi32>
+
+    The read/write specifier is either 'read' or 'write', the locality hint
+    ranges from locality<0> (no locality) to locality<3> (extremely local keep
+    in cache). The cache type specifier is either 'data' or 'instr'
+    and specifies whether the prefetch is performed on data cache or on
+    instruction cache.
+  }];
+
+  let arguments = (ins AnyMemRef:$memref, Variadic<Index>:$indices,
+                   BoolAttr:$isWrite,
+                   Confined<I32Attr, [IntMinValue<0>,
+                     IntMaxValue<3>]>:$localityHint,
+                   BoolAttr:$isDataCache);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value memref,"
+    "ArrayRef<Value> indices, bool isWrite, unsigned hint, bool isData",
+    [{
+      auto hintAttr = builder->getI32IntegerAttr(hint);
+      auto isWriteAttr = builder->getBoolAttr(isWrite);
+      auto isDataCacheAttr = builder->getBoolAttr(isData);
+      result.addOperands(memref);
+      result.addOperands(indices);
+      result.addAttribute("localityHint", hintAttr);
+      result.addAttribute("isWrite", isWriteAttr);
+      result.addAttribute("isDataCache", isDataCacheAttr);
+    }]>];
+
+  let extraClassDeclaration = [{
+    MemRefType getMemRefType() {
+      return memref()->getType().cast<MemRefType>();
+    }
+    static StringRef getLocalityHintAttrName() { return "localityHint"; }
+    static StringRef getIsWriteAttrName() { return "isWrite"; }
+    static StringRef getIsDataCacheAttrName() { return "isDataCache"; }
+  }];
+
+  let hasFolder = 1;
+}
+
+def RankOp : Std_Op<"rank", [NoSideEffect]> {
+  let summary = "rank operation";
+  let description = [{
+    The "rank" operation takes a tensor operand and returns its rank.
+
+      %1 = rank %0 : index
+  }];
+
+  let arguments = (ins AnyTensor);
+  let results = (outs Index);
+  let verifier = ?;
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value tensor", [{
+      auto indexType = builder->getIndexType();
+      build(builder, result, indexType, tensor);
+    }]>];
+
+  let hasFolder = 1;
+}
+
+def RemFOp : FloatArithmeticOp<"remf"> {
+  let summary = "floating point division remainder operation";
+}
+
+def SignedRemIOp : IntArithmeticOp<"remi_signed"> {
+  let summary = "signed integer division remainder operation";
+  let hasFolder = 1;
+}
+
+def UnsignedRemIOp : IntArithmeticOp<"remi_unsigned"> {
+  let summary = "unsigned integer division remainder operation";
+  let hasFolder = 1;
+}
+
+def ReturnOp : Std_Op<"return", [Terminator, HasParent<"FuncOp">]> {
+  let summary = "return operation";
+  let description = [{
+    The "return" operation represents a return operation within a function.
+    The operation takes variable number of operands and produces no results.
+    The operand number and types must match the signature of the function
+    that contains the operation. For example:
+
+      func @foo() : (i32, f8) {
+      ...
+      return %0, %1 : i32, f8
+  }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let builders = [OpBuilder<
+    "Builder *b, OperationState &result", [{ build(b, result, llvm::None); }]
+  >];
+}
+
+def SelectOp : Std_Op<"select", [NoSideEffect, SameOperandsAndResultShape]> {
+  let summary = "select operation";
+  let description = [{
+    The "select" operation chooses one value based on a binary condition
+    supplied as its first operand. If the value of the first operand is 1, the
+    second operand is chosen, otherwise the third operand is chosen. The second
+    and the third operand must have the same type. The operation applies
+    elementwise to vectors and tensors.  The shape of all arguments must be
+    identical. For example, the maximum operation is obtained by combining
+    "select" with "cmpi" as follows.
+
+      %2 = cmpi "gt" %0, %1 : i32         // %2 is i1
+      %3 = select %2, %0, %1 : i32
+  }];
+
+  let arguments = (ins BoolLike:$condition, AnyType:$true_value,
+                       AnyType:$false_value);
+  let results = (outs AnyType);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value condition,"
+    "Value trueValue, Value falseValue", [{
+      result.addOperands({condition, trueValue, falseValue});
+      result.addTypes(trueValue->getType());
+  }]>];
+
+  let extraClassDeclaration = [{
+      Value getCondition() { return condition(); }
+      Value getTrueValue() { return true_value(); }
+      Value getFalseValue() { return false_value(); }
+  }];
+
+  let hasFolder = 1;
+}
+
+def SignExtendIOp : Std_Op<"sexti",
+    [NoSideEffect, SameOperandsAndResultShape]> {
+  let summary = "integer sign extension operation";
+  let description = [{
+    The integer sign extension operation takes an integer input of
+    width M and an integer destination type of width N. The destination
+    bit-width must be larger than the input bit-width (N > M).
+    The top-most (N - M) bits of the output are filled with copies
+    of the most-significant bit of the input.
+
+      %1 = constant 5 : i3            // %1 is 0b101
+      %2 = sexti %1 : i3 to i6        // %2 is 0b111101
+      %3 = constant 2 : i3            // %3 is 0b010
+      %4 = sexti %3 : i3 to i6        // %4 is 0b000010
+
+      %5 = sexti %0 : vector<2 x i32> to vector<2 x i64>
+  }];
+
+  let arguments = (ins IntegerLike:$value);
+  let results = (outs IntegerLike);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value value, Type destType", [{
+      result.addOperands(value);
+      result.addTypes(destType);
+  }]>];
+
+  let parser = [{
+    return impl::parseCastOp(parser, result);
+  }];
+  let printer = [{
+    return printStandardCastOp(this->getOperation(), p);
+  }];
+}
+
+def ShiftLeftOp : IntArithmeticOp<"shift_left"> {
+  let summary = "integer left-shift";
+  let description = [{
+    The shift_left operation shifts an integer value to the left by a variable
+    amount. The low order bits are filled with zeros.
+
+      %1 = constant 5 : i8                       // %1 is 0b00000101
+      %2 = constant 3 : i8
+      %3 = shift_left %1, %2 : (i8, i8) -> i8    // %3 is 0b00101000
+  }];
+}
+
+def SignedShiftRightOp : IntArithmeticOp<"shift_right_signed"> {
+  let summary = "signed integer right-shift";
+  let description = [{
+    The shift_right_signed operation shifts an integer value to the right by
+    a variable amount. The integer is interpreted as signed. The high order
+    bits in the output are filled with copies of the most-significant bit
+    of the shifted value (which means that the sign of the value is preserved).
+
+      %1 = constant 160 : i8                             // %1 is 0b10100000
+      %2 = constant 3 : i8
+      %3 = shift_right_signed %1, %2 : (i8, i8) -> i8    // %3 is 0b11110100
+      %4 = constant 96 : i8                              // %4 is 0b01100000
+      %5 = shift_right_signed %4, %2 : (i8, i8) -> i8    // %5 is 0b00001100
+  }];
+}
+
+def UnsignedShiftRightOp : IntArithmeticOp<"shift_right_unsigned"> {
+  let summary = "unsigned integer right-shift";
+  let description = [{
+    The shift_right_unsigned operation shifts an integer value to the right by
+    a variable amount. The integer is interpreted as unsigned. The high order
+    bits are always filled with zeros.
+
+      %1 = constant 160 : i8                               // %1 is 0b10100000
+      %2 = constant 3 : i8
+      %3 = shift_right_unsigned %1, %2 : (i8, i8) -> i8    // %3 is 0b00010100
+  }];
+}
+
+def SIToFPOp : CastOp<"sitofp">, Arguments<(ins AnyType:$in)> {
+  let summary = "cast from integer type to floating-point";
+  let description = [{
+    Cast from a value interpreted as signed integer to the corresponding
+    floating-point value. If the value cannot be exactly represented, it is
+    rounded using the default rounding mode. Only scalars are currently
+    supported.
+  }];
+
+  let extraClassDeclaration = [{
+    /// Return true if `a` and `b` are valid operand and result pairs for
+    /// the operation.
+    static bool areCastCompatible(Type a, Type b);
+  }];
+
+  let hasFolder = 0;
+}
+
+def SplatOp : Std_Op<"splat", [NoSideEffect]> {
+  let summary = "splat or broadcast operation";
+  let description = [{
+    The "splat" op reads a value of integer or float type and broadcasts it into
+    a vector or a tensor. The output of splat is thus a new value of either
+    vector or tensor type with elemental type being its operand's type.
+    When the result is a tensor, it has to be statically shaped.
+
+      %1 = splat %0 : vector<8xi32>
+      %2 = splat %0 : tensor<4x8xi32>
+
+    TODO: Extend this operation to broadcast to dynamically shaped tensors in
+    the same way dynamically shaped memrefs are handled.
+
+    // Broadcasts %s to a 2-d dynamically shaped tensor, with %m, %n binding
+    // to the sizes of the two dynamic dimensions.
+
+      %m = "foo"() : () -> (index)
+      %n = "bar"() : () -> (index)
+      %t = splat %s [%m, %n] : tensor<?x?xi32>
+
+  }];
+
+  let arguments = (ins AnyTypeOf<[AnyInteger, AnyFloat],
+                                 "integer or float type">:$input);
+  let results = (outs AnyTypeOf<[AnyVector, AnyStaticShapeTensor]>:$aggregate);
+
+  let builders =
+      [OpBuilder<"Builder *builder, OperationState &result, Value element, "
+                  "Type aggregateType",
+                  [{ build(builder, result, aggregateType, element); }]>];
+
+  let hasFolder = 1;
+}
+
+def StoreOp : Std_Op<"store"> {
+  let summary = "store operation";
+  let description = [{
+    The "store" op writes an element to a memref specified by an index list.
+    The arity of indices is the rank of the memref (i.e. if the memref being
+    stored to is of rank 3, then 3 indices are required for the store following
+    the memref identifier). The store operation does not produce a result.
+
+    In the following example, the ssa value '%v' is stored in memref '%A' at
+    indices [%i, %j]:
+      store %v, %A[%i, %j] : memref<4x128xf32, (d0, d1) -> (d0, d1), 0>
+  }];
+
+  let arguments = (ins AnyType:$value, AnyMemRef:$memref,
+                   Variadic<Index>:$indices);
+
+  let builders = [OpBuilder<
+    "Builder *, OperationState &result, Value valueToStore, Value memref", [{
+      result.addOperands(valueToStore);
+      result.addOperands(memref);
+  }]>];
+
+  let extraClassDeclaration = [{
+      Value getValueToStore() { return getOperand(0); }
+
+      Value getMemRef() { return getOperand(1); }
+      void setMemRef(Value value) { setOperand(1, value); }
+      MemRefType getMemRefType() {
+        return getMemRef()->getType().cast<MemRefType>();
+      }
+
+      operand_range getIndices() {
+        return {operand_begin() + 2, operand_end()};
+      }
+  }];
+
+  let hasFolder = 1;
+}
+
+def SubFOp : FloatArithmeticOp<"subf"> {
+  let summary = "floating point subtraction operation";
+  let hasFolder = 1;
+}
+
+def SubIOp : IntArithmeticOp<"subi"> {
+  let summary = "integer subtraction operation";
+  let hasFolder = 1;
+}
+
+def SubViewOp : Std_Op<"subview", [AttrSizedOperandSegments, NoSideEffect]> {
+  let summary = "memref subview operation";
+  let description = [{
+    The "subview" operation converts a memref type to another memref type
+    which represents a reduced-size view of the original memref as specified by
+    the operation's offsets, sizes and strides arguments.
+
+    The SubView operation supports the following arguments:
+    *) Memref: the "base" memref on which to create a "view" memref.
+    *) Offsets: zero or memref-rank number of dynamic offsets into the "base"
+                memref at which to create the "view" memref.
+    *) Sizes: zero or memref-rank dynamic size operands which specify the
+              dynamic sizes of the result "view" memref type.
+    *) Strides: zero or memref-rank number of dynamic strides which are applied
+                multiplicatively to the base memref strides in each dimension.
+
+    Note on the number of operands for offsets, sizes and strides: For
+    each of these, the number of operands must either be same as the
+    memref-rank number or empty. For the latter, those values will be
+    treated as constants.
+
+    Example 1:
+
+      %0 = alloc() : memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1)>
+
+      // Create a sub-view of "base" memref '%0' with offset arguments '%c0',
+      // dynamic sizes for each dimension, and stride arguments '%c1'.
+      %1 = subview %0[%c0, %c0][%size0, %size1][%c1, %c1]
+        : memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1) > to
+          memref<?x?xf32, (d0, d1)[s0, s1] -> (d0 * s1 + d1 + s0)>
+
+    Example 2:
+
+      %0 = alloc() : memref<8x16x4xf32, (d0, d1, d1) -> (d0 * 64 + d1 * 4 + d2)>
+
+      // Create a sub-view of "base" memref '%0' with dynamic offsets, sizes,
+      // and strides.
+      // Note that dynamic offsets are represented by the linearized dynamic
+      // offset symbol 's0' in the subview memref layout map, and that the
+      // dynamic strides operands, after being applied to the base memref
+      // strides in each dimension, are represented in the view memref layout
+      // map as symbols 's1', 's2' and 's3'.
+      %1 = subview %0[%i, %j, %k][%size0, %size1, %size2][%x, %y, %z]
+        : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to
+          memref<?x?x?xf32,
+            (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+
+    Example 3:
+
+      %0 = alloc() : memref<8x16x4xf32, (d0, d1, d1) -> (d0 * 64 + d1 * 4 + d2)>
+
+      // Subview with constant offsets, sizes and strides.
+      %1 = subview %0[][][]
+        : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to
+          memref<4x4x4xf32, (d0, d1, d2) -> (d0 * 16 + d1 * 4 + d2 + 8)>
+
+    Example 4:
+
+      %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
+
+      // Subview with constant size, but dynamic offsets and
+      // strides. The resulting memref has a static shape, but if the
+      // base memref has an affine map to describe the layout, the result
+      // memref also uses an affine map to describe the layout. The
+      // strides of the result memref is computed as follows:
+      //
+      // Let #map1 represents the layout of the base memref, and #map2
+      // represents the layout of the result memref. A #mapsubview can be
+      // constructed to map an index from the result memref to the base
+      // memref (note that the description below uses more convenient
+      // naming for symbols, while in affine maps, symbols are
+      // represented as unsigned numbers that identify that symbol in the
+      // given affine map.
+      //
+      // #mapsubview = (d0, d1)[o0, o1, t0, t1] -> (d0 * t0 + o0, d1 * t1 + o1)
+      //
+      // where, o0, o1, ... are offsets, and t0, t1, ... are strides. Then,
+      //
+      // #map2 = #map1.compose(#mapsubview)
+      //
+      // If the layout map is represented as
+      //
+      // #map1 = (d0, d1)[s0, s1, s2] -> (d0 * s1 + d1 * s2 + s0)
+      //
+      // then,
+      //
+      // #map2 = (d0, d1)[s0, s1, s2, o0, o1, t0, t1] ->
+      //              (d0 * s1 * t0 + d1 * s2 * t1 + o0 * s1 + o1 * s2 + s0)
+      //
+      // Representing this canonically
+      //
+      // #map2 = (d0, d1)[r0, r1, r2] -> (d0 * r1 + d1 * r2 + r0)
+      //
+      // where, r0 = o0 * s1 + o1 * s2 + s0, r1 = s1 * t0, r2 = s2 * t1.
+      %1 = subview %0[%i, %j][][%x, %y] :
+        : memref<?x?xf32, (d0, d1)[s0, s1, s2] -> (d0 * s1 + d1 * s2 + s0)> to
+          memref<4x4xf32, (d0, d1)[r0, r1, r2] -> (d0 * r1 + d1 * r2 + r0)>
+
+      // Note that the subview op does not guarantee that the result
+      // memref is "inbounds" w.r.t to base memref. It is upto the client
+      // to ensure that the subview is accessed in a manner that is
+      // in-bounds.
+
+    }
+  }];
+
+  // TODO(b/144779634, ravishankarm) : Use different arguments for
+  // offsets, sizes and strides.
+  let arguments = (ins
+    AnyMemRef:$source,
+    Variadic<Index>:$offsets,
+    Variadic<Index>:$sizes,
+    Variadic<Index>:$strides,
+    I32ElementsAttr:$operand_segment_sizes
+  );
+  let results = (outs AnyMemRef);
+
+  let builders = [
+    OpBuilder<
+      "Builder *b, OperationState &result, Value source, "
+      "ValueRange offsets, ValueRange sizes, "
+      "ValueRange strides, Type resultType = Type(), "
+      "ArrayRef<NamedAttribute> attrs = {}">,
+    OpBuilder<
+      "Builder *builder, OperationState &result, "
+      "Type resultType, Value source">
+  ];
+
+  let extraClassDeclaration = [{
+    /// Returns the type of the base memref operand.
+    MemRefType getBaseMemRefType() {
+      return source()->getType().cast<MemRefType>();
+    }
+
+    /// The result of a subview is always a memref.
+    MemRefType getType() { return getResult()->getType().cast<MemRefType>(); }
+
+    /// Returns as integer value the number of offset operands.
+    int64_t getNumOffsets() { return llvm::size(offsets()); }
+
+    /// Returns as integer value the number of size operands.
+    int64_t getNumSizes() { return llvm::size(sizes()); }
+
+    /// Returns as integer value the number of stride operands.
+    int64_t getNumStrides() { return llvm::size(strides()); }
+
+    /// Returns the dynamic sizes for this subview operation if specified.
+    operand_range getDynamicSizes() { return sizes(); }
+
+    /// Returns in `staticStrides` the static value of the stride
+    /// operands. Returns failure() if the static value of the stride
+    /// operands could not be retrieved.
+    LogicalResult getStaticStrides(SmallVectorImpl<int64_t> &staticStrides);
+
+    // Auxiliary range data structure and helper function that unpacks the
+    // offset, size and stride operands of the SubViewOp into a list of triples.
+    // Such a list of triple is sometimes more convenient to manipulate.
+    struct Range {
+      Value offset, size, stride;
+    };
+    SmallVector<Range, 8> getRanges();
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def TanhOp : FloatUnaryOp<"tanh"> {
+  let summary = "hyperbolic tangent of the specified value";
+  let description = [{
+    The `tanh` operation computes the hyperbolic tangent. It takes one operand
+    and returns one result of the same type. This type may be a float scalar
+    type, a vector whose element type is float, or a tensor of floats. It has
+    no standard attributes.
+  }];
+}
+
+def TensorCastOp : CastOp<"tensor_cast"> {
+  let summary = "tensor cast operation";
+  let description = [{
+    The "tensor_cast" operation converts a tensor from one type to an equivalent
+    type without changing any data elements.  The source and destination types
+    must both be tensor types with the same element type.  If both are ranked
+    then the rank should be the same and static dimensions should match.  The
+    operation is invalid if converting to a mismatching constant dimension.
+
+    Convert from unknown rank to rank 2 with unknown dimension sizes.
+       %2 = tensor_cast %1 : tensor<*xf32> to tensor<?x?xf32>
+  }];
+
+  let arguments = (ins AnyTensor);
+  let results = (outs AnyTensor);
+
+  let extraClassDeclaration = [{
+    /// Return true if `a` and `b` are valid operand and result pairs for
+    /// the operation.
+    static bool areCastCompatible(Type a, Type b);
+
+    /// The result of a tensor_cast is always a tensor.
+    TensorType getType() { return getResult()->getType().cast<TensorType>(); }
+  }];
+}
+
+def TensorLoadOp : Std_Op<"tensor_load",
+    [SameOperandsAndResultShape, SameOperandsAndResultElementType]> {
+  let summary = "tensor load operation";
+  let description = [{
+    The "tensor_load" operation creates a tensor from a memref, making an
+    independent copy of the element data. The result value is a tensor whose
+    shape and element type match the memref operand.
+
+    Produce a value of tensor<4x?xf32> type.
+       %12 = tensor_load %10 : memref<4x?xf32, #layout, memspace0>
+  }];
+
+  let arguments = (ins AnyMemRef);
+  let results = (outs AnyTensor);
+  // TensorLoadOp is fully verified by traits.
+  let verifier = ?;
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value memref", [{
+      auto memrefType = memref->getType().cast<MemRefType>();
+      auto resultType = RankedTensorType::get(memrefType.getShape(),
+                                              memrefType.getElementType());
+      result.addOperands(memref);
+      result.addTypes(resultType);
+  }]>];
+
+
+  let extraClassDeclaration = [{
+    /// The result of a tensor_load is always a tensor.
+    TensorType getType() { return getResult()->getType().cast<TensorType>(); }
+  }];
+}
+
+def TensorStoreOp : Std_Op<"tensor_store",
+    [SameOperandsShape, SameOperandsElementType]> {
+  let summary = "tensor store operation";
+  let description = [{
+    The "tensor_store" operation stores the contents of a tensor into a memref.
+    The first operand is a value of tensor type, the second operand is a value
+    of memref type. The shapes and element types of these must match, and are
+    specified by the memref type.
+
+    Example:
+       %9 = dim %8, 1 : tensor<4x?xf32>
+       %10 = alloc(%9) : memref<4x?xf32, #layout, memspace0>
+       tensor_store %8, %10 : memref<4x?xf32, #layout, memspace0>
+  }];
+
+  let arguments = (ins AnyTensor:$tensor, AnyMemRef:$memref);
+  // TensorStoreOp is fully verified by traits.
+  let verifier = ?;
+}
+
+def TruncateIOp : Std_Op<"trunci", [NoSideEffect, SameOperandsAndResultShape]> {
+  let summary = "integer truncation operation";
+  let description = [{
+    The integer truncation operation takes an integer input of
+    width M and an integer destination type of width N. The destination
+    bit-width must be smaller than the input bit-width (N < M).
+    The top-most (N - M) bits of the input are discarded.
+
+      %1 = constant 21 : i5           // %1 is 0b10101
+      %2 = trunci %1 : i5 to i4       // %2 is 0b0101
+      %3 = trunci %1 : i5 to i3       // %3 is 0b101
+
+      %5 = trunci %0 : vector<2 x i32> to vector<2 x i16>
+  }];
+
+  let arguments = (ins IntegerLike:$value);
+  let results = (outs IntegerLike);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value value, Type destType", [{
+      result.addOperands(value);
+      result.addTypes(destType);
+  }]>];
+
+  let parser = [{
+    return impl::parseCastOp(parser, result);
+  }];
+  let printer = [{
+    return printStandardCastOp(this->getOperation(), p);
+  }];
+}
+
+def ViewOp : Std_Op<"view", [NoSideEffect]> {
+  let summary = "memref view operation";
+  let description = [{
+    The "view" operation converts a 1-D memref with i8 element type,
+    to an N-D memref with arbitrary element type. In addition, the ViewOp
+    supports the following arguments:
+    *) A single dynamic offset operand can be specified which represents a
+       a dynamic offset within the base 1-D memref at which to create the
+       resulting memref view.
+    *) A dynamic size operand must be specified for each dynamic dimension
+       in the resulting view memref type.
+
+    // Allocate a flat 1D/i8 memref.
+    %0 = alloc() : memref<2048xi8>
+
+    // ViewOp with static offset and sizes.
+    %1 = view %0[][] : memref<2048xi8> to memref<64x4xf32>
+
+    // ViewOp with dynamic offset and one dynamic size.
+    %2 = view %0[%offset_1024][%size0]
+      : memref<2048xi8> to memref<?x4xf32, (d0, d1)[s0] -> (d0 * 4 + d1 + s0)>
+
+    // ViewOp creating 3D shape where two of the dim sizes are dynamic.
+    // *) The dynamic offset specified in the ViewOp is applied to the
+    //    base 1-D memref, and is represented by the symbol 's0' in the
+    //    layout map of the ViewOp result memref type.
+    // *) The dynamic size for the second dimension induces a dynamic
+    //    stride for the first dimension, which is represented by the
+    //    symbol 's1' in the layout map of the ViewOp result memref type.
+    //    Note that this dynamic stride will be computed from the view
+    //    shape and dynamic sizes.
+    %3 = view %0[%offset_1024][%size0, %size1]
+      : memref<2048xi8> to memref<?x?x4xf32,
+        (d0, d1, d2)[s0, s1] -> (d0 * s1 + d1 * 4 + d2 + s0)>
+  }];
+
+  let arguments = (ins MemRefRankOf<[I8], [1]>:$source,
+                       Variadic<Index>:$operands);
+  let results = (outs AnyMemRef);
+
+  let extraClassDeclaration = [{
+    /// The result of a view is always a memref.
+    MemRefType getType() { return getResult()->getType().cast<MemRefType>(); }
+
+    /// Returns the dynamic offset for this view operation if specified.
+    /// Returns nullptr if no dynamic offset was specified.
+    Value getDynamicOffset();
+
+    /// Returns the starting operand list position of the dynamic size operands.
+    unsigned getDynamicSizesOperandStart() {
+      return getDynamicOffset() == nullptr ? 1 : 2;
+    }
+
+    /// Returns the dynamic sizes for this view operation.
+    operand_range getDynamicSizes() {
+      return {operand_begin() + getDynamicSizesOperandStart(), operand_end()};
+    }
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def XOrOp : IntArithmeticOp<"xor", [Commutative]> {
+  let summary = "integer binary xor";
+  let hasFolder = 1;
+}
+
+def ZeroExtendIOp : Std_Op<"zexti", [NoSideEffect, SameOperandsAndResultShape]> {
+  let summary = "integer zero extension operation";
+  let description = [{
+    The integer zero extension operation takes an integer input of
+    width M and an integer destination type of width N. The destination
+    bit-width must be larger than the input bit-width (N > M).
+    The top-most (N - M) bits of the output are filled with zeros.
+
+      %1 = constant 5 : i3            // %1 is 0b101
+      %2 = zexti %1 : i3 to i6        // %2 is 0b000101
+      %3 = constant 2 : i3            // %3 is 0b010
+      %4 = zexti %3 : i3 to i6        // %4 is 0b000010
+
+      %5 = zexti %0 : vector<2 x i32> to vector<2 x i64>
+  }];
+
+  let arguments = (ins IntegerLike:$value);
+  let results = (outs IntegerLike);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value value, Type destType", [{
+      result.addOperands(value);
+      result.addTypes(destType);
+  }]>];
+
+  let parser = [{
+    return impl::parseCastOp(parser, result);
+  }];
+  let printer = [{
+    return printStandardCastOp(this->getOperation(), p);
+  }];
+}
+
+#endif // STANDARD_OPS
diff --git a/mlir/include/mlir/Dialect/Traits.h b/mlir/include/mlir/Dialect/Traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..87c8e662a65521eeb187d11d0f8df54016866114
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Traits.h
@@ -0,0 +1,80 @@
+//===- Traits.h - Common op traits shared by dialects -----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares common op traits that are not core to MLIR but can be
+// shared by multiple dialects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_TRAITS
+#define MLIR_DIALECT_TRAITS
+
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace OpTrait {
+
+// These functions are out-of-line implementations of the methods in the
+// corresponding trait classes.  This avoids them being template
+// instantiated/duplicated.
+namespace impl {
+LogicalResult verifyCompatibleOperandBroadcast(Operation *op);
+} // namespace impl
+
+namespace util {
+/// Returns true and sets `resultShape` to the broadcasted shape from the two
+/// given shapes if they are broadcast compatible. Returns false and clears
+/// `resultShape` otherwise.
+///
+/// The rules for determining the result shape are:
+///
+/// Zip together the dimensions in the two given shapes by prepending the shape
+/// with less dimensions with 1s. For each dimension pair, deduces the result
+/// dimension according to the following order:
+/// - If there are unknown dimensions, follows the TensorFlow behavior:
+///   - If either dimension is greater than 1, we assume that the program is
+///     correct, and the other dimension will be broadcast to match it.
+///   - If either dimension is 1, the other dimension is the result.
+///   - Otherwise, the result dimension is unknown dimension.
+/// - If one of the dimension is 1, the other dimension is the result.
+/// - If two dimensions are the same, that's the result.
+/// - Otherwise, incompatible shape.
+bool getBroadcastedShape(ArrayRef<int64_t> shape1, ArrayRef<int64_t> shape2,
+                         SmallVectorImpl<int64_t> &resultShape);
+
+/// Returns the result broadcast composition type from the two given types by
+/// following NumPy broadcast semantics. Returned type may have dynamic shape if
+/// either of the input types has dynamic shape. Returns null type if the two
+/// given types are not broadcast-compatible.
+Type getBroadcastedType(Type type1, Type type2);
+} // namespace util
+
+/// This class provides the API for ops that are known to have broadcast-
+/// compatible operand and result types. Specifically,  starting from the
+/// most varying dimension, each dimension pair of the two operands' types
+/// should either be the same or one of them is one. Also, the result type
+/// should have the corresponding dimension equal to the larger one, if known.
+/// Shapes are checked partially if ranks or dimensions are not known. For
+/// example, an op with tensor<? x 2 x f32> and tensor <2 x f32> as operand
+/// types and tensor<3 x 2 x f32> as the result type is broadcast-compatible.
+///
+/// Ths trait assumes the op has two operands and one result, and it asserts
+/// if the pre-condition is not satisfied.
+template <typename ConcreteType>
+class BroadcastableTwoOperandsOneResult
+    : public TraitBase<ConcreteType, BroadcastableTwoOperandsOneResult> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyCompatibleOperandBroadcast(op);
+  }
+};
+
+} // end namespace OpTrait
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_TRAITS
diff --git a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e7cbba0f433996c35504c00148d524645c025eb
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
@@ -0,0 +1,105 @@
+//===- StructuredOpsUtils.h - Utilities used by structured ops --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file define utilities that operate on standard types and are
+// useful across multiple dialects that use structured ops abstractions. These
+// abstractions consist of define custom operations that encode and transport
+// information about their semantics (e.g. type of iterators like parallel,
+// reduction, etc..) as attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_UTILS_STRUCTUREDOPSUTILS_H
+#define MLIR_DIALECT_UTILS_STRUCTUREDOPSUTILS_H
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+/// Attribute name for the AffineArrayAttr which encodes the relationship
+/// between a structured op iterators' and its operands.
+static constexpr StringLiteral getIndexingMapsAttrName() {
+  return StringLiteral("indexing_maps");
+}
+
+/// Attribute name for the StrArrayAttr which encodes the type of a structured
+/// op's iterators.
+static constexpr StringLiteral getIteratorTypesAttrName() {
+  return StringLiteral("iterator_types");
+}
+
+/// Attribute name for the IntegerAttr which encodes the number of input buffer
+/// arguments.
+static constexpr StringLiteral getArgsInAttrName() {
+  return StringLiteral("args_in");
+}
+
+/// Attribute name for the IntegerAttr which encodes the number of input buffer
+/// arguments.
+static constexpr StringLiteral getArgsOutAttrName() {
+  return StringLiteral("args_out");
+}
+
+/// Attribute name for the StringAttr which encodes an optional documentation
+/// string of the structured op.
+static constexpr StringLiteral getDocAttrName() { return StringLiteral("doc"); }
+
+/// Attribute name for the StrArrayAttr which encodes the SymbolAttr for the
+/// MLIR function that implements the body of the structured op.
+static constexpr StringLiteral getFunAttrName() { return StringLiteral("fun"); }
+
+/// Attribute name for the StrArrayAttr which encodes the external library
+/// function that implements the structured op.
+static constexpr StringLiteral getLibraryCallAttrName() {
+  return StringLiteral("library_call");
+}
+
+/// Use to encode that a particular iterator type has parallel semantics.
+inline static constexpr StringLiteral getParallelIteratorTypeName() {
+  return StringLiteral("parallel");
+}
+
+/// Use to encode that a particular iterator type has reduction semantics.
+inline static constexpr StringLiteral getReductionIteratorTypeName() {
+  return StringLiteral("reduction");
+}
+
+/// Use to encode that a particular iterator type has window semantics.
+inline static constexpr StringLiteral getWindowIteratorTypeName() {
+  return StringLiteral("window");
+}
+
+/// Use to encode that a particular iterator type has window semantics.
+inline static ArrayRef<StringRef> getAllIteratorTypeNames() {
+  static StringRef names[3] = {getParallelIteratorTypeName(),
+                               getReductionIteratorTypeName(),
+                               getWindowIteratorTypeName()};
+  return llvm::makeArrayRef(names);
+}
+
+/// Returns the iterator of a certain type.
+inline unsigned getNumIterators(StringRef name, ArrayAttr iteratorTypes) {
+  auto names = getAllIteratorTypeNames();
+  (void)names;
+  assert(llvm::is_contained(names, name));
+  return llvm::count_if(iteratorTypes, [name](Attribute a) {
+    return a.cast<StringAttr>().getValue() == name;
+  });
+}
+
+inline unsigned getNumIterators(ArrayAttr iteratorTypes) {
+  unsigned res = 0;
+  for (auto n : getAllIteratorTypeNames())
+    res += getNumIterators(n, iteratorTypes);
+  return res;
+}
+
+} // end namespace mlir
+
+#endif // MLIR_UTILS_STRUCTUREDOPSUTILS_H
diff --git a/mlir/include/mlir/Dialect/VectorOps/CMakeLists.txt b/mlir/include/mlir/Dialect/VectorOps/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5ce3168c55800dc026456021b4ae6770e7a23493
--- /dev/null
+++ b/mlir/include/mlir/Dialect/VectorOps/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_mlir_dialect(VectorOps VectorOps)
+
+set(LLVM_TARGET_DEFINITIONS VectorTransformPatterns.td)
+mlir_tablegen(VectorTransformPatterns.h.inc -gen-rewriters)
+add_public_tablegen_target(MLIRVectorTransformPatternsIncGen)
diff --git a/mlir/include/mlir/Dialect/VectorOps/Utils.h b/mlir/include/mlir/Dialect/VectorOps/Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f19f849e3fe55224b8761a04b81e65636d4ae4c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/VectorOps/Utils.h
@@ -0,0 +1,134 @@
+//===- Utils.h - VectorOps Utils ----------------------------*- C++ -*-=======//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_VECTOROPS_UTILS_H_
+#define MLIR_DIALECT_VECTOROPS_UTILS_H_
+
+#include "mlir/Support/LLVM.h"
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace mlir {
+
+class AffineApplyOp;
+class AffineForOp;
+class AffineMap;
+class Location;
+class MemRefType;
+class OpBuilder;
+class Operation;
+class Value;
+class VectorType;
+
+/// Computes and returns the multi-dimensional ratio of `superShape` to
+/// `subShape`. This is calculated by performing a traversal from minor to major
+/// dimensions (i.e. in reverse shape order). If integral division is not
+/// possible, returns None.
+/// The ArrayRefs are assumed (and enforced) to only contain > 1 values.
+/// This constraint comes from the fact that they are meant to be used with
+/// VectorTypes, for which the property holds by construction.
+///
+/// Examples:
+///   - shapeRatio({3, 4, 5, 8}, {2, 5, 2}) returns {3, 2, 1, 4}
+///   - shapeRatio({3, 4, 4, 8}, {2, 5, 2}) returns None
+///   - shapeRatio({1, 2, 10, 32}, {2, 5, 2}) returns {1, 1, 2, 16}
+Optional<SmallVector<int64_t, 4>> shapeRatio(ArrayRef<int64_t> superShape,
+                                             ArrayRef<int64_t> subShape);
+
+/// Computes and returns the multi-dimensional ratio of the shapes of
+/// `superVector` to `subVector`. If integral division is not possible, returns
+/// None.
+/// Assumes and enforces that the VectorTypes have the same elemental type.
+Optional<SmallVector<int64_t, 4>> shapeRatio(VectorType superVectorType,
+                                             VectorType subVectorType);
+
+/// Constructs a permutation map of invariant memref indices to vector
+/// dimension.
+///
+/// If no index is found to be invariant, 0 is added to the permutation_map and
+/// corresponds to a vector broadcast along that dimension.
+///
+/// The implementation uses the knowledge of the mapping of loops to
+/// vector dimension. `loopToVectorDim` carries this information as a map with:
+///   - keys representing "vectorized enclosing loops";
+///   - values representing the corresponding vector dimension.
+/// Note that loopToVectorDim is a whole function map from which only enclosing
+/// loop information is extracted.
+///
+/// Prerequisites: `opInst` is a vectorizable load or store operation (i.e. at
+/// most one invariant index along each AffineForOp of `loopToVectorDim`).
+///
+/// Example 1:
+/// The following MLIR snippet:
+///
+/// ```mlir
+///    affine.for %i3 = 0 to %0 {
+///      affine.for %i4 = 0 to %1 {
+///        affine.for %i5 = 0 to %2 {
+///          %a5 = load %arg0[%i4, %i5, %i3] : memref<?x?x?xf32>
+///    }}}
+/// ```
+///
+/// may vectorize with {permutation_map: (d0, d1, d2) -> (d2, d1)} into:
+///
+/// ```mlir
+///    affine.for %i3 = 0 to %0 step 32 {
+///      affine.for %i4 = 0 to %1 {
+///        affine.for %i5 = 0 to %2 step 256 {
+///          %4 = vector.transfer_read %arg0, %i4, %i5, %i3
+///               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
+///               (memref<?x?x?xf32>, index, index) -> vector<32x256xf32>
+///    }}}
+/// ```
+///
+/// Meaning that vector.transfer_read will be responsible for reading the slice:
+/// `%arg0[%i4, %i5:%15+256, %i3:%i3+32]` into vector<32x256xf32>.
+///
+/// Example 2:
+/// The following MLIR snippet:
+///
+/// ```mlir
+///    %cst0 = constant 0 : index
+///    affine.for %i0 = 0 to %0 {
+///      %a0 = load %arg0[%cst0, %cst0] : memref<?x?xf32>
+///    }
+/// ```
+///
+/// may vectorize with {permutation_map: (d0) -> (0)} into:
+///
+/// ```mlir
+///    affine.for %i0 = 0 to %0 step 128 {
+///      %3 = vector.transfer_read %arg0, %c0_0, %c0_0
+///           {permutation_map: (d0, d1) -> (0)} :
+///           (memref<?x?xf32>, index, index) -> vector<128xf32>
+///    }
+/// ````
+///
+/// Meaning that vector.transfer_read will be responsible of reading the slice
+/// `%arg0[%c0, %c0]` into vector<128xf32> which needs a 1-D vector broadcast.
+///
+AffineMap
+makePermutationMap(Operation *op, ArrayRef<Value> indices,
+                   const DenseMap<Operation *, unsigned> &loopToVectorDim);
+
+namespace matcher {
+
+/// Matches vector.transfer_read, vector.transfer_write and ops that return a
+/// vector type that is a multiple of the sub-vector type. This allows passing
+/// over other smaller vector types in the function and avoids interfering with
+/// operations on those.
+/// This is a first approximation, it can easily be extended in the future.
+/// TODO(ntv): this could all be much simpler if we added a bit that a vector
+/// type to mark that a vector is a strict super-vector but it still does not
+/// warrant adding even 1 extra bit in the IR for now.
+bool operatesOnSuperVectorsOf(Operation &op, VectorType subVectorType);
+
+} // end namespace matcher
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_VECTOROPS_UTILS_H_
diff --git a/mlir/include/mlir/Dialect/VectorOps/VectorOps.h b/mlir/include/mlir/Dialect/VectorOps/VectorOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..7234d46b765669f744bfa2ac0caece671a7ed018
--- /dev/null
+++ b/mlir/include/mlir/Dialect/VectorOps/VectorOps.h
@@ -0,0 +1,59 @@
+//===- VectorOps.h - MLIR Super Vectorizer Operations -----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Vector dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_VECTOROPS_VECTOROPS_H
+#define MLIR_DIALECT_VECTOROPS_VECTOROPS_H
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+
+namespace mlir {
+class MLIRContext;
+class OwningRewritePatternList;
+namespace vector {
+
+/// Dialect for Ops on higher-dimensional vector types.
+class VectorOpsDialect : public Dialect {
+public:
+  VectorOpsDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "vector"; }
+
+  /// Materialize a single constant operation from a given attribute value with
+  /// the desired resultant type.
+  Operation *materializeConstant(OpBuilder &builder, Attribute value, Type type,
+                                 Location loc) override;
+};
+
+/// Collect a set of vector-to-vector canonicalization patterns.
+void populateVectorToVectorCanonicalizationPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *context);
+
+/// Collect a set of vector-to-vector transformation patterns.
+void populateVectorToVectorTransformationPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *context);
+
+/// Returns the integer type required for subscripts in the vector dialect.
+IntegerType getVectorSubscriptType(Builder &builder);
+
+/// Returns an integer array attribute containing the given values using
+/// the integer type required for subscripts in the vector dialect.
+ArrayAttr getVectorSubscriptAttr(Builder &b, ArrayRef<int64_t> values);
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/VectorOps/VectorOps.h.inc"
+
+} // end namespace vector
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_VECTOROPS_VECTOROPS_H
diff --git a/mlir/include/mlir/Dialect/VectorOps/VectorOps.td b/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..8726b162fd6169fda6f1469781c0050e8545da5a
--- /dev/null
+++ b/mlir/include/mlir/Dialect/VectorOps/VectorOps.td
@@ -0,0 +1,1152 @@
+//===- VectorOps.td - Vector op definitions ---------------*- tablegen -*-====//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines MLIR vector operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef VECTOR_OPS
+#define VECTOR_OPS
+
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+
+def Vector_Dialect : Dialect {
+  let name = "vector";
+  let cppNamespace = "vector";
+}
+
+// Base class for Vector dialect ops.
+class Vector_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<Vector_Dialect, mnemonic, traits> {
+  // For every vector op, there needs to be a:
+  //   * void print(OpAsmPrinter &p, ${C++ class of Op} op)
+  //   * LogicalResult verify(${C++ class of Op} op)
+  //   * ParseResult parse${C++ class of Op}(OpAsmParser &parser,
+  //                                         OperationState &result)
+  // functions.
+  let printer = [{ return ::print(p, *this); }];
+  let verifier = [{ return ::verify(*this); }];
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+// TODO(andydavis, ntv) Add an attribute to specify a different algebra
+// with operators other than the current set: {*, +}.
+def Vector_ContractionOp :
+  Vector_Op<"contract", [NoSideEffect]>,
+    Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, AnyVector:$acc,
+               Variadic<VectorOf<[I1]>>:$masks,
+               AffineMapArrayAttr:$indexing_maps, ArrayAttr:$iterator_types)>,
+    Results<(outs AnyVector)> {
+  let summary = "vector contraction operation";
+  let description = [{
+    Computes the sum of products of vector elements along contracting
+    dimension pairs from 2 vectors of rank M and N respectively, adds this
+    intermediate result to the accumulator argument of rank K, and returns a
+    vector result of rank K (where K = num_lhs_free_dims + num_rhs_free_dims +
+    num_batch_dims (see dimension type descriptions below)).
+
+    Optional vector mask arguments (produced by CreateMaskOp or ConstantMaskOp)
+    specify the dynamic dimension sizes of valid data within the lhs/rhs vector
+    arguments.
+
+    An iterator type attribute list must be specified, where each element of
+    the list represents an iterator with one of the following types:
+
+    *) "reduction": reduction dimensions are present in the lhs and rhs
+                    arguments but not in the output (or optional accumulator
+                    argument). These are the dimensions along which the vector
+                    contraction op computes the sum of products, and
+                    contracting dimension pair dimension sizes must match
+                    between lhs/rhs.
+    *) "parallel": Batch dimensions are iterator type "parallel", and
+                   are non-contracting dimensions present in the lhs, rhs and
+                   output. The lhs/rhs co-iterate along the batch dimensions,
+                   which should be expressed in their indexing maps.
+
+                   Free dimensions are iterator type "parallel", and are
+                   non-contraction, non-batch dimensions accessed by either the
+                   lhs or rhs (but not both). The lhs and rhs free dimensions
+                   are unrelated to each other and do not co-iterate, which
+                   should be expressed in their indexing maps.
+
+    An indexing map attribute list must be specified with an entry for lhs, rhs
+    and acc arguments. An indexing map attribute specifies a mapping from each
+    iterator in the iterator type list, to each dimension of an N-D vector.
+
+    Examples:
+
+      // 2D vector contraction with one contracting dimension (matmul).
+      #contraction_accesses = [
+        (i, j, k) -> (i, k),
+        (i, j, k) -> (k, j),
+        (i, j, k) -> (i, j)
+      ]
+      #contraction_trait = {
+        indexing_maps = #contraction_accesses,
+        iterator_types = [parallel, parallel, reduction]
+      }
+
+      %3 = vector.contract #contraction_trait %0, %1, %2
+        : vector<4x3xf32>, vector<3x7xf32> into vector<4x7xf32>
+
+      // 4D to 3D vector contraction with two contracting dimensions and
+      // one batch dimension.
+      #contraction_accesses = [
+        (b0, f0, f1, c0, c1) -> (c0, b0, c1, f0),
+        (b0, f0, f1, c0, c1) -> (b0, c1, c0, f1),
+        (b0, f0, f1, c0, c1) -> (b0, f0, f1)
+      ]
+      #contraction_trait = {
+        indexing_maps = #contraction_accesses,
+        iterator_types = [parallel, parallel, parallel reduction, reduction]
+      }
+
+      %4 = vector.contract #contraction_trait %0, %1, %2
+          : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+
+      // 4D vector contraction with two contracting dimensions and optional
+      // vector mask arguments.
+      %lhs_mask = vector.constant_mask [7, 8, 16, 15] : vector<7x8x16x15xi1>
+      %rhs_mask = vector.constant_mask [8, 16, 7, 5] : vector<8x16x7x5xi1>
+
+      %5 = vector.contract #contraction_trait %0, %1, %2, %lhs_mask, %rhs_mask
+         : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32>
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value lhs, Value rhs, "
+    "Value acc, ArrayAttr indexingMaps, ArrayAttr iteratorTypes">];
+  let extraClassDeclaration = [{
+    VectorType getLhsType() {
+      return lhs()->getType().cast<VectorType>();
+    }
+    VectorType getRhsType() {
+      return rhs()->getType().cast<VectorType>();
+    }
+    VectorType getAccType() {
+      return acc()->getType().cast<VectorType>();
+    }
+    VectorType getLHSVectorMaskType() {
+      if (llvm::size(masks()) != 2) return VectorType();
+      return getOperand(3)->getType().cast<VectorType>();
+    }
+    VectorType getRHSVectorMaskType() {
+      if (llvm::size(masks()) != 2) return VectorType();
+      return getOperand(4)->getType().cast<VectorType>();
+    }
+    VectorType getResultType() {
+      return getResult()->getType().cast<VectorType>();
+    }
+    ArrayRef<StringRef> getTraitAttrNames();
+    SmallVector<AffineMap, 4> getIndexingMaps();
+    static unsigned getAccOperandIndex() { return 2; }
+
+    // Returns the bounds of each dimension in the iteration space spanned
+    // by the iterator types of this operation.
+    void getIterationBounds(SmallVectorImpl<int64_t> &iterationBounds);
+
+    // Returns a list of index maps, where there is a list entry for each
+    // op indexing map attribute (i.e. one for each input and output, with
+    // the output listed last). Each index map, maps from this operations
+    // iteration space, to vector dimensions of the maps input/output.
+    void getIterationIndexMap(
+      std::vector<DenseMap<int64_t, int64_t>> &iterationIndexMap);
+
+    std::vector<std::pair<int64_t, int64_t>> getContractingDimMap();
+    std::vector<std::pair<int64_t, int64_t>> getBatchDimMap();
+  }];
+}
+
+def Vector_BroadcastOp :
+  Vector_Op<"broadcast", [NoSideEffect,
+     PredOpTrait<"source operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
+    Arguments<(ins AnyType:$source)>,
+    Results<(outs AnyVector:$vector)> {
+  let summary = "broadcast operation";
+  let description = [{
+    Broadcasts the scalar or k-D vector value in the source operand
+    to a n-D result vector such that the broadcast makes sense, i.e.,
+    the source operand is duplicated to match the given rank and sizes
+    in the result vector. The legality rules are:
+    * the source operand must have the same element type as the result type
+    * a k-D vector <s_1 x .. x s_k x type> can be broadcast to
+      a n-D vector <t_1 x .. x t_n x type> if
+       * k <= n, and
+       * the sizes in the trailing dimensions n-k < i <= n with j=i+k-n
+          match exactly as s_j = t_i or s_j = 1:
+       ```
+           t_1 x   ..  t_n-k x t_n-k+1 x .. x t_i x .. x t_n
+                               s_1     x .. x s_j x .. x s_k
+               <duplication>         <potential stretch>
+       ```
+    The source operand is duplicated over all the missing leading dimensions
+    and stretched over the trailing dimensions where the source has a non-equal
+    dimension of 1. These rules imply that any scalar broadcast (k=0) to any
+    shaped vector with the same element type is always legal.
+
+    Examples:
+    ```
+      %0 = constant 0.0 : f32
+      %1 = vector.broadcast %0 : f32 to vector<16xf32>
+      %2 = vector.broadcast %1 : vector<16xf32> to vector<4x16xf32>
+    ```
+  }];
+  let extraClassDeclaration = [{
+    Type getSourceType() { return source()->getType(); }
+    VectorType getVectorType() {
+      return vector()->getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_ShuffleOp :
+  Vector_Op<"shuffle", [NoSideEffect,
+     PredOpTrait<"first operand v1 and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>,
+     PredOpTrait<"second operand v2 and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 1>>]>,
+     Arguments<(ins AnyVector:$v1, AnyVector:$v2, I64ArrayAttr:$mask)>,
+     Results<(outs AnyVector:$vector)> {
+  let summary = "shuffle operation";
+  let description = [{
+    The shuffle operation constructs a permutation (or duplication) of elements
+    from two input vectors, returning a vector with the same element type as
+    the input and a length that is the same as the shuffle mask. The two input
+    vectors must have the same element type, rank, and trailing dimension sizes
+    and shuffles their values in the leading dimension (which may differ in size)
+    according to the given mask. The legality rules are:
+    * the two operands must have the same element type as the result
+    * the two operands and the result must have the same rank and trailing
+      dimension sizes, viz. given two k-D operands
+              v1 : <s_1 x s_2 x .. x s_k x type> and
+              v2 : <t_1 x t_2 x .. x t_k x type>
+      we have s_i = t_i for all 1 < i <= k
+    * the mask length equals the leading dimension size of the result
+    * numbering the input vector indices left to right accross the operands, all
+      mask values must be within range, viz. given two k-D operands v1 and v2
+      above, all mask values are in the range [0,s_1+t_1)
+
+    Examples:
+    ```
+    %0 = vector.shuffle %a, %b[0, 3]
+               : vector<2xf32>, vector<2xf32>       ; yields vector<2xf32>
+    %1 = vector.shuffle %c, %b[0, 1, 2]
+               : vector<2x16xf32>, vector<1x16xf32> ; yields vector<3x16xf32>
+    %2 = vector.shuffle %a, %b[3, 2, 1, 0]
+               : vector<2xf32>, vector<2xf32>       ; yields vector<4xf32>
+
+    ```
+  }];
+  let builders = [OpBuilder<"Builder *builder, OperationState &result,"
+                            "Value v1, Value v2, ArrayRef<int64_t>">];
+  let extraClassDeclaration = [{
+    static StringRef getMaskAttrName() { return "mask"; }
+    VectorType getV1VectorType() {
+      return v1()->getType().cast<VectorType>();
+    }
+    VectorType getV2VectorType() {
+      return v2()->getType().cast<VectorType>();
+    }
+    VectorType getVectorType() {
+      return vector()->getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_ExtractElementOp :
+  Vector_Op<"extractelement", [NoSideEffect,
+     PredOpTrait<"operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
+    Arguments<(ins AnyVector:$vector, AnyInteger:$position)>,
+    Results<(outs AnyType)> {
+  let summary = "extractelement operation";
+  let description = [{
+    Takes an 1-D vector and a dynamic index position and extracts the
+    scalar at that position. Note that this instruction resembles
+    vector.extract, but is restricted to 1-D vectors and relaxed
+    to dynamic indices. It is meant to be closer to LLVM's version:
+    https://llvm.org/docs/LangRef.html#extractelement-instruction
+
+    Example:
+    ```
+      %c = constant 15 : i32
+      %1 = vector.extractelement %0[%c : i32]: vector<16xf32>
+    ```
+  }];
+  let extraClassDeclaration = [{
+    VectorType getVectorType() {
+      return vector()->getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_ExtractOp :
+  Vector_Op<"extract", [NoSideEffect,
+     PredOpTrait<"operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
+    Arguments<(ins AnyVector:$vector, I64ArrayAttr:$position)>,
+    Results<(outs AnyType)> {
+  let summary = "extract operation";
+  let description = [{
+    Takes an n-D vector and a k-D position and extracts the (n-k)-D vector at
+    the proper position. Degenerates to an element type in the 0-D case.
+
+    Examples:
+    ```
+      %1 = vector.extract %0[3]: vector<4x8x16xf32>
+      %2 = vector.extract %0[3, 3, 3]: vector<4x8x16xf32>
+    ```
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value source,"
+    "ArrayRef<int64_t>">];
+  let extraClassDeclaration = [{
+    static StringRef getPositionAttrName() { return "position"; }
+    VectorType getVectorType() {
+      return vector()->getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_ExtractSlicesOp :
+  Vector_Op<"extract_slices", [NoSideEffect]>,
+    Arguments<(ins AnyVector:$vector, I64ArrayAttr:$sizes,
+                   I64ArrayAttr:$strides)>,
+    Results<(outs TupleOf<[AnyVector]>)> {
+  let summary = "vector extract slices operation";
+  let description = [{
+    Takes an N-d vector and returns a tuple of vector slices of 'vector',
+    based on 'sizes' and 'strides' parameters.
+
+    The arguments 'sizes' and 'strides' represent a specification for
+    generating the unrolling of 'vector' shape, which has all slices of shape
+    'sizes' except for slices at dimension boundaries when 'vector' dimension
+    sizes are not a multiple of 'sizes'.
+
+    Each slice is returned at the tuple element index corresponding to the
+    linear index of the slice w.r.t the unrolling scheme represented by 'sizes'.
+    Currently, only unit strides are supported.
+
+    Examples:
+    ```
+      %0 = vector.transfer_read ...: vector<4x2xf32>
+
+      %1 = vector.extract_slices %0, [2, 2], [1, 1]
+        : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
+
+      // Example with partial slices at dimension boundaries.
+      %2 = vector.transfer_read ...: vector<4x3xf32>
+
+      %3 = vector.extract_slices %2, [2, 2], [1, 1]
+        : vector<4x3xf32> into tuple<vector<2x2xf32>, vector<2x1xf32>,
+                                     vector<2x2xf32>, vector<2x1xf32>>
+    ```
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, TupleType tupleType, " #
+    "Value vector, ArrayRef<int64_t> sizes, " #
+    "ArrayRef<int64_t> strides">];
+  let extraClassDeclaration = [{
+    VectorType getSourceVectorType() {
+      return vector()->getType().cast<VectorType>();
+    }
+    TupleType getResultTupleType() {
+      return getResult()->getType().cast<TupleType>();
+    }
+    void getSizes(SmallVectorImpl<int64_t> &results);
+    void getStrides(SmallVectorImpl<int64_t> &results);
+    static StringRef getSizesAttrName() { return "sizes"; }
+    static StringRef getStridesAttrName() { return "strides"; }
+  }];
+}
+
+def Vector_InsertElementOp :
+  Vector_Op<"insertelement", [NoSideEffect,
+     PredOpTrait<"source operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>,
+     PredOpTrait<"dest operand and result have same type",
+                 TCresIsSameAsOpBase<0, 1>>]>,
+     Arguments<(ins AnyType:$source, AnyVector:$dest, AnyInteger:$position)>,
+     Results<(outs AnyVector)> {
+  let summary = "insertelement operation";
+  let description = [{
+    Takes a scalar source, an 1-D destination vector and a dynamic index
+    position and inserts the source into the destination at the proper
+    position.  Note that this instruction resembles vector.insert, but
+    is restricted to 1-D vectors and relaxed to dynamic indices. It is
+    meant to be closer to LLVM's version:
+    https://llvm.org/docs/LangRef.html#insertelement-instruction
+
+    Example:
+    ```
+      %c = constant 15 : i32
+      %f = constant 0.0f : f32
+      %1 = vector.insertelement %f, %0[%c : i32]: vector<16xf32>
+    ```
+  }];
+  let extraClassDeclaration = [{
+    Type getSourceType() { return source()->getType(); }
+    VectorType getDestVectorType() {
+      return dest()->getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_InsertOp :
+  Vector_Op<"insert", [NoSideEffect,
+     PredOpTrait<"source operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>,
+     PredOpTrait<"dest operand and result have same type",
+                 TCresIsSameAsOpBase<0, 1>>]>,
+     Arguments<(ins AnyType:$source, AnyVector:$dest, I64ArrayAttr:$position)>,
+     Results<(outs AnyVector)> {
+  let summary = "insert operation";
+  let description = [{
+    Takes an n-D source vector, an (n+k)-D destination vector and a k-D position
+    and inserts the n-D source into the (n+k)-D destination at the proper
+    position. Degenerates to a scalar source type when n = 0.
+
+    Examples:
+    ```
+      %2 = vector.insert %0, %1[3]:
+        vector<8x16xf32> into vector<4x8x16xf32>
+      %5 = vector.insert %3, %4[3, 3, 3]:
+        f32 into vector<4x8x16xf32>
+    ```
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value source, " #
+    "Value dest, ArrayRef<int64_t>">];
+  let extraClassDeclaration = [{
+    static StringRef getPositionAttrName() { return "position"; }
+    Type getSourceType() { return source()->getType(); }
+    VectorType getDestVectorType() {
+      return dest()->getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_InsertSlicesOp :
+  Vector_Op<"insert_slices", [NoSideEffect]>,
+    Arguments<(ins TupleOf<[AnyVector]>:$vectors, I64ArrayAttr:$sizes,
+                   I64ArrayAttr:$strides)>,
+    Results<(outs AnyVector)> {
+  let summary = "vector insert slices operation";
+  let description = [{
+    Takes a tuple of vector slices and inserts them into the vector result
+    according to the 'sizes' and 'strides' parameters.
+
+    The arguments 'sizes' and 'strides' represent a specification for
+    generating the unrolling of 'vector' shape, which has all slices of shape
+    'sizes' except for slices at dimension boundaries when 'vector' dimension
+    sizes are not a multiple of 'sizes'.
+
+    Each slice in 'vectors' is at the tuple element index corresponding to the
+    linear index of the slice w.r.t the unrolling scheme represented by 'sizes'.
+    Currently, only unit strides are supported.
+
+    Examples:
+    ```
+      %0 = vector.extract_slices %0, [2, 2], [1, 1]
+        : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
+
+      %1 = vector.insert_slices %0, [2, 2], [1, 1]
+        : tuple<vector<2x2xf32>, vector<2x2xf32>> into vector<4x2xf32>
+
+      // Example with partial slices at dimension boundaries.
+      %3 = vector.extract_slices %2, [2, 2], [1, 1]
+        : vector<4x3xf32> into tuple<vector<2x2xf32>, vector<2x1xf32>,
+                                     vector<2x2xf32>, vector<2x1xf32>>
+
+      %4 = vector.insert_slices %3, [2, 2], [1, 1]
+        : tuple<vector<2x2xf32>, vector<2x1xf32>,
+                vector<2x2xf32>, vector<2x1xf32>> into vector<4x3xf32>
+    ```
+  }];
+
+  let extraClassDeclaration = [{
+    TupleType getSourceTupleType() {
+      return vectors()->getType().cast<TupleType>();
+    }
+    VectorType getResultVectorType() {
+      return getResult()->getType().cast<VectorType>();
+    }
+    void getSizes(SmallVectorImpl<int64_t> &results);
+    void getStrides(SmallVectorImpl<int64_t> &results);
+    static StringRef getSizesAttrName() { return "sizes"; }
+    static StringRef getStridesAttrName() { return "strides"; }
+  }];
+}
+
+def Vector_InsertStridedSliceOp :
+  Vector_Op<"insert_strided_slice", [NoSideEffect,
+    PredOpTrait<"operand #0 and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>,
+    PredOpTrait<"dest operand and result have same type",
+                 TCresIsSameAsOpBase<0, 1>>]>,
+    Arguments<(ins AnyVector:$source, AnyVector:$dest, I64ArrayAttr:$offsets,
+               I64ArrayAttr:$strides)>,
+    Results<(outs AnyVector)> {
+  let summary = "strided_slice operation";
+  let description = [{
+    Takes a k-D source vector, an n-D destination vector (n >= k), n-D `offsets`
+    integer array attribute, a k-D `strides` integer array attribute and inserts
+    the k-D source vector as a strided subvector at the proper offset into the
+    n-D destination vector.
+
+    At the moment strides must contain only 1s.
+
+    Returns an n-D vector that is a copy of the n-D destination vector in which
+    the last k-D dimensions contain the k-D source vector elements strided at
+    the proper location as specified by the offsets.
+
+    Examples:
+    ```
+      %2 = vector.insert_strided_slice %0, %1
+          {offsets : [0, 0, 2], strides : [1, 1]}:
+        vector<2x4xf32> into vector<16x4x8xf32>
+    ```
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value source, Value dest, " #
+    "ArrayRef<int64_t> offsets, ArrayRef<int64_t> strides">];
+  let extraClassDeclaration = [{
+    static StringRef getOffsetsAttrName() { return "offsets"; }
+    static StringRef getStridesAttrName() { return "strides"; }
+    VectorType getSourceVectorType() {
+      return source()->getType().cast<VectorType>();
+    }
+    VectorType getDestVectorType() {
+      return dest()->getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_OuterProductOp :
+  Vector_Op<"outerproduct", [NoSideEffect, SameOperandsAndResultElementType]>,
+    Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, Variadic<AnyVector>:$acc)>,
+    Results<(outs AnyVector)> {
+  let summary = "vector outerproduct with optional fused add";
+  let description = [{
+    Takes 2 1-D vectors and returns the 2-D vector containing the outer product.
+
+    An optional extra 2-D vector argument may be specified in which case the
+    operation returns the sum of the outer product and the extra vector. When
+    lowered to the LLVMIR dialect, this form emits `llvm.intr.fmuladd`, which
+    can lower to actual `fma` instructions in LLVM.
+
+    Examples
+
+      %2 = vector.outerproduct %0, %1: vector<4xf32>, vector<8xf32>
+      return %2: vector<4x8xf32>
+
+      %3 = vector.outerproduct %0, %1, %2:
+        vector<4xf32>, vector<8xf32>, vector<4x8xf32>
+      return %3: vector<4x8xf32>
+  }];
+  let extraClassDeclaration = [{
+    VectorType getOperandVectorTypeLHS() {
+      return lhs()->getType().cast<VectorType>();
+    }
+    VectorType getOperandVectorTypeRHS() {
+      return rhs()->getType().cast<VectorType>();
+    }
+    VectorType getOperandVectorTypeACC() {
+      return (llvm::size(acc()) == 0) ? VectorType() :
+        (*acc().begin())->getType().cast<VectorType>();
+    }
+    VectorType getVectorType() {
+      return getResult()->getType().cast<VectorType>();
+    }
+  }];
+}
+
+// TODO(andydavis) Add transformation which decomposes ReshapeOp into an
+// optimized sequence of vector rotate/shuffle/select operations.
+def Vector_ReshapeOp :
+  Vector_Op<"reshape", [AttrSizedOperandSegments, NoSideEffect]>,
+    Arguments<(ins AnyVector:$vector, Variadic<Index>:$input_shape,
+               Variadic<Index>:$output_shape,
+               I64ArrayAttr:$fixed_vector_sizes,
+               I32ElementsAttr:$operand_segment_sizes)>,
+    Results<(outs AnyVector)> {
+  let summary = "vector reshape operation";
+  let description = [{
+    Reshapes its vector operand from 'input_shape' to 'output_shape' maintaining
+    fixed vector dimension 'fixed_vector_sizes' on the innermost vector
+    dimensions.
+
+    The parameters 'input_shape' and 'output_shape' represent valid data shapes
+    across fixed vector shapes. For example, if a vector has a valid data
+    shape [6] with fixed vector size [8], then the valid data elements are
+    assumed to be stored at the beginning of the vector with the remaining
+    vector elements undefined.
+
+    In the examples below, valid data elements are represented by an alphabetic
+    character, and undefined data elements are represented by '-'.
+
+    Example
+
+      vector<1x8xf32> with valid data shape [6], fixed vector sizes [8]
+
+                input: [a, b, c, d, e, f]
+
+           layout map: (d0) -> (d0 floordiv 8, d0 mod 8)
+
+        vector layout: [a, b, c, d, e, f, -, -]
+
+    Example
+
+      vector<2x8xf32> with valid data shape [10], fixed vector sizes [8]
+
+                input: [a, b, c, d, e, f, g, h, i, j]
+
+           layout map: (d0) -> (d0 floordiv 8, d0 mod 8)
+
+        vector layout: [[a, b, c, d, e, f, g, h],
+                        [i, j, -, -, -, -, -, -]]
+
+    Example
+
+      vector<2x2x2x3xf32> with valid data shape [3, 5], fixed vector sizes
+      [2, 3]
+
+                input: [[a, b, c, d, e],
+                        [f, g, h, i, j],
+                        [k, l, m, n, o]]
+
+           layout map: (d0, d1) -> (d0 floordiv 3, d1 floordiv 5,
+                                    d0 mod 3, d1 mod 5)
+
+        vector layout: [[[[a, b, c],
+                          [f, g, h]]
+                         [[d, e, -],
+                          [i, j, -]]],
+                        [[[k, l, m],
+                          [-, -, -]]
+                         [[n, o, -],
+                          [-, -, -]]]]
+
+    Example
+
+      %1 = vector.reshape %0, [%c3, %c6], [%c2, %c9], [4]
+        : vector<3x2x4xf32> to vector<2x3x4xf32>
+
+             input: [[a, b, c, d, e, f],
+                     [g, h, i, j, k, l],
+                     [m, n, o, p, q, r]]
+
+        layout map: (d0, d1) -> (d0, d1 floordiv 4, d1 mod 4)
+
+
+      Input vector:  [[[a, b, c, d],
+                       [e, f, -, -]],
+                      [[g, h, i, j],
+                       [k, l, -, -]],
+                      [[m, n, o, p],
+                       [q, r, -, -]]]
+
+      Output vector:  [[[a, b, c, d],
+                        [e, f, g, h],
+                        [i, -, -, -]],
+                       [[j, k, l, m],
+                        [n, o, p, q],
+                        [r, -, -, -]]]
+  }];
+
+  let extraClassDeclaration = [{
+    VectorType getInputVectorType() {
+      return vector()->getType().cast<VectorType>();
+    }
+    VectorType getOutputVectorType() {
+      return getResult()->getType().cast<VectorType>();
+    }
+
+    /// Returns as integer value the number of input shape operands.
+    int64_t getNumInputShapeSizes() { return input_shape().size(); }
+
+    /// Returns as integer value the number of output shape operands.
+    int64_t getNumOutputShapeSizes() { return output_shape().size(); }
+
+    void getFixedVectorSizes(SmallVectorImpl<int64_t> &results);
+
+    static StringRef getFixedVectorSizesAttrName() {
+      return "fixed_vector_sizes";
+    }
+    static StringRef getInputShapeAttrName() { return "input_shape"; }
+    static StringRef getOutputShapeAttrName() { return "output_shape"; }
+  }];
+}
+
+def Vector_StridedSliceOp :
+  Vector_Op<"strided_slice", [NoSideEffect,
+    PredOpTrait<"operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
+    Arguments<(ins AnyVector:$vector, I64ArrayAttr:$offsets,
+               I64ArrayAttr:$sizes, I64ArrayAttr:$strides)>,
+    Results<(outs AnyVector)> {
+  let summary = "strided_slice operation";
+  let description = [{
+    Takes an n-D vector, k-D `offsets` integer array attribute, a k-D `sizes`
+    integer array attribute, a k-D `strides` integer array attribute and
+    extracts the n-D subvector at the proper offset.
+
+    At the moment strides must contain only 1s.
+    // TODO(ntv) support non-1 strides.
+
+    Returns an n-D vector where the first k-D dimensions match the `sizes`
+    attribute. The returned subvector contains the elements starting at offset
+    `offsets` and ending at `offsets + sizes`.
+
+    Examples:
+    ```
+      %1 = vector.strided_slice %0
+          {offsets : [0, 2], sizes : [2, 4], strides : [1, 1]}:
+        vector<4x8x16xf32> to vector<2x4x16xf32>
+    ```
+
+    // TODO(ntv) Evolve to a range form syntax similar to:
+    %1 = vector.strided_slice %0[0:2:1][2:4:1]
+      vector<4x8x16xf32> to vector<2x4x16xf32>
+  }];
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value source, " #
+    "ArrayRef<int64_t> offsets, ArrayRef<int64_t> sizes, " #
+    "ArrayRef<int64_t> strides">];
+  let extraClassDeclaration = [{
+    static StringRef getOffsetsAttrName() { return "offsets"; }
+    static StringRef getSizesAttrName() { return "sizes"; }
+    static StringRef getStridesAttrName() { return "strides"; }
+    VectorType getVectorType(){ return vector()->getType().cast<VectorType>(); }
+    void getOffsets(SmallVectorImpl<int64_t> &results);
+  }];
+  let hasCanonicalizer = 1;
+}
+
+def Vector_TransferReadOp :
+  Vector_Op<"transfer_read">,
+    Arguments<(ins AnyMemRef:$memref, Variadic<Index>:$indices,
+               AffineMapAttr:$permutation_map, AnyType:$padding)>,
+    Results<(outs AnyVector:$vector)> {
+
+  let summary = "Reads a supervector from memory into an SSA vector value.";
+
+  let description = [{
+    The `vector.transfer_read` op performs a blocking read from a slice within
+    a [MemRef](../LangRef.md#memref-type) supplied as its first operand
+    into a [vector](../LangRef.md#vector-type) of the same base elemental type.
+
+    A vector memref operand must have its vector element type match a suffix
+    (shape and element type) of the vector (e.g. memref<3x2x6x4x3xf32>,
+    vector<1x1x4x3xf32>).
+
+    The slice is further defined by a full-rank index within the MemRef,
+    supplied as the operands `2 .. 1 + rank(memref)`. The permutation_map
+    [attribute](../LangRef.md#attributes) is an
+    [affine-map](Affine.md#affine-maps) which specifies the transposition on the
+    slice to match the vector shape. The size of the slice is specified by the
+    size of the vector, given as the return type. An `ssa-value` of the same
+    elemental type as the MemRef is provided as the last operand to specify
+    padding in the case of out-of-bounds accesses. This operation is called
+    'read' by opposition to 'load' because the super-vector granularity is
+    generally not representable with a single hardware register.
+    A `vector.transfer_read` is thus a mid-level
+    abstraction that supports super-vectorization with non-effecting padding for
+    full-tile-only code.
+
+    More precisely, let's dive deeper into the permutation_map for the following
+    MLIR:
+
+    ```mlir
+    vector.transfer_read %A[%expr1, %expr2, %expr3, %expr4]
+      { permutation_map : (d0,d1,d2,d3) -> (d2,0,d0) } :
+      memref<?x?x?x?xf32>, vector<3x4x5xf32>
+    ```
+
+    This operation always reads a slice starting at `%A[%expr1, %expr2, %expr3,
+    %expr4]`. The size of the slice is 3 along d2 and 5 along d0, so the slice
+    is: `%A[%expr1 : %expr1 + 5, %expr2, %expr3:%expr3 + 3, %expr4]`
+
+    That slice needs to be read into a `vector<3x4x5xf32>`. Since the
+    permutation map is not full rank, there must be a broadcast along vector
+    dimension `1`.
+
+    A notional lowering of vector.transfer_read could generate code resembling:
+
+    ```mlir
+    // %expr1, %expr2, %expr3, %expr4 defined before this point
+    %tmp = alloc() : vector<3x4x5xf32>
+    %view_in_tmp = "element_type_cast"(%tmp) : memref<1xvector<3x4x5xf32>>
+    for %i = 0 to 3 {
+      affine.for %j = 0 to 4 {
+        affine.for %k = 0 to 5 {
+          %a = load %A[%expr1 + %k, %expr2, %expr3 + %i, %expr4] :
+            memref<?x?x?x?xf32>
+          store %tmp[%i, %j, %k] : vector<3x4x5xf32>
+    }}}
+    %c0 = constant 0 : index
+    %vec = load %view_in_tmp[%c0] : vector<3x4x5xf32>
+    ```
+
+    On a GPU one could then map `i`, `j`, `k` to blocks and threads. Notice that
+    the temporary storage footprint is `3 * 5` values but `3 * 4 * 5` values are
+    actually transferred between `%A` and `%tmp`.
+
+    Alternatively, if a notional vector broadcast operation were available, the
+    lowered code would resemble:
+
+    ```mlir
+    // %expr1, %expr2, %expr3, %expr4 defined before this point
+    %tmp = alloc() : vector<3x4x5xf32>
+    %view_in_tmp = "element_type_cast"(%tmp) : memref<1xvector<3x4x5xf32>>
+    for %i = 0 to 3 {
+      affine.for %k = 0 to 5 {
+        %a = load %A[%expr1 + %k, %expr2, %expr3 + %i, %expr4] :
+          memref<?x?x?x?xf32>
+        store %tmp[%i, 0, %k] : vector<3x4x5xf32>
+    }}
+    %c0 = constant 0 : index
+    %tmpvec = load %view_in_tmp[%c0] : vector<3x4x5xf32>
+    %vec = broadcast %tmpvec, 1 : vector<3x4x5xf32>
+    ```
+
+    where `broadcast` broadcasts from element 0 to all others along the
+    specified dimension. This time, the temporary storage footprint is `3 * 5`
+    values which is the same amount of data as the `3 * 5` values transferred.
+    An additional `1` broadcast is required. On a GPU this broadcast could be
+    implemented using a warp-shuffle if loop `j` were mapped to `threadIdx.x`.
+
+    Syntax
+    ```
+    operation ::= ssa-id `=` `vector.transfer_read` ssa-use-list
+      `{` attribute-entry `} :` memref-type `,` vector-type
+    ```
+
+    Examples:
+
+    ```mlir
+    // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into vector<32x256xf32>
+    // and pad with %f0 to handle the boundary case:
+    %f0 = constant 0.0f : f32
+    for %i0 = 0 to %0 {
+      affine.for %i1 = 0 to %1 step 256 {
+        affine.for %i2 = 0 to %2 step 32 {
+          %v = vector.transfer_read %A[%i0, %i1, %i2], (%f0)
+               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
+               memref<?x?x?xf32>, vector<32x256xf32>
+    }}}
+
+    // Read the slice `%A[%i0, %i1]` (i.e. the element `%A[%i0, %i1]`) into
+    // vector<128xf32>. The underlying implementation will require a 1-D vector
+    // broadcast:
+    for %i0 = 0 to %0 {
+      affine.for %i1 = 0 to %1 {
+        %3 = vector.transfer_read %A[%i0, %i1]
+             {permutation_map: (d0, d1) -> (0)} :
+             memref<?x?xf32>, vector<128xf32>
+      }
+    }
+
+    // Read from a memref with vector element type.
+    %4 = vector.transfer_read %arg1[%c3, %c3], %vf0
+      {permutation_map = (d0, d1)->(d0, d1)}
+        : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
+    ```
+  }];
+
+  let extraClassDeclaration = [{
+    MemRefType getMemRefType() {
+      return memref()->getType().cast<MemRefType>();
+    }
+    VectorType getVectorType() {
+      return vector()->getType().cast<VectorType>();
+    }
+  }];
+}
+
+def Vector_TransferWriteOp :
+  Vector_Op<"transfer_write">,
+    Arguments<(ins AnyVector:$vector, AnyMemRef:$memref,
+               Variadic<Index>:$indices,
+               AffineMapAttr:$permutation_map)> {
+
+  let summary = "The vector.transfer_write op writes a supervector to memory.";
+
+  let description = [{
+    The `vector.transfer_write` performs a blocking write from a
+    [vector](../LangRef.md#vector-type), supplied as its first operand, into a
+    slice within a [MemRef](../LangRef.md#memref-type) of the same base
+    elemental type, supplied as its second operand.
+
+    A vector memref operand must have its vector element type match a suffix
+    (shape and element type) of the vector (e.g. memref<3x2x6x4x3xf32>,
+    vector<1x1x4x3xf32>).
+
+    The slice is further defined by a full-rank index within the MemRef,
+    supplied as the operands `3 .. 2 + rank(memref)`.
+    The permutation_map [attribute](../LangRef.md#attributes) is an
+    [affine-map](Affine.md#affine-maps) which specifies the transposition on the
+    slice to match the vector shape. The size of the slice is specified by the
+    size of the vector. This operation is called 'write' by opposition to
+    'store' because the super-vector granularity is generally not representable
+    with a single hardware register. A `vector.transfer_write` is thus a
+    mid-level abstraction that supports super-vectorization with non-effecting
+    padding for full-tile-only code. It is the responsibility of
+    `vector.transfer_write`'s implementation to ensure the memory writes are
+    valid. Different lowerings may be pertinent depending on the hardware
+    support.
+
+    Syntax:
+
+    ```
+    operation ::= `vector.transfer_write` ssa-use-list `{` attribute-entry `} :
+      ` vector-type ', ' memref-type '
+    ```
+
+    Examples:
+
+    ```mlir
+    // write vector<16x32x64xf32> into the slice
+    //   `%A[%i0, %i1:%i1+32, %i2:%i2+64, %i3:%i3+16]`:
+    for %i0 = 0 to %0 {
+      affine.for %i1 = 0 to %1 step 32 {
+        affine.for %i2 = 0 to %2 step 64 {
+          affine.for %i3 = 0 to %3 step 16 {
+            %val = `ssa-value` : vector<16x32x64xf32>
+            vector.transfer_write %val, %A[%i0, %i1, %i2, %i3]
+              {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
+              vector<16x32x64xf32>, memref<?x?x?x?xf32>
+    }}}}
+
+    // write to a memref with vector element type.
+    vector.transfer_write %4, %arg1[%c3, %c3]
+      {permutation_map = (d0, d1)->(d0, d1)}
+        : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
+    ```
+  }];
+
+  let extraClassDeclaration = [{
+    VectorType getVectorType() {
+      return vector()->getType().cast<VectorType>();
+    }
+    MemRefType getMemRefType() {
+      return memref()->getType().cast<MemRefType>();
+    }
+  }];
+}
+
+def Vector_TypeCastOp :
+  Vector_Op<"type_cast", [NoSideEffect]>,
+    Arguments<(ins StaticShapeMemRefOf<[AnyType]>:$memref)>,
+    Results<(outs AnyMemRef)> {
+  let summary = "type_cast op converts a scalar memref to a vector memref";
+  let description = [{
+    Performs a conversion from a memref with scalar element to a memref with a
+    *single* vector element, copying the shape of the memref to the vector. This
+    is the minimal viable operation that is required to makeke
+    super-vectorization operational. It can be seen as a special case of the
+    `view` operation but scoped in the super-vectorization context.
+
+    Syntax:
+
+    ```
+    operation ::= `vector.type_cast` ssa-use : memref-type to memref-type
+    ```
+
+    Example:
+
+    ```mlir
+    %A  = alloc() : memref<5x4x3xf32>
+    %VA = vector.type_cast %A : memref<5x4x3xf32> to memref<vector<5x4x3xf32>>
+    ```
+  }];
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value source">];
+
+  let parser = [{
+    return impl::parseCastOp(parser, result);
+  }];
+
+  let extraClassDeclaration = [{
+    MemRefType getMemRefType() {
+      return memref()->getType().cast<MemRefType>();
+    }
+    MemRefType getResultMemRefType() {
+      return getResult()->getType().cast<MemRefType>();
+    }
+  }];
+}
+
+def Vector_ConstantMaskOp :
+  Vector_Op<"constant_mask", [NoSideEffect]>,
+    Arguments<(ins I64ArrayAttr:$mask_dim_sizes)>,
+    Results<(outs VectorOf<[I1]>)> {
+  let summary = "creates a constant vector mask";
+  let description = [{
+    Creates and returns a vector mask where elements of the result vector
+    are set to '0' or '1', based on whether the element indices are contained
+    within a hyper-rectangular region specified by the 'mask_dim_sizes'
+    array attribute argument. Each element of the 'mask_dim_sizes' array,
+    specifies an exclusive upper bound [0, mask-dim-size-element-value)
+    for a unique dimension in the vector result. The conjunction of the ranges
+    define a hyper-rectangular region within which elements values are set to 1
+    (otherwise element values are set to 0).
+
+    Example: create a constant vector mask of size 4x3xi1 with elements in range
+             0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0).
+
+      %1 = vector.constant_mask [3, 2] : vector<4x3xi1>
+
+      print %1
+                    columns
+                  0    1    2
+                |------------
+              0 | 1    1    0
+        rows  1 | 1    1    0
+              2 | 1    1    0
+              3 | 0    0    0
+  }];
+
+  let extraClassDeclaration = [{
+    static StringRef getMaskDimSizesAttrName() { return "mask_dim_sizes"; }
+  }];
+}
+
+def Vector_CreateMaskOp :
+  Vector_Op<"create_mask", [NoSideEffect]>,
+    Arguments<(ins Variadic<Index>:$operands)>, Results<(outs VectorOf<[I1]>)> {
+  let summary = "creates a vector mask";
+  let description = [{
+    Creates and returns a vector mask where elements of the result vector
+    are set to '0' or '1', based on whether the element indices are contained
+    within a hyper-rectangular region specified by the operands. Specifically,
+    each operand specifies a range [0, operand-value) for a unique dimension in
+    the vector result. The conjunction of the operand ranges define a
+    hyper-rectangular region within which elements values are set to 1
+    (otherwise element values are set to 0).
+
+    Example: create a vector mask of size 4x3xi1 where elements in range
+             0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0).
+
+      %1 = vector.create_mask %c3, %c2 : vector<4x3xi1>
+
+      print %1
+                    columns
+                  0    1    2
+                |------------
+              0 | 1    1    0
+        rows  1 | 1    1    0
+              2 | 1    1    0
+              3 | 0    0    0
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def Vector_TupleOp :
+  Vector_Op<"tuple", [NoSideEffect]>,
+    Arguments<(ins Variadic<AnyVector>:$vectors)>,
+    Results<(outs TupleOf<[AnyVector]>)> {
+  let summary = "make tuple of vectors operation";
+  let description = [{
+    Returns a tuple of its operands 'vectors'.
+
+    Note that this operation is used during the vector op unrolling
+    transformation and should be removed before lowering to lower-level
+    dialects.
+
+
+    Examples:
+    ```
+      %0 = vector.transfer_read ... : vector<2x2xf32>
+      %1 = vector.transfer_read ... : vector<2x1xf32>
+      %2 = vector.transfer_read ... : vector<2x2xf32>
+      %3 = vector.transfer_read ... : vector<2x1xf32>
+
+      %4 = vector.tuple %0, %1, %2, %3
+        : vector<2x2xf32>, vector<2x1xf32>, vector<2x2xf32>, vector<2x1xf32>
+
+    ```
+  }];
+
+  let extraClassDeclaration = [{
+    TupleType getResultTupleType() {
+      return getResult()->getType().cast<TupleType>();
+    }
+  }];
+}
+
+def Vector_TupleGetOp :
+  Vector_Op<"tuple_get", [NoSideEffect]>,
+    Arguments<(ins TupleOf<[AnyVector]>:$vectors, APIntAttr:$index)>,
+    Results<(outs AnyVector)> {
+  let summary = "vector tuple get operation";
+  let description = [{
+    Returns the tuple element of 'vectors' at 'index'.
+
+    Note that this operation is used during the vector op unrolling
+    transformation and should be removed before lowering to lower-level
+    dialects.
+
+    Examples:
+    ```
+      %4 = vector.tuple %0, %1, %2, %3
+        : vector<2x2xf32>, vector<2x1xf32>, vector<2x2xf32>, vector<2x1xf32>>
+
+      %5 = vector.tuple_get %4, 1
+        : tuple<vector<2x2xf32>, vector<2x1xf32>,
+                vector<2x2xf32>, vector<2x1xf32>>
+    ```
+  }];
+
+  let extraClassDeclaration = [{
+    VectorType getResultVectorType() {
+      return getResult()->getType().cast<VectorType>();
+    }
+    int64_t getIndex() {
+      return getAttrOfType<IntegerAttr>("index").getValue().getSExtValue();
+    }
+    static StringRef getIndexAttrName() { return "index"; }
+  }];
+}
+
+def Vector_PrintOp :
+  Vector_Op<"print", []>, Arguments<(ins AnyType:$source)> {
+  let summary = "print operation (for testing and debugging)";
+  let description = [{
+    Prints the source vector (or scalar) to stdout in human readable
+    format (for testing and debugging). No return value.
+
+    Examples:
+    ```
+      %0 = constant 0.0 : f32
+      %1 = vector.broadcast %0 : f32 to vector<4xf32>
+      vector.print %1 : vector<4xf32>
+
+      when lowered to LLVM, the vector print is unrolled into
+      elementary printing method calls that at runtime will yield
+
+      ( 0.0, 0.0, 0.0, 0.0 )
+
+      on stdout when linked with a small runtime support library,
+      which only needs to provide a few printing methods (single
+      value for all data types, opening/closing bracket, comma,
+      newline).
+    ```
+  }];
+  let verifier = ?;
+  let extraClassDeclaration = [{
+    Type getPrintType() {
+      return source()->getType();
+    }
+  }];
+}
+
+#endif // VECTOR_OPS
diff --git a/mlir/include/mlir/Dialect/VectorOps/VectorTransformPatterns.td b/mlir/include/mlir/Dialect/VectorOps/VectorTransformPatterns.td
new file mode 100644
index 0000000000000000000000000000000000000000..5d0244f6989537c20e9d0561457d078d7f383e89
--- /dev/null
+++ b/mlir/include/mlir/Dialect/VectorOps/VectorTransformPatterns.td
@@ -0,0 +1,26 @@
+//===- VectorTransformPatterns.td - Vector-Vector patterns -*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the pattern definition file for declarative Vector transformations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef VECTOR_TRANSFORM_PATTERNS
+#define VECTOR_TRANSFORM_PATTERNS
+
+include "mlir/IR/OpBase.td"
+
+class HasShape<list<int> shape> :
+  CPred<"$0->getType().cast<ShapedType>().hasStaticShape({" #
+    StrJoinInt<shape>.result # "})">;
+
+class UnrollVectorOp<list<int> factors> : NativeCodeCall<
+  "unrollSingleResultOpMatchingType($_builder, $0->getDefiningOp(), " #
+    "{" # StrJoinInt<factors>.result # "})">;
+
+#endif // VECTOR_TRANSFORM_PATTERNS
diff --git a/mlir/include/mlir/Dialect/VectorOps/VectorTransforms.h b/mlir/include/mlir/Dialect/VectorOps/VectorTransforms.h
new file mode 100644
index 0000000000000000000000000000000000000000..feb8bd60445ba921815a9dc374a3f7e2c25246c6
--- /dev/null
+++ b/mlir/include/mlir/Dialect/VectorOps/VectorTransforms.h
@@ -0,0 +1,73 @@
+//===- VectorTransforms.h - Vector transformations as patterns --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DIALECT_VECTOROPS_VECTORTRANSFORMS_H_
+#define DIALECT_VECTOROPS_VECTORTRANSFORMS_H_
+
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+class MLIRContext;
+class OwningRewritePatternList;
+
+/// Collect a set of patterns to convert from the Vector dialect to itself.
+/// Should be merged with populateVectorToAffineLoopsConversionPatterns.
+void populateVectorToVectorConversionPatterns(
+    MLIRContext *context, OwningRewritePatternList &patterns,
+    ArrayRef<int64_t> coarseVectorShape = {},
+    ArrayRef<int64_t> fineVectorShape = {});
+
+////////////////////////////////////////////////////////////////////////////////
+// The following Declarative Rewrite Rule (DRR) helpers are used in rewrite
+// patterns. As such, they must not call into `rewriter.erase/replace` APIs and
+// it is the responsibility of the enclosing PatternRewriter to erase on
+// success.
+////////////////////////////////////////////////////////////////////////////////
+
+namespace vector {
+
+// Entry point for unrolling declarative pattern rewrites.
+// `op` is unrolled to the `targetShape` as follows, for each of its operands:
+//   1. the unrolled type `unrolledVectorType` and number of unrolled instances
+//   `numUnrolledInstances` are computed from the `targetShape`. For now it is
+//   assumed the unrolling factors divide the vector sizes.
+//   2. a fakeFork cast op is inserted that takes the operand and returns
+//   `numUnrolledInstances` results of type `unrolledVectorType`.
+//   3. the original op is cloned `numUnrolledInstances` times, once for each
+//   result of the fakeFork cast op.
+//   4. a fakeJoin cast op takes all these results and merges them into a single
+//   aggregate vector result whose size matches the original non-unrolled op
+//   operand types.
+//
+// Example:
+//
+//    opA(operand0, operand1)  // numUnrolledInstances = 3
+//
+//            operand0                   operand1
+//               |                          |
+//             fork                       fork
+//        <----------gather all fork ops --------->
+//              /|\                        /|\
+//          f00 f01 f02                f10 f11 f12
+//        <---------- clone op 3 times --------->
+//          opA0(f00, f10), opA1(f01, f11), opA2(f02, f12)
+//                 \            |            /
+//      <-------------------- join ------------------------->
+//
+// Other local patterns then kick in iteratively (including DCE) and compose
+// until all the fakeFork and fakeJoin ops are removed.
+//
+// This will be extended in the future to support more advanced use cases than
+// simple pointwise ops.
+Value unrollSingleResultOpMatchingType(PatternRewriter &builder, Operation *op,
+                                       ArrayRef<int64_t> targetShape);
+
+} // namespace vector
+} // namespace mlir
+
+#endif // DIALECT_VECTOROPS_VECTORTRANSFORMS_H_
diff --git a/mlir/include/mlir/EDSC/Builders.h b/mlir/include/mlir/EDSC/Builders.h
new file mode 100644
index 0000000000000000000000000000000000000000..d598c1cfb23ba2e8cc204182f3a8b7654a5e780e
--- /dev/null
+++ b/mlir/include/mlir/EDSC/Builders.h
@@ -0,0 +1,538 @@
+//===- Builders.h - MLIR Declarative Builder Classes ------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides intuitive composable interfaces for building structured MLIR
+// snippets in a declarative fashion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EDSC_BUILDERS_H_
+#define MLIR_EDSC_BUILDERS_H_
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+namespace mlir {
+
+namespace edsc {
+
+struct index_t {
+  explicit index_t(int64_t v) : v(v) {}
+  explicit operator int64_t() { return v; }
+  int64_t v;
+};
+
+class BlockHandle;
+class CapturableHandle;
+class NestedBuilder;
+class ValueHandle;
+
+/// Helper class to transparently handle builder insertion points by RAII.
+/// As its name indicates, a ScopedContext is means to be used locally in a
+/// scoped fashion. This abstracts away all the boilerplate related to
+/// checking proper usage of captures, NestedBuilders as well as handling the
+/// setting and restoring of insertion points.
+class ScopedContext {
+public:
+  ScopedContext(OpBuilder &builder, Location location);
+
+  /// Sets the insertion point of the builder to 'newInsertPt' for the duration
+  /// of the scope. The existing insertion point of the builder is restored on
+  /// destruction.
+  ScopedContext(OpBuilder &builder, OpBuilder::InsertPoint newInsertPt,
+                Location location);
+  ~ScopedContext();
+
+  static MLIRContext *getContext();
+  static OpBuilder &getBuilder();
+  static Location getLocation();
+
+private:
+  /// Only NestedBuilder (which is used to create an operation with a body)
+  /// may access private members in order to implement scoping.
+  friend class NestedBuilder;
+
+  ScopedContext() = delete;
+  ScopedContext(const ScopedContext &) = delete;
+  ScopedContext &operator=(const ScopedContext &) = delete;
+
+  static ScopedContext *&getCurrentScopedContext();
+
+  /// Top level OpBuilder.
+  OpBuilder &builder;
+  /// The previous insertion point of the builder.
+  Optional<OpBuilder::InsertPoint> prevBuilderInsertPoint;
+  /// Current location.
+  Location location;
+  /// Parent context we return into.
+  ScopedContext *enclosingScopedContext;
+  /// Defensively keeps track of the current NestedBuilder to ensure proper
+  /// scoping usage.
+  NestedBuilder *nestedBuilder;
+
+  // TODO: Implement scoping of ValueHandles. To do this we need a proper data
+  // structure to hold ValueHandle objects. We can emulate one but there should
+  // already be something available in LLVM for this purpose.
+};
+
+/// A NestedBuilder is a scoping abstraction to create an idiomatic syntax
+/// embedded in C++ that serves the purpose of building nested MLIR.
+/// Nesting and compositionality is obtained by using the strict ordering that
+/// exists between object construction and method invocation on said object (in
+/// our case, the call to `operator()`).
+/// This ordering allows implementing an abstraction that decouples definition
+/// from declaration (in a PL sense) on placeholders of type ValueHandle and
+/// BlockHandle.
+class NestedBuilder {
+protected:
+  NestedBuilder() = default;
+  NestedBuilder(const NestedBuilder &) = delete;
+  NestedBuilder(NestedBuilder &&other) : bodyScope(other.bodyScope) {
+    other.bodyScope = nullptr;
+  }
+
+  NestedBuilder &operator=(const NestedBuilder &) = delete;
+  NestedBuilder &operator=(NestedBuilder &&other) {
+    std::swap(bodyScope, other.bodyScope);
+    return *this;
+  }
+
+  /// Enter an mlir::Block and setup a ScopedContext to insert operations at
+  /// the end of it. Since we cannot use c++ language-level scoping to implement
+  /// scoping itself, we use enter/exit pairs of operations.
+  /// As a consequence we must allocate a new OpBuilder + ScopedContext and
+  /// let the escape.
+  /// Step back "prev" times from the end of the block to set up the insertion
+  /// point, which is useful for non-empty blocks.
+  void enter(mlir::Block *block, int prev = 0) {
+    bodyScope = new ScopedContext(
+        ScopedContext::getBuilder(),
+        OpBuilder::InsertPoint(block, std::prev(block->end(), prev)),
+        ScopedContext::getLocation());
+    bodyScope->nestedBuilder = this;
+  }
+
+  /// Exit the current mlir::Block by explicitly deleting the dynamically
+  /// allocated OpBuilder and ScopedContext.
+  void exit() {
+    // Reclaim now to exit the scope.
+    bodyScope->nestedBuilder = nullptr;
+    delete bodyScope;
+    bodyScope = nullptr;
+  }
+
+  /// Custom destructor does nothing because we already destroyed bodyScope
+  /// manually in `exit`. Insert an assertion to defensively guard against
+  /// improper usage of scoping.
+  ~NestedBuilder() {
+    assert(!bodyScope &&
+           "Illegal use of NestedBuilder; must have called exit()");
+  }
+
+private:
+  ScopedContext *bodyScope = nullptr;
+};
+
+/// A LoopBuilder is a generic NestedBuilder for loop-like MLIR operations.
+/// More specifically it is meant to be used as a temporary object for
+/// representing any nested MLIR construct that is "related to" an mlir::Value
+/// (for now an induction variable).
+/// This is extensible and will evolve in the future as MLIR evolves, hence
+/// the name LoopBuilder (as opposed to say ForBuilder or AffineForBuilder).
+class LoopBuilder : public NestedBuilder {
+public:
+  /// Constructs a new AffineForOp and captures the associated induction
+  /// variable. A ValueHandle pointer is passed as the first argument and is the
+  /// *only* way to capture the loop induction variable.
+  static LoopBuilder makeAffine(ValueHandle *iv,
+                                ArrayRef<ValueHandle> lbHandles,
+                                ArrayRef<ValueHandle> ubHandles, int64_t step);
+  /// Constructs a new loop::ForOp and captures the associated induction
+  /// variable. A ValueHandle pointer is passed as the first argument and is the
+  /// *only* way to capture the loop induction variable.
+  static LoopBuilder makeLoop(ValueHandle *iv, ValueHandle lbHandle,
+                              ValueHandle ubHandle, ValueHandle stepHandle);
+  LoopBuilder(const LoopBuilder &) = delete;
+  LoopBuilder(LoopBuilder &&) = default;
+
+  LoopBuilder &operator=(const LoopBuilder &) = delete;
+  LoopBuilder &operator=(LoopBuilder &&) = default;
+
+  /// The only purpose of this operator is to serve as a sequence point so that
+  /// the evaluation of `fun` (which build IR snippets in a scoped fashion) is
+  /// scoped within a LoopBuilder.
+  void operator()(function_ref<void(void)> fun = nullptr);
+
+private:
+  LoopBuilder() = default;
+};
+
+/// Explicit nested LoopBuilder. Offers a compressed multi-loop builder to avoid
+/// explicitly writing all the loops in a nest. This simple functionality is
+/// also useful to write rank-agnostic custom ops.
+///
+/// Usage:
+///
+/// ```c++
+///    AffineLoopNestBuilder({&i, &j, &k}, {lb, lb, lb}, {ub, ub, ub}, {1, 1,
+///    1})(
+///      [&](){
+///        ...
+///      });
+/// ```
+///
+/// ```c++
+///    AffineLoopNestBuilder({&i}, {lb}, {ub}, {1})([&](){
+///      AffineLoopNestBuilder({&j}, {lb}, {ub}, {1})([&](){
+///        AffineLoopNestBuilder({&k}, {lb}, {ub}, {1})([&](){
+///          ...
+///        }),
+///      }),
+///    });
+/// ```
+class AffineLoopNestBuilder {
+public:
+  // This entry point accommodates the fact that AffineForOp implicitly uses
+  // multiple `lbs` and `ubs` with one single `iv` and `step` to encode `max`
+  // and and `min` constraints respectively.
+  AffineLoopNestBuilder(ValueHandle *iv, ArrayRef<ValueHandle> lbs,
+                        ArrayRef<ValueHandle> ubs, int64_t step);
+  AffineLoopNestBuilder(ArrayRef<ValueHandle *> ivs, ArrayRef<ValueHandle> lbs,
+                        ArrayRef<ValueHandle> ubs, ArrayRef<int64_t> steps);
+
+  void operator()(function_ref<void(void)> fun = nullptr);
+
+private:
+  SmallVector<LoopBuilder, 4> loops;
+};
+
+/// Helper class to sugar building loop.for loop nests from ranges.
+/// This is similar to edsc::AffineLoopNestBuilder except it operates on
+/// loop.for.
+class LoopNestBuilder {
+public:
+  LoopNestBuilder(ArrayRef<edsc::ValueHandle *> ivs, ArrayRef<ValueHandle> lbs,
+                  ArrayRef<ValueHandle> ubs, ArrayRef<ValueHandle> steps);
+  void operator()(std::function<void(void)> fun = nullptr);
+
+private:
+  SmallVector<LoopBuilder, 4> loops;
+};
+
+// This class exists solely to handle the C++ vexing parse case when
+// trying to enter a Block that has already been constructed.
+class Append {};
+
+/// A BlockBuilder is a NestedBuilder for mlir::Block*.
+/// This exists by opposition to LoopBuilder which is not related to an
+/// mlir::Block* but to a mlir::Value.
+/// It is meant to be used as a temporary object for representing any nested
+/// MLIR construct that is "related to" an mlir::Block*.
+class BlockBuilder : public NestedBuilder {
+public:
+  /// Enters the mlir::Block* previously captured by `bh` and sets the insertion
+  /// point to its end.
+  BlockBuilder(BlockHandle bh, Append);
+
+  /// Constructs a new mlir::Block with argument types derived from `args`.
+  /// Captures the new block in `bh` and its arguments into `args`.
+  /// Enters the new mlir::Block* and sets the insertion point to its end.
+  ///
+  /// Prerequisites:
+  ///   The ValueHandle `args` are typed delayed ValueHandles; i.e. they are
+  ///   not yet bound to mlir::Value.
+  BlockBuilder(BlockHandle *bh, ArrayRef<ValueHandle *> args);
+
+  /// The only purpose of this operator is to serve as a sequence point so that
+  /// the evaluation of `fun` (which build IR snippets in a scoped fashion) is
+  /// scoped within a BlockBuilder.
+  void operator()(function_ref<void(void)> fun = nullptr);
+
+private:
+  BlockBuilder(BlockBuilder &) = delete;
+  BlockBuilder &operator=(BlockBuilder &other) = delete;
+};
+
+/// Base class for ValueHandle, OperationHandle and BlockHandle.
+/// Not meant to be used outside of these classes.
+class CapturableHandle {
+protected:
+  CapturableHandle() = default;
+};
+
+/// ValueHandle implements a (potentially "delayed") typed Value abstraction.
+/// ValueHandle should be captured by pointer but otherwise passed by Value
+/// everywhere.
+/// A ValueHandle can have 3 states:
+///   1. null state (empty type and empty value), in which case it does not hold
+///      a value and must never hold a Value (now or in the future). This is
+///      used for MLIR operations with zero returns as well as the result of
+///      calling a NestedBuilder::operator(). In both cases the objective is to
+///      have an object that can be inserted in an ArrayRef<ValueHandle> to
+///      implement nesting;
+///   2. delayed state (empty value), in which case it represents an eagerly
+///      typed "delayed" value that can be hold a Value in the future;
+///   3. constructed state,in which case it holds a Value.
+///
+/// A ValueHandle is meant to capture a single Value and should be used for
+/// operations that have a single result. For convenience of use, we also
+/// include AffineForOp in this category although it does not return a value.
+/// In the case of AffineForOp, the captured Value is the loop induction
+/// variable.
+class ValueHandle : public CapturableHandle {
+public:
+  /// A ValueHandle in a null state can never be captured;
+  static ValueHandle null() { return ValueHandle(); }
+
+  /// A ValueHandle that is constructed from a Type represents a typed "delayed"
+  /// Value. A delayed Value can only capture Values of the specified type.
+  /// Such a delayed value represents the declaration (in the PL sense) of a
+  /// placeholder for an mlir::Value that will be constructed and captured at
+  /// some later point in the program.
+  explicit ValueHandle(Type t) : t(t), v(nullptr) {}
+
+  /// A ValueHandle that is constructed from an mlir::Value is an "eager"
+  /// Value. An eager Value represents both the declaration and the definition
+  /// (in the PL sense) of a placeholder for an mlir::Value that has already
+  /// been constructed in the past and that is captured "now" in the program.
+  explicit ValueHandle(Value v) : t(v->getType()), v(v) {}
+
+  /// Builds a ConstantIndexOp of value `cst`. The constant is created at the
+  /// current insertion point.
+  /// This implicit constructor is provided to each build an eager Value for a
+  /// constant at the current insertion point in the IR. An implicit constructor
+  /// allows idiomatic expressions mixing ValueHandle and literals.
+  ValueHandle(index_t cst);
+
+  /// ValueHandle is a value type, use the default copy constructor.
+  ValueHandle(const ValueHandle &other) = default;
+
+  /// ValueHandle is a value type, the assignment operator typechecks before
+  /// assigning.
+  ValueHandle &operator=(const ValueHandle &other);
+
+  /// Provide a swap operator.
+  void swap(ValueHandle &other) {
+    if (this == &other)
+      return;
+    std::swap(t, other.t);
+    std::swap(v, other.v);
+  }
+
+  /// Implicit conversion useful for automatic conversion to Container<Value>.
+  operator Value() const { return getValue(); }
+  operator bool() const { return hasValue(); }
+
+  /// Generic mlir::Op create. This is the key to being extensible to the whole
+  /// of MLIR without duplicating the type system or the op definitions.
+  template <typename Op, typename... Args>
+  static ValueHandle create(Args... args);
+
+  /// Generic mlir::Op create. This is the key to being extensible to the whole
+  /// of MLIR without duplicating the type system or the op definitions.
+  /// When non-null, the optional pointer `folder` is used to call into the
+  /// `createAndFold` builder method. If `folder` is null, the regular `create`
+  /// method is called.
+  template <typename Op, typename... Args>
+  static ValueHandle create(OperationFolder *folder, Args... args);
+
+  /// Special case to build composed AffineApply operations.
+  // TODO: createOrFold when available and move inside of the `create` method.
+  static ValueHandle createComposedAffineApply(AffineMap map,
+                                               ArrayRef<Value> operands);
+
+  /// Generic create for a named operation producing a single value.
+  static ValueHandle create(StringRef name, ArrayRef<ValueHandle> operands,
+                            ArrayRef<Type> resultTypes,
+                            ArrayRef<NamedAttribute> attributes = {});
+
+  bool hasValue() const { return v != nullptr; }
+  Value getValue() const {
+    assert(hasValue() && "Unexpected null value;");
+    return v;
+  }
+  bool hasType() const { return t != Type(); }
+  Type getType() const { return t; }
+
+  Operation *getOperation() const {
+    if (!v)
+      return nullptr;
+    return v->getDefiningOp();
+  }
+
+protected:
+  ValueHandle() : t(), v(nullptr) {}
+
+  Type t;
+  Value v;
+};
+
+/// An OperationHandle can be used in lieu of ValueHandle to capture the
+/// operation in cases when one does not care about, or cannot extract, a
+/// unique Value from the operation.
+/// This can be used for capturing zero result operations as well as
+/// multi-result operations that are not supported by ValueHandle.
+/// We do not distinguish further between zero and multi-result operations at
+/// this time.
+struct OperationHandle : public CapturableHandle {
+  OperationHandle() : op(nullptr) {}
+  OperationHandle(Operation *op) : op(op) {}
+
+  OperationHandle(const OperationHandle &) = default;
+  OperationHandle &operator=(const OperationHandle &) = default;
+
+  /// Generic mlir::Op create. This is the key to being extensible to the whole
+  /// of MLIR without duplicating the type system or the op definitions.
+  template <typename Op, typename... Args>
+  static OperationHandle create(Args... args);
+  template <typename Op, typename... Args> static Op createOp(Args... args);
+
+  /// Generic create for a named operation.
+  static OperationHandle create(StringRef name, ArrayRef<ValueHandle> operands,
+                                ArrayRef<Type> resultTypes,
+                                ArrayRef<NamedAttribute> attributes = {});
+
+  operator Operation *() { return op; }
+  Operation *getOperation() const { return op; }
+
+private:
+  Operation *op;
+};
+
+/// Simple wrapper to build a generic operation without successor blocks.
+template <typename HandleType> struct CustomOperation {
+  CustomOperation(StringRef name) : name(name) {
+    static_assert(std::is_same<HandleType, ValueHandle>() ||
+                      std::is_same<HandleType, OperationHandle>(),
+                  "Only CustomOperation<ValueHandle> or "
+                  "CustomOperation<OperationHandle> can be constructed.");
+  }
+  HandleType operator()(ArrayRef<ValueHandle> operands = {},
+                        ArrayRef<Type> resultTypes = {},
+                        ArrayRef<NamedAttribute> attributes = {}) {
+    return HandleType::create(name, operands, resultTypes, attributes);
+  }
+  std::string name;
+};
+
+/// A BlockHandle represents a (potentially "delayed") Block abstraction.
+/// This extra abstraction is necessary because an mlir::Block is not an
+/// mlir::Value.
+/// A BlockHandle should be captured by pointer but otherwise passed by Value
+/// everywhere.
+class BlockHandle : public CapturableHandle {
+public:
+  /// A BlockHandle constructed without an mlir::Block* represents a "delayed"
+  /// Block. A delayed Block represents the declaration (in the PL sense) of a
+  /// placeholder for an mlir::Block* that will be constructed and captured at
+  /// some later point in the program.
+  BlockHandle() : block(nullptr) {}
+
+  /// A BlockHandle constructed with an mlir::Block* represents an "eager"
+  /// Block. An eager Block represents both the declaration and the definition
+  /// (in the PL sense) of a placeholder for an mlir::Block* that has already
+  /// been constructed in the past and that is captured "now" in the program.
+  BlockHandle(mlir::Block *block) : block(block) {}
+
+  /// BlockHandle is a value type, use the default copy constructor and
+  /// assignment operator.
+  BlockHandle(const BlockHandle &) = default;
+  BlockHandle &operator=(const BlockHandle &) = default;
+
+  /// Delegates block creation to MLIR and wrap the resulting mlir::Block.
+  static BlockHandle create(ArrayRef<Type> argTypes);
+
+  operator bool() { return block != nullptr; }
+  operator mlir::Block *() { return block; }
+  mlir::Block *getBlock() { return block; }
+
+private:
+  mlir::Block *block;
+};
+
+template <typename Op, typename... Args>
+OperationHandle OperationHandle::create(Args... args) {
+  return OperationHandle(ScopedContext::getBuilder()
+                             .create<Op>(ScopedContext::getLocation(), args...)
+                             .getOperation());
+}
+
+template <typename Op, typename... Args>
+Op OperationHandle::createOp(Args... args) {
+  return cast<Op>(
+      OperationHandle(ScopedContext::getBuilder()
+                          .create<Op>(ScopedContext::getLocation(), args...)
+                          .getOperation())
+          .getOperation());
+}
+
+template <typename Op, typename... Args>
+ValueHandle ValueHandle::create(Args... args) {
+  Operation *op = ScopedContext::getBuilder()
+                      .create<Op>(ScopedContext::getLocation(), args...)
+                      .getOperation();
+  if (op->getNumResults() == 1) {
+    return ValueHandle(op->getResult(0));
+  } else if (op->getNumResults() == 0) {
+    if (auto f = dyn_cast<AffineForOp>(op)) {
+      return ValueHandle(f.getInductionVar());
+    }
+  }
+  llvm_unreachable("unsupported operation, use an OperationHandle instead");
+}
+
+template <typename Op, typename... Args>
+ValueHandle ValueHandle::create(OperationFolder *folder, Args... args) {
+  return folder ? ValueHandle(folder->create<Op>(ScopedContext::getBuilder(),
+                                                 ScopedContext::getLocation(),
+                                                 args...))
+                : ValueHandle(ScopedContext::getBuilder().create<Op>(
+                      ScopedContext::getLocation(), args...));
+}
+
+namespace op {
+
+ValueHandle operator+(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator-(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator*(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator/(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator%(ValueHandle lhs, ValueHandle rhs);
+ValueHandle floorDiv(ValueHandle lhs, ValueHandle rhs);
+ValueHandle ceilDiv(ValueHandle lhs, ValueHandle rhs);
+
+ValueHandle operator!(ValueHandle value);
+ValueHandle operator&&(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator||(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator^(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator==(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator!=(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator<(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator<=(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator>(ValueHandle lhs, ValueHandle rhs);
+ValueHandle operator>=(ValueHandle lhs, ValueHandle rhs);
+
+} // namespace op
+
+/// Entry point to build multiple ValueHandle from a `Container` of Value or
+/// Type.
+template <typename Container>
+inline SmallVector<ValueHandle, 8> makeValueHandles(Container values) {
+  SmallVector<ValueHandle, 8> res;
+  res.reserve(values.size());
+  for (auto v : values)
+    res.push_back(ValueHandle(v));
+  return res;
+}
+
+} // namespace edsc
+} // namespace mlir
+
+#endif // MLIR_EDSC_BUILDERS_H_
diff --git a/mlir/include/mlir/EDSC/Helpers.h b/mlir/include/mlir/EDSC/Helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7c0365225a750ead4f3105cad017e9d0e910104
--- /dev/null
+++ b/mlir/include/mlir/EDSC/Helpers.h
@@ -0,0 +1,258 @@
+//===- Helpers.h - MLIR Declarative Helper Functionality --------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides helper classes and syntactic sugar for declarative builders.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EDSC_HELPERS_H_
+#define MLIR_EDSC_HELPERS_H_
+
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Intrinsics.h"
+
+namespace mlir {
+namespace edsc {
+
+// A TemplatedIndexedValue brings an index notation over the template Load and
+// Store parameters.
+template <typename Load, typename Store> class TemplatedIndexedValue;
+
+// By default, edsc::IndexedValue provides an index notation around the affine
+// load and stores. edsc::StdIndexedValue provides the standard load/store
+// counterpart.
+using IndexedValue =
+    TemplatedIndexedValue<intrinsics::affine_load, intrinsics::affine_store>;
+using StdIndexedValue =
+    TemplatedIndexedValue<intrinsics::std_load, intrinsics::std_store>;
+
+// Base class for MemRefView and VectorView.
+class View {
+public:
+  unsigned rank() const { return lbs.size(); }
+  ValueHandle lb(unsigned idx) { return lbs[idx]; }
+  ValueHandle ub(unsigned idx) { return ubs[idx]; }
+  int64_t step(unsigned idx) { return steps[idx]; }
+  std::tuple<ValueHandle, ValueHandle, int64_t> range(unsigned idx) {
+    return std::make_tuple(lbs[idx], ubs[idx], steps[idx]);
+  }
+  void swapRanges(unsigned i, unsigned j) {
+    if (i == j)
+      return;
+    lbs[i].swap(lbs[j]);
+    ubs[i].swap(ubs[j]);
+    std::swap(steps[i], steps[j]);
+  }
+
+  ArrayRef<ValueHandle> getLbs() { return lbs; }
+  ArrayRef<ValueHandle> getUbs() { return ubs; }
+  ArrayRef<int64_t> getSteps() { return steps; }
+
+protected:
+  SmallVector<ValueHandle, 8> lbs;
+  SmallVector<ValueHandle, 8> ubs;
+  SmallVector<int64_t, 8> steps;
+};
+
+/// A MemRefView represents the information required to step through a
+/// MemRef. It has placeholders for non-contiguous tensors that fit within the
+/// Fortran subarray model.
+/// At the moment it can only capture a MemRef with an identity layout map.
+// TODO(ntv): Support MemRefs with layoutMaps.
+class MemRefView : public View {
+public:
+  explicit MemRefView(Value v);
+  MemRefView(const MemRefView &) = default;
+  MemRefView &operator=(const MemRefView &) = default;
+
+  unsigned fastestVarying() const { return rank() - 1; }
+
+private:
+  friend IndexedValue;
+  ValueHandle base;
+};
+
+/// A VectorView represents the information required to step through a
+/// Vector accessing each scalar element at a time. It is the counterpart of
+/// a MemRefView but for vectors. This exists purely for boilerplate avoidance.
+class VectorView : public View {
+public:
+  explicit VectorView(Value v);
+  VectorView(const VectorView &) = default;
+  VectorView &operator=(const VectorView &) = default;
+
+private:
+  friend IndexedValue;
+  ValueHandle base;
+};
+
+/// A TemplatedIndexedValue brings an index notation over the template Load and
+/// Store parameters. This helper class is an abstraction purely for sugaring
+/// purposes and allows writing compact expressions such as:
+///
+/// ```mlir
+///    // `IndexedValue` provided by default in the mlir::edsc namespace.
+///    using IndexedValue =
+///      TemplatedIndexedValue<intrinsics::load, intrinsics::store>;
+///    IndexedValue A(...), B(...), C(...);
+///    For(ivs, zeros, shapeA, ones, {
+///      C(ivs) = A(ivs) + B(ivs)
+///    });
+/// ```
+///
+/// Assigning to an IndexedValue emits an actual `Store` operation, while
+/// converting an IndexedValue to a ValueHandle emits an actual `Load`
+/// operation.
+template <typename Load, typename Store> class TemplatedIndexedValue {
+public:
+  explicit TemplatedIndexedValue(Type t) : base(t) {}
+  explicit TemplatedIndexedValue(Value v)
+      : TemplatedIndexedValue(ValueHandle(v)) {}
+  explicit TemplatedIndexedValue(ValueHandle v) : base(v) {}
+
+  TemplatedIndexedValue(const TemplatedIndexedValue &rhs) = default;
+
+  TemplatedIndexedValue operator()() { return *this; }
+  /// Returns a new `TemplatedIndexedValue`.
+  TemplatedIndexedValue operator()(ValueHandle index) {
+    TemplatedIndexedValue res(base);
+    res.indices.push_back(index);
+    return res;
+  }
+  template <typename... Args>
+  TemplatedIndexedValue operator()(ValueHandle index, Args... indices) {
+    return TemplatedIndexedValue(base, index).append(indices...);
+  }
+  TemplatedIndexedValue operator()(ArrayRef<ValueHandle> indices) {
+    return TemplatedIndexedValue(base, indices);
+  }
+  TemplatedIndexedValue operator()(ArrayRef<IndexHandle> indices) {
+    return TemplatedIndexedValue(
+        base, ArrayRef<ValueHandle>(indices.begin(), indices.end()));
+  }
+
+  /// Emits a `store`.
+  // NOLINTNEXTLINE: unconventional-assign-operator
+  OperationHandle operator=(const TemplatedIndexedValue &rhs) {
+    ValueHandle rrhs(rhs);
+    return Store(rrhs, getBase(), {indices.begin(), indices.end()});
+  }
+  // NOLINTNEXTLINE: unconventional-assign-operator
+  OperationHandle operator=(ValueHandle rhs) {
+    return Store(rhs, getBase(), {indices.begin(), indices.end()});
+  }
+
+  /// Emits a `load` when converting to a ValueHandle.
+  operator ValueHandle() const {
+    return Load(getBase(), {indices.begin(), indices.end()});
+  }
+
+  /// Emits a `load` when converting to a Value.
+  Value operator*(void) const {
+    return Load(getBase(), {indices.begin(), indices.end()}).getValue();
+  }
+
+  ValueHandle getBase() const { return base; }
+
+  /// Operator overloadings.
+  ValueHandle operator+(ValueHandle e);
+  ValueHandle operator-(ValueHandle e);
+  ValueHandle operator*(ValueHandle e);
+  ValueHandle operator/(ValueHandle e);
+  OperationHandle operator+=(ValueHandle e);
+  OperationHandle operator-=(ValueHandle e);
+  OperationHandle operator*=(ValueHandle e);
+  OperationHandle operator/=(ValueHandle e);
+  ValueHandle operator+(TemplatedIndexedValue e) {
+    return *this + static_cast<ValueHandle>(e);
+  }
+  ValueHandle operator-(TemplatedIndexedValue e) {
+    return *this - static_cast<ValueHandle>(e);
+  }
+  ValueHandle operator*(TemplatedIndexedValue e) {
+    return *this * static_cast<ValueHandle>(e);
+  }
+  ValueHandle operator/(TemplatedIndexedValue e) {
+    return *this / static_cast<ValueHandle>(e);
+  }
+  OperationHandle operator+=(TemplatedIndexedValue e) {
+    return this->operator+=(static_cast<ValueHandle>(e));
+  }
+  OperationHandle operator-=(TemplatedIndexedValue e) {
+    return this->operator-=(static_cast<ValueHandle>(e));
+  }
+  OperationHandle operator*=(TemplatedIndexedValue e) {
+    return this->operator*=(static_cast<ValueHandle>(e));
+  }
+  OperationHandle operator/=(TemplatedIndexedValue e) {
+    return this->operator/=(static_cast<ValueHandle>(e));
+  }
+
+private:
+  TemplatedIndexedValue(ValueHandle base, ArrayRef<ValueHandle> indices)
+      : base(base), indices(indices.begin(), indices.end()) {}
+
+  TemplatedIndexedValue &append() { return *this; }
+
+  template <typename T, typename... Args>
+  TemplatedIndexedValue &append(T index, Args... indices) {
+    this->indices.push_back(static_cast<ValueHandle>(index));
+    append(indices...);
+    return *this;
+  }
+  ValueHandle base;
+  SmallVector<ValueHandle, 8> indices;
+};
+
+/// Operator overloadings.
+template <typename Load, typename Store>
+ValueHandle TemplatedIndexedValue<Load, Store>::operator+(ValueHandle e) {
+  using op::operator+;
+  return static_cast<ValueHandle>(*this) + e;
+}
+template <typename Load, typename Store>
+ValueHandle TemplatedIndexedValue<Load, Store>::operator-(ValueHandle e) {
+  using op::operator-;
+  return static_cast<ValueHandle>(*this) - e;
+}
+template <typename Load, typename Store>
+ValueHandle TemplatedIndexedValue<Load, Store>::operator*(ValueHandle e) {
+  using op::operator*;
+  return static_cast<ValueHandle>(*this) * e;
+}
+template <typename Load, typename Store>
+ValueHandle TemplatedIndexedValue<Load, Store>::operator/(ValueHandle e) {
+  using op::operator/;
+  return static_cast<ValueHandle>(*this) / e;
+}
+
+template <typename Load, typename Store>
+OperationHandle TemplatedIndexedValue<Load, Store>::operator+=(ValueHandle e) {
+  using op::operator+;
+  return Store(*this + e, getBase(), {indices.begin(), indices.end()});
+}
+template <typename Load, typename Store>
+OperationHandle TemplatedIndexedValue<Load, Store>::operator-=(ValueHandle e) {
+  using op::operator-;
+  return Store(*this - e, getBase(), {indices.begin(), indices.end()});
+}
+template <typename Load, typename Store>
+OperationHandle TemplatedIndexedValue<Load, Store>::operator*=(ValueHandle e) {
+  using op::operator*;
+  return Store(*this * e, getBase(), {indices.begin(), indices.end()});
+}
+template <typename Load, typename Store>
+OperationHandle TemplatedIndexedValue<Load, Store>::operator/=(ValueHandle e) {
+  using op::operator/;
+  return Store(*this / e, getBase(), {indices.begin(), indices.end()});
+}
+
+} // namespace edsc
+} // namespace mlir
+
+#endif // MLIR_EDSC_HELPERS_H_
diff --git a/mlir/include/mlir/EDSC/Intrinsics.h b/mlir/include/mlir/EDSC/Intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..30cce6bb8d6152d93e0b3d016b65a3c626412df7
--- /dev/null
+++ b/mlir/include/mlir/EDSC/Intrinsics.h
@@ -0,0 +1,276 @@
+//===- Intrinsics.h - MLIR Operations for Declarative Builders ---*- C++-*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides intuitive composable intrinsics for building snippets of MLIR
+// declaratively
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EDSC_INTRINSICS_H_
+#define MLIR_EDSC_INTRINSICS_H_
+
+#include "mlir/EDSC/Builders.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+
+class MemRefType;
+class Type;
+
+namespace edsc {
+
+/// An IndexHandle is a simple wrapper around a ValueHandle.
+/// IndexHandles are ubiquitous enough to justify a new type to allow simple
+/// declarations without boilerplate such as:
+///
+/// ```c++
+///    IndexHandle i, j, k;
+/// ```
+struct IndexHandle : public ValueHandle {
+  explicit IndexHandle()
+      : ValueHandle(ScopedContext::getBuilder().getIndexType()) {}
+  explicit IndexHandle(index_t v) : ValueHandle(v) {}
+  explicit IndexHandle(Value v) : ValueHandle(v) {
+    assert(v->getType() == ScopedContext::getBuilder().getIndexType() &&
+           "Expected index type");
+  }
+  explicit IndexHandle(ValueHandle v) : ValueHandle(v) {
+    assert(v.getType() == ScopedContext::getBuilder().getIndexType() &&
+           "Expected index type");
+  }
+  IndexHandle &operator=(const ValueHandle &v) {
+    assert(v.getType() == ScopedContext::getBuilder().getIndexType() &&
+           "Expected index type");
+    /// Creating a new IndexHandle(v) and then std::swap rightly complains the
+    /// binding has already occurred and that we should use another name.
+    this->t = v.getType();
+    this->v = v.getValue();
+    return *this;
+  }
+};
+
+inline SmallVector<IndexHandle, 8> makeIndexHandles(unsigned rank) {
+  return SmallVector<IndexHandle, 8>(rank);
+}
+
+/// Entry point to build multiple ValueHandle* from a mutable list `ivs` of T.
+template <typename T>
+inline SmallVector<ValueHandle *, 8>
+makeHandlePointers(MutableArrayRef<T> ivs) {
+  SmallVector<ValueHandle *, 8> pivs;
+  pivs.reserve(ivs.size());
+  for (auto &iv : ivs) {
+    pivs.push_back(&iv);
+  }
+  return pivs;
+}
+
+/// Returns a vector of the underlying Value from `ivs`.
+inline SmallVector<Value, 8> extractValues(ArrayRef<IndexHandle> ivs) {
+  SmallVector<Value, 8> vals;
+  vals.reserve(ivs.size());
+  for (auto &iv : ivs) {
+    vals.push_back(iv.getValue());
+  }
+  return vals;
+}
+
+/// Provides a set of first class intrinsics.
+/// In the future, most of intrinsics related to Operation that don't contain
+/// other operations should be Tablegen'd.
+namespace intrinsics {
+namespace detail {
+/// Helper structure to be used with ValueBuilder / OperationBuilder.
+/// It serves the purpose of removing boilerplate specialization for the sole
+/// purpose of implicitly converting ArrayRef<ValueHandle> -> ArrayRef<Value>.
+class ValueHandleArray {
+public:
+  ValueHandleArray(ArrayRef<ValueHandle> vals) {
+    values.append(vals.begin(), vals.end());
+  }
+  ValueHandleArray(ArrayRef<IndexHandle> vals) {
+    values.append(vals.begin(), vals.end());
+  }
+  ValueHandleArray(ArrayRef<index_t> vals) {
+    SmallVector<IndexHandle, 8> tmp(vals.begin(), vals.end());
+    values.append(tmp.begin(), tmp.end());
+  }
+  operator ArrayRef<Value>() { return values; }
+
+private:
+  ValueHandleArray() = default;
+  SmallVector<Value, 8> values;
+};
+
+template <typename T> inline T unpack(T value) { return value; }
+
+inline detail::ValueHandleArray unpack(ArrayRef<ValueHandle> values) {
+  return detail::ValueHandleArray(values);
+}
+
+} // namespace detail
+
+/// Helper variadic abstraction to allow extending to any MLIR op without
+/// boilerplate or Tablegen.
+/// Arguably a builder is not a ValueHandle but in practice it is only used as
+/// an alias to a notional ValueHandle<Op>.
+/// Implementing it as a subclass allows it to compose all the way to Value.
+/// Without subclassing, implicit conversion to Value would fail when composing
+/// in patterns such as: `select(a, b, select(c, d, e))`.
+template <typename Op> struct ValueBuilder : public ValueHandle {
+  // Builder-based
+  template <typename... Args>
+  ValueBuilder(Args... args)
+      : ValueHandle(ValueHandle::create<Op>(detail::unpack(args)...)) {}
+  ValueBuilder(ArrayRef<ValueHandle> vs)
+      : ValueBuilder(ValueBuilder::create<Op>(detail::unpack(vs))) {}
+  template <typename... Args>
+  ValueBuilder(ArrayRef<ValueHandle> vs, Args... args)
+      : ValueHandle(ValueHandle::create<Op>(detail::unpack(vs),
+                                            detail::unpack(args)...)) {}
+  template <typename T, typename... Args>
+  ValueBuilder(T t, ArrayRef<ValueHandle> vs, Args... args)
+      : ValueHandle(ValueHandle::create<Op>(
+            detail::unpack(t), detail::unpack(vs), detail::unpack(args)...)) {}
+  template <typename T1, typename T2, typename... Args>
+  ValueBuilder(T1 t1, T2 t2, ArrayRef<ValueHandle> vs, Args... args)
+      : ValueHandle(ValueHandle::create<Op>(
+            detail::unpack(t1), detail::unpack(t2), detail::unpack(vs),
+            detail::unpack(args)...)) {}
+
+  /// Folder-based
+  template <typename... Args>
+  ValueBuilder(OperationFolder *folder, Args... args)
+      : ValueHandle(ValueHandle::create<Op>(folder, detail::unpack(args)...)) {}
+  ValueBuilder(OperationFolder *folder, ArrayRef<ValueHandle> vs)
+      : ValueBuilder(ValueBuilder::create<Op>(folder, detail::unpack(vs))) {}
+  template <typename... Args>
+  ValueBuilder(OperationFolder *folder, ArrayRef<ValueHandle> vs, Args... args)
+      : ValueHandle(ValueHandle::create<Op>(folder, detail::unpack(vs),
+                                            detail::unpack(args)...)) {}
+  template <typename T, typename... Args>
+  ValueBuilder(OperationFolder *folder, T t, ArrayRef<ValueHandle> vs,
+               Args... args)
+      : ValueHandle(ValueHandle::create<Op>(folder, detail::unpack(t),
+                                            detail::unpack(vs),
+                                            detail::unpack(args)...)) {}
+  template <typename T1, typename T2, typename... Args>
+  ValueBuilder(OperationFolder *folder, T1 t1, T2 t2, ArrayRef<ValueHandle> vs,
+               Args... args)
+      : ValueHandle(ValueHandle::create<Op>(
+            folder, detail::unpack(t1), detail::unpack(t2), detail::unpack(vs),
+            detail::unpack(args)...)) {}
+
+  ValueBuilder() : ValueHandle(ValueHandle::create<Op>()) {}
+};
+
+template <typename Op> struct OperationBuilder : public OperationHandle {
+  template <typename... Args>
+  OperationBuilder(Args... args)
+      : OperationHandle(OperationHandle::create<Op>(detail::unpack(args)...)) {}
+  OperationBuilder(ArrayRef<ValueHandle> vs)
+      : OperationHandle(OperationHandle::create<Op>(detail::unpack(vs))) {}
+  template <typename... Args>
+  OperationBuilder(ArrayRef<ValueHandle> vs, Args... args)
+      : OperationHandle(OperationHandle::create<Op>(detail::unpack(vs),
+                                                    detail::unpack(args)...)) {}
+  template <typename T, typename... Args>
+  OperationBuilder(T t, ArrayRef<ValueHandle> vs, Args... args)
+      : OperationHandle(OperationHandle::create<Op>(
+            detail::unpack(t), detail::unpack(vs), detail::unpack(args)...)) {}
+  template <typename T1, typename T2, typename... Args>
+  OperationBuilder(T1 t1, T2 t2, ArrayRef<ValueHandle> vs, Args... args)
+      : OperationHandle(OperationHandle::create<Op>(
+            detail::unpack(t1), detail::unpack(t2), detail::unpack(vs),
+            detail::unpack(args)...)) {}
+  OperationBuilder() : OperationHandle(OperationHandle::create<Op>()) {}
+};
+
+using addf = ValueBuilder<AddFOp>;
+using affine_apply = ValueBuilder<AffineApplyOp>;
+using affine_if = OperationBuilder<AffineIfOp>;
+using affine_load = ValueBuilder<AffineLoadOp>;
+using affine_store = OperationBuilder<AffineStoreOp>;
+using alloc = ValueBuilder<AllocOp>;
+using call = OperationBuilder<mlir::CallOp>;
+using constant_float = ValueBuilder<ConstantFloatOp>;
+using constant_index = ValueBuilder<ConstantIndexOp>;
+using constant_int = ValueBuilder<ConstantIntOp>;
+using dealloc = OperationBuilder<DeallocOp>;
+using dim = ValueBuilder<DimOp>;
+using muli = ValueBuilder<MulIOp>;
+using mulf = ValueBuilder<MulFOp>;
+using memref_cast = ValueBuilder<MemRefCastOp>;
+using ret = OperationBuilder<ReturnOp>;
+using select = ValueBuilder<SelectOp>;
+using std_load = ValueBuilder<LoadOp>;
+using std_store = OperationBuilder<StoreOp>;
+using subi = ValueBuilder<SubIOp>;
+using tanh = ValueBuilder<TanhOp>;
+using view = ValueBuilder<ViewOp>;
+
+/// Branches into the mlir::Block* captured by BlockHandle `b` with `operands`.
+///
+/// Prerequisites:
+///   All Handles have already captured previously constructed IR objects.
+OperationHandle br(BlockHandle bh, ArrayRef<ValueHandle> operands);
+
+/// Creates a new mlir::Block* and branches to it from the current block.
+/// Argument types are specified by `operands`.
+/// Captures the new block in `bh` and the actual `operands` in `captures`. To
+/// insert the new mlir::Block*, a local ScopedContext is constructed and
+/// released to the current block. The branch operation is then added to the
+/// new block.
+///
+/// Prerequisites:
+///   `b` has not yet captured an mlir::Block*.
+///   No `captures` have captured any mlir::Value.
+///   All `operands` have already captured an mlir::Value
+///   captures.size() == operands.size()
+///   captures and operands are pairwise of the same type.
+OperationHandle br(BlockHandle *bh, ArrayRef<ValueHandle *> captures,
+                   ArrayRef<ValueHandle> operands);
+
+/// Branches into the mlir::Block* captured by BlockHandle `trueBranch` with
+/// `trueOperands` if `cond` evaluates to `true` (resp. `falseBranch` and
+/// `falseOperand` if `cond` evaluates to `false`).
+///
+/// Prerequisites:
+///   All Handles have captured previously constructed IR objects.
+OperationHandle cond_br(ValueHandle cond, BlockHandle trueBranch,
+                        ArrayRef<ValueHandle> trueOperands,
+                        BlockHandle falseBranch,
+                        ArrayRef<ValueHandle> falseOperands);
+
+/// Eagerly creates new mlir::Block* with argument types specified by
+/// `trueOperands`/`falseOperands`.
+/// Captures the new blocks in `trueBranch`/`falseBranch` and the arguments in
+/// `trueCaptures/falseCaptures`.
+/// To insert the new mlir::Block*, a local ScopedContext is constructed and
+/// released. The branch operation is then added in the original location and
+/// targeting the eagerly constructed blocks.
+///
+/// Prerequisites:
+///   `trueBranch`/`falseBranch` has not yet captured an mlir::Block*.
+///   No `trueCaptures`/`falseCaptures` have captured any mlir::Value.
+///   All `trueOperands`/`trueOperands` have already captured an mlir::Value
+///   `trueCaptures`.size() == `trueOperands`.size()
+///   `falseCaptures`.size() == `falseOperands`.size()
+///   `trueCaptures` and `trueOperands` are pairwise of the same type
+///   `falseCaptures` and `falseOperands` are pairwise of the same type.
+OperationHandle cond_br(ValueHandle cond, BlockHandle *trueBranch,
+                        ArrayRef<ValueHandle *> trueCaptures,
+                        ArrayRef<ValueHandle> trueOperands,
+                        BlockHandle *falseBranch,
+                        ArrayRef<ValueHandle *> falseCaptures,
+                        ArrayRef<ValueHandle> falseOperands);
+} // namespace intrinsics
+} // namespace edsc
+} // namespace mlir
+
+#endif // MLIR_EDSC_INTRINSICS_H_
diff --git a/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f218bd0d9b40963650939e68a2a13e7f6f04fe9
--- /dev/null
+++ b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
@@ -0,0 +1,126 @@
+//===- ExecutionEngine.h - MLIR Execution engine and utils -----*- C++ -*--===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides a JIT-backed execution engine for MLIR modules.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EXECUTIONENGINE_EXECUTIONENGINE_H_
+#define MLIR_EXECUTIONENGINE_EXECUTIONENGINE_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Error.h"
+
+#include <functional>
+#include <memory>
+
+namespace llvm {
+template <typename T> class Expected;
+class Module;
+class ExecutionEngine;
+class MemoryBuffer;
+} // namespace llvm
+
+namespace mlir {
+
+class ModuleOp;
+
+/// A simple object cache following Lang's LLJITWithObjectCache example.
+class SimpleObjectCache : public llvm::ObjectCache {
+public:
+  void notifyObjectCompiled(const llvm::Module *M,
+                            llvm::MemoryBufferRef ObjBuffer) override;
+  std::unique_ptr<llvm::MemoryBuffer> getObject(const llvm::Module *M) override;
+
+  /// Dump cached object to output file `filename`.
+  void dumpToObjectFile(StringRef filename);
+
+private:
+  llvm::StringMap<std::unique_ptr<llvm::MemoryBuffer>> cachedObjects;
+};
+
+/// JIT-backed execution engine for MLIR modules.  Assumes the module can be
+/// converted to LLVM IR.  For each function, creates a wrapper function with
+/// the fixed interface
+///
+///     void _mlir_funcName(void **)
+///
+/// where the only argument is interpreted as a list of pointers to the actual
+/// arguments of the function, followed by a pointer to the result.  This allows
+/// the engine to provide the caller with a generic function pointer that can
+/// be used to invoke the JIT-compiled function.
+class ExecutionEngine {
+public:
+  ExecutionEngine(bool enableObjectCache);
+
+  /// Creates an execution engine for the given module.  If `transformer` is
+  /// provided, it will be called on the LLVM module during JIT-compilation and
+  /// can be used, e.g., for reporting or optimization. `jitCodeGenOptLevel`,
+  /// when provided, is used as the optimization level for target code
+  /// generation. If `sharedLibPaths` are provided, the underlying
+  /// JIT-compilation will open and link the shared libraries for symbol
+  /// resolution. If `objectCache` is provided, JIT compiler will use it to
+  /// store the object generated for the given module.
+  static llvm::Expected<std::unique_ptr<ExecutionEngine>> create(
+      ModuleOp m, std::function<llvm::Error(llvm::Module *)> transformer = {},
+      Optional<llvm::CodeGenOpt::Level> jitCodeGenOptLevel = llvm::None,
+      ArrayRef<StringRef> sharedLibPaths = {}, bool enableObjectCache = false);
+
+  /// Looks up a packed-argument function with the given name and returns a
+  /// pointer to it.  Propagates errors in case of failure.
+  llvm::Expected<void (*)(void **)> lookup(StringRef name) const;
+
+  /// Invokes the function with the given name passing it the list of arguments.
+  /// The arguments are accepted by lvalue-reference since the packed function
+  /// interface expects a list of non-null pointers.
+  template <typename... Args>
+  llvm::Error invoke(StringRef name, Args &... args);
+
+  /// Invokes the function with the given name passing it the list of arguments
+  /// as a list of opaque pointers. This is the arity-agnostic equivalent of
+  /// the templated `invoke`.
+  llvm::Error invoke(StringRef name, MutableArrayRef<void *> args);
+
+  /// Set the target triple on the module. This is implicitly done when creating
+  /// the engine.
+  static bool setupTargetTriple(llvm::Module *llvmModule);
+
+  /// Dump object code to output file `filename`.
+  void dumpToObjectFile(StringRef filename);
+
+private:
+  // Ordering of llvmContext and jit is important for destruction purposes: the
+  // jit must be destroyed before the context.
+  llvm::LLVMContext llvmContext;
+
+  // Underlying LLJIT.
+  std::unique_ptr<llvm::orc::LLJIT> jit;
+
+  // Underlying cache.
+  std::unique_ptr<SimpleObjectCache> cache;
+};
+
+template <typename... Args>
+llvm::Error ExecutionEngine::invoke(StringRef name, Args &... args) {
+  auto expectedFPtr = lookup(name);
+  if (!expectedFPtr)
+    return expectedFPtr.takeError();
+  auto fptr = *expectedFPtr;
+
+  SmallVector<void *, 8> packedArgs{static_cast<void *>(&args)...};
+  (*fptr)(packedArgs.data());
+
+  return llvm::Error::success();
+}
+
+} // end namespace mlir
+
+#endif // MLIR_EXECUTIONENGINE_EXECUTIONENGINE_H_
diff --git a/mlir/include/mlir/ExecutionEngine/OptUtils.h b/mlir/include/mlir/ExecutionEngine/OptUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b7b2598db5bdada4a5ba3cffa2b4d6189d2e083
--- /dev/null
+++ b/mlir/include/mlir/ExecutionEngine/OptUtils.h
@@ -0,0 +1,57 @@
+//===- OptUtils.h - MLIR Execution Engine opt pass utilities ----*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the utility functions to trigger LLVM optimizations from
+// MLIR Execution Engine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EXECUTIONENGINE_OPTUTILS_H_
+#define MLIR_EXECUTIONENGINE_OPTUTILS_H_
+
+#include "llvm/Pass.h"
+
+#include <functional>
+#include <string>
+
+namespace llvm {
+class Module;
+class Error;
+class TargetMachine;
+} // namespace llvm
+
+namespace mlir {
+
+/// Initialize LLVM passes that can be when running MLIR code using
+/// ExecutionEngine.
+void initializeLLVMPasses();
+
+/// Create a module transformer function for MLIR ExecutionEngine that runs
+/// LLVM IR passes corresponding to the given speed and size optimization
+/// levels (e.g. -O2 or -Os). If not null, `targetMachine` is used to
+/// initialize passes that provide target-specific information to the LLVM
+/// optimizer. `targetMachine` must outlive the returned std::function.
+std::function<llvm::Error(llvm::Module *)>
+makeOptimizingTransformer(unsigned optLevel, unsigned sizeLevel,
+                          llvm::TargetMachine *targetMachine);
+
+/// Create a module transformer function for MLIR ExecutionEngine that runs
+/// LLVM IR passes explicitly specified, plus an optional optimization level,
+/// Any optimization passes, if present, will be inserted before the pass at
+/// position optPassesInsertPos. If not null, `targetMachine` is used to
+/// initialize passes that provide target-specific information to the LLVM
+/// optimizer. `targetMachine` must outlive the returned std::function.
+std::function<llvm::Error(llvm::Module *)>
+makeLLVMPassesTransformer(llvm::ArrayRef<const llvm::PassInfo *> llvmPasses,
+                          llvm::Optional<unsigned> mbOptLevel,
+                          llvm::TargetMachine *targetMachine,
+                          unsigned optPassesInsertPos = 0);
+
+} // end namespace mlir
+
+#endif // LIR_EXECUTIONENGINE_OPTUTILS_H_
diff --git a/mlir/include/mlir/IR/AffineExpr.h b/mlir/include/mlir/IR/AffineExpr.h
new file mode 100644
index 0000000000000000000000000000000000000000..7059489ed4c9eda91ecd423f5a3652baf79be0e5
--- /dev/null
+++ b/mlir/include/mlir/IR/AffineExpr.h
@@ -0,0 +1,321 @@
+//===- AffineExpr.h - MLIR Affine Expr Class --------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// An affine expression is an affine combination of dimension identifiers and
+// symbols, including ceildiv/floordiv/mod by a constant integer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_AFFINE_EXPR_H
+#define MLIR_IR_AFFINE_EXPR_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/Support/Casting.h"
+#include <type_traits>
+
+namespace mlir {
+
+class MLIRContext;
+class AffineMap;
+class IntegerSet;
+
+namespace detail {
+
+struct AffineExprStorage;
+struct AffineBinaryOpExprStorage;
+struct AffineDimExprStorage;
+struct AffineSymbolExprStorage;
+struct AffineConstantExprStorage;
+
+} // namespace detail
+
+enum class AffineExprKind {
+  Add,
+  /// RHS of mul is always a constant or a symbolic expression.
+  Mul,
+  /// RHS of mod is always a constant or a symbolic expression with a positive
+  /// value.
+  Mod,
+  /// RHS of floordiv is always a constant or a symbolic expression.
+  FloorDiv,
+  /// RHS of ceildiv is always a constant or a symbolic expression.
+  CeilDiv,
+
+  /// This is a marker for the last affine binary op. The range of binary
+  /// op's is expected to be this element and earlier.
+  LAST_AFFINE_BINARY_OP = CeilDiv,
+
+  /// Constant integer.
+  Constant,
+  /// Dimensional identifier.
+  DimId,
+  /// Symbolic identifier.
+  SymbolId,
+};
+
+/// Base type for affine expression.
+/// AffineExpr's are immutable value types with intuitive operators to
+/// operate on chainable, lightweight compositions.
+/// An AffineExpr is an interface to the underlying storage type pointer.
+class AffineExpr {
+public:
+  using ImplType = detail::AffineExprStorage;
+
+  AffineExpr() : expr(nullptr) {}
+  /* implicit */ AffineExpr(const ImplType *expr)
+      : expr(const_cast<ImplType *>(expr)) {}
+
+  AffineExpr(const AffineExpr &other) : expr(other.expr) {}
+  AffineExpr &operator=(AffineExpr other) {
+    expr = other.expr;
+    return *this;
+  }
+
+  bool operator==(AffineExpr other) const { return expr == other.expr; }
+  bool operator!=(AffineExpr other) const { return !(*this == other); }
+  bool operator==(int64_t v) const;
+  bool operator!=(int64_t v) const { return !(*this == v); }
+  explicit operator bool() const { return expr; }
+
+  bool operator!() const { return expr == nullptr; }
+
+  template <typename U> bool isa() const;
+  template <typename U> U dyn_cast() const;
+  template <typename U> U cast() const;
+
+  MLIRContext *getContext() const;
+
+  /// Return the classification for this type.
+  AffineExprKind getKind() const;
+
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  /// Returns true if this expression is made out of only symbols and
+  /// constants, i.e., it does not involve dimensional identifiers.
+  bool isSymbolicOrConstant() const;
+
+  /// Returns true if this is a pure affine expression, i.e., multiplication,
+  /// floordiv, ceildiv, and mod is only allowed w.r.t constants.
+  bool isPureAffine() const;
+
+  /// Returns the greatest known integral divisor of this affine expression. The
+  /// result is always positive.
+  int64_t getLargestKnownDivisor() const;
+
+  /// Return true if the affine expression is a multiple of 'factor'.
+  bool isMultipleOf(int64_t factor) const;
+
+  /// Return true if the affine expression involves AffineDimExpr `position`.
+  bool isFunctionOfDim(unsigned position) const;
+
+  /// Walk all of the AffineExpr's in this expression in postorder.
+  void walk(std::function<void(AffineExpr)> callback) const;
+
+  /// This method substitutes any uses of dimensions and symbols (e.g.
+  /// dim#0 with dimReplacements[0]) and returns the modified expression tree.
+  AffineExpr replaceDimsAndSymbols(ArrayRef<AffineExpr> dimReplacements,
+                                   ArrayRef<AffineExpr> symReplacements) const;
+
+  AffineExpr operator+(int64_t v) const;
+  AffineExpr operator+(AffineExpr other) const;
+  AffineExpr operator-() const;
+  AffineExpr operator-(int64_t v) const;
+  AffineExpr operator-(AffineExpr other) const;
+  AffineExpr operator*(int64_t v) const;
+  AffineExpr operator*(AffineExpr other) const;
+  AffineExpr floorDiv(uint64_t v) const;
+  AffineExpr floorDiv(AffineExpr other) const;
+  AffineExpr ceilDiv(uint64_t v) const;
+  AffineExpr ceilDiv(AffineExpr other) const;
+  AffineExpr operator%(uint64_t v) const;
+  AffineExpr operator%(AffineExpr other) const;
+
+  /// Compose with an AffineMap.
+  /// Returns the composition of this AffineExpr with `map`.
+  ///
+  /// Prerequisites:
+  /// `this` and `map` are composable, i.e. that the number of AffineDimExpr of
+  /// `this` is smaller than the number of results of `map`. If a result of a
+  /// map does not have a corresponding AffineDimExpr, that result simply does
+  /// not appear in the produced AffineExpr.
+  ///
+  /// Example:
+  ///   expr: `d0 + d2`
+  ///   map:  `(d0, d1, d2)[s0, s1] -> (d0 + s1, d1 + s0, d0 + d1 + d2)`
+  ///   returned expr: `d0 * 2 + d1 + d2 + s1`
+  AffineExpr compose(AffineMap map) const;
+
+  friend ::llvm::hash_code hash_value(AffineExpr arg);
+
+protected:
+  ImplType *expr;
+};
+
+/// Affine binary operation expression. An affine binary operation could be an
+/// add, mul, floordiv, ceildiv, or a modulo operation. (Subtraction is
+/// represented through a multiply by -1 and add.) These expressions are always
+/// constructed in a simplified form. For eg., the LHS and RHS operands can't
+/// both be constants. There are additional canonicalizing rules depending on
+/// the op type: see checks in the constructor.
+class AffineBinaryOpExpr : public AffineExpr {
+public:
+  using ImplType = detail::AffineBinaryOpExprStorage;
+  /* implicit */ AffineBinaryOpExpr(AffineExpr::ImplType *ptr);
+  AffineExpr getLHS() const;
+  AffineExpr getRHS() const;
+};
+
+/// A dimensional identifier appearing in an affine expression.
+class AffineDimExpr : public AffineExpr {
+public:
+  using ImplType = detail::AffineDimExprStorage;
+  /* implicit */ AffineDimExpr(AffineExpr::ImplType *ptr);
+  unsigned getPosition() const;
+};
+
+/// A symbolic identifier appearing in an affine expression.
+class AffineSymbolExpr : public AffineExpr {
+public:
+  using ImplType = detail::AffineDimExprStorage;
+  /* implicit */ AffineSymbolExpr(AffineExpr::ImplType *ptr);
+  unsigned getPosition() const;
+};
+
+/// An integer constant appearing in affine expression.
+class AffineConstantExpr : public AffineExpr {
+public:
+  using ImplType = detail::AffineConstantExprStorage;
+  /* implicit */ AffineConstantExpr(AffineExpr::ImplType *ptr);
+  int64_t getValue() const;
+};
+
+/// Make AffineExpr hashable.
+inline ::llvm::hash_code hash_value(AffineExpr arg) {
+  return ::llvm::hash_value(arg.expr);
+}
+
+inline AffineExpr operator+(int64_t val, AffineExpr expr) { return expr + val; }
+inline AffineExpr operator*(int64_t val, AffineExpr expr) { return expr * val; }
+inline AffineExpr operator-(int64_t val, AffineExpr expr) {
+  return expr * (-1) + val;
+}
+
+/// These free functions allow clients of the API to not use classes in detail.
+AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context);
+AffineExpr getAffineSymbolExpr(unsigned position, MLIRContext *context);
+AffineExpr getAffineConstantExpr(int64_t constant, MLIRContext *context);
+AffineExpr getAffineBinaryOpExpr(AffineExprKind kind, AffineExpr lhs,
+                                 AffineExpr rhs);
+
+/// Constructs an affine expression from a flat ArrayRef. If there are local
+/// identifiers (neither dimensional nor symbolic) that appear in the sum of
+/// products expression, 'localExprs' is expected to have the AffineExpr
+/// for it, and is substituted into. The ArrayRef 'eq' is expected to be in the
+/// format [dims, symbols, locals, constant term].
+AffineExpr toAffineExpr(ArrayRef<int64_t> eq, unsigned numDims,
+                        unsigned numSymbols, ArrayRef<AffineExpr> localExprs,
+                        MLIRContext *context);
+
+raw_ostream &operator<<(raw_ostream &os, AffineExpr &expr);
+
+template <typename U> bool AffineExpr::isa() const {
+  if (std::is_same<U, AffineBinaryOpExpr>::value) {
+    return getKind() <= AffineExprKind::LAST_AFFINE_BINARY_OP;
+  }
+  if (std::is_same<U, AffineDimExpr>::value) {
+    return getKind() == AffineExprKind::DimId;
+  }
+  if (std::is_same<U, AffineSymbolExpr>::value) {
+    return getKind() == AffineExprKind::SymbolId;
+  }
+  if (std::is_same<U, AffineConstantExpr>::value) {
+    return getKind() == AffineExprKind::Constant;
+  }
+}
+template <typename U> U AffineExpr::dyn_cast() const {
+  if (isa<U>()) {
+    return U(expr);
+  }
+  return U(nullptr);
+}
+template <typename U> U AffineExpr::cast() const {
+  assert(isa<U>());
+  return U(expr);
+}
+
+/// Simplify an affine expression by flattening and some amount of
+/// simple analysis. This has complexity linear in the number of nodes in
+/// 'expr'. Returns the simplified expression, which is the same as the input
+///  expression if it can't be simplified.
+AffineExpr simplifyAffineExpr(AffineExpr expr, unsigned numDims,
+                              unsigned numSymbols);
+
+/// Flattens 'expr' into 'flattenedExpr'. Returns true on success or false
+/// if 'expr' could not be flattened (i.e., semi-affine is not yet handled).
+/// See documentation for AffineExprFlattener on how mod's and div's are
+/// flattened.
+bool getFlattenedAffineExpr(AffineExpr expr, unsigned numDims,
+                            unsigned numSymbols,
+                            SmallVectorImpl<int64_t> *flattenedExpr);
+
+/// Flattens the result expressions of the map to their corresponding flattened
+/// forms and set in 'flattenedExprs'. Returns true on success or false
+/// if any expression in the map could not be flattened (i.e., semi-affine is
+/// not yet handled).  For all affine expressions that share the same operands
+/// (like those of an affine map), this method should be used instead of
+/// repeatedly calling getFlattenedAffineExpr since local variables added to
+/// deal with div's and mod's will be reused across expressions.
+bool getFlattenedAffineExprs(
+    AffineMap map, std::vector<SmallVector<int64_t, 8>> *flattenedExprs);
+bool getFlattenedAffineExprs(
+    IntegerSet set, std::vector<SmallVector<int64_t, 8>> *flattenedExprs);
+
+namespace detail {
+template <int N> void bindDims(MLIRContext *ctx) {}
+
+template <int N, typename AffineExprTy, typename... AffineExprTy2>
+void bindDims(MLIRContext *ctx, AffineExprTy &e, AffineExprTy2 &... exprs) {
+  e = getAffineDimExpr(N, ctx);
+  bindDims<N + 1, AffineExprTy2 &...>(ctx, exprs...);
+}
+} // namespace detail
+
+/// Bind a list of AffineExpr references to DimExpr at positions:
+///   [0 .. sizeof...(exprs)]
+template <typename... AffineExprTy>
+void bindDims(MLIRContext *ctx, AffineExprTy &... exprs) {
+  detail::bindDims<0>(ctx, exprs...);
+}
+
+} // namespace mlir
+
+namespace llvm {
+
+// AffineExpr hash just like pointers
+template <> struct DenseMapInfo<mlir::AffineExpr> {
+  static mlir::AffineExpr getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::AffineExpr(static_cast<mlir::AffineExpr::ImplType *>(pointer));
+  }
+  static mlir::AffineExpr getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::AffineExpr(static_cast<mlir::AffineExpr::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::AffineExpr val) {
+    return mlir::hash_value(val);
+  }
+  static bool isEqual(mlir::AffineExpr LHS, mlir::AffineExpr RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // namespace llvm
+
+#endif // MLIR_IR_AFFINE_EXPR_H
diff --git a/mlir/include/mlir/IR/AffineExprVisitor.h b/mlir/include/mlir/IR/AffineExprVisitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..7866d6bb996a86ec380db07e360dd5369a7700b6
--- /dev/null
+++ b/mlir/include/mlir/IR/AffineExprVisitor.h
@@ -0,0 +1,325 @@
+//===- AffineExprVisitor.h - MLIR AffineExpr Visitor Class ------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AffineExpr visitor class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_AFFINE_EXPR_VISITOR_H
+#define MLIR_IR_AFFINE_EXPR_VISITOR_H
+
+#include "mlir/IR/AffineExpr.h"
+
+namespace mlir {
+
+/// Base class for AffineExpr visitors/walkers.
+///
+/// AffineExpr visitors are used when you want to perform different actions
+/// for different kinds of AffineExprs without having to use lots of casts
+/// and a big switch instruction.
+///
+/// To define your own visitor, inherit from this class, specifying your
+/// new type for the 'SubClass' template parameter, and "override" visitXXX
+/// functions in your class. This class is defined in terms of statically
+/// resolved overloading, not virtual functions.
+///
+/// For example, here is a visitor that counts the number of for AffineDimExprs
+/// in an AffineExpr.
+///
+///  /// Declare the class.  Note that we derive from AffineExprVisitor
+///  /// instantiated with our new subclasses_ type.
+///
+///  struct DimExprCounter : public AffineExprVisitor<DimExprCounter> {
+///    unsigned numDimExprs;
+///    DimExprCounter() : numDimExprs(0) {}
+///    void visitDimExpr(AffineDimExpr expr) { ++numDimExprs; }
+///  };
+///
+///  And this class would be used like this:
+///    DimExprCounter dec;
+///    dec.visit(affineExpr);
+///    numDimExprs = dec.numDimExprs;
+///
+/// AffineExprVisitor provides visit methods for the following binary affine
+/// op expressions:
+/// AffineBinaryAddOpExpr, AffineBinaryMulOpExpr,
+/// AffineBinaryModOpExpr, AffineBinaryFloorDivOpExpr,
+/// AffineBinaryCeilDivOpExpr. Note that default implementations of these
+/// methods will call the general AffineBinaryOpExpr method.
+///
+/// In addition, visit methods are provided for the following affine
+//  expressions: AffineConstantExpr, AffineDimExpr, and
+//  AffineSymbolExpr.
+///
+/// Note that if you don't implement visitXXX for some affine expression type,
+/// the visitXXX method for Instruction superclass will be invoked.
+///
+/// Note that this class is specifically designed as a template to avoid
+/// virtual function call overhead. Defining and using a AffineExprVisitor is
+/// just as efficient as having your own switch instruction over the instruction
+/// opcode.
+
+template <typename SubClass, typename RetTy = void> class AffineExprVisitor {
+  //===--------------------------------------------------------------------===//
+  // Interface code - This is the public interface of the AffineExprVisitor
+  // that you use to visit affine expressions...
+public:
+  // Function to walk an AffineExpr (in post order).
+  RetTy walkPostOrder(AffineExpr expr) {
+    static_assert(std::is_base_of<AffineExprVisitor, SubClass>::value,
+                  "Must instantiate with a derived type of AffineExprVisitor");
+    switch (expr.getKind()) {
+    case AffineExprKind::Add: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      walkOperandsPostOrder(binOpExpr);
+      return static_cast<SubClass *>(this)->visitAddExpr(binOpExpr);
+    }
+    case AffineExprKind::Mul: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      walkOperandsPostOrder(binOpExpr);
+      return static_cast<SubClass *>(this)->visitMulExpr(binOpExpr);
+    }
+    case AffineExprKind::Mod: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      walkOperandsPostOrder(binOpExpr);
+      return static_cast<SubClass *>(this)->visitModExpr(binOpExpr);
+    }
+    case AffineExprKind::FloorDiv: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      walkOperandsPostOrder(binOpExpr);
+      return static_cast<SubClass *>(this)->visitFloorDivExpr(binOpExpr);
+    }
+    case AffineExprKind::CeilDiv: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      walkOperandsPostOrder(binOpExpr);
+      return static_cast<SubClass *>(this)->visitCeilDivExpr(binOpExpr);
+    }
+    case AffineExprKind::Constant:
+      return static_cast<SubClass *>(this)->visitConstantExpr(
+          expr.cast<AffineConstantExpr>());
+    case AffineExprKind::DimId:
+      return static_cast<SubClass *>(this)->visitDimExpr(
+          expr.cast<AffineDimExpr>());
+    case AffineExprKind::SymbolId:
+      return static_cast<SubClass *>(this)->visitSymbolExpr(
+          expr.cast<AffineSymbolExpr>());
+    }
+  }
+
+  // Function to visit an AffineExpr.
+  RetTy visit(AffineExpr expr) {
+    static_assert(std::is_base_of<AffineExprVisitor, SubClass>::value,
+                  "Must instantiate with a derived type of AffineExprVisitor");
+    switch (expr.getKind()) {
+    case AffineExprKind::Add: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      return static_cast<SubClass *>(this)->visitAddExpr(binOpExpr);
+    }
+    case AffineExprKind::Mul: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      return static_cast<SubClass *>(this)->visitMulExpr(binOpExpr);
+    }
+    case AffineExprKind::Mod: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      return static_cast<SubClass *>(this)->visitModExpr(binOpExpr);
+    }
+    case AffineExprKind::FloorDiv: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      return static_cast<SubClass *>(this)->visitFloorDivExpr(binOpExpr);
+    }
+    case AffineExprKind::CeilDiv: {
+      auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+      return static_cast<SubClass *>(this)->visitCeilDivExpr(binOpExpr);
+    }
+    case AffineExprKind::Constant:
+      return static_cast<SubClass *>(this)->visitConstantExpr(
+          expr.cast<AffineConstantExpr>());
+    case AffineExprKind::DimId:
+      return static_cast<SubClass *>(this)->visitDimExpr(
+          expr.cast<AffineDimExpr>());
+    case AffineExprKind::SymbolId:
+      return static_cast<SubClass *>(this)->visitSymbolExpr(
+          expr.cast<AffineSymbolExpr>());
+    }
+    llvm_unreachable("Unknown AffineExpr");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Visitation functions... these functions provide default fallbacks in case
+  // the user does not specify what to do for a particular instruction type.
+  // The default behavior is to generalize the instruction type to its subtype
+  // and try visiting the subtype.  All of this should be inlined perfectly,
+  // because there are no virtual functions to get in the way.
+  //
+
+  // Default visit methods. Note that the default op-specific binary op visit
+  // methods call the general visitAffineBinaryOpExpr visit method.
+  void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) {}
+  void visitAddExpr(AffineBinaryOpExpr expr) {
+    static_cast<SubClass *>(this)->visitAffineBinaryOpExpr(expr);
+  }
+  void visitMulExpr(AffineBinaryOpExpr expr) {
+    static_cast<SubClass *>(this)->visitAffineBinaryOpExpr(expr);
+  }
+  void visitModExpr(AffineBinaryOpExpr expr) {
+    static_cast<SubClass *>(this)->visitAffineBinaryOpExpr(expr);
+  }
+  void visitFloorDivExpr(AffineBinaryOpExpr expr) {
+    static_cast<SubClass *>(this)->visitAffineBinaryOpExpr(expr);
+  }
+  void visitCeilDivExpr(AffineBinaryOpExpr expr) {
+    static_cast<SubClass *>(this)->visitAffineBinaryOpExpr(expr);
+  }
+  void visitConstantExpr(AffineConstantExpr expr) {}
+  void visitDimExpr(AffineDimExpr expr) {}
+  void visitSymbolExpr(AffineSymbolExpr expr) {}
+
+private:
+  // Walk the operands - each operand is itself walked in post order.
+  void walkOperandsPostOrder(AffineBinaryOpExpr expr) {
+    walkPostOrder(expr.getLHS());
+    walkPostOrder(expr.getRHS());
+  }
+};
+
+// This class is used to flatten a pure affine expression (AffineExpr,
+// which is in a tree form) into a sum of products (w.r.t constants) when
+// possible, and in that process simplifying the expression. For a modulo,
+// floordiv, or a ceildiv expression, an additional identifier, called a local
+// identifier, is introduced to rewrite the expression as a sum of product
+// affine expression. Each local identifier is always and by construction a
+// floordiv of a pure add/mul affine function of dimensional, symbolic, and
+// other local identifiers, in a non-mutually recursive way. Hence, every local
+// identifier can ultimately always be recovered as an affine function of
+// dimensional and symbolic identifiers (involving floordiv's); note however
+// that by AffineExpr construction, some floordiv combinations are converted to
+// mod's. The result of the flattening is a flattened expression and a set of
+// constraints involving just the local variables.
+//
+// d2 + (d0 + d1) floordiv 4  is flattened to d2 + q where 'q' is the local
+// variable introduced, with localVarCst containing 4*q <= d0 + d1 <= 4*q + 3.
+//
+// The simplification performed includes the accumulation of contributions for
+// each dimensional and symbolic identifier together, the simplification of
+// floordiv/ceildiv/mod expressions and other simplifications that in turn
+// happen as a result. A simplification that this flattening naturally performs
+// is of simplifying the numerator and denominator of floordiv/ceildiv, and
+// folding a modulo expression to a zero, if possible. Three examples are below:
+//
+// (d0 + 3 * d1) + d0) - 2 * d1) - d0    simplified to     d0 + d1
+// (d0 - d0 mod 4 + 4) mod 4             simplified to     0
+// (3*d0 + 2*d1 + d0) floordiv 2 + d1    simplified to     2*d0 + 2*d1
+//
+// The way the flattening works for the second example is as follows: d0 % 4 is
+// replaced by d0 - 4*q with q being introduced: the expression then simplifies
+// to: (d0 - (d0 - 4q) + 4) = 4q + 4, modulo of which w.r.t 4 simplifies to
+// zero. Note that an affine expression may not always be expressible purely as
+// a sum of products involving just the original dimensional and symbolic
+// identifiers due to the presence of modulo/floordiv/ceildiv expressions that
+// may not be eliminated after simplification; in such cases, the final
+// expression can be reconstructed by replacing the local identifiers with their
+// corresponding explicit form stored in 'localExprs' (note that each of the
+// explicit forms itself would have been simplified).
+//
+// The expression walk method here performs a linear time post order walk that
+// performs the above simplifications through visit methods, with partial
+// results being stored in 'operandExprStack'. When a parent expr is visited,
+// the flattened expressions corresponding to its two operands would already be
+// on the stack - the parent expression looks at the two flattened expressions
+// and combines the two. It pops off the operand expressions and pushes the
+// combined result (although this is done in-place on its LHS operand expr).
+// When the walk is completed, the flattened form of the top-level expression
+// would be left on the stack.
+//
+// A flattener can be repeatedly used for multiple affine expressions that bind
+// to the same operands, for example, for all result expressions of an
+// AffineMap or AffineValueMap. In such cases, using it for multiple expressions
+// is more efficient than creating a new flattener for each expression since
+// common identical div and mod expressions appearing across different
+// expressions are mapped to the same local identifier (same column position in
+// 'localVarCst').
+class SimpleAffineExprFlattener
+    : public AffineExprVisitor<SimpleAffineExprFlattener> {
+public:
+  // Flattend expression layout: [dims, symbols, locals, constant]
+  // Stack that holds the LHS and RHS operands while visiting a binary op expr.
+  // In future, consider adding a prepass to determine how big the SmallVector's
+  // will be, and linearize this to std::vector<int64_t> to prevent
+  // SmallVector moves on re-allocation.
+  std::vector<SmallVector<int64_t, 8>> operandExprStack;
+
+  unsigned numDims;
+  unsigned numSymbols;
+
+  // Number of newly introduced identifiers to flatten mod/floordiv/ceildiv's.
+  unsigned numLocals;
+
+  // AffineExpr's corresponding to the floordiv/ceildiv/mod expressions for
+  // which new identifiers were introduced; if the latter do not get canceled
+  // out, these expressions can be readily used to reconstruct the AffineExpr
+  // (tree) form. Note that these expressions themselves would have been
+  // simplified (recursively) by this pass. Eg. d0 + (d0 + 2*d1 + d0) ceildiv 4
+  // will be simplified to d0 + q, where q = (d0 + d1) ceildiv 2. (d0 + d1)
+  // ceildiv 2 would be the local expression stored for q.
+  SmallVector<AffineExpr, 4> localExprs;
+
+  SimpleAffineExprFlattener(unsigned numDims, unsigned numSymbols);
+
+  virtual ~SimpleAffineExprFlattener() = default;
+
+  // Visitor method overrides.
+  void visitMulExpr(AffineBinaryOpExpr expr);
+  void visitAddExpr(AffineBinaryOpExpr expr);
+  void visitDimExpr(AffineDimExpr expr);
+  void visitSymbolExpr(AffineSymbolExpr expr);
+  void visitConstantExpr(AffineConstantExpr expr);
+  void visitCeilDivExpr(AffineBinaryOpExpr expr);
+  void visitFloorDivExpr(AffineBinaryOpExpr expr);
+
+  //
+  // t = expr mod c   <=>  t = expr - c*q and c*q <= expr <= c*q + c - 1
+  //
+  // A mod expression "expr mod c" is thus flattened by introducing a new local
+  // variable q (= expr floordiv c), such that expr mod c is replaced with
+  // 'expr - c * q' and c * q <= expr <= c * q + c - 1 are added to localVarCst.
+  void visitModExpr(AffineBinaryOpExpr expr);
+
+protected:
+  // Add a local identifier (needed to flatten a mod, floordiv, ceildiv expr).
+  // The local identifier added is always a floordiv of a pure add/mul affine
+  // function of other identifiers, coefficients of which are specified in
+  // dividend and with respect to a positive constant divisor. localExpr is the
+  // simplified tree expression (AffineExpr) corresponding to the quantifier.
+  virtual void addLocalFloorDivId(ArrayRef<int64_t> dividend, int64_t divisor,
+                                  AffineExpr localExpr);
+
+private:
+  // t = expr floordiv c   <=> t = q, c * q <= expr <= c * q + c - 1
+  // A floordiv is thus flattened by introducing a new local variable q, and
+  // replacing that expression with 'q' while adding the constraints
+  // c * q <= expr <= c * q + c - 1 to localVarCst (done by
+  // FlatAffineConstraints::addLocalFloorDiv).
+  //
+  // A ceildiv is similarly flattened:
+  // t = expr ceildiv c   <=> t =  (expr + c - 1) floordiv c
+  void visitDivExpr(AffineBinaryOpExpr expr, bool isCeil);
+
+  int findLocalId(AffineExpr localExpr);
+
+  inline unsigned getNumCols() const {
+    return numDims + numSymbols + numLocals + 1;
+  }
+  inline unsigned getConstantIndex() const { return getNumCols() - 1; }
+  inline unsigned getLocalVarStartIndex() const { return numDims + numSymbols; }
+  inline unsigned getSymbolStartIndex() const { return numDims; }
+  inline unsigned getDimStartIndex() const { return 0; }
+};
+
+} // end namespace mlir
+
+#endif // MLIR_IR_AFFINE_EXPR_VISITOR_H
diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f9116cb1687c0663e9fb6dc30a1b7fa81449058
--- /dev/null
+++ b/mlir/include/mlir/IR/AffineMap.h
@@ -0,0 +1,251 @@
+//===- AffineMap.h - MLIR Affine Map Class ----------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Affine maps are mathematical functions which map a list of dimension
+// identifiers and symbols, to multidimensional affine expressions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_AFFINE_MAP_H
+#define MLIR_IR_AFFINE_MAP_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
+
+namespace mlir {
+
+namespace detail {
+struct AffineMapStorage;
+} // end namespace detail
+
+class AffineExpr;
+class Attribute;
+struct LogicalResult;
+class MLIRContext;
+
+/// A multi-dimensional affine map
+/// Affine map's are immutable like Type's, and they are uniqued.
+/// Eg: (d0, d1) -> (d0/128, d0 mod 128, d1)
+/// The names used (d0, d1) don't matter - it's the mathematical function that
+/// is unique to this affine map.
+class AffineMap {
+public:
+  using ImplType = detail::AffineMapStorage;
+
+  AffineMap() : map(nullptr) {}
+  explicit AffineMap(ImplType *map) : map(map) {}
+  AffineMap(const AffineMap &other) : map(other.map) {}
+  AffineMap &operator=(const AffineMap &other) = default;
+
+  /// Returns a zero result affine map with no dimensions or symbols: () -> ().
+  static AffineMap get(MLIRContext *context);
+
+  static AffineMap get(unsigned dimCount, unsigned symbolCount,
+                       ArrayRef<AffineExpr> results);
+
+  /// Returns a single constant result affine map.
+  static AffineMap getConstantMap(int64_t val, MLIRContext *context);
+
+  /// Returns an AffineMap with 'numDims' identity result dim exprs.
+  static AffineMap getMultiDimIdentityMap(unsigned numDims,
+                                          MLIRContext *context);
+
+  /// Returns an AffineMap representing a permutation.
+  /// The permutation is expressed as a non-empty vector of integers.
+  /// E.g. the permutation `(i,j,k) -> (j,k,i)` will be expressed with
+  /// `permutation = [1,2,0]`. All values in `permutation` must be
+  /// integers, in the range 0..`permutation.size()-1` without duplications
+  /// (i.e. `[1,1,2]` is an invalid permutation).
+  static AffineMap getPermutationMap(ArrayRef<unsigned> permutation,
+                                     MLIRContext *context);
+
+  MLIRContext *getContext() const;
+
+  explicit operator bool() { return map != nullptr; }
+  bool operator==(AffineMap other) const { return other.map == map; }
+  bool operator!=(AffineMap other) const { return !(other.map == map); }
+
+  /// Returns true if this affine map is an identity affine map.
+  /// An identity affine map corresponds to an identity affine function on the
+  /// dimensional identifiers.
+  bool isIdentity() const;
+
+  /// Returns true if this affine map is an empty map, i.e., () -> ().
+  bool isEmpty() const;
+
+  /// Returns true if this affine map is a single result constant function.
+  bool isSingleConstant() const;
+
+  /// Returns the constant result of this map. This methods asserts that the map
+  /// has a single constant result.
+  int64_t getSingleConstantResult() const;
+
+  // Prints affine map to 'os'.
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  unsigned getNumDims() const;
+  unsigned getNumSymbols() const;
+  unsigned getNumResults() const;
+  unsigned getNumInputs() const;
+
+  ArrayRef<AffineExpr> getResults() const;
+  AffineExpr getResult(unsigned idx) const;
+
+  /// Walk all of the AffineExpr's in this mapping. Each node in an expression
+  /// tree is visited in postorder.
+  void walkExprs(std::function<void(AffineExpr)> callback) const;
+
+  /// This method substitutes any uses of dimensions and symbols (e.g.
+  /// dim#0 with dimReplacements[0]) in subexpressions and returns the modified
+  /// expression mapping.  Because this can be used to eliminate dims and
+  /// symbols, the client needs to specify the number of dims and symbols in
+  /// the result.  The returned map always has the same number of results.
+  AffineMap replaceDimsAndSymbols(ArrayRef<AffineExpr> dimReplacements,
+                                  ArrayRef<AffineExpr> symReplacements,
+                                  unsigned numResultDims,
+                                  unsigned numResultSyms);
+
+  /// Folds the results of the application of an affine map on the provided
+  /// operands to a constant if possible.
+  LogicalResult constantFold(ArrayRef<Attribute> operandConstants,
+                             SmallVectorImpl<Attribute> &results) const;
+
+  /// Returns the AffineMap resulting from composing `this` with `map`.
+  /// The resulting AffineMap has as many AffineDimExpr as `map` and as many
+  /// AffineSymbolExpr as the concatenation of `this` and `map` (in which case
+  /// the symbols of `this` map come first).
+  ///
+  /// Prerequisites:
+  /// The maps are composable, i.e. that the number of AffineDimExpr of `this`
+  /// matches the number of results of `map`.
+  ///
+  /// Example:
+  ///   map1: `(d0, d1)[s0, s1] -> (d0 + 1 + s1, d1 - 1 - s0)`
+  ///   map2: `(d0)[s0] -> (d0 + s0, d0 - s0)`
+  ///   map1.compose(map2):
+  ///     `(d0)[s0, s1, s2] -> (d0 + s1 + s2 + 1, d0 - s0 - s2 - 1)`
+  AffineMap compose(AffineMap map);
+
+  /// Returns true if the AffineMap represents a subset (i.e. a projection) of a
+  /// symbol-less permutation map.
+  bool isProjectedPermutation();
+
+  /// Returns true if the AffineMap represents a symbol-less permutation map.
+  bool isPermutation();
+
+  /// Returns the map consisting of the `resultPos` subset.
+  AffineMap getSubMap(ArrayRef<unsigned> resultPos);
+
+  friend ::llvm::hash_code hash_value(AffineMap arg);
+
+private:
+  ImplType *map;
+
+  static AffineMap getImpl(unsigned dimCount, unsigned symbolCount,
+                           ArrayRef<AffineExpr> results, MLIRContext *context);
+};
+
+// Make AffineExpr hashable.
+inline ::llvm::hash_code hash_value(AffineMap arg) {
+  return ::llvm::hash_value(arg.map);
+}
+
+/// Simplify an affine map by simplifying its underlying AffineExpr results.
+AffineMap simplifyAffineMap(AffineMap map);
+
+/// Returns a map of codomain to domain dimensions such that the first codomain
+/// dimension for a particular domain dimension is selected.
+/// Returns an empty map if the input map is empty or if `map` is not invertible
+/// (i.e. `map` does not contain a subset that is a permutation of full domain
+/// rank).
+///
+/// Prerequisites:
+///   1. `map` has no symbols.
+///
+/// Example 1:
+///
+/// ```mlir
+///    (d0, d1, d2) -> (d1, d1, d0, d2, d1, d2, d1, d0)
+///                      0       2   3
+/// ```
+///
+/// returns:
+///
+/// ```mlir
+///    (d0, d1, d2, d3, d4, d5, d6, d7) -> (d2, d0, d3)
+/// ```
+///
+/// Example 2:
+///
+/// ```mlir
+///    (d0, d1, d2) -> (d1, d0 + d1, d0, d2, d1, d2, d1, d0)
+///                      0            2   3
+/// ```
+///
+/// returns:
+///
+/// ```mlir
+///    (d0, d1, d2, d3, d4, d5, d6, d7) -> (d2, d0, d3)
+/// ```
+AffineMap inversePermutation(AffineMap map);
+
+/// Concatenates a list of `maps` into a single AffineMap, stepping over
+/// potentially empty maps. Assumes each of the underlying map has 0 symbols.
+/// The resulting map has a number of dims equal to the max of `maps`' dims and
+/// the concatenated results as its results.
+/// Returns an empty map if all input `maps` are empty.
+///
+/// Example:
+/// When applied to the following list of 3 affine maps,
+///
+/// ```mlir
+///    {
+///      (i, j, k) -> (i, k),
+///      (i, j, k) -> (k, j),
+///      (i, j, k) -> (i, j)
+///    }
+/// ```
+///
+/// Returns the map:
+///
+/// ```mlir
+///     (i, j, k) -> (i, k, k, j, i, j)
+/// ```
+AffineMap concatAffineMaps(ArrayRef<AffineMap> maps);
+
+inline raw_ostream &operator<<(raw_ostream &os, AffineMap map) {
+  map.print(os);
+  return os;
+}
+} // end namespace mlir
+
+namespace llvm {
+
+// AffineExpr hash just like pointers
+template <> struct DenseMapInfo<mlir::AffineMap> {
+  static mlir::AffineMap getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::AffineMap(static_cast<mlir::AffineMap::ImplType *>(pointer));
+  }
+  static mlir::AffineMap getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::AffineMap(static_cast<mlir::AffineMap::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::AffineMap val) {
+    return mlir::hash_value(val);
+  }
+  static bool isEqual(mlir::AffineMap LHS, mlir::AffineMap RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // namespace llvm
+
+#endif // MLIR_IR_AFFINE_MAP_H
diff --git a/mlir/include/mlir/IR/AttributeSupport.h b/mlir/include/mlir/IR/AttributeSupport.h
new file mode 100644
index 0000000000000000000000000000000000000000..9804d6866f85f921d0eb14e3d6a2fb744b49b95b
--- /dev/null
+++ b/mlir/include/mlir/IR/AttributeSupport.h
@@ -0,0 +1,107 @@
+//===- AttributeSupport.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines support types for registering dialect extended attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_ATTRIBUTESUPPORT_H
+#define MLIR_IR_ATTRIBUTESUPPORT_H
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StorageUniquerSupport.h"
+#include "llvm/ADT/PointerIntPair.h"
+
+namespace mlir {
+class MLIRContext;
+class Type;
+
+//===----------------------------------------------------------------------===//
+// AttributeStorage
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+class AttributeUniquer;
+} // end namespace detail
+
+/// Base storage class appearing in an attribute. Derived storage classes should
+/// only be constructed within the context of the AttributeUniquer.
+class AttributeStorage : public StorageUniquer::BaseStorage {
+  friend detail::AttributeUniquer;
+  friend StorageUniquer;
+
+public:
+  /// Get the type of this attribute.
+  Type getType() const;
+
+  /// Get the dialect of this attribute.
+  Dialect &getDialect() const {
+    assert(dialect && "Malformed attribute storage object.");
+    return const_cast<Dialect &>(*dialect);
+  }
+
+protected:
+  /// Construct a new attribute storage instance with the given type.
+  /// Note: All attributes require a valid type. If no type is provided here,
+  ///       the type of the attribute will automatically default to NoneType
+  ///       upon initialization in the uniquer.
+  AttributeStorage(Type type);
+  AttributeStorage();
+
+  /// Set the type of this attribute.
+  void setType(Type type);
+
+  // Set the dialect for this storage instance. This is used by the
+  // AttributeUniquer when initializing a newly constructed storage object.
+  void initializeDialect(Dialect &newDialect) { dialect = &newDialect; }
+
+private:
+  /// The dialect for this attribute.
+  Dialect *dialect;
+
+  /// The opaque type of the attribute value.
+  const void *type;
+};
+
+/// Default storage type for attributes that require no additional
+/// initialization or storage.
+using DefaultAttributeStorage = AttributeStorage;
+
+//===----------------------------------------------------------------------===//
+// AttributeStorageAllocator
+//===----------------------------------------------------------------------===//
+
+// This is a utility allocator used to allocate memory for instances of derived
+// Attributes.
+using AttributeStorageAllocator = StorageUniquer::StorageAllocator;
+
+//===----------------------------------------------------------------------===//
+// AttributeUniquer
+//===----------------------------------------------------------------------===//
+namespace detail {
+// A utility class to get, or create, unique instances of attributes within an
+// MLIRContext. This class manages all creation and uniquing of attributes.
+class AttributeUniquer {
+public:
+  /// Get an uniqued instance of attribute T.
+  template <typename T, typename... Args>
+  static T get(MLIRContext *ctx, unsigned kind, Args &&... args) {
+    return ctx->getAttributeUniquer().get<typename T::ImplType>(
+        getInitFn(ctx, T::getClassID()), kind, std::forward<Args>(args)...);
+  }
+
+private:
+  /// Returns a functor used to initialize new attribute storage instances.
+  static std::function<void(AttributeStorage *)>
+  getInitFn(MLIRContext *ctx, const ClassID *const attrID);
+};
+} // namespace detail
+
+} // end namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/IR/Attributes.h b/mlir/include/mlir/IR/Attributes.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8398580f61c90ea0da96d4c2670f361168b4419
--- /dev/null
+++ b/mlir/include/mlir/IR/Attributes.h
@@ -0,0 +1,1440 @@
+//===- Attributes.h - MLIR Attribute Classes --------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_ATTRIBUTES_H
+#define MLIR_IR_ATTRIBUTES_H
+
+#include "mlir/IR/AttributeSupport.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/Sequence.h"
+
+namespace mlir {
+class AffineMap;
+class Dialect;
+class FunctionType;
+class Identifier;
+class IntegerSet;
+class Location;
+class MLIRContext;
+class ShapedType;
+class Type;
+
+namespace detail {
+
+struct AffineMapAttributeStorage;
+struct ArrayAttributeStorage;
+struct BoolAttributeStorage;
+struct DictionaryAttributeStorage;
+struct IntegerAttributeStorage;
+struct IntegerSetAttributeStorage;
+struct FloatAttributeStorage;
+struct OpaqueAttributeStorage;
+struct StringAttributeStorage;
+struct SymbolRefAttributeStorage;
+struct TypeAttributeStorage;
+
+/// Elements Attributes.
+struct DenseElementsAttributeStorage;
+struct OpaqueElementsAttributeStorage;
+struct SparseElementsAttributeStorage;
+} // namespace detail
+
+/// Attributes are known-constant values of operations and functions.
+///
+/// Instances of the Attribute class are references to immutable, uniqued,
+/// and immortal values owned by MLIRContext. As such, an Attribute is a thin
+/// wrapper around an underlying storage pointer. Attributes are usually passed
+/// by value.
+class Attribute {
+public:
+  /// Integer identifier for all the concrete attribute kinds.
+  enum Kind {
+  // Reserve attribute kinds for dialect specific extensions.
+#define DEFINE_SYM_KIND_RANGE(Dialect)                                         \
+  FIRST_##Dialect##_ATTR, LAST_##Dialect##_ATTR = FIRST_##Dialect##_ATTR + 0xff,
+#include "DialectSymbolRegistry.def"
+  };
+
+  /// Utility class for implementing attributes.
+  template <typename ConcreteType, typename BaseType = Attribute,
+            typename StorageType = AttributeStorage>
+  using AttrBase = detail::StorageUserBase<ConcreteType, BaseType, StorageType,
+                                           detail::AttributeUniquer>;
+
+  using ImplType = AttributeStorage;
+  using ValueType = void;
+
+  Attribute() : impl(nullptr) {}
+  /* implicit */ Attribute(const ImplType *impl)
+      : impl(const_cast<ImplType *>(impl)) {}
+
+  Attribute(const Attribute &other) = default;
+  Attribute &operator=(const Attribute &other) = default;
+
+  bool operator==(Attribute other) const { return impl == other.impl; }
+  bool operator!=(Attribute other) const { return !(*this == other); }
+  explicit operator bool() const { return impl; }
+
+  bool operator!() const { return impl == nullptr; }
+
+  template <typename U> bool isa() const;
+  template <typename U> U dyn_cast() const;
+  template <typename U> U dyn_cast_or_null() const;
+  template <typename U> U cast() const;
+
+  // Support dyn_cast'ing Attribute to itself.
+  static bool classof(Attribute) { return true; }
+
+  /// Return the classification for this attribute.
+  unsigned getKind() const { return impl->getKind(); }
+
+  /// Return the type of this attribute.
+  Type getType() const;
+
+  /// Return the context this attribute belongs to.
+  MLIRContext *getContext() const;
+
+  /// Get the dialect this attribute is registered to.
+  Dialect &getDialect() const;
+
+  /// Print the attribute.
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  /// Get an opaque pointer to the attribute.
+  const void *getAsOpaquePointer() const { return impl; }
+  /// Construct an attribute from the opaque pointer representation.
+  static Attribute getFromOpaquePointer(const void *ptr) {
+    return Attribute(reinterpret_cast<const ImplType *>(ptr));
+  }
+
+  friend ::llvm::hash_code hash_value(Attribute arg);
+
+protected:
+  ImplType *impl;
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, Attribute attr) {
+  attr.print(os);
+  return os;
+}
+
+namespace StandardAttributes {
+enum Kind {
+  AffineMap = Attribute::FIRST_STANDARD_ATTR,
+  Array,
+  Bool,
+  Dictionary,
+  Float,
+  Integer,
+  IntegerSet,
+  Opaque,
+  String,
+  SymbolRef,
+  Type,
+  Unit,
+
+  /// Elements Attributes.
+  DenseElements,
+  OpaqueElements,
+  SparseElements,
+  FIRST_ELEMENTS_ATTR = DenseElements,
+  LAST_ELEMENTS_ATTR = SparseElements,
+
+  /// Locations.
+  CallSiteLocation,
+  FileLineColLocation,
+  FusedLocation,
+  NameLocation,
+  OpaqueLocation,
+  UnknownLocation,
+
+  // Represents a location as a 'void*' pointer to a front-end's opaque
+  // location information, which must live longer than the MLIR objects that
+  // refer to it.  OpaqueLocation's are never serialized.
+  //
+  // TODO: OpaqueLocation,
+
+  // Represents a value inlined through a function call.
+  // TODO: InlinedLocation,
+
+  FIRST_LOCATION_ATTR = CallSiteLocation,
+  LAST_LOCATION_ATTR = UnknownLocation,
+};
+} // namespace StandardAttributes
+
+//===----------------------------------------------------------------------===//
+// AffineMapAttr
+//===----------------------------------------------------------------------===//
+
+class AffineMapAttr
+    : public Attribute::AttrBase<AffineMapAttr, Attribute,
+                                 detail::AffineMapAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = AffineMap;
+
+  static AffineMapAttr get(AffineMap value);
+
+  AffineMap getValue() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::AffineMap;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ArrayAttr
+//===----------------------------------------------------------------------===//
+
+/// Array attributes are lists of other attributes.  They are not necessarily
+/// type homogenous given that attributes don't, in general, carry types.
+class ArrayAttr : public Attribute::AttrBase<ArrayAttr, Attribute,
+                                             detail::ArrayAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = ArrayRef<Attribute>;
+
+  static ArrayAttr get(ArrayRef<Attribute> value, MLIRContext *context);
+
+  ArrayRef<Attribute> getValue() const;
+
+  /// Support range iteration.
+  using iterator = llvm::ArrayRef<Attribute>::iterator;
+  iterator begin() const { return getValue().begin(); }
+  iterator end() const { return getValue().end(); }
+  size_t size() const { return getValue().size(); }
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::Array;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// BoolAttr
+//===----------------------------------------------------------------------===//
+
+class BoolAttr : public Attribute::AttrBase<BoolAttr, Attribute,
+                                            detail::BoolAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = bool;
+
+  static BoolAttr get(bool value, MLIRContext *context);
+
+  bool getValue() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == StandardAttributes::Bool; }
+};
+
+//===----------------------------------------------------------------------===//
+// DictionaryAttr
+//===----------------------------------------------------------------------===//
+
+/// NamedAttribute is used for dictionary attributes, it holds an identifier for
+/// the name and a value for the attribute. The attribute pointer should always
+/// be non-null.
+using NamedAttribute = std::pair<Identifier, Attribute>;
+
+/// Dictionary attribute is an attribute that represents a sorted collection of
+/// named attribute values. The elements are sorted by name, and each name must
+/// be unique within the collection.
+class DictionaryAttr
+    : public Attribute::AttrBase<DictionaryAttr, Attribute,
+                                 detail::DictionaryAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = ArrayRef<NamedAttribute>;
+
+  static DictionaryAttr get(ArrayRef<NamedAttribute> value,
+                            MLIRContext *context);
+
+  ArrayRef<NamedAttribute> getValue() const;
+
+  /// Return the specified attribute if present, null otherwise.
+  Attribute get(StringRef name) const;
+  Attribute get(Identifier name) const;
+
+  /// Support range iteration.
+  using iterator = llvm::ArrayRef<NamedAttribute>::iterator;
+  iterator begin() const;
+  iterator end() const;
+  bool empty() const { return size() == 0; }
+  size_t size() const;
+
+  /// Methods for supporting type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::Dictionary;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// FloatAttr
+//===----------------------------------------------------------------------===//
+
+class FloatAttr : public Attribute::AttrBase<FloatAttr, Attribute,
+                                             detail::FloatAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = APFloat;
+
+  /// Return a float attribute for the specified value in the specified type.
+  /// These methods should only be used for simple constant values, e.g 1.0/2.0,
+  /// that are known-valid both as host double and the 'type' format.
+  static FloatAttr get(Type type, double value);
+  static FloatAttr getChecked(Type type, double value, Location loc);
+
+  /// Return a float attribute for the specified value in the specified type.
+  static FloatAttr get(Type type, const APFloat &value);
+  static FloatAttr getChecked(Type type, const APFloat &value, Location loc);
+
+  APFloat getValue() const;
+
+  /// This function is used to convert the value to a double, even if it loses
+  /// precision.
+  double getValueAsDouble() const;
+  static double getValueAsDouble(APFloat val);
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::Float;
+  }
+
+  /// Verify the construction invariants for a double value.
+  static LogicalResult verifyConstructionInvariants(Optional<Location> loc,
+                                                    MLIRContext *ctx, Type type,
+                                                    double value);
+  static LogicalResult verifyConstructionInvariants(Optional<Location> loc,
+                                                    MLIRContext *ctx, Type type,
+                                                    const APFloat &value);
+};
+
+//===----------------------------------------------------------------------===//
+// IntegerAttr
+//===----------------------------------------------------------------------===//
+
+class IntegerAttr
+    : public Attribute::AttrBase<IntegerAttr, Attribute,
+                                 detail::IntegerAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = APInt;
+
+  static IntegerAttr get(Type type, int64_t value);
+  static IntegerAttr get(Type type, const APInt &value);
+
+  APInt getValue() const;
+  // TODO(jpienaar): Change callers to use getValue instead.
+  int64_t getInt() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::Integer;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// IntegerSetAttr
+//===----------------------------------------------------------------------===//
+
+class IntegerSetAttr
+    : public Attribute::AttrBase<IntegerSetAttr, Attribute,
+                                 detail::IntegerSetAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = IntegerSet;
+
+  static IntegerSetAttr get(IntegerSet value);
+
+  IntegerSet getValue() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::IntegerSet;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// OpaqueAttr
+//===----------------------------------------------------------------------===//
+
+/// Opaque attributes represent attributes of non-registered dialects. These are
+/// attribute represented in their raw string form, and can only usefully be
+/// tested for attribute equality.
+class OpaqueAttr : public Attribute::AttrBase<OpaqueAttr, Attribute,
+                                              detail::OpaqueAttributeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new OpaqueAttr with the provided dialect and string data.
+  static OpaqueAttr get(Identifier dialect, StringRef attrData, Type type,
+                        MLIRContext *context);
+
+  /// Get or create a new OpaqueAttr with the provided dialect and string data.
+  /// If the given identifier is not a valid namespace for a dialect, then a
+  /// null attribute is returned.
+  static OpaqueAttr getChecked(Identifier dialect, StringRef attrData,
+                               Type type, Location location);
+
+  /// Returns the dialect namespace of the opaque attribute.
+  Identifier getDialectNamespace() const;
+
+  /// Returns the raw attribute data of the opaque attribute.
+  StringRef getAttrData() const;
+
+  /// Verify the construction of an opaque attribute.
+  static LogicalResult verifyConstructionInvariants(Optional<Location> loc,
+                                                    MLIRContext *context,
+                                                    Identifier dialect,
+                                                    StringRef attrData,
+                                                    Type type);
+
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::Opaque;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// StringAttr
+//===----------------------------------------------------------------------===//
+
+class StringAttr : public Attribute::AttrBase<StringAttr, Attribute,
+                                              detail::StringAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = StringRef;
+
+  /// Get an instance of a StringAttr with the given string.
+  static StringAttr get(StringRef bytes, MLIRContext *context);
+
+  /// Get an instance of a StringAttr with the given string and Type.
+  static StringAttr get(StringRef bytes, Type type);
+
+  StringRef getValue() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::String;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// SymbolRefAttr
+//===----------------------------------------------------------------------===//
+
+class FlatSymbolRefAttr;
+
+/// A symbol reference attribute represents a symbolic reference to another
+/// operation.
+class SymbolRefAttr
+    : public Attribute::AttrBase<SymbolRefAttr, Attribute,
+                                 detail::SymbolRefAttributeStorage> {
+public:
+  using Base::Base;
+
+  /// Construct a symbol reference for the given value name.
+  static FlatSymbolRefAttr get(StringRef value, MLIRContext *ctx);
+
+  /// Construct a symbol reference for the given value name, and a set of nested
+  /// references that are further resolve to a nested symbol.
+  static SymbolRefAttr get(StringRef value,
+                           ArrayRef<FlatSymbolRefAttr> references,
+                           MLIRContext *ctx);
+
+  /// Returns the name of the top level symbol reference, i.e. the root of the
+  /// reference path.
+  StringRef getRootReference() const;
+
+  /// Returns the name of the fully resolved symbol, i.e. the leaf of the
+  /// reference path.
+  StringRef getLeafReference() const;
+
+  /// Returns the set of nested references representing the path to the symbol
+  /// nested under the root reference.
+  ArrayRef<FlatSymbolRefAttr> getNestedReferences() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::SymbolRef;
+  }
+};
+
+/// A symbol reference with a reference path containing a single element. This
+/// is used to refer to an operation within the current symbol table.
+class FlatSymbolRefAttr : public SymbolRefAttr {
+public:
+  using SymbolRefAttr::SymbolRefAttr;
+  using ValueType = StringRef;
+
+  /// Construct a symbol reference for the given value name.
+  static FlatSymbolRefAttr get(StringRef value, MLIRContext *ctx) {
+    return SymbolRefAttr::get(value, ctx);
+  }
+
+  /// Returns the name of the held symbol reference.
+  StringRef getValue() const { return getRootReference(); }
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(Attribute attr) {
+    SymbolRefAttr refAttr = attr.dyn_cast<SymbolRefAttr>();
+    return refAttr && refAttr.getNestedReferences().empty();
+  }
+
+private:
+  using SymbolRefAttr::get;
+  using SymbolRefAttr::getNestedReferences;
+};
+
+//===----------------------------------------------------------------------===//
+// Type
+//===----------------------------------------------------------------------===//
+
+class TypeAttr : public Attribute::AttrBase<TypeAttr, Attribute,
+                                            detail::TypeAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = Type;
+
+  static TypeAttr get(Type value);
+
+  Type getValue() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == StandardAttributes::Type; }
+};
+
+//===----------------------------------------------------------------------===//
+// UnitAttr
+//===----------------------------------------------------------------------===//
+
+/// Unit attributes are attributes that hold no specific value and are given
+/// meaning by their existence.
+class UnitAttr : public Attribute::AttrBase<UnitAttr> {
+public:
+  using Base::Base;
+
+  static UnitAttr get(MLIRContext *context);
+
+  static bool kindof(unsigned kind) { return kind == StandardAttributes::Unit; }
+};
+
+//===----------------------------------------------------------------------===//
+// Elements Attributes
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+template <typename T> class ElementsAttrIterator;
+template <typename T> class ElementsAttrRange;
+} // namespace detail
+
+/// A base attribute that represents a reference to a static shaped tensor or
+/// vector constant.
+class ElementsAttr : public Attribute {
+public:
+  using Attribute::Attribute;
+  template <typename T> using iterator = detail::ElementsAttrIterator<T>;
+  template <typename T> using iterator_range = detail::ElementsAttrRange<T>;
+
+  /// Return the type of this ElementsAttr, guaranteed to be a vector or tensor
+  /// with static shape.
+  ShapedType getType() const;
+
+  /// Return the value at the given index. The index is expected to refer to a
+  /// valid element.
+  Attribute getValue(ArrayRef<uint64_t> index) const;
+
+  /// Return the value of type 'T' at the given index, where 'T' corresponds to
+  /// an Attribute type.
+  template <typename T> T getValue(ArrayRef<uint64_t> index) const {
+    return getValue(index).template cast<T>();
+  }
+
+  /// Return the elements of this attribute as a value of type 'T'. Note:
+  /// Aborts if the subclass is OpaqueElementsAttrs, these attrs do not support
+  /// iteration.
+  template <typename T> iterator_range<T> getValues() const;
+
+  /// Return if the given 'index' refers to a valid element in this attribute.
+  bool isValidIndex(ArrayRef<uint64_t> index) const;
+
+  /// Returns the number of elements held by this attribute.
+  int64_t getNumElements() const;
+
+  /// Generates a new ElementsAttr by mapping each int value to a new
+  /// underlying APInt. The new values can represent either a integer or float.
+  /// This ElementsAttr should contain integers.
+  ElementsAttr mapValues(Type newElementType,
+                         function_ref<APInt(const APInt &)> mapping) const;
+
+  /// Generates a new ElementsAttr by mapping each float value to a new
+  /// underlying APInt. The new values can represent either a integer or float.
+  /// This ElementsAttr should contain floats.
+  ElementsAttr mapValues(Type newElementType,
+                         function_ref<APInt(const APFloat &)> mapping) const;
+
+  /// Method for support type inquiry through isa, cast and dyn_cast.
+  static bool classof(Attribute attr) {
+    return attr.getKind() >= StandardAttributes::FIRST_ELEMENTS_ATTR &&
+           attr.getKind() <= StandardAttributes::LAST_ELEMENTS_ATTR;
+  }
+
+protected:
+  /// Returns the 1 dimensional flattened row-major index from the given
+  /// multi-dimensional index.
+  uint64_t getFlattenedIndex(ArrayRef<uint64_t> index) const;
+};
+
+namespace detail {
+/// DenseElementsAttr data is aligned to uint64_t, so this traits class is
+/// necessary to interop with PointerIntPair.
+class DenseElementDataPointerTypeTraits {
+public:
+  static inline const void *getAsVoidPointer(const char *ptr) { return ptr; }
+  static inline const char *getFromVoidPointer(const void *ptr) {
+    return static_cast<const char *>(ptr);
+  }
+
+  // Note: We could steal more bits if the need arises.
+  enum { NumLowBitsAvailable = 1 };
+};
+
+/// Pair of raw pointer and a boolean flag of whether the pointer holds a splat,
+using DenseIterPtrAndSplat =
+    llvm::PointerIntPair<const char *, 1, bool,
+                         DenseElementDataPointerTypeTraits>;
+
+/// Impl iterator for indexed DenseElementAttr iterators that records a data
+/// pointer and data index that is adjusted for the case of a splat attribute.
+template <typename ConcreteT, typename T, typename PointerT = T *,
+          typename ReferenceT = T &>
+class DenseElementIndexedIteratorImpl
+    : public indexed_accessor_iterator<ConcreteT, DenseIterPtrAndSplat, T,
+                                       PointerT, ReferenceT> {
+protected:
+  DenseElementIndexedIteratorImpl(const char *data, bool isSplat,
+                                  size_t dataIndex)
+      : indexed_accessor_iterator<ConcreteT, DenseIterPtrAndSplat, T, PointerT,
+                                  ReferenceT>({data, isSplat}, dataIndex) {}
+
+  /// Return the current index for this iterator, adjusted for the case of a
+  /// splat.
+  ptrdiff_t getDataIndex() const {
+    bool isSplat = this->base.getInt();
+    return isSplat ? 0 : this->index;
+  }
+
+  /// Return the data base pointer.
+  const char *getData() const { return this->base.getPointer(); }
+};
+} // namespace detail
+
+/// An attribute that represents a reference to a dense vector or tensor object.
+///
+class DenseElementsAttr
+    : public Attribute::AttrBase<DenseElementsAttr, ElementsAttr,
+                                 detail::DenseElementsAttributeStorage> {
+public:
+  using Base::Base;
+
+  /// Method for support type inquiry through isa, cast and dyn_cast.
+  static bool classof(Attribute attr) {
+    return attr.getKind() == StandardAttributes::DenseElements;
+  }
+
+  /// Constructs a dense elements attribute from an array of element values.
+  /// Each element attribute value is expected to be an element of 'type'.
+  /// 'type' must be a vector or tensor with static shape.
+  static DenseElementsAttr get(ShapedType type, ArrayRef<Attribute> values);
+
+  /// Constructs a dense integer elements attribute from an array of integer
+  /// or floating-point values. Each value is expected to be the same bitwidth
+  /// of the element type of 'type'. 'type' must be a vector or tensor with
+  /// static shape.
+  template <typename T, typename = typename std::enable_if<
+                            std::numeric_limits<T>::is_integer ||
+                            llvm::is_one_of<T, float, double>::value>::type>
+  static DenseElementsAttr get(const ShapedType &type, ArrayRef<T> values) {
+    const char *data = reinterpret_cast<const char *>(values.data());
+    return getRawIntOrFloat(
+        type, ArrayRef<char>(data, values.size() * sizeof(T)), sizeof(T),
+        /*isInt=*/std::numeric_limits<T>::is_integer);
+  }
+
+  /// Constructs a dense integer elements attribute from a single element.
+  template <typename T, typename = typename std::enable_if<
+                            std::numeric_limits<T>::is_integer ||
+                            llvm::is_one_of<T, float, double>::value>::type>
+  static DenseElementsAttr get(const ShapedType &type, T value) {
+    return get(type, llvm::makeArrayRef(value));
+  }
+
+  /// Overload of the above 'get' method that is specialized for boolean values.
+  static DenseElementsAttr get(ShapedType type, ArrayRef<bool> values);
+
+  /// Constructs a dense integer elements attribute from an array of APInt
+  /// values. Each APInt value is expected to have the same bitwidth as the
+  /// element type of 'type'. 'type' must be a vector or tensor with static
+  /// shape.
+  static DenseElementsAttr get(ShapedType type, ArrayRef<APInt> values);
+
+  /// Constructs a dense float elements attribute from an array of APFloat
+  /// values. Each APFloat value is expected to have the same bitwidth as the
+  /// element type of 'type'. 'type' must be a vector or tensor with static
+  /// shape.
+  static DenseElementsAttr get(ShapedType type, ArrayRef<APFloat> values);
+
+  /// Construct a dense elements attribute for an initializer_list of values.
+  /// Each value is expected to be the same bitwidth of the element type of
+  /// 'type'. 'type' must be a vector or tensor with static shape.
+  template <typename T>
+  static DenseElementsAttr get(const ShapedType &type,
+                               const std::initializer_list<T> &list) {
+    return get(type, ArrayRef<T>(list));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Iterators
+  //===--------------------------------------------------------------------===//
+
+  /// A utility iterator that allows walking over the internal Attribute values
+  /// of a DenseElementsAttr.
+  class AttributeElementIterator
+      : public indexed_accessor_iterator<AttributeElementIterator, const void *,
+                                         Attribute, Attribute, Attribute> {
+  public:
+    /// Accesses the Attribute value at this iterator position.
+    Attribute operator*() const;
+
+  private:
+    friend DenseElementsAttr;
+
+    /// Constructs a new iterator.
+    AttributeElementIterator(DenseElementsAttr attr, size_t index);
+  };
+
+  /// Iterator for walking raw element values of the specified type 'T', which
+  /// may be any c++ data type matching the stored representation: int32_t,
+  /// float, etc.
+  template <typename T>
+  class ElementIterator
+      : public detail::DenseElementIndexedIteratorImpl<ElementIterator<T>,
+                                                       const T> {
+  public:
+    /// Accesses the raw value at this iterator position.
+    const T &operator*() const {
+      return reinterpret_cast<const T *>(this->getData())[this->getDataIndex()];
+    }
+
+  private:
+    friend DenseElementsAttr;
+
+    /// Constructs a new iterator.
+    ElementIterator(const char *data, bool isSplat, size_t dataIndex)
+        : detail::DenseElementIndexedIteratorImpl<ElementIterator<T>, const T>(
+              data, isSplat, dataIndex) {}
+  };
+
+  /// A utility iterator that allows walking over the internal bool values.
+  class BoolElementIterator
+      : public detail::DenseElementIndexedIteratorImpl<BoolElementIterator,
+                                                       bool, bool, bool> {
+  public:
+    /// Accesses the bool value at this iterator position.
+    bool operator*() const;
+
+  private:
+    friend DenseElementsAttr;
+
+    /// Constructs a new iterator.
+    BoolElementIterator(DenseElementsAttr attr, size_t dataIndex);
+  };
+
+  /// A utility iterator that allows walking over the internal raw APInt values.
+  class IntElementIterator
+      : public detail::DenseElementIndexedIteratorImpl<IntElementIterator,
+                                                       APInt, APInt, APInt> {
+  public:
+    /// Accesses the raw APInt value at this iterator position.
+    APInt operator*() const;
+
+  private:
+    friend DenseElementsAttr;
+
+    /// Constructs a new iterator.
+    IntElementIterator(DenseElementsAttr attr, size_t dataIndex);
+
+    /// The bitwidth of the element type.
+    size_t bitWidth;
+  };
+
+  /// Iterator for walking over APFloat values.
+  class FloatElementIterator final
+      : public llvm::mapped_iterator<IntElementIterator,
+                                     std::function<APFloat(const APInt &)>> {
+    friend DenseElementsAttr;
+
+    /// Initializes the float element iterator to the specified iterator.
+    FloatElementIterator(const llvm::fltSemantics &smt, IntElementIterator it);
+
+  public:
+    using reference = APFloat;
+  };
+
+  //===--------------------------------------------------------------------===//
+  // Value Querying
+  //===--------------------------------------------------------------------===//
+
+  /// Returns if this attribute corresponds to a splat, i.e. if all element
+  /// values are the same.
+  bool isSplat() const;
+
+  /// Return the splat value for this attribute. This asserts that the attribute
+  /// corresponds to a splat.
+  Attribute getSplatValue() const { return getSplatValue<Attribute>(); }
+  template <typename T>
+  typename std::enable_if<!std::is_base_of<Attribute, T>::value ||
+                              std::is_same<Attribute, T>::value,
+                          T>::type
+  getSplatValue() const {
+    assert(isSplat() && "expected the attribute to be a splat");
+    return *getValues<T>().begin();
+  }
+  /// Return the splat value for derived attribute element types.
+  template <typename T>
+  typename std::enable_if<std::is_base_of<Attribute, T>::value &&
+                              !std::is_same<Attribute, T>::value,
+                          T>::type
+  getSplatValue() const {
+    return getSplatValue().template cast<T>();
+  }
+
+  /// Return the value at the given index. The 'index' is expected to refer to a
+  /// valid element.
+  Attribute getValue(ArrayRef<uint64_t> index) const {
+    return getValue<Attribute>(index);
+  }
+  template <typename T> T getValue(ArrayRef<uint64_t> index) const {
+    // Skip to the element corresponding to the flattened index.
+    return *std::next(getValues<T>().begin(), getFlattenedIndex(index));
+  }
+
+  /// Return the held element values as a range of integer or floating-point
+  /// values.
+  template <typename T, typename = typename std::enable_if<
+                            (!std::is_same<T, bool>::value &&
+                             std::numeric_limits<T>::is_integer) ||
+                            llvm::is_one_of<T, float, double>::value>::type>
+  llvm::iterator_range<ElementIterator<T>> getValues() const {
+    assert(isValidIntOrFloat(sizeof(T), std::numeric_limits<T>::is_integer));
+    auto rawData = getRawData().data();
+    bool splat = isSplat();
+    return {ElementIterator<T>(rawData, splat, 0),
+            ElementIterator<T>(rawData, splat, getNumElements())};
+  }
+
+  /// Return the held element values as a range of Attributes.
+  llvm::iterator_range<AttributeElementIterator> getAttributeValues() const;
+  template <typename T, typename = typename std::enable_if<
+                            std::is_same<T, Attribute>::value>::type>
+  llvm::iterator_range<AttributeElementIterator> getValues() const {
+    return getAttributeValues();
+  }
+  AttributeElementIterator attr_value_begin() const;
+  AttributeElementIterator attr_value_end() const;
+
+  /// Return the held element values a range of T, where T is a derived
+  /// attribute type.
+  template <typename T>
+  using DerivedAttributeElementIterator =
+      llvm::mapped_iterator<AttributeElementIterator, T (*)(Attribute)>;
+  template <typename T, typename = typename std::enable_if<
+                            std::is_base_of<Attribute, T>::value &&
+                            !std::is_same<Attribute, T>::value>::type>
+  llvm::iterator_range<DerivedAttributeElementIterator<T>> getValues() const {
+    auto castFn = [](Attribute attr) { return attr.template cast<T>(); };
+    return llvm::map_range(getAttributeValues(),
+                           static_cast<T (*)(Attribute)>(castFn));
+  }
+
+  /// Return the held element values as a range of bool. The element type of
+  /// this attribute must be of integer type of bitwidth 1.
+  llvm::iterator_range<BoolElementIterator> getBoolValues() const;
+  template <typename T, typename = typename std::enable_if<
+                            std::is_same<T, bool>::value>::type>
+  llvm::iterator_range<BoolElementIterator> getValues() const {
+    return getBoolValues();
+  }
+
+  /// Return the held element values as a range of APInts. The element type of
+  /// this attribute must be of integer type.
+  llvm::iterator_range<IntElementIterator> getIntValues() const;
+  template <typename T, typename = typename std::enable_if<
+                            std::is_same<T, APInt>::value>::type>
+  llvm::iterator_range<IntElementIterator> getValues() const {
+    return getIntValues();
+  }
+  IntElementIterator int_value_begin() const;
+  IntElementIterator int_value_end() const;
+
+  /// Return the held element values as a range of APFloat. The element type of
+  /// this attribute must be of float type.
+  llvm::iterator_range<FloatElementIterator> getFloatValues() const;
+  template <typename T, typename = typename std::enable_if<
+                            std::is_same<T, APFloat>::value>::type>
+  llvm::iterator_range<FloatElementIterator> getValues() const {
+    return getFloatValues();
+  }
+  FloatElementIterator float_value_begin() const;
+  FloatElementIterator float_value_end() const;
+
+  //===--------------------------------------------------------------------===//
+  // Mutation Utilities
+  //===--------------------------------------------------------------------===//
+
+  /// Return a new DenseElementsAttr that has the same data as the current
+  /// attribute, but has been reshaped to 'newType'. The new type must have the
+  /// same total number of elements as well as element type.
+  DenseElementsAttr reshape(ShapedType newType);
+
+  /// Generates a new DenseElementsAttr by mapping each int value to a new
+  /// underlying APInt. The new values can represent either a integer or float.
+  /// This underlying type must be an DenseIntElementsAttr.
+  DenseElementsAttr mapValues(Type newElementType,
+                              function_ref<APInt(const APInt &)> mapping) const;
+
+  /// Generates a new DenseElementsAttr by mapping each float value to a new
+  /// underlying APInt. the new values can represent either a integer or float.
+  /// This underlying type must be an DenseFPElementsAttr.
+  DenseElementsAttr
+  mapValues(Type newElementType,
+            function_ref<APInt(const APFloat &)> mapping) const;
+
+protected:
+  /// Return the raw storage data held by this attribute.
+  ArrayRef<char> getRawData() const;
+
+  /// Get iterators to the raw APInt values for each element in this attribute.
+  IntElementIterator raw_int_begin() const {
+    return IntElementIterator(*this, 0);
+  }
+  IntElementIterator raw_int_end() const {
+    return IntElementIterator(*this, getNumElements());
+  }
+
+  /// Constructs a dense elements attribute from an array of raw APInt values.
+  /// Each APInt value is expected to have the same bitwidth as the element type
+  /// of 'type'. 'type' must be a vector or tensor with static shape.
+  static DenseElementsAttr getRaw(ShapedType type, ArrayRef<APInt> values);
+
+  /// Get or create a new dense elements attribute instance with the given raw
+  /// data buffer. 'type' must be a vector or tensor with static shape.
+  static DenseElementsAttr getRaw(ShapedType type, ArrayRef<char> data,
+                                  bool isSplat);
+
+  /// Overload of the raw 'get' method that asserts that the given type is of
+  /// integer or floating-point type. This method is used to verify type
+  /// invariants that the templatized 'get' method cannot.
+  static DenseElementsAttr getRawIntOrFloat(ShapedType type,
+                                            ArrayRef<char> data,
+                                            int64_t dataEltSize, bool isInt);
+
+  /// Check the information for a c++ data type, check if this type is valid for
+  /// the current attribute. This method is used to verify specific type
+  /// invariants that the templatized 'getValues' method cannot.
+  bool isValidIntOrFloat(int64_t dataEltSize, bool isInt) const;
+};
+
+/// An attribute that represents a reference to a dense float vector or tensor
+/// object. Each element is stored as a double.
+class DenseFPElementsAttr : public DenseElementsAttr {
+public:
+  using iterator = DenseElementsAttr::FloatElementIterator;
+
+  using DenseElementsAttr::DenseElementsAttr;
+
+  /// Get an instance of a DenseFPElementsAttr with the given arguments. This
+  /// simply wraps the DenseElementsAttr::get calls.
+  template <typename Arg>
+  static DenseFPElementsAttr get(const ShapedType &type, Arg &&arg) {
+    return DenseElementsAttr::get(type, llvm::makeArrayRef(arg))
+        .template cast<DenseFPElementsAttr>();
+  }
+  template <typename T>
+  static DenseFPElementsAttr get(const ShapedType &type,
+                                 const std::initializer_list<T> &list) {
+    return DenseElementsAttr::get(type, list)
+        .template cast<DenseFPElementsAttr>();
+  }
+
+  /// Generates a new DenseElementsAttr by mapping each value attribute, and
+  /// constructing the DenseElementsAttr given the new element type.
+  DenseElementsAttr
+  mapValues(Type newElementType,
+            function_ref<APInt(const APFloat &)> mapping) const;
+
+  /// Iterator access to the float element values.
+  iterator begin() const { return float_value_begin(); }
+  iterator end() const { return float_value_end(); }
+
+  /// Method for supporting type inquiry through isa, cast and dyn_cast.
+  static bool classof(Attribute attr);
+};
+
+/// An attribute that represents a reference to a dense integer vector or tensor
+/// object.
+class DenseIntElementsAttr : public DenseElementsAttr {
+public:
+  /// DenseIntElementsAttr iterates on APInt, so we can use the raw element
+  /// iterator directly.
+  using iterator = DenseElementsAttr::IntElementIterator;
+
+  using DenseElementsAttr::DenseElementsAttr;
+
+  /// Get an instance of a DenseIntElementsAttr with the given arguments. This
+  /// simply wraps the DenseElementsAttr::get calls.
+  template <typename Arg>
+  static DenseIntElementsAttr get(const ShapedType &type, Arg &&arg) {
+    return DenseElementsAttr::get(type, llvm::makeArrayRef(arg))
+        .template cast<DenseIntElementsAttr>();
+  }
+  template <typename T>
+  static DenseIntElementsAttr get(const ShapedType &type,
+                                  const std::initializer_list<T> &list) {
+    return DenseElementsAttr::get(type, list)
+        .template cast<DenseIntElementsAttr>();
+  }
+
+  /// Generates a new DenseElementsAttr by mapping each value attribute, and
+  /// constructing the DenseElementsAttr given the new element type.
+  DenseElementsAttr mapValues(Type newElementType,
+                              function_ref<APInt(const APInt &)> mapping) const;
+
+  /// Iterator access to the integer element values.
+  iterator begin() const { return raw_int_begin(); }
+  iterator end() const { return raw_int_end(); }
+
+  /// Method for supporting type inquiry through isa, cast and dyn_cast.
+  static bool classof(Attribute attr);
+};
+
+/// An opaque attribute that represents a reference to a vector or tensor
+/// constant with opaque content. This representation is for tensor constants
+/// which the compiler may not need to interpret. This attribute is always
+/// associated with a particular dialect, which provides a method to convert
+/// tensor representation to a non-opaque format.
+class OpaqueElementsAttr
+    : public Attribute::AttrBase<OpaqueElementsAttr, ElementsAttr,
+                                 detail::OpaqueElementsAttributeStorage> {
+public:
+  using Base::Base;
+  using ValueType = StringRef;
+
+  static OpaqueElementsAttr get(Dialect *dialect, ShapedType type,
+                                StringRef bytes);
+
+  StringRef getValue() const;
+
+  /// Return the value at the given index. The 'index' is expected to refer to a
+  /// valid element.
+  Attribute getValue(ArrayRef<uint64_t> index) const;
+
+  /// Decodes the attribute value using dialect-specific decoding hook.
+  /// Returns false if decoding is successful. If not, returns true and leaves
+  /// 'result' argument unspecified.
+  bool decode(ElementsAttr &result);
+
+  /// Returns dialect associated with this opaque constant.
+  Dialect *getDialect() const;
+
+  /// Method for support type inquiry through isa, cast and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::OpaqueElements;
+  }
+};
+
+/// An attribute that represents a reference to a sparse vector or tensor
+/// object.
+///
+/// This class uses COO (coordinate list) encoding to represent the sparse
+/// elements in an element attribute. Specifically, the sparse vector/tensor
+/// stores the indices and values as two separate dense elements attributes of
+/// tensor type (even if the sparse attribute is of vector type, in order to
+/// support empty lists). The dense elements attribute indices is a 2-D tensor
+/// of 64-bit integer elements with shape [N, ndims], which specifies the
+/// indices of the elements in the sparse tensor that contains nonzero values.
+/// The dense elements attribute values is a 1-D tensor with shape [N], and it
+/// supplies the corresponding values for the indices.
+///
+/// For example,
+/// `sparse<tensor<3x4xi32>, [[0, 0], [1, 2]], [1, 5]>` represents tensor
+/// [[1, 0, 0, 0],
+///  [0, 0, 5, 0],
+///  [0, 0, 0, 0]].
+class SparseElementsAttr
+    : public Attribute::AttrBase<SparseElementsAttr, ElementsAttr,
+                                 detail::SparseElementsAttributeStorage> {
+public:
+  using Base::Base;
+
+  template <typename T>
+  using iterator =
+      llvm::mapped_iterator<llvm::detail::value_sequence_iterator<ptrdiff_t>,
+                            std::function<T(ptrdiff_t)>>;
+
+  /// 'type' must be a vector or tensor with static shape.
+  static SparseElementsAttr get(ShapedType type, DenseElementsAttr indices,
+                                DenseElementsAttr values);
+
+  DenseIntElementsAttr getIndices() const;
+
+  DenseElementsAttr getValues() const;
+
+  /// Return the values of this attribute in the form of the given type 'T'. 'T'
+  /// may be any of Attribute, APInt, APFloat, c++ integer/float types, etc.
+  template <typename T> llvm::iterator_range<iterator<T>> getValues() const {
+    auto zeroValue = getZeroValue<T>();
+    auto valueIt = getValues().getValues<T>().begin();
+    const std::vector<ptrdiff_t> flatSparseIndices(getFlattenedSparseIndices());
+    // TODO(riverriddle): Move-capture flatSparseIndices when c++14 is
+    // available.
+    std::function<T(ptrdiff_t)> mapFn = [=](ptrdiff_t index) {
+      // Try to map the current index to one of the sparse indices.
+      for (unsigned i = 0, e = flatSparseIndices.size(); i != e; ++i)
+        if (flatSparseIndices[i] == index)
+          return *std::next(valueIt, i);
+      // Otherwise, return the zero value.
+      return zeroValue;
+    };
+    return llvm::map_range(llvm::seq<ptrdiff_t>(0, getNumElements()), mapFn);
+  }
+
+  /// Return the value of the element at the given index. The 'index' is
+  /// expected to refer to a valid element.
+  Attribute getValue(ArrayRef<uint64_t> index) const;
+
+  /// Method for support type inquiry through isa, cast and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::SparseElements;
+  }
+
+private:
+  /// Get a zero APFloat for the given sparse attribute.
+  APFloat getZeroAPFloat() const;
+
+  /// Get a zero APInt for the given sparse attribute.
+  APInt getZeroAPInt() const;
+
+  /// Get a zero attribute for the given sparse attribute.
+  Attribute getZeroAttr() const;
+
+  /// Utility methods to generate a zero value of some type 'T'. This is used by
+  /// the 'iterator' class.
+  /// Get a zero for a given attribute type.
+  template <typename T>
+  typename std::enable_if<std::is_base_of<Attribute, T>::value, T>::type
+  getZeroValue() const {
+    return getZeroAttr().template cast<T>();
+  }
+  /// Get a zero for an APInt.
+  template <typename T>
+  typename std::enable_if<std::is_same<APInt, T>::value, T>::type
+  getZeroValue() const {
+    return getZeroAPInt();
+  }
+  /// Get a zero for an APFloat.
+  template <typename T>
+  typename std::enable_if<std::is_same<APFloat, T>::value, T>::type
+  getZeroValue() const {
+    return getZeroAPFloat();
+  }
+  /// Get a zero for an C++ integer or float type.
+  template <typename T>
+  typename std::enable_if<std::numeric_limits<T>::is_integer ||
+                              llvm::is_one_of<T, float, double>::value,
+                          T>::type
+  getZeroValue() const {
+    return T(0);
+  }
+
+  /// Flatten, and return, all of the sparse indices in this attribute in
+  /// row-major order.
+  std::vector<ptrdiff_t> getFlattenedSparseIndices() const;
+};
+
+/// An attribute that represents a reference to a splat vector or tensor
+/// constant, meaning all of the elements have the same value.
+class SplatElementsAttr : public DenseElementsAttr {
+public:
+  using DenseElementsAttr::DenseElementsAttr;
+
+  /// Method for support type inquiry through isa, cast and dyn_cast.
+  static bool classof(Attribute attr) {
+    auto denseAttr = attr.dyn_cast<DenseElementsAttr>();
+    return denseAttr && denseAttr.isSplat();
+  }
+};
+
+namespace detail {
+/// This class represents a general iterator over the values of an ElementsAttr.
+/// It supports all subclasses aside from OpaqueElementsAttr.
+template <typename T>
+class ElementsAttrIterator
+    : public llvm::iterator_facade_base<ElementsAttrIterator<T>,
+                                        std::random_access_iterator_tag, T,
+                                        std::ptrdiff_t, T, T> {
+  // NOTE: We use a dummy enable_if here because MSVC cannot use 'decltype'
+  // inside of a conversion operator.
+  using DenseIteratorT = typename std::enable_if<
+      true,
+      decltype(std::declval<DenseElementsAttr>().getValues<T>().begin())>::type;
+  using SparseIteratorT = SparseElementsAttr::iterator<T>;
+
+  /// A union containing the specific iterators for each derived attribute kind.
+  union Iterator {
+    Iterator(DenseIteratorT &&it) : denseIt(std::move(it)) {}
+    Iterator(SparseIteratorT &&it) : sparseIt(std::move(it)) {}
+    Iterator() {}
+    ~Iterator() {}
+
+    operator const DenseIteratorT &() const { return denseIt; }
+    operator const SparseIteratorT &() const { return sparseIt; }
+    operator DenseIteratorT &() { return denseIt; }
+    operator SparseIteratorT &() { return sparseIt; }
+
+    /// An instance of a dense elements iterator.
+    DenseIteratorT denseIt;
+    /// An instance of a sparse elements iterator.
+    SparseIteratorT sparseIt;
+  };
+
+  /// Utility method to process a functor on each of the internal iterator
+  /// types.
+  template <typename RetT, template <typename> class ProcessFn,
+            typename... Args>
+  RetT process(Args &... args) const {
+    switch (attrKind) {
+    case StandardAttributes::DenseElements:
+      return ProcessFn<DenseIteratorT>()(args...);
+    case StandardAttributes::SparseElements:
+      return ProcessFn<SparseIteratorT>()(args...);
+    }
+    llvm_unreachable("unexpected attribute kind");
+  }
+
+  /// Utility functors used to generically implement the iterators methods.
+  template <typename ItT> struct PlusAssign {
+    void operator()(ItT &it, ptrdiff_t offset) { it += offset; }
+  };
+  template <typename ItT> struct Minus {
+    ptrdiff_t operator()(const ItT &lhs, const ItT &rhs) { return lhs - rhs; }
+  };
+  template <typename ItT> struct MinusAssign {
+    void operator()(ItT &it, ptrdiff_t offset) { it -= offset; }
+  };
+  template <typename ItT> struct Dereference {
+    T operator()(ItT &it) { return *it; }
+  };
+  template <typename ItT> struct ConstructIter {
+    void operator()(ItT &dest, const ItT &it) { ::new (&dest) ItT(it); }
+  };
+  template <typename ItT> struct DestructIter {
+    void operator()(ItT &it) { it.~ItT(); }
+  };
+
+public:
+  ElementsAttrIterator(const ElementsAttrIterator<T> &rhs)
+      : attrKind(rhs.attrKind) {
+    process<void, ConstructIter>(it, rhs.it);
+  }
+  ~ElementsAttrIterator() { process<void, DestructIter>(it); }
+
+  /// Methods necessary to support random access iteration.
+  ptrdiff_t operator-(const ElementsAttrIterator<T> &rhs) const {
+    assert(attrKind == rhs.attrKind && "incompatible iterators");
+    return process<ptrdiff_t, Minus>(it, rhs.it);
+  }
+  bool operator==(const ElementsAttrIterator<T> &rhs) const {
+    return rhs.attrKind == attrKind && process<bool, std::equal_to>(it, rhs.it);
+  }
+  bool operator<(const ElementsAttrIterator<T> &rhs) const {
+    assert(attrKind == rhs.attrKind && "incompatible iterators");
+    return process<bool, std::less>(it, rhs.it);
+  }
+  ElementsAttrIterator<T> &operator+=(ptrdiff_t offset) {
+    process<void, PlusAssign>(it, offset);
+    return *this;
+  }
+  ElementsAttrIterator<T> &operator-=(ptrdiff_t offset) {
+    process<void, MinusAssign>(it, offset);
+    return *this;
+  }
+
+  /// Dereference the iterator at the current index.
+  T operator*() { return process<T, Dereference>(it); }
+
+private:
+  template <typename IteratorT>
+  ElementsAttrIterator(unsigned attrKind, IteratorT &&it)
+      : attrKind(attrKind), it(std::forward<IteratorT>(it)) {}
+
+  /// Allow accessing the constructor.
+  friend ElementsAttr;
+
+  /// The kind of derived elements attribute.
+  unsigned attrKind;
+
+  /// A union containing the specific iterators for each derived kind.
+  Iterator it;
+};
+
+template <typename T>
+class ElementsAttrRange : public llvm::iterator_range<ElementsAttrIterator<T>> {
+  using llvm::iterator_range<ElementsAttrIterator<T>>::iterator_range;
+};
+} // namespace detail
+
+/// Return the elements of this attribute as a value of type 'T'.
+template <typename T>
+auto ElementsAttr::getValues() const -> iterator_range<T> {
+  if (DenseElementsAttr denseAttr = dyn_cast<DenseElementsAttr>()) {
+    auto values = denseAttr.getValues<T>();
+    return {iterator<T>(getKind(), values.begin()),
+            iterator<T>(getKind(), values.end())};
+  }
+  if (SparseElementsAttr sparseAttr = dyn_cast<SparseElementsAttr>()) {
+    auto values = sparseAttr.getValues<T>();
+    return {iterator<T>(getKind(), values.begin()),
+            iterator<T>(getKind(), values.end())};
+  }
+  llvm_unreachable("unexpected attribute kind");
+}
+
+//===----------------------------------------------------------------------===//
+// Attributes Utils
+//===----------------------------------------------------------------------===//
+
+template <typename U> bool Attribute::isa() const {
+  assert(impl && "isa<> used on a null attribute.");
+  return U::classof(*this);
+}
+template <typename U> U Attribute::dyn_cast() const {
+  return isa<U>() ? U(impl) : U(nullptr);
+}
+template <typename U> U Attribute::dyn_cast_or_null() const {
+  return (impl && isa<U>()) ? U(impl) : U(nullptr);
+}
+template <typename U> U Attribute::cast() const {
+  assert(isa<U>());
+  return U(impl);
+}
+
+// Make Attribute hashable.
+inline ::llvm::hash_code hash_value(Attribute arg) {
+  return ::llvm::hash_value(arg.impl);
+}
+
+//===----------------------------------------------------------------------===//
+// NamedAttributeList
+//===----------------------------------------------------------------------===//
+
+/// A NamedAttributeList is used to manage a list of named attributes. This
+/// provides simple interfaces for adding/removing/finding attributes from
+/// within a DictionaryAttr.
+///
+/// We assume there will be relatively few attributes on a given operation
+/// (maybe a dozen or so, but not hundreds or thousands) so we use linear
+/// searches for everything.
+class NamedAttributeList {
+public:
+  NamedAttributeList(DictionaryAttr attrs = nullptr)
+      : attrs((attrs && !attrs.empty()) ? attrs : nullptr) {}
+  NamedAttributeList(ArrayRef<NamedAttribute> attributes);
+
+  bool operator!=(const NamedAttributeList &other) const {
+    return !(*this == other);
+  }
+  bool operator==(const NamedAttributeList &other) const {
+    return attrs == other.attrs;
+  }
+
+  /// Return the underlying dictionary attribute. This may be null, if this list
+  /// has no attributes.
+  DictionaryAttr getDictionary() const { return attrs; }
+
+  /// Return all of the attributes on this operation.
+  ArrayRef<NamedAttribute> getAttrs() const;
+
+  /// Replace the held attributes with ones provided in 'newAttrs'.
+  void setAttrs(ArrayRef<NamedAttribute> attributes);
+
+  /// Return the specified attribute if present, null otherwise.
+  Attribute get(StringRef name) const;
+  Attribute get(Identifier name) const;
+
+  /// If the an attribute exists with the specified name, change it to the new
+  /// value.  Otherwise, add a new attribute with the specified name/value.
+  void set(Identifier name, Attribute value);
+
+  enum class RemoveResult { Removed, NotFound };
+
+  /// Remove the attribute with the specified name if it exists.  The return
+  /// value indicates whether the attribute was present or not.
+  RemoveResult remove(Identifier name);
+
+private:
+  DictionaryAttr attrs;
+};
+
+} // end namespace mlir.
+
+namespace llvm {
+
+// Attribute hash just like pointers.
+template <> struct DenseMapInfo<mlir::Attribute> {
+  static mlir::Attribute getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::Attribute(static_cast<mlir::Attribute::ImplType *>(pointer));
+  }
+  static mlir::Attribute getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::Attribute(static_cast<mlir::Attribute::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::Attribute val) {
+    return mlir::hash_value(val);
+  }
+  static bool isEqual(mlir::Attribute LHS, mlir::Attribute RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// Allow LLVM to steal the low bits of Attributes.
+template <> struct PointerLikeTypeTraits<mlir::Attribute> {
+  static inline void *getAsVoidPointer(mlir::Attribute attr) {
+    return const_cast<void *>(attr.getAsOpaquePointer());
+  }
+  static inline mlir::Attribute getFromVoidPointer(void *ptr) {
+    return mlir::Attribute::getFromOpaquePointer(ptr);
+  }
+  enum { NumLowBitsAvailable = 3 };
+};
+
+template <>
+struct PointerLikeTypeTraits<mlir::SymbolRefAttr>
+    : public PointerLikeTypeTraits<mlir::Attribute> {
+  static inline mlir::SymbolRefAttr getFromVoidPointer(void *ptr) {
+    return PointerLikeTypeTraits<mlir::Attribute>::getFromVoidPointer(ptr)
+        .cast<mlir::SymbolRefAttr>();
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/mlir/include/mlir/IR/Block.h b/mlir/include/mlir/IR/Block.h
new file mode 100644
index 0000000000000000000000000000000000000000..934eed93c3b3327bfebf015a6975480aab67fb10
--- /dev/null
+++ b/mlir/include/mlir/IR/Block.h
@@ -0,0 +1,335 @@
+//===- Block.h - MLIR Block Class -------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Block class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_BLOCK_H
+#define MLIR_IR_BLOCK_H
+
+#include "mlir/IR/BlockSupport.h"
+#include "mlir/IR/Visitors.h"
+
+namespace mlir {
+/// `Block` represents an ordered list of `Operation`s.
+class Block : public IRObjectWithUseList,
+              public llvm::ilist_node_with_parent<Block, Region> {
+public:
+  explicit Block() {}
+  ~Block();
+
+  void clear() {
+    // Drop all references from within this block.
+    dropAllReferences();
+
+    // Clear operations in the reverse order so that uses are destroyed
+    // before their defs.
+    while (!empty())
+      operations.pop_back();
+  }
+
+  /// Provide a 'getParent' method for ilist_node_with_parent methods.
+  /// We mark it as a const function because ilist_node_with_parent specifically
+  /// requires a 'getParent() const' method. Once ilist_node removes this
+  /// constraint, we should drop the const to fit the rest of the MLIR const
+  /// model.
+  Region *getParent() const;
+
+  /// Returns the closest surrounding operation that contains this block.
+  Operation *getParentOp();
+
+  /// Return if this block is the entry block in the parent region.
+  bool isEntryBlock();
+
+  /// Insert this block (which must not already be in a region) right before
+  /// the specified block.
+  void insertBefore(Block *block);
+
+  /// Unlink this block from its current region and insert it right before the
+  /// specific block.
+  void moveBefore(Block *block);
+
+  /// Unlink this Block from its parent region and delete it.
+  void erase();
+
+  //===--------------------------------------------------------------------===//
+  // Block argument management
+  //===--------------------------------------------------------------------===//
+
+  // This is the list of arguments to the block.
+  using BlockArgListType = MutableArrayRef<BlockArgument>;
+
+  BlockArgListType getArguments() { return arguments; }
+
+  using args_iterator = BlockArgListType::iterator;
+  using reverse_args_iterator = BlockArgListType::reverse_iterator;
+  args_iterator args_begin() { return getArguments().begin(); }
+  args_iterator args_end() { return getArguments().end(); }
+  reverse_args_iterator args_rbegin() { return getArguments().rbegin(); }
+  reverse_args_iterator args_rend() { return getArguments().rend(); }
+
+  bool args_empty() { return arguments.empty(); }
+
+  /// Add one value to the argument list.
+  BlockArgument addArgument(Type type);
+
+  /// Add one argument to the argument list for each type specified in the list.
+  iterator_range<args_iterator> addArguments(ArrayRef<Type> types);
+
+  /// Erase the argument at 'index' and remove it from the argument list. If
+  /// 'updatePredTerms' is set to true, this argument is also removed from the
+  /// terminators of each predecessor to this block.
+  void eraseArgument(unsigned index, bool updatePredTerms = true);
+
+  unsigned getNumArguments() { return arguments.size(); }
+  BlockArgument getArgument(unsigned i) { return arguments[i]; }
+
+  //===--------------------------------------------------------------------===//
+  // Operation list management
+  //===--------------------------------------------------------------------===//
+
+  /// This is the list of operations in the block.
+  using OpListType = llvm::iplist<Operation>;
+  OpListType &getOperations() { return operations; }
+
+  // Iteration over the operations in the block.
+  using iterator = OpListType::iterator;
+  using reverse_iterator = OpListType::reverse_iterator;
+
+  iterator begin() { return operations.begin(); }
+  iterator end() { return operations.end(); }
+  reverse_iterator rbegin() { return operations.rbegin(); }
+  reverse_iterator rend() { return operations.rend(); }
+
+  bool empty() { return operations.empty(); }
+  void push_back(Operation *op) { operations.push_back(op); }
+  void push_front(Operation *op) { operations.push_front(op); }
+
+  Operation &back() { return operations.back(); }
+  Operation &front() { return operations.front(); }
+
+  /// Returns 'op' if 'op' lies in this block, or otherwise finds the
+  /// ancestor operation of 'op' that lies in this block. Returns nullptr if
+  /// the latter fails.
+  /// TODO: This is very specific functionality that should live somewhere else,
+  /// probably in Dominance.cpp.
+  Operation *findAncestorOpInBlock(Operation &op);
+
+  /// This drops all operand uses from operations within this block, which is
+  /// an essential step in breaking cyclic dependences between references when
+  /// they are to be deleted.
+  void dropAllReferences();
+
+  /// This drops all uses of values defined in this block or in the blocks of
+  /// nested regions wherever the uses are located.
+  void dropAllDefinedValueUses();
+
+  /// Returns true if the ordering of the child operations is valid, false
+  /// otherwise.
+  bool isOpOrderValid();
+
+  /// Invalidates the current ordering of operations.
+  void invalidateOpOrder();
+
+  /// Verifies the current ordering of child operations matches the
+  /// validOpOrder flag. Returns false if the order is valid, true otherwise.
+  bool verifyOpOrder();
+
+  /// Recomputes the ordering of child operations within the block.
+  void recomputeOpOrder();
+
+private:
+  /// A utility iterator that filters out operations that are not 'OpT'.
+  template <typename OpT>
+  class op_filter_iterator
+      : public llvm::filter_iterator<Block::iterator, bool (*)(Operation &)> {
+    static bool filter(Operation &op) { return llvm::isa<OpT>(op); }
+
+  public:
+    op_filter_iterator(Block::iterator it, Block::iterator end)
+        : llvm::filter_iterator<Block::iterator, bool (*)(Operation &)>(
+              it, end, &filter) {}
+
+    /// Allow implicit conversion to the underlying block iterator.
+    operator Block::iterator() const { return this->wrapped(); }
+  };
+
+public:
+  /// This class provides iteration over the held operations of a block for a
+  /// specific operation type.
+  template <typename OpT>
+  class op_iterator : public llvm::mapped_iterator<op_filter_iterator<OpT>,
+                                                   OpT (*)(Operation &)> {
+    static OpT unwrap(Operation &op) { return cast<OpT>(op); }
+
+  public:
+    using reference = OpT;
+
+    /// Initializes the iterator to the specified filter iterator.
+    op_iterator(op_filter_iterator<OpT> it)
+        : llvm::mapped_iterator<op_filter_iterator<OpT>, OpT (*)(Operation &)>(
+              it, &unwrap) {}
+
+    /// Allow implicit conversion to the underlying block iterator.
+    operator Block::iterator() const { return this->wrapped(); }
+  };
+
+  /// Return an iterator range over the operations within this block that are of
+  /// 'OpT'.
+  template <typename OpT> iterator_range<op_iterator<OpT>> getOps() {
+    auto endIt = end();
+    return {op_filter_iterator<OpT>(begin(), endIt),
+            op_filter_iterator<OpT>(endIt, endIt)};
+  }
+  template <typename OpT> op_iterator<OpT> op_begin() {
+    return op_filter_iterator<OpT>(begin(), end());
+  }
+  template <typename OpT> op_iterator<OpT> op_end() {
+    return op_filter_iterator<OpT>(end(), end());
+  }
+
+  /// Return an iterator range over the operation within this block excluding
+  /// the terminator operation at the end.
+  iterator_range<iterator> without_terminator() {
+    if (begin() == end())
+      return {begin(), end()};
+    auto endIt = --end();
+    return {begin(), endIt};
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Terminator management
+  //===--------------------------------------------------------------------===//
+
+  /// Get the terminator operation of this block. This function asserts that
+  /// the block has a valid terminator operation.
+  Operation *getTerminator();
+
+  //===--------------------------------------------------------------------===//
+  // Predecessors and successors.
+  //===--------------------------------------------------------------------===//
+
+  // Predecessor iteration.
+  using pred_iterator = PredecessorIterator;
+  pred_iterator pred_begin() {
+    return pred_iterator((BlockOperand *)getFirstUse());
+  }
+  pred_iterator pred_end() { return pred_iterator(nullptr); }
+  iterator_range<pred_iterator> getPredecessors() {
+    return {pred_begin(), pred_end()};
+  }
+
+  /// Return true if this block has no predecessors.
+  bool hasNoPredecessors();
+
+  /// If this block has exactly one predecessor, return it.  Otherwise, return
+  /// null.
+  ///
+  /// Note that if a block has duplicate predecessors from a single block (e.g.
+  /// if you have a conditional branch with the same block as the true/false
+  /// destinations) is not considered to be a single predecessor.
+  Block *getSinglePredecessor();
+
+  // Indexed successor access.
+  unsigned getNumSuccessors();
+  Block *getSuccessor(unsigned i);
+
+  // Successor iteration.
+  using succ_iterator = SuccessorRange::iterator;
+  succ_iterator succ_begin() { return getSuccessors().begin(); }
+  succ_iterator succ_end() { return getSuccessors().end(); }
+  SuccessorRange getSuccessors() { return SuccessorRange(this); }
+
+  //===--------------------------------------------------------------------===//
+  // Operation Walkers
+  //===--------------------------------------------------------------------===//
+
+  /// Walk the operations in this block in postorder, calling the callback for
+  /// each operation.
+  /// See Operation::walk for more details.
+  template <typename FnT, typename RetT = detail::walkResultType<FnT>>
+  RetT walk(FnT &&callback) {
+    return walk(begin(), end(), std::forward<FnT>(callback));
+  }
+
+  /// Walk the operations in the specified [begin, end) range of this block in
+  /// postorder, calling the callback for each operation. This method is invoked
+  /// for void return callbacks.
+  /// See Operation::walk for more details.
+  template <typename FnT, typename RetT = detail::walkResultType<FnT>>
+  typename std::enable_if<std::is_same<RetT, void>::value, RetT>::type
+  walk(Block::iterator begin, Block::iterator end, FnT &&callback) {
+    for (auto &op : llvm::make_early_inc_range(llvm::make_range(begin, end)))
+      detail::walkOperations(&op, callback);
+  }
+
+  /// Walk the operations in the specified [begin, end) range of this block in
+  /// postorder, calling the callback for each operation. This method is invoked
+  /// for interruptible callbacks.
+  /// See Operation::walk for more details.
+  template <typename FnT, typename RetT = detail::walkResultType<FnT>>
+  typename std::enable_if<std::is_same<RetT, WalkResult>::value, RetT>::type
+  walk(Block::iterator begin, Block::iterator end, FnT &&callback) {
+    for (auto &op : llvm::make_early_inc_range(llvm::make_range(begin, end)))
+      if (detail::walkOperations(&op, callback).wasInterrupted())
+        return WalkResult::interrupt();
+    return WalkResult::advance();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Other
+  //===--------------------------------------------------------------------===//
+
+  /// Split the block into two blocks before the specified operation or
+  /// iterator.
+  ///
+  /// Note that all operations BEFORE the specified iterator stay as part of
+  /// the original basic block, and the rest of the operations in the original
+  /// block are moved to the new block, including the old terminator.  The
+  /// original block is left without a terminator.
+  ///
+  /// The newly formed Block is returned, and the specified iterator is
+  /// invalidated.
+  Block *splitBlock(iterator splitBefore);
+  Block *splitBlock(Operation *splitBeforeOp) {
+    return splitBlock(iterator(splitBeforeOp));
+  }
+
+  /// Returns pointer to member of operation list.
+  static OpListType Block::*getSublistAccess(Operation *) {
+    return &Block::operations;
+  }
+
+  void print(raw_ostream &os);
+  void dump();
+
+  /// Print out the name of the block without printing its body.
+  /// NOTE: The printType argument is ignored.  We keep it for compatibility
+  /// with LLVM dominator machinery that expects it to exist.
+  void printAsOperand(raw_ostream &os, bool printType = true);
+
+private:
+  /// Pair of the parent object that owns this block and a bit that signifies if
+  /// the operations within this block have a valid ordering.
+  llvm::PointerIntPair<Region *, /*IntBits=*/1, bool> parentValidOpOrderPair;
+
+  /// This is the list of operations in the block.
+  OpListType operations;
+
+  /// This is the list of arguments to the block.
+  std::vector<BlockArgument> arguments;
+
+  Block(Block &) = delete;
+  void operator=(Block &) = delete;
+
+  friend struct llvm::ilist_traits<Block>;
+};
+} // end namespace mlir
+
+#endif // MLIR_IR_BLOCK_H
diff --git a/mlir/include/mlir/IR/BlockAndValueMapping.h b/mlir/include/mlir/IR/BlockAndValueMapping.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7ad36072bd1d6aa488e20f008ff6a600f5f8e0f
--- /dev/null
+++ b/mlir/include/mlir/IR/BlockAndValueMapping.h
@@ -0,0 +1,88 @@
+//===- BlockAndValueMapping.h -----------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a utility class for maintaining a mapping for multiple
+// value types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_BLOCKANDVALUEMAPPING_H
+#define MLIR_IR_BLOCKANDVALUEMAPPING_H
+
+#include "mlir/IR/Block.h"
+
+namespace mlir {
+// This is a utility class for mapping one set of values to another. New
+// mappings can be inserted via 'map'. Existing mappings can be
+// found via the 'lookup*' functions. There are two variants that differ only in
+// return value when an existing is not found for the provided key.
+// 'lookupOrNull' returns nullptr where as 'lookupOrDefault' will return the
+// lookup key.
+class BlockAndValueMapping {
+public:
+  /// Inserts a new mapping for 'from' to 'to'. If there is an existing mapping,
+  /// it is overwritten.
+  void map(Block *from, Block *to) { valueMap[from] = to; }
+  void map(Value from, Value to) {
+    valueMap[from.getAsOpaquePointer()] = to.getAsOpaquePointer();
+  }
+
+  /// Erases a mapping for 'from'.
+  void erase(Block *from) { valueMap.erase(from); }
+  void erase(Value from) { valueMap.erase(from.getAsOpaquePointer()); }
+
+  /// Checks to see if a mapping for 'from' exists.
+  bool contains(Block *from) const { return valueMap.count(from); }
+  bool contains(Value from) const {
+    return valueMap.count(from.getAsOpaquePointer());
+  }
+
+  /// Lookup a mapped value within the map. If a mapping for the provided value
+  /// does not exist then return nullptr.
+  Block *lookupOrNull(Block *from) const {
+    return lookupOrValue(from, (Block *)nullptr);
+  }
+  Value lookupOrNull(Value from) const { return lookupOrValue(from, Value()); }
+
+  /// Lookup a mapped value within the map. If a mapping for the provided value
+  /// does not exist then return the provided value.
+  Block *lookupOrDefault(Block *from) const {
+    return lookupOrValue(from, from);
+  }
+  Value lookupOrDefault(Value from) const { return lookupOrValue(from, from); }
+
+  /// Lookup a mapped value within the map. This asserts the provided value
+  /// exists within the map.
+  template <typename T> T lookup(T from) const {
+    auto result = lookupOrNull(from);
+    assert(result && "expected 'from' to be contained within the map");
+    return result;
+  }
+
+  /// Clears all mappings held by the mapper.
+  void clear() { valueMap.clear(); }
+
+private:
+  /// Utility lookupOrValue that looks up an existing key or returns the
+  /// provided value.
+  Block *lookupOrValue(Block *from, Block *value) const {
+    auto it = valueMap.find(from);
+    return it != valueMap.end() ? reinterpret_cast<Block *>(it->second) : value;
+  }
+  Value lookupOrValue(Value from, Value value) const {
+    auto it = valueMap.find(from.getAsOpaquePointer());
+    return it != valueMap.end() ? Value::getFromOpaquePointer(it->second)
+                                : value;
+  }
+
+  DenseMap<void *, void *> valueMap;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_IR_BLOCKANDVALUEMAPPING_H
diff --git a/mlir/include/mlir/IR/BlockSupport.h b/mlir/include/mlir/IR/BlockSupport.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc6a8245c45c3b2a3f6d7a53f178bcb59f8e17c7
--- /dev/null
+++ b/mlir/include/mlir/IR/BlockSupport.h
@@ -0,0 +1,144 @@
+//===- BlockSupport.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a number of support types for the Block class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_BLOCK_SUPPORT_H
+#define MLIR_IR_BLOCK_SUPPORT_H
+
+#include "mlir/IR/Value.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+
+namespace mlir {
+class Block;
+
+using BlockOperand = IROperandImpl<Block>;
+
+//===----------------------------------------------------------------------===//
+// Predecessors
+//===----------------------------------------------------------------------===//
+
+/// Implement a predecessor iterator for blocks. This works by walking the use
+/// lists of the blocks. The entries on this list are the BlockOperands that
+/// are embedded into terminator operations. From the operand, we can get the
+/// terminator that contains it, and its parent block is the predecessor.
+class PredecessorIterator final
+    : public llvm::mapped_iterator<ValueUseIterator<BlockOperand>,
+                                   Block *(*)(BlockOperand &)> {
+  static Block *unwrap(BlockOperand &value);
+
+public:
+  using reference = Block *;
+
+  /// Initializes the operand type iterator to the specified operand iterator.
+  PredecessorIterator(ValueUseIterator<BlockOperand> it)
+      : llvm::mapped_iterator<ValueUseIterator<BlockOperand>,
+                              Block *(*)(BlockOperand &)>(it, &unwrap) {}
+  explicit PredecessorIterator(BlockOperand *operand)
+      : PredecessorIterator(ValueUseIterator<BlockOperand>(operand)) {}
+
+  /// Get the successor number in the predecessor terminator.
+  unsigned getSuccessorIndex() const;
+};
+
+//===----------------------------------------------------------------------===//
+// Successors
+//===----------------------------------------------------------------------===//
+
+/// This class implements the successor iterators for Block.
+class SuccessorRange final
+    : public detail::indexed_accessor_range_base<SuccessorRange, BlockOperand *,
+                                                 Block *, Block *, Block *> {
+public:
+  using RangeBaseT::RangeBaseT;
+  SuccessorRange(Block *block);
+  SuccessorRange(Operation *term);
+
+private:
+  /// See `detail::indexed_accessor_range_base` for details.
+  static BlockOperand *offset_base(BlockOperand *object, ptrdiff_t index) {
+    return object + index;
+  }
+  /// See `detail::indexed_accessor_range_base` for details.
+  static Block *dereference_iterator(BlockOperand *object, ptrdiff_t index) {
+    return object[index].get();
+  }
+
+  /// Allow access to `offset_base` and `dereference_iterator`.
+  friend RangeBaseT;
+};
+
+} // end namespace mlir
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// ilist_traits for Operation
+//===----------------------------------------------------------------------===//
+
+namespace ilist_detail {
+// Explicitly define the node access for the operation list so that we can
+// break the dependence on the Operation class in this header. This allows for
+// operations to have trailing Regions without a circular include
+// dependence.
+template <>
+struct SpecificNodeAccess<
+    typename compute_node_options<::mlir::Operation>::type> : NodeAccess {
+protected:
+  using OptionsT = typename compute_node_options<mlir::Operation>::type;
+  using pointer = typename OptionsT::pointer;
+  using const_pointer = typename OptionsT::const_pointer;
+  using node_type = ilist_node_impl<OptionsT>;
+
+  static node_type *getNodePtr(pointer N);
+  static const node_type *getNodePtr(const_pointer N);
+
+  static pointer getValuePtr(node_type *N);
+  static const_pointer getValuePtr(const node_type *N);
+};
+} // end namespace ilist_detail
+
+template <> struct ilist_traits<::mlir::Operation> {
+  using Operation = ::mlir::Operation;
+  using op_iterator = simple_ilist<Operation>::iterator;
+
+  static void deleteNode(Operation *op);
+  void addNodeToList(Operation *op);
+  void removeNodeFromList(Operation *op);
+  void transferNodesFromList(ilist_traits<Operation> &otherList,
+                             op_iterator first, op_iterator last);
+
+private:
+  mlir::Block *getContainingBlock();
+};
+
+//===----------------------------------------------------------------------===//
+// ilist_traits for Block
+//===----------------------------------------------------------------------===//
+
+template <>
+struct ilist_traits<::mlir::Block> : public ilist_alloc_traits<::mlir::Block> {
+  using Block = ::mlir::Block;
+  using block_iterator = simple_ilist<::mlir::Block>::iterator;
+
+  void addNodeToList(Block *block);
+  void removeNodeFromList(Block *block);
+  void transferNodesFromList(ilist_traits<Block> &otherList,
+                             block_iterator first, block_iterator last);
+
+private:
+  mlir::Region *getParentRegion();
+};
+
+} // end namespace llvm
+
+#endif // MLIR_IR_BLOCK_SUPPORT_H
diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h
new file mode 100644
index 0000000000000000000000000000000000000000..2db44cbfa2e71f91a56ce92727d0472bb0e6dbbd
--- /dev/null
+++ b/mlir/include/mlir/IR/Builders.h
@@ -0,0 +1,381 @@
+//===- Builders.h - Helpers for constructing MLIR Classes -------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_BUILDERS_H
+#define MLIR_IR_BUILDERS_H
+
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+
+class AffineExpr;
+class BlockAndValueMapping;
+class ModuleOp;
+class UnknownLoc;
+class FileLineColLoc;
+class Type;
+class PrimitiveType;
+class IntegerType;
+class FunctionType;
+class MemRefType;
+class VectorType;
+class RankedTensorType;
+class UnrankedTensorType;
+class TupleType;
+class NoneType;
+class BoolAttr;
+class IntegerAttr;
+class FloatAttr;
+class StringAttr;
+class TypeAttr;
+class ArrayAttr;
+class SymbolRefAttr;
+class ElementsAttr;
+class DenseElementsAttr;
+class DenseIntElementsAttr;
+class AffineMapAttr;
+class AffineMap;
+class UnitAttr;
+
+/// This class is a general helper class for creating context-global objects
+/// like types, attributes, and affine expressions.
+class Builder {
+public:
+  explicit Builder(MLIRContext *context) : context(context) {}
+  explicit Builder(ModuleOp module);
+
+  MLIRContext *getContext() const { return context; }
+
+  Identifier getIdentifier(StringRef str);
+
+  // Locations.
+  Location getUnknownLoc();
+  Location getFileLineColLoc(Identifier filename, unsigned line,
+                             unsigned column);
+  Location getFusedLoc(ArrayRef<Location> locs,
+                       Attribute metadata = Attribute());
+
+  // Types.
+  FloatType getBF16Type();
+  FloatType getF16Type();
+  FloatType getF32Type();
+  FloatType getF64Type();
+
+  IndexType getIndexType();
+
+  IntegerType getI1Type();
+  IntegerType getIntegerType(unsigned width);
+  FunctionType getFunctionType(ArrayRef<Type> inputs, ArrayRef<Type> results);
+  TupleType getTupleType(ArrayRef<Type> elementTypes);
+  NoneType getNoneType();
+
+  /// Get or construct an instance of the type 'ty' with provided arguments.
+  template <typename Ty, typename... Args> Ty getType(Args... args) {
+    return Ty::get(context, args...);
+  }
+
+  // Attributes.
+  NamedAttribute getNamedAttr(StringRef name, Attribute val);
+
+  UnitAttr getUnitAttr();
+  BoolAttr getBoolAttr(bool value);
+  DictionaryAttr getDictionaryAttr(ArrayRef<NamedAttribute> value);
+  IntegerAttr getIntegerAttr(Type type, int64_t value);
+  IntegerAttr getIntegerAttr(Type type, const APInt &value);
+  FloatAttr getFloatAttr(Type type, double value);
+  FloatAttr getFloatAttr(Type type, const APFloat &value);
+  StringAttr getStringAttr(StringRef bytes);
+  ArrayAttr getArrayAttr(ArrayRef<Attribute> value);
+  FlatSymbolRefAttr getSymbolRefAttr(Operation *value);
+  FlatSymbolRefAttr getSymbolRefAttr(StringRef value);
+  SymbolRefAttr getSymbolRefAttr(StringRef value,
+                                 ArrayRef<FlatSymbolRefAttr> nestedReferences);
+
+  // Returns a 0-valued attribute of the given `type`. This function only
+  // supports boolean, integer, and 16-/32-/64-bit float types, and vector or
+  // ranked tensor of them. Returns null attribute otherwise.
+  Attribute getZeroAttr(Type type);
+
+  // Convenience methods for fixed types.
+  FloatAttr getF16FloatAttr(float value);
+  FloatAttr getF32FloatAttr(float value);
+  FloatAttr getF64FloatAttr(double value);
+
+  IntegerAttr getI8IntegerAttr(int8_t value);
+  IntegerAttr getI16IntegerAttr(int16_t value);
+  IntegerAttr getI32IntegerAttr(int32_t value);
+  IntegerAttr getI64IntegerAttr(int64_t value);
+
+  DenseIntElementsAttr getI32VectorAttr(ArrayRef<int32_t> values);
+
+  ArrayAttr getAffineMapArrayAttr(ArrayRef<AffineMap> values);
+  ArrayAttr getI32ArrayAttr(ArrayRef<int32_t> values);
+  ArrayAttr getI64ArrayAttr(ArrayRef<int64_t> values);
+  ArrayAttr getIndexArrayAttr(ArrayRef<int64_t> values);
+  ArrayAttr getF32ArrayAttr(ArrayRef<float> values);
+  ArrayAttr getF64ArrayAttr(ArrayRef<double> values);
+  ArrayAttr getStrArrayAttr(ArrayRef<StringRef> values);
+
+  // Affine expressions and affine maps.
+  AffineExpr getAffineDimExpr(unsigned position);
+  AffineExpr getAffineSymbolExpr(unsigned position);
+  AffineExpr getAffineConstantExpr(int64_t constant);
+
+  // Special cases of affine maps and integer sets
+  /// Returns a zero result affine map with no dimensions or symbols: () -> ().
+  AffineMap getEmptyAffineMap();
+  /// Returns a single constant result affine map with 0 dimensions and 0
+  /// symbols.  One constant result: () -> (val).
+  AffineMap getConstantAffineMap(int64_t val);
+  // One dimension id identity map: (i) -> (i).
+  AffineMap getDimIdentityMap();
+  // Multi-dimensional identity map: (d0, d1, d2) -> (d0, d1, d2).
+  AffineMap getMultiDimIdentityMap(unsigned rank);
+  // One symbol identity map: ()[s] -> (s).
+  AffineMap getSymbolIdentityMap();
+
+  /// Returns a map that shifts its (single) input dimension by 'shift'.
+  /// (d0) -> (d0 + shift)
+  AffineMap getSingleDimShiftAffineMap(int64_t shift);
+
+  /// Returns an affine map that is a translation (shift) of all result
+  /// expressions in 'map' by 'shift'.
+  /// Eg: input: (d0, d1)[s0] -> (d0, d1 + s0), shift = 2
+  ///   returns:    (d0, d1)[s0] -> (d0 + 2, d1 + s0 + 2)
+  AffineMap getShiftedAffineMap(AffineMap map, int64_t shift);
+
+protected:
+  MLIRContext *context;
+};
+
+/// This class helps build Operations. Operations that are created are
+/// automatically inserted at an insertion point. The builder is copyable.
+class OpBuilder : public Builder {
+public:
+  /// Create a builder with the given context.
+  explicit OpBuilder(MLIRContext *ctx) : Builder(ctx) {}
+
+  /// Create a builder and set the insertion point to the start of the region.
+  explicit OpBuilder(Region *region) : Builder(region->getContext()) {
+    if (!region->empty())
+      setInsertionPoint(&region->front(), region->front().begin());
+  }
+  explicit OpBuilder(Region &region) : OpBuilder(&region) {}
+
+  virtual ~OpBuilder();
+
+  /// Create a builder and set insertion point to the given operation, which
+  /// will cause subsequent insertions to go right before it.
+  explicit OpBuilder(Operation *op) : Builder(op->getContext()) {
+    setInsertionPoint(op);
+  }
+
+  explicit OpBuilder(Block *block) : OpBuilder(block, block->end()) {}
+
+  OpBuilder(Block *block, Block::iterator insertPoint)
+      : OpBuilder(block->getParent()) {
+    setInsertionPoint(block, insertPoint);
+  }
+
+  /// This class represents a saved insertion point.
+  class InsertPoint {
+  public:
+    /// Creates a new insertion point which doesn't point to anything.
+    InsertPoint() = default;
+
+    /// Creates a new insertion point at the given location.
+    InsertPoint(Block *insertBlock, Block::iterator insertPt)
+        : block(insertBlock), point(insertPt) {}
+
+    /// Returns true if this insert point is set.
+    bool isSet() const { return (block != nullptr); }
+
+    Block *getBlock() const { return block; }
+    Block::iterator getPoint() const { return point; }
+
+  private:
+    Block *block = nullptr;
+    Block::iterator point;
+  };
+
+  /// RAII guard to reset the insertion point of the builder when destroyed.
+  class InsertionGuard {
+  public:
+    InsertionGuard(OpBuilder &builder)
+        : builder(builder), ip(builder.saveInsertionPoint()) {}
+    ~InsertionGuard() { builder.restoreInsertionPoint(ip); }
+
+  private:
+    OpBuilder &builder;
+    OpBuilder::InsertPoint ip;
+  };
+
+  /// Reset the insertion point to no location.  Creating an operation without a
+  /// set insertion point is an error, but this can still be useful when the
+  /// current insertion point a builder refers to is being removed.
+  void clearInsertionPoint() {
+    this->block = nullptr;
+    insertPoint = Block::iterator();
+  }
+
+  /// Return a saved insertion point.
+  InsertPoint saveInsertionPoint() const {
+    return InsertPoint(getInsertionBlock(), getInsertionPoint());
+  }
+
+  /// Restore the insert point to a previously saved point.
+  void restoreInsertionPoint(InsertPoint ip) {
+    if (ip.isSet())
+      setInsertionPoint(ip.getBlock(), ip.getPoint());
+    else
+      clearInsertionPoint();
+  }
+
+  /// Set the insertion point to the specified location.
+  void setInsertionPoint(Block *block, Block::iterator insertPoint) {
+    // TODO: check that insertPoint is in this rather than some other block.
+    this->block = block;
+    this->insertPoint = insertPoint;
+  }
+
+  /// Sets the insertion point to the specified operation, which will cause
+  /// subsequent insertions to go right before it.
+  void setInsertionPoint(Operation *op) {
+    setInsertionPoint(op->getBlock(), Block::iterator(op));
+  }
+
+  /// Sets the insertion point to the node after the specified operation, which
+  /// will cause subsequent insertions to go right after it.
+  void setInsertionPointAfter(Operation *op) {
+    setInsertionPoint(op->getBlock(), ++Block::iterator(op));
+  }
+
+  /// Sets the insertion point to the start of the specified block.
+  void setInsertionPointToStart(Block *block) {
+    setInsertionPoint(block, block->begin());
+  }
+
+  /// Sets the insertion point to the end of the specified block.
+  void setInsertionPointToEnd(Block *block) {
+    setInsertionPoint(block, block->end());
+  }
+
+  /// Return the block the current insertion point belongs to.  Note that the
+  /// the insertion point is not necessarily the end of the block.
+  Block *getInsertionBlock() const { return block; }
+
+  /// Returns the current insertion point of the builder.
+  Block::iterator getInsertionPoint() const { return insertPoint; }
+
+  /// Insert the given operation at the current insertion point and return it.
+  virtual Operation *insert(Operation *op);
+
+  /// Add new block and set the insertion point to the end of it. The block is
+  /// inserted at the provided insertion point of 'parent'.
+  Block *createBlock(Region *parent, Region::iterator insertPt = {});
+
+  /// Add new block and set the insertion point to the end of it. The block is
+  /// placed before 'insertBefore'.
+  Block *createBlock(Block *insertBefore);
+
+  /// Returns the current block of the builder.
+  Block *getBlock() const { return block; }
+
+  /// Creates an operation given the fields represented as an OperationState.
+  Operation *createOperation(const OperationState &state);
+
+  /// Create an operation of specific op type at the current insertion point.
+  template <typename OpTy, typename... Args>
+  OpTy create(Location location, Args &&... args) {
+    OperationState state(location, OpTy::getOperationName());
+    OpTy::build(this, state, std::forward<Args>(args)...);
+    auto *op = createOperation(state);
+    auto result = dyn_cast<OpTy>(op);
+    assert(result && "Builder didn't return the right type");
+    return result;
+  }
+
+  /// Create an operation of specific op type at the current insertion point,
+  /// and immediately try to fold it. This functions populates 'results' with
+  /// the results after folding the operation.
+  template <typename OpTy, typename... Args>
+  void createOrFold(SmallVectorImpl<Value> &results, Location location,
+                    Args &&... args) {
+    // Create the operation without using 'createOperation' as we don't want to
+    // insert it yet.
+    OperationState state(location, OpTy::getOperationName());
+    OpTy::build(this, state, std::forward<Args>(args)...);
+    Operation *op = Operation::create(state);
+
+    // Fold the operation. If successful destroy it, otherwise insert it.
+    if (succeeded(tryFold(op, results)))
+      op->destroy();
+    else
+      insert(op);
+  }
+
+  /// Overload to create or fold a single result operation.
+  template <typename OpTy, typename... Args>
+  typename std::enable_if<OpTy::template hasTrait<OpTrait::OneResult>(),
+                          Value>::type
+  createOrFold(Location location, Args &&... args) {
+    SmallVector<Value, 1> results;
+    createOrFold<OpTy>(results, location, std::forward<Args>(args)...);
+    return results.front();
+  }
+
+  /// Overload to create or fold a zero result operation.
+  template <typename OpTy, typename... Args>
+  typename std::enable_if<OpTy::template hasTrait<OpTrait::ZeroResult>(),
+                          OpTy>::type
+  createOrFold(Location location, Args &&... args) {
+    auto op = create<OpTy>(location, std::forward<Args>(args)...);
+    SmallVector<Value, 0> unused;
+    tryFold(op.getOperation(), unused);
+
+    // Folding cannot remove a zero-result operation, so for convenience we
+    // continue to return it.
+    return op;
+  }
+
+  /// Attempts to fold the given operation and places new results within
+  /// 'results'. Returns success if the operation was folded, failure otherwise.
+  /// Note: This function does not erase the operation on a successful fold.
+  LogicalResult tryFold(Operation *op, SmallVectorImpl<Value> &results);
+
+  /// Creates a deep copy of the specified operation, remapping any operands
+  /// that use values outside of the operation using the map that is provided
+  /// ( leaving them alone if no entry is present).  Replaces references to
+  /// cloned sub-operations to the corresponding operation that is copied,
+  /// and adds those mappings to the map.
+  Operation *clone(Operation &op, BlockAndValueMapping &mapper) {
+    return insert(op.clone(mapper));
+  }
+  Operation *clone(Operation &op) { return insert(op.clone()); }
+
+  /// Creates a deep copy of this operation but keep the operation regions
+  /// empty. Operands are remapped using `mapper` (if present), and `mapper` is
+  /// updated to contain the results.
+  Operation *cloneWithoutRegions(Operation &op, BlockAndValueMapping &mapper) {
+    return insert(op.cloneWithoutRegions(mapper));
+  }
+  Operation *cloneWithoutRegions(Operation &op) {
+    return insert(op.cloneWithoutRegions());
+  }
+  template <typename OpT> OpT cloneWithoutRegions(OpT op) {
+    return cast<OpT>(cloneWithoutRegions(*op.getOperation()));
+  }
+
+private:
+  Block *block = nullptr;
+  Block::iterator insertPoint;
+};
+
+} // namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/IR/CMakeLists.txt b/mlir/include/mlir/IR/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..555b16fd29d0386fbfde1187a4f229c04fea2a6d
--- /dev/null
+++ b/mlir/include/mlir/IR/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS OpAsmInterface.td)
+mlir_tablegen(OpAsmInterface.h.inc -gen-op-interface-decls)
+mlir_tablegen(OpAsmInterface.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(MLIROpAsmInterfacesIncGen)
diff --git a/mlir/include/mlir/IR/Diagnostics.h b/mlir/include/mlir/IR/Diagnostics.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3d0f8382083332745ab6aa54caa84e904922448
--- /dev/null
+++ b/mlir/include/mlir/IR/Diagnostics.h
@@ -0,0 +1,649 @@
+//===- Diagnostics.h - MLIR Diagnostics -------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utilities for emitting diagnostics.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_DIAGNOSTICS_H
+#define MLIR_IR_DIAGNOSTICS_H
+
+#include "mlir/IR/Location.h"
+#include "mlir/Support/STLExtras.h"
+#include <functional>
+
+namespace llvm {
+class MemoryBuffer;
+class SMLoc;
+class SourceMgr;
+} // end namespace llvm
+
+namespace mlir {
+class DiagnosticEngine;
+class Identifier;
+struct LogicalResult;
+class MLIRContext;
+class Operation;
+class OperationName;
+class Type;
+
+namespace detail {
+struct DiagnosticEngineImpl;
+} // end namespace detail
+
+/// Defines the different supported severity of a diagnostic.
+enum class DiagnosticSeverity {
+  Note,
+  Warning,
+  Error,
+  Remark,
+};
+
+//===----------------------------------------------------------------------===//
+// DiagnosticArgument
+//===----------------------------------------------------------------------===//
+
+/// A variant type that holds a single argument for a diagnostic.
+class DiagnosticArgument {
+public:
+  /// Enum that represents the different kinds of diagnostic arguments
+  /// supported.
+  enum class DiagnosticArgumentKind {
+    Attribute,
+    Double,
+    Integer,
+    Operation,
+    String,
+    Type,
+    Unsigned,
+  };
+
+  /// Outputs this argument to a stream.
+  void print(raw_ostream &os) const;
+
+  /// Returns the kind of this argument.
+  DiagnosticArgumentKind getKind() const { return kind; }
+
+  /// Returns this argument as an Attribute.
+  Attribute getAsAttribute() const;
+
+  /// Returns this argument as a double.
+  double getAsDouble() const {
+    assert(getKind() == DiagnosticArgumentKind::Double);
+    return doubleVal;
+  }
+
+  /// Returns this argument as a signed integer.
+  int64_t getAsInteger() const {
+    assert(getKind() == DiagnosticArgumentKind::Integer);
+    return static_cast<int64_t>(opaqueVal);
+  }
+
+  /// Returns this argument as an operation.
+  Operation &getAsOperation() const {
+    assert(getKind() == DiagnosticArgumentKind::Operation);
+    return *reinterpret_cast<Operation *>(opaqueVal);
+  }
+
+  /// Returns this argument as a string.
+  StringRef getAsString() const {
+    assert(getKind() == DiagnosticArgumentKind::String);
+    return stringVal;
+  }
+
+  /// Returns this argument as a Type.
+  Type getAsType() const;
+
+  /// Returns this argument as an unsigned integer.
+  uint64_t getAsUnsigned() const {
+    assert(getKind() == DiagnosticArgumentKind::Unsigned);
+    return static_cast<uint64_t>(opaqueVal);
+  }
+
+private:
+  friend class Diagnostic;
+
+  // Construct from an Attribute.
+  explicit DiagnosticArgument(Attribute attr);
+
+  // Construct from a floating point number.
+  explicit DiagnosticArgument(double val)
+      : kind(DiagnosticArgumentKind::Double), doubleVal(val) {}
+  explicit DiagnosticArgument(float val) : DiagnosticArgument(double(val)) {}
+
+  // Construct from a signed integer.
+  template <typename T>
+  explicit DiagnosticArgument(
+      T val, typename std::enable_if<std::is_signed<T>::value &&
+                                     std::numeric_limits<T>::is_integer &&
+                                     sizeof(T) <= sizeof(int64_t)>::type * = 0)
+      : kind(DiagnosticArgumentKind::Integer), opaqueVal(int64_t(val)) {}
+
+  // Construct from an unsigned integer.
+  template <typename T>
+  explicit DiagnosticArgument(
+      T val, typename std::enable_if<std::is_unsigned<T>::value &&
+                                     std::numeric_limits<T>::is_integer &&
+                                     sizeof(T) <= sizeof(uint64_t)>::type * = 0)
+      : kind(DiagnosticArgumentKind::Unsigned), opaqueVal(uint64_t(val)) {}
+
+  // Construct from an operation reference.
+  explicit DiagnosticArgument(Operation &val) : DiagnosticArgument(&val) {}
+  explicit DiagnosticArgument(Operation *val)
+      : kind(DiagnosticArgumentKind::Operation),
+        opaqueVal(reinterpret_cast<intptr_t>(val)) {
+    assert(val && "expected valid operation");
+  }
+
+  // Construct from a string reference.
+  explicit DiagnosticArgument(StringRef val)
+      : kind(DiagnosticArgumentKind::String), stringVal(val) {}
+
+  // Construct from a Type.
+  explicit DiagnosticArgument(Type val);
+
+  /// The kind of this argument.
+  DiagnosticArgumentKind kind;
+
+  /// The value of this argument.
+  union {
+    double doubleVal;
+    intptr_t opaqueVal;
+    StringRef stringVal;
+  };
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, const DiagnosticArgument &arg) {
+  arg.print(os);
+  return os;
+}
+
+//===----------------------------------------------------------------------===//
+// Diagnostic
+//===----------------------------------------------------------------------===//
+
+/// This class contains all of the information necessary to report a diagnostic
+/// to the DiagnosticEngine. It should generally not be constructed directly,
+/// and instead used transitively via InFlightDiagnostic.
+class Diagnostic {
+  using NoteVector = std::vector<std::unique_ptr<Diagnostic>>;
+
+  /// This class implements a wrapper iterator around NoteVector::iterator to
+  /// implicitly dereference the unique_ptr.
+  template <typename IteratorTy, typename NotePtrTy = decltype(*IteratorTy()),
+            typename ResultTy = decltype(**IteratorTy())>
+  class NoteIteratorImpl
+      : public llvm::mapped_iterator<IteratorTy, ResultTy (*)(NotePtrTy)> {
+    static ResultTy &unwrap(NotePtrTy note) { return *note; }
+
+  public:
+    NoteIteratorImpl(IteratorTy it)
+        : llvm::mapped_iterator<IteratorTy, ResultTy (*)(NotePtrTy)>(it,
+                                                                     &unwrap) {}
+  };
+
+public:
+  Diagnostic(Location loc, DiagnosticSeverity severity)
+      : loc(loc), severity(severity) {}
+  Diagnostic(Diagnostic &&) = default;
+  Diagnostic &operator=(Diagnostic &&) = default;
+
+  /// Returns the severity of this diagnostic.
+  DiagnosticSeverity getSeverity() const { return severity; }
+
+  /// Returns the source location for this diagnostic.
+  Location getLocation() const { return loc; }
+
+  /// Returns the current list of diagnostic arguments.
+  MutableArrayRef<DiagnosticArgument> getArguments() { return arguments; }
+  ArrayRef<DiagnosticArgument> getArguments() const { return arguments; }
+
+  /// Stream operator for inserting new diagnostic arguments.
+  template <typename Arg>
+  typename std::enable_if<!std::is_convertible<Arg, StringRef>::value,
+                          Diagnostic &>::type
+  operator<<(Arg &&val) {
+    arguments.push_back(DiagnosticArgument(std::forward<Arg>(val)));
+    return *this;
+  }
+
+  /// Stream in a string literal.
+  Diagnostic &operator<<(const char *val) {
+    arguments.push_back(DiagnosticArgument(val));
+    return *this;
+  }
+
+  /// Stream in a Twine argument.
+  Diagnostic &operator<<(char val);
+  Diagnostic &operator<<(const Twine &val);
+  Diagnostic &operator<<(Twine &&val);
+
+  /// Stream in an Identifier.
+  Diagnostic &operator<<(Identifier val);
+
+  /// Stream in an OperationName.
+  Diagnostic &operator<<(OperationName val);
+
+  /// Stream in a range.
+  template <typename T> Diagnostic &operator<<(iterator_range<T> range) {
+    return appendRange(range);
+  }
+  template <typename T> Diagnostic &operator<<(ArrayRef<T> range) {
+    return appendRange(range);
+  }
+
+  /// Append a range to the diagnostic. The default delimiter between elements
+  /// is ','.
+  template <typename T, template <typename> class Container>
+  Diagnostic &appendRange(const Container<T> &c, const char *delim = ", ") {
+    interleave(
+        c, [&](const detail::ValueOfRange<Container<T>> &a) { *this << a; },
+        [&]() { *this << delim; });
+    return *this;
+  }
+
+  /// Append arguments to the diagnostic.
+  template <typename Arg1, typename Arg2, typename... Args>
+  Diagnostic &append(Arg1 &&arg1, Arg2 &&arg2, Args &&... args) {
+    append(std::forward<Arg1>(arg1));
+    return append(std::forward<Arg2>(arg2), std::forward<Args>(args)...);
+  }
+  /// Append one argument to the diagnostic.
+  template <typename Arg> Diagnostic &append(Arg &&arg) {
+    *this << std::forward<Arg>(arg);
+    return *this;
+  }
+
+  /// Outputs this diagnostic to a stream.
+  void print(raw_ostream &os) const;
+
+  /// Converts the diagnostic to a string.
+  std::string str() const;
+
+  /// Attaches a note to this diagnostic. A new location may be optionally
+  /// provided, if not, then the location defaults to the one specified for this
+  /// diagnostic. Notes may not be attached to other notes.
+  Diagnostic &attachNote(Optional<Location> noteLoc = llvm::None);
+
+  using note_iterator = NoteIteratorImpl<NoteVector::iterator>;
+  using const_note_iterator = NoteIteratorImpl<NoteVector::const_iterator>;
+
+  /// Returns the notes held by this diagnostic.
+  iterator_range<note_iterator> getNotes() {
+    return {notes.begin(), notes.end()};
+  }
+  iterator_range<const_note_iterator> getNotes() const {
+    return {notes.begin(), notes.end()};
+  }
+
+  /// Allow a diagnostic to be converted to 'failure'.
+  operator LogicalResult() const;
+
+private:
+  Diagnostic(const Diagnostic &rhs) = delete;
+  Diagnostic &operator=(const Diagnostic &rhs) = delete;
+
+  /// The source location.
+  Location loc;
+
+  /// The severity of this diagnostic.
+  DiagnosticSeverity severity;
+
+  /// The current list of arguments.
+  SmallVector<DiagnosticArgument, 4> arguments;
+
+  /// A list of string values used as arguments. This is used to guarantee the
+  /// liveness of non-constant strings used in diagnostics.
+  std::vector<std::unique_ptr<char[]>> strings;
+
+  /// A list of attached notes.
+  NoteVector notes;
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, const Diagnostic &diag) {
+  diag.print(os);
+  return os;
+}
+
+//===----------------------------------------------------------------------===//
+// InFlightDiagnostic
+//===----------------------------------------------------------------------===//
+
+/// This class represents a diagnostic that is inflight and set to be reported.
+/// This allows for last minute modifications of the diagnostic before it is
+/// emitted by a DiagnosticEngine.
+class InFlightDiagnostic {
+public:
+  InFlightDiagnostic() = default;
+  InFlightDiagnostic(InFlightDiagnostic &&rhs)
+      : owner(rhs.owner), impl(std::move(rhs.impl)) {
+    // Reset the rhs diagnostic.
+    rhs.impl.reset();
+    rhs.abandon();
+  }
+  ~InFlightDiagnostic() {
+    if (isInFlight())
+      report();
+  }
+
+  /// Stream operator for new diagnostic arguments.
+  template <typename Arg> InFlightDiagnostic &operator<<(Arg &&arg) & {
+    return append(std::forward<Arg>(arg));
+  }
+  template <typename Arg> InFlightDiagnostic &&operator<<(Arg &&arg) && {
+    return std::move(append(std::forward<Arg>(arg)));
+  }
+
+  /// Append arguments to the diagnostic.
+  template <typename... Args> InFlightDiagnostic &append(Args &&... args) & {
+    assert(isActive() && "diagnostic not active");
+    if (isInFlight())
+      impl->append(std::forward<Args>(args)...);
+    return *this;
+  }
+  template <typename... Args> InFlightDiagnostic &&append(Args &&... args) && {
+    return std::move(append(std::forward<Args>(args)...));
+  }
+
+  /// Attaches a note to this diagnostic.
+  Diagnostic &attachNote(Optional<Location> noteLoc = llvm::None) {
+    assert(isActive() && "diagnostic not active");
+    return impl->attachNote(noteLoc);
+  }
+
+  /// Reports the diagnostic to the engine.
+  void report();
+
+  /// Abandons this diagnostic so that it will no longer be reported.
+  void abandon();
+
+  /// Allow an inflight diagnostic to be converted to 'failure', otherwise
+  /// 'success' if this is an empty diagnostic.
+  operator LogicalResult() const;
+
+private:
+  InFlightDiagnostic &operator=(const InFlightDiagnostic &) = delete;
+  InFlightDiagnostic &operator=(InFlightDiagnostic &&) = delete;
+  InFlightDiagnostic(DiagnosticEngine *owner, Diagnostic &&rhs)
+      : owner(owner), impl(std::move(rhs)) {}
+
+  /// Returns if the diagnostic is still active, i.e. it has a live diagnostic.
+  bool isActive() const { return impl.hasValue(); }
+
+  /// Returns if the diagnostic is still in flight to be reported.
+  bool isInFlight() const { return owner; }
+
+  // Allow access to the constructor.
+  friend DiagnosticEngine;
+
+  /// The engine that this diagnostic is to report to.
+  DiagnosticEngine *owner = nullptr;
+
+  /// The raw diagnostic that is inflight to be reported.
+  Optional<Diagnostic> impl;
+};
+
+//===----------------------------------------------------------------------===//
+// DiagnosticEngine
+//===----------------------------------------------------------------------===//
+
+/// This class is the main interface for diagnostics. The DiagnosticEngine
+/// manages the registration of diagnostic handlers as well as the core API for
+/// diagnostic emission. This class should not be constructed directly, but
+/// instead interfaced with via an MLIRContext instance.
+class DiagnosticEngine {
+public:
+  ~DiagnosticEngine();
+
+  // Diagnostic handler registration and use. MLIR supports the ability for the
+  // IR to carry arbitrary metadata about operation location information. If a
+  // problem is detected by the compiler, it can invoke the emitError /
+  // emitWarning / emitRemark method on an Operation and have it get reported
+  // through this interface.
+  //
+  // Tools using MLIR are encouraged to register error handlers and define a
+  // schema for their location information.  If they don't, then warnings and
+  // notes will be dropped and errors will be emitted to errs.
+
+  /// The handler type for MLIR diagnostics. This function takes a diagnostic as
+  /// input, and returns success if the handler has fully processed this
+  /// diagnostic. Returns failure otherwise.
+  using HandlerTy = std::function<LogicalResult(Diagnostic &)>;
+
+  /// A handle to a specific registered handler object.
+  using HandlerID = uint64_t;
+
+  /// Register a new handler for diagnostics to the engine. Diagnostics are
+  /// process by handlers in stack-like order, meaning that the last added
+  /// handlers will process diagnostics first. This function returns a unique
+  /// identifier for the registered handler, which can be used to unregister
+  /// this handler at a later time.
+  HandlerID registerHandler(const HandlerTy &handler);
+
+  /// Set the diagnostic handler with a function that returns void. This is a
+  /// convenient wrapper for handlers that always completely process the given
+  /// diagnostic.
+  template <typename FuncTy, typename RetT = decltype(std::declval<FuncTy>()(
+                                 std::declval<Diagnostic &>()))>
+  std::enable_if_t<std::is_same<RetT, void>::value, HandlerID>
+  registerHandler(FuncTy &&handler) {
+    return registerHandler([=](Diagnostic &diag) {
+      handler(diag);
+      return success();
+    });
+  }
+
+  /// Erase the registered diagnostic handler with the given identifier.
+  void eraseHandler(HandlerID id);
+
+  /// Create a new inflight diagnostic with the given location and severity.
+  InFlightDiagnostic emit(Location loc, DiagnosticSeverity severity) {
+    assert(severity != DiagnosticSeverity::Note &&
+           "notes should not be emitted directly");
+    return InFlightDiagnostic(this, Diagnostic(loc, severity));
+  }
+
+  /// Emit a diagnostic using the registered issue handler if present, or with
+  /// the default behavior if not.
+  void emit(Diagnostic diag);
+
+private:
+  friend class MLIRContextImpl;
+  DiagnosticEngine();
+
+  /// The internal implementation of the DiagnosticEngine.
+  std::unique_ptr<detail::DiagnosticEngineImpl> impl;
+};
+
+/// Utility method to emit an error message using this location.
+InFlightDiagnostic emitError(Location loc);
+InFlightDiagnostic emitError(Location loc, const Twine &message);
+
+/// Utility method to emit a warning message using this location.
+InFlightDiagnostic emitWarning(Location loc);
+InFlightDiagnostic emitWarning(Location loc, const Twine &message);
+
+/// Utility method to emit a remark message using this location.
+InFlightDiagnostic emitRemark(Location loc);
+InFlightDiagnostic emitRemark(Location loc, const Twine &message);
+
+/// Overloads of the above emission functions that take an optionally null
+/// location. If the location is null, no diagnostic is emitted and a failure is
+/// returned. Given that the provided location may be null, these methods take
+/// the diagnostic arguments directly instead of relying on the returned
+/// InFlightDiagnostic.
+template <typename... Args>
+LogicalResult emitOptionalError(Optional<Location> loc, Args &&... args) {
+  if (loc)
+    return emitError(*loc).append(std::forward<Args>(args)...);
+  return failure();
+}
+template <typename... Args>
+LogicalResult emitOptionalWarning(Optional<Location> loc, Args &&... args) {
+  if (loc)
+    return emitWarning(*loc).append(std::forward<Args>(args)...);
+  return failure();
+}
+template <typename... Args>
+LogicalResult emitOptionalRemark(Optional<Location> loc, Args &&... args) {
+  if (loc)
+    return emitRemark(*loc).append(std::forward<Args>(args)...);
+  return failure();
+}
+
+//===----------------------------------------------------------------------===//
+// ScopedDiagnosticHandler
+//===----------------------------------------------------------------------===//
+
+/// This diagnostic handler is a simple RAII class that registers and erases a
+/// diagnostic handler on a given context. This class can be either be used
+/// directly, or in conjunction with a derived diagnostic handler.
+class ScopedDiagnosticHandler {
+public:
+  explicit ScopedDiagnosticHandler(MLIRContext *ctx) : handlerID(0), ctx(ctx) {}
+  template <typename FuncTy>
+  ScopedDiagnosticHandler(MLIRContext *ctx, FuncTy &&handler)
+      : handlerID(0), ctx(ctx) {
+    setHandler(std::forward<FuncTy>(handler));
+  }
+  ~ScopedDiagnosticHandler();
+
+protected:
+  /// Set the handler to manage via RAII.
+  template <typename FuncTy> void setHandler(FuncTy &&handler) {
+    auto &diagEngine = ctx->getDiagEngine();
+    if (handlerID)
+      diagEngine.eraseHandler(handlerID);
+    handlerID = diagEngine.registerHandler(std::forward<FuncTy>(handler));
+  }
+
+private:
+  /// The unique id for the scoped handler.
+  DiagnosticEngine::HandlerID handlerID;
+
+  /// The context to erase the handler from.
+  MLIRContext *ctx;
+};
+
+//===----------------------------------------------------------------------===//
+// SourceMgrDiagnosticHandler
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+struct SourceMgrDiagnosticHandlerImpl;
+} // end namespace detail
+
+/// This class is a utility diagnostic handler for use with llvm::SourceMgr.
+class SourceMgrDiagnosticHandler : public ScopedDiagnosticHandler {
+public:
+  SourceMgrDiagnosticHandler(llvm::SourceMgr &mgr, MLIRContext *ctx,
+                             raw_ostream &os);
+  SourceMgrDiagnosticHandler(llvm::SourceMgr &mgr, MLIRContext *ctx);
+  ~SourceMgrDiagnosticHandler();
+
+  /// Emit the given diagnostic information with the held source manager.
+  void emitDiagnostic(Location loc, Twine message, DiagnosticSeverity kind);
+
+protected:
+  /// Emit the given diagnostic with the held source manager.
+  void emitDiagnostic(Diagnostic &diag);
+
+  /// Get a memory buffer for the given file, or nullptr if no file is
+  /// available.
+  const llvm::MemoryBuffer *getBufferForFile(StringRef filename);
+
+  /// The source manager that we are wrapping.
+  llvm::SourceMgr &mgr;
+
+  /// The output stream to use when printing diagnostics.
+  raw_ostream &os;
+
+private:
+  /// Convert a location into the given memory buffer into an SMLoc.
+  llvm::SMLoc convertLocToSMLoc(FileLineColLoc loc);
+
+  /// The maximum depth that a call stack will be printed.
+  /// TODO(riverriddle) This should be a tunable flag.
+  unsigned callStackLimit = 10;
+
+  std::unique_ptr<detail::SourceMgrDiagnosticHandlerImpl> impl;
+};
+
+//===----------------------------------------------------------------------===//
+// SourceMgrDiagnosticVerifierHandler
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+struct SourceMgrDiagnosticVerifierHandlerImpl;
+} // end namespace detail
+
+/// This class is a utility diagnostic handler for use with llvm::SourceMgr that
+/// verifies that emitted diagnostics match 'expected-*' lines on the
+/// corresponding line of the source file.
+class SourceMgrDiagnosticVerifierHandler : public SourceMgrDiagnosticHandler {
+public:
+  SourceMgrDiagnosticVerifierHandler(llvm::SourceMgr &srcMgr, MLIRContext *ctx,
+                                     raw_ostream &out);
+  SourceMgrDiagnosticVerifierHandler(llvm::SourceMgr &srcMgr, MLIRContext *ctx);
+  ~SourceMgrDiagnosticVerifierHandler();
+
+  /// Returns the status of the handler and verifies that all expected
+  /// diagnostics were emitted. This return success if all diagnostics were
+  /// verified correctly, failure otherwise.
+  LogicalResult verify();
+
+private:
+  /// Process a single diagnostic.
+  void process(Diagnostic &diag);
+
+  /// Process a FileLineColLoc diagnostic.
+  void process(FileLineColLoc loc, StringRef msg, DiagnosticSeverity kind);
+
+  std::unique_ptr<detail::SourceMgrDiagnosticVerifierHandlerImpl> impl;
+};
+
+//===----------------------------------------------------------------------===//
+// ParallelDiagnosticHandler
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+struct ParallelDiagnosticHandlerImpl;
+} // end namespace detail
+
+/// This class is a utility diagnostic handler for use when multi-threading some
+/// part of the compiler where diagnostics may be emitted. This handler ensures
+/// a deterministic ordering to the emitted diagnostics that mirrors that of a
+/// single-threaded compilation.
+class ParallelDiagnosticHandler {
+public:
+  ParallelDiagnosticHandler(MLIRContext *ctx);
+  ~ParallelDiagnosticHandler();
+
+  /// Set the order id for the current thread. This is required to be set by
+  /// each thread that will be emitting diagnostics to this handler. The orderID
+  /// corresponds to the order in which diagnostics would be emitted when
+  /// executing synchronously. For example, if we were processing a list
+  /// of operations [a, b, c] on a single-thread. Diagnostics emitted while
+  /// processing operation 'a' would be emitted before those for 'b' or 'c'.
+  /// This corresponds 1-1 with the 'orderID'. The thread that is processing 'a'
+  /// should set the orderID to '0'; the thread processing 'b' should set it to
+  /// '1'; and so on and so forth. This provides a way for the handler to
+  /// deterministically order the diagnostics that it receives given the thread
+  /// that it is receiving on.
+  void setOrderIDForThread(size_t orderID);
+
+  /// Remove the order id for the current thread. This removes the thread from
+  /// diagnostics tracking.
+  void eraseOrderIDForThread();
+
+private:
+  std::unique_ptr<detail::ParallelDiagnosticHandlerImpl> impl;
+};
+} // namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3b4b055bc0c96ba221432e26b787f98f04fe164
--- /dev/null
+++ b/mlir/include/mlir/IR/Dialect.h
@@ -0,0 +1,315 @@
+//===- Dialect.h - IR Dialect Description -----------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the 'dialect' abstraction.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_DIALECT_H
+#define MLIR_IR_DIALECT_H
+
+#include "mlir/IR/OperationSupport.h"
+
+namespace mlir {
+class DialectAsmParser;
+class DialectAsmPrinter;
+class DialectInterface;
+class OpBuilder;
+class Type;
+
+using DialectConstantDecodeHook =
+    std::function<bool(const OpaqueElementsAttr, ElementsAttr &)>;
+using DialectConstantFoldHook = std::function<LogicalResult(
+    Operation *, ArrayRef<Attribute>, SmallVectorImpl<Attribute> &)>;
+using DialectExtractElementHook =
+    std::function<Attribute(const OpaqueElementsAttr, ArrayRef<uint64_t>)>;
+
+/// Dialects are groups of MLIR operations and behavior associated with the
+/// entire group.  For example, hooks into other systems for constant folding,
+/// default named types for asm printing, etc.
+///
+/// Instances of the dialect object are global across all MLIRContext's that may
+/// be active in the process.
+///
+class Dialect {
+public:
+  virtual ~Dialect();
+
+  /// Utility function that returns if the given string is a valid dialect
+  /// namespace.
+  static bool isValidNamespace(StringRef str);
+
+  MLIRContext *getContext() const { return context; }
+
+  StringRef getNamespace() const { return name; }
+
+  /// Returns true if this dialect allows for unregistered operations, i.e.
+  /// operations prefixed with the dialect namespace but not registered with
+  /// addOperation.
+  bool allowsUnknownOperations() const { return unknownOpsAllowed; }
+
+  /// Return true if this dialect allows for unregistered types, i.e., types
+  /// prefixed with the dialect namespace but not registered with addType.
+  /// These are represented with OpaqueType.
+  bool allowsUnknownTypes() const { return unknownTypesAllowed; }
+
+  //===--------------------------------------------------------------------===//
+  // Constant Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Registered fallback constant fold hook for the dialect. Like the constant
+  /// fold hook of each operation, it attempts to constant fold the operation
+  /// with the specified constant operand values - the elements in "operands"
+  /// will correspond directly to the operands of the operation, but may be null
+  /// if non-constant.  If constant folding is successful, this fills in the
+  /// `results` vector.  If not, this returns failure and `results` is
+  /// unspecified.
+  DialectConstantFoldHook constantFoldHook =
+      [](Operation *op, ArrayRef<Attribute> operands,
+         SmallVectorImpl<Attribute> &results) { return failure(); };
+
+  /// Registered hook to decode opaque constants associated with this
+  /// dialect. The hook function attempts to decode an opaque constant tensor
+  /// into a tensor with non-opaque content. If decoding is successful, this
+  /// method returns false and sets 'output' attribute. If not, it returns true
+  /// and leaves 'output' unspecified. The default hook fails to decode.
+  DialectConstantDecodeHook decodeHook =
+      [](const OpaqueElementsAttr input, ElementsAttr &output) { return true; };
+
+  /// Registered hook to extract an element from an opaque constant associated
+  /// with this dialect. If element has been successfully extracted, this
+  /// method returns that element. If not, it returns an empty attribute.
+  /// The default hook fails to extract an element.
+  DialectExtractElementHook extractElementHook =
+      [](const OpaqueElementsAttr input, ArrayRef<uint64_t> index) {
+        return Attribute();
+      };
+
+  /// Registered hook to materialize a single constant operation from a given
+  /// attribute value with the desired resultant type. This method should use
+  /// the provided builder to create the operation without changing the
+  /// insertion position. The generated operation is expected to be constant
+  /// like, i.e. single result, zero operands, non side-effecting, etc. On
+  /// success, this hook should return the value generated to represent the
+  /// constant value. Otherwise, it should return null on failure.
+  virtual Operation *materializeConstant(OpBuilder &builder, Attribute value,
+                                         Type type, Location loc) {
+    return nullptr;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Parsing Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an attribute registered to this dialect. If 'type' is nonnull, it
+  /// refers to the expected type of the attribute.
+  virtual Attribute parseAttribute(DialectAsmParser &parser, Type type) const;
+
+  /// Print an attribute registered to this dialect. Note: The type of the
+  /// attribute need not be printed by this method as it is always printed by
+  /// the caller.
+  virtual void printAttribute(Attribute, DialectAsmPrinter &) const {
+    llvm_unreachable("dialect has no registered attribute printing hook");
+  }
+
+  /// Parse a type registered to this dialect.
+  virtual Type parseType(DialectAsmParser &parser) const;
+
+  /// Print a type registered to this dialect.
+  virtual void printType(Type, DialectAsmPrinter &) const {
+    llvm_unreachable("dialect has no registered type printing hook");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Verification Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Verify an attribute from this dialect on the argument at 'argIndex' for
+  /// the region at 'regionIndex' on the given operation. Returns failure if
+  /// the verification failed, success otherwise. This hook may optionally be
+  /// invoked from any operation containing a region.
+  virtual LogicalResult verifyRegionArgAttribute(Operation *,
+                                                 unsigned regionIndex,
+                                                 unsigned argIndex,
+                                                 NamedAttribute);
+
+  /// Verify an attribute from this dialect on the result at 'resultIndex' for
+  /// the region at 'regionIndex' on the given operation. Returns failure if
+  /// the verification failed, success otherwise. This hook may optionally be
+  /// invoked from any operation containing a region.
+  virtual LogicalResult verifyRegionResultAttribute(Operation *,
+                                                    unsigned regionIndex,
+                                                    unsigned resultIndex,
+                                                    NamedAttribute);
+
+  /// Verify an attribute from this dialect on the given operation. Returns
+  /// failure if the verification failed, success otherwise.
+  virtual LogicalResult verifyOperationAttribute(Operation *, NamedAttribute) {
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Interfaces
+  //===--------------------------------------------------------------------===//
+
+  /// Lookup an interface for the given ID if one is registered, otherwise
+  /// nullptr.
+  const DialectInterface *getRegisteredInterface(ClassID *interfaceID) {
+    auto it = registeredInterfaces.find(interfaceID);
+    return it != registeredInterfaces.end() ? it->getSecond().get() : nullptr;
+  }
+  template <typename InterfaceT> const InterfaceT *getRegisteredInterface() {
+    return static_cast<const InterfaceT *>(
+        getRegisteredInterface(InterfaceT::getInterfaceID()));
+  }
+
+protected:
+  /// The constructor takes a unique namespace for this dialect as well as the
+  /// context to bind to.
+  /// Note: The namespace must not contain '.' characters.
+  /// Note: All operations belonging to this dialect must have names starting
+  ///       with the namespace followed by '.'.
+  /// Example:
+  ///       - "tf" for the TensorFlow ops like "tf.add".
+  Dialect(StringRef name, MLIRContext *context);
+
+  /// This method is used by derived classes to add their operations to the set.
+  ///
+  template <typename... Args> void addOperations() {
+    VariadicOperationAdder<Args...>::addToSet(*this);
+  }
+
+  // It would be nice to define this as variadic functions instead of a nested
+  // variadic type, but we can't do that: function template partial
+  // specialization is not allowed, and we can't define an overload set because
+  // we don't have any arguments of the types we are pushing around.
+  template <typename First, typename... Rest> class VariadicOperationAdder {
+  public:
+    static void addToSet(Dialect &dialect) {
+      dialect.addOperation(AbstractOperation::get<First>(dialect));
+      VariadicOperationAdder<Rest...>::addToSet(dialect);
+    }
+  };
+
+  template <typename First> class VariadicOperationAdder<First> {
+  public:
+    static void addToSet(Dialect &dialect) {
+      dialect.addOperation(AbstractOperation::get<First>(dialect));
+    }
+  };
+
+  void addOperation(AbstractOperation opInfo);
+
+  /// This method is used by derived classes to add their types to the set.
+  template <typename... Args> void addTypes() {
+    VariadicSymbolAdder<Args...>::addToSet(*this);
+  }
+
+  /// This method is used by derived classes to add their attributes to the set.
+  template <typename... Args> void addAttributes() {
+    VariadicSymbolAdder<Args...>::addToSet(*this);
+  }
+
+  // It would be nice to define this as variadic functions instead of a nested
+  // variadic type, but we can't do that: function template partial
+  // specialization is not allowed, and we can't define an overload set
+  // because we don't have any arguments of the types we are pushing around.
+  template <typename First, typename... Rest> struct VariadicSymbolAdder {
+    static void addToSet(Dialect &dialect) {
+      VariadicSymbolAdder<First>::addToSet(dialect);
+      VariadicSymbolAdder<Rest...>::addToSet(dialect);
+    }
+  };
+
+  template <typename First> struct VariadicSymbolAdder<First> {
+    static void addToSet(Dialect &dialect) {
+      dialect.addSymbol(First::getClassID());
+    }
+  };
+
+  /// Enable support for unregistered operations.
+  void allowUnknownOperations(bool allow = true) { unknownOpsAllowed = allow; }
+
+  /// Enable support for unregistered types.
+  void allowUnknownTypes(bool allow = true) { unknownTypesAllowed = allow; }
+
+  /// Register a dialect interface with this dialect instance.
+  void addInterface(std::unique_ptr<DialectInterface> interface);
+
+  /// Register a set of dialect interfaces with this dialect instance.
+  template <typename T, typename T2, typename... Tys> void addInterfaces() {
+    addInterfaces<T>();
+    addInterfaces<T2, Tys...>();
+  }
+  template <typename T> void addInterfaces() {
+    addInterface(std::make_unique<T>(this));
+  }
+
+private:
+  // Register a symbol(e.g. type) with its given unique class identifier.
+  void addSymbol(const ClassID *const classID);
+
+  Dialect(const Dialect &) = delete;
+  void operator=(Dialect &) = delete;
+
+  /// Register this dialect object with the specified context.  The context
+  /// takes ownership of the heap allocated dialect.
+  void registerDialect(MLIRContext *context);
+
+  /// The namespace of this dialect.
+  StringRef name;
+
+  /// This is the context that owns this Dialect object.
+  MLIRContext *context;
+
+  /// Flag that specifies whether this dialect supports unregistered operations,
+  /// i.e. operations prefixed with the dialect namespace but not registered
+  /// with addOperation.
+  bool unknownOpsAllowed = false;
+
+  /// Flag that specifies whether this dialect allows unregistered types, i.e.
+  /// types prefixed with the dialect namespace but not registered with addType.
+  /// These types are represented with OpaqueType.
+  bool unknownTypesAllowed = false;
+
+  /// A collection of registered dialect interfaces.
+  DenseMap<ClassID *, std::unique_ptr<DialectInterface>> registeredInterfaces;
+};
+
+using DialectAllocatorFunction = std::function<void(MLIRContext *)>;
+
+/// Registers a specific dialect creation function with the system, typically
+/// used through the DialectRegistration template.
+void registerDialectAllocator(const DialectAllocatorFunction &function);
+
+/// Registers all dialects with the specified MLIRContext.
+void registerAllDialects(MLIRContext *context);
+
+/// Utility to register a dialect. Client can register their dialect with the
+/// global registry by calling registerDialect<MyDialect>();
+template <typename ConcreteDialect> void registerDialect() {
+  registerDialectAllocator([](MLIRContext *ctx) {
+    // Just allocate the dialect, the context takes ownership of it.
+    new ConcreteDialect(ctx);
+  });
+}
+
+/// DialectRegistration provides a global initializer that registers a Dialect
+/// allocation routine.
+///
+/// Usage:
+///
+///   // At namespace scope.
+///   static DialectRegistration<MyDialect> Unused;
+template <typename ConcreteDialect> struct DialectRegistration {
+  DialectRegistration() { registerDialect<ConcreteDialect>(); }
+};
+
+} // namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/IR/DialectHooks.h b/mlir/include/mlir/IR/DialectHooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e4e1d8335b1645725e416085d5c7d4dac302620
--- /dev/null
+++ b/mlir/include/mlir/IR/DialectHooks.h
@@ -0,0 +1,73 @@
+//===- DialectHooks.h - MLIR DialectHooks mechanism -------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines abstraction and registration mechanism for dialect hooks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_DIALECT_HOOKS_H
+#define MLIR_IR_DIALECT_HOOKS_H
+
+#include "mlir/IR/Dialect.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+using DialectHooksSetter = std::function<void(MLIRContext *)>;
+
+/// Dialect hooks allow external components to register their functions to
+/// be called for specific tasks specialized per dialect, such as decoding
+/// of opaque constants. To register concrete dialect hooks, one should
+/// define a DialectHooks subclass and use it as a template
+/// argument to DialectHooksRegistration. For example,
+///     class MyHooks : public DialectHooks {...};
+///     static DialectHooksRegistration<MyHooks, MyDialect> hooksReg;
+/// The subclass should override DialectHook methods for supported hooks.
+class DialectHooks {
+public:
+  // Returns hook to constant fold an operation.
+  DialectConstantFoldHook getConstantFoldHook() { return nullptr; }
+  // Returns hook to decode opaque constant tensor.
+  DialectConstantDecodeHook getDecodeHook() { return nullptr; }
+  // Returns hook to extract an element of an opaque constant tensor.
+  DialectExtractElementHook getExtractElementHook() { return nullptr; }
+};
+
+/// Registers a function that will set hooks in the registered dialects
+/// based on information coming from DialectHooksRegistration.
+void registerDialectHooksSetter(const DialectHooksSetter &function);
+
+/// DialectHooksRegistration provides a global initializer that registers
+/// a dialect hooks setter routine.
+/// Usage:
+///
+///   // At namespace scope.
+///   static DialectHooksRegistration<MyHooks, MyDialect> unused;
+template <typename ConcreteHooks> struct DialectHooksRegistration {
+  DialectHooksRegistration(StringRef dialectName) {
+    registerDialectHooksSetter([dialectName](MLIRContext *ctx) {
+      Dialect *dialect = ctx->getRegisteredDialect(dialectName);
+      if (!dialect) {
+        llvm::errs() << "error: cannot register hooks for unknown dialect '"
+                     << dialectName << "'\n";
+        abort();
+      }
+      // Set hooks.
+      ConcreteHooks hooks;
+      if (auto h = hooks.getConstantFoldHook())
+        dialect->constantFoldHook = h;
+      if (auto h = hooks.getDecodeHook())
+        dialect->decodeHook = h;
+      if (auto h = hooks.getExtractElementHook())
+        dialect->extractElementHook = h;
+    });
+  }
+};
+
+} // namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/IR/DialectImplementation.h b/mlir/include/mlir/IR/DialectImplementation.h
new file mode 100644
index 0000000000000000000000000000000000000000..1eada8f264b14c661fea272e5a32f9468682fed9
--- /dev/null
+++ b/mlir/include/mlir/IR/DialectImplementation.h
@@ -0,0 +1,333 @@
+//===- DialectImplementation.h ----------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utilities classes for implementing dialect attributes and
+// types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_DIALECTIMPLEMENTATION_H
+#define MLIR_IR_DIALECTIMPLEMENTATION_H
+
+#include "mlir/IR/OpImplementation.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+
+class Builder;
+
+//===----------------------------------------------------------------------===//
+// DialectAsmPrinter
+//===----------------------------------------------------------------------===//
+
+/// This is a pure-virtual base class that exposes the asmprinter hooks
+/// necessary to implement a custom printAttribute/printType() method on a
+/// dialect.
+class DialectAsmPrinter {
+public:
+  DialectAsmPrinter() {}
+  virtual ~DialectAsmPrinter();
+  virtual raw_ostream &getStream() const = 0;
+
+  /// Print the given attribute to the stream.
+  virtual void printAttribute(Attribute attr) = 0;
+
+  /// Print the given floating point value in a stabilized form that can be
+  /// roundtripped through the IR. This is the companion to the 'parseFloat'
+  /// hook on the DialectAsmParser.
+  virtual void printFloat(const APFloat &value) = 0;
+
+  /// Print the given type to the stream.
+  virtual void printType(Type type) = 0;
+
+private:
+  DialectAsmPrinter(const DialectAsmPrinter &) = delete;
+  void operator=(const DialectAsmPrinter &) = delete;
+};
+
+// Make the implementations convenient to use.
+inline DialectAsmPrinter &operator<<(DialectAsmPrinter &p, Attribute attr) {
+  p.printAttribute(attr);
+  return p;
+}
+
+inline DialectAsmPrinter &operator<<(DialectAsmPrinter &p,
+                                     const APFloat &value) {
+  p.printFloat(value);
+  return p;
+}
+inline DialectAsmPrinter &operator<<(DialectAsmPrinter &p, float value) {
+  return p << APFloat(value);
+}
+inline DialectAsmPrinter &operator<<(DialectAsmPrinter &p, double value) {
+  return p << APFloat(value);
+}
+
+inline DialectAsmPrinter &operator<<(DialectAsmPrinter &p, Type type) {
+  p.printType(type);
+  return p;
+}
+
+// Support printing anything that isn't convertible to one of the above types,
+// even if it isn't exactly one of them.  For example, we want to print
+// FunctionType with the Type version above, not have it match this.
+template <typename T, typename std::enable_if<
+                          !std::is_convertible<T &, Attribute &>::value &&
+                              !std::is_convertible<T &, Type &>::value &&
+                              !std::is_convertible<T &, APFloat &>::value &&
+                              !llvm::is_one_of<T, double, float>::value,
+                          T>::type * = nullptr>
+inline DialectAsmPrinter &operator<<(DialectAsmPrinter &p, const T &other) {
+  p.getStream() << other;
+  return p;
+}
+
+//===----------------------------------------------------------------------===//
+// DialectAsmParser
+//===----------------------------------------------------------------------===//
+
+/// The DialectAsmParser has methods for interacting with the asm parser:
+/// parsing things from it, emitting errors etc.  It has an intentionally
+/// high-level API that is designed to reduce/constrain syntax innovation in
+/// individual attributes or types.
+class DialectAsmParser {
+public:
+  virtual ~DialectAsmParser();
+
+  /// Emit a diagnostic at the specified location and return failure.
+  virtual InFlightDiagnostic emitError(llvm::SMLoc loc,
+                                       const Twine &message = {}) = 0;
+
+  /// Return a builder which provides useful access to MLIRContext, global
+  /// objects like types and attributes.
+  virtual Builder &getBuilder() const = 0;
+
+  /// Get the location of the next token and store it into the argument.  This
+  /// always succeeds.
+  virtual llvm::SMLoc getCurrentLocation() = 0;
+  ParseResult getCurrentLocation(llvm::SMLoc *loc) {
+    *loc = getCurrentLocation();
+    return success();
+  }
+
+  /// Return the location of the original name token.
+  virtual llvm::SMLoc getNameLoc() const = 0;
+
+  /// Re-encode the given source location as an MLIR location and return it.
+  virtual Location getEncodedSourceLoc(llvm::SMLoc loc) = 0;
+
+  /// Returns the full specification of the symbol being parsed. This allows for
+  /// using a separate parser if necessary.
+  virtual StringRef getFullSymbolSpec() const = 0;
+
+  // These methods emit an error and return failure or success. This allows
+  // these to be chained together into a linear sequence of || expressions in
+  // many cases.
+
+  /// Parse a floating point value from the stream.
+  virtual ParseResult parseFloat(double &result) = 0;
+
+  /// Parse an integer value from the stream.
+  template <typename IntT> ParseResult parseInteger(IntT &result) {
+    auto loc = getCurrentLocation();
+    OptionalParseResult parseResult = parseOptionalInteger(result);
+    if (!parseResult.hasValue())
+      return emitError(loc, "expected integer value");
+    return *parseResult;
+  }
+
+  /// Parse an optional integer value from the stream.
+  virtual OptionalParseResult parseOptionalInteger(uint64_t &result) = 0;
+
+  template <typename IntT>
+  OptionalParseResult parseOptionalInteger(IntT &result) {
+    auto loc = getCurrentLocation();
+
+    // Parse the unsigned variant.
+    uint64_t uintResult;
+    OptionalParseResult parseResult = parseOptionalInteger(uintResult);
+    if (!parseResult.hasValue() || failed(*parseResult))
+      return parseResult;
+
+    // Try to convert to the provided integer type.
+    result = IntT(uintResult);
+    if (uint64_t(result) != uintResult)
+      return emitError(loc, "integer value too large");
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Token Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a '->' token.
+  virtual ParseResult parseArrow() = 0;
+
+  /// Parse a '->' token if present
+  virtual ParseResult parseOptionalArrow() = 0;
+
+  /// Parse a '{' token.
+  virtual ParseResult parseLBrace() = 0;
+
+  /// Parse a '{' token if present
+  virtual ParseResult parseOptionalLBrace() = 0;
+
+  /// Parse a `}` token.
+  virtual ParseResult parseRBrace() = 0;
+
+  /// Parse a `}` token if present
+  virtual ParseResult parseOptionalRBrace() = 0;
+
+  /// Parse a `:` token.
+  virtual ParseResult parseColon() = 0;
+
+  /// Parse a `:` token if present.
+  virtual ParseResult parseOptionalColon() = 0;
+
+  /// Parse a `,` token.
+  virtual ParseResult parseComma() = 0;
+
+  /// Parse a `,` token if present.
+  virtual ParseResult parseOptionalComma() = 0;
+
+  /// Parse a `=` token.
+  virtual ParseResult parseEqual() = 0;
+
+  /// Parse a given keyword.
+  ParseResult parseKeyword(StringRef keyword, const Twine &msg = "") {
+    auto loc = getCurrentLocation();
+    if (parseOptionalKeyword(keyword))
+      return emitError(loc, "expected '") << keyword << "'" << msg;
+    return success();
+  }
+
+  /// Parse a keyword into 'keyword'.
+  ParseResult parseKeyword(StringRef *keyword) {
+    auto loc = getCurrentLocation();
+    if (parseOptionalKeyword(keyword))
+      return emitError(loc, "expected valid keyword");
+    return success();
+  }
+
+  /// Parse the given keyword if present.
+  virtual ParseResult parseOptionalKeyword(StringRef keyword) = 0;
+
+  /// Parse a keyword, if present, into 'keyword'.
+  virtual ParseResult parseOptionalKeyword(StringRef *keyword) = 0;
+
+  /// Parse a '<' token.
+  virtual ParseResult parseLess() = 0;
+
+  /// Parse a `<` token if present.
+  virtual ParseResult parseOptionalLess() = 0;
+
+  /// Parse a '>' token.
+  virtual ParseResult parseGreater() = 0;
+
+  /// Parse a `>` token if present.
+  virtual ParseResult parseOptionalGreater() = 0;
+
+  /// Parse a `(` token.
+  virtual ParseResult parseLParen() = 0;
+
+  /// Parse a `(` token if present.
+  virtual ParseResult parseOptionalLParen() = 0;
+
+  /// Parse a `)` token.
+  virtual ParseResult parseRParen() = 0;
+
+  /// Parse a `)` token if present.
+  virtual ParseResult parseOptionalRParen() = 0;
+
+  /// Parse a `[` token.
+  virtual ParseResult parseLSquare() = 0;
+
+  /// Parse a `[` token if present.
+  virtual ParseResult parseOptionalLSquare() = 0;
+
+  /// Parse a `]` token.
+  virtual ParseResult parseRSquare() = 0;
+
+  /// Parse a `]` token if present.
+  virtual ParseResult parseOptionalRSquare() = 0;
+
+  /// Parse a `...` token if present;
+  virtual ParseResult parseOptionalEllipsis() = 0;
+
+  /// Parse a `?` token.
+  virtual ParseResult parseOptionalQuestion() = 0;
+
+  /// Parse a `*` token.
+  virtual ParseResult parseOptionalStar() = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Attribute Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an arbitrary attribute and return it in result.
+  virtual ParseResult parseAttribute(Attribute &result, Type type = {}) = 0;
+
+  /// Parse an attribute of a specific kind and type.
+  template <typename AttrType>
+  ParseResult parseAttribute(AttrType &result, Type type = {}) {
+    llvm::SMLoc loc = getCurrentLocation();
+
+    // Parse any kind of attribute.
+    Attribute attr;
+    if (parseAttribute(attr))
+      return failure();
+
+    // Check for the right kind of attribute.
+    result = attr.dyn_cast<AttrType>();
+    if (!result)
+      return emitError(loc, "invalid kind of attribute specified");
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Type Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a type.
+  virtual ParseResult parseType(Type &result) = 0;
+
+  /// Parse a type of a specific kind, e.g. a FunctionType.
+  template <typename TypeType> ParseResult parseType(TypeType &result) {
+    llvm::SMLoc loc = getCurrentLocation();
+
+    // Parse any kind of type.
+    Type type;
+    if (parseType(type))
+      return failure();
+
+    // Check for the right kind of attribute.
+    result = type.dyn_cast<TypeType>();
+    if (!result)
+      return emitError(loc, "invalid kind of type specified");
+    return success();
+  }
+
+  /// Parse a 'x' separated dimension list. This populates the dimension list,
+  /// using -1 for the `?` dimensions if `allowDynamic` is set and errors out on
+  /// `?` otherwise.
+  ///
+  ///   dimension-list ::= (dimension `x`)*
+  ///   dimension ::= `?` | integer
+  ///
+  /// When `allowDynamic` is not set, this is used to parse:
+  ///
+  ///   static-dimension-list ::= (integer `x`)*
+  virtual ParseResult parseDimensionList(SmallVectorImpl<int64_t> &dimensions,
+                                         bool allowDynamic = true) = 0;
+};
+
+} // end namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/IR/DialectInterface.h b/mlir/include/mlir/IR/DialectInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff1f8fb015abda7c68e31cc04702f03b1e78b1c4
--- /dev/null
+++ b/mlir/include/mlir/IR/DialectInterface.h
@@ -0,0 +1,181 @@
+//===- DialectInterface.h - IR Dialect Interfaces ---------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_DIALECTINTERFACE_H
+#define MLIR_IR_DIALECTINTERFACE_H
+
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/DenseSet.h"
+
+namespace mlir {
+class Dialect;
+class MLIRContext;
+class Operation;
+
+//===----------------------------------------------------------------------===//
+// DialectInterface
+//===----------------------------------------------------------------------===//
+namespace detail {
+/// The base class used for all derived interface types. This class provides
+/// utilities necessary for registration.
+template <typename ConcreteType, typename BaseT>
+class DialectInterfaceBase : public BaseT {
+public:
+  using Base = DialectInterfaceBase<ConcreteType, BaseT>;
+
+  /// Get a unique id for the derived interface type.
+  static ClassID *getInterfaceID() { return ClassID::getID<ConcreteType>(); }
+
+protected:
+  DialectInterfaceBase(Dialect *dialect) : BaseT(dialect, getInterfaceID()) {}
+};
+} // end namespace detail
+
+/// This class represents an interface overridden for a single dialect.
+class DialectInterface {
+public:
+  virtual ~DialectInterface();
+
+  /// The base class used for all derived interface types. This class provides
+  /// utilities necessary for registration.
+  template <typename ConcreteType>
+  using Base = detail::DialectInterfaceBase<ConcreteType, DialectInterface>;
+
+  /// Return the dialect that this interface represents.
+  Dialect *getDialect() const { return dialect; }
+
+  /// Return the derived interface id.
+  ClassID *getID() const { return interfaceID; }
+
+protected:
+  DialectInterface(Dialect *dialect, ClassID *id)
+      : dialect(dialect), interfaceID(id) {}
+
+private:
+  /// The dialect that represents this interface.
+  Dialect *dialect;
+
+  /// The unique identifier for the derived interface type.
+  ClassID *interfaceID;
+};
+
+//===----------------------------------------------------------------------===//
+// DialectInterfaceCollection
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+/// This class is the base class for a collection of instances for a specific
+/// interface kind.
+class DialectInterfaceCollectionBase {
+  /// DenseMap info for dialect interfaces that allows lookup by the dialect.
+  struct InterfaceKeyInfo : public DenseMapInfo<const DialectInterface *> {
+    using DenseMapInfo<const DialectInterface *>::isEqual;
+
+    static unsigned getHashValue(Dialect *key) { return llvm::hash_value(key); }
+    static unsigned getHashValue(const DialectInterface *key) {
+      return getHashValue(key->getDialect());
+    }
+
+    static bool isEqual(Dialect *lhs, const DialectInterface *rhs) {
+      if (rhs == getEmptyKey() || rhs == getTombstoneKey())
+        return false;
+      return lhs == rhs->getDialect();
+    }
+  };
+
+  /// A set of registered dialect interface instances.
+  using InterfaceSetT = DenseSet<const DialectInterface *, InterfaceKeyInfo>;
+  using InterfaceVectorT = std::vector<const DialectInterface *>;
+
+public:
+  DialectInterfaceCollectionBase(MLIRContext *ctx, ClassID *interfaceKind);
+  virtual ~DialectInterfaceCollectionBase();
+
+protected:
+  /// Get the interface for the dialect of given operation, or null if one
+  /// is not registered.
+  const DialectInterface *getInterfaceFor(Operation *op) const;
+
+  /// Get the interface for the given dialect.
+  const DialectInterface *getInterfaceFor(Dialect *dialect) const {
+    auto it = interfaces.find_as(dialect);
+    return it == interfaces.end() ? nullptr : *it;
+  }
+
+  /// An iterator class that iterates the held interface objects of the given
+  /// derived interface type.
+  template <typename InterfaceT>
+  class iterator : public llvm::mapped_iterator<
+                       InterfaceVectorT::const_iterator,
+                       const InterfaceT &(*)(const DialectInterface *)> {
+    static const InterfaceT &remapIt(const DialectInterface *interface) {
+      return *static_cast<const InterfaceT *>(interface);
+    }
+
+    iterator(InterfaceVectorT::const_iterator it)
+        : llvm::mapped_iterator<
+              InterfaceVectorT::const_iterator,
+              const InterfaceT &(*)(const DialectInterface *)>(it, &remapIt) {}
+
+    /// Allow access to the constructor.
+    friend DialectInterfaceCollectionBase;
+  };
+
+  /// Iterator access to the held interfaces.
+  template <typename InterfaceT> iterator<InterfaceT> interface_begin() const {
+    return iterator<InterfaceT>(orderedInterfaces.begin());
+  }
+  template <typename InterfaceT> iterator<InterfaceT> interface_end() const {
+    return iterator<InterfaceT>(orderedInterfaces.end());
+  }
+
+private:
+  /// A set of registered dialect interface instances.
+  InterfaceSetT interfaces;
+  /// An ordered list of the registered interface instances, necessary for
+  /// deterministic iteration.
+  // NOTE: SetVector does not provide find access, so it can't be used here.
+  InterfaceVectorT orderedInterfaces;
+};
+} // namespace detail
+
+/// A collection of dialect interfaces within a context, for a given concrete
+/// interface type.
+template <typename InterfaceType>
+class DialectInterfaceCollection
+    : public detail::DialectInterfaceCollectionBase {
+public:
+  using Base = DialectInterfaceCollection<InterfaceType>;
+
+  /// Collect the registered dialect interfaces within the provided context.
+  DialectInterfaceCollection(MLIRContext *ctx)
+      : detail::DialectInterfaceCollectionBase(
+            ctx, InterfaceType::getInterfaceID()) {}
+
+  /// Get the interface for a given object, or null if one is not registered.
+  /// The object may be a dialect or an operation instance.
+  template <typename Object>
+  const InterfaceType *getInterfaceFor(Object *obj) const {
+    return static_cast<const InterfaceType *>(
+        detail::DialectInterfaceCollectionBase::getInterfaceFor(obj));
+  }
+
+  /// Iterator access to the held interfaces.
+  using iterator =
+      detail::DialectInterfaceCollectionBase::iterator<InterfaceType>;
+  iterator begin() const { return interface_begin<InterfaceType>(); }
+  iterator end() const { return interface_end<InterfaceType>(); }
+
+private:
+  using detail::DialectInterfaceCollectionBase::interface_begin;
+  using detail::DialectInterfaceCollectionBase::interface_end;
+};
+
+} // namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/IR/DialectSymbolRegistry.def b/mlir/include/mlir/IR/DialectSymbolRegistry.def
new file mode 100644
index 0000000000000000000000000000000000000000..14b876a2ce91ed07dc24c4e071b1242e1d011945
--- /dev/null
+++ b/mlir/include/mlir/IR/DialectSymbolRegistry.def
@@ -0,0 +1,41 @@
+//===- DialectSymbolRegistry.def - MLIR Dialect Symbol Registry -*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file enumerates the different dialects that define custom classes
+// within the attribute or type system.
+//
+//===----------------------------------------------------------------------===//
+
+DEFINE_SYM_KIND_RANGE(STANDARD)
+DEFINE_SYM_KIND_RANGE(TENSORFLOW_CONTROL)
+DEFINE_SYM_KIND_RANGE(TENSORFLOW_EXECUTOR)
+DEFINE_SYM_KIND_RANGE(TENSORFLOW)
+DEFINE_SYM_KIND_RANGE(LLVM)
+DEFINE_SYM_KIND_RANGE(QUANTIZATION)
+DEFINE_SYM_KIND_RANGE(IREE) // IREE stands for IR Execution Engine
+DEFINE_SYM_KIND_RANGE(LINALG) // Linear Algebra Dialect
+DEFINE_SYM_KIND_RANGE(FIR) // Flang Fortran IR Dialect
+DEFINE_SYM_KIND_RANGE(OPENMP) // OpenMP IR Dialect
+DEFINE_SYM_KIND_RANGE(TOY) // Toy language (tutorial) Dialect
+DEFINE_SYM_KIND_RANGE(SPIRV) // SPIR-V dialect
+DEFINE_SYM_KIND_RANGE(XLA_HLO) // XLA HLO dialect
+
+// The following ranges are reserved for experimenting with MLIR dialects in a
+// private context without having to register them here.
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_0)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_1)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_2)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_3)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_4)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_5)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_6)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_7)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_8)
+DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_9)
+
+#undef DEFINE_SYM_KIND_RANGE
diff --git a/mlir/include/mlir/IR/Function.h b/mlir/include/mlir/IR/Function.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f788bbeeba4ebe2e9c8da50cc7a08d957e4a2a7
--- /dev/null
+++ b/mlir/include/mlir/IR/Function.h
@@ -0,0 +1,201 @@
+//===- Function.h - MLIR Function Class -------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Functions are the basic unit of composition in MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_FUNCTION_H
+#define MLIR_IR_FUNCTION_H
+
+#include "mlir/Analysis/CallInterfaces.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/FunctionSupport.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/SymbolTable.h"
+
+namespace mlir {
+//===--------------------------------------------------------------------===//
+// Function Operation.
+//===--------------------------------------------------------------------===//
+
+/// FuncOp represents a function, or an operation containing one region that
+/// forms a CFG(Control Flow Graph). The region of a function is not allowed to
+/// implicitly capture global values, and all external references must use
+/// Function arguments or attributes that establish a symbolic connection(e.g.
+/// symbols referenced by name via a string attribute).
+class FuncOp : public Op<FuncOp, OpTrait::ZeroOperands, OpTrait::ZeroResult,
+                         OpTrait::IsIsolatedFromAbove, OpTrait::Symbol,
+                         OpTrait::FunctionLike, CallableOpInterface::Trait> {
+public:
+  using Op::Op;
+  using Op::print;
+
+  static StringRef getOperationName() { return "func"; }
+
+  static FuncOp create(Location location, StringRef name, FunctionType type,
+                       ArrayRef<NamedAttribute> attrs = {});
+  static FuncOp create(Location location, StringRef name, FunctionType type,
+                       iterator_range<dialect_attr_iterator> attrs);
+  static FuncOp create(Location location, StringRef name, FunctionType type,
+                       ArrayRef<NamedAttribute> attrs,
+                       ArrayRef<NamedAttributeList> argAttrs);
+
+  static void build(Builder *builder, OperationState &result, StringRef name,
+                    FunctionType type, ArrayRef<NamedAttribute> attrs);
+  static void build(Builder *builder, OperationState &result, StringRef name,
+                    FunctionType type, ArrayRef<NamedAttribute> attrs,
+                    ArrayRef<NamedAttributeList> argAttrs);
+
+  /// Operation hooks.
+  static ParseResult parse(OpAsmParser &parser, OperationState &result);
+  void print(OpAsmPrinter &p);
+  LogicalResult verify();
+
+  /// Erase a single argument at `argIndex`.
+  void eraseArgument(unsigned argIndex) { eraseArguments({argIndex}); }
+  /// Erases the arguments listed in `argIndices`.
+  /// `argIndices` is allowed to have duplicates and can be in any order.
+  void eraseArguments(ArrayRef<unsigned> argIndices);
+
+  /// Returns the type of this function.
+  FunctionType getType() {
+    return getAttrOfType<TypeAttr>(getTypeAttrName())
+        .getValue()
+        .cast<FunctionType>();
+  }
+
+  /// Change the type of this function in place. This is an extremely dangerous
+  /// operation and it is up to the caller to ensure that this is legal for this
+  /// function, and to restore invariants:
+  ///  - the entry block args must be updated to match the function params.
+  ///  - the argument/result attributes may need an update: if the new type has
+  ///  less parameters we drop the extra attributes, if there are more
+  ///  parameters they won't have any attributes.
+  void setType(FunctionType newType) {
+    SmallVector<char, 16> nameBuf;
+    auto oldType = getType();
+    for (int i = newType.getNumInputs(), e = oldType.getNumInputs(); i < e;
+         i++) {
+      removeAttr(getArgAttrName(i, nameBuf));
+    }
+    for (int i = newType.getNumResults(), e = oldType.getNumResults(); i < e;
+         i++) {
+      removeAttr(getResultAttrName(i, nameBuf));
+    }
+    setAttr(getTypeAttrName(), TypeAttr::get(newType));
+  }
+
+  /// Create a deep copy of this function and all of its blocks, remapping
+  /// any operands that use values outside of the function using the map that is
+  /// provided (leaving them alone if no entry is present). If the mapper
+  /// contains entries for function arguments, these arguments are not included
+  /// in the new function. Replaces references to cloned sub-values with the
+  /// corresponding value that is copied, and adds those mappings to the mapper.
+  FuncOp clone(BlockAndValueMapping &mapper);
+  FuncOp clone();
+
+  /// Clone the internal blocks and attributes from this function into dest. Any
+  /// cloned blocks are appended to the back of dest. This function asserts that
+  /// the attributes of the current function and dest are compatible.
+  void cloneInto(FuncOp dest, BlockAndValueMapping &mapper);
+
+  //===--------------------------------------------------------------------===//
+  // Body Handling
+  //===--------------------------------------------------------------------===//
+
+  /// Add an entry block to an empty function, and set up the block arguments
+  /// to match the signature of the function. The newly inserted entry block is
+  /// returned.
+  Block *addEntryBlock();
+
+  /// Add a normal block to the end of the function's block list. The function
+  /// should at least already have an entry block.
+  Block *addBlock();
+
+  //===--------------------------------------------------------------------===//
+  // CallableOpInterface
+  //===--------------------------------------------------------------------===//
+
+  /// Returns a region on the current operation that the given callable refers
+  /// to. This may return null in the case of an external callable object, e.g.
+  /// an external function.
+  Region *getCallableRegion(CallInterfaceCallable callable) {
+    assert(callable.get<SymbolRefAttr>().getLeafReference() == getName());
+    return isExternal() ? nullptr : &getBody();
+  }
+
+  /// Returns all of the callable regions of this operation.
+  void getCallableRegions(SmallVectorImpl<Region *> &callables) {
+    if (!isExternal())
+      callables.push_back(&getBody());
+  }
+
+  /// Returns the results types that the given callable region produces when
+  /// executed.
+  ArrayRef<Type> getCallableResults(Region *region) {
+    assert(!isExternal() && region == &getBody() && "invalid callable");
+    return getType().getResults();
+  }
+
+private:
+  // This trait needs access to the hooks defined below.
+  friend class OpTrait::FunctionLike<FuncOp>;
+
+  /// Returns the number of arguments. This is a hook for OpTrait::FunctionLike.
+  unsigned getNumFuncArguments() { return getType().getInputs().size(); }
+
+  /// Returns the number of results. This is a hook for OpTrait::FunctionLike.
+  unsigned getNumFuncResults() { return getType().getResults().size(); }
+
+  /// Hook for OpTrait::FunctionLike, called after verifying that the 'type'
+  /// attribute is present and checks if it holds a function type.  Ensures
+  /// getType, getNumFuncArguments, and getNumFuncResults can be called safely.
+  LogicalResult verifyType() {
+    auto type = getTypeAttr().getValue();
+    if (!type.isa<FunctionType>())
+      return emitOpError("requires '" + getTypeAttrName() +
+                         "' attribute of function type");
+    return success();
+  }
+};
+} // end namespace mlir
+
+namespace llvm {
+
+// Functions hash just like pointers.
+template <> struct DenseMapInfo<mlir::FuncOp> {
+  static mlir::FuncOp getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::FuncOp::getFromOpaquePointer(pointer);
+  }
+  static mlir::FuncOp getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::FuncOp::getFromOpaquePointer(pointer);
+  }
+  static unsigned getHashValue(mlir::FuncOp val) {
+    return hash_value(val.getAsOpaquePointer());
+  }
+  static bool isEqual(mlir::FuncOp LHS, mlir::FuncOp RHS) { return LHS == RHS; }
+};
+
+/// Allow stealing the low bits of FuncOp.
+template <> struct PointerLikeTypeTraits<mlir::FuncOp> {
+public:
+  static inline void *getAsVoidPointer(mlir::FuncOp I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::FuncOp getFromVoidPointer(void *P) {
+    return mlir::FuncOp::getFromOpaquePointer(P);
+  }
+  enum { NumLowBitsAvailable = 3 };
+};
+
+} // namespace llvm
+
+#endif // MLIR_IR_FUNCTION_H
diff --git a/mlir/include/mlir/IR/FunctionImplementation.h b/mlir/include/mlir/IR/FunctionImplementation.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d3e438f67e95be77a399604706a53007304c6cb
--- /dev/null
+++ b/mlir/include/mlir/IR/FunctionImplementation.h
@@ -0,0 +1,100 @@
+//===- FunctionImplementation.h - Function-like Op utilities ----*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides utility functions for implementing function-like
+// operations, in particular, parsing, printing and verification components
+// common to function-like operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_FUNCTIONIMPLEMENTATION_H_
+#define MLIR_IR_FUNCTIONIMPLEMENTATION_H_
+
+#include "mlir/IR/FunctionSupport.h"
+#include "mlir/IR/OpImplementation.h"
+
+namespace mlir {
+
+namespace impl {
+
+/// A named class for passing around the variadic flag.
+class VariadicFlag {
+public:
+  explicit VariadicFlag(bool variadic) : variadic(variadic) {}
+  bool isVariadic() const { return variadic; }
+
+private:
+  /// Underlying storage.
+  bool variadic;
+};
+
+/// Adds argument and result attributes, provided as `argAttrs` and
+/// `resultAttrs` arguments, to the list of operation attributes in `result`.
+/// Internally, argument and result attributes are stored as dict attributes
+/// with special names given by getResultAttrName, getArgumentAttrName.
+void addArgAndResultAttrs(Builder &builder, OperationState &result,
+                          ArrayRef<SmallVector<NamedAttribute, 2>> argAttrs,
+                          ArrayRef<SmallVector<NamedAttribute, 2>> resultAttrs);
+
+/// Callback type for `parseFunctionLikeOp`, the callback should produce the
+/// type that will be associated with a function-like operation from lists of
+/// function arguments and results, VariadicFlag indicates whether the function
+/// should have variadic arguments; in case of error, it may populate the last
+/// argument with a message.
+using FuncTypeBuilder = function_ref<Type(
+    Builder &, ArrayRef<Type>, ArrayRef<Type>, VariadicFlag, std::string &)>;
+
+/// Parses a function signature using `parser`. The `allowVariadic` argument
+/// indicates whether functions with variadic arguments are supported. The
+/// trailing arguments are populated by this function with names, types and
+/// attributes of the arguments and those of the results.
+ParseResult parseFunctionSignature(
+    OpAsmParser &parser, bool allowVariadic,
+    SmallVectorImpl<OpAsmParser::OperandType> &argNames,
+    SmallVectorImpl<Type> &argTypes,
+    SmallVectorImpl<SmallVector<NamedAttribute, 2>> &argAttrs, bool &isVariadic,
+    SmallVectorImpl<Type> &resultTypes,
+    SmallVectorImpl<SmallVector<NamedAttribute, 2>> &resultAttrs);
+
+/// Parser implementation for function-like operations.  Uses
+/// `funcTypeBuilder` to construct the custom function type given lists of
+/// input and output types.  If `allowVariadic` is set, the parser will accept
+/// trailing ellipsis in the function signature and indicate to the builder
+/// whether the function is variadic.  If the builder returns a null type,
+/// `result` will not contain the `type` attribute.  The caller can then add a
+/// type, report the error or delegate the reporting to the op's verifier.
+ParseResult parseFunctionLikeOp(OpAsmParser &parser, OperationState &result,
+                                bool allowVariadic,
+                                FuncTypeBuilder funcTypeBuilder);
+
+/// Printer implementation for function-like operations.  Accepts lists of
+/// argument and result types to use while printing.
+void printFunctionLikeOp(OpAsmPrinter &p, Operation *op,
+                         ArrayRef<Type> argTypes, bool isVariadic,
+                         ArrayRef<Type> resultTypes);
+
+/// Prints the signature of the function-like operation `op`.  Assumes `op` has
+/// the FunctionLike trait and passed the verification.
+void printFunctionSignature(OpAsmPrinter &p, Operation *op,
+                            ArrayRef<Type> argTypes, bool isVariadic,
+                            ArrayRef<Type> resultTypes);
+
+/// Prints the list of function prefixed with the "attributes" keyword. The
+/// attributes with names listed in "elided" as well as those used by the
+/// function-like operation internally are not printed. Nothing is printed
+/// if all attributes are elided. Assumes `op` has the `FunctionLike` trait and
+/// passed the verification.
+void printFunctionAttributes(OpAsmPrinter &p, Operation *op, unsigned numInputs,
+                             unsigned numResults,
+                             ArrayRef<StringRef> elided = {});
+
+} // namespace impl
+
+} // namespace mlir
+
+#endif // MLIR_IR_FUNCTIONIMPLEMENTATION_H_
diff --git a/mlir/include/mlir/IR/FunctionSupport.h b/mlir/include/mlir/IR/FunctionSupport.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6cba2c7404dac940e6e2f6a6b259e254e0aae56
--- /dev/null
+++ b/mlir/include/mlir/IR/FunctionSupport.h
@@ -0,0 +1,539 @@
+//===- FunctionSupport.h - Utility types for function-like ops --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines support types for Operations that represent function-like
+// constructs to use.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_FUNCTIONSUPPORT_H
+#define MLIR_IR_FUNCTIONSUPPORT_H
+
+#include "mlir/IR/OpDefinition.h"
+#include "llvm/ADT/SmallString.h"
+
+namespace mlir {
+
+namespace impl {
+
+/// Return the name of the attribute used for function types.
+inline StringRef getTypeAttrName() { return "type"; }
+
+/// Return the name of the attribute used for function arguments.
+inline StringRef getArgAttrName(unsigned arg, SmallVectorImpl<char> &out) {
+  out.clear();
+  return ("arg" + Twine(arg)).toStringRef(out);
+}
+
+/// Return the name of the attribute used for function results.
+inline StringRef getResultAttrName(unsigned arg, SmallVectorImpl<char> &out) {
+  out.clear();
+  return ("result" + Twine(arg)).toStringRef(out);
+}
+
+/// Returns the dictionary attribute corresponding to the argument at 'index'.
+/// If there are no argument attributes at 'index', a null attribute is
+/// returned.
+inline DictionaryAttr getArgAttrDict(Operation *op, unsigned index) {
+  SmallString<8> nameOut;
+  return op->getAttrOfType<DictionaryAttr>(getArgAttrName(index, nameOut));
+}
+
+/// Returns the dictionary attribute corresponding to the result at 'index'.
+/// If there are no result attributes at 'index', a null attribute is
+/// returned.
+inline DictionaryAttr getResultAttrDict(Operation *op, unsigned index) {
+  SmallString<8> nameOut;
+  return op->getAttrOfType<DictionaryAttr>(getResultAttrName(index, nameOut));
+}
+
+/// Return all of the attributes for the argument at 'index'.
+inline ArrayRef<NamedAttribute> getArgAttrs(Operation *op, unsigned index) {
+  auto argDict = getArgAttrDict(op, index);
+  return argDict ? argDict.getValue() : llvm::None;
+}
+
+/// Return all of the attributes for the result at 'index'.
+inline ArrayRef<NamedAttribute> getResultAttrs(Operation *op, unsigned index) {
+  auto resultDict = getResultAttrDict(op, index);
+  return resultDict ? resultDict.getValue() : llvm::None;
+}
+
+} // namespace impl
+
+namespace OpTrait {
+
+/// This trait provides APIs for Ops that behave like functions.  In particular:
+/// - Ops must be symbols, i.e. also have the `Symbol` trait;
+/// - Ops have a single region with multiple blocks that corresponds to the body
+///   of the function;
+/// - the absence of a region corresponds to an external function;
+/// - leading arguments of the first block of the region are treated as function
+///   arguments;
+/// - they can have argument attributes that are stored in a dictionary
+///   attribute on the Op itself.
+/// This trait does *NOT* provide type support for the functions, meaning that
+/// concrete Ops must handle the type of the declared or defined function.
+/// `getTypeAttrName()` is a convenience function that returns the name of the
+/// attribute that can be used to store the function type, but the trait makes
+/// no assumption based on it.
+///
+/// - Concrete ops *must* define a member function `getNumFuncArguments()` that
+///   returns the number of function arguments based exclusively on type (so
+///   that it can be called on function declarations).
+/// - Concrete ops *must* define a member function `getNumFuncResults()` that
+///   returns the number of function results based exclusively on type (so that
+///   it can be called on function declarations).
+/// - To verify that the type respects op-specific invariants, concrete ops may
+///   redefine the `verifyType()` hook that will be called after verifying the
+///   presence of the `type` attribute and before any call to
+///   `getNumFuncArguments`/`getNumFuncResults` from the verifier.
+/// - To verify that the body respects op-specific invariants, concrete ops may
+///   redefine the `verifyBody()` hook that will be called after verifying the
+///   function type and the presence of the (potentially empty) body region.
+template <typename ConcreteType>
+class FunctionLike : public OpTrait::TraitBase<ConcreteType, FunctionLike> {
+public:
+  /// Verify that all of the argument attributes are dialect attributes.
+  static LogicalResult verifyTrait(Operation *op);
+
+  //===--------------------------------------------------------------------===//
+  // Body Handling
+  //===--------------------------------------------------------------------===//
+
+  /// Returns true if this function is external, i.e. it has no body.
+  bool isExternal() { return empty(); }
+
+  Region &getBody() { return this->getOperation()->getRegion(0); }
+
+  /// Delete all blocks from this function.
+  void eraseBody() {
+    getBody().dropAllReferences();
+    getBody().getBlocks().clear();
+  }
+
+  /// This is the list of blocks in the function.
+  using BlockListType = Region::BlockListType;
+  BlockListType &getBlocks() { return getBody().getBlocks(); }
+
+  // Iteration over the block in the function.
+  using iterator = BlockListType::iterator;
+  using reverse_iterator = BlockListType::reverse_iterator;
+
+  iterator begin() { return getBody().begin(); }
+  iterator end() { return getBody().end(); }
+  reverse_iterator rbegin() { return getBody().rbegin(); }
+  reverse_iterator rend() { return getBody().rend(); }
+
+  bool empty() { return getBody().empty(); }
+  void push_back(Block *block) { getBody().push_back(block); }
+  void push_front(Block *block) { getBody().push_front(block); }
+
+  Block &back() { return getBody().back(); }
+  Block &front() { return getBody().front(); }
+
+  /// Hook for concrete ops to verify the contents of the body. Called as a
+  /// part of trait verification, after type verification and ensuring that a
+  /// region exists.
+  LogicalResult verifyBody();
+
+  //===--------------------------------------------------------------------===//
+  // Type Attribute Handling
+  //===--------------------------------------------------------------------===//
+
+  /// Return the name of the attribute used for function types.
+  static StringRef getTypeAttrName() { return ::mlir::impl::getTypeAttrName(); }
+
+  TypeAttr getTypeAttr() {
+    return this->getOperation()->template getAttrOfType<TypeAttr>(
+        getTypeAttrName());
+  }
+
+  bool isTypeAttrValid() {
+    auto typeAttr = getTypeAttr();
+    if (!typeAttr)
+      return false;
+    return typeAttr.getValue() != Type{};
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Argument Handling
+  //===--------------------------------------------------------------------===//
+
+  unsigned getNumArguments() {
+    return static_cast<ConcreteType *>(this)->getNumFuncArguments();
+  }
+
+  unsigned getNumResults() {
+    return static_cast<ConcreteType *>(this)->getNumFuncResults();
+  }
+
+  /// Gets argument.
+  BlockArgument getArgument(unsigned idx) {
+    return getBlocks().front().getArgument(idx);
+  }
+
+  // Supports non-const operand iteration.
+  using args_iterator = Block::args_iterator;
+  args_iterator args_begin() { return front().args_begin(); }
+  args_iterator args_end() { return front().args_end(); }
+  iterator_range<args_iterator> getArguments() {
+    return {args_begin(), args_end()};
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Argument Attributes
+  //===--------------------------------------------------------------------===//
+
+  /// FunctionLike operations allow for attaching attributes to each of the
+  /// respective function arguments. These argument attributes are stored as
+  /// DictionaryAttrs in the main operation attribute dictionary. The name of
+  /// these entries is `arg` followed by the index of the argument. These
+  /// argument attribute dictionaries are optional, and will generally only
+  /// exist if they are non-empty.
+
+  /// Return all of the attributes for the argument at 'index'.
+  ArrayRef<NamedAttribute> getArgAttrs(unsigned index) {
+    return ::mlir::impl::getArgAttrs(this->getOperation(), index);
+  }
+
+  /// Return all argument attributes of this function.
+  void getAllArgAttrs(SmallVectorImpl<NamedAttributeList> &result) {
+    for (unsigned i = 0, e = getNumArguments(); i != e; ++i)
+      result.emplace_back(getArgAttrDict(i));
+  }
+
+  /// Return the specified attribute, if present, for the argument at 'index',
+  /// null otherwise.
+  Attribute getArgAttr(unsigned index, Identifier name) {
+    auto argDict = getArgAttrDict(index);
+    return argDict ? argDict.get(name) : nullptr;
+  }
+  Attribute getArgAttr(unsigned index, StringRef name) {
+    auto argDict = getArgAttrDict(index);
+    return argDict ? argDict.get(name) : nullptr;
+  }
+
+  template <typename AttrClass>
+  AttrClass getArgAttrOfType(unsigned index, Identifier name) {
+    return getArgAttr(index, name).template dyn_cast_or_null<AttrClass>();
+  }
+  template <typename AttrClass>
+  AttrClass getArgAttrOfType(unsigned index, StringRef name) {
+    return getArgAttr(index, name).template dyn_cast_or_null<AttrClass>();
+  }
+
+  /// Set the attributes held by the argument at 'index'.
+  void setArgAttrs(unsigned index, ArrayRef<NamedAttribute> attributes);
+  void setArgAttrs(unsigned index, NamedAttributeList attributes);
+  void setAllArgAttrs(ArrayRef<NamedAttributeList> attributes) {
+    assert(attributes.size() == getNumArguments());
+    for (unsigned i = 0, e = attributes.size(); i != e; ++i)
+      setArgAttrs(i, attributes[i]);
+  }
+
+  /// If the an attribute exists with the specified name, change it to the new
+  /// value. Otherwise, add a new attribute with the specified name/value.
+  void setArgAttr(unsigned index, Identifier name, Attribute value);
+  void setArgAttr(unsigned index, StringRef name, Attribute value) {
+    setArgAttr(index, Identifier::get(name, this->getOperation()->getContext()),
+               value);
+  }
+
+  /// Remove the attribute 'name' from the argument at 'index'.
+  NamedAttributeList::RemoveResult removeArgAttr(unsigned index,
+                                                 Identifier name);
+
+  //===--------------------------------------------------------------------===//
+  // Result Attributes
+  //===--------------------------------------------------------------------===//
+
+  /// FunctionLike operations allow for attaching attributes to each of the
+  /// respective function results. These result attributes are stored as
+  /// DictionaryAttrs in the main operation attribute dictionary. The name of
+  /// these entries is `result` followed by the index of the result. These
+  /// result attribute dictionaries are optional, and will generally only
+  /// exist if they are non-empty.
+
+  /// Return all of the attributes for the result at 'index'.
+  ArrayRef<NamedAttribute> getResultAttrs(unsigned index) {
+    return ::mlir::impl::getResultAttrs(this->getOperation(), index);
+  }
+
+  /// Return all result attributes of this function.
+  void getAllResultAttrs(SmallVectorImpl<NamedAttributeList> &result) {
+    for (unsigned i = 0, e = getNumResults(); i != e; ++i)
+      result.emplace_back(getResultAttrDict(i));
+  }
+
+  /// Return the specified attribute, if present, for the result at 'index',
+  /// null otherwise.
+  Attribute getResultAttr(unsigned index, Identifier name) {
+    auto argDict = getResultAttrDict(index);
+    return argDict ? argDict.get(name) : nullptr;
+  }
+  Attribute getResultAttr(unsigned index, StringRef name) {
+    auto argDict = getResultAttrDict(index);
+    return argDict ? argDict.get(name) : nullptr;
+  }
+
+  template <typename AttrClass>
+  AttrClass getResultAttrOfType(unsigned index, Identifier name) {
+    return getResultAttr(index, name).template dyn_cast_or_null<AttrClass>();
+  }
+  template <typename AttrClass>
+  AttrClass getResultAttrOfType(unsigned index, StringRef name) {
+    return getResultAttr(index, name).template dyn_cast_or_null<AttrClass>();
+  }
+
+  /// Set the attributes held by the result at 'index'.
+  void setResultAttrs(unsigned index, ArrayRef<NamedAttribute> attributes);
+  void setResultAttrs(unsigned index, NamedAttributeList attributes);
+  void setAllResultAttrs(ArrayRef<NamedAttributeList> attributes) {
+    assert(attributes.size() == getNumResults());
+    for (unsigned i = 0, e = attributes.size(); i != e; ++i)
+      setResultAttrs(i, attributes[i]);
+  }
+
+  /// If the an attribute exists with the specified name, change it to the new
+  /// value. Otherwise, add a new attribute with the specified name/value.
+  void setResultAttr(unsigned index, Identifier name, Attribute value);
+  void setResultAttr(unsigned index, StringRef name, Attribute value) {
+    setResultAttr(index,
+                  Identifier::get(name, this->getOperation()->getContext()),
+                  value);
+  }
+
+  /// Remove the attribute 'name' from the result at 'index'.
+  NamedAttributeList::RemoveResult removeResultAttr(unsigned index,
+                                                    Identifier name);
+
+protected:
+  /// Returns the attribute entry name for the set of argument attributes at
+  /// 'index'.
+  static StringRef getArgAttrName(unsigned index, SmallVectorImpl<char> &out) {
+    return ::mlir::impl::getArgAttrName(index, out);
+  }
+
+  /// Returns the dictionary attribute corresponding to the argument at 'index'.
+  /// If there are no argument attributes at 'index', a null attribute is
+  /// returned.
+  DictionaryAttr getArgAttrDict(unsigned index) {
+    assert(index < getNumArguments() && "invalid argument number");
+    return ::mlir::impl::getArgAttrDict(this->getOperation(), index);
+  }
+
+  /// Returns the attribute entry name for the set of result attributes at
+  /// 'index'.
+  static StringRef getResultAttrName(unsigned index,
+                                     SmallVectorImpl<char> &out) {
+    return ::mlir::impl::getResultAttrName(index, out);
+  }
+
+  /// Returns the dictionary attribute corresponding to the result at 'index'.
+  /// If there are no result attributes at 'index', a null attribute is
+  /// returned.
+  DictionaryAttr getResultAttrDict(unsigned index) {
+    assert(index < getNumResults() && "invalid result number");
+    return ::mlir::impl::getResultAttrDict(this->getOperation(), index);
+  }
+
+  /// Hook for concrete classes to verify that the type attribute respects
+  /// op-specific invariants.  Default implementation always succeeds.
+  LogicalResult verifyType() { return success(); }
+};
+
+/// Default verifier checks that if the entry block exists, it has the same
+/// number of arguments as the function-like operation.
+template <typename ConcreteType>
+LogicalResult FunctionLike<ConcreteType>::verifyBody() {
+  auto funcOp = cast<ConcreteType>(this->getOperation());
+
+  if (funcOp.isExternal())
+    return success();
+
+  unsigned numArguments = funcOp.getNumArguments();
+  if (funcOp.front().getNumArguments() != numArguments)
+    return funcOp.emitOpError("entry block must have ")
+           << numArguments << " arguments to match function signature";
+
+  return success();
+}
+
+template <typename ConcreteType>
+LogicalResult FunctionLike<ConcreteType>::verifyTrait(Operation *op) {
+  MLIRContext *ctx = op->getContext();
+  auto funcOp = cast<ConcreteType>(op);
+
+  if (!funcOp.isTypeAttrValid())
+    return funcOp.emitOpError("requires a type attribute '")
+           << getTypeAttrName() << '\'';
+
+  if (failed(funcOp.verifyType()))
+    return failure();
+
+  for (unsigned i = 0, e = funcOp.getNumArguments(); i != e; ++i) {
+    // Verify that all of the argument attributes are dialect attributes, i.e.
+    // that they contain a dialect prefix in their name.  Call the dialect, if
+    // registered, to verify the attributes themselves.
+    for (auto attr : funcOp.getArgAttrs(i)) {
+      if (!attr.first.strref().contains('.'))
+        return funcOp.emitOpError("arguments may only have dialect attributes");
+      auto dialectNamePair = attr.first.strref().split('.');
+      if (auto *dialect = ctx->getRegisteredDialect(dialectNamePair.first)) {
+        if (failed(dialect->verifyRegionArgAttribute(op, /*regionIndex=*/0,
+                                                     /*argIndex=*/i, attr)))
+          return failure();
+      }
+    }
+  }
+
+  for (unsigned i = 0, e = funcOp.getNumResults(); i != e; ++i) {
+    // Verify that all of the result attributes are dialect attributes, i.e.
+    // that they contain a dialect prefix in their name.  Call the dialect, if
+    // registered, to verify the attributes themselves.
+    for (auto attr : funcOp.getResultAttrs(i)) {
+      if (!attr.first.strref().contains('.'))
+        return funcOp.emitOpError("results may only have dialect attributes");
+      auto dialectNamePair = attr.first.strref().split('.');
+      if (auto *dialect = ctx->getRegisteredDialect(dialectNamePair.first)) {
+        if (failed(dialect->verifyRegionResultAttribute(op, /*regionIndex=*/0,
+                                                        /*resultIndex=*/i,
+                                                        attr)))
+          return failure();
+      }
+    }
+  }
+
+  // Check that the op has exactly one region for the body.
+  if (op->getNumRegions() != 1)
+    return funcOp.emitOpError("expects one region");
+
+  return funcOp.verifyBody();
+}
+
+//===----------------------------------------------------------------------===//
+// Function Argument Attribute.
+//===----------------------------------------------------------------------===//
+
+/// Set the attributes held by the argument at 'index'.
+template <typename ConcreteType>
+void FunctionLike<ConcreteType>::setArgAttrs(
+    unsigned index, ArrayRef<NamedAttribute> attributes) {
+  assert(index < getNumArguments() && "invalid argument number");
+  SmallString<8> nameOut;
+  getArgAttrName(index, nameOut);
+
+  if (attributes.empty())
+    return (void)static_cast<ConcreteType *>(this)->removeAttr(nameOut);
+  Operation *op = this->getOperation();
+  op->setAttr(nameOut, DictionaryAttr::get(attributes, op->getContext()));
+}
+
+template <typename ConcreteType>
+void FunctionLike<ConcreteType>::setArgAttrs(unsigned index,
+                                             NamedAttributeList attributes) {
+  assert(index < getNumArguments() && "invalid argument number");
+  SmallString<8> nameOut;
+  if (auto newAttr = attributes.getDictionary())
+    return this->getOperation()->setAttr(getArgAttrName(index, nameOut),
+                                         newAttr);
+  static_cast<ConcreteType *>(this)->removeAttr(getArgAttrName(index, nameOut));
+}
+
+/// If the an attribute exists with the specified name, change it to the new
+/// value. Otherwise, add a new attribute with the specified name/value.
+template <typename ConcreteType>
+void FunctionLike<ConcreteType>::setArgAttr(unsigned index, Identifier name,
+                                            Attribute value) {
+  auto curAttr = getArgAttrDict(index);
+  NamedAttributeList attrList(curAttr);
+  attrList.set(name, value);
+
+  // If the attribute changed, then set the new arg attribute list.
+  if (curAttr != attrList.getDictionary())
+    setArgAttrs(index, attrList);
+}
+
+/// Remove the attribute 'name' from the argument at 'index'.
+template <typename ConcreteType>
+NamedAttributeList::RemoveResult
+FunctionLike<ConcreteType>::removeArgAttr(unsigned index, Identifier name) {
+  // Build an attribute list and remove the attribute at 'name'.
+  NamedAttributeList attrList(getArgAttrDict(index));
+  auto result = attrList.remove(name);
+
+  // If the attribute was removed, then update the argument dictionary.
+  if (result == NamedAttributeList::RemoveResult::Removed)
+    setArgAttrs(index, attrList);
+  return result;
+}
+
+//===----------------------------------------------------------------------===//
+// Function Result Attribute.
+//===----------------------------------------------------------------------===//
+
+/// Set the attributes held by the result at 'index'.
+template <typename ConcreteType>
+void FunctionLike<ConcreteType>::setResultAttrs(
+    unsigned index, ArrayRef<NamedAttribute> attributes) {
+  assert(index < getNumResults() && "invalid result number");
+  SmallString<8> nameOut;
+  getResultAttrName(index, nameOut);
+
+  if (attributes.empty())
+    return (void)static_cast<ConcreteType *>(this)->removeAttr(nameOut);
+  Operation *op = this->getOperation();
+  op->setAttr(nameOut, DictionaryAttr::get(attributes, op->getContext()));
+}
+
+template <typename ConcreteType>
+void FunctionLike<ConcreteType>::setResultAttrs(unsigned index,
+                                                NamedAttributeList attributes) {
+  assert(index < getNumResults() && "invalid result number");
+  SmallString<8> nameOut;
+  if (auto newAttr = attributes.getDictionary())
+    return this->getOperation()->setAttr(getResultAttrName(index, nameOut),
+                                         newAttr);
+  static_cast<ConcreteType *>(this)->removeAttr(
+      getResultAttrName(index, nameOut));
+}
+
+/// If the an attribute exists with the specified name, change it to the new
+/// value. Otherwise, add a new attribute with the specified name/value.
+template <typename ConcreteType>
+void FunctionLike<ConcreteType>::setResultAttr(unsigned index, Identifier name,
+                                               Attribute value) {
+  auto curAttr = getResultAttrDict(index);
+  NamedAttributeList attrList(curAttr);
+  attrList.set(name, value);
+
+  // If the attribute changed, then set the new arg attribute list.
+  if (curAttr != attrList.getDictionary())
+    setResultAttrs(index, attrList);
+}
+
+/// Remove the attribute 'name' from the result at 'index'.
+template <typename ConcreteType>
+NamedAttributeList::RemoveResult
+FunctionLike<ConcreteType>::removeResultAttr(unsigned index, Identifier name) {
+  // Build an attribute list and remove the attribute at 'name'.
+  NamedAttributeList attrList(getResultAttrDict(index));
+  auto result = attrList.remove(name);
+
+  // If the attribute was removed, then update the result dictionary.
+  if (result == NamedAttributeList::RemoveResult::Removed)
+    setResultAttrs(index, attrList);
+  return result;
+}
+
+} // end namespace OpTrait
+
+} // end namespace mlir
+
+#endif // MLIR_IR_FUNCTIONSUPPORT_H
diff --git a/mlir/include/mlir/IR/Identifier.h b/mlir/include/mlir/IR/Identifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..604eebf341e4f1de8ca2c50fcb8346b285756fec
--- /dev/null
+++ b/mlir/include/mlir/IR/Identifier.h
@@ -0,0 +1,134 @@
+//===- Identifier.h - MLIR Identifier Class ---------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_IDENTIFIER_H
+#define MLIR_IR_IDENTIFIER_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+class MLIRContext;
+
+/// This class represents a uniqued string owned by an MLIRContext.  Strings
+/// represented by this type cannot contain nul characters, and may not have a
+/// zero length.
+///
+/// This is a POD type with pointer size, so it should be passed around by
+/// value.  The underlying data is owned by MLIRContext and is thus immortal for
+/// almost all clients.
+class Identifier {
+public:
+  /// Return an identifier for the specified string.
+  static Identifier get(StringRef str, MLIRContext *context);
+  Identifier(const Identifier &) = default;
+  Identifier &operator=(const Identifier &other) = default;
+
+  /// Return a StringRef for the string.
+  StringRef strref() const { return StringRef(pointer, size()); }
+
+  /// Identifiers implicitly convert to StringRefs.
+  operator StringRef() const { return strref(); }
+
+  /// Return an std::string.
+  std::string str() const { return strref().str(); }
+
+  /// Return a null terminated C string.
+  const char *c_str() const { return pointer; }
+
+  /// Return a pointer to the start of the string data.
+  const char *data() const { return pointer; }
+
+  /// Return the number of bytes in this string.
+  unsigned size() const { return ::strlen(pointer); }
+
+  /// Return true if this identifier is the specified string.
+  bool is(StringRef string) const { return strref().equals(string); }
+
+  const char *begin() const { return pointer; }
+  const char *end() const { return pointer + size(); }
+
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  const void *getAsOpaquePointer() const {
+    return static_cast<const void *>(pointer);
+  }
+  static Identifier getFromOpaquePointer(const void *pointer) {
+    return Identifier((const char *)pointer);
+  }
+
+private:
+  /// These are the bytes of the string, which is a nul terminated string.
+  const char *pointer;
+  explicit Identifier(const char *pointer) : pointer(pointer) {}
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, Identifier identifier) {
+  identifier.print(os);
+  return os;
+}
+
+inline bool operator==(Identifier lhs, Identifier rhs) {
+  return lhs.data() == rhs.data();
+}
+
+inline bool operator!=(Identifier lhs, Identifier rhs) {
+  return lhs.data() != rhs.data();
+}
+
+inline bool operator==(Identifier lhs, StringRef rhs) { return lhs.is(rhs); }
+inline bool operator!=(Identifier lhs, StringRef rhs) { return !lhs.is(rhs); }
+inline bool operator==(StringRef lhs, Identifier rhs) { return rhs.is(lhs); }
+inline bool operator!=(StringRef lhs, Identifier rhs) { return !rhs.is(lhs); }
+
+// Make identifiers hashable.
+inline llvm::hash_code hash_value(Identifier arg) {
+  return llvm::hash_value(arg.strref());
+}
+
+} // end namespace mlir
+
+namespace llvm {
+// Identifiers hash just like pointers, there is no need to hash the bytes.
+template <>
+struct DenseMapInfo<mlir::Identifier> {
+  static mlir::Identifier getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<const void *>::getEmptyKey();
+    return mlir::Identifier::getFromOpaquePointer(pointer);
+  }
+  static mlir::Identifier getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<const void *>::getTombstoneKey();
+    return mlir::Identifier::getFromOpaquePointer(pointer);
+  }
+  static unsigned getHashValue(mlir::Identifier Val) {
+    return DenseMapInfo<const void *>::getHashValue(Val.data());
+  }
+  static bool isEqual(mlir::Identifier LHS, mlir::Identifier RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// The pointer inside of an identifier comes from a StringMap, so its alignment
+/// is always at least 4 and probably 8 (on 64-bit machines).  Allow LLVM to
+/// steal the low bits.
+template <>
+struct PointerLikeTypeTraits<mlir::Identifier> {
+public:
+  static inline void *getAsVoidPointer(mlir::Identifier I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::Identifier getFromVoidPointer(void *P) {
+    return mlir::Identifier::getFromOpaquePointer(P);
+  }
+  enum { NumLowBitsAvailable = 2 };
+};
+
+} // end namespace llvm
+#endif
diff --git a/mlir/include/mlir/IR/IntegerSet.h b/mlir/include/mlir/IR/IntegerSet.h
new file mode 100644
index 0000000000000000000000000000000000000000..1238511df34cff9f177f690c78c0fad62b9ff5a2
--- /dev/null
+++ b/mlir/include/mlir/IR/IntegerSet.h
@@ -0,0 +1,142 @@
+//===- IntegerSet.h - MLIR Integer Set Class --------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Integer sets are sets of points from the integer lattice constrained by
+// affine equality/inequality constraints. This class is meant to represent
+// integer sets in the IR - for 'affine.if' operations and as attributes of
+// other operations. It is typically expected to contain only a handful of
+// affine constraints, and is immutable like an affine map. Integer sets are not
+// unique'd - although affine expressions that make up its equalities and
+// inequalities are themselves unique.
+
+// This class is not meant for affine analysis and operations like set
+// operations, emptiness checks, or other math operations for analysis and
+// transformation. For the latter, use FlatAffineConstraints.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_INTEGER_SET_H
+#define MLIR_IR_INTEGER_SET_H
+
+#include "mlir/IR/AffineExpr.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+
+namespace detail {
+struct IntegerSetStorage;
+}
+
+class MLIRContext;
+
+/// An integer set representing a conjunction of one or more affine equalities
+/// and inequalities. An integer set in the IR is immutable like the affine map,
+/// but integer sets are not unique'd. The affine expressions that make up the
+/// equalities and inequalities of an integer set are themselves unique and are
+/// allocated by the bump pointer allocator.
+class IntegerSet {
+public:
+  using ImplType = detail::IntegerSetStorage;
+
+  IntegerSet() : set(nullptr) {}
+  explicit IntegerSet(ImplType *set) : set(set) {}
+  IntegerSet(const IntegerSet &other) : set(other.set) {}
+  IntegerSet &operator=(const IntegerSet &other) = default;
+
+  static IntegerSet get(unsigned dimCount, unsigned symbolCount,
+                        ArrayRef<AffineExpr> constraints,
+                        ArrayRef<bool> eqFlags);
+
+  // Returns the canonical empty IntegerSet (i.e. a set with no integer points).
+  static IntegerSet getEmptySet(unsigned numDims, unsigned numSymbols,
+                                MLIRContext *context) {
+    auto one = getAffineConstantExpr(1, context);
+    /* 1 == 0 */
+    return get(numDims, numSymbols, one, true);
+  }
+
+  /// Returns true if this is the canonical integer set.
+  bool isEmptyIntegerSet() const;
+
+  /// This method substitutes any uses of dimensions and symbols (e.g.
+  /// dim#0 with dimReplacements[0]) in subexpressions and returns the modified
+  /// integer set.  Because this can be used to eliminate dims and
+  /// symbols, the client needs to specify the number of dims and symbols in
+  /// the result.  The returned map always has the same number of results.
+  IntegerSet replaceDimsAndSymbols(ArrayRef<AffineExpr> dimReplacements,
+                                   ArrayRef<AffineExpr> symReplacements,
+                                   unsigned numResultDims,
+                                   unsigned numResultSyms);
+
+  explicit operator bool() { return set; }
+  bool operator==(IntegerSet other) const { return set == other.set; }
+
+  unsigned getNumDims() const;
+  unsigned getNumSymbols() const;
+  unsigned getNumInputs() const;
+  unsigned getNumConstraints() const;
+  unsigned getNumEqualities() const;
+  unsigned getNumInequalities() const;
+
+  ArrayRef<AffineExpr> getConstraints() const;
+
+  AffineExpr getConstraint(unsigned idx) const;
+
+  /// Returns the equality bits, which specify whether each of the constraints
+  /// is an equality or inequality.
+  ArrayRef<bool> getEqFlags() const;
+
+  /// Returns true if the idx^th constraint is an equality, false if it is an
+  /// inequality.
+  bool isEq(unsigned idx) const;
+
+  MLIRContext *getContext() const;
+
+  /// Walk all of the AffineExpr's in this set's constraints. Each node in an
+  /// expression tree is visited in postorder.
+  void walkExprs(function_ref<void(AffineExpr)> callback) const;
+
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  friend ::llvm::hash_code hash_value(IntegerSet arg);
+
+private:
+  ImplType *set;
+  /// Sets with constraints fewer than kUniquingThreshold are uniqued.
+  constexpr static unsigned kUniquingThreshold = 4;
+};
+
+// Make AffineExpr hashable.
+inline ::llvm::hash_code hash_value(IntegerSet arg) {
+  return ::llvm::hash_value(arg.set);
+}
+
+} // end namespace mlir
+namespace llvm {
+
+// IntegerSet hash just like pointers
+template <> struct DenseMapInfo<mlir::IntegerSet> {
+  static mlir::IntegerSet getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::IntegerSet(static_cast<mlir::IntegerSet::ImplType *>(pointer));
+  }
+  static mlir::IntegerSet getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::IntegerSet(static_cast<mlir::IntegerSet::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::IntegerSet val) {
+    return mlir::hash_value(val);
+  }
+  static bool isEqual(mlir::IntegerSet LHS, mlir::IntegerSet RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // namespace llvm
+#endif // MLIR_IR_INTEGER_SET_H
diff --git a/mlir/include/mlir/IR/Location.h b/mlir/include/mlir/IR/Location.h
new file mode 100644
index 0000000000000000000000000000000000000000..c36bcb3073541f00786962d9ac196a1cd6a4909f
--- /dev/null
+++ b/mlir/include/mlir/IR/Location.h
@@ -0,0 +1,332 @@
+//===- Location.h - MLIR Location Classes -----------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// These classes provide the ability to relate MLIR objects back to source
+// location position information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_LOCATION_H
+#define MLIR_IR_LOCATION_H
+
+#include "mlir/IR/Attributes.h"
+
+namespace mlir {
+
+class Attribute;
+class MLIRContext;
+class Identifier;
+
+namespace detail {
+
+struct CallSiteLocationStorage;
+struct FileLineColLocationStorage;
+struct FusedLocationStorage;
+struct LocationStorage;
+struct NameLocationStorage;
+struct OpaqueLocationStorage;
+struct UnknownLocationStorage;
+
+} // namespace detail
+
+/// Location objects represent source locations information in MLIR.
+/// LocationAttr acts as the anchor for all Location based attributes.
+class LocationAttr : public Attribute {
+public:
+  using Attribute::Attribute;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(Attribute attr) {
+    return attr.getKind() >= StandardAttributes::FIRST_LOCATION_ATTR &&
+           attr.getKind() <= StandardAttributes::LAST_LOCATION_ATTR;
+  }
+};
+
+/// This class defines the main interface for locations in MLIR and acts as a
+/// non-nullable wrapper around a LocationAttr.
+class Location {
+public:
+  Location(LocationAttr loc) : impl(loc) {
+    assert(loc && "location should never be null.");
+  }
+
+  /// Access the impl location attribute.
+  operator LocationAttr() const { return impl; }
+  LocationAttr *operator->() const { return const_cast<LocationAttr *>(&impl); }
+
+  /// Type casting utilities on the underlying location.
+  template <typename U> bool isa() const { return impl.isa<U>(); }
+  template <typename U> U dyn_cast() const { return impl.dyn_cast<U>(); }
+  template <typename U> U cast() const { return impl.cast<U>(); }
+
+  /// Comparison operators.
+  bool operator==(Location rhs) const { return impl == rhs.impl; }
+  bool operator!=(Location rhs) const { return !(*this == rhs); }
+
+  /// Print the location.
+  void print(raw_ostream &os) const { impl.print(os); }
+  void dump() const { impl.dump(); }
+
+  friend ::llvm::hash_code hash_value(Location arg);
+
+  /// Methods for supporting PointerLikeTypeTraits.
+  const void *getAsOpaquePointer() const { return impl.getAsOpaquePointer(); }
+  static Location getFromOpaquePointer(const void *pointer) {
+    return LocationAttr(reinterpret_cast<const AttributeStorage *>(pointer));
+  }
+
+protected:
+  /// The internal backing location attribute.
+  LocationAttr impl;
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, const Location &loc) {
+  loc.print(os);
+  return os;
+}
+
+/// Represents a location as call site. "callee" is the concrete location
+/// (Unknown/NameLocation/FileLineColLoc/OpaqueLoc) and "caller" points to the
+/// caller's location (another CallLocation or a concrete location). Multiple
+/// CallSiteLocs can be chained to form a call stack.
+class CallSiteLoc
+    : public Attribute::AttrBase<CallSiteLoc, LocationAttr,
+                                 detail::CallSiteLocationStorage> {
+public:
+  using Base::Base;
+
+  /// Return a uniqued call location object.
+  static Location get(Location callee, Location caller);
+
+  /// Return a call site location which represents a name reference in one line
+  /// or a stack of frames. The input frames are ordered from innermost to
+  /// outermost.
+  static Location get(Location name, ArrayRef<Location> frames);
+
+  /// The concrete location information this object presents.
+  Location getCallee() const;
+
+  /// The caller's location.
+  Location getCaller() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::CallSiteLocation;
+  }
+};
+
+/// Represents a location derived from a file/line/column location.  The column
+/// and line may be zero to represent unknown column and/or unknown line/column
+/// information.
+class FileLineColLoc
+    : public Attribute::AttrBase<FileLineColLoc, LocationAttr,
+                                 detail::FileLineColLocationStorage> {
+public:
+  using Base::Base;
+
+  /// Return a uniqued FileLineCol location object.
+  static Location get(Identifier filename, unsigned line, unsigned column,
+                      MLIRContext *context);
+  static Location get(StringRef filename, unsigned line, unsigned column,
+                      MLIRContext *context);
+
+  StringRef getFilename() const;
+
+  unsigned getLine() const;
+  unsigned getColumn() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::FileLineColLocation;
+  }
+};
+
+/// Represents a value composed of multiple source constructs, with an optional
+/// metadata attribute.
+class FusedLoc : public Attribute::AttrBase<FusedLoc, LocationAttr,
+                                            detail::FusedLocationStorage> {
+public:
+  using Base::Base;
+
+  /// Return a uniqued Fused Location object. The first location in the list
+  /// will get precedence during diagnostic emission, with the rest being
+  /// displayed as supplementary "fused from here" style notes.
+  static Location get(ArrayRef<Location> locs, Attribute metadata,
+                      MLIRContext *context);
+  static Location get(ArrayRef<Location> locs, MLIRContext *context) {
+    return get(locs, Attribute(), context);
+  }
+
+  ArrayRef<Location> getLocations() const;
+
+  /// Returns the optional metadata attached to this fused location. Given that
+  /// it is optional, the return value may be a null node.
+  Attribute getMetadata() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::FusedLocation;
+  }
+};
+
+/// Represents an identity name attached to a child location.
+class NameLoc : public Attribute::AttrBase<NameLoc, LocationAttr,
+                                           detail::NameLocationStorage> {
+public:
+  using Base::Base;
+
+  /// Return a uniqued name location object. The child location must not be
+  /// another NameLoc.
+  static Location get(Identifier name, Location child);
+
+  /// Return a uniqued name location object with an unknown child.
+  static Location get(Identifier name, MLIRContext *context);
+
+  /// Return the name identifier.
+  Identifier getName() const;
+
+  /// Return the child location.
+  Location getChildLoc() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::NameLocation;
+  }
+};
+
+/// Represents an unknown location.  This is always a singleton for a given
+/// MLIRContext.
+class UnknownLoc : public Attribute::AttrBase<UnknownLoc, LocationAttr> {
+public:
+  using Base::Base;
+
+  /// Get an instance of the UnknownLoc.
+  static Location get(MLIRContext *context);
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::UnknownLocation;
+  }
+};
+
+/// Represents a location that is external to MLIR. Contains a pointer to some
+/// data structure and an optional location that can be used if the first one is
+/// not suitable. Since it contains an external structure, only optional
+/// location is used during serialization.
+/// The class also provides a number of methods for making type-safe casts
+/// between a pointer to an object and opaque location.
+class OpaqueLoc : public Attribute::AttrBase<OpaqueLoc, LocationAttr,
+                                             detail::OpaqueLocationStorage> {
+public:
+  using Base::Base;
+
+  /// Returns an instance of opaque location which contains a given pointer to
+  /// an object. The corresponding MLIR location is set to UnknownLoc.
+  template <typename T>
+  static Location get(T underlyingLocation, MLIRContext *context) {
+    return get(reinterpret_cast<uintptr_t>(underlyingLocation),
+               ClassID::getID<T>(), UnknownLoc::get(context));
+  }
+
+  /// Returns an instance of opaque location which contains a given pointer to
+  /// an object and an additional MLIR location.
+  template <typename T>
+  static Location get(T underlyingLocation, Location fallbackLocation) {
+    return get(reinterpret_cast<uintptr_t>(underlyingLocation),
+               ClassID::getID<T>(), fallbackLocation);
+  }
+
+  /// Returns a pointer to some data structure that opaque location stores.
+  template <typename T> static T getUnderlyingLocation(Location location) {
+    assert(isa<T>(location));
+    return reinterpret_cast<T>(
+        location.cast<mlir::OpaqueLoc>().getUnderlyingLocation());
+  }
+
+  /// Returns a pointer to some data structure that opaque location stores.
+  /// Returns nullptr if provided location is not opaque location or if it
+  /// contains a pointer of different type.
+  template <typename T>
+  static T getUnderlyingLocationOrNull(Location location) {
+    return isa<T>(location)
+               ? reinterpret_cast<T>(
+                     location.cast<mlir::OpaqueLoc>().getUnderlyingLocation())
+               : T(nullptr);
+  }
+
+  /// Checks whether provided location is opaque location and contains a pointer
+  /// to an object of particular type.
+  template <typename T> static bool isa(Location location) {
+    auto opaque_loc = location.dyn_cast<OpaqueLoc>();
+    return opaque_loc && opaque_loc.getClassId() == ClassID::getID<T>();
+  }
+
+  /// Returns a pointer to the corresponding object.
+  uintptr_t getUnderlyingLocation() const;
+
+  /// Returns a ClassID* that represents the underlying objects c++ type.
+  ClassID *getClassId() const;
+
+  /// Returns a fallback location.
+  Location getFallbackLocation() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind == StandardAttributes::OpaqueLocation;
+  }
+
+private:
+  static Location get(uintptr_t underlyingLocation, ClassID *classID,
+                      Location fallbackLocation);
+};
+
+// Make Location hashable.
+inline ::llvm::hash_code hash_value(Location arg) {
+  return hash_value(arg.impl);
+}
+
+} // end namespace mlir
+
+namespace llvm {
+
+// Type hash just like pointers.
+template <> struct DenseMapInfo<mlir::Location> {
+  static mlir::Location getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::Location::getFromOpaquePointer(pointer);
+  }
+  static mlir::Location getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::Location::getFromOpaquePointer(pointer);
+  }
+  static unsigned getHashValue(mlir::Location val) {
+    return mlir::hash_value(val);
+  }
+  static bool isEqual(mlir::Location LHS, mlir::Location RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// We align LocationStorage by 8, so allow LLVM to steal the low bits.
+template <> struct PointerLikeTypeTraits<mlir::Location> {
+public:
+  static inline void *getAsVoidPointer(mlir::Location I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::Location getFromVoidPointer(void *P) {
+    return mlir::Location::getFromOpaquePointer(P);
+  }
+  enum {
+    NumLowBitsAvailable =
+        PointerLikeTypeTraits<mlir::Attribute>::NumLowBitsAvailable
+  };
+};
+
+} // namespace llvm
+
+#endif
diff --git a/mlir/include/mlir/IR/MLIRContext.h b/mlir/include/mlir/IR/MLIRContext.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0761bcaaf13546b0f67f6c7da6efd579c8dfbbe
--- /dev/null
+++ b/mlir/include/mlir/IR/MLIRContext.h
@@ -0,0 +1,83 @@
+//===- MLIRContext.h - MLIR Global Context Class ----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_MLIRCONTEXT_H
+#define MLIR_IR_MLIRCONTEXT_H
+
+#include "mlir/Support/LLVM.h"
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace mlir {
+class AbstractOperation;
+class DiagnosticEngine;
+class Dialect;
+class InFlightDiagnostic;
+class Location;
+class MLIRContextImpl;
+class StorageUniquer;
+
+/// MLIRContext is the top-level object for a collection of MLIR modules.  It
+/// holds immortal uniqued objects like types, and the tables used to unique
+/// them.
+///
+/// MLIRContext gets a redundant "MLIR" prefix because otherwise it ends up with
+/// a very generic name ("Context") and because it is uncommon for clients to
+/// interact with it.
+///
+class MLIRContext {
+public:
+  explicit MLIRContext();
+  ~MLIRContext();
+
+  /// Return information about all registered IR dialects.
+  std::vector<Dialect *> getRegisteredDialects();
+
+  /// Get a registered IR dialect with the given namespace. If an exact match is
+  /// not found, then return nullptr.
+  Dialect *getRegisteredDialect(StringRef name);
+
+  /// Get a registered IR dialect for the given derived dialect type. The
+  /// derived type must provide a static 'getDialectNamespace' method.
+  template <typename T> T *getRegisteredDialect() {
+    return static_cast<T *>(getRegisteredDialect(T::getDialectNamespace()));
+  }
+
+  /// Return information about all registered operations.  This isn't very
+  /// efficient: typically you should ask the operations about their properties
+  /// directly.
+  std::vector<AbstractOperation *> getRegisteredOperations();
+
+  // This is effectively private given that only MLIRContext.cpp can see the
+  // MLIRContextImpl type.
+  MLIRContextImpl &getImpl() { return *impl; }
+
+  /// Returns the diagnostic engine for this context.
+  DiagnosticEngine &getDiagEngine();
+
+  /// Returns the storage uniquer used for creating affine constructs.
+  StorageUniquer &getAffineUniquer();
+
+  /// Returns the storage uniquer used for constructing type storage instances.
+  /// This should not be used directly.
+  StorageUniquer &getTypeUniquer();
+
+  /// Returns the storage uniquer used for constructing attribute storage
+  /// instances. This should not be used directly.
+  StorageUniquer &getAttributeUniquer();
+
+private:
+  const std::unique_ptr<MLIRContextImpl> impl;
+
+  MLIRContext(const MLIRContext &) = delete;
+  void operator=(const MLIRContext &) = delete;
+};
+} // end namespace mlir
+
+#endif // MLIR_IR_MLIRCONTEXT_H
diff --git a/mlir/include/mlir/IR/Matchers.h b/mlir/include/mlir/IR/Matchers.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cfa2428bd590e4e519e99f7f9e53003cbf24953
--- /dev/null
+++ b/mlir/include/mlir/IR/Matchers.h
@@ -0,0 +1,261 @@
+//===- Matchers.h - Various common matchers ---------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides a simple and efficient mechanism for performing general
+// tree-based pattern matching over MLIR. This mechanism is inspired by LLVM's
+// include/llvm/IR/PatternMatch.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_MATCHERS_H
+#define MLIR_MATCHERS_H
+
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/StandardTypes.h"
+
+namespace mlir {
+
+namespace detail {
+
+/// The matcher that matches a certain kind of Attribute and binds the value
+/// inside the Attribute.
+template <
+    typename AttrClass,
+    // Require AttrClass to be a derived class from Attribute and get its
+    // value type
+    typename ValueType =
+        typename std::enable_if<std::is_base_of<Attribute, AttrClass>::value,
+                                AttrClass>::type::ValueType,
+    // Require the ValueType is not void
+    typename = typename std::enable_if<!std::is_void<ValueType>::value>::type>
+struct attr_value_binder {
+  ValueType *bind_value;
+
+  /// Creates a matcher instance that binds the value to bv if match succeeds.
+  attr_value_binder(ValueType *bv) : bind_value(bv) {}
+
+  bool match(const Attribute &attr) {
+    if (auto intAttr = attr.dyn_cast<AttrClass>()) {
+      *bind_value = intAttr.getValue();
+      return true;
+    }
+    return false;
+  }
+};
+
+/// The matcher that matches a constant foldable operation that has no side
+/// effect, no operands and produces a single result.
+template <typename AttrT> struct constant_op_binder {
+  AttrT *bind_value;
+
+  /// Creates a matcher instance that binds the constant attribute value to
+  /// bind_value if match succeeds.
+  constant_op_binder(AttrT *bind_value) : bind_value(bind_value) {}
+
+  bool match(Operation *op) {
+    if (op->getNumOperands() > 0 || op->getNumResults() != 1)
+      return false;
+    if (!op->hasNoSideEffect())
+      return false;
+
+    SmallVector<OpFoldResult, 1> foldedOp;
+    if (succeeded(op->fold(/*operands=*/llvm::None, foldedOp))) {
+      if (auto attr = foldedOp.front().dyn_cast<Attribute>()) {
+        if ((*bind_value = attr.dyn_cast<AttrT>()))
+          return true;
+      }
+    }
+    return false;
+  }
+};
+
+/// The matcher that matches a constant scalar / vector splat / tensor splat
+/// integer operation and binds the constant integer value.
+struct constant_int_op_binder {
+  IntegerAttr::ValueType *bind_value;
+
+  /// Creates a matcher instance that binds the value to bv if match succeeds.
+  constant_int_op_binder(IntegerAttr::ValueType *bv) : bind_value(bv) {}
+
+  bool match(Operation *op) {
+    Attribute attr;
+    if (!constant_op_binder<Attribute>(&attr).match(op))
+      return false;
+    auto type = op->getResult(0)->getType();
+
+    if (type.isIntOrIndex()) {
+      return attr_value_binder<IntegerAttr>(bind_value).match(attr);
+    }
+    if (type.isa<VectorType>() || type.isa<RankedTensorType>()) {
+      if (auto splatAttr = attr.dyn_cast<SplatElementsAttr>()) {
+        return attr_value_binder<IntegerAttr>(bind_value)
+            .match(splatAttr.getSplatValue());
+      }
+    }
+    return false;
+  }
+};
+
+/// The matcher that matches a given target constant scalar / vector splat /
+/// tensor splat integer value.
+template <int64_t TargetValue> struct constant_int_value_matcher {
+  bool match(Operation *op) {
+    APInt value;
+    return constant_int_op_binder(&value).match(op) && TargetValue == value;
+  }
+};
+
+/// The matcher that matches anything except the given target constant scalar /
+/// vector splat / tensor splat integer value.
+template <int64_t TargetNotValue> struct constant_int_not_value_matcher {
+  bool match(Operation *op) {
+    APInt value;
+    return constant_int_op_binder(&value).match(op) && TargetNotValue != value;
+  }
+};
+
+/// The matcher that matches a certain kind of op.
+template <typename OpClass> struct op_matcher {
+  bool match(Operation *op) { return isa<OpClass>(op); }
+};
+
+/// Trait to check whether T provides a 'match' method with type
+/// `OperationOrValue`.
+template <typename T, typename OperationOrValue>
+using has_operation_or_value_matcher_t =
+    decltype(std::declval<T>().match(std::declval<OperationOrValue>()));
+
+/// Statically switch to a Value matcher.
+template <typename MatcherClass>
+typename std::enable_if_t<is_detected<detail::has_operation_or_value_matcher_t,
+                                      MatcherClass, Value>::value,
+                          bool>
+matchOperandOrValueAtIndex(Operation *op, unsigned idx, MatcherClass &matcher) {
+  return matcher.match(op->getOperand(idx));
+}
+
+/// Statically switch to an Operation matcher.
+template <typename MatcherClass>
+typename std::enable_if_t<is_detected<detail::has_operation_or_value_matcher_t,
+                                      MatcherClass, Operation *>::value,
+                          bool>
+matchOperandOrValueAtIndex(Operation *op, unsigned idx, MatcherClass &matcher) {
+  if (auto defOp = op->getOperand(idx)->getDefiningOp())
+    return matcher.match(defOp);
+  return false;
+}
+
+/// Terminal matcher, always returns true.
+struct AnyValueMatcher {
+  bool match(Value op) const { return true; }
+};
+
+/// Binds to a specific value and matches it.
+struct PatternMatcherValue {
+  PatternMatcherValue(Value val) : value(val) {}
+  bool match(Value val) const { return val == value; }
+  Value value;
+};
+
+template <typename TupleT, class CallbackT, std::size_t... Is>
+constexpr void enumerateImpl(TupleT &&tuple, CallbackT &&callback,
+                             std::index_sequence<Is...>) {
+  (void)std::initializer_list<int>{
+      0,
+      (callback(std::integral_constant<std::size_t, Is>{}, std::get<Is>(tuple)),
+       0)...};
+}
+
+template <typename... Tys, typename CallbackT>
+constexpr void enumerate(std::tuple<Tys...> &tuple, CallbackT &&callback) {
+  detail::enumerateImpl(tuple, std::forward<CallbackT>(callback),
+                        std::make_index_sequence<sizeof...(Tys)>{});
+}
+
+/// RecursivePatternMatcher that composes.
+template <typename OpType, typename... OperandMatchers>
+struct RecursivePatternMatcher {
+  RecursivePatternMatcher(OperandMatchers... matchers)
+      : operandMatchers(matchers...) {}
+  bool match(Operation *op) {
+    if (!isa<OpType>(op) || op->getNumOperands() != sizeof...(OperandMatchers))
+      return false;
+    bool res = true;
+    enumerate(operandMatchers, [&](size_t index, auto &matcher) {
+      res &= matchOperandOrValueAtIndex(op, index, matcher);
+    });
+    return res;
+  }
+  std::tuple<OperandMatchers...> operandMatchers;
+};
+
+} // end namespace detail
+
+/// Matches a value from a constant foldable operation and writes the value to
+/// bind_value.
+template <typename AttrT>
+inline detail::constant_op_binder<AttrT> m_Constant(AttrT *bind_value) {
+  return detail::constant_op_binder<AttrT>(bind_value);
+}
+
+/// Matches a constant scalar / vector splat / tensor splat integer one.
+inline detail::constant_int_value_matcher<1> m_One() {
+  return detail::constant_int_value_matcher<1>();
+}
+
+/// Matches the given OpClass.
+template <typename OpClass> inline detail::op_matcher<OpClass> m_Op() {
+  return detail::op_matcher<OpClass>();
+}
+
+/// Matches a constant scalar / vector splat / tensor splat integer zero.
+inline detail::constant_int_value_matcher<0> m_Zero() {
+  return detail::constant_int_value_matcher<0>();
+}
+
+/// Matches a constant scalar / vector splat / tensor splat integer that is any
+/// non-zero value.
+inline detail::constant_int_not_value_matcher<0> m_NonZero() {
+  return detail::constant_int_not_value_matcher<0>();
+}
+
+/// Entry point for matching a pattern over a Value.
+template <typename Pattern>
+inline bool matchPattern(Value value, const Pattern &pattern) {
+  // TODO: handle other cases
+  if (auto *op = value->getDefiningOp())
+    return const_cast<Pattern &>(pattern).match(op);
+  return false;
+}
+
+/// Entry point for matching a pattern over an Operation.
+template <typename Pattern>
+inline bool matchPattern(Operation *op, const Pattern &pattern) {
+  return const_cast<Pattern &>(pattern).match(op);
+}
+
+/// Matches a constant holding a scalar/vector/tensor integer (splat) and
+/// writes the integer value to bind_value.
+inline detail::constant_int_op_binder
+m_ConstantInt(IntegerAttr::ValueType *bind_value) {
+  return detail::constant_int_op_binder(bind_value);
+}
+
+template <typename OpType, typename... Matchers>
+auto m_Op(Matchers... matchers) {
+  return detail::RecursivePatternMatcher<OpType, Matchers...>(matchers...);
+}
+
+namespace matchers {
+inline auto m_Any() { return detail::AnyValueMatcher(); }
+inline auto m_Val(Value v) { return detail::PatternMatcherValue(v); }
+} // namespace matchers
+
+} // end namespace mlir
+
+#endif // MLIR_MATCHERS_H
diff --git a/mlir/include/mlir/IR/Module.h b/mlir/include/mlir/IR/Module.h
new file mode 100644
index 0000000000000000000000000000000000000000..babc51aad0d8b983932758c9a63c4a3791e78981
--- /dev/null
+++ b/mlir/include/mlir/IR/Module.h
@@ -0,0 +1,167 @@
+//===- Module.h - MLIR Module Class -----------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Module is the top-level container for code in an MLIR program.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_MODULE_H
+#define MLIR_IR_MODULE_H
+
+#include "mlir/IR/SymbolTable.h"
+
+namespace mlir {
+class ModuleTerminatorOp;
+
+//===----------------------------------------------------------------------===//
+// Module Operation.
+//===----------------------------------------------------------------------===//
+
+/// ModuleOp represents a module, or an operation containing one region with a
+/// single block containing opaque operations. The region of a module is not
+/// allowed to implicitly capture global values, and all external references
+/// must use symbolic references via attributes(e.g. via a string name).
+class ModuleOp
+    : public Op<
+          ModuleOp, OpTrait::ZeroOperands, OpTrait::ZeroResult,
+          OpTrait::IsIsolatedFromAbove, OpTrait::SymbolTable,
+          OpTrait::SingleBlockImplicitTerminator<ModuleTerminatorOp>::Impl> {
+public:
+  using Op::Op;
+  using Op::print;
+
+  static StringRef getOperationName() { return "module"; }
+
+  static void build(Builder *builder, OperationState &result,
+                    Optional<StringRef> name = llvm::None);
+
+  /// Construct a module from the given location with an optional name.
+  static ModuleOp create(Location loc, Optional<StringRef> name = llvm::None);
+
+  /// Operation hooks.
+  static ParseResult parse(OpAsmParser &parser, OperationState &result);
+  void print(OpAsmPrinter &p);
+  LogicalResult verify();
+
+  /// Return body of this module.
+  Region &getBodyRegion();
+  Block *getBody();
+
+  /// Return the name of this module if present.
+  Optional<StringRef> getName();
+
+  /// Print the this module in the custom top-level form.
+  void print(raw_ostream &os, OpPrintingFlags flags = llvm::None);
+  void dump();
+
+  //===--------------------------------------------------------------------===//
+  // Body Management.
+  //===--------------------------------------------------------------------===//
+
+  /// Iteration over the operations in the module.
+  using iterator = Block::iterator;
+
+  iterator begin() { return getBody()->begin(); }
+  iterator end() { return getBody()->end(); }
+  Operation &front() { return *begin(); }
+
+  /// This returns a range of operations of the given type 'T' held within the
+  /// module.
+  template <typename T> iterator_range<Block::op_iterator<T>> getOps() {
+    return getBody()->getOps<T>();
+  }
+
+  /// Insert the operation into the back of the body, before the terminator.
+  void push_back(Operation *op) {
+    insert(Block::iterator(getBody()->getTerminator()), op);
+  }
+
+  /// Insert the operation at the given insertion point. Note: The operation is
+  /// never inserted after the terminator, even if the insertion point is end().
+  void insert(Operation *insertPt, Operation *op) {
+    insert(Block::iterator(insertPt), op);
+  }
+  void insert(Block::iterator insertPt, Operation *op) {
+    auto *body = getBody();
+    if (insertPt == body->end())
+      insertPt = Block::iterator(body->getTerminator());
+    body->getOperations().insert(insertPt, op);
+  }
+};
+
+/// The ModuleTerminatorOp is a special terminator operation for the body of a
+/// ModuleOp, it has no semantic meaning beyond keeping the body of a ModuleOp
+/// well-formed.
+///
+/// This operation does _not_ have a custom syntax. However, ModuleOp will omit
+/// the terminator in their custom syntax for brevity.
+class ModuleTerminatorOp
+    : public Op<ModuleTerminatorOp, OpTrait::ZeroOperands, OpTrait::ZeroResult,
+                OpTrait::HasParent<ModuleOp>::Impl, OpTrait::IsTerminator> {
+public:
+  using Op::Op;
+  static StringRef getOperationName() { return "module_terminator"; }
+  static void build(Builder *, OperationState &) {}
+};
+
+/// This class acts as an owning reference to a module, and will automatically
+/// destroy the held module if valid.
+class OwningModuleRef {
+public:
+  OwningModuleRef(std::nullptr_t = nullptr) {}
+  OwningModuleRef(ModuleOp module) : module(module) {}
+  OwningModuleRef(OwningModuleRef &&other) : module(other.release()) {}
+  ~OwningModuleRef() {
+    if (module)
+      module.erase();
+  }
+
+  // Assign from another module reference.
+  OwningModuleRef &operator=(OwningModuleRef &&other) {
+    if (module)
+      module.erase();
+    module = other.release();
+    return *this;
+  }
+
+  /// Allow accessing the internal module.
+  ModuleOp get() const { return module; }
+  ModuleOp operator*() const { return module; }
+  ModuleOp *operator->() { return &module; }
+  explicit operator bool() const { return module; }
+
+  /// Release the referenced module.
+  ModuleOp release() {
+    ModuleOp released;
+    std::swap(released, module);
+    return released;
+  }
+
+private:
+  ModuleOp module;
+};
+
+} // end namespace mlir
+
+namespace llvm {
+
+/// Allow stealing the low bits of ModuleOp.
+template <> struct PointerLikeTypeTraits<mlir::ModuleOp> {
+public:
+  static inline void *getAsVoidPointer(mlir::ModuleOp I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::ModuleOp getFromVoidPointer(void *P) {
+    return mlir::ModuleOp::getFromOpaquePointer(P);
+  }
+  enum { NumLowBitsAvailable = 3 };
+};
+
+} // end namespace llvm
+
+#endif // MLIR_IR_MODULE_H
diff --git a/mlir/include/mlir/IR/OpAsmInterface.td b/mlir/include/mlir/IR/OpAsmInterface.td
new file mode 100644
index 0000000000000000000000000000000000000000..7e31c07575e1b35ab18ea2044420e7c52c7a054f
--- /dev/null
+++ b/mlir/include/mlir/IR/OpAsmInterface.td
@@ -0,0 +1,54 @@
+//===- OpAsmInterface.td - Asm Interfaces for opse ---------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains Interfaces for interacting with the AsmParser and
+// AsmPrinter.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_OPASMINTERFACE
+#define MLIR_OPASMINTERFACE
+
+include "mlir/IR/OpBase.td"
+
+/// Interface for hooking into the OpAsmPrinter and OpAsmParser.
+def OpAsmOpInterface : OpInterface<"OpAsmOpInterface"> {
+  let description = [{
+    This interface provides hooks to interact with the AsmPrinter and AsmParser
+    classes.
+  }];
+
+  let methods = [
+    InterfaceMethod<[{
+        Get a special name to use when printing the results of this operation.
+        The given callback is invoked with a specific result value that starts a
+        result "pack", and the name to give this result pack. To signal that a
+        result pack should use the default naming scheme, a None can be passed
+        in instead of the name.
+
+        For example, if you have an operation that has four results and you want
+        to split these into three distinct groups you could do the following:
+
+        ```c++
+          setNameFn(getResult(0), "first_result");
+          setNameFn(getResult(1), "middle_results");
+          setNameFn(getResult(3), ""); // use the default numbering.
+        ```
+
+        This would print the operation as follows:
+
+        ```mlir
+          %first_result, %middle_results:2, %0 = "my.op" ...
+        ```
+      }],
+      "void", "getAsmResultNames", (ins "OpAsmSetValueNameFn":$setNameFn)
+    >,
+  ];
+}
+
+#endif // MLIR_OPASMINTERFACE
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
new file mode 100644
index 0000000000000000000000000000000000000000..c457d25fc51ae6313780598183e004d08101a630
--- /dev/null
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -0,0 +1,1872 @@
+//===-- OpBase.td - Base op definition file ----------------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the base operation definition file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OP_BASE
+#define OP_BASE
+
+//===----------------------------------------------------------------------===//
+// Common utilities for defining TableGen mechanisms
+//===----------------------------------------------------------------------===//
+
+// A workaround for the inability to define functions in Tablegen.
+//
+// The template parameter defines a string that can be extracted from an
+// instance of this class by accessing the "result" member. Subclasses can take
+// their own template parameters as function "arguments" and use them to
+// populate result.
+// For example, if it didn't already exist, a concat function could be defined
+// like:
+//
+// class StrConcat<list<string> strings> :
+//     StrFunc<!foldl("", strings, prev, cur, prev # cur)>
+//
+// and then called like
+//
+// StrConcat<["a", "b", "c"]>.result
+//
+// to get the string "abc"
+class StrFunc<string r> {
+  string result = r;
+}
+
+// Concatenates a list of strings with a separator (default ", ")
+class StrJoin<list<string> strings, string sep = ", "> :
+    StrFunc<!if(!empty(strings), "",
+         !foldl(!head(strings), !tail(strings), prev, cur, prev # sep # cur))>;
+
+// Concatenates a list of integers into a string with a separator (default ", ")
+class StrJoinInt<list<int> integers, string sep = ", "> :
+    StrJoin<!foreach(i, integers, !cast<string>(i)), sep>;
+
+//===----------------------------------------------------------------------===//
+// Predicate definitions
+//===----------------------------------------------------------------------===//
+
+// Base class for logical predicates.
+//
+// Predicates are used to compose constraints (see next section for details).
+// There are two categories of predicates:
+//
+// 1. CPred: the primitive leaf predicate.
+// 2. Compound predicate: a predicate composed from child predicates using
+//    predicate combiners ("conjunction", "disjunction", "negation" or
+//    "substitution").
+class Pred;
+
+// A logical predicate wrapping any C expression.
+//
+// This is the basis for composing more complex predicates. It is the "atom"
+// predicate from the perspective of TableGen and the "interface" between
+// TableGen and C++. What is inside is already C++ code, which will be treated
+// as opaque strings with special placeholders to be substituted.
+//
+// ## Special placeholders
+//
+// Special placeholders can be used to refer to entities in the context where
+// this predicate is used. They serve as "hooks" to the enclosing environment.
+// The following special placeholders are supported in constraints for an op:
+//
+// * `$_builder` will be replaced by a mlir::Builder instance.
+// * `$_op` will be replaced by the current operation.
+// * `$_self` will be replaced with the entity this predicate is attached to.
+//   E.g., `BoolAttr` is an attribute constraint that wraps a
+//   `CPred<"$_self.isa<BoolAttr>()">` (see the following sections for details).
+//   Then for `F32:$attr`,`$_self` will be replaced by `$attr`.
+//   For type constraints, it's a little bit special since we want the
+//   constraints on each type definition reads naturally and we want to attach
+//   type constraints directly to an operand/result, $_self will be replaced
+//   by the operand/result's type. E.g., for `F32` in `F32:$operand`, its
+//   `$_self` will be expanded as `getOperand(...)->getType()`.
+class CPred<code pred> : Pred {
+  code predExpr = "(" # pred # ")";
+}
+
+// Kinds of predicate combiners.  These must closely match the predicates
+// implemented by the C++ backend (tblgen::PredCombinerKind).
+class PredCombinerKind;
+def PredCombinerAnd : PredCombinerKind;
+def PredCombinerOr : PredCombinerKind;
+def PredCombinerNot : PredCombinerKind;
+def PredCombinerSubstLeaves : PredCombinerKind;
+def PredCombinerConcat : PredCombinerKind;
+
+// A predicate that combines other predicates as defined by PredCombinerKind.
+// Instantiated below.
+class CombinedPred<PredCombinerKind k, list<Pred> c> : Pred {
+  PredCombinerKind kind = k;
+  list<Pred> children = c;
+}
+
+// Predicate combiners
+
+// A predicate that holds if all of its children hold.  Always holds for zero
+// children.
+class And<list<Pred> children> : CombinedPred<PredCombinerAnd, children>;
+
+// A predicate that holds if any of its children hold.  Never holds for zero
+// children.
+class Or<list<Pred> children> : CombinedPred<PredCombinerOr, children>;
+
+// A predicate that holds if its child does not.
+class Neg<Pred child> : CombinedPred<PredCombinerNot, [child]>;
+
+// A predicate that substitutes "pat" with "repl" in predicate calls of the
+// leaves of the predicate tree (i.e., not CombinedPred).
+//
+// This is plain string substitution without regular expressions or captures.
+// New predicates with more complex logical can be introduced should the need
+// arise.
+class SubstLeaves<string pat, string repl, Pred child>
+    : CombinedPred<PredCombinerSubstLeaves, [child]> {
+  string pattern = pat;
+  string replacement = repl;
+}
+
+// A predicate that prepends `pre` and appends `suf` to the final predicate
+// string composed from `child`. This is plain string concatenation and there
+// will be no substitution happening for `pre` and `suf`.
+class Concat<string pre, Pred child, string suf> :
+    CombinedPred<PredCombinerConcat, [child]> {
+  string prefix = pre;
+  string suffix = suf;
+}
+
+//===----------------------------------------------------------------------===//
+// Constraint definitions
+//===----------------------------------------------------------------------===//
+
+// TODO(b/130064155): Merge Constraints into Pred.
+
+// Base class for named constraints.
+//
+// An op's operands/attributes/results can have various requirements, e.g.,
+// having certain types, having values inside a certain range, and so on.
+// Besides, for a graph rewrite rule, the source pattern used to match against
+// the existing graph has conditions, like the op's operand must be of a more
+// constrained subtype, the attribute must have a certain value, and so on.
+//
+// These requirements and conditions are modeled using this class. Records of
+// this class are used to generate verification code in op verifier, and
+// matching code in pattern matcher.
+//
+// Constraints are predicates with descriptive names, to facilitate inspection,
+// provide nice error messages, etc.
+class Constraint<Pred pred, string desc = ""> {
+  // The predicates that this constraint requires.
+  Pred predicate = pred;
+  // User-readable description used in error reporting messages. If empty, a
+  // generic message will be used.
+  string description = desc;
+}
+
+// Subclasses used to differentiate different constraint kinds. These are used
+// as markers for the TableGen backend to handle different constraint kinds
+// differently if needed. Constraints not deriving from the following subclasses
+// are considered as uncategorized constraints.
+
+// Subclass for constraints on a type.
+class TypeConstraint<Pred predicate, string description = ""> :
+    Constraint<predicate, description>;
+
+// Subclass for constraints on an attribute.
+class AttrConstraint<Pred predicate, string description = ""> :
+    Constraint<predicate, description>;
+
+// Subclass for constraints on a region.
+class RegionConstraint<Pred predicate, string description = ""> :
+    Constraint<predicate, description>;
+
+// How to use these constraint categories:
+//
+// * Use TypeConstraint to specify
+//   * Constraints on an op's operand/result definition
+//   * Further constraints to match an op's operand/result in source pattern
+//
+// * Use Attr (a subclass for AttrConstraint) for
+//   * Constraints on an op's attribute definition
+// * Use AttrConstraint to specify
+//   * Further constraints to match an op's attribute in source pattern
+//
+// * Use uncategorized constraint to specify
+//   * Multi-entity constraints in rewrite rules
+
+//===----------------------------------------------------------------------===//
+// Common predicates
+//===----------------------------------------------------------------------===//
+
+// Whether a type is a VectorType.
+def IsVectorTypePred : CPred<"$_self.isa<VectorType>()">;
+
+// Whether a type is a TensorType.
+def IsTensorTypePred : CPred<"$_self.isa<TensorType>()">;
+
+// Whether a type is a MemRefType.
+def IsMemRefTypePred : CPred<"$_self.isa<MemRefType>()">;
+
+// Whether a type is an  IsUnrankedMemRefType
+def IsUnrankedMemRefTypePred : CPred<"$_self.isa<UnrankedMemRefType>()">;
+
+// Whether a type is a ShapedType.
+def IsShapedTypePred : CPred<"$_self.isa<ShapedType>()">;
+
+// For a ShapedType, verify that it has a static shape.
+def HasStaticShapePred : CPred<"$_self.cast<ShapedType>().hasStaticShape()">;
+
+// Whether a type is a TupleType.
+def IsTupleTypePred : CPred<"$_self.isa<TupleType>()">;
+
+//===----------------------------------------------------------------------===//
+// Dialect definitions
+//===----------------------------------------------------------------------===//
+
+class Dialect {
+  // The name of the dialect.
+  string name = ?;
+
+  // Short summary of the dialect.
+  string summary = ?;
+
+  // The description of the dialect.
+  string description = ?;
+
+  // The C++ namespace that ops of this dialect should be placed into.
+  //
+  // By default, uses the name of the dialect as the only namespace. To avoid
+  // placing in any namespace, use "". To specify nested namespaces, use "::"
+  // as the delimiter, e.g., given "A::B", ops will be placed in
+  // `namespace A { namespace B { <ops> } }`.
+  //
+  // Note that this works in conjunction with dialect C++ code. Depending on how
+  // the generated files are included into the dialect, you may want to specify
+  // a full namespace path or a partial one.
+  string cppNamespace = name;
+}
+
+//===----------------------------------------------------------------------===//
+// Type definitions
+//===----------------------------------------------------------------------===//
+
+// A type, carries type constraints.
+class Type<Pred condition, string descr = ""> :
+    TypeConstraint<condition, descr> {
+  string typeDescription = "";
+}
+
+// Allows providing an alternative name and description to an existing type def.
+class TypeAlias<Type t, string description = t.description> :
+    Type<t.predicate, description> {
+  let typeDescription = t.typeDescription;
+}
+
+// A type of a specific dialect.
+class DialectType<Dialect d, Pred condition, string descr = ""> :
+    Type<condition, descr> {
+  Dialect dialect = d;
+}
+
+// A variadic type constraint. It expands to zero or more of the base type. This
+// class is used for supporting variadic operands/results. An op can declare no
+// more than one variadic operand/result, and that operand/result must be the
+// last one in the operand/result list.
+class Variadic<Type type> : TypeConstraint<type.predicate, type.description> {
+  Type baseType = type;
+}
+
+// A type that can be constructed using MLIR::Builder.
+// Note that this does not "inherit" from Type because it would require
+// duplicating Type subclasses for buildable and non-buildable cases to avoid
+// diamond "inheritance".
+// TODO(zinenko): we may extend this to a more general 'Buildable' trait,
+// making some Types and some Attrs buildable.
+class BuildableType<code builder> {
+  // The builder call to invoke (if specified) to construct the BuildableType.
+  // Format: this will be affixed to the builder.
+  code builderCall = builder;
+}
+
+// Any type at all.
+def AnyType : Type<CPred<"true">, "any type">;
+
+// None type
+def NoneType : Type<CPred<"$_self.isa<NoneType>()">, "none type">;
+
+// Any type from the given list
+class AnyTypeOf<list<Type> allowedTypes, string description = ""> : Type<
+    // Satisfy any of the allowed type's condition
+    Or<!foreach(allowedtype, allowedTypes, allowedtype.predicate)>,
+    !if(!eq(description, ""),
+        StrJoin<!foreach(t, allowedTypes, t.description), " or ">.result,
+        description)>;
+
+// Integer types.
+// Any integer type irrespective of its width.
+def AnyInteger : Type<CPred<"$_self.isa<IntegerType>()">, "integer">;
+
+// Index type.
+def Index : Type<CPred<"$_self.isa<IndexType>()">, "index">;
+
+// Integer type of a specific width.
+class I<int width>
+    : Type<CPred<"$_self.isInteger(" # width # ")">,
+                  width # "-bit integer">,
+      BuildableType<"getIntegerType(" # width # ")"> {
+  int bitwidth = width;
+}
+
+class IntOfWidths<list<int> widths> :
+    AnyTypeOf<!foreach(w, widths, I<w>),
+              StrJoinInt<widths, "/">.result # "-bit integer">;
+
+def I1  : I<1>;
+def I8  : I<8>;
+def I16 : I<16>;
+def I32 : I<32>;
+def I64 : I<64>;
+
+// Floating point types.
+
+// Any float type irrespective of its width.
+def AnyFloat : Type<CPred<"$_self.isa<FloatType>()">, "floating-point">;
+
+// Float type of a specific width.
+class F<int width>
+    : Type<CPred<"$_self.isF" # width # "()">,
+                width # "-bit float">,
+      BuildableType<"getF" # width # "Type()"> {
+  int bitwidth = width;
+}
+
+class FloatOfWidths<list<int> widths> :
+    AnyTypeOf<!foreach(w, widths, F<w>),
+              StrJoinInt<widths, "/">.result # "-bit float">;
+
+def F16 : F<16>;
+def F32 : F<32>;
+def F64 : F<64>;
+
+def BF16 : Type<CPred<"$_self.isBF16()">, "bfloat16 type">,
+           BuildableType<"getBF16Type()">;
+
+class Complex<Type type>
+    : Type<And<[
+          CPred<"$_self.isa<ComplexType>()">,
+          SubstLeaves<"$_self", "$_self.cast<ComplexType>().getElementType()",
+           type.predicate>]>,
+           "complex type with " # type.description # " elements"> {
+  Type elementType = type;
+}
+
+def AnyComplex : Type<CPred<"$_self.isa<ComplexType>()">, "complex-type">;
+
+class OpaqueType<string dialect, string name, string description>
+  : Type<CPred<"isOpaqueTypeWithName($_self, \""#dialect#"\", \""#name#"\")">,
+         description>;
+
+// Function Type
+
+// Any function type.
+def FunctionType : Type<CPred<"$_self.isa<FunctionType>()">, "function type">;
+
+// A container type is a type that has another type embedded within it.
+class ContainerType<Type etype, Pred containerPred, code elementTypeCall,
+                    string descr> :
+    // First, check the container predicate.  Then, substitute the extracted
+    // element into the element type checker.
+    Type<And<[containerPred,
+                SubstLeaves<"$_self", !cast<string>(elementTypeCall),
+                etype.predicate>]>,
+         descr # " of " # etype.description # " values"> {
+  // The type of elements in the container.
+  Type elementType = etype;
+
+  // Call to retrieve.
+  code getElementTypeCall = elementTypeCall;
+}
+
+class ShapedContainerType<list<Type> allowedTypes, Pred containerPred, string descr> :
+    ContainerType<AnyTypeOf<allowedTypes>, containerPred,
+                  "$_self.cast<ShapedType>().getElementType()", descr>;
+
+// Whether a shaped type is ranked.
+def HasRankPred : CPred<"$_self.cast<ShapedType>().hasRank()">;
+
+// Whether a shaped type has one of the specified ranks.
+class HasAnyRankOfPred<list<int> ranks> : And<[
+    HasRankPred,
+    Or<!foreach(rank, ranks,
+                CPred<"$_self.cast<ShapedType>().getRank() == " # rank>)>]>;
+
+// Vector types.
+
+class VectorOf<list<Type> allowedTypes> :
+  ShapedContainerType<allowedTypes, IsVectorTypePred, "vector">;
+
+// Whether the number of elements of a vector is from the given
+// `allowedLengths` list
+class IsVectorOfLengthPred<list<int> allowedLengths> :
+  And<[IsVectorTypePred,
+       Or<!foreach(allowedlength, allowedLengths,
+                   CPred<[{$_self.cast<VectorType>().getNumElements()
+                           == }]
+                         # allowedlength>)>]>;
+
+// Any vector where the number of elements is from the given
+// `allowedLengths` list
+class VectorOfLength<list<int> allowedLengths> : Type<
+  IsVectorOfLengthPred<allowedLengths>,
+  " of length " # StrJoinInt<allowedLengths, "/">.result>;
+
+
+// Any vector where the number of elements is from the given
+// `allowedLengths` list and the type is from the given `allowedTypes`
+// list
+class VectorOfLengthAndType<list<int> allowedLengths,
+                            list<Type> allowedTypes> : Type<
+  And<[VectorOf<allowedTypes>.predicate,
+       VectorOfLength<allowedLengths>.predicate]>,
+  VectorOf<allowedTypes>.description #
+  VectorOfLength<allowedLengths>.description>;
+
+def AnyVector : VectorOf<[AnyType]>;
+
+// Tensor types.
+
+// Any tensor type whose element type is from the given `allowedTypes` list
+class TensorOf<list<Type> allowedTypes> :
+  ShapedContainerType<allowedTypes, IsTensorTypePred, "tensor">;
+
+def AnyTensor : TensorOf<[AnyType]>;
+
+def AnyRankedTensor :
+  ShapedContainerType<[AnyType], And<[IsTensorTypePred, HasRankPred]>,
+  "ranked tensor">;
+
+// TODO(b/130064155) Have an easy way to add another constraint to a type.
+class StaticShapeTensorOf<list<Type> allowedTypes>
+    : Type<And<[TensorOf<allowedTypes>.predicate, HasStaticShapePred]>,
+           "statically shaped " # TensorOf<allowedTypes>.description>;
+
+def AnyStaticShapeTensor : StaticShapeTensorOf<[AnyType]>;
+
+def I1Tensor   : TensorOf<[I1]>;
+def I8Tensor   : TensorOf<[I8]>;
+def I16Tensor  : TensorOf<[I16]>;
+def I32Tensor  : TensorOf<[I32]>;
+def I64Tensor  : TensorOf<[I64]>;
+
+def BF16Tensor : TensorOf<[BF16]>;
+def F16Tensor  : TensorOf<[F16]>;
+def F32Tensor  : TensorOf<[F32]>;
+def F64Tensor  : TensorOf<[F64]>;
+
+// Ranked tensor type with one of the specified types and ranks.
+class TensorRankOf<list<Type> allowedTypes, list<int> ranks> :
+    Type<And<[TensorOf<allowedTypes>.predicate, HasAnyRankOfPred<ranks>]>,
+         StrJoin<!foreach(rank, ranks, rank # "D"), "/">.result # " " #
+         TensorOf<allowedTypes>.description>;
+
+class 0DTensorOf<list<Type> allowedTypes> : TensorRankOf<allowedTypes, [0]>;
+class 1DTensorOf<list<Type> allowedTypes> : TensorRankOf<allowedTypes, [1]>;
+class 2DTensorOf<list<Type> allowedTypes> : TensorRankOf<allowedTypes, [2]>;
+class 3DTensorOf<list<Type> allowedTypes> : TensorRankOf<allowedTypes, [3]>;
+class 4DTensorOf<list<Type> allowedTypes> : TensorRankOf<allowedTypes, [4]>;
+
+// Unranked Memref type
+def AnyUnrankedMemRef : 
+    ShapedContainerType<[AnyType], 
+                        IsUnrankedMemRefTypePred, "unranked.memref">;
+// Memref type.
+
+// Memrefs are blocks of data with fixed type and rank.
+class MemRefOf<list<Type> allowedTypes> :
+    ShapedContainerType<allowedTypes, IsMemRefTypePred, "memref">;
+
+def AnyMemRef : MemRefOf<[AnyType]>;
+
+def AnyRankedOrUnrankedMemRef: AnyTypeOf<[AnyUnrankedMemRef, AnyMemRef]>;
+
+// Memref declarations handle any memref, independent of rank, size, (static or
+// dynamic), layout, or memory space.
+def I1MemRef  : MemRefOf<[I1]>;
+def I8MemRef  : MemRefOf<[I8]>;
+def I16MemRef : MemRefOf<[I16]>;
+def I32MemRef : MemRefOf<[I32]>;
+def I64MemRef : MemRefOf<[I64]>;
+
+def BF16MemRef : MemRefOf<[BF16]>;
+def F16MemRef  : MemRefOf<[F16]>;
+def F32MemRef  : MemRefOf<[F32]>;
+def F64MemRef  : MemRefOf<[F64]>;
+
+// TODO(b/130064155) Have an easy way to add another constraint to a type.
+class MemRefRankOf<list<Type> allowedTypes, list<int> ranks> :
+    Type<And<[MemRefOf<allowedTypes>.predicate, HasAnyRankOfPred<ranks>]>,
+         StrJoin<!foreach(rank, ranks, rank # "D"), "/">.result # " " #
+         MemRefOf<allowedTypes>.description>;
+
+class StaticShapeMemRefOf<list<Type> allowedTypes>
+    : Type<And<[MemRefOf<allowedTypes>.predicate, HasStaticShapePred]>,
+           "statically shaped " # MemRefOf<allowedTypes>.description>;
+
+def AnyStaticShapeMemRef : StaticShapeMemRefOf<[AnyType]>;
+
+// For a MemRefType, verify that it has strides.
+def HasStridesPred : CPred<[{ isStrided($_self.cast<MemRefType>()) }]>;
+
+class StridedMemRefOf<list<Type> allowedTypes>
+    : Type<And<[MemRefOf<allowedTypes>.predicate, HasStridesPred]>,
+           "strided " # MemRefOf<allowedTypes>.description>;
+
+def AnyStridedMemRef : StridedMemRefOf<[AnyType]>;
+
+class AnyStridedMemRefOfRank<int rank> :
+  Type<And<[AnyStridedMemRef.predicate,
+            MemRefRankOf<[AnyType], [rank]>.predicate]>,
+       AnyStridedMemRef.description # " of rank " # rank>;
+
+// This represents a generic tuple without any constraints on element type.
+def AnyTuple : Type<IsTupleTypePred, "tuple">;
+
+// A container type that has other types embedded in it, but (unlike
+// ContainerType) can hold elements with a mix of types. Requires a call that
+// produces a list of all elements' types.
+class MixedContainerType<Type etype, Pred containerPred, code elementTypesCall,
+                         string descr> :
+    Type<
+        And<[
+            containerPred,
+            Concat<
+                "llvm::all_of(" # elementTypesCall # ", [](Type t) { return ",
+                SubstLeaves<"$_self", "t", etype.predicate>,
+                "; })"
+            >
+        ]>,
+        descr # " with any combination of " # etype.description # " values"> {
+  // The type of elements in the container.
+  Type elementType = etype;
+
+  // Call to retrieve.
+  code getElementTypesCall = elementTypesCall;
+}
+
+// A Tuple that holds a mix of elements of the allowed types.
+class TupleOf<list<Type> allowedTypes>
+    : MixedContainerType<AnyTypeOf<allowedTypes>, IsTupleTypePred,
+                         "$_self.cast<TupleType>().getTypes()", "tuple">;
+
+// A Tuple with arbitrary nesting, where all elements are a mix of the allowed
+// types.
+class NestedTupleOf<list<Type> allowedTypes> :
+    MixedContainerType<AnyTypeOf<allowedTypes>, IsTupleTypePred,
+                       "getFlattenedTypes($_self.cast<TupleType>())",
+                       "nested tuple">;
+
+//===----------------------------------------------------------------------===//
+// Common type constraints
+//===----------------------------------------------------------------------===//
+
+// Type constraint for bool-like types: bools, vectors of bools, tensors of
+// bools.
+def BoolLike : TypeConstraint<Or<[I1.predicate, VectorOf<[I1]>.predicate,
+                                  TensorOf<[I1]>.predicate]>,
+    "bool-like">;
+
+// Type constraint for integer-like types: integers, indices, vectors of
+// integers, tensors of integers.
+def IntegerLike : TypeConstraint<Or<[AnyInteger.predicate, Index.predicate,
+        VectorOf<[AnyInteger]>.predicate, TensorOf<[AnyInteger]>.predicate]>,
+    "integer-like">;
+
+// Type constraint for float-like types: floats, vectors or tensors thereof.
+def FloatLike : TypeConstraint<Or<[AnyFloat.predicate,
+        VectorOf<[AnyFloat]>.predicate, TensorOf<[AnyFloat]>.predicate]>,
+    "floating-point-like">;
+
+
+//===----------------------------------------------------------------------===//
+// Attribute definitions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Base attribute definition
+
+// Base class for all attributes.
+class Attr<Pred condition, string descr = ""> :
+    AttrConstraint<condition, descr> {
+  code storageType = ?; // The backing mlir::Attribute type
+  code returnType = ?;  // The underlying C++ value type
+
+  // The call expression to convert from the storage type to the return
+  // type. For example, an enum can be stored as an int but returned as an
+  // enum class.
+  //
+  // Format: $_self will be expanded to the attribute.
+  //
+  // For example, `$_self.getValue().getSExtValue()` for `IntegerAttr val` will
+  // expand to `getAttrOfType<IntegerAttr>("val").getValue().getSExtValue()`.
+  code convertFromStorage = "$_self.getValue()";
+
+  // The call expression to build an attribute from a constant value.
+  //
+  // Format: $0 will be expanded to the constant value of the attribute.
+  //
+  // For example, `$_builder.getStringAttr("$0")` for `StringAttr:"foo"` will
+  // expand to `builder.getStringAttr("foo")`.
+  string constBuilderCall = ?;
+
+  // Default value for attribute.
+  // Requires a constBuilderCall defined.
+  string defaultValue = ?;
+
+  // Whether the attribute is optional. Typically requires a custom
+  // convertFromStorage method to handle the case where the attribute is
+  // not present.
+  bit isOptional = 0;
+
+  // What is the base-level Attr instantiation that this Attr is built upon.
+  // Unset means this is a base-level Attr.
+  //
+  // This field is used by attribute wrapper classes (DefaultValuedAttr,
+  // OptionalAttr, etc.) to retrieve the base-level attribute definition.
+  // This can be used for getting its name; otherwise, we will see
+  // "anonymous_<number>" as the attribute def name because of template
+  // instantiation.
+  // TOOD(b/132458159): deduplicate the fields in attribute wrapper classes.
+  Attr baseAttr = ?;
+}
+
+//===----------------------------------------------------------------------===//
+// Attribute modifier definition
+
+// Decorates an attribute to have an (unvalidated) default value if not present.
+class DefaultValuedAttr<Attr attr, string val> :
+    Attr<attr.predicate, attr.description> {
+  // Construct this attribute with the input attribute and change only
+  // the default value.
+  // Note: this has to be kept up to date with Attr above.
+  let storageType = attr.storageType;
+  let returnType = attr.returnType;
+  let convertFromStorage = attr.convertFromStorage;
+  let constBuilderCall = attr.constBuilderCall;
+  let defaultValue = val;
+
+  let baseAttr = attr;
+}
+
+// Decorates an attribute as optional. The return type of the generated
+// attribute accessor method will be Optional<>.
+class OptionalAttr<Attr attr> : Attr<attr.predicate, attr.description> {
+  // Rewrite the attribute to be optional.
+  // Note: this has to be kept up to date with Attr above.
+  let storageType = attr.storageType;
+  let returnType = "Optional<" # attr.returnType #">";
+  let convertFromStorage = "$_self ? " # returnType # "(" #
+                           attr.convertFromStorage # ") : (llvm::None)";
+  let isOptional = 1;
+
+  let baseAttr = attr;
+}
+
+//===----------------------------------------------------------------------===//
+// Primitive attribute kinds
+
+// A generic attribute that must be constructed around a specific type
+// `attrValType`. Backed by MLIR attribute kind `attrKind`.
+class TypedAttrBase<BuildableType attrValType, string attrKind,
+                    Pred condition, string descr> :
+    Attr<condition, descr> {
+  let constBuilderCall = "$_builder.get" # attrKind # "($_builder." #
+                         attrValType.builderCall # ", $0)";
+  let storageType = attrKind;
+}
+
+// Any attribute.
+def AnyAttr : Attr<CPred<"true">, "any attribute"> {
+  let storageType = "Attribute";
+  let returnType = "Attribute";
+  let convertFromStorage = "$_self";
+  let constBuilderCall = "$0";
+}
+
+def BoolAttr : Attr<CPred<"$_self.isa<BoolAttr>()">, "bool attribute"> {
+  let storageType = [{ BoolAttr }];
+  let returnType = [{ bool }];
+  let constBuilderCall = "$_builder.getBoolAttr($0)";
+}
+
+// Base class for integer attributes of fixed width.
+class IntegerAttrBase<I attrValType, string descr> :
+    TypedAttrBase<
+      attrValType, "IntegerAttr",
+      And<[CPred<"$_self.isa<IntegerAttr>()">,
+           CPred<"$_self.cast<IntegerAttr>().getType()."
+                 "isInteger(" # attrValType.bitwidth # ")">]>,
+      descr> {
+  let returnType = [{ APInt }];
+}
+
+def APIntAttr : Attr<CPred<"$_self.isa<IntegerAttr>()">,
+                     "arbitrary integer attribute"> {
+  let storageType = [{ IntegerAttr }];
+  let returnType = [{ APInt }];
+}
+
+def I1Attr  : IntegerAttrBase<I1,  "1-bit integer attribute">;
+def I8Attr  : IntegerAttrBase<I8,  "8-bit integer attribute">;
+def I16Attr : IntegerAttrBase<I16, "16-bit integer attribute">;
+def I32Attr : IntegerAttrBase<I32, "32-bit integer attribute">;
+def I64Attr : IntegerAttrBase<I64, "64-bit integer attribute">;
+
+class NonNegativeIntAttrBase<I attrValType, string descr> :
+    TypedAttrBase<
+      attrValType, "IntegerAttr",
+      And<[IntegerAttrBase<attrValType, "">.predicate,
+           CPred<"!$_self.cast<IntegerAttr>().getValue().isNegative()">]>,
+      descr> {
+  let returnType = [{ APInt }];
+}
+
+def NonNegativeI32Attr : NonNegativeIntAttrBase<
+    I32, "non-negative 32-bit integer attribute">;
+def NonNegativeI64Attr : NonNegativeIntAttrBase<
+    I64, "non-negative 64-bit integer attribute">;
+
+class PositiveIntAttrBase<I attrValType, string descr> :
+    TypedAttrBase<
+      attrValType, "IntegerAttr",
+      And<[IntegerAttrBase<attrValType, "">.predicate,
+           CPred<"$_self.cast<IntegerAttr>().getValue()"
+                 ".isStrictlyPositive()">]>,
+      descr> {
+  let returnType = [{ APInt }];
+}
+
+def PositiveI32Attr : PositiveIntAttrBase<
+    I32, "positive 32-bit integer attribute">;
+def PositiveI64Attr : PositiveIntAttrBase<
+    I64, "positive 64-bit integer attribute">;
+
+// Base class for float attributes of fixed width.
+class FloatAttrBase<F attrValType, string descr> :
+    TypedAttrBase<attrValType, "FloatAttr",
+              And<[CPred<"$_self.isa<FloatAttr>()">,
+                     CPred<"$_self.cast<FloatAttr>().getType().isF" #
+                           attrValType.bitwidth # "()">]>,
+              descr> {
+  let returnType = [{ APFloat }];
+}
+
+def F32Attr : FloatAttrBase<F32, "32-bit float attribute">;
+def F64Attr : FloatAttrBase<F64, "64-bit float attribute">;
+
+// An attribute backed by a string type.
+class StringBasedAttr<Pred condition, string descr> : Attr<condition, descr> {
+  let constBuilderCall = "$_builder.getStringAttr(\"$0\")";
+  let storageType = [{ StringAttr }];
+  let returnType = [{ StringRef }];
+}
+
+def StrAttr : StringBasedAttr<CPred<"$_self.isa<StringAttr>()">,
+                              "string attribute">;
+
+// Base class for attributes containing types. Example:
+//   def IntTypeAttr : TypeAttrBase<"IntegerType", "integer type attribute">
+// defines a type attribute containing an integer type.
+class TypeAttrBase<string retType, string description> :
+    Attr<And<[
+      CPred<"$_self.isa<TypeAttr>()">,
+      CPred<"$_self.cast<TypeAttr>().getValue().isa<" # retType # ">()">]>,
+    description> {
+  let storageType = [{ TypeAttr }];
+  let returnType = retType;
+  let convertFromStorage = "$_self.getValue().cast<" # retType # ">()";
+}
+
+def TypeAttr : TypeAttrBase<"Type", "any type attribute">;
+
+// The mere presence of unit attributes has a meaning.  Therefore, unit
+// attributes are always treated as optional and accessors to them return
+// "true" if the attribute is present and "false" otherwise.
+def UnitAttr : Attr<CPred<"$_self.isa<UnitAttr>()">, "unit attribute"> {
+  let storageType = [{ UnitAttr }];
+  let constBuilderCall = "$_builder.getUnitAttr()";
+  let convertFromStorage = "$_self != nullptr";
+  let returnType = "bool";
+  let isOptional = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Enum attribute kinds
+
+// Additional information for an enum attribute case.
+class EnumAttrCaseInfo<string sym, int val> {
+  // The C++ enumerant symbol
+  string symbol = sym;
+
+  // The C++ enumerant value
+  // If less than zero, there will be no explicit discriminator values assigned
+  // to enumerators in the generated enum class.
+  int value = val;
+}
+
+// An enum attribute case stored with StringAttr.
+class StrEnumAttrCase<string sym, int val = -1> :
+    EnumAttrCaseInfo<sym, val>,
+    StringBasedAttr<
+      CPred<"$_self.cast<StringAttr>().getValue() == \"" # sym # "\"">,
+      "case " # sym>;
+
+// An enum attribute case stored with IntegerAttr.
+class IntEnumAttrCaseBase<I intType, string sym, int val> :
+    EnumAttrCaseInfo<sym, val>,
+    IntegerAttrBase<intType, "case " # sym> {
+  let predicate =
+    CPred<"$_self.cast<IntegerAttr>().getInt() == " # val>;
+}
+
+class I32EnumAttrCase<string sym, int val> : IntEnumAttrCaseBase<I32, sym, val>;
+class I64EnumAttrCase<string sym, int val> : IntEnumAttrCaseBase<I64, sym, val>;
+
+// A bit enum case stored with 32-bit IntegerAttr. `val` here is *not* the
+// ordinal number of the bit that is set. It is the 32-bit integer with only
+// one bit set.
+class BitEnumAttrCase<string sym, int val> :
+    EnumAttrCaseInfo<sym, val>,
+    IntegerAttrBase<I32, "case " # sym> {
+  let predicate = CPred<
+    "$_self.cast<IntegerAttr>().getValue().getZExtValue() & " # val # "u">;
+}
+
+// Additional information for an enum attribute.
+class EnumAttrInfo<string name, list<EnumAttrCaseInfo> cases> {
+  // The C++ enum class name
+  string className = name;
+
+  // List of all accepted cases
+  list<EnumAttrCaseInfo> enumerants = cases;
+
+  // The following fields are only used by the EnumsGen backend to generate
+  // an enum class definition and conversion utility functions.
+
+  // The underlying type for the C++ enum class. An empty string mean the
+  // underlying type is not explicitly specified.
+  string underlyingType = "";
+
+  // The C++ namespaces that the enum class definition and utility functions
+  // should be placed into.
+  //
+  // Normally you want to place the full namespace path here. If it is nested,
+  // use "::" as the delimiter, e.g., given "A::B", generated code will be
+  // placed in `namespace A { namespace B { ... } }`. To avoid placing in any
+  // namespace, use "".
+  // TODO(b/134741431): use dialect to provide the namespace.
+  string cppNamespace = "";
+
+  // The name of the utility function that converts a value of the underlying
+  // type to the corresponding symbol. It will have the following signature:
+  //
+  // ```c++
+  // llvm::Optional<<qualified-enum-class-name>> <fn-name>(<underlying-type>);
+  // ```
+  string underlyingToSymbolFnName = "symbolize" # name;
+
+  // The name of the utility function that converts a string to the
+  // corresponding symbol. It will have the following signature:
+  //
+  // ```c++
+  // llvm::Optional<<qualified-enum-class-name>> <fn-name>(llvm::StringRef);
+  // ```
+  string stringToSymbolFnName = "symbolize" # name;
+
+  // The name of the utility function that converts a symbol to the
+  // corresponding string. It will have the following signature:
+  //
+  // ```c++
+  // <return-type> <fn-name>(<qualified-enum-class-name>);
+  // ```
+  string symbolToStringFnName = "stringify" # name;
+  string symbolToStringFnRetType = "llvm::StringRef";
+
+  // The name of the utility function that returns the max enum value used
+  // within the enum class. It will have the following signature:
+  //
+  // ```c++
+  // static constexpr unsigned <fn-name>();
+  // ```
+  string maxEnumValFnName = "getMaxEnumValFor" # name;
+}
+
+// An enum attribute backed by StringAttr.
+//
+// Op attributes of this kind are stored as StringAttr. Extra verification will
+// be generated on the string though: only the symbols of the allowed cases are
+// permitted as the string value.
+class StrEnumAttr<string name, string description,
+                  list<StrEnumAttrCase> cases> :
+    EnumAttrInfo<name, cases>,
+    StringBasedAttr<
+      And<[StrAttr.predicate, Or<!foreach(case, cases, case.predicate)>]>,
+      !if(!empty(description), "allowed string cases: " #
+          StrJoin<!foreach(case, cases, "'" # case.symbol # "'")>.result,
+          description)>;
+
+// An enum attribute backed by IntegerAttr.
+//
+// Op attributes of this kind are stored as IntegerAttr. Extra verification will
+// be generated on the integer though: only the values of the allowed cases are
+// permitted as the integer value.
+class IntEnumAttr<I intType, string name, string description,
+                  list<IntEnumAttrCaseBase> cases> :
+    EnumAttrInfo<name, cases>,
+    IntegerAttrBase<intType,
+      !if(!empty(description), "allowed " # intType.description # " cases: " #
+          StrJoinInt<!foreach(case, cases, case.value)>.result, description)> {
+  let predicate = And<[
+    IntegerAttrBase<intType, "">.predicate,
+    Or<!foreach(case, cases, case.predicate)>]>;
+}
+
+class I32EnumAttr<string name, string description,
+                  list<I32EnumAttrCase> cases> :
+    IntEnumAttr<I32, name, description, cases> {
+  let returnType = cppNamespace # "::" # name;
+  let underlyingType = "uint32_t";
+  let convertFromStorage = "static_cast<" # returnType # ">($_self.getInt())";
+  let constBuilderCall = "$_builder.getI32IntegerAttr(static_cast<int32_t>($0))";
+}
+class I64EnumAttr<string name, string description,
+                  list<I64EnumAttrCase> cases> :
+    IntEnumAttr<I64, name, description, cases> {
+  let returnType = cppNamespace # "::" # name;
+  let underlyingType = "uint64_t";
+  let convertFromStorage = "static_cast<" # returnType # ">($_self.getInt())";
+  let constBuilderCall = "$_builder.getI64IntegerAttr(static_cast<int64_t>($0))";
+}
+
+// A bit enum stored with 32-bit IntegerAttr.
+//
+// Op attributes of this kind are stored as IntegerAttr. Extra verification will
+// be generated on the integer to make sure only allowed bit are set. Besides,
+// helper methods are generated to parse a string separated with a specified
+// delimiter to a symbol and vice versa.
+class BitEnumAttr<string name, string description,
+                  list<BitEnumAttrCase> cases> :
+    EnumAttrInfo<name, cases>, IntegerAttrBase<I32, description> {
+  let predicate = And<[
+    IntegerAttrBase<I32, "">.predicate,
+    // Make sure we don't have unknown bit set.
+    CPred<"!($_self.cast<IntegerAttr>().getValue().getZExtValue() & (~(" #
+          StrJoin<!foreach(case, cases, case.value # "u"), "|">.result #
+          ")))">
+  ]>;
+
+  let returnType = cppNamespace # "::" # name;
+  let underlyingType = "uint32_t";
+  let convertFromStorage = "static_cast<" # returnType # ">($_self.getInt())";
+  let constBuilderCall = "$_builder.getI32IntegerAttr(static_cast<int32_t>($0))";
+
+  // We need to return a string because we may concatenate symbols for multiple
+  // bits together.
+  let symbolToStringFnRetType = "std::string";
+
+  // The delimiter used to separate bit enum cases in strings.
+  string separator = "|";
+}
+
+//===----------------------------------------------------------------------===//
+// Composite attribute kinds
+
+class DictionaryAttrBase : Attr<CPred<"$_self.isa<DictionaryAttr>()">,
+                          "dictionary of named attribute values"> {
+  let storageType = [{ DictionaryAttr }];
+  let returnType = [{ DictionaryAttr }];
+  let convertFromStorage = "$_self";
+}
+
+def DictionaryAttr : DictionaryAttrBase;
+
+class ElementsAttrBase<Pred condition, string description> :
+    Attr<condition, description> {
+  let storageType = [{ ElementsAttr }];
+  let returnType = [{ ElementsAttr }];
+  let convertFromStorage = "$_self";
+}
+
+def ElementsAttr : ElementsAttrBase<CPred<"$_self.isa<ElementsAttr>()">,
+                                   "constant vector/tensor attribute">;
+
+class IntElementsAttr<int width> : ElementsAttrBase<
+  CPred<"$_self.isa<DenseIntElementsAttr>() &&"
+      "$_self.cast<DenseIntElementsAttr>().getType()."
+      "getElementType().isInteger(" # width # ")">,
+  width # "-bit integer elements attribute"> {
+
+  let storageType = [{ DenseIntElementsAttr }];
+  let returnType = [{ DenseIntElementsAttr }];
+
+  // Note that this is only constructing scalar elements attribute.
+  let constBuilderCall = "DenseElementsAttr::get("
+    "RankedTensorType::get({}, $_builder.getIntegerType(" # width # ")), "
+    "llvm::makeArrayRef($0)).cast<DenseIntElementsAttr>()";
+  let convertFromStorage = "$_self";
+}
+
+def I32ElementsAttr : IntElementsAttr<32>;
+def I64ElementsAttr : IntElementsAttr<64>;
+
+class FloatElementsAttr<int width> : ElementsAttrBase<
+  CPred<"$_self.isa<DenseFPElementsAttr>() &&"
+      "$_self.cast<DenseElementsAttr>().getType()."
+      "getElementType().isF" # width # "()">,
+  width # "-bit float elements attribute"> {
+
+  let storageType = [{ DenseElementsAttr }];
+  let returnType = [{ DenseElementsAttr }];
+
+  // Note that this is only constructing scalar elements attribute.
+  let constBuilderCall = "DenseElementsAttr::get("
+    "RankedTensorType::get({}, $_builder.getF" # width # "Type()),"
+    "llvm::makeArrayRef($0))";
+  let convertFromStorage = "$_self";
+}
+
+def F64ElementsAttr : FloatElementsAttr<64>;
+
+// A `width`-bit floating point elements attribute. The attribute should be
+// ranked and has a shape as specified in `dims`.
+class RankedFloatElementsAttr<int width, list<int> dims> : ElementsAttrBase<
+  CPred<"$_self.isa<DenseFPElementsAttr>() &&"
+      "$_self.cast<DenseFPElementsAttr>().getType()."
+      "getElementType().isF" # width # "() && "
+      // Check that this is ranked and has the specified shape.
+      "$_self.cast<DenseFPElementsAttr>().getType().hasRank() && "
+      "$_self.cast<DenseFPElementsAttr>().getType().getShape() == "
+      "llvm::ArrayRef<int64_t>({" # StrJoinInt<dims>.result # "})">,
+  width # "-bit float elements attribute of shape [" #
+  StrJoinInt<dims>.result # "]"> {
+
+  let storageType = [{ DenseFPElementsAttr }];
+  let returnType = [{ DenseFPElementsAttr }];
+
+  let constBuilderCall = "DenseElementsAttr::get("
+    "RankedTensorType::get({" # StrJoinInt<dims>.result #
+    "}, $_builder.getF" # width # "Type()), "
+    "llvm::makeArrayRef($0)).cast<DenseFPElementsAttr>()";
+  let convertFromStorage = "$_self";
+}
+
+class RankedF32ElementsAttr<list<int> dims> : RankedFloatElementsAttr<32, dims>;
+class RankedF64ElementsAttr<list<int> dims> : RankedFloatElementsAttr<64, dims>;
+
+// Base class for array attributes.
+class ArrayAttrBase<Pred condition, string description> :
+    Attr<condition, description> {
+  let storageType = [{ ArrayAttr }];
+  let returnType = [{ ArrayAttr }];
+  let convertFromStorage = "$_self";
+}
+
+def ArrayAttr : ArrayAttrBase<CPred<"$_self.isa<ArrayAttr>()">,
+                              "array attribute">;
+
+// Base class for array attributes whose elements are of the same kind.
+// `element` specifies the element attribute kind stored in this array.
+class TypedArrayAttrBase<Attr element, string description>: ArrayAttrBase<
+    And<[
+      // Guarantee this is an ArrayAttr first
+      CPred<"$_self.isa<ArrayAttr>()">,
+      // Guarantee all elements satisfy the constraints from `element`
+      Concat<"llvm::all_of($_self.cast<ArrayAttr>(), "
+                          "[](Attribute attr) { return ",
+                             SubstLeaves<"$_self", "attr", element.predicate>,
+                          "; })">]>,
+    description> {
+  let constBuilderCall = "$_builder.getArrayAttr($0)";
+}
+
+def I32ArrayAttr : TypedArrayAttrBase<I32Attr,
+                                      "32-bit integer array attribute"> {
+  let constBuilderCall = "$_builder.getI32ArrayAttr($0)";
+}
+def I64ArrayAttr : TypedArrayAttrBase<I64Attr,
+                                      "64-bit integer array attribute"> {
+  let constBuilderCall = "$_builder.getI64ArrayAttr($0)";
+}
+def F32ArrayAttr : TypedArrayAttrBase<F32Attr, "32-bit float array attribute"> {
+  let constBuilderCall = "$_builder.getF32ArrayAttr($0)";
+}
+def F64ArrayAttr : TypedArrayAttrBase<F64Attr, "64-bit float array attribute"> {
+  let constBuilderCall = "$_builder.getF64ArrayAttr($0)";
+}
+def StrArrayAttr : TypedArrayAttrBase<StrAttr, "string array attribute"> {
+  let constBuilderCall = "$_builder.getStrArrayAttr($0)";
+}
+def TypeArrayAttr : TypedArrayAttrBase<TypeAttr, "type array attribute"> {
+  let constBuilderCall = ?;
+}
+
+// Attribute information for an Attribute field within a StructAttr.
+class StructFieldAttr<string thisName, Attr thisType> {
+  // Name of this field in the StructAttr.
+  string name = thisName;
+
+  // Attribute type wrapped by the struct attr.
+  Attr type = thisType;
+}
+
+// Structured attribute that wraps a DictionaryAttr and provides both a
+// validation method and set of accessors for a fixed set of fields. This is
+// useful when representing data that would normally be in a structure.
+class StructAttr<string name, Dialect dialect,
+                 list<StructFieldAttr> attributes> : DictionaryAttrBase {
+  // Name for this StructAttr.
+  string className = name;
+
+  // Return type should match the name of the structure.
+  let returnType = name;
+
+  // Storage type should match the name of the structure.
+  let storageType = name;
+
+  // The dialect this StructAttr belongs to.
+  Dialect structDialect = dialect;
+
+  // List of fields that the StructAttr contains.
+  list<StructFieldAttr> fields = attributes;
+}
+
+// Attributes containing symbol references.
+def SymbolRefAttr : Attr<CPred<"$_self.isa<SymbolRefAttr>()">,
+                        "symbol reference attribute"> {
+  let storageType = [{ SymbolRefAttr }];
+  let returnType = [{ SymbolRefAttr }];
+  let constBuilderCall = "$_builder.getSymbolRefAttr($0)";
+  let convertFromStorage = "$_self";
+}
+def FlatSymbolRefAttr : Attr<CPred<"$_self.isa<FlatSymbolRefAttr>()">,
+                                   "flat symbol reference attribute"> {
+  let storageType = [{ FlatSymbolRefAttr }];
+  let returnType = [{ StringRef }];
+  let constBuilderCall = "$_builder.getSymbolRefAttr($0)";
+  let convertFromStorage = "$_self.getValue()";
+}
+
+def SymbolRefArrayAttr :
+  TypedArrayAttrBase<SymbolRefAttr, "symbol ref array attribute"> {
+  let constBuilderCall = ?;
+}
+
+//===----------------------------------------------------------------------===//
+// Derive attribute kinds
+
+// DerivedAttr are attributes whose value is computed from properties
+// of the operation. They do not require additional storage and are
+// materialized as needed.
+class DerivedAttr<code ret, code b> : Attr<CPred<"true">, "derived attribute"> {
+  let returnType = ret;
+  code body = b;
+}
+
+// Derived attribute that returns a mlir::Type.
+class DerivedTypeAttr<code body> : DerivedAttr<"Type", body>;
+
+//===----------------------------------------------------------------------===//
+// Constant attribute kinds
+
+// Represents a constant attribute of specific Attr type. A constant
+// attribute can be specified only of attributes that have a constant
+// builder call defined. The constant value is specified as a string.
+//
+// If used as a constraint, it generates a matcher on a constant attribute by
+// using the constant value builder of the attribute and the value.
+class ConstantAttr<Attr attribute, string val> : AttrConstraint<
+    CPred<"$_self == " # !subst("$0", val, attribute.constBuilderCall)>,
+    "constant attribute " # val> {
+  Attr attr = attribute;
+  string value = val;
+}
+
+class ConstF32Attr<string val> : ConstantAttr<F32Attr, val>;
+def ConstBoolAttrFalse : ConstantAttr<BoolAttr, "false">;
+def ConstBoolAttrTrue : ConstantAttr<BoolAttr, "true">;
+def ConstUnitAttr : ConstantAttr<UnitAttr, "unit">;
+
+//===----------------------------------------------------------------------===//
+// Common attribute constraints
+//===----------------------------------------------------------------------===//
+
+// A general mechanism to further confine the given `attr` with all the
+// `constraints`. This allows to compose complex constraints out of a series
+// of more primitive ones.
+class Confined<Attr attr, list<AttrConstraint> constraints> : Attr<
+    And<!listconcat([attr.predicate],
+                      !foreach(pred, constraints, pred.predicate))>,
+    !foldl(/*init*/attr.description, /*list*/constraints,
+           prev, cur, prev # " " # cur.description)> {
+  let storageType = attr.storageType;
+  let returnType = attr.returnType;
+  let convertFromStorage = attr.convertFromStorage;
+  let constBuilderCall = attr.constBuilderCall;
+  let defaultValue = attr.defaultValue;
+  let isOptional = attr.isOptional;
+
+  let baseAttr = attr;
+}
+
+// An AttrConstraint that holds if all attr constraints specified in
+// 'constraints' hold.
+class AllAttrConstraintsOf<list<AttrConstraint> constraints> : AttrConstraint<
+    And<!listconcat([!head(constraints).predicate],
+                      !foreach(pred, !tail(constraints), pred.predicate))>,
+    !foldl(/*init*/!head(constraints).description, /*list*/!tail(constraints),
+           prev, cur, prev # " and " # cur.description)> {
+}
+
+class IntMinValue<int n> : AttrConstraint<
+    CPred<"$_self.cast<IntegerAttr>().getInt() >= " # n>,
+    "whose minimum value is " # n>;
+
+class IntMaxValue<int n> : AttrConstraint<
+    CPred<"$_self.cast<IntegerAttr>().getInt() <= " # n>,
+    "whose maximum value is " # n>;
+
+class ArrayMinCount<int n> : AttrConstraint<
+    CPred<"$_self.cast<ArrayAttr>().size() >= " # n>,
+    "with at least " # n # " elements">;
+
+class ArrayCount<int n> : AttrConstraint<
+    CPred<"$_self.cast<ArrayAttr>().size() == " #n>,
+    "with exactly " # n # " elements">;
+
+class IntArrayNthElemEq<int index, int value> : AttrConstraint<
+    And<[
+      CPred<"$_self.cast<ArrayAttr>().size() > " # index>,
+      CPred<"$_self.cast<ArrayAttr>().getValue()[" # index # "]"
+        ".cast<IntegerAttr>().getInt() == " # value>
+       ]>,
+    "whose " # index # "-th element must be " # value>;
+
+class IntArrayNthElemMinValue<int index, int min> : AttrConstraint<
+    And<[
+      CPred<"$_self.cast<ArrayAttr>().size() > " # index>,
+      CPred<"$_self.cast<ArrayAttr>().getValue()[" # index # "]"
+        ".cast<IntegerAttr>().getInt() >= " # min>
+        ]>,
+    "whose " # index # "-th element must be at least " # min>;
+
+def IsNullAttr : AttrConstraint<
+    CPred<"!$_self">, "empty attribute (for optional attributes)">;
+
+// An attribute constraint on FlatSymbolRefAttr that requires that the
+// reference point to an op of `opClass` within the closest parent with a symbol
+// table.
+// TODO(riverriddle) Add support for nested symbol references.
+class ReferToOp<string opClass> : AttrConstraint<
+    CPred<"isa_and_nonnull<" # opClass # ">("
+            "::mlir::SymbolTable::lookupNearestSymbolFrom("
+              "&$_op, $_self.cast<FlatSymbolRefAttr>().getValue()))">,
+    "referencing to a '" # opClass # "' symbol">;
+
+//===----------------------------------------------------------------------===//
+// Region definitions
+//===----------------------------------------------------------------------===//
+
+class Region<Pred condition, string descr = ""> :
+    RegionConstraint<condition, descr>;
+
+// Any region.
+def AnyRegion : Region<CPred<"true">, "any region">;
+
+// A region with the given number of blocks.
+class SizedRegion<int numBlocks> : Region<
+  CPred<"$_self.getBlocks().size() == " # numBlocks>,
+  "region with " # numBlocks # " blocks">;
+
+//===----------------------------------------------------------------------===//
+// OpTrait definitions
+//===----------------------------------------------------------------------===//
+
+// OpTrait represents a trait regarding an op.
+class OpTrait;
+
+// NativeOpTrait corresponds to the MLIR C++ OpTrait mechanism. The
+// purpose to wrap around C++ symbol string with this class is to make
+// traits specified for ops in TableGen less alien and more integrated.
+class NativeOpTrait<string prop> : OpTrait {
+  string trait = "OpTrait::" # prop;
+}
+
+// ParamNativeOpTrait corresponds to the template-parameterized traits in the
+// C++ implementation.  MLIR uses nested class templates to implement such
+// traits leading to constructs of the form "TraitName<Parameters>::Impl". Use
+// the value in `prop` as the trait name and the value in `params` as
+// parameters to construct the native trait class name.
+class ParamNativeOpTrait<string prop, string params>
+    : NativeOpTrait<prop # "<" # params # ">::Impl">;
+
+// GenInternalOpTrait is an op trait that does not have direct C++ mapping but
+// affects op definition generator internals, like how op builders and
+// operand/attribute/result getters are generated.
+class GenInternalOpTrait<string prop> : OpTrait {
+  string trait = "OpTrait::" # prop;
+}
+
+// PredOpTrait is an op trait implemented by way of a predicate on the op.
+class PredOpTrait<string descr, Pred pred> : OpTrait {
+  string description = descr;
+  Pred predicate = pred;
+}
+
+// Op supports operand broadcast behavior.
+def Broadcastable  : NativeOpTrait<"BroadcastableTwoOperandsOneResult">;
+// X op Y == Y op X
+def Commutative  : NativeOpTrait<"IsCommutative">;
+// Op behaves like a function.
+def FunctionLike : NativeOpTrait<"FunctionLike">;
+// Op is isolated from above.
+def IsolatedFromAbove : NativeOpTrait<"IsIsolatedFromAbove">;
+// Op results are float or vectors/tensors thereof.
+def ResultsAreFloatLike : NativeOpTrait<"ResultsAreFloatLike">;
+// Op has no side effect.
+def NoSideEffect : NativeOpTrait<"HasNoSideEffect">;
+// Op has the same operand type.
+def SameTypeOperands : NativeOpTrait<"SameTypeOperands">;
+// Op has same shape for all operands.
+def SameOperandsShape : NativeOpTrait<"SameOperandsShape">;
+// Op has same operand and result shape.
+def SameOperandsAndResultShape : NativeOpTrait<"SameOperandsAndResultShape">;
+// Op has the same operand and result type.
+def SameOperandsAndResultType : NativeOpTrait<"SameOperandsAndResultType">;
+// Op has the same element type (or type itself, if scalar) for all operands.
+def SameOperandsElementType : NativeOpTrait<"SameOperandsElementType">;
+// Op has the same operand and result element type (or type itself, if scalar).
+def SameOperandsAndResultElementType :
+  NativeOpTrait<"SameOperandsAndResultElementType">;
+// Op is a symbol.
+def Symbol : NativeOpTrait<"Symbol">;
+// Op defines a symbol table.
+def SymbolTable : NativeOpTrait<"SymbolTable">;
+// Op is a terminator.
+def Terminator : NativeOpTrait<"IsTerminator">;
+
+// Op's regions have a single block with the specified terminator.
+class SingleBlockImplicitTerminator<string op>
+    : ParamNativeOpTrait<"SingleBlockImplicitTerminator", op>;
+
+// Op's parent operation is the provided one.
+class HasParent<string op>
+    : ParamNativeOpTrait<"HasParent", op>;
+
+// Op result type is derived from the first attribute. If the attribute is an
+// subclass of `TypeAttrBase`, its value is used, otherwise, the type of the
+// attribute content is used.
+def FirstAttrDerivedResultType :
+  GenInternalOpTrait<"FirstAttrDerivedResultType">;
+
+// TODO(antiagainst): Turn the following into normal traits and generate
+// verification for them.
+
+// All variadic operands of the op have the same number of values.
+// A variadic operand contains an array of values whose array size is only
+// known at runtime. This trait requires all variadic operands of an op
+// to have the same array size.
+def SameVariadicOperandSize : GenInternalOpTrait<"SameVariadicOperandSize">;
+// All variadic results of the op have the same number of values.
+// A variadic result contains an array of values whose array size is only
+// known at runtime. This trait requires all variadic results of an op
+// to have the same array size.
+def SameVariadicResultSize : GenInternalOpTrait<"SameVariadicResultSize">;
+
+// Uses an attribute named `operand_segment_sizes` to specify how many actual
+// operand each ODS-declared operand (variadic or not) corresponds to.
+// This trait is used for ops that have multiple variadic operands but do
+// not know statically their size relationship. The attribute must be a 1D
+// vector that has the same number of elements as the number of ODS declared
+// operands. That means even if some operands are non-variadic, the attribute
+// still need to have an element for its size, which is always 1.
+def AttrSizedOperandSegments : NativeOpTrait<"AttrSizedOperandSegments">;
+// Similar to AttrSizedOperandSegments, but used for results. The attribute
+// should be named as `result_segment_sizes`.
+def AttrSizedResultSegments  : NativeOpTrait<"AttrSizedResultSegments">;
+
+//===----------------------------------------------------------------------===//
+// OpInterface definitions
+//===----------------------------------------------------------------------===//
+
+// Marker used to identify the argument list for an op or interface method.
+def ins;
+
+// OpInterfaceTrait corresponds to a specific 'OpInterface' class defined in
+// C++. The purpose to wrap around C++ symbol string with this class is to make
+// interfaces specified for ops in TableGen less alien and more integrated.
+class OpInterfaceTrait<string name> : NativeOpTrait<""> {
+  let trait = name # "::Trait";
+}
+
+// This class represents a single, optionally static, interface method.
+// Note: non-static interface methods have an implicit 'op' parameter
+// corresponding to an instance of the derived operation.
+class InterfaceMethod<string desc, string retTy, string methodName,
+                      dag args = (ins), code methodBody = [{}],
+                      code defaultImplementation = [{}]> {
+  // A human-readable description of what this method does.
+  string description = desc;
+
+  // The name of the interface method.
+  string name = methodName;
+
+  // The c++ type-name of the return type.
+  string returnType = retTy;
+
+  // A dag of string that correspond to the arguments of the method.
+  dag arguments = args;
+
+  // An optional body to the method.
+  code body = methodBody;
+
+  // An optional default implementation of the method.
+  code defaultBody = defaultImplementation;
+}
+
+// This class represents a single static interface method.
+class StaticInterfaceMethod<string desc, string retTy, string methodName,
+                            dag args = (ins), code methodBody = [{}],
+                            code defaultImplementation = [{}]>
+    : InterfaceMethod<desc, retTy, methodName, args, methodBody,
+                      defaultImplementation>;
+
+// OpInterface represents an interface regarding an op.
+class OpInterface<string name> : OpInterfaceTrait<name> {
+  // A human-readable description of what this interface does.
+  string description = "";
+
+  // The name given to the c++ interface class.
+  string cppClassName = name;
+
+  // The list of methods defined by this interface.
+  list<InterfaceMethod> methods = [];
+}
+
+// Whether to declare the op interface methods in the op's header. This class
+// simply wraps an OpInterface but is used to indicate that the method
+// declarations should be generated.
+class DeclareOpInterfaceMethods<OpInterface interface> :
+  OpInterface<interface.cppClassName> {
+    let description = interface.description;
+    let cppClassName = interface.cppClassName;
+    let methods = interface.methods;
+}
+
+//===----------------------------------------------------------------------===//
+// Op definitions
+//===----------------------------------------------------------------------===//
+
+// Marker used to identify the result list for an op.
+def outs;
+
+// Marker used to identify the region list for an op.
+def region;
+
+// Class for defining a custom builder.
+//
+// TableGen generates several generic builders for each op by default (see
+// comment in the `Op` class). If the default generated ones cannot cover
+// some use case, custom builders can be defined using instances of this class.
+//
+// The signature of the builder is always
+//
+// ```c++
+// static void build(Builder *builder, OperationState &state,
+//                   <other-parameters>...) {
+//   <body>...
+// }
+// ```
+//
+// To define a custom builder, the parameter list (*including* the `Builder
+// *builder, OperationState &state` part) and body should be passed in
+// as separate template arguments to this class. This is because we generate
+// op declaration and definition into separate files. If an empty string is
+// passed in for `body`, then *only* the builder declaration will be
+// generated; this provides a way to define complicated builders entirely
+// in C++.
+class OpBuilder<string p, code b = ""> {
+  string params = p;
+  code body = b;
+}
+
+// Base class for all ops.
+class Op<Dialect dialect, string mnemonic, list<OpTrait> props = []> {
+  // The dialect of the op.
+  Dialect opDialect = dialect;
+
+  // The mnemonic of the op.
+  string opName = mnemonic;
+
+  // One-line human-readable description of what the op does.
+  string summary = "";
+
+  // Additional, longer human-readable description of what the op does.
+  string description = "";
+
+  // Dag containing the arguments of the op. Default to 0 arguments.
+  dag arguments = (ins);
+
+  // The list of results of the op. Default to 0 results.
+  dag results = (outs);
+
+  // The list of regions of the op. Default to 0 regions.
+  dag regions = (region);
+
+  // Attribute getters can be added to the op by adding an Attr member
+  // with the name and type of the attribute. E.g., adding int attribute
+  // with name "value" and type "i32":
+  //   I32Attr value;
+
+  // Define the hooks used for building, parsing, printing, verification.
+
+  // Custom builder.
+  // In addition to the custom builder provided here, and unless
+  // skipDefaultBuilders is set, two default builders are generated, with the
+  // following signatures:
+  //
+  // ```c++
+  // static void build(Builder *, OperationState &tblgen_state,
+  //                   Type <result0-name>, Type <result1-name>, ...,
+  //                   Value <arg0-name>, Value <arg1-name>, ...,
+  //                   Attribute <attr0-name>, Attribute <attr1-name>, ...);
+  // ```
+  // * where the attributes follow the same declaration order as in the op.
+  //
+  // ```c++
+  // static void build(Builder *, OperationState &tblgen_state,
+  //                   ArrayRef<Type> resultTypes,
+  //                   ArrayRef<Value> operands,
+  //                   ArrayRef<NamedAttribute> attributes);
+  // ```
+  list<OpBuilder> builders = ?;
+
+  // Avoid generating default build functions.  Custom builders must be
+  // provided.
+  bit skipDefaultBuilders = 0;
+
+  // Custom parser.
+  code parser = ?;
+
+  // Custom printer.
+  code printer = ?;
+
+  // Custom verifier.
+  code verifier = ?;
+
+  // Whether this op has associated canonicalization patterns.
+  // TODO(b/120163349): figure out a better way to write canonicalization
+  // patterns in TableGen rules directly instead of using this marker
+  // and C++ implementations.
+  bit hasCanonicalizer = 0;
+
+  // Whether this op has a folder.
+  bit hasFolder = 0;
+
+  // Op traits.
+  // Note: The list of traits will be uniqued by ODS.
+  list<OpTrait> traits = props;
+
+  // Additional code that will be added to the public part of the generated
+  // C++ code of the op declaration.
+  code extraClassDeclaration = ?;
+}
+
+// The arguments of an op.
+class Arguments<dag args> {
+  dag arguments = args;
+}
+
+// The results of an op.
+class Results<dag rets> {
+  dag results = rets;
+}
+
+//===----------------------------------------------------------------------===//
+// Common value constraints
+//===----------------------------------------------------------------------===//
+
+def HasNoUseOf: Constraint<
+    CPred<"$_self->use_begin() == $_self->use_end()">, "has no use">;
+
+//===----------------------------------------------------------------------===//
+// Common op type constraints
+//===----------------------------------------------------------------------===//
+
+// These traits are for verifying properties of an op that require knowledge of
+// multiple arguments or results. For verifying properties of a single argument
+// or result, prefer operand type constraints.
+
+// These traits often require including "mlir/IR/TypeUtilities.h".
+
+// TODO(b/135033717): Improve the autogenerated error messages.
+
+class Rank<string name> :
+    StrFunc<"$" # name # ".getType().cast<ShapedType>().getRank()">;
+
+class Shape<string name> :
+    StrFunc<"$" # name # ".getType().cast<ShapedType>().getShape()">;
+
+class ElementCount<string name> :
+  StrFunc<"$" # name # ".getType().cast<ShapedType>().getNumElements()">;
+
+class ElementType<string name> : StrFunc<"getElementTypeOrSelf($" # name # ")">;
+
+class AllMatchPred<list<string> values> :
+    CPred<"llvm::is_splat(llvm::makeArrayRef({"# StrJoin<values>.result #"}))">;
+
+class AllMatch<list<string> values, string description> :
+    PredOpTrait<description, AllMatchPred<values>>;
+
+// TODO(b/135032064): Only works for non-variadic.
+class AllMatchSameOperatorPred<list<string> names, string operator> :
+    AllMatchPred<!foreach(n, names, !subst("$_self", "$" # n, operator))>;
+
+class AllMatchSameOperatorTrait<list<string> names, string operator,
+                                string description> :
+    PredOpTrait<
+        "all of {" # StrJoin<names>.result # "} have same " # description,
+        AllMatchSameOperatorPred<names, operator>>;
+
+class AllElementCountsMatch<list<string> names> :
+    AllMatchSameOperatorTrait<names, ElementCount<"_self">.result,
+                              "element count">;
+
+class AllElementTypesMatch<list<string> names> :
+    AllMatchSameOperatorTrait<names, ElementType<"_self">.result,
+                              "element type">;
+
+class AllRanksMatch<list<string> names> :
+    AllMatchSameOperatorTrait<names, Rank<"_self">.result, "rank">;
+
+class AllShapesMatch<list<string> names> :
+    AllMatchSameOperatorTrait<names, Shape<"_self">.result, "shape">;
+
+class AllTypesMatch<list<string> names> :
+    AllMatchSameOperatorTrait<names, "$_self.getType()", "type">;
+
+// Type Constraint operand `idx`'s Element type is `type`.
+class TCopVTEtIs<int idx, Type type> : And<[
+   CPred<"$_op.getNumOperands() > " # idx>,
+   SubstLeaves<"$_self", "$_op.getOperand(" # idx # ")->getType()",
+     IsShapedTypePred>,
+   SubstLeaves<"$_self", "getElementTypeOrSelf($_op.getOperand(" # idx # "))",
+     type.predicate>]>;
+
+// Predicate to verify that a named argument or result's element type matches a
+// given type.
+class TypeIsPred<string name, Type type> :
+   SubstLeaves<"$_self", "$" # name # ".getType()", type.predicate>;
+class TypeIs<string name, Type type> : PredOpTrait<
+  "'" # name # "' is " # type.description, TypeIsPred<name, type>>;
+
+// Predicate to verify that a named argument or result's element type matches a
+// given type.
+class ElementTypeIsPred<string name, Type type> : And<[
+   SubstLeaves<"$_self", "$" # name # ".getType()", IsShapedTypePred>,
+   SubstLeaves<"$_self", "getElementTypeOrSelf($" # name # ")",
+     type.predicate>]>;
+class ElementTypeIs<string name, Type type> : PredOpTrait<
+  "'" # name # "' is " # type.description, ElementTypeIsPred<name, type>>;
+
+// Predicate to verify that the i'th operand and the j'th operand have the same
+// elemental type.
+// Type Constraint operand `i`'s Element type is Same As operand `j`'s Element
+// type.
+class TCopVTEtIsSameAs<int i, int j> : And<[
+    CPred<"$_op.getNumOperands() > std::max(" # i # "u," # j # "u)">,
+    SubstLeaves<"$_self", "$_op.getOperand(" # i # ")->getType()",
+      IsShapedTypePred>,
+    SubstLeaves<"$_self", "$_op.getOperand(" # j # ")->getType()",
+      IsShapedTypePred>,
+    CPred<"mlir::getElementTypeOrSelf($_op.getOperand(" # i # ")) == "
+          "mlir::getElementTypeOrSelf($_op.getOperand(" # j # "))">]>;
+
+// Predicate to verify that the i'th result and the j'th operand exist and has
+// shaped types.
+class TCOpResIsShapedTypePred<int i, int j> : And<[
+    CPred<"$_op.getNumResults() > " # i>,
+    CPred<"$_op.getNumOperands() > " # j>,
+    SubstLeaves<"$_self", "$_op.getResult(" # i # ")->getType()",
+      IsShapedTypePred>,
+    SubstLeaves<"$_self", "$_op.getOperand(" # j # ")->getType()",
+      IsShapedTypePred>]>;
+
+// Predicate to verify that the i'th result and the j'th operand have the same
+// type.
+class TCresIsSameAsOpBase<int i, int j> :
+    CPred<"$_op.getResult(" # i # ")->getType() == "
+          "$_op.getOperand(" # j # ")->getType()">;
+
+// Basic Predicate to verify that the i'th result and the j'th operand have the
+// same elemental type.
+class TCresVTEtIsSameAsOpBase<int i, int j> :
+    CPred<"getElementTypeOrSelf($_op.getResult(" # i # ")) == "
+          "getElementTypeOrSelf($_op.getOperand(" # j # "))">;
+
+// Predicate to verify that the i'th result and the j'th operand have the same
+// elemental type.
+// Type Constraint result`i`'s Element type is Same As Operand `j`'s Element
+// type.
+class TCresVTEtIsSameAsOp<int i, int j> : And<[
+    TCOpResIsShapedTypePred<i, j>,
+    TCresVTEtIsSameAsOpBase<i, j>]>;
+
+// Predicate to verify that the opId'th operand can be broadcasted to the type
+// of the resId'th result.
+class TCOpIsBroadcastableToRes<int opId, int resId> : And<[
+    TCOpResIsShapedTypePred<opId, resId>,
+    CPred<"OpTrait::util::getBroadcastedType("
+              "$_op.getOperand(" # opId # ")->getType(), "
+              "$_op.getResult(" # resId # ")->getType())">]>;
+
+// Predicate to verify that all the operands at the given `indices`
+// have the same element type.
+// Type Constraint operands' Element type are all Same At the given `indices`.
+// We query the operands' types into a list and check they are all the same.
+// Precondition:
+// 1) all operands involved are of shaped type and
+// 2) the indices are not out of range.
+class TCopVTEtAreSameAt<list<int> indices> : CPred<
+  "llvm::is_splat(mlir::functional::map("
+    "[this](unsigned i) { return getElementTypeOrSelf(this->getOperand(i)); }, "
+    "llvm::ArrayRef<unsigned>({" # StrJoinInt<indices>.result # "})))">;
+
+//===----------------------------------------------------------------------===//
+// Pattern definitions
+//===----------------------------------------------------------------------===//
+
+// Marker used to identify the delta value added to the default benefit value.
+def addBenefit;
+
+// Base class for op+ -> op+ rewrite rules. These allow declaratively
+// specifying rewrite rules.
+//
+// A rewrite rule contains two components: a source pattern and one or more
+// result patterns. Each pattern is specified as a (recursive) DAG node (tree)
+// in the form of `(node arg0, arg1, ...)`.
+//
+// The `node` are normally MLIR ops, but it can also be one of the directives
+// listed later in this section.
+//
+// ## Symbol binding
+//
+// In the source pattern, `argN` can be used to specify matchers (e.g., using
+// type/attribute type constraints, etc.) and bound to a name for later use.
+// We can also bound names to op instances to reference them later in
+// multi-entity constraints.
+//
+// In the result pattern, `argN` can be used to refer to a previously bound
+// name, with potential transformations (e.g., using tAttr, etc.). `argN` can
+// itself be nested DAG node. We can also bound names to ops to reference
+// them later in other result patterns.
+//
+// For example,
+//
+// ```
+// def : Pattern<(OneResultOp1:$op1 $arg0, $arg1),
+//               [(OneResultOp2:$op2 $arg0, $arg1),
+//                (OneResultOp3 $op2 (OneResultOp4))],
+//               [(HasStaticShapePred $op1)]>;
+// ```
+//
+// `$argN` is bound to the `OneResultOp1`'s N-th argument and used later to
+// build `OneResultOp2`. `$op1` is bound to `OneResultOp1` and used to
+// check whether the result's shape is static. `$op2` is bound to
+// `OneResultOp2` and used to build `OneResultOp3`.
+//
+// ## Multi-result op
+//
+// To create multi-result ops in result pattern, you can use a syntax similar
+// to uni-result op, and it will act as a value pack for all results:
+//
+// ```
+// def : Pattern<(ThreeResultOp ...),
+//               [(TwoResultOp ...), (OneResultOp ...)]>;
+// ```
+//
+// Then `TwoResultOp` will replace the first two values of `ThreeResultOp`.
+//
+// You can also use `$<name>__N` to explicitly access the N-th result.
+// ```
+// def : Pattern<(FiveResultOp ...),
+//               [(TwoResultOp1:$res1__1 ...), (replaceWithValue $res1__0),
+//                (TwoResultOp2:$res2 ...), (replaceWithValue $res2__1)]>;
+// ```
+//
+// Then the values generated by `FiveResultOp` will be replaced by
+//
+// * `FiveResultOp`#0: `TwoResultOp1`#1
+// * `FiveResultOp`#1: `TwoResultOp1`#0
+// * `FiveResultOp`#2: `TwoResultOp2`#0
+// * `FiveResultOp`#3: `TwoResultOp2`#1
+// * `FiveResultOp`#4: `TwoResultOp2`#1
+class Pattern<dag source, list<dag> results, list<dag> preds = [],
+  dag benefitAdded = (addBenefit 0)> {
+  dag sourcePattern = source;
+  // Result patterns. Each result pattern is expected to replace one result
+  // of the root op in the source pattern. In the case of more result patterns
+  // than needed to replace the source op, only the last N results generated
+  // by the last N result pattern is used to replace a N-result source op.
+  // So that the beginning result patterns can be used to generate additional
+  // ops to aid building the results used for replacement.
+  list<dag> resultPatterns = results;
+  // Multi-entity constraints. Each constraint here involves multiple entities
+  // matched in source pattern and places further constraints on them as a
+  // whole.
+  list<dag> constraints = preds;
+  // The delta value added to the default benefit value. The default value is
+  // the number of ops in the source pattern. The rule with the highest final
+  // benefit value will be applied first if there are multiple rules matches.
+  // This delta value can be either positive or negative.
+  dag benefitDelta = benefitAdded;
+}
+
+// Form of a pattern which produces a single result.
+class Pat<dag pattern, dag result, list<dag> preds = [],
+  dag benefitAdded = (addBenefit 0)> :
+  Pattern<pattern, [result], preds, benefitAdded>;
+
+// Native code call wrapper. This allows invoking an arbitrary C++ expression
+// to create an op operand/attribute or replace an op result.
+//
+// ## Placeholders
+//
+// If used as a DAG leaf, i.e., `(... NativeCodeCall<"...">:$arg, ...)`,
+// the wrapped expression can take special placeholders listed below:
+//
+// * `$_builder` will be replaced by the current `mlir::PatternRewriter`.
+// * `$_self` will be replaced with the entity this transformer is attached to.
+//   E.g., with the definition `def transform : NativeCodeCall<"$_self...">`,
+//   `$_self` in `transform:$attr` will be replaced by the value for `$attr`.
+//
+// If used as a DAG node, i.e., `(NativeCodeCall<"..."> <arg0>, ..., <argN>)`,
+// then positional placeholders are also supported; placeholder `$N` in the
+// wrapped C++ expression will be replaced by `<argN>`.
+
+class NativeCodeCall<string expr> {
+  string expression = expr;
+}
+
+//===----------------------------------------------------------------------===//
+// Common directives
+//===----------------------------------------------------------------------===//
+
+// Directive used in result pattern to indicate that no new op are generated,
+// so to replace the matched DAG with an existing SSA value.
+def replaceWithValue;
+
+#endif // OP_BASE
diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h
new file mode 100644
index 0000000000000000000000000000000000000000..1abf82f37ee4623da5bf9ca4363a7f73dca601ba
--- /dev/null
+++ b/mlir/include/mlir/IR/OpDefinition.h
@@ -0,0 +1,1225 @@
+//===- OpDefinition.h - Classes for defining concrete Op types --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements helper classes for implementing the "Op" types.  This
+// includes the Op type, which is the base class for Op class definitions,
+// as well as number of traits in the OpTrait namespace that provide a
+// declarative way to specify properties of Ops.
+//
+// The purpose of these types are to allow light-weight implementation of
+// concrete ops (like DimOp) with very little boilerplate.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_OPDEFINITION_H
+#define MLIR_IR_OPDEFINITION_H
+
+#include "mlir/IR/Operation.h"
+#include <type_traits>
+
+namespace mlir {
+class Builder;
+
+namespace OpTrait {
+template <typename ConcreteType> class OneResult;
+}
+
+/// This class represents success/failure for operation parsing. It is
+/// essentially a simple wrapper class around LogicalResult that allows for
+/// explicit conversion to bool. This allows for the parser to chain together
+/// parse rules without the clutter of "failed/succeeded".
+class ParseResult : public LogicalResult {
+public:
+  ParseResult(LogicalResult result = success()) : LogicalResult(result) {}
+
+  // Allow diagnostics emitted during parsing to be converted to failure.
+  ParseResult(const InFlightDiagnostic &) : LogicalResult(failure()) {}
+  ParseResult(const Diagnostic &) : LogicalResult(failure()) {}
+
+  /// Failure is true in a boolean context.
+  explicit operator bool() const { return failed(*this); }
+};
+/// This class implements `Optional` functionality for ParseResult. We don't
+/// directly use Optional here, because it provides an implicit conversion
+/// to 'bool' which we want to avoid. This class is used to implement tri-state
+/// 'parseOptional' functions that may have a failure mode when parsing that
+/// shouldn't be attributed to "not present".
+class OptionalParseResult {
+public:
+  OptionalParseResult() = default;
+  OptionalParseResult(LogicalResult result) : impl(result) {}
+  OptionalParseResult(ParseResult result) : impl(result) {}
+  OptionalParseResult(const InFlightDiagnostic &)
+      : OptionalParseResult(failure()) {}
+  OptionalParseResult(llvm::NoneType) : impl(llvm::None) {}
+
+  /// Returns true if we contain a valid ParseResult value.
+  bool hasValue() const { return impl.hasValue(); }
+
+  /// Access the internal ParseResult value.
+  ParseResult getValue() const { return impl.getValue(); }
+  ParseResult operator*() const { return getValue(); }
+
+private:
+  Optional<ParseResult> impl;
+};
+
+// These functions are out-of-line utilities, which avoids them being template
+// instantiated/duplicated.
+namespace impl {
+/// Insert an operation, generated by `buildTerminatorOp`, at the end of the
+/// region's only block if it does not have a terminator already. If the region
+/// is empty, insert a new block first. `buildTerminatorOp` should return the
+/// terminator operation to insert.
+void ensureRegionTerminator(Region &region, Location loc,
+                            function_ref<Operation *()> buildTerminatorOp);
+/// Templated version that fills the generates the provided operation type.
+template <typename OpTy>
+void ensureRegionTerminator(Region &region, Builder &builder, Location loc) {
+  ensureRegionTerminator(region, loc, [&] {
+    OperationState state(loc, OpTy::getOperationName());
+    OpTy::build(&builder, state);
+    return Operation::create(state);
+  });
+}
+} // namespace impl
+
+/// This is the concrete base class that holds the operation pointer and has
+/// non-generic methods that only depend on State (to avoid having them
+/// instantiated on template types that don't affect them.
+///
+/// This also has the fallback implementations of customization hooks for when
+/// they aren't customized.
+class OpState {
+public:
+  /// Ops are pointer-like, so we allow implicit conversion to bool.
+  operator bool() { return getOperation() != nullptr; }
+
+  /// This implicitly converts to Operation*.
+  operator Operation *() const { return state; }
+
+  /// Return the operation that this refers to.
+  Operation *getOperation() { return state; }
+
+  /// Returns the closest surrounding operation that contains this operation
+  /// or nullptr if this is a top-level operation.
+  Operation *getParentOp() { return getOperation()->getParentOp(); }
+
+  /// Return the closest surrounding parent operation that is of type 'OpTy'.
+  template <typename OpTy> OpTy getParentOfType() {
+    return getOperation()->getParentOfType<OpTy>();
+  }
+
+  /// Return the context this operation belongs to.
+  MLIRContext *getContext() { return getOperation()->getContext(); }
+
+  /// Print the operation to the given stream.
+  void print(raw_ostream &os, OpPrintingFlags flags = llvm::None) {
+    state->print(os, flags);
+  }
+
+  /// Dump this operation.
+  void dump() { state->dump(); }
+
+  /// The source location the operation was defined or derived from.
+  Location getLoc() { return state->getLoc(); }
+  void setLoc(Location loc) { state->setLoc(loc); }
+
+  /// Return all of the attributes on this operation.
+  ArrayRef<NamedAttribute> getAttrs() { return state->getAttrs(); }
+
+  /// A utility iterator that filters out non-dialect attributes.
+  using dialect_attr_iterator = Operation::dialect_attr_iterator;
+  using dialect_attr_range = Operation::dialect_attr_range;
+
+  /// Return a range corresponding to the dialect attributes for this operation.
+  dialect_attr_range getDialectAttrs() { return state->getDialectAttrs(); }
+  dialect_attr_iterator dialect_attr_begin() {
+    return state->dialect_attr_begin();
+  }
+  dialect_attr_iterator dialect_attr_end() { return state->dialect_attr_end(); }
+
+  /// Return an attribute with the specified name.
+  Attribute getAttr(StringRef name) { return state->getAttr(name); }
+
+  /// If the operation has an attribute of the specified type, return it.
+  template <typename AttrClass> AttrClass getAttrOfType(StringRef name) {
+    return getAttr(name).dyn_cast_or_null<AttrClass>();
+  }
+
+  /// If the an attribute exists with the specified name, change it to the new
+  /// value.  Otherwise, add a new attribute with the specified name/value.
+  void setAttr(Identifier name, Attribute value) {
+    state->setAttr(name, value);
+  }
+  void setAttr(StringRef name, Attribute value) {
+    setAttr(Identifier::get(name, getContext()), value);
+  }
+
+  /// Set the attributes held by this operation.
+  void setAttrs(ArrayRef<NamedAttribute> attributes) {
+    state->setAttrs(attributes);
+  }
+  void setAttrs(NamedAttributeList newAttrs) { state->setAttrs(newAttrs); }
+
+  /// Set the dialect attributes for this operation, and preserve all dependent.
+  template <typename DialectAttrs> void setDialectAttrs(DialectAttrs &&attrs) {
+    state->setDialectAttrs(std::move(attrs));
+  }
+
+  /// Remove the attribute with the specified name if it exists.  The return
+  /// value indicates whether the attribute was present or not.
+  NamedAttributeList::RemoveResult removeAttr(Identifier name) {
+    return state->removeAttr(name);
+  }
+  NamedAttributeList::RemoveResult removeAttr(StringRef name) {
+    return state->removeAttr(Identifier::get(name, getContext()));
+  }
+
+  /// Return true if there are no users of any results of this operation.
+  bool use_empty() { return state->use_empty(); }
+
+  /// Remove this operation from its parent block and delete it.
+  void erase() { state->erase(); }
+
+  /// Emit an error with the op name prefixed, like "'dim' op " which is
+  /// convenient for verifiers.
+  InFlightDiagnostic emitOpError(const Twine &message = {});
+
+  /// Emit an error about fatal conditions with this operation, reporting up to
+  /// any diagnostic handlers that may be listening.
+  InFlightDiagnostic emitError(const Twine &message = {});
+
+  /// Emit a warning about this operation, reporting up to any diagnostic
+  /// handlers that may be listening.
+  InFlightDiagnostic emitWarning(const Twine &message = {});
+
+  /// Emit a remark about this operation, reporting up to any diagnostic
+  /// handlers that may be listening.
+  InFlightDiagnostic emitRemark(const Twine &message = {});
+
+  /// Walk the operation in postorder, calling the callback for each nested
+  /// operation(including this one).
+  /// See Operation::walk for more details.
+  template <typename FnT, typename RetT = detail::walkResultType<FnT>>
+  RetT walk(FnT &&callback) {
+    return state->walk(std::forward<FnT>(callback));
+  }
+
+  // These are default implementations of customization hooks.
+public:
+  /// This hook returns any canonicalization pattern rewrites that the operation
+  /// supports, for use by the canonicalization pass.
+  static void getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context) {}
+
+protected:
+  /// If the concrete type didn't implement a custom verifier hook, just fall
+  /// back to this one which accepts everything.
+  LogicalResult verify() { return success(); }
+
+  /// Unless overridden, the custom assembly form of an op is always rejected.
+  /// Op implementations should implement this to return failure.
+  /// On success, they should fill in result with the fields to use.
+  static ParseResult parse(OpAsmParser &parser, OperationState &result);
+
+  // The fallback for the printer is to print it the generic assembly form.
+  void print(OpAsmPrinter &p);
+
+  /// Mutability management is handled by the OpWrapper/OpConstWrapper classes,
+  /// so we can cast it away here.
+  explicit OpState(Operation *state) : state(state) {}
+
+private:
+  Operation *state;
+};
+
+// Allow comparing operators.
+inline bool operator==(OpState lhs, OpState rhs) {
+  return lhs.getOperation() == rhs.getOperation();
+}
+inline bool operator!=(OpState lhs, OpState rhs) {
+  return lhs.getOperation() != rhs.getOperation();
+}
+
+/// This class represents a single result from folding an operation.
+class OpFoldResult : public PointerUnion<Attribute, Value> {
+  using PointerUnion<Attribute, Value>::PointerUnion;
+};
+
+/// This template defines the foldHook as used by AbstractOperation.
+///
+/// The default implementation uses a general fold method that can be defined on
+/// custom ops which can return multiple results.
+template <typename ConcreteType, bool isSingleResult, typename = void>
+class FoldingHook {
+public:
+  /// This is an implementation detail of the constant folder hook for
+  /// AbstractOperation.
+  static LogicalResult foldHook(Operation *op, ArrayRef<Attribute> operands,
+                                SmallVectorImpl<OpFoldResult> &results) {
+    return cast<ConcreteType>(op).fold(operands, results);
+  }
+
+  /// This hook implements a generalized folder for this operation.  Operations
+  /// can implement this to provide simplifications rules that are applied by
+  /// the Builder::createOrFold API and the canonicalization pass.
+  ///
+  /// This is an intentionally limited interface - implementations of this hook
+  /// can only perform the following changes to the operation:
+  ///
+  ///  1. They can leave the operation alone and without changing the IR, and
+  ///     return failure.
+  ///  2. They can mutate the operation in place, without changing anything else
+  ///     in the IR.  In this case, return success.
+  ///  3. They can return a list of existing values that can be used instead of
+  ///     the operation.  In this case, fill in the results list and return
+  ///     success.  The caller will remove the operation and use those results
+  ///     instead.
+  ///
+  /// This allows expression of some simple in-place canonicalizations (e.g.
+  /// "x+0 -> x", "min(x,y,x,z) -> min(x,y,z)", "x+y-x -> y", etc), as well as
+  /// generalized constant folding.
+  ///
+  /// If not overridden, this fallback implementation always fails to fold.
+  ///
+  LogicalResult fold(ArrayRef<Attribute> operands,
+                     SmallVectorImpl<OpFoldResult> &results) {
+    return failure();
+  }
+};
+
+/// This template specialization defines the foldHook as used by
+/// AbstractOperation for single-result operations.  This gives the hook a nicer
+/// signature that is easier to implement.
+template <typename ConcreteType, bool isSingleResult>
+class FoldingHook<ConcreteType, isSingleResult,
+                  typename std::enable_if<isSingleResult>::type> {
+public:
+  /// If the operation returns a single value, then the Op can be implicitly
+  /// converted to an Value.  This yields the value of the only result.
+  operator Value() {
+    return static_cast<ConcreteType *>(this)->getOperation()->getResult(0);
+  }
+
+  /// This is an implementation detail of the constant folder hook for
+  /// AbstractOperation.
+  static LogicalResult foldHook(Operation *op, ArrayRef<Attribute> operands,
+                                SmallVectorImpl<OpFoldResult> &results) {
+    auto result = cast<ConcreteType>(op).fold(operands);
+    if (!result)
+      return failure();
+
+    // Check if the operation was folded in place. In this case, the operation
+    // returns itself.
+    if (result.template dyn_cast<Value>() != op->getResult(0))
+      results.push_back(result);
+    return success();
+  }
+
+  /// This hook implements a generalized folder for this operation.  Operations
+  /// can implement this to provide simplifications rules that are applied by
+  /// the Builder::createOrFold API and the canonicalization pass.
+  ///
+  /// This is an intentionally limited interface - implementations of this hook
+  /// can only perform the following changes to the operation:
+  ///
+  ///  1. They can leave the operation alone and without changing the IR, and
+  ///     return nullptr.
+  ///  2. They can mutate the operation in place, without changing anything else
+  ///     in the IR.  In this case, return the operation itself.
+  ///  3. They can return an existing SSA value that can be used instead of
+  ///     the operation.  In this case, return that value.  The caller will
+  ///     remove the operation and use that result instead.
+  ///
+  /// This allows expression of some simple in-place canonicalizations (e.g.
+  /// "x+0 -> x", "min(x,y,x,z) -> min(x,y,z)", "x+y-x -> y", etc), as well as
+  /// generalized constant folding.
+  ///
+  /// If not overridden, this fallback implementation always fails to fold.
+  ///
+  OpFoldResult fold(ArrayRef<Attribute> operands) { return {}; }
+};
+
+//===----------------------------------------------------------------------===//
+// Operation Trait Types
+//===----------------------------------------------------------------------===//
+
+namespace OpTrait {
+
+// These functions are out-of-line implementations of the methods in the
+// corresponding trait classes.  This avoids them being template
+// instantiated/duplicated.
+namespace impl {
+LogicalResult verifyZeroOperands(Operation *op);
+LogicalResult verifyOneOperand(Operation *op);
+LogicalResult verifyNOperands(Operation *op, unsigned numOperands);
+LogicalResult verifyAtLeastNOperands(Operation *op, unsigned numOperands);
+LogicalResult verifyOperandsAreFloatLike(Operation *op);
+LogicalResult verifyOperandsAreIntegerLike(Operation *op);
+LogicalResult verifySameTypeOperands(Operation *op);
+LogicalResult verifyZeroResult(Operation *op);
+LogicalResult verifyOneResult(Operation *op);
+LogicalResult verifyNResults(Operation *op, unsigned numOperands);
+LogicalResult verifyAtLeastNResults(Operation *op, unsigned numOperands);
+LogicalResult verifySameOperandsShape(Operation *op);
+LogicalResult verifySameOperandsAndResultShape(Operation *op);
+LogicalResult verifySameOperandsElementType(Operation *op);
+LogicalResult verifySameOperandsAndResultElementType(Operation *op);
+LogicalResult verifySameOperandsAndResultType(Operation *op);
+LogicalResult verifyResultsAreBoolLike(Operation *op);
+LogicalResult verifyResultsAreFloatLike(Operation *op);
+LogicalResult verifyResultsAreIntegerLike(Operation *op);
+LogicalResult verifyIsTerminator(Operation *op);
+LogicalResult verifyOperandSizeAttr(Operation *op, StringRef sizeAttrName);
+LogicalResult verifyResultSizeAttr(Operation *op, StringRef sizeAttrName);
+} // namespace impl
+
+/// Helper class for implementing traits.  Clients are not expected to interact
+/// with this directly, so its members are all protected.
+template <typename ConcreteType, template <typename> class TraitType>
+class TraitBase {
+protected:
+  /// Return the ultimate Operation being worked on.
+  Operation *getOperation() {
+    // We have to cast up to the trait type, then to the concrete type, then to
+    // the BaseState class in explicit hops because the concrete type will
+    // multiply derive from the (content free) TraitBase class, and we need to
+    // be able to disambiguate the path for the C++ compiler.
+    auto *trait = static_cast<TraitType<ConcreteType> *>(this);
+    auto *concrete = static_cast<ConcreteType *>(trait);
+    auto *base = static_cast<OpState *>(concrete);
+    return base->getOperation();
+  }
+
+  /// Provide default implementations of trait hooks.  This allows traits to
+  /// provide exactly the overrides they care about.
+  static LogicalResult verifyTrait(Operation *op) { return success(); }
+  static AbstractOperation::OperationProperties getTraitProperties() {
+    return 0;
+  }
+};
+
+namespace detail {
+/// Utility trait base that provides accessors for derived traits that have
+/// multiple operands.
+template <typename ConcreteType, template <typename> class TraitType>
+struct MultiOperandTraitBase : public TraitBase<ConcreteType, TraitType> {
+  using operand_iterator = Operation::operand_iterator;
+  using operand_range = Operation::operand_range;
+  using operand_type_iterator = Operation::operand_type_iterator;
+  using operand_type_range = Operation::operand_type_range;
+
+  /// Return the number of operands.
+  unsigned getNumOperands() { return this->getOperation()->getNumOperands(); }
+
+  /// Return the operand at index 'i'.
+  Value getOperand(unsigned i) { return this->getOperation()->getOperand(i); }
+
+  /// Set the operand at index 'i' to 'value'.
+  void setOperand(unsigned i, Value value) {
+    this->getOperation()->setOperand(i, value);
+  }
+
+  /// Operand iterator access.
+  operand_iterator operand_begin() {
+    return this->getOperation()->operand_begin();
+  }
+  operand_iterator operand_end() { return this->getOperation()->operand_end(); }
+  operand_range getOperands() { return this->getOperation()->getOperands(); }
+
+  /// Operand type access.
+  operand_type_iterator operand_type_begin() {
+    return this->getOperation()->operand_type_begin();
+  }
+  operand_type_iterator operand_type_end() {
+    return this->getOperation()->operand_type_end();
+  }
+  operand_type_range getOperandTypes() {
+    return this->getOperation()->getOperandTypes();
+  }
+};
+} // end namespace detail
+
+/// This class provides the API for ops that are known to have no
+/// SSA operand.
+template <typename ConcreteType>
+class ZeroOperands : public TraitBase<ConcreteType, ZeroOperands> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyZeroOperands(op);
+  }
+
+private:
+  // Disable these.
+  void getOperand() {}
+  void setOperand() {}
+};
+
+/// This class provides the API for ops that are known to have exactly one
+/// SSA operand.
+template <typename ConcreteType>
+class OneOperand : public TraitBase<ConcreteType, OneOperand> {
+public:
+  Value getOperand() { return this->getOperation()->getOperand(0); }
+
+  void setOperand(Value value) { this->getOperation()->setOperand(0, value); }
+
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyOneOperand(op);
+  }
+};
+
+/// This class provides the API for ops that are known to have a specified
+/// number of operands.  This is used as a trait like this:
+///
+///   class FooOp : public Op<FooOp, OpTrait::NOperands<2>::Impl> {
+///
+template <unsigned N> class NOperands {
+public:
+  static_assert(N > 1, "use ZeroOperands/OneOperand for N < 2");
+
+  template <typename ConcreteType>
+  class Impl
+      : public detail::MultiOperandTraitBase<ConcreteType, NOperands<N>::Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      return impl::verifyNOperands(op, N);
+    }
+  };
+};
+
+/// This class provides the API for ops that are known to have a at least a
+/// specified number of operands.  This is used as a trait like this:
+///
+///   class FooOp : public Op<FooOp, OpTrait::AtLeastNOperands<2>::Impl> {
+///
+template <unsigned N> class AtLeastNOperands {
+public:
+  template <typename ConcreteType>
+  class Impl : public detail::MultiOperandTraitBase<ConcreteType,
+                                                    AtLeastNOperands<N>::Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      return impl::verifyAtLeastNOperands(op, N);
+    }
+  };
+};
+
+/// This class provides the API for ops which have an unknown number of
+/// SSA operands.
+template <typename ConcreteType>
+class VariadicOperands
+    : public detail::MultiOperandTraitBase<ConcreteType, VariadicOperands> {};
+
+/// This class provides return value APIs for ops that are known to have
+/// zero results.
+template <typename ConcreteType>
+class ZeroResult : public TraitBase<ConcreteType, ZeroResult> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyZeroResult(op);
+  }
+};
+
+namespace detail {
+/// Utility trait base that provides accessors for derived traits that have
+/// multiple results.
+template <typename ConcreteType, template <typename> class TraitType>
+struct MultiResultTraitBase : public TraitBase<ConcreteType, TraitType> {
+  using result_iterator = Operation::result_iterator;
+  using result_range = Operation::result_range;
+  using result_type_iterator = Operation::result_type_iterator;
+  using result_type_range = Operation::result_type_range;
+
+  /// Return the number of results.
+  unsigned getNumResults() { return this->getOperation()->getNumResults(); }
+
+  /// Return the result at index 'i'.
+  Value getResult(unsigned i) { return this->getOperation()->getResult(i); }
+
+  /// Replace all uses of results of this operation with the provided 'values'.
+  /// 'values' may correspond to an existing operation, or a range of 'Value'.
+  template <typename ValuesT> void replaceAllUsesWith(ValuesT &&values) {
+    this->getOperation()->replaceAllUsesWith(std::forward<ValuesT>(values));
+  }
+
+  /// Return the type of the `i`-th result.
+  Type getType(unsigned i) { return getResult(i)->getType(); }
+
+  /// Result iterator access.
+  result_iterator result_begin() {
+    return this->getOperation()->result_begin();
+  }
+  result_iterator result_end() { return this->getOperation()->result_end(); }
+  result_range getResults() { return this->getOperation()->getResults(); }
+
+  /// Result type access.
+  result_type_iterator result_type_begin() {
+    return this->getOperation()->result_type_begin();
+  }
+  result_type_iterator result_type_end() {
+    return this->getOperation()->result_type_end();
+  }
+  result_type_range getResultTypes() {
+    return this->getOperation()->getResultTypes();
+  }
+};
+} // end namespace detail
+
+/// This class provides return value APIs for ops that are known to have a
+/// single result.
+template <typename ConcreteType>
+class OneResult : public TraitBase<ConcreteType, OneResult> {
+public:
+  Value getResult() { return this->getOperation()->getResult(0); }
+  Type getType() { return getResult()->getType(); }
+
+  /// Replace all uses of 'this' value with the new value, updating anything in
+  /// the IR that uses 'this' to use the other value instead.  When this returns
+  /// there are zero uses of 'this'.
+  void replaceAllUsesWith(Value newValue) {
+    getResult()->replaceAllUsesWith(newValue);
+  }
+
+  /// Replace all uses of 'this' value with the result of 'op'.
+  void replaceAllUsesWith(Operation *op) {
+    this->getOperation()->replaceAllUsesWith(op);
+  }
+
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyOneResult(op);
+  }
+};
+
+/// This class provides the API for ops that are known to have a specified
+/// number of results.  This is used as a trait like this:
+///
+///   class FooOp : public Op<FooOp, OpTrait::NResults<2>::Impl> {
+///
+template <unsigned N> class NResults {
+public:
+  static_assert(N > 1, "use ZeroResult/OneResult for N < 2");
+
+  template <typename ConcreteType>
+  class Impl
+      : public detail::MultiResultTraitBase<ConcreteType, NResults<N>::Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      return impl::verifyNResults(op, N);
+    }
+  };
+};
+
+/// This class provides the API for ops that are known to have at least a
+/// specified number of results.  This is used as a trait like this:
+///
+///   class FooOp : public Op<FooOp, OpTrait::AtLeastNResults<2>::Impl> {
+///
+template <unsigned N> class AtLeastNResults {
+public:
+  template <typename ConcreteType>
+  class Impl : public detail::MultiResultTraitBase<ConcreteType,
+                                                   AtLeastNResults<N>::Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      return impl::verifyAtLeastNResults(op, N);
+    }
+  };
+};
+
+/// This class provides the API for ops which have an unknown number of
+/// results.
+template <typename ConcreteType>
+class VariadicResults
+    : public detail::MultiResultTraitBase<ConcreteType, VariadicResults> {};
+
+/// This class provides verification for ops that are known to have the same
+/// operand shape: all operands are scalars, vectors/tensors of the same
+/// shape.
+template <typename ConcreteType>
+class SameOperandsShape : public TraitBase<ConcreteType, SameOperandsShape> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySameOperandsShape(op);
+  }
+};
+
+/// This class provides verification for ops that are known to have the same
+/// operand and result shape: both are scalars, vectors/tensors of the same
+/// shape.
+template <typename ConcreteType>
+class SameOperandsAndResultShape
+    : public TraitBase<ConcreteType, SameOperandsAndResultShape> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySameOperandsAndResultShape(op);
+  }
+};
+
+/// This class provides verification for ops that are known to have the same
+/// operand element type (or the type itself if it is scalar).
+///
+template <typename ConcreteType>
+class SameOperandsElementType
+    : public TraitBase<ConcreteType, SameOperandsElementType> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySameOperandsElementType(op);
+  }
+};
+
+/// This class provides verification for ops that are known to have the same
+/// operand and result element type (or the type itself if it is scalar).
+///
+template <typename ConcreteType>
+class SameOperandsAndResultElementType
+    : public TraitBase<ConcreteType, SameOperandsAndResultElementType> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySameOperandsAndResultElementType(op);
+  }
+};
+
+/// This class provides verification for ops that are known to have the same
+/// operand and result type.
+///
+/// Note: this trait subsumes the SameOperandsAndResultShape and
+/// SameOperandsAndResultElementType traits.
+template <typename ConcreteType>
+class SameOperandsAndResultType
+    : public TraitBase<ConcreteType, SameOperandsAndResultType> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySameOperandsAndResultType(op);
+  }
+};
+
+/// This class verifies that any results of the specified op have a boolean
+/// type, a vector thereof, or a tensor thereof.
+template <typename ConcreteType>
+class ResultsAreBoolLike : public TraitBase<ConcreteType, ResultsAreBoolLike> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyResultsAreBoolLike(op);
+  }
+};
+
+/// This class verifies that any results of the specified op have a floating
+/// point type, a vector thereof, or a tensor thereof.
+template <typename ConcreteType>
+class ResultsAreFloatLike
+    : public TraitBase<ConcreteType, ResultsAreFloatLike> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyResultsAreFloatLike(op);
+  }
+};
+
+/// This class verifies that any results of the specified op have an integer or
+/// index type, a vector thereof, or a tensor thereof.
+template <typename ConcreteType>
+class ResultsAreIntegerLike
+    : public TraitBase<ConcreteType, ResultsAreIntegerLike> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyResultsAreIntegerLike(op);
+  }
+};
+
+/// This class adds property that the operation is commutative.
+template <typename ConcreteType>
+class IsCommutative : public TraitBase<ConcreteType, IsCommutative> {
+public:
+  static AbstractOperation::OperationProperties getTraitProperties() {
+    return static_cast<AbstractOperation::OperationProperties>(
+        OperationProperty::Commutative);
+  }
+};
+
+/// This class adds property that the operation has no side effects.
+template <typename ConcreteType>
+class HasNoSideEffect : public TraitBase<ConcreteType, HasNoSideEffect> {
+public:
+  static AbstractOperation::OperationProperties getTraitProperties() {
+    return static_cast<AbstractOperation::OperationProperties>(
+        OperationProperty::NoSideEffect);
+  }
+};
+
+/// This class verifies that all operands of the specified op have a float type,
+/// a vector thereof, or a tensor thereof.
+template <typename ConcreteType>
+class OperandsAreFloatLike
+    : public TraitBase<ConcreteType, OperandsAreFloatLike> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyOperandsAreFloatLike(op);
+  }
+};
+
+/// This class verifies that all operands of the specified op have an integer or
+/// index type, a vector thereof, or a tensor thereof.
+template <typename ConcreteType>
+class OperandsAreIntegerLike
+    : public TraitBase<ConcreteType, OperandsAreIntegerLike> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyOperandsAreIntegerLike(op);
+  }
+};
+
+/// This class verifies that all operands of the specified op have the same
+/// type.
+template <typename ConcreteType>
+class SameTypeOperands : public TraitBase<ConcreteType, SameTypeOperands> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySameTypeOperands(op);
+  }
+};
+
+/// This class provides the API for ops that are known to be terminators.
+template <typename ConcreteType>
+class IsTerminator : public TraitBase<ConcreteType, IsTerminator> {
+public:
+  static AbstractOperation::OperationProperties getTraitProperties() {
+    return static_cast<AbstractOperation::OperationProperties>(
+        OperationProperty::Terminator);
+  }
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyIsTerminator(op);
+  }
+
+  unsigned getNumSuccessors() {
+    return this->getOperation()->getNumSuccessors();
+  }
+  unsigned getNumSuccessorOperands(unsigned index) {
+    return this->getOperation()->getNumSuccessorOperands(index);
+  }
+
+  Block *getSuccessor(unsigned index) {
+    return this->getOperation()->getSuccessor(index);
+  }
+
+  void setSuccessor(Block *block, unsigned index) {
+    return this->getOperation()->setSuccessor(block, index);
+  }
+
+  void addSuccessorOperand(unsigned index, Value value) {
+    return this->getOperation()->addSuccessorOperand(index, value);
+  }
+  void addSuccessorOperands(unsigned index, ArrayRef<Value> values) {
+    return this->getOperation()->addSuccessorOperand(index, values);
+  }
+};
+
+/// This class provides the API for ops that are known to be isolated from
+/// above.
+template <typename ConcreteType>
+class IsIsolatedFromAbove
+    : public TraitBase<ConcreteType, IsIsolatedFromAbove> {
+public:
+  static AbstractOperation::OperationProperties getTraitProperties() {
+    return static_cast<AbstractOperation::OperationProperties>(
+        OperationProperty::IsolatedFromAbove);
+  }
+  static LogicalResult verifyTrait(Operation *op) {
+    for (auto &region : op->getRegions())
+      if (!region.isIsolatedFromAbove(op->getLoc()))
+        return failure();
+    return success();
+  }
+};
+
+/// This class provides APIs and verifiers for ops with regions having a single
+/// block that must terminate with `TerminatorOpType`.
+template <typename TerminatorOpType> struct SingleBlockImplicitTerminator {
+  template <typename ConcreteType>
+  class Impl : public TraitBase<ConcreteType, Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      for (unsigned i = 0, e = op->getNumRegions(); i < e; ++i) {
+        Region &region = op->getRegion(i);
+
+        // Empty regions are fine.
+        if (region.empty())
+          continue;
+
+        // Non-empty regions must contain a single basic block.
+        if (std::next(region.begin()) != region.end())
+          return op->emitOpError("expects region #")
+                 << i << " to have 0 or 1 blocks";
+
+        Block &block = region.front();
+        if (block.empty())
+          return op->emitOpError() << "expects a non-empty block";
+        Operation &terminator = block.back();
+        if (isa<TerminatorOpType>(terminator))
+          continue;
+
+        return op->emitOpError("expects regions to end with '" +
+                               TerminatorOpType::getOperationName() +
+                               "', found '" +
+                               terminator.getName().getStringRef() + "'")
+                   .attachNote()
+               << "in custom textual format, the absence of terminator implies "
+                  "'"
+               << TerminatorOpType::getOperationName() << '\'';
+      }
+
+      return success();
+    }
+
+    /// Ensure that the given region has the terminator required by this trait.
+    static void ensureTerminator(Region &region, Builder &builder,
+                                 Location loc) {
+      ::mlir::impl::template ensureRegionTerminator<TerminatorOpType>(
+          region, builder, loc);
+    }
+  };
+};
+
+/// This class provides a verifier for ops that are expecting a specific parent.
+template <typename ParentOpType> struct HasParent {
+  template <typename ConcreteType>
+  class Impl : public TraitBase<ConcreteType, Impl> {
+  public:
+    static LogicalResult verifyTrait(Operation *op) {
+      if (isa<ParentOpType>(op->getParentOp()))
+        return success();
+      return op->emitOpError() << "expects parent op '"
+                               << ParentOpType::getOperationName() << "'";
+    }
+  };
+};
+
+/// A trait for operations that have an attribute specifying operand segments.
+///
+/// Certain operations can have multiple variadic operands and their size
+/// relationship is not always known statically. For such cases, we need
+/// a per-op-instance specification to divide the operands into logical groups
+/// or segments. This can be modeled by attributes. The attribute will be named
+/// as `operand_segment_sizes`.
+///
+/// This trait verifies the attribute for specifying operand segments has
+/// the correct type (1D vector) and values (non-negative), etc.
+template <typename ConcreteType>
+class AttrSizedOperandSegments
+    : public TraitBase<ConcreteType, AttrSizedOperandSegments> {
+public:
+  static StringRef getOperandSegmentSizeAttr() {
+    return "operand_segment_sizes";
+  }
+
+  static LogicalResult verifyTrait(Operation *op) {
+    return ::mlir::OpTrait::impl::verifyOperandSizeAttr(
+        op, getOperandSegmentSizeAttr());
+  }
+};
+
+/// Similar to AttrSizedOperandSegments but used for results.
+template <typename ConcreteType>
+class AttrSizedResultSegments
+    : public TraitBase<ConcreteType, AttrSizedResultSegments> {
+public:
+  static StringRef getResultSegmentSizeAttr() { return "result_segment_sizes"; }
+
+  static LogicalResult verifyTrait(Operation *op) {
+    return ::mlir::OpTrait::impl::verifyResultSizeAttr(
+        op, getResultSegmentSizeAttr());
+  }
+};
+
+} // end namespace OpTrait
+
+//===----------------------------------------------------------------------===//
+// Operation Definition classes
+//===----------------------------------------------------------------------===//
+
+/// This provides public APIs that all operations should have.  The template
+/// argument 'ConcreteType' should be the concrete type by CRTP and the others
+/// are base classes by the policy pattern.
+template <typename ConcreteType, template <typename T> class... Traits>
+class Op : public OpState,
+           public Traits<ConcreteType>...,
+           public FoldingHook<ConcreteType,
+                              llvm::is_one_of<OpTrait::OneResult<ConcreteType>,
+                                              Traits<ConcreteType>...>::value> {
+public:
+  /// Return if this operation contains the provided trait.
+  template <template <typename T> class Trait>
+  static constexpr bool hasTrait() {
+    return llvm::is_one_of<Trait<ConcreteType>, Traits<ConcreteType>...>::value;
+  }
+
+  /// Return the operation that this refers to.
+  Operation *getOperation() { return OpState::getOperation(); }
+
+  /// Create a deep copy of this operation.
+  ConcreteType clone() { return cast<ConcreteType>(getOperation()->clone()); }
+
+  /// Create a partial copy of this operation without traversing into attached
+  /// regions. The new operation will have the same number of regions as the
+  /// original one, but they will be left empty.
+  ConcreteType cloneWithoutRegions() {
+    return cast<ConcreteType>(getOperation()->cloneWithoutRegions());
+  }
+
+  /// Return the dialect that this refers to.
+  Dialect *getDialect() { return getOperation()->getDialect(); }
+
+  /// Return the parent Region of this operation.
+  Region *getParentRegion() { return getOperation()->getParentRegion(); }
+
+  /// Return true if this "op class" can match against the specified operation.
+  static bool classof(Operation *op) {
+    if (auto *abstractOp = op->getAbstractOperation())
+      return &classof == abstractOp->classof;
+    return op->getName().getStringRef() == ConcreteType::getOperationName();
+  }
+
+  /// This is the hook used by the AsmParser to parse the custom form of this
+  /// op from an .mlir file.  Op implementations should provide a parse method,
+  /// which returns failure.  On success, they should return fill in result with
+  /// the fields to use.
+  static ParseResult parseAssembly(OpAsmParser &parser,
+                                   OperationState &result) {
+    return ConcreteType::parse(parser, result);
+  }
+
+  /// This is the hook used by the AsmPrinter to emit this to the .mlir file.
+  /// Op implementations should provide a print method.
+  static void printAssembly(Operation *op, OpAsmPrinter &p) {
+    auto opPointer = dyn_cast<ConcreteType>(op);
+    assert(opPointer &&
+           "op's name does not match name of concrete type instantiated with");
+    opPointer.print(p);
+  }
+
+  /// This is the hook that checks whether or not this operation is well
+  /// formed according to the invariants of its opcode.  It delegates to the
+  /// Traits for their policy implementations, and allows the user to specify
+  /// their own verify() method.
+  ///
+  /// On success this returns false; on failure it emits an error to the
+  /// diagnostic subsystem and returns true.
+  static LogicalResult verifyInvariants(Operation *op) {
+    return failure(
+        failed(BaseVerifier<Traits<ConcreteType>...>::verifyTrait(op)) ||
+        failed(cast<ConcreteType>(op).verify()));
+  }
+
+  // Returns the properties of an operation by combining the properties of the
+  // traits of the op.
+  static AbstractOperation::OperationProperties getOperationProperties() {
+    return BaseProperties<Traits<ConcreteType>...>::getTraitProperties();
+  }
+
+  /// Expose the type we are instantiated on to template machinery that may want
+  /// to introspect traits on this operation.
+  using ConcreteOpType = ConcreteType;
+
+  /// This is a public constructor.  Any op can be initialized to null.
+  explicit Op() : OpState(nullptr) {}
+  Op(std::nullptr_t) : OpState(nullptr) {}
+
+  /// This is a public constructor to enable access via the llvm::cast family of
+  /// methods. This should not be used directly.
+  explicit Op(Operation *state) : OpState(state) {}
+
+  /// Methods for supporting PointerLikeTypeTraits.
+  const void *getAsOpaquePointer() const {
+    return static_cast<const void *>((Operation *)*this);
+  }
+  static ConcreteOpType getFromOpaquePointer(const void *pointer) {
+    return ConcreteOpType(
+        reinterpret_cast<Operation *>(const_cast<void *>(pointer)));
+  }
+
+private:
+  template <typename... Types> struct BaseVerifier;
+
+  template <typename First, typename... Rest>
+  struct BaseVerifier<First, Rest...> {
+    static LogicalResult verifyTrait(Operation *op) {
+      return failure(failed(First::verifyTrait(op)) ||
+                     failed(BaseVerifier<Rest...>::verifyTrait(op)));
+    }
+  };
+
+  template <typename...> struct BaseVerifier {
+    static LogicalResult verifyTrait(Operation *op) { return success(); }
+  };
+
+  template <typename... Types> struct BaseProperties;
+
+  template <typename First, typename... Rest>
+  struct BaseProperties<First, Rest...> {
+    static AbstractOperation::OperationProperties getTraitProperties() {
+      return First::getTraitProperties() |
+             BaseProperties<Rest...>::getTraitProperties();
+    }
+  };
+
+  template <typename...> struct BaseProperties {
+    static AbstractOperation::OperationProperties getTraitProperties() {
+      return 0;
+    }
+  };
+
+  /// Returns true if this operation contains the trait for the given classID.
+  static bool hasTrait(ClassID *traitID) {
+    return llvm::is_contained(llvm::makeArrayRef({ClassID::getID<Traits>()...}),
+                              traitID);
+  }
+
+  /// Returns an opaque pointer to a concept instance of the interface with the
+  /// given ID if one was registered to this operation.
+  static void *getRawInterface(ClassID *id) {
+    return InterfaceLookup::template lookup<Traits<ConcreteType>...>(id);
+  }
+
+  struct InterfaceLookup {
+    /// Trait to check if T provides a static 'getInterfaceID' method.
+    template <typename T, typename... Args>
+    using has_get_interface_id = decltype(T::getInterfaceID());
+
+    /// If 'T' is the same interface as 'interfaceID' return the concept
+    /// instance.
+    template <typename T>
+    static typename std::enable_if<is_detected<has_get_interface_id, T>::value,
+                                   void *>::type
+    lookup(ClassID *interfaceID) {
+      return (T::getInterfaceID() == interfaceID) ? &T::instance() : nullptr;
+    }
+
+    /// 'T' is known to not be an interface, return nullptr.
+    template <typename T>
+    static typename std::enable_if<!is_detected<has_get_interface_id, T>::value,
+                                   void *>::type
+    lookup(ClassID *) {
+      return nullptr;
+    }
+
+    template <typename T, typename T2, typename... Ts>
+    static void *lookup(ClassID *interfaceID) {
+      auto *concept = lookup<T>(interfaceID);
+      return concept ? concept : lookup<T2, Ts...>(interfaceID);
+    }
+  };
+
+  /// Allow access to 'hasTrait' and 'getRawInterface'.
+  friend AbstractOperation;
+};
+
+/// This class represents the base of an operation interface. Operation
+/// interfaces provide access to derived *Op properties through an opaquely
+/// Operation instance. Derived interfaces must also provide a 'Traits' class
+/// that defines a 'Concept' and a 'Model' class. The 'Concept' class defines an
+/// abstract virtual interface, where as the 'Model' class implements this
+/// interface for a specific derived *Op type. Both of these classes *must* not
+/// contain non-static data. A simple example is shown below:
+///
+///  struct ExampleOpInterfaceTraits {
+///    struct Concept {
+///      virtual unsigned getNumInputs(Operation *op) = 0;
+///    };
+///    template <typename OpT> class Model {
+///      unsigned getNumInputs(Operation *op) final {
+///        return cast<OpT>(op).getNumInputs();
+///      }
+///    };
+///  };
+///
+template <typename ConcreteType, typename Traits>
+class OpInterface : public Op<ConcreteType> {
+public:
+  using Concept = typename Traits::Concept;
+  template <typename T> using Model = typename Traits::template Model<T>;
+
+  OpInterface(Operation *op = nullptr)
+      : Op<ConcreteType>(op), impl(op ? getInterfaceFor(op) : nullptr) {
+    assert((!op || impl) &&
+           "instantiating an interface with an unregistered operation");
+  }
+
+  /// Support 'classof' by checking if the given operation defines the concrete
+  /// interface.
+  static bool classof(Operation *op) { return getInterfaceFor(op); }
+
+  /// Define an accessor for the ID of this interface.
+  static ClassID *getInterfaceID() { return ClassID::getID<ConcreteType>(); }
+
+  /// This is a special trait that registers a given interface with an
+  /// operation.
+  template <typename ConcreteOp>
+  struct Trait : public OpTrait::TraitBase<ConcreteOp, Trait> {
+    /// Define an accessor for the ID of this interface.
+    static ClassID *getInterfaceID() { return ClassID::getID<ConcreteType>(); }
+
+    /// Provide an accessor to a static instance of the interface model for the
+    /// concrete operation type.
+    /// The implementation is inspired from Sean Parent's concept-based
+    /// polymorphism. A key difference is that the set of classes erased is
+    /// statically known, which alleviates the need for using dynamic memory
+    /// allocation.
+    /// We use a zero-sized templated class `Model<ConcreteOp>` to emit the
+    /// virtual table and generate a singleton object for each instantiation of
+    /// this class.
+    static Concept &instance() {
+      static Model<ConcreteOp> singleton;
+      return singleton;
+    }
+  };
+
+protected:
+  /// Get the raw concept in the correct derived concept type.
+  Concept *getImpl() { return impl; }
+
+private:
+  /// Returns the impl interface instance for the given operation.
+  static Concept *getInterfaceFor(Operation *op) {
+    // Access the raw interface from the abstract operation.
+    auto *abstractOp = op->getAbstractOperation();
+    return abstractOp ? abstractOp->getInterface<ConcreteType>() : nullptr;
+  }
+
+  /// A pointer to the impl concept object.
+  Concept *impl;
+};
+
+// These functions are out-of-line implementations of the methods in UnaryOp and
+// BinaryOp, which avoids them being template instantiated/duplicated.
+namespace impl {
+ParseResult parseOneResultOneOperandTypeOp(OpAsmParser &parser,
+                                           OperationState &result);
+
+void buildBinaryOp(Builder *builder, OperationState &result, Value lhs,
+                   Value rhs);
+ParseResult parseOneResultSameOperandTypeOp(OpAsmParser &parser,
+                                            OperationState &result);
+
+// Prints the given binary `op` in custom assembly form if both the two operands
+// and the result have the same time. Otherwise, prints the generic assembly
+// form.
+void printOneResultOp(Operation *op, OpAsmPrinter &p);
+} // namespace impl
+
+// These functions are out-of-line implementations of the methods in CastOp,
+// which avoids them being template instantiated/duplicated.
+namespace impl {
+void buildCastOp(Builder *builder, OperationState &result, Value source,
+                 Type destType);
+ParseResult parseCastOp(OpAsmParser &parser, OperationState &result);
+void printCastOp(Operation *op, OpAsmPrinter &p);
+Value foldCastOp(Operation *op);
+} // namespace impl
+} // end namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
new file mode 100644
index 0000000000000000000000000000000000000000..41acdba1a0579fb2da5bb594ef617fd65caf51a1
--- /dev/null
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -0,0 +1,666 @@
+//===- OpImplementation.h - Classes for implementing Op types ---*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This classes used by the implementation details of Op types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_OPIMPLEMENTATION_H
+#define MLIR_IR_OPIMPLEMENTATION_H
+
+#include "mlir/IR/DialectInterface.h"
+#include "mlir/IR/OpDefinition.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+
+class Builder;
+
+//===----------------------------------------------------------------------===//
+// OpAsmPrinter
+//===----------------------------------------------------------------------===//
+
+/// This is a pure-virtual base class that exposes the asmprinter hooks
+/// necessary to implement a custom print() method.
+class OpAsmPrinter {
+public:
+  OpAsmPrinter() {}
+  virtual ~OpAsmPrinter();
+  virtual raw_ostream &getStream() const = 0;
+
+  /// Print implementations for various things an operation contains.
+  virtual void printOperand(Value value) = 0;
+
+  /// Print a comma separated list of operands.
+  template <typename ContainerType>
+  void printOperands(const ContainerType &container) {
+    printOperands(container.begin(), container.end());
+  }
+
+  /// Print a comma separated list of operands.
+  template <typename IteratorType>
+  void printOperands(IteratorType it, IteratorType end) {
+    if (it == end)
+      return;
+    printOperand(*it);
+    for (++it; it != end; ++it) {
+      getStream() << ", ";
+      printOperand(*it);
+    }
+  }
+  virtual void printType(Type type) = 0;
+  virtual void printAttribute(Attribute attr) = 0;
+
+  /// Print a successor, and use list, of a terminator operation given the
+  /// terminator and the successor index.
+  virtual void printSuccessorAndUseList(Operation *term, unsigned index) = 0;
+
+  /// If the specified operation has attributes, print out an attribute
+  /// dictionary with their values.  elidedAttrs allows the client to ignore
+  /// specific well known attributes, commonly used if the attribute value is
+  /// printed some other way (like as a fixed operand).
+  virtual void printOptionalAttrDict(ArrayRef<NamedAttribute> attrs,
+                                     ArrayRef<StringRef> elidedAttrs = {}) = 0;
+
+  /// If the specified operation has attributes, print out an attribute
+  /// dictionary prefixed with 'attributes'.
+  virtual void
+  printOptionalAttrDictWithKeyword(ArrayRef<NamedAttribute> attrs,
+                                   ArrayRef<StringRef> elidedAttrs = {}) = 0;
+
+  /// Print the entire operation with the default generic assembly form.
+  virtual void printGenericOp(Operation *op) = 0;
+
+  /// Prints a region.
+  virtual void printRegion(Region &blocks, bool printEntryBlockArgs = true,
+                           bool printBlockTerminators = true) = 0;
+
+  /// Renumber the arguments for the specified region to the same names as the
+  /// SSA values in namesToUse.  This may only be used for IsolatedFromAbove
+  /// operations.  If any entry in namesToUse is null, the corresponding
+  /// argument name is left alone.
+  virtual void shadowRegionArgs(Region &region, ValueRange namesToUse) = 0;
+
+  /// Prints an affine map of SSA ids, where SSA id names are used in place
+  /// of dims/symbols.
+  /// Operand values must come from single-result sources, and be valid
+  /// dimensions/symbol identifiers according to mlir::isValidDim/Symbol.
+  virtual void printAffineMapOfSSAIds(AffineMapAttr mapAttr,
+                                      ValueRange operands) = 0;
+
+  /// Print an optional arrow followed by a type list.
+  void printOptionalArrowTypeList(ArrayRef<Type> types) {
+    if (types.empty())
+      return;
+    auto &os = getStream() << " -> ";
+    bool wrapped = types.size() != 1 || types[0].isa<FunctionType>();
+    if (wrapped)
+      os << '(';
+    interleaveComma(types, *this);
+    if (wrapped)
+      os << ')';
+  }
+
+  /// Print the complete type of an operation in functional form.
+  void printFunctionalType(Operation *op) {
+    auto &os = getStream();
+    os << "(";
+    interleaveComma(op->getNonSuccessorOperands(), os, [&](Value operand) {
+      if (operand)
+        printType(operand->getType());
+      else
+        os << "<<NULL>";
+    });
+    os << ") -> ";
+    if (op->getNumResults() == 1 &&
+        !op->getResult(0)->getType().isa<FunctionType>()) {
+      printType(op->getResult(0)->getType());
+    } else {
+      os << '(';
+      interleaveComma(op->getResultTypes(), os);
+      os << ')';
+    }
+  }
+
+  /// Print the given string as a symbol reference, i.e. a form representable by
+  /// a SymbolRefAttr. A symbol reference is represented as a string prefixed
+  /// with '@'. The reference is surrounded with ""'s and escaped if it has any
+  /// special or non-printable characters in it.
+  virtual void printSymbolName(StringRef symbolRef) = 0;
+
+private:
+  OpAsmPrinter(const OpAsmPrinter &) = delete;
+  void operator=(const OpAsmPrinter &) = delete;
+};
+
+// Make the implementations convenient to use.
+inline OpAsmPrinter &operator<<(OpAsmPrinter &p, Value value) {
+  p.printOperand(value);
+  return p;
+}
+
+template <typename T,
+          typename std::enable_if<std::is_convertible<T &, ValueRange>::value &&
+                                      !std::is_convertible<T &, Value &>::value,
+                                  T>::type * = nullptr>
+inline OpAsmPrinter &operator<<(OpAsmPrinter &p, const T &values) {
+  p.printOperands(values);
+  return p;
+}
+
+inline OpAsmPrinter &operator<<(OpAsmPrinter &p, Type type) {
+  p.printType(type);
+  return p;
+}
+
+inline OpAsmPrinter &operator<<(OpAsmPrinter &p, Attribute attr) {
+  p.printAttribute(attr);
+  return p;
+}
+
+// Support printing anything that isn't convertible to one of the above types,
+// even if it isn't exactly one of them.  For example, we want to print
+// FunctionType with the Type version above, not have it match this.
+template <typename T, typename std::enable_if<
+                          !std::is_convertible<T &, Value &>::value &&
+                              !std::is_convertible<T &, Type &>::value &&
+                              !std::is_convertible<T &, Attribute &>::value &&
+                              !std::is_convertible<T &, ValueRange>::value &&
+                              !llvm::is_one_of<T, bool>::value,
+                          T>::type * = nullptr>
+inline OpAsmPrinter &operator<<(OpAsmPrinter &p, const T &other) {
+  p.getStream() << other;
+  return p;
+}
+
+inline OpAsmPrinter &operator<<(OpAsmPrinter &p, bool value) {
+  return p << (value ? StringRef("true") : "false");
+}
+
+template <typename IteratorT>
+inline OpAsmPrinter &
+operator<<(OpAsmPrinter &p,
+           const iterator_range<ValueTypeIterator<IteratorT>> &types) {
+  interleaveComma(types, p);
+  return p;
+}
+
+//===----------------------------------------------------------------------===//
+// OpAsmParser
+//===----------------------------------------------------------------------===//
+
+/// The OpAsmParser has methods for interacting with the asm parser: parsing
+/// things from it, emitting errors etc.  It has an intentionally high-level API
+/// that is designed to reduce/constrain syntax innovation in individual
+/// operations.
+///
+/// For example, consider an op like this:
+///
+///    %x = load %p[%1, %2] : memref<...>
+///
+/// The "%x = load" tokens are already parsed and therefore invisible to the
+/// custom op parser.  This can be supported by calling `parseOperandList` to
+/// parse the %p, then calling `parseOperandList` with a `SquareDelimiter` to
+/// parse the indices, then calling `parseColonTypeList` to parse the result
+/// type.
+///
+class OpAsmParser {
+public:
+  virtual ~OpAsmParser();
+
+  /// Emit a diagnostic at the specified location and return failure.
+  virtual InFlightDiagnostic emitError(llvm::SMLoc loc,
+                                       const Twine &message = {}) = 0;
+
+  /// Return a builder which provides useful access to MLIRContext, global
+  /// objects like types and attributes.
+  virtual Builder &getBuilder() const = 0;
+
+  /// Get the location of the next token and store it into the argument.  This
+  /// always succeeds.
+  virtual llvm::SMLoc getCurrentLocation() = 0;
+  ParseResult getCurrentLocation(llvm::SMLoc *loc) {
+    *loc = getCurrentLocation();
+    return success();
+  }
+
+  /// Return the location of the original name token.
+  virtual llvm::SMLoc getNameLoc() const = 0;
+
+  // These methods emit an error and return failure or success. This allows
+  // these to be chained together into a linear sequence of || expressions in
+  // many cases.
+
+  /// Parse an operation in its generic form.
+  /// The parsed operation is parsed in the current context and inserted in the
+  /// provided block and insertion point. The results produced by this operation
+  /// aren't mapped to any named value in the parser. Returns nullptr on
+  /// failure.
+  virtual Operation *parseGenericOperation(Block *insertBlock,
+                                           Block::iterator insertPt) = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Token Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a '->' token.
+  virtual ParseResult parseArrow() = 0;
+
+  /// Parse a '->' token if present
+  virtual ParseResult parseOptionalArrow() = 0;
+
+  /// Parse a `:` token.
+  virtual ParseResult parseColon() = 0;
+
+  /// Parse a `:` token if present.
+  virtual ParseResult parseOptionalColon() = 0;
+
+  /// Parse a `,` token.
+  virtual ParseResult parseComma() = 0;
+
+  /// Parse a `,` token if present.
+  virtual ParseResult parseOptionalComma() = 0;
+
+  /// Parse a `=` token.
+  virtual ParseResult parseEqual() = 0;
+
+  /// Parse a '<' token.
+  virtual ParseResult parseLess() = 0;
+
+  /// Parse a '>' token.
+  virtual ParseResult parseGreater() = 0;
+
+  /// Parse a given keyword.
+  ParseResult parseKeyword(StringRef keyword, const Twine &msg = "") {
+    auto loc = getCurrentLocation();
+    if (parseOptionalKeyword(keyword))
+      return emitError(loc, "expected '") << keyword << "'" << msg;
+    return success();
+  }
+
+  /// Parse a keyword into 'keyword'.
+  ParseResult parseKeyword(StringRef *keyword) {
+    auto loc = getCurrentLocation();
+    if (parseOptionalKeyword(keyword))
+      return emitError(loc, "expected valid keyword");
+    return success();
+  }
+
+  /// Parse the given keyword if present.
+  virtual ParseResult parseOptionalKeyword(StringRef keyword) = 0;
+
+  /// Parse a keyword, if present, into 'keyword'.
+  virtual ParseResult parseOptionalKeyword(StringRef *keyword) = 0;
+
+  /// Parse a `(` token.
+  virtual ParseResult parseLParen() = 0;
+
+  /// Parse a `(` token if present.
+  virtual ParseResult parseOptionalLParen() = 0;
+
+  /// Parse a `)` token.
+  virtual ParseResult parseRParen() = 0;
+
+  /// Parse a `)` token if present.
+  virtual ParseResult parseOptionalRParen() = 0;
+
+  /// Parse a `[` token.
+  virtual ParseResult parseLSquare() = 0;
+
+  /// Parse a `[` token if present.
+  virtual ParseResult parseOptionalLSquare() = 0;
+
+  /// Parse a `]` token.
+  virtual ParseResult parseRSquare() = 0;
+
+  /// Parse a `]` token if present.
+  virtual ParseResult parseOptionalRSquare() = 0;
+
+  /// Parse a `...` token if present;
+  virtual ParseResult parseOptionalEllipsis() = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Attribute Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an arbitrary attribute and return it in result.  This also adds the
+  /// attribute to the specified attribute list with the specified name.
+  ParseResult parseAttribute(Attribute &result, StringRef attrName,
+                             SmallVectorImpl<NamedAttribute> &attrs) {
+    return parseAttribute(result, Type(), attrName, attrs);
+  }
+
+  /// Parse an attribute of a specific kind and type.
+  template <typename AttrType>
+  ParseResult parseAttribute(AttrType &result, StringRef attrName,
+                             SmallVectorImpl<NamedAttribute> &attrs) {
+    return parseAttribute(result, Type(), attrName, attrs);
+  }
+
+  /// Parse an arbitrary attribute of a given type and return it in result. This
+  /// also adds the attribute to the specified attribute list with the specified
+  /// name.
+  virtual ParseResult
+  parseAttribute(Attribute &result, Type type, StringRef attrName,
+                 SmallVectorImpl<NamedAttribute> &attrs) = 0;
+
+  /// Parse an attribute of a specific kind and type.
+  template <typename AttrType>
+  ParseResult parseAttribute(AttrType &result, Type type, StringRef attrName,
+                             SmallVectorImpl<NamedAttribute> &attrs) {
+    llvm::SMLoc loc = getCurrentLocation();
+
+    // Parse any kind of attribute.
+    Attribute attr;
+    if (parseAttribute(attr, type, attrName, attrs))
+      return failure();
+
+    // Check for the right kind of attribute.
+    result = attr.dyn_cast<AttrType>();
+    if (!result)
+      return emitError(loc, "invalid kind of attribute specified");
+
+    return success();
+  }
+
+  /// Parse a named dictionary into 'result' if it is present.
+  virtual ParseResult
+  parseOptionalAttrDict(SmallVectorImpl<NamedAttribute> &result) = 0;
+
+  /// Parse a named dictionary into 'result' if the `attributes` keyword is
+  /// present.
+  virtual ParseResult
+  parseOptionalAttrDictWithKeyword(SmallVectorImpl<NamedAttribute> &result) = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Identifier Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an @-identifier and store it (without the '@' symbol) in a string
+  /// attribute named 'attrName'.
+  ParseResult parseSymbolName(StringAttr &result, StringRef attrName,
+                              SmallVectorImpl<NamedAttribute> &attrs) {
+    if (failed(parseOptionalSymbolName(result, attrName, attrs)))
+      return emitError(getCurrentLocation())
+             << "expected valid '@'-identifier for symbol name";
+    return success();
+  }
+
+  /// Parse an optional @-identifier and store it (without the '@' symbol) in a
+  /// string attribute named 'attrName'.
+  virtual ParseResult
+  parseOptionalSymbolName(StringAttr &result, StringRef attrName,
+                          SmallVectorImpl<NamedAttribute> &attrs) = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Operand Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// This is the representation of an operand reference.
+  struct OperandType {
+    llvm::SMLoc location; // Location of the token.
+    StringRef name;       // Value name, e.g. %42 or %abc
+    unsigned number;      // Number, e.g. 12 for an operand like %xyz#12
+  };
+
+  /// Parse a single operand.
+  virtual ParseResult parseOperand(OperandType &result) = 0;
+
+  /// These are the supported delimiters around operand lists and region
+  /// argument lists, used by parseOperandList and parseRegionArgumentList.
+  enum class Delimiter {
+    /// Zero or more operands with no delimiters.
+    None,
+    /// Parens surrounding zero or more operands.
+    Paren,
+    /// Square brackets surrounding zero or more operands.
+    Square,
+    /// Parens supporting zero or more operands, or nothing.
+    OptionalParen,
+    /// Square brackets supporting zero or more ops, or nothing.
+    OptionalSquare,
+  };
+
+  /// Parse zero or more SSA comma-separated operand references with a specified
+  /// surrounding delimiter, and an optional required operand count.
+  virtual ParseResult
+  parseOperandList(SmallVectorImpl<OperandType> &result,
+                   int requiredOperandCount = -1,
+                   Delimiter delimiter = Delimiter::None) = 0;
+  ParseResult parseOperandList(SmallVectorImpl<OperandType> &result,
+                               Delimiter delimiter) {
+    return parseOperandList(result, /*requiredOperandCount=*/-1, delimiter);
+  }
+
+  /// Parse zero or more trailing SSA comma-separated trailing operand
+  /// references with a specified surrounding delimiter, and an optional
+  /// required operand count. A leading comma is expected before the operands.
+  virtual ParseResult
+  parseTrailingOperandList(SmallVectorImpl<OperandType> &result,
+                           int requiredOperandCount = -1,
+                           Delimiter delimiter = Delimiter::None) = 0;
+  ParseResult parseTrailingOperandList(SmallVectorImpl<OperandType> &result,
+                                       Delimiter delimiter) {
+    return parseTrailingOperandList(result, /*requiredOperandCount=*/-1,
+                                    delimiter);
+  }
+
+  /// Resolve an operand to an SSA value, emitting an error on failure.
+  virtual ParseResult resolveOperand(const OperandType &operand, Type type,
+                                     SmallVectorImpl<Value> &result) = 0;
+
+  /// Resolve a list of operands to SSA values, emitting an error on failure, or
+  /// appending the results to the list on success. This method should be used
+  /// when all operands have the same type.
+  ParseResult resolveOperands(ArrayRef<OperandType> operands, Type type,
+                              SmallVectorImpl<Value> &result) {
+    for (auto elt : operands)
+      if (resolveOperand(elt, type, result))
+        return failure();
+    return success();
+  }
+
+  /// Resolve a list of operands and a list of operand types to SSA values,
+  /// emitting an error and returning failure, or appending the results
+  /// to the list on success.
+  ParseResult resolveOperands(ArrayRef<OperandType> operands,
+                              ArrayRef<Type> types, llvm::SMLoc loc,
+                              SmallVectorImpl<Value> &result) {
+    if (operands.size() != types.size())
+      return emitError(loc)
+             << operands.size() << " operands present, but expected "
+             << types.size();
+
+    for (unsigned i = 0, e = operands.size(); i != e; ++i)
+      if (resolveOperand(operands[i], types[i], result))
+        return failure();
+    return success();
+  }
+
+  /// Parses an affine map attribute where dims and symbols are SSA operands.
+  /// Operand values must come from single-result sources, and be valid
+  /// dimensions/symbol identifiers according to mlir::isValidDim/Symbol.
+  virtual ParseResult
+  parseAffineMapOfSSAIds(SmallVectorImpl<OperandType> &operands, Attribute &map,
+                         StringRef attrName,
+                         SmallVectorImpl<NamedAttribute> &attrs) = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Region Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parses a region. Any parsed blocks are appended to "region" and must be
+  /// moved to the op regions after the op is created. The first block of the
+  /// region takes "arguments" of types "argTypes". If "enableNameShadowing" is
+  /// set to true, the argument names are allowed to shadow the names of other
+  /// existing SSA values defined above the region scope. "enableNameShadowing"
+  /// can only be set to true for regions attached to operations that are
+  /// "IsolatedFromAbove".
+  virtual ParseResult parseRegion(Region &region,
+                                  ArrayRef<OperandType> arguments,
+                                  ArrayRef<Type> argTypes,
+                                  bool enableNameShadowing = false) = 0;
+
+  /// Parses a region if present.
+  virtual ParseResult parseOptionalRegion(Region &region,
+                                          ArrayRef<OperandType> arguments,
+                                          ArrayRef<Type> argTypes,
+                                          bool enableNameShadowing = false) = 0;
+
+  /// Parse a region argument, this argument is resolved when calling
+  /// 'parseRegion'.
+  virtual ParseResult parseRegionArgument(OperandType &argument) = 0;
+
+  /// Parse zero or more region arguments with a specified surrounding
+  /// delimiter, and an optional required argument count. Region arguments
+  /// define new values; so this also checks if values with the same names have
+  /// not been defined yet.
+  virtual ParseResult
+  parseRegionArgumentList(SmallVectorImpl<OperandType> &result,
+                          int requiredOperandCount = -1,
+                          Delimiter delimiter = Delimiter::None) = 0;
+  virtual ParseResult
+  parseRegionArgumentList(SmallVectorImpl<OperandType> &result,
+                          Delimiter delimiter) {
+    return parseRegionArgumentList(result, /*requiredOperandCount=*/-1,
+                                   delimiter);
+  }
+
+  /// Parse a region argument if present.
+  virtual ParseResult parseOptionalRegionArgument(OperandType &argument) = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Successor Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a single operation successor and its operand list.
+  virtual ParseResult
+  parseSuccessorAndUseList(Block *&dest, SmallVectorImpl<Value> &operands) = 0;
+
+  //===--------------------------------------------------------------------===//
+  // Type Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a type.
+  virtual ParseResult parseType(Type &result) = 0;
+
+  /// Parse an optional arrow followed by a type list.
+  virtual ParseResult
+  parseOptionalArrowTypeList(SmallVectorImpl<Type> &result) = 0;
+
+  /// Parse a colon followed by a type.
+  virtual ParseResult parseColonType(Type &result) = 0;
+
+  /// Parse a colon followed by a type of a specific kind, e.g. a FunctionType.
+  template <typename TypeType> ParseResult parseColonType(TypeType &result) {
+    llvm::SMLoc loc = getCurrentLocation();
+
+    // Parse any kind of type.
+    Type type;
+    if (parseColonType(type))
+      return failure();
+
+    // Check for the right kind of attribute.
+    result = type.dyn_cast<TypeType>();
+    if (!result)
+      return emitError(loc, "invalid kind of type specified");
+
+    return success();
+  }
+
+  /// Parse a colon followed by a type list, which must have at least one type.
+  virtual ParseResult parseColonTypeList(SmallVectorImpl<Type> &result) = 0;
+
+  /// Parse an optional colon followed by a type list, which if present must
+  /// have at least one type.
+  virtual ParseResult
+  parseOptionalColonTypeList(SmallVectorImpl<Type> &result) = 0;
+
+  /// Parse a keyword followed by a type.
+  ParseResult parseKeywordType(const char *keyword, Type &result) {
+    return failure(parseKeyword(keyword) || parseType(result));
+  }
+
+  /// Add the specified type to the end of the specified type list and return
+  /// success.  This is a helper designed to allow parse methods to be simple
+  /// and chain through || operators.
+  ParseResult addTypeToList(Type type, SmallVectorImpl<Type> &result) {
+    result.push_back(type);
+    return success();
+  }
+
+  /// Add the specified types to the end of the specified type list and return
+  /// success.  This is a helper designed to allow parse methods to be simple
+  /// and chain through || operators.
+  ParseResult addTypesToList(ArrayRef<Type> types,
+                             SmallVectorImpl<Type> &result) {
+    result.append(types.begin(), types.end());
+    return success();
+  }
+
+private:
+  /// Parse either an operand list or a region argument list depending on
+  /// whether isOperandList is true.
+  ParseResult parseOperandOrRegionArgList(SmallVectorImpl<OperandType> &result,
+                                          bool isOperandList,
+                                          int requiredOperandCount,
+                                          Delimiter delimiter);
+};
+
+//===--------------------------------------------------------------------===//
+// Dialect OpAsm interface.
+//===--------------------------------------------------------------------===//
+
+/// A functor used to set the name of the start of a result group of an
+/// operation. See 'getAsmResultNames' below for more details.
+using OpAsmSetValueNameFn = function_ref<void(Value, StringRef)>;
+
+class OpAsmDialectInterface
+    : public DialectInterface::Base<OpAsmDialectInterface> {
+public:
+  OpAsmDialectInterface(Dialect *dialect) : Base(dialect) {}
+
+  /// Hooks for getting identifier aliases for symbols. The identifier is used
+  /// in place of the symbol when printing textual IR.
+  ///
+  /// Hook for defining Attribute kind aliases. This will generate an alias for
+  /// all attributes of the given kind in the form : <alias>[0-9]+. These
+  /// aliases must not contain `.`.
+  virtual void getAttributeKindAliases(
+      SmallVectorImpl<std::pair<unsigned, StringRef>> &aliases) const {}
+  /// Hook for defining Attribute aliases. These aliases must not contain `.` or
+  /// end with a numeric digit([0-9]+).
+  virtual void getAttributeAliases(
+      SmallVectorImpl<std::pair<Attribute, StringRef>> &aliases) const {}
+  /// Hook for defining Type aliases.
+  virtual void
+  getTypeAliases(SmallVectorImpl<std::pair<Type, StringRef>> &aliases) const {}
+
+  /// Get a special name to use when printing the given operation. See
+  /// OpAsmInterface.td#getAsmResultNames for usage details and documentation.
+  virtual void getAsmResultNames(Operation *op,
+                                 OpAsmSetValueNameFn setNameFn) const {}
+
+  /// Get a special name to use when printing the entry block arguments of the
+  /// region contained by an operation in this dialect.
+  virtual void getAsmBlockArgumentNames(Block *block,
+                                        OpAsmSetValueNameFn setNameFn) const {}
+};
+
+//===--------------------------------------------------------------------===//
+// Operation OpAsm interface.
+//===--------------------------------------------------------------------===//
+
+/// The OpAsmOpInterface, see OpAsmInterface.td for more details.
+#include "mlir/IR/OpAsmInterface.h.inc"
+
+} // end namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/IR/Operation.h b/mlir/include/mlir/IR/Operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ef1636d3d0ead03a09a744057cbe62e9a2e4772
--- /dev/null
+++ b/mlir/include/mlir/IR/Operation.h
@@ -0,0 +1,700 @@
+//===- Operation.h - MLIR Operation Class -----------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Operation class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_OPERATION_H
+#define MLIR_IR_OPERATION_H
+
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Region.h"
+#include "llvm/ADT/Twine.h"
+
+namespace mlir {
+/// Terminator operations can have Block operands to represent successors.
+using BlockOperand = IROperandImpl<Block>;
+
+/// Operation is a basic unit of execution within a function. Operations can
+/// be nested within other operations effectively forming a tree. Child
+/// operations are organized into operation blocks represented by a 'Block'
+/// class.
+class Operation final
+    : public llvm::ilist_node_with_parent<Operation, Block>,
+      private llvm::TrailingObjects<Operation, OpResult, BlockOperand, unsigned,
+                                    Region, detail::OperandStorage> {
+public:
+  /// Create a new Operation with the specific fields.
+  static Operation *create(Location location, OperationName name,
+                           ArrayRef<Type> resultTypes, ArrayRef<Value> operands,
+                           ArrayRef<NamedAttribute> attributes,
+                           ArrayRef<Block *> successors, unsigned numRegions,
+                           bool resizableOperandList);
+
+  /// Overload of create that takes an existing NamedAttributeList to avoid
+  /// unnecessarily uniquing a list of attributes.
+  static Operation *create(Location location, OperationName name,
+                           ArrayRef<Type> resultTypes, ArrayRef<Value> operands,
+                           NamedAttributeList attributes,
+                           ArrayRef<Block *> successors, unsigned numRegions,
+                           bool resizableOperandList);
+
+  /// Create a new Operation from the fields stored in `state`.
+  static Operation *create(const OperationState &state);
+
+  /// Create a new Operation with the specific fields.
+  static Operation *create(Location location, OperationName name,
+                           ArrayRef<Type> resultTypes, ArrayRef<Value> operands,
+                           NamedAttributeList attributes,
+                           ArrayRef<Block *> successors = {},
+                           RegionRange regions = {},
+                           bool resizableOperandList = false);
+
+  /// The name of an operation is the key identifier for it.
+  OperationName getName() { return name; }
+
+  /// If this operation has a registered operation description, return it.
+  /// Otherwise return null.
+  const AbstractOperation *getAbstractOperation() {
+    return getName().getAbstractOperation();
+  }
+
+  /// Returns true if this operation has a registered operation description,
+  /// otherwise false.
+  bool isRegistered() { return getAbstractOperation(); }
+
+  /// Remove this operation from its parent block and delete it.
+  void erase();
+
+  /// Create a deep copy of this operation, remapping any operands that use
+  /// values outside of the operation using the map that is provided (leaving
+  /// them alone if no entry is present).  Replaces references to cloned
+  /// sub-operations to the corresponding operation that is copied, and adds
+  /// those mappings to the map.
+  Operation *clone(BlockAndValueMapping &mapper);
+  Operation *clone();
+
+  /// Create a partial copy of this operation without traversing into attached
+  /// regions. The new operation will have the same number of regions as the
+  /// original one, but they will be left empty.
+  /// Operands are remapped using `mapper` (if present), and `mapper` is updated
+  /// to contain the results.
+  Operation *cloneWithoutRegions(BlockAndValueMapping &mapper);
+
+  /// Create a partial copy of this operation without traversing into attached
+  /// regions. The new operation will have the same number of regions as the
+  /// original one, but they will be left empty.
+  Operation *cloneWithoutRegions();
+
+  /// Returns the operation block that contains this operation.
+  Block *getBlock() { return block; }
+
+  /// Return the context this operation is associated with.
+  MLIRContext *getContext();
+
+  /// Return the dialect this operation is associated with, or nullptr if the
+  /// associated dialect is not registered.
+  Dialect *getDialect();
+
+  /// The source location the operation was defined or derived from.
+  Location getLoc() { return location; }
+
+  /// Set the source location the operation was defined or derived from.
+  void setLoc(Location loc) { location = loc; }
+
+  /// Returns the region to which the instruction belongs. Returns nullptr if
+  /// the instruction is unlinked.
+  Region *getParentRegion();
+
+  /// Returns the closest surrounding operation that contains this operation
+  /// or nullptr if this is a top-level operation.
+  Operation *getParentOp();
+
+  /// Return the closest surrounding parent operation that is of type 'OpTy'.
+  template <typename OpTy> OpTy getParentOfType() {
+    auto *op = this;
+    while ((op = op->getParentOp()))
+      if (auto parentOp = dyn_cast<OpTy>(op))
+        return parentOp;
+    return OpTy();
+  }
+
+  /// Return true if this operation is a proper ancestor of the `other`
+  /// operation.
+  bool isProperAncestor(Operation *other);
+
+  /// Return true if this operation is an ancestor of the `other` operation. An
+  /// operation is considered as its own ancestor, use `isProperAncestor` to
+  /// avoid this.
+  bool isAncestor(Operation *other) {
+    return this == other || isProperAncestor(other);
+  }
+
+  /// Replace any uses of 'from' with 'to' within this operation.
+  void replaceUsesOfWith(Value from, Value to);
+
+  /// Replace all uses of results of this operation with the provided 'values'.
+  template <typename ValuesT,
+            typename = decltype(std::declval<ValuesT>().begin())>
+  void replaceAllUsesWith(ValuesT &&values) {
+    assert(std::distance(values.begin(), values.end()) == getNumResults() &&
+           "expected 'values' to correspond 1-1 with the number of results");
+
+    auto valueIt = values.begin();
+    for (unsigned i = 0, e = getNumResults(); i != e; ++i)
+      getResult(i)->replaceAllUsesWith(*(valueIt++));
+  }
+
+  /// Replace all uses of results of this operation with results of 'op'.
+  void replaceAllUsesWith(Operation *op) {
+    assert(getNumResults() == op->getNumResults());
+    for (unsigned i = 0, e = getNumResults(); i != e; ++i)
+      getResult(i)->replaceAllUsesWith(op->getResult(i));
+  }
+
+  /// Destroys this operation and its subclass data.
+  void destroy();
+
+  /// This drops all operand uses from this operation, which is an essential
+  /// step in breaking cyclic dependences between references when they are to
+  /// be deleted.
+  void dropAllReferences();
+
+  /// Drop uses of all values defined by this operation or its nested regions.
+  void dropAllDefinedValueUses();
+
+  /// Unlink this operation from its current block and insert it right before
+  /// `existingOp` which may be in the same or another block in the same
+  /// function.
+  void moveBefore(Operation *existingOp);
+
+  /// Unlink this operation from its current block and insert it right before
+  /// `iterator` in the specified block.
+  void moveBefore(Block *block, llvm::iplist<Operation>::iterator iterator);
+
+  /// Given an operation 'other' that is within the same parent block, return
+  /// whether the current operation is before 'other' in the operation list
+  /// of the parent block.
+  /// Note: This function has an average complexity of O(1), but worst case may
+  /// take O(N) where N is the number of operations within the parent block.
+  bool isBeforeInBlock(Operation *other);
+
+  void print(raw_ostream &os, OpPrintingFlags flags = llvm::None);
+  void dump();
+
+  //===--------------------------------------------------------------------===//
+  // Operands
+  //===--------------------------------------------------------------------===//
+
+  /// Returns if the operation has a resizable operation list, i.e. operands can
+  /// be added.
+  bool hasResizableOperandsList() { return getOperandStorage().isResizable(); }
+
+  /// Replace the current operands of this operation with the ones provided in
+  /// 'operands'. If the operands list is not resizable, the size of 'operands'
+  /// must be less than or equal to the current number of operands.
+  void setOperands(ValueRange operands);
+
+  unsigned getNumOperands() { return getOperandStorage().size(); }
+
+  Value getOperand(unsigned idx) { return getOpOperand(idx).get(); }
+  void setOperand(unsigned idx, Value value) {
+    return getOpOperand(idx).set(value);
+  }
+
+  // Support operand iteration.
+  using operand_range = OperandRange;
+  using operand_iterator = operand_range::iterator;
+
+  operand_iterator operand_begin() { return getOperands().begin(); }
+  operand_iterator operand_end() { return getOperands().end(); }
+
+  /// Returns an iterator on the underlying Value's (Value ).
+  operand_range getOperands() { return operand_range(this); }
+
+  /// Erase the operand at position `idx`.
+  void eraseOperand(unsigned idx) { getOperandStorage().eraseOperand(idx); }
+
+  MutableArrayRef<OpOperand> getOpOperands() {
+    return getOperandStorage().getOperands();
+  }
+
+  OpOperand &getOpOperand(unsigned idx) { return getOpOperands()[idx]; }
+
+  // Support operand type iteration.
+  using operand_type_iterator = operand_range::type_iterator;
+  using operand_type_range = iterator_range<operand_type_iterator>;
+  operand_type_iterator operand_type_begin() { return operand_begin(); }
+  operand_type_iterator operand_type_end() { return operand_end(); }
+  operand_type_range getOperandTypes() { return getOperands().getTypes(); }
+
+  //===--------------------------------------------------------------------===//
+  // Results
+  //===--------------------------------------------------------------------===//
+
+  /// Return true if there are no users of any results of this operation.
+  bool use_empty();
+
+  unsigned getNumResults() { return numResults; }
+
+  Value getResult(unsigned idx) { return getOpResult(idx); }
+
+  /// Support result iteration.
+  using result_range = ResultRange;
+  using result_iterator = result_range::iterator;
+
+  result_iterator result_begin() { return getResults().begin(); }
+  result_iterator result_end() { return getResults().end(); }
+  result_range getResults() { return result_range(this); }
+
+  MutableArrayRef<OpResult> getOpResults() {
+    return {getTrailingObjects<OpResult>(), numResults};
+  }
+
+  OpResult &getOpResult(unsigned idx) { return getOpResults()[idx]; }
+
+  /// Support result type iteration.
+  using result_type_iterator = result_range::type_iterator;
+  using result_type_range = iterator_range<result_type_iterator>;
+  result_type_iterator result_type_begin() { return result_begin(); }
+  result_type_iterator result_type_end() { return result_end(); }
+  result_type_range getResultTypes() { return getResults().getTypes(); }
+
+  //===--------------------------------------------------------------------===//
+  // Attributes
+  //===--------------------------------------------------------------------===//
+
+  // Operations may optionally carry a list of attributes that associate
+  // constants to names.  Attributes may be dynamically added and removed over
+  // the lifetime of an operation.
+
+  /// Return all of the attributes on this operation.
+  ArrayRef<NamedAttribute> getAttrs() { return attrs.getAttrs(); }
+
+  /// Return the internal attribute list on this operation.
+  NamedAttributeList &getAttrList() { return attrs; }
+
+  /// Set the attribute list on this operation.
+  /// Using a NamedAttributeList is more efficient as it does not require new
+  /// uniquing in the MLIRContext.
+  void setAttrs(NamedAttributeList newAttrs) { attrs = newAttrs; }
+
+  /// Return the specified attribute if present, null otherwise.
+  Attribute getAttr(Identifier name) { return attrs.get(name); }
+  Attribute getAttr(StringRef name) { return attrs.get(name); }
+
+  template <typename AttrClass> AttrClass getAttrOfType(Identifier name) {
+    return getAttr(name).dyn_cast_or_null<AttrClass>();
+  }
+
+  template <typename AttrClass> AttrClass getAttrOfType(StringRef name) {
+    return getAttr(name).dyn_cast_or_null<AttrClass>();
+  }
+
+  /// If the an attribute exists with the specified name, change it to the new
+  /// value.  Otherwise, add a new attribute with the specified name/value.
+  void setAttr(Identifier name, Attribute value) { attrs.set(name, value); }
+  void setAttr(StringRef name, Attribute value) {
+    setAttr(Identifier::get(name, getContext()), value);
+  }
+
+  /// Remove the attribute with the specified name if it exists.  The return
+  /// value indicates whether the attribute was present or not.
+  NamedAttributeList::RemoveResult removeAttr(Identifier name) {
+    return attrs.remove(name);
+  }
+
+  /// A utility iterator that filters out non-dialect attributes.
+  class dialect_attr_iterator
+      : public llvm::filter_iterator<ArrayRef<NamedAttribute>::iterator,
+                                     bool (*)(NamedAttribute)> {
+    static bool filter(NamedAttribute attr) {
+      // Dialect attributes are prefixed by the dialect name, like operations.
+      return attr.first.strref().count('.');
+    }
+
+    explicit dialect_attr_iterator(ArrayRef<NamedAttribute>::iterator it,
+                                   ArrayRef<NamedAttribute>::iterator end)
+        : llvm::filter_iterator<ArrayRef<NamedAttribute>::iterator,
+                                bool (*)(NamedAttribute)>(it, end, &filter) {}
+
+    // Allow access to the constructor.
+    friend Operation;
+  };
+  using dialect_attr_range = iterator_range<dialect_attr_iterator>;
+
+  /// Return a range corresponding to the dialect attributes for this operation.
+  dialect_attr_range getDialectAttrs() {
+    auto attrs = getAttrs();
+    return {dialect_attr_iterator(attrs.begin(), attrs.end()),
+            dialect_attr_iterator(attrs.end(), attrs.end())};
+  }
+  dialect_attr_iterator dialect_attr_begin() {
+    auto attrs = getAttrs();
+    return dialect_attr_iterator(attrs.begin(), attrs.end());
+  }
+  dialect_attr_iterator dialect_attr_end() {
+    auto attrs = getAttrs();
+    return dialect_attr_iterator(attrs.end(), attrs.end());
+  }
+
+  /// Set the dialect attributes for this operation, and preserve all dependent.
+  template <typename DialectAttrT>
+  void setDialectAttrs(DialectAttrT &&dialectAttrs) {
+    SmallVector<NamedAttribute, 16> attrs;
+    attrs.assign(std::begin(dialectAttrs), std::end(dialectAttrs));
+    for (auto attr : getAttrs())
+      if (!attr.first.strref().count('.'))
+        attrs.push_back(attr);
+    setAttrs(llvm::makeArrayRef(attrs));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Blocks
+  //===--------------------------------------------------------------------===//
+
+  /// Returns the number of regions held by this operation.
+  unsigned getNumRegions() { return numRegions; }
+
+  /// Returns the regions held by this operation.
+  MutableArrayRef<Region> getRegions() {
+    auto *regions = getTrailingObjects<Region>();
+    return {regions, numRegions};
+  }
+
+  /// Returns the region held by this operation at position 'index'.
+  Region &getRegion(unsigned index) {
+    assert(index < numRegions && "invalid region index");
+    return getRegions()[index];
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Terminators
+  //===--------------------------------------------------------------------===//
+
+  MutableArrayRef<BlockOperand> getBlockOperands() {
+    return {getTrailingObjects<BlockOperand>(), numSuccs};
+  }
+
+  // Successor iteration.
+  using succ_iterator = SuccessorRange::iterator;
+  succ_iterator successor_begin() { return getSuccessors().begin(); }
+  succ_iterator successor_end() { return getSuccessors().end(); }
+  SuccessorRange getSuccessors() { return SuccessorRange(this); }
+
+  /// Return the operands of this operation that are *not* successor arguments.
+  operand_range getNonSuccessorOperands();
+
+  operand_range getSuccessorOperands(unsigned index);
+
+  Value getSuccessorOperand(unsigned succIndex, unsigned opIndex) {
+    assert(!isKnownNonTerminator() && "only terminators may have successors");
+    assert(opIndex < getNumSuccessorOperands(succIndex));
+    return getOperand(getSuccessorOperandIndex(succIndex) + opIndex);
+  }
+
+  bool hasSuccessors() { return numSuccs != 0; }
+  unsigned getNumSuccessors() { return numSuccs; }
+  unsigned getNumSuccessorOperands(unsigned index) {
+    assert(!isKnownNonTerminator() && "only terminators may have successors");
+    assert(index < getNumSuccessors());
+    return getTrailingObjects<unsigned>()[index];
+  }
+
+  Block *getSuccessor(unsigned index) {
+    assert(index < getNumSuccessors());
+    return getBlockOperands()[index].get();
+  }
+  void setSuccessor(Block *block, unsigned index);
+
+  /// Erase a specific operand from the operand list of the successor at
+  /// 'index'.
+  void eraseSuccessorOperand(unsigned succIndex, unsigned opIndex) {
+    assert(succIndex < getNumSuccessors());
+    assert(opIndex < getNumSuccessorOperands(succIndex));
+    getOperandStorage().eraseOperand(getSuccessorOperandIndex(succIndex) +
+                                     opIndex);
+    --getTrailingObjects<unsigned>()[succIndex];
+  }
+
+  /// Get the index of the first operand of the successor at the provided
+  /// index.
+  unsigned getSuccessorOperandIndex(unsigned index);
+
+  /// Return a pair (successorIndex, successorArgIndex) containing the index
+  /// of the successor that `operandIndex` belongs to and the index of the
+  /// argument to that successor that `operandIndex` refers to.
+  ///
+  /// If `operandIndex` is not a successor operand, None is returned.
+  Optional<std::pair<unsigned, unsigned>>
+  decomposeSuccessorOperandIndex(unsigned operandIndex);
+
+  /// Returns the `BlockArgument` corresponding to operand `operandIndex` in
+  /// some successor, or None if `operandIndex` isn't a successor operand index.
+  Optional<BlockArgument> getSuccessorBlockArgument(unsigned operandIndex) {
+    auto decomposed = decomposeSuccessorOperandIndex(operandIndex);
+    if (!decomposed.hasValue())
+      return None;
+    return getSuccessor(decomposed->first)->getArgument(decomposed->second);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Accessors for various properties of operations
+  //===--------------------------------------------------------------------===//
+
+  /// Returns whether the operation is commutative.
+  bool isCommutative() {
+    if (auto *absOp = getAbstractOperation())
+      return absOp->hasProperty(OperationProperty::Commutative);
+    return false;
+  }
+
+  /// Returns whether the operation has side-effects.
+  bool hasNoSideEffect() {
+    if (auto *absOp = getAbstractOperation())
+      return absOp->hasProperty(OperationProperty::NoSideEffect);
+    return false;
+  }
+
+  /// Represents the status of whether an operation is a terminator. We
+  /// represent an 'unknown' status because we want to support unregistered
+  /// terminators.
+  enum class TerminatorStatus { Terminator, NonTerminator, Unknown };
+
+  /// Returns the status of whether this operation is a terminator or not.
+  TerminatorStatus getTerminatorStatus() {
+    if (auto *absOp = getAbstractOperation()) {
+      return absOp->hasProperty(OperationProperty::Terminator)
+                 ? TerminatorStatus::Terminator
+                 : TerminatorStatus::NonTerminator;
+    }
+    return TerminatorStatus::Unknown;
+  }
+
+  /// Returns if the operation is known to be a terminator.
+  bool isKnownTerminator() {
+    return getTerminatorStatus() == TerminatorStatus::Terminator;
+  }
+
+  /// Returns if the operation is known to *not* be a terminator.
+  bool isKnownNonTerminator() {
+    return getTerminatorStatus() == TerminatorStatus::NonTerminator;
+  }
+
+  /// Returns if the operation is known to be completely isolated from enclosing
+  /// regions, i.e. no internal regions reference values defined above this
+  /// operation.
+  bool isKnownIsolatedFromAbove() {
+    if (auto *absOp = getAbstractOperation())
+      return absOp->hasProperty(OperationProperty::IsolatedFromAbove);
+    return false;
+  }
+
+  /// Attempt to fold this operation with the specified constant operand values
+  /// - the elements in "operands" will correspond directly to the operands of
+  /// the operation, but may be null if non-constant. If folding is successful,
+  /// this fills in the `results` vector. If not, `results` is unspecified.
+  LogicalResult fold(ArrayRef<Attribute> operands,
+                     SmallVectorImpl<OpFoldResult> &results);
+
+  /// Returns if the operation was registered with a particular trait, e.g.
+  /// hasTrait<OperandsAreIntegerLike>().
+  template <template <typename T> class Trait> bool hasTrait() {
+    auto *absOp = getAbstractOperation();
+    return absOp ? absOp->hasTrait<Trait>() : false;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Operation Walkers
+  //===--------------------------------------------------------------------===//
+
+  /// Walk the operation in postorder, calling the callback for each nested
+  /// operation(including this one). The callback method can take any of the
+  /// following forms:
+  ///   void(Operation*) : Walk all operations opaquely.
+  ///     * op->walk([](Operation *nestedOp) { ...});
+  ///   void(OpT) : Walk all operations of the given derived type.
+  ///     * op->walk([](ReturnOp returnOp) { ...});
+  ///   WalkResult(Operation*|OpT) : Walk operations, but allow for
+  ///                                interruption/cancellation.
+  ///     * op->walk([](... op) {
+  ///         // Interrupt, i.e cancel, the walk based on some invariant.
+  ///         if (some_invariant)
+  ///           return WalkResult::interrupt();
+  ///         return WalkResult::advance();
+  ///       });
+  template <typename FnT, typename RetT = detail::walkResultType<FnT>>
+  RetT walk(FnT &&callback) {
+    return detail::walkOperations(this, std::forward<FnT>(callback));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Other
+  //===--------------------------------------------------------------------===//
+
+  /// Emit an error with the op name prefixed, like "'dim' op " which is
+  /// convenient for verifiers.
+  InFlightDiagnostic emitOpError(const Twine &message = {});
+
+  /// Emit an error about fatal conditions with this operation, reporting up to
+  /// any diagnostic handlers that may be listening.
+  InFlightDiagnostic emitError(const Twine &message = {});
+
+  /// Emit a warning about this operation, reporting up to any diagnostic
+  /// handlers that may be listening.
+  InFlightDiagnostic emitWarning(const Twine &message = {});
+
+  /// Emit a remark about this operation, reporting up to any diagnostic
+  /// handlers that may be listening.
+  InFlightDiagnostic emitRemark(const Twine &message = {});
+
+private:
+  //===--------------------------------------------------------------------===//
+  // Ordering
+  //===--------------------------------------------------------------------===//
+
+  /// This value represents an invalid index ordering for an operation within a
+  /// block.
+  static constexpr unsigned kInvalidOrderIdx = -1;
+
+  /// This value represents the stride to use when computing a new order for an
+  /// operation.
+  static constexpr unsigned kOrderStride = 5;
+
+  /// Update the order index of this operation of this operation if necessary,
+  /// potentially recomputing the order of the parent block.
+  void updateOrderIfNecessary();
+
+  /// Returns true if this operation has a valid order.
+  bool hasValidOrder() { return orderIndex != kInvalidOrderIdx; }
+
+private:
+  Operation(Location location, OperationName name, unsigned numResults,
+            unsigned numSuccessors, unsigned numRegions,
+            const NamedAttributeList &attributes);
+
+  // Operations are deleted through the destroy() member because they are
+  // allocated with malloc.
+  ~Operation();
+
+  /// Returns the operand storage object.
+  detail::OperandStorage &getOperandStorage() {
+    return *getTrailingObjects<detail::OperandStorage>();
+  }
+
+  /// Provide a 'getParent' method for ilist_node_with_parent methods.
+  /// We mark it as a const function because ilist_node_with_parent specifically
+  /// requires a 'getParent() const' method. Once ilist_node removes this
+  /// constraint, we should drop the const to fit the rest of the MLIR const
+  /// model.
+  Block *getParent() const { return block; }
+
+  /// The operation block that contains this operation.
+  Block *block = nullptr;
+
+  /// This holds information about the source location the operation was defined
+  /// or derived from.
+  Location location;
+
+  /// Relative order of this operation in its parent block. Used for
+  /// O(1) local dominance checks between operations.
+  mutable unsigned orderIndex = 0;
+
+  const unsigned numResults, numSuccs, numRegions;
+
+  /// This holds the name of the operation.
+  OperationName name;
+
+  /// This holds general named attributes for the operation.
+  NamedAttributeList attrs;
+
+  // allow ilist_traits access to 'block' field.
+  friend struct llvm::ilist_traits<Operation>;
+
+  // allow block to access the 'orderIndex' field.
+  friend class Block;
+
+  // allow ilist_node_with_parent to access the 'getParent' method.
+  friend class llvm::ilist_node_with_parent<Operation, Block>;
+
+  // This stuff is used by the TrailingObjects template.
+  friend llvm::TrailingObjects<Operation, OpResult, BlockOperand, unsigned,
+                               Region, detail::OperandStorage>;
+  size_t numTrailingObjects(OverloadToken<OpResult>) const {
+    return numResults;
+  }
+  size_t numTrailingObjects(OverloadToken<BlockOperand>) const {
+    return numSuccs;
+  }
+  size_t numTrailingObjects(OverloadToken<Region>) const { return numRegions; }
+  size_t numTrailingObjects(OverloadToken<unsigned>) const { return numSuccs; }
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, Operation &op) {
+  op.print(os);
+  return os;
+}
+
+/// This class implements use iterator for the Operation. This iterates over all
+/// uses of all results of an Operation.
+class UseIterator final
+    : public llvm::iterator_facade_base<UseIterator, std::forward_iterator_tag,
+                                        Operation *> {
+public:
+  /// Initialize UseIterator for op, specify end to return iterator to last use.
+  explicit UseIterator(Operation *op, bool end = false);
+
+  UseIterator &operator++();
+  Operation *operator->() { return use->getOwner(); }
+  Operation *operator*() { return use->getOwner(); }
+
+  bool operator==(const UseIterator &other) const;
+  bool operator!=(const UseIterator &other) const;
+
+private:
+  void skipOverResultsWithNoUsers();
+
+  /// The operation whose uses are being iterated over.
+  Operation *op;
+  /// The result of op who's uses are being iterated over.
+  Operation::result_iterator res;
+  /// The use of the result.
+  Value::use_iterator use;
+};
+} // end namespace mlir
+
+namespace llvm {
+/// Provide isa functionality for operation casts.
+template <typename T> struct isa_impl<T, ::mlir::Operation> {
+  static inline bool doit(const ::mlir::Operation &op) {
+    return T::classof(const_cast<::mlir::Operation *>(&op));
+  }
+};
+
+/// Provide specializations for operation casts as the resulting T is value
+/// typed.
+template <typename T> struct cast_retty_impl<T, ::mlir::Operation *> {
+  using ret_type = T;
+};
+template <typename T> struct cast_retty_impl<T, ::mlir::Operation> {
+  using ret_type = T;
+};
+template <class T>
+struct cast_convert_val<T, ::mlir::Operation, ::mlir::Operation> {
+  static T doit(::mlir::Operation &val) { return T(&val); }
+};
+template <class T>
+struct cast_convert_val<T, ::mlir::Operation *, ::mlir::Operation *> {
+  static T doit(::mlir::Operation *val) { return T(val); }
+};
+} // end namespace llvm
+
+#endif // MLIR_IR_OPERATION_H
diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
new file mode 100644
index 0000000000000000000000000000000000000000..30376b8b5991142722ee412fc8870feb0ee84733
--- /dev/null
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -0,0 +1,688 @@
+//===- OperationSupport.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a number of support types that Operation and related
+// classes build on top of.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_OPERATION_SUPPORT_H
+#define MLIR_IR_OPERATION_SUPPORT_H
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/Support/TrailingObjects.h"
+#include <memory>
+
+namespace mlir {
+class Block;
+class Dialect;
+class Operation;
+struct OperationState;
+class OpAsmParser;
+class OpAsmParserResult;
+class OpAsmPrinter;
+class OpFoldResult;
+class ParseResult;
+class Pattern;
+class Region;
+class RewritePattern;
+class Type;
+class Value;
+class ValueRange;
+
+/// This is an adaptor from a list of values to named operands of OpTy.  In a
+/// generic operation context, e.g., in dialect conversions, an ordered array of
+/// `Value`s is treated as operands of `OpTy`.  This adaptor takes a reference
+/// to the array and provides accessors with the same names as `OpTy` for
+/// operands.  This makes possible to create function templates that operate on
+/// either OpTy or OperandAdaptor<OpTy> seamlessly.
+template <typename OpTy> using OperandAdaptor = typename OpTy::OperandAdaptor;
+
+class OwningRewritePatternList;
+
+//===----------------------------------------------------------------------===//
+// AbstractOperation
+//===----------------------------------------------------------------------===//
+
+enum class OperationProperty {
+  /// This bit is set for an operation if it is a commutative operation: that
+  /// is a binary operator (two inputs) where "a op b" and "b op a" produce the
+  /// same results.
+  Commutative = 0x1,
+
+  /// This bit is set for operations that have no side effects: that means that
+  /// they do not read or write memory, or access any hidden state.
+  NoSideEffect = 0x2,
+
+  /// This bit is set for an operation if it is a terminator: that means
+  /// an operation at the end of a block.
+  Terminator = 0x4,
+
+  /// This bit is set for operations that are completely isolated from above.
+  /// This is used for operations whose regions are explicit capture only, i.e.
+  /// they are never allowed to implicitly reference values defined above the
+  /// parent operation.
+  IsolatedFromAbove = 0x8,
+};
+
+/// This is a "type erased" representation of a registered operation.  This
+/// should only be used by things like the AsmPrinter and other things that need
+/// to be parameterized by generic operation hooks.  Most user code should use
+/// the concrete operation types.
+class AbstractOperation {
+public:
+  using OperationProperties = uint32_t;
+
+  /// This is the name of the operation.
+  const StringRef name;
+
+  /// This is the dialect that this operation belongs to.
+  Dialect &dialect;
+
+  /// Return true if this "op class" can match against the specified operation.
+  bool (&classof)(Operation *op);
+
+  /// Use the specified object to parse this ops custom assembly format.
+  ParseResult (&parseAssembly)(OpAsmParser &parser, OperationState &result);
+
+  /// This hook implements the AsmPrinter for this operation.
+  void (&printAssembly)(Operation *op, OpAsmPrinter &p);
+
+  /// This hook implements the verifier for this operation.  It should emits an
+  /// error message and returns failure if a problem is detected, or returns
+  /// success if everything is ok.
+  LogicalResult (&verifyInvariants)(Operation *op);
+
+  /// This hook implements a generalized folder for this operation.  Operations
+  /// can implement this to provide simplifications rules that are applied by
+  /// the Builder::createOrFold API and the canonicalization pass.
+  ///
+  /// This is an intentionally limited interface - implementations of this hook
+  /// can only perform the following changes to the operation:
+  ///
+  ///  1. They can leave the operation alone and without changing the IR, and
+  ///     return failure.
+  ///  2. They can mutate the operation in place, without changing anything else
+  ///     in the IR.  In this case, return success.
+  ///  3. They can return a list of existing values that can be used instead of
+  ///     the operation.  In this case, fill in the results list and return
+  ///     success.  The caller will remove the operation and use those results
+  ///     instead.
+  ///
+  /// This allows expression of some simple in-place canonicalizations (e.g.
+  /// "x+0 -> x", "min(x,y,x,z) -> min(x,y,z)", "x+y-x -> y", etc), as well as
+  /// generalized constant folding.
+  LogicalResult (&foldHook)(Operation *op, ArrayRef<Attribute> operands,
+                            SmallVectorImpl<OpFoldResult> &results);
+
+  /// This hook returns any canonicalization pattern rewrites that the operation
+  /// supports, for use by the canonicalization pass.
+  void (&getCanonicalizationPatterns)(OwningRewritePatternList &results,
+                                      MLIRContext *context);
+
+  /// Returns whether the operation has a particular property.
+  bool hasProperty(OperationProperty property) const {
+    return opProperties & static_cast<OperationProperties>(property);
+  }
+
+  /// Returns an instance of the concept object for the given interface if it
+  /// was registered to this operation, null otherwise. This should not be used
+  /// directly.
+  template <typename T> typename T::Concept *getInterface() const {
+    return reinterpret_cast<typename T::Concept *>(
+        getRawInterface(T::getInterfaceID()));
+  }
+
+  /// Returns if the operation has a particular trait.
+  template <template <typename T> class Trait> bool hasTrait() const {
+    return hasRawTrait(ClassID::getID<Trait>());
+  }
+
+  /// Look up the specified operation in the specified MLIRContext and return a
+  /// pointer to it if present.  Otherwise, return a null pointer.
+  static const AbstractOperation *lookup(StringRef opName,
+                                         MLIRContext *context);
+
+  /// This constructor is used by Dialect objects when they register the list of
+  /// operations they contain.
+  template <typename T> static AbstractOperation get(Dialect &dialect) {
+    return AbstractOperation(
+        T::getOperationName(), dialect, T::getOperationProperties(), T::classof,
+        T::parseAssembly, T::printAssembly, T::verifyInvariants, T::foldHook,
+        T::getCanonicalizationPatterns, T::getRawInterface, T::hasTrait);
+  }
+
+private:
+  AbstractOperation(
+      StringRef name, Dialect &dialect, OperationProperties opProperties,
+      bool (&classof)(Operation *op),
+      ParseResult (&parseAssembly)(OpAsmParser &parser, OperationState &result),
+      void (&printAssembly)(Operation *op, OpAsmPrinter &p),
+      LogicalResult (&verifyInvariants)(Operation *op),
+      LogicalResult (&foldHook)(Operation *op, ArrayRef<Attribute> operands,
+                                SmallVectorImpl<OpFoldResult> &results),
+      void (&getCanonicalizationPatterns)(OwningRewritePatternList &results,
+                                          MLIRContext *context),
+      void *(&getRawInterface)(ClassID *interfaceID),
+      bool (&hasTrait)(ClassID *traitID))
+      : name(name), dialect(dialect), classof(classof),
+        parseAssembly(parseAssembly), printAssembly(printAssembly),
+        verifyInvariants(verifyInvariants), foldHook(foldHook),
+        getCanonicalizationPatterns(getCanonicalizationPatterns),
+        opProperties(opProperties), getRawInterface(getRawInterface),
+        hasRawTrait(hasTrait) {}
+
+  /// The properties of the operation.
+  const OperationProperties opProperties;
+
+  /// Returns a raw instance of the concept for the given interface id if it is
+  /// registered to this operation, nullptr otherwise. This should not be used
+  /// directly.
+  void *(&getRawInterface)(ClassID *interfaceID);
+
+  /// This hook returns if the operation contains the trait corresponding
+  /// to the given ClassID.
+  bool (&hasRawTrait)(ClassID *traitID);
+};
+
+//===----------------------------------------------------------------------===//
+// OperationName
+//===----------------------------------------------------------------------===//
+
+class OperationName {
+public:
+  using RepresentationUnion =
+      PointerUnion<Identifier, const AbstractOperation *>;
+
+  OperationName(AbstractOperation *op) : representation(op) {}
+  OperationName(StringRef name, MLIRContext *context);
+
+  /// Return the name of the dialect this operation is registered to.
+  StringRef getDialect() const;
+
+  /// Return the name of this operation.  This always succeeds.
+  StringRef getStringRef() const;
+
+  /// If this operation has a registered operation description, return it.
+  /// Otherwise return null.
+  const AbstractOperation *getAbstractOperation() const;
+
+  void print(raw_ostream &os) const;
+  void dump() const;
+
+  void *getAsOpaquePointer() const {
+    return static_cast<void *>(representation.getOpaqueValue());
+  }
+  static OperationName getFromOpaquePointer(void *pointer);
+
+private:
+  RepresentationUnion representation;
+  OperationName(RepresentationUnion representation)
+      : representation(representation) {}
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, OperationName identifier) {
+  identifier.print(os);
+  return os;
+}
+
+inline bool operator==(OperationName lhs, OperationName rhs) {
+  return lhs.getAsOpaquePointer() == rhs.getAsOpaquePointer();
+}
+
+inline bool operator!=(OperationName lhs, OperationName rhs) {
+  return lhs.getAsOpaquePointer() != rhs.getAsOpaquePointer();
+}
+
+// Make operation names hashable.
+inline llvm::hash_code hash_value(OperationName arg) {
+  return llvm::hash_value(arg.getAsOpaquePointer());
+}
+
+//===----------------------------------------------------------------------===//
+// OperationState
+//===----------------------------------------------------------------------===//
+
+/// This represents an operation in an abstracted form, suitable for use with
+/// the builder APIs.  This object is a large and heavy weight object meant to
+/// be used as a temporary object on the stack.  It is generally unwise to put
+/// this in a collection.
+struct OperationState {
+  Location location;
+  OperationName name;
+  SmallVector<Value, 4> operands;
+  /// Types of the results of this operation.
+  SmallVector<Type, 4> types;
+  SmallVector<NamedAttribute, 4> attributes;
+  /// Successors of this operation and their respective operands.
+  SmallVector<Block *, 1> successors;
+  /// Regions that the op will hold.
+  SmallVector<std::unique_ptr<Region>, 1> regions;
+  /// If the operation has a resizable operand list.
+  bool resizableOperandList = false;
+
+public:
+  OperationState(Location location, StringRef name);
+
+  OperationState(Location location, OperationName name);
+
+  OperationState(Location location, StringRef name, ValueRange operands,
+                 ArrayRef<Type> types, ArrayRef<NamedAttribute> attributes,
+                 ArrayRef<Block *> successors = {},
+                 MutableArrayRef<std::unique_ptr<Region>> regions = {},
+                 bool resizableOperandList = false);
+
+  void addOperands(ValueRange newOperands);
+
+  void addTypes(ArrayRef<Type> newTypes) {
+    types.append(newTypes.begin(), newTypes.end());
+  }
+
+  /// Add an attribute with the specified name.
+  void addAttribute(StringRef name, Attribute attr) {
+    addAttribute(Identifier::get(name, getContext()), attr);
+  }
+
+  /// Add an attribute with the specified name.
+  void addAttribute(Identifier name, Attribute attr) {
+    attributes.push_back({name, attr});
+  }
+
+  /// Add an array of named attributes.
+  void addAttributes(ArrayRef<NamedAttribute> newAttributes) {
+    attributes.append(newAttributes.begin(), newAttributes.end());
+  }
+
+  void addSuccessor(Block *successor, ValueRange succOperands);
+
+  /// Create a region that should be attached to the operation.  These regions
+  /// can be filled in immediately without waiting for Operation to be
+  /// created.  When it is, the region bodies will be transferred.
+  Region *addRegion();
+
+  /// Take a region that should be attached to the Operation.  The body of the
+  /// region will be transferred when the Operation is constructed.  If the
+  /// region is null, a new empty region will be attached to the Operation.
+  void addRegion(std::unique_ptr<Region> &&region);
+
+  /// Sets the operand list of the operation as resizable.
+  void setOperandListToResizable(bool isResizable = true) {
+    resizableOperandList = isResizable;
+  }
+
+  /// Get the context held by this operation state.
+  MLIRContext *getContext() { return location->getContext(); }
+};
+
+//===----------------------------------------------------------------------===//
+// OperandStorage
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+/// A utility class holding the information necessary to dynamically resize
+/// operands.
+struct ResizableStorage {
+  ResizableStorage(OpOperand *opBegin, unsigned numOperands)
+      : firstOpAndIsDynamic(opBegin, false), capacity(numOperands) {}
+
+  ~ResizableStorage() { cleanupStorage(); }
+
+  /// Cleanup any allocated storage.
+  void cleanupStorage() {
+    // If the storage is dynamic, then we need to free the storage.
+    if (isStorageDynamic())
+      free(firstOpAndIsDynamic.getPointer());
+  }
+
+  /// Sets the storage pointer to a new dynamically allocated block.
+  void setDynamicStorage(OpOperand *opBegin) {
+    /// Cleanup the old storage if necessary.
+    cleanupStorage();
+    firstOpAndIsDynamic.setPointerAndInt(opBegin, true);
+  }
+
+  /// Returns the current storage pointer.
+  OpOperand *getPointer() { return firstOpAndIsDynamic.getPointer(); }
+
+  /// Returns if the current storage of operands is in the trailing objects is
+  /// in a dynamically allocated memory block.
+  bool isStorageDynamic() const { return firstOpAndIsDynamic.getInt(); }
+
+  /// A pointer to the first operand element. This is either to the trailing
+  /// objects storage, or a dynamically allocated block of memory.
+  llvm::PointerIntPair<OpOperand *, 1, bool> firstOpAndIsDynamic;
+
+  // The maximum number of operands that can be currently held by the storage.
+  unsigned capacity;
+};
+
+/// This class handles the management of operation operands. Operands are
+/// stored similarly to the elements of a SmallVector except for two key
+/// differences. The first is the inline storage, which is a trailing objects
+/// array. The second is that being able to dynamically resize the operand list
+/// is optional.
+class OperandStorage final
+    : private llvm::TrailingObjects<OperandStorage, ResizableStorage,
+                                    OpOperand> {
+public:
+  OperandStorage(unsigned numOperands, bool resizable)
+      : numOperands(numOperands), resizable(resizable) {
+    // Initialize the resizable storage.
+    if (resizable) {
+      new (&getResizableStorage())
+          ResizableStorage(getTrailingObjects<OpOperand>(), numOperands);
+    }
+  }
+
+  ~OperandStorage() {
+    // Manually destruct the operands.
+    for (auto &operand : getOperands())
+      operand.~OpOperand();
+
+    // If the storage is resizable then destruct the utility.
+    if (resizable)
+      getResizableStorage().~ResizableStorage();
+  }
+
+  /// Replace the operands contained in the storage with the ones provided in
+  /// 'operands'.
+  void setOperands(Operation *owner, ValueRange operands);
+
+  /// Erase an operand held by the storage.
+  void eraseOperand(unsigned index);
+
+  /// Get the operation operands held by the storage.
+  MutableArrayRef<OpOperand> getOperands() {
+    return {getRawOperands(), size()};
+  }
+
+  /// Return the number of operands held in the storage.
+  unsigned size() const { return numOperands; }
+
+  /// Returns the additional size necessary for allocating this object.
+  static size_t additionalAllocSize(unsigned numOperands, bool resizable) {
+    return additionalSizeToAlloc<ResizableStorage, OpOperand>(resizable ? 1 : 0,
+                                                              numOperands);
+  }
+
+  /// Returns if this storage is resizable.
+  bool isResizable() const { return resizable; }
+
+private:
+  /// Clear the storage and destroy the current operands held by the storage.
+  void clear() { numOperands = 0; }
+
+  /// Returns the current pointer for the raw operands array.
+  OpOperand *getRawOperands() {
+    return resizable ? getResizableStorage().getPointer()
+                     : getTrailingObjects<OpOperand>();
+  }
+
+  /// Returns the resizable operand utility class.
+  ResizableStorage &getResizableStorage() {
+    assert(resizable);
+    return *getTrailingObjects<ResizableStorage>();
+  }
+
+  /// Grow the internal resizable operand storage.
+  void grow(ResizableStorage &resizeUtil, size_t minSize);
+
+  /// The current number of operands, and the current max operand capacity.
+  unsigned numOperands : 31;
+
+  /// Whether this storage is resizable or not.
+  bool resizable : 1;
+
+  // This stuff is used by the TrailingObjects template.
+  friend llvm::TrailingObjects<OperandStorage, ResizableStorage, OpOperand>;
+  size_t numTrailingObjects(OverloadToken<ResizableStorage>) const {
+    return resizable ? 1 : 0;
+  }
+};
+} // end namespace detail
+
+//===----------------------------------------------------------------------===//
+// OpPrintingFlags
+//===----------------------------------------------------------------------===//
+
+/// Set of flags used to control the behavior of the various IR print methods
+/// (e.g. Operation::Print).
+class OpPrintingFlags {
+public:
+  OpPrintingFlags();
+  OpPrintingFlags(llvm::NoneType) : OpPrintingFlags() {}
+
+  /// Enable the elision of large elements attributes, by printing a '...'
+  /// instead of the element data. Note: The IR generated with this option is
+  /// not parsable. `largeElementLimit` is used to configure what is considered
+  /// to be a "large" ElementsAttr by providing an upper limit to the number of
+  /// elements.
+  OpPrintingFlags &elideLargeElementsAttrs(int64_t largeElementLimit = 16);
+
+  /// Enable printing of debug information. If 'prettyForm' is set to true,
+  /// debug information is printed in a more readable 'pretty' form. Note: The
+  /// IR generated with 'prettyForm' is not parsable.
+  OpPrintingFlags &enableDebugInfo(bool prettyForm = false);
+
+  /// Always print operations in the generic form.
+  OpPrintingFlags &printGenericOpForm();
+
+  /// Use local scope when printing the operation. This allows for using the
+  /// printer in a more localized and thread-safe setting, but may not
+  /// necessarily be identical to what the IR will look like when dumping
+  /// the full module.
+  OpPrintingFlags &useLocalScope();
+
+  /// Return if the given ElementsAttr should be elided.
+  bool shouldElideElementsAttr(ElementsAttr attr) const;
+
+  /// Return if debug information should be printed.
+  bool shouldPrintDebugInfo() const;
+
+  /// Return if debug information should be printed in the pretty form.
+  bool shouldPrintDebugInfoPrettyForm() const;
+
+  /// Return if operations should be printed in the generic form.
+  bool shouldPrintGenericOpForm() const;
+
+  /// Return if the printer should use local scope when dumping the IR.
+  bool shouldUseLocalScope() const;
+
+private:
+  /// Elide large elements attributes if the number of elements is larger than
+  /// the upper limit.
+  Optional<int64_t> elementsAttrElementLimit;
+
+  /// Print debug information.
+  bool printDebugInfoFlag : 1;
+  bool printDebugInfoPrettyFormFlag : 1;
+
+  /// Print operations in the generic form.
+  bool printGenericOpFormFlag : 1;
+
+  /// Print operations with numberings local to the current operation.
+  bool printLocalScope : 1;
+};
+
+//===----------------------------------------------------------------------===//
+// Operation Value-Iterators
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ValueTypeRange
+
+/// This class implements iteration on the types of a given range of values.
+template <typename ValueIteratorT>
+class ValueTypeIterator final
+    : public llvm::mapped_iterator<ValueIteratorT, Type (*)(Value)> {
+  static Type unwrap(Value value) { return value.getType(); }
+
+public:
+  using reference = Type;
+
+  /// Provide a const dereference method.
+  Type operator*() const { return unwrap(*this->I); }
+
+  /// Initializes the type iterator to the specified value iterator.
+  ValueTypeIterator(ValueIteratorT it)
+      : llvm::mapped_iterator<ValueIteratorT, Type (*)(Value)>(it, &unwrap) {}
+};
+
+//===----------------------------------------------------------------------===//
+// OperandRange
+
+/// This class implements the operand iterators for the Operation class.
+class OperandRange final
+    : public detail::indexed_accessor_range_base<OperandRange, OpOperand *,
+                                                 Value, Value, Value> {
+public:
+  using RangeBaseT::RangeBaseT;
+  OperandRange(Operation *op);
+
+  /// Returns the types of the values within this range.
+  using type_iterator = ValueTypeIterator<iterator>;
+  iterator_range<type_iterator> getTypes() const { return {begin(), end()}; }
+
+private:
+  /// See `detail::indexed_accessor_range_base` for details.
+  static OpOperand *offset_base(OpOperand *object, ptrdiff_t index) {
+    return object + index;
+  }
+  /// See `detail::indexed_accessor_range_base` for details.
+  static Value dereference_iterator(OpOperand *object, ptrdiff_t index) {
+    return object[index].get();
+  }
+
+  /// Allow access to `offset_base` and `dereference_iterator`.
+  friend RangeBaseT;
+};
+
+//===----------------------------------------------------------------------===//
+// ResultRange
+
+/// This class implements the result iterators for the Operation class.
+class ResultRange final
+    : public detail::indexed_accessor_range_base<ResultRange, OpResult *, Value,
+                                                 Value, Value> {
+public:
+  using RangeBaseT::RangeBaseT;
+  ResultRange(Operation *op);
+
+  /// Returns the types of the values within this range.
+  using type_iterator = ValueTypeIterator<iterator>;
+  iterator_range<type_iterator> getTypes() const { return {begin(), end()}; }
+
+private:
+  /// See `detail::indexed_accessor_range_base` for details.
+  static OpResult *offset_base(OpResult *object, ptrdiff_t index) {
+    return object + index;
+  }
+  /// See `detail::indexed_accessor_range_base` for details.
+  static Value dereference_iterator(OpResult *object, ptrdiff_t index) {
+    return object[index];
+  }
+
+  /// Allow access to `offset_base` and `dereference_iterator`.
+  friend RangeBaseT;
+};
+
+//===----------------------------------------------------------------------===//
+// ValueRange
+
+/// This class provides an abstraction over the different types of ranges over
+/// Values. In many cases, this prevents the need to explicitly materialize a
+/// SmallVector/std::vector. This class should be used in places that are not
+/// suitable for a more derived type (e.g. ArrayRef) or a template range
+/// parameter.
+class ValueRange final
+    : public detail::indexed_accessor_range_base<
+          ValueRange, PointerUnion<const Value *, OpOperand *, OpResult *>,
+          Value, Value, Value> {
+public:
+  using RangeBaseT::RangeBaseT;
+
+  template <typename Arg,
+            typename = typename std::enable_if_t<
+                std::is_constructible<ArrayRef<Value>, Arg>::value &&
+                !std::is_convertible<Arg, Value>::value>>
+  ValueRange(Arg &&arg) : ValueRange(ArrayRef<Value>(std::forward<Arg>(arg))) {}
+  ValueRange(const Value &value) : ValueRange(&value, /*count=*/1) {}
+  ValueRange(const std::initializer_list<Value> &values)
+      : ValueRange(ArrayRef<Value>(values)) {}
+  ValueRange(iterator_range<OperandRange::iterator> values)
+      : ValueRange(OperandRange(values)) {}
+  ValueRange(iterator_range<ResultRange::iterator> values)
+      : ValueRange(ResultRange(values)) {}
+  ValueRange(ArrayRef<Value> values = llvm::None);
+  ValueRange(OperandRange values);
+  ValueRange(ResultRange values);
+
+  /// Returns the types of the values within this range.
+  using type_iterator = ValueTypeIterator<iterator>;
+  iterator_range<type_iterator> getTypes() const { return {begin(), end()}; }
+
+private:
+  /// The type representing the owner of this range. This is either a list of
+  /// values, operands, or results.
+  using OwnerT = PointerUnion<const Value *, OpOperand *, OpResult *>;
+
+  /// See `detail::indexed_accessor_range_base` for details.
+  static OwnerT offset_base(const OwnerT &owner, ptrdiff_t index);
+  /// See `detail::indexed_accessor_range_base` for details.
+  static Value dereference_iterator(const OwnerT &owner, ptrdiff_t index);
+
+  /// Allow access to `offset_base` and `dereference_iterator`.
+  friend RangeBaseT;
+};
+} // end namespace mlir
+
+namespace llvm {
+// Identifiers hash just like pointers, there is no need to hash the bytes.
+template <> struct DenseMapInfo<mlir::OperationName> {
+  static mlir::OperationName getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::OperationName::getFromOpaquePointer(pointer);
+  }
+  static mlir::OperationName getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::OperationName::getFromOpaquePointer(pointer);
+  }
+  static unsigned getHashValue(mlir::OperationName Val) {
+    return DenseMapInfo<void *>::getHashValue(Val.getAsOpaquePointer());
+  }
+  static bool isEqual(mlir::OperationName LHS, mlir::OperationName RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// The pointer inside of an identifier comes from a StringMap, so its alignment
+/// is always at least 4 and probably 8 (on 64-bit machines).  Allow LLVM to
+/// steal the low bits.
+template <> struct PointerLikeTypeTraits<mlir::OperationName> {
+public:
+  static inline void *getAsVoidPointer(mlir::OperationName I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::OperationName getFromVoidPointer(void *P) {
+    return mlir::OperationName::getFromOpaquePointer(P);
+  }
+  enum {
+    NumLowBitsAvailable = PointerLikeTypeTraits<
+        mlir::OperationName::RepresentationUnion>::NumLowBitsAvailable
+  };
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..db160e3bdb227b7a1ed45a689b8eb316212eabb4
--- /dev/null
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -0,0 +1,489 @@
+//===- PatternMatch.h - PatternMatcher classes -------==---------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_PATTERNMATCHER_H
+#define MLIR_PATTERNMATCHER_H
+
+#include "mlir/IR/Builders.h"
+
+namespace mlir {
+
+class PatternRewriter;
+
+//===----------------------------------------------------------------------===//
+// PatternBenefit class
+//===----------------------------------------------------------------------===//
+
+/// This class represents the benefit of a pattern match in a unitless scheme
+/// that ranges from 0 (very little benefit) to 65K.  The most common unit to
+/// use here is the "number of operations matched" by the pattern.
+///
+/// This also has a sentinel representation that can be used for patterns that
+/// fail to match.
+///
+class PatternBenefit {
+  enum { ImpossibleToMatchSentinel = 65535 };
+
+public:
+  /*implicit*/ PatternBenefit(unsigned benefit);
+  PatternBenefit(const PatternBenefit &) = default;
+  PatternBenefit &operator=(const PatternBenefit &) = default;
+
+  static PatternBenefit impossibleToMatch() { return PatternBenefit(); }
+  bool isImpossibleToMatch() const { return *this == impossibleToMatch(); }
+
+  /// If the corresponding pattern can match, return its benefit.  If the
+  // corresponding pattern isImpossibleToMatch() then this aborts.
+  unsigned short getBenefit() const;
+
+  bool operator==(const PatternBenefit &rhs) const {
+    return representation == rhs.representation;
+  }
+  bool operator!=(const PatternBenefit &rhs) const { return !(*this == rhs); }
+  bool operator<(const PatternBenefit &rhs) const {
+    return representation < rhs.representation;
+  }
+
+private:
+  PatternBenefit() : representation(ImpossibleToMatchSentinel) {}
+  unsigned short representation;
+};
+
+/// Pattern state is used by patterns that want to maintain state between their
+/// match and rewrite phases.  Patterns can define a pattern-specific subclass
+/// of this.
+class PatternState {
+public:
+  virtual ~PatternState() {}
+
+protected:
+  // Must be subclassed.
+  PatternState() {}
+};
+
+/// This is the type returned by a pattern match.  A match failure returns a
+/// None value.  A match success returns a Some value with any state the pattern
+/// may need to maintain (but may also be null).
+using PatternMatchResult = Optional<std::unique_ptr<PatternState>>;
+
+//===----------------------------------------------------------------------===//
+// Pattern class
+//===----------------------------------------------------------------------===//
+
+/// Instances of Pattern can be matched against SSA IR.  These matches get used
+/// in ways dependent on their subclasses and the driver doing the matching.
+/// For example, RewritePatterns implement a rewrite from one matched pattern
+/// to a replacement DAG tile.
+class Pattern {
+public:
+  /// Return the benefit (the inverse of "cost") of matching this pattern.  The
+  /// benefit of a Pattern is always static - rewrites that may have dynamic
+  /// benefit can be instantiated multiple times (different Pattern instances)
+  /// for each benefit that they may return, and be guarded by different match
+  /// condition predicates.
+  PatternBenefit getBenefit() const { return benefit; }
+
+  /// Return the root node that this pattern matches.  Patterns that can
+  /// match multiple root types are instantiated once per root.
+  OperationName getRootKind() const { return rootKind; }
+
+  //===--------------------------------------------------------------------===//
+  // Implementation hooks for patterns to implement.
+  //===--------------------------------------------------------------------===//
+
+  /// Attempt to match against code rooted at the specified operation,
+  /// which is the same operation code as getRootKind().  On failure, this
+  /// returns a None value.  On success it returns a (possibly null)
+  /// pattern-specific state wrapped in an Optional.
+  virtual PatternMatchResult match(Operation *op) const = 0;
+
+  virtual ~Pattern() {}
+
+  //===--------------------------------------------------------------------===//
+  // Helper methods to simplify pattern implementations
+  //===--------------------------------------------------------------------===//
+
+  /// This method indicates that no match was found.
+  static PatternMatchResult matchFailure() { return None; }
+
+  /// This method indicates that a match was found and has the specified cost.
+  PatternMatchResult
+  matchSuccess(std::unique_ptr<PatternState> state = {}) const {
+    return PatternMatchResult(std::move(state));
+  }
+
+protected:
+  /// Patterns must specify the root operation name they match against, and can
+  /// also specify the benefit of the pattern matching.
+  Pattern(StringRef rootName, PatternBenefit benefit, MLIRContext *context);
+
+private:
+  const OperationName rootKind;
+  const PatternBenefit benefit;
+
+  virtual void anchor();
+};
+
+/// RewritePattern is the common base class for all DAG to DAG replacements.
+/// There are two possible usages of this class:
+///   * Multi-step RewritePattern with "match" and "rewrite"
+///     - By overloading the "match" and "rewrite" functions, the user can
+///       separate the concerns of matching and rewriting.
+///   * Single-step RewritePattern with "matchAndRewrite"
+///     - By overloading the "matchAndRewrite" function, the user can perform
+///       the rewrite in the same call as the match. This removes the need for
+///       any PatternState.
+///
+class RewritePattern : public Pattern {
+public:
+  /// Rewrite the IR rooted at the specified operation with the result of
+  /// this pattern, generating any new operations with the specified
+  /// rewriter.  If an unexpected error is encountered (an internal
+  /// compiler error), it is emitted through the normal MLIR diagnostic
+  /// hooks and the IR is left in a valid state.
+  virtual void rewrite(Operation *op, std::unique_ptr<PatternState> state,
+                       PatternRewriter &rewriter) const;
+
+  /// Rewrite the IR rooted at the specified operation with the result of
+  /// this pattern, generating any new operations with the specified
+  /// builder.  If an unexpected error is encountered (an internal
+  /// compiler error), it is emitted through the normal MLIR diagnostic
+  /// hooks and the IR is left in a valid state.
+  virtual void rewrite(Operation *op, PatternRewriter &rewriter) const;
+
+  /// Attempt to match against code rooted at the specified operation,
+  /// which is the same operation code as getRootKind().  On failure, this
+  /// returns a None value.  On success, it returns a (possibly null)
+  /// pattern-specific state wrapped in an Optional.  This state is passed back
+  /// into the rewrite function if this match is selected.
+  PatternMatchResult match(Operation *op) const override;
+
+  /// Attempt to match against code rooted at the specified operation,
+  /// which is the same operation code as getRootKind(). If successful, this
+  /// function will automatically perform the rewrite.
+  virtual PatternMatchResult matchAndRewrite(Operation *op,
+                                             PatternRewriter &rewriter) const {
+    if (auto matchResult = match(op)) {
+      rewrite(op, std::move(*matchResult), rewriter);
+      return matchSuccess();
+    }
+    return matchFailure();
+  }
+
+  /// Return a list of operations that may be generated when rewriting an
+  /// operation instance with this pattern.
+  ArrayRef<OperationName> getGeneratedOps() const { return generatedOps; }
+
+protected:
+  /// Patterns must specify the root operation name they match against, and can
+  /// also specify the benefit of the pattern matching.
+  RewritePattern(StringRef rootName, PatternBenefit benefit,
+                 MLIRContext *context)
+      : Pattern(rootName, benefit, context) {}
+  /// Patterns must specify the root operation name they match against, and can
+  /// also specify the benefit of the pattern matching. They can also specify
+  /// the names of operations that may be generated during a successful rewrite.
+  RewritePattern(StringRef rootName, ArrayRef<StringRef> generatedNames,
+                 PatternBenefit benefit, MLIRContext *context);
+
+  /// A list of the potential operations that may be generated when rewriting
+  /// an op with this pattern.
+  SmallVector<OperationName, 2> generatedOps;
+};
+
+/// OpRewritePattern is a wrapper around RewritePattern that allows for
+/// matching and rewriting against an instance of a derived operation class as
+/// opposed to a raw Operation.
+template <typename SourceOp> struct OpRewritePattern : public RewritePattern {
+  /// Patterns must specify the root operation name they match against, and can
+  /// also specify the benefit of the pattern matching.
+  OpRewritePattern(MLIRContext *context, PatternBenefit benefit = 1)
+      : RewritePattern(SourceOp::getOperationName(), benefit, context) {}
+
+  /// Wrappers around the RewritePattern methods that pass the derived op type.
+  void rewrite(Operation *op, std::unique_ptr<PatternState> state,
+               PatternRewriter &rewriter) const final {
+    rewrite(cast<SourceOp>(op), std::move(state), rewriter);
+  }
+  void rewrite(Operation *op, PatternRewriter &rewriter) const final {
+    rewrite(cast<SourceOp>(op), rewriter);
+  }
+  PatternMatchResult match(Operation *op) const final {
+    return match(cast<SourceOp>(op));
+  }
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const final {
+    return matchAndRewrite(cast<SourceOp>(op), rewriter);
+  }
+
+  /// Rewrite and Match methods that operate on the SourceOp type. These must be
+  /// overridden by the derived pattern class.
+  virtual void rewrite(SourceOp op, std::unique_ptr<PatternState> state,
+                       PatternRewriter &rewriter) const {
+    rewrite(op, rewriter);
+  }
+  virtual void rewrite(SourceOp op, PatternRewriter &rewriter) const {
+    llvm_unreachable("must override matchAndRewrite or a rewrite method");
+  }
+  virtual PatternMatchResult match(SourceOp op) const {
+    llvm_unreachable("must override match or matchAndRewrite");
+  }
+  virtual PatternMatchResult matchAndRewrite(SourceOp op,
+                                             PatternRewriter &rewriter) const {
+    if (auto matchResult = match(op)) {
+      rewrite(op, std::move(*matchResult), rewriter);
+      return matchSuccess();
+    }
+    return matchFailure();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// PatternRewriter class
+//===----------------------------------------------------------------------===//
+
+/// This class coordinates the application of a pattern to the current function,
+/// providing a way to create operations and keep track of what gets deleted.
+///
+/// These class serves two purposes:
+///  1) it is the interface that patterns interact with to make mutations to the
+///     IR they are being applied to.
+///  2) It is a base class that clients of the PatternMatcher use when they want
+///     to apply patterns and observe their effects (e.g. to keep worklists or
+///     other data structures up to date).
+///
+class PatternRewriter : public OpBuilder {
+public:
+  /// Create operation of specific op type at the current insertion point
+  /// without verifying to see if it is valid.
+  template <typename OpTy, typename... Args>
+  OpTy create(Location location, Args... args) {
+    OperationState state(location, OpTy::getOperationName());
+    OpTy::build(this, state, args...);
+    auto *op = createOperation(state);
+    auto result = dyn_cast<OpTy>(op);
+    assert(result && "Builder didn't return the right type");
+    return result;
+  }
+
+  /// Creates an operation of specific op type at the current insertion point.
+  /// If the result is an invalid op (the verifier hook fails), emit an error
+  /// and return null.
+  template <typename OpTy, typename... Args>
+  OpTy createChecked(Location location, Args... args) {
+    OperationState state(location, OpTy::getOperationName());
+    OpTy::build(this, state, args...);
+    auto *op = createOperation(state);
+
+    // If the Operation we produce is valid, return it.
+    if (!OpTy::verifyInvariants(op)) {
+      auto result = dyn_cast<OpTy>(op);
+      assert(result && "Builder didn't return the right type");
+      return result;
+    }
+
+    // Otherwise, the error message got emitted.  Just remove the operation
+    // we made.
+    op->erase();
+    return OpTy();
+  }
+
+  /// This is implemented to insert the specified operation and serves as a
+  /// notification hook for rewriters that want to know about new operations.
+  virtual Operation *insert(Operation *op) = 0;
+
+  /// Move the blocks that belong to "region" before the given position in
+  /// another region "parent". The two regions must be different. The caller
+  /// is responsible for creating or updating the operation transferring flow
+  /// of control to the region and passing it the correct block arguments.
+  virtual void inlineRegionBefore(Region &region, Region &parent,
+                                  Region::iterator before);
+  void inlineRegionBefore(Region &region, Block *before);
+
+  /// Clone the blocks that belong to "region" before the given position in
+  /// another region "parent". The two regions must be different. The caller is
+  /// responsible for creating or updating the operation transferring flow of
+  /// control to the region and passing it the correct block arguments.
+  virtual void cloneRegionBefore(Region &region, Region &parent,
+                                 Region::iterator before,
+                                 BlockAndValueMapping &mapping);
+  void cloneRegionBefore(Region &region, Region &parent,
+                         Region::iterator before);
+  void cloneRegionBefore(Region &region, Block *before);
+
+  /// This method performs the final replacement for a pattern, where the
+  /// results of the operation are updated to use the specified list of SSA
+  /// values.  In addition to replacing and removing the specified operation,
+  /// clients can specify a list of other nodes that this replacement may make
+  /// (perhaps transitively) dead.  If any of those values are dead, this will
+  /// remove them as well.
+  virtual void replaceOp(Operation *op, ValueRange newValues,
+                         ValueRange valuesToRemoveIfDead);
+  void replaceOp(Operation *op, ValueRange newValues) {
+    replaceOp(op, newValues, llvm::None);
+  }
+
+  /// Replaces the result op with a new op that is created without verification.
+  /// The result values of the two ops must be the same types.
+  template <typename OpTy, typename... Args>
+  void replaceOpWithNewOp(Operation *op, Args &&... args) {
+    auto newOp = create<OpTy>(op->getLoc(), std::forward<Args>(args)...);
+    replaceOpWithResultsOfAnotherOp(op, newOp.getOperation(), {});
+  }
+
+  /// Replaces the result op with a new op that is created without verification.
+  /// The result values of the two ops must be the same types.  This allows
+  /// specifying a list of ops that may be removed if dead.
+  template <typename OpTy, typename... Args>
+  void replaceOpWithNewOp(ValueRange valuesToRemoveIfDead, Operation *op,
+                          Args &&... args) {
+    auto newOp = create<OpTy>(op->getLoc(), std::forward<Args>(args)...);
+    replaceOpWithResultsOfAnotherOp(op, newOp.getOperation(),
+                                    valuesToRemoveIfDead);
+  }
+
+  /// This method erases an operation that is known to have no uses.
+  virtual void eraseOp(Operation *op);
+
+  /// Merge the operations of block 'source' into the end of block 'dest'.
+  /// 'source's predecessors must either be empty or only contain 'dest`.
+  /// 'argValues' is used to replace the block arguments of 'source' after
+  /// merging.
+  virtual void mergeBlocks(Block *source, Block *dest,
+                           ValueRange argValues = llvm::None);
+
+  /// Split the operations starting at "before" (inclusive) out of the given
+  /// block into a new block, and return it.
+  virtual Block *splitBlock(Block *block, Block::iterator before);
+
+  /// This method is used to notify the rewriter that an in-place operation
+  /// modification is about to happen. A call to this function *must* be
+  /// followed by a call to either `finalizeRootUpdate` or `cancelRootUpdate`.
+  /// This is a minor efficiency win (it avoids creating a new operation and
+  /// removing the old one) but also often allows simpler code in the client.
+  virtual void startRootUpdate(Operation *op) {}
+
+  /// This method is used to signal the end of a root update on the given
+  /// operation. This can only be called on operations that were provided to a
+  /// call to `startRootUpdate`.
+  virtual void finalizeRootUpdate(Operation *op) {}
+
+  /// This method cancels a pending root update. This can only be called on
+  /// operations that were provided to a call to `startRootUpdate`.
+  virtual void cancelRootUpdate(Operation *op) {}
+
+  /// This method is a utility wrapper around a root update of an operation. It
+  /// wraps calls to `startRootUpdate` and `finalizeRootUpdate` around the given
+  /// callable.
+  template <typename CallableT>
+  void updateRootInPlace(Operation *root, CallableT &&callable) {
+    startRootUpdate(root);
+    callable();
+    finalizeRootUpdate(root);
+  }
+
+protected:
+  explicit PatternRewriter(MLIRContext *ctx) : OpBuilder(ctx) {}
+  virtual ~PatternRewriter();
+
+  // These are the callback methods that subclasses can choose to implement if
+  // they would like to be notified about certain types of mutations.
+
+  /// Notify the pattern rewriter that the specified operation is about to be
+  /// replaced with another set of operations.  This is called before the uses
+  /// of the operation have been changed.
+  virtual void notifyRootReplaced(Operation *op) {}
+
+  /// This is called on an operation that a pattern match is removing, right
+  /// before the operation is deleted.  At this point, the operation has zero
+  /// uses.
+  virtual void notifyOperationRemoved(Operation *op) {}
+
+private:
+  /// op and newOp are known to have the same number of results, replace the
+  /// uses of op with uses of newOp
+  void replaceOpWithResultsOfAnotherOp(Operation *op, Operation *newOp,
+                                       ValueRange valuesToRemoveIfDead);
+};
+
+//===----------------------------------------------------------------------===//
+// Pattern-driven rewriters
+//===----------------------------------------------------------------------===//
+
+class OwningRewritePatternList {
+  using PatternListT = std::vector<std::unique_ptr<RewritePattern>>;
+
+public:
+  PatternListT::iterator begin() { return patterns.begin(); }
+  PatternListT::iterator end() { return patterns.end(); }
+  PatternListT::const_iterator begin() const { return patterns.begin(); }
+  PatternListT::const_iterator end() const { return patterns.end(); }
+  void clear() { patterns.clear(); }
+
+  //===--------------------------------------------------------------------===//
+  // Pattern Insertion
+  //===--------------------------------------------------------------------===//
+
+  /// Add an instance of each of the pattern types 'Ts' to the pattern list with
+  /// the given arguments.
+  /// Note: ConstructorArg is necessary here to separate the two variadic lists.
+  template <typename... Ts, typename ConstructorArg,
+            typename... ConstructorArgs,
+            typename = std::enable_if_t<sizeof...(Ts) != 0>>
+  void insert(ConstructorArg &&arg, ConstructorArgs &&... args) {
+    // The following expands a call to emplace_back for each of the pattern
+    // types 'Ts'. This magic is necessary due to a limitation in the places
+    // that a parameter pack can be expanded in c++11.
+    // FIXME: In c++17 this can be simplified by using 'fold expressions'.
+    using dummy = int[];
+    (void)dummy{
+        0, (patterns.emplace_back(std::make_unique<Ts>(arg, args...)), 0)...};
+  }
+
+private:
+  PatternListT patterns;
+};
+
+/// This class manages optimization and execution of a group of rewrite
+/// patterns, providing an API for finding and applying, the best match against
+/// a given node.
+///
+class RewritePatternMatcher {
+public:
+  /// Create a RewritePatternMatcher with the specified set of patterns.
+  explicit RewritePatternMatcher(const OwningRewritePatternList &patterns);
+
+  /// Try to match the given operation to a pattern and rewrite it. Return
+  /// true if any pattern matches.
+  bool matchAndRewrite(Operation *op, PatternRewriter &rewriter);
+
+private:
+  RewritePatternMatcher(const RewritePatternMatcher &) = delete;
+  void operator=(const RewritePatternMatcher &) = delete;
+
+  /// The group of patterns that are matched for optimization through this
+  /// matcher.
+  std::vector<RewritePattern *> patterns;
+};
+
+/// Rewrite the regions of the specified operation, which must be isolated from
+/// above, by repeatedly applying the highest benefit patterns in a greedy
+/// work-list driven manner. Return true if no more patterns can be matched in
+/// the result operation regions.
+/// Note: This does not apply patterns to the top-level operation itself.
+/// Note: These methods also perform folding and simple dead-code elimination
+///       before attempting to match any of the provided patterns.
+///
+bool applyPatternsGreedily(Operation *op,
+                           const OwningRewritePatternList &patterns);
+/// Rewrite the given regions, which must be isolated from above.
+bool applyPatternsGreedily(MutableArrayRef<Region> regions,
+                           const OwningRewritePatternList &patterns);
+} // end namespace mlir
+
+#endif // MLIR_PATTERN_MATCH_H
diff --git a/mlir/include/mlir/IR/Region.h b/mlir/include/mlir/IR/Region.h
new file mode 100644
index 0000000000000000000000000000000000000000..00f3ca7fba15ac9142acf679e172f2631b625a15
--- /dev/null
+++ b/mlir/include/mlir/IR/Region.h
@@ -0,0 +1,192 @@
+//===- Region.h - MLIR Region Class -----------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Region class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_REGION_H
+#define MLIR_IR_REGION_H
+
+#include "mlir/IR/Block.h"
+
+namespace mlir {
+class BlockAndValueMapping;
+
+/// This class contains a list of basic blocks and a link to the parent
+/// operation it is attached to.
+class Region {
+public:
+  Region() = default;
+  explicit Region(Operation *container);
+  ~Region();
+
+  /// Return the context this region is inserted in.  The region must have a
+  /// valid parent container.
+  MLIRContext *getContext();
+
+  /// Return a location for this region. This is the location attached to the
+  /// parent container. The region must have a valid parent container.
+  Location getLoc();
+
+  using BlockListType = llvm::iplist<Block>;
+  BlockListType &getBlocks() { return blocks; }
+
+  // Iteration over the blocks in the region.
+  using iterator = BlockListType::iterator;
+  using reverse_iterator = BlockListType::reverse_iterator;
+
+  iterator begin() { return blocks.begin(); }
+  iterator end() { return blocks.end(); }
+  reverse_iterator rbegin() { return blocks.rbegin(); }
+  reverse_iterator rend() { return blocks.rend(); }
+
+  bool empty() { return blocks.empty(); }
+  void push_back(Block *block) { blocks.push_back(block); }
+  void push_front(Block *block) { blocks.push_front(block); }
+
+  Block &back() { return blocks.back(); }
+  Block &front() { return blocks.front(); }
+
+  /// getSublistAccess() - Returns pointer to member of region.
+  static BlockListType Region::*getSublistAccess(Block *) {
+    return &Region::blocks;
+  }
+
+  /// Return the region containing this region or nullptr if the region is
+  /// attached to a top-level operation.
+  Region *getParentRegion();
+
+  /// Return the parent operation this region is attached to.
+  Operation *getParentOp();
+
+  /// Find the first parent operation of the given type, or nullptr if there is
+  /// no ancestor operation.
+  template <typename ParentT> ParentT getParentOfType() {
+    auto *region = this;
+    do {
+      if (auto parent = dyn_cast_or_null<ParentT>(region->container))
+        return parent;
+    } while ((region = region->getParentRegion()));
+    return ParentT();
+  }
+
+  /// Return the number of this region in the parent operation.
+  unsigned getRegionNumber();
+
+  /// Return true if this region is a proper ancestor of the `other` region.
+  bool isProperAncestor(Region *other);
+
+  /// Return true if this region is ancestor of the `other` region.  A region
+  /// is considered as its own ancestor, use `isProperAncestor` to avoid this.
+  bool isAncestor(Region *other) {
+    return this == other || isProperAncestor(other);
+  }
+
+  /// Clone the internal blocks from this region into dest. Any
+  /// cloned blocks are appended to the back of dest. If the mapper
+  /// contains entries for block arguments, these arguments are not included
+  /// in the respective cloned block.
+  void cloneInto(Region *dest, BlockAndValueMapping &mapper);
+  /// Clone this region into 'dest' before the given position in 'dest'.
+  void cloneInto(Region *dest, Region::iterator destPos,
+                 BlockAndValueMapping &mapper);
+
+  /// Takes body of another region (that region will have no body after this
+  /// operation completes).  The current body of this region is cleared.
+  void takeBody(Region &other) {
+    blocks.clear();
+    blocks.splice(blocks.end(), other.getBlocks());
+  }
+
+  /// Check that this does not use any value defined outside it.
+  /// Emit errors if `noteLoc` is provided; this location is used to point
+  /// to the operation containing the region, the actual error is reported at
+  /// the operation with an offending use.
+  bool isIsolatedFromAbove(Optional<Location> noteLoc = llvm::None);
+
+  /// Drop all operand uses from operations within this region, which is
+  /// an essential step in breaking cyclic dependences between references when
+  /// they are to be deleted.
+  void dropAllReferences();
+
+  /// Walk the operations in this region in postorder, calling the callback for
+  /// each operation. This method is invoked for void-returning callbacks.
+  /// See Operation::walk for more details.
+  template <typename FnT, typename RetT = detail::walkResultType<FnT>>
+  typename std::enable_if<std::is_same<RetT, void>::value, RetT>::type
+  walk(FnT &&callback) {
+    for (auto &block : *this)
+      block.walk(callback);
+  }
+
+  /// Walk the operations in this region in postorder, calling the callback for
+  /// each operation. This method is invoked for interruptible callbacks.
+  /// See Operation::walk for more details.
+  template <typename FnT, typename RetT = detail::walkResultType<FnT>>
+  typename std::enable_if<std::is_same<RetT, WalkResult>::value, RetT>::type
+  walk(FnT &&callback) {
+    for (auto &block : *this)
+      if (block.walk(callback).wasInterrupted())
+        return WalkResult::interrupt();
+    return WalkResult::advance();
+  }
+
+  /// Displays the CFG in a window. This is for use from the debugger and
+  /// depends on Graphviz to generate the graph.
+  /// This function is defined in ViewRegionGraph and only works with that
+  /// target linked.
+  void viewGraph(const Twine &regionName);
+  void viewGraph();
+
+private:
+  BlockListType blocks;
+
+  /// This is the object we are part of.
+  Operation *container;
+};
+
+/// This class provides an abstraction over the different types of ranges over
+/// Regions. In many cases, this prevents the need to explicitly materialize a
+/// SmallVector/std::vector. This class should be used in places that are not
+/// suitable for a more derived type (e.g. ArrayRef) or a template range
+/// parameter.
+class RegionRange
+    : public detail::indexed_accessor_range_base<
+          RegionRange, PointerUnion<Region *, const std::unique_ptr<Region> *>,
+          Region *, Region *, Region *> {
+  /// The type representing the owner of this range. This is either a list of
+  /// values, operands, or results.
+  using OwnerT = PointerUnion<Region *, const std::unique_ptr<Region> *>;
+
+public:
+  using RangeBaseT::RangeBaseT;
+
+  RegionRange(MutableArrayRef<Region> regions = llvm::None);
+
+  template <typename Arg,
+            typename = typename std::enable_if_t<std::is_constructible<
+                ArrayRef<std::unique_ptr<Region>>, Arg>::value>>
+  RegionRange(Arg &&arg)
+      : RegionRange(ArrayRef<std::unique_ptr<Region>>(std::forward<Arg>(arg))) {
+  }
+  RegionRange(ArrayRef<std::unique_ptr<Region>> regions);
+
+private:
+  /// See `detail::indexed_accessor_range_base` for details.
+  static OwnerT offset_base(const OwnerT &owner, ptrdiff_t index);
+  /// See `detail::indexed_accessor_range_base` for details.
+  static Region *dereference_iterator(const OwnerT &owner, ptrdiff_t index);
+
+  /// Allow access to `offset_base` and `dereference_iterator`.
+  friend RangeBaseT;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_IR_REGION_H
diff --git a/mlir/include/mlir/IR/RegionGraphTraits.h b/mlir/include/mlir/IR/RegionGraphTraits.h
new file mode 100644
index 0000000000000000000000000000000000000000..b11c87dbd0c6e3c0e2bbf08ccebbe8166232a54f
--- /dev/null
+++ b/mlir/include/mlir/IR/RegionGraphTraits.h
@@ -0,0 +1,85 @@
+//===- RegionGraphTraits.h - llvm::GraphTraits for CFGs ---------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements specializations of llvm::GraphTraits for various MLIR
+// CFG data types.  This allows the generic LLVM graph algorithms to be applied
+// to CFGs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_REGIONGRAPHTRAITS_H
+#define MLIR_IR_REGIONGRAPHTRAITS_H
+
+#include "mlir/IR/Region.h"
+#include "llvm/ADT/GraphTraits.h"
+
+namespace llvm {
+template <> struct GraphTraits<mlir::Block *> {
+  using ChildIteratorType = mlir::Block::succ_iterator;
+  using Node = mlir::Block;
+  using NodeRef = Node *;
+
+  static NodeRef getEntryNode(NodeRef bb) { return bb; }
+
+  static ChildIteratorType child_begin(NodeRef node) {
+    return node->succ_begin();
+  }
+  static ChildIteratorType child_end(NodeRef node) { return node->succ_end(); }
+};
+
+template <> struct GraphTraits<Inverse<mlir::Block *>> {
+  using ChildIteratorType = mlir::Block::pred_iterator;
+  using Node = mlir::Block;
+  using NodeRef = Node *;
+  static NodeRef getEntryNode(Inverse<NodeRef> inverseGraph) {
+    return inverseGraph.Graph;
+  }
+  static inline ChildIteratorType child_begin(NodeRef node) {
+    return node->pred_begin();
+  }
+  static inline ChildIteratorType child_end(NodeRef node) {
+    return node->pred_end();
+  }
+};
+
+template <>
+struct GraphTraits<mlir::Region *> : public GraphTraits<mlir::Block *> {
+  using GraphType = mlir::Region *;
+  using NodeRef = mlir::Block *;
+
+  static NodeRef getEntryNode(GraphType fn) { return &fn->front(); }
+
+  using nodes_iterator = pointer_iterator<mlir::Region::iterator>;
+  static nodes_iterator nodes_begin(GraphType fn) {
+    return nodes_iterator(fn->begin());
+  }
+  static nodes_iterator nodes_end(GraphType fn) {
+    return nodes_iterator(fn->end());
+  }
+};
+
+template <>
+struct GraphTraits<Inverse<mlir::Region *>>
+    : public GraphTraits<Inverse<mlir::Block *>> {
+  using GraphType = Inverse<mlir::Region *>;
+  using NodeRef = NodeRef;
+
+  static NodeRef getEntryNode(GraphType fn) { return &fn.Graph->front(); }
+
+  using nodes_iterator = pointer_iterator<mlir::Region::iterator>;
+  static nodes_iterator nodes_begin(GraphType fn) {
+    return nodes_iterator(fn.Graph->begin());
+  }
+  static nodes_iterator nodes_end(GraphType fn) {
+    return nodes_iterator(fn.Graph->end());
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/mlir/include/mlir/IR/StandardTypes.h b/mlir/include/mlir/IR/StandardTypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..89ffc45e5475503fa66a2e6209af59d637005775
--- /dev/null
+++ b/mlir/include/mlir/IR/StandardTypes.h
@@ -0,0 +1,576 @@
+//===- StandardTypes.h - MLIR Standard Type Classes -------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_STANDARDTYPES_H
+#define MLIR_IR_STANDARDTYPES_H
+
+#include "mlir/IR/Types.h"
+
+namespace llvm {
+struct fltSemantics;
+} // namespace llvm
+
+namespace mlir {
+class AffineMap;
+class FloatType;
+class IndexType;
+class IntegerType;
+class Location;
+class MLIRContext;
+
+namespace detail {
+
+struct IntegerTypeStorage;
+struct ShapedTypeStorage;
+struct VectorTypeStorage;
+struct RankedTensorTypeStorage;
+struct UnrankedTensorTypeStorage;
+struct MemRefTypeStorage;
+struct UnrankedMemRefTypeStorage;
+struct ComplexTypeStorage;
+struct TupleTypeStorage;
+
+} // namespace detail
+
+namespace StandardTypes {
+enum Kind {
+  // Floating point.
+  BF16 = Type::Kind::FIRST_STANDARD_TYPE,
+  F16,
+  F32,
+  F64,
+  FIRST_FLOATING_POINT_TYPE = BF16,
+  LAST_FLOATING_POINT_TYPE = F64,
+
+  // Target pointer sized integer, used (e.g.) in affine mappings.
+  Index,
+
+  // Derived types.
+  Integer,
+  Vector,
+  RankedTensor,
+  UnrankedTensor,
+  MemRef,
+  UnrankedMemRef,
+  Complex,
+  Tuple,
+  None,
+};
+
+} // namespace StandardTypes
+
+/// Index is a special integer-like type with unknown platform-dependent bit
+/// width.
+class IndexType : public Type::TypeBase<IndexType, Type> {
+public:
+  using Base::Base;
+
+  /// Get an instance of the IndexType.
+  static IndexType get(MLIRContext *context);
+
+  /// Support method to enable LLVM-style type casting.
+  static bool kindof(unsigned kind) { return kind == StandardTypes::Index; }
+};
+
+/// Integer types can have arbitrary bitwidth up to a large fixed limit.
+class IntegerType
+    : public Type::TypeBase<IntegerType, Type, detail::IntegerTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new IntegerType of the given width within the context.
+  /// Assume the width is within the allowed range and assert on failures.
+  /// Use getChecked to handle failures gracefully.
+  static IntegerType get(unsigned width, MLIRContext *context);
+
+  /// Get or create a new IntegerType of the given width within the context,
+  /// defined at the given, potentially unknown, location.  If the width is
+  /// outside the allowed range, emit errors and return a null type.
+  static IntegerType getChecked(unsigned width, MLIRContext *context,
+                                Location location);
+
+  /// Verify the construction of an integer type.
+  static LogicalResult verifyConstructionInvariants(Optional<Location> loc,
+                                                    MLIRContext *context,
+                                                    unsigned width);
+
+  /// Return the bitwidth of this integer type.
+  unsigned getWidth() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == StandardTypes::Integer; }
+
+  /// Integer representation maximal bitwidth.
+  static constexpr unsigned kMaxWidth = 4096;
+};
+
+class FloatType : public Type::TypeBase<FloatType, Type> {
+public:
+  using Base::Base;
+
+  static FloatType get(StandardTypes::Kind kind, MLIRContext *context);
+
+  // Convenience factories.
+  static FloatType getBF16(MLIRContext *ctx) {
+    return get(StandardTypes::BF16, ctx);
+  }
+  static FloatType getF16(MLIRContext *ctx) {
+    return get(StandardTypes::F16, ctx);
+  }
+  static FloatType getF32(MLIRContext *ctx) {
+    return get(StandardTypes::F32, ctx);
+  }
+  static FloatType getF64(MLIRContext *ctx) {
+    return get(StandardTypes::F64, ctx);
+  }
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) {
+    return kind >= StandardTypes::FIRST_FLOATING_POINT_TYPE &&
+           kind <= StandardTypes::LAST_FLOATING_POINT_TYPE;
+  }
+
+  /// Return the bitwidth of this float type.
+  unsigned getWidth();
+
+  /// Return the floating semantics of this float type.
+  const llvm::fltSemantics &getFloatSemantics();
+};
+
+/// The 'complex' type represents a complex number with a parameterized element
+/// type, which is composed of a real and imaginary value of that element type.
+///
+/// The element must be a floating point or integer scalar type.
+///
+class ComplexType
+    : public Type::TypeBase<ComplexType, Type, detail::ComplexTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a ComplexType with the provided element type.
+  static ComplexType get(Type elementType);
+
+  /// Get or create a ComplexType with the provided element type.  This emits
+  /// and error at the specified location and returns null if the element type
+  /// isn't supported.
+  static ComplexType getChecked(Type elementType, Location location);
+
+  /// Verify the construction of an integer type.
+  static LogicalResult verifyConstructionInvariants(Optional<Location> loc,
+                                                    MLIRContext *context,
+                                                    Type elementType);
+
+  Type getElementType();
+
+  static bool kindof(unsigned kind) { return kind == StandardTypes::Complex; }
+};
+
+/// This is a common base class between Vector, UnrankedTensor, RankedTensor,
+/// and MemRef types because they share behavior and semantics around shape,
+/// rank, and fixed element type. Any type with these semantics should inherit
+/// from ShapedType.
+class ShapedType : public Type {
+public:
+  using ImplType = detail::ShapedTypeStorage;
+  using Type::Type;
+
+  // TODO(ntv): merge these two special values in a single one used everywhere.
+  // Unfortunately, uses of `-1` have crept deep into the codebase now and are
+  // hard to track.
+  static constexpr int64_t kDynamicSize = -1;
+  static constexpr int64_t kDynamicStrideOrOffset =
+      std::numeric_limits<int64_t>::min();
+
+  /// Return the element type.
+  Type getElementType() const;
+
+  /// If an element type is an integer or a float, return its width. Otherwise,
+  /// abort.
+  unsigned getElementTypeBitWidth() const;
+
+  /// If it has static shape, return the number of elements. Otherwise, abort.
+  int64_t getNumElements() const;
+
+  /// If this is a ranked type, return the rank. Otherwise, abort.
+  int64_t getRank() const;
+
+  /// Whether or not this is a ranked type. Memrefs, vectors and ranked tensors
+  /// have a rank, while unranked tensors do not.
+  bool hasRank() const;
+
+  /// If this is a ranked type, return the shape. Otherwise, abort.
+  ArrayRef<int64_t> getShape() const;
+
+  /// If this is unranked type or any dimension has unknown size (<0), it
+  /// doesn't have static shape. If all dimensions have known size (>= 0), it
+  /// has static shape.
+  bool hasStaticShape() const;
+
+  /// If this has a static shape and the shape is equal to `shape` return true.
+  bool hasStaticShape(ArrayRef<int64_t> shape) const;
+
+  /// If this is a ranked type, return the number of dimensions with dynamic
+  /// size. Otherwise, abort.
+  int64_t getNumDynamicDims() const;
+
+  /// If this is ranked type, return the size of the specified dimension.
+  /// Otherwise, abort.
+  int64_t getDimSize(int64_t i) const;
+
+  /// Returns the position of the dynamic dimension relative to just the dynamic
+  /// dimensions, given its `index` within the shape.
+  unsigned getDynamicDimIndex(unsigned index) const;
+
+  /// Get the total amount of bits occupied by a value of this type.  This does
+  /// not take into account any memory layout or widening constraints, e.g. a
+  /// vector<3xi57> is reported to occupy 3x57=171 bit, even though in practice
+  /// it will likely be stored as in a 4xi64 vector register.  Fail an assertion
+  /// if the size cannot be computed statically, i.e. if the type has a dynamic
+  /// shape or if its elemental type does not have a known bit width.
+  int64_t getSizeInBits() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(Type type) {
+    return type.getKind() == StandardTypes::Vector ||
+           type.getKind() == StandardTypes::RankedTensor ||
+           type.getKind() == StandardTypes::UnrankedTensor ||
+           type.getKind() == StandardTypes::UnrankedMemRef ||
+           type.getKind() == StandardTypes::MemRef;
+  }
+
+  /// Whether the given dimension size indicates a dynamic dimension.
+  static constexpr bool isDynamic(int64_t dSize) { return dSize < 0; }
+};
+
+/// Vector types represent multi-dimensional SIMD vectors, and have a fixed
+/// known constant shape with one or more dimension.
+class VectorType
+    : public Type::TypeBase<VectorType, ShapedType, detail::VectorTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new VectorType of the provided shape and element type.
+  /// Assumes the arguments define a well-formed VectorType.
+  static VectorType get(ArrayRef<int64_t> shape, Type elementType);
+
+  /// Get or create a new VectorType of the provided shape and element type
+  /// declared at the given, potentially unknown, location.  If the VectorType
+  /// defined by the arguments would be ill-formed, emit errors and return
+  /// nullptr-wrapping type.
+  static VectorType getChecked(ArrayRef<int64_t> shape, Type elementType,
+                               Location location);
+
+  /// Verify the construction of a vector type.
+  static LogicalResult verifyConstructionInvariants(Optional<Location> loc,
+                                                    MLIRContext *context,
+                                                    ArrayRef<int64_t> shape,
+                                                    Type elementType);
+
+  /// Returns true of the given type can be used as an element of a vector type.
+  /// In particular, vectors can consist of integer or float primitives.
+  static bool isValidElementType(Type t) { return t.isIntOrFloat(); }
+
+  ArrayRef<int64_t> getShape() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == StandardTypes::Vector; }
+};
+
+/// Tensor types represent multi-dimensional arrays, and have two variants:
+/// RankedTensorType and UnrankedTensorType.
+class TensorType : public ShapedType {
+public:
+  using ShapedType::ShapedType;
+
+  /// Return true if the specified element type is ok in a tensor.
+  static bool isValidElementType(Type type) {
+    // Note: Non standard/builtin types are allowed to exist within tensor
+    // types. Dialects are expected to verify that tensor types have a valid
+    // element type within that dialect.
+    return type.isIntOrFloat() || type.isa<ComplexType>() ||
+           type.isa<VectorType>() || type.isa<OpaqueType>() ||
+           (type.getKind() > Type::Kind::LAST_STANDARD_TYPE);
+  }
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(Type type) {
+    return type.getKind() == StandardTypes::RankedTensor ||
+           type.getKind() == StandardTypes::UnrankedTensor;
+  }
+};
+
+/// Ranked tensor types represent multi-dimensional arrays that have a shape
+/// with a fixed number of dimensions. Each shape element can be a positive
+/// integer or unknown (represented -1).
+class RankedTensorType
+    : public Type::TypeBase<RankedTensorType, TensorType,
+                            detail::RankedTensorTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new RankedTensorType of the provided shape and element
+  /// type. Assumes the arguments define a well-formed type.
+  static RankedTensorType get(ArrayRef<int64_t> shape, Type elementType);
+
+  /// Get or create a new RankedTensorType of the provided shape and element
+  /// type declared at the given, potentially unknown, location.  If the
+  /// RankedTensorType defined by the arguments would be ill-formed, emit errors
+  /// and return a nullptr-wrapping type.
+  static RankedTensorType getChecked(ArrayRef<int64_t> shape, Type elementType,
+                                     Location location);
+
+  /// Verify the construction of a ranked tensor type.
+  static LogicalResult verifyConstructionInvariants(Optional<Location> loc,
+                                                    MLIRContext *context,
+                                                    ArrayRef<int64_t> shape,
+                                                    Type elementType);
+
+  ArrayRef<int64_t> getShape() const;
+
+  static bool kindof(unsigned kind) {
+    return kind == StandardTypes::RankedTensor;
+  }
+};
+
+/// Unranked tensor types represent multi-dimensional arrays that have an
+/// unknown shape.
+class UnrankedTensorType
+    : public Type::TypeBase<UnrankedTensorType, TensorType,
+                            detail::UnrankedTensorTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new UnrankedTensorType of the provided shape and element
+  /// type. Assumes the arguments define a well-formed type.
+  static UnrankedTensorType get(Type elementType);
+
+  /// Get or create a new UnrankedTensorType of the provided shape and element
+  /// type declared at the given, potentially unknown, location.  If the
+  /// UnrankedTensorType defined by the arguments would be ill-formed, emit
+  /// errors and return a nullptr-wrapping type.
+  static UnrankedTensorType getChecked(Type elementType, Location location);
+
+  /// Verify the construction of a unranked tensor type.
+  static LogicalResult verifyConstructionInvariants(Optional<Location> loc,
+                                                    MLIRContext *context,
+                                                    Type elementType);
+
+  ArrayRef<int64_t> getShape() const { return llvm::None; }
+
+  static bool kindof(unsigned kind) {
+    return kind == StandardTypes::UnrankedTensor;
+  }
+};
+
+/// Base MemRef for Ranked and Unranked variants
+class BaseMemRefType : public ShapedType {
+public:
+  using ShapedType::ShapedType;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(Type type) {
+    return type.getKind() == StandardTypes::MemRef ||
+           type.getKind() == StandardTypes::UnrankedMemRef;
+  }
+};
+
+/// MemRef types represent a region of memory that have a shape with a fixed
+/// number of dimensions. Each shape element can be a non-negative integer or
+/// unknown (represented by any negative integer). MemRef types also have an
+/// affine map composition, represented as an array AffineMap pointers.
+class MemRefType : public Type::TypeBase<MemRefType, BaseMemRefType,
+                                         detail::MemRefTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new MemRefType based on shape, element type, affine
+  /// map composition, and memory space.  Assumes the arguments define a
+  /// well-formed MemRef type.  Use getChecked to gracefully handle MemRefType
+  /// construction failures.
+  static MemRefType get(ArrayRef<int64_t> shape, Type elementType,
+                        ArrayRef<AffineMap> affineMapComposition = {},
+                        unsigned memorySpace = 0);
+
+  /// Get or create a new MemRefType based on shape, element type, affine
+  /// map composition, and memory space declared at the given location.
+  /// If the location is unknown, the last argument should be an instance of
+  /// UnknownLoc.  If the MemRefType defined by the arguments would be
+  /// ill-formed, emits errors (to the handler registered with the context or to
+  /// the error stream) and returns nullptr.
+  static MemRefType getChecked(ArrayRef<int64_t> shape, Type elementType,
+                               ArrayRef<AffineMap> affineMapComposition,
+                               unsigned memorySpace, Location location);
+
+  ArrayRef<int64_t> getShape() const;
+
+  /// Returns an array of affine map pointers representing the memref affine
+  /// map composition.
+  ArrayRef<AffineMap> getAffineMaps() const;
+
+  /// Returns the memory space in which data referred to by this memref resides.
+  unsigned getMemorySpace() const;
+
+  // TODO(ntv): merge these two special values in a single one used everywhere.
+  // Unfortunately, uses of `-1` have crept deep into the codebase now and are
+  // hard to track.
+  static constexpr int64_t kDynamicSize = -1;
+  static int64_t getDynamicStrideOrOffset() {
+    return ShapedType::kDynamicStrideOrOffset;
+  }
+
+  static bool kindof(unsigned kind) { return kind == StandardTypes::MemRef; }
+
+private:
+  /// Get or create a new MemRefType defined by the arguments.  If the resulting
+  /// type would be ill-formed, return nullptr.  If the location is provided,
+  /// emit detailed error messages.
+  static MemRefType getImpl(ArrayRef<int64_t> shape, Type elementType,
+                            ArrayRef<AffineMap> affineMapComposition,
+                            unsigned memorySpace, Optional<Location> location);
+  using Base::getImpl;
+};
+
+/// Unranked MemRef type represent multi-dimensional MemRefs that
+/// have an unknown rank.
+class UnrankedMemRefType
+    : public Type::TypeBase<UnrankedMemRefType, BaseMemRefType,
+                            detail::UnrankedMemRefTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new UnrankedMemRefType of the provided element
+  /// type and memory space
+  static UnrankedMemRefType get(Type elementType, unsigned memorySpace);
+
+  /// Get or create a new UnrankedMemRefType of the provided element
+  /// type and memory space declared at the given, potentially unknown,
+  /// location. If the UnrankedMemRefType defined by the arguments would be
+  /// ill-formed, emit errors and return a nullptr-wrapping type.
+  static UnrankedMemRefType getChecked(Type elementType, unsigned memorySpace,
+                                       Location location);
+
+  /// Verify the construction of a unranked memref type.
+  static LogicalResult verifyConstructionInvariants(Optional<Location> loc,
+                                                    MLIRContext *context,
+                                                    Type elementType,
+                                                    unsigned memorySpace);
+
+  ArrayRef<int64_t> getShape() const { return llvm::None; }
+
+  /// Returns the memory space in which data referred to by this memref resides.
+  unsigned getMemorySpace() const;
+  static bool kindof(unsigned kind) {
+    return kind == StandardTypes::UnrankedMemRef;
+  }
+};
+
+/// Tuple types represent a collection of other types. Note: This type merely
+/// provides a common mechanism for representing tuples in MLIR. It is up to
+/// dialect authors to provides operations for manipulating them, e.g.
+/// extract_tuple_element. When possible, users should prefer multi-result
+/// operations in the place of tuples.
+class TupleType
+    : public Type::TypeBase<TupleType, Type, detail::TupleTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new TupleType with the provided element types. Assumes the
+  /// arguments define a well-formed type.
+  static TupleType get(ArrayRef<Type> elementTypes, MLIRContext *context);
+
+  /// Get or create an empty tuple type.
+  static TupleType get(MLIRContext *context) { return get({}, context); }
+
+  /// Return the elements types for this tuple.
+  ArrayRef<Type> getTypes() const;
+
+  /// Accumulate the types contained in this tuple and tuples nested within it.
+  /// Note that this only flattens nested tuples, not any other container type,
+  /// e.g. a tuple<i32, tensor<i32>, tuple<f32, tuple<i64>>> is flattened to
+  /// (i32, tensor<i32>, f32, i64)
+  void getFlattenedTypes(SmallVectorImpl<Type> &types);
+
+  /// Return the number of held types.
+  size_t size() const;
+
+  /// Iterate over the held elements.
+  using iterator = ArrayRef<Type>::iterator;
+  iterator begin() const { return getTypes().begin(); }
+  iterator end() const { return getTypes().end(); }
+
+  /// Return the element type at index 'index'.
+  Type getType(size_t index) const {
+    assert(index < size() && "invalid index for tuple type");
+    return getTypes()[index];
+  }
+
+  static bool kindof(unsigned kind) { return kind == StandardTypes::Tuple; }
+};
+
+/// NoneType is a unit type, i.e. a type with exactly one possible value, where
+/// its value does not have a defined dynamic representation.
+class NoneType : public Type::TypeBase<NoneType, Type> {
+public:
+  using Base::Base;
+
+  /// Get an instance of the NoneType.
+  static NoneType get(MLIRContext *context);
+
+  static bool kindof(unsigned kind) { return kind == StandardTypes::None; }
+};
+
+/// Returns the strides of the MemRef if the layout map is in strided form.
+/// MemRefs with layout maps in strided form include:
+///   1. empty or identity layout map, in which case the stride information is
+///      the canonical form computed from sizes;
+///   2. single affine map layout of the form `K + k0 * d0 + ... kn * dn`,
+///      where K and ki's are constants or symbols.
+///
+/// A stride specification is a list of integer values that are either static
+/// or dynamic (encoded with getDynamicStrideOrOffset()). Strides encode the
+/// distance in the number of elements between successive entries along a
+/// particular dimension. For example, `memref<42x16xf32, (64 * d0 + d1)>`
+/// specifies a view into a non-contiguous memory region of `42` by `16` `f32`
+/// elements in which the distance between two consecutive elements along the
+/// outer dimension is `1` and the distance between two consecutive elements
+/// along the inner dimension is `64`.
+///
+/// If a simple strided form cannot be extracted from the composition of the
+/// layout map, returns llvm::None.
+///
+/// The convention is that the strides for dimensions d0, .. dn appear in
+/// order to make indexing intuitive into the result.
+LogicalResult getStridesAndOffset(MemRefType t,
+                                  SmallVectorImpl<int64_t> &strides,
+                                  int64_t &offset);
+
+/// Given a list of strides (in which MemRefType::getDynamicStrideOrOffset()
+/// represents a dynamic value), return the single result AffineMap which
+/// represents the linearized strided layout map. Dimensions correspond to the
+/// offset followed by the strides in order. Symbols are inserted for each
+/// dynamic dimension in order. A stride cannot take value `0`.
+///
+/// Examples:
+/// =========
+///
+///   1. For offset: 0 strides: ?, ?, 1 return
+///         (i, j, k)[M, N]->(M * i + N * j + k)
+///
+///   2. For offset: 3 strides: 32, ?, 16 return
+///         (i, j, k)[M]->(3 + 32 * i + M * j + 16 * k)
+///
+///   3. For offset: ? strides: ?, ?, ? return
+///         (i, j, k)[off, M, N, P]->(off + M * i + N * j + P * k)
+AffineMap makeStridedLinearLayoutMap(ArrayRef<int64_t> strides, int64_t offset,
+                                     MLIRContext *context);
+
+bool isStrided(MemRefType t);
+
+} // end namespace mlir
+
+#endif // MLIR_IR_STANDARDTYPES_H
diff --git a/mlir/include/mlir/IR/StorageUniquerSupport.h b/mlir/include/mlir/IR/StorageUniquerSupport.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9288197072a689d86f4e827145d00cd2eca3c17
--- /dev/null
+++ b/mlir/include/mlir/IR/StorageUniquerSupport.h
@@ -0,0 +1,85 @@
+//===- StorageUniquerSupport.h - MLIR Storage Uniquer Utilities -*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utility classes for interfacing with StorageUniquer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_STORAGEUNIQUERSUPPORT_H
+#define MLIR_IR_STORAGEUNIQUERSUPPORT_H
+
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Support/StorageUniquer.h"
+
+namespace mlir {
+class Location;
+class MLIRContext;
+
+namespace detail {
+/// Utility class for implementing users of storage classes uniqued by a
+/// StorageUniquer. Clients are not expected to interact with this class
+/// directly.
+template <typename ConcreteT, typename BaseT, typename StorageT,
+          typename UniquerT>
+class StorageUserBase : public BaseT {
+public:
+  using BaseT::BaseT;
+
+  /// Utility declarations for the concrete attribute class.
+  using Base = StorageUserBase<ConcreteT, BaseT, StorageT, UniquerT>;
+  using ImplType = StorageT;
+
+  /// Return a unique identifier for the concrete type.
+  static ClassID *getClassID() { return ClassID::getID<ConcreteT>(); }
+
+  /// Provide a default implementation of 'classof' that invokes a 'kindof'
+  /// method on the concrete type.
+  template <typename T> static bool classof(T val) {
+    static_assert(std::is_convertible<ConcreteT, T>::value,
+                  "casting from a non-convertible type");
+    return ConcreteT::kindof(val.getKind());
+  }
+
+protected:
+  /// Get or create a new ConcreteT instance within the ctx. This
+  /// function is guaranteed to return a non null object and will assert if
+  /// the arguments provided are invalid.
+  template <typename... Args>
+  static ConcreteT get(MLIRContext *ctx, unsigned kind, Args... args) {
+    // Ensure that the invariants are correct for construction.
+    assert(succeeded(
+        ConcreteT::verifyConstructionInvariants(llvm::None, ctx, args...)));
+    return UniquerT::template get<ConcreteT>(ctx, kind, args...);
+  }
+
+  /// Get or create a new ConcreteT instance within the ctx, defined at
+  /// the given, potentially unknown, location. If the arguments provided are
+  /// invalid then emit errors and return a null object.
+  template <typename... Args>
+  static ConcreteT getChecked(const Location &loc, MLIRContext *ctx,
+                              unsigned kind, Args... args) {
+    // If the construction invariants fail then we return a null attribute.
+    if (failed(ConcreteT::verifyConstructionInvariants(loc, ctx, args...)))
+      return ConcreteT();
+    return UniquerT::template get<ConcreteT>(ctx, kind, args...);
+  }
+
+  /// Default implementation that just returns success.
+  template <typename... Args>
+  static LogicalResult verifyConstructionInvariants(Args... args) {
+    return success();
+  }
+
+  /// Utility for easy access to the storage instance.
+  ImplType *getImpl() const { return static_cast<ImplType *>(this->impl); }
+};
+} // namespace detail
+} // namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/IR/SymbolTable.h b/mlir/include/mlir/IR/SymbolTable.h
new file mode 100644
index 0000000000000000000000000000000000000000..07829186cbf7c69ec184ce362ca762caa0559e4b
--- /dev/null
+++ b/mlir/include/mlir/IR/SymbolTable.h
@@ -0,0 +1,233 @@
+//===- SymbolTable.h - MLIR Symbol Table Class ------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_SYMBOLTABLE_H
+#define MLIR_IR_SYMBOLTABLE_H
+
+#include "mlir/IR/OpDefinition.h"
+#include "llvm/ADT/StringMap.h"
+
+namespace mlir {
+class Identifier;
+class Operation;
+
+/// This class allows for representing and managing the symbol table used by
+/// operations with the 'SymbolTable' trait. Inserting into and erasing from
+/// this SymbolTable will also insert and erase from the Operation given to it
+/// at construction.
+class SymbolTable {
+public:
+  /// Build a symbol table with the symbols within the given operation.
+  SymbolTable(Operation *symbolTableOp);
+
+  /// Look up a symbol with the specified name, returning null if no such
+  /// name exists. Names never include the @ on them.
+  Operation *lookup(StringRef name) const;
+  template <typename T> T lookup(StringRef name) const {
+    return dyn_cast_or_null<T>(lookup(name));
+  }
+
+  /// Erase the given symbol from the table.
+  void erase(Operation *symbol);
+
+  /// Insert a new symbol into the table, and rename it as necessary to avoid
+  /// collisions. Also insert at the specified location in the body of the
+  /// associated operation.
+  void insert(Operation *symbol, Block::iterator insertPt = {});
+
+  /// Return the name of the attribute used for symbol names.
+  static StringRef getSymbolAttrName() { return "sym_name"; }
+
+  /// Returns the associated operation.
+  Operation *getOp() const { return symbolTableOp; }
+
+  //===--------------------------------------------------------------------===//
+  // Symbol Utilities
+  //===--------------------------------------------------------------------===//
+
+  /// Returns the operation registered with the given symbol name with the
+  /// regions of 'symbolTableOp'. 'symbolTableOp' is required to be an operation
+  /// with the 'OpTrait::SymbolTable' trait.
+  static Operation *lookupSymbolIn(Operation *op, StringRef symbol);
+
+  /// Returns the operation registered with the given symbol name within the
+  /// closest parent operation of, or including, 'from' with the
+  /// 'OpTrait::SymbolTable' trait. Returns nullptr if no valid symbol was
+  /// found.
+  static Operation *lookupNearestSymbolFrom(Operation *from, StringRef symbol);
+
+  /// This class represents a specific symbol use.
+  class SymbolUse {
+  public:
+    SymbolUse(Operation *op, SymbolRefAttr symbolRef)
+        : owner(op), symbolRef(symbolRef) {}
+
+    /// Return the operation user of this symbol reference.
+    Operation *getUser() const { return owner; }
+
+    /// Return the symbol reference that this use represents.
+    SymbolRefAttr getSymbolRef() const { return symbolRef; }
+
+  private:
+    /// The operation that this access is held by.
+    Operation *owner;
+
+    /// The symbol reference that this use represents.
+    SymbolRefAttr symbolRef;
+  };
+
+  /// This class implements a range of SymbolRef uses.
+  class UseRange {
+  public:
+    UseRange(std::vector<SymbolUse> &&uses) : uses(std::move(uses)) {}
+
+    using iterator = std::vector<SymbolUse>::const_iterator;
+    iterator begin() const { return uses.begin(); }
+    iterator end() const { return uses.end(); }
+
+  private:
+    std::vector<SymbolUse> uses;
+  };
+
+  /// Get an iterator range for all of the uses, for any symbol, that are nested
+  /// within the given operation 'from'. This does not traverse into any nested
+  /// symbol tables, and will also only return uses on 'from' if it does not
+  /// also define a symbol table. This is because we treat the region as the
+  /// boundary of the symbol table, and not the op itself. This function returns
+  /// None if there are any unknown operations that may potentially be symbol
+  /// tables.
+  static Optional<UseRange> getSymbolUses(Operation *from);
+
+  /// Get all of the uses of the given symbol that are nested within the given
+  /// operation 'from'. This does not traverse into any nested symbol tables,
+  /// and will also only return uses on 'from' if it does not also define a
+  /// symbol table. This is because we treat the region as the boundary of the
+  /// symbol table, and not the op itself. This function returns None if there
+  /// are any unknown operations that may potentially be symbol tables.
+  static Optional<UseRange> getSymbolUses(StringRef symbol, Operation *from);
+
+  /// Return if the given symbol is known to have no uses that are nested
+  /// within the given operation 'from'. This does not traverse into any nested
+  /// symbol tables, and will also only count uses on 'from' if it does not also
+  /// define a symbol table. This is because we treat the region as the boundary
+  /// of the symbol table, and not the op itself. This function will also return
+  /// false if there are any unknown operations that may potentially be symbol
+  /// tables. This doesn't necessarily mean that there are no uses, we just
+  /// can't conservatively prove it.
+  static bool symbolKnownUseEmpty(StringRef symbol, Operation *from);
+
+  /// Attempt to replace all uses of the given symbol 'oldSymbol' with the
+  /// provided symbol 'newSymbol' that are nested within the given operation
+  /// 'from'. This does not traverse into any nested symbol tables, and will
+  /// also only replace uses on 'from' if it does not also define a symbol
+  /// table. This is because we treat the region as the boundary of the symbol
+  /// table, and not the op itself. If there are any unknown operations that may
+  /// potentially be symbol tables, no uses are replaced and failure is
+  /// returned.
+  LLVM_NODISCARD static LogicalResult replaceAllSymbolUses(StringRef oldSymbol,
+                                                           StringRef newSymbol,
+                                                           Operation *from);
+
+private:
+  Operation *symbolTableOp;
+
+  /// This is a mapping from a name to the symbol with that name.
+  llvm::StringMap<Operation *> symbolTable;
+
+  /// This is used when name conflicts are detected.
+  unsigned uniquingCounter = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// SymbolTable Trait Types
+//===----------------------------------------------------------------------===//
+
+namespace OpTrait {
+namespace impl {
+LogicalResult verifySymbolTable(Operation *op);
+LogicalResult verifySymbol(Operation *op);
+} // namespace impl
+
+/// A trait used to provide symbol table functionalities to a region operation.
+/// This operation must hold exactly 1 region. Once attached, all operations
+/// that are directly within the region, i.e not including those within child
+/// regions, that contain a 'SymbolTable::getSymbolAttrName()' StringAttr will
+/// be verified to ensure that the names are uniqued. These operations must also
+/// adhere to the constraints defined by the `Symbol` trait, even if they do not
+/// inherit from it.
+template <typename ConcreteType>
+class SymbolTable : public TraitBase<ConcreteType, SymbolTable> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySymbolTable(op);
+  }
+
+  /// Look up a symbol with the specified name, returning null if no such
+  /// name exists. Symbol names never include the @ on them. Note: This
+  /// performs a linear scan of held symbols.
+  Operation *lookupSymbol(StringRef name) {
+    return mlir::SymbolTable::lookupSymbolIn(this->getOperation(), name);
+  }
+  template <typename T> T lookupSymbol(StringRef name) {
+    return dyn_cast_or_null<T>(lookupSymbol(name));
+  }
+};
+
+/// A trait used to define a symbol that can be used on operations within a
+/// symbol table. Operations using this trait must adhere to the following:
+///   * Have a StringAttr attribute named 'SymbolTable::getSymbolAttrName()'.
+template <typename ConcreteType>
+class Symbol : public TraitBase<ConcreteType, Symbol> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifySymbol(op);
+  }
+
+  /// Returns the name of this symbol.
+  StringRef getName() {
+    return this->getOperation()
+        ->template getAttrOfType<StringAttr>(
+            mlir::SymbolTable::getSymbolAttrName())
+        .getValue();
+  }
+
+  /// Set the name of this symbol.
+  void setName(StringRef name) {
+    this->getOperation()->setAttr(
+        mlir::SymbolTable::getSymbolAttrName(),
+        StringAttr::get(name, this->getOperation()->getContext()));
+  }
+
+  /// Get all of the uses of the current symbol that are nested within the given
+  /// operation 'from'.
+  /// Note: See mlir::SymbolTable::getSymbolUses for more details.
+  Optional<::mlir::SymbolTable::UseRange> getSymbolUses(Operation *from) {
+    return ::mlir::SymbolTable::getSymbolUses(getName(), from);
+  }
+
+  /// Return if the current symbol is known to have no uses that are nested
+  /// within the given operation 'from'.
+  /// Note: See mlir::SymbolTable::symbolKnownUseEmpty for more details.
+  bool symbolKnownUseEmpty(Operation *from) {
+    return ::mlir::SymbolTable::symbolKnownUseEmpty(getName(), from);
+  }
+
+  /// Attempt to replace all uses of the current symbol with the provided symbol
+  /// 'newSymbol' that are nested within the given operation 'from'.
+  /// Note: See mlir::SymbolTable::replaceAllSymbolUses for more details.
+  LLVM_NODISCARD LogicalResult replaceAllSymbolUses(StringRef newSymbol,
+                                                    Operation *from) {
+    return ::mlir::SymbolTable::replaceAllSymbolUses(getName(), newSymbol,
+                                                     from);
+  }
+};
+
+} // end namespace OpTrait
+} // end namespace mlir
+
+#endif // MLIR_IR_SYMBOLTABLE_H
diff --git a/mlir/include/mlir/IR/TypeSupport.h b/mlir/include/mlir/IR/TypeSupport.h
new file mode 100644
index 0000000000000000000000000000000000000000..8cc811cb916db7b64d25eb5ed5dbcd9cece55725
--- /dev/null
+++ b/mlir/include/mlir/IR/TypeSupport.h
@@ -0,0 +1,112 @@
+//===- TypeSupport.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines support types for registering dialect extended types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_TYPE_SUPPORT_H
+#define MLIR_IR_TYPE_SUPPORT_H
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StorageUniquerSupport.h"
+
+namespace mlir {
+struct ClassID;
+class Dialect;
+class MLIRContext;
+
+//===----------------------------------------------------------------------===//
+// TypeStorage
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+class TypeUniquer;
+} // end namespace detail
+
+/// Base storage class appearing in a Type.
+class TypeStorage : public StorageUniquer::BaseStorage {
+  friend detail::TypeUniquer;
+  friend StorageUniquer;
+
+protected:
+  /// This constructor is used by derived classes as part of the TypeUniquer.
+  /// When using this constructor, the initializeTypeInfo function must be
+  /// invoked afterwards for the storage to be valid.
+  TypeStorage(unsigned subclassData = 0)
+      : dialect(nullptr), subclassData(subclassData) {}
+
+public:
+  /// Get the dialect that this type is registered to.
+  Dialect &getDialect() {
+    assert(dialect && "Malformed type storage object.");
+    return *dialect;
+  }
+  /// Get the subclass data.
+  unsigned getSubclassData() const { return subclassData; }
+
+  /// Set the subclass data.
+  void setSubclassData(unsigned val) { subclassData = val; }
+
+private:
+  // Set the dialect for this storage instance. This is used by the TypeUniquer
+  // when initializing a newly constructed type storage object.
+  void initializeDialect(Dialect &newDialect) { dialect = &newDialect; }
+
+  /// The dialect for this type.
+  Dialect *dialect;
+
+  /// Space for subclasses to store data.
+  unsigned subclassData;
+};
+
+/// Default storage type for types that require no additional initialization or
+/// storage.
+using DefaultTypeStorage = TypeStorage;
+
+//===----------------------------------------------------------------------===//
+// TypeStorageAllocator
+//===----------------------------------------------------------------------===//
+
+// This is a utility allocator used to allocate memory for instances of derived
+// Types.
+using TypeStorageAllocator = StorageUniquer::StorageAllocator;
+
+//===----------------------------------------------------------------------===//
+// TypeUniquer
+//===----------------------------------------------------------------------===//
+namespace detail {
+// A utility class to get, or create, unique instances of types within an
+// MLIRContext. This class manages all creation and uniquing of types.
+class TypeUniquer {
+public:
+  /// Get an uniqued instance of a type T.
+  template <typename T, typename... Args>
+  static T get(MLIRContext *ctx, unsigned kind, Args &&... args) {
+    return ctx->getTypeUniquer().get<typename T::ImplType>(
+        [&](TypeStorage *storage) {
+          storage->initializeDialect(lookupDialectForType<T>(ctx));
+        },
+        kind, std::forward<Args>(args)...);
+  }
+
+private:
+  /// Get the dialect that the type 'T' was registered with.
+  template <typename T> static Dialect &lookupDialectForType(MLIRContext *ctx) {
+    return lookupDialectForType(ctx, T::getClassID());
+  }
+
+  /// Get the dialect that registered the type with the provided typeid.
+  static Dialect &lookupDialectForType(MLIRContext *ctx,
+                                       const ClassID *const typeID);
+};
+} // namespace detail
+
+} // end namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/IR/TypeUtilities.h b/mlir/include/mlir/IR/TypeUtilities.h
new file mode 100644
index 0000000000000000000000000000000000000000..b095683ae5b337d5cf0b71155bdc23fb161d1d58
--- /dev/null
+++ b/mlir/include/mlir/IR/TypeUtilities.h
@@ -0,0 +1,98 @@
+//===- TypeUtilities.h - Helper function for type queries -------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines generic type utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_TYPEUTILITIES_H
+#define MLIR_SUPPORT_TYPEUTILITIES_H
+
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/STLExtras.h"
+
+namespace mlir {
+
+class Attribute;
+class TupleType;
+class Type;
+class Value;
+
+//===----------------------------------------------------------------------===//
+// Utility Functions
+//===----------------------------------------------------------------------===//
+
+/// Return the element type or return the type itself.
+Type getElementTypeOrSelf(Type type);
+
+/// Return the element type or return the type itself.
+Type getElementTypeOrSelf(Attribute attr);
+Type getElementTypeOrSelf(Value val);
+
+/// Get the types within a nested Tuple. A helper for the class method that
+/// handles storage concerns, which is tricky to do in tablegen.
+SmallVector<Type, 10> getFlattenedTypes(TupleType t);
+
+/// Return true if the specified type is an opaque type with the specified
+/// dialect and typeData.
+bool isOpaqueTypeWithName(Type type, StringRef dialect, StringRef typeData);
+
+/// Returns success if the given two shapes are compatible. That is, they have
+/// the same size and each pair of the elements are equal or one of them is
+/// dynamic.
+LogicalResult verifyCompatibleShape(ArrayRef<int64_t> shape1,
+                                    ArrayRef<int64_t> shape2);
+
+/// Returns success if the given two types have compatible shape. That is,
+/// they are both scalars (not shaped), or they are both shaped types and at
+/// least one is unranked or they have compatible dimensions. Dimensions are
+/// compatible if at least one is dynamic or both are equal. The element type
+/// does not matter.
+LogicalResult verifyCompatibleShape(Type type1, Type type2);
+
+//===----------------------------------------------------------------------===//
+// Utility Iterators
+//===----------------------------------------------------------------------===//
+
+// An iterator for the element types of an op's operands of shaped types.
+class OperandElementTypeIterator final
+    : public llvm::mapped_iterator<Operation::operand_iterator,
+                                   Type (*)(Value)> {
+public:
+  using reference = Type;
+
+  /// Initializes the result element type iterator to the specified operand
+  /// iterator.
+  explicit OperandElementTypeIterator(Operation::operand_iterator it);
+
+private:
+  static Type unwrap(Value value);
+};
+
+using OperandElementTypeRange = iterator_range<OperandElementTypeIterator>;
+
+// An iterator for the tensor element types of an op's results of shaped types.
+class ResultElementTypeIterator final
+    : public llvm::mapped_iterator<Operation::result_iterator,
+                                   Type (*)(Value)> {
+public:
+  using reference = Type;
+
+  /// Initializes the result element type iterator to the specified result
+  /// iterator.
+  explicit ResultElementTypeIterator(Operation::result_iterator it);
+
+private:
+  static Type unwrap(Value value);
+};
+
+using ResultElementTypeRange = iterator_range<ResultElementTypeIterator>;
+
+} // end namespace mlir
+
+#endif // MLIR_SUPPORT_TYPEUTILITIES_H
diff --git a/mlir/include/mlir/IR/Types.h b/mlir/include/mlir/IR/Types.h
new file mode 100644
index 0000000000000000000000000000000000000000..6246e9bedd0d8ebce751d6fce52048b2039d6001
--- /dev/null
+++ b/mlir/include/mlir/IR/Types.h
@@ -0,0 +1,301 @@
+//===- Types.h - MLIR Type Classes ------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_TYPES_H
+#define MLIR_IR_TYPES_H
+
+#include "mlir/IR/TypeSupport.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
+
+namespace mlir {
+class FloatType;
+class Identifier;
+class IndexType;
+class IntegerType;
+class MLIRContext;
+class TypeStorage;
+
+namespace detail {
+struct FunctionTypeStorage;
+struct OpaqueTypeStorage;
+} // namespace detail
+
+/// Instances of the Type class are immutable and uniqued.  They wrap a pointer
+/// to the storage object owned by MLIRContext.  Therefore, instances of Type
+/// are passed around by value.
+///
+/// Some types are "primitives" meaning they do not have any parameters, for
+/// example the Index type.  Parametric types have additional information that
+/// differentiates the types of the same kind between them, for example the
+/// Integer type has bitwidth, making i8 and i16 belong to the same kind by be
+/// different instances of the IntegerType.
+///
+/// Types are constructed and uniqued via the 'detail::TypeUniquer' class.
+///
+/// Derived type classes are expected to implement several required
+/// implementation hooks:
+///  * Required:
+///    - static bool kindof(unsigned kind);
+///      * Returns if the provided type kind corresponds to an instance of the
+///        current type. Used for isa/dyn_cast casting functionality.
+///
+///  * Optional:
+///    - static LogicalResult verifyConstructionInvariants(
+///                                               Optional<Location> loc,
+///                                               MLIRContext *context,
+///                                               Args... args)
+///      * This method is invoked when calling the 'TypeBase::get/getChecked'
+///        methods to ensure that the arguments passed in are valid to construct
+///        a type instance with.
+///      * This method is expected to return failure if a type cannot be
+///        constructed with 'args', success otherwise.
+///      * 'args' must correspond with the arguments passed into the
+///        'TypeBase::get' call after the type kind.
+///
+///
+/// Type storage objects inherit from TypeStorage and contain the following:
+///    - The type kind (for LLVM-style RTTI).
+///    - The dialect that defined the type.
+///    - Any parameters of the type.
+/// For non-parametric types, a convenience DefaultTypeStorage is provided.
+/// Parametric storage types must derive TypeStorage and respect the following:
+///    - Define a type alias, KeyTy, to a type that uniquely identifies the
+///      instance of the type within its kind.
+///      * The key type must be constructible from the values passed into the
+///        detail::TypeUniquer::get call after the type kind.
+///      * If the KeyTy does not have an llvm::DenseMapInfo specialization, the
+///        storage class must define a hashing method:
+///         'static unsigned hashKey(const KeyTy &)'
+///
+///    - Provide a method, 'bool operator==(const KeyTy &) const', to
+///      compare the storage instance against an instance of the key type.
+///
+///    - Provide a construction method:
+///        'DerivedStorage *construct(TypeStorageAllocator &, const KeyTy &key)'
+///      that builds a unique instance of the derived storage. The arguments to
+///      this function are an allocator to store any uniqued data within the
+///      context and the key type for this storage.
+class Type {
+public:
+  /// Integer identifier for all the concrete type kinds.
+  /// Note: This is not an enum class as each dialect will likely define a
+  /// separate enumeration for the specific types that they define. Not being an
+  /// enum class also simplifies the handling of type kinds by not requiring
+  /// casts for each use.
+  enum Kind {
+    // Builtin types.
+    Function,
+    Opaque,
+    LAST_BUILTIN_TYPE = Opaque,
+
+  // Reserve type kinds for dialect specific type system extensions.
+#define DEFINE_SYM_KIND_RANGE(Dialect)                                         \
+  FIRST_##Dialect##_TYPE, LAST_##Dialect##_TYPE = FIRST_##Dialect##_TYPE + 0xff,
+#include "DialectSymbolRegistry.def"
+  };
+
+  /// Utility class for implementing types.
+  template <typename ConcreteType, typename BaseType,
+            typename StorageType = DefaultTypeStorage>
+  using TypeBase = detail::StorageUserBase<ConcreteType, BaseType, StorageType,
+                                           detail::TypeUniquer>;
+
+  using ImplType = TypeStorage;
+
+  Type() : impl(nullptr) {}
+  /* implicit */ Type(const ImplType *impl)
+      : impl(const_cast<ImplType *>(impl)) {}
+
+  Type(const Type &other) = default;
+  Type &operator=(const Type &other) = default;
+
+  bool operator==(Type other) const { return impl == other.impl; }
+  bool operator!=(Type other) const { return !(*this == other); }
+  explicit operator bool() const { return impl; }
+
+  bool operator!() const { return impl == nullptr; }
+
+  template <typename U> bool isa() const;
+  template <typename U> U dyn_cast() const;
+  template <typename U> U dyn_cast_or_null() const;
+  template <typename U> U cast() const;
+
+  // Support type casting Type to itself.
+  static bool classof(Type) { return true; }
+
+  /// Return the classification for this type.
+  unsigned getKind() const;
+
+  /// Return the LLVMContext in which this type was uniqued.
+  MLIRContext *getContext() const;
+
+  /// Get the dialect this type is registered to.
+  Dialect &getDialect() const;
+
+  // Convenience predicates.  This is only for floating point types,
+  // derived types should use isa/dyn_cast.
+  bool isIndex();
+  bool isBF16();
+  bool isF16();
+  bool isF32();
+  bool isF64();
+
+  /// Return true if this is an integer type with the specified width.
+  bool isInteger(unsigned width);
+
+  /// Return the bit width of an integer or a float type, assert failure on
+  /// other types.
+  unsigned getIntOrFloatBitWidth();
+
+  /// Return true if this is an integer or index type.
+  bool isIntOrIndex();
+  /// Return true if this is an integer, index, or float type.
+  bool isIntOrIndexOrFloat();
+  /// Return true of this is an integer or a float type.
+  bool isIntOrFloat();
+
+  /// Print the current type.
+  void print(raw_ostream &os);
+  void dump();
+
+  friend ::llvm::hash_code hash_value(Type arg);
+
+  unsigned getSubclassData() const;
+  void setSubclassData(unsigned val);
+
+  /// Methods for supporting PointerLikeTypeTraits.
+  const void *getAsOpaquePointer() const {
+    return static_cast<const void *>(impl);
+  }
+  static Type getFromOpaquePointer(const void *pointer) {
+    return Type(reinterpret_cast<ImplType *>(const_cast<void *>(pointer)));
+  }
+
+protected:
+  ImplType *impl;
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, Type type) {
+  type.print(os);
+  return os;
+}
+
+/// Function types map from a list of inputs to a list of results.
+class FunctionType
+    : public Type::TypeBase<FunctionType, Type, detail::FunctionTypeStorage> {
+public:
+  using Base::Base;
+
+  static FunctionType get(ArrayRef<Type> inputs, ArrayRef<Type> results,
+                          MLIRContext *context);
+
+  // Input types.
+  unsigned getNumInputs() const { return getSubclassData(); }
+
+  Type getInput(unsigned i) const { return getInputs()[i]; }
+
+  ArrayRef<Type> getInputs() const;
+
+  // Result types.
+  unsigned getNumResults() const;
+
+  Type getResult(unsigned i) const { return getResults()[i]; }
+
+  ArrayRef<Type> getResults() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool kindof(unsigned kind) { return kind == Kind::Function; }
+};
+
+/// Opaque types represent types of non-registered dialects. These are types
+/// represented in their raw string form, and can only usefully be tested for
+/// type equality.
+class OpaqueType
+    : public Type::TypeBase<OpaqueType, Type, detail::OpaqueTypeStorage> {
+public:
+  using Base::Base;
+
+  /// Get or create a new OpaqueType with the provided dialect and string data.
+  static OpaqueType get(Identifier dialect, StringRef typeData,
+                        MLIRContext *context);
+
+  /// Get or create a new OpaqueType with the provided dialect and string data.
+  /// If the given identifier is not a valid namespace for a dialect, then a
+  /// null type is returned.
+  static OpaqueType getChecked(Identifier dialect, StringRef typeData,
+                               MLIRContext *context, Location location);
+
+  /// Returns the dialect namespace of the opaque type.
+  Identifier getDialectNamespace() const;
+
+  /// Returns the raw type data of the opaque type.
+  StringRef getTypeData() const;
+
+  /// Verify the construction of an opaque type.
+  static LogicalResult verifyConstructionInvariants(Optional<Location> loc,
+                                                    MLIRContext *context,
+                                                    Identifier dialect,
+                                                    StringRef typeData);
+
+  static bool kindof(unsigned kind) { return kind == Kind::Opaque; }
+};
+
+// Make Type hashable.
+inline ::llvm::hash_code hash_value(Type arg) {
+  return ::llvm::hash_value(arg.impl);
+}
+
+template <typename U> bool Type::isa() const {
+  assert(impl && "isa<> used on a null type.");
+  return U::classof(*this);
+}
+template <typename U> U Type::dyn_cast() const {
+  return isa<U>() ? U(impl) : U(nullptr);
+}
+template <typename U> U Type::dyn_cast_or_null() const {
+  return (impl && isa<U>()) ? U(impl) : U(nullptr);
+}
+template <typename U> U Type::cast() const {
+  assert(isa<U>());
+  return U(impl);
+}
+
+} // end namespace mlir
+
+namespace llvm {
+
+// Type hash just like pointers.
+template <> struct DenseMapInfo<mlir::Type> {
+  static mlir::Type getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::Type(static_cast<mlir::Type::ImplType *>(pointer));
+  }
+  static mlir::Type getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::Type(static_cast<mlir::Type::ImplType *>(pointer));
+  }
+  static unsigned getHashValue(mlir::Type val) { return mlir::hash_value(val); }
+  static bool isEqual(mlir::Type LHS, mlir::Type RHS) { return LHS == RHS; }
+};
+
+/// We align TypeStorage by 8, so allow LLVM to steal the low bits.
+template <> struct PointerLikeTypeTraits<mlir::Type> {
+public:
+  static inline void *getAsVoidPointer(mlir::Type I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::Type getFromVoidPointer(void *P) {
+    return mlir::Type::getFromOpaquePointer(P);
+  }
+  enum { NumLowBitsAvailable = 3 };
+};
+
+} // namespace llvm
+
+#endif // MLIR_IR_TYPES_H
diff --git a/mlir/include/mlir/IR/UseDefLists.h b/mlir/include/mlir/IR/UseDefLists.h
new file mode 100644
index 0000000000000000000000000000000000000000..05720ed39af80362bccc0536dfeec5a2dfaef2be
--- /dev/null
+++ b/mlir/include/mlir/IR/UseDefLists.h
@@ -0,0 +1,290 @@
+//===- UseDefLists.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines generic use/def list machinery and manipulation utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_USEDEFLISTS_H
+#define MLIR_IR_USEDEFLISTS_H
+
+#include "mlir/IR/Location.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/iterator_range.h"
+
+namespace mlir {
+
+class IROperand;
+class Operation;
+class Value;
+template <typename OperandType> class ValueUseIterator;
+template <typename OperandType> class ValueUserIterator;
+
+class IRObjectWithUseList {
+public:
+  ~IRObjectWithUseList() {
+    assert(use_empty() && "Cannot destroy a value that still has uses!");
+  }
+
+  /// Returns true if this value has no uses.
+  bool use_empty() const { return firstUse == nullptr; }
+
+  /// Returns true if this value has exactly one use.
+  inline bool hasOneUse() const;
+
+  using use_iterator = ValueUseIterator<IROperand>;
+  using use_range = iterator_range<use_iterator>;
+
+  inline use_iterator use_begin() const;
+  inline use_iterator use_end() const;
+
+  /// Returns a range of all uses, which is useful for iterating over all uses.
+  inline use_range getUses() const;
+
+  using user_iterator = ValueUserIterator<IROperand>;
+  using user_range = iterator_range<user_iterator>;
+
+  inline user_iterator user_begin() const;
+  inline user_iterator user_end() const;
+
+  /// Returns a range of all users.
+  inline user_range getUsers() const;
+
+  /// Replace all uses of 'this' value with the new value, updating anything in
+  /// the IR that uses 'this' to use the other value instead.  When this returns
+  /// there are zero uses of 'this'.
+  void replaceAllUsesWith(IRObjectWithUseList *newValue);
+
+  /// Drop all uses of this object from their respective owners.
+  void dropAllUses();
+
+protected:
+  IRObjectWithUseList() {}
+
+  /// Return the first IROperand that is using this value, for use by custom
+  /// use/def iterators.
+  IROperand *getFirstUse() { return firstUse; }
+  const IROperand *getFirstUse() const { return firstUse; }
+
+private:
+  friend class IROperand;
+  IROperand *firstUse = nullptr;
+};
+
+/// A reference to a value, suitable for use as an operand of an operation.
+class IROperand {
+public:
+  IROperand(Operation *owner) : owner(owner) {}
+  IROperand(Operation *owner, IRObjectWithUseList *value)
+      : value(value), owner(owner) {
+    insertIntoCurrent();
+  }
+
+  /// Return the current value being used by this operand.
+  IRObjectWithUseList *get() const { return value; }
+
+  /// Set the current value being used by this operand.
+  void set(IRObjectWithUseList *newValue) {
+    // It isn't worth optimizing for the case of switching operands on a single
+    // value.
+    removeFromCurrent();
+    value = newValue;
+    insertIntoCurrent();
+  }
+
+  /// Return the owner of this operand.
+  Operation *getOwner() { return owner; }
+  Operation *getOwner() const { return owner; }
+
+  /// \brief Remove this use of the operand.
+  void drop() {
+    removeFromCurrent();
+    value = nullptr;
+    nextUse = nullptr;
+    back = nullptr;
+  }
+
+  ~IROperand() { removeFromCurrent(); }
+
+  /// Return the next operand on the use-list of the value we are referring to.
+  /// This should generally only be used by the internal implementation details
+  /// of the SSA machinery.
+  IROperand *getNextOperandUsingThisValue() { return nextUse; }
+
+  /// We support a move constructor so IROperand's can be in vectors, but this
+  /// shouldn't be used by general clients.
+  IROperand(IROperand &&other) : owner(other.owner) {
+    *this = std::move(other);
+  }
+  IROperand &operator=(IROperand &&other) {
+    removeFromCurrent();
+    other.removeFromCurrent();
+    value = other.value;
+    other.value = nullptr;
+    other.back = nullptr;
+    nextUse = nullptr;
+    back = nullptr;
+    insertIntoCurrent();
+    return *this;
+  }
+
+private:
+  /// The value used as this operand.  This can be null when in a
+  /// "dropAllUses" state.
+  IRObjectWithUseList *value = nullptr;
+
+  /// The next operand in the use-chain.
+  IROperand *nextUse = nullptr;
+
+  /// This points to the previous link in the use-chain.
+  IROperand **back = nullptr;
+
+  /// The operation owner of this operand.
+  Operation *const owner;
+
+  /// Operands are not copyable or assignable.
+  IROperand(const IROperand &use) = delete;
+  IROperand &operator=(const IROperand &use) = delete;
+
+  void removeFromCurrent() {
+    if (!back)
+      return;
+    *back = nextUse;
+    if (nextUse)
+      nextUse->back = back;
+  }
+
+  void insertIntoCurrent() {
+    back = &value->firstUse;
+    nextUse = value->firstUse;
+    if (nextUse)
+      nextUse->back = &nextUse;
+    value->firstUse = this;
+  }
+};
+
+/// A reference to a value, suitable for use as an operand of an operation.
+class OpOperand : public IROperand {
+public:
+  OpOperand(Operation *owner) : IROperand(owner) {}
+  OpOperand(Operation *owner, Value value);
+
+  /// Return the current value being used by this operand.
+  Value get();
+
+  /// Set the current value being used by this operand.
+  void set(Value newValue);
+
+  /// Return which operand this is in the operand list of the User.
+  unsigned getOperandNumber();
+};
+
+/// A reference to a value, suitable for use as an operand of an operation,
+/// operation, etc.  IRValueTy is the root type to use for values this tracks,
+/// and SSAUserTy is the type that will contain operands.
+template <typename IRValueTy> class IROperandImpl : public IROperand {
+public:
+  IROperandImpl(Operation *owner) : IROperand(owner) {}
+  IROperandImpl(Operation *owner, IRValueTy *value) : IROperand(owner, value) {}
+
+  /// Return the current value being used by this operand.
+  IRValueTy *get() { return (IRValueTy *)IROperand::get(); }
+
+  /// Set the current value being used by this operand.
+  void set(IRValueTy *newValue) { IROperand::set(newValue); }
+
+  /// Return which operand this is in the operand list of the User.
+  unsigned getOperandNumber();
+};
+
+/// An iterator over all uses of a ValueBase.
+template <typename OperandType>
+class ValueUseIterator
+    : public std::iterator<std::forward_iterator_tag, OperandType> {
+public:
+  ValueUseIterator() = default;
+  explicit ValueUseIterator(OperandType *current) : current(current) {}
+  OperandType *operator->() const { return current; }
+  OperandType &operator*() const { return *current; }
+
+  Operation *getUser() const { return current->getOwner(); }
+
+  ValueUseIterator &operator++() {
+    assert(current && "incrementing past end()!");
+    current = (OperandType *)current->getNextOperandUsingThisValue();
+    return *this;
+  }
+
+  ValueUseIterator operator++(int unused) {
+    ValueUseIterator copy = *this;
+    ++*this;
+    return copy;
+  }
+
+  friend bool operator==(ValueUseIterator lhs, ValueUseIterator rhs) {
+    return lhs.current == rhs.current;
+  }
+
+  friend bool operator!=(ValueUseIterator lhs, ValueUseIterator rhs) {
+    return !(lhs == rhs);
+  }
+
+private:
+  OperandType *current;
+};
+
+inline auto IRObjectWithUseList::use_begin() const -> use_iterator {
+  return use_iterator(firstUse);
+}
+
+inline auto IRObjectWithUseList::use_end() const -> use_iterator {
+  return use_iterator(nullptr);
+}
+
+inline auto IRObjectWithUseList::getUses() const -> use_range {
+  return {use_begin(), use_end()};
+}
+
+/// Returns true if this value has exactly one use.
+inline bool IRObjectWithUseList::hasOneUse() const {
+  return firstUse && firstUse->getNextOperandUsingThisValue() == nullptr;
+}
+
+/// An iterator over all users of a ValueBase.
+template <typename OperandType>
+class ValueUserIterator final
+    : public llvm::mapped_iterator<ValueUseIterator<OperandType>,
+                                   Operation *(*)(OperandType &)> {
+  static Operation *unwrap(OperandType &value) { return value.getOwner(); }
+
+public:
+  using pointer = Operation *;
+  using reference = Operation *;
+
+  /// Initializes the result type iterator to the specified result iterator.
+  ValueUserIterator(ValueUseIterator<OperandType> it)
+      : llvm::mapped_iterator<ValueUseIterator<OperandType>,
+                              Operation *(*)(OperandType &)>(it, &unwrap) {}
+  Operation *operator->() { return **this; }
+};
+
+inline auto IRObjectWithUseList::user_begin() const -> user_iterator {
+  return user_iterator(use_begin());
+}
+
+inline auto IRObjectWithUseList::user_end() const -> user_iterator {
+  return user_iterator(use_end());
+}
+
+inline auto IRObjectWithUseList::getUsers() const -> user_range {
+  return {user_begin(), user_end()};
+}
+
+} // namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4356b16840f4106de0cc459e6b124337701d15e
--- /dev/null
+++ b/mlir/include/mlir/IR/Value.h
@@ -0,0 +1,362 @@
+//===- Value.h - Base of the SSA Value hierarchy ----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines generic Value type and manipulation utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_VALUE_H
+#define MLIR_IR_VALUE_H
+
+#include "mlir/IR/Types.h"
+#include "mlir/IR/UseDefLists.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+class Block;
+class BlockArgument;
+class Operation;
+class OpResult;
+class Region;
+class Value;
+
+namespace detail {
+/// The internal implementation of a Value.
+class ValueImpl : public IRObjectWithUseList {
+protected:
+  /// This enumerates all of the SSA value kinds.
+  enum class Kind {
+    BlockArgument,
+    OpResult,
+  };
+
+  ValueImpl(Kind kind, Type type) : typeAndKind(type, kind) {}
+
+private:
+  /// The type of the value and its kind.
+  llvm::PointerIntPair<Type, 1, Kind> typeAndKind;
+
+  /// Allow access to 'typeAndKind'.
+  friend Value;
+};
+
+/// The internal implementation of a BlockArgument.
+class BlockArgumentImpl : public ValueImpl {
+  BlockArgumentImpl(Type type, Block *owner)
+      : ValueImpl(Kind::BlockArgument, type), owner(owner) {}
+
+  /// The owner of this argument.
+  Block *owner;
+
+  /// Allow access to owner and constructor.
+  friend BlockArgument;
+};
+
+class OpResultImpl : public ValueImpl {
+  OpResultImpl(Type type, Operation *owner)
+      : ValueImpl(Kind::OpResult, type), owner(owner) {}
+
+  /// The owner of this result.
+  Operation *owner;
+
+  /// Allow access to owner and the constructor.
+  friend OpResult;
+};
+} // end namespace detail
+
+/// This class represents an instance of an SSA value in the MLIR system,
+/// representing a computable value that has a type and a set of users. An SSA
+/// value is either a BlockArgument or the result of an operation. Note: This
+/// class has value-type semantics and is just a simple wrapper around a
+/// ValueImpl that is either owner by a block(in the case of a BlockArgument) or
+/// an Operation(in the case of an OpResult).
+///
+class Value {
+public:
+  /// This enumerates all of the SSA value kinds in the MLIR system.
+  enum class Kind {
+    BlockArgument,
+    OpResult,
+  };
+
+  Value(std::nullptr_t) : impl(nullptr) {}
+  Value(detail::ValueImpl *impl = nullptr) : impl(impl) {}
+  Value(const Value &) = default;
+  Value &operator=(const Value &) = default;
+  ~Value() {}
+
+  template <typename U> bool isa() const {
+    assert(impl && "isa<> used on a null type.");
+    return U::classof(*this);
+  }
+  template <typename U> U dyn_cast() const {
+    return isa<U>() ? U(impl) : U(nullptr);
+  }
+  template <typename U> U dyn_cast_or_null() const {
+    return (impl && isa<U>()) ? U(impl) : U(nullptr);
+  }
+  template <typename U> U cast() const {
+    assert(isa<U>());
+    return U(impl);
+  }
+
+  /// Temporary methods to enable transition of Value to being used as a
+  /// value-type.
+  /// TODO(riverriddle) Remove these when all usages have been removed.
+  Value operator*() const { return *this; }
+  Value *operator->() const { return (Value *)this; }
+
+  operator bool() const { return impl; }
+  bool operator==(const Value &other) const { return impl == other.impl; }
+  bool operator!=(const Value &other) const { return !(*this == other); }
+
+  /// Return the kind of this value.
+  Kind getKind() const { return (Kind)impl->typeAndKind.getInt(); }
+
+  /// Return the type of this value.
+  Type getType() const { return impl->typeAndKind.getPointer(); }
+
+  /// Utility to get the associated MLIRContext that this value is defined in.
+  MLIRContext *getContext() const { return getType().getContext(); }
+
+  /// Mutate the type of this Value to be of the specified type.
+  ///
+  /// Note that this is an extremely dangerous operation which can create
+  /// completely invalid IR very easily.  It is strongly recommended that you
+  /// recreate IR objects with the right types instead of mutating them in
+  /// place.
+  void setType(Type newType) { impl->typeAndKind.setPointer(newType); }
+
+  /// Replace all uses of 'this' value with the new value, updating anything in
+  /// the IR that uses 'this' to use the other value instead.  When this returns
+  /// there are zero uses of 'this'.
+  void replaceAllUsesWith(Value newValue) const {
+    impl->replaceAllUsesWith(newValue.impl);
+  }
+
+  /// If this value is the result of an operation, return the operation that
+  /// defines it.
+  Operation *getDefiningOp() const;
+
+  /// If this value is the result of an operation, use it as a location,
+  /// otherwise return an unknown location.
+  Location getLoc();
+
+  /// Return the Region in which this Value is defined.
+  Region *getParentRegion();
+
+  using use_iterator = ValueUseIterator<OpOperand>;
+  using use_range = iterator_range<use_iterator>;
+
+  inline use_iterator use_begin();
+  inline use_iterator use_end();
+
+  /// Returns a range of all uses, which is useful for iterating over all uses.
+  inline use_range getUses();
+
+  using user_iterator = ValueUserIterator<IROperand>;
+  using user_range = iterator_range<user_iterator>;
+
+  user_iterator user_begin() const { return impl->user_begin(); }
+  user_iterator user_end() const { return impl->user_end(); }
+
+  /// Returns a range of all users.
+  user_range getUsers() const { return impl->getUsers(); }
+
+  /// Returns true if this value has no uses.
+  bool use_empty() const { return impl->use_empty(); }
+
+  /// Returns true if this value has exactly one use.
+  bool hasOneUse() const { return impl->hasOneUse(); }
+
+  /// Drop all uses of this object from their respective owners.
+  void dropAllUses() const { impl->dropAllUses(); }
+
+  void print(raw_ostream &os);
+  void dump();
+
+  /// Methods for supporting PointerLikeTypeTraits.
+  void *getAsOpaquePointer() const { return static_cast<void *>(impl); }
+  static Value getFromOpaquePointer(const void *pointer) {
+    return reinterpret_cast<detail::ValueImpl *>(const_cast<void *>(pointer));
+  }
+
+  friend ::llvm::hash_code hash_value(Value arg);
+
+protected:
+  /// The internal implementation of this value.
+  mutable detail::ValueImpl *impl;
+
+  /// Allow access to 'impl'.
+  friend OpOperand;
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, Value value) {
+  value.print(os);
+  return os;
+}
+
+// Utility functions for iterating through Value uses.
+inline auto Value::use_begin() -> use_iterator {
+  return use_iterator((OpOperand *)impl->getFirstUse());
+}
+
+inline auto Value::use_end() -> use_iterator { return use_iterator(nullptr); }
+
+inline auto Value::getUses() -> iterator_range<use_iterator> {
+  return {use_begin(), use_end()};
+}
+
+/// Block arguments are values.
+class BlockArgument : public Value {
+public:
+  using Value::Value;
+
+  /// Temporary methods to enable transition of Value to being used as a
+  /// value-type.
+  /// TODO(riverriddle) Remove this when all usages have been removed.
+  BlockArgument *operator->() { return this; }
+
+  static bool classof(Value value) {
+    return value.getKind() == Kind::BlockArgument;
+  }
+
+  /// Returns the block that owns this argument.
+  Block *getOwner() const { return getImpl()->owner; }
+
+  /// Returns the number of this argument.
+  unsigned getArgNumber() const;
+
+private:
+  /// Allocate a new argument with the given type and owner.
+  static BlockArgument create(Type type, Block *owner) {
+    return new detail::BlockArgumentImpl(type, owner);
+  }
+
+  /// Destroy and deallocate this argument.
+  void destroy() { delete getImpl(); }
+
+  /// Get a raw pointer to the internal implementation.
+  detail::BlockArgumentImpl *getImpl() const {
+    return reinterpret_cast<detail::BlockArgumentImpl *>(impl);
+  }
+
+  /// Allow access to `create` and `destroy`.
+  friend Block;
+};
+
+/// This is a value defined by a result of an operation.
+class OpResult : public Value {
+public:
+  using Value::Value;
+
+  /// Temporary methods to enable transition of Value to being used as a
+  /// value-type.
+  /// TODO(riverriddle) Remove these when all usages have been removed.
+  OpResult *operator*() { return this; }
+  OpResult *operator->() { return this; }
+
+  static bool classof(Value value) { return value.getKind() == Kind::OpResult; }
+
+  /// Returns the operation that owns this result.
+  Operation *getOwner() const { return getImpl()->owner; }
+
+  /// Returns the number of this result.
+  unsigned getResultNumber() const;
+
+private:
+  /// Allocate a new result with the given type and owner.
+  static OpResult create(Type type, Operation *owner) {
+    return new detail::OpResultImpl(type, owner);
+  }
+
+  /// Destroy and deallocate this result.
+  void destroy() { delete getImpl(); }
+
+  /// Get a raw pointer to the internal implementation.
+  detail::OpResultImpl *getImpl() const {
+    return reinterpret_cast<detail::OpResultImpl *>(impl);
+  }
+
+  /// Allow access to `create` and `destroy`.
+  friend Operation;
+};
+
+/// Make Value hashable.
+inline ::llvm::hash_code hash_value(Value arg) {
+  return ::llvm::hash_value(arg.impl);
+}
+
+} // namespace mlir
+
+namespace llvm {
+
+template <> struct DenseMapInfo<mlir::Value> {
+  static mlir::Value getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::Value(static_cast<mlir::detail::ValueImpl *>(pointer));
+  }
+  static mlir::Value getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::Value(static_cast<mlir::detail::ValueImpl *>(pointer));
+  }
+  static unsigned getHashValue(mlir::Value val) {
+    return mlir::hash_value(val);
+  }
+  static bool isEqual(mlir::Value LHS, mlir::Value RHS) { return LHS == RHS; }
+};
+
+/// Allow stealing the low bits of a value.
+template <> struct PointerLikeTypeTraits<mlir::Value> {
+public:
+  static inline void *getAsVoidPointer(mlir::Value I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::Value getFromVoidPointer(void *P) {
+    return mlir::Value::getFromOpaquePointer(P);
+  }
+  enum {
+    NumLowBitsAvailable =
+        PointerLikeTypeTraits<mlir::detail::ValueImpl *>::NumLowBitsAvailable
+  };
+};
+
+template <> struct DenseMapInfo<mlir::BlockArgument> {
+  static mlir::BlockArgument getEmptyKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
+    return mlir::BlockArgument(static_cast<mlir::detail::ValueImpl *>(pointer));
+  }
+  static mlir::BlockArgument getTombstoneKey() {
+    auto pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
+    return mlir::BlockArgument(static_cast<mlir::detail::ValueImpl *>(pointer));
+  }
+  static unsigned getHashValue(mlir::BlockArgument val) {
+    return mlir::hash_value(val);
+  }
+  static bool isEqual(mlir::BlockArgument LHS, mlir::BlockArgument RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// Allow stealing the low bits of a value.
+template <> struct PointerLikeTypeTraits<mlir::BlockArgument> {
+public:
+  static inline void *getAsVoidPointer(mlir::Value I) {
+    return const_cast<void *>(I.getAsOpaquePointer());
+  }
+  static inline mlir::BlockArgument getFromVoidPointer(void *P) {
+    return mlir::Value::getFromOpaquePointer(P).cast<mlir::BlockArgument>();
+  }
+  enum {
+    NumLowBitsAvailable =
+        PointerLikeTypeTraits<mlir::detail::ValueImpl *>::NumLowBitsAvailable
+  };
+};
+} // end namespace llvm
+
+#endif
diff --git a/mlir/include/mlir/IR/Visitors.h b/mlir/include/mlir/IR/Visitors.h
new file mode 100644
index 0000000000000000000000000000000000000000..aaab933d239ebbbe6237888816a056794eda4e18
--- /dev/null
+++ b/mlir/include/mlir/IR/Visitors.h
@@ -0,0 +1,143 @@
+//===- Visitors.h - Utilities for visiting operations -----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utilities for walking and visiting operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_VISITORS_H
+#define MLIR_IR_VISITORS_H
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/STLExtras.h"
+
+namespace mlir {
+class Diagnostic;
+class InFlightDiagnostic;
+class Operation;
+
+/// A utility result that is used to signal if a walk method should be
+/// interrupted or advance.
+class WalkResult {
+  enum ResultEnum { Interrupt, Advance } result;
+
+public:
+  WalkResult(ResultEnum result) : result(result) {}
+
+  /// Allow LogicalResult to interrupt the walk on failure.
+  WalkResult(LogicalResult result)
+      : result(failed(result) ? Interrupt : Advance) {}
+
+  /// Allow diagnostics to interrupt the walk.
+  WalkResult(Diagnostic &&) : result(Interrupt) {}
+  WalkResult(InFlightDiagnostic &&) : result(Interrupt) {}
+
+  bool operator==(const WalkResult &rhs) const { return result == rhs.result; }
+
+  static WalkResult interrupt() { return {Interrupt}; }
+  static WalkResult advance() { return {Advance}; }
+
+  /// Returns if the walk was interrupted.
+  bool wasInterrupted() const { return result == Interrupt; }
+};
+
+namespace detail {
+/// Helper templates to deduce the first argument of a callback parameter.
+template <typename Ret, typename Arg> Arg first_argument_type(Ret (*)(Arg));
+template <typename Ret, typename F, typename Arg>
+Arg first_argument_type(Ret (F::*)(Arg));
+template <typename Ret, typename F, typename Arg>
+Arg first_argument_type(Ret (F::*)(Arg) const);
+template <typename F>
+decltype(first_argument_type(&F::operator())) first_argument_type(F);
+
+/// Type definition of the first argument to the given callable 'T'.
+template <typename T>
+using first_argument = decltype(first_argument_type(std::declval<T>()));
+
+/// Walk all of the operations nested under and including the given operation.
+void walkOperations(Operation *op, function_ref<void(Operation *op)> callback);
+
+/// Walk all of the operations nested under and including the given operation.
+/// This methods walks operations until an interrupt result is returned by the
+/// callback.
+WalkResult walkOperations(Operation *op,
+                          function_ref<WalkResult(Operation *op)> callback);
+
+// Below are a set of functions to walk nested operations. Users should favor
+// the direct `walk` methods on the IR classes(Operation/Block/etc) over these
+// methods. They are also templated to allow for statically dispatching based
+// upon the type of the callback function.
+
+/// Walk all of the operations nested under and including the given operation.
+/// This method is selected for callbacks that operation on Operation*.
+///
+/// Example:
+///   op->walk([](Operation *op) { ... });
+template <
+    typename FuncTy, typename ArgT = detail::first_argument<FuncTy>,
+    typename RetT = decltype(std::declval<FuncTy>()(std::declval<ArgT>()))>
+typename std::enable_if<std::is_same<ArgT, Operation *>::value, RetT>::type
+walkOperations(Operation *op, FuncTy &&callback) {
+  return detail::walkOperations(op, function_ref<RetT(ArgT)>(callback));
+}
+
+/// Walk all of the operations of type 'ArgT' nested under and including the
+/// given operation. This method is selected for void returning callbacks that
+/// operate on a specific derived operation type.
+///
+/// Example:
+///   op->walk([](ReturnOp op) { ... });
+template <
+    typename FuncTy, typename ArgT = detail::first_argument<FuncTy>,
+    typename RetT = decltype(std::declval<FuncTy>()(std::declval<ArgT>()))>
+typename std::enable_if<!std::is_same<ArgT, Operation *>::value &&
+                            std::is_same<RetT, void>::value,
+                        RetT>::type
+walkOperations(Operation *op, FuncTy &&callback) {
+  auto wrapperFn = [&](Operation *op) {
+    if (auto derivedOp = dyn_cast<ArgT>(op))
+      callback(derivedOp);
+  };
+  return detail::walkOperations(op, function_ref<RetT(Operation *)>(wrapperFn));
+}
+
+/// Walk all of the operations of type 'ArgT' nested under and including the
+/// given operation. This method is selected for WalkReturn returning
+/// interruptible callbacks that operate on a specific derived operation type.
+///
+/// Example:
+///   op->walk([](ReturnOp op) {
+///     if (some_invariant)
+///       return WalkResult::interrupt();
+///     return WalkResult::advance();
+///   });
+template <
+    typename FuncTy, typename ArgT = detail::first_argument<FuncTy>,
+    typename RetT = decltype(std::declval<FuncTy>()(std::declval<ArgT>()))>
+typename std::enable_if<!std::is_same<ArgT, Operation *>::value &&
+                            std::is_same<RetT, WalkResult>::value,
+                        RetT>::type
+walkOperations(Operation *op, FuncTy &&callback) {
+  auto wrapperFn = [&](Operation *op) {
+    if (auto derivedOp = dyn_cast<ArgT>(op))
+      return callback(derivedOp);
+    return WalkResult::advance();
+  };
+  return detail::walkOperations(op, function_ref<RetT(Operation *)>(wrapperFn));
+}
+
+/// Utility to provide the return type of a templated walk method.
+template <typename FnT>
+using walkResultType = decltype(walkOperations(nullptr, std::declval<FnT>()));
+} // end namespace detail
+
+} // namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/Parser.h b/mlir/include/mlir/Parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..cae1e8b9ab1e69815b2794e65058fb5d3c8af7d7
--- /dev/null
+++ b/mlir/include/mlir/Parser.h
@@ -0,0 +1,90 @@
+//===- Parser.h - MLIR Parser Library Interface -----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is contains the interface to the MLIR parser library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_PARSER_H
+#define MLIR_PARSER_H
+
+#include <cstddef>
+
+namespace llvm {
+class SourceMgr;
+class SMDiagnostic;
+class StringRef;
+} // end namespace llvm
+
+namespace mlir {
+class Attribute;
+class Location;
+class MLIRContext;
+class OwningModuleRef;
+class Type;
+
+/// This parses the file specified by the indicated SourceMgr and returns an
+/// MLIR module if it was valid.  If not, the error message is emitted through
+/// the error handler registered in the context, and a null pointer is returned.
+OwningModuleRef parseSourceFile(const llvm::SourceMgr &sourceMgr,
+                                MLIRContext *context);
+
+/// This parses the file specified by the indicated filename and returns an
+/// MLIR module if it was valid.  If not, the error message is emitted through
+/// the error handler registered in the context, and a null pointer is returned.
+OwningModuleRef parseSourceFile(llvm::StringRef filename, MLIRContext *context);
+
+/// This parses the file specified by the indicated filename using the provided
+/// SourceMgr and returns an MLIR module if it was valid.  If not, the error
+/// message is emitted through the error handler registered in the context, and
+/// a null pointer is returned.
+OwningModuleRef parseSourceFile(llvm::StringRef filename,
+                                llvm::SourceMgr &sourceMgr,
+                                MLIRContext *context);
+
+/// This parses the module string to a MLIR module if it was valid.  If not, the
+/// error message is emitted through the error handler registered in the
+/// context, and a null pointer is returned.
+OwningModuleRef parseSourceString(llvm::StringRef moduleStr,
+                                  MLIRContext *context);
+
+/// This parses a single MLIR attribute to an MLIR context if it was valid.  If
+/// not, an error message is emitted through a new SourceMgrDiagnosticHandler
+/// constructed from a new SourceMgr with a single a MemoryBuffer wrapping
+/// `attrStr`. If the passed `attrStr` has additional tokens that were not part
+/// of the type, an error is emitted.
+// TODO(ntv) Improve diagnostic reporting.
+Attribute parseAttribute(llvm::StringRef attrStr, MLIRContext *context);
+Attribute parseAttribute(llvm::StringRef attrStr, Type type);
+
+/// This parses a single MLIR attribute to an MLIR context if it was valid.  If
+/// not, an error message is emitted through a new SourceMgrDiagnosticHandler
+/// constructed from a new SourceMgr with a single a MemoryBuffer wrapping
+/// `attrStr`. The number of characters of `attrStr` parsed in the process is
+/// returned in `numRead`.
+Attribute parseAttribute(llvm::StringRef attrStr, MLIRContext *context,
+                         size_t &numRead);
+Attribute parseAttribute(llvm::StringRef attrStr, Type type, size_t &numRead);
+
+/// This parses a single MLIR type to an MLIR context if it was valid.  If not,
+/// an error message is emitted through a new SourceMgrDiagnosticHandler
+/// constructed from a new SourceMgr with a single a MemoryBuffer wrapping
+/// `typeStr`. If the passed `typeStr` has additional tokens that were not part
+/// of the type, an error is emitted.
+// TODO(ntv) Improve diagnostic reporting.
+Type parseType(llvm::StringRef typeStr, MLIRContext *context);
+
+/// This parses a single MLIR type to an MLIR context if it was valid.  If not,
+/// an error message is emitted through a new SourceMgrDiagnosticHandler
+/// constructed from a new SourceMgr with a single a MemoryBuffer wrapping
+/// `typeStr`. The number of characters of `typeStr` parsed in the process is
+/// returned in `numRead`.
+Type parseType(llvm::StringRef typeStr, MLIRContext *context, size_t &numRead);
+} // end namespace mlir
+
+#endif // MLIR_PARSER_H
diff --git a/mlir/include/mlir/Pass/AnalysisManager.h b/mlir/include/mlir/Pass/AnalysisManager.h
new file mode 100644
index 0000000000000000000000000000000000000000..471cd011c40f427ae5b1cb3387c2abb49f84f036
--- /dev/null
+++ b/mlir/include/mlir/Pass/AnalysisManager.h
@@ -0,0 +1,328 @@
+//===- AnalysisManager.h - Analysis Management Infrastructure ---*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_PASS_ANALYSISMANAGER_H
+#define MLIR_PASS_ANALYSISMANAGER_H
+
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/PassInstrumentation.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/TypeName.h"
+
+namespace mlir {
+/// A special type used by analyses to provide an address that identifies a
+/// particular analysis set or a concrete analysis type.
+using AnalysisID = ClassID;
+
+//===----------------------------------------------------------------------===//
+// Analysis Preservation and Concept Modeling
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+/// A utility class to represent the analyses that are known to be preserved.
+class PreservedAnalyses {
+public:
+  /// Mark all analyses as preserved.
+  void preserveAll() { preservedIDs.insert(&allAnalysesID); }
+
+  /// Returns true if all analyses were marked preserved.
+  bool isAll() const { return preservedIDs.count(&allAnalysesID); }
+
+  /// Returns true if no analyses were marked preserved.
+  bool isNone() const { return preservedIDs.empty(); }
+
+  /// Preserve the given analyses.
+  template <typename AnalysisT> void preserve() {
+    preserve(AnalysisID::getID<AnalysisT>());
+  }
+  template <typename AnalysisT, typename AnalysisT2, typename... OtherAnalysesT>
+  void preserve() {
+    preserve<AnalysisT>();
+    preserve<AnalysisT2, OtherAnalysesT...>();
+  }
+  void preserve(const AnalysisID *id) { preservedIDs.insert(id); }
+
+  /// Returns if the given analysis has been marked as preserved. Note that this
+  /// simply checks for the presence of a given analysis ID and should not be
+  /// used as a general preservation checker.
+  template <typename AnalysisT> bool isPreserved() const {
+    return isPreserved(AnalysisID::getID<AnalysisT>());
+  }
+  bool isPreserved(const AnalysisID *id) const {
+    return preservedIDs.count(id);
+  }
+
+private:
+  /// An identifier used to represent all potential analyses.
+  constexpr static AnalysisID allAnalysesID = {};
+
+  /// The set of analyses that are known to be preserved.
+  SmallPtrSet<const void *, 2> preservedIDs;
+};
+
+namespace analysis_impl {
+/// Trait to check if T provides a static 'isInvalidated' method.
+template <typename T, typename... Args>
+using has_is_invalidated = decltype(std::declval<T &>().isInvalidated(
+    std::declval<const PreservedAnalyses &>()));
+
+/// Implementation of 'isInvalidated' if the analysis provides a definition.
+template <typename AnalysisT>
+std::enable_if_t<is_detected<has_is_invalidated, AnalysisT>::value, bool>
+isInvalidated(AnalysisT &analysis, const PreservedAnalyses &pa) {
+  return analysis.isInvalidated(pa);
+}
+/// Default implementation of 'isInvalidated'.
+template <typename AnalysisT>
+std::enable_if_t<!is_detected<has_is_invalidated, AnalysisT>::value, bool>
+isInvalidated(AnalysisT &analysis, const PreservedAnalyses &pa) {
+  return !pa.isPreserved<AnalysisT>();
+}
+} // end namespace analysis_impl
+
+/// The abstract polymorphic base class representing an analysis.
+struct AnalysisConcept {
+  virtual ~AnalysisConcept() = default;
+
+  /// A hook used to query analyses for invalidation. Given a preserved analysis
+  /// set, returns true if it should truly be invalidated. This allows for more
+  /// fine-tuned invalidation in cases where an analysis wasn't explicitly
+  /// marked preserved, but may be preserved(or invalidated) based upon other
+  /// properties such as analyses sets.
+  virtual bool isInvalidated(const PreservedAnalyses &pa) = 0;
+};
+
+/// A derived analysis model used to hold a specific analysis object.
+template <typename AnalysisT> struct AnalysisModel : public AnalysisConcept {
+  template <typename... Args>
+  explicit AnalysisModel(Args &&... args)
+      : analysis(std::forward<Args>(args)...) {}
+
+  /// A hook used to query analyses for invalidation.
+  bool isInvalidated(const PreservedAnalyses &pa) final {
+    return analysis_impl::isInvalidated(analysis, pa);
+  }
+
+  /// The actual analysis object.
+  AnalysisT analysis;
+};
+
+/// This class represents a cache of analyses for a single operation. All
+/// computation, caching, and invalidation of analyses takes place here.
+class AnalysisMap {
+  /// A mapping between an analysis id and an existing analysis instance.
+  using ConceptMap =
+      DenseMap<const AnalysisID *, std::unique_ptr<AnalysisConcept>>;
+
+  /// Utility to return the name of the given analysis class.
+  template <typename AnalysisT> static StringRef getAnalysisName() {
+    StringRef name = llvm::getTypeName<AnalysisT>();
+    if (!name.consume_front("mlir::"))
+      name.consume_front("(anonymous namespace)::");
+    return name;
+  }
+
+public:
+  explicit AnalysisMap(Operation *ir) : ir(ir) {}
+
+  /// Get an analysis for the current IR unit, computing it if necessary.
+  template <typename AnalysisT> AnalysisT &getAnalysis(PassInstrumentor *pi) {
+    auto *id = AnalysisID::getID<AnalysisT>();
+
+    typename ConceptMap::iterator it;
+    bool wasInserted;
+    std::tie(it, wasInserted) = analyses.try_emplace(id);
+
+    // If we don't have a cached analysis for this function, compute it directly
+    // and add it to the cache.
+    if (wasInserted) {
+      if (pi)
+        pi->runBeforeAnalysis(getAnalysisName<AnalysisT>(), id, ir);
+
+      it->second = std::make_unique<AnalysisModel<AnalysisT>>(ir);
+
+      if (pi)
+        pi->runAfterAnalysis(getAnalysisName<AnalysisT>(), id, ir);
+    }
+    return static_cast<AnalysisModel<AnalysisT> &>(*it->second).analysis;
+  }
+
+  /// Get a cached analysis instance if one exists, otherwise return null.
+  template <typename AnalysisT>
+  Optional<std::reference_wrapper<AnalysisT>> getCachedAnalysis() const {
+    auto res = analyses.find(AnalysisID::getID<AnalysisT>());
+    if (res == analyses.end())
+      return llvm::None;
+    return {static_cast<AnalysisModel<AnalysisT> &>(*res->second).analysis};
+  }
+
+  /// Returns the operation that this analysis map represents.
+  Operation *getOperation() const { return ir; }
+
+  /// Clear any held analyses.
+  void clear() { analyses.clear(); }
+
+  /// Invalidate any cached analyses based upon the given set of preserved
+  /// analyses.
+  void invalidate(const PreservedAnalyses &pa) {
+    // Remove any analyses that were invalidated.
+    for (auto it = analyses.begin(), e = analyses.end(); it != e;) {
+      auto curIt = it++;
+      if (curIt->second->isInvalidated(pa))
+        analyses.erase(curIt);
+    }
+  }
+
+private:
+  Operation *ir;
+  ConceptMap analyses;
+};
+
+/// An analysis map that contains a map for the current operation, and a set of
+/// maps for any child operations.
+struct NestedAnalysisMap {
+  NestedAnalysisMap(Operation *op) : analyses(op) {}
+
+  /// Get the operation for this analysis map.
+  Operation *getOperation() const { return analyses.getOperation(); }
+
+  /// Invalidate any non preserved analyses.
+  void invalidate(const PreservedAnalyses &pa);
+
+  /// The cached analyses for nested operations.
+  DenseMap<Operation *, std::unique_ptr<NestedAnalysisMap>> childAnalyses;
+
+  /// The analyses for the owning module.
+  detail::AnalysisMap analyses;
+};
+} // namespace detail
+
+//===----------------------------------------------------------------------===//
+// Analysis Management
+//===----------------------------------------------------------------------===//
+class ModuleAnalysisManager;
+
+/// This class represents an analysis manager for a particular operation
+/// instance. It is used to manage and cache analyses on the operation as well
+/// as those for child operations, via nested AnalysisManager instances
+/// accessible via 'slice'. This class is intended to be passed around by value,
+/// and cannot be constructed directly.
+class AnalysisManager {
+  using ParentPointerT =
+      PointerUnion<const ModuleAnalysisManager *, const AnalysisManager *>;
+
+public:
+  using PreservedAnalyses = detail::PreservedAnalyses;
+
+  // Query for a cached analysis on the given parent operation. The analysis may
+  // not exist and if it does it may be out-of-date.
+  template <typename AnalysisT>
+  Optional<std::reference_wrapper<AnalysisT>>
+  getCachedParentAnalysis(Operation *parentOp) const {
+    ParentPointerT curParent = parent;
+    while (auto *parentAM = curParent.dyn_cast<const AnalysisManager *>()) {
+      if (parentAM->impl->getOperation() == parentOp)
+        return parentAM->getCachedAnalysis<AnalysisT>();
+      curParent = parentAM->parent;
+    }
+    return None;
+  }
+
+  // Query for the given analysis for the current operation.
+  template <typename AnalysisT> AnalysisT &getAnalysis() {
+    return impl->analyses.getAnalysis<AnalysisT>(getPassInstrumentor());
+  }
+
+  // Query for a cached entry of the given analysis on the current operation.
+  template <typename AnalysisT>
+  Optional<std::reference_wrapper<AnalysisT>> getCachedAnalysis() const {
+    return impl->analyses.getCachedAnalysis<AnalysisT>();
+  }
+
+  /// Query for a analysis of a child operation, constructing it if necessary.
+  template <typename AnalysisT> AnalysisT &getChildAnalysis(Operation *op) {
+    return slice(op).template getAnalysis<AnalysisT>();
+  }
+
+  /// Query for a cached analysis of a child operation, or return null.
+  template <typename AnalysisT>
+  Optional<std::reference_wrapper<AnalysisT>>
+  getCachedChildAnalysis(Operation *op) const {
+    assert(op->getParentOp() == impl->getOperation());
+    auto it = impl->childAnalyses.find(op);
+    if (it == impl->childAnalyses.end())
+      return llvm::None;
+    return it->second->analyses.getCachedAnalysis<AnalysisT>();
+  }
+
+  /// Get an analysis manager for the given child operation.
+  AnalysisManager slice(Operation *op);
+
+  /// Invalidate any non preserved analyses,
+  void invalidate(const PreservedAnalyses &pa) { impl->invalidate(pa); }
+
+  /// Clear any held analyses.
+  void clear() {
+    impl->analyses.clear();
+    impl->childAnalyses.clear();
+  }
+
+  /// Returns a pass instrumentation object for the current operation. This
+  /// value may be null.
+  PassInstrumentor *getPassInstrumentor() const;
+
+private:
+  AnalysisManager(const AnalysisManager *parent,
+                  detail::NestedAnalysisMap *impl)
+      : parent(parent), impl(impl) {}
+  AnalysisManager(const ModuleAnalysisManager *parent,
+                  detail::NestedAnalysisMap *impl)
+      : parent(parent), impl(impl) {}
+
+  /// A reference to the parent analysis manager, or the top-level module
+  /// analysis manager.
+  ParentPointerT parent;
+
+  /// A reference to the impl analysis map within the parent analysis manager.
+  detail::NestedAnalysisMap *impl;
+
+  /// Allow access to the constructor.
+  friend class ModuleAnalysisManager;
+};
+
+/// An analysis manager class specifically for the top-level module operation.
+/// This class contains the memory allocations for all nested analysis managers,
+/// and provides an anchor point. This is necessary because AnalysisManager is
+/// designed to be a thin wrapper around an existing analysis map instance.
+class ModuleAnalysisManager {
+public:
+  ModuleAnalysisManager(ModuleOp module, PassInstrumentor *passInstrumentor)
+      : analyses(module), passInstrumentor(passInstrumentor) {}
+  ModuleAnalysisManager(const ModuleAnalysisManager &) = delete;
+  ModuleAnalysisManager &operator=(const ModuleAnalysisManager &) = delete;
+
+  /// Returns a pass instrumentation object for the current module. This value
+  /// may be null.
+  PassInstrumentor *getPassInstrumentor() const { return passInstrumentor; }
+
+  /// Returns an analysis manager for the current top-level module.
+  operator AnalysisManager() { return AnalysisManager(this, &analyses); }
+
+private:
+  /// The analyses for the owning module.
+  detail::NestedAnalysisMap analyses;
+
+  /// An optional instrumentation object.
+  PassInstrumentor *passInstrumentor;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_PASS_ANALYSISMANAGER_H
diff --git a/mlir/include/mlir/Pass/Pass.h b/mlir/include/mlir/Pass/Pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcb297356fa66ff2e7ed5c36e0d2397079020b08
--- /dev/null
+++ b/mlir/include/mlir/Pass/Pass.h
@@ -0,0 +1,357 @@
+//===- Pass.h - Base classes for compiler passes ----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_PASS_PASS_H
+#define MLIR_PASS_PASS_H
+
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/AnalysisManager.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/Statistic.h"
+
+namespace mlir {
+namespace detail {
+/// The state for a single execution of a pass. This provides a unified
+/// interface for accessing and initializing necessary state for pass execution.
+struct PassExecutionState {
+  PassExecutionState(Operation *ir, AnalysisManager analysisManager)
+      : irAndPassFailed(ir, false), analysisManager(analysisManager) {}
+
+  /// The current operation being transformed and a bool for if the pass
+  /// signaled a failure.
+  llvm::PointerIntPair<Operation *, 1, bool> irAndPassFailed;
+
+  /// The analysis manager for the operation.
+  AnalysisManager analysisManager;
+
+  /// The set of preserved analyses for the current execution.
+  detail::PreservedAnalyses preservedAnalyses;
+};
+} // namespace detail
+
+/// The abstract base pass class. This class contains information describing the
+/// derived pass object, e.g its kind and abstract PassInfo.
+class Pass {
+public:
+  virtual ~Pass() = default;
+
+  /// Returns the unique identifier that corresponds to this pass.
+  const PassID *getPassID() const { return passID; }
+
+  /// Returns the pass info for the specified pass class or null if unknown.
+  static const PassInfo *lookupPassInfo(const PassID *passID);
+  template <typename PassT> static const PassInfo *lookupPassInfo() {
+    return lookupPassInfo(PassID::getID<PassT>());
+  }
+
+  /// Returns the pass info for this pass.
+  const PassInfo *lookupPassInfo() const { return lookupPassInfo(getPassID()); }
+
+  /// Returns the derived pass name.
+  virtual StringRef getName() = 0;
+
+  /// Returns the name of the operation that this pass operates on, or None if
+  /// this is a generic OperationPass.
+  Optional<StringRef> getOpName() const { return opName; }
+
+  //===--------------------------------------------------------------------===//
+  // Options
+  //===--------------------------------------------------------------------===//
+
+  /// This class represents a specific pass option, with a provided data type.
+  template <typename DataType>
+  struct Option : public detail::PassOptions::Option<DataType> {
+    template <typename... Args>
+    Option(Pass &parent, StringRef arg, Args &&... args)
+        : detail::PassOptions::Option<DataType>(parent.passOptions, arg,
+                                                std::forward<Args>(args)...) {}
+    using detail::PassOptions::Option<DataType>::operator=;
+  };
+  /// This class represents a specific pass option that contains a list of
+  /// values of the provided data type.
+  template <typename DataType>
+  struct ListOption : public detail::PassOptions::ListOption<DataType> {
+    template <typename... Args>
+    ListOption(Pass &parent, StringRef arg, Args &&... args)
+        : detail::PassOptions::ListOption<DataType>(
+              parent.passOptions, arg, std::forward<Args>(args)...) {}
+    using detail::PassOptions::ListOption<DataType>::operator=;
+  };
+
+  /// Attempt to initialize the options of this pass from the given string.
+  LogicalResult initializeOptions(StringRef options);
+
+  /// Prints out the pass in the textual representation of pipelines. If this is
+  /// an adaptor pass, print with the op_name(sub_pass,...) format.
+  void printAsTextualPipeline(raw_ostream &os);
+
+  //===--------------------------------------------------------------------===//
+  // Statistics
+  //===--------------------------------------------------------------------===//
+
+  /// This class represents a single pass statistic. This statistic functions
+  /// similarly to an unsigned integer value, and may be updated and incremented
+  /// accordingly. This class can be used to provide additional information
+  /// about the transformations and analyses performed by a pass.
+  class Statistic : public llvm::Statistic {
+  public:
+    /// The statistic is initialized by the pass owner, a name, and a
+    /// description.
+    Statistic(Pass *owner, const char *name, const char *description);
+
+    /// Assign the statistic to the given value.
+    Statistic &operator=(unsigned value);
+
+  private:
+    /// Hide some of the details of llvm::Statistic that we don't use.
+    using llvm::Statistic::getDebugType;
+  };
+
+  /// Returns the main statistics for this pass instance.
+  ArrayRef<Statistic *> getStatistics() const { return statistics; }
+  MutableArrayRef<Statistic *> getStatistics() { return statistics; }
+
+protected:
+  explicit Pass(const PassID *passID, Optional<StringRef> opName = llvm::None)
+      : passID(passID), opName(opName) {}
+
+  /// Returns the current pass state.
+  detail::PassExecutionState &getPassState() {
+    assert(passState && "pass state was never initialized");
+    return *passState;
+  }
+
+  /// Return the MLIR context for the current function being transformed.
+  MLIRContext &getContext() { return *getOperation()->getContext(); }
+
+  /// The polymorphic API that runs the pass over the currently held operation.
+  virtual void runOnOperation() = 0;
+
+  /// A clone method to create a copy of this pass.
+  virtual std::unique_ptr<Pass> clone() const = 0;
+
+  /// Return the current operation being transformed.
+  Operation *getOperation() {
+    return getPassState().irAndPassFailed.getPointer();
+  }
+
+  /// Returns the current analysis manager.
+  AnalysisManager getAnalysisManager() {
+    return getPassState().analysisManager;
+  }
+
+  /// Copy the option values from 'other', which is another instance of this
+  /// pass.
+  void copyOptionValuesFrom(const Pass *other);
+
+private:
+  /// Forwarding function to execute this pass on the given operation.
+  LLVM_NODISCARD
+  LogicalResult run(Operation *op, AnalysisManager am);
+
+  /// Out of line virtual method to ensure vtables and metadata are emitted to a
+  /// single .o file.
+  virtual void anchor();
+
+  /// Represents a unique identifier for the pass.
+  const PassID *passID;
+
+  /// The name of the operation that this pass operates on, or None if this is a
+  /// generic OperationPass.
+  Optional<StringRef> opName;
+
+  /// The current execution state for the pass.
+  Optional<detail::PassExecutionState> passState;
+
+  /// The set of statistics held by this pass.
+  std::vector<Statistic *> statistics;
+
+  /// The pass options registered to this pass instance.
+  detail::PassOptions passOptions;
+
+  /// Allow access to 'clone' and 'run'.
+  friend class OpPassManager;
+};
+
+//===----------------------------------------------------------------------===//
+// Pass Model Definitions
+//===----------------------------------------------------------------------===//
+namespace detail {
+/// The opaque CRTP model of a pass. This class provides utilities for derived
+/// pass execution and handles all of the necessary polymorphic API.
+template <typename PassT, typename BasePassT>
+class PassModel : public BasePassT {
+public:
+  /// Support isa/dyn_cast functionality for the derived pass class.
+  static bool classof(const Pass *pass) {
+    return pass->getPassID() == PassID::getID<PassT>();
+  }
+
+protected:
+  explicit PassModel(Optional<StringRef> opName = llvm::None)
+      : BasePassT(PassID::getID<PassT>(), opName) {}
+
+  /// Signal that some invariant was broken when running. The IR is allowed to
+  /// be in an invalid state.
+  void signalPassFailure() {
+    this->getPassState().irAndPassFailed.setInt(true);
+  }
+
+  /// Query an analysis for the current ir unit.
+  template <typename AnalysisT> AnalysisT &getAnalysis() {
+    return this->getAnalysisManager().template getAnalysis<AnalysisT>();
+  }
+
+  /// Query a cached instance of an analysis for the current ir unit if one
+  /// exists.
+  template <typename AnalysisT>
+  Optional<std::reference_wrapper<AnalysisT>> getCachedAnalysis() {
+    return this->getAnalysisManager().template getCachedAnalysis<AnalysisT>();
+  }
+
+  /// Mark all analyses as preserved.
+  void markAllAnalysesPreserved() {
+    this->getPassState().preservedAnalyses.preserveAll();
+  }
+
+  /// Mark the provided analyses as preserved.
+  template <typename... AnalysesT> void markAnalysesPreserved() {
+    this->getPassState().preservedAnalyses.template preserve<AnalysesT...>();
+  }
+  void markAnalysesPreserved(const AnalysisID *id) {
+    this->getPassState().preservedAnalyses.preserve(id);
+  }
+
+  /// Returns the derived pass name.
+  StringRef getName() override {
+    StringRef name = llvm::getTypeName<PassT>();
+    if (!name.consume_front("mlir::"))
+      name.consume_front("(anonymous namespace)::");
+    return name;
+  }
+
+  /// A clone method to create a copy of this pass.
+  std::unique_ptr<Pass> clone() const override {
+    auto newInst = std::make_unique<PassT>(*static_cast<const PassT *>(this));
+    newInst->copyOptionValuesFrom(this);
+    return newInst;
+  }
+
+  /// Returns the analysis for the parent operation if it exists.
+  template <typename AnalysisT>
+  Optional<std::reference_wrapper<AnalysisT>>
+  getCachedParentAnalysis(Operation *parent) {
+    return this->getAnalysisManager()
+        .template getCachedParentAnalysis<AnalysisT>(parent);
+  }
+  template <typename AnalysisT>
+  Optional<std::reference_wrapper<AnalysisT>> getCachedParentAnalysis() {
+    return this->getAnalysisManager()
+        .template getCachedParentAnalysis<AnalysisT>(
+            this->getOperation()->getParentOp());
+  }
+
+  /// Returns the analysis for the given child operation if it exists.
+  template <typename AnalysisT>
+  Optional<std::reference_wrapper<AnalysisT>>
+  getCachedChildAnalysis(Operation *child) {
+    return this->getAnalysisManager()
+        .template getCachedChildAnalysis<AnalysisT>(child);
+  }
+
+  /// Returns the analysis for the given child operation, or creates it if it
+  /// doesn't exist.
+  template <typename AnalysisT> AnalysisT &getChildAnalysis(Operation *child) {
+    return this->getAnalysisManager().template getChildAnalysis<AnalysisT>(
+        child);
+  }
+};
+} // end namespace detail
+
+/// Utility base class for OpPass below to denote an opaque pass operating on a
+/// specific operation type.
+template <typename OpT> class OpPassBase : public Pass {
+public:
+  using Pass::Pass;
+
+  /// Support isa/dyn_cast functionality.
+  static bool classof(const Pass *pass) {
+    return pass->getOpName() == OpT::getOperationName();
+  }
+};
+
+/// Pass to transform an operation of a specific type.
+///
+/// Operation passes must not:
+///   - modify any other operations within the parent region, as other threads
+///     may be manipulating them concurrently.
+///   - modify any state within the parent operation, this includes adding
+///     additional operations.
+///
+/// Derived function passes are expected to provide the following:
+///   - A 'void runOnOperation()' method.
+template <typename PassT, typename OpT = void>
+class OperationPass : public detail::PassModel<PassT, OpPassBase<OpT>> {
+protected:
+  OperationPass()
+      : detail::PassModel<PassT, OpPassBase<OpT>>(OpT::getOperationName()) {}
+
+  /// Return the current operation being transformed.
+  OpT getOperation() { return cast<OpT>(Pass::getOperation()); }
+};
+
+/// Pass to transform an operation.
+///
+/// Operation passes must not:
+///   - modify any other operations within the parent region, as other threads
+///     may be manipulating them concurrently.
+///   - modify any state within the parent operation, this includes adding
+///     additional operations.
+///
+/// Derived function passes are expected to provide the following:
+///   - A 'void runOnOperation()' method.
+template <typename PassT>
+struct OperationPass<PassT, void> : public detail::PassModel<PassT, Pass> {};
+
+/// A model for providing function pass specific utilities.
+///
+/// Derived function passes are expected to provide the following:
+///   - A 'void runOnFunction()' method.
+template <typename T> struct FunctionPass : public OperationPass<T, FuncOp> {
+  /// The polymorphic API that runs the pass over the currently held function.
+  virtual void runOnFunction() = 0;
+
+  /// The polymorphic API that runs the pass over the currently held operation.
+  void runOnOperation() final {
+    if (!getFunction().isExternal())
+      runOnFunction();
+  }
+
+  /// Return the current module being transformed.
+  FuncOp getFunction() { return this->getOperation(); }
+};
+
+/// A model for providing module pass specific utilities.
+///
+/// Derived module passes are expected to provide the following:
+///   - A 'void runOnModule()' method.
+template <typename T> struct ModulePass : public OperationPass<T, ModuleOp> {
+  /// The polymorphic API that runs the pass over the currently held module.
+  virtual void runOnModule() = 0;
+
+  /// The polymorphic API that runs the pass over the currently held operation.
+  void runOnOperation() final { runOnModule(); }
+
+  /// Return the current module being transformed.
+  ModuleOp getModule() { return this->getOperation(); }
+};
+} // end namespace mlir
+
+#endif // MLIR_PASS_PASS_H
diff --git a/mlir/include/mlir/Pass/PassInstrumentation.h b/mlir/include/mlir/Pass/PassInstrumentation.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef75e56ae629b25c3e79e9f1f70c99c46813759c
--- /dev/null
+++ b/mlir/include/mlir/Pass/PassInstrumentation.h
@@ -0,0 +1,153 @@
+//===- PassInstrumentation.h ------------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_PASS_PASSINSTRUMENTATION_H_
+#define MLIR_PASS_PASSINSTRUMENTATION_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+using AnalysisID = ClassID;
+class Operation;
+class OperationName;
+class Pass;
+
+namespace detail {
+struct PassInstrumentorImpl;
+} // end namespace detail
+
+/// PassInstrumentation provides several entry points into the pass manager
+/// infrastructure. Instrumentations should be added directly to a PassManager
+/// before running a pipeline.
+class PassInstrumentation {
+public:
+  /// This struct represents information related to the parent pass of pipeline.
+  /// It includes information that allows for effectively linking pipelines that
+  /// run on different threads.
+  struct PipelineParentInfo {
+    /// The thread of the parent pass that the current pipeline was spawned
+    /// from. Note: This is acquired from llvm::get_threadid().
+    uint64_t parentThreadID;
+
+    /// The pass that spawned this pipeline.
+    Pass *parentPass;
+  };
+
+  virtual ~PassInstrumentation() = 0;
+
+  /// A callback to run before a pass pipeline is executed. This function takes
+  /// the name of the operation type being operated on, and information related
+  /// to the parent that spawned this pipeline.
+  virtual void runBeforePipeline(const OperationName &name,
+                                 const PipelineParentInfo &parentInfo) {}
+
+  /// A callback to run after a pass pipeline has executed. This function takes
+  /// the name of the operation type being operated on, and information related
+  /// to the parent that spawned this pipeline.
+  virtual void runAfterPipeline(const OperationName &name,
+                                const PipelineParentInfo &parentInfo) {}
+
+  /// A callback to run before a pass is executed. This function takes a pointer
+  /// to the pass to be executed, as well as the current operation being
+  /// operated on.
+  virtual void runBeforePass(Pass *pass, Operation *op) {}
+
+  /// A callback to run after a pass is successfully executed. This function
+  /// takes a pointer to the pass to be executed, as well as the current
+  /// operation being operated on.
+  virtual void runAfterPass(Pass *pass, Operation *op) {}
+
+  /// A callback to run when a pass execution fails. This function takes a
+  /// pointer to the pass that was being executed, as well as the current
+  /// operation being operated on. Note that the operation may be in an invalid
+  /// state.
+  virtual void runAfterPassFailed(Pass *pass, Operation *op) {}
+
+  /// A callback to run before an analysis is computed. This function takes the
+  /// name of the analysis to be computed, its AnalysisID, as well as the
+  /// current operation being analyzed.
+  virtual void runBeforeAnalysis(StringRef name, AnalysisID *id,
+                                 Operation *op) {}
+
+  /// A callback to run before an analysis is computed. This function takes the
+  /// name of the analysis that was computed, its AnalysisID, as well as the
+  /// current operation being analyzed.
+  virtual void runAfterAnalysis(StringRef name, AnalysisID *id, Operation *op) {
+  }
+};
+
+/// This class holds a collection of PassInstrumentation objects, and invokes
+/// their respective call backs.
+class PassInstrumentor {
+public:
+  PassInstrumentor();
+  PassInstrumentor(PassInstrumentor &&) = delete;
+  PassInstrumentor(const PassInstrumentor &) = delete;
+  ~PassInstrumentor();
+
+  /// See PassInstrumentation::runBeforePipeline for details.
+  void
+  runBeforePipeline(const OperationName &name,
+                    const PassInstrumentation::PipelineParentInfo &parentInfo);
+
+  /// See PassInstrumentation::runAfterPipeline for details.
+  void
+  runAfterPipeline(const OperationName &name,
+                   const PassInstrumentation::PipelineParentInfo &parentInfo);
+
+  /// See PassInstrumentation::runBeforePass for details.
+  void runBeforePass(Pass *pass, Operation *op);
+
+  /// See PassInstrumentation::runAfterPass for details.
+  void runAfterPass(Pass *pass, Operation *op);
+
+  /// See PassInstrumentation::runAfterPassFailed for details.
+  void runAfterPassFailed(Pass *pass, Operation *op);
+
+  /// See PassInstrumentation::runBeforeAnalysis for details.
+  void runBeforeAnalysis(StringRef name, AnalysisID *id, Operation *op);
+
+  /// See PassInstrumentation::runAfterAnalysis for details.
+  void runAfterAnalysis(StringRef name, AnalysisID *id, Operation *op);
+
+  /// Add the given instrumentation to the collection.
+  void addInstrumentation(std::unique_ptr<PassInstrumentation> pi);
+
+private:
+  std::unique_ptr<detail::PassInstrumentorImpl> impl;
+};
+
+} // end namespace mlir
+
+namespace llvm {
+template <> struct DenseMapInfo<mlir::PassInstrumentation::PipelineParentInfo> {
+  using T = mlir::PassInstrumentation::PipelineParentInfo;
+  using PairInfo = DenseMapInfo<std::pair<uint64_t, void *>>;
+
+  static T getEmptyKey() {
+    auto pair = PairInfo::getEmptyKey();
+    return {pair.first, reinterpret_cast<mlir::Pass *>(pair.second)};
+  }
+  static T getTombstoneKey() {
+    auto pair = PairInfo::getTombstoneKey();
+    return {pair.first, reinterpret_cast<mlir::Pass *>(pair.second)};
+  }
+  static unsigned getHashValue(T val) {
+    return PairInfo::getHashValue({val.parentThreadID, val.parentPass});
+  }
+  static bool isEqual(T lhs, T rhs) {
+    return lhs.parentThreadID == rhs.parentThreadID &&
+           lhs.parentPass == rhs.parentPass;
+  }
+};
+} // end namespace llvm
+
+#endif // MLIR_PASS_PASSINSTRUMENTATION_H_
diff --git a/mlir/include/mlir/Pass/PassManager.h b/mlir/include/mlir/Pass/PassManager.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4f3683f03162e18f515c60f42a743dd634aeacd
--- /dev/null
+++ b/mlir/include/mlir/Pass/PassManager.h
@@ -0,0 +1,266 @@
+//===- PassManager.h - Pass Management Interface ----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_PASS_PASSMANAGER_H
+#define MLIR_PASS_PASSMANAGER_H
+
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator.h"
+
+#include <vector>
+
+namespace llvm {
+class Any;
+} // end namespace llvm
+
+namespace mlir {
+class AnalysisManager;
+class MLIRContext;
+class ModuleOp;
+class OperationName;
+class Operation;
+class Pass;
+class PassInstrumentation;
+class PassInstrumentor;
+
+namespace detail {
+struct OpPassManagerImpl;
+} // end namespace detail
+
+//===----------------------------------------------------------------------===//
+// OpPassManager
+//===----------------------------------------------------------------------===//
+
+/// This class represents a pass manager that runs passes on a specific
+/// operation type. This class is not constructed directly, but nested within
+/// other OpPassManagers or the top-level PassManager.
+class OpPassManager {
+public:
+  OpPassManager(OpPassManager &&rhs);
+  OpPassManager(const OpPassManager &rhs);
+  ~OpPassManager();
+  OpPassManager &operator=(const OpPassManager &rhs);
+
+  /// Iterator over the passes in this pass manager.
+  using pass_iterator =
+      llvm::pointee_iterator<std::vector<std::unique_ptr<Pass>>::iterator>;
+  pass_iterator begin();
+  pass_iterator end();
+  iterator_range<pass_iterator> getPasses() { return {begin(), end()}; }
+
+  /// Run the held passes over the given operation.
+  LogicalResult run(Operation *op, AnalysisManager am);
+
+  /// Nest a new operation pass manager for the given operation kind under this
+  /// pass manager.
+  OpPassManager &nest(const OperationName &nestedName);
+  OpPassManager &nest(StringRef nestedName);
+  template <typename OpT> OpPassManager &nest() {
+    return nest(OpT::getOperationName());
+  }
+
+  /// Add the given pass to this pass manager. If this pass has a concrete
+  /// operation type, it must be the same type as this pass manager.
+  void addPass(std::unique_ptr<Pass> pass);
+
+  /// Add the given pass to a nested pass manager for the given operation kind
+  /// `OpT`.
+  template <typename OpT> void addNestedPass(std::unique_ptr<Pass> pass) {
+    nest<OpT>().addPass(std::move(pass));
+  }
+
+  /// Returns the number of passes held by this manager.
+  size_t size() const;
+
+  /// Return an instance of the context.
+  MLIRContext *getContext() const;
+
+  /// Return the operation name that this pass manager operates on.
+  const OperationName &getOpName() const;
+
+  /// Returns the internal implementation instance.
+  detail::OpPassManagerImpl &getImpl();
+
+  /// Prints out the passes of the pass manager as the textual representation
+  /// of pipelines.
+  /// Note: The quality of the string representation depends entirely on the
+  /// the correctness of per-pass overrides of Pass::printAsTextualPipeline.
+  void printAsTextualPipeline(raw_ostream &os);
+
+  /// Merge the pass statistics of this class into 'other'.
+  void mergeStatisticsInto(OpPassManager &other);
+
+private:
+  OpPassManager(OperationName name, bool disableThreads, bool verifyPasses);
+
+  /// A pointer to an internal implementation instance.
+  std::unique_ptr<detail::OpPassManagerImpl> impl;
+
+  /// Allow access to the constructor.
+  friend class PassManager;
+};
+
+//===----------------------------------------------------------------------===//
+// PassManager
+//===----------------------------------------------------------------------===//
+
+/// An enum describing the different display modes for the information within
+/// the pass manager.
+enum class PassDisplayMode {
+  // In this mode the results are displayed in a list sorted by total,
+  // with each pass/analysis instance aggregated into one unique result.
+  List,
+
+  // In this mode the results are displayed in a nested pipeline view that
+  // mirrors the internal pass pipeline that is being executed in the pass
+  // manager.
+  Pipeline,
+};
+
+/// The main pass manager and pipeline builder.
+class PassManager : public OpPassManager {
+public:
+  // If verifyPasses is true, the verifier is run after each pass.
+  PassManager(MLIRContext *ctx, bool verifyPasses = true);
+  ~PassManager();
+
+  /// Run the passes within this manager on the provided module.
+  LLVM_NODISCARD
+  LogicalResult run(ModuleOp module);
+
+  /// Disable support for multi-threading within the pass manager.
+  void disableMultithreading(bool disable = true);
+
+  /// Enable support for the pass manager to generate a reproducer on the event
+  /// of a crash or a pass failure. `outputFile` is a .mlir filename used to
+  /// write the generated reproducer.
+  void enableCrashReproducerGeneration(StringRef outputFile);
+
+  //===--------------------------------------------------------------------===//
+  // Instrumentations
+  //===--------------------------------------------------------------------===//
+
+  /// Add the provided instrumentation to the pass manager.
+  void addInstrumentation(std::unique_ptr<PassInstrumentation> pi);
+
+  //===--------------------------------------------------------------------===//
+  // IR Printing
+
+  /// A configuration struct provided to the IR printer instrumentation.
+  class IRPrinterConfig {
+  public:
+    using PrintCallbackFn = function_ref<void(raw_ostream &)>;
+
+    /// Initialize the configuration.
+    /// * 'printModuleScope' signals if the top-level module IR should always be
+    ///   printed. This should only be set to true when multi-threading is
+    ///   disabled, otherwise we may try to print IR that is being modified
+    ///   asynchronously.
+    /// * 'printAfterOnlyOnChange' signals that when printing the IR after a
+    ///   pass, in the case of a non-failure, we should first check if any
+    ///   potential mutations were made. This allows for reducing the number of
+    ///   logs that don't contain meaningful changes.
+    explicit IRPrinterConfig(bool printModuleScope = false,
+                             bool printAfterOnlyOnChange = false);
+    virtual ~IRPrinterConfig();
+
+    /// A hook that may be overridden by a derived config that checks if the IR
+    /// of 'operation' should be dumped *before* the pass 'pass' has been
+    /// executed. If the IR should be dumped, 'printCallback' should be invoked
+    /// with the stream to dump into.
+    virtual void printBeforeIfEnabled(Pass *pass, Operation *operation,
+                                      PrintCallbackFn printCallback);
+
+    /// A hook that may be overridden by a derived config that checks if the IR
+    /// of 'operation' should be dumped *after* the pass 'pass' has been
+    /// executed. If the IR should be dumped, 'printCallback' should be invoked
+    /// with the stream to dump into.
+    virtual void printAfterIfEnabled(Pass *pass, Operation *operation,
+                                     PrintCallbackFn printCallback);
+
+    /// Returns true if the IR should always be printed at the top-level scope.
+    bool shouldPrintAtModuleScope() const { return printModuleScope; }
+
+    /// Returns true if the IR should only printed after a pass if the IR
+    /// "changed".
+    bool shouldPrintAfterOnlyOnChange() const { return printAfterOnlyOnChange; }
+
+  private:
+    /// A flag that indicates if the IR should be printed at module scope.
+    bool printModuleScope;
+
+    /// A flag that indicates that the IR after a pass should only be printed if
+    /// a change is detected.
+    bool printAfterOnlyOnChange;
+  };
+
+  /// Add an instrumentation to print the IR before and after pass execution,
+  /// using the provided configuration.
+  void enableIRPrinting(std::unique_ptr<IRPrinterConfig> config);
+
+  /// Add an instrumentation to print the IR before and after pass execution,
+  /// using the provided fields to generate a default configuration:
+  /// * 'shouldPrintBeforePass' and 'shouldPrintAfterPass' correspond to filter
+  ///   functions that take a 'Pass *' and `Operation *`. These function should
+  ///   return true if the IR should be printed or not.
+  /// * 'printModuleScope' signals if the module IR should be printed, even
+  ///   for non module passes.
+  /// * 'printAfterOnlyOnChange' signals that when printing the IR after a
+  ///   pass, in the case of a non-failure, we should first check if any
+  ///   potential mutations were made.
+  /// * 'out' corresponds to the stream to output the printed IR to.
+  void enableIRPrinting(
+      std::function<bool(Pass *, Operation *)> shouldPrintBeforePass,
+      std::function<bool(Pass *, Operation *)> shouldPrintAfterPass,
+      bool printModuleScope, bool printAfterOnlyOnChange, raw_ostream &out);
+
+  //===--------------------------------------------------------------------===//
+  // Pass Timing
+
+  /// Add an instrumentation to time the execution of passes and the computation
+  /// of analyses.
+  /// Note: Timing should be enabled after all other instrumentations to avoid
+  /// any potential "ghost" timing from other instrumentations being
+  /// unintentionally included in the timing results.
+  void enableTiming(PassDisplayMode displayMode = PassDisplayMode::Pipeline);
+
+  /// Prompts the pass manager to print the statistics collected for each of the
+  /// held passes after each call to 'run'.
+  void
+  enableStatistics(PassDisplayMode displayMode = PassDisplayMode::Pipeline);
+
+private:
+  /// Dump the statistics of the passes within this pass manager.
+  void dumpStatistics();
+
+  /// Flag that specifies if pass timing is enabled.
+  bool passTiming : 1;
+
+  /// Flag that specifies if pass statistics should be dumped.
+  Optional<PassDisplayMode> passStatisticsMode;
+
+  /// A manager for pass instrumentations.
+  std::unique_ptr<PassInstrumentor> instrumentor;
+
+  /// An optional filename to use when generating a crash reproducer if valid.
+  Optional<std::string> crashReproducerFileName;
+};
+
+/// Register a set of useful command-line options that can be used to configure
+/// a pass manager. The values of these options can be applied via the
+/// 'applyPassManagerCLOptions' method below.
+void registerPassManagerCLOptions();
+
+/// Apply any values provided to the pass manager options that were registered
+/// with 'registerPassManagerOptions'.
+void applyPassManagerCLOptions(PassManager &pm);
+} // end namespace mlir
+
+#endif // MLIR_PASS_PASSMANAGER_H
diff --git a/mlir/include/mlir/Pass/PassOptions.h b/mlir/include/mlir/Pass/PassOptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ecb7ba970a0f0f702d213203aff95d5797c8068
--- /dev/null
+++ b/mlir/include/mlir/Pass/PassOptions.h
@@ -0,0 +1,241 @@
+//===- PassOptions.h - Pass Option Utilities --------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utilities for registering options with compiler passes and
+// pipelines.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_PASS_PASSOPTIONS_H_
+#define MLIR_PASS_PASSOPTIONS_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include <memory>
+
+namespace mlir {
+namespace detail {
+/// Base container class and manager for all pass options.
+class PassOptions : protected llvm::cl::SubCommand {
+private:
+  /// This is the type-erased option base class. This provides some additional
+  /// hooks into the options that are not available via llvm::cl::Option.
+  class OptionBase {
+  public:
+    virtual ~OptionBase() = default;
+
+    /// Out of line virtual function to provide home for the class.
+    virtual void anchor();
+
+    /// Print the name and value of this option to the given stream.
+    virtual void print(raw_ostream &os) = 0;
+
+    /// Return the argument string of this option.
+    StringRef getArgStr() const { return getOption()->ArgStr; }
+
+  protected:
+    /// Return the main option instance.
+    virtual const llvm::cl::Option *getOption() const = 0;
+
+    /// Copy the value from the given option into this one.
+    virtual void copyValueFrom(const OptionBase &other) = 0;
+
+    /// Allow access to private methods.
+    friend PassOptions;
+  };
+
+  /// This is the parser that is used by pass options that use literal options.
+  /// This is a thin wrapper around the llvm::cl::parser, that exposes some
+  /// additional methods.
+  template <typename DataType>
+  struct GenericOptionParser : public llvm::cl::parser<DataType> {
+    using llvm::cl::parser<DataType>::parser;
+
+    /// Returns an argument name that maps to the specified value.
+    Optional<StringRef> findArgStrForValue(const DataType &value) {
+      for (auto &it : this->Values)
+        if (it.V.compare(value))
+          return it.Name;
+      return llvm::None;
+    }
+  };
+
+  /// The specific parser to use depending on llvm::cl parser used. This is only
+  /// necessary because we need to provide additional methods for certain data
+  /// type parsers.
+  /// TODO(riverriddle) We should upstream the methods in GenericOptionParser to
+  /// avoid the need to do this.
+  template <typename DataType>
+  using OptionParser =
+      std::conditional_t<std::is_base_of<llvm::cl::generic_parser_base,
+                                         llvm::cl::parser<DataType>>::value,
+                         GenericOptionParser<DataType>,
+                         llvm::cl::parser<DataType>>;
+
+  /// Utility methods for printing option values.
+  template <typename DataT>
+  static void printOptionValue(raw_ostream &os,
+                               GenericOptionParser<DataT> &parser,
+                               const DataT &value) {
+    if (Optional<StringRef> argStr = parser.findArgStrForValue(value))
+      os << argStr;
+    else
+      llvm_unreachable("unknown data value for option");
+  }
+  template <typename DataT, typename ParserT>
+  static void printOptionValue(raw_ostream &os, ParserT &parser,
+                               const DataT &value) {
+    os << value;
+  }
+  template <typename ParserT>
+  static void printOptionValue(raw_ostream &os, ParserT &parser,
+                               const bool &value) {
+    os << (value ? StringRef("true") : StringRef("false"));
+  }
+
+public:
+  /// This class represents a specific pass option, with a provided data type.
+  template <typename DataType>
+  class Option : public llvm::cl::opt<DataType, /*ExternalStorage=*/false,
+                                      OptionParser<DataType>>,
+                 public OptionBase {
+  public:
+    template <typename... Args>
+    Option(PassOptions &parent, StringRef arg, Args &&... args)
+        : llvm::cl::opt<DataType, /*ExternalStorage=*/false,
+                        OptionParser<DataType>>(arg, llvm::cl::sub(parent),
+                                                std::forward<Args>(args)...) {
+      assert(!this->isPositional() && !this->isSink() &&
+             "sink and positional options are not supported");
+      parent.options.push_back(this);
+    }
+    using llvm::cl::opt<DataType, /*ExternalStorage=*/false,
+                        OptionParser<DataType>>::operator=;
+    ~Option() override = default;
+
+  private:
+    /// Return the main option instance.
+    const llvm::cl::Option *getOption() const final { return this; }
+
+    /// Print the name and value of this option to the given stream.
+    void print(raw_ostream &os) final {
+      os << this->ArgStr << '=';
+      printOptionValue(os, this->getParser(), this->getValue());
+    }
+
+    /// Copy the value from the given option into this one.
+    void copyValueFrom(const OptionBase &other) final {
+      this->setValue(static_cast<const Option<DataType> &>(other).getValue());
+    }
+  };
+
+  /// This class represents a specific pass option that contains a list of
+  /// values of the provided data type.
+  template <typename DataType>
+  class ListOption : public llvm::cl::list<DataType, /*StorageClass=*/bool,
+                                           OptionParser<DataType>>,
+                     public OptionBase {
+  public:
+    template <typename... Args>
+    ListOption(PassOptions &parent, StringRef arg, Args &&... args)
+        : llvm::cl::list<DataType, /*StorageClass=*/bool,
+                         OptionParser<DataType>>(arg, llvm::cl::sub(parent),
+                                                 std::forward<Args>(args)...) {
+      assert(!this->isPositional() && !this->isSink() &&
+             "sink and positional options are not supported");
+      parent.options.push_back(this);
+    }
+    ~ListOption() override = default;
+
+    /// Allow assigning from an ArrayRef.
+    ListOption<DataType> &operator=(ArrayRef<DataType> values) {
+      (*this)->assign(values.begin(), values.end());
+      return *this;
+    }
+
+    std::vector<DataType> *operator->() { return &*this; }
+
+  private:
+    /// Return the main option instance.
+    const llvm::cl::Option *getOption() const final { return this; }
+
+    /// Print the name and value of this option to the given stream.
+    void print(raw_ostream &os) final {
+      os << this->ArgStr << '=';
+      auto printElementFn = [&](const DataType &value) {
+        printOptionValue(os, this->getParser(), value);
+      };
+      interleave(*this, os, printElementFn, ",");
+    }
+
+    /// Copy the value from the given option into this one.
+    void copyValueFrom(const OptionBase &other) final {
+      (*this) = ArrayRef<DataType>((ListOption<DataType> &)other);
+    }
+  };
+
+  PassOptions() = default;
+
+  /// Copy the option values from 'other' into 'this', where 'other' has the
+  /// same options as 'this'.
+  void copyOptionValuesFrom(const PassOptions &other);
+
+  /// Parse options out as key=value pairs that can then be handed off to the
+  /// `llvm::cl` command line passing infrastructure. Everything is space
+  /// separated.
+  LogicalResult parseFromString(StringRef options);
+
+  /// Print the options held by this struct in a form that can be parsed via
+  /// 'parseFromString'.
+  void print(raw_ostream &os);
+
+private:
+  /// A list of all of the opaque options.
+  std::vector<OptionBase *> options;
+};
+} // end namespace detail
+
+//===----------------------------------------------------------------------===//
+// PassPipelineOptions
+//===----------------------------------------------------------------------===//
+
+/// Subclasses of PassPipelineOptions provide a set of options that can be used
+/// to initialize a pass pipeline. See PassPipelineRegistration for usage
+/// details.
+///
+/// Usage:
+///
+/// struct MyPipelineOptions : PassPipelineOptions<MyPassOptions> {
+///   ListOption<int> someListFlag{
+///        *this, "flag-name", llvm::cl::MiscFlags::CommaSeparated,
+///        llvm::cl::desc("...")};
+/// };
+template <typename T> class PassPipelineOptions : public detail::PassOptions {
+public:
+  /// Factory that parses the provided options and returns a unique_ptr to the
+  /// struct.
+  static std::unique_ptr<T> createFromString(StringRef options) {
+    auto result = std::make_unique<T>();
+    if (failed(result->parseFromString(options)))
+      return nullptr;
+    return result;
+  }
+};
+
+/// A default empty option struct to be used for passes that do not need to take
+/// any options.
+struct EmptyPipelineOptions : public PassPipelineOptions<EmptyPipelineOptions> {
+};
+
+} // end namespace mlir
+
+#endif // MLIR_PASS_PASSOPTIONS_H_
diff --git a/mlir/include/mlir/Pass/PassRegistry.h b/mlir/include/mlir/Pass/PassRegistry.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5604c04616c9cf0996759a6e6d0f98e329613d9
--- /dev/null
+++ b/mlir/include/mlir/Pass/PassRegistry.h
@@ -0,0 +1,216 @@
+//===- PassRegistry.h - Pass Registration Utilities -------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utilities for registering information about compiler
+// passes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_PASS_PASSREGISTRY_H_
+#define MLIR_PASS_PASSREGISTRY_H_
+
+#include "mlir/Pass/PassOptions.h"
+#include <functional>
+
+namespace mlir {
+class OpPassManager;
+class Pass;
+
+/// A registry function that adds passes to the given pass manager. This should
+/// also parse options and return success() if parsing succeeded.
+using PassRegistryFunction =
+    std::function<LogicalResult(OpPassManager &, StringRef options)>;
+using PassAllocatorFunction = std::function<std::unique_ptr<Pass>()>;
+
+/// A special type used by transformation passes to provide an address that can
+/// act as a unique identifier during pass registration.
+using PassID = ClassID;
+
+//===----------------------------------------------------------------------===//
+// PassRegistry
+//===----------------------------------------------------------------------===//
+
+/// Structure to group information about a passes and pass pipelines (argument
+/// to invoke via mlir-opt, description, pass pipeline builder).
+class PassRegistryEntry {
+public:
+  /// Adds this pass registry entry to the given pass manager. `options` is
+  /// an opaque string that will be parsed by the builder. The success of
+  /// parsing will be returned.
+  LogicalResult addToPipeline(OpPassManager &pm, StringRef options) const {
+    assert(builder &&
+           "cannot call addToPipeline on PassRegistryEntry without builder");
+    return builder(pm, options);
+  }
+
+  /// Returns the command line option that may be passed to 'mlir-opt' that will
+  /// cause this pass to run or null if there is no such argument.
+  StringRef getPassArgument() const { return arg; }
+
+  /// Returns a description for the pass, this never returns null.
+  StringRef getPassDescription() const { return description; }
+
+protected:
+  PassRegistryEntry(StringRef arg, StringRef description,
+                    const PassRegistryFunction &builder)
+      : arg(arg), description(description), builder(builder) {}
+
+private:
+  // The argument with which to invoke the pass via mlir-opt.
+  StringRef arg;
+
+  // Description of the pass.
+  StringRef description;
+
+  // Function to register this entry to a pass manager pipeline.
+  PassRegistryFunction builder;
+};
+
+/// A structure to represent the information of a registered pass pipeline.
+class PassPipelineInfo : public PassRegistryEntry {
+public:
+  PassPipelineInfo(StringRef arg, StringRef description,
+                   const PassRegistryFunction &builder)
+      : PassRegistryEntry(arg, description, builder) {}
+};
+
+/// A structure to represent the information for a derived pass class.
+class PassInfo : public PassRegistryEntry {
+public:
+  /// PassInfo constructor should not be invoked directly, instead use
+  /// PassRegistration or registerPass.
+  PassInfo(StringRef arg, StringRef description, const PassID *passID,
+           const PassAllocatorFunction &allocator);
+};
+
+//===----------------------------------------------------------------------===//
+// PassRegistration
+//===----------------------------------------------------------------------===//
+
+/// Register a specific dialect pipeline registry function with the system,
+/// typically used through the PassPipelineRegistration template.
+void registerPassPipeline(StringRef arg, StringRef description,
+                          const PassRegistryFunction &function);
+
+/// Register a specific dialect pass allocator function with the system,
+/// typically used through the PassRegistration template.
+void registerPass(StringRef arg, StringRef description, const PassID *passID,
+                  const PassAllocatorFunction &function);
+
+/// PassRegistration provides a global initializer that registers a Pass
+/// allocation routine for a concrete pass instance. The third argument is
+/// optional and provides a callback to construct a pass that does not have
+/// a default constructor.
+///
+/// Usage:
+///
+///   /// At namespace scope.
+///   static PassRegistration<MyPass> reg("my-pass", "My Pass Description.");
+///
+template <typename ConcretePass> struct PassRegistration {
+
+  PassRegistration(StringRef arg, StringRef description,
+                   const PassAllocatorFunction &constructor) {
+    registerPass(arg, description, PassID::getID<ConcretePass>(), constructor);
+  }
+
+  PassRegistration(StringRef arg, StringRef description)
+      : PassRegistration(arg, description,
+                         [] { return std::make_unique<ConcretePass>(); }) {}
+};
+
+/// PassPipelineRegistration provides a global initializer that registers a Pass
+/// pipeline builder routine.
+///
+/// Usage:
+///
+///   // At namespace scope.
+///   void pipelineBuilder(OpPassManager &pm) {
+///      pm.addPass(new MyPass());
+///      pm.addPass(new MyOtherPass());
+///   }
+///
+///   static PassPipelineRegistration Unused("unused", "Unused pass",
+///                                          pipelineBuilder);
+template <typename Options = EmptyPipelineOptions>
+struct PassPipelineRegistration {
+  PassPipelineRegistration(
+      StringRef arg, StringRef description,
+      std::function<void(OpPassManager &, const Options &options)> builder) {
+    registerPassPipeline(arg, description,
+                         [builder](OpPassManager &pm, StringRef optionsStr) {
+                           Options options;
+                           if (failed(options.parseFromString(optionsStr)))
+                             return failure();
+                           builder(pm, options);
+                           return success();
+                         });
+  }
+};
+
+/// Convenience specialization of PassPipelineRegistration for EmptyPassOptions
+/// that does not pass an empty options struct to the pass builder function.
+template <> struct PassPipelineRegistration<EmptyPipelineOptions> {
+  PassPipelineRegistration(StringRef arg, StringRef description,
+                           std::function<void(OpPassManager &)> builder) {
+    registerPassPipeline(arg, description,
+                         [builder](OpPassManager &pm, StringRef optionsStr) {
+                           if (!optionsStr.empty())
+                             return failure();
+                           builder(pm);
+                           return success();
+                         });
+  }
+};
+
+/// This function parses the textual representation of a pass pipeline, and adds
+/// the result to 'pm' on success. This function returns failure if the given
+/// pipeline was invalid. 'errorStream' is the output stream used to emit errors
+/// found during parsing.
+LogicalResult parsePassPipeline(StringRef pipeline, OpPassManager &pm,
+                                raw_ostream &errorStream = llvm::errs());
+
+//===----------------------------------------------------------------------===//
+// PassPipelineCLParser
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+struct PassPipelineCLParserImpl;
+} // end namespace detail
+
+/// This class implements a command-line parser for MLIR passes. It registers a
+/// cl option with a given argument and description. This parser will register
+/// options for each of the passes and pipelines that have been registered with
+/// the pass registry; Meaning that `-cse` will refer to the CSE pass in MLIR.
+/// It also registers an argument, `pass-pipeline`, that supports parsing a
+/// textual description of a pipeline.
+class PassPipelineCLParser {
+public:
+  /// Construct a pass pipeline parser with the given command line description.
+  PassPipelineCLParser(StringRef arg, StringRef description);
+  ~PassPipelineCLParser();
+
+  /// Returns true if this parser contains any valid options to add.
+  bool hasAnyOccurrences() const;
+
+  /// Returns true if the given pass registry entry was registered at the
+  /// top-level of the parser, i.e. not within an explicit textual pipeline.
+  bool contains(const PassRegistryEntry *entry) const;
+
+  /// Adds the passes defined by this parser entry to the given pass manager.
+  /// Returns failure() if the pass could not be properly constructed due
+  /// to options parsing.
+  LogicalResult addToPipeline(OpPassManager &pm) const;
+
+private:
+  std::unique_ptr<detail::PassPipelineCLParserImpl> impl;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_PASS_PASSREGISTRY_H_
diff --git a/mlir/include/mlir/Quantizer/Configurations/FxpMathConfig.h b/mlir/include/mlir/Quantizer/Configurations/FxpMathConfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..f27d12d7f527e4a2953948ce5292ba34e85620c0
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Configurations/FxpMathConfig.h
@@ -0,0 +1,41 @@
+//===- FxpMathConfig.h - Reference fixed point config -----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a TargetConfiguration for reference fixed-point math
+// quantization scheme based on the FxpMathOps (plus a small category of
+// extension ops that can be added from other dialects).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_CONFIGURATIONS_FXPMATHCONFIG_H
+#define MLIR_QUANTIZER_CONFIGURATIONS_FXPMATHCONFIG_H
+
+#include "mlir/Quantizer/Support/Configuration.h"
+#include "mlir/Quantizer/Support/Metadata.h"
+
+namespace mlir {
+namespace quantizer {
+
+/// Target configuration for a reference affine/fixed-point quantization
+/// scheme defined in terms of the FxpMathOps dialect. This can be extended
+/// with select ops from other dialects by way of the following public
+/// methods:
+///   - addValueIdentityOp
+class FxpMathTargetConfig : public TargetConfiguration {
+public:
+  /// Creates an FxpMathTargetConfig instance which can be further customized.
+  static std::unique_ptr<FxpMathTargetConfig> create(SolverContext &context);
+
+protected:
+  FxpMathTargetConfig(SolverContext &context) : TargetConfiguration(context) {}
+};
+
+} // namespace quantizer
+} // namespace mlir
+
+#endif // MLIR_QUANTIZER_CONFIGURATIONS_FXPMATHCONFIG_H
diff --git a/mlir/include/mlir/Quantizer/Support/Configuration.h b/mlir/include/mlir/Quantizer/Support/Configuration.h
new file mode 100644
index 0000000000000000000000000000000000000000..3732fbad3a22237ccc0ad903be9414c2e87474d0
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Support/Configuration.h
@@ -0,0 +1,146 @@
+//===- Configuration.h - Configuration object base classes ------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The quantizer is relatively agnostic to source and target dialects, with
+// the specific represented by configuration policy objects derived from
+// classes in this file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_CONFIGURATION_H
+#define MLIR_QUANTIZER_SUPPORT_CONFIGURATION_H
+
+#include <functional>
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+#include "mlir/Quantizer/Support/Metadata.h"
+#include "mlir/Quantizer/Support/Rules.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/StringSet.h"
+
+namespace mlir {
+class Operation;
+
+namespace quantizer {
+
+class CAGSlice;
+
+/// Defines quantization configuration for the target.
+/// The settings here depend on a variety of details about the deployment
+/// environment, although, where we have control over such things, we do
+/// try to standardize as possible.
+///
+/// Non-const methods are used to setup the configuration. It is expected that
+/// const instances/references are used post-build.
+class TargetConfiguration {
+public:
+  static constexpr size_t MaxSchemeIndex = 31;
+  using OpHandlerFn = std::function<void(Operation *op, CAGSlice &cag)>;
+
+  TargetConfiguration(SolverContext &context);
+  virtual ~TargetConfiguration() = default;
+
+  /// Adds a candidate type, returning its ordinal.
+  unsigned addCandidateType(quant::AnyQuantizedType quantizedType,
+                            CandidateQuantizedType::Scheme scheme) {
+    unsigned ordinal = candidateTypes.size();
+    assert(allCandidateTypesMask.size() == ordinal);
+    CandidateQuantizedType ct{ordinal, quantizedType, scheme};
+    candidateTypes.push_back(ct);
+    allCandidateTypesMask.push_back(true);
+    return ordinal;
+  }
+
+  /// Gets a prototype scheme by index.
+  const CandidateQuantizedType &getCandidateType(unsigned index) const {
+    assert(index < candidateTypes.size());
+    return candidateTypes[index];
+  }
+
+  ArrayRef<CandidateQuantizedType> getCandidateTypes() const {
+    return candidateTypes;
+  }
+
+  /// Gets a mask of all enabled candidate types by ordinal.
+  llvm::SmallBitVector getAllCandidateTypesMask() const {
+    return allCandidateTypesMask;
+  }
+
+  /// Gets a mask with every candidate type except those in the given mask.
+  llvm::SmallBitVector
+  getCandidateTypeDisabledExceptMask(ArrayRef<unsigned> exceptOrdinals) const {
+    llvm::SmallBitVector disabled(allCandidateTypesMask);
+    for (unsigned ordinal : exceptOrdinals) {
+      disabled.reset(ordinal);
+    }
+    return disabled;
+  }
+
+  /// Adds an op handler.
+  template <typename OpTy>
+  void addOpHandler(OpHandlerFn fn) {
+    addOpHandlerByName(OpTy::getOperationName(), fn);
+  }
+
+  /// Adds an operation which requires statistics at its result nodes for
+  /// best quantization performance. Note that the opName StringRef is
+  /// expected to come from getOperationName() and be static.
+  template <typename OpTy>
+  void addRequireStatsOp() {
+    addRequireStatsOpByName(OpTy::getOperationName());
+  }
+
+  /// Returns whether opName is a RequireStatsOp.
+  bool isRequireStatsOp(Operation *op) const;
+
+  /// Adds an op which does not mutate its values but may mutate its shape
+  /// or combine its operands in an arbitrary way.
+  /// Such ops are expected to have the same types for operands and results
+  /// and must be capable of operating on storage types.
+  template <typename OpTy>
+  void addValueIdentityOp() {
+    addValueIdentityOpByName(OpTy::getOperationName());
+  }
+
+  /// Handles the operation if a handler is defined for it.
+  void handleOp(Operation *op, CAGSlice &cag) const;
+
+  /// Finalizes the CAG after all anchors have been added.
+  virtual void finalizeAnchors(CAGSlice &cag) const {}
+
+  /// Whether an operand or result type is subject to analysis by this config.
+  virtual bool isHandledType(Type t) const = 0;
+
+protected:
+  virtual void addValueIdentityOpByName(StringRef opName) = 0;
+  void addOpHandlerByName(StringRef name, OpHandlerFn fn);
+
+private:
+  void addRequireStatsOpByName(StringRef opName);
+
+  /// Vector of all candidate type constraints, indexed by ordinal.
+  std::vector<CandidateQuantizedType> candidateTypes;
+
+  // A SmallBoolVector with bits set for all known candidate types.
+  llvm::SmallBitVector allCandidateTypesMask;
+
+  /// Map of all op handlers.
+  llvm::StringMap<OpHandlerFn> opHandlers;
+
+  /// Names of operations which should have their results annotated with
+  /// statistics.
+  llvm::StringSet<> requireStatsOpNames;
+};
+
+} // namespace quantizer
+} // namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_CONFIGURATION_H
diff --git a/mlir/include/mlir/Quantizer/Support/ConstraintAnalysisGraph.h b/mlir/include/mlir/Quantizer/Support/ConstraintAnalysisGraph.h
new file mode 100644
index 0000000000000000000000000000000000000000..d99db65b015604db61ce1045a9ca02bcf0608753
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Support/ConstraintAnalysisGraph.h
@@ -0,0 +1,360 @@
+//===- ConstraintAnalysisGraph.h - Graphs type for constraints --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides graph-based data structures for representing anchors
+// and constraints between them.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_CONSTRAINTANALYSISGRAPH_H
+#define MLIR_QUANTIZER_SUPPORT_CONSTRAINTANALYSISGRAPH_H
+
+#include <utility>
+#include <vector>
+
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Quantizer/Support/Metadata.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace mlir {
+namespace quantizer {
+
+class CAGNode;
+class CAGSlice;
+class TargetConfiguration;
+
+/// A node in the Constraint Analysis Graph.
+/// Nodes are either anchors (representing results and operands) or constraints.
+/// Anchor nodes are connected to other anchor nodes via constraints.
+/// Nodes exist within graph slices, which are typically analyses attached to
+/// the function or module. Slices can contain other slices, which mirrors
+/// the nesting of analyses.
+///
+/// Nodes have directed relationships which propagate successor-ward when dirty.
+/// Relationships can be bi-directional, in which case, the constraint's
+/// propagation mechanism must ensure convergence.
+class CAGNode {
+public:
+  enum class Kind {
+    /// Anchors.
+    Anchor,
+    OperandAnchor,
+    ResultAnchor,
+    LastAnchor = ResultAnchor,
+
+    /// Constraints.
+    Constraint,
+    SolveUniformConstraint,
+    UniformPropagateExplicitScale,
+    LastConstraint = UniformPropagateExplicitScale,
+  };
+
+  // Vector and iterator over nodes.
+  using node_vector = SmallVector<CAGNode *, 1>;
+  using iterator = node_vector::iterator;
+  using const_iterator = node_vector::const_iterator;
+
+  virtual ~CAGNode() = default;
+
+  Kind getKind() const { return kind; }
+
+  /// Unique id of the node within the slice.
+  int getNodeId() const { return nodeId; }
+
+  /// Whether the node is dirty, requiring one or more calls to propagate().
+  bool isDirty() const { return dirty; }
+  void markDirty() { dirty = true; }
+  void clearDirty() { dirty = false; }
+
+  /// Iterator over this node's children (outgoing) nodes.
+  const_iterator begin() const { return outgoing.begin(); }
+  const_iterator end() const { return outgoing.end(); }
+  iterator begin() { return outgoing.begin(); }
+  iterator end() { return outgoing.end(); }
+
+  /// Iterator over this parents (incoming) nodes.
+  const_iterator incoming_begin() const { return incoming.begin(); }
+  const_iterator incoming_end() const { return incoming.end(); }
+  iterator incoming_begin() { return incoming.begin(); }
+  iterator incoming_end() { return incoming.end(); }
+
+  virtual void propagate(SolverContext &solverContext,
+                         const TargetConfiguration &config) {}
+
+  /// Prints the node label, suitable for one-line display.
+  virtual void printLabel(raw_ostream &os) const;
+
+  template <typename T> void findChildrenOfKind(SmallVectorImpl<T *> &found) {
+    for (CAGNode *child : *this) {
+      T *ofKind = dyn_cast<T>(child);
+      if (ofKind) {
+        found.push_back(ofKind);
+      }
+    }
+  }
+
+  /// Replaces this node by rerouting any parent nodes to have otherNode
+  /// as a child.
+  void replaceIncoming(CAGNode *otherNode);
+
+  /// Adds an outgoing connection to this node (and corresponding back
+  /// incoming connection).
+  void addOutgoing(CAGNode *toNode);
+
+  /// Whether this node is an orphan (has no incoming or outgoing connections).
+  bool isOrphan() const { return incoming.empty() && outgoing.empty(); }
+
+protected:
+  CAGNode(Kind kind) : kind(kind) {}
+
+private:
+  Kind kind;
+  int nodeId = -1;
+  node_vector outgoing;
+  node_vector incoming;
+  bool dirty = false;
+
+  friend class CAGSlice;
+};
+
+/// Anchor nodes represent points in the source IR where we may choose to
+/// introduce a type transition. These include operands, results, arguments
+/// returns, etc.
+class CAGAnchorNode : public CAGNode {
+public:
+  enum class TypeTransformRule {
+    /// The owning op directly supports all transformed types. In practice,
+    /// this means that the op supports QuantizedType for this anchor.
+    Direct,
+
+    /// The type of this anchor should be set to the QuantizedType storage
+    /// type. This will only be valid if constraints are such that all
+    /// inputs/outputs converge to the same storage type (i.e. coupled).
+    DirectStorage,
+
+    /// The anchor must only be typed based on the expressed type. This is
+    /// used for ops that do not natively support quantization, and suitable
+    /// casts will be inserted.
+    ExpressedOnly,
+  };
+
+  /// Metadata for solving uniform quantization params.
+  CAGUniformMetadata &getUniformMetadata() { return uniformMetadata; }
+  const CAGUniformMetadata &getUniformMetadata() const {
+    return uniformMetadata;
+  }
+
+  virtual Operation *getOp() const = 0;
+  virtual Value getValue() const = 0;
+
+  static bool classof(const CAGNode *n) {
+    return n->getKind() >= Kind::Anchor && n->getKind() <= Kind::LastAnchor;
+  }
+
+  void propagate(SolverContext &solverContext,
+                 const TargetConfiguration &config) override;
+
+  void printLabel(raw_ostream &os) const override;
+
+  /// Given the anchor metadata and resolved solutions, chooses the most
+  /// salient and returns an appropriate type to represent it.
+  Type getTransformedType();
+
+  TypeTransformRule getTypeTransformRule() const { return typeTransformRule; }
+
+  void setTypeTransformRule(TypeTransformRule r) { typeTransformRule = r; }
+
+  /// Gets the Type that was defined for this anchor at the time of
+  /// construction.
+  Type getOriginalType() const { return originalType; }
+
+protected:
+  CAGAnchorNode(Kind kind, Type originalType)
+      : CAGNode(kind), originalType(originalType) {}
+
+private:
+  CAGUniformMetadata uniformMetadata;
+  Type originalType;
+  TypeTransformRule typeTransformRule = TypeTransformRule::Direct;
+};
+
+/// An anchor tied to a specific operand.
+/// Since operand anchors can be rewritten so that the operand refers to
+/// a new result, they are maintained by reference (to the op and index).
+class CAGOperandAnchor : public CAGAnchorNode {
+public:
+  CAGOperandAnchor(Operation *op, unsigned operandIdx);
+
+  Operation *getOp() const final { return op; }
+  unsigned getOperandIdx() const { return operandIdx; }
+
+  static bool classof(const CAGNode *n) {
+    return n->getKind() == Kind::Anchor || n->getKind() == Kind::OperandAnchor;
+  }
+
+  Value getValue() const final { return op->getOperand(operandIdx); }
+
+  void printLabel(raw_ostream &os) const override;
+
+private:
+  Operation *op;
+  unsigned operandIdx;
+};
+
+/// An anchor tied to a specific result.
+/// Since a result is already anchored to its defining op, result anchors refer
+/// directly to the underlying Value.
+class CAGResultAnchor : public CAGAnchorNode {
+public:
+  CAGResultAnchor(Operation *op, unsigned resultIdx);
+
+  static bool classof(const CAGNode *n) {
+    return n->getKind() == Kind::Anchor || n->getKind() == Kind::ResultAnchor;
+  }
+
+  Operation *getOp() const final { return resultValue->getDefiningOp(); }
+  Value getValue() const final { return resultValue; }
+
+  void printLabel(raw_ostream &os) const override;
+
+private:
+  Value resultValue;
+};
+
+/// Base class for constraint nodes.
+class CAGConstraintNode : public CAGNode {
+public:
+  CAGConstraintNode(Kind kind) : CAGNode(kind) {}
+
+  static bool classof(const CAGNode *n) {
+    return n->getKind() >= Kind::Constraint &&
+           n->getKind() <= Kind::LastConstraint;
+  }
+};
+
+/// A slice of a CAG (which may be the whole graph).
+class CAGSlice {
+public:
+  CAGSlice(SolverContext &context);
+  ~CAGSlice();
+
+  using node_vector = std::vector<CAGNode *>;
+  using iterator = node_vector::iterator;
+  using const_iterator = node_vector::const_iterator;
+
+  iterator begin() { return allNodes.begin(); }
+  iterator end() { return allNodes.end(); }
+  const_iterator begin() const { return allNodes.begin(); }
+  const_iterator end() const { return allNodes.end(); }
+
+  /// Gets an operand anchor node.
+  CAGOperandAnchor *getOperandAnchor(Operation *op, unsigned operandIdx);
+
+  /// Gets a result anchor node.
+  CAGResultAnchor *getResultAnchor(Operation *op, unsigned resultIdx);
+
+  /// Adds a relation constraint with incoming 'from' anchors and outgoing 'to'
+  /// anchors.
+  template <typename T, typename... Args>
+  T *addUniqueConstraint(ArrayRef<CAGAnchorNode *> anchors, Args... args) {
+    static_assert(std::is_convertible<T *, CAGConstraintNode *>(),
+                  "T must be a CAGConstraingNode");
+    T *constraintNode = addNode(std::make_unique<T>(args...));
+    for (auto *anchor : anchors)
+      anchor->addOutgoing(constraintNode);
+    return constraintNode;
+  }
+
+  /// Adds a unidirectional constraint from a node to an array of target nodes.
+  template <typename T, typename... Args>
+  T *addUnidirectionalConstraint(CAGAnchorNode *fromAnchor,
+                                 ArrayRef<CAGAnchorNode *> toAnchors,
+                                 Args... args) {
+    static_assert(std::is_convertible<T *, CAGConstraintNode *>(),
+                  "T must be a CAGConstraingNode");
+    T *constraintNode = addNode(std::make_unique<T>(args...));
+    fromAnchor->addOutgoing(constraintNode);
+    for (auto *toAnchor : toAnchors) {
+      constraintNode->addOutgoing(toAnchor);
+    }
+    return constraintNode;
+  }
+
+  template <typename T>
+  T *addClusteredConstraint(ArrayRef<CAGAnchorNode *> anchors) {
+    static_assert(std::is_convertible<T *, CAGConstraintNode *>(),
+                  "T must be a CAGConstraingNode");
+    SmallVector<T *, 8> cluster;
+    for (auto *anchor : anchors) {
+      anchor->findChildrenOfKind<T>(cluster);
+    }
+
+    T *constraintNode;
+    if (cluster.empty()) {
+      // Create new.
+      constraintNode = addNode(std::make_unique<T>());
+    } else {
+      // Merge existing.
+      constraintNode = cluster[0];
+      for (size_t i = 1, e = cluster.size(); i < e; ++i) {
+        cluster[i]->replaceIncoming(constraintNode);
+      }
+    }
+    for (auto *anchor : anchors) {
+      anchor->addOutgoing(constraintNode);
+    }
+    return constraintNode;
+  }
+
+  /// Enumerates all implied connections in the slice.
+  /// An implied connection is any two nodes that physically refer to the
+  /// same value in the IR, such as result->operand.
+  /// Typically this will be modeled with some kind of strong or weak
+  /// identity constraint such that types propagate.
+  /// This is usually called when the slice has been fully constructed in
+  /// order to add final constraints.
+  /// It is legal for the callback to modify the graph by adding constraints.
+  void enumerateImpliedConnections(
+      std::function<void(CAGAnchorNode *from, CAGAnchorNode *to)> callback);
+
+  /// Performs one round of propagation, returning the number of nodes
+  /// propagates. If returns > 0, then additional propagate() rounds are
+  /// required.
+  unsigned propagate(const TargetConfiguration &config);
+
+private:
+  /// Adds a node to the graph.
+  /// The node should be a subclass of TransformNode.
+  /// Returns the raw pointer to the node.
+  template <typename T>
+  T *addNode(std::unique_ptr<T> node) {
+    node->nodeId = allNodes.size();
+    T *unownedNode = node.release();
+    allNodes.push_back(unownedNode);
+    return unownedNode;
+  }
+
+  SolverContext &context;
+  std::vector<CAGNode *> allNodes;
+  DenseMap<std::pair<Operation *, unsigned>, CAGOperandAnchor *> operandAnchors;
+  DenseMap<std::pair<Operation *, unsigned>, CAGResultAnchor *> resultAnchors;
+};
+
+inline raw_ostream &operator<<(raw_ostream &os, const CAGNode &node) {
+  node.printLabel(os);
+  return os;
+}
+
+} // namespace quantizer
+} // namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_CONSTRAINTANALYSISGRAPH_H
diff --git a/mlir/include/mlir/Quantizer/Support/ConstraintAnalysisGraphTraits.h b/mlir/include/mlir/Quantizer/Support/ConstraintAnalysisGraphTraits.h
new file mode 100644
index 0000000000000000000000000000000000000000..35ec85f13b27c562686a106906b6bed7795431cf
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Support/ConstraintAnalysisGraphTraits.h
@@ -0,0 +1,49 @@
+//===- ConstraintAnalysisGraphTraits.h - Traits for CAGs --------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides graph traits for constraint analysis graphs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_CONSTRAINTANALYSISGRAPHTRAITS_H
+#define MLIR_QUANTIZER_SUPPORT_CONSTRAINTANALYSISGRAPHTRAITS_H
+
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+#include "llvm/ADT/GraphTraits.h"
+
+namespace llvm {
+
+template <>
+struct GraphTraits<const mlir::quantizer::CAGNode *> {
+  using NodeRef = const mlir::quantizer::CAGNode *;
+
+  static NodeRef getEntryNode(NodeRef node) { return node; }
+
+  // Successors.
+  using ChildIteratorType = mlir::quantizer::CAGNode::const_iterator;
+  static ChildIteratorType child_begin(NodeRef node) { return node->begin(); }
+  static ChildIteratorType child_end(NodeRef node) { return node->end(); }
+};
+
+template <>
+struct GraphTraits<const mlir::quantizer::CAGSlice *>
+    : public llvm::GraphTraits<const mlir::quantizer::CAGNode *> {
+  using nodes_iterator = mlir::quantizer::CAGSlice::const_iterator;
+  static mlir::quantizer::CAGSlice::const_iterator
+  nodes_begin(const mlir::quantizer::CAGSlice *G) {
+    return G->begin();
+  }
+  static mlir::quantizer::CAGSlice::const_iterator
+  nodes_end(const mlir::quantizer::CAGSlice *G) {
+    return G->end();
+  }
+};
+
+} // end namespace llvm
+
+#endif // MLIR_QUANTIZER_SUPPORT_CONSTRAINTANALYSISGRAPHTRAITS_H
diff --git a/mlir/include/mlir/Quantizer/Support/Metadata.h b/mlir/include/mlir/Quantizer/Support/Metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..0545e78f917aef6147230d221d977ed546595855
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Support/Metadata.h
@@ -0,0 +1,101 @@
+//===- Metadata.h - Top level types and metadata ----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains top level types needed to construct constraint graphs,
+// including context/allocator support and concrete metadata structs for
+// different quantization schemes (which must be attached to anchor nodes).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_METADATA_H
+#define MLIR_QUANTIZER_SUPPORT_METADATA_H
+
+#include <limits>
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Quantizer/Support/Rules.h"
+#include "llvm/ADT/SmallBitVector.h"
+
+namespace mlir {
+namespace quantizer {
+
+class SolverContext {
+public:
+  SolverContext(MLIRContext &mlirContext) : mlirContext(mlirContext) {}
+
+  MLIRContext &getMlirContext() { return mlirContext; }
+
+  llvm::BumpPtrAllocator &getAllocator() { return allocator; }
+
+  // Optional path to write a debug DOT file for the CAG.
+  StringRef getDebugCAGDotPath() const { return debugCAGDotPath; }
+  void setDebugCAGDotPath(StringRef p) { debugCAGDotPath = p; }
+
+private:
+  MLIRContext &mlirContext;
+  llvm::BumpPtrAllocator allocator;
+  std::string debugCAGDotPath;
+};
+
+/// Candidate for a quantized type conversion.
+struct CandidateQuantizedType {
+  // Note that scheme encodes more than just the target type: it also encodes
+  // additional constraints.
+  enum class Scheme {
+    // Uses aggregate range information for all nodes in the cluster to
+    // solve for uniform scale and zero point.
+    UniformPerLayer,
+    // Uses aggregate per-axis range information for all nodes in the cluster
+    // to solve for per-axis uniform scale and zero point.
+    UniformPerAxisFixedPoint,
+    // Uses the |explicitScaleZeroPoint| to set the scale (and zero point = 0)
+    // for the uniform type. This typically overrides all other constraints
+    // and is used for wide accumulator types (i.e. i32 bias vectors).
+    UniformExplicitFixedPointScale,
+  };
+  unsigned ordinal;
+  quant::AnyQuantizedType quantizedType;
+  Scheme scheme;
+};
+
+struct CAGUniformMetadata {
+  /// Default salience for facts that are derived from data either statically
+  /// discovered in the computation or observed from an outside source.
+  static constexpr int SalienceDefault = 0;
+
+  /// Highest salience level for facts derived from overrides provided
+  /// explicitly.
+  static constexpr int SalienceForced = 100;
+
+  /// Salience for facts derived from constraints in how the math is
+  /// expressed which must be satisfied.
+  static constexpr int SalienceRequired = 200;
+
+  /// The range that the scheme must represent in order to accommodate the
+  /// underlying data.
+  ExpandingMinMaxFact requiredRange;
+
+  /// Bool vector of scheme ordinals that are disabled.
+  llvm::SmallBitVector disabledCandidateTypes;
+
+  /// If set, then a solution has converged for the given per-layer scheme.
+  quant::QuantizedType selectedType;
+
+  /// Optional scale and zero point to be used by types which solve via the
+  /// UniformExplicitFixedPointScale scheme.
+  DiscreteScaleZeroPointFact explicitScaleZeroPoint;
+
+  /// Prints a summary of the metadata suitable for display in a graph label.
+  void printSummary(raw_ostream &os) const;
+};
+
+} // end namespace quantizer
+} // end namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_METADATA_H
diff --git a/mlir/include/mlir/Quantizer/Support/Rules.h b/mlir/include/mlir/Quantizer/Support/Rules.h
new file mode 100644
index 0000000000000000000000000000000000000000..536dd7ea07e4a4ae8fc54a506ad5683d5aa05613
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Support/Rules.h
@@ -0,0 +1,200 @@
+//===- Rules.h - Helpers for declaring facts and rules ----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines helper classes and functions for managing state (facts),
+// merging and tracking modification for various data types important for
+// quantization.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_RULES_H
+#define MLIR_QUANTIZER_SUPPORT_RULES_H
+
+#include "llvm/ADT/Optional.h"
+
+#include <algorithm>
+#include <limits>
+#include <utility>
+
+namespace mlir {
+namespace quantizer {
+
+/// Typed indicator of whether a mutator produces a modification.
+struct ModificationResult {
+  enum ModificationEnum { Retained, Modified } value;
+  ModificationResult(ModificationEnum v) : value(v) {}
+
+  ModificationResult operator|(ModificationResult other) {
+    if (value == Modified || other.value == Modified) {
+      return ModificationResult(Modified);
+    } else {
+      return ModificationResult(Retained);
+    }
+  }
+
+  ModificationResult operator|=(ModificationResult other) {
+    value =
+        (value == Modified || other.value == Modified) ? Modified : Retained;
+    return *this;
+  }
+};
+
+inline ModificationResult modify(bool isModified = true) {
+  return ModificationResult{isModified ? ModificationResult::Modified
+                                       : ModificationResult::Retained};
+}
+
+inline bool modified(ModificationResult m) {
+  return m.value == ModificationResult::Modified;
+}
+
+/// A fact that can converge through forward propagation alone without the
+/// need to track ownership or individual assertions. In practice, this works
+/// for static assertions that are either minimized or maximized and do not
+/// vary dynamically.
+///
+/// It is expected that ValueTy is appropriate to pass by value and has an
+/// operator==. The BinaryReducer type should have two static methods:
+///   using ValueTy : Type of the value.
+///   ValueTy initialValue() : Returns the initial value of the fact.
+///   ValueTy reduce(ValueTy lhs, ValueTy rhs) : Reduces two values.
+template <typename BinaryReducer>
+class BasePropagatedFact {
+public:
+  using ValueTy = typename BinaryReducer::ValueTy;
+  using ThisTy = BasePropagatedFact<BinaryReducer>;
+  BasePropagatedFact()
+      : value(BinaryReducer::initialValue()),
+        salience(std::numeric_limits<int>::min()) {}
+
+  int getSalience() const { return salience; }
+  bool hasValue() const { return salience != std::numeric_limits<int>::min(); }
+  ValueTy getValue() const { return value; }
+  ModificationResult assertValue(int assertSalience, ValueTy assertValue) {
+    if (assertSalience > salience) {
+      // New salience band.
+      value = assertValue;
+      salience = assertSalience;
+      return modify(true);
+    } else if (assertSalience < salience) {
+      // Lower salience - ignore.
+      return modify(false);
+    }
+    // Merge within same salience band.
+    ValueTy updatedValue = BinaryReducer::reduce(value, assertValue);
+    auto mod = modify(value != updatedValue);
+    value = updatedValue;
+    return mod;
+  }
+  ModificationResult mergeFrom(const ThisTy &other) {
+    if (other.hasValue()) {
+      return assertValue(other.getSalience(), other.getValue());
+    }
+    return modify(false);
+  }
+
+private:
+  ValueTy value;
+  int salience;
+};
+
+/// A binary reducer that expands a min/max range represented by a pair
+/// of doubles such that it represents the largest of all inputs.
+/// The initial value is (Inf, -Inf).
+struct ExpandingMinMaxReducer {
+  using ValueTy = std::pair<double, double>;
+  static ValueTy initialValue() {
+    return std::make_pair(std::numeric_limits<double>::infinity(),
+                          -std::numeric_limits<double>::infinity());
+  }
+  static ValueTy reduce(ValueTy lhs, ValueTy rhs) {
+    return std::make_pair(std::min(lhs.first, rhs.first),
+                          std::max(lhs.second, rhs.second));
+  }
+};
+using ExpandingMinMaxFact = BasePropagatedFact<ExpandingMinMaxReducer>;
+
+/// A binary reducer that minimizing a numeric type.
+template <typename T>
+struct MinimizingNumericReducer {
+  using ValueTy = T;
+  static ValueTy initialValue() {
+    if (std::numeric_limits<T>::has_infinity()) {
+      return std::numeric_limits<T>::infinity();
+    } else {
+      return std::numeric_limits<T>::max();
+    }
+  }
+  static ValueTy reduce(ValueTy lhs, ValueTy rhs) { return std::min(lhs, rhs); }
+};
+using MinimizingDoubleFact =
+    BasePropagatedFact<MinimizingNumericReducer<double>>;
+using MinimizingIntFact = BasePropagatedFact<MinimizingNumericReducer<int>>;
+
+/// A binary reducer that maximizes a numeric type.
+template <typename T>
+struct MaximizingNumericReducer {
+  using ValueTy = T;
+  static ValueTy initialValue() {
+    if (std::numeric_limits<T>::has_infinity()) {
+      return -std::numeric_limits<T>::infinity();
+    } else {
+      return std::numeric_limits<T>::min();
+    }
+  }
+  static ValueTy reduce(ValueTy lhs, ValueTy rhs) { return std::max(lhs, rhs); }
+};
+using MaximizingDoubleFact =
+    BasePropagatedFact<MaximizingNumericReducer<double>>;
+using MaximizingIntFact = BasePropagatedFact<MaximizingNumericReducer<int>>;
+
+/// A fact and reducer for tracking agreement of discrete values. The value
+/// type consists of a |T| value and a flag indicating whether there is a
+/// conflict (in which case, the preserved value is arbitrary).
+template <typename T>
+struct DiscreteReducer {
+  struct ValueTy {
+    ValueTy() : conflict(false) {}
+    ValueTy(T value) : value(value), conflict(false) {}
+    ValueTy(T value, bool conflict) : value(value), conflict(conflict) {}
+    llvm::Optional<T> value;
+    bool conflict;
+    bool operator==(const ValueTy &other) const {
+      if (conflict != other.conflict)
+        return false;
+      if (value && other.value) {
+        return *value == *other.value;
+      } else {
+        return !value && !other.value;
+      }
+    }
+    bool operator!=(const ValueTy &other) const { return !(*this == other); }
+  };
+  static ValueTy initialValue() { return ValueTy(); }
+  static ValueTy reduce(ValueTy lhs, ValueTy rhs) {
+    if (!lhs.value && !rhs.value)
+      return lhs;
+    else if (!lhs.value)
+      return rhs;
+    else if (!rhs.value)
+      return lhs;
+    else
+      return ValueTy(*lhs.value, *lhs.value != *rhs.value);
+  }
+};
+
+template <typename T>
+using DiscreteFact = BasePropagatedFact<DiscreteReducer<T>>;
+
+/// Discrete scale/zeroPoint fact.
+using DiscreteScaleZeroPointFact = DiscreteFact<std::pair<double, int64_t>>;
+
+} // end namespace quantizer
+} // end namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_RULES_H
diff --git a/mlir/include/mlir/Quantizer/Support/Statistics.h b/mlir/include/mlir/Quantizer/Support/Statistics.h
new file mode 100644
index 0000000000000000000000000000000000000000..a24eecd34273acf263d8168c84e97b076ca0fe9c
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Support/Statistics.h
@@ -0,0 +1,84 @@
+//===- Statistics.h - Collects statistics over tensors ----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines adapters for extracting various (per layer and per axis)
+// statistics over tensors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_STATISTICS_H
+#define MLIR_QUANTIZER_SUPPORT_STATISTICS_H
+
+#include "mlir/IR/Attributes.h"
+
+namespace mlir {
+namespace quantizer {
+
+/// Statistics about a tensor axis (or the whole tensor).
+struct TensorAxisStatistics {
+  int64_t sampleSize = 0;
+  double minValue = 0;
+  double maxValue = 0;
+  double mean = 0;
+  double variance = 0;
+
+  TensorAxisStatistics() {}
+  TensorAxisStatistics(int64_t sampleSize, double minValue, double maxValue,
+                       double mean, double variance)
+      : sampleSize(sampleSize), minValue(minValue), maxValue(maxValue),
+        mean(mean), variance(variance) {}
+  void clear() { *this = TensorAxisStatistics(); }
+};
+
+/// Base class for querying statistics about a tensor.
+class AbstractTensorStatistics {
+public:
+  virtual ~AbstractTensorStatistics() = default;
+
+  /// Gets statistics across the whole tensor.
+  /// Returns true if statistics are valid and were populated.
+  virtual bool get(TensorAxisStatistics &stats) const { return false; }
+
+  /// Whether this instance supports querying per axis statistics. If true,
+  /// then getForAxis(...) can be used.
+  virtual bool supportsPerAxis() const { return false; }
+
+  /// Count of axes supported in a per-axis query.
+  virtual unsigned getAxisCount() const { return 0; }
+
+  /// Gets statistics for a specific axis (0..getAxisCount() - 1).
+  /// Returns true if statistics are valid and were populated.
+  virtual bool getForAxis(unsigned axis, TensorAxisStatistics &stats) const {
+    return false;
+  }
+};
+
+/// Wraps an MLIR Attribute and returns statistics about it.
+/// It is expected that the attribute be one of:
+///   FloatAttr (scalar)
+///   DenseFPElementsAttr
+///   OpaqueElementsAttr (with Float based type)
+///   SparseElementAttr  (with Float based type)
+class AttributeTensorStatistics : public AbstractTensorStatistics {
+public:
+  AttributeTensorStatistics(Attribute attr) : attr(attr) {}
+
+  bool get(TensorAxisStatistics &stats) const override;
+
+  // TODO: Implement per-axis.
+
+private:
+  Attribute attr;
+};
+
+raw_ostream &operator<<(raw_ostream &os, const TensorAxisStatistics &stats);
+
+} // end namespace quantizer
+} // end namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_STATISTICS_H
diff --git a/mlir/include/mlir/Quantizer/Support/TypeUtils.h b/mlir/include/mlir/Quantizer/Support/TypeUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..64ae5d65b57f7a3af4de4c83a7be947470dcf531
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Support/TypeUtils.h
@@ -0,0 +1,31 @@
+//===- TypeUtils.h - Helper function for manipulating types -----*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines various helper functions for manipulating types. The
+// process of quantizing typically involves a number of type manipulations
+// that are not very common elsewhere, and it is best to name them and define
+// them here versus inline in the rest of the tool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THIRD_PARTY_MLIR_EDGE_FXPSOLVER_SUPPORT_TYPEUTILS_H_
+#define THIRD_PARTY_MLIR_EDGE_FXPSOLVER_SUPPORT_TYPEUTILS_H_
+
+#include "mlir/IR/Types.h"
+
+namespace mlir {
+namespace quantizer {
+
+/// Given an arbitrary container or primitive type, returns the element type,
+/// where the element type is just the type for non-containers.
+Type getElementOrPrimitiveType(Type t);
+
+} // namespace quantizer
+} // namespace mlir
+
+#endif // THIRD_PARTY_MLIR_EDGE_FXPSOLVER_SUPPORT_TYPEUTILS_H_
diff --git a/mlir/include/mlir/Quantizer/Support/UniformConstraints.h b/mlir/include/mlir/Quantizer/Support/UniformConstraints.h
new file mode 100644
index 0000000000000000000000000000000000000000..70c022c96a103d6e46dacd63ab13663b5de24861
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Support/UniformConstraints.h
@@ -0,0 +1,60 @@
+//===- UniformConstraints.h - Constraints for uniform quant -----*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a builder that lets you attach constraints necessary to
+// perform a variety of uniform quantization conversions to CAG anchors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_UNIFORMCONSTRAINTS_H
+#define MLIR_QUANTIZER_SUPPORT_UNIFORMCONSTRAINTS_H
+
+#include "mlir/Quantizer/Support/Statistics.h"
+
+namespace mlir {
+namespace quantizer {
+
+class CAGAnchorNode;
+class CAGSlice;
+
+/// Factory methods for adding CAG constraints of various kinds suitable
+/// for solving for uniform quantization.
+class UniformConstraintsBuilder {
+public:
+  UniformConstraintsBuilder(CAGSlice &slice) : slice(slice) {}
+
+  /// Adds a coupling constraint between two nodes, effectively treating
+  /// them as a hard identity relationship.
+  void coupleAnchors(CAGAnchorNode *a, CAGAnchorNode *b);
+
+  /// Applies statistics constraints to the given anchor, such that the solver
+  /// ensures that the statistics are representable by chosen types.
+  void applyStats(CAGAnchorNode *a, TensorAxisStatistics stats);
+
+  /// Applies a constraint to a node which allows solutions that do not extend
+  /// beyond given min/max bounds (this is a hint that the tensor will not
+  /// take values outside of these bounds). If either minValue or maxValue is
+  /// NAN, then that side is considered open.
+  void clamp(CAGAnchorNode *a, APFloat minValue, APFloat maxValue);
+
+  /// Propagates an explicit scale from an anchor that may have a uniform
+  /// |selectedType| to the |explicitScaleZeroPoint| field of the to node.
+  /// This is typically used with a to node that has a candidate quantized
+  /// type of |UniformExplicitFixedPointScale|, indicating that it can be
+  /// an arbitrary (signed) type that is expected to share the same scale
+  /// as the originating node.
+  void propagateExplicitScale(CAGAnchorNode *from, CAGAnchorNode *to);
+
+private:
+  CAGSlice &slice;
+};
+
+} // namespace quantizer
+} // namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_UNIFORMCONSTRAINTS_H
diff --git a/mlir/include/mlir/Quantizer/Support/UniformSolvers.h b/mlir/include/mlir/Quantizer/Support/UniformSolvers.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6bd1a25ec35599044eddd40a46835abea5801fd
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Support/UniformSolvers.h
@@ -0,0 +1,86 @@
+//===- UniformSolvers.h - Uniform type solver algorithms --------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines algorithms for solving uniform type parameters for various
+// conditions (i.e. fixed-point, affine, scale matching, etc).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_SUPPORT_UNIFORMSOLVERS_H
+#define MLIR_QUANTIZER_SUPPORT_UNIFORMSOLVERS_H
+
+#include <cstdint>
+#include <limits>
+
+namespace llvm {
+class raw_ostream;
+} // end namespace llvm
+
+namespace mlir {
+namespace quantizer {
+
+struct UniformStorageParams {
+  static UniformStorageParams getQuint8() { return {255, 0}; }
+  static UniformStorageParams getQuint8SymmetricRight() { return {254, 1}; }
+  static UniformStorageParams getQuint16() { return {32767, 0}; }
+
+  uint64_t numLevels;
+  int64_t minValue;
+};
+
+/// Solves for the uniform quantization scheme parameters delta and z given
+/// bounding min/max.
+class UniformParamsFromMinMaxSolver {
+public:
+  UniformParamsFromMinMaxSolver(const UniformStorageParams &storageParams,
+                                double boundingMin, double boundingMax)
+      : storageParams(storageParams), boundingMin(boundingMin),
+        boundingMax(boundingMax) {}
+
+  /// Performs the computation, returning whether satisfied.
+  bool compute();
+
+  // Params.
+  double getBoundingMin() const { return boundingMin; }
+  double getBoundingMax() const { return boundingMax; }
+  bool isSatisfied() const { return satisfied; }
+  double getAdjMin() const { return adjMin; }
+  double getAdjMax() const { return adjMax; }
+  double getScale() const { return delta; }
+  int64_t getZp() const { return zp; }
+  int getStepCount() const { return stepCount; }
+
+  // Quantize and dequantize.
+  int64_t quantize(double x) const;
+  double dequantize(int64_t xq) const;
+
+private:
+  const UniformStorageParams storageParams;
+  const double boundingMin;
+  const double boundingMax;
+
+  // Results
+  int stepCount = 0;
+  double adjMin = std::numeric_limits<double>::quiet_NaN();
+  double adjMax = std::numeric_limits<double>::quiet_NaN();
+  double delta = std::numeric_limits<double>::quiet_NaN();
+  int64_t zp = 0;
+
+  bool satisfied = false;
+};
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const UniformStorageParams &p);
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const UniformParamsFromMinMaxSolver &s);
+
+} // end namespace quantizer
+} // end namespace mlir
+
+#endif // MLIR_QUANTIZER_SUPPORT_UNIFORMSOLVERS_H
diff --git a/mlir/include/mlir/Quantizer/Transforms/Passes.h b/mlir/include/mlir/Quantizer/Transforms/Passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..3490f2953a4a309bce62f1552556484ba599f43c
--- /dev/null
+++ b/mlir/include/mlir/Quantizer/Transforms/Passes.h
@@ -0,0 +1,42 @@
+//===- Passes.h - Quantizer passes  -----------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines entry points to create passes to perform various kinds
+// of quantization related transforms.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_QUANTIZER_TRANSFORMS_PASSES_H
+#define MLIR_QUANTIZER_TRANSFORMS_PASSES_H
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace quantizer {
+
+class SolverContext;
+class TargetConfiguration;
+
+/// Creates a pass that infers quantized types based on metadata discovered
+/// in the computation.
+std::unique_ptr<OpPassBase<ModuleOp>>
+createInferQuantizedTypesPass(SolverContext &solverContext,
+                              const TargetConfiguration &config);
+
+/// Creates a pass which removes any instrumentation and hint ops which have
+/// no effect on final runtime.
+std::unique_ptr<OpPassBase<FuncOp>> createRemoveInstrumentationPass();
+
+/// Adds default (dummy) statistics to ops that can benefit from runtime stats.
+/// Meant for testing.
+std::unique_ptr<OpPassBase<FuncOp>> createAddDefaultStatsPass();
+
+} // namespace quantizer
+} // namespace mlir
+
+#endif // MLIR_QUANTIZER_TRANSFORMS_PASSES_H
diff --git a/mlir/include/mlir/Support/DebugStringHelper.h b/mlir/include/mlir/Support/DebugStringHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fa342686ba2d73ef79e1c6789833d9fae5bdf88
--- /dev/null
+++ b/mlir/include/mlir/Support/DebugStringHelper.h
@@ -0,0 +1,42 @@
+//===- DebugStringHelper.h - helpers to generate debug strings --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Convenience functions to make it easier to get a string representation for
+// ops that have a print method. For use in debugging output and errors
+// returned.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DEBUGSTRINGHELPER_H_
+#define MLIR_DEBUGSTRINGHELPER_H_
+
+#include <string>
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+
+// Simple helper function that returns a string as printed from a op.
+template <typename T> static std::string debugString(T &op) {
+  std::string instr_str;
+  llvm::raw_string_ostream os(instr_str);
+  op.print(os);
+  return os.str();
+}
+
+} // namespace mlir
+
+inline std::ostream &operator<<(std::ostream &out, const llvm::Twine &twine) {
+  llvm::raw_os_ostream rout(out);
+  rout << twine;
+  return out;
+}
+
+#endif // MLIR_DEBUGSTRINGHELPER_H_
diff --git a/mlir/include/mlir/Support/FileUtilities.h b/mlir/include/mlir/Support/FileUtilities.h
new file mode 100644
index 0000000000000000000000000000000000000000..c13b39efc4fd49bd84d681e061fb0883d7b36513
--- /dev/null
+++ b/mlir/include/mlir/Support/FileUtilities.h
@@ -0,0 +1,41 @@
+//===- FileUtilities.h - utilities for working with files -------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Common utilities for working with files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_FILEUTILITIES_H_
+#define MLIR_SUPPORT_FILEUTILITIES_H_
+
+#include <memory>
+#include <string>
+
+namespace llvm {
+class MemoryBuffer;
+class ToolOutputFile;
+class StringRef;
+} // namespace llvm
+
+namespace mlir {
+
+/// Open the file specified by its name for reading. Write the error message to
+/// `errorMessage` if errors occur and `errorMessage` is not nullptr.
+std::unique_ptr<llvm::MemoryBuffer>
+openInputFile(llvm::StringRef inputFilename,
+              std::string *errorMessage = nullptr);
+
+/// Open the file specified by its name for writing. Write the error message to
+/// `errorMessage` if errors occur and `errorMessage` is not nullptr.
+std::unique_ptr<llvm::ToolOutputFile>
+openOutputFile(llvm::StringRef outputFilename,
+               std::string *errorMessage = nullptr);
+
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_FILEUTILITIES_H_
diff --git a/mlir/include/mlir/Support/Functional.h b/mlir/include/mlir/Support/Functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..f18677f806b735f2573ef751c72ea380204790d3
--- /dev/null
+++ b/mlir/include/mlir/Support/Functional.h
@@ -0,0 +1,113 @@
+//===- Functional.h - Helpers for functional-style Combinators --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_FUNCTIONAL_H_
+#define MLIR_SUPPORT_FUNCTIONAL_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+
+/// This file provides some simple template functional-style sugar to operate
+/// on **value** types. Make sure when using that the stored type is cheap to
+/// copy!
+///
+/// TODO(ntv): add some static_assert but we need proper traits for this.
+
+namespace mlir {
+namespace functional {
+
+/// Map with iterators.
+template <typename Fn, typename IterType>
+auto map(Fn fun, IterType begin, IterType end)
+    -> SmallVector<typename std::result_of<Fn(decltype(*begin))>::type, 8> {
+  using R = typename std::result_of<Fn(decltype(*begin))>::type;
+  SmallVector<R, 8> res;
+  // auto i works with both pointer types and value types with an operator*.
+  // auto *i only works for pointer types.
+  for (auto i = begin; i != end; ++i) {
+    res.push_back(fun(*i));
+  }
+  return res;
+}
+
+/// Map with templated container.
+template <typename Fn, typename ContainerType>
+auto map(Fn fun, ContainerType input)
+    -> decltype(map(fun, std::begin(input), std::end(input))) {
+  return map(fun, std::begin(input), std::end(input));
+}
+
+/// Zip map with 2 templated container, iterates to the min of the sizes of
+/// the 2 containers.
+/// TODO(ntv): make variadic when needed.
+template <typename Fn, typename ContainerType1, typename ContainerType2>
+auto zipMap(Fn fun, ContainerType1 input1, ContainerType2 input2)
+    -> SmallVector<typename std::result_of<Fn(decltype(*input1.begin()),
+                                              decltype(*input2.begin()))>::type,
+                   8> {
+  using R = typename std::result_of<Fn(decltype(*input1.begin()),
+                                       decltype(*input2.begin()))>::type;
+  SmallVector<R, 8> res;
+  auto zipIter = llvm::zip(input1, input2);
+  for (auto it : zipIter) {
+    res.push_back(fun(std::get<0>(it), std::get<1>(it)));
+  }
+  return res;
+}
+
+/// Apply with iterators.
+template <typename Fn, typename IterType>
+void apply(Fn fun, IterType begin, IterType end) {
+  // auto i works with both pointer types and value types with an operator*.
+  // auto *i only works for pointer types.
+  for (auto i = begin; i != end; ++i) {
+    fun(*i);
+  }
+}
+
+/// Apply with templated container.
+template <typename Fn, typename ContainerType>
+void apply(Fn fun, ContainerType input) {
+  return apply(fun, std::begin(input), std::end(input));
+}
+
+/// Zip apply with 2 templated container, iterates to the min of the sizes of
+/// the 2 containers.
+/// TODO(ntv): make variadic when needed.
+template <typename Fn, typename ContainerType1, typename ContainerType2>
+void zipApply(Fn fun, ContainerType1 input1, ContainerType2 input2) {
+  auto zipIter = llvm::zip(input1, input2);
+  for (auto it : zipIter) {
+    fun(std::get<0>(it), std::get<1>(it));
+  }
+}
+
+/// Unwraps a pointer type to another type (possibly the same).
+/// Used in particular to allow easier compositions of
+///   Operation::operand_range types.
+template <typename T, typename ToType = T>
+inline std::function<ToType *(T *)> makePtrDynCaster() {
+  return [](T *val) { return dyn_cast<ToType>(val); };
+}
+
+/// Simple ScopeGuard.
+struct ScopeGuard {
+  explicit ScopeGuard(std::function<void(void)> destruct)
+      : destruct(destruct) {}
+  ~ScopeGuard() { destruct(); }
+
+private:
+  std::function<void(void)> destruct;
+};
+
+} // namespace functional
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_FUNCTIONAL_H_
diff --git a/mlir/include/mlir/Support/JitRunner.h b/mlir/include/mlir/Support/JitRunner.h
new file mode 100644
index 0000000000000000000000000000000000000000..71c1d7d51058d32e4131a1c420547cbf45345ccb
--- /dev/null
+++ b/mlir/include/mlir/Support/JitRunner.h
@@ -0,0 +1,38 @@
+//===- JitRunner.h - MLIR CPU Execution Driver Library ----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a library that provides a shared implementation for command line
+// utilities that execute an MLIR file on the CPU by translating MLIR to LLVM
+// IR before JIT-compiling and executing the latter.
+//
+// The translation can be customized by providing an MLIR to MLIR
+// transformation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_JITRUNNER_H_
+#define MLIR_SUPPORT_JITRUNNER_H_
+
+#include "llvm/ADT/STLExtras.h"
+
+namespace mlir {
+
+class ModuleOp;
+struct LogicalResult;
+
+// Entry point for all CPU runners. Expects the common argc/argv arguments for
+// standard C++ main functions and an mlirTransformer.
+// The latter is applied after parsing the input into MLIR IR and before passing
+// the MLIR module to the ExecutionEngine.
+int JitRunnerMain(
+    int argc, char **argv,
+    llvm::function_ref<LogicalResult(mlir::ModuleOp)> mlirTransformer);
+
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_JITRUNNER_H_
diff --git a/mlir/include/mlir/Support/LLVM.h b/mlir/include/mlir/Support/LLVM.h
new file mode 100644
index 0000000000000000000000000000000000000000..1885ebe609b59ef3471543090ac9a3e658456c72
--- /dev/null
+++ b/mlir/include/mlir/Support/LLVM.h
@@ -0,0 +1,98 @@
+//===- LLVM.h - Import and forward declare core LLVM types ------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file forward declares and imports various common LLVM datatypes that
+// MLIR wants to use unqualified.
+//
+// Note that most of these are forward declared and then imported into the MLIR
+// namespace with using decls, rather than being #included.  This is because we
+// want clients to explicitly #include the files they need.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_LLVM_H
+#define MLIR_SUPPORT_LLVM_H
+
+// We include these two headers because they cannot be practically forward
+// declared, and are effectively language features.
+#include "llvm/ADT/None.h"
+#include "llvm/Support/Casting.h"
+
+// Forward declarations.
+namespace llvm {
+// Containers.
+class StringRef;
+class StringLiteral;
+class Twine;
+template <typename T> class SmallPtrSetImpl;
+template <typename T, unsigned N> class SmallPtrSet;
+template <typename T> class SmallVectorImpl;
+template <typename T, unsigned N> class SmallVector;
+template <unsigned N> class SmallString;
+template <typename T> class ArrayRef;
+template <typename T> class MutableArrayRef;
+template <typename T> class TinyPtrVector;
+template <typename T> class Optional;
+template <typename... PT> class PointerUnion;
+namespace detail {
+template <typename KeyT, typename ValueT> struct DenseMapPair;
+}
+template <typename T> struct DenseMapInfo;
+template <typename ValueT, typename ValueInfoT> class DenseSet;
+template <typename KeyT, typename ValueT, typename KeyInfoT, typename BucketT>
+class DenseMap;
+template <typename Fn> class function_ref;
+template <typename IteratorT> class iterator_range;
+
+// Other common classes.
+class raw_ostream;
+class APInt;
+class APFloat;
+} // end namespace llvm
+
+namespace mlir {
+// Casting operators.
+using llvm::cast;
+using llvm::cast_or_null;
+using llvm::dyn_cast;
+using llvm::dyn_cast_or_null;
+using llvm::isa;
+using llvm::isa_and_nonnull;
+
+// Containers.
+using llvm::ArrayRef;
+using llvm::DenseMapInfo;
+template <typename KeyT, typename ValueT,
+          typename KeyInfoT = DenseMapInfo<KeyT>,
+          typename BucketT = llvm::detail::DenseMapPair<KeyT, ValueT>>
+using DenseMap = llvm::DenseMap<KeyT, ValueT, KeyInfoT, BucketT>;
+template <typename ValueT, typename ValueInfoT = DenseMapInfo<ValueT>>
+using DenseSet = llvm::DenseSet<ValueT, ValueInfoT>;
+template <typename Fn> using function_ref = llvm::function_ref<Fn>;
+using llvm::iterator_range;
+using llvm::MutableArrayRef;
+using llvm::None;
+using llvm::Optional;
+using llvm::PointerUnion;
+using llvm::SmallPtrSet;
+using llvm::SmallPtrSetImpl;
+using llvm::SmallString;
+using llvm::SmallVector;
+using llvm::SmallVectorImpl;
+using llvm::StringLiteral;
+using llvm::StringRef;
+using llvm::TinyPtrVector;
+using llvm::Twine;
+
+// Other common classes.
+using llvm::APFloat;
+using llvm::APInt;
+using llvm::raw_ostream;
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_LLVM_H
diff --git a/mlir/include/mlir/Support/LogicalResult.h b/mlir/include/mlir/Support/LogicalResult.h
new file mode 100644
index 0000000000000000000000000000000000000000..418293c0f809a7c3f3081545d6de7a5243090ac9
--- /dev/null
+++ b/mlir/include/mlir/Support/LogicalResult.h
@@ -0,0 +1,51 @@
+//===- LogicalResult.h - Utilities for handling success/failure -*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_LOGICAL_RESULT_H
+#define MLIR_SUPPORT_LOGICAL_RESULT_H
+
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+
+// Values that can be used to signal success/failure. This should be used in
+// conjunction with the utility functions below.
+struct LogicalResult {
+  enum ResultEnum { Success, Failure } value;
+  LogicalResult(ResultEnum v) : value(v) {}
+};
+
+/// Utility function to generate a LogicalResult. If isSuccess is true a
+/// `success` result is generated, otherwise a 'failure' result is generated.
+inline LogicalResult success(bool isSuccess = true) {
+  return LogicalResult{isSuccess ? LogicalResult::Success
+                                 : LogicalResult::Failure};
+}
+
+/// Utility function to generate a LogicalResult. If isFailure is true a
+/// `failure` result is generated, otherwise a 'success' result is generated.
+inline LogicalResult failure(bool isFailure = true) {
+  return LogicalResult{isFailure ? LogicalResult::Failure
+                                 : LogicalResult::Success};
+}
+
+/// Utility function that returns true if the provided LogicalResult corresponds
+/// to a success value.
+inline bool succeeded(LogicalResult result) {
+  return result.value == LogicalResult::Success;
+}
+
+/// Utility function that returns true if the provided LogicalResult corresponds
+/// to a failure value.
+inline bool failed(LogicalResult result) {
+  return result.value == LogicalResult::Failure;
+}
+
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_LOGICAL_RESULT_H
diff --git a/mlir/include/mlir/Support/MathExtras.h b/mlir/include/mlir/Support/MathExtras.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fd0634e9e802dc11c37f74e6415d344b11546ce
--- /dev/null
+++ b/mlir/include/mlir/Support/MathExtras.h
@@ -0,0 +1,56 @@
+//===- MathExtras.h - Math functions relevant to MLIR -----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains math functions relevant to MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_MATHEXTRAS_H_
+#define MLIR_SUPPORT_MATHEXTRAS_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/APInt.h"
+
+namespace mlir {
+
+/// Returns the result of MLIR's ceildiv operation on constants. The RHS is
+/// expected to be positive.
+inline int64_t ceilDiv(int64_t lhs, int64_t rhs) {
+  assert(rhs >= 1);
+  // C/C++'s integer division rounds towards 0.
+  return lhs % rhs > 0 ? lhs / rhs + 1 : lhs / rhs;
+}
+
+/// Returns the result of MLIR's floordiv operation on constants. The RHS is
+/// expected to be positive.
+inline int64_t floorDiv(int64_t lhs, int64_t rhs) {
+  assert(rhs >= 1);
+  // C/C++'s integer division rounds towards 0.
+  return lhs % rhs < 0 ? lhs / rhs - 1 : lhs / rhs;
+}
+
+/// Returns MLIR's mod operation on constants. MLIR's mod operation yields the
+/// remainder of the Euclidean division of 'lhs' by 'rhs', and is therefore not
+/// C's % operator.  The RHS is always expected to be positive, and the result
+/// is always non-negative.
+inline int64_t mod(int64_t lhs, int64_t rhs) {
+  assert(rhs >= 1);
+  return lhs % rhs < 0 ? lhs % rhs + rhs : lhs % rhs;
+}
+
+/// Returns the least common multiple of 'a' and 'b'.
+inline int64_t lcm(int64_t a, int64_t b) {
+  uint64_t x = std::abs(a);
+  uint64_t y = std::abs(b);
+  int64_t lcm = (x * y) / llvm::GreatestCommonDivisor64(x, y);
+  assert((lcm >= a && lcm >= b) && "LCM overflow");
+  return lcm;
+}
+} // end namespace mlir
+
+#endif // MLIR_SUPPORT_MATHEXTRAS_H_
diff --git a/mlir/include/mlir/Support/MlirOptMain.h b/mlir/include/mlir/Support/MlirOptMain.h
new file mode 100644
index 0000000000000000000000000000000000000000..eac5ee765c2cf8cf8fec3e9e30e0243677e854a7
--- /dev/null
+++ b/mlir/include/mlir/Support/MlirOptMain.h
@@ -0,0 +1,31 @@
+//===- MlirOptMain.h - MLIR Optimizer Driver main ---------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Main entry function for mlir-opt for when built as standalone binary.
+//
+//===----------------------------------------------------------------------===//
+
+#include <memory>
+#include <vector>
+
+namespace llvm {
+class raw_ostream;
+class MemoryBuffer;
+} // end namespace llvm
+
+namespace mlir {
+struct LogicalResult;
+class PassPipelineCLParser;
+
+LogicalResult MlirOptMain(llvm::raw_ostream &os,
+                          std::unique_ptr<llvm::MemoryBuffer> buffer,
+                          const PassPipelineCLParser &passPipeline,
+                          bool splitInputFile, bool verifyDiagnostics,
+                          bool verifyPasses);
+
+} // end namespace mlir
diff --git a/mlir/include/mlir/Support/STLExtras.h b/mlir/include/mlir/Support/STLExtras.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a128611c6e1da6e842b0b2aaa87af2f55d0966c
--- /dev/null
+++ b/mlir/include/mlir/Support/STLExtras.h
@@ -0,0 +1,448 @@
+//===- STLExtras.h - STL-like extensions that are used by MLIR --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains stuff that should be arguably sunk down to the LLVM
+// Support/STLExtras.h file over time.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_STLEXTRAS_H
+#define MLIR_SUPPORT_STLEXTRAS_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/STLExtras.h"
+
+namespace mlir {
+
+namespace detail {
+template <typename RangeT>
+using ValueOfRange = typename std::remove_reference<decltype(
+    *std::begin(std::declval<RangeT &>()))>::type;
+} // end namespace detail
+
+/// An STL-style algorithm similar to std::for_each that applies a second
+/// functor between every pair of elements.
+///
+/// This provides the control flow logic to, for example, print a
+/// comma-separated list:
+/// \code
+///   interleave(names.begin(), names.end(),
+///              [&](StringRef name) { os << name; },
+///              [&] { os << ", "; });
+/// \endcode
+template <typename ForwardIterator, typename UnaryFunctor,
+          typename NullaryFunctor,
+          typename = typename std::enable_if<
+              !std::is_constructible<StringRef, UnaryFunctor>::value &&
+              !std::is_constructible<StringRef, NullaryFunctor>::value>::type>
+inline void interleave(ForwardIterator begin, ForwardIterator end,
+                       UnaryFunctor each_fn, NullaryFunctor between_fn) {
+  if (begin == end)
+    return;
+  each_fn(*begin);
+  ++begin;
+  for (; begin != end; ++begin) {
+    between_fn();
+    each_fn(*begin);
+  }
+}
+
+template <typename Container, typename UnaryFunctor, typename NullaryFunctor,
+          typename = typename std::enable_if<
+              !std::is_constructible<StringRef, UnaryFunctor>::value &&
+              !std::is_constructible<StringRef, NullaryFunctor>::value>::type>
+inline void interleave(const Container &c, UnaryFunctor each_fn,
+                       NullaryFunctor between_fn) {
+  interleave(c.begin(), c.end(), each_fn, between_fn);
+}
+
+/// Overload of interleave for the common case of string separator.
+template <typename Container, typename UnaryFunctor, typename raw_ostream,
+          typename T = detail::ValueOfRange<Container>>
+inline void interleave(const Container &c, raw_ostream &os,
+                       UnaryFunctor each_fn, const StringRef &separator) {
+  interleave(c.begin(), c.end(), each_fn, [&] { os << separator; });
+}
+template <typename Container, typename raw_ostream,
+          typename T = detail::ValueOfRange<Container>>
+inline void interleave(const Container &c, raw_ostream &os,
+                       const StringRef &separator) {
+  interleave(
+      c, os, [&](const T &a) { os << a; }, separator);
+}
+
+template <typename Container, typename UnaryFunctor, typename raw_ostream,
+          typename T = detail::ValueOfRange<Container>>
+inline void interleaveComma(const Container &c, raw_ostream &os,
+                            UnaryFunctor each_fn) {
+  interleave(c, os, each_fn, ", ");
+}
+template <typename Container, typename raw_ostream,
+          typename T = detail::ValueOfRange<Container>>
+inline void interleaveComma(const Container &c, raw_ostream &os) {
+  interleaveComma(c, os, [&](const T &a) { os << a; });
+}
+
+/// A special type used to provide an address for a given class that can act as
+/// a unique identifier during pass registration.
+/// Note: We specify an explicit alignment here to allow use with PointerIntPair
+/// and other utilities/data structures that require a known pointer alignment.
+struct alignas(8) ClassID {
+  template <typename T> static ClassID *getID() {
+    static ClassID id;
+    return &id;
+  }
+  template <template <typename T> class Trait> static ClassID *getID() {
+    static ClassID id;
+    return &id;
+  }
+};
+
+/// Utilities for detecting if a given trait holds for some set of arguments
+/// 'Args'. For example, the given trait could be used to detect if a given type
+/// has a copy assignment operator:
+///   template<class T>
+///   using has_copy_assign_t = decltype(std::declval<T&>()
+///                                                 = std::declval<const T&>());
+///   bool fooHasCopyAssign = is_detected<has_copy_assign_t, FooClass>::value;
+namespace detail {
+template <typename...> using void_t = void;
+template <class, template <class...> class Op, class... Args> struct detector {
+  using value_t = std::false_type;
+};
+template <template <class...> class Op, class... Args>
+struct detector<void_t<Op<Args...>>, Op, Args...> {
+  using value_t = std::true_type;
+};
+} // end namespace detail
+
+template <template <class...> class Op, class... Args>
+using is_detected = typename detail::detector<void, Op, Args...>::value_t;
+
+/// Check if a Callable type can be invoked with the given set of arg types.
+namespace detail {
+template <typename Callable, typename... Args>
+using is_invocable =
+    decltype(std::declval<Callable &>()(std::declval<Args>()...));
+} // namespace detail
+
+template <typename Callable, typename... Args>
+using is_invocable = is_detected<detail::is_invocable, Callable, Args...>;
+
+//===----------------------------------------------------------------------===//
+//     Extra additions to <iterator>
+//===----------------------------------------------------------------------===//
+
+/// A utility class used to implement an iterator that contains some base object
+/// and an index. The iterator moves the index but keeps the base constant.
+template <typename DerivedT, typename BaseT, typename T,
+          typename PointerT = T *, typename ReferenceT = T &>
+class indexed_accessor_iterator
+    : public llvm::iterator_facade_base<DerivedT,
+                                        std::random_access_iterator_tag, T,
+                                        std::ptrdiff_t, PointerT, ReferenceT> {
+public:
+  ptrdiff_t operator-(const indexed_accessor_iterator &rhs) const {
+    assert(base == rhs.base && "incompatible iterators");
+    return index - rhs.index;
+  }
+  bool operator==(const indexed_accessor_iterator &rhs) const {
+    return base == rhs.base && index == rhs.index;
+  }
+  bool operator<(const indexed_accessor_iterator &rhs) const {
+    assert(base == rhs.base && "incompatible iterators");
+    return index < rhs.index;
+  }
+
+  DerivedT &operator+=(ptrdiff_t offset) {
+    this->index += offset;
+    return static_cast<DerivedT &>(*this);
+  }
+  DerivedT &operator-=(ptrdiff_t offset) {
+    this->index -= offset;
+    return static_cast<DerivedT &>(*this);
+  }
+
+  /// Returns the current index of the iterator.
+  ptrdiff_t getIndex() const { return index; }
+
+  /// Returns the current base of the iterator.
+  const BaseT &getBase() const { return base; }
+
+protected:
+  indexed_accessor_iterator(BaseT base, ptrdiff_t index)
+      : base(base), index(index) {}
+  BaseT base;
+  ptrdiff_t index;
+};
+
+namespace detail {
+/// The class represents the base of a range of indexed_accessor_iterators. It
+/// provides support for many different range functionalities, e.g.
+/// drop_front/slice/etc.. Derived range classes must implement the following
+/// static methods:
+///   * ReferenceT dereference_iterator(const BaseT &base, ptrdiff_t index)
+///     - Derefence an iterator pointing to the base object at the given index.
+///   * BaseT offset_base(const BaseT &base, ptrdiff_t index)
+///     - Return a new base that is offset from the provide base by 'index'
+///       elements.
+template <typename DerivedT, typename BaseT, typename T,
+          typename PointerT = T *, typename ReferenceT = T &>
+class indexed_accessor_range_base {
+public:
+  using RangeBaseT =
+      indexed_accessor_range_base<DerivedT, BaseT, T, PointerT, ReferenceT>;
+
+  /// An iterator element of this range.
+  class iterator : public indexed_accessor_iterator<iterator, BaseT, T,
+                                                    PointerT, ReferenceT> {
+  public:
+    // Index into this iterator, invoking a static method on the derived type.
+    ReferenceT operator*() const {
+      return DerivedT::dereference_iterator(this->getBase(), this->getIndex());
+    }
+
+  private:
+    iterator(BaseT owner, ptrdiff_t curIndex)
+        : indexed_accessor_iterator<iterator, BaseT, T, PointerT, ReferenceT>(
+              owner, curIndex) {}
+
+    /// Allow access to the constructor.
+    friend indexed_accessor_range_base<DerivedT, BaseT, T, PointerT,
+                                       ReferenceT>;
+  };
+
+  indexed_accessor_range_base(iterator begin, iterator end)
+      : base(DerivedT::offset_base(begin.getBase(), begin.getIndex())),
+        count(end.getIndex() - begin.getIndex()) {}
+  indexed_accessor_range_base(const iterator_range<iterator> &range)
+      : indexed_accessor_range_base(range.begin(), range.end()) {}
+
+  iterator begin() const { return iterator(base, 0); }
+  iterator end() const { return iterator(base, count); }
+  ReferenceT operator[](unsigned index) const {
+    assert(index < size() && "invalid index for value range");
+    return DerivedT::dereference_iterator(base, index);
+  }
+
+  /// Return the size of this range.
+  size_t size() const { return count; }
+
+  /// Return if the range is empty.
+  bool empty() const { return size() == 0; }
+
+  /// Drop the first N elements, and keep M elements.
+  DerivedT slice(size_t n, size_t m) const {
+    assert(n + m <= size() && "invalid size specifiers");
+    return DerivedT(DerivedT::offset_base(base, n), m);
+  }
+
+  /// Drop the first n elements.
+  DerivedT drop_front(size_t n = 1) const {
+    assert(size() >= n && "Dropping more elements than exist");
+    return slice(n, size() - n);
+  }
+  /// Drop the last n elements.
+  DerivedT drop_back(size_t n = 1) const {
+    assert(size() >= n && "Dropping more elements than exist");
+    return DerivedT(base, size() - n);
+  }
+
+  /// Take the first n elements.
+  DerivedT take_front(size_t n = 1) const {
+    return n < size() ? drop_back(size() - n)
+                      : static_cast<const DerivedT &>(*this);
+  }
+
+  /// Allow conversion to SmallVector if necessary.
+  /// TODO(riverriddle) Remove this when SmallVector accepts different range
+  /// types in its constructor.
+  template <typename SVT, unsigned N> operator SmallVector<SVT, N>() const {
+    return {begin(), end()};
+  }
+
+protected:
+  indexed_accessor_range_base(BaseT base, ptrdiff_t count)
+      : base(base), count(count) {}
+  indexed_accessor_range_base(const indexed_accessor_range_base &) = default;
+  indexed_accessor_range_base(indexed_accessor_range_base &&) = default;
+  indexed_accessor_range_base &
+  operator=(const indexed_accessor_range_base &) = default;
+
+  /// The base that owns the provided range of values.
+  BaseT base;
+  /// The size from the owning range.
+  ptrdiff_t count;
+};
+} // end namespace detail
+
+/// This class provides an implementation of a range of
+/// indexed_accessor_iterators where the base is not indexable. Ranges with
+/// bases that are offsetable should derive from indexed_accessor_range_base
+/// instead. Derived range classes are expected to implement the following
+/// static method:
+///   * ReferenceT dereference_iterator(const BaseT &base, ptrdiff_t index)
+///     - Derefence an iterator pointing to a parent base at the given index.
+template <typename DerivedT, typename BaseT, typename T,
+          typename PointerT = T *, typename ReferenceT = T &>
+class indexed_accessor_range
+    : public detail::indexed_accessor_range_base<
+          indexed_accessor_range<DerivedT, BaseT, T, PointerT, ReferenceT>,
+          std::pair<BaseT, ptrdiff_t>, T, PointerT, ReferenceT> {
+protected:
+  indexed_accessor_range(BaseT base, ptrdiff_t startIndex, ptrdiff_t count)
+      : detail::indexed_accessor_range_base<
+            DerivedT, std::pair<BaseT, ptrdiff_t>, T, PointerT, ReferenceT>(
+            std::make_pair(base, startIndex), count) {}
+
+private:
+  /// See `detail::indexed_accessor_range_base` for details.
+  static std::pair<BaseT, ptrdiff_t>
+  offset_base(const std::pair<BaseT, ptrdiff_t> &base, ptrdiff_t index) {
+    // We encode the internal base as a pair of the derived base and a start
+    // index into the derived base.
+    return std::make_pair(base.first, base.second + index);
+  }
+  /// See `detail::indexed_accessor_range_base` for details.
+  static ReferenceT
+  dereference_iterator(const std::pair<BaseT, ptrdiff_t> &base,
+                       ptrdiff_t index) {
+    return DerivedT::dereference_iterator(base.first, base.second + index);
+  }
+
+  /// Allow access to `offset_base` and `dereference_iterator`.
+  friend detail::indexed_accessor_range_base<
+      indexed_accessor_range<DerivedT, BaseT, T, PointerT, ReferenceT>,
+      std::pair<BaseT, ptrdiff_t>, T, PointerT, ReferenceT>;
+};
+
+/// Given a container of pairs, return a range over the second elements.
+template <typename ContainerTy> auto make_second_range(ContainerTy &&c) {
+  return llvm::map_range(
+      std::forward<ContainerTy>(c),
+      [](decltype((*std::begin(c))) elt) -> decltype((elt.second)) {
+        return elt.second;
+      });
+}
+
+/// Returns true of the given range only contains a single element.
+template <typename ContainerTy> bool has_single_element(ContainerTy &&c) {
+  auto it = std::begin(c), e = std::end(c);
+  return it != e && std::next(it) == e;
+}
+
+//===----------------------------------------------------------------------===//
+//     Extra additions to <type_traits>
+//===----------------------------------------------------------------------===//
+
+/// This class provides various trait information about a callable object.
+///   * To access the number of arguments: Traits::num_args
+///   * To access the type of an argument: Traits::arg_t<i>
+///   * To access the type of the result: Traits::result_t<i>
+template <typename T, bool isClass = std::is_class<T>::value>
+struct FunctionTraits : public FunctionTraits<decltype(&T::operator())> {};
+
+/// Overload for class function types.
+template <typename ClassType, typename ReturnType, typename... Args>
+struct FunctionTraits<ReturnType (ClassType::*)(Args...) const, false> {
+  /// The number of arguments to this function.
+  enum { num_args = sizeof...(Args) };
+
+  /// The result type of this function.
+  using result_t = ReturnType;
+
+  /// The type of an argument to this function.
+  template <size_t i>
+  using arg_t = typename std::tuple_element<i, std::tuple<Args...>>::type;
+};
+/// Overload for non-class function types.
+template <typename ReturnType, typename... Args>
+struct FunctionTraits<ReturnType (*)(Args...), false> {
+  /// The number of arguments to this function.
+  enum { num_args = sizeof...(Args) };
+
+  /// The result type of this function.
+  using result_t = ReturnType;
+
+  /// The type of an argument to this function.
+  template <size_t i>
+  using arg_t = typename std::tuple_element<i, std::tuple<Args...>>::type;
+};
+} // end namespace mlir
+
+// Allow tuples to be usable as DenseMap keys.
+// TODO: Move this to upstream LLVM.
+
+/// Simplistic combination of 32-bit hash values into 32-bit hash values.
+/// This function is taken from llvm/ADT/DenseMapInfo.h.
+static inline unsigned llvm_combineHashValue(unsigned a, unsigned b) {
+  uint64_t key = (uint64_t)a << 32 | (uint64_t)b;
+  key += ~(key << 32);
+  key ^= (key >> 22);
+  key += ~(key << 13);
+  key ^= (key >> 8);
+  key += (key << 3);
+  key ^= (key >> 15);
+  key += ~(key << 27);
+  key ^= (key >> 31);
+  return (unsigned)key;
+}
+
+namespace llvm {
+template <typename... Ts> struct DenseMapInfo<std::tuple<Ts...>> {
+  using Tuple = std::tuple<Ts...>;
+
+  static inline Tuple getEmptyKey() {
+    return Tuple(DenseMapInfo<Ts>::getEmptyKey()...);
+  }
+
+  static inline Tuple getTombstoneKey() {
+    return Tuple(DenseMapInfo<Ts>::getTombstoneKey()...);
+  }
+
+  template <unsigned I>
+  static unsigned getHashValueImpl(const Tuple &values, std::false_type) {
+    using EltType = typename std::tuple_element<I, Tuple>::type;
+    std::integral_constant<bool, I + 1 == sizeof...(Ts)> atEnd;
+    return llvm_combineHashValue(
+        DenseMapInfo<EltType>::getHashValue(std::get<I>(values)),
+        getHashValueImpl<I + 1>(values, atEnd));
+  }
+
+  template <unsigned I>
+  static unsigned getHashValueImpl(const Tuple &values, std::true_type) {
+    return 0;
+  }
+
+  static unsigned getHashValue(const std::tuple<Ts...> &values) {
+    std::integral_constant<bool, 0 == sizeof...(Ts)> atEnd;
+    return getHashValueImpl<0>(values, atEnd);
+  }
+
+  template <unsigned I>
+  static bool isEqualImpl(const Tuple &lhs, const Tuple &rhs, std::false_type) {
+    using EltType = typename std::tuple_element<I, Tuple>::type;
+    std::integral_constant<bool, I + 1 == sizeof...(Ts)> atEnd;
+    return DenseMapInfo<EltType>::isEqual(std::get<I>(lhs), std::get<I>(rhs)) &&
+           isEqualImpl<I + 1>(lhs, rhs, atEnd);
+  }
+
+  template <unsigned I>
+  static bool isEqualImpl(const Tuple &lhs, const Tuple &rhs, std::true_type) {
+    return true;
+  }
+
+  static bool isEqual(const Tuple &lhs, const Tuple &rhs) {
+    std::integral_constant<bool, 0 == sizeof...(Ts)> atEnd;
+    return isEqualImpl<0>(lhs, rhs, atEnd);
+  }
+};
+
+} // end namespace llvm
+
+#endif // MLIR_SUPPORT_STLEXTRAS_H
diff --git a/mlir/include/mlir/Support/StorageUniquer.h b/mlir/include/mlir/Support/StorageUniquer.h
new file mode 100644
index 0000000000000000000000000000000000000000..f505731a649bc044aca85e1639083b89b3ae103c
--- /dev/null
+++ b/mlir/include/mlir/Support/StorageUniquer.h
@@ -0,0 +1,262 @@
+//===- StorageUniquer.h - Common Storage Class Uniquer ----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_STORAGEUNIQUER_H
+#define MLIR_SUPPORT_STORAGEUNIQUER_H
+
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Support/Allocator.h"
+
+namespace mlir {
+namespace detail {
+struct StorageUniquerImpl;
+
+/// Trait to check if ImplTy provides a 'getKey' method with types 'Args'.
+template <typename ImplTy, typename... Args>
+using has_impltype_getkey_t = decltype(ImplTy::getKey(std::declval<Args>()...));
+
+/// Trait to check if ImplTy provides a 'hashKey' method for 'T'.
+template <typename ImplTy, typename T>
+using has_impltype_hash_t = decltype(ImplTy::hashKey(std::declval<T>()));
+} // namespace detail
+
+/// A utility class to get, or create instances of storage classes. These
+/// storage classes must respect the following constraints:
+///    - Derive from StorageUniquer::BaseStorage.
+///    - Provide an unsigned 'kind' value to be used as part of the unique'ing
+///      process.
+///
+/// For non-parametric storage classes, i.e. those that are solely uniqued by
+/// their kind, nothing else is needed. Instances of these classes can be
+/// created by calling `get` without trailing arguments.
+///
+/// Otherwise, the parametric storage classes may be created with `get`,
+/// and must respect the following:
+///    - Define a type alias, KeyTy, to a type that uniquely identifies the
+///      instance of the storage class within its kind.
+///      * The key type must be constructible from the values passed into the
+///        getComplex call after the kind.
+///      * If the KeyTy does not have an llvm::DenseMapInfo specialization, the
+///        storage class must define a hashing method:
+///         'static unsigned hashKey(const KeyTy &)'
+///
+///    - Provide a method, 'bool operator==(const KeyTy &) const', to
+///      compare the storage instance against an instance of the key type.
+///
+///    - Provide a static construction method:
+///        'DerivedStorage *construct(StorageAllocator &, const KeyTy &key)'
+///      that builds a unique instance of the derived storage. The arguments to
+///      this function are an allocator to store any uniqued data and the key
+///      type for this storage.
+///
+///    - Provide a cleanup method:
+///        'void cleanup()'
+///      that is called when erasing a storage instance. This should cleanup any
+///      fields of the storage as necessary and not attempt to free the memory
+///      of the storage itself.
+class StorageUniquer {
+public:
+  StorageUniquer();
+  ~StorageUniquer();
+
+  /// This class acts as the base storage that all storage classes must derived
+  /// from.
+  class BaseStorage {
+  public:
+    /// Get the kind classification of this storage.
+    unsigned getKind() const { return kind; }
+
+  protected:
+    BaseStorage() : kind(0) {}
+
+  private:
+    /// Allow access to the kind field.
+    friend detail::StorageUniquerImpl;
+
+    /// Classification of the subclass, used for type checking.
+    unsigned kind;
+  };
+
+  /// This is a utility allocator used to allocate memory for instances of
+  /// derived types.
+  class StorageAllocator {
+  public:
+    /// Copy the specified array of elements into memory managed by our bump
+    /// pointer allocator.  This assumes the elements are all PODs.
+    template <typename T> ArrayRef<T> copyInto(ArrayRef<T> elements) {
+      if (elements.empty())
+        return llvm::None;
+      auto result = allocator.Allocate<T>(elements.size());
+      std::uninitialized_copy(elements.begin(), elements.end(), result);
+      return ArrayRef<T>(result, elements.size());
+    }
+
+    /// Copy the provided string into memory managed by our bump pointer
+    /// allocator.
+    StringRef copyInto(StringRef str) {
+      auto result = copyInto(ArrayRef<char>(str.data(), str.size()));
+      return StringRef(result.data(), str.size());
+    }
+
+    /// Allocate an instance of the provided type.
+    template <typename T> T *allocate() { return allocator.Allocate<T>(); }
+
+    /// Allocate 'size' bytes of 'alignment' aligned memory.
+    void *allocate(size_t size, size_t alignment) {
+      return allocator.Allocate(size, alignment);
+    }
+
+  private:
+    /// The raw allocator for type storage objects.
+    llvm::BumpPtrAllocator allocator;
+  };
+
+  /// Gets a uniqued instance of 'Storage'. 'initFn' is an optional parameter
+  /// that can be used to initialize a newly inserted storage instance. This
+  /// function is used for derived types that have complex storage or uniquing
+  /// constraints.
+  template <typename Storage, typename Arg, typename... Args>
+  Storage *get(std::function<void(Storage *)> initFn, unsigned kind, Arg &&arg,
+               Args &&... args) {
+    // Construct a value of the derived key type.
+    auto derivedKey =
+        getKey<Storage>(std::forward<Arg>(arg), std::forward<Args>(args)...);
+
+    // Create a hash of the kind and the derived key.
+    unsigned hashValue = getHash<Storage>(kind, derivedKey);
+
+    // Generate an equality function for the derived storage.
+    std::function<bool(const BaseStorage *)> isEqual =
+        [&derivedKey](const BaseStorage *existing) {
+          return static_cast<const Storage &>(*existing) == derivedKey;
+        };
+
+    // Generate a constructor function for the derived storage.
+    std::function<BaseStorage *(StorageAllocator &)> ctorFn =
+        [&](StorageAllocator &allocator) {
+          auto *storage = Storage::construct(allocator, derivedKey);
+          if (initFn)
+            initFn(storage);
+          return storage;
+        };
+
+    // Get an instance for the derived storage.
+    return static_cast<Storage *>(getImpl(kind, hashValue, isEqual, ctorFn));
+  }
+
+  /// Gets a uniqued instance of 'Storage'. 'initFn' is an optional parameter
+  /// that can be used to initialize a newly inserted storage instance. This
+  /// function is used for derived types that use no additional storage or
+  /// uniquing outside of the kind.
+  template <typename Storage>
+  Storage *get(std::function<void(Storage *)> initFn, unsigned kind) {
+    auto ctorFn = [&](StorageAllocator &allocator) {
+      auto *storage = new (allocator.allocate<Storage>()) Storage();
+      if (initFn)
+        initFn(storage);
+      return storage;
+    };
+    return static_cast<Storage *>(getImpl(kind, ctorFn));
+  }
+
+  /// Erases a uniqued instance of 'Storage'. This function is used for derived
+  /// types that have complex storage or uniquing constraints.
+  template <typename Storage, typename Arg, typename... Args>
+  void erase(unsigned kind, Arg &&arg, Args &&... args) {
+    // Construct a value of the derived key type.
+    auto derivedKey =
+        getKey<Storage>(std::forward<Arg>(arg), std::forward<Args>(args)...);
+
+    // Create a hash of the kind and the derived key.
+    unsigned hashValue = getHash<Storage>(kind, derivedKey);
+
+    // Generate an equality function for the derived storage.
+    std::function<bool(const BaseStorage *)> isEqual =
+        [&derivedKey](const BaseStorage *existing) {
+          return static_cast<const Storage &>(*existing) == derivedKey;
+        };
+
+    // Attempt to erase the storage instance.
+    eraseImpl(kind, hashValue, isEqual, [](BaseStorage *storage) {
+      static_cast<Storage *>(storage)->cleanup();
+    });
+  }
+
+private:
+  /// Implementation for getting/creating an instance of a derived type with
+  /// complex storage.
+  BaseStorage *getImpl(unsigned kind, unsigned hashValue,
+                       function_ref<bool(const BaseStorage *)> isEqual,
+                       std::function<BaseStorage *(StorageAllocator &)> ctorFn);
+
+  /// Implementation for getting/creating an instance of a derived type with
+  /// default storage.
+  BaseStorage *getImpl(unsigned kind,
+                       std::function<BaseStorage *(StorageAllocator &)> ctorFn);
+
+  /// Implementation for erasing an instance of a derived type with complex
+  /// storage.
+  void eraseImpl(unsigned kind, unsigned hashValue,
+                 function_ref<bool(const BaseStorage *)> isEqual,
+                 std::function<void(BaseStorage *)> cleanupFn);
+
+  /// The internal implementation class.
+  std::unique_ptr<detail::StorageUniquerImpl> impl;
+
+  //===--------------------------------------------------------------------===//
+  // Key Construction
+  //===--------------------------------------------------------------------===//
+
+  /// Used to construct an instance of 'ImplTy::KeyTy' if there is an
+  /// 'ImplTy::getKey' function for the provided arguments.
+  template <typename ImplTy, typename... Args>
+  static typename std::enable_if<
+      is_detected<detail::has_impltype_getkey_t, ImplTy, Args...>::value,
+      typename ImplTy::KeyTy>::type
+  getKey(Args &&... args) {
+    return ImplTy::getKey(args...);
+  }
+  /// If there is no 'ImplTy::getKey' method, then we try to directly construct
+  /// the 'ImplTy::KeyTy' with the provided arguments.
+  template <typename ImplTy, typename... Args>
+  static typename std::enable_if<
+      !is_detected<detail::has_impltype_getkey_t, ImplTy, Args...>::value,
+      typename ImplTy::KeyTy>::type
+  getKey(Args &&... args) {
+    return typename ImplTy::KeyTy(args...);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Key and Kind Hashing
+  //===--------------------------------------------------------------------===//
+
+  /// Used to generate a hash for the 'ImplTy::KeyTy' and kind of a storage
+  /// instance if there is an 'ImplTy::hashKey' overload for 'DerivedKey'.
+  template <typename ImplTy, typename DerivedKey>
+  static typename std::enable_if<
+      is_detected<detail::has_impltype_hash_t, ImplTy, DerivedKey>::value,
+      ::llvm::hash_code>::type
+  getHash(unsigned kind, const DerivedKey &derivedKey) {
+    return llvm::hash_combine(kind, ImplTy::hashKey(derivedKey));
+  }
+  /// If there is no 'ImplTy::hashKey' default to using the
+  /// 'llvm::DenseMapInfo' definition for 'DerivedKey' for generating a hash.
+  template <typename ImplTy, typename DerivedKey>
+  static typename std::enable_if<
+      !is_detected<detail::has_impltype_hash_t, ImplTy, DerivedKey>::value,
+      ::llvm::hash_code>::type
+  getHash(unsigned kind, const DerivedKey &derivedKey) {
+    return llvm::hash_combine(
+        kind, DenseMapInfo<DerivedKey>::getHashValue(derivedKey));
+  }
+};
+} // end namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/Support/StringExtras.h b/mlir/include/mlir/Support/StringExtras.h
new file mode 100644
index 0000000000000000000000000000000000000000..5fc6769c124fce9893b6631121d15c2357f089a0
--- /dev/null
+++ b/mlir/include/mlir/Support/StringExtras.h
@@ -0,0 +1,74 @@
+//===- StringExtras.h - String utilities used by MLIR -----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains string utility functions used within MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_STRINGEXTRAS_H
+#define MLIR_SUPPORT_STRINGEXTRAS_H
+
+#include "llvm/ADT/StringExtras.h"
+
+#include <cctype>
+
+namespace mlir {
+/// Converts a string to snake-case from camel-case by replacing all uppercase
+/// letters with '_' followed by the letter in lowercase, except if the
+/// uppercase letter is the first character of the string.
+inline std::string convertToSnakeCase(llvm::StringRef input) {
+  std::string snakeCase;
+  snakeCase.reserve(input.size());
+  for (auto c : input) {
+    if (std::isupper(c)) {
+      if (!snakeCase.empty() && snakeCase.back() != '_') {
+        snakeCase.push_back('_');
+      }
+      snakeCase.push_back(llvm::toLower(c));
+    } else {
+      snakeCase.push_back(c);
+    }
+  }
+  return snakeCase;
+}
+
+/// Converts a string from camel-case to snake_case by replacing all occurrences
+/// of '_' followed by a lowercase letter with the letter in
+/// uppercase. Optionally allow capitalization of the first letter (if it is a
+/// lowercase letter)
+inline std::string convertToCamelCase(llvm::StringRef input,
+                                      bool capitalizeFirst = false) {
+  if (input.empty()) {
+    return "";
+  }
+  std::string output;
+  output.reserve(input.size());
+  size_t pos = 0;
+  if (capitalizeFirst && std::islower(input[pos])) {
+    output.push_back(llvm::toUpper(input[pos]));
+    pos++;
+  }
+  while (pos < input.size()) {
+    auto cur = input[pos];
+    if (cur == '_') {
+      if (pos && (pos + 1 < input.size())) {
+        if (std::islower(input[pos + 1])) {
+          output.push_back(llvm::toUpper(input[pos + 1]));
+          pos += 2;
+          continue;
+        }
+      }
+    }
+    output.push_back(cur);
+    pos++;
+  }
+  return output;
+}
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_STRINGEXTRAS_H
diff --git a/mlir/include/mlir/Support/ToolUtilities.h b/mlir/include/mlir/Support/ToolUtilities.h
new file mode 100644
index 0000000000000000000000000000000000000000..3175ebbdba5792415a151f1bcf2ebd30e2879950
--- /dev/null
+++ b/mlir/include/mlir/Support/ToolUtilities.h
@@ -0,0 +1,41 @@
+//===- ToolUtilities.h - MLIR Tool Utilities --------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares common utilities for implementing MLIR tools.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_TOOLUTILITIES_H
+#define MLIR_SUPPORT_TOOLUTILITIES_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/STLExtras.h"
+#include <memory>
+
+namespace llvm {
+class MemoryBuffer;
+}
+
+namespace mlir {
+struct LogicalResult;
+
+using ChunkBufferHandler = function_ref<LogicalResult(
+    std::unique_ptr<llvm::MemoryBuffer> chunkBuffer, raw_ostream &os)>;
+
+/// Splits the specified buffer on a marker (`// -----`), processes each chunk
+/// independently according to the normal `processChunkBuffer` logic, and writes
+/// all results to `os`.
+///
+/// This is used to allow a large number of small independent tests to be put
+/// into a single file.
+LogicalResult
+splitAndProcessBuffer(std::unique_ptr<llvm::MemoryBuffer> originalBuffer,
+                      ChunkBufferHandler processChunkBuffer, raw_ostream &os);
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_TOOLUTILITIES_H
diff --git a/mlir/include/mlir/Support/TranslateClParser.h b/mlir/include/mlir/Support/TranslateClParser.h
new file mode 100644
index 0000000000000000000000000000000000000000..822d4b1a0a4f1fba098f6534c953b8c3b1a501db
--- /dev/null
+++ b/mlir/include/mlir/Support/TranslateClParser.h
@@ -0,0 +1,38 @@
+//===- TranslateClParser.h - Translations command line parser ---*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains custom command line parser for translations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_SUPPORT_TRANSLATE_CL_PARSER_H_
+#define MLIR_SUPPORT_TRANSLATE_CL_PARSER_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/Translation.h"
+#include "llvm/Support/CommandLine.h"
+#include <functional>
+
+namespace mlir {
+
+struct LogicalResult;
+class MLIRContext;
+
+/// Custom parser for TranslateFunction.
+/// Wraps TranslateToMLIRFunctions and TranslateFromMLIRFunctions into
+/// TranslateFunctions before registering them as options.
+struct TranslationParser : public llvm::cl::parser<const TranslateFunction *> {
+  TranslationParser(llvm::cl::Option &opt);
+
+  void printOptionInfo(const llvm::cl::Option &O,
+                       size_t GlobalWidth) const override;
+};
+
+} // namespace mlir
+
+#endif // MLIR_SUPPORT_TRANSLATE_CL_PARSER_H_
diff --git a/mlir/include/mlir/TableGen/Argument.h b/mlir/include/mlir/TableGen/Argument.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a0787e1b6cad51e359e79e875da597aceef1da5
--- /dev/null
+++ b/mlir/include/mlir/TableGen/Argument.h
@@ -0,0 +1,59 @@
+//===- Argument.h - Argument definitions ------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file contains definitions for TableGen operation's arguments.
+// Operation arguments fall into two categories:
+//
+// 1. Operands: SSA values operated on by the operation
+// 2. Attributes: compile-time known properties that have influence over
+//    the operation's behavior
+//
+// These two categories are modelled with the unified argument concept in
+// TableGen because we need similar pattern matching mechanisms for them.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_ARGUMENT_H_
+#define MLIR_TABLEGEN_ARGUMENT_H_
+
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/Type.h"
+#include "llvm/ADT/PointerUnion.h"
+#include <string>
+
+namespace llvm {
+class StringRef;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// A struct wrapping an op attribute and its name together
+struct NamedAttribute {
+  llvm::StringRef name;
+  Attribute attr;
+};
+
+// A struct wrapping an op operand/result's constraint and its name together
+struct NamedTypeConstraint {
+  // Returns true if this operand/result has constraint to be satisfied.
+  bool hasPredicate() const;
+  // Returns true if this operand/result is variadic.
+  bool isVariadic() const;
+
+  llvm::StringRef name;
+  TypeConstraint constraint;
+};
+
+// Operation argument: either attribute or operand
+using Argument = llvm::PointerUnion<NamedAttribute *, NamedTypeConstraint *>;
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_ARGUMENT_H_
diff --git a/mlir/include/mlir/TableGen/Attribute.h b/mlir/include/mlir/TableGen/Attribute.h
new file mode 100644
index 0000000000000000000000000000000000000000..747df945cea3b3f3498ac2fcf654c430448d3590
--- /dev/null
+++ b/mlir/include/mlir/TableGen/Attribute.h
@@ -0,0 +1,218 @@
+//===- Attribute.h - Attribute wrapper class --------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Attribute wrapper to simplify using TableGen Record defining a MLIR
+// Attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_ATTRIBUTE_H_
+#define MLIR_TABLEGEN_ATTRIBUTE_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Constraint.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class DefInit;
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class with helper methods for accessing attribute constraints defined
+// in TableGen.
+class AttrConstraint : public Constraint {
+public:
+  explicit AttrConstraint(const llvm::Record *record);
+
+  static bool classof(const Constraint *c) { return c->getKind() == CK_Attr; }
+
+  // Returns true if this constraint is a subclass of the given `className`
+  // class defined in TableGen.
+  bool isSubClassOf(StringRef className) const;
+};
+
+// Wrapper class providing helper methods for accessing MLIR Attribute defined
+// in TableGen. This class should closely reflect what is defined as class
+// `Attr` in TableGen.
+class Attribute : public AttrConstraint {
+public:
+  explicit Attribute(const llvm::Record *record);
+  explicit Attribute(const llvm::DefInit *init);
+
+  // Returns the storage type if set. Returns the default storage type
+  // ("Attribute") otherwise.
+  StringRef getStorageType() const;
+
+  // Returns the return type for this attribute.
+  StringRef getReturnType() const;
+
+  // Returns the template getter method call which reads this attribute's
+  // storage and returns the value as of the desired return type.
+  // The call will contain a `{0}` which will be expanded to this attribute.
+  StringRef getConvertFromStorageCall() const;
+
+  // Returns true if this attribute can be built from a constant value.
+  bool isConstBuildable() const;
+
+  // Returns the template that can be used to produce an instance of the
+  // attribute.
+  // Syntax: {0} should be replaced with a builder, {1} should be replaced with
+  // the constant value.
+  StringRef getConstBuilderTemplate() const;
+
+  // Returns the base-level attribute that this attribute constraint is
+  // built upon.
+  Attribute getBaseAttr() const;
+
+  // Returns whether this attribute has a default value.
+  bool hasDefaultValue() const;
+  // Returns the default value for this attribute.
+  StringRef getDefaultValue() const;
+
+  // Returns whether this attribute is optional.
+  bool isOptional() const;
+
+  // Returns true if this attribute is a derived attribute (i.e., a subclass
+  // of `DerivedAttr`).
+  bool isDerivedAttr() const;
+
+  // Returns true if this attribute is a type attribute (i.e., a subclass
+  // of `TypeAttrBase`).
+  bool isTypeAttr() const;
+
+  // Returns true if this attribute is an enum attribute (i.e., a subclass of
+  // `EnumAttrInfo`)
+  bool isEnumAttr() const;
+
+  // Returns this attribute's TableGen def name. If this is an `OptionalAttr`
+  // or `DefaultValuedAttr` without explicit name, returns the base attribute's
+  // name.
+  StringRef getAttrDefName() const;
+
+  // Returns the code body for derived attribute. Aborts if this is not a
+  // derived attribute.
+  StringRef getDerivedCodeBody() const;
+};
+
+// Wrapper class providing helper methods for accessing MLIR constant attribute
+// defined in TableGen. This class should closely reflect what is defined as
+// class `ConstantAttr` in TableGen.
+class ConstantAttr {
+public:
+  explicit ConstantAttr(const llvm::DefInit *init);
+
+  // Returns the attribute kind.
+  Attribute getAttribute() const;
+
+  // Returns the constant value.
+  StringRef getConstantValue() const;
+
+private:
+  // The TableGen definition of this constant attribute.
+  const llvm::Record *def;
+};
+
+// Wrapper class providing helper methods for accessing enum attribute cases
+// defined in TableGen. This is used for enum attribute case backed by both
+// StringAttr and IntegerAttr.
+class EnumAttrCase : public Attribute {
+public:
+  explicit EnumAttrCase(const llvm::DefInit *init);
+
+  // Returns true if this EnumAttrCase is backed by a StringAttr.
+  bool isStrCase() const;
+
+  // Returns the symbol of this enum attribute case.
+  StringRef getSymbol() const;
+
+  // Returns the value of this enum attribute case.
+  int64_t getValue() const;
+};
+
+// Wrapper class providing helper methods for accessing enum attributes defined
+// in TableGen.This is used for enum attribute case backed by both StringAttr
+// and IntegerAttr.
+class EnumAttr : public Attribute {
+public:
+  explicit EnumAttr(const llvm::Record *record);
+  explicit EnumAttr(const llvm::Record &record);
+  explicit EnumAttr(const llvm::DefInit *init);
+
+  // Returns true if this is a bit enum attribute.
+  bool isBitEnum() const;
+
+  // Returns the enum class name.
+  StringRef getEnumClassName() const;
+
+  // Returns the C++ namespaces this enum class should be placed in.
+  StringRef getCppNamespace() const;
+
+  // Returns the underlying type.
+  StringRef getUnderlyingType() const;
+
+  // Returns the name of the utility function that converts a value of the
+  // underlying type to the corresponding symbol.
+  StringRef getUnderlyingToSymbolFnName() const;
+
+  // Returns the name of the utility function that converts a string to the
+  // corresponding symbol.
+  StringRef getStringToSymbolFnName() const;
+
+  // Returns the name of the utility function that converts a symbol to the
+  // corresponding string.
+  StringRef getSymbolToStringFnName() const;
+
+  // Returns the return type of the utility function that converts a symbol to
+  // the corresponding string.
+  StringRef getSymbolToStringFnRetType() const;
+
+  // Returns the name of the utilit function that returns the max enum value
+  // used within the enum class.
+  StringRef getMaxEnumValFnName() const;
+
+  // Returns all allowed cases for this enum attribute.
+  std::vector<EnumAttrCase> getAllCases() const;
+};
+
+class StructFieldAttr {
+public:
+  explicit StructFieldAttr(const llvm::Record *record);
+  explicit StructFieldAttr(const llvm::Record &record);
+  explicit StructFieldAttr(const llvm::DefInit *init);
+
+  StringRef getName() const;
+  Attribute getType() const;
+
+private:
+  const llvm::Record *def;
+};
+
+// Wrapper class providing helper methods for accessing struct attributes
+// defined in TableGen.
+class StructAttr : public Attribute {
+public:
+  explicit StructAttr(const llvm::Record *record);
+  explicit StructAttr(const llvm::Record &record) : StructAttr(&record){};
+  explicit StructAttr(const llvm::DefInit *init);
+
+  // Returns the struct class name.
+  StringRef getStructClassName() const;
+
+  // Returns the C++ namespaces this struct class should be placed in.
+  StringRef getCppNamespace() const;
+
+  std::vector<StructFieldAttr> getAllFields() const;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_ATTRIBUTE_H_
diff --git a/mlir/include/mlir/TableGen/Constraint.h b/mlir/include/mlir/TableGen/Constraint.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb7c1d74b64109ed253d98b4997ee8fdff63f500
--- /dev/null
+++ b/mlir/include/mlir/TableGen/Constraint.h
@@ -0,0 +1,81 @@
+//===- Constraint.h - Constraint class --------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Constraint wrapper to simplify using TableGen Record for constraints.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_CONSTRAINT_H_
+#define MLIR_TABLEGEN_CONSTRAINT_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Predicate.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class with helper methods for accessing Constraint defined in
+// TableGen.
+class Constraint {
+public:
+  Constraint(const llvm::Record *record);
+
+  bool operator==(const Constraint &that) { return def == that.def; }
+  bool operator!=(const Constraint &that) { return def != that.def; }
+
+  // Returns the predicate for this constraint.
+  Pred getPredicate() const;
+
+  // Returns the condition template that can be used to check if a type or
+  // attribute satisfies this constraint.  The template may contain "{0}" that
+  // must be substituted with an expression returning an mlir::Type or
+  // mlir::Attribute.
+  std::string getConditionTemplate() const;
+
+  // Returns the user-readable description of this constraint. If the
+  // description is not provided, returns the TableGen def name.
+  StringRef getDescription() const;
+
+  // Constraint kind
+  enum Kind { CK_Attr, CK_Region, CK_Type, CK_Uncategorized };
+
+  Kind getKind() const { return kind; }
+
+protected:
+  Constraint(Kind kind, const llvm::Record *record);
+
+  // The TableGen definition of this constraint.
+  const llvm::Record *def;
+
+private:
+  // What kind of constraint this is.
+  Kind kind;
+};
+
+// An constraint and the concrete entities to place the constraint on.
+struct AppliedConstraint {
+  AppliedConstraint(Constraint &&constraint, StringRef self,
+                    std::vector<std::string> &&entities);
+
+  Constraint constraint;
+  // The symbol to replace `$_self` special placeholder in the constraint.
+  std::string self;
+  // The symbols to replace `$N` positional placeholders in the constraint.
+  std::vector<std::string> entities;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_CONSTRAINT_H_
diff --git a/mlir/include/mlir/TableGen/Dialect.h b/mlir/include/mlir/TableGen/Dialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..56d17f41b561968a79c987a1d568d986edbc085f
--- /dev/null
+++ b/mlir/include/mlir/TableGen/Dialect.h
@@ -0,0 +1,58 @@
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Dialect wrapper to simplify using TableGen Record defining a MLIR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_DIALECT_H_
+#define MLIR_TABLEGEN_DIALECT_H_
+
+#include "mlir/Support/LLVM.h"
+
+namespace llvm {
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+// Wrapper class that contains a MLIR dialect's information defined in TableGen
+// and provides helper methods for accessing them.
+class Dialect {
+public:
+  explicit Dialect(const llvm::Record *def) : def(def) {}
+
+  // Returns the name of this dialect.
+  StringRef getName() const;
+
+  // Returns the C++ namespaces that ops of this dialect should be placed into.
+  StringRef getCppNamespace() const;
+
+  // Returns the summary description of the dialect. Returns empty string if
+  // none.
+  StringRef getSummary() const;
+
+  // Returns the description of the dialect. Returns empty string if none.
+  StringRef getDescription() const;
+
+  // Returns whether two dialects are equal by checking the equality of the
+  // underlying record.
+  bool operator==(const Dialect &other) const;
+
+  // Compares two dialects by comparing the names of the dialects.
+  bool operator<(const Dialect &other) const;
+
+  // Returns whether the dialect is defined.
+  operator bool() const { return def != nullptr; }
+
+private:
+  const llvm::Record *def;
+};
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_DIALECT_H_
diff --git a/mlir/include/mlir/TableGen/Format.h b/mlir/include/mlir/TableGen/Format.h
new file mode 100644
index 0000000000000000000000000000000000000000..160ba5f036a7eeb81c4021e75254345d5eb9fe1d
--- /dev/null
+++ b/mlir/include/mlir/TableGen/Format.h
@@ -0,0 +1,240 @@
+//===- Format.h - Utilities for String Format -------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares utilities for formatting strings. They are specially
+// tailored to the needs of TableGen'ing op definitions and rewrite rules,
+// so they are not expected to be used as widely applicable utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_FORMAT_H_
+#define MLIR_TABLEGEN_FORMAT_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace mlir {
+namespace tblgen {
+
+/// Format context containing substitutions for special placeholders.
+///
+/// This context divides special placeholders into two categories: builtin ones
+/// and custom ones.
+///
+/// Builtin placeholders are baked into `FmtContext` and each one of them has a
+/// dedicated setter. They can be used in all dialects. Their names follow the
+/// convention of `$_<name>`. The rationale of the leading underscore is to
+/// avoid confusion and name collision: op arguments/attributes/results are
+/// named as $<name>, and we can potentially support referencing those entities
+/// directly in the format template in the future.
+//
+/// Custom ones are registered by dialect-specific TableGen backends and use the
+/// same unified setter.
+class FmtContext {
+public:
+  // Placeholder kinds
+  enum class PHKind : char {
+    None,
+    Custom,  // For custom placeholders
+    Builder, // For the $_builder placeholder
+    Op,      // For the $_op placeholder
+    Self,    // For the $_self placeholder
+  };
+
+  FmtContext() = default;
+
+  // Setter for custom placeholders
+  FmtContext &addSubst(StringRef placeholder, Twine subst);
+
+  // Setters for builtin placeholders
+  FmtContext &withBuilder(Twine subst);
+  FmtContext &withOp(Twine subst);
+  FmtContext &withSelf(Twine subst);
+
+  Optional<StringRef> getSubstFor(PHKind placeholder) const;
+  Optional<StringRef> getSubstFor(StringRef placeholder) const;
+
+  static PHKind getPlaceHolderKind(StringRef str);
+
+private:
+  struct PHKindInfo : DenseMapInfo<PHKind> {
+    using CharInfo = DenseMapInfo<char>;
+
+    static inline PHKind getEmptyKey() {
+      return static_cast<PHKind>(CharInfo::getEmptyKey());
+    }
+    static inline PHKind getTombstoneKey() {
+      return static_cast<PHKind>(CharInfo::getTombstoneKey());
+    }
+    static unsigned getHashValue(const PHKind &val) {
+      return CharInfo::getHashValue(static_cast<char>(val));
+    }
+
+    static bool isEqual(const PHKind &lhs, const PHKind &rhs) {
+      return lhs == rhs;
+    }
+  };
+
+  llvm::SmallDenseMap<PHKind, std::string, 4, PHKindInfo> builtinSubstMap;
+  llvm::StringMap<std::string> customSubstMap;
+};
+
+/// Struct representing a replacement segment for the formatted string. It can
+/// be a segment of the formatting template (for `Literal`) or a replacement
+/// parameter (for `PositionalPH` and `SpecialPH`).
+struct FmtReplacement {
+  enum class Type { Empty, Literal, PositionalPH, SpecialPH };
+
+  FmtReplacement() = default;
+  explicit FmtReplacement(StringRef literal)
+      : type(Type::Literal), spec(literal) {}
+  FmtReplacement(StringRef spec, size_t index)
+      : type(Type::PositionalPH), spec(spec), index(index) {}
+  FmtReplacement(StringRef spec, FmtContext::PHKind placeholder)
+      : type(Type::SpecialPH), spec(spec), placeholder(placeholder) {}
+
+  Type type = Type::Empty;
+  StringRef spec;
+  size_t index = 0;
+  FmtContext::PHKind placeholder = FmtContext::PHKind::None;
+};
+
+class FmtObjectBase {
+private:
+  static std::pair<FmtReplacement, StringRef> splitFmtSegment(StringRef fmt);
+  static std::vector<FmtReplacement> parseFormatString(StringRef fmt);
+
+protected:
+  // The parameters are stored in a std::tuple, which does not provide runtime
+  // indexing capabilities.  In order to enable runtime indexing, we use this
+  // structure to put the parameters into a std::vector.  Since the parameters
+  // are not all the same type, we use some type-erasure by wrapping the
+  // parameters in a template class that derives from a non-template superclass.
+  // Essentially, we are converting a std::tuple<Derived<Ts...>> to a
+  // std::vector<Base*>.
+  struct CreateAdapters {
+    template <typename... Ts>
+    std::vector<llvm::detail::format_adapter *> operator()(Ts &... items) {
+      return std::vector<llvm::detail::format_adapter *>{&items...};
+    }
+  };
+
+  StringRef fmt;
+  const FmtContext *context;
+  std::vector<llvm::detail::format_adapter *> adapters;
+  std::vector<FmtReplacement> replacements;
+
+public:
+  FmtObjectBase(StringRef fmt, const FmtContext *ctx, size_t numParams)
+      : fmt(fmt), context(ctx), replacements(parseFormatString(fmt)) {}
+
+  FmtObjectBase(const FmtObjectBase &that) = delete;
+
+  FmtObjectBase(FmtObjectBase &&that)
+      : fmt(std::move(that.fmt)), context(that.context),
+        adapters(), // adapters are initialized by FmtObject
+        replacements(std::move(that.replacements)) {}
+
+  void format(llvm::raw_ostream &s) const;
+
+  std::string str() const {
+    std::string result;
+    llvm::raw_string_ostream s(result);
+    format(s);
+    return s.str();
+  }
+
+  template <unsigned N> SmallString<N> sstr() const {
+    SmallString<N> result;
+    llvm::raw_svector_ostream s(result);
+    format(s);
+    return result;
+  }
+
+  template <unsigned N> operator SmallString<N>() const { return sstr<N>(); }
+
+  operator std::string() const { return str(); }
+};
+
+template <typename Tuple> class FmtObject : public FmtObjectBase {
+  // Storage for the parameter adapters.  Since the base class erases the type
+  // of the parameters, we have to own the storage for the parameters here, and
+  // have the base class store type-erased pointers into this tuple.
+  Tuple parameters;
+
+public:
+  FmtObject(StringRef fmt, const FmtContext *ctx, Tuple &&params)
+      : FmtObjectBase(fmt, ctx, std::tuple_size<Tuple>::value),
+        parameters(std::move(params)) {
+    adapters.reserve(std::tuple_size<Tuple>::value);
+    adapters = llvm::apply_tuple(CreateAdapters(), parameters);
+  }
+
+  FmtObject(FmtObject const &that) = delete;
+
+  FmtObject(FmtObject &&that)
+      : FmtObjectBase(std::move(that)), parameters(std::move(that.parameters)) {
+    adapters.reserve(that.adapters.size());
+    adapters = llvm::apply_tuple(CreateAdapters(), parameters);
+  }
+};
+
+/// Formats text by substituting placeholders in format string with replacement
+/// parameters.
+///
+/// There are two categories of placeholders accepted, both led by a '$' sign:
+///
+/// 1. Positional placeholder: $[0-9]+
+/// 2. Special placeholder:    $[a-zA-Z_][a-zA-Z0-9_]*
+///
+/// Replacement parameters for positional placeholders are supplied as the
+/// `vals` parameter pack with 1:1 mapping. That is, $0 will be replaced by the
+/// first parameter in `vals`, $1 by the second one, and so on. Note that you
+/// can use the positional placeholders in any order and repeat any times, for
+/// example, "$2 $1 $1 $0" is accepted.
+///
+/// Replacement parameters for special placeholders are supplied using the `ctx`
+/// format context.
+///
+/// The `fmt` is recorded as a `StringRef` inside the returned `FmtObject`.
+/// The caller needs to make sure the underlying data is available when the
+/// `FmtObject` is used.
+///
+/// `ctx` accepts a nullptr if there is no special placeholder is used.
+///
+/// If no substitution is provided for a placeholder or any error happens during
+/// format string parsing or replacement, the placeholder will be outputted
+/// as-is with an additional marker '<no-subst-found>', to aid debugging.
+///
+/// To print a '$' literally, escape it with '$$'.
+///
+/// This utility function is inspired by LLVM formatv(), with modifications
+/// specially tailored for TableGen C++ generation usage:
+///
+/// 1. This utility use '$' instead of '{' and '}' for denoting the placeholder
+///    because '{' and '}' are frequently used in C++ code.
+/// 2. This utility does not support format layout because it is rarely needed
+///    in C++ code generation.
+template <typename... Ts>
+inline auto tgfmt(StringRef fmt, const FmtContext *ctx, Ts &&... vals)
+    -> FmtObject<decltype(std::make_tuple(
+        llvm::detail::build_format_adapter(std::forward<Ts>(vals))...))> {
+  using ParamTuple = decltype(std::make_tuple(
+      llvm::detail::build_format_adapter(std::forward<Ts>(vals))...));
+  return FmtObject<ParamTuple>(
+      fmt, ctx,
+      std::make_tuple(
+          llvm::detail::build_format_adapter(std::forward<Ts>(vals))...));
+}
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_FORMAT_H_
diff --git a/mlir/include/mlir/TableGen/GenInfo.h b/mlir/include/mlir/TableGen/GenInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c732c2ff491a28e1e17b137748bab6b76dd1221
--- /dev/null
+++ b/mlir/include/mlir/TableGen/GenInfo.h
@@ -0,0 +1,72 @@
+//===- GenInfo.h - Generator info -------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_GENINFO_H_
+#define MLIR_TABLEGEN_GENINFO_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/StringRef.h"
+#include <functional>
+
+namespace llvm {
+class RecordKeeper;
+} // end namespace llvm
+
+namespace mlir {
+
+/// Generator function to invoke.
+using GenFunction = std::function<bool(const llvm::RecordKeeper &recordKeeper,
+                                       raw_ostream &os)>;
+
+/// Structure to group information about a generator (argument to invoke via
+/// mlir-tblgen, description, and generator function).
+class GenInfo {
+public:
+  /// GenInfo constructor should not be invoked directly, instead use
+  /// GenRegistration or registerGen.
+  GenInfo(StringRef arg, StringRef description, GenFunction generator)
+      : arg(arg), description(description), generator(generator) {}
+
+  /// Invokes the generator and returns whether the generator failed.
+  bool invoke(const llvm::RecordKeeper &recordKeeper, raw_ostream &os) const {
+    assert(generator && "Cannot call generator with null generator");
+    return generator(recordKeeper, os);
+  }
+
+  /// Returns the command line option that may be passed to 'mlir-tblgen' to
+  /// invoke this generator.
+  StringRef getGenArgument() const { return arg; }
+
+  /// Returns a description for the generator.
+  StringRef getGenDescription() const { return description; }
+
+private:
+  // The argument with which to invoke the generator via mlir-tblgen.
+  StringRef arg;
+
+  // Description of the generator.
+  StringRef description;
+
+  // Generator function.
+  GenFunction generator;
+};
+
+/// GenRegistration provides a global initializer that registers a generator
+/// function.
+///
+/// Usage:
+///
+///   // At namespace scope.
+///   static GenRegistration Print("print", "Print records", [](...){...});
+struct GenRegistration {
+  GenRegistration(StringRef arg, StringRef description, GenFunction function);
+};
+
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_GENINFO_H_
diff --git a/mlir/include/mlir/TableGen/GenNameParser.h b/mlir/include/mlir/TableGen/GenNameParser.h
new file mode 100644
index 0000000000000000000000000000000000000000..65f4a8ceace549eccca9d5ae602102f24e9ff8cc
--- /dev/null
+++ b/mlir/include/mlir/TableGen/GenNameParser.h
@@ -0,0 +1,31 @@
+//===- GenNameParser.h - Command line parser for generators -----*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The GenNameParser class adds all passes linked in to the system that are
+// creatable to the tool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_GENNAMEPARSER_H_
+#define MLIR_TABLEGEN_GENNAMEPARSER_H_
+
+#include "llvm/Support/CommandLine.h"
+
+namespace mlir {
+class GenInfo;
+
+/// Adds command line option for each registered generator.
+struct GenNameParser : public llvm::cl::parser<const GenInfo *> {
+  GenNameParser(llvm::cl::Option &opt);
+
+  void printOptionInfo(const llvm::cl::Option &O,
+                       size_t GlobalWidth) const override;
+};
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_GENNAMEPARSER_H_
diff --git a/mlir/include/mlir/TableGen/OpInterfaces.h b/mlir/include/mlir/TableGen/OpInterfaces.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bf181615648c6fe8db9aeef9abc99cc1e43e36c
--- /dev/null
+++ b/mlir/include/mlir/TableGen/OpInterfaces.h
@@ -0,0 +1,100 @@
+//===- OpInterfaces.h - OpInterfaces wrapper class --------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpInterfaces wrapper to simplify using TableGen OpInterfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_OPINTERFACES_H_
+#define MLIR_TABLEGEN_OPINTERFACES_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class Init;
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class with helper methods for accessing OpInterfaceMethod defined
+// in TableGen.
+class OpInterfaceMethod {
+public:
+  // This struct represents a single method argument.
+  struct Argument {
+    StringRef type;
+    StringRef name;
+  };
+
+  explicit OpInterfaceMethod(const llvm::Record *def);
+
+  // Return the return type of this method.
+  StringRef getReturnType() const;
+
+  // Return the name of this method.
+  StringRef getName() const;
+
+  // Return if this method is static.
+  bool isStatic() const;
+
+  // Return the body for this method if it has one.
+  llvm::Optional<StringRef> getBody() const;
+
+  // Return the default implementation for this method if it has one.
+  llvm::Optional<StringRef> getDefaultImplementation() const;
+
+  // Return the description of this method if it has one.
+  llvm::Optional<StringRef> getDescription() const;
+
+  // Arguments.
+  ArrayRef<Argument> getArguments() const;
+  bool arg_empty() const;
+
+private:
+  // The TableGen definition of this method.
+  const llvm::Record *def;
+
+  // The arguments of this method.
+  SmallVector<Argument, 2> arguments;
+};
+
+//===----------------------------------------------------------------------===//
+// OpInterface
+//===----------------------------------------------------------------------===//
+
+// Wrapper class with helper methods for accessing OpInterfaces defined in
+// TableGen.
+class OpInterface {
+public:
+  explicit OpInterface(const llvm::Record *def);
+
+  // Return the name of this interface.
+  StringRef getName() const;
+
+  // Return the methods of this interface.
+  ArrayRef<OpInterfaceMethod> getMethods() const;
+
+  // Return the description of this method if it has one.
+  llvm::Optional<StringRef> getDescription() const;
+
+private:
+  // The TableGen definition of this interface.
+  const llvm::Record *def;
+
+  // The methods of this interface.
+  SmallVector<OpInterfaceMethod, 8> methods;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_OPINTERFACES_H_
diff --git a/mlir/include/mlir/TableGen/OpTrait.h b/mlir/include/mlir/TableGen/OpTrait.h
new file mode 100644
index 0000000000000000000000000000000000000000..59fc7acbfd74e8def9ae19fbc53fe03a9104aa71
--- /dev/null
+++ b/mlir/include/mlir/TableGen/OpTrait.h
@@ -0,0 +1,110 @@
+//===- OpTrait.h - OpTrait wrapper class ------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpTrait wrapper to simplify using TableGen Record defining an MLIR OpTrait.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_OPTRAIT_H_
+#define MLIR_TABLEGEN_OPTRAIT_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class Init;
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+class OpInterface;
+
+// Wrapper class with helper methods for accessing OpTrait constraints defined
+// in TableGen.
+class OpTrait {
+public:
+  // Discriminator for kinds of op traits.
+  enum class Kind {
+    // OpTrait corresponding to C++ class.
+    Native,
+    // OpTrait corresponding to predicate on operation.
+    Pred,
+    // OpTrait controlling op definition generator internals.
+    Internal,
+    // OpTrait corresponding to OpInterface.
+    Interface
+  };
+
+  explicit OpTrait(Kind kind, const llvm::Record *def);
+
+  // Returns an OpTrait corresponding to the init provided.
+  static OpTrait create(const llvm::Init *init);
+
+  Kind getKind() const { return kind; }
+
+protected:
+  // The TableGen definition of this trait.
+  const llvm::Record *def;
+  Kind kind;
+};
+
+// OpTrait corresponding to a native C++ OpTrait.
+class NativeOpTrait : public OpTrait {
+public:
+  // Returns the trait corresponding to a C++ trait class.
+  StringRef getTrait() const;
+
+  static bool classof(const OpTrait *t) { return t->getKind() == Kind::Native; }
+};
+
+// OpTrait corresponding to a predicate on the operation.
+class PredOpTrait : public OpTrait {
+public:
+  // Returns the template for constructing the predicate.
+  std::string getPredTemplate() const;
+
+  // Returns the description of what the predicate is verifying.
+  StringRef getDescription() const;
+
+  static bool classof(const OpTrait *t) { return t->getKind() == Kind::Pred; }
+};
+
+// OpTrait controlling op definition generator internals.
+class InternalOpTrait : public OpTrait {
+public:
+  // Returns the trait controlling op definition generator internals.
+  StringRef getTrait() const;
+
+  static bool classof(const OpTrait *t) {
+    return t->getKind() == Kind::Internal;
+  }
+};
+
+// OpTrait corresponding to an OpInterface on the operation.
+class InterfaceOpTrait : public OpTrait {
+public:
+  // Returns member function definitions corresponding to the trait,
+  OpInterface getOpInterface() const;
+
+  // Returns the trait corresponding to a C++ trait class.
+  StringRef getTrait() const;
+
+  static bool classof(const OpTrait *t) {
+    return t->getKind() == Kind::Interface;
+  }
+
+  // Whether the declaration of methods for this trait should be emitted.
+  bool shouldDeclareMethods() const;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_OPTRAIT_H_
diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd5ff353bf922d81f86653834c7420a961508c0a
--- /dev/null
+++ b/mlir/include/mlir/TableGen/Operator.h
@@ -0,0 +1,213 @@
+//===- Operator.h - Operator class ------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Operator wrapper to simplify using TableGen Record defining a MLIR Op.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_OPERATOR_H_
+#define MLIR_TABLEGEN_OPERATOR_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Argument.h"
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/Dialect.h"
+#include "mlir/TableGen/OpTrait.h"
+#include "mlir/TableGen/Region.h"
+#include "mlir/TableGen/Type.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/SMLoc.h"
+
+namespace llvm {
+class CodeInit;
+class DefInit;
+class Record;
+class StringInit;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class that contains a MLIR op's information (e.g., operands,
+// attributes) defined in TableGen and provides helper methods for
+// accessing them.
+class Operator {
+public:
+  explicit Operator(const llvm::Record &def);
+  explicit Operator(const llvm::Record *def) : Operator(*def) {}
+
+  // Returns this op's dialect name.
+  StringRef getDialectName() const;
+
+  // Returns the operation name. The name will follow the "<dialect>.<op-name>"
+  // format if its dialect name is not empty.
+  std::string getOperationName() const;
+
+  // Returns this op's C++ class name.
+  StringRef getCppClassName() const;
+
+  // Returns this op's C++ class name prefixed with namespaces.
+  std::string getQualCppClassName() const;
+
+  using value_iterator = NamedTypeConstraint *;
+  using value_range = llvm::iterator_range<value_iterator>;
+
+  // Returns true if this op has variadic operands or results.
+  bool isVariadic() const;
+
+  // Returns true if default builders should not be generated.
+  bool skipDefaultBuilders() const;
+
+  // Op result iterators.
+  value_iterator result_begin();
+  value_iterator result_end();
+  value_range getResults();
+
+  // Returns the number of results this op produces.
+  int getNumResults() const;
+
+  // Returns the op result at the given `index`.
+  NamedTypeConstraint &getResult(int index) { return results[index]; }
+  const NamedTypeConstraint &getResult(int index) const {
+    return results[index];
+  }
+
+  // Returns the `index`-th result's type constraint.
+  TypeConstraint getResultTypeConstraint(int index) const;
+  // Returns the `index`-th result's name.
+  StringRef getResultName(int index) const;
+
+  // Returns the number of variadic results in this operation.
+  unsigned getNumVariadicResults() const;
+
+  // Op attribute iterators.
+  using attribute_iterator = const NamedAttribute *;
+  attribute_iterator attribute_begin() const;
+  attribute_iterator attribute_end() const;
+  llvm::iterator_range<attribute_iterator> getAttributes() const;
+
+  int getNumAttributes() const { return attributes.size(); }
+  int getNumNativeAttributes() const { return numNativeAttributes; }
+
+  // Op attribute accessors.
+  NamedAttribute &getAttribute(int index) { return attributes[index]; }
+
+  // Op operand iterators.
+  value_iterator operand_begin();
+  value_iterator operand_end();
+  value_range getOperands();
+
+  int getNumOperands() const { return operands.size(); }
+  NamedTypeConstraint &getOperand(int index) { return operands[index]; }
+  const NamedTypeConstraint &getOperand(int index) const {
+    return operands[index];
+  }
+
+  // Returns the number of variadic operands in this operation.
+  unsigned getNumVariadicOperands() const;
+
+  // Returns the total number of arguments.
+  int getNumArgs() const { return arguments.size(); }
+
+  using arg_iterator = const Argument *;
+  using arg_range = llvm::iterator_range<arg_iterator>;
+
+  // Op argument (attribute or operand) iterators.
+  arg_iterator arg_begin() const;
+  arg_iterator arg_end() const;
+  arg_range getArgs() const;
+
+  // Op argument (attribute or operand) accessors.
+  Argument getArg(int index) const;
+  StringRef getArgName(int index) const;
+
+  // Returns the trait wrapper for the given MLIR C++ `trait`.
+  // TODO: We should add a C++ wrapper class for TableGen OpTrait instead of
+  // requiring the raw MLIR trait here.
+  const OpTrait *getTrait(llvm::StringRef trait) const;
+
+  // Returns the number of regions.
+  unsigned getNumRegions() const;
+  // Returns the `index`-th region.
+  const NamedRegion &getRegion(unsigned index) const;
+
+  // Trait.
+  using const_trait_iterator = const OpTrait *;
+  const_trait_iterator trait_begin() const;
+  const_trait_iterator trait_end() const;
+  llvm::iterator_range<const_trait_iterator> getTraits() const;
+
+  ArrayRef<llvm::SMLoc> getLoc() const;
+
+  // Query functions for the documentation of the operator.
+  bool hasDescription() const;
+  StringRef getDescription() const;
+  bool hasSummary() const;
+  StringRef getSummary() const;
+
+  // Returns this op's extra class declaration code.
+  StringRef getExtraClassDeclaration() const;
+
+  // Returns the Tablegen definition this operator was constructed from.
+  // TODO(antiagainst,zinenko): do not expose the TableGen record, this is a
+  // temporary solution to OpEmitter requiring a Record because Operator does
+  // not provide enough methods.
+  const llvm::Record &getDef() const;
+
+  // Returns the dialect of the op.
+  const Dialect &getDialect() const { return dialect; }
+
+  // Prints the contents in this operator to the given `os`. This is used for
+  // debugging purposes.
+  void print(llvm::raw_ostream &os) const;
+
+private:
+  // Populates the vectors containing operands, attributes, results and traits.
+  void populateOpStructure();
+
+  // The dialect of this op.
+  Dialect dialect;
+
+  // The unqualified C++ class name of the op.
+  StringRef cppClassName;
+
+  // The operands of the op.
+  SmallVector<NamedTypeConstraint, 4> operands;
+
+  // The attributes of the op.  Contains native attributes (corresponding to the
+  // actual stored attributed of the operation) followed by derived attributes
+  // (corresponding to dynamic properties of the operation that are computed
+  // upon request).
+  SmallVector<NamedAttribute, 4> attributes;
+
+  // The arguments of the op (operands and native attributes).
+  SmallVector<Argument, 4> arguments;
+
+  // The results of the op.
+  SmallVector<NamedTypeConstraint, 4> results;
+
+  // The traits of the op.
+  SmallVector<OpTrait, 4> traits;
+
+  // The regions of this op.
+  SmallVector<NamedRegion, 1> regions;
+
+  // The number of native attributes stored in the leading positions of
+  // `attributes`.
+  int numNativeAttributes;
+
+  // The TableGen definition of this op.
+  const llvm::Record &def;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_OPERATOR_H_
diff --git a/mlir/include/mlir/TableGen/Pattern.h b/mlir/include/mlir/TableGen/Pattern.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf89f6e7c8220e3f6c885d83545e869e7f1b3ecf
--- /dev/null
+++ b/mlir/include/mlir/TableGen/Pattern.h
@@ -0,0 +1,408 @@
+//===- Pattern.h - Pattern wrapper class ------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Pattern wrapper class to simplify using TableGen Record defining a MLIR
+// Pattern.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_PATTERN_H_
+#define MLIR_TABLEGEN_PATTERN_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Argument.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
+
+namespace llvm {
+class DagInit;
+class Init;
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Mapping from TableGen Record to Operator wrapper object.
+//
+// We allocate each wrapper object in heap to make sure the pointer to it is
+// valid throughout the lifetime of this map. This is important because this map
+// is shared among multiple patterns to avoid creating the wrapper object for
+// the same op again and again. But this map will continuously grow.
+using RecordOperatorMap =
+    DenseMap<const llvm::Record *, std::unique_ptr<Operator>>;
+
+class Pattern;
+
+// Wrapper class providing helper methods for accessing TableGen DAG leaves
+// used inside Patterns. This class is lightweight and designed to be used like
+// values.
+//
+// A TableGen DAG construct is of the syntax
+//   `(operator, arg0, arg1, ...)`.
+//
+// This class provides getters to retrieve `arg*` as tblgen:: wrapper objects
+// for handy helper methods. It only works on `arg*`s that are not nested DAG
+// constructs.
+class DagLeaf {
+public:
+  explicit DagLeaf(const llvm::Init *def) : def(def) {}
+
+  // Returns true if this DAG leaf is not specified in the pattern. That is, it
+  // places no further constraints/transforms and just carries over the original
+  // value.
+  bool isUnspecified() const;
+
+  // Returns true if this DAG leaf is matching an operand. That is, it specifies
+  // a type constraint.
+  bool isOperandMatcher() const;
+
+  // Returns true if this DAG leaf is matching an attribute. That is, it
+  // specifies an attribute constraint.
+  bool isAttrMatcher() const;
+
+  // Returns true if this DAG leaf is wrapping native code call.
+  bool isNativeCodeCall() const;
+
+  // Returns true if this DAG leaf is specifying a constant attribute.
+  bool isConstantAttr() const;
+
+  // Returns true if this DAG leaf is specifying an enum attribute case.
+  bool isEnumAttrCase() const;
+
+  // Returns this DAG leaf as a constraint. Asserts if fails.
+  Constraint getAsConstraint() const;
+
+  // Returns this DAG leaf as an constant attribute. Asserts if fails.
+  ConstantAttr getAsConstantAttr() const;
+
+  // Returns this DAG leaf as an enum attribute case.
+  // Precondition: isEnumAttrCase()
+  EnumAttrCase getAsEnumAttrCase() const;
+
+  // Returns the matching condition template inside this DAG leaf. Assumes the
+  // leaf is an operand/attribute matcher and asserts otherwise.
+  std::string getConditionTemplate() const;
+
+  // Returns the native code call template inside this DAG leaf.
+  // Precondition: isNativeCodeCall()
+  StringRef getNativeCodeTemplate() const;
+
+  void print(raw_ostream &os) const;
+
+private:
+  // Returns true if the TableGen Init `def` in this DagLeaf is a DefInit and
+  // also a subclass of the given `superclass`.
+  bool isSubClassOf(StringRef superclass) const;
+
+  const llvm::Init *def;
+};
+
+// Wrapper class providing helper methods for accessing TableGen DAG constructs
+// used inside Patterns. This class is lightweight and designed to be used like
+// values.
+//
+// A TableGen DAG construct is of the syntax
+//   `(operator, arg0, arg1, ...)`.
+//
+// When used inside Patterns, `operator` corresponds to some dialect op, or
+// a known list of verbs that defines special transformation actions. This
+// `arg*` can be a nested DAG construct. This class provides getters to
+// retrieve `operator` and `arg*` as tblgen:: wrapper objects for handy helper
+// methods.
+//
+// A null DagNode contains a nullptr and converts to false implicitly.
+class DagNode {
+public:
+  explicit DagNode(const llvm::DagInit *node) : node(node) {}
+
+  // Implicit bool converter that returns true if this DagNode is not a null
+  // DagNode.
+  operator bool() const { return node != nullptr; }
+
+  // Returns the symbol bound to this DAG node.
+  StringRef getSymbol() const;
+
+  // Returns the operator wrapper object corresponding to the dialect op matched
+  // by this DAG. The operator wrapper will be queried from the given `mapper`
+  // and created in it if not existing.
+  Operator &getDialectOp(RecordOperatorMap *mapper) const;
+
+  // Returns the number of operations recursively involved in the DAG tree
+  // rooted from this node.
+  int getNumOps() const;
+
+  // Returns the number of immediate arguments to this DAG node.
+  int getNumArgs() const;
+
+  // Returns true if the `index`-th argument is a nested DAG construct.
+  bool isNestedDagArg(unsigned index) const;
+
+  // Gets the `index`-th argument as a nested DAG construct if possible. Returns
+  // null DagNode otherwise.
+  DagNode getArgAsNestedDag(unsigned index) const;
+
+  // Gets the `index`-th argument as a DAG leaf.
+  DagLeaf getArgAsLeaf(unsigned index) const;
+
+  // Returns the specified name of the `index`-th argument.
+  StringRef getArgName(unsigned index) const;
+
+  // Returns true if this DAG construct means to replace with an existing SSA
+  // value.
+  bool isReplaceWithValue() const;
+
+  // Returns true if this DAG node is wrapping native code call.
+  bool isNativeCodeCall() const;
+
+  // Returns true if this DAG node is an operation.
+  bool isOperation() const;
+
+  // Returns the native code call template inside this DAG node.
+  // Precondition: isNativeCodeCall()
+  StringRef getNativeCodeTemplate() const;
+
+  void print(raw_ostream &os) const;
+
+private:
+  const llvm::DagInit *node; // nullptr means null DagNode
+};
+
+// A class for maintaining information for symbols bound in patterns and
+// provides methods for resolving them according to specific use cases.
+//
+// Symbols can be bound to
+//
+// * Op arguments and op results in the source pattern and
+// * Op results in result patterns.
+//
+// Symbols can be referenced in result patterns and additional constraints to
+// the pattern.
+//
+// For example, in
+//
+// ```
+// def : Pattern<
+//     (SrcOp:$results1 $arg0, %arg1),
+//     [(ResOp1:$results2), (ResOp2 $results2 (ResOp3 $arg0, $arg1))]>;
+// ```
+//
+// `$argN` is bound to the `SrcOp`'s N-th argument. `$results1` is bound to
+// `SrcOp`. `$results2` is bound to `ResOp1`. $result2 is referenced to build
+// `ResOp2`. `$arg0` and `$arg1` are referenced to build `ResOp3`.
+//
+// If a symbol binds to a multi-result op and it does not have the `__N`
+// suffix, the symbol is expanded to represent all results generated by the
+// multi-result op. If the symbol has a `__N` suffix, then it will expand to
+// only the N-th *static* result as declared in ODS, and that can still
+// corresponds to multiple *dynamic* values if the N-th *static* result is
+// variadic.
+//
+// This class keeps track of such symbols and resolves them into their bound
+// values in a suitable way.
+class SymbolInfoMap {
+public:
+  explicit SymbolInfoMap(ArrayRef<llvm::SMLoc> loc) : loc(loc) {}
+
+  // Class for information regarding a symbol.
+  class SymbolInfo {
+  public:
+    // Returns a string for defining a variable named as `name` to store the
+    // value bound by this symbol.
+    std::string getVarDecl(StringRef name) const;
+
+  private:
+    // Allow SymbolInfoMap to access private methods.
+    friend class SymbolInfoMap;
+
+    // What kind of entity this symbol represents:
+    // * Attr: op attribute
+    // * Operand: op operand
+    // * Result: op result
+    // * Value: a value not attached to an op (e.g., from NativeCodeCall)
+    enum class Kind : uint8_t { Attr, Operand, Result, Value };
+
+    // Creates a SymbolInfo instance. `index` is only used for `Attr` and
+    // `Operand` so should be negative for `Result` and `Value` kind.
+    SymbolInfo(const Operator *op, Kind kind, Optional<int> index);
+
+    // Static methods for creating SymbolInfo.
+    static SymbolInfo getAttr(const Operator *op, int index) {
+      return SymbolInfo(op, Kind::Attr, index);
+    }
+    static SymbolInfo getOperand(const Operator *op, int index) {
+      return SymbolInfo(op, Kind::Operand, index);
+    }
+    static SymbolInfo getResult(const Operator *op) {
+      return SymbolInfo(op, Kind::Result, llvm::None);
+    }
+    static SymbolInfo getValue() {
+      return SymbolInfo(nullptr, Kind::Value, llvm::None);
+    }
+
+    // Returns the number of static values this symbol corresponds to.
+    // A static value is an operand/result declared in ODS. Normally a symbol
+    // only represents one static value, but symbols bound to op results can
+    // represent more than one if the op is a multi-result op.
+    int getStaticValueCount() const;
+
+    // Returns a string containing the C++ expression for referencing this
+    // symbol as a value (if this symbol represents one static value) or a value
+    // range (if this symbol represents multiple static values). `name` is the
+    // name of the C++ variable that this symbol bounds to. `index` should only
+    // be used for indexing results.  `fmt` is used to format each value.
+    // `separator` is used to separate values if this is a value range.
+    std::string getValueAndRangeUse(StringRef name, int index, const char *fmt,
+                                    const char *separator) const;
+
+    // Returns a string containing the C++ expression for referencing this
+    // symbol as a value range regardless of how many static values this symbol
+    // represents. `name` is the name of the C++ variable that this symbol
+    // bounds to. `index` should only be used for indexing results. `fmt` is
+    // used to format each value. `separator` is used to separate values in the
+    // range.
+    std::string getAllRangeUse(StringRef name, int index, const char *fmt,
+                               const char *separator) const;
+
+    const Operator *op; // The op where the bound entity belongs
+    Kind kind;          // The kind of the bound entity
+    // The argument index (for `Attr` and `Operand` only)
+    Optional<int> argIndex;
+  };
+
+  using BaseT = llvm::StringMap<SymbolInfo>;
+
+  // Iterators for accessing all symbols.
+  using iterator = BaseT::iterator;
+  iterator begin() { return symbolInfoMap.begin(); }
+  iterator end() { return symbolInfoMap.end(); }
+
+  // Const iterators for accessing all symbols.
+  using const_iterator = BaseT::const_iterator;
+  const_iterator begin() const { return symbolInfoMap.begin(); }
+  const_iterator end() const { return symbolInfoMap.end(); }
+
+  // Binds the given `symbol` to the `argIndex`-th argument to the given `op`.
+  // Returns false if `symbol` is already bound.
+  bool bindOpArgument(StringRef symbol, const Operator &op, int argIndex);
+
+  // Binds the given `symbol` to the results the given `op`. Returns false if
+  // `symbol` is already bound.
+  bool bindOpResult(StringRef symbol, const Operator &op);
+
+  // Registers the given `symbol` as bound to a value. Returns false if `symbol`
+  // is already bound.
+  bool bindValue(StringRef symbol);
+
+  // Returns true if the given `symbol` is bound.
+  bool contains(StringRef symbol) const;
+
+  // Returns an iterator to the information of the given symbol named as `key`.
+  const_iterator find(StringRef key) const;
+
+  // Returns the number of static values of the given `symbol` corresponds to.
+  // A static value is a operand/result declared in ODS. Normally a symbol only
+  // represents one static value, but symbols bound to op results can represent
+  // more than one if the op is a multi-result op.
+  int getStaticValueCount(StringRef symbol) const;
+
+  // Returns a string containing the C++ expression for referencing this
+  // symbol as a value (if this symbol represents one static value) or a value
+  // range (if this symbol represents multiple static values). `fmt` is used to
+  // format each value. `separator` is used to separate values if `symbol`
+  // represents a value range.
+  std::string getValueAndRangeUse(StringRef symbol, const char *fmt = "{0}",
+                                  const char *separator = ", ") const;
+
+  // Returns a string containing the C++ expression for referencing this
+  // symbol as a value range regardless of how many static values this symbol
+  // represents. `fmt` is used to format each value. `separator` is used to
+  // separate values in the range.
+  std::string getAllRangeUse(StringRef symbol, const char *fmt = "{0}",
+                             const char *separator = ", ") const;
+
+  // Splits the given `symbol` into a value pack name and an index. Returns the
+  // value pack name and writes the index to `index` on success. Returns
+  // `symbol` itself if it does not contain an index.
+  //
+  // We can use `name__N` to access the `N`-th value in the value pack bound to
+  // `name`. `name` is typically the results of an multi-result op.
+  static StringRef getValuePackName(StringRef symbol, int *index = nullptr);
+
+private:
+  llvm::StringMap<SymbolInfo> symbolInfoMap;
+
+  // Pattern instantiation location. This is intended to be used as parameter
+  // to PrintFatalError() to report errors.
+  ArrayRef<llvm::SMLoc> loc;
+};
+
+// Wrapper class providing helper methods for accessing MLIR Pattern defined
+// in TableGen. This class should closely reflect what is defined as class
+// `Pattern` in TableGen. This class contains maps so it is not intended to be
+// used as values.
+class Pattern {
+public:
+  explicit Pattern(const llvm::Record *def, RecordOperatorMap *mapper);
+
+  // Returns the source pattern to match.
+  DagNode getSourcePattern() const;
+
+  // Returns the number of result patterns generated by applying this rewrite
+  // rule.
+  int getNumResultPatterns() const;
+
+  // Returns the DAG tree root node of the `index`-th result pattern.
+  DagNode getResultPattern(unsigned index) const;
+
+  // Collects all symbols bound in the source pattern into `infoMap`.
+  void collectSourcePatternBoundSymbols(SymbolInfoMap &infoMap);
+
+  // Collects all symbols bound in result patterns into `infoMap`.
+  void collectResultPatternBoundSymbols(SymbolInfoMap &infoMap);
+
+  // Returns the op that the root node of the source pattern matches.
+  const Operator &getSourceRootOp();
+
+  // Returns the operator wrapper object corresponding to the given `node`'s DAG
+  // operator.
+  Operator &getDialectOp(DagNode node);
+
+  // Returns the constraints.
+  std::vector<AppliedConstraint> getConstraints() const;
+
+  // Returns the benefit score of the pattern.
+  int getBenefit() const;
+
+  using IdentifierLine = std::pair<StringRef, unsigned>;
+
+  // Returns the file location of the pattern (buffer identifier + line number
+  // pair).
+  std::vector<IdentifierLine> getLocation() const;
+
+private:
+  // Recursively collects all bound symbols inside the DAG tree rooted
+  // at `tree` and updates the given `infoMap`.
+  void collectBoundSymbols(DagNode tree, SymbolInfoMap &infoMap,
+                           bool isSrcPattern);
+
+  // The TableGen definition of this pattern.
+  const llvm::Record &def;
+
+  // All operators.
+  // TODO(antiagainst): we need a proper context manager, like MLIRContext,
+  // for managing the lifetime of shared entities.
+  RecordOperatorMap *recordOpMap;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_PATTERN_H_
diff --git a/mlir/include/mlir/TableGen/Predicate.h b/mlir/include/mlir/TableGen/Predicate.h
new file mode 100644
index 0000000000000000000000000000000000000000..045b7fece2e30bf563f54ade8c204cd7a0cd92f1
--- /dev/null
+++ b/mlir/include/mlir/TableGen/Predicate.h
@@ -0,0 +1,119 @@
+//===- Predicate.h - Predicate class ----------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper around predicates defined in TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_PREDICATE_H_
+#define MLIR_TABLEGEN_PREDICATE_H_
+
+#include "mlir/Support/LLVM.h"
+
+#include <string>
+#include <vector>
+
+namespace llvm {
+class Init;
+class ListInit;
+class Record;
+class SMLoc;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// A logical predicate.  This class must closely follow the definition of
+// TableGen class 'Pred'.
+class Pred {
+public:
+  // Constructs the null Predicate (e.g., always true).
+  explicit Pred() : def(nullptr) {}
+  // Construct a Predicate from a record.
+  explicit Pred(const llvm::Record *record);
+  // Construct a Predicate from an initializer.
+  explicit Pred(const llvm::Init *init);
+
+  // Check if the predicate is defined.  Callers may use this to interpret the
+  // missing predicate as either true (e.g. in filters) or false (e.g. in
+  // precondition verification).
+  bool isNull() const { return def == nullptr; }
+
+  // Get the predicate condition.  This may dispatch to getConditionImpl() of
+  // the underlying predicate type.
+  std::string getCondition() const;
+
+  // Whether the predicate is a combination of other predicates, i.e. an
+  // record of type CombinedPred.
+  bool isCombined() const;
+
+  // Records are pointer-comparable.
+  bool operator==(const Pred &other) const { return def == other.def; }
+
+  // Get the location of the predicate.
+  ArrayRef<llvm::SMLoc> getLoc() const;
+
+protected:
+  // The TableGen definition of this predicate.
+  const llvm::Record *def;
+};
+
+// A logical predicate wrapping a C expression.  This class must closely follow
+// the definition of TableGen class 'CPred'.
+class CPred : public Pred {
+public:
+  // Construct a CPred from a record.
+  explicit CPred(const llvm::Record *record);
+  // Construct a CPred an initializer.
+  explicit CPred(const llvm::Init *init);
+
+  // Get the predicate condition.
+  std::string getConditionImpl() const;
+};
+
+// A logical predicate that is a combination of other predicates.  This class
+// must closely follow the definition of TableGen class 'CombinedPred'.
+class CombinedPred : public Pred {
+public:
+  // Construct a CombinedPred from a record.
+  explicit CombinedPred(const llvm::Record *record);
+  // Construct a CombinedPred from an initializer.
+  explicit CombinedPred(const llvm::Init *init);
+
+  // Get the predicate condition.
+  std::string getConditionImpl() const;
+
+  // Get the definition of the combiner used in this predicate.
+  const llvm::Record *getCombinerDef() const;
+
+  // Get the predicates that are combined by this predicate.
+  const std::vector<llvm::Record *> getChildren() const;
+};
+
+// A combined predicate that requires all child predicates of 'CPred' type to
+// have their expression rewritten with a simple string substitution rule.
+class SubstLeavesPred : public CombinedPred {
+public:
+  // Get the replacement pattern.
+  StringRef getPattern() const;
+  // Get the string used to replace the pattern.
+  StringRef getReplacement() const;
+};
+
+// A combined predicate that prepends a prefix and appends a suffix to the
+// predicate string composed from a child predicate.
+class ConcatPred : public CombinedPred {
+public:
+  StringRef getPrefix() const;
+  StringRef getSuffix() const;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_PREDICATE_H_
diff --git a/mlir/include/mlir/TableGen/Region.h b/mlir/include/mlir/TableGen/Region.h
new file mode 100644
index 0000000000000000000000000000000000000000..778f68622bf922e14c40738f2cf0545b4fbbfa41
--- /dev/null
+++ b/mlir/include/mlir/TableGen/Region.h
@@ -0,0 +1,36 @@
+//===- TGRegion.h - TableGen region definitions -----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_REGION_H_
+#define MLIR_TABLEGEN_REGION_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Constraint.h"
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class providing helper methods for accessing Region defined in
+// TableGen.
+class Region : public Constraint {
+public:
+  using Constraint::Constraint;
+
+  static bool classof(const Constraint *c) { return c->getKind() == CK_Region; }
+};
+
+// A struct bundling a region's constraint and its name.
+struct NamedRegion {
+  StringRef name;
+  Region constraint;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_REGION_H_
diff --git a/mlir/include/mlir/TableGen/Type.h b/mlir/include/mlir/TableGen/Type.h
new file mode 100644
index 0000000000000000000000000000000000000000..35de70f52fd9cd3334b7bc9d041f7c5a28df3dde
--- /dev/null
+++ b/mlir/include/mlir/TableGen/Type.h
@@ -0,0 +1,56 @@
+//===- Type.h - Type class --------------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Type wrapper to simplify using TableGen Record defining a MLIR Type.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TABLEGEN_TYPE_H_
+#define MLIR_TABLEGEN_TYPE_H_
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Constraint.h"
+#include "mlir/TableGen/Dialect.h"
+
+namespace llvm {
+class DefInit;
+class Record;
+} // end namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Wrapper class with helper methods for accessing Type constraints defined in
+// TableGen.
+class TypeConstraint : public Constraint {
+public:
+  explicit TypeConstraint(const llvm::Record *record);
+  explicit TypeConstraint(const llvm::DefInit *init);
+
+  static bool classof(const Constraint *c) { return c->getKind() == CK_Type; }
+
+  // Returns true if this is a variadic type constraint.
+  bool isVariadic() const;
+};
+
+// Wrapper class with helper methods for accessing Types defined in TableGen.
+class Type : public TypeConstraint {
+public:
+  explicit Type(const llvm::Record *record);
+
+  // Returns the description of the type.
+  StringRef getTypeDescription() const;
+
+  // Returns the dialect for the type if defined.
+  Dialect getDialect() const;
+};
+
+} // end namespace tblgen
+} // end namespace mlir
+
+#endif // MLIR_TABLEGEN_TYPE_H_
diff --git a/mlir/include/mlir/Target/LLVMIR.h b/mlir/include/mlir/Target/LLVMIR.h
new file mode 100644
index 0000000000000000000000000000000000000000..1cdc26ccee6e103f813c8fd16adb0f5b78cba35f
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVMIR.h
@@ -0,0 +1,46 @@
+//===- LLVMIR.h - MLIR to LLVM IR conversion --------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the entry point for the MLIR to LLVM IR conversion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_H
+#define MLIR_TARGET_LLVMIR_H
+
+#include <memory>
+
+// Forward-declare LLVM classes.
+namespace llvm {
+class LLVMContext;
+class Module;
+} // namespace llvm
+
+namespace mlir {
+
+class OwningModuleRef;
+class MLIRContext;
+class ModuleOp;
+
+/// Convert the given MLIR module into LLVM IR.  The LLVM context is extracted
+/// from the registered LLVM IR dialect.  In case of error, report it
+/// to the error handler registered with the MLIR context, if any (obtained from
+/// the MLIR module), and return `nullptr`.
+std::unique_ptr<llvm::Module> translateModuleToLLVMIR(ModuleOp m);
+
+/// Convert the given LLVM module into MLIR's LLVM dialect.  The LLVM context is
+/// extracted from the registered LLVM IR dialect. In case of error, report it
+/// to the error handler registered with the MLIR context, if any (obtained from
+/// the MLIR module), and return `{}`.
+OwningModuleRef
+translateLLVMIRToModule(std::unique_ptr<llvm::Module> llvmModule,
+                        MLIRContext *context);
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_H
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0b13a669fa9c8ca73346ef1fd38ec002bd3dc35
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -0,0 +1,114 @@
+//===- ModuleTranslation.h - MLIR to LLVM conversion ------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the translation between an MLIR LLVM dialect module and
+// the corresponding LLVMIR module. It only handles core LLVM IR operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_MODULETRANSLATION_H
+#define MLIR_TARGET_LLVMIR_MODULETRANSLATION_H
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Value.h"
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+
+namespace mlir {
+class Attribute;
+class Location;
+class ModuleOp;
+class Operation;
+
+namespace LLVM {
+
+class LLVMFuncOp;
+
+// Implementation class for module translation.  Holds a reference to the module
+// being translated, and the mappings between the original and the translated
+// functions, basic blocks and values.  It is practically easier to hold these
+// mappings in one class since the conversion of control flow operations
+// needs to look up block and function mappings.
+class ModuleTranslation {
+public:
+  template <typename T = ModuleTranslation>
+  static std::unique_ptr<llvm::Module> translateModule(Operation *m) {
+    if (!satisfiesLLVMModule(m))
+      return nullptr;
+    if (failed(checkSupportedModuleOps(m)))
+      return nullptr;
+    auto llvmModule = prepareLLVMModule(m);
+    if (!llvmModule)
+      return nullptr;
+
+    T translator(m);
+    translator.llvmModule = std::move(llvmModule);
+    translator.convertGlobals();
+    if (failed(translator.convertFunctions()))
+      return nullptr;
+
+    return std::move(translator.llvmModule);
+  }
+
+  /// A helper method to get the single Block in an operation honoring LLVM's
+  /// module requirements.
+  static Block &getModuleBody(Operation *m) { return m->getRegion(0).front(); }
+
+protected:
+  // Translate the given MLIR module expressed in MLIR LLVM IR dialect into an
+  // LLVM IR module.  The MLIR LLVM IR dialect holds a pointer to an
+  // LLVMContext, the LLVM IR module will be created in that context.
+  explicit ModuleTranslation(Operation *module) : mlirModule(module) {
+    assert(satisfiesLLVMModule(mlirModule) &&
+           "mlirModule should honor LLVM's module semantics.");
+  }
+  virtual ~ModuleTranslation() {}
+
+  virtual LogicalResult convertOperation(Operation &op,
+                                         llvm::IRBuilder<> &builder);
+  static std::unique_ptr<llvm::Module> prepareLLVMModule(Operation *m);
+
+  /// A helper to look up remapped operands in the value remapping table.
+  SmallVector<llvm::Value *, 8> lookupValues(ValueRange values);
+
+private:
+  /// Check whether the module contains only supported ops directly in its body.
+  static LogicalResult checkSupportedModuleOps(Operation *m);
+
+  LogicalResult convertFunctions();
+  void convertGlobals();
+  LogicalResult convertOneFunction(LLVMFuncOp func);
+  void connectPHINodes(LLVMFuncOp func);
+  LogicalResult convertBlock(Block &bb, bool ignoreArguments);
+
+  llvm::Constant *getLLVMConstant(llvm::Type *llvmType, Attribute attr,
+                                  Location loc);
+
+  // Original and translated module.
+  Operation *mlirModule;
+  std::unique_ptr<llvm::Module> llvmModule;
+
+  // Mappings between llvm.mlir.global definitions and corresponding globals.
+  DenseMap<Operation *, llvm::GlobalValue *> globalsMapping;
+
+protected:
+  // Mappings between original and translated values, used for lookups.
+  llvm::StringMap<llvm::Function *> functionMapping;
+  DenseMap<Value, llvm::Value *> valueMapping;
+  DenseMap<Block *, llvm::BasicBlock *> blockMapping;
+};
+
+} // namespace LLVM
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_MODULETRANSLATION_H
diff --git a/mlir/include/mlir/Target/NVVMIR.h b/mlir/include/mlir/Target/NVVMIR.h
new file mode 100644
index 0000000000000000000000000000000000000000..377ee16d4e4e970047fe45d59388548470a09fab
--- /dev/null
+++ b/mlir/include/mlir/Target/NVVMIR.h
@@ -0,0 +1,35 @@
+//===- NVVMIR.h - MLIR to LLVM + NVVM IR conversion -------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the entry point for the MLIR to LLVM + NVVM IR conversion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_NVVMIR_H
+#define MLIR_TARGET_NVVMIR_H
+
+#include <memory>
+
+// Forward-declare LLVM classes.
+namespace llvm {
+class Module;
+} // namespace llvm
+
+namespace mlir {
+class Operation;
+
+/// Convert the given LLVM-module-like operation into NVVM IR. This conversion
+/// requires the registration of the LLVM IR dialect and will extract the LLVM
+/// context from the registered LLVM IR dialect.  In case of error, report it to
+/// the error handler registered with the MLIR context, if any (obtained from
+/// the MLIR module), and return `nullptr`.
+std::unique_ptr<llvm::Module> translateModuleToNVVMIR(Operation *m);
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_NVVMIR_H
diff --git a/mlir/include/mlir/Target/ROCDLIR.h b/mlir/include/mlir/Target/ROCDLIR.h
new file mode 100644
index 0000000000000000000000000000000000000000..25937eedd5aaebbe64028a3dfda796b60d8ae8c9
--- /dev/null
+++ b/mlir/include/mlir/Target/ROCDLIR.h
@@ -0,0 +1,36 @@
+//===- ROCDLIR.h - MLIR to LLVM + ROCDL IR conversion -----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the entry point for the MLIR to LLVM + ROCDL IR
+// conversion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_ROCDLIR_H
+#define MLIR_TARGET_ROCDLIR_H
+
+#include <memory>
+
+// Forward-declare LLVM classes.
+namespace llvm {
+class Module;
+} // namespace llvm
+
+namespace mlir {
+class Operation;
+
+/// Convert the given LLVM-module-like operation into ROCDL IR. This conversion
+/// requires the registration of the LLVM IR dialect and will extract the LLVM
+/// context from the registered LLVM IR dialect.  In case of error, report it to
+/// the error handler registered with the MLIR context, if any (obtained from
+/// the MLIR module), and return `nullptr`.
+std::unique_ptr<llvm::Module> translateModuleToROCDLIR(Operation *m);
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_ROCDLIR_H
diff --git a/mlir/include/mlir/Transforms/CMakeLists.txt b/mlir/include/mlir/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9ac003a4140cc33109956bf8b93258bd05b2ead4
--- /dev/null
+++ b/mlir/include/mlir/Transforms/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(LLVM_TARGET_DEFINITIONS LoopLikeInterface.td)
+mlir_tablegen(LoopLikeInterface.h.inc -gen-op-interface-decls)
+mlir_tablegen(LoopLikeInterface.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(MLIRLoopLikeInterfaceIncGen)
+
+
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cbbcae4543f01abf5c03fadacd196a88308fde9
--- /dev/null
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -0,0 +1,675 @@
+//===- DialectConversion.h - MLIR dialect conversion pass -------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a generic pass for converting between MLIR dialects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_DIALECTCONVERSION_H_
+#define MLIR_TRANSFORMS_DIALECTCONVERSION_H_
+
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/StringMap.h"
+
+namespace mlir {
+
+// Forward declarations.
+class Block;
+class ConversionPatternRewriter;
+class FuncOp;
+class MLIRContext;
+class Operation;
+class Type;
+class Value;
+
+//===----------------------------------------------------------------------===//
+// Type Conversion
+//===----------------------------------------------------------------------===//
+
+/// Base class for type conversion interface. Specific converters must
+/// derive this class and implement the pure virtual functions.
+class TypeConverter {
+public:
+  virtual ~TypeConverter() = default;
+
+  /// This class provides all of the information necessary to convert a type
+  /// signature.
+  class SignatureConversion {
+  public:
+    SignatureConversion(unsigned numOrigInputs)
+        : remappedInputs(numOrigInputs) {}
+
+    /// This struct represents a range of new types or a single value that
+    /// remaps an existing signature input.
+    struct InputMapping {
+      size_t inputNo, size;
+      Value replacementValue;
+    };
+
+    /// Return the argument types for the new signature.
+    ArrayRef<Type> getConvertedTypes() const { return argTypes; }
+
+    /// Get the input mapping for the given argument.
+    Optional<InputMapping> getInputMapping(unsigned input) const {
+      return remappedInputs[input];
+    }
+
+    //===------------------------------------------------------------------===//
+    // Conversion Hooks
+    //===------------------------------------------------------------------===//
+
+    /// Remap an input of the original signature with a new set of types. The
+    /// new types are appended to the new signature conversion.
+    void addInputs(unsigned origInputNo, ArrayRef<Type> types);
+
+    /// Append new input types to the signature conversion, this should only be
+    /// used if the new types are not intended to remap an existing input.
+    void addInputs(ArrayRef<Type> types);
+
+    /// Remap an input of the original signature with a range of types in the
+    /// new signature.
+    void remapInput(unsigned origInputNo, unsigned newInputNo,
+                    unsigned newInputCount = 1);
+
+    /// Remap an input of the original signature to another `replacement`
+    /// value. This drops the original argument.
+    void remapInput(unsigned origInputNo, Value replacement);
+
+  private:
+    /// The remapping information for each of the original arguments.
+    SmallVector<Optional<InputMapping>, 4> remappedInputs;
+
+    /// The set of new argument types.
+    SmallVector<Type, 4> argTypes;
+  };
+
+  /// This hook allows for converting a type. This function should return
+  /// failure if no valid conversion exists, success otherwise. If the new set
+  /// of types is empty, the type is removed and any usages of the existing
+  /// value are expected to be removed during conversion.
+  virtual LogicalResult convertType(Type t, SmallVectorImpl<Type> &results);
+
+  /// This hook simplifies defining 1-1 type conversions. This function returns
+  /// the type to convert to on success, and a null type on failure.
+  virtual Type convertType(Type t) { return t; }
+
+  /// Convert the given set of types, filling 'results' as necessary. This
+  /// returns failure if the conversion of any of the types fails, success
+  /// otherwise.
+  LogicalResult convertTypes(ArrayRef<Type> types,
+                             SmallVectorImpl<Type> &results);
+
+  /// Return true if the given type is legal for this type converter, i.e. the
+  /// type converts to itself.
+  bool isLegal(Type type);
+
+  /// Return true if the inputs and outputs of the given function type are
+  /// legal.
+  bool isSignatureLegal(FunctionType funcType);
+
+  /// This hook allows for converting a specific argument of a signature. It
+  /// takes as inputs the original argument input number, type.
+  /// On success, this function should populate 'result' with any new mappings.
+  virtual LogicalResult convertSignatureArg(unsigned inputNo, Type type,
+                                            SignatureConversion &result);
+
+  /// This function converts the type signature of the given block, by invoking
+  /// 'convertSignatureArg' for each argument. This function should return a
+  /// valid conversion for the signature on success, None otherwise.
+  Optional<SignatureConversion> convertBlockSignature(Block *block);
+
+  /// This hook allows for materializing a conversion from a set of types into
+  /// one result type by generating a cast operation of some kind. The generated
+  /// operation should produce one result, of 'resultType', with the provided
+  /// 'inputs' as operands. This hook must be overridden when a type conversion
+  /// results in more than one type, or if a type conversion may persist after
+  /// the conversion has finished.
+  virtual Operation *materializeConversion(PatternRewriter &rewriter,
+                                           Type resultType,
+                                           ArrayRef<Value> inputs,
+                                           Location loc) {
+    llvm_unreachable("expected 'materializeConversion' to be overridden");
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Conversion Patterns
+//===----------------------------------------------------------------------===//
+
+/// Base class for the conversion patterns that require type changes. Specific
+/// conversions must derive this class and implement least one `rewrite` method.
+/// NOTE: These conversion patterns can only be used with the 'apply*' methods
+/// below.
+class ConversionPattern : public RewritePattern {
+public:
+  /// Construct an ConversionPattern.  `rootName` must correspond to the
+  /// canonical name of the first operation matched by the pattern.
+  ConversionPattern(StringRef rootName, PatternBenefit benefit,
+                    MLIRContext *ctx)
+      : RewritePattern(rootName, benefit, ctx) {}
+
+  /// Hook for derived classes to implement rewriting. `op` is the (first)
+  /// operation matched by the pattern, `operands` is a list of rewritten values
+  /// that are passed to this operation, `rewriter` can be used to emit the new
+  /// operations. This function must be reimplemented if the
+  /// ConversionPattern ever needs to replace an operation that does not
+  /// have successors. This function should not fail. If some specific cases of
+  /// the operation are not supported, these cases should not be matched.
+  virtual void rewrite(Operation *op, ArrayRef<Value> operands,
+                       ConversionPatternRewriter &rewriter) const {
+    llvm_unreachable("unimplemented rewrite");
+  }
+
+  /// Hook for derived classes to implement rewriting. `op` is the (first)
+  /// operation matched by the pattern, `properOperands` is a list of rewritten
+  /// values that are passed to the operation itself, `destinations` is a list
+  /// of (potentially rewritten) successor blocks, `operands` is a list of lists
+  /// of rewritten values passed to each of the successors, co-indexed with
+  /// `destinations`, `rewriter` can be used to emit the new operations. It must
+  /// be reimplemented if the ConversionPattern ever needs to replace a
+  /// terminator operation that has successors. This function should not fail
+  /// the pass. If some specific cases of the operation are not supported,
+  /// these cases should not be matched.
+  virtual void rewrite(Operation *op, ArrayRef<Value> properOperands,
+                       ArrayRef<Block *> destinations,
+                       ArrayRef<ArrayRef<Value>> operands,
+                       ConversionPatternRewriter &rewriter) const {
+    llvm_unreachable("unimplemented rewrite for terminators");
+  }
+
+  /// Hook for derived classes to implement combined matching and rewriting.
+  virtual PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> properOperands,
+                  ArrayRef<Block *> destinations,
+                  ArrayRef<ArrayRef<Value>> operands,
+                  ConversionPatternRewriter &rewriter) const {
+    if (!match(op))
+      return matchFailure();
+    rewrite(op, properOperands, destinations, operands, rewriter);
+    return matchSuccess();
+  }
+
+  /// Hook for derived classes to implement combined matching and rewriting.
+  virtual PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const {
+    if (!match(op))
+      return matchFailure();
+    rewrite(op, operands, rewriter);
+    return matchSuccess();
+  }
+
+  /// Attempt to match and rewrite the IR root at the specified operation.
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const final;
+
+private:
+  using RewritePattern::rewrite;
+};
+
+/// OpConversionPattern is a wrapper around ConversionPattern that allows for
+/// matching and rewriting against an instance of a derived operation class as
+/// opposed to a raw Operation.
+template <typename SourceOp>
+struct OpConversionPattern : public ConversionPattern {
+  OpConversionPattern(MLIRContext *context, PatternBenefit benefit = 1)
+      : ConversionPattern(SourceOp::getOperationName(), benefit, context) {}
+
+  /// Wrappers around the ConversionPattern methods that pass the derived op
+  /// type.
+  void rewrite(Operation *op, ArrayRef<Value> operands,
+               ConversionPatternRewriter &rewriter) const final {
+    rewrite(cast<SourceOp>(op), operands, rewriter);
+  }
+  void rewrite(Operation *op, ArrayRef<Value> properOperands,
+               ArrayRef<Block *> destinations,
+               ArrayRef<ArrayRef<Value>> operands,
+               ConversionPatternRewriter &rewriter) const final {
+    rewrite(cast<SourceOp>(op), properOperands, destinations, operands,
+            rewriter);
+  }
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> properOperands,
+                  ArrayRef<Block *> destinations,
+                  ArrayRef<ArrayRef<Value>> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    return matchAndRewrite(cast<SourceOp>(op), properOperands, destinations,
+                           operands, rewriter);
+  }
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    return matchAndRewrite(cast<SourceOp>(op), operands, rewriter);
+  }
+
+  // TODO(b/142763075): Use OperandAdaptor when it supports access to unnamed
+  // operands.
+
+  /// Rewrite and Match methods that operate on the SourceOp type. These must be
+  /// overridden by the derived pattern class.
+  virtual void rewrite(SourceOp op, ArrayRef<Value> operands,
+                       ConversionPatternRewriter &rewriter) const {
+    llvm_unreachable("must override matchAndRewrite or a rewrite method");
+  }
+
+  virtual void rewrite(SourceOp op, ArrayRef<Value> properOperands,
+                       ArrayRef<Block *> destinations,
+                       ArrayRef<ArrayRef<Value>> operands,
+                       ConversionPatternRewriter &rewriter) const {
+    llvm_unreachable("unimplemented rewrite for terminators");
+  }
+
+  virtual PatternMatchResult
+  matchAndRewrite(SourceOp op, ArrayRef<Value> properOperands,
+                  ArrayRef<Block *> destinations,
+                  ArrayRef<ArrayRef<Value>> operands,
+                  ConversionPatternRewriter &rewriter) const {
+    if (!match(op))
+      return matchFailure();
+    rewrite(op, properOperands, destinations, operands, rewriter);
+    return matchSuccess();
+  }
+
+  virtual PatternMatchResult
+  matchAndRewrite(SourceOp op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const {
+    if (!match(op))
+      return matchFailure();
+    rewrite(op, operands, rewriter);
+    return matchSuccess();
+  }
+
+private:
+  using ConversionPattern::matchAndRewrite;
+};
+
+/// Add a pattern to the given pattern list to convert the signature of a FuncOp
+/// with the given type converter.
+void populateFuncOpTypeConversionPattern(OwningRewritePatternList &patterns,
+                                         MLIRContext *ctx,
+                                         TypeConverter &converter);
+
+//===----------------------------------------------------------------------===//
+// Conversion PatternRewriter
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+struct ConversionPatternRewriterImpl;
+} // end namespace detail
+
+/// This class implements a pattern rewriter for use with ConversionPatterns. It
+/// extends the base PatternRewriter and provides special conversion specific
+/// hooks.
+class ConversionPatternRewriter final : public PatternRewriter {
+public:
+  ConversionPatternRewriter(MLIRContext *ctx, TypeConverter *converter);
+  ~ConversionPatternRewriter() override;
+
+  /// Apply a signature conversion to the entry block of the given region. This
+  /// replaces the entry block with a new block containing the updated
+  /// signature. The new entry block to the region is returned for convenience.
+  Block *
+  applySignatureConversion(Region *region,
+                           TypeConverter::SignatureConversion &conversion);
+
+  /// Replace all the uses of the block argument `from` with value `to`.
+  void replaceUsesOfBlockArgument(BlockArgument from, Value to);
+
+  /// Return the converted value that replaces 'key'. Return 'key' if there is
+  /// no such a converted value.
+  Value getRemappedValue(Value key);
+
+  //===--------------------------------------------------------------------===//
+  // PatternRewriter Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// PatternRewriter hook for replacing the results of an operation.
+  void replaceOp(Operation *op, ValueRange newValues,
+                 ValueRange valuesToRemoveIfDead) override;
+  using PatternRewriter::replaceOp;
+
+  /// PatternRewriter hook for erasing a dead operation. The uses of this
+  /// operation *must* be made dead by the end of the conversion process,
+  /// otherwise an assert will be issued.
+  void eraseOp(Operation *op) override;
+
+  /// PatternRewriter hook for splitting a block into two parts.
+  Block *splitBlock(Block *block, Block::iterator before) override;
+
+  /// PatternRewriter hook for merging a block into another.
+  void mergeBlocks(Block *source, Block *dest, ValueRange argValues) override;
+
+  /// PatternRewriter hook for moving blocks out of a region.
+  void inlineRegionBefore(Region &region, Region &parent,
+                          Region::iterator before) override;
+  using PatternRewriter::inlineRegionBefore;
+
+  /// PatternRewriter hook for cloning blocks of one region into another. The
+  /// given region to clone *must* not have been modified as part of conversion
+  /// yet, i.e. it must be within an operation that is either in the process of
+  /// conversion, or has not yet been converted.
+  void cloneRegionBefore(Region &region, Region &parent,
+                         Region::iterator before,
+                         BlockAndValueMapping &mapping) override;
+  using PatternRewriter::cloneRegionBefore;
+
+  /// PatternRewriter hook for inserting a new operation.
+  Operation *insert(Operation *op) override;
+
+  /// PatternRewriter hook for updating the root operation in-place.
+  /// Note: These methods only track updates to the top-level operation itself,
+  /// and not nested regions. Updates to regions will still require notification
+  /// through other more specific hooks above.
+  void startRootUpdate(Operation *op) override;
+
+  /// PatternRewriter hook for updating the root operation in-place.
+  void finalizeRootUpdate(Operation *op) override;
+
+  /// PatternRewriter hook for updating the root operation in-place.
+  void cancelRootUpdate(Operation *op) override;
+
+  /// Return a reference to the internal implementation.
+  detail::ConversionPatternRewriterImpl &getImpl();
+
+private:
+  std::unique_ptr<detail::ConversionPatternRewriterImpl> impl;
+};
+
+//===----------------------------------------------------------------------===//
+// ConversionTarget
+//===----------------------------------------------------------------------===//
+
+/// This class describes a specific conversion target.
+class ConversionTarget {
+public:
+  /// This enumeration corresponds to the specific action to take when
+  /// considering an operation legal for this conversion target.
+  enum class LegalizationAction {
+    /// The target supports this operation.
+    Legal,
+
+    /// This operation has dynamic legalization constraints that must be checked
+    /// by the target.
+    Dynamic,
+
+    /// The target explicitly does not support this operation.
+    Illegal,
+  };
+
+  /// A structure containing additional information describing a specific legal
+  /// operation instance.
+  struct LegalOpDetails {
+    /// A flag that indicates if this operation is 'recursively' legal. This
+    /// means that if an operation is legal, either statically or dynamically,
+    /// all of the operations nested within are also considered legal.
+    bool isRecursivelyLegal = false;
+  };
+
+  /// The signature of the callback used to determine if an operation is
+  /// dynamically legal on the target.
+  using DynamicLegalityCallbackFn = std::function<bool(Operation *)>;
+
+  ConversionTarget(MLIRContext &ctx) : ctx(ctx) {}
+  virtual ~ConversionTarget() = default;
+
+  //===--------------------------------------------------------------------===//
+  // Legality Registration
+  //===--------------------------------------------------------------------===//
+
+  /// Register a legality action for the given operation.
+  void setOpAction(OperationName op, LegalizationAction action);
+  template <typename OpT> void setOpAction(LegalizationAction action) {
+    setOpAction(OperationName(OpT::getOperationName(), &ctx), action);
+  }
+
+  /// Register the given operations as legal.
+  template <typename OpT> void addLegalOp() {
+    setOpAction<OpT>(LegalizationAction::Legal);
+  }
+  template <typename OpT, typename OpT2, typename... OpTs> void addLegalOp() {
+    addLegalOp<OpT>();
+    addLegalOp<OpT2, OpTs...>();
+  }
+
+  /// Register the given operation as dynamically legal, i.e. requiring custom
+  /// handling by the target via 'isDynamicallyLegal'.
+  template <typename OpT> void addDynamicallyLegalOp() {
+    setOpAction<OpT>(LegalizationAction::Dynamic);
+  }
+  template <typename OpT, typename OpT2, typename... OpTs>
+  void addDynamicallyLegalOp() {
+    addDynamicallyLegalOp<OpT>();
+    addDynamicallyLegalOp<OpT2, OpTs...>();
+  }
+
+  /// Register the given operation as dynamically legal and set the dynamic
+  /// legalization callback to the one provided.
+  template <typename OpT>
+  void addDynamicallyLegalOp(const DynamicLegalityCallbackFn &callback) {
+    OperationName opName(OpT::getOperationName(), &ctx);
+    setOpAction(opName, LegalizationAction::Dynamic);
+    setLegalityCallback(opName, callback);
+  }
+  template <typename OpT, typename OpT2, typename... OpTs>
+  void addDynamicallyLegalOp(const DynamicLegalityCallbackFn &callback) {
+    addDynamicallyLegalOp<OpT>(callback);
+    addDynamicallyLegalOp<OpT2, OpTs...>(callback);
+  }
+  template <typename OpT, class Callable>
+  typename std::enable_if<!is_invocable<Callable, Operation *>::value>::type
+  addDynamicallyLegalOp(Callable &&callback) {
+    addDynamicallyLegalOp<OpT>(
+        [=](Operation *op) { return callback(cast<OpT>(op)); });
+  }
+
+  /// Register the given operation as illegal, i.e. this operation is known to
+  /// not be supported by this target.
+  template <typename OpT> void addIllegalOp() {
+    setOpAction<OpT>(LegalizationAction::Illegal);
+  }
+  template <typename OpT, typename OpT2, typename... OpTs> void addIllegalOp() {
+    addIllegalOp<OpT>();
+    addIllegalOp<OpT2, OpTs...>();
+  }
+
+  /// Mark an operation, that *must* have either been set as `Legal` or
+  /// `DynamicallyLegal`, as being recursively legal. This means that in
+  /// addition to the operation itself, all of the operations nested within are
+  /// also considered legal. An optional dynamic legality callback may be
+  /// provided to mark subsets of legal instances as recursively legal.
+  template <typename OpT>
+  void markOpRecursivelyLegal(const DynamicLegalityCallbackFn &callback = {}) {
+    OperationName opName(OpT::getOperationName(), &ctx);
+    markOpRecursivelyLegal(opName, callback);
+  }
+  template <typename OpT, typename OpT2, typename... OpTs>
+  void markOpRecursivelyLegal(const DynamicLegalityCallbackFn &callback = {}) {
+    markOpRecursivelyLegal<OpT>(callback);
+    markOpRecursivelyLegal<OpT2, OpTs...>(callback);
+  }
+  template <typename OpT, class Callable>
+  typename std::enable_if<!is_invocable<Callable, Operation *>::value>::type
+  markOpRecursivelyLegal(Callable &&callback) {
+    markOpRecursivelyLegal<OpT>(
+        [=](Operation *op) { return callback(cast<OpT>(op)); });
+  }
+
+  /// Register a legality action for the given dialects.
+  void setDialectAction(ArrayRef<StringRef> dialectNames,
+                        LegalizationAction action);
+
+  /// Register the operations of the given dialects as legal.
+  template <typename... Names>
+  void addLegalDialect(StringRef name, Names... names) {
+    SmallVector<StringRef, 2> dialectNames({name, names...});
+    setDialectAction(dialectNames, LegalizationAction::Legal);
+  }
+  template <typename... Args> void addLegalDialect() {
+    SmallVector<StringRef, 2> dialectNames({Args::getDialectNamespace()...});
+    setDialectAction(dialectNames, LegalizationAction::Legal);
+  }
+
+  /// Register the operations of the given dialects as dynamically legal, i.e.
+  /// requiring custom handling by the target via 'isDynamicallyLegal'.
+  template <typename... Names>
+  void addDynamicallyLegalDialect(StringRef name, Names... names) {
+    SmallVector<StringRef, 2> dialectNames({name, names...});
+    setDialectAction(dialectNames, LegalizationAction::Dynamic);
+  }
+  template <typename... Args>
+  void addDynamicallyLegalDialect(
+      Optional<DynamicLegalityCallbackFn> callback = llvm::None) {
+    SmallVector<StringRef, 2> dialectNames({Args::getDialectNamespace()...});
+    setDialectAction(dialectNames, LegalizationAction::Dynamic);
+    if (callback)
+      setLegalityCallback(dialectNames, *callback);
+  }
+
+  /// Register the operations of the given dialects as illegal, i.e.
+  /// operations of this dialect are not supported by the target.
+  template <typename... Names>
+  void addIllegalDialect(StringRef name, Names... names) {
+    SmallVector<StringRef, 2> dialectNames({name, names...});
+    setDialectAction(dialectNames, LegalizationAction::Illegal);
+  }
+  template <typename... Args> void addIllegalDialect() {
+    SmallVector<StringRef, 2> dialectNames({Args::getDialectNamespace()...});
+    setDialectAction(dialectNames, LegalizationAction::Illegal);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Legality Querying
+  //===--------------------------------------------------------------------===//
+
+  /// Get the legality action for the given operation.
+  Optional<LegalizationAction> getOpAction(OperationName op) const;
+
+  /// If the given operation instance is legal on this target, a structure
+  /// containing legality information is returned. If the operation is not
+  /// legal, None is returned.
+  Optional<LegalOpDetails> isLegal(Operation *op) const;
+
+protected:
+  /// Runs a custom legalization query for the given operation. This should
+  /// return true if the given operation is legal, otherwise false.
+  virtual bool isDynamicallyLegal(Operation *op) const {
+    llvm_unreachable(
+        "targets with custom legalization must override 'isDynamicallyLegal'");
+  }
+
+private:
+  /// Set the dynamic legality callback for the given operation.
+  void setLegalityCallback(OperationName name,
+                           const DynamicLegalityCallbackFn &callback);
+
+  /// Set the dynamic legality callback for the given dialects.
+  void setLegalityCallback(ArrayRef<StringRef> dialects,
+                           const DynamicLegalityCallbackFn &callback);
+
+  /// Set the recursive legality callback for the given operation and mark the
+  /// operation as recursively legal.
+  void markOpRecursivelyLegal(OperationName name,
+                              const DynamicLegalityCallbackFn &callback);
+
+  /// The set of information that configures the legalization of an operation.
+  struct LegalizationInfo {
+    /// The legality action this operation was given.
+    LegalizationAction action;
+
+    /// If some legal instances of this operation may also be recursively legal.
+    bool isRecursivelyLegal;
+  };
+
+  /// Get the legalization information for the given operation.
+  Optional<LegalizationInfo> getOpInfo(OperationName op) const;
+
+  /// A deterministic mapping of operation name and its respective legality
+  /// information.
+  llvm::MapVector<OperationName, LegalizationInfo> legalOperations;
+
+  /// A set of dynamic legality callbacks for given operation names.
+  DenseMap<OperationName, DynamicLegalityCallbackFn> opLegalityFns;
+
+  /// A set of legality callbacks for given operation names that are used to
+  /// check if an operation instance is recursively legal.
+  DenseMap<OperationName, DynamicLegalityCallbackFn> opRecursiveLegalityFns;
+
+  /// A deterministic mapping of dialect name to the specific legality action to
+  /// take.
+  llvm::StringMap<LegalizationAction> legalDialects;
+
+  /// A set of dynamic legality callbacks for given dialect names.
+  llvm::StringMap<DynamicLegalityCallbackFn> dialectLegalityFns;
+
+  /// The current context this target applies to.
+  MLIRContext &ctx;
+};
+
+//===----------------------------------------------------------------------===//
+// Op Conversion Entry Points
+//===----------------------------------------------------------------------===//
+
+/// Below we define several entry points for operation conversion. It is
+/// important to note that the patterns provided to the conversion framework may
+/// have additional constraints. See the `PatternRewriter Hooks` section of the
+/// ConversionPatternRewriter, to see what additional constraints are imposed on
+/// the use of the PatternRewriter.
+
+/// Apply a partial conversion on the given operations, and all nested
+/// operations. This method converts as many operations to the target as
+/// possible, ignoring operations that failed to legalize. This method only
+/// returns failure if there are unreachable blocks in any of the regions nested
+/// within 'ops'. If 'converter' is provided, the signatures of blocks and
+/// regions are also converted.
+LLVM_NODISCARD LogicalResult
+applyPartialConversion(ArrayRef<Operation *> ops, ConversionTarget &target,
+                       const OwningRewritePatternList &patterns,
+                       TypeConverter *converter = nullptr);
+LLVM_NODISCARD LogicalResult
+applyPartialConversion(Operation *op, ConversionTarget &target,
+                       const OwningRewritePatternList &patterns,
+                       TypeConverter *converter = nullptr);
+
+/// Apply a complete conversion on the given operations, and all nested
+/// operations. This method returns failure if the conversion of any operation
+/// fails, or if there are unreachable blocks in any of the regions nested
+/// within 'ops'. If 'converter' is provided, the signatures of blocks and
+/// regions are also converted.
+LLVM_NODISCARD LogicalResult
+applyFullConversion(ArrayRef<Operation *> ops, ConversionTarget &target,
+                    const OwningRewritePatternList &patterns,
+                    TypeConverter *converter = nullptr);
+LLVM_NODISCARD LogicalResult
+applyFullConversion(Operation *op, ConversionTarget &target,
+                    const OwningRewritePatternList &patterns,
+                    TypeConverter *converter = nullptr);
+
+/// Apply an analysis conversion on the given operations, and all nested
+/// operations. This method analyzes which operations would be successfully
+/// converted to the target if a conversion was applied. All operations that
+/// were found to be legalizable to the given 'target' are placed within the
+/// provided 'convertedOps' set; note that no actual rewrites are applied to the
+/// operations on success and only pre-existing operations are added to the set.
+/// This method only returns failure if there are unreachable blocks in any of
+/// the regions nested within 'ops', or if a type conversion failed. If
+/// 'converter' is provided, the signatures of blocks and regions are also
+/// considered for conversion.
+LLVM_NODISCARD LogicalResult applyAnalysisConversion(
+    ArrayRef<Operation *> ops, ConversionTarget &target,
+    const OwningRewritePatternList &patterns,
+    DenseSet<Operation *> &convertedOps, TypeConverter *converter = nullptr);
+LLVM_NODISCARD LogicalResult applyAnalysisConversion(
+    Operation *op, ConversionTarget &target,
+    const OwningRewritePatternList &patterns,
+    DenseSet<Operation *> &convertedOps, TypeConverter *converter = nullptr);
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_DIALECTCONVERSION_H_
diff --git a/mlir/include/mlir/Transforms/FoldUtils.h b/mlir/include/mlir/Transforms/FoldUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b0e82794f5d9a2de501d432a5107914229968d0
--- /dev/null
+++ b/mlir/include/mlir/Transforms/FoldUtils.h
@@ -0,0 +1,143 @@
+//===- FoldUtils.h - Operation Fold Utilities -------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file declares various operation folding utilities. These
+// utilities are intended to be used by passes to unify and simply their logic.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_FOLDUTILS_H
+#define MLIR_TRANSFORMS_FOLDUTILS_H
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/DialectInterface.h"
+
+namespace mlir {
+class Operation;
+class Value;
+
+//===--------------------------------------------------------------------===//
+// Operation Folding Interface
+//===--------------------------------------------------------------------===//
+
+/// This class defines a dialect interface used to assist the operation folder.
+/// It provides hooks for materializing and folding operations.
+class OpFolderDialectInterface
+    : public DialectInterface::Base<OpFolderDialectInterface> {
+public:
+  OpFolderDialectInterface(Dialect *dialect) : Base(dialect) {}
+
+  /// Registered hook to check if the given region, which is attached to an
+  /// operation that is *not* isolated from above, should be used when
+  /// materializing constants. The folder will generally materialize constants
+  /// into the top-level isolated region, this allows for materializing into a
+  /// lower level ancestor region if it is more profitable/correct.
+  virtual bool shouldMaterializeInto(Region *region) const { return false; }
+};
+
+//===--------------------------------------------------------------------===//
+// OperationFolder
+//===--------------------------------------------------------------------===//
+
+/// A utility class for folding operations, and unifying duplicated constants
+/// generated along the way.
+class OperationFolder {
+public:
+  OperationFolder(MLIRContext *ctx) : interfaces(ctx) {}
+
+  /// Tries to perform folding on the given `op`, including unifying
+  /// deduplicated constants. If successful, replaces `op`'s uses with
+  /// folded results, and returns success. `preReplaceAction` is invoked on `op`
+  /// before it is replaced. 'processGeneratedConstants' is invoked for any new
+  /// operations generated when folding. If the op was completely folded it is
+  /// erased.
+  LogicalResult
+  tryToFold(Operation *op,
+            function_ref<void(Operation *)> processGeneratedConstants = nullptr,
+            function_ref<void(Operation *)> preReplaceAction = nullptr);
+
+  /// Notifies that the given constant `op` should be remove from this
+  /// OperationFolder's internal bookkeeping.
+  ///
+  /// Note: this method must be called if a constant op is to be deleted
+  /// externally to this OperationFolder. `op` must be a constant op.
+  void notifyRemoval(Operation *op);
+
+  /// Create an operation of specific op type with the given builder,
+  /// and immediately try to fold it. This function populates 'results' with
+  /// the results after folding the operation.
+  template <typename OpTy, typename... Args>
+  void create(OpBuilder &builder, SmallVectorImpl<Value> &results,
+              Location location, Args &&... args) {
+    Operation *op = builder.create<OpTy>(location, std::forward<Args>(args)...);
+    if (failed(tryToFold(op, results)))
+      results.assign(op->result_begin(), op->result_end());
+    else if (op->getNumResults() != 0)
+      op->erase();
+  }
+
+  /// Overload to create or fold a single result operation.
+  template <typename OpTy, typename... Args>
+  typename std::enable_if<OpTy::template hasTrait<OpTrait::OneResult>(),
+                          Value>::type
+  create(OpBuilder &builder, Location location, Args &&... args) {
+    SmallVector<Value, 1> results;
+    create<OpTy>(builder, results, location, std::forward<Args>(args)...);
+    return results.front();
+  }
+
+  /// Overload to create or fold a zero result operation.
+  template <typename OpTy, typename... Args>
+  typename std::enable_if<OpTy::template hasTrait<OpTrait::ZeroResult>(),
+                          OpTy>::type
+  create(OpBuilder &builder, Location location, Args &&... args) {
+    auto op = builder.create<OpTy>(location, std::forward<Args>(args)...);
+    SmallVector<Value, 0> unused;
+    (void)tryToFold(op.getOperation(), unused);
+
+    // Folding cannot remove a zero-result operation, so for convenience we
+    // continue to return it.
+    return op;
+  }
+
+private:
+  /// This map keeps track of uniqued constants by dialect, attribute, and type.
+  /// A constant operation materializes an attribute with a type. Dialects may
+  /// generate different constants with the same input attribute and type, so we
+  /// also need to track per-dialect.
+  using ConstantMap =
+      DenseMap<std::tuple<Dialect *, Attribute, Type>, Operation *>;
+
+  /// Tries to perform folding on the given `op`. If successful, populates
+  /// `results` with the results of the folding.
+  LogicalResult tryToFold(
+      Operation *op, SmallVectorImpl<Value> &results,
+      function_ref<void(Operation *)> processGeneratedConstants = nullptr);
+
+  /// Try to get or create a new constant entry. On success this returns the
+  /// constant operation, nullptr otherwise.
+  Operation *tryGetOrCreateConstant(ConstantMap &uniquedConstants,
+                                    Dialect *dialect, OpBuilder &builder,
+                                    Attribute value, Type type, Location loc);
+
+  /// A mapping between an insertion region and the constants that have been
+  /// created within it.
+  DenseMap<Region *, ConstantMap> foldScopes;
+
+  /// This map tracks all of the dialects that an operation is referenced by;
+  /// given that many dialects may generate the same constant.
+  DenseMap<Operation *, SmallVector<Dialect *, 2>> referencedDialects;
+
+  /// A collection of dialect folder interfaces.
+  DialectInterfaceCollection<OpFolderDialectInterface> interfaces;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_FOLDUTILS_H
diff --git a/mlir/include/mlir/Transforms/InliningUtils.h b/mlir/include/mlir/Transforms/InliningUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3631c21c30c508ecfddee6dffe8731ae577240e
--- /dev/null
+++ b/mlir/include/mlir/Transforms/InliningUtils.h
@@ -0,0 +1,207 @@
+//===- InliningUtils.h - Inliner utilities ----------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines interfaces for various inlining utility methods.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_INLINING_UTILS_H
+#define MLIR_TRANSFORMS_INLINING_UTILS_H
+
+#include "mlir/IR/DialectInterface.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Region.h"
+
+namespace mlir {
+
+class Block;
+class BlockAndValueMapping;
+class CallableOpInterface;
+class CallOpInterface;
+class FuncOp;
+class OpBuilder;
+class Operation;
+class Region;
+class Value;
+
+//===----------------------------------------------------------------------===//
+// InlinerInterface
+//===----------------------------------------------------------------------===//
+
+/// This is the interface that must be implemented by the dialects of operations
+/// to be inlined. This interface should only handle the operations of the
+/// given dialect.
+class DialectInlinerInterface
+    : public DialectInterface::Base<DialectInlinerInterface> {
+public:
+  DialectInlinerInterface(Dialect *dialect) : Base(dialect) {}
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Returns true if the given region 'src' can be inlined into the region
+  /// 'dest' that is attached to an operation registered to the current dialect.
+  /// 'valueMapping' contains any remapped values from within the 'src' region.
+  /// This can be used to examine what values will replace entry arguments into
+  /// the 'src' region for example.
+  virtual bool isLegalToInline(Region *dest, Region *src,
+                               BlockAndValueMapping &valueMapping) const {
+    return false;
+  }
+
+  /// Returns true if the given operation 'op', that is registered to this
+  /// dialect, can be inlined into the given region, false otherwise.
+  /// 'valueMapping' contains any remapped values from within the 'src' region.
+  /// This can be used to examine what values may potentially replace the
+  /// operands to 'op'.
+  virtual bool isLegalToInline(Operation *op, Region *dest,
+                               BlockAndValueMapping &valueMapping) const {
+    return false;
+  }
+
+  /// This hook is invoked on an operation that contains regions. It should
+  /// return true if the analyzer should recurse within the regions of this
+  /// operation when computing legality and cost, false otherwise. The default
+  /// implementation returns true.
+  virtual bool shouldAnalyzeRecursively(Operation *op) const { return true; }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary. This overload is called when the inlined region has more
+  /// than one block. The 'newDest' block represents the new final branching
+  /// destination of blocks within this region, i.e. operations that release
+  /// control to the parent operation will likely now branch to this block.
+  /// Its block arguments correspond to any values that need to be replaced by
+  /// terminators within the inlined region.
+  virtual void handleTerminator(Operation *op, Block *newDest) const {
+    llvm_unreachable("must implement handleTerminator in the case of multiple "
+                     "inlined blocks");
+  }
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary. This overload is called when the inlined region only
+  /// contains one block. 'valuesToReplace' contains the previously returned
+  /// values of the call site before inlining. These values must be replaced by
+  /// this callback if they had any users (for example for traditional function
+  /// calls, these are directly replaced with the operands of the `return`
+  /// operation). The given 'op' will be removed by the caller, after this
+  /// function has been called.
+  virtual void handleTerminator(Operation *op,
+                                ArrayRef<Value> valuesToReplace) const {
+    llvm_unreachable(
+        "must implement handleTerminator in the case of one inlined block");
+  }
+
+  /// Attempt to materialize a conversion for a type mismatch between a call
+  /// from this dialect, and a callable region. This method should generate an
+  /// operation that takes 'input' as the only operand, and produces a single
+  /// result of 'resultType'. If a conversion can not be generated, nullptr
+  /// should be returned. For example, this hook may be invoked in the following
+  /// scenarios:
+  ///   func @foo(i32) -> i32 { ... }
+  ///
+  ///   // Mismatched input operand
+  ///   ... = foo.call @foo(%input : i16) -> i32
+  ///
+  ///   // Mismatched result type.
+  ///   ... = foo.call @foo(%input : i32) -> i16
+  ///
+  /// NOTE: This hook may be invoked before the 'isLegal' checks above.
+  virtual Operation *materializeCallConversion(OpBuilder &builder, Value input,
+                                               Type resultType,
+                                               Location conversionLoc) const {
+    return nullptr;
+  }
+};
+
+/// This interface provides the hooks into the inlining interface.
+/// Note: this class automatically collects 'DialectInlinerInterface' objects
+/// registered to each dialect within the given context.
+class InlinerInterface
+    : public DialectInterfaceCollection<DialectInlinerInterface> {
+public:
+  using Base::Base;
+
+  /// Process a set of blocks that have been inlined. This callback is invoked
+  /// *before* inlined terminator operations have been processed.
+  virtual void
+  processInlinedBlocks(iterator_range<Region::iterator> inlinedBlocks) {}
+
+  /// These hooks mirror the hooks for the DialectInlinerInterface, with default
+  /// implementations that call the hook on the handler for the dialect 'op' is
+  /// registered to.
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  virtual bool isLegalToInline(Region *dest, Region *src,
+                               BlockAndValueMapping &valueMapping) const;
+  virtual bool isLegalToInline(Operation *op, Region *dest,
+                               BlockAndValueMapping &valueMapping) const;
+  virtual bool shouldAnalyzeRecursively(Operation *op) const;
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  virtual void handleTerminator(Operation *op, Block *newDest) const;
+  virtual void handleTerminator(Operation *op,
+                                ArrayRef<Value> valuesToRepl) const;
+};
+
+//===----------------------------------------------------------------------===//
+// Inline Methods.
+//===----------------------------------------------------------------------===//
+
+/// This function inlines a region, 'src', into another. This function returns
+/// failure if it is not possible to inline this function. If the function
+/// returned failure, then no changes to the module have been made.
+///
+/// The provided 'inlinePoint' must be within a region, and corresponds to the
+/// location where the 'src' region should be inlined. 'mapping' contains any
+/// remapped operands that are used within the region, and *must* include
+/// remappings for the entry arguments to the region. 'resultsToReplace'
+/// corresponds to any results that should be replaced by terminators within the
+/// inlined region. 'inlineLoc' is an optional Location that, if provided, will
+/// be used to update the inlined operations' location information.
+/// 'shouldCloneInlinedRegion' corresponds to whether the source region should
+/// be cloned into the 'inlinePoint' or spliced directly.
+LogicalResult inlineRegion(InlinerInterface &interface, Region *src,
+                           Operation *inlinePoint, BlockAndValueMapping &mapper,
+                           ArrayRef<Value> resultsToReplace,
+                           Optional<Location> inlineLoc = llvm::None,
+                           bool shouldCloneInlinedRegion = true);
+
+/// This function is an overload of the above 'inlineRegion' that allows for
+/// providing the set of operands ('inlinedOperands') that should be used
+/// in-favor of the region arguments when inlining.
+LogicalResult inlineRegion(InlinerInterface &interface, Region *src,
+                           Operation *inlinePoint,
+                           ArrayRef<Value> inlinedOperands,
+                           ArrayRef<Value> resultsToReplace,
+                           Optional<Location> inlineLoc = llvm::None,
+                           bool shouldCloneInlinedRegion = true);
+
+/// This function inlines a given region, 'src', of a callable operation,
+/// 'callable', into the location defined by the given call operation. This
+/// function returns failure if inlining is not possible, success otherwise. On
+/// failure, no changes are made to the module. 'shouldCloneInlinedRegion'
+/// corresponds to whether the source region should be cloned into the 'call' or
+/// spliced directly.
+LogicalResult inlineCall(InlinerInterface &interface, CallOpInterface call,
+                         CallableOpInterface callable, Region *src,
+                         bool shouldCloneInlinedRegion = true);
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_INLINING_UTILS_H
diff --git a/mlir/include/mlir/Transforms/LoopFusionUtils.h b/mlir/include/mlir/Transforms/LoopFusionUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c307ffeda338655d25ff195eb3b4b0ef1da1377
--- /dev/null
+++ b/mlir/include/mlir/Transforms/LoopFusionUtils.h
@@ -0,0 +1,92 @@
+//===- LoopFusionUtils.h - Loop fusion utilities ----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for various loop fusion utility
+// methods: these are not passes by themselves but are used either by passes,
+// optimization sequences, or in turn by other transformation utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_LOOP_FUSION_UTILS_H
+#define MLIR_TRANSFORMS_LOOP_FUSION_UTILS_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir {
+class AffineForOp;
+struct ComputationSliceState;
+class Operation;
+
+// TODO(andydavis) Extend this module to include utility functions for querying
+// fusion cost/storage reduction, and for performing the loop fusion
+// transformation.
+
+struct FusionResult {
+  enum ResultEnum {
+    Success,
+    FailPrecondition,     // Failed precondition for fusion. (e.g. same block).
+    FailBlockDependence,  // Fusion would violate another dependence in block.
+    FailFusionDependence, // Fusion would reverse dependences between loops.
+    FailComputationSlice, // Unable to compute src loop computation slice.
+  } value;
+  FusionResult(ResultEnum v) : value(v) {}
+};
+
+/// Checks the feasibility of fusing the loop nest rooted at 'srcForOp' into the
+/// loop nest rooted at 'dstForOp' at 'dstLoopDepth'. Returns FusionResult
+/// 'Success' if fusion of the src/dst loop nests is feasible (i.e. they are
+/// in the same block and dependences would not be violated). Otherwise
+/// returns a FusionResult explaining why fusion is not feasible.
+/// NOTE: This function is not feature complete and should only be used in
+/// testing.
+/// TODO(andydavis) Update comments when this function is fully implemented.
+FusionResult canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
+                          unsigned dstLoopDepth,
+                          ComputationSliceState *srcSlice);
+
+/// LoopNestStats aggregates various per-loop statistics (eg. loop trip count
+/// and operation count) for a loop nest up until (and including) the innermost
+/// loop body.
+struct LoopNestStats {
+  /// Map from AffineForOp to immediate child AffineForOps in its loop body.
+  DenseMap<Operation *, SmallVector<AffineForOp, 2>> loopMap;
+  /// Map from AffineForOp to count of operations in its loop body.
+  DenseMap<Operation *, uint64_t> opCountMap;
+  /// Map from AffineForOp to its constant trip count.
+  DenseMap<Operation *, uint64_t> tripCountMap;
+};
+
+/// Collect loop nest statistics (eg. loop trip count and operation count)
+/// in 'stats' for loop nest rooted at 'forOp'. Returns true on success,
+/// returns false otherwise.
+// TODO(andydavis) Consider moving this to LoopUtils.
+bool getLoopNestStats(AffineForOp forOp, LoopNestStats *stats);
+
+/// Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
+/// Currently, the total cost is computed by counting the total operation
+/// instance count (i.e. total number of operations in the loop body * loop
+/// trip count) for the entire loop nest.
+// TODO(andydavis) Improve this cost model.
+int64_t getComputeCost(AffineForOp forOp, LoopNestStats &stats);
+
+/// Computes and returns in 'computeCost', the total compute cost of fusing the
+/// 'slice' of the loop nest rooted at 'srcForOp' into 'dstForOp'. Currently,
+/// the total cost is computed by counting the total operation instance count
+/// (i.e. total number of operations in the loop body * loop trip count) for
+/// the entire loop nest.
+/// Returns true on success, failure otherwise (e.g. non-constant trip counts).
+// TODO(andydavis) Improve this cost model.
+bool getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats,
+                          AffineForOp dstForOp, LoopNestStats &dstStats,
+                          ComputationSliceState *slice, int64_t *computeCost);
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_LOOP_FUSION_UTILS_H
diff --git a/mlir/include/mlir/Transforms/LoopLikeInterface.h b/mlir/include/mlir/Transforms/LoopLikeInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..cba9ae78122d6d06ce2323b7211abe94f4df4833
--- /dev/null
+++ b/mlir/include/mlir/Transforms/LoopLikeInterface.h
@@ -0,0 +1,26 @@
+//===- LoopLikeInterface.h - Loop-like operations interface ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the operation interface for loop like operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_LOOPLIKEINTERFACE_H_
+#define MLIR_TRANSFORMS_LOOPLIKEINTERFACE_H_
+
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+
+#include "mlir/Transforms/LoopLikeInterface.h.inc"
+
+} // namespace mlir
+
+#endif // MLIR_TRANSFORMS_LOOPLIKEINTERFACE_H_
diff --git a/mlir/include/mlir/Transforms/LoopLikeInterface.td b/mlir/include/mlir/Transforms/LoopLikeInterface.td
new file mode 100644
index 0000000000000000000000000000000000000000..c110b192987e976ee9da736526629d59fd39b79a
--- /dev/null
+++ b/mlir/include/mlir/Transforms/LoopLikeInterface.td
@@ -0,0 +1,49 @@
+//===- LoopLikeInterface.td - LoopLike interface -----------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the interface for loop-like operations as used by LICM.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_LOOPLIKEINTERFACE
+#define MLIR_LOOPLIKEINTERFACE
+
+include "mlir/IR/OpBase.td"
+
+def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
+  let description = [{
+    Encodes properties of a loop. Operations that implement this interface will
+    be considered by loop-invariant code motion.
+  }];
+
+  let methods = [
+    InterfaceMethod<[{
+        Returns true if the given value is defined outside of the loop.
+        A sensible implementation could be to check whether the value's defining
+        operation lies outside of the loops body region. If the loop uses
+        explicit capture of dependencies, an implementation could check whether
+        the value corresponds to a captured dependency.
+      }],
+      "bool", "isDefinedOutsideOfLoop", (ins "Value ":$value)
+    >,
+    InterfaceMethod<[{
+        Returns the region that makes up the body of the loop and should be
+        inspected for loop-invariant operations.
+      }],
+      "Region &", "getLoopBody"
+    >,
+    InterfaceMethod<[{
+        Moves the given vector of operations out of the loop. The vector is
+        sorted topologically.
+      }],
+      "LogicalResult", "moveOutOfLoop", (ins "ArrayRef<Operation *>":$ops)
+    >,
+  ];
+}
+
+#endif // MLIR_LOOPLIKEINTERFACE
diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..402a336cf1c7431c0e414114c4a78086d6d40f45
--- /dev/null
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -0,0 +1,225 @@
+//===- LoopUtils.h - Loop transformation utilities --------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for various loop transformation utility
+// methods: these are not passes by themselves but are used either by passes,
+// optimization sequences, or in turn by other transformation utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_LOOP_UTILS_H
+#define MLIR_TRANSFORMS_LOOP_UTILS_H
+
+#include "mlir/IR/Block.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+class AffineForOp;
+class FuncOp;
+class OpBuilder;
+class Value;
+
+namespace loop {
+class ForOp;
+} // end namespace loop
+
+/// Unrolls this for operation completely if the trip count is known to be
+/// constant. Returns failure otherwise.
+LogicalResult loopUnrollFull(AffineForOp forOp);
+
+/// Unrolls this for operation by the specified unroll factor. Returns failure
+/// if the loop cannot be unrolled either due to restrictions or due to invalid
+/// unroll factors.
+LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor);
+
+/// Unrolls this loop by the specified unroll factor or its trip count,
+/// whichever is lower.
+LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor);
+
+/// Get perfectly nested sequence of loops starting at root of loop nest
+/// (the first op being another AffineFor, and the second op - a terminator).
+/// A loop is perfectly nested iff: the first op in the loop's body is another
+/// AffineForOp, and the second op is a terminator).
+void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
+                             AffineForOp root);
+void getPerfectlyNestedLoops(SmallVectorImpl<loop::ForOp> &nestedLoops,
+                             loop::ForOp root);
+
+/// Unrolls and jams this loop by the specified factor. Returns success if the
+/// loop is successfully unroll-jammed.
+LogicalResult loopUnrollJamByFactor(AffineForOp forOp,
+                                    uint64_t unrollJamFactor);
+
+/// Unrolls and jams this loop by the specified factor or by the trip count (if
+/// constant), whichever is lower.
+LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
+                                      uint64_t unrollJamFactor);
+
+/// Promotes the loop body of a AffineForOp to its containing block if the
+/// AffineForOp was known to have a single iteration.
+LogicalResult promoteIfSingleIteration(AffineForOp forOp);
+
+/// Promotes all single iteration AffineForOp's in the Function, i.e., moves
+/// their body into the containing Block.
+void promoteSingleIterationLoops(FuncOp f);
+
+/// Computes the cleanup loop lower bound of the loop being unrolled with
+/// the specified unroll factor; this bound will also be upper bound of the main
+/// part of the unrolled loop. Computes the bound as an AffineMap with its
+/// operands or a null map when the trip count can't be expressed as an affine
+/// expression.
+void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
+                              AffineMap *map, SmallVectorImpl<Value> *operands,
+                              OpBuilder &builder);
+
+/// Skew the operations in the body of a 'affine.for' operation with the
+/// specified operation-wise shifts. The shifts are with respect to the
+/// original execution order, and are multiplied by the loop 'step' before being
+/// applied.
+LLVM_NODISCARD
+LogicalResult instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
+                           bool unrollPrologueEpilogue = false);
+
+/// Tiles the specified band of perfectly nested loops creating tile-space loops
+/// and intra-tile loops. A band is a contiguous set of loops.
+LLVM_NODISCARD
+LogicalResult tileCodeGen(MutableArrayRef<AffineForOp> band,
+                          ArrayRef<unsigned> tileSizes);
+
+/// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA'
+/// and 'forOpB' are part of a perfectly nested sequence of loops.
+void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB);
+
+/// Checks if the loop interchange permutation 'loopPermMap', of the perfectly
+/// nested sequence of loops in 'loops', would violate dependences (loop 'i' in
+/// 'loops' is mapped to location 'j = 'loopPermMap[i]' in the interchange).
+bool isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,
+                                       ArrayRef<unsigned> loopPermMap);
+
+/// Performs a sequence of loop interchanges on perfectly nested 'loops', as
+/// specified by permutation 'loopPermMap' (loop 'i' in 'loops' is mapped to
+/// location 'j = 'loopPermMap[i]' after the loop interchange).
+unsigned interchangeLoops(ArrayRef<AffineForOp> loops,
+                          ArrayRef<unsigned> loopPermMap);
+
+// Sinks all sequential loops to the innermost levels (while preserving
+// relative order among them) and moves all parallel loops to the
+// outermost (while again preserving relative order among them).
+// Returns AffineForOp of the root of the new loop nest after loop interchanges.
+AffineForOp sinkSequentialLoops(AffineForOp forOp);
+
+/// Sinks 'forOp' by 'loopDepth' levels by performing a series of loop
+/// interchanges. Requires that 'forOp' is part of a perfect nest with
+/// 'loopDepth' AffineForOps consecutively nested under it.
+void sinkLoop(AffineForOp forOp, unsigned loopDepth);
+
+/// Performs tiling fo imperfectly nested loops (with interchange) by
+/// strip-mining the `forOps` by `sizes` and sinking them, in their order of
+/// occurrence in `forOps`, under each of the `targets`.
+/// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
+/// nested immediately under each of `targets`.
+using Loops = SmallVector<loop::ForOp, 8>;
+using TileLoops = std::pair<Loops, Loops>;
+SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps,
+                                                 ArrayRef<uint64_t> sizes,
+                                                 ArrayRef<AffineForOp> targets);
+SmallVector<Loops, 8> tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value> sizes,
+                           ArrayRef<loop::ForOp> targets);
+
+/// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
+/// and sinking them, in their order of occurrence in `forOps`, under `target`.
+/// Returns the new AffineForOps, one per `forOps`, nested immediately under
+/// `target`.
+SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps,
+                                 ArrayRef<uint64_t> sizes, AffineForOp target);
+Loops tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value> sizes,
+           loop::ForOp target);
+
+/// Tile a nest of loop::ForOp loops rooted at `rootForOp` with the given
+/// (parametric) sizes. Sizes are expected to be strictly positive values at
+/// runtime.  If more sizes than loops are provided, discard the trailing values
+/// in sizes.  Assumes the loop nest is permutable.
+/// Returns the newly created intra-tile loops.
+Loops tilePerfectlyNested(loop::ForOp rootForOp, ArrayRef<Value> sizes);
+
+/// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
+struct AffineCopyOptions {
+  // True if DMAs should be generated instead of point-wise copies.
+  bool generateDma;
+  // The slower memory space from which data is to be moved.
+  unsigned slowMemorySpace;
+  // Memory space of the faster one (typically a scratchpad).
+  unsigned fastMemorySpace;
+  // Memory space to place tags in: only meaningful for DMAs.
+  unsigned tagMemorySpace;
+  // Capacity of the fast memory space in bytes.
+  uint64_t fastMemCapacityBytes;
+};
+
+/// Performs explicit copying for the contiguous sequence of operations in the
+/// block iterator range [`begin', `end'), where `end' can't be past the
+/// terminator of the block (since additional operations are potentially
+/// inserted right before `end`. Returns the total size of fast memory space
+/// buffers used. `copyOptions` provides various parameters, and the output
+/// argument `copyNests` is the set of all copy nests inserted, each represented
+/// by its root affine.for. Since we generate alloc's and dealloc's for all fast
+/// buffers (before and after the range of operations resp. or at a hoisted
+/// position), all of the fast memory capacity is assumed to be available for
+/// processing this block range.
+uint64_t affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
+                                const AffineCopyOptions &copyOptions,
+                                DenseSet<Operation *> &copyNests);
+
+/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
+/// parametric tile sizes that the outer loops have a fixed number of iterations
+/// as defined in `sizes`.
+TileLoops extractFixedOuterLoops(loop::ForOp rootFOrOp,
+                                 ArrayRef<int64_t> sizes);
+
+/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
+/// `loops` contains a list of perfectly nested loops with bounds and steps
+/// independent of any loop induction variable involved in the nest.
+void coalesceLoops(MutableArrayRef<loop::ForOp> loops);
+
+/// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
+/// size given by `numProcessors`. This is achieved by embedding the SSA values
+/// corresponding to `processorIds` and `numProcessors` into the bounds and step
+/// of the `forOp`. No check is performed on the legality of the rewrite, it is
+/// the caller's responsibility to ensure legality.
+///
+/// Requires that `processorIds` and `numProcessors` have the same size and that
+/// for each idx, `processorIds`[idx] takes, at runtime, all values between 0
+/// and `numProcessors`[idx] - 1. This corresponds to traditional use cases for:
+///   1. GPU (threadIdx, get_local_id(), ...)
+///   2. MPI (MPI_Comm_rank)
+///   3. OpenMP (omp_get_thread_num)
+///
+/// Example:
+/// Assuming a 2-d grid with processorIds = [blockIdx.x, threadIdx.x] and
+/// numProcessors = [gridDim.x, blockDim.x], the loop:
+///
+/// ```
+///    loop.for %i = %lb to %ub step %step {
+///      ...
+///    }
+/// ```
+///
+/// is rewritten into a version resembling the following pseudo-IR:
+///
+/// ```
+///    loop.for %i = %lb + %step * (threadIdx.x + blockIdx.x * blockDim.x)
+///       to %ub step %gridDim.x * blockDim.x * %step {
+///      ...
+///    }
+/// ```
+void mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value> processorId,
+                           ArrayRef<Value> numProcessors);
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_LOOP_UTILS_H
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ea8f060e3999be6f0a33cf41d5ba6ef05d7fcad
--- /dev/null
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -0,0 +1,131 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes that expose pass constructors in the loop
+// transformation library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_PASSES_H
+#define MLIR_TRANSFORMS_PASSES_H
+
+#include "mlir/Support/LLVM.h"
+#include <functional>
+#include <limits>
+
+namespace mlir {
+
+class AffineForOp;
+class FuncOp;
+class ModuleOp;
+class Pass;
+template <typename T> class OpPassBase;
+
+/// Creates an instance of the Canonicalizer pass.
+std::unique_ptr<Pass> createCanonicalizerPass();
+
+/// Creates a pass to perform common sub expression elimination.
+std::unique_ptr<Pass> createCSEPass();
+
+/// Creates a pass to vectorize loops, operations and data types using a
+/// target-independent, n-D super-vector abstraction.
+std::unique_ptr<OpPassBase<FuncOp>>
+createVectorizePass(ArrayRef<int64_t> virtualVectorSize);
+
+/// Creates a pass to allow independent testing of vectorizer functionality with
+/// FileCheck.
+std::unique_ptr<OpPassBase<FuncOp>> createVectorizerTestPass();
+
+/// Creates a pass to lower super-vectors to target-dependent HW vectors.
+std::unique_ptr<OpPassBase<FuncOp>>
+createMaterializeVectorsPass(ArrayRef<int64_t> vectorSize);
+
+/// Creates a loop unrolling pass with the provided parameters.
+/// 'getUnrollFactor' is a function callback for clients to supply a function
+/// that computes an unroll factor - the callback takes precedence over unroll
+/// factors supplied through other means. If -1 is passed as the unrollFactor
+/// and no callback is provided, anything passed from the command-line (if at
+/// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor).
+std::unique_ptr<OpPassBase<FuncOp>> createLoopUnrollPass(
+    int unrollFactor = -1, int unrollFull = -1,
+    const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr);
+
+/// Creates a loop unroll jam pass to unroll jam by the specified factor. A
+/// factor of -1 lets the pass use the default factor or the one on the command
+/// line if provided.
+std::unique_ptr<OpPassBase<FuncOp>>
+createLoopUnrollAndJamPass(int unrollJamFactor = -1);
+
+/// Creates a simplification pass for affine structures (maps and sets). In
+/// addition, this pass also normalizes memrefs to have the trivial (identity)
+/// layout map.
+std::unique_ptr<OpPassBase<FuncOp>> createSimplifyAffineStructuresPass();
+
+/// Creates a loop fusion pass which fuses loops. Buffers of size less than or
+/// equal to `localBufSizeThreshold` are promoted to memory space
+/// `fastMemorySpace'.
+std::unique_ptr<OpPassBase<FuncOp>>
+createLoopFusionPass(unsigned fastMemorySpace = 0,
+                     uint64_t localBufSizeThreshold = 0,
+                     bool maximalFusion = false);
+
+/// Creates a loop invariant code motion pass that hoists loop invariant
+/// instructions out of the loop.
+std::unique_ptr<Pass> createLoopInvariantCodeMotionPass();
+
+/// Creates a loop invariant code motion pass that hoists loop invariant
+/// instructions out of affine loop.
+std::unique_ptr<OpPassBase<FuncOp>> createAffineLoopInvariantCodeMotionPass();
+
+/// Creates a pass to pipeline explicit movement of data across levels of the
+/// memory hierarchy.
+std::unique_ptr<OpPassBase<FuncOp>> createPipelineDataTransferPass();
+
+/// Lowers affine control flow operations (ForStmt, IfStmt and AffineApplyOp)
+/// to equivalent lower-level constructs (flow of basic blocks and arithmetic
+/// primitives).
+std::unique_ptr<OpPassBase<FuncOp>> createLowerAffinePass();
+
+/// Creates a pass to perform tiling on loop nests.
+std::unique_ptr<OpPassBase<FuncOp>>
+createLoopTilingPass(uint64_t cacheSizeBytes);
+
+/// Creates a pass that performs parametric tiling so that the outermost loops
+/// have the given fixed number of iterations.  Assumes outermost loop nests
+/// are permutable.
+std::unique_ptr<OpPassBase<FuncOp>>
+createSimpleParametricTilingPass(ArrayRef<int64_t> outerLoopSizes);
+
+/// Creates a pass that transforms perfectly nested loops with independent
+/// bounds into a single loop.
+std::unique_ptr<OpPassBase<FuncOp>> createLoopCoalescingPass();
+
+/// Performs packing (or explicit copying) of accessed memref regions into
+/// buffers in the specified faster memory space through either pointwise copies
+/// or DMA operations.
+std::unique_ptr<OpPassBase<FuncOp>> createAffineDataCopyGenerationPass(
+    unsigned slowMemorySpace, unsigned fastMemorySpace,
+    unsigned tagMemorySpace = 0, int minDmaTransferSize = 1024,
+    uint64_t fastMemCapacityBytes = std::numeric_limits<uint64_t>::max());
+
+/// Creates a pass to perform optimizations relying on memref dataflow such as
+/// store to load forwarding, elimination of dead stores, and dead allocs.
+std::unique_ptr<OpPassBase<FuncOp>> createMemRefDataFlowOptPass();
+
+/// Creates a pass to strip debug information from a function.
+std::unique_ptr<OpPassBase<FuncOp>> createStripDebugInfoPass();
+
+/// Creates a pass which tests loop fusion utilities.
+std::unique_ptr<OpPassBase<FuncOp>> createTestLoopFusionPass();
+
+/// Creates a pass which inlines calls and callable operations as defined by the
+/// CallGraph.
+std::unique_ptr<Pass> createInlinerPass();
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_PASSES_H
diff --git a/mlir/include/mlir/Transforms/RegionUtils.h b/mlir/include/mlir/Transforms/RegionUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd71553e96be0a68ba66d2886d665bc870b02b19
--- /dev/null
+++ b/mlir/include/mlir/Transforms/RegionUtils.h
@@ -0,0 +1,61 @@
+//===- RegionUtils.h - Region-related transformation utilities --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_REGIONUTILS_H_
+#define MLIR_TRANSFORMS_REGIONUTILS_H_
+
+#include "mlir/IR/Region.h"
+#include "mlir/IR/Value.h"
+
+#include "llvm/ADT/SetVector.h"
+
+namespace mlir {
+
+/// Check if all values in the provided range are defined above the `limit`
+/// region.  That is, if they are defined in a region that is a proper ancestor
+/// of `limit`.
+template <typename Range>
+bool areValuesDefinedAbove(Range values, Region &limit) {
+  for (Value v : values)
+    if (!v->getParentRegion()->isProperAncestor(&limit))
+      return false;
+  return true;
+}
+
+/// Replace all uses of `orig` within the given region with `replacement`.
+void replaceAllUsesInRegionWith(Value orig, Value replacement, Region &region);
+
+/// Calls `callback` for each use of a value within `region` or its descendants
+/// that was defined at the ancestors of the `limit`.
+void visitUsedValuesDefinedAbove(Region &region, Region &limit,
+                                 function_ref<void(OpOperand *)> callback);
+
+/// Calls `callback` for each use of a value within any of the regions provided
+/// that was defined in one of the ancestors.
+void visitUsedValuesDefinedAbove(MutableArrayRef<Region> regions,
+                                 function_ref<void(OpOperand *)> callback);
+
+/// Fill `values` with a list of values defined at the ancestors of the `limit`
+/// region and used within `region` or its descendants.
+void getUsedValuesDefinedAbove(Region &region, Region &limit,
+                               llvm::SetVector<Value> &values);
+
+/// Fill `values` with a list of values used within any of the regions provided
+/// but defined in one of the ancestors.
+void getUsedValuesDefinedAbove(MutableArrayRef<Region> regions,
+                               llvm::SetVector<Value> &values);
+
+/// Run a set of structural simplifications over the given regions. This
+/// includes transformations like unreachable block elimination, dead argument
+/// elimination, as well as some other DCE. This function returns success if any
+/// of the regions were simplified, failure otherwise.
+LogicalResult simplifyRegions(MutableArrayRef<Region> regions);
+
+} // namespace mlir
+
+#endif // MLIR_TRANSFORMS_REGIONUTILS_H_
diff --git a/mlir/include/mlir/Transforms/SideEffectsInterface.h b/mlir/include/mlir/Transforms/SideEffectsInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..69c2a272c70ff230a340364fc170c3eabb58d622
--- /dev/null
+++ b/mlir/include/mlir/Transforms/SideEffectsInterface.h
@@ -0,0 +1,64 @@
+//===- SideEffectsInterface.h - dialect interface modeling side effects ---===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file specifies a dialect interface to model side-effects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_SIDEEFFECTSINTERFACE_H_
+#define MLIR_TRANSFORMS_SIDEEFFECTSINTERFACE_H_
+
+#include "mlir/IR/DialectInterface.h"
+#include "mlir/IR/Operation.h"
+
+namespace mlir {
+
+/// Specifies an interface for basic side-effect modelling that is used by the
+/// loop-invariant code motion pass.
+///
+/// TODO: This interface should be replaced by a more general solution.
+class SideEffectsDialectInterface
+    : public DialectInterface::Base<SideEffectsDialectInterface> {
+public:
+  SideEffectsDialectInterface(Dialect *dialect) : Base(dialect) {}
+
+  enum SideEffecting {
+    Never,     /* the operation has no side-effects */
+    Recursive, /* the operation has side-effects if a contained operation has */
+    Always     /* the operation has side-effects */
+  };
+
+  /// Checks whether the given operation has side-effects.
+  virtual SideEffecting isSideEffecting(Operation *op) const {
+    if (op->hasNoSideEffect())
+      return Never;
+    return Always;
+  };
+};
+
+class SideEffectsInterface
+    : public DialectInterfaceCollection<SideEffectsDialectInterface> {
+public:
+  using SideEffecting = SideEffectsDialectInterface::SideEffecting;
+  explicit SideEffectsInterface(MLIRContext *ctx)
+      : DialectInterfaceCollection<SideEffectsDialectInterface>(ctx) {}
+
+  SideEffecting isSideEffecting(Operation *op) const {
+    // First check generic trait.
+    if (op->hasNoSideEffect())
+      return SideEffecting::Never;
+    if (auto handler = getInterfaceFor(op))
+      return handler->isSideEffecting(op);
+
+    return SideEffecting::Always;
+  }
+};
+
+} // namespace mlir
+
+#endif // MLIR_TRANSFORMS_SIDEEFFECTSINTERFACE_H_
diff --git a/mlir/include/mlir/Transforms/Utils.h b/mlir/include/mlir/Transforms/Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b7f6cd3909c8e8f9a2d3d6a13eec6c93343a66f
--- /dev/null
+++ b/mlir/include/mlir/Transforms/Utils.h
@@ -0,0 +1,128 @@
+//===- Utils.h - General transformation utilities ---------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for various transformation utilities for
+// memref's and non-loop IR structures. These are not passes by themselves but
+// are used either by passes, optimization sequences, or in turn by other
+// transformation utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_UTILS_H
+#define MLIR_TRANSFORMS_UTILS_H
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineMap.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace mlir {
+
+class AffineApplyOp;
+class AffineForOp;
+class Location;
+class OpBuilder;
+
+/// Replaces all "dereferencing" uses of `oldMemRef` with `newMemRef` while
+/// optionally remapping the old memref's indices using the supplied affine map,
+/// `indexRemap`. The new memref could be of a different shape or rank.
+/// `extraIndices` provides any additional access indices to be added to the
+/// start.
+///
+/// `indexRemap` remaps indices of the old memref access to a new set of indices
+/// that are used to index the memref. Additional input operands to indexRemap
+/// can be optionally provided in `extraOperands`, and they occupy the start
+/// of its input list. `indexRemap`'s dimensional inputs are expected to
+/// correspond to memref's indices, and its symbolic inputs if any should be
+/// provided in `symbolOperands`.
+///
+/// `domInstFilter`, if non-null, restricts the replacement to only those
+/// operations that are dominated by the former; similarly, `postDomInstFilter`
+/// restricts replacement to only those operations that are postdominated by it.
+///
+/// Returns true on success and false if the replacement is not possible,
+/// whenever a memref is used as an operand in a non-dereferencing context,
+/// except for dealloc's on the memref which are left untouched. See comments at
+/// function definition for an example.
+//
+//  Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
+//  The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
+//  index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
+//  d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any
+//  extra operands, note that 'indexRemap' would just be applied to existing
+//  indices (%i, %j).
+//  TODO(bondhugula): allow extraIndices to be added at any position.
+LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
+                                       ArrayRef<Value> extraIndices = {},
+                                       AffineMap indexRemap = AffineMap(),
+                                       ArrayRef<Value> extraOperands = {},
+                                       ArrayRef<Value> symbolOperands = {},
+                                       Operation *domInstFilter = nullptr,
+                                       Operation *postDomInstFilter = nullptr);
+
+/// Performs the same replacement as the other version above but only for the
+/// dereferencing uses of `oldMemRef` in `op`.
+LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
+                                       Operation *op,
+                                       ArrayRef<Value> extraIndices = {},
+                                       AffineMap indexRemap = AffineMap(),
+                                       ArrayRef<Value> extraOperands = {},
+                                       ArrayRef<Value> symbolOperands = {});
+
+/// Rewrites the memref defined by this alloc op to have an identity layout map
+/// and updates all its indexing uses. Returns failure if any of its uses
+/// escape (while leaving the IR in a valid state).
+LogicalResult normalizeMemRef(AllocOp op);
+
+/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
+/// its results equal to the number of operands, as a composition
+/// of all other AffineApplyOps reachable from input parameter 'operands'. If
+/// different operands were drawing results from multiple affine apply ops,
+/// these will also be collected into a single (multi-result) affine apply op.
+/// The final results of the composed AffineApplyOp are returned in output
+/// parameter 'results'. Returns the affine apply op created.
+Operation *createComposedAffineApplyOp(OpBuilder &builder, Location loc,
+                                       ArrayRef<Value> operands,
+                                       ArrayRef<Operation *> affineApplyOps,
+                                       SmallVectorImpl<Value> *results);
+
+/// Given an operation, inserts one or more single result affine apply
+/// operations, results of which are exclusively used by this operation.
+/// The operands of these newly created affine apply ops are
+/// guaranteed to be loop iterators or terminal symbols of a function.
+///
+/// Before
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   send %A[%idx], ...
+///   %v = "compute"(%idx, ...)
+///
+/// After
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   send %A[%idx], ...
+///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
+///   %v = "compute"(%idx_, ...)
+
+/// This allows the application of different transformations on send and
+/// compute (for eg. different shifts/delays)
+///
+/// Fills `sliceOps` with the list of affine.apply operations.
+/// In the following cases, `sliceOps` remains empty:
+///   1. If none of opInst's operands were the result of an affine.apply
+///      (i.e., there was no affine computation slice to create).
+///   2. If all the affine.apply op's supplying operands to this opInst did not
+///      have any uses other than those in this opInst.
+void createAffineComputationSlice(Operation *opInst,
+                                  SmallVectorImpl<AffineApplyOp> *sliceOps);
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_UTILS_H
diff --git a/mlir/include/mlir/Transforms/ViewOpGraph.h b/mlir/include/mlir/Transforms/ViewOpGraph.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1782081adcb0f9ad72a4900ddf0d18466d7aee1
--- /dev/null
+++ b/mlir/include/mlir/Transforms/ViewOpGraph.h
@@ -0,0 +1,41 @@
+//===- ViewOpGraph.h - View/write op graphviz graphs ------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines interface to produce Graphviz outputs of MLIR op within block.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_VIEWOPGRAPH_H_
+#define MLIR_TRANSFORMS_VIEWOPGRAPH_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+class Block;
+class ModuleOp;
+template <typename T> class OpPassBase;
+
+/// Displays the graph in a window. This is for use from the debugger and
+/// depends on Graphviz to generate the graph.
+void viewGraph(Block &block, const Twine &name, bool shortNames = false,
+               const Twine &title = "",
+               llvm::GraphProgram::Name program = llvm::GraphProgram::DOT);
+
+raw_ostream &writeGraph(raw_ostream &os, Block &block, bool shortNames = false,
+                        const Twine &title = "");
+
+/// Creates a pass to print op graphs.
+std::unique_ptr<OpPassBase<ModuleOp>>
+createPrintOpGraphPass(raw_ostream &os = llvm::errs(), bool shortNames = false,
+                       const Twine &title = "");
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_VIEWOPGRAPH_H_
diff --git a/mlir/include/mlir/Transforms/ViewRegionGraph.h b/mlir/include/mlir/Transforms/ViewRegionGraph.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8c47500c74621f745b9042f15c80f3c658f121e
--- /dev/null
+++ b/mlir/include/mlir/Transforms/ViewRegionGraph.h
@@ -0,0 +1,41 @@
+//===- ViewRegionGraph.h - View/write graphviz graphs -----------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines interface to produce Graphviz outputs of MLIR Regions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_VIEWFUNCTIONGRAPH_H_
+#define MLIR_TRANSFORMS_VIEWFUNCTIONGRAPH_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+class FuncOp;
+template <typename T> class OpPassBase;
+class Region;
+
+/// Displays the CFG in a window. This is for use from the debugger and
+/// depends on Graphviz to generate the graph.
+void viewGraph(Region &region, const Twine &name, bool shortNames = false,
+               const Twine &title = "",
+               llvm::GraphProgram::Name program = llvm::GraphProgram::DOT);
+
+raw_ostream &writeGraph(raw_ostream &os, Region &region,
+                        bool shortNames = false, const Twine &title = "");
+
+/// Creates a pass to print CFG graphs.
+std::unique_ptr<mlir::OpPassBase<mlir::FuncOp>>
+createPrintCFGGraphPass(raw_ostream &os = llvm::errs(), bool shortNames = false,
+                        const Twine &title = "");
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_VIEWFUNCTIONGRAPH_H_
diff --git a/mlir/include/mlir/Translation.h b/mlir/include/mlir/Translation.h
new file mode 100644
index 0000000000000000000000000000000000000000..9244b971753cde7f2d148455277b82e11623eb23
--- /dev/null
+++ b/mlir/include/mlir/Translation.h
@@ -0,0 +1,94 @@
+//===- Translation.h - Translation registry ---------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Registry for user-provided translations.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_TRANSLATION_H
+#define MLIR_TRANSLATION_H
+
+#include "llvm/ADT/StringMap.h"
+
+#include <memory>
+
+namespace llvm {
+class MemoryBuffer;
+class SourceMgr;
+class StringRef;
+} // namespace llvm
+
+namespace mlir {
+struct LogicalResult;
+class MLIRContext;
+class ModuleOp;
+class OwningModuleRef;
+
+/// Interface of the function that translates the sources managed by `sourceMgr`
+/// to MLIR. The source manager has at least one buffer. The implementation
+/// should create a new MLIR ModuleOp in the given context and return a pointer
+/// to it, or a nullptr in case of any error.
+using TranslateSourceMgrToMLIRFunction =
+    std::function<OwningModuleRef(llvm::SourceMgr &sourceMgr, MLIRContext *)>;
+
+/// Interface of the function that translates the given string to MLIR. The
+/// implementation should create a new MLIR ModuleOp in the given context. If
+/// source-related error reporting is required from within the function, use
+/// TranslateSourceMgrToMLIRFunction instead.
+using TranslateStringRefToMLIRFunction =
+    std::function<OwningModuleRef(llvm::StringRef, MLIRContext *)>;
+
+/// Interface of the function that translates MLIR to a different format and
+/// outputs the result to a stream. It is allowed to modify the module.
+using TranslateFromMLIRFunction =
+    std::function<LogicalResult(ModuleOp, llvm::raw_ostream &output)>;
+
+/// Interface of the function that performs file-to-file translation involving
+/// MLIR. The input file is held in the given MemoryBuffer; the output file
+/// should be written to the given raw_ostream. The implementation should create
+/// all MLIR constructs needed during the process inside the given context. This
+/// can be used for round-tripping external formats through the MLIR system.
+using TranslateFunction = std::function<LogicalResult(
+    llvm::SourceMgr &sourceMgr, llvm::raw_ostream &output, MLIRContext *)>;
+
+/// Use Translate[ToMLIR|FromMLIR]Registration as a global initializer that
+/// registers a function and associates it with name. This requires that a
+/// translation has not been registered to a given name.
+///
+/// Usage:
+///
+///   // At namespace scope.
+///   static TranslateToMLIRRegistration Unused(&MySubCommand, [] { ... });
+///
+/// \{
+struct TranslateToMLIRRegistration {
+  TranslateToMLIRRegistration(llvm::StringRef name,
+                              const TranslateSourceMgrToMLIRFunction &function);
+  TranslateToMLIRRegistration(llvm::StringRef name,
+                              const TranslateStringRefToMLIRFunction &function);
+};
+
+struct TranslateFromMLIRRegistration {
+  TranslateFromMLIRRegistration(llvm::StringRef name,
+                                const TranslateFromMLIRFunction &function);
+};
+struct TranslateRegistration {
+  TranslateRegistration(llvm::StringRef name,
+                        const TranslateFunction &function);
+};
+/// \}
+
+/// Get a read-only reference to the translator registry.
+const llvm::StringMap<TranslateSourceMgrToMLIRFunction> &
+getTranslationToMLIRRegistry();
+const llvm::StringMap<TranslateFromMLIRFunction> &
+getTranslationFromMLIRRegistry();
+const llvm::StringMap<TranslateFunction> &getTranslationRegistry();
+
+} // namespace mlir
+
+#endif // MLIR_TRANSLATION_H
diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3358bb437ffe0e18d9e40e19df7fad05bdbae170
--- /dev/null
+++ b/mlir/lib/Analysis/AffineAnalysis.cpp
@@ -0,0 +1,886 @@
+//===- AffineAnalysis.cpp - Affine structures analysis routines -----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements miscellaneous analysis routines for affine structures
+// (expressions, maps, sets), and other utilities relying on such analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/MathExtras.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "affine-analysis"
+
+using namespace mlir;
+
+using llvm::dbgs;
+
+/// Returns the sequence of AffineApplyOp Operations operation in
+/// 'affineApplyOps', which are reachable via a search starting from 'operands',
+/// and ending at operands which are not defined by AffineApplyOps.
+// TODO(andydavis) Add a method to AffineApplyOp which forward substitutes
+// the AffineApplyOp into any user AffineApplyOps.
+void mlir::getReachableAffineApplyOps(
+    ArrayRef<Value> operands, SmallVectorImpl<Operation *> &affineApplyOps) {
+  struct State {
+    // The ssa value for this node in the DFS traversal.
+    Value value;
+    // The operand index of 'value' to explore next during DFS traversal.
+    unsigned operandIndex;
+  };
+  SmallVector<State, 4> worklist;
+  for (auto operand : operands) {
+    worklist.push_back({operand, 0});
+  }
+
+  while (!worklist.empty()) {
+    State &state = worklist.back();
+    auto *opInst = state.value->getDefiningOp();
+    // Note: getDefiningOp will return nullptr if the operand is not an
+    // Operation (i.e. block argument), which is a terminator for the search.
+    if (!isa_and_nonnull<AffineApplyOp>(opInst)) {
+      worklist.pop_back();
+      continue;
+    }
+
+    if (state.operandIndex == 0) {
+      // Pre-Visit: Add 'opInst' to reachable sequence.
+      affineApplyOps.push_back(opInst);
+    }
+    if (state.operandIndex < opInst->getNumOperands()) {
+      // Visit: Add next 'affineApplyOp' operand to worklist.
+      // Get next operand to visit at 'operandIndex'.
+      auto nextOperand = opInst->getOperand(state.operandIndex);
+      // Increment 'operandIndex' in 'state'.
+      ++state.operandIndex;
+      // Add 'nextOperand' to worklist.
+      worklist.push_back({nextOperand, 0});
+    } else {
+      // Post-visit: done visiting operands AffineApplyOp, pop off stack.
+      worklist.pop_back();
+    }
+  }
+}
+
+// Builds a system of constraints with dimensional identifiers corresponding to
+// the loop IVs of the forOps appearing in that order. Any symbols founds in
+// the bound operands are added as symbols in the system. Returns failure for
+// the yet unimplemented cases.
+// TODO(andydavis,bondhugula) Handle non-unit steps through local variables or
+// stride information in FlatAffineConstraints. (For eg., by using iv - lb %
+// step = 0 and/or by introducing a method in FlatAffineConstraints
+// setExprStride(ArrayRef<int64_t> expr, int64_t stride)
+LogicalResult mlir::getIndexSet(MutableArrayRef<AffineForOp> forOps,
+                                FlatAffineConstraints *domain) {
+  SmallVector<Value, 4> indices;
+  extractForInductionVars(forOps, &indices);
+  // Reset while associated Values in 'indices' to the domain.
+  domain->reset(forOps.size(), /*numSymbols=*/0, /*numLocals=*/0, indices);
+  for (auto forOp : forOps) {
+    // Add constraints from forOp's bounds.
+    if (failed(domain->addAffineForOpDomain(forOp)))
+      return failure();
+  }
+  return success();
+}
+
+// Computes the iteration domain for 'opInst' and populates 'indexSet', which
+// encapsulates the constraints involving loops surrounding 'opInst' and
+// potentially involving any Function symbols. The dimensional identifiers in
+// 'indexSet' correspond to the loops surrounding 'op' from outermost to
+// innermost.
+// TODO(andydavis) Add support to handle IfInsts surrounding 'op'.
+static LogicalResult getInstIndexSet(Operation *op,
+                                     FlatAffineConstraints *indexSet) {
+  // TODO(andydavis) Extend this to gather enclosing IfInsts and consider
+  // factoring it out into a utility function.
+  SmallVector<AffineForOp, 4> loops;
+  getLoopIVs(*op, &loops);
+  return getIndexSet(loops, indexSet);
+}
+
+// ValuePositionMap manages the mapping from Values which represent dimension
+// and symbol identifiers from 'src' and 'dst' access functions to positions
+// in new space where some Values are kept separate (using addSrc/DstValue)
+// and some Values are merged (addSymbolValue).
+// Position lookups return the absolute position in the new space which
+// has the following format:
+//
+//   [src-dim-identifiers] [dst-dim-identifiers] [symbol-identifiers]
+//
+// Note: access function non-IV dimension identifiers (that have 'dimension'
+// positions in the access function position space) are assigned as symbols
+// in the output position space. Convenience access functions which lookup
+// an Value in multiple maps are provided (i.e. getSrcDimOrSymPos) to handle
+// the common case of resolving positions for all access function operands.
+//
+// TODO(andydavis) Generalize this: could take a template parameter for
+// the number of maps (3 in the current case), and lookups could take indices
+// of maps to check. So getSrcDimOrSymPos would be "getPos(value, {0, 2})".
+class ValuePositionMap {
+public:
+  void addSrcValue(Value value) {
+    if (addValueAt(value, &srcDimPosMap, numSrcDims))
+      ++numSrcDims;
+  }
+  void addDstValue(Value value) {
+    if (addValueAt(value, &dstDimPosMap, numDstDims))
+      ++numDstDims;
+  }
+  void addSymbolValue(Value value) {
+    if (addValueAt(value, &symbolPosMap, numSymbols))
+      ++numSymbols;
+  }
+  unsigned getSrcDimOrSymPos(Value value) const {
+    return getDimOrSymPos(value, srcDimPosMap, 0);
+  }
+  unsigned getDstDimOrSymPos(Value value) const {
+    return getDimOrSymPos(value, dstDimPosMap, numSrcDims);
+  }
+  unsigned getSymPos(Value value) const {
+    auto it = symbolPosMap.find(value);
+    assert(it != symbolPosMap.end());
+    return numSrcDims + numDstDims + it->second;
+  }
+
+  unsigned getNumSrcDims() const { return numSrcDims; }
+  unsigned getNumDstDims() const { return numDstDims; }
+  unsigned getNumDims() const { return numSrcDims + numDstDims; }
+  unsigned getNumSymbols() const { return numSymbols; }
+
+private:
+  bool addValueAt(Value value, DenseMap<Value, unsigned> *posMap,
+                  unsigned position) {
+    auto it = posMap->find(value);
+    if (it == posMap->end()) {
+      (*posMap)[value] = position;
+      return true;
+    }
+    return false;
+  }
+  unsigned getDimOrSymPos(Value value,
+                          const DenseMap<Value, unsigned> &dimPosMap,
+                          unsigned dimPosOffset) const {
+    auto it = dimPosMap.find(value);
+    if (it != dimPosMap.end()) {
+      return dimPosOffset + it->second;
+    }
+    it = symbolPosMap.find(value);
+    assert(it != symbolPosMap.end());
+    return numSrcDims + numDstDims + it->second;
+  }
+
+  unsigned numSrcDims = 0;
+  unsigned numDstDims = 0;
+  unsigned numSymbols = 0;
+  DenseMap<Value, unsigned> srcDimPosMap;
+  DenseMap<Value, unsigned> dstDimPosMap;
+  DenseMap<Value, unsigned> symbolPosMap;
+};
+
+// Builds a map from Value to identifier position in a new merged identifier
+// list, which is the result of merging dim/symbol lists from src/dst
+// iteration domains, the format of which is as follows:
+//
+//   [src-dim-identifiers, dst-dim-identifiers, symbol-identifiers, const_term]
+//
+// This method populates 'valuePosMap' with mappings from operand Values in
+// 'srcAccessMap'/'dstAccessMap' (as well as those in 'srcDomain'/'dstDomain')
+// to the position of these values in the merged list.
+static void buildDimAndSymbolPositionMaps(
+    const FlatAffineConstraints &srcDomain,
+    const FlatAffineConstraints &dstDomain, const AffineValueMap &srcAccessMap,
+    const AffineValueMap &dstAccessMap, ValuePositionMap *valuePosMap,
+    FlatAffineConstraints *dependenceConstraints) {
+  auto updateValuePosMap = [&](ArrayRef<Value> values, bool isSrc) {
+    for (unsigned i = 0, e = values.size(); i < e; ++i) {
+      auto value = values[i];
+      if (!isForInductionVar(values[i])) {
+        assert(isValidSymbol(values[i]) &&
+               "access operand has to be either a loop IV or a symbol");
+        valuePosMap->addSymbolValue(value);
+      } else if (isSrc) {
+        valuePosMap->addSrcValue(value);
+      } else {
+        valuePosMap->addDstValue(value);
+      }
+    }
+  };
+
+  SmallVector<Value, 4> srcValues, destValues;
+  srcDomain.getIdValues(0, srcDomain.getNumDimAndSymbolIds(), &srcValues);
+  dstDomain.getIdValues(0, dstDomain.getNumDimAndSymbolIds(), &destValues);
+  // Update value position map with identifiers from src iteration domain.
+  updateValuePosMap(srcValues, /*isSrc=*/true);
+  // Update value position map with identifiers from dst iteration domain.
+  updateValuePosMap(destValues, /*isSrc=*/false);
+  // Update value position map with identifiers from src access function.
+  updateValuePosMap(srcAccessMap.getOperands(), /*isSrc=*/true);
+  // Update value position map with identifiers from dst access function.
+  updateValuePosMap(dstAccessMap.getOperands(), /*isSrc=*/false);
+}
+
+// Sets up dependence constraints columns appropriately, in the format:
+// [src-dim-ids, dst-dim-ids, symbol-ids, local-ids, const_term]
+void initDependenceConstraints(const FlatAffineConstraints &srcDomain,
+                               const FlatAffineConstraints &dstDomain,
+                               const AffineValueMap &srcAccessMap,
+                               const AffineValueMap &dstAccessMap,
+                               const ValuePositionMap &valuePosMap,
+                               FlatAffineConstraints *dependenceConstraints) {
+  // Calculate number of equalities/inequalities and columns required to
+  // initialize FlatAffineConstraints for 'dependenceDomain'.
+  unsigned numIneq =
+      srcDomain.getNumInequalities() + dstDomain.getNumInequalities();
+  AffineMap srcMap = srcAccessMap.getAffineMap();
+  assert(srcMap.getNumResults() == dstAccessMap.getAffineMap().getNumResults());
+  unsigned numEq = srcMap.getNumResults();
+  unsigned numDims = srcDomain.getNumDimIds() + dstDomain.getNumDimIds();
+  unsigned numSymbols = valuePosMap.getNumSymbols();
+  unsigned numLocals = srcDomain.getNumLocalIds() + dstDomain.getNumLocalIds();
+  unsigned numIds = numDims + numSymbols + numLocals;
+  unsigned numCols = numIds + 1;
+
+  // Set flat affine constraints sizes and reserving space for constraints.
+  dependenceConstraints->reset(numIneq, numEq, numCols, numDims, numSymbols,
+                               numLocals);
+
+  // Set values corresponding to dependence constraint identifiers.
+  SmallVector<Value, 4> srcLoopIVs, dstLoopIVs;
+  srcDomain.getIdValues(0, srcDomain.getNumDimIds(), &srcLoopIVs);
+  dstDomain.getIdValues(0, dstDomain.getNumDimIds(), &dstLoopIVs);
+
+  dependenceConstraints->setIdValues(0, srcLoopIVs.size(), srcLoopIVs);
+  dependenceConstraints->setIdValues(
+      srcLoopIVs.size(), srcLoopIVs.size() + dstLoopIVs.size(), dstLoopIVs);
+
+  // Set values for the symbolic identifier dimensions.
+  auto setSymbolIds = [&](ArrayRef<Value> values) {
+    for (auto value : values) {
+      if (!isForInductionVar(value)) {
+        assert(isValidSymbol(value) && "expected symbol");
+        dependenceConstraints->setIdValue(valuePosMap.getSymPos(value), value);
+      }
+    }
+  };
+
+  setSymbolIds(srcAccessMap.getOperands());
+  setSymbolIds(dstAccessMap.getOperands());
+
+  SmallVector<Value, 8> srcSymbolValues, dstSymbolValues;
+  srcDomain.getIdValues(srcDomain.getNumDimIds(),
+                        srcDomain.getNumDimAndSymbolIds(), &srcSymbolValues);
+  dstDomain.getIdValues(dstDomain.getNumDimIds(),
+                        dstDomain.getNumDimAndSymbolIds(), &dstSymbolValues);
+  setSymbolIds(srcSymbolValues);
+  setSymbolIds(dstSymbolValues);
+
+  for (unsigned i = 0, e = dependenceConstraints->getNumDimAndSymbolIds();
+       i < e; i++)
+    assert(dependenceConstraints->getIds()[i].hasValue());
+}
+
+// Adds iteration domain constraints from 'srcDomain' and 'dstDomain' into
+// 'dependenceDomain'.
+// Uses 'valuePosMap' to determine the position in 'dependenceDomain' to which a
+// srcDomain/dstDomain Value maps.
+static void addDomainConstraints(const FlatAffineConstraints &srcDomain,
+                                 const FlatAffineConstraints &dstDomain,
+                                 const ValuePositionMap &valuePosMap,
+                                 FlatAffineConstraints *dependenceDomain) {
+  unsigned depNumDimsAndSymbolIds = dependenceDomain->getNumDimAndSymbolIds();
+
+  SmallVector<int64_t, 4> cst(dependenceDomain->getNumCols());
+
+  auto addDomain = [&](bool isSrc, bool isEq, unsigned localOffset) {
+    const FlatAffineConstraints &domain = isSrc ? srcDomain : dstDomain;
+    unsigned numCsts =
+        isEq ? domain.getNumEqualities() : domain.getNumInequalities();
+    unsigned numDimAndSymbolIds = domain.getNumDimAndSymbolIds();
+    auto at = [&](unsigned i, unsigned j) -> int64_t {
+      return isEq ? domain.atEq(i, j) : domain.atIneq(i, j);
+    };
+    auto map = [&](unsigned i) -> int64_t {
+      return isSrc ? valuePosMap.getSrcDimOrSymPos(domain.getIdValue(i))
+                   : valuePosMap.getDstDimOrSymPos(domain.getIdValue(i));
+    };
+
+    for (unsigned i = 0; i < numCsts; ++i) {
+      // Zero fill.
+      std::fill(cst.begin(), cst.end(), 0);
+      // Set coefficients for identifiers corresponding to domain.
+      for (unsigned j = 0; j < numDimAndSymbolIds; ++j)
+        cst[map(j)] = at(i, j);
+      // Local terms.
+      for (unsigned j = 0, e = domain.getNumLocalIds(); j < e; j++)
+        cst[depNumDimsAndSymbolIds + localOffset + j] =
+            at(i, numDimAndSymbolIds + j);
+      // Set constant term.
+      cst[cst.size() - 1] = at(i, domain.getNumCols() - 1);
+      // Add constraint.
+      if (isEq)
+        dependenceDomain->addEquality(cst);
+      else
+        dependenceDomain->addInequality(cst);
+    }
+  };
+
+  // Add equalities from src domain.
+  addDomain(/*isSrc=*/true, /*isEq=*/true, /*localOffset=*/0);
+  // Add inequalities from src domain.
+  addDomain(/*isSrc=*/true, /*isEq=*/false, /*localOffset=*/0);
+  // Add equalities from dst domain.
+  addDomain(/*isSrc=*/false, /*isEq=*/true,
+            /*localOffset=*/srcDomain.getNumLocalIds());
+  // Add inequalities from dst domain.
+  addDomain(/*isSrc=*/false, /*isEq=*/false,
+            /*localOffset=*/srcDomain.getNumLocalIds());
+}
+
+// Adds equality constraints that equate src and dst access functions
+// represented by 'srcAccessMap' and 'dstAccessMap' for each result.
+// Requires that 'srcAccessMap' and 'dstAccessMap' have the same results count.
+// For example, given the following two accesses functions to a 2D memref:
+//
+//   Source access function:
+//     (a0 * d0 + a1 * s0 + a2, b0 * d0 + b1 * s0 + b2)
+//
+//   Destination access function:
+//     (c0 * d0 + c1 * s0 + c2, f0 * d0 + f1 * s0 + f2)
+//
+// This method constructs the following equality constraints in
+// 'dependenceDomain', by equating the access functions for each result
+// (i.e. each memref dim). Notice that 'd0' for the destination access function
+// is mapped into 'd0' in the equality constraint:
+//
+//   d0      d1      s0         c
+//   --      --      --         --
+//   a0     -c0      (a1 - c1)  (a1 - c2) = 0
+//   b0     -f0      (b1 - f1)  (b1 - f2) = 0
+//
+// Returns failure if any AffineExpr cannot be flattened (due to it being
+// semi-affine). Returns success otherwise.
+static LogicalResult
+addMemRefAccessConstraints(const AffineValueMap &srcAccessMap,
+                           const AffineValueMap &dstAccessMap,
+                           const ValuePositionMap &valuePosMap,
+                           FlatAffineConstraints *dependenceDomain) {
+  AffineMap srcMap = srcAccessMap.getAffineMap();
+  AffineMap dstMap = dstAccessMap.getAffineMap();
+  assert(srcMap.getNumResults() == dstMap.getNumResults());
+  unsigned numResults = srcMap.getNumResults();
+
+  unsigned srcNumIds = srcMap.getNumDims() + srcMap.getNumSymbols();
+  ArrayRef<Value> srcOperands = srcAccessMap.getOperands();
+
+  unsigned dstNumIds = dstMap.getNumDims() + dstMap.getNumSymbols();
+  ArrayRef<Value> dstOperands = dstAccessMap.getOperands();
+
+  std::vector<SmallVector<int64_t, 8>> srcFlatExprs;
+  std::vector<SmallVector<int64_t, 8>> destFlatExprs;
+  FlatAffineConstraints srcLocalVarCst, destLocalVarCst;
+  // Get flattened expressions for the source destination maps.
+  if (failed(getFlattenedAffineExprs(srcMap, &srcFlatExprs, &srcLocalVarCst)) ||
+      failed(getFlattenedAffineExprs(dstMap, &destFlatExprs, &destLocalVarCst)))
+    return failure();
+
+  unsigned domNumLocalIds = dependenceDomain->getNumLocalIds();
+  unsigned srcNumLocalIds = srcLocalVarCst.getNumLocalIds();
+  unsigned dstNumLocalIds = destLocalVarCst.getNumLocalIds();
+  unsigned numLocalIdsToAdd = srcNumLocalIds + dstNumLocalIds;
+  for (unsigned i = 0; i < numLocalIdsToAdd; i++) {
+    dependenceDomain->addLocalId(dependenceDomain->getNumLocalIds());
+  }
+
+  unsigned numDims = dependenceDomain->getNumDimIds();
+  unsigned numSymbols = dependenceDomain->getNumSymbolIds();
+  unsigned numSrcLocalIds = srcLocalVarCst.getNumLocalIds();
+  unsigned newLocalIdOffset = numDims + numSymbols + domNumLocalIds;
+
+  // Equality to add.
+  SmallVector<int64_t, 8> eq(dependenceDomain->getNumCols());
+  for (unsigned i = 0; i < numResults; ++i) {
+    // Zero fill.
+    std::fill(eq.begin(), eq.end(), 0);
+
+    // Flattened AffineExpr for src result 'i'.
+    const auto &srcFlatExpr = srcFlatExprs[i];
+    // Set identifier coefficients from src access function.
+    for (unsigned j = 0, e = srcOperands.size(); j < e; ++j)
+      eq[valuePosMap.getSrcDimOrSymPos(srcOperands[j])] = srcFlatExpr[j];
+    // Local terms.
+    for (unsigned j = 0, e = srcNumLocalIds; j < e; j++)
+      eq[newLocalIdOffset + j] = srcFlatExpr[srcNumIds + j];
+    // Set constant term.
+    eq[eq.size() - 1] = srcFlatExpr[srcFlatExpr.size() - 1];
+
+    // Flattened AffineExpr for dest result 'i'.
+    const auto &destFlatExpr = destFlatExprs[i];
+    // Set identifier coefficients from dst access function.
+    for (unsigned j = 0, e = dstOperands.size(); j < e; ++j)
+      eq[valuePosMap.getDstDimOrSymPos(dstOperands[j])] -= destFlatExpr[j];
+    // Local terms.
+    for (unsigned j = 0, e = dstNumLocalIds; j < e; j++)
+      eq[newLocalIdOffset + numSrcLocalIds + j] = -destFlatExpr[dstNumIds + j];
+    // Set constant term.
+    eq[eq.size() - 1] -= destFlatExpr[destFlatExpr.size() - 1];
+
+    // Add equality constraint.
+    dependenceDomain->addEquality(eq);
+  }
+
+  // Add equality constraints for any operands that are defined by constant ops.
+  auto addEqForConstOperands = [&](ArrayRef<Value> operands) {
+    for (unsigned i = 0, e = operands.size(); i < e; ++i) {
+      if (isForInductionVar(operands[i]))
+        continue;
+      auto symbol = operands[i];
+      assert(isValidSymbol(symbol));
+      // Check if the symbol is a constant.
+      if (auto cOp = dyn_cast_or_null<ConstantIndexOp>(symbol->getDefiningOp()))
+        dependenceDomain->setIdToConstant(valuePosMap.getSymPos(symbol),
+                                          cOp.getValue());
+    }
+  };
+
+  // Add equality constraints for any src symbols defined by constant ops.
+  addEqForConstOperands(srcOperands);
+  // Add equality constraints for any dst symbols defined by constant ops.
+  addEqForConstOperands(dstOperands);
+
+  // By construction (see flattener), local var constraints will not have any
+  // equalities.
+  assert(srcLocalVarCst.getNumEqualities() == 0 &&
+         destLocalVarCst.getNumEqualities() == 0);
+  // Add inequalities from srcLocalVarCst and destLocalVarCst into the
+  // dependence domain.
+  SmallVector<int64_t, 8> ineq(dependenceDomain->getNumCols());
+  for (unsigned r = 0, e = srcLocalVarCst.getNumInequalities(); r < e; r++) {
+    std::fill(ineq.begin(), ineq.end(), 0);
+
+    // Set identifier coefficients from src local var constraints.
+    for (unsigned j = 0, e = srcOperands.size(); j < e; ++j)
+      ineq[valuePosMap.getSrcDimOrSymPos(srcOperands[j])] =
+          srcLocalVarCst.atIneq(r, j);
+    // Local terms.
+    for (unsigned j = 0, e = srcNumLocalIds; j < e; j++)
+      ineq[newLocalIdOffset + j] = srcLocalVarCst.atIneq(r, srcNumIds + j);
+    // Set constant term.
+    ineq[ineq.size() - 1] =
+        srcLocalVarCst.atIneq(r, srcLocalVarCst.getNumCols() - 1);
+    dependenceDomain->addInequality(ineq);
+  }
+
+  for (unsigned r = 0, e = destLocalVarCst.getNumInequalities(); r < e; r++) {
+    std::fill(ineq.begin(), ineq.end(), 0);
+    // Set identifier coefficients from dest local var constraints.
+    for (unsigned j = 0, e = dstOperands.size(); j < e; ++j)
+      ineq[valuePosMap.getDstDimOrSymPos(dstOperands[j])] =
+          destLocalVarCst.atIneq(r, j);
+    // Local terms.
+    for (unsigned j = 0, e = dstNumLocalIds; j < e; j++)
+      ineq[newLocalIdOffset + numSrcLocalIds + j] =
+          destLocalVarCst.atIneq(r, dstNumIds + j);
+    // Set constant term.
+    ineq[ineq.size() - 1] =
+        destLocalVarCst.atIneq(r, destLocalVarCst.getNumCols() - 1);
+
+    dependenceDomain->addInequality(ineq);
+  }
+  return success();
+}
+
+// Returns the number of outer loop common to 'src/dstDomain'.
+// Loops common to 'src/dst' domains are added to 'commonLoops' if non-null.
+static unsigned
+getNumCommonLoops(const FlatAffineConstraints &srcDomain,
+                  const FlatAffineConstraints &dstDomain,
+                  SmallVectorImpl<AffineForOp> *commonLoops = nullptr) {
+  // Find the number of common loops shared by src and dst accesses.
+  unsigned minNumLoops =
+      std::min(srcDomain.getNumDimIds(), dstDomain.getNumDimIds());
+  unsigned numCommonLoops = 0;
+  for (unsigned i = 0; i < minNumLoops; ++i) {
+    if (!isForInductionVar(srcDomain.getIdValue(i)) ||
+        !isForInductionVar(dstDomain.getIdValue(i)) ||
+        srcDomain.getIdValue(i) != dstDomain.getIdValue(i))
+      break;
+    if (commonLoops != nullptr)
+      commonLoops->push_back(getForInductionVarOwner(srcDomain.getIdValue(i)));
+    ++numCommonLoops;
+  }
+  if (commonLoops != nullptr)
+    assert(commonLoops->size() == numCommonLoops);
+  return numCommonLoops;
+}
+
+// Returns Block common to 'srcAccess.opInst' and 'dstAccess.opInst'.
+static Block *getCommonBlock(const MemRefAccess &srcAccess,
+                             const MemRefAccess &dstAccess,
+                             const FlatAffineConstraints &srcDomain,
+                             unsigned numCommonLoops) {
+  if (numCommonLoops == 0) {
+    auto *block = srcAccess.opInst->getBlock();
+    while (!llvm::isa<FuncOp>(block->getParentOp())) {
+      block = block->getParentOp()->getBlock();
+    }
+    return block;
+  }
+  auto commonForValue = srcDomain.getIdValue(numCommonLoops - 1);
+  auto forOp = getForInductionVarOwner(commonForValue);
+  assert(forOp && "commonForValue was not an induction variable");
+  return forOp.getBody();
+}
+
+// Returns true if the ancestor operation of 'srcAccess' appears before the
+// ancestor operation of 'dstAccess' in the common ancestral block. Returns
+// false otherwise.
+// Note that because 'srcAccess' or 'dstAccess' may be nested in conditionals,
+// the function is named 'srcAppearsBeforeDstInCommonBlock'. Note that
+// 'numCommonLoops' is the number of contiguous surrounding outer loops.
+static bool srcAppearsBeforeDstInAncestralBlock(
+    const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
+    const FlatAffineConstraints &srcDomain, unsigned numCommonLoops) {
+  // Get Block common to 'srcAccess.opInst' and 'dstAccess.opInst'.
+  auto *commonBlock =
+      getCommonBlock(srcAccess, dstAccess, srcDomain, numCommonLoops);
+  // Check the dominance relationship between the respective ancestors of the
+  // src and dst in the Block of the innermost among the common loops.
+  auto *srcInst = commonBlock->findAncestorOpInBlock(*srcAccess.opInst);
+  assert(srcInst != nullptr);
+  auto *dstInst = commonBlock->findAncestorOpInBlock(*dstAccess.opInst);
+  assert(dstInst != nullptr);
+
+  // Determine whether dstInst comes after srcInst.
+  return srcInst->isBeforeInBlock(dstInst);
+}
+
+// Adds ordering constraints to 'dependenceDomain' based on number of loops
+// common to 'src/dstDomain' and requested 'loopDepth'.
+// Note that 'loopDepth' cannot exceed the number of common loops plus one.
+// EX: Given a loop nest of depth 2 with IVs 'i' and 'j':
+// *) If 'loopDepth == 1' then one constraint is added: i' >= i + 1
+// *) If 'loopDepth == 2' then two constraints are added: i == i' and j' > j + 1
+// *) If 'loopDepth == 3' then two constraints are added: i == i' and j == j'
+static void addOrderingConstraints(const FlatAffineConstraints &srcDomain,
+                                   const FlatAffineConstraints &dstDomain,
+                                   unsigned loopDepth,
+                                   FlatAffineConstraints *dependenceDomain) {
+  unsigned numCols = dependenceDomain->getNumCols();
+  SmallVector<int64_t, 4> eq(numCols);
+  unsigned numSrcDims = srcDomain.getNumDimIds();
+  unsigned numCommonLoops = getNumCommonLoops(srcDomain, dstDomain);
+  unsigned numCommonLoopConstraints = std::min(numCommonLoops, loopDepth);
+  for (unsigned i = 0; i < numCommonLoopConstraints; ++i) {
+    std::fill(eq.begin(), eq.end(), 0);
+    eq[i] = -1;
+    eq[i + numSrcDims] = 1;
+    if (i == loopDepth - 1) {
+      eq[numCols - 1] = -1;
+      dependenceDomain->addInequality(eq);
+    } else {
+      dependenceDomain->addEquality(eq);
+    }
+  }
+}
+
+// Computes distance and direction vectors in 'dependences', by adding
+// variables to 'dependenceDomain' which represent the difference of the IVs,
+// eliminating all other variables, and reading off distance vectors from
+// equality constraints (if possible), and direction vectors from inequalities.
+static void computeDirectionVector(
+    const FlatAffineConstraints &srcDomain,
+    const FlatAffineConstraints &dstDomain, unsigned loopDepth,
+    FlatAffineConstraints *dependenceDomain,
+    SmallVector<DependenceComponent, 2> *dependenceComponents) {
+  // Find the number of common loops shared by src and dst accesses.
+  SmallVector<AffineForOp, 4> commonLoops;
+  unsigned numCommonLoops =
+      getNumCommonLoops(srcDomain, dstDomain, &commonLoops);
+  if (numCommonLoops == 0)
+    return;
+  // Compute direction vectors for requested loop depth.
+  unsigned numIdsToEliminate = dependenceDomain->getNumIds();
+  // Add new variables to 'dependenceDomain' to represent the direction
+  // constraints for each shared loop.
+  for (unsigned j = 0; j < numCommonLoops; ++j) {
+    dependenceDomain->addDimId(j);
+  }
+
+  // Add equality constraints for each common loop, setting newly introduced
+  // variable at column 'j' to the 'dst' IV minus the 'src IV.
+  SmallVector<int64_t, 4> eq;
+  eq.resize(dependenceDomain->getNumCols());
+  unsigned numSrcDims = srcDomain.getNumDimIds();
+  // Constraint variables format:
+  // [num-common-loops][num-src-dim-ids][num-dst-dim-ids][num-symbols][constant]
+  for (unsigned j = 0; j < numCommonLoops; ++j) {
+    std::fill(eq.begin(), eq.end(), 0);
+    eq[j] = 1;
+    eq[j + numCommonLoops] = 1;
+    eq[j + numCommonLoops + numSrcDims] = -1;
+    dependenceDomain->addEquality(eq);
+  }
+
+  // Eliminate all variables other than the direction variables just added.
+  dependenceDomain->projectOut(numCommonLoops, numIdsToEliminate);
+
+  // Scan each common loop variable column and set direction vectors based
+  // on eliminated constraint system.
+  dependenceComponents->resize(numCommonLoops);
+  for (unsigned j = 0; j < numCommonLoops; ++j) {
+    (*dependenceComponents)[j].op = commonLoops[j].getOperation();
+    auto lbConst = dependenceDomain->getConstantLowerBound(j);
+    (*dependenceComponents)[j].lb =
+        lbConst.getValueOr(std::numeric_limits<int64_t>::min());
+    auto ubConst = dependenceDomain->getConstantUpperBound(j);
+    (*dependenceComponents)[j].ub =
+        ubConst.getValueOr(std::numeric_limits<int64_t>::max());
+  }
+}
+
+// Populates 'accessMap' with composition of AffineApplyOps reachable from
+// indices of MemRefAccess.
+void MemRefAccess::getAccessMap(AffineValueMap *accessMap) const {
+  // Get affine map from AffineLoad/Store.
+  AffineMap map;
+  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst))
+    map = loadOp.getAffineMap();
+  else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst))
+    map = storeOp.getAffineMap();
+  SmallVector<Value, 8> operands(indices.begin(), indices.end());
+  fullyComposeAffineMapAndOperands(&map, &operands);
+  map = simplifyAffineMap(map);
+  canonicalizeMapAndOperands(&map, &operands);
+  accessMap->reset(map, operands);
+}
+
+// Builds a flat affine constraint system to check if there exists a dependence
+// between memref accesses 'srcAccess' and 'dstAccess'.
+// Returns 'NoDependence' if the accesses can be definitively shown not to
+// access the same element.
+// Returns 'HasDependence' if the accesses do access the same element.
+// Returns 'Failure' if an error or unsupported case was encountered.
+// If a dependence exists, returns in 'dependenceComponents' a direction
+// vector for the dependence, with a component for each loop IV in loops
+// common to both accesses (see Dependence in AffineAnalysis.h for details).
+//
+// The memref access dependence check is comprised of the following steps:
+// *) Compute access functions for each access. Access functions are computed
+//    using AffineValueMaps initialized with the indices from an access, then
+//    composed with AffineApplyOps reachable from operands of that access,
+//    until operands of the AffineValueMap are loop IVs or symbols.
+// *) Build iteration domain constraints for each access. Iteration domain
+//    constraints are pairs of inequality constraints representing the
+//    upper/lower loop bounds for each AffineForOp in the loop nest associated
+//    with each access.
+// *) Build dimension and symbol position maps for each access, which map
+//    Values from access functions and iteration domains to their position
+//    in the merged constraint system built by this method.
+//
+// This method builds a constraint system with the following column format:
+//
+//  [src-dim-identifiers, dst-dim-identifiers, symbols, constant]
+//
+// For example, given the following MLIR code with "source" and "destination"
+// accesses to the same memref label, and symbols %M, %N, %K:
+//
+//   affine.for %i0 = 0 to 100 {
+//     affine.for %i1 = 0 to 50 {
+//       %a0 = affine.apply
+//         (d0, d1) -> (d0 * 2 - d1 * 4 + s1, d1 * 3 - s0) (%i0, %i1)[%M, %N]
+//       // Source memref access.
+//       store %v0, %m[%a0#0, %a0#1] : memref<4x4xf32>
+//     }
+//   }
+//
+//   affine.for %i2 = 0 to 100 {
+//     affine.for %i3 = 0 to 50 {
+//       %a1 = affine.apply
+//         (d0, d1) -> (d0 * 7 + d1 * 9 - s1, d1 * 11 + s0) (%i2, %i3)[%K, %M]
+//       // Destination memref access.
+//       %v1 = load %m[%a1#0, %a1#1] : memref<4x4xf32>
+//     }
+//   }
+//
+// The access functions would be the following:
+//
+//   src: (%i0 * 2 - %i1 * 4 + %N, %i1 * 3 - %M)
+//   dst: (%i2 * 7 + %i3 * 9 - %M, %i3 * 11 - %K)
+//
+// The iteration domains for the src/dst accesses would be the following:
+//
+//   src: 0 <= %i0 <= 100, 0 <= %i1 <= 50
+//   dst: 0 <= %i2 <= 100, 0 <= %i3 <= 50
+//
+// The symbols by both accesses would be assigned to a canonical position order
+// which will be used in the dependence constraint system:
+//
+//   symbol name: %M  %N  %K
+//   symbol  pos:  0   1   2
+//
+// Equality constraints are built by equating each result of src/destination
+// access functions. For this example, the following two equality constraints
+// will be added to the dependence constraint system:
+//
+//   [src_dim0, src_dim1, dst_dim0, dst_dim1, sym0, sym1, sym2, const]
+//      2         -4        -7        -9       1      1     0     0    = 0
+//      0          3         0        -11     -1      0     1     0    = 0
+//
+// Inequality constraints from the iteration domain will be meged into
+// the dependence constraint system
+//
+//   [src_dim0, src_dim1, dst_dim0, dst_dim1, sym0, sym1, sym2, const]
+//       1         0         0         0        0     0     0     0    >= 0
+//      -1         0         0         0        0     0     0     100  >= 0
+//       0         1         0         0        0     0     0     0    >= 0
+//       0        -1         0         0        0     0     0     50   >= 0
+//       0         0         1         0        0     0     0     0    >= 0
+//       0         0        -1         0        0     0     0     100  >= 0
+//       0         0         0         1        0     0     0     0    >= 0
+//       0         0         0        -1        0     0     0     50   >= 0
+//
+//
+// TODO(andydavis) Support AffineExprs mod/floordiv/ceildiv.
+DependenceResult mlir::checkMemrefAccessDependence(
+    const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
+    unsigned loopDepth, FlatAffineConstraints *dependenceConstraints,
+    SmallVector<DependenceComponent, 2> *dependenceComponents, bool allowRAR) {
+  LLVM_DEBUG(llvm::dbgs() << "Checking for dependence at depth: "
+                          << Twine(loopDepth) << " between:\n";);
+  LLVM_DEBUG(srcAccess.opInst->dump(););
+  LLVM_DEBUG(dstAccess.opInst->dump(););
+
+  // Return 'NoDependence' if these accesses do not access the same memref.
+  if (srcAccess.memref != dstAccess.memref)
+    return DependenceResult::NoDependence;
+
+  // Return 'NoDependence' if one of these accesses is not an AffineStoreOp.
+  if (!allowRAR && !isa<AffineStoreOp>(srcAccess.opInst) &&
+      !isa<AffineStoreOp>(dstAccess.opInst))
+    return DependenceResult::NoDependence;
+
+  // Get composed access function for 'srcAccess'.
+  AffineValueMap srcAccessMap;
+  srcAccess.getAccessMap(&srcAccessMap);
+
+  // Get composed access function for 'dstAccess'.
+  AffineValueMap dstAccessMap;
+  dstAccess.getAccessMap(&dstAccessMap);
+
+  // Get iteration domain for the 'srcAccess' operation.
+  FlatAffineConstraints srcDomain;
+  if (failed(getInstIndexSet(srcAccess.opInst, &srcDomain)))
+    return DependenceResult::Failure;
+
+  // Get iteration domain for 'dstAccess' operation.
+  FlatAffineConstraints dstDomain;
+  if (failed(getInstIndexSet(dstAccess.opInst, &dstDomain)))
+    return DependenceResult::Failure;
+
+  // Return 'NoDependence' if loopDepth > numCommonLoops and if the ancestor
+  // operation of 'srcAccess' does not properly dominate the ancestor
+  // operation of 'dstAccess' in the same common operation block.
+  // Note: this check is skipped if 'allowRAR' is true, because because RAR
+  // deps can exist irrespective of lexicographic ordering b/w src and dst.
+  unsigned numCommonLoops = getNumCommonLoops(srcDomain, dstDomain);
+  assert(loopDepth <= numCommonLoops + 1);
+  if (!allowRAR && loopDepth > numCommonLoops &&
+      !srcAppearsBeforeDstInAncestralBlock(srcAccess, dstAccess, srcDomain,
+                                           numCommonLoops)) {
+    return DependenceResult::NoDependence;
+  }
+  // Build dim and symbol position maps for each access from access operand
+  // Value to position in merged constraint system.
+  ValuePositionMap valuePosMap;
+  buildDimAndSymbolPositionMaps(srcDomain, dstDomain, srcAccessMap,
+                                dstAccessMap, &valuePosMap,
+                                dependenceConstraints);
+
+  initDependenceConstraints(srcDomain, dstDomain, srcAccessMap, dstAccessMap,
+                            valuePosMap, dependenceConstraints);
+
+  assert(valuePosMap.getNumDims() ==
+         srcDomain.getNumDimIds() + dstDomain.getNumDimIds());
+
+  // Create memref access constraint by equating src/dst access functions.
+  // Note that this check is conservative, and will fail in the future when
+  // local variables for mod/div exprs are supported.
+  if (failed(addMemRefAccessConstraints(srcAccessMap, dstAccessMap, valuePosMap,
+                                        dependenceConstraints)))
+    return DependenceResult::Failure;
+
+  // Add 'src' happens before 'dst' ordering constraints.
+  addOrderingConstraints(srcDomain, dstDomain, loopDepth,
+                         dependenceConstraints);
+  // Add src and dst domain constraints.
+  addDomainConstraints(srcDomain, dstDomain, valuePosMap,
+                       dependenceConstraints);
+
+  // Return 'NoDependence' if the solution space is empty: no dependence.
+  if (dependenceConstraints->isEmpty()) {
+    return DependenceResult::NoDependence;
+  }
+
+  // Compute dependence direction vector and return true.
+  if (dependenceComponents != nullptr) {
+    computeDirectionVector(srcDomain, dstDomain, loopDepth,
+                           dependenceConstraints, dependenceComponents);
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "Dependence polyhedron:\n");
+  LLVM_DEBUG(dependenceConstraints->dump());
+  return DependenceResult::HasDependence;
+}
+
+/// Gathers dependence components for dependences between all ops in loop nest
+/// rooted at 'forOp' at loop depths in range [1, maxLoopDepth].
+void mlir::getDependenceComponents(
+    AffineForOp forOp, unsigned maxLoopDepth,
+    std::vector<SmallVector<DependenceComponent, 2>> *depCompsVec) {
+  // Collect all load and store ops in loop nest rooted at 'forOp'.
+  SmallVector<Operation *, 8> loadAndStoreOpInsts;
+  forOp.getOperation()->walk([&](Operation *opInst) {
+    if (isa<AffineLoadOp>(opInst) || isa<AffineStoreOp>(opInst))
+      loadAndStoreOpInsts.push_back(opInst);
+  });
+
+  unsigned numOps = loadAndStoreOpInsts.size();
+  for (unsigned d = 1; d <= maxLoopDepth; ++d) {
+    for (unsigned i = 0; i < numOps; ++i) {
+      auto *srcOpInst = loadAndStoreOpInsts[i];
+      MemRefAccess srcAccess(srcOpInst);
+      for (unsigned j = 0; j < numOps; ++j) {
+        auto *dstOpInst = loadAndStoreOpInsts[j];
+        MemRefAccess dstAccess(dstOpInst);
+
+        FlatAffineConstraints dependenceConstraints;
+        SmallVector<DependenceComponent, 2> depComps;
+        // TODO(andydavis,bondhugula) Explore whether it would be profitable
+        // to pre-compute and store deps instead of repeatedly checking.
+        DependenceResult result = checkMemrefAccessDependence(
+            srcAccess, dstAccess, d, &dependenceConstraints, &depComps);
+        if (hasDependence(result))
+          depCompsVec->push_back(depComps);
+      }
+    }
+  }
+}
diff --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..78a869884ee497ff496fbef5847c440810bc816b
--- /dev/null
+++ b/mlir/lib/Analysis/AffineStructures.cpp
@@ -0,0 +1,2854 @@
+//===- AffineStructures.cpp - MLIR Affine Structures Class-----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Structures for affine/polyhedral analysis of MLIR functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/MathExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "affine-structures"
+
+using namespace mlir;
+using llvm::SmallDenseMap;
+using llvm::SmallDenseSet;
+
+namespace {
+
+// See comments for SimpleAffineExprFlattener.
+// An AffineExprFlattener extends a SimpleAffineExprFlattener by recording
+// constraint information associated with mod's, floordiv's, and ceildiv's
+// in FlatAffineConstraints 'localVarCst'.
+struct AffineExprFlattener : public SimpleAffineExprFlattener {
+public:
+  // Constraints connecting newly introduced local variables (for mod's and
+  // div's) to existing (dimensional and symbolic) ones. These are always
+  // inequalities.
+  FlatAffineConstraints localVarCst;
+
+  AffineExprFlattener(unsigned nDims, unsigned nSymbols, MLIRContext *ctx)
+      : SimpleAffineExprFlattener(nDims, nSymbols) {
+    localVarCst.reset(nDims, nSymbols, /*numLocals=*/0);
+  }
+
+private:
+  // Add a local identifier (needed to flatten a mod, floordiv, ceildiv expr).
+  // The local identifier added is always a floordiv of a pure add/mul affine
+  // function of other identifiers, coefficients of which are specified in
+  // `dividend' and with respect to the positive constant `divisor'. localExpr
+  // is the simplified tree expression (AffineExpr) corresponding to the
+  // quantifier.
+  void addLocalFloorDivId(ArrayRef<int64_t> dividend, int64_t divisor,
+                          AffineExpr localExpr) override {
+    SimpleAffineExprFlattener::addLocalFloorDivId(dividend, divisor, localExpr);
+    // Update localVarCst.
+    localVarCst.addLocalFloorDiv(dividend, divisor);
+  }
+};
+
+} // end anonymous namespace
+
+// Flattens the expressions in map. Returns failure if 'expr' was unable to be
+// flattened (i.e., semi-affine expressions not handled yet).
+static LogicalResult
+getFlattenedAffineExprs(ArrayRef<AffineExpr> exprs, unsigned numDims,
+                        unsigned numSymbols,
+                        std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
+                        FlatAffineConstraints *localVarCst) {
+  if (exprs.empty()) {
+    localVarCst->reset(numDims, numSymbols);
+    return success();
+  }
+
+  AffineExprFlattener flattener(numDims, numSymbols, exprs[0].getContext());
+  // Use the same flattener to simplify each expression successively. This way
+  // local identifiers / expressions are shared.
+  for (auto expr : exprs) {
+    if (!expr.isPureAffine())
+      return failure();
+
+    flattener.walkPostOrder(expr);
+  }
+
+  assert(flattener.operandExprStack.size() == exprs.size());
+  flattenedExprs->clear();
+  flattenedExprs->assign(flattener.operandExprStack.begin(),
+                         flattener.operandExprStack.end());
+
+  if (localVarCst) {
+    localVarCst->clearAndCopyFrom(flattener.localVarCst);
+  }
+
+  return success();
+}
+
+// Flattens 'expr' into 'flattenedExpr'. Returns failure if 'expr' was unable to
+// be flattened (semi-affine expressions not handled yet).
+LogicalResult
+mlir::getFlattenedAffineExpr(AffineExpr expr, unsigned numDims,
+                             unsigned numSymbols,
+                             SmallVectorImpl<int64_t> *flattenedExpr,
+                             FlatAffineConstraints *localVarCst) {
+  std::vector<SmallVector<int64_t, 8>> flattenedExprs;
+  LogicalResult ret = ::getFlattenedAffineExprs({expr}, numDims, numSymbols,
+                                                &flattenedExprs, localVarCst);
+  *flattenedExpr = flattenedExprs[0];
+  return ret;
+}
+
+/// Flattens the expressions in map. Returns failure if 'expr' was unable to be
+/// flattened (i.e., semi-affine expressions not handled yet).
+LogicalResult mlir::getFlattenedAffineExprs(
+    AffineMap map, std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
+    FlatAffineConstraints *localVarCst) {
+  if (map.getNumResults() == 0) {
+    localVarCst->reset(map.getNumDims(), map.getNumSymbols());
+    return success();
+  }
+  return ::getFlattenedAffineExprs(map.getResults(), map.getNumDims(),
+                                   map.getNumSymbols(), flattenedExprs,
+                                   localVarCst);
+}
+
+LogicalResult mlir::getFlattenedAffineExprs(
+    IntegerSet set, std::vector<SmallVector<int64_t, 8>> *flattenedExprs,
+    FlatAffineConstraints *localVarCst) {
+  if (set.getNumConstraints() == 0) {
+    localVarCst->reset(set.getNumDims(), set.getNumSymbols());
+    return success();
+  }
+  return ::getFlattenedAffineExprs(set.getConstraints(), set.getNumDims(),
+                                   set.getNumSymbols(), flattenedExprs,
+                                   localVarCst);
+}
+
+//===----------------------------------------------------------------------===//
+// MutableAffineMap.
+//===----------------------------------------------------------------------===//
+
+MutableAffineMap::MutableAffineMap(AffineMap map)
+    : numDims(map.getNumDims()), numSymbols(map.getNumSymbols()),
+      // A map always has at least 1 result by construction
+      context(map.getResult(0).getContext()) {
+  for (auto result : map.getResults())
+    results.push_back(result);
+}
+
+void MutableAffineMap::reset(AffineMap map) {
+  results.clear();
+  numDims = map.getNumDims();
+  numSymbols = map.getNumSymbols();
+  // A map always has at least 1 result by construction
+  context = map.getResult(0).getContext();
+  for (auto result : map.getResults())
+    results.push_back(result);
+}
+
+bool MutableAffineMap::isMultipleOf(unsigned idx, int64_t factor) const {
+  if (results[idx].isMultipleOf(factor))
+    return true;
+
+  // TODO(bondhugula): use simplifyAffineExpr and FlatAffineConstraints to
+  // complete this (for a more powerful analysis).
+  return false;
+}
+
+// Simplifies the result affine expressions of this map. The expressions have to
+// be pure for the simplification implemented.
+void MutableAffineMap::simplify() {
+  // Simplify each of the results if possible.
+  // TODO(ntv): functional-style map
+  for (unsigned i = 0, e = getNumResults(); i < e; i++) {
+    results[i] = simplifyAffineExpr(getResult(i), numDims, numSymbols);
+  }
+}
+
+AffineMap MutableAffineMap::getAffineMap() const {
+  return AffineMap::get(numDims, numSymbols, results);
+}
+
+MutableIntegerSet::MutableIntegerSet(IntegerSet set, MLIRContext *context)
+    : numDims(set.getNumDims()), numSymbols(set.getNumSymbols()) {
+  // TODO(bondhugula)
+}
+
+// Universal set.
+MutableIntegerSet::MutableIntegerSet(unsigned numDims, unsigned numSymbols,
+                                     MLIRContext *context)
+    : numDims(numDims), numSymbols(numSymbols) {}
+
+//===----------------------------------------------------------------------===//
+// AffineValueMap.
+//===----------------------------------------------------------------------===//
+
+AffineValueMap::AffineValueMap(AffineMap map, ArrayRef<Value> operands,
+                               ArrayRef<Value> results)
+    : map(map), operands(operands.begin(), operands.end()),
+      results(results.begin(), results.end()) {}
+
+AffineValueMap::AffineValueMap(AffineApplyOp applyOp)
+    : map(applyOp.getAffineMap()),
+      operands(applyOp.operand_begin(), applyOp.operand_end()) {
+  results.push_back(applyOp.getResult());
+}
+
+AffineValueMap::AffineValueMap(AffineBound bound)
+    : map(bound.getMap()),
+      operands(bound.operand_begin(), bound.operand_end()) {}
+
+void AffineValueMap::reset(AffineMap map, ArrayRef<Value> operands,
+                           ArrayRef<Value> results) {
+  this->map.reset(map);
+  this->operands.assign(operands.begin(), operands.end());
+  this->results.assign(results.begin(), results.end());
+}
+
+void AffineValueMap::difference(const AffineValueMap &a,
+                                const AffineValueMap &b, AffineValueMap *res) {
+  assert(a.getNumResults() == b.getNumResults() && "invalid inputs");
+
+  // Fully compose A's map + operands.
+  auto aMap = a.getAffineMap();
+  SmallVector<Value, 4> aOperands(a.getOperands().begin(),
+                                  a.getOperands().end());
+  fullyComposeAffineMapAndOperands(&aMap, &aOperands);
+
+  // Use the affine apply normalizer to get B's map into A's coordinate space.
+  AffineApplyNormalizer normalizer(aMap, aOperands);
+  SmallVector<Value, 4> bOperands(b.getOperands().begin(),
+                                  b.getOperands().end());
+  auto bMap = b.getAffineMap();
+  normalizer.normalize(&bMap, &bOperands);
+
+  assert(std::equal(bOperands.begin(), bOperands.end(),
+                    normalizer.getOperands().begin()) &&
+         "operands are expected to be the same after normalization");
+
+  // Construct the difference expressions.
+  SmallVector<AffineExpr, 4> diffExprs;
+  diffExprs.reserve(a.getNumResults());
+  for (unsigned i = 0, e = bMap.getNumResults(); i < e; ++i)
+    diffExprs.push_back(normalizer.getAffineMap().getResult(i) -
+                        bMap.getResult(i));
+
+  auto diffMap = AffineMap::get(normalizer.getNumDims(),
+                                normalizer.getNumSymbols(), diffExprs);
+  canonicalizeMapAndOperands(&diffMap, &bOperands);
+  diffMap = simplifyAffineMap(diffMap);
+  res->reset(diffMap, bOperands);
+}
+
+// Returns true and sets 'indexOfMatch' if 'valueToMatch' is found in
+// 'valuesToSearch' beginning at 'indexStart'. Returns false otherwise.
+static bool findIndex(Value valueToMatch, ArrayRef<Value> valuesToSearch,
+                      unsigned indexStart, unsigned *indexOfMatch) {
+  unsigned size = valuesToSearch.size();
+  for (unsigned i = indexStart; i < size; ++i) {
+    if (valueToMatch == valuesToSearch[i]) {
+      *indexOfMatch = i;
+      return true;
+    }
+  }
+  return false;
+}
+
+inline bool AffineValueMap::isMultipleOf(unsigned idx, int64_t factor) const {
+  return map.isMultipleOf(idx, factor);
+}
+
+/// This method uses the invariant that operands are always positionally aligned
+/// with the AffineDimExpr in the underlying AffineMap.
+bool AffineValueMap::isFunctionOf(unsigned idx, Value value) const {
+  unsigned index;
+  if (!findIndex(value, operands, /*indexStart=*/0, &index)) {
+    return false;
+  }
+  auto expr = const_cast<AffineValueMap *>(this)->getAffineMap().getResult(idx);
+  // TODO(ntv): this is better implemented on a flattened representation.
+  // At least for now it is conservative.
+  return expr.isFunctionOfDim(index);
+}
+
+Value AffineValueMap::getOperand(unsigned i) const {
+  return static_cast<Value>(operands[i]);
+}
+
+ArrayRef<Value> AffineValueMap::getOperands() const {
+  return ArrayRef<Value>(operands);
+}
+
+AffineMap AffineValueMap::getAffineMap() const { return map.getAffineMap(); }
+
+AffineValueMap::~AffineValueMap() {}
+
+//===----------------------------------------------------------------------===//
+// FlatAffineConstraints.
+//===----------------------------------------------------------------------===//
+
+// Copy constructor.
+FlatAffineConstraints::FlatAffineConstraints(
+    const FlatAffineConstraints &other) {
+  numReservedCols = other.numReservedCols;
+  numDims = other.getNumDimIds();
+  numSymbols = other.getNumSymbolIds();
+  numIds = other.getNumIds();
+
+  auto otherIds = other.getIds();
+  ids.reserve(numReservedCols);
+  ids.append(otherIds.begin(), otherIds.end());
+
+  unsigned numReservedEqualities = other.getNumReservedEqualities();
+  unsigned numReservedInequalities = other.getNumReservedInequalities();
+
+  equalities.reserve(numReservedEqualities * numReservedCols);
+  inequalities.reserve(numReservedInequalities * numReservedCols);
+
+  for (unsigned r = 0, e = other.getNumInequalities(); r < e; r++) {
+    addInequality(other.getInequality(r));
+  }
+  for (unsigned r = 0, e = other.getNumEqualities(); r < e; r++) {
+    addEquality(other.getEquality(r));
+  }
+}
+
+// Clones this object.
+std::unique_ptr<FlatAffineConstraints> FlatAffineConstraints::clone() const {
+  return std::make_unique<FlatAffineConstraints>(*this);
+}
+
+// Construct from an IntegerSet.
+FlatAffineConstraints::FlatAffineConstraints(IntegerSet set)
+    : numReservedCols(set.getNumInputs() + 1),
+      numIds(set.getNumDims() + set.getNumSymbols()), numDims(set.getNumDims()),
+      numSymbols(set.getNumSymbols()) {
+  equalities.reserve(set.getNumEqualities() * numReservedCols);
+  inequalities.reserve(set.getNumInequalities() * numReservedCols);
+  ids.resize(numIds, None);
+
+  // Flatten expressions and add them to the constraint system.
+  std::vector<SmallVector<int64_t, 8>> flatExprs;
+  FlatAffineConstraints localVarCst;
+  if (failed(getFlattenedAffineExprs(set, &flatExprs, &localVarCst))) {
+    assert(false && "flattening unimplemented for semi-affine integer sets");
+    return;
+  }
+  assert(flatExprs.size() == set.getNumConstraints());
+  for (unsigned l = 0, e = localVarCst.getNumLocalIds(); l < e; l++) {
+    addLocalId(getNumLocalIds());
+  }
+
+  for (unsigned i = 0, e = flatExprs.size(); i < e; ++i) {
+    const auto &flatExpr = flatExprs[i];
+    assert(flatExpr.size() == getNumCols());
+    if (set.getEqFlags()[i]) {
+      addEquality(flatExpr);
+    } else {
+      addInequality(flatExpr);
+    }
+  }
+  // Add the other constraints involving local id's from flattening.
+  append(localVarCst);
+}
+
+void FlatAffineConstraints::reset(unsigned numReservedInequalities,
+                                  unsigned numReservedEqualities,
+                                  unsigned newNumReservedCols,
+                                  unsigned newNumDims, unsigned newNumSymbols,
+                                  unsigned newNumLocals,
+                                  ArrayRef<Value> idArgs) {
+  assert(newNumReservedCols >= newNumDims + newNumSymbols + newNumLocals + 1 &&
+         "minimum 1 column");
+  numReservedCols = newNumReservedCols;
+  numDims = newNumDims;
+  numSymbols = newNumSymbols;
+  numIds = numDims + numSymbols + newNumLocals;
+  assert(idArgs.empty() || idArgs.size() == numIds);
+
+  clearConstraints();
+  if (numReservedEqualities >= 1)
+    equalities.reserve(newNumReservedCols * numReservedEqualities);
+  if (numReservedInequalities >= 1)
+    inequalities.reserve(newNumReservedCols * numReservedInequalities);
+  if (idArgs.empty()) {
+    ids.resize(numIds, None);
+  } else {
+    ids.assign(idArgs.begin(), idArgs.end());
+  }
+}
+
+void FlatAffineConstraints::reset(unsigned newNumDims, unsigned newNumSymbols,
+                                  unsigned newNumLocals,
+                                  ArrayRef<Value> idArgs) {
+  reset(0, 0, newNumDims + newNumSymbols + newNumLocals + 1, newNumDims,
+        newNumSymbols, newNumLocals, idArgs);
+}
+
+void FlatAffineConstraints::append(const FlatAffineConstraints &other) {
+  assert(other.getNumCols() == getNumCols());
+  assert(other.getNumDimIds() == getNumDimIds());
+  assert(other.getNumSymbolIds() == getNumSymbolIds());
+
+  inequalities.reserve(inequalities.size() +
+                       other.getNumInequalities() * numReservedCols);
+  equalities.reserve(equalities.size() +
+                     other.getNumEqualities() * numReservedCols);
+
+  for (unsigned r = 0, e = other.getNumInequalities(); r < e; r++) {
+    addInequality(other.getInequality(r));
+  }
+  for (unsigned r = 0, e = other.getNumEqualities(); r < e; r++) {
+    addEquality(other.getEquality(r));
+  }
+}
+
+void FlatAffineConstraints::addLocalId(unsigned pos) {
+  addId(IdKind::Local, pos);
+}
+
+void FlatAffineConstraints::addDimId(unsigned pos, Value id) {
+  addId(IdKind::Dimension, pos, id);
+}
+
+void FlatAffineConstraints::addSymbolId(unsigned pos, Value id) {
+  addId(IdKind::Symbol, pos, id);
+}
+
+/// Adds a dimensional identifier. The added column is initialized to
+/// zero.
+void FlatAffineConstraints::addId(IdKind kind, unsigned pos, Value id) {
+  if (kind == IdKind::Dimension) {
+    assert(pos <= getNumDimIds());
+  } else if (kind == IdKind::Symbol) {
+    assert(pos <= getNumSymbolIds());
+  } else {
+    assert(pos <= getNumLocalIds());
+  }
+
+  unsigned oldNumReservedCols = numReservedCols;
+
+  // Check if a resize is necessary.
+  if (getNumCols() + 1 > numReservedCols) {
+    equalities.resize(getNumEqualities() * (getNumCols() + 1));
+    inequalities.resize(getNumInequalities() * (getNumCols() + 1));
+    numReservedCols++;
+  }
+
+  int absolutePos;
+
+  if (kind == IdKind::Dimension) {
+    absolutePos = pos;
+    numDims++;
+  } else if (kind == IdKind::Symbol) {
+    absolutePos = pos + getNumDimIds();
+    numSymbols++;
+  } else {
+    absolutePos = pos + getNumDimIds() + getNumSymbolIds();
+  }
+  numIds++;
+
+  // Note that getNumCols() now will already return the new size, which will be
+  // at least one.
+  int numInequalities = static_cast<int>(getNumInequalities());
+  int numEqualities = static_cast<int>(getNumEqualities());
+  int numCols = static_cast<int>(getNumCols());
+  for (int r = numInequalities - 1; r >= 0; r--) {
+    for (int c = numCols - 2; c >= 0; c--) {
+      if (c < absolutePos)
+        atIneq(r, c) = inequalities[r * oldNumReservedCols + c];
+      else
+        atIneq(r, c + 1) = inequalities[r * oldNumReservedCols + c];
+    }
+    atIneq(r, absolutePos) = 0;
+  }
+
+  for (int r = numEqualities - 1; r >= 0; r--) {
+    for (int c = numCols - 2; c >= 0; c--) {
+      // All values in column absolutePositions < absolutePos have the same
+      // coordinates in the 2-d view of the coefficient buffer.
+      if (c < absolutePos)
+        atEq(r, c) = equalities[r * oldNumReservedCols + c];
+      else
+        // Those at absolutePosition >= absolutePos, get a shifted
+        // absolutePosition.
+        atEq(r, c + 1) = equalities[r * oldNumReservedCols + c];
+    }
+    // Initialize added dimension to zero.
+    atEq(r, absolutePos) = 0;
+  }
+
+  // If an 'id' is provided, insert it; otherwise use None.
+  if (id) {
+    ids.insert(ids.begin() + absolutePos, id);
+  } else {
+    ids.insert(ids.begin() + absolutePos, None);
+  }
+  assert(ids.size() == getNumIds());
+}
+
+/// Checks if two constraint systems are in the same space, i.e., if they are
+/// associated with the same set of identifiers, appearing in the same order.
+static bool areIdsAligned(const FlatAffineConstraints &A,
+                          const FlatAffineConstraints &B) {
+  return A.getNumDimIds() == B.getNumDimIds() &&
+         A.getNumSymbolIds() == B.getNumSymbolIds() &&
+         A.getNumIds() == B.getNumIds() && A.getIds().equals(B.getIds());
+}
+
+/// Calls areIdsAligned to check if two constraint systems have the same set
+/// of identifiers in the same order.
+bool FlatAffineConstraints::areIdsAlignedWithOther(
+    const FlatAffineConstraints &other) {
+  return areIdsAligned(*this, other);
+}
+
+/// Checks if the SSA values associated with `cst''s identifiers are unique.
+static bool LLVM_ATTRIBUTE_UNUSED
+areIdsUnique(const FlatAffineConstraints &cst) {
+  SmallPtrSet<Value, 8> uniqueIds;
+  for (auto id : cst.getIds()) {
+    if (id.hasValue() && !uniqueIds.insert(id.getValue()).second)
+      return false;
+  }
+  return true;
+}
+
+// Swap the posA^th identifier with the posB^th identifier.
+static void swapId(FlatAffineConstraints *A, unsigned posA, unsigned posB) {
+  assert(posA < A->getNumIds() && "invalid position A");
+  assert(posB < A->getNumIds() && "invalid position B");
+
+  if (posA == posB)
+    return;
+
+  for (unsigned r = 0, e = A->getNumInequalities(); r < e; r++) {
+    std::swap(A->atIneq(r, posA), A->atIneq(r, posB));
+  }
+  for (unsigned r = 0, e = A->getNumEqualities(); r < e; r++) {
+    std::swap(A->atEq(r, posA), A->atEq(r, posB));
+  }
+  std::swap(A->getId(posA), A->getId(posB));
+}
+
+/// Merge and align the identifiers of A and B starting at 'offset', so that
+/// both constraint systems get the union of the contained identifiers that is
+/// dimension-wise and symbol-wise unique; both constraint systems are updated
+/// so that they have the union of all identifiers, with A's original
+/// identifiers appearing first followed by any of B's identifiers that didn't
+/// appear in A. Local identifiers of each system are by design separate/local
+/// and are placed one after other (A's followed by B's).
+//  Eg: Input: A has ((%i %j) [%M %N]) and B has (%k, %j) [%P, %N, %M])
+//      Output: both A, B have (%i, %j, %k) [%M, %N, %P]
+//
+static void mergeAndAlignIds(unsigned offset, FlatAffineConstraints *A,
+                             FlatAffineConstraints *B) {
+  assert(offset <= A->getNumDimIds() && offset <= B->getNumDimIds());
+  // A merge/align isn't meaningful if a cst's ids aren't distinct.
+  assert(areIdsUnique(*A) && "A's id values aren't unique");
+  assert(areIdsUnique(*B) && "B's id values aren't unique");
+
+  assert(std::all_of(A->getIds().begin() + offset,
+                     A->getIds().begin() + A->getNumDimAndSymbolIds(),
+                     [](Optional<Value> id) { return id.hasValue(); }));
+
+  assert(std::all_of(B->getIds().begin() + offset,
+                     B->getIds().begin() + B->getNumDimAndSymbolIds(),
+                     [](Optional<Value> id) { return id.hasValue(); }));
+
+  // Place local id's of A after local id's of B.
+  for (unsigned l = 0, e = A->getNumLocalIds(); l < e; l++) {
+    B->addLocalId(0);
+  }
+  for (unsigned t = 0, e = B->getNumLocalIds() - A->getNumLocalIds(); t < e;
+       t++) {
+    A->addLocalId(A->getNumLocalIds());
+  }
+
+  SmallVector<Value, 4> aDimValues, aSymValues;
+  A->getIdValues(offset, A->getNumDimIds(), &aDimValues);
+  A->getIdValues(A->getNumDimIds(), A->getNumDimAndSymbolIds(), &aSymValues);
+  {
+    // Merge dims from A into B.
+    unsigned d = offset;
+    for (auto aDimValue : aDimValues) {
+      unsigned loc;
+      if (B->findId(*aDimValue, &loc)) {
+        assert(loc >= offset && "A's dim appears in B's aligned range");
+        assert(loc < B->getNumDimIds() &&
+               "A's dim appears in B's non-dim position");
+        swapId(B, d, loc);
+      } else {
+        B->addDimId(d);
+        B->setIdValue(d, aDimValue);
+      }
+      d++;
+    }
+
+    // Dimensions that are in B, but not in A, are added at the end.
+    for (unsigned t = A->getNumDimIds(), e = B->getNumDimIds(); t < e; t++) {
+      A->addDimId(A->getNumDimIds());
+      A->setIdValue(A->getNumDimIds() - 1, B->getIdValue(t));
+    }
+  }
+  {
+    // Merge symbols: merge A's symbols into B first.
+    unsigned s = B->getNumDimIds();
+    for (auto aSymValue : aSymValues) {
+      unsigned loc;
+      if (B->findId(*aSymValue, &loc)) {
+        assert(loc >= B->getNumDimIds() && loc < B->getNumDimAndSymbolIds() &&
+               "A's symbol appears in B's non-symbol position");
+        swapId(B, s, loc);
+      } else {
+        B->addSymbolId(s - B->getNumDimIds());
+        B->setIdValue(s, aSymValue);
+      }
+      s++;
+    }
+    // Symbols that are in B, but not in A, are added at the end.
+    for (unsigned t = A->getNumDimAndSymbolIds(),
+                  e = B->getNumDimAndSymbolIds();
+         t < e; t++) {
+      A->addSymbolId(A->getNumSymbolIds());
+      A->setIdValue(A->getNumDimAndSymbolIds() - 1, B->getIdValue(t));
+    }
+  }
+  assert(areIdsAligned(*A, *B) && "IDs expected to be aligned");
+}
+
+// Call 'mergeAndAlignIds' to align constraint systems of 'this' and 'other'.
+void FlatAffineConstraints::mergeAndAlignIdsWithOther(
+    unsigned offset, FlatAffineConstraints *other) {
+  mergeAndAlignIds(offset, this, other);
+}
+
+// This routine may add additional local variables if the flattened expression
+// corresponding to the map has such variables due to mod's, ceildiv's, and
+// floordiv's in it.
+LogicalResult FlatAffineConstraints::composeMap(const AffineValueMap *vMap) {
+  std::vector<SmallVector<int64_t, 8>> flatExprs;
+  FlatAffineConstraints localCst;
+  if (failed(getFlattenedAffineExprs(vMap->getAffineMap(), &flatExprs,
+                                     &localCst))) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "composition unimplemented for semi-affine maps\n");
+    return failure();
+  }
+  assert(flatExprs.size() == vMap->getNumResults());
+
+  // Add localCst information.
+  if (localCst.getNumLocalIds() > 0) {
+    localCst.setIdValues(0, /*end=*/localCst.getNumDimAndSymbolIds(),
+                         /*values=*/vMap->getOperands());
+    // Align localCst and this.
+    mergeAndAlignIds(/*offset=*/0, &localCst, this);
+    // Finally, append localCst to this constraint set.
+    append(localCst);
+  }
+
+  // Add dimensions corresponding to the map's results.
+  for (unsigned t = 0, e = vMap->getNumResults(); t < e; t++) {
+    // TODO: Consider using a batched version to add a range of IDs.
+    addDimId(0);
+  }
+
+  // We add one equality for each result connecting the result dim of the map to
+  // the other identifiers.
+  // For eg: if the expression is 16*i0 + i1, and this is the r^th
+  // iteration/result of the value map, we are adding the equality:
+  //  d_r - 16*i0 - i1 = 0. Hence, when flattening say (i0 + 1, i0 + 8*i2), we
+  //  add two equalities overall: d_0 - i0 - 1 == 0, d1 - i0 - 8*i2 == 0.
+  for (unsigned r = 0, e = flatExprs.size(); r < e; r++) {
+    const auto &flatExpr = flatExprs[r];
+    assert(flatExpr.size() >= vMap->getNumOperands() + 1);
+
+    // eqToAdd is the equality corresponding to the flattened affine expression.
+    SmallVector<int64_t, 8> eqToAdd(getNumCols(), 0);
+    // Set the coefficient for this result to one.
+    eqToAdd[r] = 1;
+
+    // Dims and symbols.
+    for (unsigned i = 0, e = vMap->getNumOperands(); i < e; i++) {
+      unsigned loc;
+      bool ret = findId(*vMap->getOperand(i), &loc);
+      assert(ret && "value map's id can't be found");
+      (void)ret;
+      // Negate 'eq[r]' since the newly added dimension will be set to this one.
+      eqToAdd[loc] = -flatExpr[i];
+    }
+    // Local vars common to eq and localCst are at the beginning.
+    unsigned j = getNumDimIds() + getNumSymbolIds();
+    unsigned end = flatExpr.size() - 1;
+    for (unsigned i = vMap->getNumOperands(); i < end; i++, j++) {
+      eqToAdd[j] = -flatExpr[i];
+    }
+
+    // Constant term.
+    eqToAdd[getNumCols() - 1] = -flatExpr[flatExpr.size() - 1];
+
+    // Add the equality connecting the result of the map to this constraint set.
+    addEquality(eqToAdd);
+  }
+
+  return success();
+}
+
+// Similar to composeMap except that no Value's need be associated with the
+// constraint system nor are they looked at -- since the dimensions and
+// symbols of 'other' are expected to correspond 1:1 to 'this' system. It
+// is thus not convenient to share code with composeMap.
+LogicalResult FlatAffineConstraints::composeMatchingMap(AffineMap other) {
+  assert(other.getNumDims() == getNumDimIds() && "dim mismatch");
+  assert(other.getNumSymbols() == getNumSymbolIds() && "symbol mismatch");
+
+  std::vector<SmallVector<int64_t, 8>> flatExprs;
+  FlatAffineConstraints localCst;
+  if (failed(getFlattenedAffineExprs(other, &flatExprs, &localCst))) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "composition unimplemented for semi-affine maps\n");
+    return failure();
+  }
+  assert(flatExprs.size() == other.getNumResults());
+
+  // Add localCst information.
+  if (localCst.getNumLocalIds() > 0) {
+    // Place local id's of A after local id's of B.
+    for (unsigned l = 0, e = localCst.getNumLocalIds(); l < e; l++) {
+      addLocalId(0);
+    }
+    // Finally, append localCst to this constraint set.
+    append(localCst);
+  }
+
+  // Add dimensions corresponding to the map's results.
+  for (unsigned t = 0, e = other.getNumResults(); t < e; t++) {
+    addDimId(0);
+  }
+
+  // We add one equality for each result connecting the result dim of the map to
+  // the other identifiers.
+  // For eg: if the expression is 16*i0 + i1, and this is the r^th
+  // iteration/result of the value map, we are adding the equality:
+  //  d_r - 16*i0 - i1 = 0. Hence, when flattening say (i0 + 1, i0 + 8*i2), we
+  //  add two equalities overall: d_0 - i0 - 1 == 0, d1 - i0 - 8*i2 == 0.
+  for (unsigned r = 0, e = flatExprs.size(); r < e; r++) {
+    const auto &flatExpr = flatExprs[r];
+    assert(flatExpr.size() >= other.getNumInputs() + 1);
+
+    // eqToAdd is the equality corresponding to the flattened affine expression.
+    SmallVector<int64_t, 8> eqToAdd(getNumCols(), 0);
+    // Set the coefficient for this result to one.
+    eqToAdd[r] = 1;
+
+    // Dims and symbols.
+    for (unsigned i = 0, f = other.getNumInputs(); i < f; i++) {
+      // Negate 'eq[r]' since the newly added dimension will be set to this one.
+      eqToAdd[e + i] = -flatExpr[i];
+    }
+    // Local vars common to eq and localCst are at the beginning.
+    unsigned j = getNumDimIds() + getNumSymbolIds();
+    unsigned end = flatExpr.size() - 1;
+    for (unsigned i = other.getNumInputs(); i < end; i++, j++) {
+      eqToAdd[j] = -flatExpr[i];
+    }
+
+    // Constant term.
+    eqToAdd[getNumCols() - 1] = -flatExpr[flatExpr.size() - 1];
+
+    // Add the equality connecting the result of the map to this constraint set.
+    addEquality(eqToAdd);
+  }
+
+  return success();
+}
+
+// Turn a dimension into a symbol.
+static void turnDimIntoSymbol(FlatAffineConstraints *cst, Value id) {
+  unsigned pos;
+  if (cst->findId(id, &pos) && pos < cst->getNumDimIds()) {
+    swapId(cst, pos, cst->getNumDimIds() - 1);
+    cst->setDimSymbolSeparation(cst->getNumSymbolIds() + 1);
+  }
+}
+
+// Turn a symbol into a dimension.
+static void turnSymbolIntoDim(FlatAffineConstraints *cst, Value id) {
+  unsigned pos;
+  if (cst->findId(id, &pos) && pos >= cst->getNumDimIds() &&
+      pos < cst->getNumDimAndSymbolIds()) {
+    swapId(cst, pos, cst->getNumDimIds());
+    cst->setDimSymbolSeparation(cst->getNumSymbolIds() - 1);
+  }
+}
+
+// Changes all symbol identifiers which are loop IVs to dim identifiers.
+void FlatAffineConstraints::convertLoopIVSymbolsToDims() {
+  // Gather all symbols which are loop IVs.
+  SmallVector<Value, 4> loopIVs;
+  for (unsigned i = getNumDimIds(), e = getNumDimAndSymbolIds(); i < e; i++) {
+    if (ids[i].hasValue() && getForInductionVarOwner(ids[i].getValue()))
+      loopIVs.push_back(ids[i].getValue());
+  }
+  // Turn each symbol in 'loopIVs' into a dim identifier.
+  for (auto iv : loopIVs) {
+    turnSymbolIntoDim(this, *iv);
+  }
+}
+
+void FlatAffineConstraints::addInductionVarOrTerminalSymbol(Value id) {
+  if (containsId(*id))
+    return;
+
+  // Caller is expected to fully compose map/operands if necessary.
+  assert((isTopLevelValue(id) || isForInductionVar(id)) &&
+         "non-terminal symbol / loop IV expected");
+  // Outer loop IVs could be used in forOp's bounds.
+  if (auto loop = getForInductionVarOwner(id)) {
+    addDimId(getNumDimIds(), id);
+    if (failed(this->addAffineForOpDomain(loop)))
+      LLVM_DEBUG(
+          loop.emitWarning("failed to add domain info to constraint system"));
+    return;
+  }
+  // Add top level symbol.
+  addSymbolId(getNumSymbolIds(), id);
+  // Check if the symbol is a constant.
+  if (auto constOp = dyn_cast_or_null<ConstantIndexOp>(id->getDefiningOp()))
+    setIdToConstant(*id, constOp.getValue());
+}
+
+LogicalResult FlatAffineConstraints::addAffineForOpDomain(AffineForOp forOp) {
+  unsigned pos;
+  // Pre-condition for this method.
+  if (!findId(*forOp.getInductionVar(), &pos)) {
+    assert(false && "Value not found");
+    return failure();
+  }
+
+  int64_t step = forOp.getStep();
+  if (step != 1) {
+    if (!forOp.hasConstantLowerBound())
+      forOp.emitWarning("domain conservatively approximated");
+    else {
+      // Add constraints for the stride.
+      // (iv - lb) % step = 0 can be written as:
+      // (iv - lb) - step * q = 0 where q = (iv - lb) / step.
+      // Add local variable 'q' and add the above equality.
+      // The first constraint is q = (iv - lb) floordiv step
+      SmallVector<int64_t, 8> dividend(getNumCols(), 0);
+      int64_t lb = forOp.getConstantLowerBound();
+      dividend[pos] = 1;
+      dividend.back() -= lb;
+      addLocalFloorDiv(dividend, step);
+      // Second constraint: (iv - lb) - step * q = 0.
+      SmallVector<int64_t, 8> eq(getNumCols(), 0);
+      eq[pos] = 1;
+      eq.back() -= lb;
+      // For the local var just added above.
+      eq[getNumCols() - 2] = -step;
+      addEquality(eq);
+    }
+  }
+
+  if (forOp.hasConstantLowerBound()) {
+    addConstantLowerBound(pos, forOp.getConstantLowerBound());
+  } else {
+    // Non-constant lower bound case.
+    SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands().begin(),
+                                     forOp.getLowerBoundOperands().end());
+    if (failed(addLowerOrUpperBound(pos, forOp.getLowerBoundMap(), lbOperands,
+                                    /*eq=*/false, /*lower=*/true)))
+      return failure();
+  }
+
+  if (forOp.hasConstantUpperBound()) {
+    addConstantUpperBound(pos, forOp.getConstantUpperBound() - 1);
+    return success();
+  }
+  // Non-constant upper bound case.
+  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands().begin(),
+                                   forOp.getUpperBoundOperands().end());
+  return addLowerOrUpperBound(pos, forOp.getUpperBoundMap(), ubOperands,
+                              /*eq=*/false, /*lower=*/false);
+}
+
+// Searches for a constraint with a non-zero coefficient at 'colIdx' in
+// equality (isEq=true) or inequality (isEq=false) constraints.
+// Returns true and sets row found in search in 'rowIdx'.
+// Returns false otherwise.
+static bool
+findConstraintWithNonZeroAt(const FlatAffineConstraints &constraints,
+                            unsigned colIdx, bool isEq, unsigned *rowIdx) {
+  auto at = [&](unsigned rowIdx) -> int64_t {
+    return isEq ? constraints.atEq(rowIdx, colIdx)
+                : constraints.atIneq(rowIdx, colIdx);
+  };
+  unsigned e =
+      isEq ? constraints.getNumEqualities() : constraints.getNumInequalities();
+  for (*rowIdx = 0; *rowIdx < e; ++(*rowIdx)) {
+    if (at(*rowIdx) != 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Normalizes the coefficient values across all columns in 'rowIDx' by their
+// GCD in equality or inequality constraints as specified by 'isEq'.
+template <bool isEq>
+static void normalizeConstraintByGCD(FlatAffineConstraints *constraints,
+                                     unsigned rowIdx) {
+  auto at = [&](unsigned colIdx) -> int64_t {
+    return isEq ? constraints->atEq(rowIdx, colIdx)
+                : constraints->atIneq(rowIdx, colIdx);
+  };
+  uint64_t gcd = std::abs(at(0));
+  for (unsigned j = 1, e = constraints->getNumCols(); j < e; ++j) {
+    gcd = llvm::GreatestCommonDivisor64(gcd, std::abs(at(j)));
+  }
+  if (gcd > 0 && gcd != 1) {
+    for (unsigned j = 0, e = constraints->getNumCols(); j < e; ++j) {
+      int64_t v = at(j) / static_cast<int64_t>(gcd);
+      isEq ? constraints->atEq(rowIdx, j) = v
+           : constraints->atIneq(rowIdx, j) = v;
+    }
+  }
+}
+
+void FlatAffineConstraints::normalizeConstraintsByGCD() {
+  for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
+    normalizeConstraintByGCD</*isEq=*/true>(this, i);
+  }
+  for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) {
+    normalizeConstraintByGCD</*isEq=*/false>(this, i);
+  }
+}
+
+bool FlatAffineConstraints::hasConsistentState() const {
+  if (inequalities.size() != getNumInequalities() * numReservedCols)
+    return false;
+  if (equalities.size() != getNumEqualities() * numReservedCols)
+    return false;
+  if (ids.size() != getNumIds())
+    return false;
+
+  // Catches errors where numDims, numSymbols, numIds aren't consistent.
+  if (numDims > numIds || numSymbols > numIds || numDims + numSymbols > numIds)
+    return false;
+
+  return true;
+}
+
+/// Checks all rows of equality/inequality constraints for trivial
+/// contradictions (for example: 1 == 0, 0 >= 1), which may have surfaced
+/// after elimination. Returns 'true' if an invalid constraint is found;
+/// 'false' otherwise.
+bool FlatAffineConstraints::hasInvalidConstraint() const {
+  assert(hasConsistentState());
+  auto check = [&](bool isEq) -> bool {
+    unsigned numCols = getNumCols();
+    unsigned numRows = isEq ? getNumEqualities() : getNumInequalities();
+    for (unsigned i = 0, e = numRows; i < e; ++i) {
+      unsigned j;
+      for (j = 0; j < numCols - 1; ++j) {
+        int64_t v = isEq ? atEq(i, j) : atIneq(i, j);
+        // Skip rows with non-zero variable coefficients.
+        if (v != 0)
+          break;
+      }
+      if (j < numCols - 1) {
+        continue;
+      }
+      // Check validity of constant term at 'numCols - 1' w.r.t 'isEq'.
+      // Example invalid constraints include: '1 == 0' or '-1 >= 0'
+      int64_t v = isEq ? atEq(i, numCols - 1) : atIneq(i, numCols - 1);
+      if ((isEq && v != 0) || (!isEq && v < 0)) {
+        return true;
+      }
+    }
+    return false;
+  };
+  if (check(/*isEq=*/true))
+    return true;
+  return check(/*isEq=*/false);
+}
+
+// Eliminate identifier from constraint at 'rowIdx' based on coefficient at
+// pivotRow, pivotCol. Columns in range [elimColStart, pivotCol) will not be
+// updated as they have already been eliminated.
+static void eliminateFromConstraint(FlatAffineConstraints *constraints,
+                                    unsigned rowIdx, unsigned pivotRow,
+                                    unsigned pivotCol, unsigned elimColStart,
+                                    bool isEq) {
+  // Skip if equality 'rowIdx' if same as 'pivotRow'.
+  if (isEq && rowIdx == pivotRow)
+    return;
+  auto at = [&](unsigned i, unsigned j) -> int64_t {
+    return isEq ? constraints->atEq(i, j) : constraints->atIneq(i, j);
+  };
+  int64_t leadCoeff = at(rowIdx, pivotCol);
+  // Skip if leading coefficient at 'rowIdx' is already zero.
+  if (leadCoeff == 0)
+    return;
+  int64_t pivotCoeff = constraints->atEq(pivotRow, pivotCol);
+  int64_t sign = (leadCoeff * pivotCoeff > 0) ? -1 : 1;
+  int64_t lcm = mlir::lcm(pivotCoeff, leadCoeff);
+  int64_t pivotMultiplier = sign * (lcm / std::abs(pivotCoeff));
+  int64_t rowMultiplier = lcm / std::abs(leadCoeff);
+
+  unsigned numCols = constraints->getNumCols();
+  for (unsigned j = 0; j < numCols; ++j) {
+    // Skip updating column 'j' if it was just eliminated.
+    if (j >= elimColStart && j < pivotCol)
+      continue;
+    int64_t v = pivotMultiplier * constraints->atEq(pivotRow, j) +
+                rowMultiplier * at(rowIdx, j);
+    isEq ? constraints->atEq(rowIdx, j) = v
+         : constraints->atIneq(rowIdx, j) = v;
+  }
+}
+
+// Remove coefficients in column range [colStart, colLimit) in place.
+// This removes in data in the specified column range, and copies any
+// remaining valid data into place.
+static void shiftColumnsToLeft(FlatAffineConstraints *constraints,
+                               unsigned colStart, unsigned colLimit,
+                               bool isEq) {
+  assert(colLimit <= constraints->getNumIds());
+  if (colLimit <= colStart)
+    return;
+
+  unsigned numCols = constraints->getNumCols();
+  unsigned numRows = isEq ? constraints->getNumEqualities()
+                          : constraints->getNumInequalities();
+  unsigned numToEliminate = colLimit - colStart;
+  for (unsigned r = 0, e = numRows; r < e; ++r) {
+    for (unsigned c = colLimit; c < numCols; ++c) {
+      if (isEq) {
+        constraints->atEq(r, c - numToEliminate) = constraints->atEq(r, c);
+      } else {
+        constraints->atIneq(r, c - numToEliminate) = constraints->atIneq(r, c);
+      }
+    }
+  }
+}
+
+// Removes identifiers in column range [idStart, idLimit), and copies any
+// remaining valid data into place, and updates member variables.
+void FlatAffineConstraints::removeIdRange(unsigned idStart, unsigned idLimit) {
+  assert(idLimit < getNumCols() && "invalid id limit");
+
+  if (idStart >= idLimit)
+    return;
+
+  // We are going to be removing one or more identifiers from the range.
+  assert(idStart < numIds && "invalid idStart position");
+
+  // TODO(andydavis) Make 'removeIdRange' a lambda called from here.
+  // Remove eliminated identifiers from equalities.
+  shiftColumnsToLeft(this, idStart, idLimit, /*isEq=*/true);
+
+  // Remove eliminated identifiers from inequalities.
+  shiftColumnsToLeft(this, idStart, idLimit, /*isEq=*/false);
+
+  // Update members numDims, numSymbols and numIds.
+  unsigned numDimsEliminated = 0;
+  unsigned numLocalsEliminated = 0;
+  unsigned numColsEliminated = idLimit - idStart;
+  if (idStart < numDims) {
+    numDimsEliminated = std::min(numDims, idLimit) - idStart;
+  }
+  // Check how many local id's were removed. Note that our identifier order is
+  // [dims, symbols, locals]. Local id start at position numDims + numSymbols.
+  if (idLimit > numDims + numSymbols) {
+    numLocalsEliminated = std::min(
+        idLimit - std::max(idStart, numDims + numSymbols), getNumLocalIds());
+  }
+  unsigned numSymbolsEliminated =
+      numColsEliminated - numDimsEliminated - numLocalsEliminated;
+
+  numDims -= numDimsEliminated;
+  numSymbols -= numSymbolsEliminated;
+  numIds = numIds - numColsEliminated;
+
+  ids.erase(ids.begin() + idStart, ids.begin() + idLimit);
+
+  // No resize necessary. numReservedCols remains the same.
+}
+
+/// Returns the position of the identifier that has the minimum <number of lower
+/// bounds> times <number of upper bounds> from the specified range of
+/// identifiers [start, end). It is often best to eliminate in the increasing
+/// order of these counts when doing Fourier-Motzkin elimination since FM adds
+/// that many new constraints.
+static unsigned getBestIdToEliminate(const FlatAffineConstraints &cst,
+                                     unsigned start, unsigned end) {
+  assert(start < cst.getNumIds() && end < cst.getNumIds() + 1);
+
+  auto getProductOfNumLowerUpperBounds = [&](unsigned pos) {
+    unsigned numLb = 0;
+    unsigned numUb = 0;
+    for (unsigned r = 0, e = cst.getNumInequalities(); r < e; r++) {
+      if (cst.atIneq(r, pos) > 0) {
+        ++numLb;
+      } else if (cst.atIneq(r, pos) < 0) {
+        ++numUb;
+      }
+    }
+    return numLb * numUb;
+  };
+
+  unsigned minLoc = start;
+  unsigned min = getProductOfNumLowerUpperBounds(start);
+  for (unsigned c = start + 1; c < end; c++) {
+    unsigned numLbUbProduct = getProductOfNumLowerUpperBounds(c);
+    if (numLbUbProduct < min) {
+      min = numLbUbProduct;
+      minLoc = c;
+    }
+  }
+  return minLoc;
+}
+
+// Checks for emptiness of the set by eliminating identifiers successively and
+// using the GCD test (on all equality constraints) and checking for trivially
+// invalid constraints. Returns 'true' if the constraint system is found to be
+// empty; false otherwise.
+bool FlatAffineConstraints::isEmpty() const {
+  if (isEmptyByGCDTest() || hasInvalidConstraint())
+    return true;
+
+  // First, eliminate as many identifiers as possible using Gaussian
+  // elimination.
+  FlatAffineConstraints tmpCst(*this);
+  unsigned currentPos = 0;
+  while (currentPos < tmpCst.getNumIds()) {
+    tmpCst.gaussianEliminateIds(currentPos, tmpCst.getNumIds());
+    ++currentPos;
+    // We check emptiness through trivial checks after eliminating each ID to
+    // detect emptiness early. Since the checks isEmptyByGCDTest() and
+    // hasInvalidConstraint() are linear time and single sweep on the constraint
+    // buffer, this appears reasonable - but can optimize in the future.
+    if (tmpCst.hasInvalidConstraint() || tmpCst.isEmptyByGCDTest())
+      return true;
+  }
+
+  // Eliminate the remaining using FM.
+  for (unsigned i = 0, e = tmpCst.getNumIds(); i < e; i++) {
+    tmpCst.FourierMotzkinEliminate(
+        getBestIdToEliminate(tmpCst, 0, tmpCst.getNumIds()));
+    // Check for a constraint explosion. This rarely happens in practice, but
+    // this check exists as a safeguard against improperly constructed
+    // constraint systems or artificially created arbitrarily complex systems
+    // that aren't the intended use case for FlatAffineConstraints. This is
+    // needed since FM has a worst case exponential complexity in theory.
+    if (tmpCst.getNumConstraints() >= kExplosionFactor * getNumIds()) {
+      LLVM_DEBUG(llvm::dbgs() << "FM constraint explosion detected\n");
+      return false;
+    }
+
+    // FM wouldn't have modified the equalities in any way. So no need to again
+    // run GCD test. Check for trivial invalid constraints.
+    if (tmpCst.hasInvalidConstraint())
+      return true;
+  }
+  return false;
+}
+
+// Runs the GCD test on all equality constraints. Returns 'true' if this test
+// fails on any equality. Returns 'false' otherwise.
+// This test can be used to disprove the existence of a solution. If it returns
+// true, no integer solution to the equality constraints can exist.
+//
+// GCD test definition:
+//
+// The equality constraint:
+//
+//  c_1*x_1 + c_2*x_2 + ... + c_n*x_n = c_0
+//
+// has an integer solution iff:
+//
+//  GCD of c_1, c_2, ..., c_n divides c_0.
+//
+bool FlatAffineConstraints::isEmptyByGCDTest() const {
+  assert(hasConsistentState());
+  unsigned numCols = getNumCols();
+  for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
+    uint64_t gcd = std::abs(atEq(i, 0));
+    for (unsigned j = 1; j < numCols - 1; ++j) {
+      gcd = llvm::GreatestCommonDivisor64(gcd, std::abs(atEq(i, j)));
+    }
+    int64_t v = std::abs(atEq(i, numCols - 1));
+    if (gcd > 0 && (v % gcd != 0)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Tightens inequalities given that we are dealing with integer spaces. This is
+/// analogous to the GCD test but applied to inequalities. The constant term can
+/// be reduced to the preceding multiple of the GCD of the coefficients, i.e.,
+///  64*i - 100 >= 0  =>  64*i - 128 >= 0 (since 'i' is an integer). This is a
+/// fast method - linear in the number of coefficients.
+// Example on how this affects practical cases: consider the scenario:
+// 64*i >= 100, j = 64*i; without a tightening, elimination of i would yield
+// j >= 100 instead of the tighter (exact) j >= 128.
+void FlatAffineConstraints::GCDTightenInequalities() {
+  unsigned numCols = getNumCols();
+  for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) {
+    uint64_t gcd = std::abs(atIneq(i, 0));
+    for (unsigned j = 1; j < numCols - 1; ++j) {
+      gcd = llvm::GreatestCommonDivisor64(gcd, std::abs(atIneq(i, j)));
+    }
+    if (gcd > 0 && gcd != 1) {
+      int64_t gcdI = static_cast<int64_t>(gcd);
+      // Tighten the constant term and normalize the constraint by the GCD.
+      atIneq(i, numCols - 1) = mlir::floorDiv(atIneq(i, numCols - 1), gcdI);
+      for (unsigned j = 0, e = numCols - 1; j < e; ++j)
+        atIneq(i, j) /= gcdI;
+    }
+  }
+}
+
+// Eliminates all identifier variables in column range [posStart, posLimit).
+// Returns the number of variables eliminated.
+unsigned FlatAffineConstraints::gaussianEliminateIds(unsigned posStart,
+                                                     unsigned posLimit) {
+  // Return if identifier positions to eliminate are out of range.
+  assert(posLimit <= numIds);
+  assert(hasConsistentState());
+
+  if (posStart >= posLimit)
+    return 0;
+
+  GCDTightenInequalities();
+
+  unsigned pivotCol = 0;
+  for (pivotCol = posStart; pivotCol < posLimit; ++pivotCol) {
+    // Find a row which has a non-zero coefficient in column 'j'.
+    unsigned pivotRow;
+    if (!findConstraintWithNonZeroAt(*this, pivotCol, /*isEq=*/true,
+                                     &pivotRow)) {
+      // No pivot row in equalities with non-zero at 'pivotCol'.
+      if (!findConstraintWithNonZeroAt(*this, pivotCol, /*isEq=*/false,
+                                       &pivotRow)) {
+        // If inequalities are also non-zero in 'pivotCol', it can be
+        // eliminated.
+        continue;
+      }
+      break;
+    }
+
+    // Eliminate identifier at 'pivotCol' from each equality row.
+    for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
+      eliminateFromConstraint(this, i, pivotRow, pivotCol, posStart,
+                              /*isEq=*/true);
+      normalizeConstraintByGCD</*isEq=*/true>(this, i);
+    }
+
+    // Eliminate identifier at 'pivotCol' from each inequality row.
+    for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) {
+      eliminateFromConstraint(this, i, pivotRow, pivotCol, posStart,
+                              /*isEq=*/false);
+      normalizeConstraintByGCD</*isEq=*/false>(this, i);
+    }
+    removeEquality(pivotRow);
+    GCDTightenInequalities();
+  }
+  // Update position limit based on number eliminated.
+  posLimit = pivotCol;
+  // Remove eliminated columns from all constraints.
+  removeIdRange(posStart, posLimit);
+  return posLimit - posStart;
+}
+
+// Detect the identifier at 'pos' (say id_r) as modulo of another identifier
+// (say id_n) w.r.t a constant. When this happens, another identifier (say id_q)
+// could be detected as the floordiv of n. For eg:
+// id_n - 4*id_q - id_r = 0, 0 <= id_r <= 3    <=>
+//                          id_r = id_n mod 4, id_q = id_n floordiv 4.
+// lbConst and ubConst are the constant lower and upper bounds for 'pos' -
+// pre-detected at the caller.
+static bool detectAsMod(const FlatAffineConstraints &cst, unsigned pos,
+                        int64_t lbConst, int64_t ubConst,
+                        SmallVectorImpl<AffineExpr> *memo) {
+  assert(pos < cst.getNumIds() && "invalid position");
+
+  // Check if 0 <= id_r <= divisor - 1 and if id_r is equal to
+  // id_n - divisor * id_q. If these are true, then id_n becomes the dividend
+  // and id_q the quotient when dividing id_n by the divisor.
+
+  if (lbConst != 0 || ubConst < 1)
+    return false;
+
+  int64_t divisor = ubConst + 1;
+
+  // Now check for: id_r =  id_n - divisor * id_q. As an example, we
+  // are looking r = d - 4q, i.e., either r - d + 4q = 0 or -r + d - 4q = 0.
+  unsigned seenQuotient = 0, seenDividend = 0;
+  int quotientPos = -1, dividendPos = -1;
+  for (unsigned r = 0, e = cst.getNumEqualities(); r < e; r++) {
+    // id_n should have coeff 1 or -1.
+    if (std::abs(cst.atEq(r, pos)) != 1)
+      continue;
+    // constant term should be 0.
+    if (cst.atEq(r, cst.getNumCols() - 1) != 0)
+      continue;
+    unsigned c, f;
+    int quotientSign = 1, dividendSign = 1;
+    for (c = 0, f = cst.getNumDimAndSymbolIds(); c < f; c++) {
+      if (c == pos)
+        continue;
+      // The coefficient of the quotient should be +/-divisor.
+      // TODO(bondhugula): could be extended to detect an affine function for
+      // the quotient (i.e., the coeff could be a non-zero multiple of divisor).
+      int64_t v = cst.atEq(r, c) * cst.atEq(r, pos);
+      if (v == divisor || v == -divisor) {
+        seenQuotient++;
+        quotientPos = c;
+        quotientSign = v > 0 ? 1 : -1;
+      }
+      // The coefficient of the dividend should be +/-1.
+      // TODO(bondhugula): could be extended to detect an affine function of
+      // the other identifiers as the dividend.
+      else if (v == -1 || v == 1) {
+        seenDividend++;
+        dividendPos = c;
+        dividendSign = v < 0 ? 1 : -1;
+      } else if (cst.atEq(r, c) != 0) {
+        // Cannot be inferred as a mod since the constraint has a coefficient
+        // for an identifier that's neither a unit nor the divisor (see TODOs
+        // above).
+        break;
+      }
+    }
+    if (c < f)
+      // Cannot be inferred as a mod since the constraint has a coefficient for
+      // an identifier that's neither a unit nor the divisor (see TODOs above).
+      continue;
+
+    // We are looking for exactly one identifier as the dividend.
+    if (seenDividend == 1 && seenQuotient >= 1) {
+      if (!(*memo)[dividendPos])
+        return false;
+      // Successfully detected a mod.
+      (*memo)[pos] = (*memo)[dividendPos] % divisor * dividendSign;
+      auto ub = cst.getConstantUpperBound(dividendPos);
+      if (ub.hasValue() && ub.getValue() < divisor)
+        // The mod can be optimized away.
+        (*memo)[pos] = (*memo)[dividendPos] * dividendSign;
+      else
+        (*memo)[pos] = (*memo)[dividendPos] % divisor * dividendSign;
+
+      if (seenQuotient == 1 && !(*memo)[quotientPos])
+        // Successfully detected a floordiv as well.
+        (*memo)[quotientPos] =
+            (*memo)[dividendPos].floorDiv(divisor) * quotientSign;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Gather lower and upper bounds for the pos^th identifier.
+static void getLowerAndUpperBoundIndices(const FlatAffineConstraints &cst,
+                                         unsigned pos,
+                                         SmallVectorImpl<unsigned> *lbIndices,
+                                         SmallVectorImpl<unsigned> *ubIndices) {
+  assert(pos < cst.getNumIds() && "invalid position");
+
+  // Gather all lower bounds and upper bounds of the variable. Since the
+  // canonical form c_1*x_1 + c_2*x_2 + ... + c_0 >= 0, a constraint is a lower
+  // bound for x_i if c_i >= 1, and an upper bound if c_i <= -1.
+  for (unsigned r = 0, e = cst.getNumInequalities(); r < e; r++) {
+    if (cst.atIneq(r, pos) >= 1) {
+      // Lower bound.
+      lbIndices->push_back(r);
+    } else if (cst.atIneq(r, pos) <= -1) {
+      // Upper bound.
+      ubIndices->push_back(r);
+    }
+  }
+}
+
+// Check if the pos^th identifier can be expressed as a floordiv of an affine
+// function of other identifiers (where the divisor is a positive constant).
+// For eg: 4q <= i + j <= 4q + 3   <=>   q = (i + j) floordiv 4.
+bool detectAsFloorDiv(const FlatAffineConstraints &cst, unsigned pos,
+                      SmallVectorImpl<AffineExpr> *memo, MLIRContext *context) {
+  assert(pos < cst.getNumIds() && "invalid position");
+
+  SmallVector<unsigned, 4> lbIndices, ubIndices;
+  getLowerAndUpperBoundIndices(cst, pos, &lbIndices, &ubIndices);
+
+  // Check if any lower bound, upper bound pair is of the form:
+  // divisor * id >=  expr - (divisor - 1)    <-- Lower bound for 'id'
+  // divisor * id <=  expr                    <-- Upper bound for 'id'
+  // Then, 'id' is equivalent to 'expr floordiv divisor'.  (where divisor > 1).
+  //
+  // For example, if -32*k + 16*i + j >= 0
+  //                  32*k - 16*i - j + 31 >= 0   <=>
+  //             k = ( 16*i + j ) floordiv 32
+  unsigned seenDividends = 0;
+  for (auto ubPos : ubIndices) {
+    for (auto lbPos : lbIndices) {
+      // Check if lower bound's constant term is 'divisor - 1'. The 'divisor'
+      // here is cst.atIneq(lbPos, pos) and we already know that it's positive
+      // (since cst.Ineq(lbPos, ...) is a lower bound expression for 'pos'.
+      if (cst.atIneq(lbPos, cst.getNumCols() - 1) != cst.atIneq(lbPos, pos) - 1)
+        continue;
+      // Check if upper bound's constant term is 0.
+      if (cst.atIneq(ubPos, cst.getNumCols() - 1) != 0)
+        continue;
+      // For the remaining part, check if the lower bound expr's coeff's are
+      // negations of corresponding upper bound ones'.
+      unsigned c, f;
+      for (c = 0, f = cst.getNumCols() - 1; c < f; c++) {
+        if (cst.atIneq(lbPos, c) != -cst.atIneq(ubPos, c))
+          break;
+        if (c != pos && cst.atIneq(lbPos, c) != 0)
+          seenDividends++;
+      }
+      // Lb coeff's aren't negative of ub coeff's (for the non constant term
+      // part).
+      if (c < f)
+        continue;
+      if (seenDividends >= 1) {
+        // The divisor is the constant term of the lower bound expression.
+        // We already know that cst.atIneq(lbPos, pos) > 0.
+        int64_t divisor = cst.atIneq(lbPos, pos);
+        // Construct the dividend expression.
+        auto dividendExpr = getAffineConstantExpr(0, context);
+        unsigned c, f;
+        for (c = 0, f = cst.getNumCols() - 1; c < f; c++) {
+          if (c == pos)
+            continue;
+          int64_t ubVal = cst.atIneq(ubPos, c);
+          if (ubVal == 0)
+            continue;
+          if (!(*memo)[c])
+            break;
+          dividendExpr = dividendExpr + ubVal * (*memo)[c];
+        }
+        // Expression can't be constructed as it depends on a yet unknown
+        // identifier.
+        // TODO(mlir-team): Visit/compute the identifiers in an order so that
+        // this doesn't happen. More complex but much more efficient.
+        if (c < f)
+          continue;
+        // Successfully detected the floordiv.
+        (*memo)[pos] = dividendExpr.floorDiv(divisor);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Fills an inequality row with the value 'val'.
+static inline void fillInequality(FlatAffineConstraints *cst, unsigned r,
+                                  int64_t val) {
+  for (unsigned c = 0, f = cst->getNumCols(); c < f; c++) {
+    cst->atIneq(r, c) = val;
+  }
+}
+
+// Negates an inequality.
+static inline void negateInequality(FlatAffineConstraints *cst, unsigned r) {
+  for (unsigned c = 0, f = cst->getNumCols(); c < f; c++) {
+    cst->atIneq(r, c) = -cst->atIneq(r, c);
+  }
+}
+
+// A more complex check to eliminate redundant inequalities. Uses FourierMotzkin
+// to check if a constraint is redundant.
+void FlatAffineConstraints::removeRedundantInequalities() {
+  SmallVector<bool, 32> redun(getNumInequalities(), false);
+  // To check if an inequality is redundant, we replace the inequality by its
+  // complement (for eg., i - 1 >= 0 by i <= 0), and check if the resulting
+  // system is empty. If it is, the inequality is redundant.
+  FlatAffineConstraints tmpCst(*this);
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    // Change the inequality to its complement.
+    negateInequality(&tmpCst, r);
+    tmpCst.atIneq(r, tmpCst.getNumCols() - 1)--;
+    if (tmpCst.isEmpty()) {
+      redun[r] = true;
+      // Zero fill the redundant inequality.
+      fillInequality(this, r, /*val=*/0);
+      fillInequality(&tmpCst, r, /*val=*/0);
+    } else {
+      // Reverse the change (to avoid recreating tmpCst each time).
+      tmpCst.atIneq(r, tmpCst.getNumCols() - 1)++;
+      negateInequality(&tmpCst, r);
+    }
+  }
+
+  // Scan to get rid of all rows marked redundant, in-place.
+  auto copyRow = [&](unsigned src, unsigned dest) {
+    if (src == dest)
+      return;
+    for (unsigned c = 0, e = getNumCols(); c < e; c++) {
+      atIneq(dest, c) = atIneq(src, c);
+    }
+  };
+  unsigned pos = 0;
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    if (!redun[r])
+      copyRow(r, pos++);
+  }
+  inequalities.resize(numReservedCols * pos);
+}
+
+std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(
+    unsigned pos, unsigned offset, unsigned num, unsigned symStartPos,
+    ArrayRef<AffineExpr> localExprs, MLIRContext *context) const {
+  assert(pos + offset < getNumDimIds() && "invalid dim start pos");
+  assert(symStartPos >= (pos + offset) && "invalid sym start pos");
+  assert(getNumLocalIds() == localExprs.size() &&
+         "incorrect local exprs count");
+
+  SmallVector<unsigned, 4> lbIndices, ubIndices;
+  getLowerAndUpperBoundIndices(*this, pos + offset, &lbIndices, &ubIndices);
+
+  /// Add to 'b' from 'a' in set [0, offset) U [offset + num, symbStartPos).
+  auto addCoeffs = [&](ArrayRef<int64_t> a, SmallVectorImpl<int64_t> &b) {
+    b.clear();
+    for (unsigned i = 0, e = a.size(); i < e; ++i) {
+      if (i < offset || i >= offset + num)
+        b.push_back(a[i]);
+    }
+  };
+
+  SmallVector<int64_t, 8> lb, ub;
+  SmallVector<AffineExpr, 4> exprs;
+  unsigned dimCount = symStartPos - num;
+  unsigned symCount = getNumDimAndSymbolIds() - symStartPos;
+  exprs.reserve(lbIndices.size());
+  // Lower bound expressions.
+  for (auto idx : lbIndices) {
+    auto ineq = getInequality(idx);
+    // Extract the lower bound (in terms of other coeff's + const), i.e., if
+    // i - j + 1 >= 0 is the constraint, 'pos' is for i the lower bound is j
+    // - 1.
+    addCoeffs(ineq, lb);
+    std::transform(lb.begin(), lb.end(), lb.begin(), std::negate<int64_t>());
+    auto expr = mlir::toAffineExpr(lb, dimCount, symCount, localExprs, context);
+    exprs.push_back(expr);
+  }
+  auto lbMap =
+      exprs.empty() ? AffineMap() : AffineMap::get(dimCount, symCount, exprs);
+
+  exprs.clear();
+  exprs.reserve(ubIndices.size());
+  // Upper bound expressions.
+  for (auto idx : ubIndices) {
+    auto ineq = getInequality(idx);
+    // Extract the upper bound (in terms of other coeff's + const).
+    addCoeffs(ineq, ub);
+    auto expr = mlir::toAffineExpr(ub, dimCount, symCount, localExprs, context);
+    // Upper bound is exclusive.
+    exprs.push_back(expr + 1);
+  }
+  auto ubMap =
+      exprs.empty() ? AffineMap() : AffineMap::get(dimCount, symCount, exprs);
+
+  return {lbMap, ubMap};
+}
+
+/// Computes the lower and upper bounds of the first 'num' dimensional
+/// identifiers (starting at 'offset') as affine maps of the remaining
+/// identifiers (dimensional and symbolic identifiers). Local identifiers are
+/// themselves explicitly computed as affine functions of other identifiers in
+/// this process if needed.
+void FlatAffineConstraints::getSliceBounds(unsigned offset, unsigned num,
+                                           MLIRContext *context,
+                                           SmallVectorImpl<AffineMap> *lbMaps,
+                                           SmallVectorImpl<AffineMap> *ubMaps) {
+  assert(num < getNumDimIds() && "invalid range");
+
+  // Basic simplification.
+  normalizeConstraintsByGCD();
+
+  LLVM_DEBUG(llvm::dbgs() << "getSliceBounds for first " << num
+                          << " identifiers\n");
+  LLVM_DEBUG(dump());
+
+  // Record computed/detected identifiers.
+  SmallVector<AffineExpr, 8> memo(getNumIds());
+  // Initialize dimensional and symbolic identifiers.
+  for (unsigned i = 0, e = getNumDimIds(); i < e; i++) {
+    if (i < offset)
+      memo[i] = getAffineDimExpr(i, context);
+    else if (i >= offset + num)
+      memo[i] = getAffineDimExpr(i - num, context);
+  }
+  for (unsigned i = getNumDimIds(), e = getNumDimAndSymbolIds(); i < e; i++)
+    memo[i] = getAffineSymbolExpr(i - getNumDimIds(), context);
+
+  bool changed;
+  do {
+    changed = false;
+    // Identify yet unknown identifiers as constants or mod's / floordiv's of
+    // other identifiers if possible.
+    for (unsigned pos = 0; pos < getNumIds(); pos++) {
+      if (memo[pos])
+        continue;
+
+      auto lbConst = getConstantLowerBound(pos);
+      auto ubConst = getConstantUpperBound(pos);
+      if (lbConst.hasValue() && ubConst.hasValue()) {
+        // Detect equality to a constant.
+        if (lbConst.getValue() == ubConst.getValue()) {
+          memo[pos] = getAffineConstantExpr(lbConst.getValue(), context);
+          changed = true;
+          continue;
+        }
+
+        // Detect an identifier as modulo of another identifier w.r.t a
+        // constant.
+        if (detectAsMod(*this, pos, lbConst.getValue(), ubConst.getValue(),
+                        &memo)) {
+          changed = true;
+          continue;
+        }
+      }
+
+      // Detect an identifier as floordiv of another identifier w.r.t a
+      // constant.
+      if (detectAsFloorDiv(*this, pos, &memo, context)) {
+        changed = true;
+        continue;
+      }
+
+      // Detect an identifier as an expression of other identifiers.
+      unsigned idx;
+      if (!findConstraintWithNonZeroAt(*this, pos, /*isEq=*/true, &idx)) {
+        continue;
+      }
+
+      // Build AffineExpr solving for identifier 'pos' in terms of all others.
+      auto expr = getAffineConstantExpr(0, context);
+      unsigned j, e;
+      for (j = 0, e = getNumIds(); j < e; ++j) {
+        if (j == pos)
+          continue;
+        int64_t c = atEq(idx, j);
+        if (c == 0)
+          continue;
+        // If any of the involved IDs hasn't been found yet, we can't proceed.
+        if (!memo[j])
+          break;
+        expr = expr + memo[j] * c;
+      }
+      if (j < e)
+        // Can't construct expression as it depends on a yet uncomputed
+        // identifier.
+        continue;
+
+      // Add constant term to AffineExpr.
+      expr = expr + atEq(idx, getNumIds());
+      int64_t vPos = atEq(idx, pos);
+      assert(vPos != 0 && "expected non-zero here");
+      if (vPos > 0)
+        expr = (-expr).floorDiv(vPos);
+      else
+        // vPos < 0.
+        expr = expr.floorDiv(-vPos);
+      // Successfully constructed expression.
+      memo[pos] = expr;
+      changed = true;
+    }
+    // This loop is guaranteed to reach a fixed point - since once an
+    // identifier's explicit form is computed (in memo[pos]), it's not updated
+    // again.
+  } while (changed);
+
+  // Set the lower and upper bound maps for all the identifiers that were
+  // computed as affine expressions of the rest as the "detected expr" and
+  // "detected expr + 1" respectively; set the undetected ones to null.
+  Optional<FlatAffineConstraints> tmpClone;
+  for (unsigned pos = 0; pos < num; pos++) {
+    unsigned numMapDims = getNumDimIds() - num;
+    unsigned numMapSymbols = getNumSymbolIds();
+    AffineExpr expr = memo[pos + offset];
+    if (expr)
+      expr = simplifyAffineExpr(expr, numMapDims, numMapSymbols);
+
+    AffineMap &lbMap = (*lbMaps)[pos];
+    AffineMap &ubMap = (*ubMaps)[pos];
+
+    if (expr) {
+      lbMap = AffineMap::get(numMapDims, numMapSymbols, expr);
+      ubMap = AffineMap::get(numMapDims, numMapSymbols, expr + 1);
+    } else {
+      // TODO(bondhugula): Whenever there are local identifiers in the
+      // dependence constraints, we'll conservatively over-approximate, since we
+      // don't always explicitly compute them above (in the while loop).
+      if (getNumLocalIds() == 0) {
+        // Work on a copy so that we don't update this constraint system.
+        if (!tmpClone) {
+          tmpClone.emplace(FlatAffineConstraints(*this));
+          // Removing redundant inequalities is necessary so that we don't get
+          // redundant loop bounds.
+          tmpClone->removeRedundantInequalities();
+        }
+        std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound(
+            pos, offset, num, getNumDimIds(), {}, context);
+      }
+
+      // If the above fails, we'll just use the constant lower bound and the
+      // constant upper bound (if they exist) as the slice bounds.
+      // TODO(b/126426796): being conservative for the moment in cases that
+      // lead to multiple bounds - until getConstDifference in LoopFusion.cpp is
+      // fixed (b/126426796).
+      if (!lbMap || lbMap.getNumResults() > 1) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "WARNING: Potentially over-approximating slice lb\n");
+        auto lbConst = getConstantLowerBound(pos + offset);
+        if (lbConst.hasValue()) {
+          lbMap = AffineMap::get(
+              numMapDims, numMapSymbols,
+              getAffineConstantExpr(lbConst.getValue(), context));
+        }
+      }
+      if (!ubMap || ubMap.getNumResults() > 1) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "WARNING: Potentially over-approximating slice ub\n");
+        auto ubConst = getConstantUpperBound(pos + offset);
+        if (ubConst.hasValue()) {
+          (ubMap) = AffineMap::get(
+              numMapDims, numMapSymbols,
+              getAffineConstantExpr(ubConst.getValue() + 1, context));
+        }
+      }
+    }
+    LLVM_DEBUG(llvm::dbgs()
+               << "lb map for pos = " << Twine(pos + offset) << ", expr: ");
+    LLVM_DEBUG(lbMap.dump(););
+    LLVM_DEBUG(llvm::dbgs()
+               << "ub map for pos = " << Twine(pos + offset) << ", expr: ");
+    LLVM_DEBUG(ubMap.dump(););
+  }
+}
+
+LogicalResult
+FlatAffineConstraints::addLowerOrUpperBound(unsigned pos, AffineMap boundMap,
+                                            ArrayRef<Value> boundOperands,
+                                            bool eq, bool lower) {
+  assert(pos < getNumDimAndSymbolIds() && "invalid position");
+  // Equality follows the logic of lower bound except that we add an equality
+  // instead of an inequality.
+  assert((!eq || boundMap.getNumResults() == 1) && "single result expected");
+  if (eq)
+    lower = true;
+
+  // Fully compose map and operands; canonicalize and simplify so that we
+  // transitively get to terminal symbols or loop IVs.
+  auto map = boundMap;
+  SmallVector<Value, 4> operands(boundOperands.begin(), boundOperands.end());
+  fullyComposeAffineMapAndOperands(&map, &operands);
+  map = simplifyAffineMap(map);
+  canonicalizeMapAndOperands(&map, &operands);
+  for (auto operand : operands)
+    addInductionVarOrTerminalSymbol(operand);
+
+  FlatAffineConstraints localVarCst;
+  std::vector<SmallVector<int64_t, 8>> flatExprs;
+  if (failed(getFlattenedAffineExprs(map, &flatExprs, &localVarCst))) {
+    LLVM_DEBUG(llvm::dbgs() << "semi-affine expressions not yet supported\n");
+    return failure();
+  }
+
+  // Merge and align with localVarCst.
+  if (localVarCst.getNumLocalIds() > 0) {
+    // Set values for localVarCst.
+    localVarCst.setIdValues(0, localVarCst.getNumDimAndSymbolIds(), operands);
+    for (auto operand : operands) {
+      unsigned pos;
+      if (findId(*operand, &pos)) {
+        if (pos >= getNumDimIds() && pos < getNumDimAndSymbolIds()) {
+          // If the local var cst has this as a dim, turn it into its symbol.
+          turnDimIntoSymbol(&localVarCst, *operand);
+        } else if (pos < getNumDimIds()) {
+          // Or vice versa.
+          turnSymbolIntoDim(&localVarCst, *operand);
+        }
+      }
+    }
+    mergeAndAlignIds(/*offset=*/0, this, &localVarCst);
+    append(localVarCst);
+  }
+
+  // Record positions of the operands in the constraint system. Need to do
+  // this here since the constraint system changes after a bound is added.
+  SmallVector<unsigned, 8> positions;
+  unsigned numOperands = operands.size();
+  for (auto operand : operands) {
+    unsigned pos;
+    if (!findId(*operand, &pos))
+      assert(0 && "expected to be found");
+    positions.push_back(pos);
+  }
+
+  for (const auto &flatExpr : flatExprs) {
+    SmallVector<int64_t, 4> ineq(getNumCols(), 0);
+    ineq[pos] = lower ? 1 : -1;
+    // Dims and symbols.
+    for (unsigned j = 0, e = map.getNumInputs(); j < e; j++) {
+      ineq[positions[j]] = lower ? -flatExpr[j] : flatExpr[j];
+    }
+    // Copy over the local id coefficients.
+    unsigned numLocalIds = flatExpr.size() - 1 - numOperands;
+    for (unsigned jj = 0, j = getNumIds() - numLocalIds; jj < numLocalIds;
+         jj++, j++) {
+      ineq[j] =
+          lower ? -flatExpr[numOperands + jj] : flatExpr[numOperands + jj];
+    }
+    // Constant term.
+    ineq[getNumCols() - 1] =
+        lower ? -flatExpr[flatExpr.size() - 1]
+              // Upper bound in flattenedExpr is an exclusive one.
+              : flatExpr[flatExpr.size() - 1] - 1;
+    eq ? addEquality(ineq) : addInequality(ineq);
+  }
+  return success();
+}
+
+// Adds slice lower bounds represented by lower bounds in 'lbMaps' and upper
+// bounds in 'ubMaps' to each value in `values' that appears in the constraint
+// system. Note that both lower/upper bounds share the same operand list
+// 'operands'.
+// This function assumes 'values.size' == 'lbMaps.size' == 'ubMaps.size', and
+// skips any null AffineMaps in 'lbMaps' or 'ubMaps'.
+// Note that both lower/upper bounds use operands from 'operands'.
+// Returns failure for unimplemented cases such as semi-affine expressions or
+// expressions with mod/floordiv.
+LogicalResult FlatAffineConstraints::addSliceBounds(ArrayRef<Value> values,
+                                                    ArrayRef<AffineMap> lbMaps,
+                                                    ArrayRef<AffineMap> ubMaps,
+                                                    ArrayRef<Value> operands) {
+  assert(values.size() == lbMaps.size());
+  assert(lbMaps.size() == ubMaps.size());
+
+  for (unsigned i = 0, e = lbMaps.size(); i < e; ++i) {
+    unsigned pos;
+    if (!findId(*values[i], &pos))
+      continue;
+
+    AffineMap lbMap = lbMaps[i];
+    AffineMap ubMap = ubMaps[i];
+    assert(!lbMap || lbMap.getNumInputs() == operands.size());
+    assert(!ubMap || ubMap.getNumInputs() == operands.size());
+
+    // Check if this slice is just an equality along this dimension.
+    if (lbMap && ubMap && lbMap.getNumResults() == 1 &&
+        ubMap.getNumResults() == 1 &&
+        lbMap.getResult(0) + 1 == ubMap.getResult(0)) {
+      if (failed(addLowerOrUpperBound(pos, lbMap, operands, /*eq=*/true,
+                                      /*lower=*/true)))
+        return failure();
+      continue;
+    }
+
+    if (lbMap && failed(addLowerOrUpperBound(pos, lbMap, operands, /*eq=*/false,
+                                             /*lower=*/true)))
+      return failure();
+
+    if (ubMap && failed(addLowerOrUpperBound(pos, ubMap, operands, /*eq=*/false,
+                                             /*lower=*/false)))
+      return failure();
+  }
+  return success();
+}
+
+void FlatAffineConstraints::addEquality(ArrayRef<int64_t> eq) {
+  assert(eq.size() == getNumCols());
+  unsigned offset = equalities.size();
+  equalities.resize(equalities.size() + numReservedCols);
+  std::copy(eq.begin(), eq.end(), equalities.begin() + offset);
+}
+
+void FlatAffineConstraints::addInequality(ArrayRef<int64_t> inEq) {
+  assert(inEq.size() == getNumCols());
+  unsigned offset = inequalities.size();
+  inequalities.resize(inequalities.size() + numReservedCols);
+  std::copy(inEq.begin(), inEq.end(), inequalities.begin() + offset);
+}
+
+void FlatAffineConstraints::addConstantLowerBound(unsigned pos, int64_t lb) {
+  assert(pos < getNumCols());
+  unsigned offset = inequalities.size();
+  inequalities.resize(inequalities.size() + numReservedCols);
+  std::fill(inequalities.begin() + offset,
+            inequalities.begin() + offset + getNumCols(), 0);
+  inequalities[offset + pos] = 1;
+  inequalities[offset + getNumCols() - 1] = -lb;
+}
+
+void FlatAffineConstraints::addConstantUpperBound(unsigned pos, int64_t ub) {
+  assert(pos < getNumCols());
+  unsigned offset = inequalities.size();
+  inequalities.resize(inequalities.size() + numReservedCols);
+  std::fill(inequalities.begin() + offset,
+            inequalities.begin() + offset + getNumCols(), 0);
+  inequalities[offset + pos] = -1;
+  inequalities[offset + getNumCols() - 1] = ub;
+}
+
+void FlatAffineConstraints::addConstantLowerBound(ArrayRef<int64_t> expr,
+                                                  int64_t lb) {
+  assert(expr.size() == getNumCols());
+  unsigned offset = inequalities.size();
+  inequalities.resize(inequalities.size() + numReservedCols);
+  std::fill(inequalities.begin() + offset,
+            inequalities.begin() + offset + getNumCols(), 0);
+  std::copy(expr.begin(), expr.end(), inequalities.begin() + offset);
+  inequalities[offset + getNumCols() - 1] += -lb;
+}
+
+void FlatAffineConstraints::addConstantUpperBound(ArrayRef<int64_t> expr,
+                                                  int64_t ub) {
+  assert(expr.size() == getNumCols());
+  unsigned offset = inequalities.size();
+  inequalities.resize(inequalities.size() + numReservedCols);
+  std::fill(inequalities.begin() + offset,
+            inequalities.begin() + offset + getNumCols(), 0);
+  for (unsigned i = 0, e = getNumCols(); i < e; i++) {
+    inequalities[offset + i] = -expr[i];
+  }
+  inequalities[offset + getNumCols() - 1] += ub;
+}
+
+/// Adds a new local identifier as the floordiv of an affine function of other
+/// identifiers, the coefficients of which are provided in 'dividend' and with
+/// respect to a positive constant 'divisor'. Two constraints are added to the
+/// system to capture equivalence with the floordiv.
+///      q = expr floordiv c    <=>   c*q <= expr <= c*q + c - 1.
+void FlatAffineConstraints::addLocalFloorDiv(ArrayRef<int64_t> dividend,
+                                             int64_t divisor) {
+  assert(dividend.size() == getNumCols() && "incorrect dividend size");
+  assert(divisor > 0 && "positive divisor expected");
+
+  addLocalId(getNumLocalIds());
+
+  // Add two constraints for this new identifier 'q'.
+  SmallVector<int64_t, 8> bound(dividend.size() + 1);
+
+  // dividend - q * divisor >= 0
+  std::copy(dividend.begin(), dividend.begin() + dividend.size() - 1,
+            bound.begin());
+  bound.back() = dividend.back();
+  bound[getNumIds() - 1] = -divisor;
+  addInequality(bound);
+
+  // -dividend +qdivisor * q + divisor - 1 >= 0
+  std::transform(bound.begin(), bound.end(), bound.begin(),
+                 std::negate<int64_t>());
+  bound[bound.size() - 1] += divisor - 1;
+  addInequality(bound);
+}
+
+bool FlatAffineConstraints::findId(Value id, unsigned *pos) const {
+  unsigned i = 0;
+  for (const auto &mayBeId : ids) {
+    if (mayBeId.hasValue() && mayBeId.getValue() == id) {
+      *pos = i;
+      return true;
+    }
+    i++;
+  }
+  return false;
+}
+
+bool FlatAffineConstraints::containsId(Value id) const {
+  return llvm::any_of(ids, [&](const Optional<Value> &mayBeId) {
+    return mayBeId.hasValue() && mayBeId.getValue() == id;
+  });
+}
+
+void FlatAffineConstraints::setDimSymbolSeparation(unsigned newSymbolCount) {
+  assert(newSymbolCount <= numDims + numSymbols &&
+         "invalid separation position");
+  numDims = numDims + numSymbols - newSymbolCount;
+  numSymbols = newSymbolCount;
+}
+
+/// Sets the specified identifier to a constant value.
+void FlatAffineConstraints::setIdToConstant(unsigned pos, int64_t val) {
+  unsigned offset = equalities.size();
+  equalities.resize(equalities.size() + numReservedCols);
+  std::fill(equalities.begin() + offset,
+            equalities.begin() + offset + getNumCols(), 0);
+  equalities[offset + pos] = 1;
+  equalities[offset + getNumCols() - 1] = -val;
+}
+
+/// Sets the specified identifier to a constant value; asserts if the id is not
+/// found.
+void FlatAffineConstraints::setIdToConstant(Value id, int64_t val) {
+  unsigned pos;
+  if (!findId(id, &pos))
+    // This is a pre-condition for this method.
+    assert(0 && "id not found");
+  setIdToConstant(pos, val);
+}
+
+void FlatAffineConstraints::removeEquality(unsigned pos) {
+  unsigned numEqualities = getNumEqualities();
+  assert(pos < numEqualities);
+  unsigned outputIndex = pos * numReservedCols;
+  unsigned inputIndex = (pos + 1) * numReservedCols;
+  unsigned numElemsToCopy = (numEqualities - pos - 1) * numReservedCols;
+  std::copy(equalities.begin() + inputIndex,
+            equalities.begin() + inputIndex + numElemsToCopy,
+            equalities.begin() + outputIndex);
+  equalities.resize(equalities.size() - numReservedCols);
+}
+
+/// Finds an equality that equates the specified identifier to a constant.
+/// Returns the position of the equality row. If 'symbolic' is set to true,
+/// symbols are also treated like a constant, i.e., an affine function of the
+/// symbols is also treated like a constant.
+static int findEqualityToConstant(const FlatAffineConstraints &cst,
+                                  unsigned pos, bool symbolic = false) {
+  assert(pos < cst.getNumIds() && "invalid position");
+  for (unsigned r = 0, e = cst.getNumEqualities(); r < e; r++) {
+    int64_t v = cst.atEq(r, pos);
+    if (v * v != 1)
+      continue;
+    unsigned c;
+    unsigned f = symbolic ? cst.getNumDimIds() : cst.getNumIds();
+    // This checks for zeros in all positions other than 'pos' in [0, f)
+    for (c = 0; c < f; c++) {
+      if (c == pos)
+        continue;
+      if (cst.atEq(r, c) != 0) {
+        // Dependent on another identifier.
+        break;
+      }
+    }
+    if (c == f)
+      // Equality is free of other identifiers.
+      return r;
+  }
+  return -1;
+}
+
+void FlatAffineConstraints::setAndEliminate(unsigned pos, int64_t constVal) {
+  assert(pos < getNumIds() && "invalid position");
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    atIneq(r, getNumCols() - 1) += atIneq(r, pos) * constVal;
+  }
+  for (unsigned r = 0, e = getNumEqualities(); r < e; r++) {
+    atEq(r, getNumCols() - 1) += atEq(r, pos) * constVal;
+  }
+  removeId(pos);
+}
+
+LogicalResult FlatAffineConstraints::constantFoldId(unsigned pos) {
+  assert(pos < getNumIds() && "invalid position");
+  int rowIdx;
+  if ((rowIdx = findEqualityToConstant(*this, pos)) == -1)
+    return failure();
+
+  // atEq(rowIdx, pos) is either -1 or 1.
+  assert(atEq(rowIdx, pos) * atEq(rowIdx, pos) == 1);
+  int64_t constVal = -atEq(rowIdx, getNumCols() - 1) / atEq(rowIdx, pos);
+  setAndEliminate(pos, constVal);
+  return success();
+}
+
+void FlatAffineConstraints::constantFoldIdRange(unsigned pos, unsigned num) {
+  for (unsigned s = pos, t = pos, e = pos + num; s < e; s++) {
+    if (failed(constantFoldId(t)))
+      t++;
+  }
+}
+
+/// Returns the extent (upper bound - lower bound) of the specified
+/// identifier if it is found to be a constant; returns None if it's not a
+/// constant. This methods treats symbolic identifiers specially, i.e.,
+/// it looks for constant differences between affine expressions involving
+/// only the symbolic identifiers. See comments at function definition for
+/// example. 'lb', if provided, is set to the lower bound associated with the
+/// constant difference. Note that 'lb' is purely symbolic and thus will contain
+/// the coefficients of the symbolic identifiers and the constant coefficient.
+//  Egs: 0 <= i <= 15, return 16.
+//       s0 + 2 <= i <= s0 + 17, returns 16. (s0 has to be a symbol)
+//       s0 + s1 + 16 <= d0 <= s0 + s1 + 31, returns 16.
+//       s0 - 7 <= 8*j <= s0 returns 1 with lb = s0, lbDivisor = 8 (since lb =
+//       ceil(s0 - 7 / 8) = floor(s0 / 8)).
+Optional<int64_t> FlatAffineConstraints::getConstantBoundOnDimSize(
+    unsigned pos, SmallVectorImpl<int64_t> *lb, int64_t *lbFloorDivisor,
+    SmallVectorImpl<int64_t> *ub) const {
+  assert(pos < getNumDimIds() && "Invalid identifier position");
+  assert(getNumLocalIds() == 0);
+
+  // TODO(bondhugula): eliminate all remaining dimensional identifiers (other
+  // than the one at 'pos' to make this more powerful. Not needed for
+  // hyper-rectangular spaces.
+
+  // Find an equality for 'pos'^th identifier that equates it to some function
+  // of the symbolic identifiers (+ constant).
+  int eqRow = findEqualityToConstant(*this, pos, /*symbolic=*/true);
+  if (eqRow != -1) {
+    // This identifier can only take a single value.
+    if (lb) {
+      // Set lb to the symbolic value.
+      lb->resize(getNumSymbolIds() + 1);
+      if (ub)
+        ub->resize(getNumSymbolIds() + 1);
+      for (unsigned c = 0, f = getNumSymbolIds() + 1; c < f; c++) {
+        int64_t v = atEq(eqRow, pos);
+        // atEq(eqRow, pos) is either -1 or 1.
+        assert(v * v == 1);
+        (*lb)[c] = v < 0 ? atEq(eqRow, getNumDimIds() + c) / -v
+                         : -atEq(eqRow, getNumDimIds() + c) / v;
+        // Since this is an equality, ub = lb.
+        if (ub)
+          (*ub)[c] = (*lb)[c];
+      }
+      assert(lbFloorDivisor &&
+             "both lb and divisor or none should be provided");
+      *lbFloorDivisor = 1;
+    }
+    return 1;
+  }
+
+  // Check if the identifier appears at all in any of the inequalities.
+  unsigned r, e;
+  for (r = 0, e = getNumInequalities(); r < e; r++) {
+    if (atIneq(r, pos) != 0)
+      break;
+  }
+  if (r == e)
+    // If it doesn't, there isn't a bound on it.
+    return None;
+
+  // Positions of constraints that are lower/upper bounds on the variable.
+  SmallVector<unsigned, 4> lbIndices, ubIndices;
+
+  // Gather all symbolic lower bounds and upper bounds of the variable. Since
+  // the canonical form c_1*x_1 + c_2*x_2 + ... + c_0 >= 0, a constraint is a
+  // lower bound for x_i if c_i >= 1, and an upper bound if c_i <= -1.
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    unsigned c, f;
+    for (c = 0, f = getNumDimIds(); c < f; c++) {
+      if (c != pos && atIneq(r, c) != 0)
+        break;
+    }
+    if (c < getNumDimIds())
+      // Not a pure symbolic bound.
+      continue;
+    if (atIneq(r, pos) >= 1)
+      // Lower bound.
+      lbIndices.push_back(r);
+    else if (atIneq(r, pos) <= -1)
+      // Upper bound.
+      ubIndices.push_back(r);
+  }
+
+  // TODO(bondhugula): eliminate other dimensional identifiers to make this more
+  // powerful. Not needed for hyper-rectangular iteration spaces.
+
+  Optional<int64_t> minDiff = None;
+  unsigned minLbPosition, minUbPosition;
+  for (auto ubPos : ubIndices) {
+    for (auto lbPos : lbIndices) {
+      // Look for a lower bound and an upper bound that only differ by a
+      // constant, i.e., pairs of the form  0 <= c_pos - f(c_i's) <= diffConst.
+      // For example, if ii is the pos^th variable, we are looking for
+      // constraints like ii >= i, ii <= ii + 50, 50 being the difference. The
+      // minimum among all such constant differences is kept since that's the
+      // constant bounding the extent of the pos^th variable.
+      unsigned j, e;
+      for (j = 0, e = getNumCols() - 1; j < e; j++)
+        if (atIneq(ubPos, j) != -atIneq(lbPos, j)) {
+          break;
+        }
+      if (j < getNumCols() - 1)
+        continue;
+      int64_t diff = ceilDiv(atIneq(ubPos, getNumCols() - 1) +
+                                 atIneq(lbPos, getNumCols() - 1) + 1,
+                             atIneq(lbPos, pos));
+      if (minDiff == None || diff < minDiff) {
+        minDiff = diff;
+        minLbPosition = lbPos;
+        minUbPosition = ubPos;
+      }
+    }
+  }
+  if (lb && minDiff.hasValue()) {
+    // Set lb to the symbolic lower bound.
+    lb->resize(getNumSymbolIds() + 1);
+    if (ub)
+      ub->resize(getNumSymbolIds() + 1);
+    // The lower bound is the ceildiv of the lb constraint over the coefficient
+    // of the variable at 'pos'. We express the ceildiv equivalently as a floor
+    // for uniformity. For eg., if the lower bound constraint was: 32*d0 - N +
+    // 31 >= 0, the lower bound for d0 is ceil(N - 31, 32), i.e., floor(N, 32).
+    *lbFloorDivisor = atIneq(minLbPosition, pos);
+    assert(*lbFloorDivisor == -atIneq(minUbPosition, pos));
+    for (unsigned c = 0, e = getNumSymbolIds() + 1; c < e; c++) {
+      (*lb)[c] = -atIneq(minLbPosition, getNumDimIds() + c);
+    }
+    if (ub) {
+      for (unsigned c = 0, e = getNumSymbolIds() + 1; c < e; c++)
+        (*ub)[c] = atIneq(minUbPosition, getNumDimIds() + c);
+    }
+    // The lower bound leads to a ceildiv while the upper bound is a floordiv
+    // whenever the coefficient at pos != 1. ceildiv (val / d) = floordiv (val +
+    // d - 1 / d); hence, the addition of 'atIneq(minLbPosition, pos) - 1' to
+    // the constant term for the lower bound.
+    (*lb)[getNumSymbolIds()] += atIneq(minLbPosition, pos) - 1;
+  }
+  return minDiff;
+}
+
+template <bool isLower>
+Optional<int64_t>
+FlatAffineConstraints::computeConstantLowerOrUpperBound(unsigned pos) {
+  assert(pos < getNumIds() && "invalid position");
+  // Project to 'pos'.
+  projectOut(0, pos);
+  projectOut(1, getNumIds() - 1);
+  // Check if there's an equality equating the '0'^th identifier to a constant.
+  int eqRowIdx = findEqualityToConstant(*this, 0, /*symbolic=*/false);
+  if (eqRowIdx != -1)
+    // atEq(rowIdx, 0) is either -1 or 1.
+    return -atEq(eqRowIdx, getNumCols() - 1) / atEq(eqRowIdx, 0);
+
+  // Check if the identifier appears at all in any of the inequalities.
+  unsigned r, e;
+  for (r = 0, e = getNumInequalities(); r < e; r++) {
+    if (atIneq(r, 0) != 0)
+      break;
+  }
+  if (r == e)
+    // If it doesn't, there isn't a bound on it.
+    return None;
+
+  Optional<int64_t> minOrMaxConst = None;
+
+  // Take the max across all const lower bounds (or min across all constant
+  // upper bounds).
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    if (isLower) {
+      if (atIneq(r, 0) <= 0)
+        // Not a lower bound.
+        continue;
+    } else if (atIneq(r, 0) >= 0) {
+      // Not an upper bound.
+      continue;
+    }
+    unsigned c, f;
+    for (c = 0, f = getNumCols() - 1; c < f; c++)
+      if (c != 0 && atIneq(r, c) != 0)
+        break;
+    if (c < getNumCols() - 1)
+      // Not a constant bound.
+      continue;
+
+    int64_t boundConst =
+        isLower ? mlir::ceilDiv(-atIneq(r, getNumCols() - 1), atIneq(r, 0))
+                : mlir::floorDiv(atIneq(r, getNumCols() - 1), -atIneq(r, 0));
+    if (isLower) {
+      if (minOrMaxConst == None || boundConst > minOrMaxConst)
+        minOrMaxConst = boundConst;
+    } else {
+      if (minOrMaxConst == None || boundConst < minOrMaxConst)
+        minOrMaxConst = boundConst;
+    }
+  }
+  return minOrMaxConst;
+}
+
+Optional<int64_t>
+FlatAffineConstraints::getConstantLowerBound(unsigned pos) const {
+  FlatAffineConstraints tmpCst(*this);
+  return tmpCst.computeConstantLowerOrUpperBound</*isLower=*/true>(pos);
+}
+
+Optional<int64_t>
+FlatAffineConstraints::getConstantUpperBound(unsigned pos) const {
+  FlatAffineConstraints tmpCst(*this);
+  return tmpCst.computeConstantLowerOrUpperBound</*isLower=*/false>(pos);
+}
+
+// A simple (naive and conservative) check for hyper-rectangularity.
+bool FlatAffineConstraints::isHyperRectangular(unsigned pos,
+                                               unsigned num) const {
+  assert(pos < getNumCols() - 1);
+  // Check for two non-zero coefficients in the range [pos, pos + sum).
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    unsigned sum = 0;
+    for (unsigned c = pos; c < pos + num; c++) {
+      if (atIneq(r, c) != 0)
+        sum++;
+    }
+    if (sum > 1)
+      return false;
+  }
+  for (unsigned r = 0, e = getNumEqualities(); r < e; r++) {
+    unsigned sum = 0;
+    for (unsigned c = pos; c < pos + num; c++) {
+      if (atEq(r, c) != 0)
+        sum++;
+    }
+    if (sum > 1)
+      return false;
+  }
+  return true;
+}
+
+void FlatAffineConstraints::print(raw_ostream &os) const {
+  assert(hasConsistentState());
+  os << "\nConstraints (" << getNumDimIds() << " dims, " << getNumSymbolIds()
+     << " symbols, " << getNumLocalIds() << " locals), (" << getNumConstraints()
+     << " constraints)\n";
+  os << "(";
+  for (unsigned i = 0, e = getNumIds(); i < e; i++) {
+    if (ids[i] == None)
+      os << "None ";
+    else
+      os << "Value ";
+  }
+  os << " const)\n";
+  for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
+    for (unsigned j = 0, f = getNumCols(); j < f; ++j) {
+      os << atEq(i, j) << " ";
+    }
+    os << "= 0\n";
+  }
+  for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) {
+    for (unsigned j = 0, f = getNumCols(); j < f; ++j) {
+      os << atIneq(i, j) << " ";
+    }
+    os << ">= 0\n";
+  }
+  os << '\n';
+}
+
+void FlatAffineConstraints::dump() const { print(llvm::errs()); }
+
+/// Removes duplicate constraints, trivially true constraints, and constraints
+/// that can be detected as redundant as a result of differing only in their
+/// constant term part. A constraint of the form <non-negative constant> >= 0 is
+/// considered trivially true.
+//  Uses a DenseSet to hash and detect duplicates followed by a linear scan to
+//  remove duplicates in place.
+void FlatAffineConstraints::removeTrivialRedundancy() {
+  SmallDenseSet<ArrayRef<int64_t>, 8> rowSet;
+
+  // A map used to detect redundancy stemming from constraints that only differ
+  // in their constant term. The value stored is <row position, const term>
+  // for a given row.
+  SmallDenseMap<ArrayRef<int64_t>, std::pair<unsigned, int64_t>>
+      rowsWithoutConstTerm;
+
+  // Check if constraint is of the form <non-negative-constant> >= 0.
+  auto isTriviallyValid = [&](unsigned r) -> bool {
+    for (unsigned c = 0, e = getNumCols() - 1; c < e; c++) {
+      if (atIneq(r, c) != 0)
+        return false;
+    }
+    return atIneq(r, getNumCols() - 1) >= 0;
+  };
+
+  // Detect and mark redundant constraints.
+  SmallVector<bool, 256> redunIneq(getNumInequalities(), false);
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    int64_t *rowStart = inequalities.data() + numReservedCols * r;
+    auto row = ArrayRef<int64_t>(rowStart, getNumCols());
+    if (isTriviallyValid(r) || !rowSet.insert(row).second) {
+      redunIneq[r] = true;
+      continue;
+    }
+
+    // Among constraints that only differ in the constant term part, mark
+    // everything other than the one with the smallest constant term redundant.
+    // (eg: among i - 16j - 5 >= 0, i - 16j - 1 >=0, i - 16j - 7 >= 0, the
+    // former two are redundant).
+    int64_t constTerm = atIneq(r, getNumCols() - 1);
+    auto rowWithoutConstTerm = ArrayRef<int64_t>(rowStart, getNumCols() - 1);
+    const auto &ret =
+        rowsWithoutConstTerm.insert({rowWithoutConstTerm, {r, constTerm}});
+    if (!ret.second) {
+      // Check if the other constraint has a higher constant term.
+      auto &val = ret.first->second;
+      if (val.second > constTerm) {
+        // The stored row is redundant. Mark it so, and update with this one.
+        redunIneq[val.first] = true;
+        val = {r, constTerm};
+      } else {
+        // The one stored makes this one redundant.
+        redunIneq[r] = true;
+      }
+    }
+  }
+
+  auto copyRow = [&](unsigned src, unsigned dest) {
+    if (src == dest)
+      return;
+    for (unsigned c = 0, e = getNumCols(); c < e; c++) {
+      atIneq(dest, c) = atIneq(src, c);
+    }
+  };
+
+  // Scan to get rid of all rows marked redundant, in-place.
+  unsigned pos = 0;
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    if (!redunIneq[r])
+      copyRow(r, pos++);
+  }
+  inequalities.resize(numReservedCols * pos);
+
+  // TODO(bondhugula): consider doing this for equalities as well, but probably
+  // not worth the savings.
+}
+
+void FlatAffineConstraints::clearAndCopyFrom(
+    const FlatAffineConstraints &other) {
+  FlatAffineConstraints copy(other);
+  std::swap(*this, copy);
+  assert(copy.getNumIds() == copy.getIds().size());
+}
+
+void FlatAffineConstraints::removeId(unsigned pos) {
+  removeIdRange(pos, pos + 1);
+}
+
+static std::pair<unsigned, unsigned>
+getNewNumDimsSymbols(unsigned pos, const FlatAffineConstraints &cst) {
+  unsigned numDims = cst.getNumDimIds();
+  unsigned numSymbols = cst.getNumSymbolIds();
+  unsigned newNumDims, newNumSymbols;
+  if (pos < numDims) {
+    newNumDims = numDims - 1;
+    newNumSymbols = numSymbols;
+  } else if (pos < numDims + numSymbols) {
+    assert(numSymbols >= 1);
+    newNumDims = numDims;
+    newNumSymbols = numSymbols - 1;
+  } else {
+    newNumDims = numDims;
+    newNumSymbols = numSymbols;
+  }
+  return {newNumDims, newNumSymbols};
+}
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "fm"
+
+/// Eliminates identifier at the specified position using Fourier-Motzkin
+/// variable elimination. This technique is exact for rational spaces but
+/// conservative (in "rare" cases) for integer spaces. The operation corresponds
+/// to a projection operation yielding the (convex) set of integer points
+/// contained in the rational shadow of the set. An emptiness test that relies
+/// on this method will guarantee emptiness, i.e., it disproves the existence of
+/// a solution if it says it's empty.
+/// If a non-null isResultIntegerExact is passed, it is set to true if the
+/// result is also integer exact. If it's set to false, the obtained solution
+/// *may* not be exact, i.e., it may contain integer points that do not have an
+/// integer pre-image in the original set.
+///
+/// Eg:
+/// j >= 0, j <= i + 1
+/// i >= 0, i <= N + 1
+/// Eliminating i yields,
+///   j >= 0, 0 <= N + 1, j - 1 <= N + 1
+///
+/// If darkShadow = true, this method computes the dark shadow on elimination;
+/// the dark shadow is a convex integer subset of the exact integer shadow. A
+/// non-empty dark shadow proves the existence of an integer solution. The
+/// elimination in such a case could however be an under-approximation, and thus
+/// should not be used for scanning sets or used by itself for dependence
+/// checking.
+///
+/// Eg: 2-d set, * represents grid points, 'o' represents a point in the set.
+///            ^
+///            |
+///            | * * * * o o
+///         i  | * * o o o o
+///            | o * * * * *
+///            --------------->
+///                 j ->
+///
+/// Eliminating i from this system (projecting on the j dimension):
+/// rational shadow / integer light shadow:  1 <= j <= 6
+/// dark shadow:                             3 <= j <= 6
+/// exact integer shadow:                    j = 1 \union  3 <= j <= 6
+/// holes/splinters:                         j = 2
+///
+/// darkShadow = false, isResultIntegerExact = nullptr are default values.
+// TODO(bondhugula): a slight modification to yield dark shadow version of FM
+// (tightened), which can prove the existence of a solution if there is one.
+void FlatAffineConstraints::FourierMotzkinEliminate(
+    unsigned pos, bool darkShadow, bool *isResultIntegerExact) {
+  LLVM_DEBUG(llvm::dbgs() << "FM input (eliminate pos " << pos << "):\n");
+  LLVM_DEBUG(dump());
+  assert(pos < getNumIds() && "invalid position");
+  assert(hasConsistentState());
+
+  // Check if this identifier can be eliminated through a substitution.
+  for (unsigned r = 0, e = getNumEqualities(); r < e; r++) {
+    if (atEq(r, pos) != 0) {
+      // Use Gaussian elimination here (since we have an equality).
+      LogicalResult ret = gaussianEliminateId(pos);
+      (void)ret;
+      assert(succeeded(ret) && "Gaussian elimination guaranteed to succeed");
+      LLVM_DEBUG(llvm::dbgs() << "FM output (through Gaussian elimination):\n");
+      LLVM_DEBUG(dump());
+      return;
+    }
+  }
+
+  // A fast linear time tightening.
+  GCDTightenInequalities();
+
+  // Check if the identifier appears at all in any of the inequalities.
+  unsigned r, e;
+  for (r = 0, e = getNumInequalities(); r < e; r++) {
+    if (atIneq(r, pos) != 0)
+      break;
+  }
+  if (r == getNumInequalities()) {
+    // If it doesn't appear, just remove the column and return.
+    // TODO(andydavis,bondhugula): refactor removeColumns to use it from here.
+    removeId(pos);
+    LLVM_DEBUG(llvm::dbgs() << "FM output:\n");
+    LLVM_DEBUG(dump());
+    return;
+  }
+
+  // Positions of constraints that are lower bounds on the variable.
+  SmallVector<unsigned, 4> lbIndices;
+  // Positions of constraints that are lower bounds on the variable.
+  SmallVector<unsigned, 4> ubIndices;
+  // Positions of constraints that do not involve the variable.
+  std::vector<unsigned> nbIndices;
+  nbIndices.reserve(getNumInequalities());
+
+  // Gather all lower bounds and upper bounds of the variable. Since the
+  // canonical form c_1*x_1 + c_2*x_2 + ... + c_0 >= 0, a constraint is a lower
+  // bound for x_i if c_i >= 1, and an upper bound if c_i <= -1.
+  for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
+    if (atIneq(r, pos) == 0) {
+      // Id does not appear in bound.
+      nbIndices.push_back(r);
+    } else if (atIneq(r, pos) >= 1) {
+      // Lower bound.
+      lbIndices.push_back(r);
+    } else {
+      // Upper bound.
+      ubIndices.push_back(r);
+    }
+  }
+
+  // Set the number of dimensions, symbols in the resulting system.
+  const auto &dimsSymbols = getNewNumDimsSymbols(pos, *this);
+  unsigned newNumDims = dimsSymbols.first;
+  unsigned newNumSymbols = dimsSymbols.second;
+
+  SmallVector<Optional<Value>, 8> newIds;
+  newIds.reserve(numIds - 1);
+  newIds.append(ids.begin(), ids.begin() + pos);
+  newIds.append(ids.begin() + pos + 1, ids.end());
+
+  /// Create the new system which has one identifier less.
+  FlatAffineConstraints newFac(
+      lbIndices.size() * ubIndices.size() + nbIndices.size(),
+      getNumEqualities(), getNumCols() - 1, newNumDims, newNumSymbols,
+      /*numLocals=*/getNumIds() - 1 - newNumDims - newNumSymbols, newIds);
+
+  assert(newFac.getIds().size() == newFac.getNumIds());
+
+  // This will be used to check if the elimination was integer exact.
+  unsigned lcmProducts = 1;
+
+  // Let x be the variable we are eliminating.
+  // For each lower bound, lb <= c_l*x, and each upper bound c_u*x <= ub, (note
+  // that c_l, c_u >= 1) we have:
+  // lb*lcm(c_l, c_u)/c_l <= lcm(c_l, c_u)*x <= ub*lcm(c_l, c_u)/c_u
+  // We thus generate a constraint:
+  // lcm(c_l, c_u)/c_l*lb <= lcm(c_l, c_u)/c_u*ub.
+  // Note if c_l = c_u = 1, all integer points captured by the resulting
+  // constraint correspond to integer points in the original system (i.e., they
+  // have integer pre-images). Hence, if the lcm's are all 1, the elimination is
+  // integer exact.
+  for (auto ubPos : ubIndices) {
+    for (auto lbPos : lbIndices) {
+      SmallVector<int64_t, 4> ineq;
+      ineq.reserve(newFac.getNumCols());
+      int64_t lbCoeff = atIneq(lbPos, pos);
+      // Note that in the comments above, ubCoeff is the negation of the
+      // coefficient in the canonical form as the view taken here is that of the
+      // term being moved to the other size of '>='.
+      int64_t ubCoeff = -atIneq(ubPos, pos);
+      // TODO(bondhugula): refactor this loop to avoid all branches inside.
+      for (unsigned l = 0, e = getNumCols(); l < e; l++) {
+        if (l == pos)
+          continue;
+        assert(lbCoeff >= 1 && ubCoeff >= 1 && "bounds wrongly identified");
+        int64_t lcm = mlir::lcm(lbCoeff, ubCoeff);
+        ineq.push_back(atIneq(ubPos, l) * (lcm / ubCoeff) +
+                       atIneq(lbPos, l) * (lcm / lbCoeff));
+        lcmProducts *= lcm;
+      }
+      if (darkShadow) {
+        // The dark shadow is a convex subset of the exact integer shadow. If
+        // there is a point here, it proves the existence of a solution.
+        ineq[ineq.size() - 1] += lbCoeff * ubCoeff - lbCoeff - ubCoeff + 1;
+      }
+      // TODO: we need to have a way to add inequalities in-place in
+      // FlatAffineConstraints instead of creating and copying over.
+      newFac.addInequality(ineq);
+    }
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "FM isResultIntegerExact: " << (lcmProducts == 1)
+                          << "\n");
+  if (lcmProducts == 1 && isResultIntegerExact)
+    *isResultIntegerExact = true;
+
+  // Copy over the constraints not involving this variable.
+  for (auto nbPos : nbIndices) {
+    SmallVector<int64_t, 4> ineq;
+    ineq.reserve(getNumCols() - 1);
+    for (unsigned l = 0, e = getNumCols(); l < e; l++) {
+      if (l == pos)
+        continue;
+      ineq.push_back(atIneq(nbPos, l));
+    }
+    newFac.addInequality(ineq);
+  }
+
+  assert(newFac.getNumConstraints() ==
+         lbIndices.size() * ubIndices.size() + nbIndices.size());
+
+  // Copy over the equalities.
+  for (unsigned r = 0, e = getNumEqualities(); r < e; r++) {
+    SmallVector<int64_t, 4> eq;
+    eq.reserve(newFac.getNumCols());
+    for (unsigned l = 0, e = getNumCols(); l < e; l++) {
+      if (l == pos)
+        continue;
+      eq.push_back(atEq(r, l));
+    }
+    newFac.addEquality(eq);
+  }
+
+  // GCD tightening and normalization allows detection of more trivially
+  // redundant constraints.
+  newFac.GCDTightenInequalities();
+  newFac.normalizeConstraintsByGCD();
+  newFac.removeTrivialRedundancy();
+  clearAndCopyFrom(newFac);
+  LLVM_DEBUG(llvm::dbgs() << "FM output:\n");
+  LLVM_DEBUG(dump());
+}
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "affine-structures"
+
+void FlatAffineConstraints::projectOut(unsigned pos, unsigned num) {
+  if (num == 0)
+    return;
+
+  // 'pos' can be at most getNumCols() - 2 if num > 0.
+  assert((getNumCols() < 2 || pos <= getNumCols() - 2) && "invalid position");
+  assert(pos + num < getNumCols() && "invalid range");
+
+  // Eliminate as many identifiers as possible using Gaussian elimination.
+  unsigned currentPos = pos;
+  unsigned numToEliminate = num;
+  unsigned numGaussianEliminated = 0;
+
+  while (currentPos < getNumIds()) {
+    unsigned curNumEliminated =
+        gaussianEliminateIds(currentPos, currentPos + numToEliminate);
+    ++currentPos;
+    numToEliminate -= curNumEliminated + 1;
+    numGaussianEliminated += curNumEliminated;
+  }
+
+  // Eliminate the remaining using Fourier-Motzkin.
+  for (unsigned i = 0; i < num - numGaussianEliminated; i++) {
+    unsigned numToEliminate = num - numGaussianEliminated - i;
+    FourierMotzkinEliminate(
+        getBestIdToEliminate(*this, pos, pos + numToEliminate));
+  }
+
+  // Fast/trivial simplifications.
+  GCDTightenInequalities();
+  // Normalize constraints after tightening since the latter impacts this, but
+  // not the other way round.
+  normalizeConstraintsByGCD();
+}
+
+void FlatAffineConstraints::projectOut(Value id) {
+  unsigned pos;
+  bool ret = findId(*id, &pos);
+  assert(ret);
+  (void)ret;
+  FourierMotzkinEliminate(pos);
+}
+
+void FlatAffineConstraints::clearConstraints() {
+  equalities.clear();
+  inequalities.clear();
+}
+
+namespace {
+
+enum BoundCmpResult { Greater, Less, Equal, Unknown };
+
+/// Compares two affine bounds whose coefficients are provided in 'first' and
+/// 'second'. The last coefficient is the constant term.
+static BoundCmpResult compareBounds(ArrayRef<int64_t> a, ArrayRef<int64_t> b) {
+  assert(a.size() == b.size());
+
+  // For the bounds to be comparable, their corresponding identifier
+  // coefficients should be equal; the constant terms are then compared to
+  // determine less/greater/equal.
+
+  if (!std::equal(a.begin(), a.end() - 1, b.begin()))
+    return Unknown;
+
+  if (a.back() == b.back())
+    return Equal;
+
+  return a.back() < b.back() ? Less : Greater;
+}
+} // namespace
+
+// Computes the bounding box with respect to 'other' by finding the min of the
+// lower bounds and the max of the upper bounds along each of the dimensions.
+LogicalResult
+FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) {
+  assert(otherCst.getNumDimIds() == numDims && "dims mismatch");
+  assert(otherCst.getIds()
+             .slice(0, getNumDimIds())
+             .equals(getIds().slice(0, getNumDimIds())) &&
+         "dim values mismatch");
+  assert(otherCst.getNumLocalIds() == 0 && "local ids not supported here");
+  assert(getNumLocalIds() == 0 && "local ids not supported yet here");
+
+  Optional<FlatAffineConstraints> otherCopy;
+  if (!areIdsAligned(*this, otherCst)) {
+    otherCopy.emplace(FlatAffineConstraints(otherCst));
+    mergeAndAlignIds(/*offset=*/numDims, this, &otherCopy.getValue());
+  }
+
+  const auto &other = otherCopy ? *otherCopy : otherCst;
+
+  std::vector<SmallVector<int64_t, 8>> boundingLbs;
+  std::vector<SmallVector<int64_t, 8>> boundingUbs;
+  boundingLbs.reserve(2 * getNumDimIds());
+  boundingUbs.reserve(2 * getNumDimIds());
+
+  // To hold lower and upper bounds for each dimension.
+  SmallVector<int64_t, 4> lb, otherLb, ub, otherUb;
+  // To compute min of lower bounds and max of upper bounds for each dimension.
+  SmallVector<int64_t, 4> minLb(getNumSymbolIds() + 1);
+  SmallVector<int64_t, 4> maxUb(getNumSymbolIds() + 1);
+  // To compute final new lower and upper bounds for the union.
+  SmallVector<int64_t, 8> newLb(getNumCols()), newUb(getNumCols());
+
+  int64_t lbFloorDivisor, otherLbFloorDivisor;
+  for (unsigned d = 0, e = getNumDimIds(); d < e; ++d) {
+    auto extent = getConstantBoundOnDimSize(d, &lb, &lbFloorDivisor, &ub);
+    if (!extent.hasValue())
+      // TODO(bondhugula): symbolic extents when necessary.
+      // TODO(bondhugula): handle union if a dimension is unbounded.
+      return failure();
+
+    auto otherExtent = other.getConstantBoundOnDimSize(
+        d, &otherLb, &otherLbFloorDivisor, &otherUb);
+    if (!otherExtent.hasValue() || lbFloorDivisor != otherLbFloorDivisor)
+      // TODO(bondhugula): symbolic extents when necessary.
+      return failure();
+
+    assert(lbFloorDivisor > 0 && "divisor always expected to be positive");
+
+    auto res = compareBounds(lb, otherLb);
+    // Identify min.
+    if (res == BoundCmpResult::Less || res == BoundCmpResult::Equal) {
+      minLb = lb;
+      // Since the divisor is for a floordiv, we need to convert to ceildiv,
+      // i.e., i >= expr floordiv div <=> i >= (expr - div + 1) ceildiv div <=>
+      // div * i >= expr - div + 1.
+      minLb.back() -= lbFloorDivisor - 1;
+    } else if (res == BoundCmpResult::Greater) {
+      minLb = otherLb;
+      minLb.back() -= otherLbFloorDivisor - 1;
+    } else {
+      // Uncomparable - check for constant lower/upper bounds.
+      auto constLb = getConstantLowerBound(d);
+      auto constOtherLb = other.getConstantLowerBound(d);
+      if (!constLb.hasValue() || !constOtherLb.hasValue())
+        return failure();
+      std::fill(minLb.begin(), minLb.end(), 0);
+      minLb.back() = std::min(constLb.getValue(), constOtherLb.getValue());
+    }
+
+    // Do the same for ub's but max of upper bounds. Identify max.
+    auto uRes = compareBounds(ub, otherUb);
+    if (uRes == BoundCmpResult::Greater || uRes == BoundCmpResult::Equal) {
+      maxUb = ub;
+    } else if (uRes == BoundCmpResult::Less) {
+      maxUb = otherUb;
+    } else {
+      // Uncomparable - check for constant lower/upper bounds.
+      auto constUb = getConstantUpperBound(d);
+      auto constOtherUb = other.getConstantUpperBound(d);
+      if (!constUb.hasValue() || !constOtherUb.hasValue())
+        return failure();
+      std::fill(maxUb.begin(), maxUb.end(), 0);
+      maxUb.back() = std::max(constUb.getValue(), constOtherUb.getValue());
+    }
+
+    std::fill(newLb.begin(), newLb.end(), 0);
+    std::fill(newUb.begin(), newUb.end(), 0);
+
+    // The divisor for lb, ub, otherLb, otherUb at this point is lbDivisor,
+    // and so it's the divisor for newLb and newUb as well.
+    newLb[d] = lbFloorDivisor;
+    newUb[d] = -lbFloorDivisor;
+    // Copy over the symbolic part + constant term.
+    std::copy(minLb.begin(), minLb.end(), newLb.begin() + getNumDimIds());
+    std::transform(newLb.begin() + getNumDimIds(), newLb.end(),
+                   newLb.begin() + getNumDimIds(), std::negate<int64_t>());
+    std::copy(maxUb.begin(), maxUb.end(), newUb.begin() + getNumDimIds());
+
+    boundingLbs.push_back(newLb);
+    boundingUbs.push_back(newUb);
+  }
+
+  // Clear all constraints and add the lower/upper bounds for the bounding box.
+  clearConstraints();
+  for (unsigned d = 0, e = getNumDimIds(); d < e; ++d) {
+    addInequality(boundingLbs[d]);
+    addInequality(boundingUbs[d]);
+  }
+  // TODO(mlir-team): copy over pure symbolic constraints from this and 'other'
+  // over to the union (since the above are just the union along dimensions); we
+  // shouldn't be discarding any other constraints on the symbols.
+
+  return success();
+}
diff --git a/mlir/lib/Analysis/CMakeLists.txt b/mlir/lib/Analysis/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..96c2928b17f4f90dd723b9ed8ed07d864dfac372
--- /dev/null
+++ b/mlir/lib/Analysis/CMakeLists.txt
@@ -0,0 +1,29 @@
+add_llvm_library(MLIRAnalysis STATIC
+  AffineAnalysis.cpp
+  AffineStructures.cpp
+  CallGraph.cpp
+  Dominance.cpp
+  InferTypeOpInterface.cpp
+  Liveness.cpp
+  LoopAnalysis.cpp
+  MemRefBoundCheck.cpp
+  NestedMatcher.cpp
+  OpStats.cpp
+  SliceAnalysis.cpp
+  TestMemRefDependenceCheck.cpp
+  TestParallelismDetection.cpp
+  Utils.cpp
+  VectorAnalysis.cpp
+  Verifier.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Analysis
+  )
+add_dependencies(MLIRAnalysis
+  MLIRAffineOps
+  MLIRCallOpInterfacesIncGen
+  MLIRTypeInferOpInterfaceIncGen
+  MLIRLoopOps
+  MLIRVectorOps
+  )
+target_link_libraries(MLIRAnalysis MLIRAffineOps MLIRLoopOps MLIRVectorOps)
diff --git a/mlir/lib/Analysis/CallGraph.cpp b/mlir/lib/Analysis/CallGraph.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c35421d55ebd39c91bd8dd2bc8c4b7722807921b
--- /dev/null
+++ b/mlir/lib/Analysis/CallGraph.cpp
@@ -0,0 +1,256 @@
+//===- CallGraph.cpp - CallGraph analysis for MLIR ------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains interfaces and analyses for defining a nested callgraph.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/CallGraph.h"
+#include "mlir/Analysis/CallInterfaces.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/SymbolTable.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// CallInterfaces
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/CallInterfaces.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// CallGraphNode
+//===----------------------------------------------------------------------===//
+
+/// Returns if this node refers to the indirect/external node.
+bool CallGraphNode::isExternal() const { return !callableRegion; }
+
+/// Return the callable region this node represents. This can only be called
+/// on non-external nodes.
+Region *CallGraphNode::getCallableRegion() const {
+  assert(!isExternal() && "the external node has no callable region");
+  return callableRegion;
+}
+
+/// Adds an reference edge to the given node. This is only valid on the
+/// external node.
+void CallGraphNode::addAbstractEdge(CallGraphNode *node) {
+  assert(isExternal() && "abstract edges are only valid on external nodes");
+  addEdge(node, Edge::Kind::Abstract);
+}
+
+/// Add an outgoing call edge from this node.
+void CallGraphNode::addCallEdge(CallGraphNode *node) {
+  addEdge(node, Edge::Kind::Call);
+}
+
+/// Adds a reference edge to the given child node.
+void CallGraphNode::addChildEdge(CallGraphNode *child) {
+  addEdge(child, Edge::Kind::Child);
+}
+
+/// Returns true if this node has any child edges.
+bool CallGraphNode::hasChildren() const {
+  return llvm::any_of(edges, [](const Edge &edge) { return edge.isChild(); });
+}
+
+/// Add an edge to 'node' with the given kind.
+void CallGraphNode::addEdge(CallGraphNode *node, Edge::Kind kind) {
+  edges.insert({node, kind});
+}
+
+//===----------------------------------------------------------------------===//
+// CallGraph
+//===----------------------------------------------------------------------===//
+
+/// Recursively compute the callgraph edges for the given operation. Computed
+/// edges are placed into the given callgraph object.
+static void computeCallGraph(Operation *op, CallGraph &cg,
+                             CallGraphNode *parentNode);
+
+/// Compute the set of callgraph nodes that are created by regions nested within
+/// 'op'.
+static void computeCallables(Operation *op, CallGraph &cg,
+                             CallGraphNode *parentNode) {
+  if (op->getNumRegions() == 0)
+    return;
+  if (auto callableOp = dyn_cast<CallableOpInterface>(op)) {
+    SmallVector<Region *, 1> callables;
+    callableOp.getCallableRegions(callables);
+    for (auto *callableRegion : callables)
+      cg.getOrAddNode(callableRegion, parentNode);
+  }
+}
+
+/// Recursively compute the callgraph edges within the given region. Computed
+/// edges are placed into the given callgraph object.
+static void computeCallGraph(Region &region, CallGraph &cg,
+                             CallGraphNode *parentNode) {
+  // Iterate over the nested operations twice:
+  /// One to fully create nodes in the for each callable region of a nested
+  /// operation;
+  for (auto &block : region)
+    for (auto &nested : block)
+      computeCallables(&nested, cg, parentNode);
+
+  /// And another to recursively compute the callgraph.
+  for (auto &block : region)
+    for (auto &nested : block)
+      computeCallGraph(&nested, cg, parentNode);
+}
+
+/// Recursively compute the callgraph edges for the given operation. Computed
+/// edges are placed into the given callgraph object.
+static void computeCallGraph(Operation *op, CallGraph &cg,
+                             CallGraphNode *parentNode) {
+  // Compute the callgraph nodes and edges for each of the nested operations.
+  auto isCallable = isa<CallableOpInterface>(op);
+  for (auto &region : op->getRegions()) {
+    // Check to see if this region is a callable node, if so this is the parent
+    // node of the nested region.
+    CallGraphNode *nestedParentNode;
+    if (!isCallable || !(nestedParentNode = cg.lookupNode(&region)))
+      nestedParentNode = parentNode;
+    computeCallGraph(region, cg, nestedParentNode);
+  }
+
+  // If there is no parent node, we ignore this operation. Even if this
+  // operation was a call, there would be no callgraph node to attribute it to.
+  if (!parentNode)
+    return;
+
+  // If this is a call operation, resolve the callee.
+  if (auto call = dyn_cast<CallOpInterface>(op))
+    parentNode->addCallEdge(
+        cg.resolveCallable(call.getCallableForCallee(), op));
+}
+
+CallGraph::CallGraph(Operation *op) : externalNode(/*callableRegion=*/nullptr) {
+  computeCallGraph(op, *this, /*parentNode=*/nullptr);
+}
+
+/// Get or add a call graph node for the given region.
+CallGraphNode *CallGraph::getOrAddNode(Region *region,
+                                       CallGraphNode *parentNode) {
+  assert(region && isa<CallableOpInterface>(region->getParentOp()) &&
+         "expected parent operation to be callable");
+  std::unique_ptr<CallGraphNode> &node = nodes[region];
+  if (!node) {
+    node.reset(new CallGraphNode(region));
+
+    // Add this node to the given parent node if necessary.
+    if (parentNode)
+      parentNode->addChildEdge(node.get());
+    else
+      // Otherwise, connect all callable nodes to the external node, this allows
+      // for conservatively including all callable nodes within the graph.
+      // FIXME(riverriddle) This isn't correct, this is only necessary for
+      // callable nodes that *could* be called from external sources. This
+      // requires extending the interface for callables to check if they may be
+      // referenced externally.
+      externalNode.addAbstractEdge(node.get());
+  }
+  return node.get();
+}
+
+/// Lookup a call graph node for the given region, or nullptr if none is
+/// registered.
+CallGraphNode *CallGraph::lookupNode(Region *region) const {
+  auto it = nodes.find(region);
+  return it == nodes.end() ? nullptr : it->second.get();
+}
+
+/// Resolve the callable for given callee to a node in the callgraph, or the
+/// external node if a valid node was not resolved.
+CallGraphNode *CallGraph::resolveCallable(CallInterfaceCallable callable,
+                                          Operation *from) const {
+  // Get the callee operation from the callable.
+  Operation *callee;
+  if (auto symbolRef = callable.dyn_cast<SymbolRefAttr>())
+    // TODO(riverriddle) Support nested references.
+    callee = SymbolTable::lookupNearestSymbolFrom(from,
+                                                  symbolRef.getRootReference());
+  else
+    callee = callable.get<Value>()->getDefiningOp();
+
+  // If the callee is non-null and is a valid callable object, try to get the
+  // called region from it.
+  if (callee && callee->getNumRegions()) {
+    if (auto callableOp = dyn_cast_or_null<CallableOpInterface>(callee)) {
+      if (auto *node = lookupNode(callableOp.getCallableRegion(callable)))
+        return node;
+    }
+  }
+
+  // If we don't have a valid direct region, this is an external call.
+  return getExternalNode();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing
+
+/// Dump the graph in a human readable format.
+void CallGraph::dump() const { print(llvm::errs()); }
+void CallGraph::print(raw_ostream &os) const {
+  os << "// ---- CallGraph ----\n";
+
+  // Functor used to output the name for the given node.
+  auto emitNodeName = [&](const CallGraphNode *node) {
+    if (node->isExternal()) {
+      os << "<External-Node>";
+      return;
+    }
+
+    auto *callableRegion = node->getCallableRegion();
+    auto *parentOp = callableRegion->getParentOp();
+    os << "'" << callableRegion->getParentOp()->getName() << "' - Region #"
+       << callableRegion->getRegionNumber();
+    if (auto attrs = parentOp->getAttrList().getDictionary())
+      os << " : " << attrs;
+  };
+
+  for (auto &nodeIt : nodes) {
+    const CallGraphNode *node = nodeIt.second.get();
+
+    // Dump the header for this node.
+    os << "// - Node : ";
+    emitNodeName(node);
+    os << "\n";
+
+    // Emit each of the edges.
+    for (auto &edge : *node) {
+      os << "// -- ";
+      if (edge.isCall())
+        os << "Call";
+      else if (edge.isChild())
+        os << "Child";
+
+      os << "-Edge : ";
+      emitNodeName(edge.getTarget());
+      os << "\n";
+    }
+    os << "//\n";
+  }
+
+  os << "// -- SCCs --\n";
+
+  for (auto &scc : make_range(llvm::scc_begin(this), llvm::scc_end(this))) {
+    os << "// - SCC : \n";
+    for (auto &node : scc) {
+      os << "// -- Node :";
+      emitNodeName(node);
+      os << "\n";
+    }
+    os << "\n";
+  }
+
+  os << "// -------------------\n";
+}
diff --git a/mlir/lib/Analysis/Dominance.cpp b/mlir/lib/Analysis/Dominance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4af4c0d69bcd5feab9566c79e81d18f6578383b
--- /dev/null
+++ b/mlir/lib/Analysis/Dominance.cpp
@@ -0,0 +1,171 @@
+//===- Dominance.cpp - Dominator analysis for CFGs ------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of dominance related classes and instantiations of extern
+// templates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Dominance.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/Support/GenericDomTreeConstruction.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+template class llvm::DominatorTreeBase<Block, /*IsPostDom=*/false>;
+template class llvm::DominatorTreeBase<Block, /*IsPostDom=*/true>;
+template class llvm::DomTreeNodeBase<Block>;
+
+//===----------------------------------------------------------------------===//
+// DominanceInfoBase
+//===----------------------------------------------------------------------===//
+
+template <bool IsPostDom>
+void DominanceInfoBase<IsPostDom>::recalculate(Operation *op) {
+  dominanceInfos.clear();
+
+  /// Build the dominance for each of the operation regions.
+  op->walk([&](Operation *op) {
+    for (auto &region : op->getRegions()) {
+      // Don't compute dominance if the region is empty.
+      if (region.empty())
+        continue;
+      auto opDominance = std::make_unique<base>();
+      opDominance->recalculate(region);
+      dominanceInfos.try_emplace(&region, std::move(opDominance));
+    }
+  });
+}
+
+/// Return true if the specified block A properly dominates block B.
+template <bool IsPostDom>
+bool DominanceInfoBase<IsPostDom>::properlyDominates(Block *a, Block *b) {
+  // A block dominates itself but does not properly dominate itself.
+  if (a == b)
+    return false;
+
+  // If either a or b are null, then conservatively return false.
+  if (!a || !b)
+    return false;
+
+  // If both blocks are not in the same region, 'a' properly dominates 'b' if
+  // 'b' is defined in an operation region that (recursively) ends up being
+  // dominated by 'a'. Walk up the list of containers enclosing B.
+  auto *regionA = a->getParent(), *regionB = b->getParent();
+  if (regionA != regionB) {
+    Operation *bAncestor;
+    do {
+      bAncestor = regionB->getParentOp();
+      // If 'bAncestor' is the top level region, then 'a' is a block that post
+      // dominates 'b'.
+      if (!bAncestor || !bAncestor->getBlock())
+        return IsPostDom;
+
+      regionB = bAncestor->getBlock()->getParent();
+    } while (regionA != regionB);
+
+    // Check to see if the ancestor of 'b' is the same block as 'a'.
+    b = bAncestor->getBlock();
+    if (a == b)
+      return true;
+  }
+
+  // Otherwise, use the standard dominance functionality.
+
+  // If we don't have a dominance information for this region, assume that b is
+  // dominated by anything.
+  auto baseInfoIt = dominanceInfos.find(regionA);
+  if (baseInfoIt == dominanceInfos.end())
+    return true;
+  return baseInfoIt->second->properlyDominates(a, b);
+}
+
+template class mlir::detail::DominanceInfoBase</*IsPostDom=*/true>;
+template class mlir::detail::DominanceInfoBase</*IsPostDom=*/false>;
+
+//===----------------------------------------------------------------------===//
+// DominanceInfo
+//===----------------------------------------------------------------------===//
+
+/// Return true if operation A properly dominates operation B.
+bool DominanceInfo::properlyDominates(Operation *a, Operation *b) {
+  auto *aBlock = a->getBlock(), *bBlock = b->getBlock();
+
+  // If a or b are not within a block, then a does not dominate b.
+  if (!aBlock || !bBlock)
+    return false;
+
+  // If the blocks are the same, then check if b is before a in the block.
+  if (aBlock == bBlock)
+    return a->isBeforeInBlock(b);
+
+  // Traverse up b's hierarchy to check if b's block is contained in a's.
+  if (auto *bAncestor = aBlock->findAncestorOpInBlock(*b)) {
+    // Since we already know that aBlock != bBlock, here bAncestor != b.
+    // a and bAncestor are in the same block; check if 'a' dominates
+    // bAncestor.
+    return dominates(a, bAncestor);
+  }
+
+  // If the blocks are different, check if a's block dominates b's.
+  return properlyDominates(aBlock, bBlock);
+}
+
+/// Return true if value A properly dominates operation B.
+bool DominanceInfo::properlyDominates(Value a, Operation *b) {
+  if (auto *aOp = a->getDefiningOp()) {
+    // The values defined by an operation do *not* dominate any nested
+    // operations.
+    if (aOp->getParentRegion() != b->getParentRegion() && aOp->isAncestor(b))
+      return false;
+    return properlyDominates(aOp, b);
+  }
+
+  // block arguments properly dominate all operations in their own block, so
+  // we use a dominates check here, not a properlyDominates check.
+  return dominates(a.cast<BlockArgument>()->getOwner(), b->getBlock());
+}
+
+DominanceInfoNode *DominanceInfo::getNode(Block *a) {
+  auto *region = a->getParent();
+  assert(dominanceInfos.count(region) != 0);
+  return dominanceInfos[region]->getNode(a);
+}
+
+void DominanceInfo::updateDFSNumbers() {
+  for (auto &iter : dominanceInfos)
+    iter.second->updateDFSNumbers();
+}
+
+//===----------------------------------------------------------------------===//
+// PostDominanceInfo
+//===----------------------------------------------------------------------===//
+
+/// Returns true if statement 'a' properly postdominates statement b.
+bool PostDominanceInfo::properlyPostDominates(Operation *a, Operation *b) {
+  auto *aBlock = a->getBlock(), *bBlock = b->getBlock();
+
+  // If a or b are not within a block, then a does not post dominate b.
+  if (!aBlock || !bBlock)
+    return false;
+
+  // If the blocks are the same, check if b is before a in the block.
+  if (aBlock == bBlock)
+    return b->isBeforeInBlock(a);
+
+  // Traverse up b's hierarchy to check if b's block is contained in a's.
+  if (auto *bAncestor = a->getBlock()->findAncestorOpInBlock(*b))
+    // Since we already know that aBlock != bBlock, here bAncestor != b.
+    // a and bAncestor are in the same block; check if 'a' postdominates
+    // bAncestor.
+    return postDominates(a, bAncestor);
+
+  // If the blocks are different, check if a's block post dominates b's.
+  return properlyDominates(aBlock, bBlock);
+}
diff --git a/mlir/lib/Analysis/InferTypeOpInterface.cpp b/mlir/lib/Analysis/InferTypeOpInterface.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e52de2b3fa2def2e9a6e72e1ff30671eacfd66a
--- /dev/null
+++ b/mlir/lib/Analysis/InferTypeOpInterface.cpp
@@ -0,0 +1,22 @@
+//===- InferTypeOpInterface.cpp - Infer Type Interfaces ---------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definitions of the infer op interfaces defined in
+// `InferTypeOpInterface.td`.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/InferTypeOpInterface.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir {
+#include "mlir/Analysis/InferTypeOpInterface.cpp.inc"
+} // namespace mlir
diff --git a/mlir/lib/Analysis/Liveness.cpp b/mlir/lib/Analysis/Liveness.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ba31365f1ad3d50ac8a0278a281094cab392c75
--- /dev/null
+++ b/mlir/lib/Analysis/Liveness.cpp
@@ -0,0 +1,373 @@
+//===- Liveness.cpp - Liveness analysis for MLIR --------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the liveness analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Liveness.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/Value.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+/// Builds and holds block information during the construction phase.
+struct BlockInfoBuilder {
+  using ValueSetT = Liveness::ValueSetT;
+
+  /// Constructs an empty block builder.
+  BlockInfoBuilder() : block(nullptr) {}
+
+  /// Fills the block builder with initial liveness information.
+  BlockInfoBuilder(Block *block) : block(block) {
+    // Mark all block arguments (phis) as defined.
+    for (BlockArgument argument : block->getArguments())
+      defValues.insert(argument);
+
+    // Check all result values and whether their uses
+    // are inside this block or not (see outValues).
+    for (Operation &operation : *block)
+      for (Value result : operation.getResults()) {
+        defValues.insert(result);
+
+        // Check whether this value will be in the outValues
+        // set (its uses escape this block). Due to the SSA
+        // properties of the program, the uses must occur after
+        // the definition. Therefore, we do not have to check
+        // additional conditions to detect an escaping value.
+        for (OpOperand &use : result->getUses())
+          if (use.getOwner()->getBlock() != block) {
+            outValues.insert(result);
+            break;
+          }
+      }
+
+    // Check all operations for used operands.
+    for (Operation &operation : block->getOperations())
+      for (Value operand : operation.getOperands()) {
+        // If the operand is already defined in the scope of this
+        // block, we can skip the value in the use set.
+        if (!defValues.count(operand))
+          useValues.insert(operand);
+      }
+  }
+
+  /// Updates live-in information of the current block.
+  /// To do so it uses the default liveness-computation formula:
+  /// newIn = use union out \ def.
+  /// The methods returns true, if the set has changed (newIn != in),
+  /// false otherwise.
+  bool updateLiveIn() {
+    ValueSetT newIn = useValues;
+    llvm::set_union(newIn, outValues);
+    llvm::set_subtract(newIn, defValues);
+
+    // It is sufficient to check the set sizes (instead of their contents)
+    // since the live-in set can only grow monotonically during all update
+    // operations.
+    if (newIn.size() == inValues.size())
+      return false;
+
+    inValues = newIn;
+    return true;
+  }
+
+  /// Updates live-out information of the current block.
+  /// It iterates over all successors and unifies their live-in
+  /// values with the current live-out values.
+  template <typename SourceT> void updateLiveOut(SourceT &source) {
+    for (Block *succ : block->getSuccessors()) {
+      BlockInfoBuilder &builder = source[succ];
+      llvm::set_union(outValues, builder.inValues);
+    }
+  }
+
+  /// The current block.
+  Block *block;
+
+  /// The set of all live in values.
+  ValueSetT inValues;
+
+  /// The set of all live out values.
+  ValueSetT outValues;
+
+  /// The set of all defined values.
+  ValueSetT defValues;
+
+  /// The set of all used values.
+  ValueSetT useValues;
+};
+
+/// Builds the internal liveness block mapping.
+static void buildBlockMapping(MutableArrayRef<Region> regions,
+                              DenseMap<Block *, BlockInfoBuilder> &builders) {
+  llvm::SetVector<Block *> toProcess;
+
+  // Initialize all block structures
+  for (Region &region : regions)
+    for (Block &block : region) {
+      BlockInfoBuilder &builder =
+          builders.try_emplace(&block, &block).first->second;
+
+      if (builder.updateLiveIn())
+        toProcess.insert(block.pred_begin(), block.pred_end());
+    }
+
+  // Propagate the in and out-value sets (fixpoint iteration)
+  while (!toProcess.empty()) {
+    Block *current = toProcess.pop_back_val();
+    BlockInfoBuilder &builder = builders[current];
+
+    // Update the current out values.
+    builder.updateLiveOut(builders);
+
+    // Compute (potentially) updated live in values.
+    if (builder.updateLiveIn())
+      toProcess.insert(current->pred_begin(), current->pred_end());
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Liveness
+//===----------------------------------------------------------------------===//
+
+/// Creates a new Liveness analysis that computes liveness
+/// information for all associated regions.
+Liveness::Liveness(Operation *op) : operation(op) { build(op->getRegions()); }
+
+/// Initializes the internal mappings.
+void Liveness::build(MutableArrayRef<Region> regions) {
+
+  // Build internal block mapping.
+  DenseMap<Block *, BlockInfoBuilder> builders;
+  buildBlockMapping(regions, builders);
+
+  // Store internal block data.
+  for (auto &entry : builders) {
+    BlockInfoBuilder &builder = entry.second;
+    LivenessBlockInfo &info = blockMapping[entry.first];
+
+    info.block = builder.block;
+    info.inValues = std::move(builder.inValues);
+    info.outValues = std::move(builder.outValues);
+  }
+}
+
+/// Gets liveness info (if any) for the given value.
+Liveness::OperationListT Liveness::resolveLiveness(Value value) const {
+  OperationListT result;
+  SmallPtrSet<Block *, 32> visited;
+  SmallVector<Block *, 8> toProcess;
+
+  // Start with the defining block
+  Block *currentBlock;
+  if (Operation *defOp = value->getDefiningOp())
+    currentBlock = defOp->getBlock();
+  else
+    currentBlock = value.cast<BlockArgument>()->getOwner();
+  toProcess.push_back(currentBlock);
+  visited.insert(currentBlock);
+
+  // Start with all associated blocks
+  for (OpOperand &use : value->getUses()) {
+    Block *useBlock = use.getOwner()->getBlock();
+    if (visited.insert(useBlock).second)
+      toProcess.push_back(useBlock);
+  }
+
+  while (!toProcess.empty()) {
+    // Get block and block liveness information.
+    Block *block = toProcess.back();
+    toProcess.pop_back();
+    const LivenessBlockInfo *blockInfo = getLiveness(block);
+
+    // Note that start and end will be in the same block.
+    Operation *start = blockInfo->getStartOperation(value);
+    Operation *end = blockInfo->getEndOperation(value, start);
+
+    result.push_back(start);
+    while (start != end) {
+      start = start->getNextNode();
+      result.push_back(start);
+    }
+
+    for (Block *successor : block->getSuccessors()) {
+      if (getLiveness(successor)->isLiveIn(value) &&
+          visited.insert(successor).second)
+        toProcess.push_back(successor);
+    }
+  }
+
+  return result;
+}
+
+/// Gets liveness info (if any) for the block.
+const LivenessBlockInfo *Liveness::getLiveness(Block *block) const {
+  auto it = blockMapping.find(block);
+  return it == blockMapping.end() ? nullptr : &it->second;
+}
+
+/// Returns a reference to a set containing live-in values.
+const Liveness::ValueSetT &Liveness::getLiveIn(Block *block) const {
+  return getLiveness(block)->in();
+}
+
+/// Returns a reference to a set containing live-out values.
+const Liveness::ValueSetT &Liveness::getLiveOut(Block *block) const {
+  return getLiveness(block)->out();
+}
+
+/// Returns true if the given operation represent the last use of the
+/// given value.
+bool Liveness::isLastUse(Value value, Operation *operation) const {
+  Block *block = operation->getBlock();
+  const LivenessBlockInfo *blockInfo = getLiveness(block);
+
+  // The given value escapes the associated block.
+  if (blockInfo->isLiveOut(value))
+    return false;
+
+  Operation *endOperation = blockInfo->getEndOperation(value, operation);
+  // If the operation is a real user of `value` the first check is sufficient.
+  // If not, we will have to test whether the end operation is executed before
+  // the given operation in the block.
+  return endOperation == operation || endOperation->isBeforeInBlock(operation);
+}
+
+/// Dumps the liveness information in a human readable format.
+void Liveness::dump() const { print(llvm::errs()); }
+
+/// Dumps the liveness information to the given stream.
+void Liveness::print(raw_ostream &os) const {
+  os << "// ---- Liveness -----\n";
+
+  // Builds unique block/value mappings for testing purposes.
+  DenseMap<Block *, size_t> blockIds;
+  DenseMap<Operation *, size_t> operationIds;
+  DenseMap<Value, size_t> valueIds;
+  for (Region &region : operation->getRegions())
+    for (Block &block : region) {
+      blockIds.insert({&block, blockIds.size()});
+      for (BlockArgument argument : block.getArguments())
+        valueIds.insert({argument, valueIds.size()});
+      for (Operation &operation : block) {
+        operationIds.insert({&operation, operationIds.size()});
+        for (Value result : operation.getResults())
+          valueIds.insert({result, valueIds.size()});
+      }
+    }
+
+  // Local printing helpers
+  auto printValueRef = [&](Value value) {
+    if (Operation *defOp = value->getDefiningOp())
+      os << "val_" << defOp->getName();
+    else {
+      auto blockArg = value.cast<BlockArgument>();
+      os << "arg" << blockArg->getArgNumber() << "@"
+         << blockIds[blockArg->getOwner()];
+    }
+    os << " ";
+  };
+
+  auto printValueRefs = [&](const ValueSetT &values) {
+    std::vector<Value> orderedValues(values.begin(), values.end());
+    std::sort(orderedValues.begin(), orderedValues.end(),
+              [&](Value left, Value right) {
+                return valueIds[left] < valueIds[right];
+              });
+    for (Value value : orderedValues)
+      printValueRef(value);
+  };
+
+  // Dump information about in and out values.
+  for (Region &region : operation->getRegions())
+    for (Block &block : region) {
+      os << "// - Block: " << blockIds[&block] << "\n";
+      auto liveness = getLiveness(&block);
+      os << "// --- LiveIn: ";
+      printValueRefs(liveness->inValues);
+      os << "\n// --- LiveOut: ";
+      printValueRefs(liveness->outValues);
+      os << "\n";
+
+      // Print liveness intervals.
+      os << "// --- BeginLiveness";
+      for (Operation &op : block) {
+        if (op.getNumResults() < 1)
+          continue;
+        os << "\n";
+        for (Value result : op.getResults()) {
+          os << "// ";
+          printValueRef(result);
+          os << ":";
+          auto liveOperations = resolveLiveness(result);
+          std::sort(liveOperations.begin(), liveOperations.end(),
+                    [&](Operation *left, Operation *right) {
+                      return operationIds[left] < operationIds[right];
+                    });
+          for (Operation *operation : liveOperations) {
+            os << "\n//     ";
+            operation->print(os);
+          }
+        }
+      }
+      os << "\n// --- EndLiveness\n";
+    }
+  os << "// -------------------\n";
+}
+
+//===----------------------------------------------------------------------===//
+// LivenessBlockInfo
+//===----------------------------------------------------------------------===//
+
+/// Returns true if the given value is in the live-in set.
+bool LivenessBlockInfo::isLiveIn(Value value) const {
+  return inValues.count(value);
+}
+
+/// Returns true if the given value is in the live-out set.
+bool LivenessBlockInfo::isLiveOut(Value value) const {
+  return outValues.count(value);
+}
+
+/// Gets the start operation for the given value
+/// (must be referenced in this block).
+Operation *LivenessBlockInfo::getStartOperation(Value value) const {
+  Operation *definingOp = value->getDefiningOp();
+  // The given value is either live-in or is defined
+  // in the scope of this block.
+  if (isLiveIn(value) || !definingOp)
+    return &block->front();
+  return definingOp;
+}
+
+/// Gets the end operation for the given value using the start operation
+/// provided (must be referenced in this block).
+Operation *LivenessBlockInfo::getEndOperation(Value value,
+                                              Operation *startOperation) const {
+  // The given value is either dying in this block or live-out.
+  if (isLiveOut(value))
+    return &block->back();
+
+  // Resolve the last operation (must exist by definition).
+  Operation *endOperation = startOperation;
+  for (OpOperand &use : value->getUses()) {
+    Operation *useOperation = use.getOwner();
+    // Check whether the use is in our block and after
+    // the current end operation.
+    if (useOperation->getBlock() == block &&
+        endOperation->isBeforeInBlock(useOperation))
+      endOperation = useOperation;
+  }
+  return endOperation;
+}
diff --git a/mlir/lib/Analysis/LoopAnalysis.cpp b/mlir/lib/Analysis/LoopAnalysis.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..18c86dc63b4c2ea888911037ffc89bd9837e002e
--- /dev/null
+++ b/mlir/lib/Analysis/LoopAnalysis.cpp
@@ -0,0 +1,388 @@
+//===- LoopAnalysis.cpp - Misc loop analysis routines //-------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements miscellaneous loop analysis routines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/LoopAnalysis.h"
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/NestedMatcher.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Support/MathExtras.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallString.h"
+#include <type_traits>
+
+using namespace mlir;
+
+/// Returns the trip count of the loop as an affine expression if the latter is
+/// expressible as an affine expression, and nullptr otherwise. The trip count
+/// expression is simplified before returning. This method only utilizes map
+/// composition to construct lower and upper bounds before computing the trip
+/// count expressions.
+// TODO(mlir-team): this should be moved into 'Transforms/' and be replaced by a
+// pure analysis method relying on FlatAffineConstraints; the latter will also
+// be more powerful (since both inequalities and equalities will be considered).
+void mlir::buildTripCountMapAndOperands(
+    AffineForOp forOp, AffineMap *tripCountMap,
+    SmallVectorImpl<Value> *tripCountOperands) {
+  int64_t loopSpan;
+
+  int64_t step = forOp.getStep();
+  OpBuilder b(forOp.getOperation());
+
+  if (forOp.hasConstantBounds()) {
+    int64_t lb = forOp.getConstantLowerBound();
+    int64_t ub = forOp.getConstantUpperBound();
+    loopSpan = ub - lb;
+    if (loopSpan < 0)
+      loopSpan = 0;
+    *tripCountMap = b.getConstantAffineMap(ceilDiv(loopSpan, step));
+    tripCountOperands->clear();
+    return;
+  }
+  auto lbMap = forOp.getLowerBoundMap();
+  auto ubMap = forOp.getUpperBoundMap();
+  if (lbMap.getNumResults() != 1) {
+    *tripCountMap = AffineMap();
+    return;
+  }
+  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
+  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
+
+  // Difference of each upper bound expression from the single lower bound
+  // expression (divided by the step) provides the expressions for the trip
+  // count map.
+  AffineValueMap ubValueMap(ubMap, ubOperands);
+
+  SmallVector<AffineExpr, 4> lbSplatExpr(ubValueMap.getNumResults(),
+                                         lbMap.getResult(0));
+  auto lbMapSplat =
+      AffineMap::get(lbMap.getNumDims(), lbMap.getNumSymbols(), lbSplatExpr);
+  AffineValueMap lbSplatValueMap(lbMapSplat, lbOperands);
+
+  AffineValueMap tripCountValueMap;
+  AffineValueMap::difference(ubValueMap, lbSplatValueMap, &tripCountValueMap);
+  for (unsigned i = 0, e = tripCountValueMap.getNumResults(); i < e; ++i)
+    tripCountValueMap.setResult(i,
+                                tripCountValueMap.getResult(i).ceilDiv(step));
+
+  *tripCountMap = tripCountValueMap.getAffineMap();
+  tripCountOperands->assign(tripCountValueMap.getOperands().begin(),
+                            tripCountValueMap.getOperands().end());
+}
+
+/// Returns the trip count of the loop if it's a constant, None otherwise. This
+/// method uses affine expression analysis (in turn using getTripCount) and is
+/// able to determine constant trip count in non-trivial cases.
+// FIXME(mlir-team): this is really relying on buildTripCountMapAndOperands;
+// being an analysis utility, it shouldn't. Replace with a version that just
+// works with analysis structures (FlatAffineConstraints) and thus doesn't
+// update the IR.
+Optional<uint64_t> mlir::getConstantTripCount(AffineForOp forOp) {
+  SmallVector<Value, 4> operands;
+  AffineMap map;
+  buildTripCountMapAndOperands(forOp, &map, &operands);
+
+  if (!map)
+    return None;
+
+  // Take the min if all trip counts are constant.
+  Optional<uint64_t> tripCount;
+  for (auto resultExpr : map.getResults()) {
+    if (auto constExpr = resultExpr.dyn_cast<AffineConstantExpr>()) {
+      if (tripCount.hasValue())
+        tripCount = std::min(tripCount.getValue(),
+                             static_cast<uint64_t>(constExpr.getValue()));
+      else
+        tripCount = constExpr.getValue();
+    } else
+      return None;
+  }
+  return tripCount;
+}
+
+/// Returns the greatest known integral divisor of the trip count. Affine
+/// expression analysis is used (indirectly through getTripCount), and
+/// this method is thus able to determine non-trivial divisors.
+uint64_t mlir::getLargestDivisorOfTripCount(AffineForOp forOp) {
+  SmallVector<Value, 4> operands;
+  AffineMap map;
+  buildTripCountMapAndOperands(forOp, &map, &operands);
+
+  if (!map)
+    return 1;
+
+  // The largest divisor of the trip count is the GCD of the individual largest
+  // divisors.
+  assert(map.getNumResults() >= 1 && "expected one or more results");
+  Optional<uint64_t> gcd;
+  for (auto resultExpr : map.getResults()) {
+    uint64_t thisGcd;
+    if (auto constExpr = resultExpr.dyn_cast<AffineConstantExpr>()) {
+      uint64_t tripCount = constExpr.getValue();
+      // 0 iteration loops (greatest divisor is 2^64 - 1).
+      if (tripCount == 0)
+        thisGcd = std::numeric_limits<uint64_t>::max();
+      else
+        // The greatest divisor is the trip count.
+        thisGcd = tripCount;
+    } else {
+      // Trip count is not a known constant; return its largest known divisor.
+      thisGcd = resultExpr.getLargestKnownDivisor();
+    }
+    if (gcd.hasValue())
+      gcd = llvm::GreatestCommonDivisor64(gcd.getValue(), thisGcd);
+    else
+      gcd = thisGcd;
+  }
+  assert(gcd.hasValue() && "value expected per above logic");
+  return gcd.getValue();
+}
+
+/// Given an induction variable `iv` of type AffineForOp and an access `index`
+/// of type index, returns `true` if `index` is independent of `iv` and
+/// false otherwise. The determination supports composition with at most one
+/// AffineApplyOp. The 'at most one AffineApplyOp' comes from the fact that
+/// the composition of AffineApplyOp needs to be canonicalized by construction
+/// to avoid writing code that composes arbitrary numbers of AffineApplyOps
+/// everywhere. To achieve this, at the very least, the compose-affine-apply
+/// pass must have been run.
+///
+/// Prerequisites:
+///   1. `iv` and `index` of the proper type;
+///   2. at most one reachable AffineApplyOp from index;
+///
+/// Returns false in cases with more than one AffineApplyOp, this is
+/// conservative.
+static bool isAccessIndexInvariant(Value iv, Value index) {
+  assert(isForInductionVar(iv) && "iv must be a AffineForOp");
+  assert(index->getType().isa<IndexType>() && "index must be of IndexType");
+  SmallVector<Operation *, 4> affineApplyOps;
+  getReachableAffineApplyOps({index}, affineApplyOps);
+
+  if (affineApplyOps.empty()) {
+    // Pointer equality test because of Value pointer semantics.
+    return index != iv;
+  }
+
+  if (affineApplyOps.size() > 1) {
+    affineApplyOps[0]->emitRemark(
+        "CompositionAffineMapsPass must have been run: there should be at most "
+        "one AffineApplyOp, returning false conservatively.");
+    return false;
+  }
+
+  auto composeOp = cast<AffineApplyOp>(affineApplyOps[0]);
+  // We need yet another level of indirection because the `dim` index of the
+  // access may not correspond to the `dim` index of composeOp.
+  return !(AffineValueMap(composeOp).isFunctionOf(0, iv));
+}
+
+DenseSet<Value> mlir::getInvariantAccesses(Value iv, ArrayRef<Value> indices) {
+  DenseSet<Value> res;
+  for (unsigned idx = 0, n = indices.size(); idx < n; ++idx) {
+    auto val = indices[idx];
+    if (isAccessIndexInvariant(iv, val)) {
+      res.insert(val);
+    }
+  }
+  return res;
+}
+
+/// Given:
+///   1. an induction variable `iv` of type AffineForOp;
+///   2. a `memoryOp` of type const LoadOp& or const StoreOp&;
+/// determines whether `memoryOp` has a contiguous access along `iv`. Contiguous
+/// is defined as either invariant or varying only along a unique MemRef dim.
+/// Upon success, the unique MemRef dim is written in `memRefDim` (or -1 to
+/// convey the memRef access is invariant along `iv`).
+///
+/// Prerequisites:
+///   1. `memRefDim` ~= nullptr;
+///   2. `iv` of the proper type;
+///   3. the MemRef accessed by `memoryOp` has no layout map or at most an
+///      identity layout map.
+///
+/// Currently only supports no layoutMap or identity layoutMap in the MemRef.
+/// Returns false if the MemRef has a non-identity layoutMap or more than 1
+/// layoutMap. This is conservative.
+///
+// TODO(ntv): check strides.
+template <typename LoadOrStoreOp>
+static bool isContiguousAccess(Value iv, LoadOrStoreOp memoryOp,
+                               int *memRefDim) {
+  static_assert(std::is_same<LoadOrStoreOp, AffineLoadOp>::value ||
+                    std::is_same<LoadOrStoreOp, AffineStoreOp>::value,
+                "Must be called on either const LoadOp & or const StoreOp &");
+  assert(memRefDim && "memRefDim == nullptr");
+  auto memRefType = memoryOp.getMemRefType();
+
+  auto layoutMap = memRefType.getAffineMaps();
+  // TODO(ntv): remove dependence on Builder once we support non-identity
+  // layout map.
+  Builder b(memoryOp.getContext());
+  if (layoutMap.size() >= 2 ||
+      (layoutMap.size() == 1 &&
+       !(layoutMap[0] ==
+         b.getMultiDimIdentityMap(layoutMap[0].getNumDims())))) {
+    return memoryOp.emitError("NYI: non-trivial layoutMap"), false;
+  }
+
+  int uniqueVaryingIndexAlongIv = -1;
+  auto accessMap = memoryOp.getAffineMap();
+  SmallVector<Value, 4> mapOperands(memoryOp.getMapOperands());
+  unsigned numDims = accessMap.getNumDims();
+  for (unsigned i = 0, e = memRefType.getRank(); i < e; ++i) {
+    // Gather map operands used result expr 'i' in 'exprOperands'.
+    SmallVector<Value, 4> exprOperands;
+    auto resultExpr = accessMap.getResult(i);
+    resultExpr.walk([&](AffineExpr expr) {
+      if (auto dimExpr = expr.dyn_cast<AffineDimExpr>())
+        exprOperands.push_back(mapOperands[dimExpr.getPosition()]);
+      else if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>())
+        exprOperands.push_back(mapOperands[numDims + symExpr.getPosition()]);
+    });
+    // Check access invariance of each operand in 'exprOperands'.
+    for (auto exprOperand : exprOperands) {
+      if (!isAccessIndexInvariant(iv, exprOperand)) {
+        if (uniqueVaryingIndexAlongIv != -1) {
+          // 2+ varying indices -> do not vectorize along iv.
+          return false;
+        }
+        uniqueVaryingIndexAlongIv = i;
+      }
+    }
+  }
+
+  if (uniqueVaryingIndexAlongIv == -1)
+    *memRefDim = -1;
+  else
+    *memRefDim = memRefType.getRank() - (uniqueVaryingIndexAlongIv + 1);
+  return true;
+}
+
+template <typename LoadOrStoreOpPointer>
+static bool isVectorElement(LoadOrStoreOpPointer memoryOp) {
+  auto memRefType = memoryOp.getMemRefType();
+  return memRefType.getElementType().template isa<VectorType>();
+}
+
+using VectorizableOpFun = std::function<bool(AffineForOp, Operation &)>;
+
+static bool
+isVectorizableLoopBodyWithOpCond(AffineForOp loop,
+                                 VectorizableOpFun isVectorizableOp,
+                                 NestedPattern &vectorTransferMatcher) {
+  auto *forOp = loop.getOperation();
+
+  // No vectorization across conditionals for now.
+  auto conditionals = matcher::If();
+  SmallVector<NestedMatch, 8> conditionalsMatched;
+  conditionals.match(forOp, &conditionalsMatched);
+  if (!conditionalsMatched.empty()) {
+    return false;
+  }
+
+  // No vectorization across unknown regions.
+  auto regions = matcher::Op([](Operation &op) -> bool {
+    return op.getNumRegions() != 0 &&
+           !(isa<AffineIfOp>(op) || isa<AffineForOp>(op));
+  });
+  SmallVector<NestedMatch, 8> regionsMatched;
+  regions.match(forOp, &regionsMatched);
+  if (!regionsMatched.empty()) {
+    return false;
+  }
+
+  SmallVector<NestedMatch, 8> vectorTransfersMatched;
+  vectorTransferMatcher.match(forOp, &vectorTransfersMatched);
+  if (!vectorTransfersMatched.empty()) {
+    return false;
+  }
+
+  auto loadAndStores = matcher::Op(matcher::isLoadOrStore);
+  SmallVector<NestedMatch, 8> loadAndStoresMatched;
+  loadAndStores.match(forOp, &loadAndStoresMatched);
+  for (auto ls : loadAndStoresMatched) {
+    auto *op = ls.getMatchedOperation();
+    auto load = dyn_cast<AffineLoadOp>(op);
+    auto store = dyn_cast<AffineStoreOp>(op);
+    // Only scalar types are considered vectorizable, all load/store must be
+    // vectorizable for a loop to qualify as vectorizable.
+    // TODO(ntv): ponder whether we want to be more general here.
+    bool vector = load ? isVectorElement(load) : isVectorElement(store);
+    if (vector) {
+      return false;
+    }
+    if (isVectorizableOp && !isVectorizableOp(loop, *op)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool mlir::isVectorizableLoopBody(AffineForOp loop, int *memRefDim,
+                                  NestedPattern &vectorTransferMatcher) {
+  VectorizableOpFun fun([memRefDim](AffineForOp loop, Operation &op) {
+    auto load = dyn_cast<AffineLoadOp>(op);
+    auto store = dyn_cast<AffineStoreOp>(op);
+    return load ? isContiguousAccess(loop.getInductionVar(), load, memRefDim)
+                : isContiguousAccess(loop.getInductionVar(), store, memRefDim);
+  });
+  return isVectorizableLoopBodyWithOpCond(loop, fun, vectorTransferMatcher);
+}
+
+bool mlir::isVectorizableLoopBody(AffineForOp loop,
+                                  NestedPattern &vectorTransferMatcher) {
+  return isVectorizableLoopBodyWithOpCond(loop, nullptr, vectorTransferMatcher);
+}
+
+/// Checks whether SSA dominance would be violated if a for op's body
+/// operations are shifted by the specified shifts. This method checks if a
+/// 'def' and all its uses have the same shift factor.
+// TODO(mlir-team): extend this to check for memory-based dependence violation
+// when we have the support.
+bool mlir::isInstwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts) {
+  auto *forBody = forOp.getBody();
+  assert(shifts.size() == forBody->getOperations().size());
+
+  // Work backwards over the body of the block so that the shift of a use's
+  // ancestor operation in the block gets recorded before it's looked up.
+  DenseMap<Operation *, uint64_t> forBodyShift;
+  for (auto it : llvm::enumerate(llvm::reverse(forBody->getOperations()))) {
+    auto &op = it.value();
+
+    // Get the index of the current operation, note that we are iterating in
+    // reverse so we need to fix it up.
+    size_t index = shifts.size() - it.index() - 1;
+
+    // Remember the shift of this operation.
+    uint64_t shift = shifts[index];
+    forBodyShift.try_emplace(&op, shift);
+
+    // Validate the results of this operation if it were to be shifted.
+    for (unsigned i = 0, e = op.getNumResults(); i < e; ++i) {
+      Value result = op.getResult(i);
+      for (auto *user : result->getUsers()) {
+        // If an ancestor operation doesn't lie in the block of forOp,
+        // there is no shift to check.
+        if (auto *ancOp = forBody->findAncestorOpInBlock(*user)) {
+          assert(forBodyShift.count(ancOp) > 0 && "ancestor expected in map");
+          if (shift != forBodyShift[ancOp])
+            return false;
+        }
+      }
+    }
+  }
+  return true;
+}
diff --git a/mlir/lib/Analysis/MemRefBoundCheck.cpp b/mlir/lib/Analysis/MemRefBoundCheck.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f7c1a1ae31010d490d156967ac8f5a85546fd37
--- /dev/null
+++ b/mlir/lib/Analysis/MemRefBoundCheck.cpp
@@ -0,0 +1,53 @@
+//===- MemRefBoundCheck.cpp - MLIR Affine Structures Class ----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to check memref accesses for out of bound
+// accesses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ADT/TypeSwitch.h"
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "memref-bound-check"
+
+using namespace mlir;
+
+namespace {
+
+/// Checks for out of bound memef access subscripts..
+struct MemRefBoundCheck : public FunctionPass<MemRefBoundCheck> {
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createMemRefBoundCheckPass() {
+  return std::make_unique<MemRefBoundCheck>();
+}
+
+void MemRefBoundCheck::runOnFunction() {
+  getFunction().walk([](Operation *opInst) {
+    TypeSwitch<Operation *>(opInst).Case<AffineLoadOp, AffineStoreOp>(
+        [](auto op) { boundCheckLoadOrStoreOp(op); });
+
+    // TODO(bondhugula): do this for DMA ops as well.
+  });
+}
+
+static PassRegistration<MemRefBoundCheck>
+    memRefBoundCheck("memref-bound-check",
+                     "Check memref access bounds in a Function");
diff --git a/mlir/lib/Analysis/NestedMatcher.cpp b/mlir/lib/Analysis/NestedMatcher.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..97eaafd37ce4b07d62106c9fc31d5f98050205f2
--- /dev/null
+++ b/mlir/lib/Analysis/NestedMatcher.cpp
@@ -0,0 +1,152 @@
+//===- NestedMatcher.cpp - NestedMatcher Impl  ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/NestedMatcher.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+llvm::BumpPtrAllocator *&NestedMatch::allocator() {
+  thread_local llvm::BumpPtrAllocator *allocator = nullptr;
+  return allocator;
+}
+
+NestedMatch NestedMatch::build(Operation *operation,
+                               ArrayRef<NestedMatch> nestedMatches) {
+  auto *result = allocator()->Allocate<NestedMatch>();
+  auto *children = allocator()->Allocate<NestedMatch>(nestedMatches.size());
+  std::uninitialized_copy(nestedMatches.begin(), nestedMatches.end(), children);
+  new (result) NestedMatch();
+  result->matchedOperation = operation;
+  result->matchedChildren =
+      ArrayRef<NestedMatch>(children, nestedMatches.size());
+  return *result;
+}
+
+llvm::BumpPtrAllocator *&NestedPattern::allocator() {
+  thread_local llvm::BumpPtrAllocator *allocator = nullptr;
+  return allocator;
+}
+
+NestedPattern::NestedPattern(ArrayRef<NestedPattern> nested,
+                             FilterFunctionType filter)
+    : nestedPatterns(), filter(filter), skip(nullptr) {
+  if (!nested.empty()) {
+    auto *newNested = allocator()->Allocate<NestedPattern>(nested.size());
+    std::uninitialized_copy(nested.begin(), nested.end(), newNested);
+    nestedPatterns = ArrayRef<NestedPattern>(newNested, nested.size());
+  }
+}
+
+unsigned NestedPattern::getDepth() const {
+  if (nestedPatterns.empty()) {
+    return 1;
+  }
+  unsigned depth = 0;
+  for (auto &c : nestedPatterns) {
+    depth = std::max(depth, c.getDepth());
+  }
+  return depth + 1;
+}
+
+/// Matches a single operation in the following way:
+///   1. checks the kind of operation against the matcher, if different then
+///      there is no match;
+///   2. calls the customizable filter function to refine the single operation
+///      match with extra semantic constraints;
+///   3. if all is good, recursively matches the nested patterns;
+///   4. if all nested match then the single operation matches too and is
+///      appended to the list of matches;
+///   5. TODO(ntv) Optionally applies actions (lambda), in which case we will
+///      want to traverse in post-order DFS to avoid invalidating iterators.
+void NestedPattern::matchOne(Operation *op,
+                             SmallVectorImpl<NestedMatch> *matches) {
+  if (skip == op) {
+    return;
+  }
+  // Local custom filter function
+  if (!filter(*op)) {
+    return;
+  }
+
+  if (nestedPatterns.empty()) {
+    SmallVector<NestedMatch, 8> nestedMatches;
+    matches->push_back(NestedMatch::build(op, nestedMatches));
+    return;
+  }
+  // Take a copy of each nested pattern so we can match it.
+  for (auto nestedPattern : nestedPatterns) {
+    SmallVector<NestedMatch, 8> nestedMatches;
+    // Skip elem in the walk immediately following. Without this we would
+    // essentially need to reimplement walk here.
+    nestedPattern.skip = op;
+    nestedPattern.match(op, &nestedMatches);
+    // If we could not match even one of the specified nestedPattern, early exit
+    // as this whole branch is not a match.
+    if (nestedMatches.empty()) {
+      return;
+    }
+    matches->push_back(NestedMatch::build(op, nestedMatches));
+  }
+}
+
+static bool isAffineForOp(Operation &op) { return isa<AffineForOp>(op); }
+
+static bool isAffineIfOp(Operation &op) { return isa<AffineIfOp>(op); }
+
+namespace mlir {
+namespace matcher {
+
+NestedPattern Op(FilterFunctionType filter) {
+  return NestedPattern({}, filter);
+}
+
+NestedPattern If(NestedPattern child) {
+  return NestedPattern(child, isAffineIfOp);
+}
+NestedPattern If(FilterFunctionType filter, NestedPattern child) {
+  return NestedPattern(child, [filter](Operation &op) {
+    return isAffineIfOp(op) && filter(op);
+  });
+}
+NestedPattern If(ArrayRef<NestedPattern> nested) {
+  return NestedPattern(nested, isAffineIfOp);
+}
+NestedPattern If(FilterFunctionType filter, ArrayRef<NestedPattern> nested) {
+  return NestedPattern(nested, [filter](Operation &op) {
+    return isAffineIfOp(op) && filter(op);
+  });
+}
+
+NestedPattern For(NestedPattern child) {
+  return NestedPattern(child, isAffineForOp);
+}
+NestedPattern For(FilterFunctionType filter, NestedPattern child) {
+  return NestedPattern(
+      child, [=](Operation &op) { return isAffineForOp(op) && filter(op); });
+}
+NestedPattern For(ArrayRef<NestedPattern> nested) {
+  return NestedPattern(nested, isAffineForOp);
+}
+NestedPattern For(FilterFunctionType filter, ArrayRef<NestedPattern> nested) {
+  return NestedPattern(
+      nested, [=](Operation &op) { return isAffineForOp(op) && filter(op); });
+}
+
+bool isLoadOrStore(Operation &op) {
+  return isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op);
+}
+
+} // end namespace matcher
+} // end namespace mlir
diff --git a/mlir/lib/Analysis/OpStats.cpp b/mlir/lib/Analysis/OpStats.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dbd938710ef30a79dda6857c04d7867f8fc806b0
--- /dev/null
+++ b/mlir/lib/Analysis/OpStats.cpp
@@ -0,0 +1,84 @@
+//===- OpStats.cpp - Prints stats of operations in module -----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+namespace {
+struct PrintOpStatsPass : public ModulePass<PrintOpStatsPass> {
+  explicit PrintOpStatsPass(raw_ostream &os = llvm::errs()) : os(os) {}
+
+  // Prints the resultant operation statistics post iterating over the module.
+  void runOnModule() override;
+
+  // Print summary of op stats.
+  void printSummary();
+
+private:
+  llvm::StringMap<int64_t> opCount;
+  raw_ostream &os;
+};
+} // namespace
+
+void PrintOpStatsPass::runOnModule() {
+  opCount.clear();
+
+  // Compute the operation statistics for each function in the module.
+  for (auto &op : getModule())
+    op.walk([&](Operation *op) { ++opCount[op->getName().getStringRef()]; });
+  printSummary();
+}
+
+void PrintOpStatsPass::printSummary() {
+  os << "Operations encountered:\n";
+  os << "-----------------------\n";
+  SmallVector<StringRef, 64> sorted(opCount.keys());
+  llvm::sort(sorted);
+
+  // Split an operation name from its dialect prefix.
+  auto splitOperationName = [](StringRef opName) {
+    auto splitName = opName.split('.');
+    return splitName.second.empty() ? std::make_pair("", splitName.first)
+                                    : splitName;
+  };
+
+  // Compute the largest dialect and operation name.
+  StringRef dialectName, opName;
+  size_t maxLenOpName = 0, maxLenDialect = 0;
+  for (const auto &key : sorted) {
+    std::tie(dialectName, opName) = splitOperationName(key);
+    maxLenDialect = std::max(maxLenDialect, dialectName.size());
+    maxLenOpName = std::max(maxLenOpName, opName.size());
+  }
+
+  for (const auto &key : sorted) {
+    std::tie(dialectName, opName) = splitOperationName(key);
+
+    // Left-align the names (aligning on the dialect) and right-align the count
+    // below. The alignment is for readability and does not affect CSV/FileCheck
+    // parsing.
+    if (dialectName.empty())
+      os.indent(maxLenDialect + 3);
+    else
+      os << llvm::right_justify(dialectName, maxLenDialect + 2) << '.';
+
+    // Left justify the operation name.
+    os << llvm::left_justify(opName, maxLenOpName) << " , " << opCount[key]
+       << '\n';
+  }
+}
+
+static PassRegistration<PrintOpStatsPass>
+    pass("print-op-stats", "Print statistics of operations");
diff --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..89ee613b370c2c402754bf57af732e5760776b7b
--- /dev/null
+++ b/mlir/lib/Analysis/SliceAnalysis.cpp
@@ -0,0 +1,213 @@
+//===- UseDefAnalysis.cpp - Analysis for Transitive UseDef chains ---------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Analysis functions specific to slicing in Function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+
+///
+/// Implements Analysis functions specific to slicing in Function.
+///
+
+using namespace mlir;
+
+using llvm::SetVector;
+
+static void getForwardSliceImpl(Operation *op,
+                                SetVector<Operation *> *forwardSlice,
+                                TransitiveFilter filter) {
+  if (!op) {
+    return;
+  }
+
+  // Evaluate whether we should keep this use.
+  // This is useful in particular to implement scoping; i.e. return the
+  // transitive forwardSlice in the current scope.
+  if (!filter(op)) {
+    return;
+  }
+
+  if (auto forOp = dyn_cast<AffineForOp>(op)) {
+    for (auto *ownerInst : forOp.getInductionVar()->getUsers())
+      if (forwardSlice->count(ownerInst) == 0)
+        getForwardSliceImpl(ownerInst, forwardSlice, filter);
+  } else if (auto forOp = dyn_cast<loop::ForOp>(op)) {
+    for (auto *ownerInst : forOp.getInductionVar()->getUsers())
+      if (forwardSlice->count(ownerInst) == 0)
+        getForwardSliceImpl(ownerInst, forwardSlice, filter);
+  } else {
+    assert(op->getNumRegions() == 0 && "unexpected generic op with regions");
+    assert(op->getNumResults() <= 1 && "unexpected multiple results");
+    if (op->getNumResults() > 0) {
+      for (auto *ownerInst : op->getResult(0)->getUsers())
+        if (forwardSlice->count(ownerInst) == 0)
+          getForwardSliceImpl(ownerInst, forwardSlice, filter);
+    }
+  }
+
+  forwardSlice->insert(op);
+}
+
+void mlir::getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
+                           TransitiveFilter filter) {
+  getForwardSliceImpl(op, forwardSlice, filter);
+  // Don't insert the top level operation, we just queried on it and don't
+  // want it in the results.
+  forwardSlice->remove(op);
+
+  // Reverse to get back the actual topological order.
+  // std::reverse does not work out of the box on SetVector and I want an
+  // in-place swap based thing (the real std::reverse, not the LLVM adapter).
+  std::vector<Operation *> v(forwardSlice->takeVector());
+  forwardSlice->insert(v.rbegin(), v.rend());
+}
+
+static void getBackwardSliceImpl(Operation *op,
+                                 SetVector<Operation *> *backwardSlice,
+                                 TransitiveFilter filter) {
+  if (!op)
+    return;
+
+  assert((op->getNumRegions() == 0 || isa<AffineForOp>(op) ||
+          isa<loop::ForOp>(op)) &&
+         "unexpected generic op with regions");
+
+  // Evaluate whether we should keep this def.
+  // This is useful in particular to implement scoping; i.e. return the
+  // transitive forwardSlice in the current scope.
+  if (!filter(op)) {
+    return;
+  }
+
+  for (auto en : llvm::enumerate(op->getOperands())) {
+    auto operand = en.value();
+    if (auto blockArg = operand.dyn_cast<BlockArgument>()) {
+      if (auto affIv = getForInductionVarOwner(operand)) {
+        auto *affOp = affIv.getOperation();
+        if (backwardSlice->count(affOp) == 0)
+          getBackwardSliceImpl(affOp, backwardSlice, filter);
+      } else if (auto loopIv = loop::getForInductionVarOwner(operand)) {
+        auto *loopOp = loopIv.getOperation();
+        if (backwardSlice->count(loopOp) == 0)
+          getBackwardSliceImpl(loopOp, backwardSlice, filter);
+      } else if (blockArg->getOwner() !=
+                 &op->getParentOfType<FuncOp>().getBody().front()) {
+        op->emitError("unsupported CF for operand ") << en.index();
+        llvm_unreachable("Unsupported control flow");
+      }
+      continue;
+    }
+    auto *op = operand->getDefiningOp();
+    if (backwardSlice->count(op) == 0) {
+      getBackwardSliceImpl(op, backwardSlice, filter);
+    }
+  }
+
+  backwardSlice->insert(op);
+}
+
+void mlir::getBackwardSlice(Operation *op,
+                            SetVector<Operation *> *backwardSlice,
+                            TransitiveFilter filter) {
+  getBackwardSliceImpl(op, backwardSlice, filter);
+
+  // Don't insert the top level operation, we just queried on it and don't
+  // want it in the results.
+  backwardSlice->remove(op);
+}
+
+SetVector<Operation *> mlir::getSlice(Operation *op,
+                                      TransitiveFilter backwardFilter,
+                                      TransitiveFilter forwardFilter) {
+  SetVector<Operation *> slice;
+  slice.insert(op);
+
+  unsigned currentIndex = 0;
+  SetVector<Operation *> backwardSlice;
+  SetVector<Operation *> forwardSlice;
+  while (currentIndex != slice.size()) {
+    auto *currentInst = (slice)[currentIndex];
+    // Compute and insert the backwardSlice starting from currentInst.
+    backwardSlice.clear();
+    getBackwardSlice(currentInst, &backwardSlice, backwardFilter);
+    slice.insert(backwardSlice.begin(), backwardSlice.end());
+
+    // Compute and insert the forwardSlice starting from currentInst.
+    forwardSlice.clear();
+    getForwardSlice(currentInst, &forwardSlice, forwardFilter);
+    slice.insert(forwardSlice.begin(), forwardSlice.end());
+    ++currentIndex;
+  }
+  return topologicalSort(slice);
+}
+
+namespace {
+/// DFS post-order implementation that maintains a global count to work across
+/// multiple invocations, to help implement topological sort on multi-root DAGs.
+/// We traverse all operations but only record the ones that appear in
+/// `toSort` for the final result.
+struct DFSState {
+  DFSState(const SetVector<Operation *> &set)
+      : toSort(set), topologicalCounts(), seen() {}
+  const SetVector<Operation *> &toSort;
+  SmallVector<Operation *, 16> topologicalCounts;
+  DenseSet<Operation *> seen;
+};
+} // namespace
+
+static void DFSPostorder(Operation *current, DFSState *state) {
+  assert(current->getNumResults() <= 1 && "NYI: multi-result");
+  if (current->getNumResults() > 0) {
+    for (auto &u : current->getResult(0)->getUses()) {
+      auto *op = u.getOwner();
+      DFSPostorder(op, state);
+    }
+  }
+  bool inserted;
+  using IterTy = decltype(state->seen.begin());
+  IterTy iter;
+  std::tie(iter, inserted) = state->seen.insert(current);
+  if (inserted) {
+    if (state->toSort.count(current) > 0) {
+      state->topologicalCounts.push_back(current);
+    }
+  }
+}
+
+SetVector<Operation *>
+mlir::topologicalSort(const SetVector<Operation *> &toSort) {
+  if (toSort.empty()) {
+    return toSort;
+  }
+
+  // Run from each root with global count and `seen` set.
+  DFSState state(toSort);
+  for (auto *s : toSort) {
+    assert(toSort.count(s) == 1 && "NYI: multi-sets not supported");
+    DFSPostorder(s, &state);
+  }
+
+  // Reorder and return.
+  SetVector<Operation *> res;
+  for (auto it = state.topologicalCounts.rbegin(),
+            eit = state.topologicalCounts.rend();
+       it != eit; ++it) {
+    res.insert(*it);
+  }
+  return res;
+}
diff --git a/mlir/lib/Analysis/TestMemRefDependenceCheck.cpp b/mlir/lib/Analysis/TestMemRefDependenceCheck.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6d7519740e1a3e55fc270efd47dc008578bf854
--- /dev/null
+++ b/mlir/lib/Analysis/TestMemRefDependenceCheck.cpp
@@ -0,0 +1,121 @@
+//===- TestMemRefDependenceCheck.cpp - Test dep analysis ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to run pair-wise memref access dependence checks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "test-memref-dependence-check"
+
+using namespace mlir;
+
+namespace {
+
+// TODO(andydavis) Add common surrounding loop depth-wise dependence checks.
+/// Checks dependences between all pairs of memref accesses in a Function.
+struct TestMemRefDependenceCheck
+    : public FunctionPass<TestMemRefDependenceCheck> {
+  SmallVector<Operation *, 4> loadsAndStores;
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::createTestMemRefDependenceCheckPass() {
+  return std::make_unique<TestMemRefDependenceCheck>();
+}
+
+// Returns a result string which represents the direction vector (if there was
+// a dependence), returns the string "false" otherwise.
+static std::string
+getDirectionVectorStr(bool ret, unsigned numCommonLoops, unsigned loopNestDepth,
+                      ArrayRef<DependenceComponent> dependenceComponents) {
+  if (!ret)
+    return "false";
+  if (dependenceComponents.empty() || loopNestDepth > numCommonLoops)
+    return "true";
+  std::string result;
+  for (unsigned i = 0, e = dependenceComponents.size(); i < e; ++i) {
+    std::string lbStr = "-inf";
+    if (dependenceComponents[i].lb.hasValue() &&
+        dependenceComponents[i].lb.getValue() !=
+            std::numeric_limits<int64_t>::min())
+      lbStr = std::to_string(dependenceComponents[i].lb.getValue());
+
+    std::string ubStr = "+inf";
+    if (dependenceComponents[i].ub.hasValue() &&
+        dependenceComponents[i].ub.getValue() !=
+            std::numeric_limits<int64_t>::max())
+      ubStr = std::to_string(dependenceComponents[i].ub.getValue());
+
+    result += "[" + lbStr + ", " + ubStr + "]";
+  }
+  return result;
+}
+
+// For each access in 'loadsAndStores', runs a dependence check between this
+// "source" access and all subsequent "destination" accesses in
+// 'loadsAndStores'. Emits the result of the dependence check as a note with
+// the source access.
+static void checkDependences(ArrayRef<Operation *> loadsAndStores) {
+  for (unsigned i = 0, e = loadsAndStores.size(); i < e; ++i) {
+    auto *srcOpInst = loadsAndStores[i];
+    MemRefAccess srcAccess(srcOpInst);
+    for (unsigned j = 0; j < e; ++j) {
+      auto *dstOpInst = loadsAndStores[j];
+      MemRefAccess dstAccess(dstOpInst);
+
+      unsigned numCommonLoops =
+          getNumCommonSurroundingLoops(*srcOpInst, *dstOpInst);
+      for (unsigned d = 1; d <= numCommonLoops + 1; ++d) {
+        FlatAffineConstraints dependenceConstraints;
+        SmallVector<DependenceComponent, 2> dependenceComponents;
+        DependenceResult result = checkMemrefAccessDependence(
+            srcAccess, dstAccess, d, &dependenceConstraints,
+            &dependenceComponents);
+        assert(result.value != DependenceResult::Failure);
+        bool ret = hasDependence(result);
+        // TODO(andydavis) Print dependence type (i.e. RAW, etc) and print
+        // distance vectors as: ([2, 3], [0, 10]). Also, shorten distance
+        // vectors from ([1, 1], [3, 3]) to (1, 3).
+        srcOpInst->emitRemark("dependence from ")
+            << i << " to " << j << " at depth " << d << " = "
+            << getDirectionVectorStr(ret, numCommonLoops, d,
+                                     dependenceComponents);
+      }
+    }
+  }
+}
+
+// Walks the Function 'f' adding load and store ops to 'loadsAndStores'.
+// Runs pair-wise dependence checks.
+void TestMemRefDependenceCheck::runOnFunction() {
+  // Collect the loads and stores within the function.
+  loadsAndStores.clear();
+  getFunction().walk([&](Operation *op) {
+    if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op))
+      loadsAndStores.push_back(op);
+  });
+
+  checkDependences(loadsAndStores);
+}
+
+static PassRegistration<TestMemRefDependenceCheck>
+    pass("test-memref-dependence-check",
+         "Checks dependences between all pairs of memref accesses.");
diff --git a/mlir/lib/Analysis/TestParallelismDetection.cpp b/mlir/lib/Analysis/TestParallelismDetection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cfc5431df37f0c8e4cd59d544c6214a4046439f
--- /dev/null
+++ b/mlir/lib/Analysis/TestParallelismDetection.cpp
@@ -0,0 +1,48 @@
+//===- ParallelismDetection.cpp - Parallelism Detection pass ------------*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to detect parallel affine 'affine.for' ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+
+struct TestParallelismDetection
+    : public FunctionPass<TestParallelismDetection> {
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createParallelismDetectionTestPass() {
+  return std::make_unique<TestParallelismDetection>();
+}
+
+// Walks the function and emits a note for all 'affine.for' ops detected as
+// parallel.
+void TestParallelismDetection::runOnFunction() {
+  FuncOp f = getFunction();
+  OpBuilder b(f.getBody());
+  f.walk([&](AffineForOp forOp) {
+    if (isLoopParallel(forOp))
+      forOp.emitRemark("parallel loop");
+    else
+      forOp.emitRemark("sequential loop");
+  });
+}
+
+static PassRegistration<TestParallelismDetection>
+    pass("test-detect-parallel", "Test parallelism detection ");
diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ddf2e274eb23763cf4eb4b040ac62d4f953ea8a
--- /dev/null
+++ b/mlir/lib/Analysis/Utils.cpp
@@ -0,0 +1,1007 @@
+//===- Utils.cpp ---- Misc utilities for analysis -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements miscellaneous analysis routines for non-loop IR
+// structures.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Utils.h"
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "analysis-utils"
+
+using namespace mlir;
+
+using llvm::SmallDenseMap;
+
+/// Populates 'loops' with IVs of the loops surrounding 'op' ordered from
+/// the outermost 'affine.for' operation to the innermost one.
+void mlir::getLoopIVs(Operation &op, SmallVectorImpl<AffineForOp> *loops) {
+  auto *currOp = op.getParentOp();
+  AffineForOp currAffineForOp;
+  // Traverse up the hierarchy collecting all 'affine.for' operation while
+  // skipping over 'affine.if' operations.
+  while (currOp && ((currAffineForOp = dyn_cast<AffineForOp>(currOp)) ||
+                    isa<AffineIfOp>(currOp))) {
+    if (currAffineForOp)
+      loops->push_back(currAffineForOp);
+    currOp = currOp->getParentOp();
+  }
+  std::reverse(loops->begin(), loops->end());
+}
+
+// Populates 'cst' with FlatAffineConstraints which represent slice bounds.
+LogicalResult
+ComputationSliceState::getAsConstraints(FlatAffineConstraints *cst) {
+  assert(!lbOperands.empty());
+  // Adds src 'ivs' as dimension identifiers in 'cst'.
+  unsigned numDims = ivs.size();
+  // Adds operands (dst ivs and symbols) as symbols in 'cst'.
+  unsigned numSymbols = lbOperands[0].size();
+
+  SmallVector<Value, 4> values(ivs);
+  // Append 'ivs' then 'operands' to 'values'.
+  values.append(lbOperands[0].begin(), lbOperands[0].end());
+  cst->reset(numDims, numSymbols, 0, values);
+
+  // Add loop bound constraints for values which are loop IVs and equality
+  // constraints for symbols which are constants.
+  for (const auto &value : values) {
+    assert(cst->containsId(*value) && "value expected to be present");
+    if (isValidSymbol(value)) {
+      // Check if the symbol is a constant.
+
+      if (auto cOp = dyn_cast_or_null<ConstantIndexOp>(value->getDefiningOp()))
+        cst->setIdToConstant(*value, cOp.getValue());
+    } else if (auto loop = getForInductionVarOwner(value)) {
+      if (failed(cst->addAffineForOpDomain(loop)))
+        return failure();
+    }
+  }
+
+  // Add slices bounds on 'ivs' using maps 'lbs'/'ubs' with 'lbOperands[0]'
+  LogicalResult ret = cst->addSliceBounds(ivs, lbs, ubs, lbOperands[0]);
+  assert(succeeded(ret) &&
+         "should not fail as we never have semi-affine slice maps");
+  (void)ret;
+  return success();
+}
+
+// Clears state bounds and operand state.
+void ComputationSliceState::clearBounds() {
+  lbs.clear();
+  ubs.clear();
+  lbOperands.clear();
+  ubOperands.clear();
+}
+
+unsigned MemRefRegion::getRank() const {
+  return memref->getType().cast<MemRefType>().getRank();
+}
+
+Optional<int64_t> MemRefRegion::getConstantBoundingSizeAndShape(
+    SmallVectorImpl<int64_t> *shape, std::vector<SmallVector<int64_t, 4>> *lbs,
+    SmallVectorImpl<int64_t> *lbDivisors) const {
+  auto memRefType = memref->getType().cast<MemRefType>();
+  unsigned rank = memRefType.getRank();
+  if (shape)
+    shape->reserve(rank);
+
+  assert(rank == cst.getNumDimIds() && "inconsistent memref region");
+
+  // Find a constant upper bound on the extent of this memref region along each
+  // dimension.
+  int64_t numElements = 1;
+  int64_t diffConstant;
+  int64_t lbDivisor;
+  for (unsigned d = 0; d < rank; d++) {
+    SmallVector<int64_t, 4> lb;
+    Optional<int64_t> diff = cst.getConstantBoundOnDimSize(d, &lb, &lbDivisor);
+    if (diff.hasValue()) {
+      diffConstant = diff.getValue();
+      assert(lbDivisor > 0);
+    } else {
+      // If no constant bound is found, then it can always be bound by the
+      // memref's dim size if the latter has a constant size along this dim.
+      auto dimSize = memRefType.getDimSize(d);
+      if (dimSize == -1)
+        return None;
+      diffConstant = dimSize;
+      // Lower bound becomes 0.
+      lb.resize(cst.getNumSymbolIds() + 1, 0);
+      lbDivisor = 1;
+    }
+    numElements *= diffConstant;
+    if (lbs) {
+      lbs->push_back(lb);
+      assert(lbDivisors && "both lbs and lbDivisor or none");
+      lbDivisors->push_back(lbDivisor);
+    }
+    if (shape) {
+      shape->push_back(diffConstant);
+    }
+  }
+  return numElements;
+}
+
+LogicalResult MemRefRegion::unionBoundingBox(const MemRefRegion &other) {
+  assert(memref == other.memref);
+  return cst.unionBoundingBox(*other.getConstraints());
+}
+
+/// Computes the memory region accessed by this memref with the region
+/// represented as constraints symbolic/parametric in 'loopDepth' loops
+/// surrounding opInst and any additional Function symbols.
+//  For example, the memref region for this load operation at loopDepth = 1 will
+//  be as below:
+//
+//    affine.for %i = 0 to 32 {
+//      affine.for %ii = %i to (d0) -> (d0 + 8) (%i) {
+//        load %A[%ii]
+//      }
+//    }
+//
+// region:  {memref = %A, write = false, {%i <= m0 <= %i + 7} }
+// The last field is a 2-d FlatAffineConstraints symbolic in %i.
+//
+// TODO(bondhugula): extend this to any other memref dereferencing ops
+// (dma_start, dma_wait).
+LogicalResult MemRefRegion::compute(Operation *op, unsigned loopDepth,
+                                    ComputationSliceState *sliceState,
+                                    bool addMemRefDimBounds) {
+  assert((isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op)) &&
+         "affine load/store op expected");
+
+  MemRefAccess access(op);
+  memref = access.memref;
+  write = access.isStore();
+
+  unsigned rank = access.getRank();
+
+  LLVM_DEBUG(llvm::dbgs() << "MemRefRegion::compute: " << *op
+                          << "depth: " << loopDepth << "\n";);
+
+  if (rank == 0) {
+    SmallVector<AffineForOp, 4> ivs;
+    getLoopIVs(*op, &ivs);
+    SmallVector<Value, 8> regionSymbols;
+    extractForInductionVars(ivs, &regionSymbols);
+    // A rank 0 memref has a 0-d region.
+    cst.reset(rank, loopDepth, 0, regionSymbols);
+    return success();
+  }
+
+  // Build the constraints for this region.
+  AffineValueMap accessValueMap;
+  access.getAccessMap(&accessValueMap);
+  AffineMap accessMap = accessValueMap.getAffineMap();
+
+  unsigned numDims = accessMap.getNumDims();
+  unsigned numSymbols = accessMap.getNumSymbols();
+  unsigned numOperands = accessValueMap.getNumOperands();
+  // Merge operands with slice operands.
+  SmallVector<Value, 4> operands;
+  operands.resize(numOperands);
+  for (unsigned i = 0; i < numOperands; ++i)
+    operands[i] = accessValueMap.getOperand(i);
+
+  if (sliceState != nullptr) {
+    operands.reserve(operands.size() + sliceState->lbOperands[0].size());
+    // Append slice operands to 'operands' as symbols.
+    for (auto extraOperand : sliceState->lbOperands[0]) {
+      if (!llvm::is_contained(operands, extraOperand)) {
+        operands.push_back(extraOperand);
+        numSymbols++;
+      }
+    }
+  }
+  // We'll first associate the dims and symbols of the access map to the dims
+  // and symbols resp. of cst. This will change below once cst is
+  // fully constructed out.
+  cst.reset(numDims, numSymbols, 0, operands);
+
+  // Add equality constraints.
+  // Add inequalities for loop lower/upper bounds.
+  for (unsigned i = 0; i < numDims + numSymbols; ++i) {
+    auto operand = operands[i];
+    if (auto loop = getForInductionVarOwner(operand)) {
+      // Note that cst can now have more dimensions than accessMap if the
+      // bounds expressions involve outer loops or other symbols.
+      // TODO(bondhugula): rewrite this to use getInstIndexSet; this way
+      // conditionals will be handled when the latter supports it.
+      if (failed(cst.addAffineForOpDomain(loop)))
+        return failure();
+    } else {
+      // Has to be a valid symbol.
+      auto symbol = operand;
+      assert(isValidSymbol(symbol));
+      // Check if the symbol is a constant.
+      if (auto *op = symbol->getDefiningOp()) {
+        if (auto constOp = dyn_cast<ConstantIndexOp>(op)) {
+          cst.setIdToConstant(*symbol, constOp.getValue());
+        }
+      }
+    }
+  }
+
+  // Add lower/upper bounds on loop IVs using bounds from 'sliceState'.
+  if (sliceState != nullptr) {
+    // Add dim and symbol slice operands.
+    for (auto operand : sliceState->lbOperands[0]) {
+      cst.addInductionVarOrTerminalSymbol(operand);
+    }
+    // Add upper/lower bounds from 'sliceState' to 'cst'.
+    LogicalResult ret =
+        cst.addSliceBounds(sliceState->ivs, sliceState->lbs, sliceState->ubs,
+                           sliceState->lbOperands[0]);
+    assert(succeeded(ret) &&
+           "should not fail as we never have semi-affine slice maps");
+    (void)ret;
+  }
+
+  // Add access function equalities to connect loop IVs to data dimensions.
+  if (failed(cst.composeMap(&accessValueMap))) {
+    op->emitError("getMemRefRegion: compose affine map failed");
+    LLVM_DEBUG(accessValueMap.getAffineMap().dump());
+    return failure();
+  }
+
+  // Set all identifiers appearing after the first 'rank' identifiers as
+  // symbolic identifiers - so that the ones corresponding to the memref
+  // dimensions are the dimensional identifiers for the memref region.
+  cst.setDimSymbolSeparation(cst.getNumDimAndSymbolIds() - rank);
+
+  // Eliminate any loop IVs other than the outermost 'loopDepth' IVs, on which
+  // this memref region is symbolic.
+  SmallVector<AffineForOp, 4> enclosingIVs;
+  getLoopIVs(*op, &enclosingIVs);
+  assert(loopDepth <= enclosingIVs.size() && "invalid loop depth");
+  enclosingIVs.resize(loopDepth);
+  SmallVector<Value, 4> ids;
+  cst.getIdValues(cst.getNumDimIds(), cst.getNumDimAndSymbolIds(), &ids);
+  for (auto id : ids) {
+    AffineForOp iv;
+    if ((iv = getForInductionVarOwner(id)) &&
+        llvm::is_contained(enclosingIVs, iv) == false) {
+      cst.projectOut(id);
+    }
+  }
+
+  // Project out any local variables (these would have been added for any
+  // mod/divs).
+  cst.projectOut(cst.getNumDimAndSymbolIds(), cst.getNumLocalIds());
+
+  // Constant fold any symbolic identifiers.
+  cst.constantFoldIdRange(/*pos=*/cst.getNumDimIds(),
+                          /*num=*/cst.getNumSymbolIds());
+
+  assert(cst.getNumDimIds() == rank && "unexpected MemRefRegion format");
+
+  // Add upper/lower bounds for each memref dimension with static size
+  // to guard against potential over-approximation from projection.
+  // TODO(andydavis) Support dynamic memref dimensions.
+  if (addMemRefDimBounds) {
+    auto memRefType = memref->getType().cast<MemRefType>();
+    for (unsigned r = 0; r < rank; r++) {
+      cst.addConstantLowerBound(r, 0);
+      int64_t dimSize = memRefType.getDimSize(r);
+      if (ShapedType::isDynamic(dimSize))
+        continue;
+      cst.addConstantUpperBound(r, dimSize - 1);
+    }
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "Memory region:\n");
+  LLVM_DEBUG(cst.dump());
+  return success();
+}
+
+//  TODO(mlir-team): improve/complete this when we have target data.
+static unsigned getMemRefEltSizeInBytes(MemRefType memRefType) {
+  auto elementType = memRefType.getElementType();
+
+  unsigned sizeInBits;
+  if (elementType.isIntOrFloat()) {
+    sizeInBits = elementType.getIntOrFloatBitWidth();
+  } else {
+    auto vectorType = elementType.cast<VectorType>();
+    sizeInBits =
+        vectorType.getElementTypeBitWidth() * vectorType.getNumElements();
+  }
+  return llvm::divideCeil(sizeInBits, 8);
+}
+
+// Returns the size of the region.
+Optional<int64_t> MemRefRegion::getRegionSize() {
+  auto memRefType = memref->getType().cast<MemRefType>();
+
+  auto layoutMaps = memRefType.getAffineMaps();
+  if (layoutMaps.size() > 1 ||
+      (layoutMaps.size() == 1 && !layoutMaps[0].isIdentity())) {
+    LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
+    return false;
+  }
+
+  // Indices to use for the DmaStart op.
+  // Indices for the original memref being DMAed from/to.
+  SmallVector<Value, 4> memIndices;
+  // Indices for the faster buffer being DMAed into/from.
+  SmallVector<Value, 4> bufIndices;
+
+  // Compute the extents of the buffer.
+  Optional<int64_t> numElements = getConstantBoundingSizeAndShape();
+  if (!numElements.hasValue()) {
+    LLVM_DEBUG(llvm::dbgs() << "Dynamic shapes not yet supported\n");
+    return None;
+  }
+  return getMemRefEltSizeInBytes(memRefType) * numElements.getValue();
+}
+
+/// Returns the size of memref data in bytes if it's statically shaped, None
+/// otherwise.  If the element of the memref has vector type, takes into account
+/// size of the vector as well.
+//  TODO(mlir-team): improve/complete this when we have target data.
+Optional<uint64_t> mlir::getMemRefSizeInBytes(MemRefType memRefType) {
+  if (!memRefType.hasStaticShape())
+    return None;
+  auto elementType = memRefType.getElementType();
+  if (!elementType.isIntOrFloat() && !elementType.isa<VectorType>())
+    return None;
+
+  uint64_t sizeInBytes = getMemRefEltSizeInBytes(memRefType);
+  for (unsigned i = 0, e = memRefType.getRank(); i < e; i++) {
+    sizeInBytes = sizeInBytes * memRefType.getDimSize(i);
+  }
+  return sizeInBytes;
+}
+
+template <typename LoadOrStoreOpPointer>
+LogicalResult mlir::boundCheckLoadOrStoreOp(LoadOrStoreOpPointer loadOrStoreOp,
+                                            bool emitError) {
+  static_assert(std::is_same<LoadOrStoreOpPointer, AffineLoadOp>::value ||
+                    std::is_same<LoadOrStoreOpPointer, AffineStoreOp>::value,
+                "argument should be either a AffineLoadOp or a AffineStoreOp");
+
+  Operation *opInst = loadOrStoreOp.getOperation();
+  MemRefRegion region(opInst->getLoc());
+  if (failed(region.compute(opInst, /*loopDepth=*/0, /*sliceState=*/nullptr,
+                            /*addMemRefDimBounds=*/false)))
+    return success();
+
+  LLVM_DEBUG(llvm::dbgs() << "Memory region");
+  LLVM_DEBUG(region.getConstraints()->dump());
+
+  bool outOfBounds = false;
+  unsigned rank = loadOrStoreOp.getMemRefType().getRank();
+
+  // For each dimension, check for out of bounds.
+  for (unsigned r = 0; r < rank; r++) {
+    FlatAffineConstraints ucst(*region.getConstraints());
+
+    // Intersect memory region with constraint capturing out of bounds (both out
+    // of upper and out of lower), and check if the constraint system is
+    // feasible. If it is, there is at least one point out of bounds.
+    SmallVector<int64_t, 4> ineq(rank + 1, 0);
+    int64_t dimSize = loadOrStoreOp.getMemRefType().getDimSize(r);
+    // TODO(bondhugula): handle dynamic dim sizes.
+    if (dimSize == -1)
+      continue;
+
+    // Check for overflow: d_i >= memref dim size.
+    ucst.addConstantLowerBound(r, dimSize);
+    outOfBounds = !ucst.isEmpty();
+    if (outOfBounds && emitError) {
+      loadOrStoreOp.emitOpError()
+          << "memref out of upper bound access along dimension #" << (r + 1);
+    }
+
+    // Check for a negative index.
+    FlatAffineConstraints lcst(*region.getConstraints());
+    std::fill(ineq.begin(), ineq.end(), 0);
+    // d_i <= -1;
+    lcst.addConstantUpperBound(r, -1);
+    outOfBounds = !lcst.isEmpty();
+    if (outOfBounds && emitError) {
+      loadOrStoreOp.emitOpError()
+          << "memref out of lower bound access along dimension #" << (r + 1);
+    }
+  }
+  return failure(outOfBounds);
+}
+
+// Explicitly instantiate the template so that the compiler knows we need them!
+template LogicalResult mlir::boundCheckLoadOrStoreOp(AffineLoadOp loadOp,
+                                                     bool emitError);
+template LogicalResult mlir::boundCheckLoadOrStoreOp(AffineStoreOp storeOp,
+                                                     bool emitError);
+
+// Returns in 'positions' the Block positions of 'op' in each ancestor
+// Block from the Block containing operation, stopping at 'limitBlock'.
+static void findInstPosition(Operation *op, Block *limitBlock,
+                             SmallVectorImpl<unsigned> *positions) {
+  Block *block = op->getBlock();
+  while (block != limitBlock) {
+    // FIXME: This algorithm is unnecessarily O(n) and should be improved to not
+    // rely on linear scans.
+    int instPosInBlock = std::distance(block->begin(), op->getIterator());
+    positions->push_back(instPosInBlock);
+    op = block->getParentOp();
+    block = op->getBlock();
+  }
+  std::reverse(positions->begin(), positions->end());
+}
+
+// Returns the Operation in a possibly nested set of Blocks, where the
+// position of the operation is represented by 'positions', which has a
+// Block position for each level of nesting.
+static Operation *getInstAtPosition(ArrayRef<unsigned> positions,
+                                    unsigned level, Block *block) {
+  unsigned i = 0;
+  for (auto &op : *block) {
+    if (i != positions[level]) {
+      ++i;
+      continue;
+    }
+    if (level == positions.size() - 1)
+      return &op;
+    if (auto childAffineForOp = dyn_cast<AffineForOp>(op))
+      return getInstAtPosition(positions, level + 1,
+                               childAffineForOp.getBody());
+
+    for (auto &region : op.getRegions()) {
+      for (auto &b : region)
+        if (auto *ret = getInstAtPosition(positions, level + 1, &b))
+          return ret;
+    }
+    return nullptr;
+  }
+  return nullptr;
+}
+
+// Adds loop IV bounds to 'cst' for loop IVs not found in 'ivs'.
+LogicalResult addMissingLoopIVBounds(SmallPtrSet<Value, 8> &ivs,
+                                     FlatAffineConstraints *cst) {
+  for (unsigned i = 0, e = cst->getNumDimIds(); i < e; ++i) {
+    auto value = cst->getIdValue(i);
+    if (ivs.count(value) == 0) {
+      assert(isForInductionVar(value));
+      auto loop = getForInductionVarOwner(value);
+      if (failed(cst->addAffineForOpDomain(loop)))
+        return failure();
+    }
+  }
+  return success();
+}
+
+// Returns the innermost common loop depth for the set of operations in 'ops'.
+// TODO(andydavis) Move this to LoopUtils.
+static unsigned
+getInnermostCommonLoopDepth(ArrayRef<Operation *> ops,
+                            SmallVectorImpl<AffineForOp> &surroundingLoops) {
+  unsigned numOps = ops.size();
+  assert(numOps > 0);
+
+  std::vector<SmallVector<AffineForOp, 4>> loops(numOps);
+  unsigned loopDepthLimit = std::numeric_limits<unsigned>::max();
+  for (unsigned i = 0; i < numOps; ++i) {
+    getLoopIVs(*ops[i], &loops[i]);
+    loopDepthLimit =
+        std::min(loopDepthLimit, static_cast<unsigned>(loops[i].size()));
+  }
+
+  unsigned loopDepth = 0;
+  for (unsigned d = 0; d < loopDepthLimit; ++d) {
+    unsigned i;
+    for (i = 1; i < numOps; ++i) {
+      if (loops[i - 1][d] != loops[i][d])
+        return loopDepth;
+    }
+    surroundingLoops.push_back(loops[i - 1][d]);
+    ++loopDepth;
+  }
+  return loopDepth;
+}
+
+/// Computes in 'sliceUnion' the union of all slice bounds computed at
+/// 'loopDepth' between all dependent pairs of ops in 'opsA' and 'opsB'.
+/// Returns 'Success' if union was computed, 'failure' otherwise.
+LogicalResult mlir::computeSliceUnion(ArrayRef<Operation *> opsA,
+                                      ArrayRef<Operation *> opsB,
+                                      unsigned loopDepth,
+                                      unsigned numCommonLoops,
+                                      bool isBackwardSlice,
+                                      ComputationSliceState *sliceUnion) {
+  // Compute the union of slice bounds between all pairs in 'opsA' and
+  // 'opsB' in 'sliceUnionCst'.
+  FlatAffineConstraints sliceUnionCst;
+  assert(sliceUnionCst.getNumDimAndSymbolIds() == 0);
+  std::vector<std::pair<Operation *, Operation *>> dependentOpPairs;
+  for (unsigned i = 0, numOpsA = opsA.size(); i < numOpsA; ++i) {
+    MemRefAccess srcAccess(opsA[i]);
+    for (unsigned j = 0, numOpsB = opsB.size(); j < numOpsB; ++j) {
+      MemRefAccess dstAccess(opsB[j]);
+      if (srcAccess.memref != dstAccess.memref)
+        continue;
+      // Check if 'loopDepth' exceeds nesting depth of src/dst ops.
+      if ((!isBackwardSlice && loopDepth > getNestingDepth(*opsA[i])) ||
+          (isBackwardSlice && loopDepth > getNestingDepth(*opsB[j]))) {
+        LLVM_DEBUG(llvm::dbgs() << "Invalid loop depth\n.");
+        return failure();
+      }
+
+      bool readReadAccesses = isa<AffineLoadOp>(srcAccess.opInst) &&
+                              isa<AffineLoadOp>(dstAccess.opInst);
+      FlatAffineConstraints dependenceConstraints;
+      // Check dependence between 'srcAccess' and 'dstAccess'.
+      DependenceResult result = checkMemrefAccessDependence(
+          srcAccess, dstAccess, /*loopDepth=*/numCommonLoops + 1,
+          &dependenceConstraints, /*dependenceComponents=*/nullptr,
+          /*allowRAR=*/readReadAccesses);
+      if (result.value == DependenceResult::Failure) {
+        LLVM_DEBUG(llvm::dbgs() << "Dependence check failed\n.");
+        return failure();
+      }
+      if (result.value == DependenceResult::NoDependence)
+        continue;
+      dependentOpPairs.push_back({opsA[i], opsB[j]});
+
+      // Compute slice bounds for 'srcAccess' and 'dstAccess'.
+      ComputationSliceState tmpSliceState;
+      mlir::getComputationSliceState(opsA[i], opsB[j], &dependenceConstraints,
+                                     loopDepth, isBackwardSlice,
+                                     &tmpSliceState);
+
+      if (sliceUnionCst.getNumDimAndSymbolIds() == 0) {
+        // Initialize 'sliceUnionCst' with the bounds computed in previous step.
+        if (failed(tmpSliceState.getAsConstraints(&sliceUnionCst))) {
+          LLVM_DEBUG(llvm::dbgs()
+                     << "Unable to compute slice bound constraints\n.");
+          return failure();
+        }
+        assert(sliceUnionCst.getNumDimAndSymbolIds() > 0);
+        continue;
+      }
+
+      // Compute constraints for 'tmpSliceState' in 'tmpSliceCst'.
+      FlatAffineConstraints tmpSliceCst;
+      if (failed(tmpSliceState.getAsConstraints(&tmpSliceCst))) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Unable to compute slice bound constraints\n.");
+        return failure();
+      }
+
+      // Align coordinate spaces of 'sliceUnionCst' and 'tmpSliceCst' if needed.
+      if (!sliceUnionCst.areIdsAlignedWithOther(tmpSliceCst)) {
+
+        // Pre-constraint id alignment: record loop IVs used in each constraint
+        // system.
+        SmallPtrSet<Value, 8> sliceUnionIVs;
+        for (unsigned k = 0, l = sliceUnionCst.getNumDimIds(); k < l; ++k)
+          sliceUnionIVs.insert(sliceUnionCst.getIdValue(k));
+        SmallPtrSet<Value, 8> tmpSliceIVs;
+        for (unsigned k = 0, l = tmpSliceCst.getNumDimIds(); k < l; ++k)
+          tmpSliceIVs.insert(tmpSliceCst.getIdValue(k));
+
+        sliceUnionCst.mergeAndAlignIdsWithOther(/*offset=*/0, &tmpSliceCst);
+
+        // Post-constraint id alignment: add loop IV bounds missing after
+        // id alignment to constraint systems. This can occur if one constraint
+        // system uses an loop IV that is not used by the other. The call
+        // to unionBoundingBox below expects constraints for each Loop IV, even
+        // if they are the unsliced full loop bounds added here.
+        if (failed(addMissingLoopIVBounds(sliceUnionIVs, &sliceUnionCst)))
+          return failure();
+        if (failed(addMissingLoopIVBounds(tmpSliceIVs, &tmpSliceCst)))
+          return failure();
+      }
+      // Compute union bounding box of 'sliceUnionCst' and 'tmpSliceCst'.
+      if (sliceUnionCst.getNumLocalIds() > 0 ||
+          tmpSliceCst.getNumLocalIds() > 0 ||
+          failed(sliceUnionCst.unionBoundingBox(tmpSliceCst))) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Unable to compute union bounding box of slice bounds."
+                      "\n.");
+        return failure();
+      }
+    }
+  }
+
+  // Empty union.
+  if (sliceUnionCst.getNumDimAndSymbolIds() == 0)
+    return failure();
+
+  // Gather loops surrounding ops from loop nest where slice will be inserted.
+  SmallVector<Operation *, 4> ops;
+  for (auto &dep : dependentOpPairs) {
+    ops.push_back(isBackwardSlice ? dep.second : dep.first);
+  }
+  SmallVector<AffineForOp, 4> surroundingLoops;
+  unsigned innermostCommonLoopDepth =
+      getInnermostCommonLoopDepth(ops, surroundingLoops);
+  if (loopDepth > innermostCommonLoopDepth) {
+    LLVM_DEBUG(llvm::dbgs() << "Exceeds max loop depth\n.");
+    return failure();
+  }
+
+  // Store 'numSliceLoopIVs' before converting dst loop IVs to dims.
+  unsigned numSliceLoopIVs = sliceUnionCst.getNumDimIds();
+
+  // Convert any dst loop IVs which are symbol identifiers to dim identifiers.
+  sliceUnionCst.convertLoopIVSymbolsToDims();
+  sliceUnion->clearBounds();
+  sliceUnion->lbs.resize(numSliceLoopIVs, AffineMap());
+  sliceUnion->ubs.resize(numSliceLoopIVs, AffineMap());
+
+  // Get slice bounds from slice union constraints 'sliceUnionCst'.
+  sliceUnionCst.getSliceBounds(/*offset=*/0, numSliceLoopIVs,
+                               opsA[0]->getContext(), &sliceUnion->lbs,
+                               &sliceUnion->ubs);
+
+  // Add slice bound operands of union.
+  SmallVector<Value, 4> sliceBoundOperands;
+  sliceUnionCst.getIdValues(numSliceLoopIVs,
+                            sliceUnionCst.getNumDimAndSymbolIds(),
+                            &sliceBoundOperands);
+
+  // Copy src loop IVs from 'sliceUnionCst' to 'sliceUnion'.
+  sliceUnion->ivs.clear();
+  sliceUnionCst.getIdValues(0, numSliceLoopIVs, &sliceUnion->ivs);
+
+  // Set loop nest insertion point to block start at 'loopDepth'.
+  sliceUnion->insertPoint =
+      isBackwardSlice
+          ? surroundingLoops[loopDepth - 1].getBody()->begin()
+          : std::prev(surroundingLoops[loopDepth - 1].getBody()->end());
+
+  // Give each bound its own copy of 'sliceBoundOperands' for subsequent
+  // canonicalization.
+  sliceUnion->lbOperands.resize(numSliceLoopIVs, sliceBoundOperands);
+  sliceUnion->ubOperands.resize(numSliceLoopIVs, sliceBoundOperands);
+  return success();
+}
+
+const char *const kSliceFusionBarrierAttrName = "slice_fusion_barrier";
+// Computes slice bounds by projecting out any loop IVs from
+// 'dependenceConstraints' at depth greater than 'loopDepth', and computes slice
+// bounds in 'sliceState' which represent the one loop nest's IVs in terms of
+// the other loop nest's IVs, symbols and constants (using 'isBackwardsSlice').
+void mlir::getComputationSliceState(
+    Operation *depSourceOp, Operation *depSinkOp,
+    FlatAffineConstraints *dependenceConstraints, unsigned loopDepth,
+    bool isBackwardSlice, ComputationSliceState *sliceState) {
+  // Get loop nest surrounding src operation.
+  SmallVector<AffineForOp, 4> srcLoopIVs;
+  getLoopIVs(*depSourceOp, &srcLoopIVs);
+  unsigned numSrcLoopIVs = srcLoopIVs.size();
+
+  // Get loop nest surrounding dst operation.
+  SmallVector<AffineForOp, 4> dstLoopIVs;
+  getLoopIVs(*depSinkOp, &dstLoopIVs);
+  unsigned numDstLoopIVs = dstLoopIVs.size();
+
+  assert((!isBackwardSlice && loopDepth <= numSrcLoopIVs) ||
+         (isBackwardSlice && loopDepth <= numDstLoopIVs));
+
+  // Project out dimensions other than those up to 'loopDepth'.
+  unsigned pos = isBackwardSlice ? numSrcLoopIVs + loopDepth : loopDepth;
+  unsigned num =
+      isBackwardSlice ? numDstLoopIVs - loopDepth : numSrcLoopIVs - loopDepth;
+  dependenceConstraints->projectOut(pos, num);
+
+  // Add slice loop IV values to 'sliceState'.
+  unsigned offset = isBackwardSlice ? 0 : loopDepth;
+  unsigned numSliceLoopIVs = isBackwardSlice ? numSrcLoopIVs : numDstLoopIVs;
+  dependenceConstraints->getIdValues(offset, offset + numSliceLoopIVs,
+                                     &sliceState->ivs);
+
+  // Set up lower/upper bound affine maps for the slice.
+  sliceState->lbs.resize(numSliceLoopIVs, AffineMap());
+  sliceState->ubs.resize(numSliceLoopIVs, AffineMap());
+
+  // Get bounds for slice IVs in terms of other IVs, symbols, and constants.
+  dependenceConstraints->getSliceBounds(offset, numSliceLoopIVs,
+                                        depSourceOp->getContext(),
+                                        &sliceState->lbs, &sliceState->ubs);
+
+  // Set up bound operands for the slice's lower and upper bounds.
+  SmallVector<Value, 4> sliceBoundOperands;
+  unsigned numDimsAndSymbols = dependenceConstraints->getNumDimAndSymbolIds();
+  for (unsigned i = 0; i < numDimsAndSymbols; ++i) {
+    if (i < offset || i >= offset + numSliceLoopIVs) {
+      sliceBoundOperands.push_back(dependenceConstraints->getIdValue(i));
+    }
+  }
+
+  // Give each bound its own copy of 'sliceBoundOperands' for subsequent
+  // canonicalization.
+  sliceState->lbOperands.resize(numSliceLoopIVs, sliceBoundOperands);
+  sliceState->ubOperands.resize(numSliceLoopIVs, sliceBoundOperands);
+
+  // Set destination loop nest insertion point to block start at 'dstLoopDepth'.
+  sliceState->insertPoint =
+      isBackwardSlice ? dstLoopIVs[loopDepth - 1].getBody()->begin()
+                      : std::prev(srcLoopIVs[loopDepth - 1].getBody()->end());
+
+  llvm::SmallDenseSet<Value, 8> sequentialLoops;
+  if (isa<AffineLoadOp>(depSourceOp) && isa<AffineLoadOp>(depSinkOp)) {
+    // For read-read access pairs, clear any slice bounds on sequential loops.
+    // Get sequential loops in loop nest rooted at 'srcLoopIVs[0]'.
+    getSequentialLoops(isBackwardSlice ? srcLoopIVs[0] : dstLoopIVs[0],
+                       &sequentialLoops);
+  }
+  // Clear all sliced loop bounds beginning at the first sequential loop, or
+  // first loop with a slice fusion barrier attribute..
+  // TODO(andydavis, bondhugula) Use MemRef read/write regions instead of
+  // using 'kSliceFusionBarrierAttrName'.
+  auto getSliceLoop = [&](unsigned i) {
+    return isBackwardSlice ? srcLoopIVs[i] : dstLoopIVs[i];
+  };
+  for (unsigned i = 0; i < numSliceLoopIVs; ++i) {
+    Value iv = getSliceLoop(i).getInductionVar();
+    if (sequentialLoops.count(iv) == 0 &&
+        getSliceLoop(i).getAttr(kSliceFusionBarrierAttrName) == nullptr)
+      continue;
+    for (unsigned j = i; j < numSliceLoopIVs; ++j) {
+      sliceState->lbs[j] = AffineMap();
+      sliceState->ubs[j] = AffineMap();
+    }
+    break;
+  }
+}
+
+/// Creates a computation slice of the loop nest surrounding 'srcOpInst',
+/// updates the slice loop bounds with any non-null bound maps specified in
+/// 'sliceState', and inserts this slice into the loop nest surrounding
+/// 'dstOpInst' at loop depth 'dstLoopDepth'.
+// TODO(andydavis,bondhugula): extend the slicing utility to compute slices that
+// aren't necessarily a one-to-one relation b/w the source and destination. The
+// relation between the source and destination could be many-to-many in general.
+// TODO(andydavis,bondhugula): the slice computation is incorrect in the cases
+// where the dependence from the source to the destination does not cover the
+// entire destination index set. Subtract out the dependent destination
+// iterations from destination index set and check for emptiness --- this is one
+// solution.
+AffineForOp
+mlir::insertBackwardComputationSlice(Operation *srcOpInst, Operation *dstOpInst,
+                                     unsigned dstLoopDepth,
+                                     ComputationSliceState *sliceState) {
+  // Get loop nest surrounding src operation.
+  SmallVector<AffineForOp, 4> srcLoopIVs;
+  getLoopIVs(*srcOpInst, &srcLoopIVs);
+  unsigned numSrcLoopIVs = srcLoopIVs.size();
+
+  // Get loop nest surrounding dst operation.
+  SmallVector<AffineForOp, 4> dstLoopIVs;
+  getLoopIVs(*dstOpInst, &dstLoopIVs);
+  unsigned dstLoopIVsSize = dstLoopIVs.size();
+  if (dstLoopDepth > dstLoopIVsSize) {
+    dstOpInst->emitError("invalid destination loop depth");
+    return AffineForOp();
+  }
+
+  // Find the op block positions of 'srcOpInst' within 'srcLoopIVs'.
+  SmallVector<unsigned, 4> positions;
+  // TODO(andydavis): This code is incorrect since srcLoopIVs can be 0-d.
+  findInstPosition(srcOpInst, srcLoopIVs[0].getOperation()->getBlock(),
+                   &positions);
+
+  // Clone src loop nest and insert it a the beginning of the operation block
+  // of the loop at 'dstLoopDepth' in 'dstLoopIVs'.
+  auto dstAffineForOp = dstLoopIVs[dstLoopDepth - 1];
+  OpBuilder b(dstAffineForOp.getBody(), dstAffineForOp.getBody()->begin());
+  auto sliceLoopNest =
+      cast<AffineForOp>(b.clone(*srcLoopIVs[0].getOperation()));
+
+  Operation *sliceInst =
+      getInstAtPosition(positions, /*level=*/0, sliceLoopNest.getBody());
+  // Get loop nest surrounding 'sliceInst'.
+  SmallVector<AffineForOp, 4> sliceSurroundingLoops;
+  getLoopIVs(*sliceInst, &sliceSurroundingLoops);
+
+  // Sanity check.
+  unsigned sliceSurroundingLoopsSize = sliceSurroundingLoops.size();
+  (void)sliceSurroundingLoopsSize;
+  assert(dstLoopDepth + numSrcLoopIVs >= sliceSurroundingLoopsSize);
+  unsigned sliceLoopLimit = dstLoopDepth + numSrcLoopIVs;
+  (void)sliceLoopLimit;
+  assert(sliceLoopLimit >= sliceSurroundingLoopsSize);
+
+  // Update loop bounds for loops in 'sliceLoopNest'.
+  for (unsigned i = 0; i < numSrcLoopIVs; ++i) {
+    auto forOp = sliceSurroundingLoops[dstLoopDepth + i];
+    if (AffineMap lbMap = sliceState->lbs[i])
+      forOp.setLowerBound(sliceState->lbOperands[i], lbMap);
+    if (AffineMap ubMap = sliceState->ubs[i])
+      forOp.setUpperBound(sliceState->ubOperands[i], ubMap);
+  }
+  return sliceLoopNest;
+}
+
+// Constructs  MemRefAccess populating it with the memref, its indices and
+// opinst from 'loadOrStoreOpInst'.
+MemRefAccess::MemRefAccess(Operation *loadOrStoreOpInst) {
+  if (auto loadOp = dyn_cast<AffineLoadOp>(loadOrStoreOpInst)) {
+    memref = loadOp.getMemRef();
+    opInst = loadOrStoreOpInst;
+    auto loadMemrefType = loadOp.getMemRefType();
+    indices.reserve(loadMemrefType.getRank());
+    for (auto index : loadOp.getMapOperands()) {
+      indices.push_back(index);
+    }
+  } else {
+    assert(isa<AffineStoreOp>(loadOrStoreOpInst) && "load/store op expected");
+    auto storeOp = dyn_cast<AffineStoreOp>(loadOrStoreOpInst);
+    opInst = loadOrStoreOpInst;
+    memref = storeOp.getMemRef();
+    auto storeMemrefType = storeOp.getMemRefType();
+    indices.reserve(storeMemrefType.getRank());
+    for (auto index : storeOp.getMapOperands()) {
+      indices.push_back(index);
+    }
+  }
+}
+
+unsigned MemRefAccess::getRank() const {
+  return memref->getType().cast<MemRefType>().getRank();
+}
+
+bool MemRefAccess::isStore() const { return isa<AffineStoreOp>(opInst); }
+
+/// Returns the nesting depth of this statement, i.e., the number of loops
+/// surrounding this statement.
+unsigned mlir::getNestingDepth(Operation &op) {
+  Operation *currOp = &op;
+  unsigned depth = 0;
+  while ((currOp = currOp->getParentOp())) {
+    if (isa<AffineForOp>(currOp))
+      depth++;
+  }
+  return depth;
+}
+
+/// Equal if both affine accesses are provably equivalent (at compile
+/// time) when considering the memref, the affine maps and their respective
+/// operands. The equality of access functions + operands is checked by
+/// subtracting fully composed value maps, and then simplifying the difference
+/// using the expression flattener.
+/// TODO: this does not account for aliasing of memrefs.
+bool MemRefAccess::operator==(const MemRefAccess &rhs) const {
+  if (memref != rhs.memref)
+    return false;
+
+  AffineValueMap diff, thisMap, rhsMap;
+  getAccessMap(&thisMap);
+  rhs.getAccessMap(&rhsMap);
+  AffineValueMap::difference(thisMap, rhsMap, &diff);
+  return llvm::all_of(diff.getAffineMap().getResults(),
+                      [](AffineExpr e) { return e == 0; });
+}
+
+/// Returns the number of surrounding loops common to 'loopsA' and 'loopsB',
+/// where each lists loops from outer-most to inner-most in loop nest.
+unsigned mlir::getNumCommonSurroundingLoops(Operation &A, Operation &B) {
+  SmallVector<AffineForOp, 4> loopsA, loopsB;
+  getLoopIVs(A, &loopsA);
+  getLoopIVs(B, &loopsB);
+
+  unsigned minNumLoops = std::min(loopsA.size(), loopsB.size());
+  unsigned numCommonLoops = 0;
+  for (unsigned i = 0; i < minNumLoops; ++i) {
+    if (loopsA[i].getOperation() != loopsB[i].getOperation())
+      break;
+    ++numCommonLoops;
+  }
+  return numCommonLoops;
+}
+
+static Optional<int64_t> getMemoryFootprintBytes(Block &block,
+                                                 Block::iterator start,
+                                                 Block::iterator end,
+                                                 int memorySpace) {
+  SmallDenseMap<Value, std::unique_ptr<MemRefRegion>, 4> regions;
+
+  // Walk this 'affine.for' operation to gather all memory regions.
+  auto result = block.walk(start, end, [&](Operation *opInst) -> WalkResult {
+    if (!isa<AffineLoadOp>(opInst) && !isa<AffineStoreOp>(opInst)) {
+      // Neither load nor a store op.
+      return WalkResult::advance();
+    }
+
+    // Compute the memref region symbolic in any IVs enclosing this block.
+    auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
+    if (failed(
+            region->compute(opInst,
+                            /*loopDepth=*/getNestingDepth(*block.begin())))) {
+      return opInst->emitError("error obtaining memory region\n");
+    }
+
+    auto it = regions.find(region->memref);
+    if (it == regions.end()) {
+      regions[region->memref] = std::move(region);
+    } else if (failed(it->second->unionBoundingBox(*region))) {
+      return opInst->emitWarning(
+          "getMemoryFootprintBytes: unable to perform a union on a memory "
+          "region");
+    }
+    return WalkResult::advance();
+  });
+  if (result.wasInterrupted())
+    return None;
+
+  int64_t totalSizeInBytes = 0;
+  for (const auto &region : regions) {
+    Optional<int64_t> size = region.second->getRegionSize();
+    if (!size.hasValue())
+      return None;
+    totalSizeInBytes += size.getValue();
+  }
+  return totalSizeInBytes;
+}
+
+Optional<int64_t> mlir::getMemoryFootprintBytes(AffineForOp forOp,
+                                                int memorySpace) {
+  auto *forInst = forOp.getOperation();
+  return ::getMemoryFootprintBytes(
+      *forInst->getBlock(), Block::iterator(forInst),
+      std::next(Block::iterator(forInst)), memorySpace);
+}
+
+/// Returns in 'sequentialLoops' all sequential loops in loop nest rooted
+/// at 'forOp'.
+void mlir::getSequentialLoops(AffineForOp forOp,
+                              llvm::SmallDenseSet<Value, 8> *sequentialLoops) {
+  forOp.getOperation()->walk([&](Operation *op) {
+    if (auto innerFor = dyn_cast<AffineForOp>(op))
+      if (!isLoopParallel(innerFor))
+        sequentialLoops->insert(innerFor.getInductionVar());
+  });
+}
+
+/// Returns true if 'forOp' is parallel.
+bool mlir::isLoopParallel(AffineForOp forOp) {
+  // Collect all load and store ops in loop nest rooted at 'forOp'.
+  SmallVector<Operation *, 8> loadAndStoreOpInsts;
+  auto walkResult = forOp.walk([&](Operation *opInst) {
+    if (isa<AffineLoadOp>(opInst) || isa<AffineStoreOp>(opInst))
+      loadAndStoreOpInsts.push_back(opInst);
+    else if (!isa<AffineForOp>(opInst) && !isa<AffineTerminatorOp>(opInst) &&
+             !isa<AffineIfOp>(opInst) && !opInst->hasNoSideEffect())
+      return WalkResult::interrupt();
+
+    return WalkResult::advance();
+  });
+
+  // Stop early if the loop has unknown ops with side effects.
+  if (walkResult.wasInterrupted())
+    return false;
+
+  // Dep check depth would be number of enclosing loops + 1.
+  unsigned depth = getNestingDepth(*forOp.getOperation()) + 1;
+
+  // Check dependences between all pairs of ops in 'loadAndStoreOpInsts'.
+  for (auto *srcOpInst : loadAndStoreOpInsts) {
+    MemRefAccess srcAccess(srcOpInst);
+    for (auto *dstOpInst : loadAndStoreOpInsts) {
+      MemRefAccess dstAccess(dstOpInst);
+      FlatAffineConstraints dependenceConstraints;
+      DependenceResult result = checkMemrefAccessDependence(
+          srcAccess, dstAccess, depth, &dependenceConstraints,
+          /*dependenceComponents=*/nullptr);
+      if (result.value != DependenceResult::NoDependence)
+        return false;
+    }
+  }
+  return true;
+}
diff --git a/mlir/lib/Analysis/VectorAnalysis.cpp b/mlir/lib/Analysis/VectorAnalysis.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c7dbed5fac5d4c4b18829ece041ccd4aa7e9713
--- /dev/null
+++ b/mlir/lib/Analysis/VectorAnalysis.cpp
@@ -0,0 +1,232 @@
+//===- VectorAnalysis.cpp - Analysis for Vectorization --------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Dialect/VectorOps/Utils.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/STLExtras.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+
+///
+/// Implements Analysis functions specific to vectors which support
+/// the vectorization and vectorization materialization passes.
+///
+
+using namespace mlir;
+
+using llvm::SetVector;
+
+Optional<SmallVector<int64_t, 4>> mlir::shapeRatio(ArrayRef<int64_t> superShape,
+                                                   ArrayRef<int64_t> subShape) {
+  if (superShape.size() < subShape.size()) {
+    return Optional<SmallVector<int64_t, 4>>();
+  }
+
+  // Starting from the end, compute the integer divisors.
+  // Set the boolean `divides` if integral division is not possible.
+  std::vector<int64_t> result;
+  result.reserve(superShape.size());
+  bool divides = true;
+  auto divide = [&divides, &result](int superSize, int subSize) {
+    assert(superSize > 0 && "superSize must be > 0");
+    assert(subSize > 0 && "subSize must be > 0");
+    divides &= (superSize % subSize == 0);
+    result.push_back(superSize / subSize);
+  };
+  functional::zipApply(
+      divide, SmallVector<int64_t, 8>{superShape.rbegin(), superShape.rend()},
+      SmallVector<int64_t, 8>{subShape.rbegin(), subShape.rend()});
+
+  // If integral division does not occur, return and let the caller decide.
+  if (!divides) {
+    return None;
+  }
+
+  // At this point we computed the ratio (in reverse) for the common
+  // size. Fill with the remaining entries from the super-vector shape (still in
+  // reverse).
+  int commonSize = subShape.size();
+  std::copy(superShape.rbegin() + commonSize, superShape.rend(),
+            std::back_inserter(result));
+
+  assert(result.size() == superShape.size() &&
+         "super to sub shape ratio is not of the same size as the super rank");
+
+  // Reverse again to get it back in the proper order and return.
+  return SmallVector<int64_t, 4>{result.rbegin(), result.rend()};
+}
+
+Optional<SmallVector<int64_t, 4>> mlir::shapeRatio(VectorType superVectorType,
+                                                   VectorType subVectorType) {
+  assert(superVectorType.getElementType() == subVectorType.getElementType() &&
+         "vector types must be of the same elemental type");
+  return shapeRatio(superVectorType.getShape(), subVectorType.getShape());
+}
+
+/// Constructs a permutation map from memref indices to vector dimension.
+///
+/// The implementation uses the knowledge of the mapping of enclosing loop to
+/// vector dimension. `enclosingLoopToVectorDim` carries this information as a
+/// map with:
+///   - keys representing "vectorized enclosing loops";
+///   - values representing the corresponding vector dimension.
+/// The algorithm traverses "vectorized enclosing loops" and extracts the
+/// at-most-one MemRef index that is invariant along said loop. This index is
+/// guaranteed to be at most one by construction: otherwise the MemRef is not
+/// vectorizable.
+/// If this invariant index is found, it is added to the permutation_map at the
+/// proper vector dimension.
+/// If no index is found to be invariant, 0 is added to the permutation_map and
+/// corresponds to a vector broadcast along that dimension.
+///
+/// Returns an empty AffineMap if `enclosingLoopToVectorDim` is empty,
+/// signalling that no permutation map can be constructed given
+/// `enclosingLoopToVectorDim`.
+///
+/// Examples can be found in the documentation of `makePermutationMap`, in the
+/// header file.
+static AffineMap makePermutationMap(
+    ArrayRef<Value> indices,
+    const DenseMap<Operation *, unsigned> &enclosingLoopToVectorDim) {
+  if (enclosingLoopToVectorDim.empty())
+    return AffineMap();
+  MLIRContext *context =
+      enclosingLoopToVectorDim.begin()->getFirst()->getContext();
+  using functional::makePtrDynCaster;
+  using functional::map;
+  SmallVector<AffineExpr, 4> perm(enclosingLoopToVectorDim.size(),
+                                  getAffineConstantExpr(0, context));
+
+  for (auto kvp : enclosingLoopToVectorDim) {
+    assert(kvp.second < perm.size());
+    auto invariants = getInvariantAccesses(
+        cast<AffineForOp>(kvp.first).getInductionVar(), indices);
+    unsigned numIndices = indices.size();
+    unsigned countInvariantIndices = 0;
+    for (unsigned dim = 0; dim < numIndices; ++dim) {
+      if (!invariants.count(indices[dim])) {
+        assert(perm[kvp.second] == getAffineConstantExpr(0, context) &&
+               "permutationMap already has an entry along dim");
+        perm[kvp.second] = getAffineDimExpr(dim, context);
+      } else {
+        ++countInvariantIndices;
+      }
+    }
+    assert((countInvariantIndices == numIndices ||
+            countInvariantIndices == numIndices - 1) &&
+           "Vectorization prerequisite violated: at most 1 index may be "
+           "invariant wrt a vectorized loop");
+  }
+  return AffineMap::get(indices.size(), 0, perm);
+}
+
+/// Implementation detail that walks up the parents and records the ones with
+/// the specified type.
+/// TODO(ntv): could also be implemented as a collect parents followed by a
+/// filter and made available outside this file.
+template <typename T>
+static SetVector<Operation *> getParentsOfType(Operation *op) {
+  SetVector<Operation *> res;
+  auto *current = op;
+  while (auto *parent = current->getParentOp()) {
+    if (auto typedParent = dyn_cast<T>(parent)) {
+      assert(res.count(parent) == 0 && "Already inserted");
+      res.insert(parent);
+    }
+    current = parent;
+  }
+  return res;
+}
+
+/// Returns the enclosing AffineForOp, from closest to farthest.
+static SetVector<Operation *> getEnclosingforOps(Operation *op) {
+  return getParentsOfType<AffineForOp>(op);
+}
+
+AffineMap mlir::makePermutationMap(
+    Operation *op, ArrayRef<Value> indices,
+    const DenseMap<Operation *, unsigned> &loopToVectorDim) {
+  DenseMap<Operation *, unsigned> enclosingLoopToVectorDim;
+  auto enclosingLoops = getEnclosingforOps(op);
+  for (auto *forInst : enclosingLoops) {
+    auto it = loopToVectorDim.find(forInst);
+    if (it != loopToVectorDim.end()) {
+      enclosingLoopToVectorDim.insert(*it);
+    }
+  }
+  return ::makePermutationMap(indices, enclosingLoopToVectorDim);
+}
+
+bool mlir::matcher::operatesOnSuperVectorsOf(Operation &op,
+                                             VectorType subVectorType) {
+  // First, extract the vector type and distinguish between:
+  //   a. ops that *must* lower a super-vector (i.e. vector.transfer_read,
+  //      vector.transfer_write); and
+  //   b. ops that *may* lower a super-vector (all other ops).
+  // The ops that *may* lower a super-vector only do so if the super-vector to
+  // sub-vector ratio exists. The ops that *must* lower a super-vector are
+  // explicitly checked for this property.
+  /// TODO(ntv): there should be a single function for all ops to do this so we
+  /// do not have to special case. Maybe a trait, or just a method, unclear atm.
+  bool mustDivide = false;
+  (void)mustDivide;
+  VectorType superVectorType;
+  if (auto read = dyn_cast<vector::TransferReadOp>(op)) {
+    superVectorType = read.getVectorType();
+    mustDivide = true;
+  } else if (auto write = dyn_cast<vector::TransferWriteOp>(op)) {
+    superVectorType = write.getVectorType();
+    mustDivide = true;
+  } else if (op.getNumResults() == 0) {
+    if (!isa<ReturnOp>(op)) {
+      op.emitError("NYI: assuming only return operations can have 0 "
+                   " results at this point");
+    }
+    return false;
+  } else if (op.getNumResults() == 1) {
+    if (auto v = op.getResult(0)->getType().dyn_cast<VectorType>()) {
+      superVectorType = v;
+    } else {
+      // Not a vector type.
+      return false;
+    }
+  } else {
+    // Not a vector.transfer and has more than 1 result, fail hard for now to
+    // wake us up when something changes.
+    op.emitError("NYI: operation has more than 1 result");
+    return false;
+  }
+
+  // Get the ratio.
+  auto ratio = shapeRatio(superVectorType, subVectorType);
+
+  // Sanity check.
+  assert((ratio.hasValue() || !mustDivide) &&
+         "vector.transfer operation in which super-vector size is not an"
+         " integer multiple of sub-vector size");
+
+  // This catches cases that are not strictly necessary to have multiplicity but
+  // still aren't divisible by the sub-vector shape.
+  // This could be useful information if we wanted to reshape at the level of
+  // the vector type (but we would have to look at the compute and distinguish
+  // between parallel, reduction and possibly other cases.
+  if (!ratio.hasValue()) {
+    return false;
+  }
+
+  return true;
+}
diff --git a/mlir/lib/Analysis/Verifier.cpp b/mlir/lib/Analysis/Verifier.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4861b1a2e7e53bd016ca6fb44e4259be7fca75d
--- /dev/null
+++ b/mlir/lib/Analysis/Verifier.cpp
@@ -0,0 +1,266 @@
+//===- Verifier.cpp - MLIR Verifier Implementation ------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the verify() methods on the various IR types, performing
+// (potentially expensive) checks on the holistic structure of the code.  This
+// can be used for detecting bugs in compiler transformations and hand written
+// .mlir files.
+//
+// The checks in this file are only for things that can occur as part of IR
+// transformations: e.g. violation of dominance information, malformed operation
+// attributes, etc.  MLIR supports transformations moving IR through locally
+// invalid states (e.g. unlinking an operation from a block before re-inserting
+// it in a new place), but each transformation must complete with the IR in a
+// valid form.
+//
+// This should not check for things that are always wrong by construction (e.g.
+// attributes or other immutable structures that are incorrect), because those
+// are not mutable and can be checked at time of construction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/Analysis/Dominance.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Regex.h"
+
+using namespace mlir;
+
+namespace {
+/// This class encapsulates all the state used to verify an operation region.
+class OperationVerifier {
+public:
+  explicit OperationVerifier(MLIRContext *ctx)
+      : ctx(ctx), identifierRegex("^[a-zA-Z_][a-zA-Z_0-9\\.\\$]*$") {}
+
+  /// Verify the given operation.
+  LogicalResult verify(Operation &op);
+
+  /// Returns the registered dialect for a dialect-specific attribute.
+  Dialect *getDialectForAttribute(const NamedAttribute &attr) {
+    assert(attr.first.strref().contains('.') && "expected dialect attribute");
+    auto dialectNamePair = attr.first.strref().split('.');
+    return ctx->getRegisteredDialect(dialectNamePair.first);
+  }
+
+  /// Returns if the given string is valid to use as an identifier name.
+  bool isValidName(StringRef name) { return identifierRegex.match(name); }
+
+private:
+  /// Verify the given potentially nested region or block.
+  LogicalResult verifyRegion(Region &region);
+  LogicalResult verifyBlock(Block &block);
+  LogicalResult verifyOperation(Operation &op);
+
+  /// Verify the dominance within the given IR unit.
+  LogicalResult verifyDominance(Region &region);
+  LogicalResult verifyDominance(Operation &op);
+
+  /// Emit an error for the given block.
+  InFlightDiagnostic emitError(Block &bb, const Twine &message) {
+    // Take the location information for the first operation in the block.
+    if (!bb.empty())
+      return bb.front().emitError(message);
+
+    // Worst case, fall back to using the parent's location.
+    return mlir::emitError(bb.getParent()->getLoc(), message);
+  }
+
+  /// The current context for the verifier.
+  MLIRContext *ctx;
+
+  /// Dominance information for this operation, when checking dominance.
+  DominanceInfo *domInfo = nullptr;
+
+  /// Regex checker for attribute names.
+  llvm::Regex identifierRegex;
+
+  /// Mapping between dialect namespace and if that dialect supports
+  /// unregistered operations.
+  llvm::StringMap<bool> dialectAllowsUnknownOps;
+};
+} // end anonymous namespace
+
+/// Verify the given operation.
+LogicalResult OperationVerifier::verify(Operation &op) {
+  // Verify the operation first.
+  if (failed(verifyOperation(op)))
+    return failure();
+
+  // Since everything looks structurally ok to this point, we do a dominance
+  // check for any nested regions. We do this as a second pass since malformed
+  // CFG's can cause dominator analysis constructure to crash and we want the
+  // verifier to be resilient to malformed code.
+  DominanceInfo theDomInfo(&op);
+  domInfo = &theDomInfo;
+  for (auto &region : op.getRegions())
+    if (failed(verifyDominance(region)))
+      return failure();
+
+  domInfo = nullptr;
+  return success();
+}
+
+LogicalResult OperationVerifier::verifyRegion(Region &region) {
+  if (region.empty())
+    return success();
+
+  // Verify the first block has no predecessors.
+  auto *firstBB = &region.front();
+  if (!firstBB->hasNoPredecessors())
+    return mlir::emitError(region.getLoc(),
+                           "entry block of region may not have predecessors");
+
+  // Verify each of the blocks within the region.
+  for (auto &block : region)
+    if (failed(verifyBlock(block)))
+      return failure();
+  return success();
+}
+
+LogicalResult OperationVerifier::verifyBlock(Block &block) {
+  for (auto arg : block.getArguments())
+    if (arg->getOwner() != &block)
+      return emitError(block, "block argument not owned by block");
+
+  // Verify that this block has a terminator.
+  if (block.empty())
+    return emitError(block, "block with no terminator");
+
+  // Verify the non-terminator operations separately so that we can verify
+  // they has no successors.
+  for (auto &op : llvm::make_range(block.begin(), std::prev(block.end()))) {
+    if (op.getNumSuccessors() != 0)
+      return op.emitError(
+          "operation with block successors must terminate its parent block");
+
+    if (failed(verifyOperation(op)))
+      return failure();
+  }
+
+  // Verify the terminator.
+  if (failed(verifyOperation(block.back())))
+    return failure();
+  if (block.back().isKnownNonTerminator())
+    return emitError(block, "block with no terminator");
+
+  // Verify that this block is not branching to a block of a different
+  // region.
+  for (Block *successor : block.getSuccessors())
+    if (successor->getParent() != block.getParent())
+      return block.back().emitOpError(
+          "branching to block of a different region");
+
+  return success();
+}
+
+LogicalResult OperationVerifier::verifyOperation(Operation &op) {
+  // Check that operands are non-nil and structurally ok.
+  for (auto operand : op.getOperands())
+    if (!operand)
+      return op.emitError("null operand found");
+
+  /// Verify that all of the attributes are okay.
+  for (auto attr : op.getAttrs()) {
+    if (!identifierRegex.match(attr.first))
+      return op.emitError("invalid attribute name '") << attr.first << "'";
+
+    // Check for any optional dialect specific attributes.
+    if (!attr.first.strref().contains('.'))
+      continue;
+    if (auto *dialect = getDialectForAttribute(attr))
+      if (failed(dialect->verifyOperationAttribute(&op, attr)))
+        return failure();
+  }
+
+  // If we can get operation info for this, check the custom hook.
+  auto *opInfo = op.getAbstractOperation();
+  if (opInfo && failed(opInfo->verifyInvariants(&op)))
+    return failure();
+
+  // Verify that all child regions are ok.
+  for (auto &region : op.getRegions())
+    if (failed(verifyRegion(region)))
+      return failure();
+
+  // If this is a registered operation, there is nothing left to do.
+  if (opInfo)
+    return success();
+
+  // Otherwise, verify that the parent dialect allows un-registered operations.
+  auto dialectPrefix = op.getName().getDialect();
+
+  // Check for an existing answer for the operation dialect.
+  auto it = dialectAllowsUnknownOps.find(dialectPrefix);
+  if (it == dialectAllowsUnknownOps.end()) {
+    // If the operation dialect is registered, query it directly.
+    if (auto *dialect = ctx->getRegisteredDialect(dialectPrefix))
+      it = dialectAllowsUnknownOps
+               .try_emplace(dialectPrefix, dialect->allowsUnknownOperations())
+               .first;
+    // Otherwise, conservatively allow unknown operations.
+    else
+      it = dialectAllowsUnknownOps.try_emplace(dialectPrefix, true).first;
+  }
+
+  if (!it->second) {
+    return op.emitError("unregistered operation '")
+           << op.getName() << "' found in dialect ('" << dialectPrefix
+           << "') that does not allow unknown operations";
+  }
+
+  return success();
+}
+
+LogicalResult OperationVerifier::verifyDominance(Region &region) {
+  // Verify the dominance of each of the held operations.
+  for (auto &block : region)
+    for (auto &op : block)
+      if (failed(verifyDominance(op)))
+        return failure();
+  return success();
+}
+
+LogicalResult OperationVerifier::verifyDominance(Operation &op) {
+  // Check that operands properly dominate this use.
+  for (unsigned operandNo = 0, e = op.getNumOperands(); operandNo != e;
+       ++operandNo) {
+    auto operand = op.getOperand(operandNo);
+    if (domInfo->properlyDominates(operand, &op))
+      continue;
+
+    auto diag = op.emitError("operand #")
+                << operandNo << " does not dominate this use";
+    if (auto *useOp = operand->getDefiningOp())
+      diag.attachNote(useOp->getLoc()) << "operand defined here";
+    return failure();
+  }
+
+  // Verify the dominance of each of the nested blocks within this operation.
+  for (auto &region : op.getRegions())
+    if (failed(verifyDominance(region)))
+      return failure();
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// Perform (potentially expensive) checks of invariants, used to detect
+/// compiler bugs.  On error, this reports the error through the MLIRContext and
+/// returns failure.
+LogicalResult mlir::verify(Operation *op) {
+  return OperationVerifier(op->getContext()).verify(*op);
+}
diff --git a/mlir/lib/CMakeLists.txt b/mlir/lib/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f34b1e8bead229afdc97486e6cdf40cc385fe63a
--- /dev/null
+++ b/mlir/lib/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_subdirectory(Analysis)
+add_subdirectory(Conversion)
+add_subdirectory(Dialect)
+add_subdirectory(EDSC)
+add_subdirectory(ExecutionEngine)
+add_subdirectory(IR)
+add_subdirectory(Parser)
+add_subdirectory(Pass)
+add_subdirectory(Quantizer)
+add_subdirectory(Support)
+add_subdirectory(TableGen)
+add_subdirectory(Target)
+add_subdirectory(Transforms)
+add_subdirectory(Translation)
diff --git a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e9a9ca82f51a767f94b2b36ff0d39f2147094aa2
--- /dev/null
+++ b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
@@ -0,0 +1,550 @@
+//===- AffineToStandard.cpp - Lower affine constructs to primitives -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file lowers affine constructs (If and For statements, AffineApply
+// operations) within a function into their standard If and For equivalent ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+// Visit affine expressions recursively and build the sequence of operations
+// that correspond to it.  Visitation functions return an Value of the
+// expression subtree they visited or `nullptr` on error.
+class AffineApplyExpander
+    : public AffineExprVisitor<AffineApplyExpander, Value> {
+public:
+  // This internal class expects arguments to be non-null, checks must be
+  // performed at the call site.
+  AffineApplyExpander(OpBuilder &builder, ArrayRef<Value> dimValues,
+                      ArrayRef<Value> symbolValues, Location loc)
+      : builder(builder), dimValues(dimValues), symbolValues(symbolValues),
+        loc(loc) {}
+
+  template <typename OpTy> Value buildBinaryExpr(AffineBinaryOpExpr expr) {
+    auto lhs = visit(expr.getLHS());
+    auto rhs = visit(expr.getRHS());
+    if (!lhs || !rhs)
+      return nullptr;
+    auto op = builder.create<OpTy>(loc, lhs, rhs);
+    return op.getResult();
+  }
+
+  Value visitAddExpr(AffineBinaryOpExpr expr) {
+    return buildBinaryExpr<AddIOp>(expr);
+  }
+
+  Value visitMulExpr(AffineBinaryOpExpr expr) {
+    return buildBinaryExpr<MulIOp>(expr);
+  }
+
+  // Euclidean modulo operation: negative RHS is not allowed.
+  // Remainder of the euclidean integer division is always non-negative.
+  //
+  // Implemented as
+  //
+  //     a mod b =
+  //         let remainder = srem a, b;
+  //             negative = a < 0 in
+  //         select negative, remainder + b, remainder.
+  Value visitModExpr(AffineBinaryOpExpr expr) {
+    auto rhsConst = expr.getRHS().dyn_cast<AffineConstantExpr>();
+    if (!rhsConst) {
+      emitError(
+          loc,
+          "semi-affine expressions (modulo by non-const) are not supported");
+      return nullptr;
+    }
+    if (rhsConst.getValue() <= 0) {
+      emitError(loc, "modulo by non-positive value is not supported");
+      return nullptr;
+    }
+
+    auto lhs = visit(expr.getLHS());
+    auto rhs = visit(expr.getRHS());
+    assert(lhs && rhs && "unexpected affine expr lowering failure");
+
+    Value remainder = builder.create<SignedRemIOp>(loc, lhs, rhs);
+    Value zeroCst = builder.create<ConstantIndexOp>(loc, 0);
+    Value isRemainderNegative =
+        builder.create<CmpIOp>(loc, CmpIPredicate::slt, remainder, zeroCst);
+    Value correctedRemainder = builder.create<AddIOp>(loc, remainder, rhs);
+    Value result = builder.create<SelectOp>(loc, isRemainderNegative,
+                                            correctedRemainder, remainder);
+    return result;
+  }
+
+  // Floor division operation (rounds towards negative infinity).
+  //
+  // For positive divisors, it can be implemented without branching and with a
+  // single division operation as
+  //
+  //        a floordiv b =
+  //            let negative = a < 0 in
+  //            let absolute = negative ? -a - 1 : a in
+  //            let quotient = absolute / b in
+  //                negative ? -quotient - 1 : quotient
+  Value visitFloorDivExpr(AffineBinaryOpExpr expr) {
+    auto rhsConst = expr.getRHS().dyn_cast<AffineConstantExpr>();
+    if (!rhsConst) {
+      emitError(
+          loc,
+          "semi-affine expressions (division by non-const) are not supported");
+      return nullptr;
+    }
+    if (rhsConst.getValue() <= 0) {
+      emitError(loc, "division by non-positive value is not supported");
+      return nullptr;
+    }
+
+    auto lhs = visit(expr.getLHS());
+    auto rhs = visit(expr.getRHS());
+    assert(lhs && rhs && "unexpected affine expr lowering failure");
+
+    Value zeroCst = builder.create<ConstantIndexOp>(loc, 0);
+    Value noneCst = builder.create<ConstantIndexOp>(loc, -1);
+    Value negative =
+        builder.create<CmpIOp>(loc, CmpIPredicate::slt, lhs, zeroCst);
+    Value negatedDecremented = builder.create<SubIOp>(loc, noneCst, lhs);
+    Value dividend =
+        builder.create<SelectOp>(loc, negative, negatedDecremented, lhs);
+    Value quotient = builder.create<SignedDivIOp>(loc, dividend, rhs);
+    Value correctedQuotient = builder.create<SubIOp>(loc, noneCst, quotient);
+    Value result =
+        builder.create<SelectOp>(loc, negative, correctedQuotient, quotient);
+    return result;
+  }
+
+  // Ceiling division operation (rounds towards positive infinity).
+  //
+  // For positive divisors, it can be implemented without branching and with a
+  // single division operation as
+  //
+  //     a ceildiv b =
+  //         let negative = a <= 0 in
+  //         let absolute = negative ? -a : a - 1 in
+  //         let quotient = absolute / b in
+  //             negative ? -quotient : quotient + 1
+  Value visitCeilDivExpr(AffineBinaryOpExpr expr) {
+    auto rhsConst = expr.getRHS().dyn_cast<AffineConstantExpr>();
+    if (!rhsConst) {
+      emitError(loc) << "semi-affine expressions (division by non-const) are "
+                        "not supported";
+      return nullptr;
+    }
+    if (rhsConst.getValue() <= 0) {
+      emitError(loc, "division by non-positive value is not supported");
+      return nullptr;
+    }
+    auto lhs = visit(expr.getLHS());
+    auto rhs = visit(expr.getRHS());
+    assert(lhs && rhs && "unexpected affine expr lowering failure");
+
+    Value zeroCst = builder.create<ConstantIndexOp>(loc, 0);
+    Value oneCst = builder.create<ConstantIndexOp>(loc, 1);
+    Value nonPositive =
+        builder.create<CmpIOp>(loc, CmpIPredicate::sle, lhs, zeroCst);
+    Value negated = builder.create<SubIOp>(loc, zeroCst, lhs);
+    Value decremented = builder.create<SubIOp>(loc, lhs, oneCst);
+    Value dividend =
+        builder.create<SelectOp>(loc, nonPositive, negated, decremented);
+    Value quotient = builder.create<SignedDivIOp>(loc, dividend, rhs);
+    Value negatedQuotient = builder.create<SubIOp>(loc, zeroCst, quotient);
+    Value incrementedQuotient = builder.create<AddIOp>(loc, quotient, oneCst);
+    Value result = builder.create<SelectOp>(loc, nonPositive, negatedQuotient,
+                                            incrementedQuotient);
+    return result;
+  }
+
+  Value visitConstantExpr(AffineConstantExpr expr) {
+    auto valueAttr =
+        builder.getIntegerAttr(builder.getIndexType(), expr.getValue());
+    auto op =
+        builder.create<ConstantOp>(loc, builder.getIndexType(), valueAttr);
+    return op.getResult();
+  }
+
+  Value visitDimExpr(AffineDimExpr expr) {
+    assert(expr.getPosition() < dimValues.size() &&
+           "affine dim position out of range");
+    return dimValues[expr.getPosition()];
+  }
+
+  Value visitSymbolExpr(AffineSymbolExpr expr) {
+    assert(expr.getPosition() < symbolValues.size() &&
+           "symbol dim position out of range");
+    return symbolValues[expr.getPosition()];
+  }
+
+private:
+  OpBuilder &builder;
+  ArrayRef<Value> dimValues;
+  ArrayRef<Value> symbolValues;
+
+  Location loc;
+};
+} // namespace
+
+// Create a sequence of operations that implement the `expr` applied to the
+// given dimension and symbol values.
+mlir::Value mlir::expandAffineExpr(OpBuilder &builder, Location loc,
+                                   AffineExpr expr, ArrayRef<Value> dimValues,
+                                   ArrayRef<Value> symbolValues) {
+  return AffineApplyExpander(builder, dimValues, symbolValues, loc).visit(expr);
+}
+
+// Create a sequence of operations that implement the `affineMap` applied to
+// the given `operands` (as it it were an AffineApplyOp).
+Optional<SmallVector<Value, 8>> static expandAffineMap(
+    OpBuilder &builder, Location loc, AffineMap affineMap,
+    ArrayRef<Value> operands) {
+  auto numDims = affineMap.getNumDims();
+  auto expanded = functional::map(
+      [numDims, &builder, loc, operands](AffineExpr expr) {
+        return expandAffineExpr(builder, loc, expr,
+                                operands.take_front(numDims),
+                                operands.drop_front(numDims));
+      },
+      affineMap.getResults());
+  if (llvm::all_of(expanded, [](Value v) { return v; }))
+    return expanded;
+  return None;
+}
+
+// Given a range of values, emit the code that reduces them with "min" or "max"
+// depending on the provided comparison predicate.  The predicate defines which
+// comparison to perform, "lt" for "min", "gt" for "max" and is used for the
+// `cmpi` operation followed by the `select` operation:
+//
+//   %cond   = cmpi "predicate" %v0, %v1
+//   %result = select %cond, %v0, %v1
+//
+// Multiple values are scanned in a linear sequence.  This creates a data
+// dependences that wouldn't exist in a tree reduction, but is easier to
+// recognize as a reduction by the subsequent passes.
+static Value buildMinMaxReductionSeq(Location loc, CmpIPredicate predicate,
+                                     ArrayRef<Value> values,
+                                     OpBuilder &builder) {
+  assert(!llvm::empty(values) && "empty min/max chain");
+
+  auto valueIt = values.begin();
+  Value value = *valueIt++;
+  for (; valueIt != values.end(); ++valueIt) {
+    auto cmpOp = builder.create<CmpIOp>(loc, predicate, value, *valueIt);
+    value = builder.create<SelectOp>(loc, cmpOp.getResult(), value, *valueIt);
+  }
+
+  return value;
+}
+
+// Emit instructions that correspond to the affine map in the lower bound
+// applied to the respective operands, and compute the maximum value across
+// the results.
+Value mlir::lowerAffineLowerBound(AffineForOp op, OpBuilder &builder) {
+  SmallVector<Value, 8> boundOperands(op.getLowerBoundOperands());
+  auto lbValues = expandAffineMap(builder, op.getLoc(), op.getLowerBoundMap(),
+                                  boundOperands);
+  if (!lbValues)
+    return nullptr;
+  return buildMinMaxReductionSeq(op.getLoc(), CmpIPredicate::sgt, *lbValues,
+                                 builder);
+}
+
+// Emit instructions that correspond to the affine map in the upper bound
+// applied to the respective operands, and compute the minimum value across
+// the results.
+Value mlir::lowerAffineUpperBound(AffineForOp op, OpBuilder &builder) {
+  SmallVector<Value, 8> boundOperands(op.getUpperBoundOperands());
+  auto ubValues = expandAffineMap(builder, op.getLoc(), op.getUpperBoundMap(),
+                                  boundOperands);
+  if (!ubValues)
+    return nullptr;
+  return buildMinMaxReductionSeq(op.getLoc(), CmpIPredicate::slt, *ubValues,
+                                 builder);
+}
+
+namespace {
+// Affine terminators are removed.
+class AffineTerminatorLowering : public OpRewritePattern<AffineTerminatorOp> {
+public:
+  using OpRewritePattern<AffineTerminatorOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineTerminatorOp op,
+                                     PatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<loop::TerminatorOp>(op);
+    return matchSuccess();
+  }
+};
+
+class AffineForLowering : public OpRewritePattern<AffineForOp> {
+public:
+  using OpRewritePattern<AffineForOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineForOp op,
+                                     PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    Value lowerBound = lowerAffineLowerBound(op, rewriter);
+    Value upperBound = lowerAffineUpperBound(op, rewriter);
+    Value step = rewriter.create<ConstantIndexOp>(loc, op.getStep());
+    auto f = rewriter.create<loop::ForOp>(loc, lowerBound, upperBound, step);
+    f.region().getBlocks().clear();
+    rewriter.inlineRegionBefore(op.region(), f.region(), f.region().end());
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+};
+
+class AffineIfLowering : public OpRewritePattern<AffineIfOp> {
+public:
+  using OpRewritePattern<AffineIfOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineIfOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+
+    // Now we just have to handle the condition logic.
+    auto integerSet = op.getIntegerSet();
+    Value zeroConstant = rewriter.create<ConstantIndexOp>(loc, 0);
+    SmallVector<Value, 8> operands(op.getOperands());
+    auto operandsRef = llvm::makeArrayRef(operands);
+
+    // Calculate cond as a conjunction without short-circuiting.
+    Value cond = nullptr;
+    for (unsigned i = 0, e = integerSet.getNumConstraints(); i < e; ++i) {
+      AffineExpr constraintExpr = integerSet.getConstraint(i);
+      bool isEquality = integerSet.isEq(i);
+
+      // Build and apply an affine expression
+      auto numDims = integerSet.getNumDims();
+      Value affResult = expandAffineExpr(rewriter, loc, constraintExpr,
+                                         operandsRef.take_front(numDims),
+                                         operandsRef.drop_front(numDims));
+      if (!affResult)
+        return matchFailure();
+      auto pred = isEquality ? CmpIPredicate::eq : CmpIPredicate::sge;
+      Value cmpVal =
+          rewriter.create<CmpIOp>(loc, pred, affResult, zeroConstant);
+      cond =
+          cond ? rewriter.create<AndOp>(loc, cond, cmpVal).getResult() : cmpVal;
+    }
+    cond = cond ? cond
+                : rewriter.create<ConstantIntOp>(loc, /*value=*/1, /*width=*/1);
+
+    bool hasElseRegion = !op.elseRegion().empty();
+    auto ifOp = rewriter.create<loop::IfOp>(loc, cond, hasElseRegion);
+    rewriter.inlineRegionBefore(op.thenRegion(), &ifOp.thenRegion().back());
+    ifOp.thenRegion().back().erase();
+    if (hasElseRegion) {
+      rewriter.inlineRegionBefore(op.elseRegion(), &ifOp.elseRegion().back());
+      ifOp.elseRegion().back().erase();
+    }
+
+    // Ok, we're done!
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+};
+
+// Convert an "affine.apply" operation into a sequence of arithmetic
+// operations using the StandardOps dialect.
+class AffineApplyLowering : public OpRewritePattern<AffineApplyOp> {
+public:
+  using OpRewritePattern<AffineApplyOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineApplyOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto maybeExpandedMap =
+        expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(),
+                        llvm::to_vector<8>(op.getOperands()));
+    if (!maybeExpandedMap)
+      return matchFailure();
+    rewriter.replaceOp(op, *maybeExpandedMap);
+    return matchSuccess();
+  }
+};
+
+// Apply the affine map from an 'affine.load' operation to its operands, and
+// feed the results to a newly created 'std.load' operation (which replaces the
+// original 'affine.load').
+class AffineLoadLowering : public OpRewritePattern<AffineLoadOp> {
+public:
+  using OpRewritePattern<AffineLoadOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineLoadOp op,
+                                     PatternRewriter &rewriter) const override {
+    // Expand affine map from 'affineLoadOp'.
+    SmallVector<Value, 8> indices(op.getMapOperands());
+    auto resultOperands =
+        expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(), indices);
+    if (!resultOperands)
+      return matchFailure();
+
+    // Build std.load memref[expandedMap.results].
+    rewriter.replaceOpWithNewOp<LoadOp>(op, op.getMemRef(), *resultOperands);
+    return matchSuccess();
+  }
+};
+
+// Apply the affine map from an 'affine.prefetch' operation to its operands, and
+// feed the results to a newly created 'std.prefetch' operation (which replaces
+// the original 'affine.prefetch').
+class AffinePrefetchLowering : public OpRewritePattern<AffinePrefetchOp> {
+public:
+  using OpRewritePattern<AffinePrefetchOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffinePrefetchOp op,
+                                     PatternRewriter &rewriter) const override {
+    // Expand affine map from 'affinePrefetchOp'.
+    SmallVector<Value, 8> indices(op.getMapOperands());
+    auto resultOperands =
+        expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(), indices);
+    if (!resultOperands)
+      return matchFailure();
+
+    // Build std.prefetch memref[expandedMap.results].
+    rewriter.replaceOpWithNewOp<PrefetchOp>(
+        op, op.memref(), *resultOperands, op.isWrite(),
+        op.localityHint().getZExtValue(), op.isDataCache());
+    return matchSuccess();
+  }
+};
+
+// Apply the affine map from an 'affine.store' operation to its operands, and
+// feed the results to a newly created 'std.store' operation (which replaces the
+// original 'affine.store').
+class AffineStoreLowering : public OpRewritePattern<AffineStoreOp> {
+public:
+  using OpRewritePattern<AffineStoreOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineStoreOp op,
+                                     PatternRewriter &rewriter) const override {
+    // Expand affine map from 'affineStoreOp'.
+    SmallVector<Value, 8> indices(op.getMapOperands());
+    auto maybeExpandedMap =
+        expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(), indices);
+    if (!maybeExpandedMap)
+      return matchFailure();
+
+    // Build std.store valueToStore, memref[expandedMap.results].
+    rewriter.replaceOpWithNewOp<StoreOp>(op, op.getValueToStore(),
+                                         op.getMemRef(), *maybeExpandedMap);
+    return matchSuccess();
+  }
+};
+
+// Apply the affine maps from an 'affine.dma_start' operation to each of their
+// respective map operands, and feed the results to a newly created
+// 'std.dma_start' operation (which replaces the original 'affine.dma_start').
+class AffineDmaStartLowering : public OpRewritePattern<AffineDmaStartOp> {
+public:
+  using OpRewritePattern<AffineDmaStartOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineDmaStartOp op,
+                                     PatternRewriter &rewriter) const override {
+    SmallVector<Value, 8> operands(op.getOperands());
+    auto operandsRef = llvm::makeArrayRef(operands);
+
+    // Expand affine map for DMA source memref.
+    auto maybeExpandedSrcMap = expandAffineMap(
+        rewriter, op.getLoc(), op.getSrcMap(),
+        operandsRef.drop_front(op.getSrcMemRefOperandIndex() + 1));
+    if (!maybeExpandedSrcMap)
+      return matchFailure();
+    // Expand affine map for DMA destination memref.
+    auto maybeExpandedDstMap = expandAffineMap(
+        rewriter, op.getLoc(), op.getDstMap(),
+        operandsRef.drop_front(op.getDstMemRefOperandIndex() + 1));
+    if (!maybeExpandedDstMap)
+      return matchFailure();
+    // Expand affine map for DMA tag memref.
+    auto maybeExpandedTagMap = expandAffineMap(
+        rewriter, op.getLoc(), op.getTagMap(),
+        operandsRef.drop_front(op.getTagMemRefOperandIndex() + 1));
+    if (!maybeExpandedTagMap)
+      return matchFailure();
+
+    // Build std.dma_start operation with affine map results.
+    rewriter.replaceOpWithNewOp<DmaStartOp>(
+        op, op.getSrcMemRef(), *maybeExpandedSrcMap, op.getDstMemRef(),
+        *maybeExpandedDstMap, op.getNumElements(), op.getTagMemRef(),
+        *maybeExpandedTagMap, op.getStride(), op.getNumElementsPerStride());
+    return matchSuccess();
+  }
+};
+
+// Apply the affine map from an 'affine.dma_wait' operation tag memref,
+// and feed the results to a newly created 'std.dma_wait' operation (which
+// replaces the original 'affine.dma_wait').
+class AffineDmaWaitLowering : public OpRewritePattern<AffineDmaWaitOp> {
+public:
+  using OpRewritePattern<AffineDmaWaitOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineDmaWaitOp op,
+                                     PatternRewriter &rewriter) const override {
+    // Expand affine map for DMA tag memref.
+    SmallVector<Value, 8> indices(op.getTagIndices());
+    auto maybeExpandedTagMap =
+        expandAffineMap(rewriter, op.getLoc(), op.getTagMap(), indices);
+    if (!maybeExpandedTagMap)
+      return matchFailure();
+
+    // Build std.dma_wait operation with affine map results.
+    rewriter.replaceOpWithNewOp<DmaWaitOp>(
+        op, op.getTagMemRef(), *maybeExpandedTagMap, op.getNumElements());
+    return matchSuccess();
+  }
+};
+
+} // end namespace
+
+void mlir::populateAffineToStdConversionPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *ctx) {
+  patterns.insert<
+      AffineApplyLowering, AffineDmaStartLowering, AffineDmaWaitLowering,
+      AffineLoadLowering, AffinePrefetchLowering, AffineStoreLowering,
+      AffineForLowering, AffineIfLowering, AffineTerminatorLowering>(ctx);
+}
+
+namespace {
+class LowerAffinePass : public FunctionPass<LowerAffinePass> {
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    populateAffineToStdConversionPatterns(patterns, &getContext());
+    ConversionTarget target(getContext());
+    target.addLegalDialect<loop::LoopOpsDialect, StandardOpsDialect>();
+    if (failed(applyPartialConversion(getFunction(), target, patterns)))
+      signalPassFailure();
+  }
+};
+} // namespace
+
+/// Lowers If and For operations within a function into their lower level CFG
+/// equivalent blocks.
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createLowerAffinePass() {
+  return std::make_unique<LowerAffinePass>();
+}
+
+static PassRegistration<LowerAffinePass>
+    pass("lower-affine",
+         "Lower If, For, AffineApply operations to primitive equivalents");
diff --git a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..33f7db7abc4eb66dd5a88ab3e405814558b45431
--- /dev/null
+++ b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
@@ -0,0 +1,24 @@
+add_llvm_library(MLIRAffineToStandard
+  AffineToStandard.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/AffineToStandard
+)
+add_dependencies(
+  MLIRAffineToStandard
+
+  MLIRAffineOps
+  MLIRStandardOps
+  MLIRIR
+  LLVMCore
+  LLVMSupport
+)
+target_link_libraries(
+  MLIRAffineToStandard
+
+  MLIRAffineOps
+  MLIRStandardOps
+  MLIRIR
+  LLVMCore
+  LLVMSupport
+)
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c791d214d305a70c9bb394557049558c54744b93
--- /dev/null
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_subdirectory(AffineToStandard)
+add_subdirectory(GPUToCUDA)
+add_subdirectory(GPUToNVVM)
+add_subdirectory(GPUToROCDL)
+add_subdirectory(GPUToSPIRV)
+add_subdirectory(LinalgToLLVM)
+add_subdirectory(LoopsToGPU)
+add_subdirectory(LoopToStandard)
+add_subdirectory(StandardToLLVM)
+add_subdirectory(StandardToSPIRV)
+add_subdirectory(VectorToLLVM)
+add_subdirectory(VectorToLoops)
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
new file mode 100644
index 0000000000000000000000000000000000000000..63bc15173be7fc1e8df7a9e88550ea047974df4d
--- /dev/null
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -0,0 +1,85 @@
+//===- IndexIntrinsicsOpLowering.h - GPU IndexOps Lowering class *- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_
+#define MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+
+#include "llvm/ADT/StringSwitch.h"
+
+namespace mlir {
+
+// Rewriting that replaces Op with XOp, YOp, or ZOp depending on the dimension
+// that Op operates on.  Op is assumed to return an `std.index` value and
+// XOp, YOp and ZOp are assumed to return an `llvm.i32` value.  Depending on
+// `indexBitwidth`, sign-extend or truncate the resulting value to match the
+// bitwidth expected by the consumers of the value.
+template <typename Op, typename XOp, typename YOp, typename ZOp>
+struct GPUIndexIntrinsicOpLowering : public LLVMOpLowering {
+private:
+  enum dimension { X = 0, Y = 1, Z = 2, invalid };
+  unsigned indexBitwidth;
+
+  static dimension dimensionToIndex(Op op) {
+    return llvm::StringSwitch<dimension>(op.dimension())
+        .Case("x", X)
+        .Case("y", Y)
+        .Case("z", Z)
+        .Default(invalid);
+  }
+
+  static unsigned getIndexBitWidth(LLVMTypeConverter &type_converter) {
+    auto dialect = type_converter.getDialect();
+    return dialect->getLLVMModule().getDataLayout().getPointerSizeInBits();
+  }
+
+public:
+  explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(Op::getOperationName(),
+                       lowering_.getDialect()->getContext(), lowering_),
+        indexBitwidth(getIndexBitWidth(lowering_)) {}
+
+  // Convert the kernel arguments to an LLVM type, preserve the rest.
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto dialect = lowering.getDialect();
+    Value newOp;
+    switch (dimensionToIndex(cast<Op>(op))) {
+    case X:
+      newOp = rewriter.create<XOp>(loc, LLVM::LLVMType::getInt32Ty(dialect));
+      break;
+    case Y:
+      newOp = rewriter.create<YOp>(loc, LLVM::LLVMType::getInt32Ty(dialect));
+      break;
+    case Z:
+      newOp = rewriter.create<ZOp>(loc, LLVM::LLVMType::getInt32Ty(dialect));
+      break;
+    default:
+      return matchFailure();
+    }
+
+    if (indexBitwidth > 32) {
+      newOp = rewriter.create<LLVM::SExtOp>(
+          loc, LLVM::LLVMType::getIntNTy(dialect, indexBitwidth), newOp);
+    } else if (indexBitwidth < 32) {
+      newOp = rewriter.create<LLVM::TruncOp>(
+          loc, LLVM::LLVMType::getIntNTy(dialect, indexBitwidth), newOp);
+    }
+
+    rewriter.replaceOp(op, {newOp});
+    return matchSuccess();
+  }
+};
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_
diff --git a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
new file mode 100644
index 0000000000000000000000000000000000000000..b75c1bf2d7b3a6bd08dfb6c62ee26e0e8ec0a2c3
--- /dev/null
+++ b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
@@ -0,0 +1,100 @@
+//===- OpToFuncCallLowering.h - GPU ops lowering to custom calls *- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_
+#define MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+
+namespace mlir {
+
+/// Rewriting that replace SourceOp with a CallOp to `f32Func` or `f64Func`
+/// depending on the element type that Op operates upon. The function
+/// declaration is added in case it was not added before.
+///
+/// Example with NVVM:
+///   %exp_f32 = std.exp %arg_f32 : f32
+///
+/// will be transformed into
+///   llvm.call @__nv_expf(%arg_f32) : (!llvm.float) -> !llvm.float
+template <typename SourceOp>
+struct OpToFuncCallLowering : public LLVMOpLowering {
+public:
+  explicit OpToFuncCallLowering(LLVMTypeConverter &lowering_, StringRef f32Func,
+                                StringRef f64Func)
+      : LLVMOpLowering(SourceOp::getOperationName(),
+                       lowering_.getDialect()->getContext(), lowering_),
+        f32Func(f32Func), f64Func(f64Func) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    using LLVM::LLVMFuncOp;
+    using LLVM::LLVMType;
+
+    static_assert(
+        std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
+        "expected single result op");
+
+    LLVMType resultType = lowering.convertType(op->getResult(0)->getType())
+                              .template cast<LLVM::LLVMType>();
+    LLVMType funcType = getFunctionType(resultType, operands);
+    StringRef funcName = getFunctionName(resultType);
+    if (funcName.empty())
+      return matchFailure();
+
+    LLVMFuncOp funcOp = appendOrGetFuncOp(funcName, funcType, op);
+    auto callOp = rewriter.create<LLVM::CallOp>(
+        op->getLoc(), resultType, rewriter.getSymbolRefAttr(funcOp), operands);
+    rewriter.replaceOp(op, {callOp.getResult(0)});
+    return matchSuccess();
+  }
+
+private:
+  LLVM::LLVMType getFunctionType(LLVM::LLVMType resultType,
+                                 ArrayRef<Value> operands) const {
+    using LLVM::LLVMType;
+    SmallVector<LLVMType, 1> operandTypes;
+    for (Value operand : operands) {
+      operandTypes.push_back(operand->getType().cast<LLVMType>());
+    }
+    return LLVMType::getFunctionTy(resultType, operandTypes,
+                                   /*isVarArg=*/false);
+  }
+
+  StringRef getFunctionName(LLVM::LLVMType type) const {
+    if (type.isFloatTy())
+      return f32Func;
+    if (type.isDoubleTy())
+      return f64Func;
+    return "";
+  }
+
+  LLVM::LLVMFuncOp appendOrGetFuncOp(StringRef funcName,
+                                     LLVM::LLVMType funcType,
+                                     Operation *op) const {
+    using LLVM::LLVMFuncOp;
+
+    Operation *funcOp = SymbolTable::lookupNearestSymbolFrom(op, funcName);
+    if (funcOp)
+      return cast<LLVMFuncOp>(*funcOp);
+
+    mlir::OpBuilder b(op->getParentOfType<LLVMFuncOp>());
+    return b.create<LLVMFuncOp>(op->getLoc(), funcName, funcType);
+  }
+
+  const std::string f32Func;
+  const std::string f64Func;
+};
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_
diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4eddb78749394d63f7b5456727a0176faf429884
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
@@ -0,0 +1,16 @@
+if(MLIR_CUDA_CONVERSIONS_ENABLED)
+  llvm_map_components_to_libnames(nvptx "NVPTX")
+
+  add_llvm_library(MLIRGPUtoCUDATransforms
+    ConvertKernelFuncToCubin.cpp
+    ConvertLaunchFuncToCudaCalls.cpp
+  )
+  target_link_libraries(MLIRGPUtoCUDATransforms
+    MLIRGPU
+    MLIRLLVMIR
+    MLIRNVVMIR
+    MLIRPass
+    MLIRTargetNVVMIR
+    ${nvptx}
+  )
+endif()
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66a2e66f99a4eaf6447c48fd3f549a7426497940
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
@@ -0,0 +1,167 @@
+//===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to convert gpu kernel functions into a
+// corresponding binary blob that can be executed on a CUDA GPU. Currently
+// only translates the function itself but no dependencies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Target/NVVMIR.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace mlir;
+
+namespace {
+// TODO(herhut): Move to shared location.
+static constexpr const char *kCubinAnnotation = "nvvm.cubin";
+
+/// A pass converting tagged kernel modules to cubin blobs.
+///
+/// If tagged as a kernel module, each contained function is translated to NVVM
+/// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
+/// GPU binary code, which is then attached as an attribute to the function. The
+/// function body is erased.
+class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
+public:
+  GpuKernelToCubinPass(
+      CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
+      : cubinGenerator(cubinGenerator) {}
+
+  void runOnModule() override {
+    ModuleOp module = getModule();
+    if (!module.getAttrOfType<UnitAttr>(
+            gpu::GPUDialect::getKernelModuleAttrName()) ||
+        !module.getName())
+      return;
+
+    // Make sure the NVPTX target is initialized.
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXAsmPrinter();
+
+    auto llvmModule = translateModuleToNVVMIR(module);
+    if (!llvmModule)
+      return signalPassFailure();
+
+    // Translate the module to CUBIN and attach the result as attribute to the
+    // module.
+    if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
+            *llvmModule, module.getLoc(), *module.getName()))
+      module.setAttr(kCubinAnnotation, cubinAttr);
+    else
+      signalPassFailure();
+  }
+
+private:
+  static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
+                                                Location, StringRef);
+
+  std::string translateModuleToPtx(llvm::Module &module,
+                                   llvm::TargetMachine &target_machine);
+
+  /// Converts llvmModule to cubin using the user-provided generator. Location
+  /// is used for error reporting and name is forwarded to the CUBIN generator
+  /// to use in its logging mechanisms.
+  OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, Location loc,
+                                  StringRef name);
+
+  /// Translates llvmModule to cubin and returns the result as attribute.
+  StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
+                                                 Location loc, StringRef name);
+
+  CubinGenerator cubinGenerator;
+};
+
+} // anonymous namespace
+
+std::string GpuKernelToCubinPass::translateModuleToPtx(
+    llvm::Module &module, llvm::TargetMachine &target_machine) {
+  std::string ptx;
+  {
+    llvm::raw_string_ostream stream(ptx);
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegen_passes;
+    target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
+                                       llvm::CGFT_AssemblyFile);
+    codegen_passes.run(module);
+  }
+
+  return ptx;
+}
+
+OwnedCubin
+GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
+                                                  Location, StringRef) {
+  const char data[] = "CUBIN";
+  return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
+}
+
+OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
+                                                      Location loc,
+                                                      StringRef name) {
+  std::unique_ptr<llvm::TargetMachine> targetMachine;
+  {
+    std::string error;
+    // TODO(herhut): Make triple configurable.
+    constexpr const char *cudaTriple = "nvptx64-nvidia-cuda";
+    llvm::Triple triple(cudaTriple);
+    const llvm::Target *target =
+        llvm::TargetRegistry::lookupTarget("", triple, error);
+    if (target == nullptr) {
+      emitError(loc, "cannot initialize target triple");
+      return {};
+    }
+    targetMachine.reset(
+        target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {}));
+  }
+
+  // Set the data layout of the llvm module to match what the ptx target needs.
+  llvmModule.setDataLayout(targetMachine->createDataLayout());
+
+  auto ptx = translateModuleToPtx(llvmModule, *targetMachine);
+
+  return cubinGenerator(ptx, loc, name);
+}
+
+StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
+    llvm::Module &llvmModule, Location loc, StringRef name) {
+  auto cubin = convertModuleToCubin(llvmModule, loc, name);
+  if (!cubin)
+    return {};
+  return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
+}
+
+std::unique_ptr<OpPassBase<ModuleOp>>
+mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
+  return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
+}
+
+static PassRegistration<GpuKernelToCubinPass>
+    pass("test-kernel-to-cubin",
+         "Convert all kernel functions to CUDA cubin blobs");
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..19dabcdafee74771766316671bbb34444388e679
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -0,0 +1,424 @@
+//===- ConvertLaunchFuncToCudaCalls.cpp - MLIR CUDA lowering passes -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to convert gpu.launch_func op into a sequence of
+// CUDA runtime calls. As the CUDA runtime does not have a stable published ABI,
+// this pass uses a slim runtime layer that builds on top of the public API from
+// the CUDA headers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace mlir;
+
+// To avoid name mangling, these are defined in the mini-runtime file.
+static constexpr const char *cuModuleLoadName = "mcuModuleLoad";
+static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";
+static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";
+static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
+static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
+static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister";
+
+static constexpr const char *kCubinAnnotation = "nvvm.cubin";
+static constexpr const char *kCubinStorageSuffix = "_cubin_cst";
+
+namespace {
+
+/// A pass to convert gpu.launch_func operations into a sequence of CUDA
+/// runtime calls.
+///
+/// In essence, a gpu.launch_func operations gets compiled into the following
+/// sequence of runtime calls:
+///
+/// * mcuModuleLoad        -- loads the module given the cubin data
+/// * mcuModuleGetFunction -- gets a handle to the actual kernel function
+/// * mcuGetStreamHelper   -- initializes a new CUDA stream
+/// * mcuLaunchKernelName  -- launches the kernel on a stream
+/// * mcuStreamSynchronize -- waits for operations on the stream to finish
+///
+/// Intermediate data structures are allocated on the stack.
+class GpuLaunchFuncToCudaCallsPass
+    : public ModulePass<GpuLaunchFuncToCudaCallsPass> {
+private:
+  LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }
+
+  llvm::LLVMContext &getLLVMContext() {
+    return getLLVMDialect()->getLLVMContext();
+  }
+
+  void initializeCachedTypes() {
+    const llvm::Module &module = llvmDialect->getLLVMModule();
+    llvmVoidType = LLVM::LLVMType::getVoidTy(llvmDialect);
+    llvmPointerType = LLVM::LLVMType::getInt8PtrTy(llvmDialect);
+    llvmPointerPointerType = llvmPointerType.getPointerTo();
+    llvmInt8Type = LLVM::LLVMType::getInt8Ty(llvmDialect);
+    llvmInt32Type = LLVM::LLVMType::getInt32Ty(llvmDialect);
+    llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect);
+    llvmIntPtrType = LLVM::LLVMType::getIntNTy(
+        llvmDialect, module.getDataLayout().getPointerSizeInBits());
+  }
+
+  LLVM::LLVMType getVoidType() { return llvmVoidType; }
+
+  LLVM::LLVMType getPointerType() { return llvmPointerType; }
+
+  LLVM::LLVMType getPointerPointerType() { return llvmPointerPointerType; }
+
+  LLVM::LLVMType getInt8Type() { return llvmInt8Type; }
+
+  LLVM::LLVMType getInt32Type() { return llvmInt32Type; }
+
+  LLVM::LLVMType getInt64Type() { return llvmInt64Type; }
+
+  LLVM::LLVMType getIntPtrType() {
+    const llvm::Module &module = getLLVMDialect()->getLLVMModule();
+    return LLVM::LLVMType::getIntNTy(
+        getLLVMDialect(), module.getDataLayout().getPointerSizeInBits());
+  }
+
+  LLVM::LLVMType getCUResultType() {
+    // This is declared as an enum in CUDA but helpers use i32.
+    return getInt32Type();
+  }
+
+  // Allocate a void pointer on the stack.
+  Value allocatePointer(OpBuilder &builder, Location loc) {
+    auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+                                                builder.getI32IntegerAttr(1));
+    return builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), one,
+                                          /*alignment=*/0);
+  }
+
+  void declareCudaFunctions(Location loc);
+  Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
+  Value generateKernelNameConstant(StringRef name, Location loc,
+                                   OpBuilder &builder);
+  void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
+
+public:
+  // Run the dialect converter on the module.
+  void runOnModule() override {
+    // Cache the LLVMDialect for the current module.
+    llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
+    // Cache the used LLVM types.
+    initializeCachedTypes();
+
+    getModule().walk([this](mlir::gpu::LaunchFuncOp op) {
+      translateGpuLaunchCalls(op);
+    });
+
+    // GPU kernel modules are no longer necessary since we have a global
+    // constant with the CUBIN data.
+    for (auto m : llvm::make_early_inc_range(getModule().getOps<ModuleOp>()))
+      if (m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
+        m.erase();
+  }
+
+private:
+  LLVM::LLVMDialect *llvmDialect;
+  LLVM::LLVMType llvmVoidType;
+  LLVM::LLVMType llvmPointerType;
+  LLVM::LLVMType llvmPointerPointerType;
+  LLVM::LLVMType llvmInt8Type;
+  LLVM::LLVMType llvmInt32Type;
+  LLVM::LLVMType llvmInt64Type;
+  LLVM::LLVMType llvmIntPtrType;
+};
+
+} // anonymous namespace
+
+// Adds declarations for the needed helper functions from the CUDA wrapper.
+// The types in comments give the actual types expected/returned but the API
+// uses void pointers. This is fine as they have the same linkage in C.
+void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
+  ModuleOp module = getModule();
+  OpBuilder builder(module.getBody()->getTerminator());
+  if (!module.lookupSymbol(cuModuleLoadName)) {
+    builder.create<LLVM::LLVMFuncOp>(
+        loc, cuModuleLoadName,
+        LLVM::LLVMType::getFunctionTy(
+            getCUResultType(),
+            {
+                getPointerPointerType(), /* CUmodule *module */
+                getPointerType()         /* void *cubin */
+            },
+            /*isVarArg=*/false));
+  }
+  if (!module.lookupSymbol(cuModuleGetFunctionName)) {
+    // The helper uses void* instead of CUDA's opaque CUmodule and
+    // CUfunction.
+    builder.create<LLVM::LLVMFuncOp>(
+        loc, cuModuleGetFunctionName,
+        LLVM::LLVMType::getFunctionTy(
+            getCUResultType(),
+            {
+                getPointerPointerType(), /* void **function */
+                getPointerType(),        /* void *module */
+                getPointerType()         /* char *name */
+            },
+            /*isVarArg=*/false));
+  }
+  if (!module.lookupSymbol(cuLaunchKernelName)) {
+    // Other than the CUDA api, the wrappers use uintptr_t to match the
+    // LLVM type if MLIR's index type, which the GPU dialect uses.
+    // Furthermore, they use void* instead of CUDA's opaque CUfunction and
+    // CUstream.
+    builder.create<LLVM::LLVMFuncOp>(
+        loc, cuLaunchKernelName,
+        LLVM::LLVMType::getFunctionTy(
+            getCUResultType(),
+            {
+                getPointerType(),        /* void* f */
+                getIntPtrType(),         /* intptr_t gridXDim */
+                getIntPtrType(),         /* intptr_t gridyDim */
+                getIntPtrType(),         /* intptr_t gridZDim */
+                getIntPtrType(),         /* intptr_t blockXDim */
+                getIntPtrType(),         /* intptr_t blockYDim */
+                getIntPtrType(),         /* intptr_t blockZDim */
+                getInt32Type(),          /* unsigned int sharedMemBytes */
+                getPointerType(),        /* void *hstream */
+                getPointerPointerType(), /* void **kernelParams */
+                getPointerPointerType()  /* void **extra */
+            },
+            /*isVarArg=*/false));
+  }
+  if (!module.lookupSymbol(cuGetStreamHelperName)) {
+    // Helper function to get the current CUDA stream. Uses void* instead of
+    // CUDAs opaque CUstream.
+    builder.create<LLVM::LLVMFuncOp>(
+        loc, cuGetStreamHelperName,
+        LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false));
+  }
+  if (!module.lookupSymbol(cuStreamSynchronizeName)) {
+    builder.create<LLVM::LLVMFuncOp>(
+        loc, cuStreamSynchronizeName,
+        LLVM::LLVMType::getFunctionTy(getCUResultType(),
+                                      getPointerType() /* CUstream stream */,
+                                      /*isVarArg=*/false));
+  }
+  if (!module.lookupSymbol(kMcuMemHostRegister)) {
+    builder.create<LLVM::LLVMFuncOp>(
+        loc, kMcuMemHostRegister,
+        LLVM::LLVMType::getFunctionTy(getVoidType(),
+                                      {
+                                          getPointerType(), /* void *ptr */
+                                          getInt64Type()    /* int64 sizeBytes*/
+                                      },
+                                      /*isVarArg=*/false));
+  }
+}
+
+// Generates a parameters array to be used with a CUDA kernel launch call. The
+// arguments are extracted from the launchOp.
+// The generated code is essentially as follows:
+//
+// %array = alloca(numparams * sizeof(void *))
+// for (i : [0, NumKernelOperands))
+//   %array[i] = cast<void*>(KernelOperand[i])
+// return %array
+Value GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
+                                                     OpBuilder &builder) {
+  auto numKernelOperands = launchOp.getNumKernelOperands();
+  Location loc = launchOp.getLoc();
+  auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+                                              builder.getI32IntegerAttr(1));
+  // Provision twice as much for the `array` to allow up to one level of
+  // indirection for each argument.
+  auto arraySize = builder.create<LLVM::ConstantOp>(
+      loc, getInt32Type(), builder.getI32IntegerAttr(numKernelOperands));
+  auto array = builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(),
+                                              arraySize, /*alignment=*/0);
+  for (unsigned idx = 0; idx < numKernelOperands; ++idx) {
+    auto operand = launchOp.getKernelOperand(idx);
+    auto llvmType = operand->getType().cast<LLVM::LLVMType>();
+    Value memLocation = builder.create<LLVM::AllocaOp>(
+        loc, llvmType.getPointerTo(), one, /*alignment=*/1);
+    builder.create<LLVM::StoreOp>(loc, operand, memLocation);
+    auto casted =
+        builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);
+
+    // Assume all struct arguments come from MemRef. If this assumption does not
+    // hold anymore then we `launchOp` to lower from MemRefType and not after
+    // LLVMConversion has taken place and the MemRef information is lost.
+    // Extra level of indirection in the `array`:
+    //   the descriptor pointer is registered via @mcuMemHostRegisterPtr
+    if (llvmType.isStructTy()) {
+      auto registerFunc =
+          getModule().lookupSymbol<LLVM::LLVMFuncOp>(kMcuMemHostRegister);
+      auto nullPtr = builder.create<LLVM::NullOp>(loc, llvmType.getPointerTo());
+      auto gep = builder.create<LLVM::GEPOp>(loc, llvmType.getPointerTo(),
+                                             ArrayRef<Value>{nullPtr, one});
+      auto size = builder.create<LLVM::PtrToIntOp>(loc, getInt64Type(), gep);
+      builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{},
+                                   builder.getSymbolRefAttr(registerFunc),
+                                   ArrayRef<Value>{casted, size});
+      Value memLocation = builder.create<LLVM::AllocaOp>(
+          loc, getPointerPointerType(), one, /*alignment=*/1);
+      builder.create<LLVM::StoreOp>(loc, casted, memLocation);
+      casted =
+          builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);
+    }
+
+    auto index = builder.create<LLVM::ConstantOp>(
+        loc, getInt32Type(), builder.getI32IntegerAttr(idx));
+    auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), array,
+                                           ArrayRef<Value>{index});
+    builder.create<LLVM::StoreOp>(loc, casted, gep);
+  }
+  return array;
+}
+
+// Generates an LLVM IR dialect global that contains the name of the given
+// kernel function as a C string, and returns a pointer to its beginning.
+// The code is essentially:
+//
+// llvm.global constant @kernel_name("function_name\00")
+// func(...) {
+//   %0 = llvm.addressof @kernel_name
+//   %1 = llvm.constant (0 : index)
+//   %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
+// }
+Value GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
+    StringRef name, Location loc, OpBuilder &builder) {
+  // Make sure the trailing zero is included in the constant.
+  std::vector<char> kernelName(name.begin(), name.end());
+  kernelName.push_back('\0');
+
+  std::string globalName = llvm::formatv("{0}_kernel_name", name);
+  return LLVM::createGlobalString(
+      loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
+      LLVM::Linkage::Internal, llvmDialect);
+}
+
+// Emits LLVM IR to launch a kernel function. Expects the module that contains
+// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the
+// kernel function in the IR.
+// While MLIR has no global constants, also expects a cubin getter function in
+// an 'nvvm.cubingetter' attribute. Such function is expected to return a
+// pointer to the cubin blob when invoked.
+// With these given, the generated code in essence is
+//
+// %0 = call %cubingetter
+// %1 = alloca sizeof(void*)
+// call %mcuModuleLoad(%2, %1)
+// %2 = alloca sizeof(void*)
+// %3 = load %1
+// %4 = <see generateKernelNameConstant>
+// call %mcuModuleGetFunction(%2, %3, %4)
+// %5 = call %mcuGetStreamHelper()
+// %6 = load %2
+// %7 = <see setupParamsArray>
+// call %mcuLaunchKernel(%6, <launchOp operands 0..5>, 0, %5, %7, nullptr)
+// call %mcuStreamSynchronize(%5)
+void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
+    mlir::gpu::LaunchFuncOp launchOp) {
+  OpBuilder builder(launchOp);
+  Location loc = launchOp.getLoc();
+  declareCudaFunctions(loc);
+
+  auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+                                               builder.getI32IntegerAttr(0));
+  // Create an LLVM global with CUBIN extracted from the kernel annotation and
+  // obtain a pointer to the first byte in it.
+  auto kernelModule =
+      getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName());
+  assert(kernelModule && "expected a kernel module");
+
+  auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);
+  if (!cubinAttr) {
+    kernelModule.emitOpError()
+        << "missing " << kCubinAnnotation << " attribute";
+    return signalPassFailure();
+  }
+
+  assert(kernelModule.getName() && "expected a named module");
+  SmallString<128> nameBuffer(*kernelModule.getName());
+  nameBuffer.append(kCubinStorageSuffix);
+  Value data = LLVM::createGlobalString(
+      loc, builder, nameBuffer.str(), cubinAttr.getValue(),
+      LLVM::Linkage::Internal, getLLVMDialect());
+
+  // Emit the load module call to load the module data. Error checking is done
+  // in the called helper function.
+  auto cuModule = allocatePointer(builder, loc);
+  auto cuModuleLoad =
+      getModule().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleLoadName);
+  builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
+                               builder.getSymbolRefAttr(cuModuleLoad),
+                               ArrayRef<Value>{cuModule, data});
+  // Get the function from the module. The name corresponds to the name of
+  // the kernel function.
+  auto cuOwningModuleRef =
+      builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule);
+  auto kernelName = generateKernelNameConstant(launchOp.kernel(), loc, builder);
+  auto cuFunction = allocatePointer(builder, loc);
+  auto cuModuleGetFunction =
+      getModule().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleGetFunctionName);
+  builder.create<LLVM::CallOp>(
+      loc, ArrayRef<Type>{getCUResultType()},
+      builder.getSymbolRefAttr(cuModuleGetFunction),
+      ArrayRef<Value>{cuFunction, cuOwningModuleRef, kernelName});
+  // Grab the global stream needed for execution.
+  auto cuGetStreamHelper =
+      getModule().lookupSymbol<LLVM::LLVMFuncOp>(cuGetStreamHelperName);
+  auto cuStream = builder.create<LLVM::CallOp>(
+      loc, ArrayRef<Type>{getPointerType()},
+      builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef<Value>{});
+  // Invoke the function with required arguments.
+  auto cuLaunchKernel =
+      getModule().lookupSymbol<LLVM::LLVMFuncOp>(cuLaunchKernelName);
+  auto cuFunctionRef =
+      builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction);
+  auto paramsArray = setupParamsArray(launchOp, builder);
+  auto nullpointer =
+      builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);
+  builder.create<LLVM::CallOp>(
+      loc, ArrayRef<Type>{getCUResultType()},
+      builder.getSymbolRefAttr(cuLaunchKernel),
+      ArrayRef<Value>{cuFunctionRef, launchOp.getOperand(0),
+                      launchOp.getOperand(1), launchOp.getOperand(2),
+                      launchOp.getOperand(3), launchOp.getOperand(4),
+                      launchOp.getOperand(5), zero, /* sharedMemBytes */
+                      cuStream.getResult(0),        /* stream */
+                      paramsArray,                  /* kernel params */
+                      nullpointer /* extra */});
+  // Sync on the stream to make it synchronous.
+  auto cuStreamSync =
+      getModule().lookupSymbol<LLVM::LLVMFuncOp>(cuStreamSynchronizeName);
+  builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
+                               builder.getSymbolRefAttr(cuStreamSync),
+                               ArrayRef<Value>(cuStream.getResult(0)));
+  launchOp.erase();
+}
+
+std::unique_ptr<mlir::OpPassBase<mlir::ModuleOp>>
+mlir::createConvertGpuLaunchFuncToCudaCallsPass() {
+  return std::make_unique<GpuLaunchFuncToCudaCallsPass>();
+}
+
+static PassRegistration<GpuLaunchFuncToCudaCallsPass>
+    pass("launch-func-to-cuda",
+         "Convert all launch_func ops to CUDA runtime calls");
diff --git a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b5df446abe15e9185d6ff8674aefeafc08330d8c
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(LLVM_TARGET_DEFINITIONS GPUToNVVM.td)
+mlir_tablegen(GPUToNVVM.cpp.inc -gen-rewriters)
+add_public_tablegen_target(MLIRGPUToNVVMIncGen)
+
+add_llvm_library(MLIRGPUtoNVVMTransforms
+  LowerGpuOpsToNVVMOps.cpp
+  )
+
+add_dependencies(MLIRGPUtoNVVMTransforms
+  MLIRGPUToNVVMIncGen)
+
+target_link_libraries(MLIRGPUtoNVVMTransforms
+  LLVMSupport
+  MLIRGPU
+  MLIRLLVMIR
+  MLIRNVVMIR
+  MLIRPass
+  )
diff --git a/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td b/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td
new file mode 100644
index 0000000000000000000000000000000000000000..0a6aec07041dba83ed00aa9e9f694ee8808dd4ca
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td
@@ -0,0 +1,21 @@
+//==-- GPUToNVVM.td - GPU Ops to NVVM Patterns ---------------*- tablegen -*==//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines Patterns to lower GPU ops to NVVM.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_GPUTONVVM_TD
+#define MLIR_CONVERSION_GPUTONVVM_TD
+
+include "mlir/Dialect/GPU/GPUOps.td"
+include "mlir/Dialect/LLVMIR/NVVMOps.td"
+
+def : Pat<(GPU_BarrierOp), (NVVM_Barrier0Op)>;
+
+#endif // MLIR_CONVERSION_GPUTONVVM_TD
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..08c18c1ec83d04895ff24d588f649143edfc3f61
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -0,0 +1,751 @@
+//===- LowerGpuOpsToNVVMOps.cpp - MLIR GPU to NVVM lowering passes --------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to generate NVVMIR operations for higher-level
+// GPU operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "llvm/Support/FormatVariadic.h"
+
+#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
+#include "../GPUCommon/OpToFuncCallLowering.h"
+
+using namespace mlir;
+
+namespace {
+
+/// Derived type converter for GPU to NVVM lowering. The GPU dialect uses memory
+/// space 5 for private memory attributions, but NVVM represents private
+/// memory allocations as local `alloca`s in the default address space. This
+/// converter drops the private memory space to support the use case above.
+class NVVMTypeConverter : public LLVMTypeConverter {
+public:
+  using LLVMTypeConverter::LLVMTypeConverter;
+
+  Type convertType(Type type) override {
+    auto memref = type.dyn_cast<MemRefType>();
+    if (memref &&
+        memref.getMemorySpace() == gpu::GPUDialect::getPrivateAddressSpace()) {
+      type = MemRefType::get(memref.getShape(), memref.getElementType(),
+                             memref.getAffineMaps());
+    }
+
+    return LLVMTypeConverter::convertType(type);
+  }
+};
+
+/// Converts all_reduce op to LLVM/NVVM ops.
+struct GPUAllReduceOpLowering : public LLVMOpLowering {
+  using AccumulatorFactory =
+      std::function<Value(Location, Value, Value, ConversionPatternRewriter &)>;
+
+  explicit GPUAllReduceOpLowering(LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(gpu::AllReduceOp::getOperationName(),
+                       lowering_.getDialect()->getContext(), lowering_),
+        int32Type(LLVM::LLVMType::getInt32Ty(lowering_.getDialect())) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op->getLoc();
+    Value operand = operands.front();
+
+    // TODO(csigg): Generalize to other types of accumulation.
+    assert(op->getOperand(0)->getType().isIntOrFloat());
+
+    // Create the reduction using an accumulator factory.
+    AccumulatorFactory factory =
+        getFactory(cast<gpu::AllReduceOp>(op), operand);
+    assert(factory && "failed to create accumulator factory");
+    Value result = createBlockReduce(loc, operand, factory, rewriter);
+
+    rewriter.replaceOp(op, {result});
+    return matchSuccess();
+  }
+
+private:
+  /// Returns an accumulator factory using either the op attribute or the body
+  /// region.
+  AccumulatorFactory getFactory(gpu::AllReduceOp allReduce,
+                                Value operand) const {
+    if (!allReduce.body().empty()) {
+      return getFactory(allReduce.body());
+    }
+    if (allReduce.op()) {
+      auto type = operand->getType().cast<LLVM::LLVMType>();
+      return getFactory(*allReduce.op(), type.getUnderlyingType());
+    }
+    return AccumulatorFactory();
+  }
+
+  /// Returns an accumulator factory that clones the body. The body's entry
+  /// block is expected to have 2 arguments. The gpu.yield return the
+  /// accumulated value of the same type.
+  AccumulatorFactory getFactory(Region &body) const {
+    return AccumulatorFactory([&](Location loc, Value lhs, Value rhs,
+                                  ConversionPatternRewriter &rewriter) {
+      Block *block = rewriter.getInsertionBlock();
+      Block *split = rewriter.splitBlock(block, rewriter.getInsertionPoint());
+
+      // Insert accumulator body between split block.
+      BlockAndValueMapping mapping;
+      mapping.map(body.front().getArgument(0), lhs);
+      mapping.map(body.front().getArgument(1), rhs);
+      rewriter.cloneRegionBefore(body, *split->getParent(),
+                                 split->getIterator(), mapping);
+
+      // Add branch before inserted body, into body.
+      block = block->getNextNode();
+      rewriter.create<LLVM::BrOp>(loc, ArrayRef<Value>{},
+                                  llvm::makeArrayRef(block), ValueRange());
+
+      // Replace all gpu.yield ops with branch out of body.
+      for (; block != split; block = block->getNextNode()) {
+        Operation *terminator = block->getTerminator();
+        if (!llvm::isa<gpu::YieldOp>(terminator))
+          continue;
+        rewriter.setInsertionPointToEnd(block);
+        rewriter.replaceOpWithNewOp<LLVM::BrOp>(
+            terminator, ArrayRef<Value>{}, llvm::makeArrayRef(split),
+            ValueRange(terminator->getOperand(0)));
+      }
+
+      // Return accumulator result.
+      rewriter.setInsertionPointToStart(split);
+      return split->addArgument(lhs->getType());
+    });
+  }
+
+  /// Returns an accumulator factory that creates an op specified by opName.
+  AccumulatorFactory getFactory(StringRef opName, llvm::Type *type) const {
+    if (type->isVectorTy() || type->isArrayTy())
+      return getFactory(opName, type->getSequentialElementType());
+
+    bool isFloatingPoint = type->isFloatingPointTy();
+
+    if (opName == "add") {
+      return isFloatingPoint ? getFactory<LLVM::FAddOp>()
+                             : getFactory<LLVM::AddOp>();
+    }
+    if (opName == "mul") {
+      return isFloatingPoint ? getFactory<LLVM::FMulOp>()
+                             : getFactory<LLVM::MulOp>();
+    }
+
+    return AccumulatorFactory();
+  }
+
+  /// Returns an accumulator factory that creates an op of type T.
+  template <typename T> AccumulatorFactory getFactory() const {
+    return [](Location loc, Value lhs, Value rhs,
+              ConversionPatternRewriter &rewriter) {
+      return rewriter.create<T>(loc, lhs->getType(), lhs, rhs);
+    };
+  }
+
+  /// Creates an all_reduce across the block.
+  ///
+  /// First reduce the elements within a warp. The first thread of each warp
+  /// writes the intermediate result to shared memory. After synchronizing the
+  /// block, the first warp reduces the values from shared memory. The result
+  /// is broadcasted to all threads through shared memory.
+  ///
+  ///     %warp_reduce = `createWarpReduce(%operand)`
+  ///     %shared_mem_ptr = llvm.mlir.addressof @reduce_buffer
+  ///     %zero = llvm.mlir.constant(0 : i32) : !llvm.i32
+  ///     %lane_id = nvvm.read.ptx.sreg.laneid  : !llvm.i32
+  ///     %is_first_lane = llvm.icmp "eq" %lane_id, %zero : !llvm.i1
+  ///     %thread_idx = `getLinearThreadIndex()` : !llvm.i32
+  ///     llvm.cond_br %is_first_lane, ^then1, ^continue1
+  ///   ^then1:
+  ///     %warp_id = `getWarpId()`
+  ///     %store_dst = llvm.getelementptr %shared_mem_ptr[%zero, %warp_id]
+  ///     llvm.store %store_dst, %warp_reduce
+  ///     llvm.br ^continue1
+  ///   ^continue1:
+  ///     nvvm.barrier0
+  ///     %num_warps = `getNumWarps()` : !llvm.i32
+  ///     %is_valid_warp = llvm.icmp "slt" %thread_idx, %num_warps
+  ///     %result_ptr = llvm.getelementptr %shared_mem_ptr[%zero, %zero]
+  ///     llvm.cond_br %is_first_lane, ^then2, ^continue2
+  ///   ^then2:
+  ///     %load_src = llvm.getelementptr %shared_mem_ptr[%zero, %thread_idx]
+  ///     %value = llvm.load %load_src
+  ///     %result = `createWarpReduce(%value)`
+  ///     llvm.store %result_ptr, %result
+  ///     llvm.br ^continue2
+  ///   ^continue2:
+  ///     nvvm.barrier0
+  ///     %result = llvm.load %result_ptr
+  ///     return %result
+  ///
+  Value createBlockReduce(Location loc, Value operand,
+                          AccumulatorFactory &accumFactory,
+                          ConversionPatternRewriter &rewriter) const {
+    auto type = operand->getType().cast<LLVM::LLVMType>();
+
+    // Create shared memory array to store the warp reduction.
+    auto module = operand->getDefiningOp()->getParentOfType<ModuleOp>();
+    assert(module && "op must belong to a module");
+    Value sharedMemPtr =
+        createSharedMemoryArray(loc, module, type, kWarpSize, rewriter);
+
+    Value zero = rewriter.create<LLVM::ConstantOp>(
+        loc, int32Type, rewriter.getI32IntegerAttr(0u));
+    Value laneId = rewriter.create<NVVM::LaneIdOp>(loc, int32Type);
+    Value isFirstLane = rewriter.create<LLVM::ICmpOp>(
+        loc, LLVM::ICmpPredicate::eq, laneId, zero);
+    Value threadIdx = getLinearThreadIndex(loc, rewriter);
+    Value blockSize = getBlockSize(loc, rewriter);
+    Value activeWidth = getActiveWidth(loc, threadIdx, blockSize, rewriter);
+
+    // Reduce elements within each warp to produce the intermediate results.
+    Value warpReduce = createWarpReduce(loc, activeWidth, laneId, operand,
+                                        accumFactory, rewriter);
+
+    // Write the intermediate results to shared memory, using the first lane of
+    // each warp.
+    createPredicatedBlock(loc, rewriter, isFirstLane, [&] {
+      Value warpId = getDivideByWarpSize(threadIdx, rewriter);
+      Value storeDst = rewriter.create<LLVM::GEPOp>(
+          loc, type, sharedMemPtr, ArrayRef<Value>({zero, warpId}));
+      rewriter.create<LLVM::StoreOp>(loc, warpReduce, storeDst);
+    });
+    rewriter.create<NVVM::Barrier0Op>(loc);
+
+    Value numWarps = getNumWarps(loc, blockSize, rewriter);
+    Value isValidWarp = rewriter.create<LLVM::ICmpOp>(
+        loc, LLVM::ICmpPredicate::slt, threadIdx, numWarps);
+    Value resultPtr = rewriter.create<LLVM::GEPOp>(
+        loc, type, sharedMemPtr, ArrayRef<Value>({zero, zero}));
+
+    // Use the first numWarps threads to reduce the intermediate results from
+    // shared memory. The final result is written to shared memory again.
+    createPredicatedBlock(loc, rewriter, isValidWarp, [&] {
+      Value loadSrc = rewriter.create<LLVM::GEPOp>(
+          loc, type, sharedMemPtr, ArrayRef<Value>({zero, threadIdx}));
+      Value value = rewriter.create<LLVM::LoadOp>(loc, type, loadSrc);
+      Value result = createWarpReduce(loc, numWarps, laneId, value,
+                                      accumFactory, rewriter);
+      rewriter.create<LLVM::StoreOp>(loc, result, resultPtr);
+    });
+    rewriter.create<NVVM::Barrier0Op>(loc);
+
+    // Load and return result from shared memory.
+    Value result = rewriter.create<LLVM::LoadOp>(loc, type, resultPtr);
+    return result;
+  }
+
+  /// Creates an if-block skeleton and calls the two factories to generate the
+  /// ops in the `then` and `else` block..
+  ///
+  ///     llvm.cond_br %condition, ^then, ^continue
+  ///   ^then:
+  ///     %then_operands = `thenOpsFactory()`
+  ///     llvm.br ^continue(%then_operands)
+  ///   ^else:
+  ///     %else_operands = `elseOpsFactory()`
+  ///     llvm.br ^continue(%else_operands)
+  ///   ^continue(%block_operands):
+  ///
+  template <typename ThenOpsFactory, typename ElseOpsFactory>
+  void createIf(Location loc, ConversionPatternRewriter &rewriter,
+                Value condition, ThenOpsFactory &&thenOpsFactory,
+                ElseOpsFactory &&elseOpsFactory) const {
+    Block *currentBlock = rewriter.getInsertionBlock();
+    auto currentPoint = rewriter.getInsertionPoint();
+
+    Block *thenBlock = rewriter.splitBlock(currentBlock, currentPoint);
+    Block *elseBlock = rewriter.splitBlock(thenBlock, thenBlock->begin());
+    Block *continueBlock = rewriter.splitBlock(elseBlock, elseBlock->begin());
+
+    rewriter.setInsertionPointToEnd(currentBlock);
+    rewriter.create<LLVM::CondBrOp>(loc, llvm::makeArrayRef(condition),
+                                    ArrayRef<Block *>{thenBlock, elseBlock});
+
+    auto addBranch = [&](ValueRange operands) {
+      rewriter.create<LLVM::BrOp>(loc, ArrayRef<Value>{},
+                                  llvm::makeArrayRef(continueBlock),
+                                  llvm::makeArrayRef(operands));
+    };
+
+    rewriter.setInsertionPointToStart(thenBlock);
+    auto thenOperands = thenOpsFactory();
+    addBranch(thenOperands);
+
+    rewriter.setInsertionPointToStart(elseBlock);
+    auto elseOperands = elseOpsFactory();
+    addBranch(elseOperands);
+
+    assert(thenOperands.size() == elseOperands.size());
+    rewriter.setInsertionPointToStart(continueBlock);
+    for (auto operand : thenOperands)
+      continueBlock->addArgument(operand->getType());
+  }
+
+  /// Shortcut for createIf with empty else block and no block operands.
+  template <typename Factory>
+  void createPredicatedBlock(Location loc, ConversionPatternRewriter &rewriter,
+                             Value condition,
+                             Factory &&predicatedOpsFactory) const {
+    createIf(
+        loc, rewriter, condition,
+        [&] {
+          predicatedOpsFactory();
+          return ArrayRef<Value>();
+        },
+        [&] { return ArrayRef<Value>(); });
+  }
+
+  /// Creates a reduction across the first activeWidth lanes of a warp.
+  /// The first lane returns the result, all others return values are undefined.
+  Value createWarpReduce(Location loc, Value activeWidth, Value laneId,
+                         Value operand, AccumulatorFactory accumFactory,
+                         ConversionPatternRewriter &rewriter) const {
+    Value warpSize = rewriter.create<LLVM::ConstantOp>(
+        loc, int32Type, rewriter.getI32IntegerAttr(kWarpSize));
+    Value isPartialWarp = rewriter.create<LLVM::ICmpOp>(
+        loc, LLVM::ICmpPredicate::slt, activeWidth, warpSize);
+    auto type = operand->getType().cast<LLVM::LLVMType>();
+
+    createIf(
+        loc, rewriter, isPartialWarp,
+        // Generate reduction over a (potentially) partial warp.
+        [&] {
+          Value value = operand;
+          Value one = rewriter.create<LLVM::ConstantOp>(
+              loc, int32Type, rewriter.getI32IntegerAttr(1));
+          // Bit mask of active lanes: `(1 << activeWidth) - 1`.
+          Value activeMask = rewriter.create<LLVM::SubOp>(
+              loc, int32Type,
+              rewriter.create<LLVM::ShlOp>(loc, int32Type, one, activeWidth),
+              one);
+          // Clamp lane: `activeWidth - 1`
+          Value maskAndClamp =
+              rewriter.create<LLVM::SubOp>(loc, int32Type, activeWidth, one);
+          auto dialect = lowering.getDialect();
+          auto predTy = LLVM::LLVMType::getInt1Ty(dialect);
+          auto shflTy = LLVM::LLVMType::getStructTy(dialect, {type, predTy});
+          auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
+
+          // Repeatedly shuffle value from 'laneId ^ i' and accumulate if source
+          // lane is within the active range. All lanes contain the final
+          // result, but only the first lane's result is used.
+          for (int i = 1; i < kWarpSize; i <<= 1) {
+            Value offset = rewriter.create<LLVM::ConstantOp>(
+                loc, int32Type, rewriter.getI32IntegerAttr(i));
+            Value shfl = rewriter.create<NVVM::ShflBflyOp>(
+                loc, shflTy, activeMask, value, offset, maskAndClamp,
+                returnValueAndIsValidAttr);
+            Value isActiveSrcLane = rewriter.create<LLVM::ExtractValueOp>(
+                loc, predTy, shfl, rewriter.getIndexArrayAttr(1));
+            // Skip the accumulation if the shuffle op read from a lane outside
+            // of the active range.
+            createIf(
+                loc, rewriter, isActiveSrcLane,
+                [&] {
+                  Value shflValue = rewriter.create<LLVM::ExtractValueOp>(
+                      loc, type, shfl, rewriter.getIndexArrayAttr(0));
+                  return SmallVector<Value, 1>{
+                      accumFactory(loc, value, shflValue, rewriter)};
+                },
+                [&] { return llvm::makeArrayRef(value); });
+            value = rewriter.getInsertionBlock()->getArgument(0);
+          }
+          return SmallVector<Value, 1>{value};
+        },
+        // Generate a reduction over the entire warp. This is a specialization
+        // of the above reduction with unconditional accumulation.
+        [&] {
+          Value value = operand;
+          Value activeMask = rewriter.create<LLVM::ConstantOp>(
+              loc, int32Type, rewriter.getI32IntegerAttr(~0u));
+          Value maskAndClamp = rewriter.create<LLVM::ConstantOp>(
+              loc, int32Type, rewriter.getI32IntegerAttr(kWarpSize - 1));
+          for (int i = 1; i < kWarpSize; i <<= 1) {
+            Value offset = rewriter.create<LLVM::ConstantOp>(
+                loc, int32Type, rewriter.getI32IntegerAttr(i));
+            Value shflValue = rewriter.create<NVVM::ShflBflyOp>(
+                loc, type, activeMask, value, offset, maskAndClamp,
+                /*return_value_and_is_valid=*/UnitAttr());
+            value = accumFactory(loc, value, shflValue, rewriter);
+          }
+          return SmallVector<Value, 1>{value};
+        });
+    return rewriter.getInsertionBlock()->getArgument(0);
+  }
+
+  /// Creates a global array stored in shared memory.
+  Value createSharedMemoryArray(Location loc, ModuleOp module,
+                                LLVM::LLVMType elementType, int numElements,
+                                ConversionPatternRewriter &rewriter) const {
+    OpBuilder builder(module.getBodyRegion());
+
+    auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
+    StringRef name = "reduce_buffer";
+    auto globalOp = builder.create<LLVM::GlobalOp>(
+        loc, arrayType.cast<LLVM::LLVMType>(),
+        /*isConstant=*/false, LLVM::Linkage::Internal, name,
+        /*value=*/Attribute(), gpu::GPUDialect::getWorkgroupAddressSpace());
+
+    return rewriter.create<LLVM::AddressOfOp>(loc, globalOp);
+  }
+
+  /// Returns the index of the thread within the block.
+  Value getLinearThreadIndex(Location loc,
+                             ConversionPatternRewriter &rewriter) const {
+    Value dimX = rewriter.create<NVVM::BlockDimXOp>(loc, int32Type);
+    Value dimY = rewriter.create<NVVM::BlockDimYOp>(loc, int32Type);
+    Value idX = rewriter.create<NVVM::ThreadIdXOp>(loc, int32Type);
+    Value idY = rewriter.create<NVVM::ThreadIdYOp>(loc, int32Type);
+    Value idZ = rewriter.create<NVVM::ThreadIdZOp>(loc, int32Type);
+    Value tmp1 = rewriter.create<LLVM::MulOp>(loc, int32Type, idZ, dimY);
+    Value tmp2 = rewriter.create<LLVM::AddOp>(loc, int32Type, tmp1, idY);
+    Value tmp3 = rewriter.create<LLVM::MulOp>(loc, int32Type, tmp2, dimX);
+    return rewriter.create<LLVM::AddOp>(loc, int32Type, tmp3, idX);
+  }
+
+  /// Returns the number of threads in the block.
+  Value getBlockSize(Location loc, ConversionPatternRewriter &rewriter) const {
+    Value dimX = rewriter.create<NVVM::BlockDimXOp>(loc, int32Type);
+    Value dimY = rewriter.create<NVVM::BlockDimYOp>(loc, int32Type);
+    Value dimZ = rewriter.create<NVVM::BlockDimZOp>(loc, int32Type);
+    Value dimXY = rewriter.create<LLVM::MulOp>(loc, int32Type, dimX, dimY);
+    return rewriter.create<LLVM::MulOp>(loc, int32Type, dimXY, dimZ);
+  }
+
+  /// Returns the number of warps in the block.
+  Value getNumWarps(Location loc, Value blockSize,
+                    ConversionPatternRewriter &rewriter) const {
+    auto warpSizeMinusOne = rewriter.create<LLVM::ConstantOp>(
+        loc, int32Type, rewriter.getI32IntegerAttr(kWarpSize - 1));
+    auto biasedBlockSize = rewriter.create<LLVM::AddOp>(
+        loc, int32Type, blockSize, warpSizeMinusOne);
+    return getDivideByWarpSize(biasedBlockSize, rewriter);
+  }
+
+  /// Returns the number of active threads in the warp, not clamped to 32.
+  Value getActiveWidth(Location loc, Value threadIdx, Value blockSize,
+                       ConversionPatternRewriter &rewriter) const {
+    Value threadIdxMask = rewriter.create<LLVM::ConstantOp>(
+        loc, int32Type, rewriter.getI32IntegerAttr(~(kWarpSize - 1)));
+    Value numThreadsWithSmallerWarpId =
+        rewriter.create<LLVM::AndOp>(loc, threadIdx, threadIdxMask);
+    return rewriter.create<LLVM::SubOp>(loc, blockSize,
+                                        numThreadsWithSmallerWarpId);
+  }
+
+  /// Returns value divided by the warp size (i.e. 32).
+  Value getDivideByWarpSize(Value value,
+                            ConversionPatternRewriter &rewriter) const {
+    auto loc = value->getLoc();
+    auto warpSize = rewriter.create<LLVM::ConstantOp>(
+        loc, int32Type, rewriter.getI32IntegerAttr(kWarpSize));
+    return rewriter.create<LLVM::SDivOp>(loc, int32Type, value, warpSize);
+  }
+
+  LLVM::LLVMType int32Type;
+
+  static constexpr int kWarpSize = 32;
+};
+
+struct GPUShuffleOpLowering : public LLVMOpLowering {
+  explicit GPUShuffleOpLowering(LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(gpu::ShuffleOp::getOperationName(),
+                       lowering_.getDialect()->getContext(), lowering_) {}
+
+  /// Lowers a shuffle to the corresponding NVVM op.
+  ///
+  /// Convert the `width` argument into an activeMask (a bitmask which specifies
+  /// which threads participate in the shuffle) and a maskAndClamp (specifying
+  /// the highest lane which participates in the shuffle).
+  ///
+  ///     %one = llvm.constant(1 : i32) : !llvm.i32
+  ///     %shl = llvm.shl %one, %width : !llvm.i32
+  ///     %active_mask = llvm.sub %shl, %one : !llvm.i32
+  ///     %mask_and_clamp = llvm.sub %width, %one : !llvm.i32
+  ///     %shfl = nvvm.shfl.sync.bfly %active_mask, %value, %offset,
+  ///         %mask_and_clamp : !llvm<"{ float, i1 }">
+  ///     %shfl_value = llvm.extractvalue %shfl[0 : index] :
+  ///         !llvm<"{ float, i1 }">
+  ///     %shfl_pred = llvm.extractvalue %shfl[1 : index] :
+  ///         !llvm<"{ float, i1 }">
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op->getLoc();
+    gpu::ShuffleOpOperandAdaptor adaptor(operands);
+
+    auto dialect = lowering.getDialect();
+    auto valueTy = adaptor.value()->getType().cast<LLVM::LLVMType>();
+    auto int32Type = LLVM::LLVMType::getInt32Ty(dialect);
+    auto predTy = LLVM::LLVMType::getInt1Ty(dialect);
+    auto resultTy = LLVM::LLVMType::getStructTy(dialect, {valueTy, predTy});
+
+    Value one = rewriter.create<LLVM::ConstantOp>(
+        loc, int32Type, rewriter.getI32IntegerAttr(1));
+    // Bit mask of active lanes: `(1 << activeWidth) - 1`.
+    Value activeMask = rewriter.create<LLVM::SubOp>(
+        loc, int32Type,
+        rewriter.create<LLVM::ShlOp>(loc, int32Type, one, adaptor.width()),
+        one);
+    // Clamp lane: `activeWidth - 1`
+    Value maskAndClamp =
+        rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.width(), one);
+
+    auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
+    Value shfl = rewriter.create<NVVM::ShflBflyOp>(
+        loc, resultTy, activeMask, adaptor.value(), adaptor.offset(),
+        maskAndClamp, returnValueAndIsValidAttr);
+    Value shflValue = rewriter.create<LLVM::ExtractValueOp>(
+        loc, valueTy, shfl, rewriter.getIndexArrayAttr(0));
+    Value isActiveSrcLane = rewriter.create<LLVM::ExtractValueOp>(
+        loc, predTy, shfl, rewriter.getIndexArrayAttr(1));
+
+    rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
+    return matchSuccess();
+  }
+};
+
+struct GPUFuncOpLowering : LLVMOpLowering {
+  explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(gpu::GPUFuncOp::getOperationName(),
+                       typeConverter.getDialect()->getContext(),
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    assert(operands.empty() && "func op is not expected to have operands");
+    auto gpuFuncOp = cast<gpu::GPUFuncOp>(op);
+    Location loc = gpuFuncOp.getLoc();
+
+    SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
+    workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
+    for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
+      Value attribution = en.value();
+
+      auto type = attribution->getType().dyn_cast<MemRefType>();
+      assert(type && type.hasStaticShape() && "unexpected type in attribution");
+
+      uint64_t numElements = type.getNumElements();
+
+      auto elementType =
+          lowering.convertType(type.getElementType()).cast<LLVM::LLVMType>();
+      auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
+      std::string name =
+          llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index());
+      auto globalOp = rewriter.create<LLVM::GlobalOp>(
+          gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
+          LLVM::Linkage::Internal, name, /*value=*/Attribute(),
+          gpu::GPUDialect::getWorkgroupAddressSpace());
+      workgroupBuffers.push_back(globalOp);
+    }
+
+    // Rewrite the original GPU function to an LLVM function.
+    auto funcType = lowering.convertType(gpuFuncOp.getType())
+                        .cast<LLVM::LLVMType>()
+                        .getPointerElementTy();
+
+    // Remap proper input types.
+    TypeConverter::SignatureConversion signatureConversion(
+        gpuFuncOp.front().getNumArguments());
+    for (unsigned i = 0, e = funcType.getFunctionNumParams(); i < e; ++i)
+      signatureConversion.addInputs(i, funcType.getFunctionParamType(i));
+
+    // Create the new function operation. Only copy those attributes that are
+    // not specific to function modeling.
+    SmallVector<NamedAttribute, 4> attributes;
+    for (const auto &attr : gpuFuncOp.getAttrs()) {
+      if (attr.first.is(SymbolTable::getSymbolAttrName()) ||
+          attr.first.is(impl::getTypeAttrName()) ||
+          attr.first.is(gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName()))
+        continue;
+      attributes.push_back(attr);
+    }
+    auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
+        gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
+        LLVM::Linkage::External, attributes);
+
+    {
+      // Insert operations that correspond to converted workgroup and private
+      // memory attributions to the body of the function. This must operate on
+      // the original function, before the body region is inlined in the new
+      // function to maintain the relation between block arguments and the
+      // parent operation that assigns their semantics.
+      OpBuilder::InsertionGuard guard(rewriter);
+
+      // Rewrite workgroup memory attributions to addresses of global buffers.
+      rewriter.setInsertionPointToStart(&gpuFuncOp.front());
+      unsigned numProperArguments = gpuFuncOp.getNumArguments();
+      auto i32Type = LLVM::LLVMType::getInt32Ty(lowering.getDialect());
+
+      Value zero = nullptr;
+      if (!workgroupBuffers.empty())
+        zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
+                                                 rewriter.getI32IntegerAttr(0));
+      for (auto en : llvm::enumerate(workgroupBuffers)) {
+        LLVM::GlobalOp global = en.value();
+        Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
+        auto elementType = global.getType().getArrayElementType();
+        Value memory = rewriter.create<LLVM::GEPOp>(
+            loc, elementType.getPointerTo(global.addr_space().getZExtValue()),
+            address, ArrayRef<Value>{zero, zero});
+
+        // Build a memref descriptor pointing to the buffer to plug with the
+        // existing memref infrastructure. This may use more registers than
+        // otherwise necessary given that memref sizes are fixed, but we can try
+        // and canonicalize that away later.
+        Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
+        auto type = attribution->getType().cast<MemRefType>();
+        auto descr = MemRefDescriptor::fromStaticShape(rewriter, loc, lowering,
+                                                       type, memory);
+        signatureConversion.remapInput(numProperArguments + en.index(), descr);
+      }
+
+      // Rewrite private memory attributions to alloca'ed buffers.
+      unsigned numWorkgroupAttributions =
+          gpuFuncOp.getNumWorkgroupAttributions();
+      auto int64Ty = LLVM::LLVMType::getInt64Ty(lowering.getDialect());
+      for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
+        Value attribution = en.value();
+        auto type = attribution->getType().cast<MemRefType>();
+        assert(type && type.hasStaticShape() &&
+               "unexpected type in attribution");
+
+        // Explicitly drop memory space when lowering private memory
+        // attributions since NVVM models it as `alloca`s in the default
+        // memory space and does not support `alloca`s with addrspace(5).
+        auto ptrType = lowering.convertType(type.getElementType())
+                           .cast<LLVM::LLVMType>()
+                           .getPointerTo();
+        Value numElements = rewriter.create<LLVM::ConstantOp>(
+            gpuFuncOp.getLoc(), int64Ty,
+            rewriter.getI64IntegerAttr(type.getNumElements()));
+        Value allocated = rewriter.create<LLVM::AllocaOp>(
+            gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
+        auto descr = MemRefDescriptor::fromStaticShape(rewriter, loc, lowering,
+                                                       type, allocated);
+        signatureConversion.remapInput(
+            numProperArguments + numWorkgroupAttributions + en.index(), descr);
+      }
+    }
+
+    // Move the region to the new function, update the entry block signature.
+    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
+                                llvmFuncOp.end());
+    rewriter.applySignatureConversion(&llvmFuncOp.getBody(),
+                                      signatureConversion);
+
+    {
+      // For memref-typed arguments, insert the relevant loads in the beginning
+      // of the block to comply with the LLVM dialect calling convention. This
+      // needs to be done after signature conversion to get the right types.
+      OpBuilder::InsertionGuard guard(rewriter);
+      Block &block = llvmFuncOp.front();
+      rewriter.setInsertionPointToStart(&block);
+
+      for (auto en : llvm::enumerate(gpuFuncOp.getType().getInputs())) {
+        if (!en.value().isa<MemRefType>() &&
+            !en.value().isa<UnrankedMemRefType>())
+          continue;
+
+        BlockArgument arg = block.getArgument(en.index());
+        Value loaded = rewriter.create<LLVM::LoadOp>(loc, arg);
+        rewriter.replaceUsesOfBlockArgument(arg, loaded);
+      }
+    }
+
+    rewriter.eraseOp(gpuFuncOp);
+    return matchSuccess();
+  }
+};
+
+struct GPUReturnOpLowering : public LLVMOpLowering {
+  GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(gpu::ReturnOp::getOperationName(),
+                       typeConverter.getDialect()->getContext(),
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands,
+                                                ArrayRef<Block *>());
+    return matchSuccess();
+  }
+};
+
+/// Import the GPU Ops to NVVM Patterns.
+#include "GPUToNVVM.cpp.inc"
+
+/// A pass that replaces all occurrences of GPU device operations with their
+/// corresponding NVVM equivalent.
+///
+/// This pass only handles device code and is not meant to be run on GPU host
+/// code.
+class LowerGpuOpsToNVVMOpsPass : public ModulePass<LowerGpuOpsToNVVMOpsPass> {
+public:
+  void runOnModule() override {
+    ModuleOp m = getModule();
+    if (!m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
+      return;
+
+    OwningRewritePatternList patterns;
+    NVVMTypeConverter converter(m.getContext());
+    populateStdToLLVMConversionPatterns(converter, patterns);
+    populateGpuToNVVMConversionPatterns(converter, patterns);
+    ConversionTarget target(getContext());
+    target.addIllegalDialect<gpu::GPUDialect>();
+    target.addIllegalOp<LLVM::ExpOp>();
+    target.addIllegalOp<FuncOp>();
+    target.addLegalDialect<LLVM::LLVMDialect>();
+    target.addLegalDialect<NVVM::NVVMDialect>();
+    // TODO(csigg): Remove once we support replacing non-root ops.
+    target.addLegalOp<gpu::YieldOp>();
+    if (failed(applyPartialConversion(m, target, patterns, &converter)))
+      signalPassFailure();
+  }
+};
+
+} // anonymous namespace
+
+void mlir::populateGpuToNVVMConversionPatterns(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+  populateWithGenerated(converter.getDialect()->getContext(), &patterns);
+  patterns
+      .insert<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
+                                          NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
+              GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
+                                          NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
+              GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
+                                          NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
+              GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
+                                          NVVM::GridDimYOp, NVVM::GridDimZOp>,
+              GPUAllReduceOpLowering, GPUShuffleOpLowering, GPUFuncOpLowering,
+              GPUReturnOpLowering>(converter);
+  patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__nv_expf",
+                                               "__nv_exp");
+}
+
+std::unique_ptr<OpPassBase<ModuleOp>> mlir::createLowerGpuOpsToNVVMOpsPass() {
+  return std::make_unique<LowerGpuOpsToNVVMOpsPass>();
+}
+
+static PassRegistration<LowerGpuOpsToNVVMOpsPass>
+    pass("convert-gpu-to-nvvm", "Generate NVVM operations for gpu operations");
diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3c97e5ca86ba2eb82469ef1ebb2910837ba6f49e
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(MLIRGPUtoROCDLTransforms
+  LowerGpuOpsToROCDLOps.cpp
+  )
+target_link_libraries(MLIRGPUtoROCDLTransforms
+  LLVMSupport
+  MLIRGPU
+  MLIRLLVMIR
+  MLIRROCDLIR
+  MLIRPass
+  )
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83770641bd4768c35a14383c76857cea853a4450
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -0,0 +1,75 @@
+//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to generate ROCDLIR operations for higher-level
+// GPU operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
+#include "../GPUCommon/OpToFuncCallLowering.h"
+
+using namespace mlir;
+
+namespace {
+
+// A pass that replaces all occurrences of GPU device operations with their
+// corresponding ROCDL equivalent.
+//
+// This pass only handles device code and is not meant to be run on GPU host
+// code.
+class LowerGpuOpsToROCDLOpsPass : public ModulePass<LowerGpuOpsToROCDLOpsPass> {
+public:
+  void runOnModule() override {
+    ModuleOp m = getModule();
+    if (!m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
+      return;
+
+    OwningRewritePatternList patterns;
+    LLVMTypeConverter converter(m.getContext());
+    populateStdToLLVMConversionPatterns(converter, patterns);
+    patterns.insert<
+        GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
+                                    ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
+        GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
+                                    ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
+        GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
+                                    ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
+        GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
+                                    ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
+        converter);
+    patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "_ocml_exp_f32",
+                                                 "_ocml_exp_f64");
+
+    ConversionTarget target(getContext());
+    target.addLegalDialect<LLVM::LLVMDialect, ROCDL::ROCDLDialect>();
+    target.addIllegalOp<LLVM::ExpOp>();
+    target.addDynamicallyLegalOp<FuncOp>(
+        [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
+    if (failed(applyPartialConversion(m, target, patterns, &converter)))
+      signalPassFailure();
+  }
+};
+
+} // anonymous namespace
+
+std::unique_ptr<OpPassBase<ModuleOp>> mlir::createLowerGpuOpsToROCDLOpsPass() {
+  return std::make_unique<LowerGpuOpsToROCDLOpsPass>();
+}
+
+static PassRegistration<LowerGpuOpsToROCDLOpsPass>
+    pass("convert-gpu-to-rocdl",
+         "Generate ROCDL operations for gpu operations");
diff --git a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..be82894461d610f93c7648bdd18449cde0361b0a
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_llvm_library(MLIRGPUtoSPIRVTransforms
+  ConvertGPUToSPIRV.cpp
+  ConvertGPUToSPIRVPass.cpp
+  )
+
+target_link_libraries(MLIRGPUtoSPIRVTransforms
+  MLIRGPU
+  MLIRIR
+  MLIRPass
+  MLIRSPIRV
+  MLIRStandardOps
+  MLIRStandardToSPIRVTransforms
+  MLIRSupport
+  MLIRTransforms
+  )
diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..509457d076a7f9dccf0bd4eee810326c40a1f882
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp
@@ -0,0 +1,359 @@
+//===- ConvertGPUToSPIRV.cpp - Convert GPU ops to SPIR-V dialect ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the conversion patterns from GPU ops to SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVLowering.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/IR/Module.h"
+
+using namespace mlir;
+
+namespace {
+
+/// Pattern to convert a loop::ForOp within kernel functions into spirv::LoopOp.
+class ForOpConversion final : public SPIRVOpLowering<loop::ForOp> {
+public:
+  using SPIRVOpLowering<loop::ForOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(loop::ForOp forOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// Pattern lowering GPU block/thread size/id to loading SPIR-V invocation
+/// builin variables.
+template <typename SourceOp, spirv::BuiltIn builtin>
+class LaunchConfigConversion : public SPIRVOpLowering<SourceOp> {
+public:
+  using SPIRVOpLowering<SourceOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(SourceOp op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// Pattern to convert a kernel function in GPU dialect within a spv.module.
+class KernelFnConversion final : public SPIRVOpLowering<gpu::GPUFuncOp> {
+public:
+  KernelFnConversion(MLIRContext *context, SPIRVTypeConverter &converter,
+                     ArrayRef<int64_t> workGroupSize,
+                     PatternBenefit benefit = 1)
+      : SPIRVOpLowering<gpu::GPUFuncOp>(context, converter, benefit) {
+    auto config = workGroupSize.take_front(3);
+    workGroupSizeAsInt32.assign(config.begin(), config.end());
+    workGroupSizeAsInt32.resize(3, 1);
+  }
+
+  PatternMatchResult
+  matchAndRewrite(gpu::GPUFuncOp funcOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+
+private:
+  SmallVector<int32_t, 3> workGroupSizeAsInt32;
+};
+
+/// Pattern to convert a module with gpu.kernel_module attribute to a
+/// spv.module.
+class KernelModuleConversion final : public SPIRVOpLowering<ModuleOp> {
+public:
+  using SPIRVOpLowering<ModuleOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(ModuleOp moduleOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// Pattern to convert a module terminator op to a terminator of spv.module op.
+// TODO: Move this into DRR, but that requires ModuleTerminatorOp to be defined
+// in ODS.
+class KernelModuleTerminatorConversion final
+    : public SPIRVOpLowering<ModuleTerminatorOp> {
+public:
+  using SPIRVOpLowering<ModuleTerminatorOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(ModuleTerminatorOp terminatorOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// Pattern to convert a gpu.return into a SPIR-V return.
+// TODO: This can go to DRR when GPU return has operands.
+class GPUReturnOpConversion final : public SPIRVOpLowering<gpu::ReturnOp> {
+public:
+  using SPIRVOpLowering<gpu::ReturnOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(gpu::ReturnOp returnOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// loop::ForOp.
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult
+ForOpConversion::matchAndRewrite(loop::ForOp forOp, ArrayRef<Value> operands,
+                                 ConversionPatternRewriter &rewriter) const {
+  // loop::ForOp can be lowered to the structured control flow represented by
+  // spirv::LoopOp by making the continue block of the spirv::LoopOp the loop
+  // latch and the merge block the exit block. The resulting spirv::LoopOp has a
+  // single back edge from the continue to header block, and a single exit from
+  // header to merge.
+  loop::ForOpOperandAdaptor forOperands(operands);
+  auto loc = forOp.getLoc();
+  auto loopControl = rewriter.getI32IntegerAttr(
+      static_cast<uint32_t>(spirv::LoopControl::None));
+  auto loopOp = rewriter.create<spirv::LoopOp>(loc, loopControl);
+  loopOp.addEntryAndMergeBlock();
+
+  OpBuilder::InsertionGuard guard(rewriter);
+  // Create the block for the header.
+  auto header = new Block();
+  // Insert the header.
+  loopOp.body().getBlocks().insert(std::next(loopOp.body().begin(), 1), header);
+
+  // Create the new induction variable to use.
+  BlockArgument newIndVar =
+      header->addArgument(forOperands.lowerBound()->getType());
+  Block *body = forOp.getBody();
+
+  // Apply signature conversion to the body of the forOp. It has a single block,
+  // with argument which is the induction variable. That has to be replaced with
+  // the new induction variable.
+  TypeConverter::SignatureConversion signatureConverter(
+      body->getNumArguments());
+  signatureConverter.remapInput(0, newIndVar);
+  body = rewriter.applySignatureConversion(&forOp.getLoopBody(),
+                                           signatureConverter);
+
+  // Delete the loop terminator.
+  rewriter.eraseOp(body->getTerminator());
+
+  // Move the blocks from the forOp into the loopOp. This is the body of the
+  // loopOp.
+  rewriter.inlineRegionBefore(forOp.getOperation()->getRegion(0), loopOp.body(),
+                              std::next(loopOp.body().begin(), 2));
+
+  // Branch into it from the entry.
+  rewriter.setInsertionPointToEnd(&(loopOp.body().front()));
+  rewriter.create<spirv::BranchOp>(loc, header, forOperands.lowerBound());
+
+  // Generate the rest of the loop header.
+  rewriter.setInsertionPointToEnd(header);
+  auto mergeBlock = loopOp.getMergeBlock();
+  auto cmpOp = rewriter.create<spirv::SLessThanOp>(
+      loc, rewriter.getI1Type(), newIndVar, forOperands.upperBound());
+  rewriter.create<spirv::BranchConditionalOp>(
+      loc, cmpOp, body, ArrayRef<Value>(), mergeBlock, ArrayRef<Value>());
+
+  // Generate instructions to increment the step of the induction variable and
+  // branch to the header.
+  Block *continueBlock = loopOp.getContinueBlock();
+  rewriter.setInsertionPointToEnd(continueBlock);
+
+  // Add the step to the induction variable and branch to the header.
+  Value updatedIndVar = rewriter.create<spirv::IAddOp>(
+      loc, newIndVar->getType(), newIndVar, forOperands.step());
+  rewriter.create<spirv::BranchOp>(loc, header, updatedIndVar);
+
+  rewriter.eraseOp(forOp);
+  return matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
+// Builtins.
+//===----------------------------------------------------------------------===//
+
+template <typename SourceOp, spirv::BuiltIn builtin>
+PatternMatchResult LaunchConfigConversion<SourceOp, builtin>::matchAndRewrite(
+    SourceOp op, ArrayRef<Value> operands,
+    ConversionPatternRewriter &rewriter) const {
+  auto dimAttr =
+      op.getOperation()->template getAttrOfType<StringAttr>("dimension");
+  if (!dimAttr) {
+    return this->matchFailure();
+  }
+  int32_t index = 0;
+  if (dimAttr.getValue() == "x") {
+    index = 0;
+  } else if (dimAttr.getValue() == "y") {
+    index = 1;
+  } else if (dimAttr.getValue() == "z") {
+    index = 2;
+  } else {
+    return this->matchFailure();
+  }
+
+  // SPIR-V invocation builtin variables are a vector of type <3xi32>
+  auto spirvBuiltin = spirv::getBuiltinVariableValue(op, builtin, rewriter);
+  rewriter.replaceOpWithNewOp<spirv::CompositeExtractOp>(
+      op, rewriter.getIntegerType(32), spirvBuiltin,
+      rewriter.getI32ArrayAttr({index}));
+  return this->matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
+// GPUFuncOp
+//===----------------------------------------------------------------------===//
+
+// Legalizes a GPU function as an entry SPIR-V function.
+static FuncOp
+lowerAsEntryFunction(gpu::GPUFuncOp funcOp, SPIRVTypeConverter &typeConverter,
+                     ConversionPatternRewriter &rewriter,
+                     spirv::EntryPointABIAttr entryPointInfo,
+                     ArrayRef<spirv::InterfaceVarABIAttr> argABIInfo) {
+  auto fnType = funcOp.getType();
+  if (fnType.getNumResults()) {
+    funcOp.emitError("SPIR-V lowering only supports entry functions"
+                     "with no return values right now");
+    return nullptr;
+  }
+  if (fnType.getNumInputs() != argABIInfo.size()) {
+    funcOp.emitError(
+        "lowering as entry functions requires ABI info for all arguments");
+    return nullptr;
+  }
+  // Update the signature to valid SPIR-V types and add the ABI
+  // attributes. These will be "materialized" by using the
+  // LowerABIAttributesPass.
+  TypeConverter::SignatureConversion signatureConverter(fnType.getNumInputs());
+  {
+    for (auto argType : enumerate(funcOp.getType().getInputs())) {
+      auto convertedType = typeConverter.convertType(argType.value());
+      signatureConverter.addInputs(argType.index(), convertedType);
+    }
+  }
+  auto newFuncOp = rewriter.create<FuncOp>(
+      funcOp.getLoc(), funcOp.getName(),
+      rewriter.getFunctionType(signatureConverter.getConvertedTypes(),
+                               llvm::None),
+      ArrayRef<NamedAttribute>());
+  for (const auto &namedAttr : funcOp.getAttrs()) {
+    if (namedAttr.first.is(impl::getTypeAttrName()) ||
+        namedAttr.first.is(SymbolTable::getSymbolAttrName()))
+      continue;
+    newFuncOp.setAttr(namedAttr.first, namedAttr.second);
+  }
+  rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
+                              newFuncOp.end());
+  rewriter.applySignatureConversion(&newFuncOp.getBody(), signatureConverter);
+  rewriter.eraseOp(funcOp);
+
+  spirv::setABIAttrs(newFuncOp, entryPointInfo, argABIInfo);
+  return newFuncOp;
+}
+
+PatternMatchResult
+KernelFnConversion::matchAndRewrite(gpu::GPUFuncOp funcOp,
+                                    ArrayRef<Value> operands,
+                                    ConversionPatternRewriter &rewriter) const {
+  if (!gpu::GPUDialect::isKernel(funcOp)) {
+    return matchFailure();
+  }
+
+  SmallVector<spirv::InterfaceVarABIAttr, 4> argABI;
+  for (auto argNum : llvm::seq<unsigned>(0, funcOp.getNumArguments())) {
+    argABI.push_back(spirv::getInterfaceVarABIAttr(
+        0, argNum, spirv::StorageClass::StorageBuffer, rewriter.getContext()));
+  }
+
+  auto context = rewriter.getContext();
+  auto entryPointAttr =
+      spirv::getEntryPointABIAttr(workGroupSizeAsInt32, context);
+  FuncOp newFuncOp = lowerAsEntryFunction(funcOp, typeConverter, rewriter,
+                                          entryPointAttr, argABI);
+  if (!newFuncOp) {
+    return matchFailure();
+  }
+  newFuncOp.removeAttr(Identifier::get(gpu::GPUDialect::getKernelFuncAttrName(),
+                                       rewriter.getContext()));
+  return matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
+// ModuleOp with gpu.kernel_module.
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult KernelModuleConversion::matchAndRewrite(
+    ModuleOp moduleOp, ArrayRef<Value> operands,
+    ConversionPatternRewriter &rewriter) const {
+  if (!moduleOp.getAttrOfType<UnitAttr>(
+          gpu::GPUDialect::getKernelModuleAttrName())) {
+    return matchFailure();
+  }
+  // TODO : Generalize this to account for different extensions,
+  // capabilities, extended_instruction_sets, other addressing models
+  // and memory models.
+  auto spvModule = rewriter.create<spirv::ModuleOp>(
+      moduleOp.getLoc(), spirv::AddressingModel::Logical,
+      spirv::MemoryModel::GLSL450, spirv::Capability::Shader,
+      spirv::Extension::SPV_KHR_storage_buffer_storage_class);
+  // Move the region from the module op into the SPIR-V module.
+  Region &spvModuleRegion = spvModule.getOperation()->getRegion(0);
+  rewriter.inlineRegionBefore(moduleOp.getBodyRegion(), spvModuleRegion,
+                              spvModuleRegion.begin());
+  // The spv.module build method adds a block with a terminator. Remove that
+  // block. The terminator of the module op in the remaining block will be
+  // legalized later.
+  spvModuleRegion.back().erase();
+  rewriter.eraseOp(moduleOp);
+  return matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
+// ModuleTerminatorOp for gpu.kernel_module.
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult KernelModuleTerminatorConversion::matchAndRewrite(
+    ModuleTerminatorOp terminatorOp, ArrayRef<Value> operands,
+    ConversionPatternRewriter &rewriter) const {
+  rewriter.replaceOpWithNewOp<spirv::ModuleEndOp>(terminatorOp);
+  return matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
+// GPU return inside kernel functions to SPIR-V return.
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult GPUReturnOpConversion::matchAndRewrite(
+    gpu::ReturnOp returnOp, ArrayRef<Value> operands,
+    ConversionPatternRewriter &rewriter) const {
+  if (!operands.empty())
+    return matchFailure();
+
+  rewriter.replaceOpWithNewOp<spirv::ReturnOp>(returnOp);
+  return matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
+// GPU To SPIRV Patterns.
+//===----------------------------------------------------------------------===//
+
+void mlir::populateGPUToSPIRVPatterns(MLIRContext *context,
+                                      SPIRVTypeConverter &typeConverter,
+                                      OwningRewritePatternList &patterns,
+                                      ArrayRef<int64_t> workGroupSize) {
+  patterns.insert<KernelFnConversion>(context, typeConverter, workGroupSize);
+  patterns.insert<
+      GPUReturnOpConversion, ForOpConversion, KernelModuleConversion,
+      KernelModuleTerminatorConversion,
+      LaunchConfigConversion<gpu::BlockDimOp, spirv::BuiltIn::WorkgroupSize>,
+      LaunchConfigConversion<gpu::BlockIdOp, spirv::BuiltIn::WorkgroupId>,
+      LaunchConfigConversion<gpu::GridDimOp, spirv::BuiltIn::NumWorkgroups>,
+      LaunchConfigConversion<gpu::ThreadIdOp,
+                             spirv::BuiltIn::LocalInvocationId>>(context,
+                                                                 typeConverter);
+}
diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..68392c36765a0517e38a06e626d32f735283af9f
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp
@@ -0,0 +1,96 @@
+//===- ConvertGPUToSPIRVPass.cpp - GPU to SPIR-V dialect lowering passes --===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to convert a kernel function in the GPU Dialect
+// into a spv.module operation
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h"
+#include "mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.h"
+#include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVLowering.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+using namespace mlir;
+
+namespace {
+/// Pass to lower GPU Dialect to SPIR-V. The pass only converts those functions
+/// that have the "gpu.kernel" attribute, i.e. those functions that are
+/// referenced in gpu::LaunchKernelOp operations. For each such function
+///
+/// 1) Create a spirv::ModuleOp, and clone the function into spirv::ModuleOp
+/// (the original function is still needed by the gpu::LaunchKernelOp, so cannot
+/// replace it).
+///
+/// 2) Lower the body of the spirv::ModuleOp.
+class GPUToSPIRVPass : public ModulePass<GPUToSPIRVPass> {
+public:
+  GPUToSPIRVPass() = default;
+  GPUToSPIRVPass(const GPUToSPIRVPass &) {}
+  GPUToSPIRVPass(ArrayRef<int64_t> workGroupSize) {
+    this->workGroupSize = workGroupSize;
+  }
+
+  void runOnModule() override;
+
+private:
+  /// Command line option to specify the workgroup size.
+  ListOption<int64_t> workGroupSize{
+      *this, "workgroup-size",
+      llvm::cl::desc(
+          "Workgroup Sizes in the SPIR-V module for x, followed by y, followed "
+          "by z dimension of the dispatch (others will be ignored)"),
+      llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
+};
+} // namespace
+
+void GPUToSPIRVPass::runOnModule() {
+  auto context = &getContext();
+  auto module = getModule();
+
+  SmallVector<Operation *, 1> kernelModules;
+  OpBuilder builder(context);
+  module.walk([&builder, &kernelModules](ModuleOp moduleOp) {
+    if (moduleOp.getAttrOfType<UnitAttr>(
+            gpu::GPUDialect::getKernelModuleAttrName())) {
+      // For each kernel module (should be only 1 for now, but that is not a
+      // requirement here), clone the module for conversion because the
+      // gpu.launch function still needs the kernel module.
+      builder.setInsertionPoint(moduleOp.getOperation());
+      kernelModules.push_back(builder.clone(*moduleOp.getOperation()));
+    }
+  });
+
+  SPIRVTypeConverter typeConverter;
+  OwningRewritePatternList patterns;
+  populateGPUToSPIRVPatterns(context, typeConverter, patterns, workGroupSize);
+  populateStandardToSPIRVPatterns(context, typeConverter, patterns);
+
+  ConversionTarget target(*context);
+  target.addLegalDialect<spirv::SPIRVDialect>();
+  target.addDynamicallyLegalOp<FuncOp>(
+      [&](FuncOp op) { return typeConverter.isSignatureLegal(op.getType()); });
+
+  if (failed(applyFullConversion(kernelModules, target, patterns,
+                                 &typeConverter))) {
+    return signalPassFailure();
+  }
+}
+
+std::unique_ptr<OpPassBase<ModuleOp>>
+mlir::createConvertGPUToSPIRVPass(ArrayRef<int64_t> workGroupSize) {
+  return std::make_unique<GPUToSPIRVPass>(workGroupSize);
+}
+
+static PassRegistration<GPUToSPIRVPass>
+    pass("convert-gpu-to-spirv", "Convert GPU dialect to SPIR-V dialect");
diff --git a/mlir/lib/Conversion/LinalgToLLVM/CMakeLists.txt b/mlir/lib/Conversion/LinalgToLLVM/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9d2b5dac2026ecff82ac9ab1e0d274e962aad3c9
--- /dev/null
+++ b/mlir/lib/Conversion/LinalgToLLVM/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_llvm_library(MLIRLinalgToLLVM
+  LinalgToLLVM.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/LinalgToLLVM
+)
+set(LIBS
+  MLIRLLVMIR
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+  )
+
+add_dependencies(MLIRLinalgToLLVM ${LIBS})
+target_link_libraries(MLIRLinalgToLLVM ${LIBS})
diff --git a/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp b/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a034fd15c514329262f46b77797d3072bb8a5fb
--- /dev/null
+++ b/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
@@ -0,0 +1,549 @@
+//===- LinalgToLLVM.cpp - conversion from Linalg to LLVM dialect ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Intrinsics.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::LLVM;
+using namespace mlir::linalg;
+using namespace mlir::linalg::intrinsics;
+
+using add = ValueBuilder<mlir::LLVM::AddOp>;
+using addi = ValueBuilder<mlir::AddIOp>;
+using bitcast = ValueBuilder<mlir::LLVM::BitcastOp>;
+using cmpi = ValueBuilder<mlir::CmpIOp>;
+using constant = ValueBuilder<mlir::LLVM::ConstantOp>;
+using extractvalue = ValueBuilder<mlir::LLVM::ExtractValueOp>;
+using gep = ValueBuilder<mlir::LLVM::GEPOp>;
+using insertvalue = ValueBuilder<mlir::LLVM::InsertValueOp>;
+using llvm_call = OperationBuilder<mlir::LLVM::CallOp>;
+using llvm_icmp = ValueBuilder<LLVM::ICmpOp>;
+using llvm_load = ValueBuilder<LLVM::LoadOp>;
+using llvm_store = OperationBuilder<LLVM::StoreOp>;
+using llvm_select = ValueBuilder<LLVM::SelectOp>;
+using mul = ValueBuilder<mlir::LLVM::MulOp>;
+using ptrtoint = ValueBuilder<mlir::LLVM::PtrToIntOp>;
+using sub = ValueBuilder<mlir::LLVM::SubOp>;
+using llvm_undef = ValueBuilder<mlir::LLVM::UndefOp>;
+using urem = ValueBuilder<mlir::LLVM::URemOp>;
+using llvm_alloca = ValueBuilder<LLVM::AllocaOp>;
+using llvm_return = OperationBuilder<LLVM::ReturnOp>;
+
+template <typename T>
+static LLVMType getPtrToElementType(T containerType,
+                                    LLVMTypeConverter &lowering) {
+  return lowering.convertType(containerType.getElementType())
+      .template cast<LLVMType>()
+      .getPointerTo();
+}
+
+// Convert the given type to the LLVM IR Dialect type.  The following
+// conversions are supported:
+//   - an Index type is converted into an LLVM integer type with pointer
+//     bitwidth (analogous to intptr_t in C);
+//   - an Integer type is converted into an LLVM integer type of the same width;
+//   - an F32 type is converted into an LLVM float type
+//   - a Buffer, Range or View is converted into an LLVM structure type
+//     containing the respective dynamic values.
+static Type convertLinalgType(Type t, LLVMTypeConverter &lowering) {
+  auto *context = t.getContext();
+  auto int64Ty = lowering.convertType(IntegerType::get(64, context))
+                     .cast<LLVM::LLVMType>();
+
+  // Range descriptor contains the range bounds and the step as 64-bit integers.
+  //
+  // struct {
+  //   int64_t min;
+  //   int64_t max;
+  //   int64_t step;
+  // };
+  if (t.isa<RangeType>())
+    return LLVMType::getStructTy(int64Ty, int64Ty, int64Ty);
+
+  return Type();
+}
+
+namespace {
+/// EDSC-compatible wrapper for MemRefDescriptor.
+class BaseViewConversionHelper {
+public:
+  BaseViewConversionHelper(Type type)
+      : d(MemRefDescriptor::undef(rewriter(), loc(), type)) {}
+
+  BaseViewConversionHelper(Value v) : d(v) {}
+
+  /// Wrappers around MemRefDescriptor that use EDSC builder and location.
+  Value allocatedPtr() { return d.allocatedPtr(rewriter(), loc()); }
+  void setAllocatedPtr(Value v) { d.setAllocatedPtr(rewriter(), loc(), v); }
+  Value alignedPtr() { return d.alignedPtr(rewriter(), loc()); }
+  void setAlignedPtr(Value v) { d.setAlignedPtr(rewriter(), loc(), v); }
+  Value offset() { return d.offset(rewriter(), loc()); }
+  void setOffset(Value v) { d.setOffset(rewriter(), loc(), v); }
+  Value size(unsigned i) { return d.size(rewriter(), loc(), i); }
+  void setSize(unsigned i, Value v) { d.setSize(rewriter(), loc(), i, v); }
+  Value stride(unsigned i) { return d.stride(rewriter(), loc(), i); }
+  void setStride(unsigned i, Value v) { d.setStride(rewriter(), loc(), i, v); }
+
+  operator Value() { return d; }
+
+private:
+  OpBuilder &rewriter() { return ScopedContext::getBuilder(); }
+  Location loc() { return ScopedContext::getLocation(); }
+
+  MemRefDescriptor d;
+};
+} // namespace
+
+// RangeOp creates a new range descriptor.
+class RangeOpConversion : public LLVMOpLowering {
+public:
+  explicit RangeOpConversion(MLIRContext *context, LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(RangeOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto rangeOp = cast<RangeOp>(op);
+    auto rangeDescriptorTy =
+        convertLinalgType(rangeOp.getResult()->getType(), lowering);
+
+    edsc::ScopedContext context(rewriter, op->getLoc());
+
+    // Fill in an aggregate value of the descriptor.
+    RangeOpOperandAdaptor adaptor(operands);
+    Value desc = llvm_undef(rangeDescriptorTy);
+    desc = insertvalue(desc, adaptor.min(), rewriter.getI64ArrayAttr(0));
+    desc = insertvalue(desc, adaptor.max(), rewriter.getI64ArrayAttr(1));
+    desc = insertvalue(desc, adaptor.step(), rewriter.getI64ArrayAttr(2));
+    rewriter.replaceOp(op, desc);
+    return matchSuccess();
+  }
+};
+
+/// Conversion pattern that transforms a linalg.slice op into:
+///   1. A function entry `alloca` operation to allocate a ViewDescriptor.
+///   2. A load of the ViewDescriptor from the pointer allocated in 1.
+///   3. Updates to the ViewDescriptor to introduce the data ptr, offset, size
+///      and stride corresponding to the region of memory within the bounds of
+///      the parent view.
+///   4. A store of the resulting ViewDescriptor to the alloca'ed pointer.
+/// The linalg.slice op is replaced by the alloca'ed pointer.
+class SliceOpConversion : public LLVMOpLowering {
+public:
+  explicit SliceOpConversion(MLIRContext *context, LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(SliceOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    edsc::ScopedContext context(rewriter, op->getLoc());
+    SliceOpOperandAdaptor adaptor(operands);
+    BaseViewConversionHelper baseDesc(adaptor.view());
+
+    auto sliceOp = cast<SliceOp>(op);
+    auto memRefType = sliceOp.getBaseViewType();
+    auto int64Ty = lowering.convertType(rewriter.getIntegerType(64))
+                       .cast<LLVM::LLVMType>();
+
+    BaseViewConversionHelper desc(lowering.convertType(sliceOp.getViewType()));
+
+    // TODO(ntv): extract sizes and emit asserts.
+    SmallVector<Value, 4> strides(memRefType.getRank());
+    for (int i = 0, e = memRefType.getRank(); i < e; ++i)
+      strides[i] = baseDesc.stride(i);
+
+    auto pos = [&rewriter](ArrayRef<int64_t> values) {
+      return rewriter.getI64ArrayAttr(values);
+    };
+
+    // Compute base offset.
+    Value baseOffset = baseDesc.offset();
+    for (int i = 0, e = memRefType.getRank(); i < e; ++i) {
+      Value indexing = adaptor.indexings()[i];
+      Value min = indexing;
+      if (sliceOp.indexing(i)->getType().isa<RangeType>())
+        min = extractvalue(int64Ty, indexing, pos(0));
+      baseOffset = add(baseOffset, mul(min, strides[i]));
+    }
+
+    // Insert the base and aligned pointers.
+    desc.setAllocatedPtr(baseDesc.allocatedPtr());
+    desc.setAlignedPtr(baseDesc.alignedPtr());
+
+    // Insert base offset.
+    desc.setOffset(baseOffset);
+
+    // Corner case, no sizes or strides: early return the descriptor.
+    if (sliceOp.getViewType().getRank() == 0)
+      return rewriter.replaceOp(op, {desc}), matchSuccess();
+
+    Value zero =
+        constant(int64Ty, rewriter.getIntegerAttr(rewriter.getIndexType(), 0));
+    // Compute and insert view sizes (max - min along the range) and strides.
+    // Skip the non-range operands as they will be projected away from the view.
+    int numNewDims = 0;
+    for (auto en : llvm::enumerate(sliceOp.indexings())) {
+      Value indexing = en.value();
+      if (indexing->getType().isa<RangeType>()) {
+        int rank = en.index();
+        Value rangeDescriptor = adaptor.indexings()[rank];
+        Value min = extractvalue(int64Ty, rangeDescriptor, pos(0));
+        Value max = extractvalue(int64Ty, rangeDescriptor, pos(1));
+        Value step = extractvalue(int64Ty, rangeDescriptor, pos(2));
+        Value baseSize = baseDesc.size(rank);
+
+        // Bound upper by base view upper bound.
+        max = llvm_select(llvm_icmp(ICmpPredicate::slt, max, baseSize), max,
+                          baseSize);
+        Value size = sub(max, min);
+        // Bound lower by zero.
+        size =
+            llvm_select(llvm_icmp(ICmpPredicate::slt, size, zero), zero, size);
+        Value stride = mul(strides[rank], step);
+        desc.setSize(numNewDims, size);
+        desc.setStride(numNewDims, stride);
+        ++numNewDims;
+      }
+    }
+
+    rewriter.replaceOp(op, {desc});
+    return matchSuccess();
+  }
+};
+
+/// Conversion pattern that transforms a linalg.transpose op into:
+///   1. A function entry `alloca` operation to allocate a ViewDescriptor.
+///   2. A load of the ViewDescriptor from the pointer allocated in 1.
+///   3. Updates to the ViewDescriptor to introduce the data ptr, offset, size
+///      and stride. Size and stride are permutations of the original values.
+///   4. A store of the resulting ViewDescriptor to the alloca'ed pointer.
+/// The linalg.transpose op is replaced by the alloca'ed pointer.
+class TransposeOpConversion : public LLVMOpLowering {
+public:
+  explicit TransposeOpConversion(MLIRContext *context,
+                                 LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(TransposeOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Initialize the common boilerplate and alloca at the top of the FuncOp.
+    edsc::ScopedContext context(rewriter, op->getLoc());
+    TransposeOpOperandAdaptor adaptor(operands);
+    BaseViewConversionHelper baseDesc(adaptor.view());
+
+    auto transposeOp = cast<TransposeOp>(op);
+    // No permutation, early exit.
+    if (transposeOp.permutation().isIdentity())
+      return rewriter.replaceOp(op, {baseDesc}), matchSuccess();
+
+    BaseViewConversionHelper desc(
+        lowering.convertType(transposeOp.getViewType()));
+
+    // Copy the base and aligned pointers from the old descriptor to the new
+    // one.
+    desc.setAllocatedPtr(baseDesc.allocatedPtr());
+    desc.setAlignedPtr(baseDesc.alignedPtr());
+
+    // Copy the offset pointer from the old descriptor to the new one.
+    desc.setOffset(baseDesc.offset());
+
+    // Iterate over the dimensions and apply size/stride permutation.
+    for (auto en : llvm::enumerate(transposeOp.permutation().getResults())) {
+      int sourcePos = en.index();
+      int targetPos = en.value().cast<AffineDimExpr>().getPosition();
+      desc.setSize(targetPos, baseDesc.size(sourcePos));
+      desc.setStride(targetPos, baseDesc.stride(sourcePos));
+    }
+
+    rewriter.replaceOp(op, {desc});
+    return matchSuccess();
+  }
+};
+
+// YieldOp produces and LLVM::ReturnOp.
+class YieldOpConversion : public LLVMOpLowering {
+public:
+  explicit YieldOpConversion(MLIRContext *context, LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(YieldOp::getOperationName(), context, lowering_) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands);
+    return matchSuccess();
+  }
+};
+
+template <typename LinalgOp>
+static SmallVector<Type, 4> ExtractOperandTypes(Operation *op) {
+  return SmallVector<Type, 4>{op->getOperandTypes()};
+}
+
+template <>
+SmallVector<Type, 4> ExtractOperandTypes<IndexedGenericOp>(Operation *op) {
+  auto ctx = op->getContext();
+  auto indexedGenericOp = cast<IndexedGenericOp>(op);
+  auto numLoops = indexedGenericOp.getNumLoops();
+
+  SmallVector<Type, 4> result;
+  result.reserve(numLoops + op->getNumOperands());
+  for (unsigned i = 0; i < numLoops; ++i) {
+    result.push_back(IndexType::get(ctx));
+  }
+  for (auto type : op->getOperandTypes()) {
+    result.push_back(type);
+  }
+  return result;
+}
+
+// Get a SymbolRefAttr containing the library function name for the LinalgOp.
+// If the library function does not exist, insert a declaration.
+template <typename LinalgOp>
+static FlatSymbolRefAttr getLibraryCallSymbolRef(Operation *op,
+                                                 PatternRewriter &rewriter) {
+  auto linalgOp = cast<LinalgOp>(op);
+  auto fnName = linalgOp.getLibraryCallName();
+  if (fnName.empty()) {
+    op->emitWarning("No library call defined for: ") << *op;
+    return {};
+  }
+
+  // fnName is a dynamic std::String, unique it via a SymbolRefAttr.
+  FlatSymbolRefAttr fnNameAttr = rewriter.getSymbolRefAttr(fnName);
+  auto module = op->getParentOfType<ModuleOp>();
+  if (module.lookupSymbol(fnName)) {
+    return fnNameAttr;
+  }
+
+  SmallVector<Type, 4> inputTypes(ExtractOperandTypes<LinalgOp>(op));
+  assert(op->getNumResults() == 0 &&
+         "Library call for linalg operation can be generated only for ops that "
+         "have void return types");
+  auto libFnType = FunctionType::get(inputTypes, {}, rewriter.getContext());
+
+  OpBuilder::InsertionGuard guard(rewriter);
+  // Insert before module terminator.
+  rewriter.setInsertionPoint(module.getBody(),
+                             std::prev(module.getBody()->end()));
+  rewriter.create<FuncOp>(op->getLoc(), fnNameAttr.getValue(), libFnType,
+                          ArrayRef<NamedAttribute>{});
+  return fnNameAttr;
+}
+
+Type LinalgTypeConverter::convertType(Type t) {
+  if (auto result = LLVMTypeConverter::convertType(t))
+    return result;
+  return convertLinalgType(t, *this);
+}
+
+// LinalgOpConversion<LinalgOp> creates a new call to the
+// `LinalgOp::getLibraryCallName()` function.
+// The implementation of the function can be either in the same module or in an
+// externally linked library.
+template <typename LinalgOp>
+class LinalgOpConversion : public OpRewritePattern<LinalgOp> {
+public:
+  using OpRewritePattern<LinalgOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(LinalgOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto libraryCallName = getLibraryCallSymbolRef<LinalgOp>(op, rewriter);
+    if (!libraryCallName)
+      return this->matchFailure();
+
+    rewriter.replaceOpWithNewOp<mlir::CallOp>(
+        op, libraryCallName.getValue(), ArrayRef<Type>{}, op.getOperands());
+    return this->matchSuccess();
+  }
+};
+
+/// Conversion pattern specialization for CopyOp. This kicks in when both input
+/// and output permutations are left unspecified or are the identity.
+template <> class LinalgOpConversion<CopyOp> : public OpRewritePattern<CopyOp> {
+public:
+  using OpRewritePattern<CopyOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(CopyOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto inputPerm = op.inputPermutation();
+    if (inputPerm.hasValue() && !inputPerm->isIdentity())
+      return matchFailure();
+    auto outputPerm = op.outputPermutation();
+    if (outputPerm.hasValue() && !outputPerm->isIdentity())
+      return matchFailure();
+
+    auto libraryCallName = getLibraryCallSymbolRef<CopyOp>(op, rewriter);
+    if (!libraryCallName)
+      return matchFailure();
+
+    rewriter.replaceOpWithNewOp<mlir::CallOp>(
+        op, libraryCallName.getValue(), ArrayRef<Type>{}, op.getOperands());
+    return matchSuccess();
+  }
+};
+
+/// Conversion pattern specialization for IndexedGenericOp.
+template <>
+class LinalgOpConversion<IndexedGenericOp>
+    : public OpRewritePattern<IndexedGenericOp> {
+public:
+  using OpRewritePattern<IndexedGenericOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(IndexedGenericOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto libraryCallName =
+        getLibraryCallSymbolRef<IndexedGenericOp>(op, rewriter);
+    if (!libraryCallName)
+      return this->matchFailure();
+
+    // TODO(pifon, ntv): Use induction variables values instead of zeros, when
+    // IndexedGenericOp is tiled.
+    auto zero = rewriter.create<mlir::ConstantOp>(
+        op.getLoc(), rewriter.getIntegerAttr(rewriter.getIndexType(), 0));
+    auto indexedGenericOp = cast<IndexedGenericOp>(op);
+    auto numLoops = indexedGenericOp.getNumLoops();
+    SmallVector<Value, 4> operands;
+    operands.reserve(numLoops + op.getNumOperands());
+    for (unsigned i = 0; i < numLoops; ++i) {
+      operands.push_back(zero);
+    }
+    for (auto operand : op.getOperands()) {
+      operands.push_back(operand);
+    }
+    rewriter.replaceOpWithNewOp<mlir::CallOp>(op, libraryCallName.getValue(),
+                                              ArrayRef<Type>{}, operands);
+    return this->matchSuccess();
+  }
+};
+
+/// A non-conversion rewrite pattern kicks in to convert CopyOp with
+/// permutations into a sequence of TransposeOp and permutation-free CopyOp.
+/// This interplays together with TransposeOpConversion and
+/// LinalgConversion<CopyOp> to create a path to the LLVM dialect.
+class CopyTransposeConversion : public OpRewritePattern<CopyOp> {
+public:
+  using OpRewritePattern<CopyOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(CopyOp op,
+                                     PatternRewriter &rewriter) const override {
+    Value in = op.input(), out = op.output();
+
+    // If either inputPerm or outputPerm are non-identities, insert transposes.
+    auto inputPerm = op.inputPermutation();
+    if (inputPerm.hasValue() && !inputPerm->isIdentity())
+      in = rewriter.create<linalg::TransposeOp>(op.getLoc(), in,
+                                                AffineMapAttr::get(*inputPerm));
+    auto outputPerm = op.outputPermutation();
+    if (outputPerm.hasValue() && !outputPerm->isIdentity())
+      out = rewriter.create<linalg::TransposeOp>(
+          op.getLoc(), out, AffineMapAttr::get(*outputPerm));
+
+    // If nothing was transposed, fail and let the conversion kick in.
+    if (in == op.input() && out == op.output())
+      return matchFailure();
+
+    rewriter.replaceOpWithNewOp<CopyOp>(op, in, out);
+    return matchSuccess();
+  }
+};
+
+/// Populate the given list with patterns that convert from Linalg to Standard.
+static void
+populateLinalgToStandardConversionPatterns(OwningRewritePatternList &patterns,
+                                           MLIRContext *ctx) {
+  // TODO(ntv) ConvOp conversion needs to export a descriptor with relevant
+  // attribute values such as kernel striding and dilation.
+  patterns.insert<CopyTransposeConversion, LinalgOpConversion<ConvOp>,
+                  LinalgOpConversion<CopyOp>, LinalgOpConversion<DotOp>,
+                  LinalgOpConversion<FillOp>, LinalgOpConversion<GenericOp>,
+                  LinalgOpConversion<IndexedGenericOp>,
+                  LinalgOpConversion<MatmulOp>, LinalgOpConversion<MatvecOp>>(
+      ctx);
+}
+
+/// Populate the given list with patterns that convert from Linalg to LLVM.
+void mlir::populateLinalgToLLVMConversionPatterns(
+    LinalgTypeConverter &converter, OwningRewritePatternList &patterns,
+    MLIRContext *ctx) {
+  patterns.insert<RangeOpConversion, SliceOpConversion, TransposeOpConversion,
+                  YieldOpConversion>(ctx, converter);
+}
+
+namespace {
+struct ConvertLinalgToLLVMPass : public ModulePass<ConvertLinalgToLLVMPass> {
+  void runOnModule() override;
+};
+} // namespace
+
+void ConvertLinalgToLLVMPass::runOnModule() {
+  auto module = getModule();
+
+  // Convert to the LLVM IR dialect using the converter defined above.
+  OwningRewritePatternList patterns;
+  LinalgTypeConverter converter(&getContext());
+  populateAffineToStdConversionPatterns(patterns, &getContext());
+  populateLoopToStdConversionPatterns(patterns, &getContext());
+  populateStdToLLVMConversionPatterns(converter, patterns);
+  populateVectorToLLVMConversionPatterns(converter, patterns);
+  populateLinalgToStandardConversionPatterns(patterns, &getContext());
+  populateLinalgToLLVMConversionPatterns(converter, patterns, &getContext());
+
+  ConversionTarget target(getContext());
+  target.addLegalDialect<LLVM::LLVMDialect>();
+  target.addDynamicallyLegalOp<FuncOp>(
+      [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
+  target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
+  if (failed(applyFullConversion(module, target, patterns, &converter)))
+    signalPassFailure();
+}
+
+std::unique_ptr<OpPassBase<ModuleOp>>
+mlir::linalg::createConvertLinalgToLLVMPass() {
+  return std::make_unique<ConvertLinalgToLLVMPass>();
+}
+
+static PassRegistration<ConvertLinalgToLLVMPass> pass(
+    "convert-linalg-to-llvm",
+    "Convert the operations from the linalg dialect into the LLVM dialect");
diff --git a/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt b/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8f05dbd0b63364947828ca4779fe91593617eec2
--- /dev/null
+++ b/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_llvm_library(MLIRLoopToStandard
+  ConvertLoopToStandard.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/LoopToStandard
+)
+add_dependencies(
+  MLIRLoopToStandard
+
+  MLIRLoopOps
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+)
+target_link_libraries(
+  MLIRLoopToStandard
+
+  MLIRLoopOps
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+)
diff --git a/mlir/lib/Conversion/LoopToStandard/ConvertLoopToStandard.cpp b/mlir/lib/Conversion/LoopToStandard/ConvertLoopToStandard.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b257e9b482bf0322ede370c3022f85c3852ebe8e
--- /dev/null
+++ b/mlir/lib/Conversion/LoopToStandard/ConvertLoopToStandard.cpp
@@ -0,0 +1,269 @@
+//===- ConvertLoopToStandard.cpp - ControlFlow to CFG conversion ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to convert loop.for, loop.if and loop.terminator
+// ops into standard CFG ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+
+using namespace mlir;
+using namespace mlir::loop;
+
+namespace {
+
+struct LoopToStandardPass : public OperationPass<LoopToStandardPass> {
+  void runOnOperation() override;
+};
+
+// Create a CFG subgraph for the loop around its body blocks (if the body
+// contained other loops, they have been already lowered to a flow of blocks).
+// Maintain the invariants that a CFG subgraph created for any loop has a single
+// entry and a single exit, and that the entry/exit blocks are respectively
+// first/last blocks in the parent region.  The original loop operation is
+// replaced by the initialization operations that set up the initial value of
+// the loop induction variable (%iv) and computes the loop bounds that are loop-
+// invariant for affine loops.  The operations following the original loop.for
+// are split out into a separate continuation (exit) block. A condition block is
+// created before the continuation block. It checks the exit condition of the
+// loop and branches either to the continuation block, or to the first block of
+// the body. Induction variable modification is appended to the last block of
+// the body (which is the exit block from the body subgraph thanks to the
+// invariant we maintain) along with a branch that loops back to the condition
+// block.
+//
+//      +---------------------------------+
+//      |   <code before the ForOp>       |
+//      |   <compute initial %iv value>   |
+//      |   br cond(%iv)                  |
+//      +---------------------------------+
+//             |
+//  -------|   |
+//  |      v   v
+//  |   +--------------------------------+
+//  |   | cond(%iv):                     |
+//  |   |   <compare %iv to upper bound> |
+//  |   |   cond_br %r, body, end        |
+//  |   +--------------------------------+
+//  |          |               |
+//  |          |               -------------|
+//  |          v                            |
+//  |   +--------------------------------+  |
+//  |   | body-first:                    |  |
+//  |   |   <body contents>              |  |
+//  |   +--------------------------------+  |
+//  |                   |                   |
+//  |                  ...                  |
+//  |                   |                   |
+//  |   +--------------------------------+  |
+//  |   | body-last:                     |  |
+//  |   |   <body contents>              |  |
+//  |   |   %new_iv =<add step to %iv>   |  |
+//  |   |   br cond(%new_iv)             |  |
+//  |   +--------------------------------+  |
+//  |          |                            |
+//  |-----------        |--------------------
+//                      v
+//      +--------------------------------+
+//      | end:                           |
+//      |   <code after the ForOp> |
+//      +--------------------------------+
+//
+struct ForLowering : public OpRewritePattern<ForOp> {
+  using OpRewritePattern<ForOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(ForOp forOp,
+                                     PatternRewriter &rewriter) const override;
+};
+
+// Create a CFG subgraph for the loop.if operation (including its "then" and
+// optional "else" operation blocks).  We maintain the invariants that the
+// subgraph has a single entry and a single exit point, and that the entry/exit
+// blocks are respectively the first/last block of the enclosing region. The
+// operations following the loop.if are split into a continuation (subgraph
+// exit) block. The condition is lowered to a chain of blocks that implement the
+// short-circuit scheme.  Condition blocks are created by splitting out an empty
+// block from the block that contains the loop.if operation.  They
+// conditionally branch to either the first block of the "then" region, or to
+// the first block of the "else" region.  If the latter is absent, they branch
+// to the continuation block instead.  The last blocks of "then" and "else"
+// regions (which are known to be exit blocks thanks to the invariant we
+// maintain).
+//
+//      +--------------------------------+
+//      | <code before the IfOp>         |
+//      | cond_br %cond, %then, %else    |
+//      +--------------------------------+
+//             |              |
+//             |              --------------|
+//             v                            |
+//      +--------------------------------+  |
+//      | then:                          |  |
+//      |   <then contents>              |  |
+//      |   br continue                  |  |
+//      +--------------------------------+  |
+//             |                            |
+//   |----------               |-------------
+//   |                         V
+//   |  +--------------------------------+
+//   |  | else:                          |
+//   |  |   <else contents>              |
+//   |  |   br continue                  |
+//   |  +--------------------------------+
+//   |         |
+//   ------|   |
+//         v   v
+//      +--------------------------------+
+//      | continue:                      |
+//      |   <code after the IfOp>  |
+//      +--------------------------------+
+//
+struct IfLowering : public OpRewritePattern<IfOp> {
+  using OpRewritePattern<IfOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(IfOp ifOp,
+                                     PatternRewriter &rewriter) const override;
+};
+
+struct TerminatorLowering : public OpRewritePattern<TerminatorOp> {
+  using OpRewritePattern<TerminatorOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(TerminatorOp op,
+                                     PatternRewriter &rewriter) const override {
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+};
+} // namespace
+
+PatternMatchResult
+ForLowering::matchAndRewrite(ForOp forOp, PatternRewriter &rewriter) const {
+  Location loc = forOp.getLoc();
+
+  // Start by splitting the block containing the 'loop.for' into two parts.
+  // The part before will get the init code, the part after will be the end
+  // point.
+  auto *initBlock = rewriter.getInsertionBlock();
+  auto initPosition = rewriter.getInsertionPoint();
+  auto *endBlock = rewriter.splitBlock(initBlock, initPosition);
+
+  // Use the first block of the loop body as the condition block since it is
+  // the block that has the induction variable as its argument.  Split out
+  // all operations from the first block into a new block.  Move all body
+  // blocks from the loop body region to the region containing the loop.
+  auto *conditionBlock = &forOp.region().front();
+  auto *firstBodyBlock =
+      rewriter.splitBlock(conditionBlock, conditionBlock->begin());
+  auto *lastBodyBlock = &forOp.region().back();
+  rewriter.inlineRegionBefore(forOp.region(), endBlock);
+  auto iv = conditionBlock->getArgument(0);
+
+  // Append the induction variable stepping logic to the last body block and
+  // branch back to the condition block.  Construct an expression f :
+  // (x -> x+step) and apply this expression to the induction variable.
+  rewriter.setInsertionPointToEnd(lastBodyBlock);
+  auto step = forOp.step();
+  auto stepped = rewriter.create<AddIOp>(loc, iv, step).getResult();
+  if (!stepped)
+    return matchFailure();
+  rewriter.create<BranchOp>(loc, conditionBlock, stepped);
+
+  // Compute loop bounds before branching to the condition.
+  rewriter.setInsertionPointToEnd(initBlock);
+  Value lowerBound = forOp.lowerBound();
+  Value upperBound = forOp.upperBound();
+  if (!lowerBound || !upperBound)
+    return matchFailure();
+  rewriter.create<BranchOp>(loc, conditionBlock, lowerBound);
+
+  // With the body block done, we can fill in the condition block.
+  rewriter.setInsertionPointToEnd(conditionBlock);
+  auto comparison =
+      rewriter.create<CmpIOp>(loc, CmpIPredicate::slt, iv, upperBound);
+
+  rewriter.create<CondBranchOp>(loc, comparison, firstBodyBlock,
+                                ArrayRef<Value>(), endBlock, ArrayRef<Value>());
+  // Ok, we're done!
+  rewriter.eraseOp(forOp);
+  return matchSuccess();
+}
+
+PatternMatchResult
+IfLowering::matchAndRewrite(IfOp ifOp, PatternRewriter &rewriter) const {
+  auto loc = ifOp.getLoc();
+
+  // Start by splitting the block containing the 'loop.if' into two parts.
+  // The part before will contain the condition, the part after will be the
+  // continuation point.
+  auto *condBlock = rewriter.getInsertionBlock();
+  auto opPosition = rewriter.getInsertionPoint();
+  auto *continueBlock = rewriter.splitBlock(condBlock, opPosition);
+
+  // Move blocks from the "then" region to the region containing 'loop.if',
+  // place it before the continuation block, and branch to it.
+  auto &thenRegion = ifOp.thenRegion();
+  auto *thenBlock = &thenRegion.front();
+  rewriter.setInsertionPointToEnd(&thenRegion.back());
+  rewriter.create<BranchOp>(loc, continueBlock);
+  rewriter.inlineRegionBefore(thenRegion, continueBlock);
+
+  // Move blocks from the "else" region (if present) to the region containing
+  // 'loop.if', place it before the continuation block and branch to it.  It
+  // will be placed after the "then" regions.
+  auto *elseBlock = continueBlock;
+  auto &elseRegion = ifOp.elseRegion();
+  if (!elseRegion.empty()) {
+    elseBlock = &elseRegion.front();
+    rewriter.setInsertionPointToEnd(&elseRegion.back());
+    rewriter.create<BranchOp>(loc, continueBlock);
+    rewriter.inlineRegionBefore(elseRegion, continueBlock);
+  }
+
+  rewriter.setInsertionPointToEnd(condBlock);
+  rewriter.create<CondBranchOp>(loc, ifOp.condition(), thenBlock,
+                                /*trueArgs=*/ArrayRef<Value>(), elseBlock,
+                                /*falseArgs=*/ArrayRef<Value>());
+
+  // Ok, we're done!
+  rewriter.eraseOp(ifOp);
+  return matchSuccess();
+}
+
+void mlir::populateLoopToStdConversionPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *ctx) {
+  patterns.insert<ForLowering, IfLowering, TerminatorLowering>(ctx);
+}
+
+void LoopToStandardPass::runOnOperation() {
+  OwningRewritePatternList patterns;
+  populateLoopToStdConversionPatterns(patterns, &getContext());
+  ConversionTarget target(getContext());
+  target.addLegalDialect<StandardOpsDialect>();
+  if (failed(applyPartialConversion(getOperation(), target, patterns)))
+    signalPassFailure();
+}
+
+std::unique_ptr<Pass> mlir::createLowerToCFGPass() {
+  return std::make_unique<LoopToStandardPass>();
+}
+
+static PassRegistration<LoopToStandardPass>
+    pass("convert-loop-to-std", "Convert Loop dialect to Standard dialect, "
+                                "replacing structured control flow with a CFG");
diff --git a/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt b/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2dacc800cb23eba8d325c9008f1ec1a465e72e40
--- /dev/null
+++ b/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt
@@ -0,0 +1,21 @@
+set(LIBS
+  MLIRAffineOps
+  MLIRGPU
+  MLIRIR
+  MLIRLinalg
+  MLIRPass
+  MLIRStandardOps
+  MLIRSupport
+  MLIRTransforms
+  LLVMSupport
+)
+
+add_llvm_library(MLIRLoopsToGPU
+  LoopsToGPU.cpp
+  LoopsToGPUPass.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/LoopsToGPU
+)
+add_dependencies(MLIRLoopsToGPU ${LIBS})
+target_link_libraries(MLIRLoopsToGPU ${LIBS})
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e500d10983ccde271ac4e2feff79e0a0ae73f2b5
--- /dev/null
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -0,0 +1,528 @@
+//===- LoopsToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a straightforward conversion of an loop nest into a GPU
+// kernel.  The caller is expected to guarantee that the conversion is correct
+// or to further transform the kernel to ensure correctness.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h"
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "loops-to-gpu"
+
+using namespace mlir;
+using namespace mlir::loop;
+
+using llvm::seq;
+
+// Extract an indexed value from KernelDim3.
+static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
+  switch (pos) {
+  case 0:
+    return dim3.x;
+  case 1:
+    return dim3.y;
+  case 2:
+    return dim3.z;
+  default:
+    llvm_unreachable("dim3 position out of bounds");
+  }
+  return nullptr;
+}
+
+// Get the lower bound-related operands of a loop operation.
+static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) {
+  return forOp.getLowerBoundOperands();
+}
+static SmallVector<Value, 1> getLowerBoundOperands(ForOp forOp) {
+  SmallVector<Value, 1> bounds(1, forOp.lowerBound());
+  return bounds;
+}
+
+// Get the upper bound-related operands of a loop operation.
+static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) {
+  return forOp.getUpperBoundOperands();
+}
+static SmallVector<Value, 1> getUpperBoundOperands(ForOp forOp) {
+  SmallVector<Value, 1> bounds(1, forOp.upperBound());
+  return bounds;
+}
+
+// Get a Value that corresponds to the loop step.  If the step is an attribute,
+// materialize a corresponding constant using builder.
+static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
+  return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep());
+}
+static Value getOrCreateStep(ForOp forOp, OpBuilder &) { return forOp.step(); }
+
+// Get a Value for the loop lower bound.  If the value requires computation,
+// materialize the instructions using builder.
+static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
+  return lowerAffineLowerBound(forOp, builder);
+}
+static Value getOrEmitLowerBound(ForOp forOp, OpBuilder &) {
+  return forOp.lowerBound();
+}
+
+// Get a Value for the loop upper bound.  If the value requires computation,
+// materialize the instructions using builder.
+static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
+  return lowerAffineUpperBound(forOp, builder);
+}
+static Value getOrEmitUpperBound(ForOp forOp, OpBuilder &) {
+  return forOp.upperBound();
+}
+
+// Check the structure of the loop nest:
+//   - there are enough loops to map to numDims;
+//   - the loops are perfectly nested;
+//   - the loop bounds can be computed above the outermost loop.
+// This roughly corresponds to the "matcher" part of the pattern-based
+// rewriting infrastructure.
+template <typename OpTy>
+LogicalResult checkLoopNestMappableImpl(OpTy forOp, unsigned numDims) {
+  Region &limit = forOp.region();
+  for (unsigned i = 0, e = numDims; i < e; ++i) {
+    Operation *nested = &forOp.getBody()->front();
+    if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) ||
+        !areValuesDefinedAbove(getUpperBoundOperands(forOp), limit))
+      return forOp.emitError(
+          "loops with bounds depending on other mapped loops "
+          "are not supported");
+
+    // The innermost loop can have an arbitrary body, skip the perfect nesting
+    // check for it.
+    if (i == e - 1)
+      break;
+
+    auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
+    if (forOp.getBody()->empty() || std::next(begin, 2) != end)
+      return forOp.emitError("expected perfectly nested loops in the body");
+
+    if (!(forOp = dyn_cast<OpTy>(nested)))
+      return nested->emitError("expected a nested loop");
+  }
+  return success();
+}
+
+template <typename OpTy>
+LogicalResult checkLoopNestMappable(OpTy forOp, unsigned numBlockDims,
+                                    unsigned numThreadDims) {
+  if (numBlockDims < 1 || numThreadDims < 1) {
+    LLVM_DEBUG(llvm::dbgs() << "nothing to map");
+    return success();
+  }
+
+  OpBuilder builder(forOp.getOperation());
+  if (numBlockDims > 3) {
+    return forOp.emitError("cannot map to more than 3 block dimensions");
+  }
+  if (numThreadDims > 3) {
+    return forOp.emitError("cannot map to more than 3 thread dimensions");
+  }
+  return checkLoopNestMappableImpl(forOp, numBlockDims + numThreadDims);
+}
+
+template <typename OpTy>
+LogicalResult checkLoopOpMappable(OpTy forOp, unsigned numBlockDims,
+                                  unsigned numThreadDims) {
+  if (numBlockDims < 1 || numThreadDims < 1) {
+    LLVM_DEBUG(llvm::dbgs() << "nothing to map");
+    return success();
+  }
+
+  if (numBlockDims > 3) {
+    return forOp.emitError("cannot map to more than 3 block dimensions");
+  }
+  if (numThreadDims > 3) {
+    return forOp.emitError("cannot map to more than 3 thread dimensions");
+  }
+  if (numBlockDims != numThreadDims) {
+    // TODO(ravishankarm) : This can probably be relaxed by having a one-trip
+    // loop for the missing dimension, but there is not reason to handle this
+    // case for now.
+    return forOp.emitError(
+        "mismatch in block dimensions and thread dimensions");
+  }
+
+  // Check that the forOp contains perfectly nested loops for numBlockDims
+  if (failed(checkLoopNestMappableImpl(forOp, numBlockDims))) {
+    return failure();
+  }
+
+  // Get to the innermost loop.
+  for (auto i : seq<unsigned>(0, numBlockDims - 1)) {
+    forOp = cast<OpTy>(&forOp.getBody()->front());
+    (void)i;
+  }
+
+  // The forOp now points to the body of the innermost loop mapped to blocks.
+  for (Operation &op : *forOp.getBody()) {
+    // If the operation is a loop, check that it is mappable to workItems.
+    if (auto innerLoop = dyn_cast<OpTy>(&op)) {
+      if (failed(checkLoopNestMappableImpl(innerLoop, numThreadDims))) {
+        return failure();
+      }
+      continue;
+    }
+    // TODO(ravishankarm) : If it is not a loop op, it is assumed that the
+    // statement is executed by all threads. It might be a collective operation,
+    // or some non-side effect instruction. Have to decide on "allowable"
+    // statements and check for those here.
+  }
+  return success();
+}
+
+namespace {
+// Helper structure that holds common state of the loop to GPU kernel
+// conversion.
+struct LoopToGpuConverter {
+  template <typename OpTy>
+  Optional<OpTy> collectBounds(OpTy forOp, unsigned numLoops);
+
+  template <typename OpTy>
+  void createLaunch(OpTy rootForOp, OpTy innermostForOp, unsigned numBlockDims,
+                    unsigned numThreadDims);
+
+  // Ranges of the loops mapped to blocks or threads.
+  SmallVector<Value, 6> dims;
+  // Lower bounds of the loops mapped to blocks or threads.
+  SmallVector<Value, 6> lbs;
+  // Induction variables of the loops mapped to blocks or threads.
+  SmallVector<Value, 6> ivs;
+  // Steps of the loops mapped to blocks or threads.
+  SmallVector<Value, 6> steps;
+};
+} // namespace
+
+// Return true if the value is obviously a constant "one".
+static bool isConstantOne(Value value) {
+  if (auto def = dyn_cast_or_null<ConstantIndexOp>(value->getDefiningOp()))
+    return def.getValue() == 1;
+  return false;
+}
+
+// Collect ranges, bounds, steps and induction variables in preparation for
+// mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel.
+// This may fail if the IR for computing loop bounds cannot be constructed, for
+// example if an affine loop uses semi-affine maps. Return the last loop to be
+// mapped on success, llvm::None on failure.
+template <typename OpTy>
+Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp,
+                                                 unsigned numLoops) {
+  OpBuilder builder(forOp.getOperation());
+  dims.reserve(numLoops);
+  lbs.reserve(numLoops);
+  ivs.reserve(numLoops);
+  steps.reserve(numLoops);
+  OpTy currentLoop = forOp;
+  for (unsigned i = 0; i < numLoops; ++i) {
+    Value lowerBound = getOrEmitLowerBound(currentLoop, builder);
+    Value upperBound = getOrEmitUpperBound(currentLoop, builder);
+    if (!lowerBound || !upperBound) {
+      return llvm::None;
+    }
+
+    Value range =
+        builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound);
+    Value step = getOrCreateStep(currentLoop, builder);
+    if (!isConstantOne(step))
+      range = builder.create<SignedDivIOp>(currentLoop.getLoc(), range, step);
+    dims.push_back(range);
+
+    lbs.push_back(lowerBound);
+    ivs.push_back(currentLoop.getInductionVar());
+    steps.push_back(step);
+
+    if (i != numLoops - 1)
+      currentLoop = cast<OpTy>(&currentLoop.getBody()->front());
+  }
+  return currentLoop;
+}
+
+/// Given `nDims` perfectly nested loops rooted as `rootForOp`, convert them o
+/// be partitioned across workgroups or workitems. The values for the
+/// workgroup/workitem id along each dimension is passed in with `ids`. The
+/// number of workgroups/workitems along each dimension are passed in with
+/// `nids`. The innermost loop is mapped to the x-dimension, followed by the
+/// next innermost loop to y-dimension, followed by z-dimension.
+template <typename OpTy>
+OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<Value> ids,
+                          ArrayRef<Value> nids) {
+  auto nDims = ids.size();
+  assert(nDims == nids.size());
+  for (auto dim : llvm::seq<unsigned>(0, nDims)) {
+    // TODO(ravishankarm): Don't always need to generate a loop here. If nids >=
+    // number of iterations of the original loop, this becomes a if
+    // condition. Though that does rely on how the workgroup/workitem sizes are
+    // specified to begin with.
+    mapLoopToProcessorIds(rootForOp, ids[dim], nids[dim]);
+    if (dim != nDims - 1) {
+      rootForOp = cast<OpTy>(rootForOp.getBody()->front());
+    }
+  }
+  return rootForOp;
+}
+
+/// Utility method to convert the gpu::KernelDim3 object for representing id of
+/// each workgroup/workitem and number of workgroup/workitems along a dimension
+/// of the launch into a container.
+void packIdAndNumId(gpu::KernelDim3 kernelIds, gpu::KernelDim3 kernelNids,
+                    unsigned nDims, SmallVectorImpl<Value> &ids,
+                    SmallVectorImpl<Value> &nids) {
+  assert(nDims <= 3 && "invalid number of launch dimensions");
+  SmallVector<Value, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x};
+  SmallVector<Value, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x};
+  ids.clear();
+  ids.append(std::next(allIds.begin(), allIds.size() - nDims), allIds.end());
+  nids.clear();
+  nids.append(std::next(allNids.begin(), allNids.size() - nDims),
+              allNids.end());
+}
+
+/// Generate the body of the launch operation.
+template <typename OpTy>
+LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp,
+                               gpu::LaunchOp launchOp, unsigned numBlockDims,
+                               unsigned numThreadDims) {
+  OpBuilder::InsertionGuard bodyInsertionGuard(builder);
+  builder.setInsertionPointToEnd(&launchOp.body().front());
+  auto returnOp = builder.create<gpu::ReturnOp>(launchOp.getLoc());
+
+  rootForOp.getOperation()->moveBefore(returnOp);
+  SmallVector<Value, 3> workgroupID, numWorkGroups;
+  packIdAndNumId(launchOp.getBlockIds(), launchOp.getGridSize(), numBlockDims,
+                 workgroupID, numWorkGroups);
+
+  // Partition the loop for mapping to workgroups.
+  auto loopOp = createGPULaunchLoops(rootForOp, workgroupID, numWorkGroups);
+
+  // Iterate over the body of the loopOp and get the loops to partition for
+  // thread blocks.
+  SmallVector<OpTy, 1> threadRootForOps;
+  for (Operation &op : *loopOp.getBody()) {
+    if (auto threadRootForOp = dyn_cast<OpTy>(&op)) {
+      threadRootForOps.push_back(threadRootForOp);
+    }
+  }
+
+  SmallVector<Value, 3> workItemID, workGroupSize;
+  packIdAndNumId(launchOp.getThreadIds(), launchOp.getBlockSize(),
+                 numThreadDims, workItemID, workGroupSize);
+  for (auto &loopOp : threadRootForOps) {
+    builder.setInsertionPoint(loopOp);
+    createGPULaunchLoops(loopOp, workItemID, workGroupSize);
+  }
+  return success();
+}
+
+// Convert the computation rooted at the `rootForOp`, into a GPU kernel with the
+// given workgroup size and number of workgroups.
+template <typename OpTy>
+LogicalResult createLaunchFromOp(OpTy rootForOp, ArrayRef<Value> numWorkGroups,
+                                 ArrayRef<Value> workGroupSizes) {
+  OpBuilder builder(rootForOp.getOperation());
+  if (numWorkGroups.size() > 3) {
+    return rootForOp.emitError("invalid ")
+           << numWorkGroups.size() << "-D workgroup specification";
+  }
+  auto loc = rootForOp.getLoc();
+  Value one = builder.create<ConstantOp>(
+      loc, builder.getIntegerAttr(builder.getIndexType(), 1));
+  SmallVector<Value, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one);
+  for (auto numWorkGroup : enumerate(numWorkGroups)) {
+    numWorkGroups3D[numWorkGroup.index()] = numWorkGroup.value();
+  }
+  for (auto workGroupSize : enumerate(workGroupSizes)) {
+    workGroupSize3D[workGroupSize.index()] = workGroupSize.value();
+  }
+
+  // Get the values used within the region of the rootForOp but defined above
+  // it.
+  llvm::SetVector<Value> valuesToForwardSet;
+  getUsedValuesDefinedAbove(rootForOp.region(), rootForOp.region(),
+                            valuesToForwardSet);
+  // Also add the values used for the lb, ub, and step of the rootForOp.
+  valuesToForwardSet.insert(rootForOp.getOperands().begin(),
+                            rootForOp.getOperands().end());
+  auto valuesToForward = valuesToForwardSet.takeVector();
+  auto launchOp = builder.create<gpu::LaunchOp>(
+      rootForOp.getLoc(), numWorkGroups3D[0], numWorkGroups3D[1],
+      numWorkGroups3D[2], workGroupSize3D[0], workGroupSize3D[1],
+      workGroupSize3D[2], valuesToForward);
+  if (failed(createLaunchBody(builder, rootForOp, launchOp,
+                              numWorkGroups.size(), workGroupSizes.size()))) {
+    return failure();
+  }
+
+  // Replace values that are used within the region of the launchOp but are
+  // defined outside. They all are replaced with kernel arguments.
+  for (const auto &pair :
+       llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) {
+    Value from = std::get<0>(pair);
+    Value to = std::get<1>(pair);
+    replaceAllUsesInRegionWith(from, to, launchOp.body());
+  }
+  return success();
+}
+
+// Replace the rooted at "rootForOp" with a GPU launch operation.  This expects
+// "innermostForOp" to point to the last loop to be transformed to the kernel,
+// and to have (numBlockDims + numThreadDims) perfectly nested loops between
+// "rootForOp" and "innermostForOp".
+// TODO(ravishankarm) : This method can be modified to use the
+// createLaunchFromOp method, since that is a strict generalization of this
+// method.
+template <typename OpTy>
+void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
+                                      unsigned numBlockDims,
+                                      unsigned numThreadDims) {
+  OpBuilder builder(rootForOp.getOperation());
+  // Prepare the grid and block sizes for the launch operation.  If there is
+  // no loop mapped to a specific dimension, use constant "1" as its size.
+  Value constOne = (numBlockDims < 3 || numThreadDims < 3)
+                       ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1)
+                       : nullptr;
+  Value gridSizeX = dims[0];
+  Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
+  Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
+  Value blockSizeX = dims[numBlockDims];
+  Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
+  Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
+
+  // Create a launch op and move the body region of the innermost loop to the
+  // launch op.  Pass the values defined outside the outermost loop and used
+  // inside the innermost loop and loop lower bounds as kernel data arguments.
+  // Still assuming perfect nesting so there are no values other than induction
+  // variables that are defined in one loop and used in deeper loops.
+  llvm::SetVector<Value> valuesToForwardSet;
+  getUsedValuesDefinedAbove(innermostForOp.region(), rootForOp.region(),
+                            valuesToForwardSet);
+  auto valuesToForward = valuesToForwardSet.takeVector();
+  auto originallyForwardedValues = valuesToForward.size();
+  valuesToForward.insert(valuesToForward.end(), lbs.begin(), lbs.end());
+  valuesToForward.insert(valuesToForward.end(), steps.begin(), steps.end());
+  auto launchOp = builder.create<gpu::LaunchOp>(
+      rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
+      blockSizeY, blockSizeZ, valuesToForward);
+  valuesToForward.resize(originallyForwardedValues);
+
+  // Replace the loop terminator (loops contain only a single block) with the
+  // gpu return and move the operations from the loop body block to the gpu
+  // launch body block.  Do not move the entire block because of the difference
+  // in block arguments.
+  Operation &terminator = innermostForOp.getBody()->back();
+  Location terminatorLoc = terminator.getLoc();
+  terminator.erase();
+  builder.setInsertionPointToEnd(innermostForOp.getBody());
+  builder.create<gpu::ReturnOp>(terminatorLoc);
+  launchOp.body().front().getOperations().splice(
+      launchOp.body().front().begin(),
+      innermostForOp.getBody()->getOperations());
+
+  // Remap the loop iterators to use block/thread identifiers instead.  Loops
+  // may iterate from LB with step S whereas GPU thread/block ids always iterate
+  // from 0 to N with step 1.  Therefore, loop induction variables are replaced
+  // with (gpu-thread/block-id * S) + LB.
+  builder.setInsertionPointToStart(&launchOp.body().front());
+  auto lbArgumentIt = std::next(launchOp.getKernelArguments().begin(),
+                                originallyForwardedValues);
+  auto stepArgumentIt = std::next(lbArgumentIt, lbs.size());
+  for (auto en : llvm::enumerate(ivs)) {
+    Value id =
+        en.index() < numBlockDims
+            ? getDim3Value(launchOp.getBlockIds(), en.index())
+            : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
+    Value step = steps[en.index()];
+    if (!isConstantOne(step))
+      id = builder.create<MulIOp>(rootForOp.getLoc(), step, id);
+
+    Value ivReplacement =
+        builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
+    en.value()->replaceAllUsesWith(ivReplacement);
+    replaceAllUsesInRegionWith(steps[en.index()], *stepArgumentIt,
+                               launchOp.body());
+    std::advance(lbArgumentIt, 1);
+    std::advance(stepArgumentIt, 1);
+  }
+
+  // Remap the values defined outside the body to use kernel arguments instead.
+  // The list of kernel arguments also contains the lower bounds for loops at
+  // trailing positions, make sure we don't touch those.
+  for (const auto &pair :
+       llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) {
+    Value from = std::get<0>(pair);
+    Value to = std::get<1>(pair);
+    replaceAllUsesInRegionWith(from, to, launchOp.body());
+  }
+
+  // We are done and can erase the original outermost loop.
+  rootForOp.erase();
+}
+
+// Generic loop to GPU kernel conversion function.
+template <typename OpTy>
+static LogicalResult convertLoopNestToGPULaunch(OpTy forOp,
+                                                unsigned numBlockDims,
+                                                unsigned numThreadDims) {
+  if (failed(checkLoopNestMappable(forOp, numBlockDims, numThreadDims)))
+    return failure();
+
+  LoopToGpuConverter converter;
+  auto maybeInnerLoop =
+      converter.collectBounds(forOp, numBlockDims + numThreadDims);
+  if (!maybeInnerLoop)
+    return failure();
+  converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
+
+  return success();
+}
+
+// Generic loop to GPU kernel conversion function when loop is imperfectly
+// nested. The workgroup size and num workgroups is provided as input
+template <typename OpTy>
+static LogicalResult convertLoopToGPULaunch(OpTy forOp,
+                                            ArrayRef<Value> numWorkGroups,
+                                            ArrayRef<Value> workGroupSize) {
+  if (failed(checkLoopOpMappable(forOp, numWorkGroups.size(),
+                                 workGroupSize.size()))) {
+    return failure();
+  }
+  return createLaunchFromOp(forOp, numWorkGroups, workGroupSize);
+}
+
+LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp,
+                                                     unsigned numBlockDims,
+                                                     unsigned numThreadDims) {
+  return ::convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
+}
+
+LogicalResult mlir::convertLoopNestToGPULaunch(ForOp forOp,
+                                               unsigned numBlockDims,
+                                               unsigned numThreadDims) {
+  return ::convertLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
+}
+
+LogicalResult mlir::convertLoopToGPULaunch(loop::ForOp forOp,
+                                           ArrayRef<Value> numWorkGroups,
+                                           ArrayRef<Value> workGroupSizes) {
+  return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes);
+}
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c3bbf2748187422b3043c8d08d5c2e0bc63d29eb
--- /dev/null
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp
@@ -0,0 +1,147 @@
+//===- LoopsToGPUPass.cpp - Convert a loop nest to a GPU kernel -----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"
+#include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Pass/Pass.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/CommandLine.h"
+
+#define PASS_NAME "convert-loops-to-gpu"
+#define LOOPOP_TO_GPU_PASS_NAME "convert-loop-op-to-gpu"
+
+using namespace mlir;
+using namespace mlir::loop;
+
+static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options");
+static llvm::cl::opt<unsigned>
+    clNumBlockDims("gpu-block-dims",
+                   llvm::cl::desc("Number of GPU block dimensions for mapping"),
+                   llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
+static llvm::cl::opt<unsigned> clNumThreadDims(
+    "gpu-thread-dims",
+    llvm::cl::desc("Number of GPU thread dimensions for mapping"),
+    llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
+
+static llvm::cl::OptionCategory clLoopOpToGPUCategory(LOOPOP_TO_GPU_PASS_NAME
+                                                      " options");
+static llvm::cl::list<unsigned>
+    clNumWorkGroups("gpu-num-workgroups",
+                    llvm::cl::desc("Num workgroups in the GPU launch"),
+                    llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
+                    llvm::cl::cat(clLoopOpToGPUCategory));
+static llvm::cl::list<unsigned>
+    clWorkGroupSize("gpu-workgroup-size",
+                    llvm::cl::desc("Workgroup Size in the GPU launch"),
+                    llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
+                    llvm::cl::cat(clLoopOpToGPUCategory));
+
+namespace {
+// A pass that traverses top-level loops in the function and converts them to
+// GPU launch operations.  Nested launches are not allowed, so this does not
+// walk the function recursively to avoid considering nested loops.
+struct ForLoopMapper : public FunctionPass<ForLoopMapper> {
+  ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims)
+      : numBlockDims(numBlockDims), numThreadDims(numThreadDims) {}
+
+  void runOnFunction() override {
+    for (Block &block : getFunction())
+      for (Operation &op : llvm::make_early_inc_range(block)) {
+        if (auto forOp = dyn_cast<AffineForOp>(&op)) {
+          if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims,
+                                                      numThreadDims)))
+            signalPassFailure();
+        } else if (auto forOp = dyn_cast<ForOp>(&op)) {
+          if (failed(convertLoopNestToGPULaunch(forOp, numBlockDims,
+                                                numThreadDims)))
+            signalPassFailure();
+        }
+      }
+  }
+
+  unsigned numBlockDims;
+  unsigned numThreadDims;
+};
+
+// A pass that traverses top-level loops in the function and convertes them to
+// GPU launch operations. The top-level loops itself does not have to be
+// perfectly nested. The only requirement is that there be as many perfectly
+// nested loops as the size of `numWorkGroups`. Within these any loop nest has
+// to be perfectly nested upto depth equal to size of `workGroupSize`.
+struct ImperfectlyNestedForLoopMapper
+    : public FunctionPass<ImperfectlyNestedForLoopMapper> {
+  ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups,
+                                 ArrayRef<int64_t> workGroupSize)
+      : numWorkGroups(numWorkGroups.begin(), numWorkGroups.end()),
+        workGroupSize(workGroupSize.begin(), workGroupSize.end()) {}
+
+  void runOnFunction() override {
+    // Insert the num work groups and workgroup sizes as constant values. This
+    // pass is only used for testing.
+    FuncOp funcOp = getFunction();
+    OpBuilder builder(funcOp.getOperation()->getRegion(0));
+    SmallVector<Value, 3> numWorkGroupsVal, workGroupSizeVal;
+    for (auto val : numWorkGroups) {
+      auto constOp = builder.create<ConstantOp>(
+          funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
+      numWorkGroupsVal.push_back(constOp);
+    }
+    for (auto val : workGroupSize) {
+      auto constOp = builder.create<ConstantOp>(
+          funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
+      workGroupSizeVal.push_back(constOp);
+    }
+    for (Block &block : getFunction()) {
+      for (Operation &op : llvm::make_early_inc_range(block)) {
+        if (auto forOp = dyn_cast<ForOp>(&op)) {
+          if (failed(convertLoopToGPULaunch(forOp, numWorkGroupsVal,
+                                            workGroupSizeVal))) {
+            return signalPassFailure();
+          }
+        }
+      }
+    }
+  }
+  SmallVector<int64_t, 3> numWorkGroups;
+  SmallVector<int64_t, 3> workGroupSize;
+};
+
+} // namespace
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::createSimpleLoopsToGPUPass(unsigned numBlockDims,
+                                 unsigned numThreadDims) {
+  return std::make_unique<ForLoopMapper>(numBlockDims, numThreadDims);
+}
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups,
+                          ArrayRef<int64_t> workGroupSize) {
+  return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
+                                                          workGroupSize);
+}
+
+static PassRegistration<ForLoopMapper>
+    registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] {
+      return std::make_unique<ForLoopMapper>(clNumBlockDims.getValue(),
+                                             clNumThreadDims.getValue());
+    });
+
+static PassRegistration<ImperfectlyNestedForLoopMapper> loopOpToGPU(
+    LOOPOP_TO_GPU_PASS_NAME, "Convert top-level loop::ForOp to GPU kernels",
+    [] {
+      SmallVector<int64_t, 3> numWorkGroups, workGroupSize;
+      numWorkGroups.assign(clNumWorkGroups.begin(), clNumWorkGroups.end());
+      workGroupSize.assign(clWorkGroupSize.begin(), clWorkGroupSize.end());
+      return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
+                                                              workGroupSize);
+    });
diff --git a/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt b/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6334c2734932a71d973d7d9da07b65d089a726c7
--- /dev/null
+++ b/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt
@@ -0,0 +1,24 @@
+add_llvm_library(MLIRStandardToLLVM
+  ConvertStandardToLLVM.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/StandardToLLVM
+)
+add_dependencies(
+  MLIRStandardToLLVM
+
+  MLIRLoopToStandard
+  MLIRLLVMIR
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+)
+target_link_libraries(
+  MLIRStandardToLLVM
+
+  MLIRLoopToStandard
+  MLIRLLVMIR
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+)
diff --git a/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c96cc5e9c7db255e1b293ea8b41d109f8c495f7
--- /dev/null
+++ b/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp
@@ -0,0 +1,2278 @@
+//===- ConvertStandardToLLVM.cpp - Standard to LLVM dialect conversion-----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to convert MLIR standard and builtin dialects
+// into the LLVM IR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/ADT/TypeSwitch.h"
+#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace mlir;
+
+#define PASS_NAME "convert-std-to-llvm"
+
+static llvm::cl::OptionCategory
+    clOptionsCategory("Standard to LLVM lowering options");
+
+static llvm::cl::opt<bool>
+    clUseAlloca(PASS_NAME "-use-alloca",
+                llvm::cl::desc("Replace emission of malloc/free by alloca"),
+                llvm::cl::init(false));
+
+LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx)
+    : llvmDialect(ctx->getRegisteredDialect<LLVM::LLVMDialect>()) {
+  assert(llvmDialect && "LLVM IR dialect is not registered");
+  module = &llvmDialect->getLLVMModule();
+}
+
+// Get the LLVM context.
+llvm::LLVMContext &LLVMTypeConverter::getLLVMContext() {
+  return module->getContext();
+}
+
+// Extract an LLVM IR type from the LLVM IR dialect type.
+LLVM::LLVMType LLVMTypeConverter::unwrap(Type type) {
+  if (!type)
+    return nullptr;
+  auto *mlirContext = type.getContext();
+  auto wrappedLLVMType = type.dyn_cast<LLVM::LLVMType>();
+  if (!wrappedLLVMType)
+    emitError(UnknownLoc::get(mlirContext),
+              "conversion resulted in a non-LLVM type");
+  return wrappedLLVMType;
+}
+
+LLVM::LLVMType LLVMTypeConverter::getIndexType() {
+  return LLVM::LLVMType::getIntNTy(
+      llvmDialect, module->getDataLayout().getPointerSizeInBits());
+}
+
+Type LLVMTypeConverter::convertIndexType(IndexType type) {
+  return getIndexType();
+}
+
+Type LLVMTypeConverter::convertIntegerType(IntegerType type) {
+  return LLVM::LLVMType::getIntNTy(llvmDialect, type.getWidth());
+}
+
+Type LLVMTypeConverter::convertFloatType(FloatType type) {
+  switch (type.getKind()) {
+  case mlir::StandardTypes::F32:
+    return LLVM::LLVMType::getFloatTy(llvmDialect);
+  case mlir::StandardTypes::F64:
+    return LLVM::LLVMType::getDoubleTy(llvmDialect);
+  case mlir::StandardTypes::F16:
+    return LLVM::LLVMType::getHalfTy(llvmDialect);
+  case mlir::StandardTypes::BF16: {
+    auto *mlirContext = llvmDialect->getContext();
+    return emitError(UnknownLoc::get(mlirContext), "unsupported type: BF16"),
+           Type();
+  }
+  default:
+    llvm_unreachable("non-float type in convertFloatType");
+  }
+}
+
+// Except for signatures, MLIR function types are converted into LLVM
+// pointer-to-function types.
+Type LLVMTypeConverter::convertFunctionType(FunctionType type) {
+  SignatureConversion conversion(type.getNumInputs());
+  LLVM::LLVMType converted =
+      convertFunctionSignature(type, /*isVariadic=*/false, conversion);
+  return converted.getPointerTo();
+}
+
+// Function types are converted to LLVM Function types by recursively converting
+// argument and result types.  If MLIR Function has zero results, the LLVM
+// Function has one VoidType result.  If MLIR Function has more than one result,
+// they are into an LLVM StructType in their order of appearance.
+LLVM::LLVMType LLVMTypeConverter::convertFunctionSignature(
+    FunctionType type, bool isVariadic,
+    LLVMTypeConverter::SignatureConversion &result) {
+  // Convert argument types one by one and check for errors.
+  for (auto &en : llvm::enumerate(type.getInputs())) {
+    Type type = en.value();
+    auto converted = convertType(type).dyn_cast_or_null<LLVM::LLVMType>();
+    if (!converted)
+      return {};
+    if (type.isa<MemRefType>() || type.isa<UnrankedMemRefType>())
+      converted = converted.getPointerTo();
+    result.addInputs(en.index(), converted);
+  }
+
+  SmallVector<LLVM::LLVMType, 8> argTypes;
+  argTypes.reserve(llvm::size(result.getConvertedTypes()));
+  for (Type type : result.getConvertedTypes())
+    argTypes.push_back(unwrap(type));
+
+  // If function does not return anything, create the void result type,
+  // if it returns on element, convert it, otherwise pack the result types into
+  // a struct.
+  LLVM::LLVMType resultType =
+      type.getNumResults() == 0
+          ? LLVM::LLVMType::getVoidTy(llvmDialect)
+          : unwrap(packFunctionResults(type.getResults()));
+  if (!resultType)
+    return {};
+  return LLVM::LLVMType::getFunctionTy(resultType, argTypes, isVariadic);
+}
+
+// Convert a MemRef to an LLVM type. The result is a MemRef descriptor which
+// contains:
+//   1. the pointer to the data buffer, followed by
+//   2.  a lowered `index`-type integer containing the distance between the
+//   beginning of the buffer and the first element to be accessed through the
+//   view, followed by
+//   3. an array containing as many `index`-type integers as the rank of the
+//   MemRef: the array represents the size, in number of elements, of the memref
+//   along the given dimension. For constant MemRef dimensions, the
+//   corresponding size entry is a constant whose runtime value must match the
+//   static value, followed by
+//   4. a second array containing as many `index`-type integers as the rank of
+//   the MemRef: the second array represents the "stride" (in tensor abstraction
+//   sense), i.e. the number of consecutive elements of the underlying buffer.
+//   TODO(ntv, zinenko): add assertions for the static cases.
+//
+// template <typename Elem, size_t Rank>
+// struct {
+//   Elem *allocatedPtr;
+//   Elem *alignedPtr;
+//   int64_t offset;
+//   int64_t sizes[Rank]; // omitted when rank == 0
+//   int64_t strides[Rank]; // omitted when rank == 0
+// };
+static constexpr unsigned kAllocatedPtrPosInMemRefDescriptor = 0;
+static constexpr unsigned kAlignedPtrPosInMemRefDescriptor = 1;
+static constexpr unsigned kOffsetPosInMemRefDescriptor = 2;
+static constexpr unsigned kSizePosInMemRefDescriptor = 3;
+static constexpr unsigned kStridePosInMemRefDescriptor = 4;
+Type LLVMTypeConverter::convertMemRefType(MemRefType type) {
+  int64_t offset;
+  SmallVector<int64_t, 4> strides;
+  bool strideSuccess = succeeded(getStridesAndOffset(type, strides, offset));
+  assert(strideSuccess &&
+         "Non-strided layout maps must have been normalized away");
+  (void)strideSuccess;
+  LLVM::LLVMType elementType = unwrap(convertType(type.getElementType()));
+  if (!elementType)
+    return {};
+  auto ptrTy = elementType.getPointerTo(type.getMemorySpace());
+  auto indexTy = getIndexType();
+  auto rank = type.getRank();
+  if (rank > 0) {
+    auto arrayTy = LLVM::LLVMType::getArrayTy(indexTy, type.getRank());
+    return LLVM::LLVMType::getStructTy(ptrTy, ptrTy, indexTy, arrayTy, arrayTy);
+  }
+  return LLVM::LLVMType::getStructTy(ptrTy, ptrTy, indexTy);
+}
+
+// Converts UnrankedMemRefType to LLVMType. The result is a descriptor which
+// contains:
+// 1. int64_t rank, the dynamic rank of this MemRef
+// 2. void* ptr, pointer to the static ranked MemRef descriptor. This will be
+//    stack allocated (alloca) copy of a MemRef descriptor that got casted to
+//    be unranked.
+
+static constexpr unsigned kRankInUnrankedMemRefDescriptor = 0;
+static constexpr unsigned kPtrInUnrankedMemRefDescriptor = 1;
+
+Type LLVMTypeConverter::convertUnrankedMemRefType(UnrankedMemRefType type) {
+  auto rankTy = LLVM::LLVMType::getInt64Ty(llvmDialect);
+  auto ptrTy = LLVM::LLVMType::getInt8PtrTy(llvmDialect);
+  return LLVM::LLVMType::getStructTy(rankTy, ptrTy);
+}
+
+// Convert an n-D vector type to an LLVM vector type via (n-1)-D array type when
+// n > 1.
+// For example, `vector<4 x f32>` converts to `!llvm.type<"<4 x float>">` and
+// `vector<4 x 8 x 16 f32>` converts to `!llvm<"[4 x [8 x <16 x float>]]">`.
+Type LLVMTypeConverter::convertVectorType(VectorType type) {
+  auto elementType = unwrap(convertType(type.getElementType()));
+  if (!elementType)
+    return {};
+  auto vectorType =
+      LLVM::LLVMType::getVectorTy(elementType, type.getShape().back());
+  auto shape = type.getShape();
+  for (int i = shape.size() - 2; i >= 0; --i)
+    vectorType = LLVM::LLVMType::getArrayTy(vectorType, shape[i]);
+  return vectorType;
+}
+
+// Dispatch based on the actual type.  Return null type on error.
+Type LLVMTypeConverter::convertStandardType(Type t) {
+  return TypeSwitch<Type, Type>(t)
+      .Case([&](FloatType type) { return convertFloatType(type); })
+      .Case([&](FunctionType type) { return convertFunctionType(type); })
+      .Case([&](IndexType type) { return convertIndexType(type); })
+      .Case([&](IntegerType type) { return convertIntegerType(type); })
+      .Case([&](MemRefType type) { return convertMemRefType(type); })
+      .Case([&](UnrankedMemRefType type) {
+        return convertUnrankedMemRefType(type);
+      })
+      .Case([&](VectorType type) { return convertVectorType(type); })
+      .Case([](LLVM::LLVMType type) { return type; })
+      .Default([](Type) { return Type(); });
+}
+
+LLVMOpLowering::LLVMOpLowering(StringRef rootOpName, MLIRContext *context,
+                               LLVMTypeConverter &lowering_,
+                               PatternBenefit benefit)
+    : ConversionPattern(rootOpName, benefit, context), lowering(lowering_) {}
+
+/*============================================================================*/
+/* StructBuilder implementation                                               */
+/*============================================================================*/
+StructBuilder::StructBuilder(Value v) : value(v) {
+  assert(value != nullptr && "value cannot be null");
+  structType = value->getType().cast<LLVM::LLVMType>();
+}
+
+Value StructBuilder::extractPtr(OpBuilder &builder, Location loc,
+                                unsigned pos) {
+  Type type = structType.cast<LLVM::LLVMType>().getStructElementType(pos);
+  return builder.create<LLVM::ExtractValueOp>(loc, type, value,
+                                              builder.getI64ArrayAttr(pos));
+}
+
+void StructBuilder::setPtr(OpBuilder &builder, Location loc, unsigned pos,
+                           Value ptr) {
+  value = builder.create<LLVM::InsertValueOp>(loc, structType, value, ptr,
+                                              builder.getI64ArrayAttr(pos));
+}
+/*============================================================================*/
+/* MemRefDescriptor implementation                                            */
+/*============================================================================*/
+
+/// Construct a helper for the given descriptor value.
+MemRefDescriptor::MemRefDescriptor(Value descriptor)
+    : StructBuilder(descriptor) {
+  assert(value != nullptr && "value cannot be null");
+  indexType = value->getType().cast<LLVM::LLVMType>().getStructElementType(
+      kOffsetPosInMemRefDescriptor);
+}
+
+/// Builds IR creating an `undef` value of the descriptor type.
+MemRefDescriptor MemRefDescriptor::undef(OpBuilder &builder, Location loc,
+                                         Type descriptorType) {
+
+  Value descriptor =
+      builder.create<LLVM::UndefOp>(loc, descriptorType.cast<LLVM::LLVMType>());
+  return MemRefDescriptor(descriptor);
+}
+
+/// Builds IR creating a MemRef descriptor that represents `type` and
+/// populates it with static shape and stride information extracted from the
+/// type.
+MemRefDescriptor
+MemRefDescriptor::fromStaticShape(OpBuilder &builder, Location loc,
+                                  LLVMTypeConverter &typeConverter,
+                                  MemRefType type, Value memory) {
+  assert(type.hasStaticShape() && "unexpected dynamic shape");
+  assert(type.getAffineMaps().empty() && "unexpected layout map");
+
+  auto convertedType = typeConverter.convertType(type);
+  assert(convertedType && "unexpected failure in memref type conversion");
+
+  auto descr = MemRefDescriptor::undef(builder, loc, convertedType);
+  descr.setAllocatedPtr(builder, loc, memory);
+  descr.setAlignedPtr(builder, loc, memory);
+  descr.setConstantOffset(builder, loc, 0);
+
+  // Fill in sizes and strides, in reverse order to simplify stride
+  // calculation.
+  uint64_t runningStride = 1;
+  for (unsigned i = type.getRank(); i > 0; --i) {
+    unsigned dim = i - 1;
+    descr.setConstantSize(builder, loc, dim, type.getDimSize(dim));
+    descr.setConstantStride(builder, loc, dim, runningStride);
+    runningStride *= type.getDimSize(dim);
+  }
+  return descr;
+}
+
+/// Builds IR extracting the allocated pointer from the descriptor.
+Value MemRefDescriptor::allocatedPtr(OpBuilder &builder, Location loc) {
+  return extractPtr(builder, loc, kAllocatedPtrPosInMemRefDescriptor);
+}
+
+/// Builds IR inserting the allocated pointer into the descriptor.
+void MemRefDescriptor::setAllocatedPtr(OpBuilder &builder, Location loc,
+                                       Value ptr) {
+  setPtr(builder, loc, kAllocatedPtrPosInMemRefDescriptor, ptr);
+}
+
+/// Builds IR extracting the aligned pointer from the descriptor.
+Value MemRefDescriptor::alignedPtr(OpBuilder &builder, Location loc) {
+  return extractPtr(builder, loc, kAlignedPtrPosInMemRefDescriptor);
+}
+
+/// Builds IR inserting the aligned pointer into the descriptor.
+void MemRefDescriptor::setAlignedPtr(OpBuilder &builder, Location loc,
+                                     Value ptr) {
+  setPtr(builder, loc, kAlignedPtrPosInMemRefDescriptor, ptr);
+}
+
+// Creates a constant Op producing a value of `resultType` from an index-typed
+// integer attribute.
+static Value createIndexAttrConstant(OpBuilder &builder, Location loc,
+                                     Type resultType, int64_t value) {
+  return builder.create<LLVM::ConstantOp>(
+      loc, resultType, builder.getIntegerAttr(builder.getIndexType(), value));
+}
+
+/// Builds IR extracting the offset from the descriptor.
+Value MemRefDescriptor::offset(OpBuilder &builder, Location loc) {
+  return builder.create<LLVM::ExtractValueOp>(
+      loc, indexType, value,
+      builder.getI64ArrayAttr(kOffsetPosInMemRefDescriptor));
+}
+
+/// Builds IR inserting the offset into the descriptor.
+void MemRefDescriptor::setOffset(OpBuilder &builder, Location loc,
+                                 Value offset) {
+  value = builder.create<LLVM::InsertValueOp>(
+      loc, structType, value, offset,
+      builder.getI64ArrayAttr(kOffsetPosInMemRefDescriptor));
+}
+
+/// Builds IR inserting the offset into the descriptor.
+void MemRefDescriptor::setConstantOffset(OpBuilder &builder, Location loc,
+                                         uint64_t offset) {
+  setOffset(builder, loc,
+            createIndexAttrConstant(builder, loc, indexType, offset));
+}
+
+/// Builds IR extracting the pos-th size from the descriptor.
+Value MemRefDescriptor::size(OpBuilder &builder, Location loc, unsigned pos) {
+  return builder.create<LLVM::ExtractValueOp>(
+      loc, indexType, value,
+      builder.getI64ArrayAttr({kSizePosInMemRefDescriptor, pos}));
+}
+
+/// Builds IR inserting the pos-th size into the descriptor
+void MemRefDescriptor::setSize(OpBuilder &builder, Location loc, unsigned pos,
+                               Value size) {
+  value = builder.create<LLVM::InsertValueOp>(
+      loc, structType, value, size,
+      builder.getI64ArrayAttr({kSizePosInMemRefDescriptor, pos}));
+}
+
+/// Builds IR inserting the pos-th size into the descriptor
+void MemRefDescriptor::setConstantSize(OpBuilder &builder, Location loc,
+                                       unsigned pos, uint64_t size) {
+  setSize(builder, loc, pos,
+          createIndexAttrConstant(builder, loc, indexType, size));
+}
+
+/// Builds IR extracting the pos-th size from the descriptor.
+Value MemRefDescriptor::stride(OpBuilder &builder, Location loc, unsigned pos) {
+  return builder.create<LLVM::ExtractValueOp>(
+      loc, indexType, value,
+      builder.getI64ArrayAttr({kStridePosInMemRefDescriptor, pos}));
+}
+
+/// Builds IR inserting the pos-th stride into the descriptor
+void MemRefDescriptor::setStride(OpBuilder &builder, Location loc, unsigned pos,
+                                 Value stride) {
+  value = builder.create<LLVM::InsertValueOp>(
+      loc, structType, value, stride,
+      builder.getI64ArrayAttr({kStridePosInMemRefDescriptor, pos}));
+}
+
+/// Builds IR inserting the pos-th stride into the descriptor
+void MemRefDescriptor::setConstantStride(OpBuilder &builder, Location loc,
+                                         unsigned pos, uint64_t stride) {
+  setStride(builder, loc, pos,
+            createIndexAttrConstant(builder, loc, indexType, stride));
+}
+
+LLVM::LLVMType MemRefDescriptor::getElementType() {
+  return value->getType().cast<LLVM::LLVMType>().getStructElementType(
+      kAlignedPtrPosInMemRefDescriptor);
+}
+
+/*============================================================================*/
+/* UnrankedMemRefDescriptor implementation                                    */
+/*============================================================================*/
+
+/// Construct a helper for the given descriptor value.
+UnrankedMemRefDescriptor::UnrankedMemRefDescriptor(Value descriptor)
+    : StructBuilder(descriptor) {}
+
+/// Builds IR creating an `undef` value of the descriptor type.
+UnrankedMemRefDescriptor UnrankedMemRefDescriptor::undef(OpBuilder &builder,
+                                                         Location loc,
+                                                         Type descriptorType) {
+  Value descriptor =
+      builder.create<LLVM::UndefOp>(loc, descriptorType.cast<LLVM::LLVMType>());
+  return UnrankedMemRefDescriptor(descriptor);
+}
+Value UnrankedMemRefDescriptor::rank(OpBuilder &builder, Location loc) {
+  return extractPtr(builder, loc, kRankInUnrankedMemRefDescriptor);
+}
+void UnrankedMemRefDescriptor::setRank(OpBuilder &builder, Location loc,
+                                       Value v) {
+  setPtr(builder, loc, kRankInUnrankedMemRefDescriptor, v);
+}
+Value UnrankedMemRefDescriptor::memRefDescPtr(OpBuilder &builder,
+                                              Location loc) {
+  return extractPtr(builder, loc, kPtrInUnrankedMemRefDescriptor);
+}
+void UnrankedMemRefDescriptor::setMemRefDescPtr(OpBuilder &builder,
+                                                Location loc, Value v) {
+  setPtr(builder, loc, kPtrInUnrankedMemRefDescriptor, v);
+}
+namespace {
+// Base class for Standard to LLVM IR op conversions.  Matches the Op type
+// provided as template argument.  Carries a reference to the LLVM dialect in
+// case it is necessary for rewriters.
+template <typename SourceOp>
+class LLVMLegalizationPattern : public LLVMOpLowering {
+public:
+  // Construct a conversion pattern.
+  explicit LLVMLegalizationPattern(LLVM::LLVMDialect &dialect_,
+                                   LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(SourceOp::getOperationName(), dialect_.getContext(),
+                       lowering_),
+        dialect(dialect_) {}
+
+  // Get the LLVM IR dialect.
+  LLVM::LLVMDialect &getDialect() const { return dialect; }
+  // Get the LLVM context.
+  llvm::LLVMContext &getContext() const { return dialect.getLLVMContext(); }
+  // Get the LLVM module in which the types are constructed.
+  llvm::Module &getModule() const { return dialect.getLLVMModule(); }
+
+  // Get the MLIR type wrapping the LLVM integer type whose bit width is defined
+  // by the pointer size used in the LLVM module.
+  LLVM::LLVMType getIndexType() const {
+    return LLVM::LLVMType::getIntNTy(
+        &dialect, getModule().getDataLayout().getPointerSizeInBits());
+  }
+
+  LLVM::LLVMType getVoidType() const {
+    return LLVM::LLVMType::getVoidTy(&dialect);
+  }
+
+  // Get the MLIR type wrapping the LLVM i8* type.
+  LLVM::LLVMType getVoidPtrType() const {
+    return LLVM::LLVMType::getInt8PtrTy(&dialect);
+  }
+
+  // Create an LLVM IR pseudo-operation defining the given index constant.
+  Value createIndexConstant(ConversionPatternRewriter &builder, Location loc,
+                            uint64_t value) const {
+    return createIndexAttrConstant(builder, loc, getIndexType(), value);
+  }
+
+protected:
+  LLVM::LLVMDialect &dialect;
+};
+
+struct FuncOpConversion : public LLVMLegalizationPattern<FuncOp> {
+  using LLVMLegalizationPattern<FuncOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto funcOp = cast<FuncOp>(op);
+    FunctionType type = funcOp.getType();
+
+    // Store the positions of memref-typed arguments so that we can emit loads
+    // from them to follow the calling convention.
+    SmallVector<unsigned, 4> promotedArgIndices;
+    promotedArgIndices.reserve(type.getNumInputs());
+    for (auto en : llvm::enumerate(type.getInputs())) {
+      if (en.value().isa<MemRefType>() || en.value().isa<UnrankedMemRefType>())
+        promotedArgIndices.push_back(en.index());
+    }
+
+    // Convert the original function arguments. Struct arguments are promoted to
+    // pointer to struct arguments to allow calling external functions with
+    // various ABIs (e.g. compiled from C/C++ on platform X).
+    auto varargsAttr = funcOp.getAttrOfType<BoolAttr>("std.varargs");
+    TypeConverter::SignatureConversion result(funcOp.getNumArguments());
+    auto llvmType = lowering.convertFunctionSignature(
+        funcOp.getType(), varargsAttr && varargsAttr.getValue(), result);
+
+    // Only retain those attributes that are not constructed by build.
+    SmallVector<NamedAttribute, 4> attributes;
+    for (const auto &attr : funcOp.getAttrs()) {
+      if (attr.first.is(SymbolTable::getSymbolAttrName()) ||
+          attr.first.is(impl::getTypeAttrName()) ||
+          attr.first.is("std.varargs"))
+        continue;
+      attributes.push_back(attr);
+    }
+
+    // Create an LLVM function, use external linkage by default until MLIR
+    // functions have linkage.
+    auto newFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
+        op->getLoc(), funcOp.getName(), llvmType, LLVM::Linkage::External,
+        attributes);
+    rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
+                                newFuncOp.end());
+
+    // Tell the rewriter to convert the region signature.
+    rewriter.applySignatureConversion(&newFuncOp.getBody(), result);
+
+    // Insert loads from memref descriptor pointers in function bodies.
+    if (!newFuncOp.getBody().empty()) {
+      Block *firstBlock = &newFuncOp.getBody().front();
+      rewriter.setInsertionPoint(firstBlock, firstBlock->begin());
+      for (unsigned idx : promotedArgIndices) {
+        BlockArgument arg = firstBlock->getArgument(idx);
+        Value loaded = rewriter.create<LLVM::LoadOp>(funcOp.getLoc(), arg);
+        rewriter.replaceUsesOfBlockArgument(arg, loaded);
+      }
+    }
+
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+};
+
+//////////////// Support for Lowering operations on n-D vectors ////////////////
+namespace {
+// Helper struct to "unroll" operations on n-D vectors in terms of operations on
+// 1-D LLVM vectors.
+struct NDVectorTypeInfo {
+  // LLVM array struct which encodes n-D vectors.
+  LLVM::LLVMType llvmArrayTy;
+  // LLVM vector type which encodes the inner 1-D vector type.
+  LLVM::LLVMType llvmVectorTy;
+  // Multiplicity of llvmArrayTy to llvmVectorTy.
+  SmallVector<int64_t, 4> arraySizes;
+};
+} // namespace
+
+// For >1-D vector types, extracts the necessary information to iterate over all
+// 1-D subvectors in the underlying llrepresentation of the n-D vector
+// Iterates on the llvm array type until we hit a non-array type (which is
+// asserted to be an llvm vector type).
+static NDVectorTypeInfo extractNDVectorTypeInfo(VectorType vectorType,
+                                                LLVMTypeConverter &converter) {
+  assert(vectorType.getRank() > 1 && "expected >1D vector type");
+  NDVectorTypeInfo info;
+  info.llvmArrayTy =
+      converter.convertType(vectorType).dyn_cast<LLVM::LLVMType>();
+  if (!info.llvmArrayTy)
+    return info;
+  info.arraySizes.reserve(vectorType.getRank() - 1);
+  auto llvmTy = info.llvmArrayTy;
+  while (llvmTy.isArrayTy()) {
+    info.arraySizes.push_back(llvmTy.getArrayNumElements());
+    llvmTy = llvmTy.getArrayElementType();
+  }
+  if (!llvmTy.isVectorTy())
+    return info;
+  info.llvmVectorTy = llvmTy;
+  return info;
+}
+
+// Express `linearIndex` in terms of coordinates of `basis`.
+// Returns the empty vector when linearIndex is out of the range [0, P] where
+// P is the product of all the basis coordinates.
+//
+// Prerequisites:
+//   Basis is an array of nonnegative integers (signed type inherited from
+//   vector shape type).
+static SmallVector<int64_t, 4> getCoordinates(ArrayRef<int64_t> basis,
+                                              unsigned linearIndex) {
+  SmallVector<int64_t, 4> res;
+  res.reserve(basis.size());
+  for (unsigned basisElement : llvm::reverse(basis)) {
+    res.push_back(linearIndex % basisElement);
+    linearIndex = linearIndex / basisElement;
+  }
+  if (linearIndex > 0)
+    return {};
+  std::reverse(res.begin(), res.end());
+  return res;
+}
+
+// Iterate of linear index, convert to coords space and insert splatted 1-D
+// vector in each position.
+template <typename Lambda>
+void nDVectorIterate(const NDVectorTypeInfo &info, OpBuilder &builder,
+                     Lambda fun) {
+  unsigned ub = 1;
+  for (auto s : info.arraySizes)
+    ub *= s;
+  for (unsigned linearIndex = 0; linearIndex < ub; ++linearIndex) {
+    auto coords = getCoordinates(info.arraySizes, linearIndex);
+    // Linear index is out of bounds, we are done.
+    if (coords.empty())
+      break;
+    assert(coords.size() == info.arraySizes.size());
+    auto position = builder.getI64ArrayAttr(coords);
+    fun(position);
+  }
+}
+////////////// End Support for Lowering operations on n-D vectors //////////////
+
+// Basic lowering implementation for one-to-one rewriting from Standard Ops to
+// LLVM Dialect Ops.
+template <typename SourceOp, typename TargetOp>
+struct OneToOneLLVMOpLowering : public LLVMLegalizationPattern<SourceOp> {
+  using LLVMLegalizationPattern<SourceOp>::LLVMLegalizationPattern;
+  using Super = OneToOneLLVMOpLowering<SourceOp, TargetOp>;
+
+  // Convert the type of the result to an LLVM type, pass operands as is,
+  // preserve attributes.
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    unsigned numResults = op->getNumResults();
+
+    Type packedType;
+    if (numResults != 0) {
+      packedType = this->lowering.packFunctionResults(
+          llvm::to_vector<4>(op->getResultTypes()));
+      if (!packedType)
+        return this->matchFailure();
+    }
+
+    auto newOp = rewriter.create<TargetOp>(op->getLoc(), packedType, operands,
+                                           op->getAttrs());
+
+    // If the operation produced 0 or 1 result, return them immediately.
+    if (numResults == 0)
+      return rewriter.eraseOp(op), this->matchSuccess();
+    if (numResults == 1)
+      return rewriter.replaceOp(op, newOp.getOperation()->getResult(0)),
+             this->matchSuccess();
+
+    // Otherwise, it had been converted to an operation producing a structure.
+    // Extract individual results from the structure and return them as list.
+    SmallVector<Value, 4> results;
+    results.reserve(numResults);
+    for (unsigned i = 0; i < numResults; ++i) {
+      auto type = this->lowering.convertType(op->getResult(i)->getType());
+      results.push_back(rewriter.create<LLVM::ExtractValueOp>(
+          op->getLoc(), type, newOp.getOperation()->getResult(0),
+          rewriter.getI64ArrayAttr(i)));
+    }
+    rewriter.replaceOp(op, results);
+    return this->matchSuccess();
+  }
+};
+
+template <typename SourceOp, unsigned OpCount> struct OpCountValidator {
+  static_assert(
+      std::is_base_of<
+          typename OpTrait::NOperands<OpCount>::template Impl<SourceOp>,
+          SourceOp>::value,
+      "wrong operand count");
+};
+
+template <typename SourceOp> struct OpCountValidator<SourceOp, 1> {
+  static_assert(std::is_base_of<OpTrait::OneOperand<SourceOp>, SourceOp>::value,
+                "expected a single operand");
+};
+
+template <typename SourceOp, unsigned OpCount> void ValidateOpCount() {
+  OpCountValidator<SourceOp, OpCount>();
+}
+
+// Basic lowering implementation for rewriting from Standard Ops to LLVM Dialect
+// Ops for N-ary ops with one result. This supports higher-dimensional vector
+// types.
+template <typename SourceOp, typename TargetOp, unsigned OpCount>
+struct NaryOpLLVMOpLowering : public LLVMLegalizationPattern<SourceOp> {
+  using LLVMLegalizationPattern<SourceOp>::LLVMLegalizationPattern;
+  using Super = NaryOpLLVMOpLowering<SourceOp, TargetOp, OpCount>;
+
+  // Convert the type of the result to an LLVM type, pass operands as is,
+  // preserve attributes.
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    ValidateOpCount<SourceOp, OpCount>();
+    static_assert(
+        std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
+        "expected single result op");
+    static_assert(std::is_base_of<OpTrait::SameOperandsAndResultType<SourceOp>,
+                                  SourceOp>::value,
+                  "expected same operands and result type");
+
+    // Cannot convert ops if their operands are not of LLVM type.
+    for (Value operand : operands) {
+      if (!operand || !operand->getType().isa<LLVM::LLVMType>())
+        return this->matchFailure();
+    }
+
+    auto loc = op->getLoc();
+    auto llvmArrayTy = operands[0]->getType().cast<LLVM::LLVMType>();
+
+    if (!llvmArrayTy.isArrayTy()) {
+      auto newOp = rewriter.create<TargetOp>(
+          op->getLoc(), operands[0]->getType(), operands, op->getAttrs());
+      rewriter.replaceOp(op, newOp.getResult());
+      return this->matchSuccess();
+    }
+
+    auto vectorType = op->getResult(0)->getType().dyn_cast<VectorType>();
+    if (!vectorType)
+      return this->matchFailure();
+    auto vectorTypeInfo = extractNDVectorTypeInfo(vectorType, this->lowering);
+    auto llvmVectorTy = vectorTypeInfo.llvmVectorTy;
+    if (!llvmVectorTy || llvmArrayTy != vectorTypeInfo.llvmArrayTy)
+      return this->matchFailure();
+
+    Value desc = rewriter.create<LLVM::UndefOp>(loc, llvmArrayTy);
+    nDVectorIterate(vectorTypeInfo, rewriter, [&](ArrayAttr position) {
+      // For this unrolled `position` corresponding to the `linearIndex`^th
+      // element, extract operand vectors
+      SmallVector<Value, OpCount> extractedOperands;
+      for (unsigned i = 0; i < OpCount; ++i) {
+        extractedOperands.push_back(rewriter.create<LLVM::ExtractValueOp>(
+            loc, llvmVectorTy, operands[i], position));
+      }
+      Value newVal = rewriter.create<TargetOp>(
+          loc, llvmVectorTy, extractedOperands, op->getAttrs());
+      desc = rewriter.create<LLVM::InsertValueOp>(loc, llvmArrayTy, desc,
+                                                  newVal, position);
+    });
+    rewriter.replaceOp(op, desc);
+    return this->matchSuccess();
+  }
+};
+
+template <typename SourceOp, typename TargetOp>
+using UnaryOpLLVMOpLowering = NaryOpLLVMOpLowering<SourceOp, TargetOp, 1>;
+template <typename SourceOp, typename TargetOp>
+using BinaryOpLLVMOpLowering = NaryOpLLVMOpLowering<SourceOp, TargetOp, 2>;
+
+// Specific lowerings.
+// FIXME: this should be tablegen'ed.
+struct AbsFOpLowering : public UnaryOpLLVMOpLowering<AbsFOp, LLVM::FAbsOp> {
+  using Super::Super;
+};
+struct CeilFOpLowering : public UnaryOpLLVMOpLowering<CeilFOp, LLVM::FCeilOp> {
+  using Super::Super;
+};
+struct CosOpLowering : public UnaryOpLLVMOpLowering<CosOp, LLVM::CosOp> {
+  using Super::Super;
+};
+struct ExpOpLowering : public UnaryOpLLVMOpLowering<ExpOp, LLVM::ExpOp> {
+  using Super::Super;
+};
+struct LogOpLowering : public UnaryOpLLVMOpLowering<LogOp, LLVM::LogOp> {
+  using Super::Super;
+};
+struct Log10OpLowering : public UnaryOpLLVMOpLowering<Log10Op, LLVM::Log10Op> {
+  using Super::Super;
+};
+struct Log2OpLowering : public UnaryOpLLVMOpLowering<Log2Op, LLVM::Log2Op> {
+  using Super::Super;
+};
+struct NegFOpLowering : public UnaryOpLLVMOpLowering<NegFOp, LLVM::FNegOp> {
+  using Super::Super;
+};
+struct AddIOpLowering : public BinaryOpLLVMOpLowering<AddIOp, LLVM::AddOp> {
+  using Super::Super;
+};
+struct SubIOpLowering : public BinaryOpLLVMOpLowering<SubIOp, LLVM::SubOp> {
+  using Super::Super;
+};
+struct MulIOpLowering : public BinaryOpLLVMOpLowering<MulIOp, LLVM::MulOp> {
+  using Super::Super;
+};
+struct SignedDivIOpLowering
+    : public BinaryOpLLVMOpLowering<SignedDivIOp, LLVM::SDivOp> {
+  using Super::Super;
+};
+struct UnsignedDivIOpLowering
+    : public BinaryOpLLVMOpLowering<UnsignedDivIOp, LLVM::UDivOp> {
+  using Super::Super;
+};
+struct SignedRemIOpLowering
+    : public BinaryOpLLVMOpLowering<SignedRemIOp, LLVM::SRemOp> {
+  using Super::Super;
+};
+struct UnsignedRemIOpLowering
+    : public BinaryOpLLVMOpLowering<UnsignedRemIOp, LLVM::URemOp> {
+  using Super::Super;
+};
+struct AndOpLowering : public BinaryOpLLVMOpLowering<AndOp, LLVM::AndOp> {
+  using Super::Super;
+};
+struct OrOpLowering : public BinaryOpLLVMOpLowering<OrOp, LLVM::OrOp> {
+  using Super::Super;
+};
+struct XOrOpLowering : public BinaryOpLLVMOpLowering<XOrOp, LLVM::XOrOp> {
+  using Super::Super;
+};
+struct AddFOpLowering : public BinaryOpLLVMOpLowering<AddFOp, LLVM::FAddOp> {
+  using Super::Super;
+};
+struct SubFOpLowering : public BinaryOpLLVMOpLowering<SubFOp, LLVM::FSubOp> {
+  using Super::Super;
+};
+struct MulFOpLowering : public BinaryOpLLVMOpLowering<MulFOp, LLVM::FMulOp> {
+  using Super::Super;
+};
+struct DivFOpLowering : public BinaryOpLLVMOpLowering<DivFOp, LLVM::FDivOp> {
+  using Super::Super;
+};
+struct RemFOpLowering : public BinaryOpLLVMOpLowering<RemFOp, LLVM::FRemOp> {
+  using Super::Super;
+};
+struct CopySignOpLowering
+    : public BinaryOpLLVMOpLowering<CopySignOp, LLVM::CopySignOp> {
+  using Super::Super;
+};
+struct SelectOpLowering
+    : public OneToOneLLVMOpLowering<SelectOp, LLVM::SelectOp> {
+  using Super::Super;
+};
+struct ConstLLVMOpLowering
+    : public OneToOneLLVMOpLowering<ConstantOp, LLVM::ConstantOp> {
+  using Super::Super;
+};
+struct ShiftLeftOpLowering
+    : public OneToOneLLVMOpLowering<ShiftLeftOp, LLVM::ShlOp> {
+  using Super::Super;
+};
+struct SignedShiftRightOpLowering
+    : public OneToOneLLVMOpLowering<SignedShiftRightOp, LLVM::AShrOp> {
+  using Super::Super;
+};
+struct UnsignedShiftRightOpLowering
+    : public OneToOneLLVMOpLowering<UnsignedShiftRightOp, LLVM::LShrOp> {
+  using Super::Super;
+};
+
+// Check if the MemRefType `type` is supported by the lowering. We currently
+// only support memrefs with identity maps.
+static bool isSupportedMemRefType(MemRefType type) {
+  return type.getAffineMaps().empty() ||
+         llvm::all_of(type.getAffineMaps(),
+                      [](AffineMap map) { return map.isIdentity(); });
+}
+
+// An `alloc` is converted into a definition of a memref descriptor value and
+// a call to `malloc` to allocate the underlying data buffer.  The memref
+// descriptor is of the LLVM structure type where:
+//   1. the first element is a pointer to the allocated (typed) data buffer,
+//   2. the second element is a pointer to the (typed) payload, aligned to the
+//      specified alignment,
+//   3. the remaining elements serve to store all the sizes and strides of the
+//      memref using LLVM-converted `index` type.
+//
+// Alignment is obtained by allocating `alignment - 1` more bytes than requested
+// and shifting the aligned pointer relative to the allocated memory. If
+// alignment is unspecified, the two pointers are equal.
+struct AllocOpLowering : public LLVMLegalizationPattern<AllocOp> {
+  using LLVMLegalizationPattern<AllocOp>::LLVMLegalizationPattern;
+
+  AllocOpLowering(LLVM::LLVMDialect &dialect_, LLVMTypeConverter &converter,
+                  bool useAlloca = false)
+      : LLVMLegalizationPattern<AllocOp>(dialect_, converter),
+        useAlloca(useAlloca) {}
+
+  PatternMatchResult match(Operation *op) const override {
+    MemRefType type = cast<AllocOp>(op).getType();
+    if (isSupportedMemRefType(type))
+      return matchSuccess();
+
+    int64_t offset;
+    SmallVector<int64_t, 4> strides;
+    auto successStrides = getStridesAndOffset(type, strides, offset);
+    if (failed(successStrides))
+      return matchFailure();
+
+    // Dynamic strides are ok if they can be deduced from dynamic sizes (which
+    // is guaranteed when succeeded(successStrides)). Dynamic offset however can
+    // never be alloc'ed.
+    if (offset == MemRefType::getDynamicStrideOrOffset())
+      return matchFailure();
+
+    return matchSuccess();
+  }
+
+  void rewrite(Operation *op, ArrayRef<Value> operands,
+               ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto allocOp = cast<AllocOp>(op);
+    MemRefType type = allocOp.getType();
+
+    // Get actual sizes of the memref as values: static sizes are constant
+    // values and dynamic sizes are passed to 'alloc' as operands.  In case of
+    // zero-dimensional memref, assume a scalar (size 1).
+    SmallVector<Value, 4> sizes;
+    sizes.reserve(type.getRank());
+    unsigned i = 0;
+    for (int64_t s : type.getShape())
+      sizes.push_back(s == -1 ? operands[i++]
+                              : createIndexConstant(rewriter, loc, s));
+    if (sizes.empty())
+      sizes.push_back(createIndexConstant(rewriter, loc, 1));
+
+    // Compute the total number of memref elements.
+    Value cumulativeSize = sizes.front();
+    for (unsigned i = 1, e = sizes.size(); i < e; ++i)
+      cumulativeSize = rewriter.create<LLVM::MulOp>(
+          loc, getIndexType(), ArrayRef<Value>{cumulativeSize, sizes[i]});
+
+    // Compute the size of an individual element. This emits the MLIR equivalent
+    // of the following sizeof(...) implementation in LLVM IR:
+    //   %0 = getelementptr %elementType* null, %indexType 1
+    //   %1 = ptrtoint %elementType* %0 to %indexType
+    // which is a common pattern of getting the size of a type in bytes.
+    auto elementType = type.getElementType();
+    auto convertedPtrType =
+        lowering.convertType(elementType).cast<LLVM::LLVMType>().getPointerTo();
+    auto nullPtr = rewriter.create<LLVM::NullOp>(loc, convertedPtrType);
+    auto one = createIndexConstant(rewriter, loc, 1);
+    auto gep = rewriter.create<LLVM::GEPOp>(loc, convertedPtrType,
+                                            ArrayRef<Value>{nullPtr, one});
+    auto elementSize =
+        rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gep);
+    cumulativeSize = rewriter.create<LLVM::MulOp>(
+        loc, getIndexType(), ArrayRef<Value>{cumulativeSize, elementSize});
+
+    // Allocate the underlying buffer and store a pointer to it in the MemRef
+    // descriptor.
+    Value allocated = nullptr;
+    int alignment = 0;
+    Value alignmentValue = nullptr;
+    if (auto alignAttr = allocOp.alignment())
+      alignment = alignAttr.getValue().getSExtValue();
+
+    if (useAlloca) {
+      allocated = rewriter.create<LLVM::AllocaOp>(loc, getVoidPtrType(),
+                                                  cumulativeSize, alignment);
+    } else {
+      // Insert the `malloc` declaration if it is not already present.
+      auto module = op->getParentOfType<ModuleOp>();
+      auto mallocFunc = module.lookupSymbol<LLVM::LLVMFuncOp>("malloc");
+      if (!mallocFunc) {
+        OpBuilder moduleBuilder(
+            op->getParentOfType<ModuleOp>().getBodyRegion());
+        mallocFunc = moduleBuilder.create<LLVM::LLVMFuncOp>(
+            rewriter.getUnknownLoc(), "malloc",
+            LLVM::LLVMType::getFunctionTy(getVoidPtrType(), getIndexType(),
+                                          /*isVarArg=*/false));
+      }
+      if (alignment != 0) {
+        alignmentValue = createIndexConstant(rewriter, loc, alignment);
+        cumulativeSize = rewriter.create<LLVM::SubOp>(
+            loc,
+            rewriter.create<LLVM::AddOp>(loc, cumulativeSize, alignmentValue),
+            one);
+      }
+      allocated = rewriter
+                      .create<LLVM::CallOp>(
+                          loc, getVoidPtrType(),
+                          rewriter.getSymbolRefAttr(mallocFunc), cumulativeSize)
+                      .getResult(0);
+    }
+
+    auto structElementType = lowering.convertType(elementType);
+    auto elementPtrType = structElementType.cast<LLVM::LLVMType>().getPointerTo(
+        type.getMemorySpace());
+    Value bitcastAllocated = rewriter.create<LLVM::BitcastOp>(
+        loc, elementPtrType, ArrayRef<Value>(allocated));
+
+    int64_t offset;
+    SmallVector<int64_t, 4> strides;
+    auto successStrides = getStridesAndOffset(type, strides, offset);
+    assert(succeeded(successStrides) && "unexpected non-strided memref");
+    (void)successStrides;
+    assert(offset != MemRefType::getDynamicStrideOrOffset() &&
+           "unexpected dynamic offset");
+
+    // 0-D memref corner case: they have size 1 ...
+    assert(((type.getRank() == 0 && strides.empty() && sizes.size() == 1) ||
+            (strides.size() == sizes.size())) &&
+           "unexpected number of strides");
+
+    // Create the MemRef descriptor.
+    auto structType = lowering.convertType(type);
+    auto memRefDescriptor = MemRefDescriptor::undef(rewriter, loc, structType);
+    // Field 1: Allocated pointer, used for malloc/free.
+    memRefDescriptor.setAllocatedPtr(rewriter, loc, bitcastAllocated);
+
+    // Field 2: Actual aligned pointer to payload.
+    Value bitcastAligned = bitcastAllocated;
+    if (!useAlloca && alignment != 0) {
+      assert(alignmentValue);
+      // offset = (align - (ptr % align))% align
+      Value intVal = rewriter.create<LLVM::PtrToIntOp>(
+          loc, this->getIndexType(), allocated);
+      Value ptrModAlign =
+          rewriter.create<LLVM::URemOp>(loc, intVal, alignmentValue);
+      Value subbed =
+          rewriter.create<LLVM::SubOp>(loc, alignmentValue, ptrModAlign);
+      Value offset = rewriter.create<LLVM::URemOp>(loc, subbed, alignmentValue);
+      Value aligned = rewriter.create<LLVM::GEPOp>(loc, allocated->getType(),
+                                                   allocated, offset);
+      bitcastAligned = rewriter.create<LLVM::BitcastOp>(
+          loc, elementPtrType, ArrayRef<Value>(aligned));
+    }
+    memRefDescriptor.setAlignedPtr(rewriter, loc, bitcastAligned);
+
+    // Field 3: Offset in aligned pointer.
+    memRefDescriptor.setOffset(rewriter, loc,
+                               createIndexConstant(rewriter, loc, offset));
+
+    if (type.getRank() == 0)
+      // No size/stride descriptor in memref, return the descriptor value.
+      return rewriter.replaceOp(op, {memRefDescriptor});
+
+    // Fields 4 and 5: Sizes and strides of the strided MemRef.
+    // Store all sizes in the descriptor. Only dynamic sizes are passed in as
+    // operands to AllocOp.
+    Value runningStride = nullptr;
+    // Iterate strides in reverse order, compute runningStride and strideValues.
+    auto nStrides = strides.size();
+    SmallVector<Value, 4> strideValues(nStrides, nullptr);
+    for (auto indexedStride : llvm::enumerate(llvm::reverse(strides))) {
+      int64_t index = nStrides - 1 - indexedStride.index();
+      if (strides[index] == MemRefType::getDynamicStrideOrOffset())
+        // Identity layout map is enforced in the match function, so we compute:
+        //   `runningStride *= sizes[index]`
+        runningStride =
+            runningStride
+                ? rewriter.create<LLVM::MulOp>(loc, runningStride, sizes[index])
+                : createIndexConstant(rewriter, loc, 1);
+      else
+        runningStride = createIndexConstant(rewriter, loc, strides[index]);
+      strideValues[index] = runningStride;
+    }
+    // Fill size and stride descriptors in memref.
+    for (auto indexedSize : llvm::enumerate(sizes)) {
+      int64_t index = indexedSize.index();
+      memRefDescriptor.setSize(rewriter, loc, index, indexedSize.value());
+      memRefDescriptor.setStride(rewriter, loc, index, strideValues[index]);
+    }
+
+    // Return the final value of the descriptor.
+    rewriter.replaceOp(op, {memRefDescriptor});
+  }
+
+  bool useAlloca;
+};
+
+// A CallOp automatically promotes MemRefType to a sequence of alloca/store and
+// passes the pointer to the MemRef across function boundaries.
+template <typename CallOpType>
+struct CallOpInterfaceLowering : public LLVMLegalizationPattern<CallOpType> {
+  using LLVMLegalizationPattern<CallOpType>::LLVMLegalizationPattern;
+  using Super = CallOpInterfaceLowering<CallOpType>;
+  using Base = LLVMLegalizationPattern<CallOpType>;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    OperandAdaptor<CallOpType> transformed(operands);
+    auto callOp = cast<CallOpType>(op);
+
+    // Pack the result types into a struct.
+    Type packedResult;
+    unsigned numResults = callOp.getNumResults();
+    auto resultTypes = llvm::to_vector<4>(callOp.getResultTypes());
+
+    for (Type resType : resultTypes) {
+      assert(!resType.isa<UnrankedMemRefType>() &&
+             "Returning unranked memref is not supported. Pass result as an"
+             "argument instead.");
+      (void)resType;
+    }
+
+    if (numResults != 0) {
+      if (!(packedResult = this->lowering.packFunctionResults(resultTypes)))
+        return this->matchFailure();
+    }
+
+    auto promoted = this->lowering.promoteMemRefDescriptors(
+        op->getLoc(), /*opOperands=*/op->getOperands(), operands, rewriter);
+    auto newOp = rewriter.create<LLVM::CallOp>(op->getLoc(), packedResult,
+                                               promoted, op->getAttrs());
+
+    // If < 2 results, packing did not do anything and we can just return.
+    if (numResults < 2) {
+      rewriter.replaceOp(op, newOp.getResults());
+      return this->matchSuccess();
+    }
+
+    // Otherwise, it had been converted to an operation producing a structure.
+    // Extract individual results from the structure and return them as list.
+    // TODO(aminim, ntv, riverriddle, zinenko): this seems like patching around
+    // a particular interaction between MemRefType and CallOp lowering. Find a
+    // way to avoid special casing.
+    SmallVector<Value, 4> results;
+    results.reserve(numResults);
+    for (unsigned i = 0; i < numResults; ++i) {
+      auto type = this->lowering.convertType(op->getResult(i)->getType());
+      results.push_back(rewriter.create<LLVM::ExtractValueOp>(
+          op->getLoc(), type, newOp.getOperation()->getResult(0),
+          rewriter.getI64ArrayAttr(i)));
+    }
+    rewriter.replaceOp(op, results);
+
+    return this->matchSuccess();
+  }
+};
+
+struct CallOpLowering : public CallOpInterfaceLowering<CallOp> {
+  using Super::Super;
+};
+
+struct CallIndirectOpLowering : public CallOpInterfaceLowering<CallIndirectOp> {
+  using Super::Super;
+};
+
+// A `dealloc` is converted into a call to `free` on the underlying data buffer.
+// The memref descriptor being an SSA value, there is no need to clean it up
+// in any way.
+struct DeallocOpLowering : public LLVMLegalizationPattern<DeallocOp> {
+  using LLVMLegalizationPattern<DeallocOp>::LLVMLegalizationPattern;
+
+  DeallocOpLowering(LLVM::LLVMDialect &dialect_, LLVMTypeConverter &converter,
+                    bool useAlloca = false)
+      : LLVMLegalizationPattern<DeallocOp>(dialect_, converter),
+        useAlloca(useAlloca) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (useAlloca)
+      return rewriter.eraseOp(op), matchSuccess();
+
+    assert(operands.size() == 1 && "dealloc takes one operand");
+    OperandAdaptor<DeallocOp> transformed(operands);
+
+    // Insert the `free` declaration if it is not already present.
+    auto freeFunc =
+        op->getParentOfType<ModuleOp>().lookupSymbol<LLVM::LLVMFuncOp>("free");
+    if (!freeFunc) {
+      OpBuilder moduleBuilder(op->getParentOfType<ModuleOp>().getBodyRegion());
+      freeFunc = moduleBuilder.create<LLVM::LLVMFuncOp>(
+          rewriter.getUnknownLoc(), "free",
+          LLVM::LLVMType::getFunctionTy(getVoidType(), getVoidPtrType(),
+                                        /*isVarArg=*/false));
+    }
+
+    MemRefDescriptor memref(transformed.memref());
+    Value casted = rewriter.create<LLVM::BitcastOp>(
+        op->getLoc(), getVoidPtrType(),
+        memref.allocatedPtr(rewriter, op->getLoc()));
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+        op, ArrayRef<Type>(), rewriter.getSymbolRefAttr(freeFunc), casted);
+    return matchSuccess();
+  }
+
+  bool useAlloca;
+};
+
+// A `tanh` is converted into a call to the `tanh` function.
+struct TanhOpLowering : public LLVMLegalizationPattern<TanhOp> {
+  using LLVMLegalizationPattern<TanhOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    using LLVMFuncOpT = LLVM::LLVMFuncOp;
+    using LLVMTypeT = LLVM::LLVMType;
+
+    OperandAdaptor<TanhOp> transformed(operands);
+    LLVMTypeT operandType =
+        transformed.operand()->getType().dyn_cast_or_null<LLVM::LLVMType>();
+
+    if (!operandType)
+      return matchFailure();
+
+    std::string functionName;
+    if (operandType.isFloatTy())
+      functionName = "tanhf";
+    else if (operandType.isDoubleTy())
+      functionName = "tanh";
+    else
+      return matchFailure();
+
+    // Get a reference to the tanh function, inserting it if necessary.
+    Operation *tanhFunc =
+        SymbolTable::lookupNearestSymbolFrom(op, functionName);
+
+    LLVMFuncOpT tanhLLVMFunc;
+    if (tanhFunc) {
+      tanhLLVMFunc = cast<LLVMFuncOpT>(tanhFunc);
+    } else {
+      PatternRewriter::InsertionGuard insertGuard(rewriter);
+      auto module = op->getParentOfType<ModuleOp>();
+      rewriter.setInsertionPointToStart(module.getBody());
+      tanhLLVMFunc = rewriter.create<LLVMFuncOpT>(
+          module.getLoc(), functionName,
+          LLVMTypeT::getFunctionTy(operandType, operandType,
+                                   /*isVarArg=*/false));
+    }
+
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+        op, operandType, rewriter.getSymbolRefAttr(tanhLLVMFunc),
+        transformed.operand());
+    return matchSuccess();
+  }
+};
+
+struct MemRefCastOpLowering : public LLVMLegalizationPattern<MemRefCastOp> {
+  using LLVMLegalizationPattern<MemRefCastOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult match(Operation *op) const override {
+    auto memRefCastOp = cast<MemRefCastOp>(op);
+    Type srcType = memRefCastOp.getOperand()->getType();
+    Type dstType = memRefCastOp.getType();
+
+    if (srcType.isa<MemRefType>() && dstType.isa<MemRefType>()) {
+      MemRefType sourceType =
+          memRefCastOp.getOperand()->getType().cast<MemRefType>();
+      MemRefType targetType = memRefCastOp.getType().cast<MemRefType>();
+      return (isSupportedMemRefType(targetType) &&
+              isSupportedMemRefType(sourceType))
+                 ? matchSuccess()
+                 : matchFailure();
+    }
+
+    // At least one of the operands is unranked type
+    assert(srcType.isa<UnrankedMemRefType>() ||
+           dstType.isa<UnrankedMemRefType>());
+
+    // Unranked to unranked cast is disallowed
+    return !(srcType.isa<UnrankedMemRefType>() &&
+             dstType.isa<UnrankedMemRefType>())
+               ? matchSuccess()
+               : matchFailure();
+  }
+
+  void rewrite(Operation *op, ArrayRef<Value> operands,
+               ConversionPatternRewriter &rewriter) const override {
+    auto memRefCastOp = cast<MemRefCastOp>(op);
+    OperandAdaptor<MemRefCastOp> transformed(operands);
+
+    auto srcType = memRefCastOp.getOperand()->getType();
+    auto dstType = memRefCastOp.getType();
+    auto targetStructType = lowering.convertType(memRefCastOp.getType());
+    auto loc = op->getLoc();
+
+    if (srcType.isa<MemRefType>() && dstType.isa<MemRefType>()) {
+      // memref_cast is defined for source and destination memref types with the
+      // same element type, same mappings, same address space and same rank.
+      // Therefore a simple bitcast suffices. If not it is undefined behavior.
+      rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(op, targetStructType,
+                                                   transformed.source());
+    } else if (srcType.isa<MemRefType>() && dstType.isa<UnrankedMemRefType>()) {
+      // Casting ranked to unranked memref type
+      // Set the rank in the destination from the memref type
+      // Allocate space on the stack and copy the src memref descriptor
+      // Set the ptr in the destination to the stack space
+      auto srcMemRefType = srcType.cast<MemRefType>();
+      int64_t rank = srcMemRefType.getRank();
+      // ptr = AllocaOp sizeof(MemRefDescriptor)
+      auto ptr = lowering.promoteOneMemRefDescriptor(loc, transformed.source(),
+                                                     rewriter);
+      // voidptr = BitCastOp srcType* to void*
+      auto voidPtr =
+          rewriter.create<LLVM::BitcastOp>(loc, getVoidPtrType(), ptr)
+              .getResult();
+      // rank = ConstantOp srcRank
+      auto rankVal = rewriter.create<LLVM::ConstantOp>(
+          loc, lowering.convertType(rewriter.getIntegerType(64)),
+          rewriter.getI64IntegerAttr(rank));
+      // undef = UndefOp
+      UnrankedMemRefDescriptor memRefDesc =
+          UnrankedMemRefDescriptor::undef(rewriter, loc, targetStructType);
+      // d1 = InsertValueOp undef, rank, 0
+      memRefDesc.setRank(rewriter, loc, rankVal);
+      // d2 = InsertValueOp d1, voidptr, 1
+      memRefDesc.setMemRefDescPtr(rewriter, loc, voidPtr);
+      rewriter.replaceOp(op, (Value)memRefDesc);
+
+    } else if (srcType.isa<UnrankedMemRefType>() && dstType.isa<MemRefType>()) {
+      // Casting from unranked type to ranked.
+      // The operation is assumed to be doing a correct cast. If the destination
+      // type mismatches the unranked the type, it is undefined behavior.
+      UnrankedMemRefDescriptor memRefDesc(transformed.source());
+      // ptr = ExtractValueOp src, 1
+      auto ptr = memRefDesc.memRefDescPtr(rewriter, loc);
+      // castPtr = BitCastOp i8* to structTy*
+      auto castPtr =
+          rewriter
+              .create<LLVM::BitcastOp>(
+                  loc, targetStructType.cast<LLVM::LLVMType>().getPointerTo(),
+                  ptr)
+              .getResult();
+      // struct = LoadOp castPtr
+      auto loadOp = rewriter.create<LLVM::LoadOp>(loc, castPtr);
+      rewriter.replaceOp(op, loadOp.getResult());
+    } else {
+      llvm_unreachable("Unsuppored unranked memref to unranked memref cast");
+    }
+  }
+};
+
+// A `dim` is converted to a constant for static sizes and to an access to the
+// size stored in the memref descriptor for dynamic sizes.
+struct DimOpLowering : public LLVMLegalizationPattern<DimOp> {
+  using LLVMLegalizationPattern<DimOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto dimOp = cast<DimOp>(op);
+    OperandAdaptor<DimOp> transformed(operands);
+    MemRefType type = dimOp.getOperand()->getType().cast<MemRefType>();
+
+    auto shape = type.getShape();
+    int64_t index = dimOp.getIndex();
+    // Extract dynamic size from the memref descriptor.
+    if (ShapedType::isDynamic(shape[index]))
+      rewriter.replaceOp(op, {MemRefDescriptor(transformed.memrefOrTensor())
+                                  .size(rewriter, op->getLoc(), index)});
+    else
+      // Use constant for static size.
+      rewriter.replaceOp(
+          op, createIndexConstant(rewriter, op->getLoc(), shape[index]));
+    return matchSuccess();
+  }
+};
+
+// Common base for load and store operations on MemRefs.  Restricts the match
+// to supported MemRef types.  Provides functionality to emit code accessing a
+// specific element of the underlying data buffer.
+template <typename Derived>
+struct LoadStoreOpLowering : public LLVMLegalizationPattern<Derived> {
+  using LLVMLegalizationPattern<Derived>::LLVMLegalizationPattern;
+  using Base = LoadStoreOpLowering<Derived>;
+
+  PatternMatchResult match(Operation *op) const override {
+    MemRefType type = cast<Derived>(op).getMemRefType();
+    return isSupportedMemRefType(type) ? this->matchSuccess()
+                                       : this->matchFailure();
+  }
+
+  // Given subscript indices and array sizes in row-major order,
+  //   i_n, i_{n-1}, ..., i_1
+  //   s_n, s_{n-1}, ..., s_1
+  // obtain a value that corresponds to the linearized subscript
+  //   \sum_k i_k * \prod_{j=1}^{k-1} s_j
+  // by accumulating the running linearized value.
+  // Note that `indices` and `allocSizes` are passed in the same order as they
+  // appear in load/store operations and memref type declarations.
+  Value linearizeSubscripts(ConversionPatternRewriter &builder, Location loc,
+                            ArrayRef<Value> indices,
+                            ArrayRef<Value> allocSizes) const {
+    assert(indices.size() == allocSizes.size() &&
+           "mismatching number of indices and allocation sizes");
+    assert(!indices.empty() && "cannot linearize a 0-dimensional access");
+
+    Value linearized = indices.front();
+    for (int i = 1, nSizes = allocSizes.size(); i < nSizes; ++i) {
+      linearized = builder.create<LLVM::MulOp>(
+          loc, this->getIndexType(),
+          ArrayRef<Value>{linearized, allocSizes[i]});
+      linearized = builder.create<LLVM::AddOp>(
+          loc, this->getIndexType(), ArrayRef<Value>{linearized, indices[i]});
+    }
+    return linearized;
+  }
+
+  // This is a strided getElementPtr variant that linearizes subscripts as:
+  //   `base_offset + index_0 * stride_0 + ... + index_n * stride_n`.
+  Value getStridedElementPtr(Location loc, Type elementTypePtr,
+                             Value descriptor, ArrayRef<Value> indices,
+                             ArrayRef<int64_t> strides, int64_t offset,
+                             ConversionPatternRewriter &rewriter) const {
+    MemRefDescriptor memRefDescriptor(descriptor);
+
+    Value base = memRefDescriptor.alignedPtr(rewriter, loc);
+    Value offsetValue = offset == MemRefType::getDynamicStrideOrOffset()
+                            ? memRefDescriptor.offset(rewriter, loc)
+                            : this->createIndexConstant(rewriter, loc, offset);
+
+    for (int i = 0, e = indices.size(); i < e; ++i) {
+      Value stride = strides[i] == MemRefType::getDynamicStrideOrOffset()
+                         ? memRefDescriptor.stride(rewriter, loc, i)
+                         : this->createIndexConstant(rewriter, loc, strides[i]);
+      Value additionalOffset =
+          rewriter.create<LLVM::MulOp>(loc, indices[i], stride);
+      offsetValue =
+          rewriter.create<LLVM::AddOp>(loc, offsetValue, additionalOffset);
+    }
+    return rewriter.create<LLVM::GEPOp>(loc, elementTypePtr, base, offsetValue);
+  }
+
+  Value getDataPtr(Location loc, MemRefType type, Value memRefDesc,
+                   ArrayRef<Value> indices, ConversionPatternRewriter &rewriter,
+                   llvm::Module &module) const {
+    LLVM::LLVMType ptrType = MemRefDescriptor(memRefDesc).getElementType();
+    int64_t offset;
+    SmallVector<int64_t, 4> strides;
+    auto successStrides = getStridesAndOffset(type, strides, offset);
+    assert(succeeded(successStrides) && "unexpected non-strided memref");
+    (void)successStrides;
+    return getStridedElementPtr(loc, ptrType, memRefDesc, indices, strides,
+                                offset, rewriter);
+  }
+};
+
+// Load operation is lowered to obtaining a pointer to the indexed element
+// and loading it.
+struct LoadOpLowering : public LoadStoreOpLowering<LoadOp> {
+  using Base::Base;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loadOp = cast<LoadOp>(op);
+    OperandAdaptor<LoadOp> transformed(operands);
+    auto type = loadOp.getMemRefType();
+
+    Value dataPtr = getDataPtr(op->getLoc(), type, transformed.memref(),
+                               transformed.indices(), rewriter, getModule());
+    rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, dataPtr);
+    return matchSuccess();
+  }
+};
+
+// Store operation is lowered to obtaining a pointer to the indexed element,
+// and storing the given value to it.
+struct StoreOpLowering : public LoadStoreOpLowering<StoreOp> {
+  using Base::Base;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto type = cast<StoreOp>(op).getMemRefType();
+    OperandAdaptor<StoreOp> transformed(operands);
+
+    Value dataPtr = getDataPtr(op->getLoc(), type, transformed.memref(),
+                               transformed.indices(), rewriter, getModule());
+    rewriter.replaceOpWithNewOp<LLVM::StoreOp>(op, transformed.value(),
+                                               dataPtr);
+    return matchSuccess();
+  }
+};
+
+// The prefetch operation is lowered in a way similar to the load operation
+// except that the llvm.prefetch operation is used for replacement.
+struct PrefetchOpLowering : public LoadStoreOpLowering<PrefetchOp> {
+  using Base::Base;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto prefetchOp = cast<PrefetchOp>(op);
+    OperandAdaptor<PrefetchOp> transformed(operands);
+    auto type = prefetchOp.getMemRefType();
+
+    Value dataPtr = getDataPtr(op->getLoc(), type, transformed.memref(),
+                               transformed.indices(), rewriter, getModule());
+
+    // Replace with llvm.prefetch.
+    auto llvmI32Type = lowering.convertType(rewriter.getIntegerType(32));
+    auto isWrite = rewriter.create<LLVM::ConstantOp>(
+        op->getLoc(), llvmI32Type,
+        rewriter.getI32IntegerAttr(prefetchOp.isWrite()));
+    auto localityHint = rewriter.create<LLVM::ConstantOp>(
+        op->getLoc(), llvmI32Type,
+        rewriter.getI32IntegerAttr(prefetchOp.localityHint().getZExtValue()));
+    auto isData = rewriter.create<LLVM::ConstantOp>(
+        op->getLoc(), llvmI32Type,
+        rewriter.getI32IntegerAttr(prefetchOp.isDataCache()));
+
+    rewriter.replaceOpWithNewOp<LLVM::Prefetch>(op, dataPtr, isWrite,
+                                                localityHint, isData);
+    return matchSuccess();
+  }
+};
+
+// The lowering of index_cast becomes an integer conversion since index becomes
+// an integer.  If the bit width of the source and target integer types is the
+// same, just erase the cast.  If the target type is wider, sign-extend the
+// value, otherwise truncate it.
+struct IndexCastOpLowering : public LLVMLegalizationPattern<IndexCastOp> {
+  using LLVMLegalizationPattern<IndexCastOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    IndexCastOpOperandAdaptor transformed(operands);
+    auto indexCastOp = cast<IndexCastOp>(op);
+
+    auto targetType =
+        this->lowering.convertType(indexCastOp.getResult()->getType())
+            .cast<LLVM::LLVMType>();
+    auto sourceType = transformed.in()->getType().cast<LLVM::LLVMType>();
+    unsigned targetBits = targetType.getUnderlyingType()->getIntegerBitWidth();
+    unsigned sourceBits = sourceType.getUnderlyingType()->getIntegerBitWidth();
+
+    if (targetBits == sourceBits)
+      rewriter.replaceOp(op, transformed.in());
+    else if (targetBits < sourceBits)
+      rewriter.replaceOpWithNewOp<LLVM::TruncOp>(op, targetType,
+                                                 transformed.in());
+    else
+      rewriter.replaceOpWithNewOp<LLVM::SExtOp>(op, targetType,
+                                                transformed.in());
+    return matchSuccess();
+  }
+};
+
+// Convert std.cmp predicate into the LLVM dialect CmpPredicate.  The two
+// enums share the numerical values so just cast.
+template <typename LLVMPredType, typename StdPredType>
+static LLVMPredType convertCmpPredicate(StdPredType pred) {
+  return static_cast<LLVMPredType>(pred);
+}
+
+struct CmpIOpLowering : public LLVMLegalizationPattern<CmpIOp> {
+  using LLVMLegalizationPattern<CmpIOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto cmpiOp = cast<CmpIOp>(op);
+    CmpIOpOperandAdaptor transformed(operands);
+
+    rewriter.replaceOpWithNewOp<LLVM::ICmpOp>(
+        op, lowering.convertType(cmpiOp.getResult()->getType()),
+        rewriter.getI64IntegerAttr(static_cast<int64_t>(
+            convertCmpPredicate<LLVM::ICmpPredicate>(cmpiOp.getPredicate()))),
+        transformed.lhs(), transformed.rhs());
+
+    return matchSuccess();
+  }
+};
+
+struct CmpFOpLowering : public LLVMLegalizationPattern<CmpFOp> {
+  using LLVMLegalizationPattern<CmpFOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto cmpfOp = cast<CmpFOp>(op);
+    CmpFOpOperandAdaptor transformed(operands);
+
+    rewriter.replaceOpWithNewOp<LLVM::FCmpOp>(
+        op, lowering.convertType(cmpfOp.getResult()->getType()),
+        rewriter.getI64IntegerAttr(static_cast<int64_t>(
+            convertCmpPredicate<LLVM::FCmpPredicate>(cmpfOp.getPredicate()))),
+        transformed.lhs(), transformed.rhs());
+
+    return matchSuccess();
+  }
+};
+
+struct SIToFPLowering
+    : public OneToOneLLVMOpLowering<SIToFPOp, LLVM::SIToFPOp> {
+  using Super::Super;
+};
+
+struct FPExtLowering : public OneToOneLLVMOpLowering<FPExtOp, LLVM::FPExtOp> {
+  using Super::Super;
+};
+
+struct FPTruncLowering
+    : public OneToOneLLVMOpLowering<FPTruncOp, LLVM::FPTruncOp> {
+  using Super::Super;
+};
+
+struct SignExtendIOpLowering
+    : public OneToOneLLVMOpLowering<SignExtendIOp, LLVM::SExtOp> {
+  using Super::Super;
+};
+
+struct TruncateIOpLowering
+    : public OneToOneLLVMOpLowering<TruncateIOp, LLVM::TruncOp> {
+  using Super::Super;
+};
+
+struct ZeroExtendIOpLowering
+    : public OneToOneLLVMOpLowering<ZeroExtendIOp, LLVM::ZExtOp> {
+  using Super::Super;
+};
+
+// Base class for LLVM IR lowering terminator operations with successors.
+template <typename SourceOp, typename TargetOp>
+struct OneToOneLLVMTerminatorLowering
+    : public LLVMLegalizationPattern<SourceOp> {
+  using LLVMLegalizationPattern<SourceOp>::LLVMLegalizationPattern;
+  using Super = OneToOneLLVMTerminatorLowering<SourceOp, TargetOp>;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> properOperands,
+                  ArrayRef<Block *> destinations,
+                  ArrayRef<ArrayRef<Value>> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    SmallVector<ValueRange, 2> operandRanges(operands.begin(), operands.end());
+    rewriter.replaceOpWithNewOp<TargetOp>(op, properOperands, destinations,
+                                          operandRanges, op->getAttrs());
+    return this->matchSuccess();
+  }
+};
+
+// Special lowering pattern for `ReturnOps`.  Unlike all other operations,
+// `ReturnOp` interacts with the function signature and must have as many
+// operands as the function has return values.  Because in LLVM IR, functions
+// can only return 0 or 1 value, we pack multiple values into a structure type.
+// Emit `UndefOp` followed by `InsertValueOp`s to create such structure if
+// necessary before returning it
+struct ReturnOpLowering : public LLVMLegalizationPattern<ReturnOp> {
+  using LLVMLegalizationPattern<ReturnOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    unsigned numArguments = op->getNumOperands();
+
+    // If ReturnOp has 0 or 1 operand, create it and return immediately.
+    if (numArguments == 0) {
+      rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
+          op, ArrayRef<Value>(), ArrayRef<Block *>(), op->getAttrs());
+      return matchSuccess();
+    }
+    if (numArguments == 1) {
+      rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
+          op, ArrayRef<Value>(operands.front()), ArrayRef<Block *>(),
+          op->getAttrs());
+      return matchSuccess();
+    }
+
+    // Otherwise, we need to pack the arguments into an LLVM struct type before
+    // returning.
+    auto packedType =
+        lowering.packFunctionResults(llvm::to_vector<4>(op->getOperandTypes()));
+
+    Value packed = rewriter.create<LLVM::UndefOp>(op->getLoc(), packedType);
+    for (unsigned i = 0; i < numArguments; ++i) {
+      packed = rewriter.create<LLVM::InsertValueOp>(
+          op->getLoc(), packedType, packed, operands[i],
+          rewriter.getI64ArrayAttr(i));
+    }
+    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
+        op, llvm::makeArrayRef(packed), ArrayRef<Block *>(), op->getAttrs());
+    return matchSuccess();
+  }
+};
+
+// FIXME: this should be tablegen'ed as well.
+struct BranchOpLowering
+    : public OneToOneLLVMTerminatorLowering<BranchOp, LLVM::BrOp> {
+  using Super::Super;
+};
+struct CondBranchOpLowering
+    : public OneToOneLLVMTerminatorLowering<CondBranchOp, LLVM::CondBrOp> {
+  using Super::Super;
+};
+
+// The Splat operation is lowered to an insertelement + a shufflevector
+// operation. Splat to only 1-d vector result types are lowered.
+struct SplatOpLowering : public LLVMLegalizationPattern<SplatOp> {
+  using LLVMLegalizationPattern<SplatOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto splatOp = cast<SplatOp>(op);
+    VectorType resultType = splatOp.getType().dyn_cast<VectorType>();
+    if (!resultType || resultType.getRank() != 1)
+      return matchFailure();
+
+    // First insert it into an undef vector so we can shuffle it.
+    auto vectorType = lowering.convertType(splatOp.getType());
+    Value undef = rewriter.create<LLVM::UndefOp>(op->getLoc(), vectorType);
+    auto zero = rewriter.create<LLVM::ConstantOp>(
+        op->getLoc(), lowering.convertType(rewriter.getIntegerType(32)),
+        rewriter.getZeroAttr(rewriter.getIntegerType(32)));
+
+    auto v = rewriter.create<LLVM::InsertElementOp>(
+        op->getLoc(), vectorType, undef, splatOp.getOperand(), zero);
+
+    int64_t width = splatOp.getType().cast<VectorType>().getDimSize(0);
+    SmallVector<int32_t, 4> zeroValues(width, 0);
+
+    // Shuffle the value across the desired number of elements.
+    ArrayAttr zeroAttrs = rewriter.getI32ArrayAttr(zeroValues);
+    rewriter.replaceOpWithNewOp<LLVM::ShuffleVectorOp>(op, v, undef, zeroAttrs);
+    return matchSuccess();
+  }
+};
+
+// The Splat operation is lowered to an insertelement + a shufflevector
+// operation. Splat to only 2+-d vector result types are lowered by the
+// SplatNdOpLowering, the 1-d case is handled by SplatOpLowering.
+struct SplatNdOpLowering : public LLVMLegalizationPattern<SplatOp> {
+  using LLVMLegalizationPattern<SplatOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto splatOp = cast<SplatOp>(op);
+    OperandAdaptor<SplatOp> adaptor(operands);
+    VectorType resultType = splatOp.getType().dyn_cast<VectorType>();
+    if (!resultType || resultType.getRank() == 1)
+      return matchFailure();
+
+    // First insert it into an undef vector so we can shuffle it.
+    auto loc = op->getLoc();
+    auto vectorTypeInfo = extractNDVectorTypeInfo(resultType, lowering);
+    auto llvmArrayTy = vectorTypeInfo.llvmArrayTy;
+    auto llvmVectorTy = vectorTypeInfo.llvmVectorTy;
+    if (!llvmArrayTy || !llvmVectorTy)
+      return matchFailure();
+
+    // Construct returned value.
+    Value desc = rewriter.create<LLVM::UndefOp>(loc, llvmArrayTy);
+
+    // Construct a 1-D vector with the splatted value that we insert in all the
+    // places within the returned descriptor.
+    Value vdesc = rewriter.create<LLVM::UndefOp>(loc, llvmVectorTy);
+    auto zero = rewriter.create<LLVM::ConstantOp>(
+        loc, lowering.convertType(rewriter.getIntegerType(32)),
+        rewriter.getZeroAttr(rewriter.getIntegerType(32)));
+    Value v = rewriter.create<LLVM::InsertElementOp>(loc, llvmVectorTy, vdesc,
+                                                     adaptor.input(), zero);
+
+    // Shuffle the value across the desired number of elements.
+    int64_t width = resultType.getDimSize(resultType.getRank() - 1);
+    SmallVector<int32_t, 4> zeroValues(width, 0);
+    ArrayAttr zeroAttrs = rewriter.getI32ArrayAttr(zeroValues);
+    v = rewriter.create<LLVM::ShuffleVectorOp>(loc, v, v, zeroAttrs);
+
+    // Iterate of linear index, convert to coords space and insert splatted 1-D
+    // vector in each position.
+    nDVectorIterate(vectorTypeInfo, rewriter, [&](ArrayAttr position) {
+      desc = rewriter.create<LLVM::InsertValueOp>(loc, llvmArrayTy, desc, v,
+                                                  position);
+    });
+    rewriter.replaceOp(op, desc);
+    return matchSuccess();
+  }
+};
+
+/// Conversion pattern that transforms a subview op into:
+///   1. An `llvm.mlir.undef` operation to create a memref descriptor
+///   2. Updates to the descriptor to introduce the data ptr, offset, size
+///      and stride.
+/// The subview op is replaced by the descriptor.
+struct SubViewOpLowering : public LLVMLegalizationPattern<SubViewOp> {
+  using LLVMLegalizationPattern<SubViewOp>::LLVMLegalizationPattern;
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto viewOp = cast<SubViewOp>(op);
+    // TODO(b/144779634, ravishankarm) : After Tblgen is adapted to support
+    // having multiple variadic operands where each operand can have different
+    // number of entries, clean all of this up.
+    SmallVector<Value, 2> dynamicOffsets(
+        std::next(operands.begin()),
+        std::next(operands.begin(), 1 + viewOp.getNumOffsets()));
+    SmallVector<Value, 2> dynamicSizes(
+        std::next(operands.begin(), 1 + viewOp.getNumOffsets()),
+        std::next(operands.begin(),
+                  1 + viewOp.getNumOffsets() + viewOp.getNumSizes()));
+    SmallVector<Value, 2> dynamicStrides(
+        std::next(operands.begin(),
+                  1 + viewOp.getNumOffsets() + viewOp.getNumSizes()),
+        operands.end());
+
+    auto sourceMemRefType = viewOp.source()->getType().cast<MemRefType>();
+    auto sourceElementTy =
+        lowering.convertType(sourceMemRefType.getElementType())
+            .dyn_cast_or_null<LLVM::LLVMType>();
+
+    auto viewMemRefType = viewOp.getType();
+    auto targetElementTy = lowering.convertType(viewMemRefType.getElementType())
+                               .dyn_cast<LLVM::LLVMType>();
+    auto targetDescTy =
+        lowering.convertType(viewMemRefType).dyn_cast_or_null<LLVM::LLVMType>();
+    if (!sourceElementTy || !targetDescTy)
+      return matchFailure();
+
+    // Currently, only rank > 0 and full or no operands are supported. Fail to
+    // convert otherwise.
+    unsigned rank = sourceMemRefType.getRank();
+    if (viewMemRefType.getRank() == 0 || (rank != dynamicOffsets.size()) ||
+        (!dynamicSizes.empty() && rank != dynamicSizes.size()) ||
+        (!dynamicStrides.empty() && rank != dynamicStrides.size()))
+      return matchFailure();
+
+    int64_t offset;
+    SmallVector<int64_t, 4> strides;
+    auto successStrides = getStridesAndOffset(viewMemRefType, strides, offset);
+    if (failed(successStrides))
+      return matchFailure();
+
+    // Create the descriptor.
+    MemRefDescriptor sourceMemRef(operands.front());
+    auto targetMemRef = MemRefDescriptor::undef(rewriter, loc, targetDescTy);
+
+    // Copy the buffer pointer from the old descriptor to the new one.
+    Value extracted = sourceMemRef.allocatedPtr(rewriter, loc);
+    Value bitcastPtr = rewriter.create<LLVM::BitcastOp>(
+        loc, targetElementTy.getPointerTo(), extracted);
+    targetMemRef.setAllocatedPtr(rewriter, loc, bitcastPtr);
+
+    extracted = sourceMemRef.alignedPtr(rewriter, loc);
+    bitcastPtr = rewriter.create<LLVM::BitcastOp>(
+        loc, targetElementTy.getPointerTo(), extracted);
+    targetMemRef.setAlignedPtr(rewriter, loc, bitcastPtr);
+
+    // Extract strides needed to compute offset.
+    SmallVector<Value, 4> strideValues;
+    strideValues.reserve(viewMemRefType.getRank());
+    for (int i = 0, e = viewMemRefType.getRank(); i < e; ++i)
+      strideValues.push_back(sourceMemRef.stride(rewriter, loc, i));
+
+    // Fill in missing dynamic sizes.
+    auto llvmIndexType = lowering.convertType(rewriter.getIndexType());
+    if (dynamicSizes.empty()) {
+      dynamicSizes.reserve(viewMemRefType.getRank());
+      auto shape = viewMemRefType.getShape();
+      for (auto extent : shape) {
+        dynamicSizes.push_back(rewriter.create<LLVM::ConstantOp>(
+            loc, llvmIndexType, rewriter.getI64IntegerAttr(extent)));
+      }
+    }
+
+    // Offset.
+    Value baseOffset = sourceMemRef.offset(rewriter, loc);
+    for (int i = 0, e = viewMemRefType.getRank(); i < e; ++i) {
+      Value min = dynamicOffsets[i];
+      baseOffset = rewriter.create<LLVM::AddOp>(
+          loc, baseOffset,
+          rewriter.create<LLVM::MulOp>(loc, min, strideValues[i]));
+    }
+    targetMemRef.setOffset(rewriter, loc, baseOffset);
+
+    // Update sizes and strides.
+    for (int i = viewMemRefType.getRank() - 1; i >= 0; --i) {
+      targetMemRef.setSize(rewriter, loc, i, dynamicSizes[i]);
+      Value newStride;
+      if (dynamicStrides.empty())
+        newStride = rewriter.create<LLVM::ConstantOp>(
+            loc, llvmIndexType, rewriter.getI64IntegerAttr(strides[i]));
+      else
+        newStride = rewriter.create<LLVM::MulOp>(loc, dynamicStrides[i],
+                                                 strideValues[i]);
+      targetMemRef.setStride(rewriter, loc, i, newStride);
+    }
+
+    rewriter.replaceOp(op, {targetMemRef});
+    return matchSuccess();
+  }
+};
+
+/// Conversion pattern that transforms a op into:
+///   1. An `llvm.mlir.undef` operation to create a memref descriptor
+///   2. Updates to the descriptor to introduce the data ptr, offset, size
+///      and stride.
+/// The view op is replaced by the descriptor.
+struct ViewOpLowering : public LLVMLegalizationPattern<ViewOp> {
+  using LLVMLegalizationPattern<ViewOp>::LLVMLegalizationPattern;
+
+  // Build and return the value for the idx^th shape dimension, either by
+  // returning the constant shape dimension or counting the proper dynamic size.
+  Value getSize(ConversionPatternRewriter &rewriter, Location loc,
+                ArrayRef<int64_t> shape, ArrayRef<Value> dynamicSizes,
+                unsigned idx) const {
+    assert(idx < shape.size());
+    if (!ShapedType::isDynamic(shape[idx]))
+      return createIndexConstant(rewriter, loc, shape[idx]);
+    // Count the number of dynamic dims in range [0, idx]
+    unsigned nDynamic = llvm::count_if(shape.take_front(idx), [](int64_t v) {
+      return ShapedType::isDynamic(v);
+    });
+    return dynamicSizes[nDynamic];
+  }
+
+  // Build and return the idx^th stride, either by returning the constant stride
+  // or by computing the dynamic stride from the current `runningStride` and
+  // `nextSize`. The caller should keep a running stride and update it with the
+  // result returned by this function.
+  Value getStride(ConversionPatternRewriter &rewriter, Location loc,
+                  ArrayRef<int64_t> strides, Value nextSize,
+                  Value runningStride, unsigned idx) const {
+    assert(idx < strides.size());
+    if (strides[idx] != MemRefType::getDynamicStrideOrOffset())
+      return createIndexConstant(rewriter, loc, strides[idx]);
+    if (nextSize)
+      return runningStride
+                 ? rewriter.create<LLVM::MulOp>(loc, runningStride, nextSize)
+                 : nextSize;
+    assert(!runningStride);
+    return createIndexConstant(rewriter, loc, 1);
+  }
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto viewOp = cast<ViewOp>(op);
+    ViewOpOperandAdaptor adaptor(operands);
+
+    auto viewMemRefType = viewOp.getType();
+    auto targetElementTy = lowering.convertType(viewMemRefType.getElementType())
+                               .dyn_cast<LLVM::LLVMType>();
+    auto targetDescTy =
+        lowering.convertType(viewMemRefType).dyn_cast<LLVM::LLVMType>();
+    if (!targetDescTy)
+      return op->emitWarning("Target descriptor type not converted to LLVM"),
+             matchFailure();
+
+    int64_t offset;
+    SmallVector<int64_t, 4> strides;
+    auto successStrides = getStridesAndOffset(viewMemRefType, strides, offset);
+    if (failed(successStrides))
+      return op->emitWarning("cannot cast to non-strided shape"),
+             matchFailure();
+
+    // Create the descriptor.
+    MemRefDescriptor sourceMemRef(adaptor.source());
+    auto targetMemRef = MemRefDescriptor::undef(rewriter, loc, targetDescTy);
+
+    // Field 1: Copy the allocated pointer, used for malloc/free.
+    Value extracted = sourceMemRef.allocatedPtr(rewriter, loc);
+    Value bitcastPtr = rewriter.create<LLVM::BitcastOp>(
+        loc, targetElementTy.getPointerTo(), extracted);
+    targetMemRef.setAllocatedPtr(rewriter, loc, bitcastPtr);
+
+    // Field 2: Copy the actual aligned pointer to payload.
+    extracted = sourceMemRef.alignedPtr(rewriter, loc);
+    bitcastPtr = rewriter.create<LLVM::BitcastOp>(
+        loc, targetElementTy.getPointerTo(), extracted);
+    targetMemRef.setAlignedPtr(rewriter, loc, bitcastPtr);
+
+    // Field 3: Copy the offset in aligned pointer.
+    unsigned numDynamicSizes = llvm::size(viewOp.getDynamicSizes());
+    (void)numDynamicSizes;
+    bool hasDynamicOffset = offset == MemRefType::getDynamicStrideOrOffset();
+    auto sizeAndOffsetOperands = adaptor.operands();
+    assert(llvm::size(sizeAndOffsetOperands) ==
+           numDynamicSizes + (hasDynamicOffset ? 1 : 0));
+    Value baseOffset = !hasDynamicOffset
+                           ? createIndexConstant(rewriter, loc, offset)
+                           // TODO(ntv): better adaptor.
+                           : sizeAndOffsetOperands.front();
+    targetMemRef.setOffset(rewriter, loc, baseOffset);
+
+    // Early exit for 0-D corner case.
+    if (viewMemRefType.getRank() == 0)
+      return rewriter.replaceOp(op, {targetMemRef}), matchSuccess();
+
+    // Fields 4 and 5: Update sizes and strides.
+    if (strides.back() != 1)
+      return op->emitWarning("cannot cast to non-contiguous shape"),
+             matchFailure();
+    Value stride = nullptr, nextSize = nullptr;
+    // Drop the dynamic stride from the operand list, if present.
+    ArrayRef<Value> sizeOperands(sizeAndOffsetOperands);
+    if (hasDynamicOffset)
+      sizeOperands = sizeOperands.drop_front();
+    for (int i = viewMemRefType.getRank() - 1; i >= 0; --i) {
+      // Update size.
+      Value size =
+          getSize(rewriter, loc, viewMemRefType.getShape(), sizeOperands, i);
+      targetMemRef.setSize(rewriter, loc, i, size);
+      // Update stride.
+      stride = getStride(rewriter, loc, strides, nextSize, stride, i);
+      targetMemRef.setStride(rewriter, loc, i, stride);
+      nextSize = size;
+    }
+
+    rewriter.replaceOp(op, {targetMemRef});
+    return matchSuccess();
+  }
+};
+
+} // namespace
+
+static void ensureDistinctSuccessors(Block &bb) {
+  auto *terminator = bb.getTerminator();
+
+  // Find repeated successors with arguments.
+  llvm::SmallDenseMap<Block *, SmallVector<int, 4>> successorPositions;
+  for (int i = 0, e = terminator->getNumSuccessors(); i < e; ++i) {
+    Block *successor = terminator->getSuccessor(i);
+    // Blocks with no arguments are safe even if they appear multiple times
+    // because they don't need PHI nodes.
+    if (successor->getNumArguments() == 0)
+      continue;
+    successorPositions[successor].push_back(i);
+  }
+
+  // If a successor appears for the second or more time in the terminator,
+  // create a new dummy block that unconditionally branches to the original
+  // destination, and retarget the terminator to branch to this new block.
+  // There is no need to pass arguments to the dummy block because it will be
+  // dominated by the original block and can therefore use any values defined in
+  // the original block.
+  for (const auto &successor : successorPositions) {
+    const auto &positions = successor.second;
+    // Start from the second occurrence of a block in the successor list.
+    for (auto position = std::next(positions.begin()), end = positions.end();
+         position != end; ++position) {
+      auto *dummyBlock = new Block();
+      bb.getParent()->push_back(dummyBlock);
+      auto builder = OpBuilder(dummyBlock);
+      SmallVector<Value, 8> operands(
+          terminator->getSuccessorOperands(*position));
+      builder.create<BranchOp>(terminator->getLoc(), successor.first, operands);
+      terminator->setSuccessor(dummyBlock, *position);
+      for (int i = 0, e = terminator->getNumSuccessorOperands(*position); i < e;
+           ++i)
+        terminator->eraseSuccessorOperand(*position, i);
+    }
+  }
+}
+
+void mlir::LLVM::ensureDistinctSuccessors(ModuleOp m) {
+  for (auto f : m.getOps<FuncOp>()) {
+    for (auto &bb : f.getBlocks()) {
+      ::ensureDistinctSuccessors(bb);
+    }
+  }
+}
+
+/// Collect a set of patterns to convert from the Standard dialect to LLVM.
+void mlir::populateStdToLLVMNonMemoryConversionPatterns(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+  // FIXME: this should be tablegen'ed
+  // clang-format off
+  patterns.insert<
+      AbsFOpLowering,
+      AddFOpLowering,
+      AddIOpLowering,
+      AndOpLowering,
+      BranchOpLowering,
+      CallIndirectOpLowering,
+      CallOpLowering,
+      CeilFOpLowering,
+      CmpFOpLowering,
+      CmpIOpLowering,
+      CondBranchOpLowering,
+      CopySignOpLowering,
+      CosOpLowering,
+      ConstLLVMOpLowering,
+      DivFOpLowering,
+      ExpOpLowering,
+      LogOpLowering,
+      Log10OpLowering,
+      Log2OpLowering,
+      FPExtLowering,
+      FPTruncLowering,
+      IndexCastOpLowering,
+      MulFOpLowering,
+      MulIOpLowering,
+      NegFOpLowering,
+      OrOpLowering,
+      PrefetchOpLowering,
+      RemFOpLowering,
+      ReturnOpLowering,
+      SIToFPLowering,
+      SelectOpLowering,
+      ShiftLeftOpLowering,
+      SignExtendIOpLowering,
+      SignedDivIOpLowering,
+      SignedRemIOpLowering,
+      SignedShiftRightOpLowering,
+      SplatOpLowering,
+      SplatNdOpLowering,
+      SubFOpLowering,
+      SubIOpLowering,
+      TanhOpLowering,
+      TruncateIOpLowering,
+      UnsignedDivIOpLowering,
+      UnsignedRemIOpLowering,
+      UnsignedShiftRightOpLowering,
+      XOrOpLowering,
+      ZeroExtendIOpLowering>(*converter.getDialect(), converter);
+  // clang-format on
+}
+
+void mlir::populateStdToLLVMMemoryConversionPatters(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+  // clang-format off
+  patterns.insert<
+      DimOpLowering,
+      FuncOpConversion,
+      LoadOpLowering,
+      MemRefCastOpLowering,
+      StoreOpLowering,
+      SubViewOpLowering,
+      ViewOpLowering>(*converter.getDialect(), converter);
+  patterns.insert<
+      AllocOpLowering,
+      DeallocOpLowering>(
+        *converter.getDialect(), converter, clUseAlloca.getValue());
+  // clang-format on
+}
+
+void mlir::populateStdToLLVMConversionPatterns(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+  populateStdToLLVMNonMemoryConversionPatterns(converter, patterns);
+  populateStdToLLVMMemoryConversionPatters(converter, patterns);
+}
+
+// Convert types using the stored LLVM IR module.
+Type LLVMTypeConverter::convertType(Type t) { return convertStandardType(t); }
+
+// Create an LLVM IR structure type if there is more than one result.
+Type LLVMTypeConverter::packFunctionResults(ArrayRef<Type> types) {
+  assert(!types.empty() && "expected non-empty list of type");
+
+  if (types.size() == 1)
+    return convertType(types.front());
+
+  SmallVector<LLVM::LLVMType, 8> resultTypes;
+  resultTypes.reserve(types.size());
+  for (auto t : types) {
+    auto converted = convertType(t).dyn_cast<LLVM::LLVMType>();
+    if (!converted)
+      return {};
+    resultTypes.push_back(converted);
+  }
+
+  return LLVM::LLVMType::getStructTy(llvmDialect, resultTypes);
+}
+
+Value LLVMTypeConverter::promoteOneMemRefDescriptor(Location loc, Value operand,
+                                                    OpBuilder &builder) {
+  auto *context = builder.getContext();
+  auto int64Ty = LLVM::LLVMType::getInt64Ty(getDialect());
+  auto indexType = IndexType::get(context);
+  // Alloca with proper alignment. We do not expect optimizations of this
+  // alloca op and so we omit allocating at the entry block.
+  auto ptrType = operand->getType().cast<LLVM::LLVMType>().getPointerTo();
+  Value one = builder.create<LLVM::ConstantOp>(loc, int64Ty,
+                                               IntegerAttr::get(indexType, 1));
+  Value allocated =
+      builder.create<LLVM::AllocaOp>(loc, ptrType, one, /*alignment=*/0);
+  // Store into the alloca'ed descriptor.
+  builder.create<LLVM::StoreOp>(loc, operand, allocated);
+  return allocated;
+}
+
+SmallVector<Value, 4>
+LLVMTypeConverter::promoteMemRefDescriptors(Location loc, ValueRange opOperands,
+                                            ValueRange operands,
+                                            OpBuilder &builder) {
+  SmallVector<Value, 4> promotedOperands;
+  promotedOperands.reserve(operands.size());
+  for (auto it : llvm::zip(opOperands, operands)) {
+    auto operand = std::get<0>(it);
+    auto llvmOperand = std::get<1>(it);
+    if (!operand->getType().isa<MemRefType>() &&
+        !operand->getType().isa<UnrankedMemRefType>()) {
+      promotedOperands.push_back(operand);
+      continue;
+    }
+    promotedOperands.push_back(
+        promoteOneMemRefDescriptor(loc, llvmOperand, builder));
+  }
+  return promotedOperands;
+}
+
+/// Create an instance of LLVMTypeConverter in the given context.
+static std::unique_ptr<LLVMTypeConverter>
+makeStandardToLLVMTypeConverter(MLIRContext *context) {
+  return std::make_unique<LLVMTypeConverter>(context);
+}
+
+namespace {
+/// A pass converting MLIR operations into the LLVM IR dialect.
+struct LLVMLoweringPass : public ModulePass<LLVMLoweringPass> {
+  // By default, the patterns are those converting Standard operations to the
+  // LLVMIR dialect.
+  explicit LLVMLoweringPass(
+      bool useAlloca = false,
+      LLVMPatternListFiller patternListFiller =
+          populateStdToLLVMConversionPatterns,
+      LLVMTypeConverterMaker converterBuilder = makeStandardToLLVMTypeConverter)
+      : patternListFiller(patternListFiller),
+        typeConverterMaker(converterBuilder) {}
+
+  // Run the dialect converter on the module.
+  void runOnModule() override {
+    if (!typeConverterMaker || !patternListFiller)
+      return signalPassFailure();
+
+    ModuleOp m = getModule();
+    LLVM::ensureDistinctSuccessors(m);
+    std::unique_ptr<LLVMTypeConverter> typeConverter =
+        typeConverterMaker(&getContext());
+    if (!typeConverter)
+      return signalPassFailure();
+
+    OwningRewritePatternList patterns;
+    populateLoopToStdConversionPatterns(patterns, m.getContext());
+    patternListFiller(*typeConverter, patterns);
+
+    ConversionTarget target(getContext());
+    target.addLegalDialect<LLVM::LLVMDialect>();
+    if (failed(applyPartialConversion(m, target, patterns, &*typeConverter)))
+      signalPassFailure();
+  }
+
+  // Callback for creating a list of patterns.  It is called every time in
+  // runOnModule since applyPartialConversion consumes the list.
+  LLVMPatternListFiller patternListFiller;
+
+  // Callback for creating an instance of type converter.  The converter
+  // constructor needs an MLIRContext, which is not available until runOnModule.
+  LLVMTypeConverterMaker typeConverterMaker;
+};
+} // end namespace
+
+std::unique_ptr<OpPassBase<ModuleOp>>
+mlir::createLowerToLLVMPass(bool useAlloca) {
+  return std::make_unique<LLVMLoweringPass>(useAlloca);
+}
+
+std::unique_ptr<OpPassBase<ModuleOp>>
+mlir::createLowerToLLVMPass(LLVMPatternListFiller patternListFiller,
+                            LLVMTypeConverterMaker typeConverterMaker,
+                            bool useAlloca) {
+  return std::make_unique<LLVMLoweringPass>(useAlloca, patternListFiller,
+                                            typeConverterMaker);
+}
+
+static PassRegistration<LLVMLoweringPass>
+    pass("convert-std-to-llvm",
+         "Convert scalar and vector operations from the "
+         "Standard to the LLVM dialect",
+         [] {
+           return std::make_unique<LLVMLoweringPass>(
+               clUseAlloca.getValue(), populateStdToLLVMConversionPatterns,
+               makeStandardToLLVMTypeConverter);
+         });
diff --git a/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fcced23a95e86e180c471673924d0dec9b27c5f8
--- /dev/null
+++ b/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt
@@ -0,0 +1,26 @@
+set(LLVM_TARGET_DEFINITIONS StandardToSPIRV.td)
+mlir_tablegen(StandardToSPIRV.cpp.inc -gen-rewriters)
+add_public_tablegen_target(MLIRStandardToSPIRVIncGen)
+
+add_llvm_library(MLIRStandardToSPIRVTransforms
+  ConvertStandardToSPIRV.cpp
+  ConvertStandardToSPIRVPass.cpp
+  LegalizeStandardForSPIRV.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SPIRV
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR
+  )
+
+add_dependencies(MLIRStandardToSPIRVTransforms
+  MLIRStandardToSPIRVIncGen)
+
+target_link_libraries(MLIRStandardToSPIRVTransforms
+  MLIRIR
+  MLIRPass
+  MLIRSPIRV
+  MLIRSupport
+  MLIRTransformUtils
+  MLIRSPIRV
+  MLIRStandardOps
+  )
diff --git a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a02dee4419a8b5a7d8fb292c167a24d14dcf1b98
--- /dev/null
+++ b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
@@ -0,0 +1,314 @@
+//===- ConvertStandardToSPIRV.cpp - Standard to SPIR-V dialect conversion--===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements patterns to convert Standard Ops to the SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/SPIRV/LayoutUtils.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVLowering.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineMap.h"
+#include "llvm/ADT/SetVector.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Operation conversion
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Convert constant operation with IndexType return to SPIR-V constant
+/// operation. Since IndexType is not used within SPIR-V dialect, this needs
+/// special handling to make sure the result type and the type of the value
+/// attribute are consistent.
+// TODO(ravishankarm) : This should be moved into DRR.
+class ConstantIndexOpConversion final : public SPIRVOpLowering<ConstantOp> {
+public:
+  using SPIRVOpLowering<ConstantOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(ConstantOp constIndexOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// Convert compare operation to SPIR-V dialect.
+class CmpIOpConversion final : public SPIRVOpLowering<CmpIOp> {
+public:
+  using SPIRVOpLowering<CmpIOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(CmpIOp cmpIOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// Convert integer binary operations to SPIR-V operations. Cannot use
+/// tablegen for this. If the integer operation is on variables of IndexType,
+/// the type of the return value of the replacement operation differs from
+/// that of the replaced operation. This is not handled in tablegen-based
+/// pattern specification.
+// TODO(ravishankarm) : This should be moved into DRR.
+template <typename StdOp, typename SPIRVOp>
+class IntegerOpConversion final : public SPIRVOpLowering<StdOp> {
+public:
+  using SPIRVOpLowering<StdOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(StdOp operation, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto resultType =
+        this->typeConverter.convertType(operation.getResult()->getType());
+    rewriter.template replaceOpWithNewOp<SPIRVOp>(
+        operation, resultType, operands, ArrayRef<NamedAttribute>());
+    return this->matchSuccess();
+  }
+};
+
+/// Convert load -> spv.LoadOp. The operands of the replaced operation are of
+/// IndexType while that of the replacement operation are of type i32. This is
+/// not supported in tablegen based pattern specification.
+// TODO(ravishankarm) : This should be moved into DRR.
+class LoadOpConversion final : public SPIRVOpLowering<LoadOp> {
+public:
+  using SPIRVOpLowering<LoadOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(LoadOp loadOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// Convert return -> spv.Return.
+// TODO(ravishankarm) : This should be moved into DRR.
+class ReturnOpConversion final : public SPIRVOpLowering<ReturnOp> {
+public:
+  using SPIRVOpLowering<ReturnOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(ReturnOp returnOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// Convert select -> spv.Select
+// TODO(ravishankarm) : This should be moved into DRR.
+class SelectOpConversion final : public SPIRVOpLowering<SelectOp> {
+public:
+  using SPIRVOpLowering<SelectOp>::SPIRVOpLowering;
+  PatternMatchResult
+  matchAndRewrite(SelectOp op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// Convert store -> spv.StoreOp. The operands of the replaced operation are
+/// of IndexType while that of the replacement operation are of type i32. This
+/// is not supported in tablegen based pattern specification.
+// TODO(ravishankarm) : This should be moved into DRR.
+class StoreOpConversion final : public SPIRVOpLowering<StoreOp> {
+public:
+  using SPIRVOpLowering<StoreOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(StoreOp storeOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Utility functions for operation conversion
+//===----------------------------------------------------------------------===//
+
+/// Performs the index computation to get to the element pointed to by
+/// `indices` using the layout map of `baseType`.
+
+// TODO(ravishankarm) : This method assumes that the `origBaseType` is a
+// MemRefType with AffineMap that has static strides. Handle dynamic strides
+spirv::AccessChainOp getElementPtr(OpBuilder &builder,
+                                   SPIRVTypeConverter &typeConverter,
+                                   Location loc, MemRefType origBaseType,
+                                   Value basePtr, ArrayRef<Value> indices) {
+  // Get base and offset of the MemRefType and verify they are static.
+  int64_t offset;
+  SmallVector<int64_t, 4> strides;
+  if (failed(getStridesAndOffset(origBaseType, strides, offset)) ||
+      llvm::is_contained(strides, MemRefType::getDynamicStrideOrOffset())) {
+    return nullptr;
+  }
+
+  auto indexType = typeConverter.getIndexType(builder.getContext());
+
+  Value ptrLoc = nullptr;
+  assert(indices.size() == strides.size());
+  for (auto index : enumerate(indices)) {
+    Value strideVal = builder.create<spirv::ConstantOp>(
+        loc, indexType, IntegerAttr::get(indexType, strides[index.index()]));
+    Value update = builder.create<spirv::IMulOp>(loc, strideVal, index.value());
+    ptrLoc =
+        (ptrLoc ? builder.create<spirv::IAddOp>(loc, ptrLoc, update).getResult()
+                : update);
+  }
+  SmallVector<Value, 2> linearizedIndices;
+  // Add a '0' at the start to index into the struct.
+  linearizedIndices.push_back(builder.create<spirv::ConstantOp>(
+      loc, indexType, IntegerAttr::get(indexType, 0)));
+  linearizedIndices.push_back(ptrLoc);
+  return builder.create<spirv::AccessChainOp>(loc, basePtr, linearizedIndices);
+}
+
+//===----------------------------------------------------------------------===//
+// ConstantOp with index type.
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult ConstantIndexOpConversion::matchAndRewrite(
+    ConstantOp constIndexOp, ArrayRef<Value> operands,
+    ConversionPatternRewriter &rewriter) const {
+  if (!constIndexOp.getResult()->getType().isa<IndexType>()) {
+    return matchFailure();
+  }
+  // The attribute has index type which is not directly supported in
+  // SPIR-V. Get the integer value and create a new IntegerAttr.
+  auto constAttr = constIndexOp.value().dyn_cast<IntegerAttr>();
+  if (!constAttr) {
+    return matchFailure();
+  }
+
+  // Use the bitwidth set in the value attribute to decide the result type
+  // of the SPIR-V constant operation since SPIR-V does not support index
+  // types.
+  auto constVal = constAttr.getValue();
+  auto constValType = constAttr.getType().dyn_cast<IndexType>();
+  if (!constValType) {
+    return matchFailure();
+  }
+  auto spirvConstType =
+      typeConverter.convertType(constIndexOp.getResult()->getType());
+  auto spirvConstVal =
+      rewriter.getIntegerAttr(spirvConstType, constAttr.getInt());
+  rewriter.replaceOpWithNewOp<spirv::ConstantOp>(constIndexOp, spirvConstType,
+                                                 spirvConstVal);
+  return matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
+// CmpIOp
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult
+CmpIOpConversion::matchAndRewrite(CmpIOp cmpIOp, ArrayRef<Value> operands,
+                                  ConversionPatternRewriter &rewriter) const {
+  CmpIOpOperandAdaptor cmpIOpOperands(operands);
+
+  switch (cmpIOp.getPredicate()) {
+#define DISPATCH(cmpPredicate, spirvOp)                                        \
+  case cmpPredicate:                                                           \
+    rewriter.replaceOpWithNewOp<spirvOp>(                                      \
+        cmpIOp, cmpIOp.getResult()->getType(), cmpIOpOperands.lhs(),           \
+        cmpIOpOperands.rhs());                                                 \
+    return matchSuccess();
+
+    DISPATCH(CmpIPredicate::eq, spirv::IEqualOp);
+    DISPATCH(CmpIPredicate::ne, spirv::INotEqualOp);
+    DISPATCH(CmpIPredicate::slt, spirv::SLessThanOp);
+    DISPATCH(CmpIPredicate::sle, spirv::SLessThanEqualOp);
+    DISPATCH(CmpIPredicate::sgt, spirv::SGreaterThanOp);
+    DISPATCH(CmpIPredicate::sge, spirv::SGreaterThanEqualOp);
+
+#undef DISPATCH
+
+  default:
+    break;
+  }
+  return matchFailure();
+}
+
+//===----------------------------------------------------------------------===//
+// LoadOp
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult
+LoadOpConversion::matchAndRewrite(LoadOp loadOp, ArrayRef<Value> operands,
+                                  ConversionPatternRewriter &rewriter) const {
+  LoadOpOperandAdaptor loadOperands(operands);
+  auto loadPtr = getElementPtr(rewriter, typeConverter, loadOp.getLoc(),
+                               loadOp.memref()->getType().cast<MemRefType>(),
+                               loadOperands.memref(), loadOperands.indices());
+  rewriter.replaceOpWithNewOp<spirv::LoadOp>(loadOp, loadPtr,
+                                             /*memory_access =*/nullptr,
+                                             /*alignment =*/nullptr);
+  return matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult
+ReturnOpConversion::matchAndRewrite(ReturnOp returnOp, ArrayRef<Value> operands,
+                                    ConversionPatternRewriter &rewriter) const {
+  if (returnOp.getNumOperands()) {
+    return matchFailure();
+  }
+  rewriter.replaceOpWithNewOp<spirv::ReturnOp>(returnOp);
+  return matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
+// SelectOp
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult
+SelectOpConversion::matchAndRewrite(SelectOp op, ArrayRef<Value> operands,
+                                    ConversionPatternRewriter &rewriter) const {
+  SelectOpOperandAdaptor selectOperands(operands);
+  rewriter.replaceOpWithNewOp<spirv::SelectOp>(op, selectOperands.condition(),
+                                               selectOperands.true_value(),
+                                               selectOperands.false_value());
+  return matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
+// StoreOp
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult
+StoreOpConversion::matchAndRewrite(StoreOp storeOp, ArrayRef<Value> operands,
+                                   ConversionPatternRewriter &rewriter) const {
+  StoreOpOperandAdaptor storeOperands(operands);
+  auto storePtr =
+      getElementPtr(rewriter, typeConverter, storeOp.getLoc(),
+                    storeOp.memref()->getType().cast<MemRefType>(),
+                    storeOperands.memref(), storeOperands.indices());
+  rewriter.replaceOpWithNewOp<spirv::StoreOp>(storeOp, storePtr,
+                                              storeOperands.value(),
+                                              /*memory_access =*/nullptr,
+                                              /*alignment =*/nullptr);
+  return matchSuccess();
+}
+
+namespace {
+/// Import the Standard Ops to SPIR-V Patterns.
+#include "StandardToSPIRV.cpp.inc"
+} // namespace
+
+namespace mlir {
+void populateStandardToSPIRVPatterns(MLIRContext *context,
+                                     SPIRVTypeConverter &typeConverter,
+                                     OwningRewritePatternList &patterns) {
+  // Add patterns that lower operations into SPIR-V dialect.
+  populateWithGenerated(context, &patterns);
+  patterns.insert<ConstantIndexOpConversion, CmpIOpConversion,
+                  IntegerOpConversion<AddIOp, spirv::IAddOp>,
+                  IntegerOpConversion<MulIOp, spirv::IMulOp>,
+                  IntegerOpConversion<SignedDivIOp, spirv::SDivOp>,
+                  IntegerOpConversion<SignedRemIOp, spirv::SModOp>,
+                  IntegerOpConversion<SubIOp, spirv::ISubOp>, LoadOpConversion,
+                  ReturnOpConversion, SelectOpConversion, StoreOpConversion>(
+      context, typeConverter);
+}
+} // namespace mlir
diff --git a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52456b6e46d04d650ca36f6718e442b897c79740
--- /dev/null
+++ b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp
@@ -0,0 +1,89 @@
+//===- ConvertStandardToSPIRVPass.cpp - Convert Std Ops to SPIR-V Ops -----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to convert MLIR standard ops into the SPIR-V
+// ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h"
+#include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVLowering.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+
+/// A simple pattern for rewriting function signature to convert arguments of
+/// functions to be of valid SPIR-V types.
+class FuncOpConversion final : public SPIRVOpLowering<FuncOp> {
+public:
+  using SPIRVOpLowering<FuncOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(FuncOp funcOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// A pass converting MLIR Standard operations into the SPIR-V dialect.
+class ConvertStandardToSPIRVPass
+    : public ModulePass<ConvertStandardToSPIRVPass> {
+  void runOnModule() override;
+};
+} // namespace
+
+PatternMatchResult
+FuncOpConversion::matchAndRewrite(FuncOp funcOp, ArrayRef<Value> operands,
+                                  ConversionPatternRewriter &rewriter) const {
+  auto fnType = funcOp.getType();
+  if (fnType.getNumResults()) {
+    return matchFailure();
+  }
+
+  TypeConverter::SignatureConversion signatureConverter(fnType.getNumInputs());
+  {
+    for (auto argType : enumerate(funcOp.getType().getInputs())) {
+      auto convertedType = typeConverter.convertType(argType.value());
+      signatureConverter.addInputs(argType.index(), convertedType);
+    }
+  }
+
+  rewriter.updateRootInPlace(funcOp, [&] {
+    funcOp.setType(rewriter.getFunctionType(
+        signatureConverter.getConvertedTypes(), llvm::None));
+    rewriter.applySignatureConversion(&funcOp.getBody(), signatureConverter);
+  });
+  return matchSuccess();
+}
+
+void ConvertStandardToSPIRVPass::runOnModule() {
+  OwningRewritePatternList patterns;
+  auto context = &getContext();
+  auto module = getModule();
+
+  SPIRVTypeConverter typeConverter;
+  populateStandardToSPIRVPatterns(context, typeConverter, patterns);
+  patterns.insert<FuncOpConversion>(context, typeConverter);
+  ConversionTarget target(*(module.getContext()));
+  target.addLegalDialect<spirv::SPIRVDialect>();
+  target.addDynamicallyLegalOp<FuncOp>(
+      [&](FuncOp op) { return typeConverter.isSignatureLegal(op.getType()); });
+
+  if (failed(applyPartialConversion(module, target, patterns))) {
+    return signalPassFailure();
+  }
+}
+
+std::unique_ptr<OpPassBase<ModuleOp>> mlir::createConvertStandardToSPIRVPass() {
+  return std::make_unique<ConvertStandardToSPIRVPass>();
+}
+
+static PassRegistration<ConvertStandardToSPIRVPass>
+    pass("convert-std-to-spirv", "Convert Standard Ops to SPIR-V dialect");
diff --git a/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp b/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a658356f76ce8449b9c8449ce9a1a429ccd59093
--- /dev/null
+++ b/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
@@ -0,0 +1,181 @@
+//===- LegalizeStandardForSPIRV.cpp - Legalize ops for SPIR-V lowering ----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation pass legalizes operations before the conversion to SPIR-V
+// dialect to handle ops that cannot be lowered directly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h"
+#include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// Merges subview operation with load operation.
+class LoadOpOfSubViewFolder final : public OpRewritePattern<LoadOp> {
+public:
+  using OpRewritePattern<LoadOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(LoadOp loadOp,
+                                     PatternRewriter &rewriter) const override;
+};
+
+/// Merges subview operation with store operation.
+class StoreOpOfSubViewFolder final : public OpRewritePattern<StoreOp> {
+public:
+  using OpRewritePattern<StoreOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(StoreOp storeOp,
+                                     PatternRewriter &rewriter) const override;
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Utility functions for op legalization.
+//===----------------------------------------------------------------------===//
+
+/// Given the 'indices' of an load/store operation where the memref is a result
+/// of a subview op, returns the indices w.r.t to the source memref of the
+/// subview op. For example
+///
+/// %0 = ... : memref<12x42xf32>
+/// %1 = subview %0[%arg0, %arg1][][%stride1, %stride2] : memref<12x42xf32> to
+///          memref<4x4xf32, offset=?, strides=[?, ?]>
+/// %2 = load %1[%i1, %i2] : memref<4x4xf32, offset=?, strides=[?, ?]>
+///
+/// could be folded into
+///
+/// %2 = load %0[%arg0 + %i1 * %stride1][%arg1 + %i2 * %stride2] :
+///          memref<12x42xf32>
+static LogicalResult
+resolveSourceIndices(Location loc, PatternRewriter &rewriter,
+                     SubViewOp subViewOp, ValueRange indices,
+                     SmallVectorImpl<Value> &sourceIndices) {
+  // TODO: Aborting when the offsets are static. There might be a way to fold
+  // the subview op with load even if the offsets have been canonicalized
+  // away.
+  if (subViewOp.getNumOffsets() == 0)
+    return failure();
+
+  ValueRange opOffsets = subViewOp.offsets();
+  SmallVector<Value, 2> opStrides;
+  if (subViewOp.getNumStrides()) {
+    // If the strides are dynamic, get the stride operands.
+    opStrides = llvm::to_vector<2>(subViewOp.strides());
+  } else {
+    // When static, the stride operands can be retrieved by taking the strides
+    // of the result of the subview op, and dividing the strides of the base
+    // memref.
+    SmallVector<int64_t, 2> staticStrides;
+    if (failed(subViewOp.getStaticStrides(staticStrides))) {
+      return failure();
+    }
+    opStrides.reserve(opOffsets.size());
+    for (auto stride : staticStrides) {
+      auto constValAttr = rewriter.getIntegerAttr(
+          IndexType::get(rewriter.getContext()), stride);
+      opStrides.emplace_back(rewriter.create<ConstantOp>(loc, constValAttr));
+    }
+  }
+  assert(opOffsets.size() == opStrides.size());
+
+  // New indices for the load are the current indices * subview_stride +
+  // subview_offset.
+  assert(indices.size() == opStrides.size());
+  sourceIndices.resize(indices.size());
+  for (auto index : llvm::enumerate(indices)) {
+    auto offset = opOffsets[index.index()];
+    auto stride = opStrides[index.index()];
+    auto mul = rewriter.create<MulIOp>(loc, index.value(), stride);
+    sourceIndices[index.index()] =
+        rewriter.create<AddIOp>(loc, offset, mul).getResult();
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Folding SubViewOp and LoadOp.
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult
+LoadOpOfSubViewFolder::matchAndRewrite(LoadOp loadOp,
+                                       PatternRewriter &rewriter) const {
+  auto subViewOp =
+      dyn_cast_or_null<SubViewOp>(loadOp.memref()->getDefiningOp());
+  if (!subViewOp) {
+    return matchFailure();
+  }
+  SmallVector<Value, 4> sourceIndices;
+  if (failed(resolveSourceIndices(loadOp.getLoc(), rewriter, subViewOp,
+                                  loadOp.indices(), sourceIndices)))
+    return matchFailure();
+
+  rewriter.replaceOpWithNewOp<LoadOp>(loadOp, subViewOp.source(),
+                                      sourceIndices);
+  return matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
+// Folding SubViewOp and StoreOp.
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult
+StoreOpOfSubViewFolder::matchAndRewrite(StoreOp storeOp,
+                                        PatternRewriter &rewriter) const {
+  auto subViewOp =
+      dyn_cast_or_null<SubViewOp>(storeOp.memref()->getDefiningOp());
+  if (!subViewOp) {
+    return matchFailure();
+  }
+  SmallVector<Value, 4> sourceIndices;
+  if (failed(resolveSourceIndices(storeOp.getLoc(), rewriter, subViewOp,
+                                  storeOp.indices(), sourceIndices)))
+    return matchFailure();
+
+  rewriter.replaceOpWithNewOp<StoreOp>(storeOp, storeOp.value(),
+                                       subViewOp.source(), sourceIndices);
+  return matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
+// Hook for adding patterns.
+//===----------------------------------------------------------------------===//
+
+void mlir::populateStdLegalizationPatternsForSPIRVLowering(
+    MLIRContext *context, OwningRewritePatternList &patterns) {
+  patterns.insert<LoadOpOfSubViewFolder, StoreOpOfSubViewFolder>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// Pass for testing just the legalization patterns.
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct SPIRVLegalization final : public OperationPass<SPIRVLegalization> {
+  void runOnOperation() override;
+};
+} // namespace
+
+void SPIRVLegalization::runOnOperation() {
+  OwningRewritePatternList patterns;
+  auto *context = &getContext();
+  populateStdLegalizationPatternsForSPIRVLowering(context, patterns);
+  applyPatternsGreedily(getOperation()->getRegions(), patterns);
+}
+
+std::unique_ptr<Pass> mlir::createLegalizeStdOpsForSPIRVLoweringPass() {
+  return std::make_unique<SPIRVLegalization>();
+}
+
+static PassRegistration<SPIRVLegalization>
+    pass("legalize-std-for-spirv", "Legalize standard ops for SPIR-V lowering");
diff --git a/mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.td b/mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.td
new file mode 100644
index 0000000000000000000000000000000000000000..6f3a6a82476bc3994c525dce9f20e5ab8dd947ac
--- /dev/null
+++ b/mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.td
@@ -0,0 +1,35 @@
+//==- StandardToSPIRV.td - Standard Ops to SPIR-V Patterns ---*- tablegen -*==//
+
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines Patterns to lower standard ops to SPIR-V.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_STANDARDTOSPIRV_TD
+#define MLIR_CONVERSION_STANDARDTOSPIRV_TD
+
+include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/SPIRV/SPIRVOps.td"
+
+class BinaryOpPattern<Op src, Op tgt> :
+      Pat<(src SPV_ScalarOrVector:$l, SPV_ScalarOrVector:$r),
+          (tgt $l, $r)>;
+
+def : BinaryOpPattern<AddFOp, SPV_FAddOp>;
+def : BinaryOpPattern<DivFOp, SPV_FDivOp>;
+def : BinaryOpPattern<MulFOp, SPV_FMulOp>;
+def : BinaryOpPattern<RemFOp, SPV_FRemOp>;
+def : BinaryOpPattern<SubFOp, SPV_FSubOp>;
+
+// Constant Op
+// TODO(ravishankarm): Handle lowering other constant types.
+def : Pat<(ConstantOp:$result $valueAttr),
+          (SPV_ConstantOp $valueAttr),
+          [(SPV_ScalarOrVector $result)]>;
+
+#endif // MLIR_CONVERSION_STANDARDTOSPIRV_TD
diff --git a/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt b/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2aaec68f6c477b908b889fd5539586fe9212d5fa
--- /dev/null
+++ b/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_llvm_library(MLIRVectorToLLVM
+  ConvertVectorToLLVM.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/VectorToLLVM
+)
+set(LIBS
+  MLIRLLVMIR
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+  )
+
+add_dependencies(MLIRVectorToLLVM ${LIBS})
+target_link_libraries(MLIRVectorToLLVM ${LIBS})
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b48930c4ddab0010503f2ff1711054b44f138d80
--- /dev/null
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -0,0 +1,766 @@
+//===- VectorToLLVM.cpp - Conversion from Vector to the LLVM dialect ------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace mlir;
+
+template <typename T>
+static LLVM::LLVMType getPtrToElementType(T containerType,
+                                          LLVMTypeConverter &lowering) {
+  return lowering.convertType(containerType.getElementType())
+      .template cast<LLVM::LLVMType>()
+      .getPointerTo();
+}
+
+// Helper to reduce vector type by one rank at front.
+static VectorType reducedVectorTypeFront(VectorType tp) {
+  assert((tp.getRank() > 1) && "unlowerable vector type");
+  return VectorType::get(tp.getShape().drop_front(), tp.getElementType());
+}
+
+// Helper to reduce vector type by *all* but one rank at back.
+static VectorType reducedVectorTypeBack(VectorType tp) {
+  assert((tp.getRank() > 1) && "unlowerable vector type");
+  return VectorType::get(tp.getShape().take_back(), tp.getElementType());
+}
+
+// Helper that picks the proper sequence for inserting.
+static Value insertOne(ConversionPatternRewriter &rewriter,
+                       LLVMTypeConverter &lowering, Location loc, Value val1,
+                       Value val2, Type llvmType, int64_t rank, int64_t pos) {
+  if (rank == 1) {
+    auto idxType = rewriter.getIndexType();
+    auto constant = rewriter.create<LLVM::ConstantOp>(
+        loc, lowering.convertType(idxType),
+        rewriter.getIntegerAttr(idxType, pos));
+    return rewriter.create<LLVM::InsertElementOp>(loc, llvmType, val1, val2,
+                                                  constant);
+  }
+  return rewriter.create<LLVM::InsertValueOp>(loc, llvmType, val1, val2,
+                                              rewriter.getI64ArrayAttr(pos));
+}
+
+// Helper that picks the proper sequence for extracting.
+static Value extractOne(ConversionPatternRewriter &rewriter,
+                        LLVMTypeConverter &lowering, Location loc, Value val,
+                        Type llvmType, int64_t rank, int64_t pos) {
+  if (rank == 1) {
+    auto idxType = rewriter.getIndexType();
+    auto constant = rewriter.create<LLVM::ConstantOp>(
+        loc, lowering.convertType(idxType),
+        rewriter.getIntegerAttr(idxType, pos));
+    return rewriter.create<LLVM::ExtractElementOp>(loc, llvmType, val,
+                                                   constant);
+  }
+  return rewriter.create<LLVM::ExtractValueOp>(loc, llvmType, val,
+                                               rewriter.getI64ArrayAttr(pos));
+}
+
+class VectorBroadcastOpConversion : public LLVMOpLowering {
+public:
+  explicit VectorBroadcastOpConversion(MLIRContext *context,
+                                       LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(vector::BroadcastOp::getOperationName(), context,
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto broadcastOp = cast<vector::BroadcastOp>(op);
+    VectorType dstVectorType = broadcastOp.getVectorType();
+    if (lowering.convertType(dstVectorType) == nullptr)
+      return matchFailure();
+    // Rewrite when the full vector type can be lowered (which
+    // implies all 'reduced' types can be lowered too).
+    auto adaptor = vector::BroadcastOpOperandAdaptor(operands);
+    VectorType srcVectorType =
+        broadcastOp.getSourceType().dyn_cast<VectorType>();
+    rewriter.replaceOp(
+        op, expandRanks(adaptor.source(), // source value to be expanded
+                        op->getLoc(),     // location of original broadcast
+                        srcVectorType, dstVectorType, rewriter));
+    return matchSuccess();
+  }
+
+private:
+  // Expands the given source value over all the ranks, as defined
+  // by the source and destination type (a null source type denotes
+  // expansion from a scalar value into a vector).
+  //
+  // TODO(ajcbik): consider replacing this one-pattern lowering
+  //               with a two-pattern lowering using other vector
+  //               ops once all insert/extract/shuffle operations
+  //               are available with lowering implemention.
+  //
+  Value expandRanks(Value value, Location loc, VectorType srcVectorType,
+                    VectorType dstVectorType,
+                    ConversionPatternRewriter &rewriter) const {
+    assert((dstVectorType != nullptr) && "invalid result type in broadcast");
+    // Determine rank of source and destination.
+    int64_t srcRank = srcVectorType ? srcVectorType.getRank() : 0;
+    int64_t dstRank = dstVectorType.getRank();
+    int64_t curDim = dstVectorType.getDimSize(0);
+    if (srcRank < dstRank)
+      // Duplicate this rank.
+      return duplicateOneRank(value, loc, srcVectorType, dstVectorType, dstRank,
+                              curDim, rewriter);
+    // If all trailing dimensions are the same, the broadcast consists of
+    // simply passing through the source value and we are done. Otherwise,
+    // any non-matching dimension forces a stretch along this rank.
+    assert((srcVectorType != nullptr) && (srcRank > 0) &&
+           (srcRank == dstRank) && "invalid rank in broadcast");
+    for (int64_t r = 0; r < dstRank; r++) {
+      if (srcVectorType.getDimSize(r) != dstVectorType.getDimSize(r)) {
+        return stretchOneRank(value, loc, srcVectorType, dstVectorType, dstRank,
+                              curDim, rewriter);
+      }
+    }
+    return value;
+  }
+
+  // Picks the best way to duplicate a single rank. For the 1-D case, a
+  // single insert-elt/shuffle is the most efficient expansion. For higher
+  // dimensions, however, we need dim x insert-values on a new broadcast
+  // with one less leading dimension, which will be lowered "recursively"
+  // to matching LLVM IR.
+  // For example:
+  //   v = broadcast s : f32 to vector<4x2xf32>
+  // becomes:
+  //   x = broadcast s : f32 to vector<2xf32>
+  //   v = [x,x,x,x]
+  // becomes:
+  //   x = [s,s]
+  //   v = [x,x,x,x]
+  Value duplicateOneRank(Value value, Location loc, VectorType srcVectorType,
+                         VectorType dstVectorType, int64_t rank, int64_t dim,
+                         ConversionPatternRewriter &rewriter) const {
+    Type llvmType = lowering.convertType(dstVectorType);
+    assert((llvmType != nullptr) && "unlowerable vector type");
+    if (rank == 1) {
+      Value undef = rewriter.create<LLVM::UndefOp>(loc, llvmType);
+      Value expand =
+          insertOne(rewriter, lowering, loc, undef, value, llvmType, rank, 0);
+      SmallVector<int32_t, 4> zeroValues(dim, 0);
+      return rewriter.create<LLVM::ShuffleVectorOp>(
+          loc, expand, undef, rewriter.getI32ArrayAttr(zeroValues));
+    }
+    Value expand = expandRanks(value, loc, srcVectorType,
+                               reducedVectorTypeFront(dstVectorType), rewriter);
+    Value result = rewriter.create<LLVM::UndefOp>(loc, llvmType);
+    for (int64_t d = 0; d < dim; ++d) {
+      result =
+          insertOne(rewriter, lowering, loc, result, expand, llvmType, rank, d);
+    }
+    return result;
+  }
+
+  // Picks the best way to stretch a single rank. For the 1-D case, a
+  // single insert-elt/shuffle is the most efficient expansion when at
+  // a stretch. Otherwise, every dimension needs to be expanded
+  // individually and individually inserted in the resulting vector.
+  // For example:
+  //   v = broadcast w : vector<4x1x2xf32> to vector<4x2x2xf32>
+  // becomes:
+  //   a = broadcast w[0] : vector<1x2xf32> to vector<2x2xf32>
+  //   b = broadcast w[1] : vector<1x2xf32> to vector<2x2xf32>
+  //   c = broadcast w[2] : vector<1x2xf32> to vector<2x2xf32>
+  //   d = broadcast w[3] : vector<1x2xf32> to vector<2x2xf32>
+  //   v = [a,b,c,d]
+  // becomes:
+  //   x = broadcast w[0][0] : vector<2xf32> to vector <2x2xf32>
+  //   y = broadcast w[1][0] : vector<2xf32> to vector <2x2xf32>
+  //   a = [x, y]
+  //   etc.
+  Value stretchOneRank(Value value, Location loc, VectorType srcVectorType,
+                       VectorType dstVectorType, int64_t rank, int64_t dim,
+                       ConversionPatternRewriter &rewriter) const {
+    Type llvmType = lowering.convertType(dstVectorType);
+    assert((llvmType != nullptr) && "unlowerable vector type");
+    Value result = rewriter.create<LLVM::UndefOp>(loc, llvmType);
+    bool atStretch = dim != srcVectorType.getDimSize(0);
+    if (rank == 1) {
+      assert(atStretch);
+      Type redLlvmType = lowering.convertType(dstVectorType.getElementType());
+      Value one =
+          extractOne(rewriter, lowering, loc, value, redLlvmType, rank, 0);
+      Value expand =
+          insertOne(rewriter, lowering, loc, result, one, llvmType, rank, 0);
+      SmallVector<int32_t, 4> zeroValues(dim, 0);
+      return rewriter.create<LLVM::ShuffleVectorOp>(
+          loc, expand, result, rewriter.getI32ArrayAttr(zeroValues));
+    }
+    VectorType redSrcType = reducedVectorTypeFront(srcVectorType);
+    VectorType redDstType = reducedVectorTypeFront(dstVectorType);
+    Type redLlvmType = lowering.convertType(redSrcType);
+    for (int64_t d = 0; d < dim; ++d) {
+      int64_t pos = atStretch ? 0 : d;
+      Value one =
+          extractOne(rewriter, lowering, loc, value, redLlvmType, rank, pos);
+      Value expand = expandRanks(one, loc, redSrcType, redDstType, rewriter);
+      result =
+          insertOne(rewriter, lowering, loc, result, expand, llvmType, rank, d);
+    }
+    return result;
+  }
+};
+
+class VectorShuffleOpConversion : public LLVMOpLowering {
+public:
+  explicit VectorShuffleOpConversion(MLIRContext *context,
+                                     LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(vector::ShuffleOp::getOperationName(), context,
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto adaptor = vector::ShuffleOpOperandAdaptor(operands);
+    auto shuffleOp = cast<vector::ShuffleOp>(op);
+    auto v1Type = shuffleOp.getV1VectorType();
+    auto v2Type = shuffleOp.getV2VectorType();
+    auto vectorType = shuffleOp.getVectorType();
+    Type llvmType = lowering.convertType(vectorType);
+    auto maskArrayAttr = shuffleOp.mask();
+
+    // Bail if result type cannot be lowered.
+    if (!llvmType)
+      return matchFailure();
+
+    // Get rank and dimension sizes.
+    int64_t rank = vectorType.getRank();
+    assert(v1Type.getRank() == rank);
+    assert(v2Type.getRank() == rank);
+    int64_t v1Dim = v1Type.getDimSize(0);
+
+    // For rank 1, where both operands have *exactly* the same vector type,
+    // there is direct shuffle support in LLVM. Use it!
+    if (rank == 1 && v1Type == v2Type) {
+      Value shuffle = rewriter.create<LLVM::ShuffleVectorOp>(
+          loc, adaptor.v1(), adaptor.v2(), maskArrayAttr);
+      rewriter.replaceOp(op, shuffle);
+      return matchSuccess();
+    }
+
+    // For all other cases, insert the individual values individually.
+    Value insert = rewriter.create<LLVM::UndefOp>(loc, llvmType);
+    int64_t insPos = 0;
+    for (auto en : llvm::enumerate(maskArrayAttr)) {
+      int64_t extPos = en.value().cast<IntegerAttr>().getInt();
+      Value value = adaptor.v1();
+      if (extPos >= v1Dim) {
+        extPos -= v1Dim;
+        value = adaptor.v2();
+      }
+      Value extract =
+          extractOne(rewriter, lowering, loc, value, llvmType, rank, extPos);
+      insert = insertOne(rewriter, lowering, loc, insert, extract, llvmType,
+                         rank, insPos++);
+    }
+    rewriter.replaceOp(op, insert);
+    return matchSuccess();
+  }
+};
+
+class VectorExtractElementOpConversion : public LLVMOpLowering {
+public:
+  explicit VectorExtractElementOpConversion(MLIRContext *context,
+                                            LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(vector::ExtractElementOp::getOperationName(), context,
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto adaptor = vector::ExtractElementOpOperandAdaptor(operands);
+    auto extractEltOp = cast<vector::ExtractElementOp>(op);
+    auto vectorType = extractEltOp.getVectorType();
+    auto llvmType = lowering.convertType(vectorType.getElementType());
+
+    // Bail if result type cannot be lowered.
+    if (!llvmType)
+      return matchFailure();
+
+    rewriter.replaceOpWithNewOp<LLVM::ExtractElementOp>(
+        op, llvmType, adaptor.vector(), adaptor.position());
+    return matchSuccess();
+  }
+};
+
+class VectorExtractOpConversion : public LLVMOpLowering {
+public:
+  explicit VectorExtractOpConversion(MLIRContext *context,
+                                     LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(vector::ExtractOp::getOperationName(), context,
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto adaptor = vector::ExtractOpOperandAdaptor(operands);
+    auto extractOp = cast<vector::ExtractOp>(op);
+    auto vectorType = extractOp.getVectorType();
+    auto resultType = extractOp.getResult()->getType();
+    auto llvmResultType = lowering.convertType(resultType);
+    auto positionArrayAttr = extractOp.position();
+
+    // Bail if result type cannot be lowered.
+    if (!llvmResultType)
+      return matchFailure();
+
+    // One-shot extraction of vector from array (only requires extractvalue).
+    if (resultType.isa<VectorType>()) {
+      Value extracted = rewriter.create<LLVM::ExtractValueOp>(
+          loc, llvmResultType, adaptor.vector(), positionArrayAttr);
+      rewriter.replaceOp(op, extracted);
+      return matchSuccess();
+    }
+
+    // Potential extraction of 1-D vector from array.
+    auto *context = op->getContext();
+    Value extracted = adaptor.vector();
+    auto positionAttrs = positionArrayAttr.getValue();
+    if (positionAttrs.size() > 1) {
+      auto oneDVectorType = reducedVectorTypeBack(vectorType);
+      auto nMinusOnePositionAttrs =
+          ArrayAttr::get(positionAttrs.drop_back(), context);
+      extracted = rewriter.create<LLVM::ExtractValueOp>(
+          loc, lowering.convertType(oneDVectorType), extracted,
+          nMinusOnePositionAttrs);
+    }
+
+    // Remaining extraction of element from 1-D LLVM vector
+    auto position = positionAttrs.back().cast<IntegerAttr>();
+    auto i64Type = LLVM::LLVMType::getInt64Ty(lowering.getDialect());
+    auto constant = rewriter.create<LLVM::ConstantOp>(loc, i64Type, position);
+    extracted =
+        rewriter.create<LLVM::ExtractElementOp>(loc, extracted, constant);
+    rewriter.replaceOp(op, extracted);
+
+    return matchSuccess();
+  }
+};
+
+class VectorInsertElementOpConversion : public LLVMOpLowering {
+public:
+  explicit VectorInsertElementOpConversion(MLIRContext *context,
+                                           LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(vector::InsertElementOp::getOperationName(), context,
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto adaptor = vector::InsertElementOpOperandAdaptor(operands);
+    auto insertEltOp = cast<vector::InsertElementOp>(op);
+    auto vectorType = insertEltOp.getDestVectorType();
+    auto llvmType = lowering.convertType(vectorType);
+
+    // Bail if result type cannot be lowered.
+    if (!llvmType)
+      return matchFailure();
+
+    rewriter.replaceOpWithNewOp<LLVM::InsertElementOp>(
+        op, llvmType, adaptor.dest(), adaptor.source(), adaptor.position());
+    return matchSuccess();
+  }
+};
+
+class VectorInsertOpConversion : public LLVMOpLowering {
+public:
+  explicit VectorInsertOpConversion(MLIRContext *context,
+                                    LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(vector::InsertOp::getOperationName(), context,
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto adaptor = vector::InsertOpOperandAdaptor(operands);
+    auto insertOp = cast<vector::InsertOp>(op);
+    auto sourceType = insertOp.getSourceType();
+    auto destVectorType = insertOp.getDestVectorType();
+    auto llvmResultType = lowering.convertType(destVectorType);
+    auto positionArrayAttr = insertOp.position();
+
+    // Bail if result type cannot be lowered.
+    if (!llvmResultType)
+      return matchFailure();
+
+    // One-shot insertion of a vector into an array (only requires insertvalue).
+    if (sourceType.isa<VectorType>()) {
+      Value inserted = rewriter.create<LLVM::InsertValueOp>(
+          loc, llvmResultType, adaptor.dest(), adaptor.source(),
+          positionArrayAttr);
+      rewriter.replaceOp(op, inserted);
+      return matchSuccess();
+    }
+
+    // Potential extraction of 1-D vector from array.
+    auto *context = op->getContext();
+    Value extracted = adaptor.dest();
+    auto positionAttrs = positionArrayAttr.getValue();
+    auto position = positionAttrs.back().cast<IntegerAttr>();
+    auto oneDVectorType = destVectorType;
+    if (positionAttrs.size() > 1) {
+      oneDVectorType = reducedVectorTypeBack(destVectorType);
+      auto nMinusOnePositionAttrs =
+          ArrayAttr::get(positionAttrs.drop_back(), context);
+      extracted = rewriter.create<LLVM::ExtractValueOp>(
+          loc, lowering.convertType(oneDVectorType), extracted,
+          nMinusOnePositionAttrs);
+    }
+
+    // Insertion of an element into a 1-D LLVM vector.
+    auto i64Type = LLVM::LLVMType::getInt64Ty(lowering.getDialect());
+    auto constant = rewriter.create<LLVM::ConstantOp>(loc, i64Type, position);
+    Value inserted = rewriter.create<LLVM::InsertElementOp>(
+        loc, lowering.convertType(oneDVectorType), extracted, adaptor.source(),
+        constant);
+
+    // Potential insertion of resulting 1-D vector into array.
+    if (positionAttrs.size() > 1) {
+      auto nMinusOnePositionAttrs =
+          ArrayAttr::get(positionAttrs.drop_back(), context);
+      inserted = rewriter.create<LLVM::InsertValueOp>(loc, llvmResultType,
+                                                      adaptor.dest(), inserted,
+                                                      nMinusOnePositionAttrs);
+    }
+
+    rewriter.replaceOp(op, inserted);
+    return matchSuccess();
+  }
+};
+
+class VectorOuterProductOpConversion : public LLVMOpLowering {
+public:
+  explicit VectorOuterProductOpConversion(MLIRContext *context,
+                                          LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(vector::OuterProductOp::getOperationName(), context,
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto adaptor = vector::OuterProductOpOperandAdaptor(operands);
+    auto *ctx = op->getContext();
+    auto vLHS = adaptor.lhs()->getType().cast<LLVM::LLVMType>();
+    auto vRHS = adaptor.rhs()->getType().cast<LLVM::LLVMType>();
+    auto rankLHS = vLHS.getUnderlyingType()->getVectorNumElements();
+    auto rankRHS = vRHS.getUnderlyingType()->getVectorNumElements();
+    auto llvmArrayOfVectType = lowering.convertType(
+        cast<vector::OuterProductOp>(op).getResult()->getType());
+    Value desc = rewriter.create<LLVM::UndefOp>(loc, llvmArrayOfVectType);
+    Value a = adaptor.lhs(), b = adaptor.rhs();
+    Value acc = adaptor.acc().empty() ? nullptr : adaptor.acc().front();
+    SmallVector<Value, 8> lhs, accs;
+    lhs.reserve(rankLHS);
+    accs.reserve(rankLHS);
+    for (unsigned d = 0, e = rankLHS; d < e; ++d) {
+      // shufflevector explicitly requires i32.
+      auto attr = rewriter.getI32IntegerAttr(d);
+      SmallVector<Attribute, 4> bcastAttr(rankRHS, attr);
+      auto bcastArrayAttr = ArrayAttr::get(bcastAttr, ctx);
+      Value aD = nullptr, accD = nullptr;
+      // 1. Broadcast the element a[d] into vector aD.
+      aD = rewriter.create<LLVM::ShuffleVectorOp>(loc, a, a, bcastArrayAttr);
+      // 2. If acc is present, extract 1-d vector acc[d] into accD.
+      if (acc)
+        accD = rewriter.create<LLVM::ExtractValueOp>(
+            loc, vRHS, acc, rewriter.getI64ArrayAttr(d));
+      // 3. Compute aD outer b (plus accD, if relevant).
+      Value aOuterbD =
+          accD ? rewriter.create<LLVM::FMulAddOp>(loc, vRHS, aD, b, accD)
+                     .getResult()
+               : rewriter.create<LLVM::FMulOp>(loc, aD, b).getResult();
+      // 4. Insert as value `d` in the descriptor.
+      desc = rewriter.create<LLVM::InsertValueOp>(loc, llvmArrayOfVectType,
+                                                  desc, aOuterbD,
+                                                  rewriter.getI64ArrayAttr(d));
+    }
+    rewriter.replaceOp(op, desc);
+    return matchSuccess();
+  }
+};
+
+class VectorTypeCastOpConversion : public LLVMOpLowering {
+public:
+  explicit VectorTypeCastOpConversion(MLIRContext *context,
+                                      LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(vector::TypeCastOp::getOperationName(), context,
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    vector::TypeCastOp castOp = cast<vector::TypeCastOp>(op);
+    MemRefType sourceMemRefType =
+        castOp.getOperand()->getType().cast<MemRefType>();
+    MemRefType targetMemRefType =
+        castOp.getResult()->getType().cast<MemRefType>();
+
+    // Only static shape casts supported atm.
+    if (!sourceMemRefType.hasStaticShape() ||
+        !targetMemRefType.hasStaticShape())
+      return matchFailure();
+
+    auto llvmSourceDescriptorTy =
+        operands[0]->getType().dyn_cast<LLVM::LLVMType>();
+    if (!llvmSourceDescriptorTy || !llvmSourceDescriptorTy.isStructTy())
+      return matchFailure();
+    MemRefDescriptor sourceMemRef(operands[0]);
+
+    auto llvmTargetDescriptorTy = lowering.convertType(targetMemRefType)
+                                      .dyn_cast_or_null<LLVM::LLVMType>();
+    if (!llvmTargetDescriptorTy || !llvmTargetDescriptorTy.isStructTy())
+      return matchFailure();
+
+    int64_t offset;
+    SmallVector<int64_t, 4> strides;
+    auto successStrides =
+        getStridesAndOffset(sourceMemRefType, strides, offset);
+    bool isContiguous = (strides.back() == 1);
+    if (isContiguous) {
+      auto sizes = sourceMemRefType.getShape();
+      for (int index = 0, e = strides.size() - 2; index < e; ++index) {
+        if (strides[index] != strides[index + 1] * sizes[index + 1]) {
+          isContiguous = false;
+          break;
+        }
+      }
+    }
+    // Only contiguous source tensors supported atm.
+    if (failed(successStrides) || !isContiguous)
+      return matchFailure();
+
+    auto int64Ty = LLVM::LLVMType::getInt64Ty(lowering.getDialect());
+
+    // Create descriptor.
+    auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy);
+    Type llvmTargetElementTy = desc.getElementType();
+    // Set allocated ptr.
+    Value allocated = sourceMemRef.allocatedPtr(rewriter, loc);
+    allocated =
+        rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, allocated);
+    desc.setAllocatedPtr(rewriter, loc, allocated);
+    // Set aligned ptr.
+    Value ptr = sourceMemRef.alignedPtr(rewriter, loc);
+    ptr = rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, ptr);
+    desc.setAlignedPtr(rewriter, loc, ptr);
+    // Fill offset 0.
+    auto attr = rewriter.getIntegerAttr(rewriter.getIndexType(), 0);
+    auto zero = rewriter.create<LLVM::ConstantOp>(loc, int64Ty, attr);
+    desc.setOffset(rewriter, loc, zero);
+
+    // Fill size and stride descriptors in memref.
+    for (auto indexedSize : llvm::enumerate(targetMemRefType.getShape())) {
+      int64_t index = indexedSize.index();
+      auto sizeAttr =
+          rewriter.getIntegerAttr(rewriter.getIndexType(), indexedSize.value());
+      auto size = rewriter.create<LLVM::ConstantOp>(loc, int64Ty, sizeAttr);
+      desc.setSize(rewriter, loc, index, size);
+      auto strideAttr =
+          rewriter.getIntegerAttr(rewriter.getIndexType(), strides[index]);
+      auto stride = rewriter.create<LLVM::ConstantOp>(loc, int64Ty, strideAttr);
+      desc.setStride(rewriter, loc, index, stride);
+    }
+
+    rewriter.replaceOp(op, {desc});
+    return matchSuccess();
+  }
+};
+
+class VectorPrintOpConversion : public LLVMOpLowering {
+public:
+  explicit VectorPrintOpConversion(MLIRContext *context,
+                                   LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(vector::PrintOp::getOperationName(), context,
+                       typeConverter) {}
+
+  // Proof-of-concept lowering implementation that relies on a small
+  // runtime support library, which only needs to provide a few
+  // printing methods (single value for all data types, opening/closing
+  // bracket, comma, newline). The lowering fully unrolls a vector
+  // in terms of these elementary printing operations. The advantage
+  // of this approach is that the library can remain unaware of all
+  // low-level implementation details of vectors while still supporting
+  // output of any shaped and dimensioned vector. Due to full unrolling,
+  // this approach is less suited for very large vectors though.
+  //
+  // TODO(ajcbik): rely solely on libc in future? something else?
+  //
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto printOp = cast<vector::PrintOp>(op);
+    auto adaptor = vector::PrintOpOperandAdaptor(operands);
+    Type printType = printOp.getPrintType();
+
+    if (lowering.convertType(printType) == nullptr)
+      return matchFailure();
+
+    // Make sure element type has runtime support (currently just Float/Double).
+    VectorType vectorType = printType.dyn_cast<VectorType>();
+    Type eltType = vectorType ? vectorType.getElementType() : printType;
+    int64_t rank = vectorType ? vectorType.getRank() : 0;
+    Operation *printer;
+    if (eltType.isF32())
+      printer = getPrintFloat(op);
+    else if (eltType.isF64())
+      printer = getPrintDouble(op);
+    else
+      return matchFailure();
+
+    // Unroll vector into elementary print calls.
+    emitRanks(rewriter, op, adaptor.source(), vectorType, printer, rank);
+    emitCall(rewriter, op->getLoc(), getPrintNewline(op));
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+
+private:
+  void emitRanks(ConversionPatternRewriter &rewriter, Operation *op,
+                 Value value, VectorType vectorType, Operation *printer,
+                 int64_t rank) const {
+    Location loc = op->getLoc();
+    if (rank == 0) {
+      emitCall(rewriter, loc, printer, value);
+      return;
+    }
+
+    emitCall(rewriter, loc, getPrintOpen(op));
+    Operation *printComma = getPrintComma(op);
+    int64_t dim = vectorType.getDimSize(0);
+    for (int64_t d = 0; d < dim; ++d) {
+      auto reducedType =
+          rank > 1 ? reducedVectorTypeFront(vectorType) : nullptr;
+      auto llvmType = lowering.convertType(
+          rank > 1 ? reducedType : vectorType.getElementType());
+      Value nestedVal =
+          extractOne(rewriter, lowering, loc, value, llvmType, rank, d);
+      emitRanks(rewriter, op, nestedVal, reducedType, printer, rank - 1);
+      if (d != dim - 1)
+        emitCall(rewriter, loc, printComma);
+    }
+    emitCall(rewriter, loc, getPrintClose(op));
+  }
+
+  // Helper to emit a call.
+  static void emitCall(ConversionPatternRewriter &rewriter, Location loc,
+                       Operation *ref, ValueRange params = ValueRange()) {
+    rewriter.create<LLVM::CallOp>(loc, ArrayRef<Type>{},
+                                  rewriter.getSymbolRefAttr(ref), params);
+  }
+
+  // Helper for printer method declaration (first hit) and lookup.
+  static Operation *getPrint(Operation *op, LLVM::LLVMDialect *dialect,
+                             StringRef name, ArrayRef<LLVM::LLVMType> params) {
+    auto module = op->getParentOfType<ModuleOp>();
+    auto func = module.lookupSymbol<LLVM::LLVMFuncOp>(name);
+    if (func)
+      return func;
+    OpBuilder moduleBuilder(module.getBodyRegion());
+    return moduleBuilder.create<LLVM::LLVMFuncOp>(
+        op->getLoc(), name,
+        LLVM::LLVMType::getFunctionTy(LLVM::LLVMType::getVoidTy(dialect),
+                                      params, /*isVarArg=*/false));
+  }
+
+  // Helpers for method names.
+  Operation *getPrintFloat(Operation *op) const {
+    LLVM::LLVMDialect *dialect = lowering.getDialect();
+    return getPrint(op, dialect, "print_f32",
+                    LLVM::LLVMType::getFloatTy(dialect));
+  }
+  Operation *getPrintDouble(Operation *op) const {
+    LLVM::LLVMDialect *dialect = lowering.getDialect();
+    return getPrint(op, dialect, "print_f64",
+                    LLVM::LLVMType::getDoubleTy(dialect));
+  }
+  Operation *getPrintOpen(Operation *op) const {
+    return getPrint(op, lowering.getDialect(), "print_open", {});
+  }
+  Operation *getPrintClose(Operation *op) const {
+    return getPrint(op, lowering.getDialect(), "print_close", {});
+  }
+  Operation *getPrintComma(Operation *op) const {
+    return getPrint(op, lowering.getDialect(), "print_comma", {});
+  }
+  Operation *getPrintNewline(Operation *op) const {
+    return getPrint(op, lowering.getDialect(), "print_newline", {});
+  }
+};
+
+/// Populate the given list with patterns that convert from Vector to LLVM.
+void mlir::populateVectorToLLVMConversionPatterns(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+  patterns.insert<VectorBroadcastOpConversion, VectorShuffleOpConversion,
+                  VectorExtractElementOpConversion, VectorExtractOpConversion,
+                  VectorInsertElementOpConversion, VectorInsertOpConversion,
+                  VectorOuterProductOpConversion, VectorTypeCastOpConversion,
+                  VectorPrintOpConversion>(converter.getDialect()->getContext(),
+                                           converter);
+}
+
+namespace {
+struct LowerVectorToLLVMPass : public ModulePass<LowerVectorToLLVMPass> {
+  void runOnModule() override;
+};
+} // namespace
+
+void LowerVectorToLLVMPass::runOnModule() {
+  // Convert to the LLVM IR dialect using the converter defined above.
+  OwningRewritePatternList patterns;
+  LLVMTypeConverter converter(&getContext());
+  populateVectorToLLVMConversionPatterns(converter, patterns);
+  populateStdToLLVMConversionPatterns(converter, patterns);
+
+  ConversionTarget target(getContext());
+  target.addLegalDialect<LLVM::LLVMDialect>();
+  target.addDynamicallyLegalOp<FuncOp>(
+      [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
+  if (failed(
+          applyPartialConversion(getModule(), target, patterns, &converter))) {
+    signalPassFailure();
+  }
+}
+
+OpPassBase<ModuleOp> *mlir::createLowerVectorToLLVMPass() {
+  return new LowerVectorToLLVMPass();
+}
+
+static PassRegistration<LowerVectorToLLVMPass>
+    pass("convert-vector-to-llvm",
+         "Lower the operations from the vector dialect into the LLVM dialect");
diff --git a/mlir/lib/Conversion/VectorToLoops/CMakeLists.txt b/mlir/lib/Conversion/VectorToLoops/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e213bc9bcce0cff47e2baaf0ca34eec694682a02
--- /dev/null
+++ b/mlir/lib/Conversion/VectorToLoops/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_llvm_library(MLIRVectorToLoops
+  ConvertVectorToLoops.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/VectorToLoops
+)
+set(LIBS
+  MLIRLLVMIR
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+  )
+
+add_dependencies(MLIRVectorToLoops ${LIBS})
+target_link_libraries(MLIRVectorToLoops ${LIBS})
diff --git a/mlir/lib/Conversion/VectorToLoops/ConvertVectorToLoops.cpp b/mlir/lib/Conversion/VectorToLoops/ConvertVectorToLoops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ed031b985ae90f375821e6c9a6724d533add0cd
--- /dev/null
+++ b/mlir/lib/Conversion/VectorToLoops/ConvertVectorToLoops.cpp
@@ -0,0 +1,358 @@
+//===- VectorToLoops.cpp - Conversion from Vector to mix of Loops and Std -===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target-dependent lowering of vector transfer operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include <type_traits>
+
+#include "mlir/Conversion/VectorToLoops/ConvertVectorToLoops.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+
+using namespace mlir;
+using vector::TransferReadOp;
+using vector::TransferWriteOp;
+
+/// Analyzes the `transfer` to find an access dimension along the fastest remote
+/// MemRef dimension. If such a dimension with coalescing properties is found,
+/// `pivs` and `vectorView` are swapped so that the invocation of
+/// LoopNestBuilder captures it in the innermost loop.
+template <typename TransferOpTy>
+static void coalesceCopy(TransferOpTy transfer,
+                         SmallVectorImpl<edsc::ValueHandle *> *pivs,
+                         edsc::VectorView *vectorView) {
+  // rank of the remote memory access, coalescing behavior occurs on the
+  // innermost memory dimension.
+  auto remoteRank = transfer.getMemRefType().getRank();
+  // Iterate over the results expressions of the permutation map to determine
+  // the loop order for creating pointwise copies between remote and local
+  // memories.
+  int coalescedIdx = -1;
+  auto exprs = transfer.permutation_map().getResults();
+  for (auto en : llvm::enumerate(exprs)) {
+    auto dim = en.value().template dyn_cast<AffineDimExpr>();
+    if (!dim) {
+      continue;
+    }
+    auto memRefDim = dim.getPosition();
+    if (memRefDim == remoteRank - 1) {
+      // memRefDim has coalescing properties, it should be swapped in the last
+      // position.
+      assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices");
+      coalescedIdx = en.index();
+    }
+  }
+  if (coalescedIdx >= 0) {
+    std::swap(pivs->back(), (*pivs)[coalescedIdx]);
+    vectorView->swapRanges(pivs->size() - 1, coalescedIdx);
+  }
+}
+
+/// Emits remote memory accesses that are clipped to the boundaries of the
+/// MemRef.
+template <typename TransferOpTy>
+static SmallVector<edsc::ValueHandle, 8> clip(TransferOpTy transfer,
+                                              edsc::MemRefView &view,
+                                              ArrayRef<edsc::IndexHandle> ivs) {
+  using namespace mlir::edsc;
+  using namespace edsc::op;
+  using edsc::intrinsics::select;
+
+  IndexHandle zero(index_t(0)), one(index_t(1));
+  SmallVector<edsc::ValueHandle, 8> memRefAccess(transfer.indices());
+  SmallVector<edsc::ValueHandle, 8> clippedScalarAccessExprs(
+      memRefAccess.size(), edsc::IndexHandle());
+
+  // Indices accessing to remote memory are clipped and their expressions are
+  // returned in clippedScalarAccessExprs.
+  for (unsigned memRefDim = 0; memRefDim < clippedScalarAccessExprs.size();
+       ++memRefDim) {
+    // Linear search on a small number of entries.
+    int loopIndex = -1;
+    auto exprs = transfer.permutation_map().getResults();
+    for (auto en : llvm::enumerate(exprs)) {
+      auto expr = en.value();
+      auto dim = expr.template dyn_cast<AffineDimExpr>();
+      // Sanity check.
+      assert(
+          (dim || expr.template cast<AffineConstantExpr>().getValue() == 0) &&
+          "Expected dim or 0 in permutationMap");
+      if (dim && memRefDim == dim.getPosition()) {
+        loopIndex = en.index();
+        break;
+      }
+    }
+
+    // We cannot distinguish atm between unrolled dimensions that implement
+    // the "always full" tile abstraction and need clipping from the other
+    // ones. So we conservatively clip everything.
+    auto N = view.ub(memRefDim);
+    auto i = memRefAccess[memRefDim];
+    if (loopIndex < 0) {
+      auto N_minus_1 = N - one;
+      auto select_1 = select(i < N, i, N_minus_1);
+      clippedScalarAccessExprs[memRefDim] = select(i < zero, zero, select_1);
+    } else {
+      auto ii = ivs[loopIndex];
+      auto i_plus_ii = i + ii;
+      auto N_minus_1 = N - one;
+      auto select_1 = select(i_plus_ii < N, i_plus_ii, N_minus_1);
+      clippedScalarAccessExprs[memRefDim] =
+          select(i_plus_ii < zero, zero, select_1);
+    }
+  }
+
+  return clippedScalarAccessExprs;
+}
+
+namespace {
+
+using vector_type_cast = edsc::intrinsics::ValueBuilder<vector::TypeCastOp>;
+
+/// Implements lowering of TransferReadOp and TransferWriteOp to a
+/// proper abstraction for the hardware.
+///
+/// For now, we only emit a simple loop nest that performs clipped pointwise
+/// copies from a remote to a locally allocated memory.
+///
+/// Consider the case:
+///
+/// ```mlir
+///    // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into
+///    // vector<32x256xf32> and pad with %f0 to handle the boundary case:
+///    %f0 = constant 0.0f : f32
+///    loop.for %i0 = 0 to %0 {
+///      loop.for %i1 = 0 to %1 step %c256 {
+///        loop.for %i2 = 0 to %2 step %c32 {
+///          %v = vector.transfer_read %A[%i0, %i1, %i2], %f0
+///               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
+///               memref<?x?x?xf32>, vector<32x256xf32>
+///    }}}
+/// ```
+///
+/// The rewriters construct loop and indices that access MemRef A in a pattern
+/// resembling the following (while guaranteeing an always full-tile
+/// abstraction):
+///
+/// ```mlir
+///    loop.for %d2 = 0 to %c256 {
+///      loop.for %d1 = 0 to %c32 {
+///        %s = %A[%i0, %i1 + %d1, %i2 + %d2] : f32
+///        %tmp[%d2, %d1] = %s
+///      }
+///    }
+/// ```
+///
+/// In the current state, only a clipping transfer is implemented by `clip`,
+/// which creates individual indexing expressions of the form:
+///
+/// ```mlir-dsc
+///    auto condMax = i + ii < N;
+///    auto max = select(condMax, i + ii, N - one)
+///    auto cond = i + ii < zero;
+///    select(cond, zero, max);
+/// ```
+///
+/// In the future, clipping should not be the only way and instead we should
+/// load vectors + mask them. Similarly on the write side, load/mask/store for
+/// implementing RMW behavior.
+///
+/// Lowers TransferOp into a combination of:
+///   1. local memory allocation;
+///   2. perfect loop nest over:
+///      a. scalar load/stores from local buffers (viewed as a scalar memref);
+///      a. scalar store/load to original memref (with clipping).
+///   3. vector_load/store
+///   4. local memory deallocation.
+/// Minor variations occur depending on whether a TransferReadOp or
+/// a TransferWriteOp is rewritten.
+template <typename TransferOpTy>
+struct VectorTransferRewriter : public RewritePattern {
+  explicit VectorTransferRewriter(MLIRContext *context)
+      : RewritePattern(TransferOpTy::getOperationName(), 1, context) {}
+
+  /// Used for staging the transfer in a local scalar buffer.
+  MemRefType tmpMemRefType(TransferOpTy transfer) const {
+    auto vectorType = transfer.getVectorType();
+    return MemRefType::get(vectorType.getShape(), vectorType.getElementType(),
+                           {}, 0);
+  }
+
+  /// Performs the rewrite.
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const override;
+};
+
+/// Lowers TransferReadOp into a combination of:
+///   1. local memory allocation;
+///   2. perfect loop nest over:
+///      a. scalar load from local buffers (viewed as a scalar memref);
+///      a. scalar store to original memref (with clipping).
+///   3. vector_load from local buffer (viewed as a memref<1 x vector>);
+///   4. local memory deallocation.
+///
+/// Lowers the data transfer part of a TransferReadOp while ensuring no
+/// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
+/// clipping. This means that a given value in memory can be read multiple
+/// times and concurrently.
+///
+/// Important notes about clipping and "full-tiles only" abstraction:
+/// =================================================================
+/// When using clipping for dealing with boundary conditions, the same edge
+/// value will appear multiple times (a.k.a edge padding). This is fine if the
+/// subsequent vector operations are all data-parallel but **is generally
+/// incorrect** in the presence of reductions or extract operations.
+///
+/// More generally, clipping is a scalar abstraction that is expected to work
+/// fine as a baseline for CPUs and GPUs but not for vector_load and DMAs.
+/// To deal with real vector_load and DMAs, a "padded allocation + view"
+/// abstraction with the ability to read out-of-memref-bounds (but still within
+/// the allocated region) is necessary.
+///
+/// Whether using scalar loops or vector_load/DMAs to perform the transfer,
+/// junk values will be materialized in the vectors and generally need to be
+/// filtered out and replaced by the "neutral element". This neutral element is
+/// op-dependent so, in the future, we expect to create a vector filter and
+/// apply it to a splatted constant vector with the proper neutral element at
+/// each ssa-use. This filtering is not necessary for pure data-parallel
+/// operations.
+///
+/// In the case of vector_store/DMAs, Read-Modify-Write will be required, which
+/// also have concurrency implications. Note that by using clipped scalar stores
+/// in the presence of data-parallel only operations, we generate code that
+/// writes the same value multiple time on the edge locations.
+///
+/// TODO(ntv): implement alternatives to clipping.
+/// TODO(ntv): support non-data-parallel operations.
+
+/// Performs the rewrite.
+template <>
+PatternMatchResult VectorTransferRewriter<TransferReadOp>::matchAndRewrite(
+    Operation *op, PatternRewriter &rewriter) const {
+  using namespace mlir::edsc;
+  using namespace mlir::edsc::op;
+  using namespace mlir::edsc::intrinsics;
+  using IndexedValue =
+      TemplatedIndexedValue<intrinsics::std_load, intrinsics::std_store>;
+
+  TransferReadOp transfer = cast<TransferReadOp>(op);
+
+  // 1. Setup all the captures.
+  ScopedContext scope(rewriter, transfer.getLoc());
+  IndexedValue remote(transfer.memref());
+  MemRefView view(transfer.memref());
+  VectorView vectorView(transfer.vector());
+  SmallVector<IndexHandle, 8> ivs = makeIndexHandles(vectorView.rank());
+  SmallVector<ValueHandle *, 8> pivs =
+      makeHandlePointers(MutableArrayRef<IndexHandle>(ivs));
+  coalesceCopy(transfer, &pivs, &vectorView);
+
+  auto lbs = vectorView.getLbs();
+  auto ubs = vectorView.getUbs();
+  SmallVector<ValueHandle, 8> steps;
+  steps.reserve(vectorView.getSteps().size());
+  for (auto step : vectorView.getSteps())
+    steps.push_back(constant_index(step));
+
+  // 2. Emit alloc-copy-load-dealloc.
+  ValueHandle tmp = alloc(tmpMemRefType(transfer));
+  IndexedValue local(tmp);
+  ValueHandle vec = vector_type_cast(tmp);
+  LoopNestBuilder(pivs, lbs, ubs, steps)([&] {
+    // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
+    local(ivs) = remote(clip(transfer, view, ivs));
+  });
+  ValueHandle vectorValue = std_load(vec);
+  (dealloc(tmp)); // vexing parse
+
+  // 3. Propagate.
+  rewriter.replaceOp(op, vectorValue.getValue());
+  return matchSuccess();
+}
+
+/// Lowers TransferWriteOp into a combination of:
+///   1. local memory allocation;
+///   2. vector_store to local buffer (viewed as a memref<1 x vector>);
+///   3. perfect loop nest over:
+///      a. scalar load from local buffers (viewed as a scalar memref);
+///      a. scalar store to original memref (with clipping).
+///   4. local memory deallocation.
+///
+/// More specifically, lowers the data transfer part while ensuring no
+/// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
+/// clipping. This means that a given value in memory can be written to multiple
+/// times and concurrently.
+///
+/// See `Important notes about clipping and full-tiles only abstraction` in the
+/// description of `readClipped` above.
+///
+/// TODO(ntv): implement alternatives to clipping.
+/// TODO(ntv): support non-data-parallel operations.
+template <>
+PatternMatchResult VectorTransferRewriter<TransferWriteOp>::matchAndRewrite(
+    Operation *op, PatternRewriter &rewriter) const {
+  using namespace mlir::edsc;
+  using namespace mlir::edsc::op;
+  using namespace mlir::edsc::intrinsics;
+  using IndexedValue =
+      TemplatedIndexedValue<intrinsics::std_load, intrinsics::std_store>;
+
+  TransferWriteOp transfer = cast<TransferWriteOp>(op);
+
+  // 1. Setup all the captures.
+  ScopedContext scope(rewriter, transfer.getLoc());
+  IndexedValue remote(transfer.memref());
+  MemRefView view(transfer.memref());
+  ValueHandle vectorValue(transfer.vector());
+  VectorView vectorView(transfer.vector());
+  SmallVector<IndexHandle, 8> ivs = makeIndexHandles(vectorView.rank());
+  SmallVector<ValueHandle *, 8> pivs =
+      makeHandlePointers(MutableArrayRef<IndexHandle>(ivs));
+  coalesceCopy(transfer, &pivs, &vectorView);
+
+  auto lbs = vectorView.getLbs();
+  auto ubs = vectorView.getUbs();
+  SmallVector<ValueHandle, 8> steps;
+  steps.reserve(vectorView.getSteps().size());
+  for (auto step : vectorView.getSteps())
+    steps.push_back(constant_index(step));
+
+  // 2. Emit alloc-store-copy-dealloc.
+  ValueHandle tmp = alloc(tmpMemRefType(transfer));
+  IndexedValue local(tmp);
+  ValueHandle vec = vector_type_cast(tmp);
+  std_store(vectorValue, vec);
+  LoopNestBuilder(pivs, lbs, ubs, steps)([&] {
+    // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
+    remote(clip(transfer, view, ivs)) = local(ivs);
+  });
+  (dealloc(tmp)); // vexing parse...
+
+  rewriter.eraseOp(op);
+  return matchSuccess();
+}
+
+} // namespace
+
+void mlir::populateVectorToAffineLoopsConversionPatterns(
+    MLIRContext *context, OwningRewritePatternList &patterns) {
+  patterns.insert<VectorTransferRewriter<vector::TransferReadOp>,
+                  VectorTransferRewriter<vector::TransferWriteOp>>(context);
+}
diff --git a/mlir/lib/Dialect/AffineOps/AffineOps.cpp b/mlir/lib/Dialect/AffineOps/AffineOps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5f4cc2e1060b8c4f50804a1d92ec9feb9673733d
--- /dev/null
+++ b/mlir/lib/Dialect/AffineOps/AffineOps.cpp
@@ -0,0 +1,2116 @@
+//===- AffineOps.cpp - MLIR Affine Operations -----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "mlir/Transforms/SideEffectsInterface.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/Support/Debug.h"
+
+using namespace mlir;
+using llvm::dbgs;
+
+#define DEBUG_TYPE "affine-analysis"
+
+//===----------------------------------------------------------------------===//
+// AffineOpsDialect Interfaces
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This class defines the interface for handling inlining with affine
+/// operations.
+struct AffineInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Returns true if the given region 'src' can be inlined into the region
+  /// 'dest' that is attached to an operation registered to the current dialect.
+  bool isLegalToInline(Region *dest, Region *src,
+                       BlockAndValueMapping &valueMapping) const final {
+    // Conservatively don't allow inlining into affine structures.
+    return false;
+  }
+
+  /// Returns true if the given operation 'op', that is registered to this
+  /// dialect, can be inlined into the given region, false otherwise.
+  bool isLegalToInline(Operation *op, Region *region,
+                       BlockAndValueMapping &valueMapping) const final {
+    // Always allow inlining affine operations into the top-level region of a
+    // function. There are some edge cases when inlining *into* affine
+    // structures, but that is handled in the other 'isLegalToInline' hook
+    // above.
+    // TODO: We should be able to inline into other regions than functions.
+    return isa<FuncOp>(region->getParentOp());
+  }
+
+  /// Affine regions should be analyzed recursively.
+  bool shouldAnalyzeRecursively(Operation *op) const final { return true; }
+};
+
+// TODO(mlir): Extend for other ops in this dialect.
+struct AffineSideEffectsInterface : public SideEffectsDialectInterface {
+  using SideEffectsDialectInterface::SideEffectsDialectInterface;
+
+  SideEffecting isSideEffecting(Operation *op) const override {
+    if (isa<AffineIfOp>(op)) {
+      return Recursive;
+    }
+    return SideEffectsDialectInterface::isSideEffecting(op);
+  };
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// AffineOpsDialect
+//===----------------------------------------------------------------------===//
+
+AffineOpsDialect::AffineOpsDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context) {
+  addOperations<AffineApplyOp, AffineDmaStartOp, AffineDmaWaitOp, AffineLoadOp,
+                AffineStoreOp,
+#define GET_OP_LIST
+#include "mlir/Dialect/AffineOps/AffineOps.cpp.inc"
+                >();
+  addInterfaces<AffineInlinerInterface, AffineSideEffectsInterface>();
+}
+
+/// Materialize a single constant operation from a given attribute value with
+/// the desired resultant type.
+Operation *AffineOpsDialect::materializeConstant(OpBuilder &builder,
+                                                 Attribute value, Type type,
+                                                 Location loc) {
+  return builder.create<ConstantOp>(loc, type, value);
+}
+
+/// A utility function to check if a given region is attached to a function.
+static bool isFunctionRegion(Region *region) {
+  return llvm::isa<FuncOp>(region->getParentOp());
+}
+
+/// A utility function to check if a value is defined at the top level of a
+/// function. A value of index type defined at the top level is always a valid
+/// symbol.
+bool mlir::isTopLevelValue(Value value) {
+  if (auto arg = value.dyn_cast<BlockArgument>())
+    return isFunctionRegion(arg->getOwner()->getParent());
+  return isFunctionRegion(value->getDefiningOp()->getParentRegion());
+}
+
+// Value can be used as a dimension id if it is valid as a symbol, or
+// it is an induction variable, or it is a result of affine apply operation
+// with dimension id arguments.
+bool mlir::isValidDim(Value value) {
+  // The value must be an index type.
+  if (!value->getType().isIndex())
+    return false;
+
+  if (auto *op = value->getDefiningOp()) {
+    // Top level operation or constant operation is ok.
+    if (isFunctionRegion(op->getParentRegion()) || isa<ConstantOp>(op))
+      return true;
+    // Affine apply operation is ok if all of its operands are ok.
+    if (auto applyOp = dyn_cast<AffineApplyOp>(op))
+      return applyOp.isValidDim();
+    // The dim op is okay if its operand memref/tensor is defined at the top
+    // level.
+    if (auto dimOp = dyn_cast<DimOp>(op))
+      return isTopLevelValue(dimOp.getOperand());
+    return false;
+  }
+  // This value has to be a block argument for a FuncOp or an affine.for.
+  auto *parentOp = value.cast<BlockArgument>()->getOwner()->getParentOp();
+  return isa<FuncOp>(parentOp) || isa<AffineForOp>(parentOp);
+}
+
+/// Returns true if the 'index' dimension of the `memref` defined by
+/// `memrefDefOp` is a statically  shaped one or defined using a valid symbol.
+template <typename AnyMemRefDefOp>
+bool isMemRefSizeValidSymbol(AnyMemRefDefOp memrefDefOp, unsigned index) {
+  auto memRefType = memrefDefOp.getType();
+  // Statically shaped.
+  if (!ShapedType::isDynamic(memRefType.getDimSize(index)))
+    return true;
+  // Get the position of the dimension among dynamic dimensions;
+  unsigned dynamicDimPos = memRefType.getDynamicDimIndex(index);
+  return isValidSymbol(
+      *(memrefDefOp.getDynamicSizes().begin() + dynamicDimPos));
+}
+
+/// Returns true if the result of the dim op is a valid symbol.
+static bool isDimOpValidSymbol(DimOp dimOp) {
+  // The dim op is okay if its operand memref/tensor is defined at the top
+  // level.
+  if (isTopLevelValue(dimOp.getOperand()))
+    return true;
+
+  // The dim op is also okay if its operand memref/tensor is a view/subview
+  // whose corresponding size is a valid symbol.
+  unsigned index = dimOp.getIndex();
+  if (auto viewOp = dyn_cast<ViewOp>(dimOp.getOperand()->getDefiningOp()))
+    return isMemRefSizeValidSymbol<ViewOp>(viewOp, index);
+  if (auto subViewOp = dyn_cast<SubViewOp>(dimOp.getOperand()->getDefiningOp()))
+    return isMemRefSizeValidSymbol<SubViewOp>(subViewOp, index);
+  if (auto allocOp = dyn_cast<AllocOp>(dimOp.getOperand()->getDefiningOp()))
+    return isMemRefSizeValidSymbol<AllocOp>(allocOp, index);
+  return false;
+}
+
+// Value can be used as a symbol if it is a constant, or it is defined at
+// the top level, or it is a result of affine apply operation with symbol
+// arguments, or a result of the dim op on a memref satisfying certain
+// constraints.
+bool mlir::isValidSymbol(Value value) {
+  // The value must be an index type.
+  if (!value->getType().isIndex())
+    return false;
+
+  if (auto *op = value->getDefiningOp()) {
+    // Top level operation or constant operation is ok.
+    if (isFunctionRegion(op->getParentRegion()) || isa<ConstantOp>(op))
+      return true;
+    // Affine apply operation is ok if all of its operands are ok.
+    if (auto applyOp = dyn_cast<AffineApplyOp>(op))
+      return applyOp.isValidSymbol();
+    if (auto dimOp = dyn_cast<DimOp>(op)) {
+      return isDimOpValidSymbol(dimOp);
+    }
+  }
+  // Otherwise, check that the value is a top level value.
+  return isTopLevelValue(value);
+}
+
+// Returns true if 'value' is a valid index to an affine operation (e.g.
+// affine.load, affine.store, affine.dma_start, affine.dma_wait).
+// Returns false otherwise.
+static bool isValidAffineIndexOperand(Value value) {
+  return isValidDim(value) || isValidSymbol(value);
+}
+
+/// Utility function to verify that a set of operands are valid dimension and
+/// symbol identifiers. The operands should be laid out such that the dimension
+/// operands are before the symbol operands. This function returns failure if
+/// there was an invalid operand. An operation is provided to emit any necessary
+/// errors.
+template <typename OpTy>
+static LogicalResult
+verifyDimAndSymbolIdentifiers(OpTy &op, Operation::operand_range operands,
+                              unsigned numDims) {
+  unsigned opIt = 0;
+  for (auto operand : operands) {
+    if (opIt++ < numDims) {
+      if (!isValidDim(operand))
+        return op.emitOpError("operand cannot be used as a dimension id");
+    } else if (!isValidSymbol(operand)) {
+      return op.emitOpError("operand cannot be used as a symbol");
+    }
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// AffineApplyOp
+//===----------------------------------------------------------------------===//
+
+void AffineApplyOp::build(Builder *builder, OperationState &result,
+                          AffineMap map, ValueRange operands) {
+  result.addOperands(operands);
+  result.types.append(map.getNumResults(), builder->getIndexType());
+  result.addAttribute("map", AffineMapAttr::get(map));
+}
+
+ParseResult AffineApplyOp::parse(OpAsmParser &parser, OperationState &result) {
+  auto &builder = parser.getBuilder();
+  auto indexTy = builder.getIndexType();
+
+  AffineMapAttr mapAttr;
+  unsigned numDims;
+  if (parser.parseAttribute(mapAttr, "map", result.attributes) ||
+      parseDimAndSymbolList(parser, result.operands, numDims) ||
+      parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+  auto map = mapAttr.getValue();
+
+  if (map.getNumDims() != numDims ||
+      numDims + map.getNumSymbols() != result.operands.size()) {
+    return parser.emitError(parser.getNameLoc(),
+                            "dimension or symbol index mismatch");
+  }
+
+  result.types.append(map.getNumResults(), indexTy);
+  return success();
+}
+
+void AffineApplyOp::print(OpAsmPrinter &p) {
+  p << "affine.apply " << getAttr("map");
+  printDimAndSymbolList(operand_begin(), operand_end(),
+                        getAffineMap().getNumDims(), p);
+  p.printOptionalAttrDict(getAttrs(), /*elidedAttrs=*/{"map"});
+}
+
+LogicalResult AffineApplyOp::verify() {
+  // Check that affine map attribute was specified.
+  auto affineMapAttr = getAttrOfType<AffineMapAttr>("map");
+  if (!affineMapAttr)
+    return emitOpError("requires an affine map");
+
+  // Check input and output dimensions match.
+  auto map = affineMapAttr.getValue();
+
+  // Verify that operand count matches affine map dimension and symbol count.
+  if (getNumOperands() != map.getNumDims() + map.getNumSymbols())
+    return emitOpError(
+        "operand count and affine map dimension and symbol count must match");
+
+  // Verify that all operands are of `index` type.
+  for (Type t : getOperandTypes()) {
+    if (!t.isIndex())
+      return emitOpError("operands must be of type 'index'");
+  }
+
+  if (!getResult()->getType().isIndex())
+    return emitOpError("result must be of type 'index'");
+
+  // Verify that the map only produces one result.
+  if (map.getNumResults() != 1)
+    return emitOpError("mapping must produce one value");
+
+  return success();
+}
+
+// The result of the affine apply operation can be used as a dimension id if all
+// its operands are valid dimension ids.
+bool AffineApplyOp::isValidDim() {
+  return llvm::all_of(getOperands(),
+                      [](Value op) { return mlir::isValidDim(op); });
+}
+
+// The result of the affine apply operation can be used as a symbol if all its
+// operands are symbols.
+bool AffineApplyOp::isValidSymbol() {
+  return llvm::all_of(getOperands(),
+                      [](Value op) { return mlir::isValidSymbol(op); });
+}
+
+OpFoldResult AffineApplyOp::fold(ArrayRef<Attribute> operands) {
+  auto map = getAffineMap();
+
+  // Fold dims and symbols to existing values.
+  auto expr = map.getResult(0);
+  if (auto dim = expr.dyn_cast<AffineDimExpr>())
+    return getOperand(dim.getPosition());
+  if (auto sym = expr.dyn_cast<AffineSymbolExpr>())
+    return getOperand(map.getNumDims() + sym.getPosition());
+
+  // Otherwise, default to folding the map.
+  SmallVector<Attribute, 1> result;
+  if (failed(map.constantFold(operands, result)))
+    return {};
+  return result[0];
+}
+
+AffineDimExpr AffineApplyNormalizer::renumberOneDim(Value v) {
+  DenseMap<Value, unsigned>::iterator iterPos;
+  bool inserted = false;
+  std::tie(iterPos, inserted) =
+      dimValueToPosition.insert(std::make_pair(v, dimValueToPosition.size()));
+  if (inserted) {
+    reorderedDims.push_back(v);
+  }
+  return getAffineDimExpr(iterPos->second, v->getContext())
+      .cast<AffineDimExpr>();
+}
+
+AffineMap AffineApplyNormalizer::renumber(const AffineApplyNormalizer &other) {
+  SmallVector<AffineExpr, 8> dimRemapping;
+  for (auto v : other.reorderedDims) {
+    auto kvp = other.dimValueToPosition.find(v);
+    if (dimRemapping.size() <= kvp->second)
+      dimRemapping.resize(kvp->second + 1);
+    dimRemapping[kvp->second] = renumberOneDim(kvp->first);
+  }
+  unsigned numSymbols = concatenatedSymbols.size();
+  unsigned numOtherSymbols = other.concatenatedSymbols.size();
+  SmallVector<AffineExpr, 8> symRemapping(numOtherSymbols);
+  for (unsigned idx = 0; idx < numOtherSymbols; ++idx) {
+    symRemapping[idx] =
+        getAffineSymbolExpr(idx + numSymbols, other.affineMap.getContext());
+  }
+  concatenatedSymbols.insert(concatenatedSymbols.end(),
+                             other.concatenatedSymbols.begin(),
+                             other.concatenatedSymbols.end());
+  auto map = other.affineMap;
+  return map.replaceDimsAndSymbols(dimRemapping, symRemapping,
+                                   reorderedDims.size(),
+                                   concatenatedSymbols.size());
+}
+
+// Gather the positions of the operands that are produced by an AffineApplyOp.
+static llvm::SetVector<unsigned>
+indicesFromAffineApplyOp(ArrayRef<Value> operands) {
+  llvm::SetVector<unsigned> res;
+  for (auto en : llvm::enumerate(operands))
+    if (isa_and_nonnull<AffineApplyOp>(en.value()->getDefiningOp()))
+      res.insert(en.index());
+  return res;
+}
+
+// Support the special case of a symbol coming from an AffineApplyOp that needs
+// to be composed into the current AffineApplyOp.
+// This case is handled by rewriting all such symbols into dims for the purpose
+// of allowing mathematical AffineMap composition.
+// Returns an AffineMap where symbols that come from an AffineApplyOp have been
+// rewritten as dims and are ordered after the original dims.
+// TODO(andydavis,ntv): This promotion makes AffineMap lose track of which
+// symbols are represented as dims. This loss is static but can still be
+// recovered dynamically (with `isValidSymbol`). Still this is annoying for the
+// semi-affine map case. A dynamic canonicalization of all dims that are valid
+// symbols (a.k.a `canonicalizePromotedSymbols`) into symbols helps and even
+// results in better simplifications and foldings. But we should evaluate
+// whether this behavior is what we really want after using more.
+static AffineMap promoteComposedSymbolsAsDims(AffineMap map,
+                                              ArrayRef<Value> symbols) {
+  if (symbols.empty()) {
+    return map;
+  }
+
+  // Sanity check on symbols.
+  for (auto sym : symbols) {
+    assert(isValidSymbol(sym) && "Expected only valid symbols");
+    (void)sym;
+  }
+
+  // Extract the symbol positions that come from an AffineApplyOp and
+  // needs to be rewritten as dims.
+  auto symPositions = indicesFromAffineApplyOp(symbols);
+  if (symPositions.empty()) {
+    return map;
+  }
+
+  // Create the new map by replacing each symbol at pos by the next new dim.
+  unsigned numDims = map.getNumDims();
+  unsigned numSymbols = map.getNumSymbols();
+  unsigned numNewDims = 0;
+  unsigned numNewSymbols = 0;
+  SmallVector<AffineExpr, 8> symReplacements(numSymbols);
+  for (unsigned i = 0; i < numSymbols; ++i) {
+    symReplacements[i] =
+        symPositions.count(i) > 0
+            ? getAffineDimExpr(numDims + numNewDims++, map.getContext())
+            : getAffineSymbolExpr(numNewSymbols++, map.getContext());
+  }
+  assert(numSymbols >= numNewDims);
+  AffineMap newMap = map.replaceDimsAndSymbols(
+      {}, symReplacements, numDims + numNewDims, numNewSymbols);
+
+  return newMap;
+}
+
+/// The AffineNormalizer composes AffineApplyOp recursively. Its purpose is to
+/// keep a correspondence between the mathematical `map` and the `operands` of
+/// a given AffineApplyOp. This correspondence is maintained by iterating over
+/// the operands and forming an `auxiliaryMap` that can be composed
+/// mathematically with `map`. To keep this correspondence in cases where
+/// symbols are produced by affine.apply operations, we perform a local rewrite
+/// of symbols as dims.
+///
+/// Rationale for locally rewriting symbols as dims:
+/// ================================================
+/// The mathematical composition of AffineMap must always concatenate symbols
+/// because it does not have enough information to do otherwise. For example,
+/// composing `(d0)[s0] -> (d0 + s0)` with itself must produce
+/// `(d0)[s0, s1] -> (d0 + s0 + s1)`.
+///
+/// The result is only equivalent to `(d0)[s0] -> (d0 + 2 * s0)` when
+/// applied to the same mlir::Value for both s0 and s1.
+/// As a consequence mathematical composition of AffineMap always concatenates
+/// symbols.
+///
+/// When AffineMaps are used in AffineApplyOp however, they may specify
+/// composition via symbols, which is ambiguous mathematically. This corner case
+/// is handled by locally rewriting such symbols that come from AffineApplyOp
+/// into dims and composing through dims.
+/// TODO(andydavis, ntv): Composition via symbols comes at a significant code
+/// complexity. Alternatively we should investigate whether we want to
+/// explicitly disallow symbols coming from affine.apply and instead force the
+/// user to compose symbols beforehand. The annoyances may be small (i.e. 1 or 2
+/// extra API calls for such uses, which haven't popped up until now) and the
+/// benefit potentially big: simpler and more maintainable code for a
+/// non-trivial, recursive, procedure.
+AffineApplyNormalizer::AffineApplyNormalizer(AffineMap map,
+                                             ArrayRef<Value> operands)
+    : AffineApplyNormalizer() {
+  static_assert(kMaxAffineApplyDepth > 0, "kMaxAffineApplyDepth must be > 0");
+  assert(map.getNumInputs() == operands.size() &&
+         "number of operands does not match the number of map inputs");
+
+  LLVM_DEBUG(map.print(dbgs() << "\nInput map: "));
+
+  // Promote symbols that come from an AffineApplyOp to dims by rewriting the
+  // map to always refer to:
+  //   (dims, symbols coming from AffineApplyOp, other symbols).
+  // The order of operands can remain unchanged.
+  // This is a simplification that relies on 2 ordering properties:
+  //   1. rewritten symbols always appear after the original dims in the map;
+  //   2. operands are traversed in order and either dispatched to:
+  //      a. auxiliaryExprs (dims and symbols rewritten as dims);
+  //      b. concatenatedSymbols (all other symbols)
+  // This allows operand order to remain unchanged.
+  unsigned numDimsBeforeRewrite = map.getNumDims();
+  map = promoteComposedSymbolsAsDims(map,
+                                     operands.take_back(map.getNumSymbols()));
+
+  LLVM_DEBUG(map.print(dbgs() << "\nRewritten map: "));
+
+  SmallVector<AffineExpr, 8> auxiliaryExprs;
+  bool furtherCompose = (affineApplyDepth() <= kMaxAffineApplyDepth);
+  // We fully spell out the 2 cases below. In this particular instance a little
+  // code duplication greatly improves readability.
+  // Note that the first branch would disappear if we only supported full
+  // composition (i.e. infinite kMaxAffineApplyDepth).
+  if (!furtherCompose) {
+    // 1. Only dispatch dims or symbols.
+    for (auto en : llvm::enumerate(operands)) {
+      auto t = en.value();
+      assert(t->getType().isIndex());
+      bool isDim = (en.index() < map.getNumDims());
+      if (isDim) {
+        // a. The mathematical composition of AffineMap composes dims.
+        auxiliaryExprs.push_back(renumberOneDim(t));
+      } else {
+        // b. The mathematical composition of AffineMap concatenates symbols.
+        //    We do the same for symbol operands.
+        concatenatedSymbols.push_back(t);
+      }
+    }
+  } else {
+    assert(numDimsBeforeRewrite <= operands.size());
+    // 2. Compose AffineApplyOps and dispatch dims or symbols.
+    for (unsigned i = 0, e = operands.size(); i < e; ++i) {
+      auto t = operands[i];
+      auto affineApply = dyn_cast_or_null<AffineApplyOp>(t->getDefiningOp());
+      if (affineApply) {
+        // a. Compose affine.apply operations.
+        LLVM_DEBUG(affineApply.getOperation()->print(
+            dbgs() << "\nCompose AffineApplyOp recursively: "));
+        AffineMap affineApplyMap = affineApply.getAffineMap();
+        SmallVector<Value, 8> affineApplyOperands(
+            affineApply.getOperands().begin(), affineApply.getOperands().end());
+        AffineApplyNormalizer normalizer(affineApplyMap, affineApplyOperands);
+
+        LLVM_DEBUG(normalizer.affineMap.print(
+            dbgs() << "\nRenumber into current normalizer: "));
+
+        auto renumberedMap = renumber(normalizer);
+
+        LLVM_DEBUG(
+            renumberedMap.print(dbgs() << "\nRecursive composition yields: "));
+
+        auxiliaryExprs.push_back(renumberedMap.getResult(0));
+      } else {
+        if (i < numDimsBeforeRewrite) {
+          // b. The mathematical composition of AffineMap composes dims.
+          auxiliaryExprs.push_back(renumberOneDim(t));
+        } else {
+          // c. The mathematical composition of AffineMap concatenates symbols.
+          //    We do the same for symbol operands.
+          concatenatedSymbols.push_back(t);
+        }
+      }
+    }
+  }
+
+  // Early exit if `map` is already composed.
+  if (auxiliaryExprs.empty()) {
+    affineMap = map;
+    return;
+  }
+
+  assert(concatenatedSymbols.size() >= map.getNumSymbols() &&
+         "Unexpected number of concatenated symbols");
+  auto numDims = dimValueToPosition.size();
+  auto numSymbols = concatenatedSymbols.size() - map.getNumSymbols();
+  auto auxiliaryMap = AffineMap::get(numDims, numSymbols, auxiliaryExprs);
+
+  LLVM_DEBUG(map.print(dbgs() << "\nCompose map: "));
+  LLVM_DEBUG(auxiliaryMap.print(dbgs() << "\nWith map: "));
+  LLVM_DEBUG(map.compose(auxiliaryMap).print(dbgs() << "\nResult: "));
+
+  // TODO(andydavis,ntv): Disabling simplification results in major speed gains.
+  // Another option is to cache the results as it is expected a lot of redundant
+  // work is performed in practice.
+  affineMap = simplifyAffineMap(map.compose(auxiliaryMap));
+
+  LLVM_DEBUG(affineMap.print(dbgs() << "\nSimplified result: "));
+  LLVM_DEBUG(dbgs() << "\n");
+}
+
+void AffineApplyNormalizer::normalize(AffineMap *otherMap,
+                                      SmallVectorImpl<Value> *otherOperands) {
+  AffineApplyNormalizer other(*otherMap, *otherOperands);
+  *otherMap = renumber(other);
+
+  otherOperands->reserve(reorderedDims.size() + concatenatedSymbols.size());
+  otherOperands->assign(reorderedDims.begin(), reorderedDims.end());
+  otherOperands->append(concatenatedSymbols.begin(), concatenatedSymbols.end());
+}
+
+/// Implements `map` and `operands` composition and simplification to support
+/// `makeComposedAffineApply`. This can be called to achieve the same effects
+/// on `map` and `operands` without creating an AffineApplyOp that needs to be
+/// immediately deleted.
+static void composeAffineMapAndOperands(AffineMap *map,
+                                        SmallVectorImpl<Value> *operands) {
+  AffineApplyNormalizer normalizer(*map, *operands);
+  auto normalizedMap = normalizer.getAffineMap();
+  auto normalizedOperands = normalizer.getOperands();
+  canonicalizeMapAndOperands(&normalizedMap, &normalizedOperands);
+  *map = normalizedMap;
+  *operands = normalizedOperands;
+  assert(*map);
+}
+
+void mlir::fullyComposeAffineMapAndOperands(AffineMap *map,
+                                            SmallVectorImpl<Value> *operands) {
+  while (llvm::any_of(*operands, [](Value v) {
+    return isa_and_nonnull<AffineApplyOp>(v->getDefiningOp());
+  })) {
+    composeAffineMapAndOperands(map, operands);
+  }
+}
+
+AffineApplyOp mlir::makeComposedAffineApply(OpBuilder &b, Location loc,
+                                            AffineMap map,
+                                            ArrayRef<Value> operands) {
+  AffineMap normalizedMap = map;
+  SmallVector<Value, 8> normalizedOperands(operands.begin(), operands.end());
+  composeAffineMapAndOperands(&normalizedMap, &normalizedOperands);
+  assert(normalizedMap);
+  return b.create<AffineApplyOp>(loc, normalizedMap, normalizedOperands);
+}
+
+// A symbol may appear as a dim in affine.apply operations. This function
+// canonicalizes dims that are valid symbols into actual symbols.
+template <class MapOrSet>
+static void canonicalizePromotedSymbols(MapOrSet *mapOrSet,
+                                        SmallVectorImpl<Value> *operands) {
+  if (!mapOrSet || operands->empty())
+    return;
+
+  assert(mapOrSet->getNumInputs() == operands->size() &&
+         "map/set inputs must match number of operands");
+
+  auto *context = mapOrSet->getContext();
+  SmallVector<Value, 8> resultOperands;
+  resultOperands.reserve(operands->size());
+  SmallVector<Value, 8> remappedSymbols;
+  remappedSymbols.reserve(operands->size());
+  unsigned nextDim = 0;
+  unsigned nextSym = 0;
+  unsigned oldNumSyms = mapOrSet->getNumSymbols();
+  SmallVector<AffineExpr, 8> dimRemapping(mapOrSet->getNumDims());
+  for (unsigned i = 0, e = mapOrSet->getNumInputs(); i != e; ++i) {
+    if (i < mapOrSet->getNumDims()) {
+      if (isValidSymbol((*operands)[i])) {
+        // This is a valid symbol that appears as a dim, canonicalize it.
+        dimRemapping[i] = getAffineSymbolExpr(oldNumSyms + nextSym++, context);
+        remappedSymbols.push_back((*operands)[i]);
+      } else {
+        dimRemapping[i] = getAffineDimExpr(nextDim++, context);
+        resultOperands.push_back((*operands)[i]);
+      }
+    } else {
+      resultOperands.push_back((*operands)[i]);
+    }
+  }
+
+  resultOperands.append(remappedSymbols.begin(), remappedSymbols.end());
+  *operands = resultOperands;
+  *mapOrSet = mapOrSet->replaceDimsAndSymbols(dimRemapping, {}, nextDim,
+                                              oldNumSyms + nextSym);
+
+  assert(mapOrSet->getNumInputs() == operands->size() &&
+         "map/set inputs must match number of operands");
+}
+
+// Works for either an affine map or an integer set.
+template <class MapOrSet>
+static void canonicalizeMapOrSetAndOperands(MapOrSet *mapOrSet,
+                                            SmallVectorImpl<Value> *operands) {
+  static_assert(std::is_same<MapOrSet, AffineMap>::value ||
+                    std::is_same<MapOrSet, IntegerSet>::value,
+                "Argument must be either of AffineMap or IntegerSet type");
+
+  if (!mapOrSet || operands->empty())
+    return;
+
+  assert(mapOrSet->getNumInputs() == operands->size() &&
+         "map/set inputs must match number of operands");
+
+  canonicalizePromotedSymbols<MapOrSet>(mapOrSet, operands);
+
+  // Check to see what dims are used.
+  llvm::SmallBitVector usedDims(mapOrSet->getNumDims());
+  llvm::SmallBitVector usedSyms(mapOrSet->getNumSymbols());
+  mapOrSet->walkExprs([&](AffineExpr expr) {
+    if (auto dimExpr = expr.dyn_cast<AffineDimExpr>())
+      usedDims[dimExpr.getPosition()] = true;
+    else if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>())
+      usedSyms[symExpr.getPosition()] = true;
+  });
+
+  auto *context = mapOrSet->getContext();
+
+  SmallVector<Value, 8> resultOperands;
+  resultOperands.reserve(operands->size());
+
+  llvm::SmallDenseMap<Value, AffineExpr, 8> seenDims;
+  SmallVector<AffineExpr, 8> dimRemapping(mapOrSet->getNumDims());
+  unsigned nextDim = 0;
+  for (unsigned i = 0, e = mapOrSet->getNumDims(); i != e; ++i) {
+    if (usedDims[i]) {
+      // Remap dim positions for duplicate operands.
+      auto it = seenDims.find((*operands)[i]);
+      if (it == seenDims.end()) {
+        dimRemapping[i] = getAffineDimExpr(nextDim++, context);
+        resultOperands.push_back((*operands)[i]);
+        seenDims.insert(std::make_pair((*operands)[i], dimRemapping[i]));
+      } else {
+        dimRemapping[i] = it->second;
+      }
+    }
+  }
+  llvm::SmallDenseMap<Value, AffineExpr, 8> seenSymbols;
+  SmallVector<AffineExpr, 8> symRemapping(mapOrSet->getNumSymbols());
+  unsigned nextSym = 0;
+  for (unsigned i = 0, e = mapOrSet->getNumSymbols(); i != e; ++i) {
+    if (!usedSyms[i])
+      continue;
+    // Handle constant operands (only needed for symbolic operands since
+    // constant operands in dimensional positions would have already been
+    // promoted to symbolic positions above).
+    IntegerAttr operandCst;
+    if (matchPattern((*operands)[i + mapOrSet->getNumDims()],
+                     m_Constant(&operandCst))) {
+      symRemapping[i] =
+          getAffineConstantExpr(operandCst.getValue().getSExtValue(), context);
+      continue;
+    }
+    // Remap symbol positions for duplicate operands.
+    auto it = seenSymbols.find((*operands)[i + mapOrSet->getNumDims()]);
+    if (it == seenSymbols.end()) {
+      symRemapping[i] = getAffineSymbolExpr(nextSym++, context);
+      resultOperands.push_back((*operands)[i + mapOrSet->getNumDims()]);
+      seenSymbols.insert(std::make_pair((*operands)[i + mapOrSet->getNumDims()],
+                                        symRemapping[i]));
+    } else {
+      symRemapping[i] = it->second;
+    }
+  }
+  *mapOrSet = mapOrSet->replaceDimsAndSymbols(dimRemapping, symRemapping,
+                                              nextDim, nextSym);
+  *operands = resultOperands;
+}
+
+void mlir::canonicalizeMapAndOperands(AffineMap *map,
+                                      SmallVectorImpl<Value> *operands) {
+  canonicalizeMapOrSetAndOperands<AffineMap>(map, operands);
+}
+
+void mlir::canonicalizeSetAndOperands(IntegerSet *set,
+                                      SmallVectorImpl<Value> *operands) {
+  canonicalizeMapOrSetAndOperands<IntegerSet>(set, operands);
+}
+
+namespace {
+/// Simplify AffineApply, AffineLoad, and AffineStore operations by composing
+/// maps that supply results into them.
+///
+template <typename AffineOpTy>
+struct SimplifyAffineOp : public OpRewritePattern<AffineOpTy> {
+  using OpRewritePattern<AffineOpTy>::OpRewritePattern;
+
+  /// Replace the affine op with another instance of it with the supplied
+  /// map and mapOperands.
+  void replaceAffineOp(PatternRewriter &rewriter, AffineOpTy affineOp,
+                       AffineMap map, ArrayRef<Value> mapOperands) const;
+
+  PatternMatchResult matchAndRewrite(AffineOpTy affineOp,
+                                     PatternRewriter &rewriter) const override {
+    static_assert(std::is_same<AffineOpTy, AffineLoadOp>::value ||
+                      std::is_same<AffineOpTy, AffinePrefetchOp>::value ||
+                      std::is_same<AffineOpTy, AffineStoreOp>::value ||
+                      std::is_same<AffineOpTy, AffineApplyOp>::value,
+                  "affine load/store/apply op expected");
+    auto map = affineOp.getAffineMap();
+    AffineMap oldMap = map;
+    auto oldOperands = affineOp.getMapOperands();
+    SmallVector<Value, 8> resultOperands(oldOperands);
+    composeAffineMapAndOperands(&map, &resultOperands);
+    if (map == oldMap && std::equal(oldOperands.begin(), oldOperands.end(),
+                                    resultOperands.begin()))
+      return this->matchFailure();
+
+    replaceAffineOp(rewriter, affineOp, map, resultOperands);
+    return this->matchSuccess();
+  }
+};
+
+// Specialize the template to account for the different build signatures for
+// affine load, store, and apply ops.
+template <>
+void SimplifyAffineOp<AffineLoadOp>::replaceAffineOp(
+    PatternRewriter &rewriter, AffineLoadOp load, AffineMap map,
+    ArrayRef<Value> mapOperands) const {
+  rewriter.replaceOpWithNewOp<AffineLoadOp>(load, load.getMemRef(), map,
+                                            mapOperands);
+}
+template <>
+void SimplifyAffineOp<AffinePrefetchOp>::replaceAffineOp(
+    PatternRewriter &rewriter, AffinePrefetchOp prefetch, AffineMap map,
+    ArrayRef<Value> mapOperands) const {
+  rewriter.replaceOpWithNewOp<AffinePrefetchOp>(
+      prefetch, prefetch.memref(), map, mapOperands,
+      prefetch.localityHint().getZExtValue(), prefetch.isWrite(),
+      prefetch.isDataCache());
+}
+template <>
+void SimplifyAffineOp<AffineStoreOp>::replaceAffineOp(
+    PatternRewriter &rewriter, AffineStoreOp store, AffineMap map,
+    ArrayRef<Value> mapOperands) const {
+  rewriter.replaceOpWithNewOp<AffineStoreOp>(
+      store, store.getValueToStore(), store.getMemRef(), map, mapOperands);
+}
+template <>
+void SimplifyAffineOp<AffineApplyOp>::replaceAffineOp(
+    PatternRewriter &rewriter, AffineApplyOp apply, AffineMap map,
+    ArrayRef<Value> mapOperands) const {
+  rewriter.replaceOpWithNewOp<AffineApplyOp>(apply, map, mapOperands);
+}
+} // end anonymous namespace.
+
+void AffineApplyOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<SimplifyAffineOp<AffineApplyOp>>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// Common canonicalization pattern support logic
+//===----------------------------------------------------------------------===//
+
+/// This is a common class used for patterns of the form
+/// "someop(memrefcast) -> someop".  It folds the source of any memref_cast
+/// into the root operation directly.
+static LogicalResult foldMemRefCast(Operation *op) {
+  bool folded = false;
+  for (OpOperand &operand : op->getOpOperands()) {
+    auto cast = dyn_cast_or_null<MemRefCastOp>(operand.get()->getDefiningOp());
+    if (cast && !cast.getOperand()->getType().isa<UnrankedMemRefType>()) {
+      operand.set(cast.getOperand());
+      folded = true;
+    }
+  }
+  return success(folded);
+}
+
+//===----------------------------------------------------------------------===//
+// AffineDmaStartOp
+//===----------------------------------------------------------------------===//
+
+// TODO(b/133776335) Check that map operands are loop IVs or symbols.
+void AffineDmaStartOp::build(Builder *builder, OperationState &result,
+                             Value srcMemRef, AffineMap srcMap,
+                             ValueRange srcIndices, Value destMemRef,
+                             AffineMap dstMap, ValueRange destIndices,
+                             Value tagMemRef, AffineMap tagMap,
+                             ValueRange tagIndices, Value numElements,
+                             Value stride, Value elementsPerStride) {
+  result.addOperands(srcMemRef);
+  result.addAttribute(getSrcMapAttrName(), AffineMapAttr::get(srcMap));
+  result.addOperands(srcIndices);
+  result.addOperands(destMemRef);
+  result.addAttribute(getDstMapAttrName(), AffineMapAttr::get(dstMap));
+  result.addOperands(destIndices);
+  result.addOperands(tagMemRef);
+  result.addAttribute(getTagMapAttrName(), AffineMapAttr::get(tagMap));
+  result.addOperands(tagIndices);
+  result.addOperands(numElements);
+  if (stride) {
+    result.addOperands({stride, elementsPerStride});
+  }
+}
+
+void AffineDmaStartOp::print(OpAsmPrinter &p) {
+  p << "affine.dma_start " << *getSrcMemRef() << '[';
+  p.printAffineMapOfSSAIds(getSrcMapAttr(), getSrcIndices());
+  p << "], " << *getDstMemRef() << '[';
+  p.printAffineMapOfSSAIds(getDstMapAttr(), getDstIndices());
+  p << "], " << *getTagMemRef() << '[';
+  p.printAffineMapOfSSAIds(getTagMapAttr(), getTagIndices());
+  p << "], " << *getNumElements();
+  if (isStrided()) {
+    p << ", " << *getStride();
+    p << ", " << *getNumElementsPerStride();
+  }
+  p << " : " << getSrcMemRefType() << ", " << getDstMemRefType() << ", "
+    << getTagMemRefType();
+}
+
+// Parse AffineDmaStartOp.
+// Ex:
+//   affine.dma_start %src[%i, %j], %dst[%k, %l], %tag[%index], %size,
+//     %stride, %num_elt_per_stride
+//       : memref<3076 x f32, 0>, memref<1024 x f32, 2>, memref<1 x i32>
+//
+ParseResult AffineDmaStartOp::parse(OpAsmParser &parser,
+                                    OperationState &result) {
+  OpAsmParser::OperandType srcMemRefInfo;
+  AffineMapAttr srcMapAttr;
+  SmallVector<OpAsmParser::OperandType, 4> srcMapOperands;
+  OpAsmParser::OperandType dstMemRefInfo;
+  AffineMapAttr dstMapAttr;
+  SmallVector<OpAsmParser::OperandType, 4> dstMapOperands;
+  OpAsmParser::OperandType tagMemRefInfo;
+  AffineMapAttr tagMapAttr;
+  SmallVector<OpAsmParser::OperandType, 4> tagMapOperands;
+  OpAsmParser::OperandType numElementsInfo;
+  SmallVector<OpAsmParser::OperandType, 2> strideInfo;
+
+  SmallVector<Type, 3> types;
+  auto indexType = parser.getBuilder().getIndexType();
+
+  // Parse and resolve the following list of operands:
+  // *) dst memref followed by its affine maps operands (in square brackets).
+  // *) src memref followed by its affine map operands (in square brackets).
+  // *) tag memref followed by its affine map operands (in square brackets).
+  // *) number of elements transferred by DMA operation.
+  if (parser.parseOperand(srcMemRefInfo) ||
+      parser.parseAffineMapOfSSAIds(srcMapOperands, srcMapAttr,
+                                    getSrcMapAttrName(), result.attributes) ||
+      parser.parseComma() || parser.parseOperand(dstMemRefInfo) ||
+      parser.parseAffineMapOfSSAIds(dstMapOperands, dstMapAttr,
+                                    getDstMapAttrName(), result.attributes) ||
+      parser.parseComma() || parser.parseOperand(tagMemRefInfo) ||
+      parser.parseAffineMapOfSSAIds(tagMapOperands, tagMapAttr,
+                                    getTagMapAttrName(), result.attributes) ||
+      parser.parseComma() || parser.parseOperand(numElementsInfo))
+    return failure();
+
+  // Parse optional stride and elements per stride.
+  if (parser.parseTrailingOperandList(strideInfo)) {
+    return failure();
+  }
+  if (!strideInfo.empty() && strideInfo.size() != 2) {
+    return parser.emitError(parser.getNameLoc(),
+                            "expected two stride related operands");
+  }
+  bool isStrided = strideInfo.size() == 2;
+
+  if (parser.parseColonTypeList(types))
+    return failure();
+
+  if (types.size() != 3)
+    return parser.emitError(parser.getNameLoc(), "expected three types");
+
+  if (parser.resolveOperand(srcMemRefInfo, types[0], result.operands) ||
+      parser.resolveOperands(srcMapOperands, indexType, result.operands) ||
+      parser.resolveOperand(dstMemRefInfo, types[1], result.operands) ||
+      parser.resolveOperands(dstMapOperands, indexType, result.operands) ||
+      parser.resolveOperand(tagMemRefInfo, types[2], result.operands) ||
+      parser.resolveOperands(tagMapOperands, indexType, result.operands) ||
+      parser.resolveOperand(numElementsInfo, indexType, result.operands))
+    return failure();
+
+  if (isStrided) {
+    if (parser.resolveOperands(strideInfo, indexType, result.operands))
+      return failure();
+  }
+
+  // Check that src/dst/tag operand counts match their map.numInputs.
+  if (srcMapOperands.size() != srcMapAttr.getValue().getNumInputs() ||
+      dstMapOperands.size() != dstMapAttr.getValue().getNumInputs() ||
+      tagMapOperands.size() != tagMapAttr.getValue().getNumInputs())
+    return parser.emitError(parser.getNameLoc(),
+                            "memref operand count not equal to map.numInputs");
+  return success();
+}
+
+LogicalResult AffineDmaStartOp::verify() {
+  if (!getOperand(getSrcMemRefOperandIndex())->getType().isa<MemRefType>())
+    return emitOpError("expected DMA source to be of memref type");
+  if (!getOperand(getDstMemRefOperandIndex())->getType().isa<MemRefType>())
+    return emitOpError("expected DMA destination to be of memref type");
+  if (!getOperand(getTagMemRefOperandIndex())->getType().isa<MemRefType>())
+    return emitOpError("expected DMA tag to be of memref type");
+
+  // DMAs from different memory spaces supported.
+  if (getSrcMemorySpace() == getDstMemorySpace()) {
+    return emitOpError("DMA should be between different memory spaces");
+  }
+  unsigned numInputsAllMaps = getSrcMap().getNumInputs() +
+                              getDstMap().getNumInputs() +
+                              getTagMap().getNumInputs();
+  if (getNumOperands() != numInputsAllMaps + 3 + 1 &&
+      getNumOperands() != numInputsAllMaps + 3 + 1 + 2) {
+    return emitOpError("incorrect number of operands");
+  }
+
+  for (auto idx : getSrcIndices()) {
+    if (!idx->getType().isIndex())
+      return emitOpError("src index to dma_start must have 'index' type");
+    if (!isValidAffineIndexOperand(idx))
+      return emitOpError("src index must be a dimension or symbol identifier");
+  }
+  for (auto idx : getDstIndices()) {
+    if (!idx->getType().isIndex())
+      return emitOpError("dst index to dma_start must have 'index' type");
+    if (!isValidAffineIndexOperand(idx))
+      return emitOpError("dst index must be a dimension or symbol identifier");
+  }
+  for (auto idx : getTagIndices()) {
+    if (!idx->getType().isIndex())
+      return emitOpError("tag index to dma_start must have 'index' type");
+    if (!isValidAffineIndexOperand(idx))
+      return emitOpError("tag index must be a dimension or symbol identifier");
+  }
+  return success();
+}
+
+LogicalResult AffineDmaStartOp::fold(ArrayRef<Attribute> cstOperands,
+                                     SmallVectorImpl<OpFoldResult> &results) {
+  /// dma_start(memrefcast) -> dma_start
+  return foldMemRefCast(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// AffineDmaWaitOp
+//===----------------------------------------------------------------------===//
+
+// TODO(b/133776335) Check that map operands are loop IVs or symbols.
+void AffineDmaWaitOp::build(Builder *builder, OperationState &result,
+                            Value tagMemRef, AffineMap tagMap,
+                            ValueRange tagIndices, Value numElements) {
+  result.addOperands(tagMemRef);
+  result.addAttribute(getTagMapAttrName(), AffineMapAttr::get(tagMap));
+  result.addOperands(tagIndices);
+  result.addOperands(numElements);
+}
+
+void AffineDmaWaitOp::print(OpAsmPrinter &p) {
+  p << "affine.dma_wait " << *getTagMemRef() << '[';
+  SmallVector<Value, 2> operands(getTagIndices());
+  p.printAffineMapOfSSAIds(getTagMapAttr(), operands);
+  p << "], ";
+  p.printOperand(getNumElements());
+  p << " : " << getTagMemRef()->getType();
+}
+
+// Parse AffineDmaWaitOp.
+// Eg:
+//   affine.dma_wait %tag[%index], %num_elements
+//     : memref<1 x i32, (d0) -> (d0), 4>
+//
+ParseResult AffineDmaWaitOp::parse(OpAsmParser &parser,
+                                   OperationState &result) {
+  OpAsmParser::OperandType tagMemRefInfo;
+  AffineMapAttr tagMapAttr;
+  SmallVector<OpAsmParser::OperandType, 2> tagMapOperands;
+  Type type;
+  auto indexType = parser.getBuilder().getIndexType();
+  OpAsmParser::OperandType numElementsInfo;
+
+  // Parse tag memref, its map operands, and dma size.
+  if (parser.parseOperand(tagMemRefInfo) ||
+      parser.parseAffineMapOfSSAIds(tagMapOperands, tagMapAttr,
+                                    getTagMapAttrName(), result.attributes) ||
+      parser.parseComma() || parser.parseOperand(numElementsInfo) ||
+      parser.parseColonType(type) ||
+      parser.resolveOperand(tagMemRefInfo, type, result.operands) ||
+      parser.resolveOperands(tagMapOperands, indexType, result.operands) ||
+      parser.resolveOperand(numElementsInfo, indexType, result.operands))
+    return failure();
+
+  if (!type.isa<MemRefType>())
+    return parser.emitError(parser.getNameLoc(),
+                            "expected tag to be of memref type");
+
+  if (tagMapOperands.size() != tagMapAttr.getValue().getNumInputs())
+    return parser.emitError(parser.getNameLoc(),
+                            "tag memref operand count != to map.numInputs");
+  return success();
+}
+
+LogicalResult AffineDmaWaitOp::verify() {
+  if (!getOperand(0)->getType().isa<MemRefType>())
+    return emitOpError("expected DMA tag to be of memref type");
+  for (auto idx : getTagIndices()) {
+    if (!idx->getType().isIndex())
+      return emitOpError("index to dma_wait must have 'index' type");
+    if (!isValidAffineIndexOperand(idx))
+      return emitOpError("index must be a dimension or symbol identifier");
+  }
+  return success();
+}
+
+LogicalResult AffineDmaWaitOp::fold(ArrayRef<Attribute> cstOperands,
+                                    SmallVectorImpl<OpFoldResult> &results) {
+  /// dma_wait(memrefcast) -> dma_wait
+  return foldMemRefCast(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// AffineForOp
+//===----------------------------------------------------------------------===//
+
+void AffineForOp::build(Builder *builder, OperationState &result,
+                        ValueRange lbOperands, AffineMap lbMap,
+                        ValueRange ubOperands, AffineMap ubMap, int64_t step) {
+  assert(((!lbMap && lbOperands.empty()) ||
+          lbOperands.size() == lbMap.getNumInputs()) &&
+         "lower bound operand count does not match the affine map");
+  assert(((!ubMap && ubOperands.empty()) ||
+          ubOperands.size() == ubMap.getNumInputs()) &&
+         "upper bound operand count does not match the affine map");
+  assert(step > 0 && "step has to be a positive integer constant");
+
+  // Add an attribute for the step.
+  result.addAttribute(getStepAttrName(),
+                      builder->getIntegerAttr(builder->getIndexType(), step));
+
+  // Add the lower bound.
+  result.addAttribute(getLowerBoundAttrName(), AffineMapAttr::get(lbMap));
+  result.addOperands(lbOperands);
+
+  // Add the upper bound.
+  result.addAttribute(getUpperBoundAttrName(), AffineMapAttr::get(ubMap));
+  result.addOperands(ubOperands);
+
+  // Create a region and a block for the body.  The argument of the region is
+  // the loop induction variable.
+  Region *bodyRegion = result.addRegion();
+  Block *body = new Block();
+  body->addArgument(IndexType::get(builder->getContext()));
+  bodyRegion->push_back(body);
+  ensureTerminator(*bodyRegion, *builder, result.location);
+
+  // Set the operands list as resizable so that we can freely modify the bounds.
+  result.setOperandListToResizable();
+}
+
+void AffineForOp::build(Builder *builder, OperationState &result, int64_t lb,
+                        int64_t ub, int64_t step) {
+  auto lbMap = AffineMap::getConstantMap(lb, builder->getContext());
+  auto ubMap = AffineMap::getConstantMap(ub, builder->getContext());
+  return build(builder, result, {}, lbMap, {}, ubMap, step);
+}
+
+static LogicalResult verify(AffineForOp op) {
+  // Check that the body defines as single block argument for the induction
+  // variable.
+  auto *body = op.getBody();
+  if (body->getNumArguments() != 1 ||
+      !body->getArgument(0)->getType().isIndex())
+    return op.emitOpError(
+        "expected body to have a single index argument for the "
+        "induction variable");
+
+  // Verify that there are enough operands for the bounds.
+  AffineMap lowerBoundMap = op.getLowerBoundMap(),
+            upperBoundMap = op.getUpperBoundMap();
+  if (op.getNumOperands() !=
+      (lowerBoundMap.getNumInputs() + upperBoundMap.getNumInputs()))
+    return op.emitOpError(
+        "operand count must match with affine map dimension and symbol count");
+
+  // Verify that the bound operands are valid dimension/symbols.
+  /// Lower bound.
+  if (failed(verifyDimAndSymbolIdentifiers(op, op.getLowerBoundOperands(),
+                                           op.getLowerBoundMap().getNumDims())))
+    return failure();
+  /// Upper bound.
+  if (failed(verifyDimAndSymbolIdentifiers(op, op.getUpperBoundOperands(),
+                                           op.getUpperBoundMap().getNumDims())))
+    return failure();
+  return success();
+}
+
+/// Parse a for operation loop bounds.
+static ParseResult parseBound(bool isLower, OperationState &result,
+                              OpAsmParser &p) {
+  // 'min' / 'max' prefixes are generally syntactic sugar, but are required if
+  // the map has multiple results.
+  bool failedToParsedMinMax =
+      failed(p.parseOptionalKeyword(isLower ? "max" : "min"));
+
+  auto &builder = p.getBuilder();
+  auto boundAttrName = isLower ? AffineForOp::getLowerBoundAttrName()
+                               : AffineForOp::getUpperBoundAttrName();
+
+  // Parse ssa-id as identity map.
+  SmallVector<OpAsmParser::OperandType, 1> boundOpInfos;
+  if (p.parseOperandList(boundOpInfos))
+    return failure();
+
+  if (!boundOpInfos.empty()) {
+    // Check that only one operand was parsed.
+    if (boundOpInfos.size() > 1)
+      return p.emitError(p.getNameLoc(),
+                         "expected only one loop bound operand");
+
+    // TODO: improve error message when SSA value is not of index type.
+    // Currently it is 'use of value ... expects different type than prior uses'
+    if (p.resolveOperand(boundOpInfos.front(), builder.getIndexType(),
+                         result.operands))
+      return failure();
+
+    // Create an identity map using symbol id. This representation is optimized
+    // for storage. Analysis passes may expand it into a multi-dimensional map
+    // if desired.
+    AffineMap map = builder.getSymbolIdentityMap();
+    result.addAttribute(boundAttrName, AffineMapAttr::get(map));
+    return success();
+  }
+
+  // Get the attribute location.
+  llvm::SMLoc attrLoc = p.getCurrentLocation();
+
+  Attribute boundAttr;
+  if (p.parseAttribute(boundAttr, builder.getIndexType(), boundAttrName,
+                       result.attributes))
+    return failure();
+
+  // Parse full form - affine map followed by dim and symbol list.
+  if (auto affineMapAttr = boundAttr.dyn_cast<AffineMapAttr>()) {
+    unsigned currentNumOperands = result.operands.size();
+    unsigned numDims;
+    if (parseDimAndSymbolList(p, result.operands, numDims))
+      return failure();
+
+    auto map = affineMapAttr.getValue();
+    if (map.getNumDims() != numDims)
+      return p.emitError(
+          p.getNameLoc(),
+          "dim operand count and integer set dim count must match");
+
+    unsigned numDimAndSymbolOperands =
+        result.operands.size() - currentNumOperands;
+    if (numDims + map.getNumSymbols() != numDimAndSymbolOperands)
+      return p.emitError(
+          p.getNameLoc(),
+          "symbol operand count and integer set symbol count must match");
+
+    // If the map has multiple results, make sure that we parsed the min/max
+    // prefix.
+    if (map.getNumResults() > 1 && failedToParsedMinMax) {
+      if (isLower) {
+        return p.emitError(attrLoc, "lower loop bound affine map with "
+                                    "multiple results requires 'max' prefix");
+      }
+      return p.emitError(attrLoc, "upper loop bound affine map with multiple "
+                                  "results requires 'min' prefix");
+    }
+    return success();
+  }
+
+  // Parse custom assembly form.
+  if (auto integerAttr = boundAttr.dyn_cast<IntegerAttr>()) {
+    result.attributes.pop_back();
+    result.addAttribute(
+        boundAttrName,
+        AffineMapAttr::get(builder.getConstantAffineMap(integerAttr.getInt())));
+    return success();
+  }
+
+  return p.emitError(
+      p.getNameLoc(),
+      "expected valid affine map representation for loop bounds");
+}
+
+static ParseResult parseAffineForOp(OpAsmParser &parser,
+                                    OperationState &result) {
+  auto &builder = parser.getBuilder();
+  OpAsmParser::OperandType inductionVariable;
+  // Parse the induction variable followed by '='.
+  if (parser.parseRegionArgument(inductionVariable) || parser.parseEqual())
+    return failure();
+
+  // Parse loop bounds.
+  if (parseBound(/*isLower=*/true, result, parser) ||
+      parser.parseKeyword("to", " between bounds") ||
+      parseBound(/*isLower=*/false, result, parser))
+    return failure();
+
+  // Parse the optional loop step, we default to 1 if one is not present.
+  if (parser.parseOptionalKeyword("step")) {
+    result.addAttribute(
+        AffineForOp::getStepAttrName(),
+        builder.getIntegerAttr(builder.getIndexType(), /*value=*/1));
+  } else {
+    llvm::SMLoc stepLoc = parser.getCurrentLocation();
+    IntegerAttr stepAttr;
+    if (parser.parseAttribute(stepAttr, builder.getIndexType(),
+                              AffineForOp::getStepAttrName().data(),
+                              result.attributes))
+      return failure();
+
+    if (stepAttr.getValue().getSExtValue() < 0)
+      return parser.emitError(
+          stepLoc,
+          "expected step to be representable as a positive signed integer");
+  }
+
+  // Parse the body region.
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, inductionVariable, builder.getIndexType()))
+    return failure();
+
+  AffineForOp::ensureTerminator(*body, builder, result.location);
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  // Set the operands list as resizable so that we can freely modify the bounds.
+  result.setOperandListToResizable();
+  return success();
+}
+
+static void printBound(AffineMapAttr boundMap,
+                       Operation::operand_range boundOperands,
+                       const char *prefix, OpAsmPrinter &p) {
+  AffineMap map = boundMap.getValue();
+
+  // Check if this bound should be printed using custom assembly form.
+  // The decision to restrict printing custom assembly form to trivial cases
+  // comes from the will to roundtrip MLIR binary -> text -> binary in a
+  // lossless way.
+  // Therefore, custom assembly form parsing and printing is only supported for
+  // zero-operand constant maps and single symbol operand identity maps.
+  if (map.getNumResults() == 1) {
+    AffineExpr expr = map.getResult(0);
+
+    // Print constant bound.
+    if (map.getNumDims() == 0 && map.getNumSymbols() == 0) {
+      if (auto constExpr = expr.dyn_cast<AffineConstantExpr>()) {
+        p << constExpr.getValue();
+        return;
+      }
+    }
+
+    // Print bound that consists of a single SSA symbol if the map is over a
+    // single symbol.
+    if (map.getNumDims() == 0 && map.getNumSymbols() == 1) {
+      if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>()) {
+        p.printOperand(*boundOperands.begin());
+        return;
+      }
+    }
+  } else {
+    // Map has multiple results. Print 'min' or 'max' prefix.
+    p << prefix << ' ';
+  }
+
+  // Print the map and its operands.
+  p << boundMap;
+  printDimAndSymbolList(boundOperands.begin(), boundOperands.end(),
+                        map.getNumDims(), p);
+}
+
+static void print(OpAsmPrinter &p, AffineForOp op) {
+  p << "affine.for ";
+  p.printOperand(op.getBody()->getArgument(0));
+  p << " = ";
+  printBound(op.getLowerBoundMapAttr(), op.getLowerBoundOperands(), "max", p);
+  p << " to ";
+  printBound(op.getUpperBoundMapAttr(), op.getUpperBoundOperands(), "min", p);
+
+  if (op.getStep() != 1)
+    p << " step " << op.getStep();
+  p.printRegion(op.region(),
+                /*printEntryBlockArgs=*/false,
+                /*printBlockTerminators=*/false);
+  p.printOptionalAttrDict(op.getAttrs(),
+                          /*elidedAttrs=*/{op.getLowerBoundAttrName(),
+                                           op.getUpperBoundAttrName(),
+                                           op.getStepAttrName()});
+}
+
+/// Fold the constant bounds of a loop.
+static LogicalResult foldLoopBounds(AffineForOp forOp) {
+  auto foldLowerOrUpperBound = [&forOp](bool lower) {
+    // Check to see if each of the operands is the result of a constant.  If
+    // so, get the value.  If not, ignore it.
+    SmallVector<Attribute, 8> operandConstants;
+    auto boundOperands =
+        lower ? forOp.getLowerBoundOperands() : forOp.getUpperBoundOperands();
+    for (auto operand : boundOperands) {
+      Attribute operandCst;
+      matchPattern(operand, m_Constant(&operandCst));
+      operandConstants.push_back(operandCst);
+    }
+
+    AffineMap boundMap =
+        lower ? forOp.getLowerBoundMap() : forOp.getUpperBoundMap();
+    assert(boundMap.getNumResults() >= 1 &&
+           "bound maps should have at least one result");
+    SmallVector<Attribute, 4> foldedResults;
+    if (failed(boundMap.constantFold(operandConstants, foldedResults)))
+      return failure();
+
+    // Compute the max or min as applicable over the results.
+    assert(!foldedResults.empty() && "bounds should have at least one result");
+    auto maxOrMin = foldedResults[0].cast<IntegerAttr>().getValue();
+    for (unsigned i = 1, e = foldedResults.size(); i < e; i++) {
+      auto foldedResult = foldedResults[i].cast<IntegerAttr>().getValue();
+      maxOrMin = lower ? llvm::APIntOps::smax(maxOrMin, foldedResult)
+                       : llvm::APIntOps::smin(maxOrMin, foldedResult);
+    }
+    lower ? forOp.setConstantLowerBound(maxOrMin.getSExtValue())
+          : forOp.setConstantUpperBound(maxOrMin.getSExtValue());
+    return success();
+  };
+
+  // Try to fold the lower bound.
+  bool folded = false;
+  if (!forOp.hasConstantLowerBound())
+    folded |= succeeded(foldLowerOrUpperBound(/*lower=*/true));
+
+  // Try to fold the upper bound.
+  if (!forOp.hasConstantUpperBound())
+    folded |= succeeded(foldLowerOrUpperBound(/*lower=*/false));
+  return success(folded);
+}
+
+/// Canonicalize the bounds of the given loop.
+static LogicalResult canonicalizeLoopBounds(AffineForOp forOp) {
+  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
+  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
+
+  auto lbMap = forOp.getLowerBoundMap();
+  auto ubMap = forOp.getUpperBoundMap();
+  auto prevLbMap = lbMap;
+  auto prevUbMap = ubMap;
+
+  canonicalizeMapAndOperands(&lbMap, &lbOperands);
+  canonicalizeMapAndOperands(&ubMap, &ubOperands);
+
+  // Any canonicalization change always leads to updated map(s).
+  if (lbMap == prevLbMap && ubMap == prevUbMap)
+    return failure();
+
+  if (lbMap != prevLbMap)
+    forOp.setLowerBound(lbOperands, lbMap);
+  if (ubMap != prevUbMap)
+    forOp.setUpperBound(ubOperands, ubMap);
+  return success();
+}
+
+namespace {
+/// This is a pattern to fold trivially empty loops.
+struct AffineForEmptyLoopFolder : public OpRewritePattern<AffineForOp> {
+  using OpRewritePattern<AffineForOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineForOp forOp,
+                                     PatternRewriter &rewriter) const override {
+    // Check that the body only contains a terminator.
+    if (!has_single_element(*forOp.getBody()))
+      return matchFailure();
+    rewriter.eraseOp(forOp);
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace
+
+void AffineForOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                              MLIRContext *context) {
+  results.insert<AffineForEmptyLoopFolder>(context);
+}
+
+LogicalResult AffineForOp::fold(ArrayRef<Attribute> operands,
+                                SmallVectorImpl<OpFoldResult> &results) {
+  bool folded = succeeded(foldLoopBounds(*this));
+  folded |= succeeded(canonicalizeLoopBounds(*this));
+  return success(folded);
+}
+
+AffineBound AffineForOp::getLowerBound() {
+  auto lbMap = getLowerBoundMap();
+  return AffineBound(AffineForOp(*this), 0, lbMap.getNumInputs(), lbMap);
+}
+
+AffineBound AffineForOp::getUpperBound() {
+  auto lbMap = getLowerBoundMap();
+  auto ubMap = getUpperBoundMap();
+  return AffineBound(AffineForOp(*this), lbMap.getNumInputs(), getNumOperands(),
+                     ubMap);
+}
+
+void AffineForOp::setLowerBound(ValueRange lbOperands, AffineMap map) {
+  assert(lbOperands.size() == map.getNumInputs());
+  assert(map.getNumResults() >= 1 && "bound map has at least one result");
+
+  SmallVector<Value, 4> newOperands(lbOperands.begin(), lbOperands.end());
+
+  auto ubOperands = getUpperBoundOperands();
+  newOperands.append(ubOperands.begin(), ubOperands.end());
+  getOperation()->setOperands(newOperands);
+
+  setAttr(getLowerBoundAttrName(), AffineMapAttr::get(map));
+}
+
+void AffineForOp::setUpperBound(ValueRange ubOperands, AffineMap map) {
+  assert(ubOperands.size() == map.getNumInputs());
+  assert(map.getNumResults() >= 1 && "bound map has at least one result");
+
+  SmallVector<Value, 4> newOperands(getLowerBoundOperands());
+  newOperands.append(ubOperands.begin(), ubOperands.end());
+  getOperation()->setOperands(newOperands);
+
+  setAttr(getUpperBoundAttrName(), AffineMapAttr::get(map));
+}
+
+void AffineForOp::setLowerBoundMap(AffineMap map) {
+  auto lbMap = getLowerBoundMap();
+  assert(lbMap.getNumDims() == map.getNumDims() &&
+         lbMap.getNumSymbols() == map.getNumSymbols());
+  assert(map.getNumResults() >= 1 && "bound map has at least one result");
+  (void)lbMap;
+  setAttr(getLowerBoundAttrName(), AffineMapAttr::get(map));
+}
+
+void AffineForOp::setUpperBoundMap(AffineMap map) {
+  auto ubMap = getUpperBoundMap();
+  assert(ubMap.getNumDims() == map.getNumDims() &&
+         ubMap.getNumSymbols() == map.getNumSymbols());
+  assert(map.getNumResults() >= 1 && "bound map has at least one result");
+  (void)ubMap;
+  setAttr(getUpperBoundAttrName(), AffineMapAttr::get(map));
+}
+
+bool AffineForOp::hasConstantLowerBound() {
+  return getLowerBoundMap().isSingleConstant();
+}
+
+bool AffineForOp::hasConstantUpperBound() {
+  return getUpperBoundMap().isSingleConstant();
+}
+
+int64_t AffineForOp::getConstantLowerBound() {
+  return getLowerBoundMap().getSingleConstantResult();
+}
+
+int64_t AffineForOp::getConstantUpperBound() {
+  return getUpperBoundMap().getSingleConstantResult();
+}
+
+void AffineForOp::setConstantLowerBound(int64_t value) {
+  setLowerBound({}, AffineMap::getConstantMap(value, getContext()));
+}
+
+void AffineForOp::setConstantUpperBound(int64_t value) {
+  setUpperBound({}, AffineMap::getConstantMap(value, getContext()));
+}
+
+AffineForOp::operand_range AffineForOp::getLowerBoundOperands() {
+  return {operand_begin(), operand_begin() + getLowerBoundMap().getNumInputs()};
+}
+
+AffineForOp::operand_range AffineForOp::getUpperBoundOperands() {
+  return {operand_begin() + getLowerBoundMap().getNumInputs(), operand_end()};
+}
+
+bool AffineForOp::matchingBoundOperandList() {
+  auto lbMap = getLowerBoundMap();
+  auto ubMap = getUpperBoundMap();
+  if (lbMap.getNumDims() != ubMap.getNumDims() ||
+      lbMap.getNumSymbols() != ubMap.getNumSymbols())
+    return false;
+
+  unsigned numOperands = lbMap.getNumInputs();
+  for (unsigned i = 0, e = lbMap.getNumInputs(); i < e; i++) {
+    // Compare Value 's.
+    if (getOperand(i) != getOperand(numOperands + i))
+      return false;
+  }
+  return true;
+}
+
+Region &AffineForOp::getLoopBody() { return region(); }
+
+bool AffineForOp::isDefinedOutsideOfLoop(Value value) {
+  return !region().isAncestor(value->getParentRegion());
+}
+
+LogicalResult AffineForOp::moveOutOfLoop(ArrayRef<Operation *> ops) {
+  for (auto *op : ops)
+    op->moveBefore(*this);
+  return success();
+}
+
+/// Returns if the provided value is the induction variable of a AffineForOp.
+bool mlir::isForInductionVar(Value val) {
+  return getForInductionVarOwner(val) != AffineForOp();
+}
+
+/// Returns the loop parent of an induction variable. If the provided value is
+/// not an induction variable, then return nullptr.
+AffineForOp mlir::getForInductionVarOwner(Value val) {
+  auto ivArg = val.dyn_cast<BlockArgument>();
+  if (!ivArg || !ivArg->getOwner())
+    return AffineForOp();
+  auto *containingInst = ivArg->getOwner()->getParent()->getParentOp();
+  return dyn_cast<AffineForOp>(containingInst);
+}
+
+/// Extracts the induction variables from a list of AffineForOps and returns
+/// them.
+void mlir::extractForInductionVars(ArrayRef<AffineForOp> forInsts,
+                                   SmallVectorImpl<Value> *ivs) {
+  ivs->reserve(forInsts.size());
+  for (auto forInst : forInsts)
+    ivs->push_back(forInst.getInductionVar());
+}
+
+//===----------------------------------------------------------------------===//
+// AffineIfOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(AffineIfOp op) {
+  // Verify that we have a condition attribute.
+  auto conditionAttr =
+      op.getAttrOfType<IntegerSetAttr>(op.getConditionAttrName());
+  if (!conditionAttr)
+    return op.emitOpError(
+        "requires an integer set attribute named 'condition'");
+
+  // Verify that there are enough operands for the condition.
+  IntegerSet condition = conditionAttr.getValue();
+  if (op.getNumOperands() != condition.getNumInputs())
+    return op.emitOpError(
+        "operand count and condition integer set dimension and "
+        "symbol count must match");
+
+  // Verify that the operands are valid dimension/symbols.
+  if (failed(verifyDimAndSymbolIdentifiers(
+          op, op.getOperation()->getNonSuccessorOperands(),
+          condition.getNumDims())))
+    return failure();
+
+  // Verify that the entry of each child region does not have arguments.
+  for (auto &region : op.getOperation()->getRegions()) {
+    for (auto &b : region)
+      if (b.getNumArguments() != 0)
+        return op.emitOpError(
+            "requires that child entry blocks have no arguments");
+  }
+  return success();
+}
+
+ParseResult parseAffineIfOp(OpAsmParser &parser, OperationState &result) {
+  // Parse the condition attribute set.
+  IntegerSetAttr conditionAttr;
+  unsigned numDims;
+  if (parser.parseAttribute(conditionAttr, AffineIfOp::getConditionAttrName(),
+                            result.attributes) ||
+      parseDimAndSymbolList(parser, result.operands, numDims))
+    return failure();
+
+  // Verify the condition operands.
+  auto set = conditionAttr.getValue();
+  if (set.getNumDims() != numDims)
+    return parser.emitError(
+        parser.getNameLoc(),
+        "dim operand count and integer set dim count must match");
+  if (numDims + set.getNumSymbols() != result.operands.size())
+    return parser.emitError(
+        parser.getNameLoc(),
+        "symbol operand count and integer set symbol count must match");
+
+  // Create the regions for 'then' and 'else'.  The latter must be created even
+  // if it remains empty for the validity of the operation.
+  result.regions.reserve(2);
+  Region *thenRegion = result.addRegion();
+  Region *elseRegion = result.addRegion();
+
+  // Parse the 'then' region.
+  if (parser.parseRegion(*thenRegion, {}, {}))
+    return failure();
+  AffineIfOp::ensureTerminator(*thenRegion, parser.getBuilder(),
+                               result.location);
+
+  // If we find an 'else' keyword then parse the 'else' region.
+  if (!parser.parseOptionalKeyword("else")) {
+    if (parser.parseRegion(*elseRegion, {}, {}))
+      return failure();
+    AffineIfOp::ensureTerminator(*elseRegion, parser.getBuilder(),
+                                 result.location);
+  }
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  return success();
+}
+
+void print(OpAsmPrinter &p, AffineIfOp op) {
+  auto conditionAttr =
+      op.getAttrOfType<IntegerSetAttr>(op.getConditionAttrName());
+  p << "affine.if " << conditionAttr;
+  printDimAndSymbolList(op.operand_begin(), op.operand_end(),
+                        conditionAttr.getValue().getNumDims(), p);
+  p.printRegion(op.thenRegion(),
+                /*printEntryBlockArgs=*/false,
+                /*printBlockTerminators=*/false);
+
+  // Print the 'else' regions if it has any blocks.
+  auto &elseRegion = op.elseRegion();
+  if (!elseRegion.empty()) {
+    p << " else";
+    p.printRegion(elseRegion,
+                  /*printEntryBlockArgs=*/false,
+                  /*printBlockTerminators=*/false);
+  }
+
+  // Print the attribute list.
+  p.printOptionalAttrDict(op.getAttrs(),
+                          /*elidedAttrs=*/op.getConditionAttrName());
+}
+
+IntegerSet AffineIfOp::getIntegerSet() {
+  return getAttrOfType<IntegerSetAttr>(getConditionAttrName()).getValue();
+}
+void AffineIfOp::setIntegerSet(IntegerSet newSet) {
+  setAttr(getConditionAttrName(), IntegerSetAttr::get(newSet));
+}
+
+void AffineIfOp::setConditional(IntegerSet set, ValueRange operands) {
+  setIntegerSet(set);
+  getOperation()->setOperands(operands);
+}
+
+void AffineIfOp::build(Builder *builder, OperationState &result, IntegerSet set,
+                       ValueRange args, bool withElseRegion) {
+  result.addOperands(args);
+  result.addAttribute(getConditionAttrName(), IntegerSetAttr::get(set));
+  Region *thenRegion = result.addRegion();
+  Region *elseRegion = result.addRegion();
+  AffineIfOp::ensureTerminator(*thenRegion, *builder, result.location);
+  if (withElseRegion)
+    AffineIfOp::ensureTerminator(*elseRegion, *builder, result.location);
+}
+
+/// Canonicalize an affine if op's conditional (integer set + operands).
+LogicalResult AffineIfOp::fold(ArrayRef<Attribute>,
+                               SmallVectorImpl<OpFoldResult> &) {
+  auto set = getIntegerSet();
+  SmallVector<Value, 4> operands(getOperands());
+  canonicalizeSetAndOperands(&set, &operands);
+
+  // Any canonicalization change always leads to either a reduction in the
+  // number of operands or a change in the number of symbolic operands
+  // (promotion of dims to symbols).
+  if (operands.size() < getIntegerSet().getNumInputs() ||
+      set.getNumSymbols() > getIntegerSet().getNumSymbols()) {
+    setConditional(set, operands);
+    return success();
+  }
+
+  return failure();
+}
+
+//===----------------------------------------------------------------------===//
+// AffineLoadOp
+//===----------------------------------------------------------------------===//
+
+void AffineLoadOp::build(Builder *builder, OperationState &result,
+                         AffineMap map, ValueRange operands) {
+  assert(operands.size() == 1 + map.getNumInputs() && "inconsistent operands");
+  result.addOperands(operands);
+  if (map)
+    result.addAttribute(getMapAttrName(), AffineMapAttr::get(map));
+  auto memrefType = operands[0]->getType().cast<MemRefType>();
+  result.types.push_back(memrefType.getElementType());
+}
+
+void AffineLoadOp::build(Builder *builder, OperationState &result, Value memref,
+                         AffineMap map, ValueRange mapOperands) {
+  assert(map.getNumInputs() == mapOperands.size() && "inconsistent index info");
+  result.addOperands(memref);
+  result.addOperands(mapOperands);
+  auto memrefType = memref->getType().cast<MemRefType>();
+  result.addAttribute(getMapAttrName(), AffineMapAttr::get(map));
+  result.types.push_back(memrefType.getElementType());
+}
+
+void AffineLoadOp::build(Builder *builder, OperationState &result, Value memref,
+                         ValueRange indices) {
+  auto memrefType = memref->getType().cast<MemRefType>();
+  auto rank = memrefType.getRank();
+  // Create identity map for memrefs with at least one dimension or () -> ()
+  // for zero-dimensional memrefs.
+  auto map = rank ? builder->getMultiDimIdentityMap(rank)
+                  : builder->getEmptyAffineMap();
+  build(builder, result, memref, map, indices);
+}
+
+ParseResult AffineLoadOp::parse(OpAsmParser &parser, OperationState &result) {
+  auto &builder = parser.getBuilder();
+  auto indexTy = builder.getIndexType();
+
+  MemRefType type;
+  OpAsmParser::OperandType memrefInfo;
+  AffineMapAttr mapAttr;
+  SmallVector<OpAsmParser::OperandType, 1> mapOperands;
+  return failure(
+      parser.parseOperand(memrefInfo) ||
+      parser.parseAffineMapOfSSAIds(mapOperands, mapAttr, getMapAttrName(),
+                                    result.attributes) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type) ||
+      parser.resolveOperand(memrefInfo, type, result.operands) ||
+      parser.resolveOperands(mapOperands, indexTy, result.operands) ||
+      parser.addTypeToList(type.getElementType(), result.types));
+}
+
+void AffineLoadOp::print(OpAsmPrinter &p) {
+  p << "affine.load " << *getMemRef() << '[';
+  if (AffineMapAttr mapAttr = getAttrOfType<AffineMapAttr>(getMapAttrName()))
+    p.printAffineMapOfSSAIds(mapAttr, getMapOperands());
+  p << ']';
+  p.printOptionalAttrDict(getAttrs(), /*elidedAttrs=*/{getMapAttrName()});
+  p << " : " << getMemRefType();
+}
+
+LogicalResult AffineLoadOp::verify() {
+  if (getType() != getMemRefType().getElementType())
+    return emitOpError("result type must match element type of memref");
+
+  auto mapAttr = getAttrOfType<AffineMapAttr>(getMapAttrName());
+  if (mapAttr) {
+    AffineMap map = getAttrOfType<AffineMapAttr>(getMapAttrName()).getValue();
+    if (map.getNumResults() != getMemRefType().getRank())
+      return emitOpError("affine.load affine map num results must equal"
+                         " memref rank");
+    if (map.getNumInputs() != getNumOperands() - 1)
+      return emitOpError("expects as many subscripts as affine map inputs");
+  } else {
+    if (getMemRefType().getRank() != getNumOperands() - 1)
+      return emitOpError(
+          "expects the number of subscripts to be equal to memref rank");
+  }
+
+  for (auto idx : getMapOperands()) {
+    if (!idx->getType().isIndex())
+      return emitOpError("index to load must have 'index' type");
+    if (!isValidAffineIndexOperand(idx))
+      return emitOpError("index must be a dimension or symbol identifier");
+  }
+  return success();
+}
+
+void AffineLoadOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<SimplifyAffineOp<AffineLoadOp>>(context);
+}
+
+OpFoldResult AffineLoadOp::fold(ArrayRef<Attribute> cstOperands) {
+  /// load(memrefcast) -> load
+  if (succeeded(foldMemRefCast(*this)))
+    return getResult();
+  return OpFoldResult();
+}
+
+//===----------------------------------------------------------------------===//
+// AffineStoreOp
+//===----------------------------------------------------------------------===//
+
+void AffineStoreOp::build(Builder *builder, OperationState &result,
+                          Value valueToStore, Value memref, AffineMap map,
+                          ValueRange mapOperands) {
+  assert(map.getNumInputs() == mapOperands.size() && "inconsistent index info");
+  result.addOperands(valueToStore);
+  result.addOperands(memref);
+  result.addOperands(mapOperands);
+  result.addAttribute(getMapAttrName(), AffineMapAttr::get(map));
+}
+
+// Use identity map.
+void AffineStoreOp::build(Builder *builder, OperationState &result,
+                          Value valueToStore, Value memref,
+                          ValueRange indices) {
+  auto memrefType = memref->getType().cast<MemRefType>();
+  auto rank = memrefType.getRank();
+  // Create identity map for memrefs with at least one dimension or () -> ()
+  // for zero-dimensional memrefs.
+  auto map = rank ? builder->getMultiDimIdentityMap(rank)
+                  : builder->getEmptyAffineMap();
+  build(builder, result, valueToStore, memref, map, indices);
+}
+
+ParseResult AffineStoreOp::parse(OpAsmParser &parser, OperationState &result) {
+  auto indexTy = parser.getBuilder().getIndexType();
+
+  MemRefType type;
+  OpAsmParser::OperandType storeValueInfo;
+  OpAsmParser::OperandType memrefInfo;
+  AffineMapAttr mapAttr;
+  SmallVector<OpAsmParser::OperandType, 1> mapOperands;
+  return failure(parser.parseOperand(storeValueInfo) || parser.parseComma() ||
+                 parser.parseOperand(memrefInfo) ||
+                 parser.parseAffineMapOfSSAIds(mapOperands, mapAttr,
+                                               getMapAttrName(),
+                                               result.attributes) ||
+                 parser.parseOptionalAttrDict(result.attributes) ||
+                 parser.parseColonType(type) ||
+                 parser.resolveOperand(storeValueInfo, type.getElementType(),
+                                       result.operands) ||
+                 parser.resolveOperand(memrefInfo, type, result.operands) ||
+                 parser.resolveOperands(mapOperands, indexTy, result.operands));
+}
+
+void AffineStoreOp::print(OpAsmPrinter &p) {
+  p << "affine.store " << *getValueToStore();
+  p << ", " << *getMemRef() << '[';
+  if (AffineMapAttr mapAttr = getAttrOfType<AffineMapAttr>(getMapAttrName()))
+    p.printAffineMapOfSSAIds(mapAttr, getMapOperands());
+  p << ']';
+  p.printOptionalAttrDict(getAttrs(), /*elidedAttrs=*/{getMapAttrName()});
+  p << " : " << getMemRefType();
+}
+
+LogicalResult AffineStoreOp::verify() {
+  // First operand must have same type as memref element type.
+  if (getValueToStore()->getType() != getMemRefType().getElementType())
+    return emitOpError("first operand must have same type memref element type");
+
+  auto mapAttr = getAttrOfType<AffineMapAttr>(getMapAttrName());
+  if (mapAttr) {
+    AffineMap map = mapAttr.getValue();
+    if (map.getNumResults() != getMemRefType().getRank())
+      return emitOpError("affine.store affine map num results must equal"
+                         " memref rank");
+    if (map.getNumInputs() != getNumOperands() - 2)
+      return emitOpError("expects as many subscripts as affine map inputs");
+  } else {
+    if (getMemRefType().getRank() != getNumOperands() - 2)
+      return emitOpError(
+          "expects the number of subscripts to be equal to memref rank");
+  }
+
+  for (auto idx : getMapOperands()) {
+    if (!idx->getType().isIndex())
+      return emitOpError("index to store must have 'index' type");
+    if (!isValidAffineIndexOperand(idx))
+      return emitOpError("index must be a dimension or symbol identifier");
+  }
+  return success();
+}
+
+void AffineStoreOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<SimplifyAffineOp<AffineStoreOp>>(context);
+}
+
+LogicalResult AffineStoreOp::fold(ArrayRef<Attribute> cstOperands,
+                                  SmallVectorImpl<OpFoldResult> &results) {
+  /// store(memrefcast) -> store
+  return foldMemRefCast(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// AffineMinOp
+//===----------------------------------------------------------------------===//
+//
+//   %0 = affine.min (d0) -> (1000, d0 + 512) (%i0)
+//
+
+static ParseResult parseAffineMinOp(OpAsmParser &parser,
+                                    OperationState &result) {
+  auto &builder = parser.getBuilder();
+  auto indexType = builder.getIndexType();
+  SmallVector<OpAsmParser::OperandType, 8> dim_infos;
+  SmallVector<OpAsmParser::OperandType, 8> sym_infos;
+  AffineMapAttr mapAttr;
+  return failure(
+      parser.parseAttribute(mapAttr, AffineMinOp::getMapAttrName(),
+                            result.attributes) ||
+      parser.parseOperandList(dim_infos, OpAsmParser::Delimiter::Paren) ||
+      parser.parseOperandList(sym_infos,
+                              OpAsmParser::Delimiter::OptionalSquare) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.resolveOperands(dim_infos, indexType, result.operands) ||
+      parser.resolveOperands(sym_infos, indexType, result.operands) ||
+      parser.addTypeToList(indexType, result.types));
+}
+
+static void print(OpAsmPrinter &p, AffineMinOp op) {
+  p << op.getOperationName() << ' '
+    << op.getAttr(AffineMinOp::getMapAttrName());
+  auto operands = op.getOperands();
+  unsigned numDims = op.map().getNumDims();
+  p << '(' << operands.take_front(numDims) << ')';
+
+  if (operands.size() != numDims)
+    p << '[' << operands.drop_front(numDims) << ']';
+  p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"map"});
+}
+
+static LogicalResult verify(AffineMinOp op) {
+  // Verify that operand count matches affine map dimension and symbol count.
+  if (op.getNumOperands() != op.map().getNumDims() + op.map().getNumSymbols())
+    return op.emitOpError(
+        "operand count and affine map dimension and symbol count must match");
+  return success();
+}
+
+OpFoldResult AffineMinOp::fold(ArrayRef<Attribute> operands) {
+  // Fold the affine map.
+  // TODO(andydavis, ntv) Fold more cases: partial static information,
+  // min(some_affine, some_affine + constant, ...).
+  SmallVector<Attribute, 2> results;
+  if (failed(map().constantFold(operands, results)))
+    return {};
+
+  // Compute and return min of folded map results.
+  int64_t min = std::numeric_limits<int64_t>::max();
+  int minIndex = -1;
+  for (unsigned i = 0, e = results.size(); i < e; ++i) {
+    auto intAttr = results[i].cast<IntegerAttr>();
+    if (intAttr.getInt() < min) {
+      min = intAttr.getInt();
+      minIndex = i;
+    }
+  }
+  if (minIndex < 0)
+    return {};
+  return results[minIndex];
+}
+
+//===----------------------------------------------------------------------===//
+// AffinePrefetchOp
+//===----------------------------------------------------------------------===//
+
+//
+// affine.prefetch %0[%i, %j + 5], read, locality<3>, data : memref<400x400xi32>
+//
+static ParseResult parseAffinePrefetchOp(OpAsmParser &parser,
+                                         OperationState &result) {
+  auto &builder = parser.getBuilder();
+  auto indexTy = builder.getIndexType();
+
+  MemRefType type;
+  OpAsmParser::OperandType memrefInfo;
+  IntegerAttr hintInfo;
+  auto i32Type = parser.getBuilder().getIntegerType(32);
+  StringRef readOrWrite, cacheType;
+
+  AffineMapAttr mapAttr;
+  SmallVector<OpAsmParser::OperandType, 1> mapOperands;
+  if (parser.parseOperand(memrefInfo) ||
+      parser.parseAffineMapOfSSAIds(mapOperands, mapAttr,
+                                    AffinePrefetchOp::getMapAttrName(),
+                                    result.attributes) ||
+      parser.parseComma() || parser.parseKeyword(&readOrWrite) ||
+      parser.parseComma() || parser.parseKeyword("locality") ||
+      parser.parseLess() ||
+      parser.parseAttribute(hintInfo, i32Type,
+                            AffinePrefetchOp::getLocalityHintAttrName(),
+                            result.attributes) ||
+      parser.parseGreater() || parser.parseComma() ||
+      parser.parseKeyword(&cacheType) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type) ||
+      parser.resolveOperand(memrefInfo, type, result.operands) ||
+      parser.resolveOperands(mapOperands, indexTy, result.operands))
+    return failure();
+
+  if (!readOrWrite.equals("read") && !readOrWrite.equals("write"))
+    return parser.emitError(parser.getNameLoc(),
+                            "rw specifier has to be 'read' or 'write'");
+  result.addAttribute(
+      AffinePrefetchOp::getIsWriteAttrName(),
+      parser.getBuilder().getBoolAttr(readOrWrite.equals("write")));
+
+  if (!cacheType.equals("data") && !cacheType.equals("instr"))
+    return parser.emitError(parser.getNameLoc(),
+                            "cache type has to be 'data' or 'instr'");
+
+  result.addAttribute(
+      AffinePrefetchOp::getIsDataCacheAttrName(),
+      parser.getBuilder().getBoolAttr(cacheType.equals("data")));
+
+  return success();
+}
+
+void print(OpAsmPrinter &p, AffinePrefetchOp op) {
+  p << AffinePrefetchOp::getOperationName() << " " << *op.memref() << '[';
+  AffineMapAttr mapAttr = op.getAttrOfType<AffineMapAttr>(op.getMapAttrName());
+  if (mapAttr) {
+    SmallVector<Value, 2> operands(op.getMapOperands());
+    p.printAffineMapOfSSAIds(mapAttr, operands);
+  }
+  p << ']' << ", " << (op.isWrite() ? "write" : "read") << ", "
+    << "locality<" << op.localityHint() << ">, "
+    << (op.isDataCache() ? "data" : "instr");
+  p.printOptionalAttrDict(
+      op.getAttrs(),
+      /*elidedAttrs=*/{op.getMapAttrName(), op.getLocalityHintAttrName(),
+                       op.getIsDataCacheAttrName(), op.getIsWriteAttrName()});
+  p << " : " << op.getMemRefType();
+}
+
+LogicalResult verify(AffinePrefetchOp op) {
+  auto mapAttr = op.getAttrOfType<AffineMapAttr>(op.getMapAttrName());
+  if (mapAttr) {
+    AffineMap map = mapAttr.getValue();
+    if (map.getNumResults() != op.getMemRefType().getRank())
+      return op.emitOpError("affine.prefetch affine map num results must equal"
+                            " memref rank");
+    if (map.getNumInputs() + 1 != op.getNumOperands())
+      return op.emitOpError("too few operands");
+  } else {
+    if (op.getNumOperands() != 1)
+      return op.emitOpError("too few operands");
+  }
+
+  for (auto idx : op.getMapOperands()) {
+    if (!isValidAffineIndexOperand(idx))
+      return op.emitOpError("index must be a dimension or symbol identifier");
+  }
+  return success();
+}
+
+void AffinePrefetchOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  // prefetch(memrefcast) -> prefetch
+  results.insert<SimplifyAffineOp<AffinePrefetchOp>>(context);
+}
+
+LogicalResult AffinePrefetchOp::fold(ArrayRef<Attribute> cstOperands,
+                                     SmallVectorImpl<OpFoldResult> &results) {
+  /// prefetch(memrefcast) -> prefetch
+  return foldMemRefCast(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/AffineOps/AffineOps.cpp.inc"
diff --git a/mlir/lib/Dialect/AffineOps/CMakeLists.txt b/mlir/lib/Dialect/AffineOps/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..97d954b71fd3dae196f5f5418661e7cf04be98fd
--- /dev/null
+++ b/mlir/lib/Dialect/AffineOps/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_llvm_library(MLIRAffineOps
+  AffineOps.cpp
+  DialectRegistration.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AffineOps
+  )
+add_dependencies(MLIRAffineOps
+  MLIRAffineOpsIncGen
+  MLIRIR
+  MLIRLoopLikeInterfaceIncGen
+  MLIRStandardOps)
+target_link_libraries(MLIRAffineOps MLIRIR MLIRStandardOps)
+
diff --git a/mlir/lib/Dialect/AffineOps/DialectRegistration.cpp b/mlir/lib/Dialect/AffineOps/DialectRegistration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..775e25ec8eacf70779ff3ed3cad0fd2baf715865
--- /dev/null
+++ b/mlir/lib/Dialect/AffineOps/DialectRegistration.cpp
@@ -0,0 +1,13 @@
+//===- DialectRegistration.cpp - Register Affine Op dialect ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+using namespace mlir;
+
+// Static initialization for Affine op dialect registration.
+static DialectRegistration<AffineOpsDialect> StandardOps;
diff --git a/mlir/lib/Dialect/CMakeLists.txt b/mlir/lib/Dialect/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b0641a9611f4d52a692b58ba34a3fbed74c185d5
--- /dev/null
+++ b/mlir/lib/Dialect/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_subdirectory(AffineOps)
+add_subdirectory(FxpMathOps)
+add_subdirectory(GPU)
+add_subdirectory(Linalg)
+add_subdirectory(LLVMIR)
+add_subdirectory(LoopOps)
+add_subdirectory(QuantOps)
+add_subdirectory(SDBM)
+add_subdirectory(SPIRV)
+add_subdirectory(StandardOps)
+add_subdirectory(VectorOps)
+
+add_llvm_library(MLIRDialect
+  Traits.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect
+  )
+target_link_libraries(MLIRDialect MLIRIR)
diff --git a/mlir/lib/Dialect/FxpMathOps/CMakeLists.txt b/mlir/lib/Dialect/FxpMathOps/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9eddc5545f58f8308cd58b9376f6bb21a30a148b
--- /dev/null
+++ b/mlir/lib/Dialect/FxpMathOps/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_llvm_library(MLIRFxpMathOps
+  IR/FxpMathOps.cpp
+  IR/DialectRegistration.cpp
+  Transforms/LowerUniformRealMath.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/FxpMathOps
+  )
+add_dependencies(MLIRFxpMathOps
+                 MLIRFxpMathOpsIncGen
+                 MLIRQuantOps
+                 MLIRIR
+                 MLIRPass
+                 MLIRSupport
+                 MLIRStandardOps)
diff --git a/mlir/lib/Dialect/FxpMathOps/IR/DialectRegistration.cpp b/mlir/lib/Dialect/FxpMathOps/IR/DialectRegistration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..57d5ae8e78900c21aa86abce86b3b55ab7cabea1
--- /dev/null
+++ b/mlir/lib/Dialect/FxpMathOps/IR/DialectRegistration.cpp
@@ -0,0 +1,15 @@
+//===- DialectRegistration.cpp - Register FxpMathOps dialect --------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.h"
+
+using namespace mlir;
+using namespace mlir::fxpmath;
+
+// Static initialization for the fxpmath ops dialect registration.
+static mlir::DialectRegistration<FxpMathOpsDialect> FxpMathOps;
diff --git a/mlir/lib/Dialect/FxpMathOps/IR/FxpMathOps.cpp b/mlir/lib/Dialect/FxpMathOps/IR/FxpMathOps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..30e7dc04104604ee5b6d695b1ecec98986ab91bf
--- /dev/null
+++ b/mlir/lib/Dialect/FxpMathOps/IR/FxpMathOps.cpp
@@ -0,0 +1,29 @@
+//===- FxpMathOps.cpp - Op implementation for FxpMathOps ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace mlir;
+using namespace mlir::fxpmath;
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.cpp.inc"
+
+FxpMathOpsDialect::FxpMathOpsDialect(MLIRContext *context)
+    : Dialect(/*name=*/"fxpmath", context) {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.cpp.inc"
+      >();
+}
diff --git a/mlir/lib/Dialect/FxpMathOps/Transforms/LowerUniformRealMath.cpp b/mlir/lib/Dialect/FxpMathOps/Transforms/LowerUniformRealMath.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df6015de1b96ea439a2c26b1c2b326e1d4a9ded6
--- /dev/null
+++ b/mlir/lib/Dialect/FxpMathOps/Transforms/LowerUniformRealMath.cpp
@@ -0,0 +1,393 @@
+//===- LowerUniformRealMath.cpp  ------------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "UniformKernelUtils.h"
+
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.h"
+#include "mlir/Dialect/FxpMathOps/Passes.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::fxpmath;
+using namespace mlir::fxpmath::detail;
+using namespace mlir::quant;
+
+namespace {
+
+struct LowerUniformRealMathPass
+    : public FunctionPass<LowerUniformRealMathPass> {
+  void runOnFunction() override;
+};
+
+struct LowerUniformCastsPass : public FunctionPass<LowerUniformCastsPass> {
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Dequantize
+//===----------------------------------------------------------------------===//
+
+static Value emitUniformPerLayerDequantize(Location loc, Value input,
+                                           UniformQuantizedType elementType,
+                                           PatternRewriter &rewriter) {
+  // Pre-conditions.
+  if (!elementType.isSigned()) {
+    // TODO: Support unsigned storage type.
+    emitWarning(loc, "unimplemented: dequantize signed uniform");
+    return nullptr;
+  }
+
+  Type storageType = elementType.castToStorageType(input->getType());
+  Type realType = elementType.castToExpressedType(input->getType());
+  Type intermediateType =
+      castElementType(storageType, IntegerType::get(32, rewriter.getContext()));
+  assert(storageType && "cannot cast to storage type");
+  assert(realType && "cannot cast to expressed type");
+
+  // Cast to storage type.
+  input = rewriter.create<StorageCastOp>(loc, storageType, input);
+
+  // Promote to intermediate type.
+  input = rewriter.create<ConvertISOp>(loc, intermediateType, input);
+
+  // Apply zero-point offset.
+  if (elementType.getZeroPoint() != 0) {
+    Value negZeroPointConst = rewriter.create<ConstantOp>(
+        loc, broadcastScalarConstIntValue(intermediateType,
+                                          -elementType.getZeroPoint()));
+    input = rewriter.create<AddIOp>(loc, input, negZeroPointConst);
+  }
+
+  // Convert to float.
+  input = rewriter.create<ConvertISToFOp>(loc, realType, input);
+
+  // Mul by scale.
+  Value scaleConst = rewriter.create<ConstantOp>(
+      loc, broadcastScalarConstFloatValue(realType,
+                                          APFloat(elementType.getScale())));
+  return rewriter.create<MulFOp>(loc, input, scaleConst);
+}
+
+static Value
+emitUniformPerAxisDequantize(Location loc, Value input,
+                             UniformQuantizedPerAxisType elementType,
+                             PatternRewriter &rewriter) {
+  // TODO: Support per-axis dequantize.
+  rewriter.getContext()->getDiagEngine().emit(loc, DiagnosticSeverity::Warning)
+      << "unimplemented: per-axis uniform dequantization";
+  return nullptr;
+}
+
+static Value emitDequantize(Location loc, Value input,
+                            PatternRewriter &rewriter) {
+  Type inputType = input->getType();
+  QuantizedType qElementType =
+      QuantizedType::getQuantizedElementType(inputType);
+  if (auto uperLayerElementType =
+          qElementType.dyn_cast_or_null<UniformQuantizedType>()) {
+    return emitUniformPerLayerDequantize(loc, input, uperLayerElementType,
+                                         rewriter);
+  } else if (auto uperAxisElementType =
+                 qElementType.dyn_cast_or_null<UniformQuantizedPerAxisType>()) {
+    return emitUniformPerAxisDequantize(loc, input, uperAxisElementType,
+                                        rewriter);
+  } else {
+    return nullptr;
+  }
+}
+
+namespace {
+
+struct UniformDequantizePattern : public OpRewritePattern<DequantizeCastOp> {
+  using OpRewritePattern<DequantizeCastOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(DequantizeCastOp op,
+                                     PatternRewriter &rewriter) const override {
+    Type inputType = op.arg()->getType();
+    Type outputType = op.getResult()->getType();
+
+    QuantizedType inputElementType =
+        QuantizedType::getQuantizedElementType(inputType);
+    Type expressedOutputType = inputElementType.castToExpressedType(inputType);
+    if (expressedOutputType != outputType) {
+      // Not a valid uniform cast.
+      return matchFailure();
+    }
+
+    Value dequantizedValue = emitDequantize(op.getLoc(), op.arg(), rewriter);
+    if (!dequantizedValue) {
+      return matchFailure();
+    }
+
+    rewriter.replaceOp(op, dequantizedValue);
+    return matchSuccess();
+  }
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Elementwise add
+//===----------------------------------------------------------------------===//
+
+static LogicalResult
+tryRewriteAffineAddEwIsomorphicSigned(const UniformBinaryOpInfo &info,
+                                      PatternRewriter &rewriter) {
+  if (!info.resultType.isSigned() || info.lhsType != info.resultType ||
+      info.rhsType != info.resultType) {
+    return failure();
+  }
+
+  // Choose a byte aligned intermediate width big enough to perform the
+  // calculation without overflow.
+  // TODO: This should probably be made just big enough to avoid overflow and
+  // leave the downstream tooling to decide how to align that to machine
+  // word sizes.
+  unsigned intermediateWidth =
+      info.resultType.getStorageTypeIntegralWidth() <= 8 ? 16 : 32;
+  IntegerType intermediateElementType =
+      IntegerType::get(intermediateWidth, rewriter.getContext());
+  Type intermediateType =
+      castElementType(info.resultStorageType, intermediateElementType);
+
+  // Cast operands to storage type.
+  Value lhsValue = rewriter
+                       .create<StorageCastOp>(info.op->getLoc(),
+                                              info.lhsStorageType, info.lhs)
+                       .getResult();
+  Value rhsValue = rewriter
+                       .create<StorageCastOp>(info.op->getLoc(),
+                                              info.rhsStorageType, info.rhs)
+                       .getResult();
+
+  // Cast to the intermediate sized type.
+  lhsValue = rewriter.create<ConvertISOp>(info.op->getLoc(), intermediateType,
+                                          lhsValue);
+  rhsValue = rewriter.create<ConvertISOp>(info.op->getLoc(), intermediateType,
+                                          rhsValue);
+
+  // Add.
+  Value resultValue =
+      rewriter.create<AddIOp>(info.op->getLoc(), lhsValue, rhsValue);
+
+  // Zero point offset adjustment.
+  // result = (lhs - zp) + (rhs - zp) + zp
+  // zpOffset = -zp
+  int zpOffset = -1 * info.resultType.getZeroPoint();
+  if (zpOffset != 0) {
+    Value zpOffsetConst = rewriter.create<ConstantOp>(
+        info.op->getLoc(),
+        broadcastScalarConstIntValue(intermediateType, zpOffset));
+    resultValue =
+        rewriter.create<AddIOp>(info.op->getLoc(), resultValue, zpOffsetConst);
+  }
+
+  // Clamp.
+  auto clampMinMax = info.getClampMinMax(intermediateElementType);
+  resultValue = rewriter.create<ClampISOp>(
+      info.op->getLoc(), resultValue, clampMinMax.first, clampMinMax.second);
+
+  // Convert back to original type.
+  resultValue = rewriter.create<ConvertISOp>(
+      info.op->getLoc(), info.resultStorageType, resultValue);
+
+  // Cast back for new result.
+  rewriter.replaceOpWithNewOp<StorageCastOp>(
+      info.op, info.getQuantizedResultType(), resultValue);
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Elementwise mul
+//===----------------------------------------------------------------------===//
+
+static LogicalResult
+tryRewriteAffineMulEwSigned(const UniformBinaryOpInfo &info,
+                            PatternRewriter &rewriter) {
+  if (!info.resultType.isSigned()) {
+    return failure();
+  }
+
+  double outputMultiplierReal = info.lhsType.getScale() *
+                                info.rhsType.getScale() /
+                                info.resultType.getScale();
+  if (outputMultiplierReal > 1.0) {
+    info.op->emitWarning(
+        "unimplemented: cannot multiply with multiplier > 1.0");
+    return failure();
+  }
+
+  // TODO: Choose an appropriate intermediate width for muls > 8 bits to
+  // avoid overflow.
+  unsigned intermediateWidth = 32;
+  IntegerType intermediateElementType =
+      IntegerType::get(intermediateWidth, rewriter.getContext());
+  Type intermediateType =
+      castElementType(info.resultStorageType, intermediateElementType);
+
+  // Cast operands to storage type.
+  Value lhsValue = rewriter
+                       .create<StorageCastOp>(info.op->getLoc(),
+                                              info.lhsStorageType, info.lhs)
+                       .getResult();
+  Value rhsValue = rewriter
+                       .create<StorageCastOp>(info.op->getLoc(),
+                                              info.rhsStorageType, info.rhs)
+                       .getResult();
+
+  // Cast to the intermediate sized type.
+  lhsValue = rewriter.create<ConvertISOp>(info.op->getLoc(), intermediateType,
+                                          lhsValue);
+  rhsValue = rewriter.create<ConvertISOp>(info.op->getLoc(), intermediateType,
+                                          rhsValue);
+
+  // Apply argument zeroPoints.
+  if (info.lhsType.getZeroPoint() != 0) {
+    Value zpOffsetConst = rewriter.create<ConstantOp>(
+        info.op->getLoc(), broadcastScalarConstIntValue(
+                               intermediateType, -info.lhsType.getZeroPoint()));
+    lhsValue =
+        rewriter.create<AddIOp>(info.op->getLoc(), lhsValue, zpOffsetConst);
+  }
+
+  if (info.rhsType.getZeroPoint() != 0) {
+    Value zpOffsetConst = rewriter.create<ConstantOp>(
+        info.op->getLoc(), broadcastScalarConstIntValue(
+                               intermediateType, -info.rhsType.getZeroPoint()));
+    rhsValue =
+        rewriter.create<AddIOp>(info.op->getLoc(), rhsValue, zpOffsetConst);
+  }
+
+  // Mul.
+  Value resultValue =
+      rewriter.create<MulIOp>(info.op->getLoc(), lhsValue, rhsValue);
+
+  // Scale output.
+  QuantizedMultiplierSmallerThanOneExp outputMultiplier(outputMultiplierReal);
+  resultValue = rewriter.create<VecScalarSaturatingRoundingDoublingHighMulISOp>(
+      info.op->getLoc(), resultValue,
+      IntegerAttr::get(intermediateElementType, outputMultiplier.multiplier));
+  resultValue = rewriter.create<RoundingDivideByPotISOp>(
+      info.op->getLoc(), resultValue,
+      IntegerAttr::get(intermediateElementType, -outputMultiplier.exponent));
+
+  // Zero point offset adjustment.
+  if (info.resultType.getZeroPoint() != 0) {
+    Value zpOffsetConst = rewriter.create<ConstantOp>(
+        info.op->getLoc(),
+        broadcastScalarConstIntValue(intermediateType,
+                                     info.resultType.getZeroPoint()));
+    resultValue =
+        rewriter.create<AddIOp>(info.op->getLoc(), resultValue, zpOffsetConst);
+  }
+
+  // Clamp.
+  auto clampMinMax = info.getClampMinMax(intermediateElementType);
+  resultValue = rewriter.create<ClampISOp>(
+      info.op->getLoc(), resultValue, clampMinMax.first, clampMinMax.second);
+
+  // Convert back to original type.
+  resultValue = rewriter.create<ConvertISOp>(
+      info.op->getLoc(), info.resultStorageType, resultValue);
+
+  // Cast back for new result.
+  rewriter.replaceOpWithNewOp<StorageCastOp>(
+      info.op, info.getQuantizedResultType(), resultValue);
+
+  return success();
+}
+
+namespace {
+
+struct UniformRealAddEwPattern : public OpRewritePattern<RealAddEwOp> {
+  using OpRewritePattern<RealAddEwOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(RealAddEwOp op,
+                                     PatternRewriter &rewriter) const override {
+    const UniformBinaryOpInfo info(op, op.lhs(), op.rhs(), op.clamp_min(),
+                                   op.clamp_max());
+    if (!info.isValid()) {
+      return matchFailure();
+    }
+
+    // Try all of the permutations we support.
+    if (succeeded(tryRewriteAffineAddEwIsomorphicSigned(info, rewriter))) {
+      return matchSuccess();
+    }
+
+    return matchFailure();
+  }
+};
+
+struct UniformRealMulEwPattern : public OpRewritePattern<RealMulEwOp> {
+  using OpRewritePattern<RealMulEwOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(RealMulEwOp op,
+                                     PatternRewriter &rewriter) const override {
+    const UniformBinaryOpInfo info(op, op.lhs(), op.rhs(), op.clamp_min(),
+                                   op.clamp_max());
+    if (!info.isValid()) {
+      return matchFailure();
+    }
+
+    // Try all of the permutations we support.
+    if (succeeded(tryRewriteAffineMulEwSigned(info, rewriter))) {
+      return matchSuccess();
+    }
+
+    return matchFailure();
+  }
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// LowerUniformRealMath pass
+//===----------------------------------------------------------------------===//
+
+void LowerUniformRealMathPass::runOnFunction() {
+  auto fn = getFunction();
+  OwningRewritePatternList patterns;
+  auto *context = &getContext();
+  patterns.insert<UniformRealAddEwPattern, UniformRealMulEwPattern>(context);
+  applyPatternsGreedily(fn, patterns);
+}
+
+OpPassBase<FuncOp> *mlir::fxpmath::createLowerUniformRealMathPass() {
+  return new LowerUniformRealMathPass();
+}
+
+static PassRegistration<LowerUniformRealMathPass> lowerUniformRealMathPass(
+    "fxpmath-lower-uniform-real-math",
+    "Lowers uniform-quantized real math ops to integer arithmetic.");
+
+//===----------------------------------------------------------------------===//
+// LowerUniformCasts pass
+//===----------------------------------------------------------------------===//
+
+void LowerUniformCastsPass::runOnFunction() {
+  auto fn = getFunction();
+  OwningRewritePatternList patterns;
+  auto *context = &getContext();
+  patterns.insert<UniformDequantizePattern>(context);
+  applyPatternsGreedily(fn, patterns);
+}
+
+OpPassBase<FuncOp> *mlir::fxpmath::createLowerUniformCastsPass() {
+  return new LowerUniformCastsPass();
+}
+
+static PassRegistration<LowerUniformCastsPass>
+    lowerUniformCastsPass("fxpmath-lower-uniform-casts",
+                          "Lowers uniform-quantized casts.");
diff --git a/mlir/lib/Dialect/FxpMathOps/Transforms/UniformKernelUtils.h b/mlir/lib/Dialect/FxpMathOps/Transforms/UniformKernelUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8cea97c693c3ac6155f1a83448c40e842e349dc7
--- /dev/null
+++ b/mlir/lib/Dialect/FxpMathOps/Transforms/UniformKernelUtils.h
@@ -0,0 +1,227 @@
+//===- UniformKernelUtils.h - Utilities for lowering uniform math - C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_FXPMATH_UNIFORM_KERNEL_UTILS_H_
+#define MLIR_FXPMATH_UNIFORM_KERNEL_UTILS_H_
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/Dialect/QuantOps/UniformSupport.h"
+#include "mlir/IR/Operation.h"
+
+#include <cmath>
+
+namespace mlir {
+namespace fxpmath {
+namespace detail {
+
+inline quant::UniformQuantizedType getUniformElementType(Type t) {
+  return quant::QuantizedType::getQuantizedElementType(t)
+      .dyn_cast_or_null<quant::UniformQuantizedType>();
+}
+
+inline bool hasStorageBitWidth(quant::QuantizedType t,
+                               ArrayRef<unsigned> checkWidths) {
+  unsigned w = t.getStorageType().getIntOrFloatBitWidth();
+  for (unsigned checkWidth : checkWidths) {
+    if (w == checkWidth)
+      return true;
+  }
+  return false;
+}
+
+/// Computes the log2(x), rounded to an integral value. Returns whether 'x' can
+/// be considered an exact integral value.
+template <typename F> bool integralLog2(F x, int &log2Result) {
+  const F xLog2 = std::log(x) * (1.0 / std::log(2.0));
+  const F xLog2Rounded = std::round(xLog2);
+  const F xLog2Frac = xLog2 - xLog2Rounded;
+  log2Result = static_cast<int>(xLog2Rounded);
+  // Allow small comparison slop below the level that would make a difference
+  // for 2^16 levels.
+  return std::abs(xLog2Frac) < 1e-6;
+}
+
+/// Helper class for operating on binary operations where all operands
+/// and the result are a UniformQuantizedType.
+struct UniformBinaryOpInfo {
+  UniformBinaryOpInfo(Operation *op, Value lhs, Value rhs,
+                      Optional<APFloat> clampMin, Optional<APFloat> clampMax)
+      : op(op), lhs(lhs), rhs(rhs), clampMin(clampMin), clampMax(clampMax),
+        lhsType(getUniformElementType(lhs->getType())),
+        rhsType(getUniformElementType(rhs->getType())),
+        resultType(getUniformElementType(*op->result_type_begin())),
+        lhsStorageType(quant::QuantizedType::castToStorageType(lhs->getType())),
+        rhsStorageType(quant::QuantizedType::castToStorageType(rhs->getType())),
+        resultStorageType(
+            quant::QuantizedType::castToStorageType(*op->result_type_begin())) {
+  }
+
+  /// Returns whether this info is valid (all types defined, etc).
+  bool isValid() const {
+    return lhsType && rhsType && resultType && lhsStorageType &&
+           rhsStorageType && resultStorageType;
+  }
+
+  /// Gets the final quantized result type of the result.
+  Type getQuantizedResultType() const { return *op->result_type_begin(); }
+
+  /// Returns whether the storage type of all operands is identical.
+  bool isSameStorageType() const {
+    return lhsType.getStorageType() == rhsType.getStorageType() &&
+           lhsType.getStorageType() == resultType.getStorageType();
+  }
+
+  /// Returns whether all operands and result are considered fixedpoint power
+  /// of two, setting the lhs, rhs, and result log2 scale references.
+  bool isFixedPointPOT(int &lhsLog2Scale, int &rhsLog2Scale,
+                       int &resultLog2Scale) const {
+    if (!lhsType.isFixedPoint() || !rhsType.isFixedPoint() ||
+        !resultType.isFixedPoint()) {
+      return false;
+    }
+
+    if (!integralLog2(lhsType.getScale(), lhsLog2Scale) ||
+        !integralLog2(rhsType.getScale(), rhsLog2Scale) ||
+        !integralLog2(resultType.getScale(), resultLog2Scale)) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Gets the result integer clamp range given the result quantized type
+  // and any explicit clamp provided as attributes.
+  std::pair<IntegerAttr, IntegerAttr> getClampMinMax(IntegerType ty) const {
+    int64_t typeMin = resultType.getStorageTypeMin();
+    int64_t typeMax = resultType.getStorageTypeMax();
+
+    if (clampMin || clampMax) {
+      quant::UniformQuantizedValueConverter conv(resultType);
+      if (clampMin) {
+        typeMin = std::max(typeMin, conv.quantizeFloatToInt64(*clampMin));
+      }
+      if (clampMax) {
+        typeMax = std::min(typeMax, conv.quantizeFloatToInt64(*clampMax));
+      }
+    }
+
+    // The quantized, integral ops expect clamps as 32bit ints.
+    return {
+        IntegerAttr::get(ty, typeMin),
+        IntegerAttr::get(ty, typeMax),
+    };
+  }
+
+  Operation *op;
+  Value lhs;
+  Value rhs;
+  Optional<APFloat> clampMin;
+  Optional<APFloat> clampMax;
+
+  // Element UniformQuantizedType for operands/result.
+  quant::UniformQuantizedType lhsType;
+  quant::UniformQuantizedType rhsType;
+  quant::UniformQuantizedType resultType;
+
+  // Full storage-based types.
+  Type lhsStorageType;
+  Type rhsStorageType;
+  Type resultStorageType;
+};
+
+/// Derives a quantized multiplier and shift from a real valued multiplier
+/// less than 1.
+struct QuantizedMultiplierSmallerThanOneExp {
+  QuantizedMultiplierSmallerThanOneExp(double realMultiplier) {
+    assert(realMultiplier < 1.0);
+    assert(realMultiplier > 0.0);
+
+    const double q = std::frexp(realMultiplier, &exponent);
+    auto qFixed = static_cast<int64_t>(std::round(q * (1ll << 31)));
+    assert(qFixed <= (1ll << 31));
+    if (qFixed == (1ll << 31)) {
+      qFixed /= 2;
+      ++exponent;
+    }
+    assert(qFixed <= std::numeric_limits<int32_t>::max());
+    multiplier = static_cast<int32_t>(qFixed);
+  }
+
+  int32_t multiplier;
+  int exponent;
+};
+
+/// Casts an integer or floating point based shaped type to a new element type.
+inline Type castElementType(Type t, Type newElementType) {
+  if (auto st = t.dyn_cast<ShapedType>()) {
+    switch (st.getKind()) {
+    case StandardTypes::Kind::Vector:
+      return VectorType::get(st.getShape(), newElementType);
+    case StandardTypes::Kind::RankedTensor:
+      return RankedTensorType::get(st.getShape(), newElementType);
+    case StandardTypes::Kind::UnrankedTensor:
+      return UnrankedTensorType::get(newElementType);
+    case StandardTypes::Kind::MemRef:
+      return MemRefType::get(st.getShape(), newElementType,
+                             st.cast<MemRefType>().getAffineMaps());
+    }
+  }
+  assert(t.isIntOrFloat());
+  return newElementType;
+}
+
+/// Creates an IntegerAttr with a type that matches the shape of 't' (which can
+/// be a scalar primitive or a shaped type).
+inline Attribute broadcastScalarConstIntValue(Type t, int64_t value) {
+  if (auto st = t.dyn_cast<ShapedType>()) {
+    assert(st.getElementType().isa<IntegerType>());
+    return DenseElementsAttr::get(st,
+                                  IntegerAttr::get(st.getElementType(), value));
+  }
+
+  auto integerType = t.cast<IntegerType>();
+  assert(t.isa<IntegerType>() && "integer broadcast must be of integer type");
+  return IntegerAttr::get(integerType, value);
+}
+
+/// Given an APFloat, converts it to the float semantics that matches the
+/// given FloatType, silently ignoring inexact conversions.
+inline APFloat convertFloatToType(FloatType ft, APFloat value) {
+  bool losesInfo;
+  auto status = value.convert(ft.getFloatSemantics(),
+                              APFloat::rmNearestTiesToEven, &losesInfo);
+  (void)status; // unused in opt mode
+  assert((status & (APFloat::opDivByZero | APFloat::opInvalidOp)) == 0 &&
+         "could not convert to float const");
+  return value;
+}
+
+/// Creates a FloatAttr with a type that matches the shape of 't' (which can be
+/// a scalar primitive or a shaped type).
+inline Attribute broadcastScalarConstFloatValue(Type t, APFloat value) {
+  if (auto st = t.dyn_cast<ShapedType>()) {
+    FloatType floatElementType = st.getElementType().dyn_cast<FloatType>();
+    assert(floatElementType &&
+           "float broadcast element type must be float like");
+    APFloat apValue = convertFloatToType(floatElementType, value);
+    return DenseElementsAttr::get(st,
+                                  FloatAttr::get(st.getElementType(), apValue));
+  } else {
+    auto floatType = t.dyn_cast<FloatType>();
+    assert(floatType && "float broadcast must be of float type");
+    APFloat apValue = convertFloatToType(floatType, value);
+    return FloatAttr::get(floatType, apValue);
+  }
+}
+
+} // namespace detail
+} // namespace fxpmath
+} // namespace mlir
+
+#endif // MLIR_FXPMATH_UNIFORM_KERNEL_UTILS_H_
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6fe45ba49ef56f46fd53294868b4e7e13d4100c0
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(MLIRGPU
+  IR/GPUDialect.cpp
+  IR/DialectRegistration.cpp
+  Transforms/KernelOutlining.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
+)
+add_dependencies(MLIRGPU MLIRGPUOpsIncGen MLIRIR MLIRLLVMIR LLVMSupport)
+target_link_libraries(MLIRGPU MLIRIR MLIRLLVMIR MLIRStandardOps LLVMSupport)
diff --git a/mlir/lib/Dialect/GPU/IR/DialectRegistration.cpp b/mlir/lib/Dialect/GPU/IR/DialectRegistration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..511c69e06955e4b914662b00af71318134bcc595
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/IR/DialectRegistration.cpp
@@ -0,0 +1,12 @@
+//===- DialectRegistration.cpp - MLIR GPU dialect registration ------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+
+// Static initialization for GPU dialect registration.
+static mlir::DialectRegistration<mlir::gpu::GPUDialect> kernelDialect;
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bda8032fc21a6386272797ab017dade4940a235d
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -0,0 +1,821 @@
+//===- GPUDialect.cpp - MLIR Dialect for GPU Kernels implementation -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GPU kernel-related dialect and its operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/FunctionImplementation.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+//===----------------------------------------------------------------------===//
+// GPUDialect
+//===----------------------------------------------------------------------===//
+
+StringRef GPUDialect::getDialectName() { return "gpu"; }
+
+bool GPUDialect::isKernel(Operation *op) {
+  UnitAttr isKernelAttr = op->getAttrOfType<UnitAttr>(getKernelFuncAttrName());
+  return static_cast<bool>(isKernelAttr);
+}
+
+GPUDialect::GPUDialect(MLIRContext *context)
+    : Dialect(getDialectName(), context) {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/GPU/GPUOps.cpp.inc"
+      >();
+}
+
+LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
+                                                   NamedAttribute attr) {
+  if (!attr.second.isa<UnitAttr>() ||
+      !attr.first.is(getContainerModuleAttrName()))
+    return success();
+
+  auto module = dyn_cast<ModuleOp>(op);
+  if (!module)
+    return op->emitError("expected '")
+           << getContainerModuleAttrName() << "' attribute to be attached to '"
+           << ModuleOp::getOperationName() << '\'';
+
+  auto walkResult = module.walk([&module](LaunchFuncOp launchOp) -> WalkResult {
+    // Ignore launches that are nested more or less deep than functions in the
+    // module we are currently checking.
+    if (!launchOp.getParentOp() ||
+        launchOp.getParentOp()->getParentOp() != module)
+      return success();
+
+    // Ignore launch ops with missing attributes here. The errors will be
+    // reported by the verifiers of those ops.
+    if (!launchOp.getAttrOfType<StringAttr>(
+            LaunchFuncOp::getKernelAttrName()) ||
+        !launchOp.getAttrOfType<SymbolRefAttr>(
+            LaunchFuncOp::getKernelModuleAttrName()))
+      return success();
+
+    // Check that `launch_func` refers to a well-formed GPU kernel module.
+    StringRef kernelModuleName = launchOp.getKernelModuleName();
+    auto kernelModule = module.lookupSymbol<ModuleOp>(kernelModuleName);
+    if (!kernelModule)
+      return launchOp.emitOpError()
+             << "kernel module '" << kernelModuleName << "' is undefined";
+    if (!kernelModule.getAttrOfType<UnitAttr>(
+            GPUDialect::getKernelModuleAttrName()))
+      return launchOp.emitOpError("module '")
+             << kernelModuleName << "' is missing the '"
+             << GPUDialect::getKernelModuleAttrName() << "' attribute";
+
+    // Check that `launch_func` refers to a well-formed kernel function.
+    StringRef kernelName = launchOp.kernel();
+    Operation *kernelFunc = kernelModule.lookupSymbol(kernelName);
+    auto kernelGPUFunction = dyn_cast_or_null<gpu::GPUFuncOp>(kernelFunc);
+    auto kernelLLVMFunction = dyn_cast_or_null<LLVM::LLVMFuncOp>(kernelFunc);
+    if (!kernelGPUFunction && !kernelLLVMFunction)
+      return launchOp.emitOpError("kernel function '")
+             << kernelName << "' is undefined";
+    if (!kernelFunc->getAttrOfType<mlir::UnitAttr>(
+            GPUDialect::getKernelFuncAttrName()))
+      return launchOp.emitOpError("kernel function is missing the '")
+             << GPUDialect::getKernelFuncAttrName() << "' attribute";
+
+    unsigned actualNumArguments = launchOp.getNumKernelOperands();
+    unsigned expectedNumArguments = kernelLLVMFunction
+                                        ? kernelLLVMFunction.getNumArguments()
+                                        : kernelGPUFunction.getNumArguments();
+    if (expectedNumArguments != actualNumArguments)
+      return launchOp.emitOpError("got ")
+             << actualNumArguments << " kernel operands but expected "
+             << expectedNumArguments;
+
+    // Due to the ordering of the current impl of lowering and LLVMLowering,
+    // type checks need to be temporarily disabled.
+    // TODO(ntv,zinenko,herhut): reactivate checks once "changing gpu.launchFunc
+    // to encode target module" has landed.
+    // auto functionType = kernelFunc.getType();
+    // for (unsigned i = 0; i < numKernelFuncArgs; ++i) {
+    //   if (getKernelOperand(i)->getType() != functionType.getInput(i)) {
+    //     return emitOpError("type of function argument ")
+    //            << i << " does not match";
+    //   }
+    // }
+
+    return success();
+  });
+
+  return walkResult.wasInterrupted() ? failure() : success();
+}
+
+template <typename T> static LogicalResult verifyIndexOp(T op) {
+  auto dimension = op.dimension();
+  if (dimension != "x" && dimension != "y" && dimension != "z")
+    return op.emitError("dimension \"") << dimension << "\" is invalid";
+  return success();
+}
+
+static LogicalResult verifyAllReduce(gpu::AllReduceOp allReduce) {
+  if (allReduce.body().empty() != allReduce.op().hasValue())
+    return allReduce.emitError(
+        "expected either an op attribute or a non-empty body");
+  if (!allReduce.body().empty()) {
+    if (allReduce.body().front().getNumArguments() != 2)
+      return allReduce.emitError("expected two region arguments");
+    for (auto argument : allReduce.body().front().getArguments()) {
+      if (argument->getType() != allReduce.getType())
+        return allReduce.emitError("incorrect region argument type");
+    }
+    unsigned yieldCount = 0;
+    for (Block &block : allReduce.body()) {
+      if (auto yield = dyn_cast<gpu::YieldOp>(block.getTerminator())) {
+        if (yield.getNumOperands() != 1)
+          return allReduce.emitError("expected one gpu.yield operand");
+        if (yield.getOperand(0)->getType() != allReduce.getType())
+          return allReduce.emitError("incorrect gpu.yield type");
+        ++yieldCount;
+      }
+    }
+    if (yieldCount == 0)
+      return allReduce.emitError("expected gpu.yield op in region");
+  }
+  return success();
+}
+
+static LogicalResult verifyShuffleOp(gpu::ShuffleOp shuffleOp) {
+  auto type = shuffleOp.value()->getType();
+  if (shuffleOp.result()->getType() != type) {
+    return shuffleOp.emitOpError()
+           << "requires the same type for value operand and result";
+  }
+  if (!type.isIntOrFloat() || type.getIntOrFloatBitWidth() != 32) {
+    return shuffleOp.emitOpError()
+           << "requires value operand type to be f32 or i32";
+  }
+  return success();
+}
+
+static void printShuffleOp(OpAsmPrinter &p, ShuffleOp op) {
+  p << ShuffleOp::getOperationName() << ' ';
+  p.printOperands(op.getOperands());
+  p << ' ' << op.mode() << " : ";
+  p.printType(op.value()->getType());
+}
+
+static ParseResult parseShuffleOp(OpAsmParser &parser, OperationState &state) {
+  SmallVector<OpAsmParser::OperandType, 3> operandInfo;
+  if (parser.parseOperandList(operandInfo, 3))
+    return failure();
+
+  StringRef mode;
+  if (parser.parseKeyword(&mode))
+    return failure();
+  state.addAttribute("mode", parser.getBuilder().getStringAttr(mode));
+
+  Type valueType;
+  Type int32Type = parser.getBuilder().getIntegerType(32);
+  Type int1Type = parser.getBuilder().getI1Type();
+  if (parser.parseColonType(valueType) ||
+      parser.resolveOperands(operandInfo, {valueType, int32Type, int32Type},
+                             parser.getCurrentLocation(), state.operands) ||
+      parser.addTypesToList({valueType, int1Type}, state.types))
+    return failure();
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// LaunchOp
+//===----------------------------------------------------------------------===//
+
+static SmallVector<Type, 4> getValueTypes(ValueRange values) {
+  SmallVector<Type, 4> types;
+  types.reserve(values.size());
+  for (Value v : values)
+    types.push_back(v->getType());
+  return types;
+}
+
+void LaunchOp::build(Builder *builder, OperationState &result, Value gridSizeX,
+                     Value gridSizeY, Value gridSizeZ, Value blockSizeX,
+                     Value blockSizeY, Value blockSizeZ, ValueRange operands) {
+  // Add grid and block sizes as op operands, followed by the data operands.
+  result.addOperands(
+      {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});
+  result.addOperands(operands);
+
+  // Create a kernel body region with kNumConfigRegionAttributes + N arguments,
+  // where the first kNumConfigRegionAttributes arguments have `index` type and
+  // the rest have the same types as the data operands.
+  Region *kernelRegion = result.addRegion();
+  Block *body = new Block();
+  body->addArguments(
+      std::vector<Type>(kNumConfigRegionAttributes, builder->getIndexType()));
+  body->addArguments(getValueTypes(operands));
+  kernelRegion->push_back(body);
+}
+
+KernelDim3 LaunchOp::getBlockIds() {
+  assert(!body().getBlocks().empty() && "FuncOp body must not be empty.");
+  auto args = body().getBlocks().front().getArguments();
+  return KernelDim3{args[0], args[1], args[2]};
+}
+
+KernelDim3 LaunchOp::getThreadIds() {
+  assert(!body().getBlocks().empty() && "FuncOp body must not be empty.");
+  auto args = body().getBlocks().front().getArguments();
+  return KernelDim3{args[3], args[4], args[5]};
+}
+
+KernelDim3 LaunchOp::getGridSize() {
+  assert(!body().getBlocks().empty() && "FuncOp body must not be empty.");
+  auto args = body().getBlocks().front().getArguments();
+  return KernelDim3{args[6], args[7], args[8]};
+}
+
+KernelDim3 LaunchOp::getBlockSize() {
+  assert(!body().getBlocks().empty() && "FuncOp body must not be empty.");
+  auto args = body().getBlocks().front().getArguments();
+  return KernelDim3{args[9], args[10], args[11]};
+}
+
+LaunchOp::operand_range LaunchOp::getKernelOperandValues() {
+  return llvm::drop_begin(getOperands(), kNumConfigOperands);
+}
+
+LaunchOp::operand_type_range LaunchOp::getKernelOperandTypes() {
+  return llvm::drop_begin(getOperandTypes(), kNumConfigOperands);
+}
+
+KernelDim3 LaunchOp::getGridSizeOperandValues() {
+  return KernelDim3{getOperand(0), getOperand(1), getOperand(2)};
+}
+
+KernelDim3 LaunchOp::getBlockSizeOperandValues() {
+  return KernelDim3{getOperand(3), getOperand(4), getOperand(5)};
+}
+
+iterator_range<Block::args_iterator> LaunchOp::getKernelArguments() {
+  auto args = body().getBlocks().front().getArguments();
+  return llvm::drop_begin(args, LaunchOp::kNumConfigRegionAttributes);
+}
+
+LogicalResult verify(LaunchOp op) {
+  // Kernel launch takes kNumConfigOperands leading operands for grid/block
+  // sizes and transforms them into kNumConfigRegionAttributes region arguments
+  // for block/thread identifiers and grid/block sizes.
+  if (!op.body().empty()) {
+    Block &entryBlock = op.body().front();
+    if (entryBlock.getNumArguments() !=
+        LaunchOp::kNumConfigOperands + op.getNumOperands())
+      return op.emitOpError("unexpected number of region arguments");
+  }
+
+  // Block terminators without successors are expected to exit the kernel region
+  // and must be `gpu.launch`.
+  for (Block &block : op.body()) {
+    if (block.empty())
+      continue;
+    if (block.back().getNumSuccessors() != 0)
+      continue;
+    if (!isa<gpu::ReturnOp>(&block.back())) {
+      return block.back()
+                 .emitError("expected 'gpu.terminator' or a terminator with "
+                            "successors")
+                 .attachNote(op.getLoc())
+             << "in '" << LaunchOp::getOperationName() << "' body region";
+    }
+  }
+
+  return success();
+}
+
+// Pretty-print the kernel grid/block size assignment as
+//   (%iter-x, %iter-y, %iter-z) in
+//   (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use)
+// where %size-* and %iter-* will correspond to the body region arguments.
+static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size,
+                                ValueRange operands, KernelDim3 ids) {
+  p << '(' << *ids.x << ", " << *ids.y << ", " << *ids.z << ") in (";
+  p << *size.x << " = " << *operands[0] << ", ";
+  p << *size.y << " = " << *operands[1] << ", ";
+  p << *size.z << " = " << *operands[2] << ')';
+}
+
+void printLaunchOp(OpAsmPrinter &p, LaunchOp op) {
+  ValueRange operands = op.getOperands();
+
+  // Print the launch configuration.
+  p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword();
+  printSizeAssignment(p, op.getGridSize(), operands.take_front(3),
+                      op.getBlockIds());
+  p << ' ' << op.getThreadsKeyword();
+  printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3),
+                      op.getThreadIds());
+
+  // From now on, the first kNumConfigOperands operands corresponding to grid
+  // and block sizes are irrelevant, so we can drop them.
+  operands = operands.drop_front(LaunchOp::kNumConfigOperands);
+
+  // Print the data argument remapping.
+  if (!op.body().empty() && !operands.empty()) {
+    p << ' ' << op.getArgsKeyword() << '(';
+    Block *entryBlock = &op.body().front();
+    interleaveComma(llvm::seq<int>(0, operands.size()), p, [&](int i) {
+      p << *entryBlock->getArgument(LaunchOp::kNumConfigRegionAttributes + i)
+        << " = " << *operands[i];
+    });
+    p << ") ";
+  }
+
+  // Print the types of data arguments.
+  if (!operands.empty())
+    p << ": " << operands.getTypes();
+
+  p.printRegion(op.body(), /*printEntryBlockArgs=*/false);
+  p.printOptionalAttrDict(op.getAttrs());
+}
+
+// Parse the size assignment blocks for blocks and threads.  These have the form
+//   (%region_arg, %region_arg, %region_arg) in
+//   (%region_arg = %operand, %region_arg = %operand, %region_arg = %operand)
+// where %region_arg are percent-identifiers for the region arguments to be
+// introduced further (SSA defs), and %operand are percent-identifiers for the
+// SSA value uses.
+static ParseResult
+parseSizeAssignment(OpAsmParser &parser,
+                    MutableArrayRef<OpAsmParser::OperandType> sizes,
+                    MutableArrayRef<OpAsmParser::OperandType> regionSizes,
+                    MutableArrayRef<OpAsmParser::OperandType> indices) {
+  assert(indices.size() == 3 && "space for three indices expected");
+  SmallVector<OpAsmParser::OperandType, 3> args;
+  if (parser.parseRegionArgumentList(args, /*requiredOperandCount=*/3,
+                                     OpAsmParser::Delimiter::Paren) ||
+      parser.parseKeyword("in") || parser.parseLParen())
+    return failure();
+  std::move(args.begin(), args.end(), indices.begin());
+
+  for (int i = 0; i < 3; ++i) {
+    if (i != 0 && parser.parseComma())
+      return failure();
+    if (parser.parseRegionArgument(regionSizes[i]) || parser.parseEqual() ||
+        parser.parseOperand(sizes[i]))
+      return failure();
+  }
+
+  return parser.parseRParen();
+}
+
+// Parses a Launch operation.
+// operation ::= `gpu.launch` `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
+//                           `threads` `(` ssa-id-list `)` `in` ssa-reassignment
+//                             (`args` ssa-reassignment `:` type-list)?
+//                             region attr-dict?
+// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
+ParseResult parseLaunchOp(OpAsmParser &parser, OperationState &result) {
+  // Sizes of the grid and block.
+  SmallVector<OpAsmParser::OperandType, LaunchOp::kNumConfigOperands> sizes(
+      LaunchOp::kNumConfigOperands);
+  MutableArrayRef<OpAsmParser::OperandType> sizesRef(sizes);
+
+  // Actual (data) operands passed to the kernel.
+  SmallVector<OpAsmParser::OperandType, 4> dataOperands;
+
+  // Region arguments to be created.
+  SmallVector<OpAsmParser::OperandType, 16> regionArgs(
+      LaunchOp::kNumConfigRegionAttributes);
+  MutableArrayRef<OpAsmParser::OperandType> regionArgsRef(regionArgs);
+
+  // Parse the size assignment segments: the first segment assigns grid sizes
+  // and defines values for block identifiers; the second segment assigns block
+  // sizes and defines values for thread identifiers.  In the region argument
+  // list, identifiers precede sizes, and block-related values precede
+  // thread-related values.
+  if (parser.parseKeyword(LaunchOp::getBlocksKeyword().data()) ||
+      parseSizeAssignment(parser, sizesRef.take_front(3),
+                          regionArgsRef.slice(6, 3),
+                          regionArgsRef.slice(0, 3)) ||
+      parser.parseKeyword(LaunchOp::getThreadsKeyword().data()) ||
+      parseSizeAssignment(parser, sizesRef.drop_front(3),
+                          regionArgsRef.slice(9, 3),
+                          regionArgsRef.slice(3, 3)) ||
+      parser.resolveOperands(sizes, parser.getBuilder().getIndexType(),
+                             result.operands))
+    return failure();
+
+  // If kernel argument renaming segment is present, parse it.  When present,
+  // the segment should have at least one element.  If this segment is present,
+  // so is the trailing type list.  Parse it as well and use the parsed types
+  // to resolve the operands passed to the kernel arguments.
+  SmallVector<Type, 4> dataTypes;
+  if (!parser.parseOptionalKeyword(LaunchOp::getArgsKeyword())) {
+    llvm::SMLoc argsLoc = parser.getCurrentLocation();
+
+    regionArgs.push_back({});
+    dataOperands.push_back({});
+    if (parser.parseLParen() || parser.parseRegionArgument(regionArgs.back()) ||
+        parser.parseEqual() || parser.parseOperand(dataOperands.back()))
+      return failure();
+
+    while (!parser.parseOptionalComma()) {
+      regionArgs.push_back({});
+      dataOperands.push_back({});
+      if (parser.parseRegionArgument(regionArgs.back()) ||
+          parser.parseEqual() || parser.parseOperand(dataOperands.back()))
+        return failure();
+    }
+
+    if (parser.parseRParen() || parser.parseColonTypeList(dataTypes) ||
+        parser.resolveOperands(dataOperands, dataTypes, argsLoc,
+                               result.operands))
+      return failure();
+  }
+
+  // Introduce the body region and parse it.  The region has
+  // kNumConfigRegionAttributes leading arguments that correspond to
+  // block/thread identifiers and grid/block sizes, all of the `index` type.
+  // Follow the actual kernel arguments.
+  Type index = parser.getBuilder().getIndexType();
+  dataTypes.insert(dataTypes.begin(), LaunchOp::kNumConfigRegionAttributes,
+                   index);
+  Region *body = result.addRegion();
+  return failure(parser.parseRegion(*body, regionArgs, dataTypes) ||
+                 parser.parseOptionalAttrDict(result.attributes));
+}
+
+void LaunchOp::eraseKernelArgument(unsigned index) {
+  Block &entryBlock = body().front();
+  assert(index < entryBlock.getNumArguments() - kNumConfigRegionAttributes &&
+         "kernel argument index overflow");
+  entryBlock.eraseArgument(kNumConfigRegionAttributes + index);
+  getOperation()->eraseOperand(kNumConfigOperands + index);
+}
+
+namespace {
+// Clone any known constants passed as operands to the kernel into its body.
+class PropagateConstantBounds : public OpRewritePattern<LaunchOp> {
+  using OpRewritePattern<LaunchOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(LaunchOp launchOp,
+                                     PatternRewriter &rewriter) const override {
+    rewriter.startRootUpdate(launchOp);
+    PatternRewriter::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPointToStart(&launchOp.body().front());
+
+    // Traverse operands passed to kernel and check if some of them are known
+    // constants.  If so, clone the constant operation inside the kernel region
+    // and use it instead of passing the value from the parent region.  Perform
+    // the traversal in the inverse order to simplify index arithmetics when
+    // dropping arguments.
+    auto operands = launchOp.getKernelOperandValues();
+    auto kernelArgs = launchOp.getKernelArguments();
+    bool found = false;
+    for (unsigned i = operands.size(); i > 0; --i) {
+      unsigned index = i - 1;
+      Value operand = operands[index];
+      if (!isa_and_nonnull<ConstantOp>(operand->getDefiningOp()))
+        continue;
+
+      found = true;
+      Value internalConstant =
+          rewriter.clone(*operand->getDefiningOp())->getResult(0);
+      Value kernelArg = *std::next(kernelArgs.begin(), index);
+      kernelArg->replaceAllUsesWith(internalConstant);
+      launchOp.eraseKernelArgument(index);
+    }
+
+    if (!found) {
+      rewriter.cancelRootUpdate(launchOp);
+      return matchFailure();
+    }
+
+    rewriter.finalizeRootUpdate(launchOp);
+    return matchSuccess();
+  }
+};
+} // end namespace
+
+void LaunchOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<PropagateConstantBounds>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// LaunchFuncOp
+//===----------------------------------------------------------------------===//
+
+void LaunchFuncOp::build(Builder *builder, OperationState &result,
+                         GPUFuncOp kernelFunc, Value gridSizeX, Value gridSizeY,
+                         Value gridSizeZ, Value blockSizeX, Value blockSizeY,
+                         Value blockSizeZ, ValueRange kernelOperands) {
+  // Add grid and block sizes as op operands, followed by the data operands.
+  result.addOperands(
+      {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});
+  result.addOperands(kernelOperands);
+  result.addAttribute(getKernelAttrName(),
+                      builder->getStringAttr(kernelFunc.getName()));
+  auto kernelModule = kernelFunc.getParentOfType<ModuleOp>();
+  if (Optional<StringRef> kernelModuleName = kernelModule.getName())
+    result.addAttribute(getKernelModuleAttrName(),
+                        builder->getSymbolRefAttr(*kernelModuleName));
+}
+
+void LaunchFuncOp::build(Builder *builder, OperationState &result,
+                         GPUFuncOp kernelFunc, KernelDim3 gridSize,
+                         KernelDim3 blockSize, ValueRange kernelOperands) {
+  build(builder, result, kernelFunc, gridSize.x, gridSize.y, gridSize.z,
+        blockSize.x, blockSize.y, blockSize.z, kernelOperands);
+}
+
+StringRef LaunchFuncOp::kernel() {
+  return getAttrOfType<StringAttr>(getKernelAttrName()).getValue();
+}
+
+unsigned LaunchFuncOp::getNumKernelOperands() {
+  return getNumOperands() - kNumConfigOperands;
+}
+
+StringRef LaunchFuncOp::getKernelModuleName() {
+  return getAttrOfType<SymbolRefAttr>(getKernelModuleAttrName())
+      .getRootReference();
+}
+
+Value LaunchFuncOp::getKernelOperand(unsigned i) {
+  return getOperation()->getOperand(i + kNumConfigOperands);
+}
+
+KernelDim3 LaunchFuncOp::getGridSizeOperandValues() {
+  return KernelDim3{getOperand(0), getOperand(1), getOperand(2)};
+}
+
+KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
+  return KernelDim3{getOperand(3), getOperand(4), getOperand(5)};
+}
+
+LogicalResult verify(LaunchFuncOp op) {
+  auto module = op.getParentOfType<ModuleOp>();
+  if (!module)
+    return op.emitOpError("expected to belong to a module");
+
+  if (!module.getAttrOfType<UnitAttr>(GPUDialect::getContainerModuleAttrName()))
+    return op.emitOpError(
+        "expected the closest surrounding module to have the '" +
+        GPUDialect::getContainerModuleAttrName() + "' attribute");
+
+  auto kernelAttr = op.getAttrOfType<StringAttr>(op.getKernelAttrName());
+  if (!kernelAttr)
+    return op.emitOpError("string attribute '" + op.getKernelAttrName() +
+                          "' must be specified");
+
+  auto kernelModuleAttr =
+      op.getAttrOfType<SymbolRefAttr>(op.getKernelModuleAttrName());
+  if (!kernelModuleAttr)
+    return op.emitOpError("symbol reference attribute '" +
+                          op.getKernelModuleAttrName() + "' must be specified");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// GPUFuncOp
+//===----------------------------------------------------------------------===//
+
+void GPUFuncOp::build(Builder *builder, OperationState &result, StringRef name,
+                      FunctionType type, ArrayRef<Type> workgroupAttributions,
+                      ArrayRef<Type> privateAttributions,
+                      ArrayRef<NamedAttribute> attrs) {
+  result.addAttribute(SymbolTable::getSymbolAttrName(),
+                      builder->getStringAttr(name));
+  result.addAttribute(getTypeAttrName(), TypeAttr::get(type));
+  result.addAttribute(getNumWorkgroupAttributionsAttrName(),
+                      builder->getI64IntegerAttr(workgroupAttributions.size()));
+  result.addAttributes(attrs);
+  Region *body = result.addRegion();
+  Block *entryBlock = new Block;
+  entryBlock->addArguments(type.getInputs());
+  entryBlock->addArguments(workgroupAttributions);
+  entryBlock->addArguments(privateAttributions);
+
+  body->getBlocks().push_back(entryBlock);
+}
+
+/// Parses a GPU function memory attribution.
+///
+/// memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
+///                        (`private` `(` ssa-id-and-type-list `)`)?
+///
+/// Note that this function parses only one of the two similar parts, with the
+/// keyword provided as argument.
+static ParseResult
+parseAttributions(OpAsmParser &parser, StringRef keyword,
+                  SmallVectorImpl<OpAsmParser::OperandType> &args,
+                  SmallVectorImpl<Type> &argTypes) {
+  // If we could not parse the keyword, just assume empty list and succeed.
+  if (failed(parser.parseOptionalKeyword(keyword)))
+    return success();
+
+  if (failed(parser.parseLParen()))
+    return failure();
+
+  // Early exit for an empty list.
+  if (succeeded(parser.parseOptionalRParen()))
+    return success();
+
+  do {
+    OpAsmParser::OperandType arg;
+    Type type;
+
+    if (parser.parseRegionArgument(arg) || parser.parseColonType(type))
+      return failure();
+
+    args.push_back(arg);
+    argTypes.push_back(type);
+  } while (succeeded(parser.parseOptionalComma()));
+
+  return parser.parseRParen();
+}
+
+/// Parses a GPU function.
+///
+/// <operation> ::= `gpu.func` symbol-ref-id `(` argument-list `)`
+///                 (`->` function-result-list)? memory-attribution `kernel`?
+///                 function-attributes? region
+static ParseResult parseGPUFuncOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 8> entryArgs;
+  SmallVector<SmallVector<NamedAttribute, 2>, 1> argAttrs;
+  SmallVector<SmallVector<NamedAttribute, 2>, 1> resultAttrs;
+  SmallVector<Type, 8> argTypes;
+  SmallVector<Type, 4> resultTypes;
+  bool isVariadic;
+
+  // Parse the function name.
+  StringAttr nameAttr;
+  if (parser.parseSymbolName(nameAttr, ::mlir::SymbolTable::getSymbolAttrName(),
+                             result.attributes))
+    return failure();
+
+  auto signatureLocation = parser.getCurrentLocation();
+  if (failed(impl::parseFunctionSignature(
+          parser, /*allowVariadic=*/false, entryArgs, argTypes, argAttrs,
+          isVariadic, resultTypes, resultAttrs)))
+    return failure();
+
+  if (entryArgs.empty() && !argTypes.empty())
+    return parser.emitError(signatureLocation)
+           << "gpu.func requires named arguments";
+
+  // Construct the function type. More types will be added to the region, but
+  // not to the functiont type.
+  Builder &builder = parser.getBuilder();
+  auto type = builder.getFunctionType(argTypes, resultTypes);
+  result.addAttribute(GPUFuncOp::getTypeAttrName(), TypeAttr::get(type));
+
+  // Parse workgroup memory attributions.
+  if (failed(parseAttributions(parser, GPUFuncOp::getWorkgroupKeyword(),
+                               entryArgs, argTypes)))
+    return failure();
+
+  // Store the number of operands we just parsed as the number of workgroup
+  // memory attributions.
+  unsigned numWorkgroupAttrs = argTypes.size() - type.getNumInputs();
+  result.addAttribute(GPUFuncOp::getNumWorkgroupAttributionsAttrName(),
+                      builder.getI64IntegerAttr(numWorkgroupAttrs));
+
+  // Parse private memory attributions.
+  if (failed(parseAttributions(parser, GPUFuncOp::getPrivateKeyword(),
+                               entryArgs, argTypes)))
+    return failure();
+
+  // Parse the kernel attribute if present.
+  if (succeeded(parser.parseOptionalKeyword(GPUFuncOp::getKernelKeyword())))
+    result.addAttribute(GPUDialect::getKernelFuncAttrName(),
+                        builder.getUnitAttr());
+
+  // Parse attributes.
+  if (failed(parser.parseOptionalAttrDictWithKeyword(result.attributes)))
+    return failure();
+  mlir::impl::addArgAndResultAttrs(builder, result, argAttrs, resultAttrs);
+
+  // Parse the region. If no argument names were provided, take all names
+  // (including those of attributions) from the entry block.
+  auto *body = result.addRegion();
+  return parser.parseRegion(*body, entryArgs, argTypes);
+}
+
+static void printAttributions(OpAsmPrinter &p, StringRef keyword,
+                              ArrayRef<BlockArgument> values) {
+  if (values.empty())
+    return;
+
+  p << ' ' << keyword << '(';
+  interleaveComma(values, p,
+                  [&p](BlockArgument v) { p << *v << " : " << v->getType(); });
+  p << ')';
+}
+
+/// Prints a GPU Func op.
+void printGPUFuncOp(OpAsmPrinter &p, GPUFuncOp op) {
+  p << GPUFuncOp::getOperationName() << ' ';
+  p.printSymbolName(op.getName());
+
+  FunctionType type = op.getType();
+  impl::printFunctionSignature(p, op.getOperation(), type.getInputs(),
+                               /*isVariadic=*/false, type.getResults());
+
+  printAttributions(p, op.getWorkgroupKeyword(), op.getWorkgroupAttributions());
+  printAttributions(p, op.getPrivateKeyword(), op.getPrivateAttributions());
+  if (op.isKernel())
+    p << ' ' << op.getKernelKeyword();
+
+  impl::printFunctionAttributes(p, op.getOperation(), type.getNumInputs(),
+                                type.getNumResults(),
+                                {op.getNumWorkgroupAttributionsAttrName(),
+                                 GPUDialect::getKernelFuncAttrName()});
+  p.printRegion(op.getBody(), /*printEntryBlockArgs=*/false);
+}
+
+void GPUFuncOp::setType(FunctionType newType) {
+  auto oldType = getType();
+  assert(newType.getNumResults() == oldType.getNumResults() &&
+         "unimplemented: changes to the number of results");
+
+  SmallVector<char, 16> nameBuf;
+  for (int i = newType.getNumInputs(), e = oldType.getNumInputs(); i < e; i++)
+    removeAttr(getArgAttrName(i, nameBuf));
+
+  setAttr(getTypeAttrName(), TypeAttr::get(newType));
+}
+
+/// Hook for FunctionLike verifier.
+LogicalResult GPUFuncOp::verifyType() {
+  Type type = getTypeAttr().getValue();
+  if (!type.isa<FunctionType>())
+    return emitOpError("requires '" + getTypeAttrName() +
+                       "' attribute of function type");
+  return success();
+}
+
+static LogicalResult verifyAttributions(Operation *op,
+                                        ArrayRef<BlockArgument> attributions,
+                                        unsigned memorySpace) {
+  for (Value v : attributions) {
+    auto type = v->getType().dyn_cast<MemRefType>();
+    if (!type)
+      return op->emitOpError() << "expected memref type in attribution";
+
+    if (type.getMemorySpace() != memorySpace) {
+      return op->emitOpError()
+             << "expected memory space " << memorySpace << " in attribution";
+    }
+  }
+  return success();
+}
+
+/// Verifies the body of the function.
+LogicalResult GPUFuncOp::verifyBody() {
+  unsigned numFuncArguments = getNumArguments();
+  unsigned numWorkgroupAttributions = getNumWorkgroupAttributions();
+  unsigned numBlockArguments = front().getNumArguments();
+  if (numBlockArguments < numFuncArguments + numWorkgroupAttributions)
+    return emitOpError() << "expected at least "
+                         << numFuncArguments + numWorkgroupAttributions
+                         << " arguments to body region";
+
+  ArrayRef<Type> funcArgTypes = getType().getInputs();
+  for (unsigned i = 0; i < numFuncArguments; ++i) {
+    Type blockArgType = front().getArgument(i)->getType();
+    if (funcArgTypes[i] != blockArgType)
+      return emitOpError() << "expected body region argument #" << i
+                           << " to be of type " << funcArgTypes[i] << ", got "
+                           << blockArgType;
+  }
+
+  if (failed(verifyAttributions(getOperation(), getWorkgroupAttributions(),
+                                GPUDialect::getWorkgroupAddressSpace())) ||
+      failed(verifyAttributions(getOperation(), getPrivateAttributions(),
+                                GPUDialect::getPrivateAddressSpace())))
+    return failure();
+
+  return success();
+}
+
+// Namespace avoids ambiguous ReturnOpOperandAdaptor.
+namespace mlir {
+namespace gpu {
+#define GET_OP_CLASSES
+#include "mlir/Dialect/GPU/GPUOps.cpp.inc"
+} // namespace gpu
+} // namespace mlir
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d00ac03d338924823a2429bb9ecd30c9e3fbd5b
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -0,0 +1,219 @@
+//===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GPU dialect kernel outlining pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+template <typename OpTy>
+static void createForAllDimensions(OpBuilder &builder, Location loc,
+                                   SmallVectorImpl<Value> &values) {
+  for (StringRef dim : {"x", "y", "z"}) {
+    Value v = builder.create<OpTy>(loc, builder.getIndexType(),
+                                   builder.getStringAttr(dim));
+    values.push_back(v);
+  }
+}
+
+// Add operations generating block/thread ids and grid/block dimensions at the
+// beginning of the `body` region and replace uses of the respective function
+// arguments.
+static void injectGpuIndexOperations(Location loc, Region &body) {
+  OpBuilder builder(loc->getContext());
+  Block &firstBlock = body.front();
+  builder.setInsertionPointToStart(&firstBlock);
+  SmallVector<Value, 12> indexOps;
+  createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
+  createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
+  createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
+  createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
+  // Replace the leading 12 function args with the respective thread/block index
+  // operations. Iterate backwards since args are erased and indices change.
+  for (int i = 11; i >= 0; --i) {
+    firstBlock.getArgument(i)->replaceAllUsesWith(indexOps[i]);
+    firstBlock.eraseArgument(i);
+  }
+}
+
+static bool isInliningBeneficiary(Operation *op) {
+  return isa<ConstantOp>(op) || isa<DimOp>(op);
+}
+
+// Move arguments of the given kernel function into the function if this reduces
+// the number of kernel arguments.
+static gpu::LaunchFuncOp inlineBeneficiaryOps(gpu::GPUFuncOp kernelFunc,
+                                              gpu::LaunchFuncOp launch) {
+  OpBuilder kernelBuilder(kernelFunc.getBody());
+  auto &firstBlock = kernelFunc.getBody().front();
+  SmallVector<Value, 8> newLaunchArgs;
+  BlockAndValueMapping map;
+  for (int i = 0, e = launch.getNumKernelOperands(); i < e; ++i) {
+    map.map(launch.getKernelOperand(i), kernelFunc.getArgument(i));
+  }
+  for (int i = launch.getNumKernelOperands() - 1; i >= 0; --i) {
+    auto operandOp = launch.getKernelOperand(i)->getDefiningOp();
+    if (!operandOp || !isInliningBeneficiary(operandOp)) {
+      newLaunchArgs.push_back(launch.getKernelOperand(i));
+      continue;
+    }
+    // Only inline operations that do not create new arguments.
+    if (!llvm::all_of(operandOp->getOperands(),
+                      [map](Value value) { return map.contains(value); })) {
+      continue;
+    }
+    auto clone = kernelBuilder.clone(*operandOp, map);
+    firstBlock.getArgument(i)->replaceAllUsesWith(clone->getResult(0));
+    firstBlock.eraseArgument(i);
+  }
+  if (newLaunchArgs.size() == launch.getNumKernelOperands())
+    return launch;
+
+  std::reverse(newLaunchArgs.begin(), newLaunchArgs.end());
+  OpBuilder LaunchBuilder(launch);
+  SmallVector<Type, 8> newArgumentTypes;
+  newArgumentTypes.reserve(firstBlock.getNumArguments());
+  for (auto value : firstBlock.getArguments()) {
+    newArgumentTypes.push_back(value->getType());
+  }
+  kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {}));
+  auto newLaunch = LaunchBuilder.create<gpu::LaunchFuncOp>(
+      launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(),
+      launch.getBlockSizeOperandValues(), newLaunchArgs);
+  launch.erase();
+  return newLaunch;
+}
+
+// Outline the `gpu.launch` operation body into a kernel function. Replace
+// `gpu.return` operations by `std.return` in the generated function.
+static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
+  Location loc = launchOp.getLoc();
+  // Create a builder with no insertion point, insertion will happen separately
+  // due to symbol table manipulation.
+  OpBuilder builder(launchOp.getContext());
+
+  SmallVector<Type, 4> kernelOperandTypes(launchOp.getKernelOperandTypes());
+  FunctionType type =
+      FunctionType::get(kernelOperandTypes, {}, launchOp.getContext());
+  std::string kernelFuncName =
+      Twine(launchOp.getParentOfType<FuncOp>().getName(), "_kernel").str();
+  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFuncName, type);
+  outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
+                       builder.getUnitAttr());
+  outlinedFunc.body().takeBody(launchOp.body());
+  injectGpuIndexOperations(loc, outlinedFunc.body());
+  return outlinedFunc;
+}
+
+// Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
+// `kernelFunc`. The kernel func contains the body of the `gpu.launch` with
+// constant region arguments inlined.
+static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp,
+                                  gpu::GPUFuncOp kernelFunc) {
+  OpBuilder builder(launchOp);
+  auto launchFuncOp = builder.create<gpu::LaunchFuncOp>(
+      launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
+      launchOp.getBlockSizeOperandValues(), launchOp.getKernelOperandValues());
+  inlineBeneficiaryOps(kernelFunc, launchFuncOp);
+  launchOp.erase();
+}
+
+namespace {
+
+/// Pass that moves the kernel of each LaunchOp into its separate nested module.
+///
+/// This pass moves the kernel code of each LaunchOp into a function created
+/// inside a nested module. It also creates an external function of the same
+/// name in the parent module.
+///
+/// The kernel modules are intended to be compiled to a cubin blob independently
+/// in a separate pass. The external functions can then be annotated with the
+/// symbol of the cubin accessor function.
+class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> {
+public:
+  void runOnModule() override {
+    SymbolTable symbolTable(getModule());
+    bool modified = false;
+    for (auto func : getModule().getOps<FuncOp>()) {
+      // Insert just after the function.
+      Block::iterator insertPt(func.getOperation()->getNextNode());
+      func.walk([&](gpu::LaunchOp op) {
+        gpu::GPUFuncOp outlinedFunc = outlineKernelFunc(op);
+
+        // Create nested module and insert outlinedFunc. The module will
+        // originally get the same name as the function, but may be renamed on
+        // insertion into the parent module.
+        auto kernelModule = createKernelModule(outlinedFunc, symbolTable);
+        symbolTable.insert(kernelModule, insertPt);
+
+        // Potentially changes signature, pulling in constants.
+        convertToLaunchFuncOp(op, outlinedFunc);
+        modified = true;
+      });
+    }
+
+    // If any new module was inserted in this module, annotate this module as
+    // a container module.
+    if (modified)
+      getModule().setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
+                          UnitAttr::get(&getContext()));
+  }
+
+private:
+  // Returns a module containing kernelFunc and all callees (recursive).
+  ModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
+                              const SymbolTable &parentSymbolTable) {
+    auto context = getModule().getContext();
+    Builder builder(context);
+    auto kernelModule =
+        ModuleOp::create(builder.getUnknownLoc(), kernelFunc.getName());
+    kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(),
+                         builder.getUnitAttr());
+    SymbolTable symbolTable(kernelModule);
+    symbolTable.insert(kernelFunc);
+
+    SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc};
+    while (!symbolDefWorklist.empty()) {
+      if (Optional<SymbolTable::UseRange> symbolUses =
+              SymbolTable::getSymbolUses(symbolDefWorklist.pop_back_val())) {
+        for (SymbolTable::SymbolUse symbolUse : *symbolUses) {
+          StringRef symbolName =
+              symbolUse.getSymbolRef().cast<FlatSymbolRefAttr>().getValue();
+          if (symbolTable.lookup(symbolName))
+            continue;
+
+          Operation *symbolDefClone =
+              parentSymbolTable.lookup(symbolName)->clone();
+          symbolDefWorklist.push_back(symbolDefClone);
+          symbolTable.insert(symbolDefClone);
+        }
+      }
+    }
+
+    return kernelModule;
+  }
+};
+
+} // namespace
+
+std::unique_ptr<OpPassBase<ModuleOp>> mlir::createGpuKernelOutliningPass() {
+  return std::make_unique<GpuKernelOutliningPass>();
+}
+
+static PassRegistration<GpuKernelOutliningPass>
+    pass("gpu-kernel-outlining",
+         "Outline gpu.launch bodies to kernel functions.");
diff --git a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8c53e2dcf33dd8ebf5c62593fd121981a2abce3f
--- /dev/null
+++ b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
@@ -0,0 +1,26 @@
+add_llvm_library(MLIRLLVMIR
+  IR/LLVMDialect.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR
+  )
+add_dependencies(MLIRLLVMIR MLIRLLVMOpsIncGen MLIRLLVMConversionsIncGen LLVMAsmParser LLVMCore LLVMSupport)
+target_link_libraries(MLIRLLVMIR LLVMAsmParser LLVMCore LLVMSupport MLIRIR)
+
+add_llvm_library(MLIRNVVMIR
+  IR/NVVMDialect.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR
+  )
+add_dependencies(MLIRNVVMIR MLIRNVVMOpsIncGen MLIRNVVMConversionsIncGen LLVMAsmParser LLVMCore LLVMSupport)
+target_link_libraries(MLIRNVVMIR LLVMAsmParser LLVMCore LLVMSupport MLIRIR)
+
+add_llvm_library(MLIRROCDLIR
+  IR/ROCDLDialect.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR
+  )
+add_dependencies(MLIRROCDLIR MLIRROCDLOpsIncGen MLIRROCDLConversionsIncGen LLVMAsmParser LLVMCore LLVMSupport)
+target_link_libraries(MLIRROCDLIR LLVMAsmParser LLVMCore LLVMSupport MLIRIR)
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71b7064ac63a644ff300dec6fd2e3d62d576822d
--- /dev/null
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -0,0 +1,1690 @@
+//===- LLVMDialect.cpp - LLVM IR Ops and Dialect registration -------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the types and operation details for the LLVM IR dialect in
+// MLIR, and the LLVM IR dialect.  It also registers the dialect.
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/FunctionImplementation.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+using namespace mlir::LLVM;
+
+#include "mlir/Dialect/LLVMIR/LLVMOpsEnums.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::CmpOp.
+//===----------------------------------------------------------------------===//
+static void printICmpOp(OpAsmPrinter &p, ICmpOp &op) {
+  p << op.getOperationName() << " \"" << stringifyICmpPredicate(op.predicate())
+    << "\" " << *op.getOperand(0) << ", " << *op.getOperand(1);
+  p.printOptionalAttrDict(op.getAttrs(), {"predicate"});
+  p << " : " << op.lhs()->getType();
+}
+
+static void printFCmpOp(OpAsmPrinter &p, FCmpOp &op) {
+  p << op.getOperationName() << " \"" << stringifyFCmpPredicate(op.predicate())
+    << "\" " << *op.getOperand(0) << ", " << *op.getOperand(1);
+  p.printOptionalAttrDict(op.getAttrs(), {"predicate"});
+  p << " : " << op.lhs()->getType();
+}
+
+// <operation> ::= `llvm.icmp` string-literal ssa-use `,` ssa-use
+//                 attribute-dict? `:` type
+// <operation> ::= `llvm.fcmp` string-literal ssa-use `,` ssa-use
+//                 attribute-dict? `:` type
+template <typename CmpPredicateType>
+static ParseResult parseCmpOp(OpAsmParser &parser, OperationState &result) {
+  Builder &builder = parser.getBuilder();
+
+  Attribute predicate;
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType lhs, rhs;
+  Type type;
+  llvm::SMLoc predicateLoc, trailingTypeLoc;
+  if (parser.getCurrentLocation(&predicateLoc) ||
+      parser.parseAttribute(predicate, "predicate", attrs) ||
+      parser.parseOperand(lhs) || parser.parseComma() ||
+      parser.parseOperand(rhs) || parser.parseOptionalAttrDict(attrs) ||
+      parser.parseColon() || parser.getCurrentLocation(&trailingTypeLoc) ||
+      parser.parseType(type) ||
+      parser.resolveOperand(lhs, type, result.operands) ||
+      parser.resolveOperand(rhs, type, result.operands))
+    return failure();
+
+  // Replace the string attribute `predicate` with an integer attribute.
+  auto predicateStr = predicate.dyn_cast<StringAttr>();
+  if (!predicateStr)
+    return parser.emitError(predicateLoc,
+                            "expected 'predicate' attribute of string type");
+
+  int64_t predicateValue = 0;
+  if (std::is_same<CmpPredicateType, ICmpPredicate>()) {
+    Optional<ICmpPredicate> predicate =
+        symbolizeICmpPredicate(predicateStr.getValue());
+    if (!predicate)
+      return parser.emitError(predicateLoc)
+             << "'" << predicateStr.getValue()
+             << "' is an incorrect value of the 'predicate' attribute";
+    predicateValue = static_cast<int64_t>(predicate.getValue());
+  } else {
+    Optional<FCmpPredicate> predicate =
+        symbolizeFCmpPredicate(predicateStr.getValue());
+    if (!predicate)
+      return parser.emitError(predicateLoc)
+             << "'" << predicateStr.getValue()
+             << "' is an incorrect value of the 'predicate' attribute";
+    predicateValue = static_cast<int64_t>(predicate.getValue());
+  }
+
+  attrs[0].second = parser.getBuilder().getI64IntegerAttr(predicateValue);
+
+  // The result type is either i1 or a vector type <? x i1> if the inputs are
+  // vectors.
+  auto *dialect = builder.getContext()->getRegisteredDialect<LLVMDialect>();
+  auto resultType = LLVMType::getInt1Ty(dialect);
+  auto argType = type.dyn_cast<LLVM::LLVMType>();
+  if (!argType)
+    return parser.emitError(trailingTypeLoc, "expected LLVM IR dialect type");
+  if (argType.getUnderlyingType()->isVectorTy())
+    resultType = LLVMType::getVectorTy(
+        resultType, argType.getUnderlyingType()->getVectorNumElements());
+
+  result.attributes = attrs;
+  result.addTypes({resultType});
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::AllocaOp.
+//===----------------------------------------------------------------------===//
+
+static void printAllocaOp(OpAsmPrinter &p, AllocaOp &op) {
+  auto elemTy = op.getType().cast<LLVM::LLVMType>().getPointerElementTy();
+
+  auto funcTy = FunctionType::get({op.arraySize()->getType()}, {op.getType()},
+                                  op.getContext());
+
+  p << op.getOperationName() << ' ' << *op.arraySize() << " x " << elemTy;
+  if (op.alignment().hasValue() && op.alignment()->getSExtValue() != 0)
+    p.printOptionalAttrDict(op.getAttrs());
+  else
+    p.printOptionalAttrDict(op.getAttrs(), {"alignment"});
+  p << " : " << funcTy;
+}
+
+// <operation> ::= `llvm.alloca` ssa-use `x` type attribute-dict?
+//                 `:` type `,` type
+static ParseResult parseAllocaOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType arraySize;
+  Type type, elemType;
+  llvm::SMLoc trailingTypeLoc;
+  if (parser.parseOperand(arraySize) || parser.parseKeyword("x") ||
+      parser.parseType(elemType) || parser.parseOptionalAttrDict(attrs) ||
+      parser.parseColon() || parser.getCurrentLocation(&trailingTypeLoc) ||
+      parser.parseType(type))
+    return failure();
+
+  // Extract the result type from the trailing function type.
+  auto funcType = type.dyn_cast<FunctionType>();
+  if (!funcType || funcType.getNumInputs() != 1 ||
+      funcType.getNumResults() != 1)
+    return parser.emitError(
+        trailingTypeLoc,
+        "expected trailing function type with one argument and one result");
+
+  if (parser.resolveOperand(arraySize, funcType.getInput(0), result.operands))
+    return failure();
+
+  result.attributes = attrs;
+  result.addTypes({funcType.getResult(0)});
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::GEPOp.
+//===----------------------------------------------------------------------===//
+
+static void printGEPOp(OpAsmPrinter &p, GEPOp &op) {
+  SmallVector<Type, 8> types(op.getOperandTypes());
+  auto funcTy = FunctionType::get(types, op.getType(), op.getContext());
+
+  p << op.getOperationName() << ' ' << *op.base() << '['
+    << op.getOperands().drop_front() << ']';
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << funcTy;
+}
+
+// <operation> ::= `llvm.getelementptr` ssa-use `[` ssa-use-list `]`
+//                 attribute-dict? `:` type
+static ParseResult parseGEPOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType base;
+  SmallVector<OpAsmParser::OperandType, 8> indices;
+  Type type;
+  llvm::SMLoc trailingTypeLoc;
+  if (parser.parseOperand(base) ||
+      parser.parseOperandList(indices, OpAsmParser::Delimiter::Square) ||
+      parser.parseOptionalAttrDict(attrs) || parser.parseColon() ||
+      parser.getCurrentLocation(&trailingTypeLoc) || parser.parseType(type))
+    return failure();
+
+  // Deconstruct the trailing function type to extract the types of the base
+  // pointer and result (same type) and the types of the indices.
+  auto funcType = type.dyn_cast<FunctionType>();
+  if (!funcType || funcType.getNumResults() != 1 ||
+      funcType.getNumInputs() == 0)
+    return parser.emitError(trailingTypeLoc,
+                            "expected trailing function type with at least "
+                            "one argument and one result");
+
+  if (parser.resolveOperand(base, funcType.getInput(0), result.operands) ||
+      parser.resolveOperands(indices, funcType.getInputs().drop_front(),
+                             parser.getNameLoc(), result.operands))
+    return failure();
+
+  result.attributes = attrs;
+  result.addTypes(funcType.getResults());
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::LoadOp.
+//===----------------------------------------------------------------------===//
+
+static void printLoadOp(OpAsmPrinter &p, LoadOp &op) {
+  p << op.getOperationName() << ' ' << *op.addr();
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.addr()->getType();
+}
+
+// Extract the pointee type from the LLVM pointer type wrapped in MLIR.  Return
+// the resulting type wrapped in MLIR, or nullptr on error.
+static Type getLoadStoreElementType(OpAsmParser &parser, Type type,
+                                    llvm::SMLoc trailingTypeLoc) {
+  auto llvmTy = type.dyn_cast<LLVM::LLVMType>();
+  if (!llvmTy)
+    return parser.emitError(trailingTypeLoc, "expected LLVM IR dialect type"),
+           nullptr;
+  if (!llvmTy.getUnderlyingType()->isPointerTy())
+    return parser.emitError(trailingTypeLoc, "expected LLVM pointer type"),
+           nullptr;
+  return llvmTy.getPointerElementTy();
+}
+
+// <operation> ::= `llvm.load` ssa-use attribute-dict? `:` type
+static ParseResult parseLoadOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType addr;
+  Type type;
+  llvm::SMLoc trailingTypeLoc;
+
+  if (parser.parseOperand(addr) || parser.parseOptionalAttrDict(attrs) ||
+      parser.parseColon() || parser.getCurrentLocation(&trailingTypeLoc) ||
+      parser.parseType(type) ||
+      parser.resolveOperand(addr, type, result.operands))
+    return failure();
+
+  Type elemTy = getLoadStoreElementType(parser, type, trailingTypeLoc);
+
+  result.attributes = attrs;
+  result.addTypes(elemTy);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::StoreOp.
+//===----------------------------------------------------------------------===//
+
+static void printStoreOp(OpAsmPrinter &p, StoreOp &op) {
+  p << op.getOperationName() << ' ' << *op.value() << ", " << *op.addr();
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.addr()->getType();
+}
+
+// <operation> ::= `llvm.store` ssa-use `,` ssa-use attribute-dict? `:` type
+static ParseResult parseStoreOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType addr, value;
+  Type type;
+  llvm::SMLoc trailingTypeLoc;
+
+  if (parser.parseOperand(value) || parser.parseComma() ||
+      parser.parseOperand(addr) || parser.parseOptionalAttrDict(attrs) ||
+      parser.parseColon() || parser.getCurrentLocation(&trailingTypeLoc) ||
+      parser.parseType(type))
+    return failure();
+
+  Type elemTy = getLoadStoreElementType(parser, type, trailingTypeLoc);
+  if (!elemTy)
+    return failure();
+
+  if (parser.resolveOperand(value, elemTy, result.operands) ||
+      parser.resolveOperand(addr, type, result.operands))
+    return failure();
+
+  result.attributes = attrs;
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::CallOp.
+//===----------------------------------------------------------------------===//
+
+static void printCallOp(OpAsmPrinter &p, CallOp &op) {
+  auto callee = op.callee();
+  bool isDirect = callee.hasValue();
+
+  // Print the direct callee if present as a function attribute, or an indirect
+  // callee (first operand) otherwise.
+  p << op.getOperationName() << ' ';
+  if (isDirect)
+    p.printSymbolName(callee.getValue());
+  else
+    p << *op.getOperand(0);
+
+  p << '(' << op.getOperands().drop_front(isDirect ? 0 : 1) << ')';
+  p.printOptionalAttrDict(op.getAttrs(), {"callee"});
+
+  // Reconstruct the function MLIR function type from operand and result types.
+  SmallVector<Type, 1> resultTypes(op.getResultTypes());
+  SmallVector<Type, 8> argTypes(
+      llvm::drop_begin(op.getOperandTypes(), isDirect ? 0 : 1));
+
+  p << " : " << FunctionType::get(argTypes, resultTypes, op.getContext());
+}
+
+// <operation> ::= `llvm.call` (function-id | ssa-use) `(` ssa-use-list `)`
+//                 attribute-dict? `:` function-type
+static ParseResult parseCallOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  SmallVector<OpAsmParser::OperandType, 8> operands;
+  Type type;
+  SymbolRefAttr funcAttr;
+  llvm::SMLoc trailingTypeLoc;
+
+  // Parse an operand list that will, in practice, contain 0 or 1 operand.  In
+  // case of an indirect call, there will be 1 operand before `(`.  In case of a
+  // direct call, there will be no operands and the parser will stop at the
+  // function identifier without complaining.
+  if (parser.parseOperandList(operands))
+    return failure();
+  bool isDirect = operands.empty();
+
+  // Optionally parse a function identifier.
+  if (isDirect)
+    if (parser.parseAttribute(funcAttr, "callee", attrs))
+      return failure();
+
+  if (parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
+      parser.parseOptionalAttrDict(attrs) || parser.parseColon() ||
+      parser.getCurrentLocation(&trailingTypeLoc) || parser.parseType(type))
+    return failure();
+
+  auto funcType = type.dyn_cast<FunctionType>();
+  if (!funcType)
+    return parser.emitError(trailingTypeLoc, "expected function type");
+  if (isDirect) {
+    // Make sure types match.
+    if (parser.resolveOperands(operands, funcType.getInputs(),
+                               parser.getNameLoc(), result.operands))
+      return failure();
+    result.addTypes(funcType.getResults());
+  } else {
+    // Construct the LLVM IR Dialect function type that the first operand
+    // should match.
+    if (funcType.getNumResults() > 1)
+      return parser.emitError(trailingTypeLoc,
+                              "expected function with 0 or 1 result");
+
+    Builder &builder = parser.getBuilder();
+    auto *llvmDialect =
+        builder.getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
+    LLVM::LLVMType llvmResultType;
+    if (funcType.getNumResults() == 0) {
+      llvmResultType = LLVM::LLVMType::getVoidTy(llvmDialect);
+    } else {
+      llvmResultType = funcType.getResult(0).dyn_cast<LLVM::LLVMType>();
+      if (!llvmResultType)
+        return parser.emitError(trailingTypeLoc,
+                                "expected result to have LLVM type");
+    }
+
+    SmallVector<LLVM::LLVMType, 8> argTypes;
+    argTypes.reserve(funcType.getNumInputs());
+    for (int i = 0, e = funcType.getNumInputs(); i < e; ++i) {
+      auto argType = funcType.getInput(i).dyn_cast<LLVM::LLVMType>();
+      if (!argType)
+        return parser.emitError(trailingTypeLoc,
+                                "expected LLVM types as inputs");
+      argTypes.push_back(argType);
+    }
+    auto llvmFuncType = LLVM::LLVMType::getFunctionTy(llvmResultType, argTypes,
+                                                      /*isVarArg=*/false);
+    auto wrappedFuncType = llvmFuncType.getPointerTo();
+
+    auto funcArguments =
+        ArrayRef<OpAsmParser::OperandType>(operands).drop_front();
+
+    // Make sure that the first operand (indirect callee) matches the wrapped
+    // LLVM IR function type, and that the types of the other call operands
+    // match the types of the function arguments.
+    if (parser.resolveOperand(operands[0], wrappedFuncType, result.operands) ||
+        parser.resolveOperands(funcArguments, funcType.getInputs(),
+                               parser.getNameLoc(), result.operands))
+      return failure();
+
+    result.addTypes(llvmResultType);
+  }
+
+  result.attributes = attrs;
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::ExtractElementOp.
+//===----------------------------------------------------------------------===//
+// Expects vector to be of wrapped LLVM vector type and position to be of
+// wrapped LLVM i32 type.
+void LLVM::ExtractElementOp::build(Builder *b, OperationState &result,
+                                   Value vector, Value position,
+                                   ArrayRef<NamedAttribute> attrs) {
+  auto wrappedVectorType = vector->getType().cast<LLVM::LLVMType>();
+  auto llvmType = wrappedVectorType.getVectorElementType();
+  build(b, result, llvmType, vector, position);
+  result.addAttributes(attrs);
+}
+
+static void printExtractElementOp(OpAsmPrinter &p, ExtractElementOp &op) {
+  p << op.getOperationName() << ' ' << *op.vector() << "[" << *op.position()
+    << " : " << op.position()->getType() << "]";
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.vector()->getType();
+}
+
+// <operation> ::= `llvm.extractelement` ssa-use `, ` ssa-use
+//                 attribute-dict? `:` type
+static ParseResult parseExtractElementOp(OpAsmParser &parser,
+                                         OperationState &result) {
+  llvm::SMLoc loc;
+  OpAsmParser::OperandType vector, position;
+  Type type, positionType;
+  if (parser.getCurrentLocation(&loc) || parser.parseOperand(vector) ||
+      parser.parseLSquare() || parser.parseOperand(position) ||
+      parser.parseColonType(positionType) || parser.parseRSquare() ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type) ||
+      parser.resolveOperand(vector, type, result.operands) ||
+      parser.resolveOperand(position, positionType, result.operands))
+    return failure();
+  auto wrappedVectorType = type.dyn_cast<LLVM::LLVMType>();
+  if (!wrappedVectorType ||
+      !wrappedVectorType.getUnderlyingType()->isVectorTy())
+    return parser.emitError(
+        loc, "expected LLVM IR dialect vector type for operand #1");
+  result.addTypes(wrappedVectorType.getVectorElementType());
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::ExtractValueOp.
+//===----------------------------------------------------------------------===//
+
+static void printExtractValueOp(OpAsmPrinter &p, ExtractValueOp &op) {
+  p << op.getOperationName() << ' ' << *op.container() << op.position();
+  p.printOptionalAttrDict(op.getAttrs(), {"position"});
+  p << " : " << op.container()->getType();
+}
+
+// Extract the type at `position` in the wrapped LLVM IR aggregate type
+// `containerType`.  Position is an integer array attribute where each value
+// is a zero-based position of the element in the aggregate type.  Return the
+// resulting type wrapped in MLIR, or nullptr on error.
+static LLVM::LLVMType getInsertExtractValueElementType(OpAsmParser &parser,
+                                                       Type containerType,
+                                                       Attribute positionAttr,
+                                                       llvm::SMLoc attributeLoc,
+                                                       llvm::SMLoc typeLoc) {
+  auto wrappedContainerType = containerType.dyn_cast<LLVM::LLVMType>();
+  if (!wrappedContainerType)
+    return parser.emitError(typeLoc, "expected LLVM IR Dialect type"), nullptr;
+
+  auto positionArrayAttr = positionAttr.dyn_cast<ArrayAttr>();
+  if (!positionArrayAttr)
+    return parser.emitError(attributeLoc, "expected an array attribute"),
+           nullptr;
+
+  // Infer the element type from the structure type: iteratively step inside the
+  // type by taking the element type, indexed by the position attribute for
+  // structures.  Check the position index before accessing, it is supposed to
+  // be in bounds.
+  for (Attribute subAttr : positionArrayAttr) {
+    auto positionElementAttr = subAttr.dyn_cast<IntegerAttr>();
+    if (!positionElementAttr)
+      return parser.emitError(attributeLoc,
+                              "expected an array of integer literals"),
+             nullptr;
+    int position = positionElementAttr.getInt();
+    auto *llvmContainerType = wrappedContainerType.getUnderlyingType();
+    if (llvmContainerType->isArrayTy()) {
+      if (position < 0 || static_cast<unsigned>(position) >=
+                              llvmContainerType->getArrayNumElements())
+        return parser.emitError(attributeLoc, "position out of bounds"),
+               nullptr;
+      wrappedContainerType = wrappedContainerType.getArrayElementType();
+    } else if (llvmContainerType->isStructTy()) {
+      if (position < 0 || static_cast<unsigned>(position) >=
+                              llvmContainerType->getStructNumElements())
+        return parser.emitError(attributeLoc, "position out of bounds"),
+               nullptr;
+      wrappedContainerType =
+          wrappedContainerType.getStructElementType(position);
+    } else {
+      return parser.emitError(typeLoc,
+                              "expected wrapped LLVM IR structure/array type"),
+             nullptr;
+    }
+  }
+  return wrappedContainerType;
+}
+
+// <operation> ::= `llvm.extractvalue` ssa-use
+//                 `[` integer-literal (`,` integer-literal)* `]`
+//                 attribute-dict? `:` type
+static ParseResult parseExtractValueOp(OpAsmParser &parser,
+                                       OperationState &result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType container;
+  Type containerType;
+  Attribute positionAttr;
+  llvm::SMLoc attributeLoc, trailingTypeLoc;
+
+  if (parser.parseOperand(container) ||
+      parser.getCurrentLocation(&attributeLoc) ||
+      parser.parseAttribute(positionAttr, "position", attrs) ||
+      parser.parseOptionalAttrDict(attrs) || parser.parseColon() ||
+      parser.getCurrentLocation(&trailingTypeLoc) ||
+      parser.parseType(containerType) ||
+      parser.resolveOperand(container, containerType, result.operands))
+    return failure();
+
+  auto elementType = getInsertExtractValueElementType(
+      parser, containerType, positionAttr, attributeLoc, trailingTypeLoc);
+  if (!elementType)
+    return failure();
+
+  result.attributes = attrs;
+  result.addTypes(elementType);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::InsertElementOp.
+//===----------------------------------------------------------------------===//
+
+static void printInsertElementOp(OpAsmPrinter &p, InsertElementOp &op) {
+  p << op.getOperationName() << ' ' << *op.value() << ", " << *op.vector()
+    << "[" << *op.position() << " : " << op.position()->getType() << "]";
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.vector()->getType();
+}
+
+// <operation> ::= `llvm.insertelement` ssa-use `,` ssa-use `,` ssa-use
+//                 attribute-dict? `:` type
+static ParseResult parseInsertElementOp(OpAsmParser &parser,
+                                        OperationState &result) {
+  llvm::SMLoc loc;
+  OpAsmParser::OperandType vector, value, position;
+  Type vectorType, positionType;
+  if (parser.getCurrentLocation(&loc) || parser.parseOperand(value) ||
+      parser.parseComma() || parser.parseOperand(vector) ||
+      parser.parseLSquare() || parser.parseOperand(position) ||
+      parser.parseColonType(positionType) || parser.parseRSquare() ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(vectorType))
+    return failure();
+
+  auto wrappedVectorType = vectorType.dyn_cast<LLVM::LLVMType>();
+  if (!wrappedVectorType ||
+      !wrappedVectorType.getUnderlyingType()->isVectorTy())
+    return parser.emitError(
+        loc, "expected LLVM IR dialect vector type for operand #1");
+  auto valueType = wrappedVectorType.getVectorElementType();
+  if (!valueType)
+    return failure();
+
+  if (parser.resolveOperand(vector, vectorType, result.operands) ||
+      parser.resolveOperand(value, valueType, result.operands) ||
+      parser.resolveOperand(position, positionType, result.operands))
+    return failure();
+
+  result.addTypes(vectorType);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::InsertValueOp.
+//===----------------------------------------------------------------------===//
+
+static void printInsertValueOp(OpAsmPrinter &p, InsertValueOp &op) {
+  p << op.getOperationName() << ' ' << *op.value() << ", " << *op.container()
+    << op.position();
+  p.printOptionalAttrDict(op.getAttrs(), {"position"});
+  p << " : " << op.container()->getType();
+}
+
+// <operation> ::= `llvm.insertvaluevalue` ssa-use `,` ssa-use
+//                 `[` integer-literal (`,` integer-literal)* `]`
+//                 attribute-dict? `:` type
+static ParseResult parseInsertValueOp(OpAsmParser &parser,
+                                      OperationState &result) {
+  OpAsmParser::OperandType container, value;
+  Type containerType;
+  Attribute positionAttr;
+  llvm::SMLoc attributeLoc, trailingTypeLoc;
+
+  if (parser.parseOperand(value) || parser.parseComma() ||
+      parser.parseOperand(container) ||
+      parser.getCurrentLocation(&attributeLoc) ||
+      parser.parseAttribute(positionAttr, "position", result.attributes) ||
+      parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() ||
+      parser.getCurrentLocation(&trailingTypeLoc) ||
+      parser.parseType(containerType))
+    return failure();
+
+  auto valueType = getInsertExtractValueElementType(
+      parser, containerType, positionAttr, attributeLoc, trailingTypeLoc);
+  if (!valueType)
+    return failure();
+
+  if (parser.resolveOperand(container, containerType, result.operands) ||
+      parser.resolveOperand(value, valueType, result.operands))
+    return failure();
+
+  result.addTypes(containerType);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::SelectOp.
+//===----------------------------------------------------------------------===//
+
+static void printSelectOp(OpAsmPrinter &p, SelectOp &op) {
+  p << op.getOperationName() << ' ' << *op.condition() << ", "
+    << *op.trueValue() << ", " << *op.falseValue();
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.condition()->getType() << ", " << op.trueValue()->getType();
+}
+
+// <operation> ::= `llvm.select` ssa-use `,` ssa-use `,` ssa-use
+//                 attribute-dict? `:` type, type
+static ParseResult parseSelectOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType condition, trueValue, falseValue;
+  Type conditionType, argType;
+
+  if (parser.parseOperand(condition) || parser.parseComma() ||
+      parser.parseOperand(trueValue) || parser.parseComma() ||
+      parser.parseOperand(falseValue) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(conditionType) || parser.parseComma() ||
+      parser.parseType(argType))
+    return failure();
+
+  if (parser.resolveOperand(condition, conditionType, result.operands) ||
+      parser.resolveOperand(trueValue, argType, result.operands) ||
+      parser.resolveOperand(falseValue, argType, result.operands))
+    return failure();
+
+  result.addTypes(argType);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::BrOp.
+//===----------------------------------------------------------------------===//
+
+static void printBrOp(OpAsmPrinter &p, BrOp &op) {
+  p << op.getOperationName() << ' ';
+  p.printSuccessorAndUseList(op.getOperation(), 0);
+  p.printOptionalAttrDict(op.getAttrs());
+}
+
+// <operation> ::= `llvm.br` bb-id (`[` ssa-use-and-type-list `]`)?
+// attribute-dict?
+static ParseResult parseBrOp(OpAsmParser &parser, OperationState &result) {
+  Block *dest;
+  SmallVector<Value, 4> operands;
+  if (parser.parseSuccessorAndUseList(dest, operands) ||
+      parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  result.addSuccessor(dest, operands);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::CondBrOp.
+//===----------------------------------------------------------------------===//
+
+static void printCondBrOp(OpAsmPrinter &p, CondBrOp &op) {
+  p << op.getOperationName() << ' ' << *op.getOperand(0) << ", ";
+  p.printSuccessorAndUseList(op.getOperation(), 0);
+  p << ", ";
+  p.printSuccessorAndUseList(op.getOperation(), 1);
+  p.printOptionalAttrDict(op.getAttrs());
+}
+
+// <operation> ::= `llvm.cond_br` ssa-use `,`
+//                  bb-id (`[` ssa-use-and-type-list `]`)? `,`
+//                  bb-id (`[` ssa-use-and-type-list `]`)? attribute-dict?
+static ParseResult parseCondBrOp(OpAsmParser &parser, OperationState &result) {
+  Block *trueDest;
+  Block *falseDest;
+  SmallVector<Value, 4> trueOperands;
+  SmallVector<Value, 4> falseOperands;
+  OpAsmParser::OperandType condition;
+
+  Builder &builder = parser.getBuilder();
+  auto *llvmDialect =
+      builder.getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
+  auto i1Type = LLVM::LLVMType::getInt1Ty(llvmDialect);
+
+  if (parser.parseOperand(condition) || parser.parseComma() ||
+      parser.parseSuccessorAndUseList(trueDest, trueOperands) ||
+      parser.parseComma() ||
+      parser.parseSuccessorAndUseList(falseDest, falseOperands) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.resolveOperand(condition, i1Type, result.operands))
+    return failure();
+
+  result.addSuccessor(trueDest, trueOperands);
+  result.addSuccessor(falseDest, falseOperands);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::ReturnOp.
+//===----------------------------------------------------------------------===//
+
+static void printReturnOp(OpAsmPrinter &p, ReturnOp &op) {
+  p << op.getOperationName();
+  p.printOptionalAttrDict(op.getAttrs());
+  assert(op.getNumOperands() <= 1);
+
+  if (op.getNumOperands() == 0)
+    return;
+
+  p << ' ' << *op.getOperand(0) << " : " << op.getOperand(0)->getType();
+}
+
+// <operation> ::= `llvm.return` ssa-use-list attribute-dict? `:`
+//                 type-list-no-parens
+static ParseResult parseReturnOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 1> operands;
+  Type type;
+
+  if (parser.parseOperandList(operands) ||
+      parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+  if (operands.empty())
+    return success();
+
+  if (parser.parseColonType(type) ||
+      parser.resolveOperand(operands[0], type, result.operands))
+    return failure();
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::UndefOp.
+//===----------------------------------------------------------------------===//
+
+static void printUndefOp(OpAsmPrinter &p, UndefOp &op) {
+  p << op.getOperationName();
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.res()->getType();
+}
+
+// <operation> ::= `llvm.mlir.undef` attribute-dict? : type
+static ParseResult parseUndefOp(OpAsmParser &parser, OperationState &result) {
+  Type type;
+
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type))
+    return failure();
+
+  result.addTypes(type);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printer, parser and verifier for LLVM::AddressOfOp.
+//===----------------------------------------------------------------------===//
+
+GlobalOp AddressOfOp::getGlobal() {
+  Operation *module = getParentOp();
+  while (module && !satisfiesLLVMModule(module))
+    module = module->getParentOp();
+  assert(module && "unexpected operation outside of a module");
+  return dyn_cast_or_null<LLVM::GlobalOp>(
+      mlir::SymbolTable::lookupSymbolIn(module, global_name()));
+}
+
+static void printAddressOfOp(OpAsmPrinter &p, AddressOfOp op) {
+  p << op.getOperationName() << " @" << op.global_name();
+  p.printOptionalAttrDict(op.getAttrs(), {"global_name"});
+  p << " : " << op.getResult()->getType();
+}
+
+static ParseResult parseAddressOfOp(OpAsmParser &parser,
+                                    OperationState &result) {
+  Attribute symRef;
+  Type type;
+  if (parser.parseAttribute(symRef, "global_name", result.attributes) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type) || parser.addTypeToList(type, result.types))
+    return failure();
+
+  if (!symRef.isa<SymbolRefAttr>())
+    return parser.emitError(parser.getNameLoc(), "expected symbol reference");
+  return success();
+}
+
+static LogicalResult verify(AddressOfOp op) {
+  auto global = op.getGlobal();
+  if (!global)
+    return op.emitOpError(
+        "must reference a global defined by 'llvm.mlir.global'");
+
+  if (global.getType().getPointerTo(global.addr_space().getZExtValue()) !=
+      op.getResult()->getType())
+    return op.emitOpError(
+        "the type must be a pointer to the type of the referred global");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::ConstantOp.
+//===----------------------------------------------------------------------===//
+
+static void printConstantOp(OpAsmPrinter &p, ConstantOp &op) {
+  p << op.getOperationName() << '(' << op.value() << ')';
+  p.printOptionalAttrDict(op.getAttrs(), {"value"});
+  p << " : " << op.res()->getType();
+}
+
+// <operation> ::= `llvm.mlir.constant` `(` attribute `)` attribute-list? : type
+static ParseResult parseConstantOp(OpAsmParser &parser,
+                                   OperationState &result) {
+  Attribute valueAttr;
+  Type type;
+
+  if (parser.parseLParen() ||
+      parser.parseAttribute(valueAttr, "value", result.attributes) ||
+      parser.parseRParen() || parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type))
+    return failure();
+
+  result.addTypes(type);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Builder, printer and verifier for LLVM::GlobalOp.
+//===----------------------------------------------------------------------===//
+
+/// Returns the name used for the linkge attribute. This *must* correspond to
+/// the name of the attribute in ODS.
+static StringRef getLinkageAttrName() { return "linkage"; }
+
+void GlobalOp::build(Builder *builder, OperationState &result, LLVMType type,
+                     bool isConstant, Linkage linkage, StringRef name,
+                     Attribute value, unsigned addrSpace,
+                     ArrayRef<NamedAttribute> attrs) {
+  result.addAttribute(SymbolTable::getSymbolAttrName(),
+                      builder->getStringAttr(name));
+  result.addAttribute("type", TypeAttr::get(type));
+  if (isConstant)
+    result.addAttribute("constant", builder->getUnitAttr());
+  if (value)
+    result.addAttribute("value", value);
+  result.addAttribute(getLinkageAttrName(), builder->getI64IntegerAttr(
+                                                static_cast<int64_t>(linkage)));
+  if (addrSpace != 0)
+    result.addAttribute("addr_space", builder->getI32IntegerAttr(addrSpace));
+  result.attributes.append(attrs.begin(), attrs.end());
+  result.addRegion();
+}
+
+// Returns the textual representation of the given linkage.
+static StringRef linkageToStr(LLVM::Linkage linkage) {
+  switch (linkage) {
+  case LLVM::Linkage::Private:
+    return "private";
+  case LLVM::Linkage::Internal:
+    return "internal";
+  case LLVM::Linkage::AvailableExternally:
+    return "available_externally";
+  case LLVM::Linkage::Linkonce:
+    return "linkonce";
+  case LLVM::Linkage::Weak:
+    return "weak";
+  case LLVM::Linkage::Common:
+    return "common";
+  case LLVM::Linkage::Appending:
+    return "appending";
+  case LLVM::Linkage::ExternWeak:
+    return "extern_weak";
+  case LLVM::Linkage::LinkonceODR:
+    return "linkonce_odr";
+  case LLVM::Linkage::WeakODR:
+    return "weak_odr";
+  case LLVM::Linkage::External:
+    return "external";
+  }
+  llvm_unreachable("unknown linkage type");
+}
+
+// Prints the keyword for the linkage type using the printer.
+static void printLinkage(OpAsmPrinter &p, LLVM::Linkage linkage) {
+  p << linkageToStr(linkage);
+}
+
+static void printGlobalOp(OpAsmPrinter &p, GlobalOp op) {
+  p << op.getOperationName() << ' ';
+  printLinkage(p, op.linkage());
+  p << ' ';
+  if (op.constant())
+    p << "constant ";
+  p.printSymbolName(op.sym_name());
+  p << '(';
+  if (auto value = op.getValueOrNull())
+    p.printAttribute(value);
+  p << ')';
+  p.printOptionalAttrDict(op.getAttrs(),
+                          {SymbolTable::getSymbolAttrName(), "type", "constant",
+                           "value", getLinkageAttrName()});
+
+  // Print the trailing type unless it's a string global.
+  if (op.getValueOrNull().dyn_cast_or_null<StringAttr>())
+    return;
+  p << " : " << op.type();
+
+  Region &initializer = op.getInitializerRegion();
+  if (!initializer.empty())
+    p.printRegion(initializer, /*printEntryBlockArgs=*/false);
+}
+
+// Parses one of the keywords provided in the list `keywords` and returns the
+// position of the parsed keyword in the list. If none of the keywords from the
+// list is parsed, returns -1.
+static int parseOptionalKeywordAlternative(OpAsmParser &parser,
+                                           ArrayRef<StringRef> keywords) {
+  for (auto en : llvm::enumerate(keywords)) {
+    if (succeeded(parser.parseOptionalKeyword(en.value())))
+      return en.index();
+  }
+  return -1;
+}
+
+// Parses one of the linkage keywords and, if succeeded, appends the "linkage"
+// integer attribute with the corresponding value to `result`.
+//
+// linkage ::= `private` | `internal` | `available_externally` | `linkonce`
+//           | `weak` | `common` | `appending` | `extern_weak`
+//           | `linkonce_odr` | `weak_odr` | `external
+static ParseResult parseOptionalLinkageKeyword(OpAsmParser &parser,
+                                               OperationState &result) {
+  int index = parseOptionalKeywordAlternative(
+      parser, {"private", "internal", "available_externally", "linkonce",
+               "weak", "common", "appending", "extern_weak", "linkonce_odr",
+               "weak_odr", "external"});
+  if (index == -1)
+    return failure();
+  result.addAttribute(getLinkageAttrName(),
+                      parser.getBuilder().getI64IntegerAttr(index));
+  return success();
+}
+
+// operation ::= `llvm.mlir.global` linkage `constant`? `@` identifier
+//               `(` attribute? `)` attribute-list? (`:` type)? region?
+//
+// The type can be omitted for string attributes, in which case it will be
+// inferred from the value of the string as [strlen(value) x i8].
+static ParseResult parseGlobalOp(OpAsmParser &parser, OperationState &result) {
+  if (failed(parseOptionalLinkageKeyword(parser, result)))
+    return parser.emitError(parser.getCurrentLocation(), "expected linkage");
+
+  if (succeeded(parser.parseOptionalKeyword("constant")))
+    result.addAttribute("constant", parser.getBuilder().getUnitAttr());
+
+  StringAttr name;
+  if (parser.parseSymbolName(name, SymbolTable::getSymbolAttrName(),
+                             result.attributes) ||
+      parser.parseLParen())
+    return failure();
+
+  Attribute value;
+  if (parser.parseOptionalRParen()) {
+    if (parser.parseAttribute(value, "value", result.attributes) ||
+        parser.parseRParen())
+      return failure();
+  }
+
+  SmallVector<Type, 1> types;
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseOptionalColonTypeList(types))
+    return failure();
+
+  if (types.size() > 1)
+    return parser.emitError(parser.getNameLoc(), "expected zero or one type");
+
+  Region &initRegion = *result.addRegion();
+  if (types.empty()) {
+    if (auto strAttr = value.dyn_cast_or_null<StringAttr>()) {
+      MLIRContext *context = parser.getBuilder().getContext();
+      auto *dialect = context->getRegisteredDialect<LLVMDialect>();
+      auto arrayType = LLVM::LLVMType::getArrayTy(
+          LLVM::LLVMType::getInt8Ty(dialect), strAttr.getValue().size());
+      types.push_back(arrayType);
+    } else {
+      return parser.emitError(parser.getNameLoc(),
+                              "type can only be omitted for string globals");
+    }
+  } else if (parser.parseOptionalRegion(initRegion, /*arguments=*/{},
+                                        /*argTypes=*/{})) {
+    return failure();
+  }
+
+  result.addAttribute("type", TypeAttr::get(types[0]));
+  return success();
+}
+
+static LogicalResult verify(GlobalOp op) {
+  if (!llvm::PointerType::isValidElementType(op.getType().getUnderlyingType()))
+    return op.emitOpError(
+        "expects type to be a valid element type for an LLVM pointer");
+  if (op.getParentOp() && !satisfiesLLVMModule(op.getParentOp()))
+    return op.emitOpError("must appear at the module level");
+
+  if (auto strAttr = op.getValueOrNull().dyn_cast_or_null<StringAttr>()) {
+    auto type = op.getType();
+    if (!type.getUnderlyingType()->isArrayTy() ||
+        !type.getArrayElementType().getUnderlyingType()->isIntegerTy(8) ||
+        type.getArrayNumElements() != strAttr.getValue().size())
+      return op.emitOpError(
+          "requires an i8 array type of the length equal to that of the string "
+          "attribute");
+  }
+
+  if (Block *b = op.getInitializerBlock()) {
+    ReturnOp ret = cast<ReturnOp>(b->getTerminator());
+    if (ret.operand_type_begin() == ret.operand_type_end())
+      return op.emitOpError("initializer region cannot return void");
+    if (*ret.operand_type_begin() != op.getType())
+      return op.emitOpError("initializer region type ")
+             << *ret.operand_type_begin() << " does not match global type "
+             << op.getType();
+
+    if (op.getValueOrNull())
+      return op.emitOpError("cannot have both initializer value and region");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for LLVM::ShuffleVectorOp.
+//===----------------------------------------------------------------------===//
+// Expects vector to be of wrapped LLVM vector type and position to be of
+// wrapped LLVM i32 type.
+void LLVM::ShuffleVectorOp::build(Builder *b, OperationState &result, Value v1,
+                                  Value v2, ArrayAttr mask,
+                                  ArrayRef<NamedAttribute> attrs) {
+  auto wrappedContainerType1 = v1->getType().cast<LLVM::LLVMType>();
+  auto vType = LLVMType::getVectorTy(
+      wrappedContainerType1.getVectorElementType(), mask.size());
+  build(b, result, vType, v1, v2, mask);
+  result.addAttributes(attrs);
+}
+
+static void printShuffleVectorOp(OpAsmPrinter &p, ShuffleVectorOp &op) {
+  p << op.getOperationName() << ' ' << *op.v1() << ", " << *op.v2() << " "
+    << op.mask();
+  p.printOptionalAttrDict(op.getAttrs(), {"mask"});
+  p << " : " << op.v1()->getType() << ", " << op.v2()->getType();
+}
+
+// <operation> ::= `llvm.shufflevector` ssa-use `, ` ssa-use
+//                 `[` integer-literal (`,` integer-literal)* `]`
+//                 attribute-dict? `:` type
+static ParseResult parseShuffleVectorOp(OpAsmParser &parser,
+                                        OperationState &result) {
+  llvm::SMLoc loc;
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType v1, v2;
+  Attribute maskAttr;
+  Type typeV1, typeV2;
+  if (parser.getCurrentLocation(&loc) || parser.parseOperand(v1) ||
+      parser.parseComma() || parser.parseOperand(v2) ||
+      parser.parseAttribute(maskAttr, "mask", attrs) ||
+      parser.parseOptionalAttrDict(attrs) || parser.parseColonType(typeV1) ||
+      parser.parseComma() || parser.parseType(typeV2) ||
+      parser.resolveOperand(v1, typeV1, result.operands) ||
+      parser.resolveOperand(v2, typeV2, result.operands))
+    return failure();
+  auto wrappedContainerType1 = typeV1.dyn_cast<LLVM::LLVMType>();
+  if (!wrappedContainerType1 ||
+      !wrappedContainerType1.getUnderlyingType()->isVectorTy())
+    return parser.emitError(
+        loc, "expected LLVM IR dialect vector type for operand #1");
+  auto vType =
+      LLVMType::getVectorTy(wrappedContainerType1.getVectorElementType(),
+                            maskAttr.cast<ArrayAttr>().size());
+  result.attributes = attrs;
+  result.addTypes(vType);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Implementations for LLVM::LLVMFuncOp.
+//===----------------------------------------------------------------------===//
+
+// Add the entry block to the function.
+Block *LLVMFuncOp::addEntryBlock() {
+  assert(empty() && "function already has an entry block");
+  assert(!isVarArg() && "unimplemented: non-external variadic functions");
+
+  auto *entry = new Block;
+  push_back(entry);
+
+  LLVMType type = getType();
+  for (unsigned i = 0, e = type.getFunctionNumParams(); i < e; ++i)
+    entry->addArgument(type.getFunctionParamType(i));
+  return entry;
+}
+
+void LLVMFuncOp::build(Builder *builder, OperationState &result, StringRef name,
+                       LLVMType type, LLVM::Linkage linkage,
+                       ArrayRef<NamedAttribute> attrs,
+                       ArrayRef<NamedAttributeList> argAttrs) {
+  result.addRegion();
+  result.addAttribute(SymbolTable::getSymbolAttrName(),
+                      builder->getStringAttr(name));
+  result.addAttribute("type", TypeAttr::get(type));
+  result.addAttribute(getLinkageAttrName(), builder->getI64IntegerAttr(
+                                                static_cast<int64_t>(linkage)));
+  result.attributes.append(attrs.begin(), attrs.end());
+  if (argAttrs.empty())
+    return;
+
+  unsigned numInputs = type.getUnderlyingType()->getFunctionNumParams();
+  assert(numInputs == argAttrs.size() &&
+         "expected as many argument attribute lists as arguments");
+  SmallString<8> argAttrName;
+  for (unsigned i = 0; i < numInputs; ++i)
+    if (auto argDict = argAttrs[i].getDictionary())
+      result.addAttribute(getArgAttrName(i, argAttrName), argDict);
+}
+
+// Builds an LLVM function type from the given lists of input and output types.
+// Returns a null type if any of the types provided are non-LLVM types, or if
+// there is more than one output type.
+static Type buildLLVMFunctionType(OpAsmParser &parser, llvm::SMLoc loc,
+                                  ArrayRef<Type> inputs, ArrayRef<Type> outputs,
+                                  impl::VariadicFlag variadicFlag) {
+  Builder &b = parser.getBuilder();
+  if (outputs.size() > 1) {
+    parser.emitError(loc, "failed to construct function type: expected zero or "
+                          "one function result");
+    return {};
+  }
+
+  // Convert inputs to LLVM types, exit early on error.
+  SmallVector<LLVMType, 4> llvmInputs;
+  for (auto t : inputs) {
+    auto llvmTy = t.dyn_cast<LLVMType>();
+    if (!llvmTy) {
+      parser.emitError(loc, "failed to construct function type: expected LLVM "
+                            "type for function arguments");
+      return {};
+    }
+    llvmInputs.push_back(llvmTy);
+  }
+
+  // Get the dialect from the input type, if any exist.  Look it up in the
+  // context otherwise.
+  LLVMDialect *dialect =
+      llvmInputs.empty() ? b.getContext()->getRegisteredDialect<LLVMDialect>()
+                         : &llvmInputs.front().getDialect();
+
+  // No output is denoted as "void" in LLVM type system.
+  LLVMType llvmOutput = outputs.empty() ? LLVMType::getVoidTy(dialect)
+                                        : outputs.front().dyn_cast<LLVMType>();
+  if (!llvmOutput) {
+    parser.emitError(loc, "failed to construct function type: expected LLVM "
+                          "type for function results");
+    return {};
+  }
+  return LLVMType::getFunctionTy(llvmOutput, llvmInputs,
+                                 variadicFlag.isVariadic());
+}
+
+// Parses an LLVM function.
+//
+// operation ::= `llvm.func` linkage? function-signature function-attributes?
+//               function-body
+//
+static ParseResult parseLLVMFuncOp(OpAsmParser &parser,
+                                   OperationState &result) {
+  // Default to external linkage if no keyword is provided.
+  if (failed(parseOptionalLinkageKeyword(parser, result)))
+    result.addAttribute(getLinkageAttrName(),
+                        parser.getBuilder().getI64IntegerAttr(
+                            static_cast<int64_t>(LLVM::Linkage::External)));
+
+  StringAttr nameAttr;
+  SmallVector<OpAsmParser::OperandType, 8> entryArgs;
+  SmallVector<SmallVector<NamedAttribute, 2>, 1> argAttrs;
+  SmallVector<SmallVector<NamedAttribute, 2>, 1> resultAttrs;
+  SmallVector<Type, 8> argTypes;
+  SmallVector<Type, 4> resultTypes;
+  bool isVariadic;
+
+  auto signatureLocation = parser.getCurrentLocation();
+  if (parser.parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(),
+                             result.attributes) ||
+      impl::parseFunctionSignature(parser, /*allowVariadic=*/true, entryArgs,
+                                   argTypes, argAttrs, isVariadic, resultTypes,
+                                   resultAttrs))
+    return failure();
+
+  auto type =
+      buildLLVMFunctionType(parser, signatureLocation, argTypes, resultTypes,
+                            impl::VariadicFlag(isVariadic));
+  if (!type)
+    return failure();
+  result.addAttribute(impl::getTypeAttrName(), TypeAttr::get(type));
+
+  if (failed(parser.parseOptionalAttrDictWithKeyword(result.attributes)))
+    return failure();
+  impl::addArgAndResultAttrs(parser.getBuilder(), result, argAttrs,
+                             resultAttrs);
+
+  auto *body = result.addRegion();
+  return parser.parseOptionalRegion(
+      *body, entryArgs, entryArgs.empty() ? ArrayRef<Type>() : argTypes);
+}
+
+// Print the LLVMFuncOp. Collects argument and result types and passes them to
+// helper functions. Drops "void" result since it cannot be parsed back. Skips
+// the external linkage since it is the default value.
+static void printLLVMFuncOp(OpAsmPrinter &p, LLVMFuncOp op) {
+  p << op.getOperationName() << ' ';
+  if (op.linkage() != LLVM::Linkage::External) {
+    printLinkage(p, op.linkage());
+    p << ' ';
+  }
+  p.printSymbolName(op.getName());
+
+  LLVMType fnType = op.getType();
+  SmallVector<Type, 8> argTypes;
+  SmallVector<Type, 1> resTypes;
+  argTypes.reserve(fnType.getFunctionNumParams());
+  for (unsigned i = 0, e = fnType.getFunctionNumParams(); i < e; ++i)
+    argTypes.push_back(fnType.getFunctionParamType(i));
+
+  LLVMType returnType = fnType.getFunctionResultType();
+  if (!returnType.getUnderlyingType()->isVoidTy())
+    resTypes.push_back(returnType);
+
+  impl::printFunctionSignature(p, op, argTypes, op.isVarArg(), resTypes);
+  impl::printFunctionAttributes(p, op, argTypes.size(), resTypes.size(),
+                                {getLinkageAttrName()});
+
+  // Print the body if this is not an external function.
+  Region &body = op.body();
+  if (!body.empty())
+    p.printRegion(body, /*printEntryBlockArgs=*/false,
+                  /*printBlockTerminators=*/true);
+}
+
+// Hook for OpTrait::FunctionLike, called after verifying that the 'type'
+// attribute is present.  This can check for preconditions of the
+// getNumArguments hook not failing.
+LogicalResult LLVMFuncOp::verifyType() {
+  auto llvmType = getTypeAttr().getValue().dyn_cast_or_null<LLVMType>();
+  if (!llvmType || !llvmType.getUnderlyingType()->isFunctionTy())
+    return emitOpError("requires '" + getTypeAttrName() +
+                       "' attribute of wrapped LLVM function type");
+
+  return success();
+}
+
+// Hook for OpTrait::FunctionLike, returns the number of function arguments.
+// Depends on the type attribute being correct as checked by verifyType
+unsigned LLVMFuncOp::getNumFuncArguments() {
+  return getType().getUnderlyingType()->getFunctionNumParams();
+}
+
+// Hook for OpTrait::FunctionLike, returns the number of function results.
+// Depends on the type attribute being correct as checked by verifyType
+unsigned LLVMFuncOp::getNumFuncResults() {
+  llvm::FunctionType *funcType =
+      cast<llvm::FunctionType>(getType().getUnderlyingType());
+  // We model LLVM functions that return void as having zero results,
+  // and all others as having one result.
+  // If we modeled a void return as one result, then it would be possible to
+  // attach an MLIR result attribute to it, and it isn't clear what semantics we
+  // would assign to that.
+  if (funcType->getReturnType()->isVoidTy())
+    return 0;
+  return 1;
+}
+
+// Verifies LLVM- and implementation-specific properties of the LLVM func Op:
+// - functions don't have 'common' linkage
+// - external functions have 'external' or 'extern_weak' linkage;
+// - vararg is (currently) only supported for external functions;
+// - entry block arguments are of LLVM types and match the function signature.
+static LogicalResult verify(LLVMFuncOp op) {
+  if (op.linkage() == LLVM::Linkage::Common)
+    return op.emitOpError()
+           << "functions cannot have '" << linkageToStr(LLVM::Linkage::Common)
+           << "' linkage";
+
+  if (op.isExternal()) {
+    if (op.linkage() != LLVM::Linkage::External &&
+        op.linkage() != LLVM::Linkage::ExternWeak)
+      return op.emitOpError()
+             << "external functions must have '"
+             << linkageToStr(LLVM::Linkage::External) << "' or '"
+             << linkageToStr(LLVM::Linkage::ExternWeak) << "' linkage";
+    return success();
+  }
+
+  if (op.isVarArg())
+    return op.emitOpError("only external functions can be variadic");
+
+  auto *funcType = cast<llvm::FunctionType>(op.getType().getUnderlyingType());
+  unsigned numArguments = funcType->getNumParams();
+  Block &entryBlock = op.front();
+  for (unsigned i = 0; i < numArguments; ++i) {
+    Type argType = entryBlock.getArgument(i)->getType();
+    auto argLLVMType = argType.dyn_cast<LLVMType>();
+    if (!argLLVMType)
+      return op.emitOpError("entry block argument #")
+             << i << " is not of LLVM type";
+    if (funcType->getParamType(i) != argLLVMType.getUnderlyingType())
+      return op.emitOpError("the type of entry block argument #")
+             << i << " does not match the function signature";
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Printing, parsing and verification for LLVM::NullOp.
+//===----------------------------------------------------------------------===//
+
+static void printNullOp(OpAsmPrinter &p, LLVM::NullOp op) {
+  p << NullOp::getOperationName();
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getType();
+}
+
+// <operation> = `llvm.mlir.null` : type
+static ParseResult parseNullOp(OpAsmParser &parser, OperationState &result) {
+  Type type;
+  return failure(parser.parseOptionalAttrDict(result.attributes) ||
+                 parser.parseColonType(type) ||
+                 parser.addTypeToList(type, result.types));
+}
+
+// Only LLVM pointer types are supported.
+static LogicalResult verify(LLVM::NullOp op) {
+  auto llvmType = op.getType().dyn_cast<LLVM::LLVMType>();
+  if (!llvmType || !llvmType.isPointerTy())
+    return op.emitOpError("expected LLVM IR pointer type");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// LLVMDialect initialization, type parsing, and registration.
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace LLVM {
+namespace detail {
+struct LLVMDialectImpl {
+  LLVMDialectImpl() : module("LLVMDialectModule", llvmContext) {}
+
+  llvm::LLVMContext llvmContext;
+  llvm::Module module;
+
+  /// A set of LLVMTypes that are cached on construction to avoid any lookups or
+  /// locking.
+  LLVMType int1Ty, int8Ty, int16Ty, int32Ty, int64Ty, int128Ty;
+  LLVMType doubleTy, floatTy, halfTy, fp128Ty, x86_fp80Ty;
+  LLVMType voidTy;
+
+  /// A smart mutex to lock access to the llvm context. Unlike MLIR, LLVM is not
+  /// multi-threaded and requires locked access to prevent race conditions.
+  llvm::sys::SmartMutex<true> mutex;
+};
+} // end namespace detail
+} // end namespace LLVM
+} // end namespace mlir
+
+LLVMDialect::LLVMDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context),
+      impl(new detail::LLVMDialectImpl()) {
+  addTypes<LLVMType>();
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/LLVMIR/LLVMOps.cpp.inc"
+      >();
+
+  // Support unknown operations because not all LLVM operations are registered.
+  allowUnknownOperations();
+
+  // Cache some of the common LLVM types to avoid the need for lookups/locking.
+  auto &llvmContext = impl->llvmContext;
+  /// Integer Types.
+  impl->int1Ty = LLVMType::get(context, llvm::Type::getInt1Ty(llvmContext));
+  impl->int8Ty = LLVMType::get(context, llvm::Type::getInt8Ty(llvmContext));
+  impl->int16Ty = LLVMType::get(context, llvm::Type::getInt16Ty(llvmContext));
+  impl->int32Ty = LLVMType::get(context, llvm::Type::getInt32Ty(llvmContext));
+  impl->int64Ty = LLVMType::get(context, llvm::Type::getInt64Ty(llvmContext));
+  impl->int128Ty = LLVMType::get(context, llvm::Type::getInt128Ty(llvmContext));
+  /// Float Types.
+  impl->doubleTy = LLVMType::get(context, llvm::Type::getDoubleTy(llvmContext));
+  impl->floatTy = LLVMType::get(context, llvm::Type::getFloatTy(llvmContext));
+  impl->halfTy = LLVMType::get(context, llvm::Type::getHalfTy(llvmContext));
+  impl->fp128Ty = LLVMType::get(context, llvm::Type::getFP128Ty(llvmContext));
+  impl->x86_fp80Ty =
+      LLVMType::get(context, llvm::Type::getX86_FP80Ty(llvmContext));
+  /// Other Types.
+  impl->voidTy = LLVMType::get(context, llvm::Type::getVoidTy(llvmContext));
+}
+
+LLVMDialect::~LLVMDialect() {}
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LLVMIR/LLVMOps.cpp.inc"
+
+llvm::LLVMContext &LLVMDialect::getLLVMContext() { return impl->llvmContext; }
+llvm::Module &LLVMDialect::getLLVMModule() { return impl->module; }
+
+/// Parse a type registered to this dialect.
+Type LLVMDialect::parseType(DialectAsmParser &parser) const {
+  StringRef tyData = parser.getFullSymbolSpec();
+
+  // LLVM is not thread-safe, so lock access to it.
+  llvm::sys::SmartScopedLock<true> lock(impl->mutex);
+
+  llvm::SMDiagnostic errorMessage;
+  llvm::Type *type = llvm::parseType(tyData, errorMessage, impl->module);
+  if (!type)
+    return (parser.emitError(parser.getNameLoc(), errorMessage.getMessage()),
+            nullptr);
+  return LLVMType::get(getContext(), type);
+}
+
+/// Print a type registered to this dialect.
+void LLVMDialect::printType(Type type, DialectAsmPrinter &os) const {
+  auto llvmType = type.dyn_cast<LLVMType>();
+  assert(llvmType && "printing wrong type");
+  assert(llvmType.getUnderlyingType() && "no underlying LLVM type");
+  llvmType.getUnderlyingType()->print(os.getStream());
+}
+
+/// Verify LLVMIR function argument attributes.
+LogicalResult LLVMDialect::verifyRegionArgAttribute(Operation *op,
+                                                    unsigned regionIdx,
+                                                    unsigned argIdx,
+                                                    NamedAttribute argAttr) {
+  // Check that llvm.noalias is a boolean attribute.
+  if (argAttr.first == "llvm.noalias" && !argAttr.second.isa<BoolAttr>())
+    return op->emitError()
+           << "llvm.noalias argument attribute of non boolean type";
+  return success();
+}
+
+static DialectRegistration<LLVMDialect> llvmDialect;
+
+//===----------------------------------------------------------------------===//
+// LLVMType.
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace LLVM {
+namespace detail {
+struct LLVMTypeStorage : public ::mlir::TypeStorage {
+  LLVMTypeStorage(llvm::Type *ty) : underlyingType(ty) {}
+
+  // LLVM types are pointer-unique.
+  using KeyTy = llvm::Type *;
+  bool operator==(const KeyTy &key) const { return key == underlyingType; }
+
+  static LLVMTypeStorage *construct(TypeStorageAllocator &allocator,
+                                    llvm::Type *ty) {
+    return new (allocator.allocate<LLVMTypeStorage>()) LLVMTypeStorage(ty);
+  }
+
+  llvm::Type *underlyingType;
+};
+} // end namespace detail
+} // end namespace LLVM
+} // end namespace mlir
+
+LLVMType LLVMType::get(MLIRContext *context, llvm::Type *llvmType) {
+  return Base::get(context, FIRST_LLVM_TYPE, llvmType);
+}
+
+/// Get an LLVMType with an llvm type that may cause changes to the underlying
+/// llvm context when constructed.
+LLVMType LLVMType::getLocked(LLVMDialect *dialect,
+                             function_ref<llvm::Type *()> typeBuilder) {
+  // Lock access to the llvm context and build the type.
+  llvm::sys::SmartScopedLock<true> lock(dialect->impl->mutex);
+  return get(dialect->getContext(), typeBuilder());
+}
+
+LLVMDialect &LLVMType::getDialect() {
+  return static_cast<LLVMDialect &>(Type::getDialect());
+}
+
+llvm::Type *LLVMType::getUnderlyingType() const {
+  return getImpl()->underlyingType;
+}
+
+/// Array type utilities.
+LLVMType LLVMType::getArrayElementType() {
+  return get(getContext(), getUnderlyingType()->getArrayElementType());
+}
+unsigned LLVMType::getArrayNumElements() {
+  return getUnderlyingType()->getArrayNumElements();
+}
+bool LLVMType::isArrayTy() { return getUnderlyingType()->isArrayTy(); }
+
+/// Vector type utilities.
+LLVMType LLVMType::getVectorElementType() {
+  return get(getContext(), getUnderlyingType()->getVectorElementType());
+}
+bool LLVMType::isVectorTy() { return getUnderlyingType()->isVectorTy(); }
+
+/// Function type utilities.
+LLVMType LLVMType::getFunctionParamType(unsigned argIdx) {
+  return get(getContext(), getUnderlyingType()->getFunctionParamType(argIdx));
+}
+unsigned LLVMType::getFunctionNumParams() {
+  return getUnderlyingType()->getFunctionNumParams();
+}
+LLVMType LLVMType::getFunctionResultType() {
+  return get(
+      getContext(),
+      llvm::cast<llvm::FunctionType>(getUnderlyingType())->getReturnType());
+}
+bool LLVMType::isFunctionTy() { return getUnderlyingType()->isFunctionTy(); }
+
+/// Pointer type utilities.
+LLVMType LLVMType::getPointerTo(unsigned addrSpace) {
+  // Lock access to the dialect as this may modify the LLVM context.
+  return getLocked(&getDialect(), [=] {
+    return getUnderlyingType()->getPointerTo(addrSpace);
+  });
+}
+LLVMType LLVMType::getPointerElementTy() {
+  return get(getContext(), getUnderlyingType()->getPointerElementType());
+}
+bool LLVMType::isPointerTy() { return getUnderlyingType()->isPointerTy(); }
+
+/// Struct type utilities.
+LLVMType LLVMType::getStructElementType(unsigned i) {
+  return get(getContext(), getUnderlyingType()->getStructElementType(i));
+}
+unsigned LLVMType::getStructNumElements() {
+  return getUnderlyingType()->getStructNumElements();
+}
+bool LLVMType::isStructTy() { return getUnderlyingType()->isStructTy(); }
+
+/// Utilities used to generate floating point types.
+LLVMType LLVMType::getDoubleTy(LLVMDialect *dialect) {
+  return dialect->impl->doubleTy;
+}
+LLVMType LLVMType::getFloatTy(LLVMDialect *dialect) {
+  return dialect->impl->floatTy;
+}
+LLVMType LLVMType::getHalfTy(LLVMDialect *dialect) {
+  return dialect->impl->halfTy;
+}
+LLVMType LLVMType::getFP128Ty(LLVMDialect *dialect) {
+  return dialect->impl->fp128Ty;
+}
+LLVMType LLVMType::getX86_FP80Ty(LLVMDialect *dialect) {
+  return dialect->impl->x86_fp80Ty;
+}
+
+/// Utilities used to generate integer types.
+LLVMType LLVMType::getIntNTy(LLVMDialect *dialect, unsigned numBits) {
+  switch (numBits) {
+  case 1:
+    return dialect->impl->int1Ty;
+  case 8:
+    return dialect->impl->int8Ty;
+  case 16:
+    return dialect->impl->int16Ty;
+  case 32:
+    return dialect->impl->int32Ty;
+  case 64:
+    return dialect->impl->int64Ty;
+  case 128:
+    return dialect->impl->int128Ty;
+  default:
+    break;
+  }
+
+  // Lock access to the dialect as this may modify the LLVM context.
+  return getLocked(dialect, [=] {
+    return llvm::Type::getIntNTy(dialect->getLLVMContext(), numBits);
+  });
+}
+
+/// Utilities used to generate other miscellaneous types.
+LLVMType LLVMType::getArrayTy(LLVMType elementType, uint64_t numElements) {
+  // Lock access to the dialect as this may modify the LLVM context.
+  return getLocked(&elementType.getDialect(), [=] {
+    return llvm::ArrayType::get(elementType.getUnderlyingType(), numElements);
+  });
+}
+LLVMType LLVMType::getFunctionTy(LLVMType result, ArrayRef<LLVMType> params,
+                                 bool isVarArg) {
+  SmallVector<llvm::Type *, 8> llvmParams;
+  for (auto param : params)
+    llvmParams.push_back(param.getUnderlyingType());
+
+  // Lock access to the dialect as this may modify the LLVM context.
+  return getLocked(&result.getDialect(), [=] {
+    return llvm::FunctionType::get(result.getUnderlyingType(), llvmParams,
+                                   isVarArg);
+  });
+}
+LLVMType LLVMType::getStructTy(LLVMDialect *dialect,
+                               ArrayRef<LLVMType> elements, bool isPacked) {
+  SmallVector<llvm::Type *, 8> llvmElements;
+  for (auto elt : elements)
+    llvmElements.push_back(elt.getUnderlyingType());
+
+  // Lock access to the dialect as this may modify the LLVM context.
+  return getLocked(dialect, [=] {
+    return llvm::StructType::get(dialect->getLLVMContext(), llvmElements,
+                                 isPacked);
+  });
+}
+LLVMType LLVMType::getVectorTy(LLVMType elementType, unsigned numElements) {
+  // Lock access to the dialect as this may modify the LLVM context.
+  return getLocked(&elementType.getDialect(), [=] {
+    return llvm::VectorType::get(elementType.getUnderlyingType(), numElements);
+  });
+}
+LLVMType LLVMType::getVoidTy(LLVMDialect *dialect) {
+  return dialect->impl->voidTy;
+}
+
+//===----------------------------------------------------------------------===//
+// Utility functions.
+//===----------------------------------------------------------------------===//
+
+Value mlir::LLVM::createGlobalString(Location loc, OpBuilder &builder,
+                                     StringRef name, StringRef value,
+                                     LLVM::Linkage linkage,
+                                     LLVM::LLVMDialect *llvmDialect) {
+  assert(builder.getInsertionBlock() &&
+         builder.getInsertionBlock()->getParentOp() &&
+         "expected builder to point to a block constrained in an op");
+  auto module =
+      builder.getInsertionBlock()->getParentOp()->getParentOfType<ModuleOp>();
+  assert(module && "builder points to an op outside of a module");
+
+  // Create the global at the entry of the module.
+  OpBuilder moduleBuilder(module.getBodyRegion());
+  auto type = LLVM::LLVMType::getArrayTy(LLVM::LLVMType::getInt8Ty(llvmDialect),
+                                         value.size());
+  auto global = moduleBuilder.create<LLVM::GlobalOp>(
+      loc, type, /*isConstant=*/true, linkage, name,
+      builder.getStringAttr(value));
+
+  // Get the pointer to the first character in the global string.
+  Value globalPtr = builder.create<LLVM::AddressOfOp>(loc, global);
+  Value cst0 = builder.create<LLVM::ConstantOp>(
+      loc, LLVM::LLVMType::getInt64Ty(llvmDialect),
+      builder.getIntegerAttr(builder.getIndexType(), 0));
+  return builder.create<LLVM::GEPOp>(loc,
+                                     LLVM::LLVMType::getInt8PtrTy(llvmDialect),
+                                     globalPtr, ArrayRef<Value>({cst0, cst0}));
+}
+
+bool mlir::LLVM::satisfiesLLVMModule(Operation *op) {
+  return op->hasTrait<OpTrait::SymbolTable>() &&
+         op->hasTrait<OpTrait::IsIsolatedFromAbove>();
+}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a8e84ea91858cd301d0e80aea138b3d8f66451f
--- /dev/null
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -0,0 +1,209 @@
+//===- NVVMDialect.cpp - NVVM IR Ops and Dialect registration -------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the types and operation details for the NVVM IR dialect in
+// MLIR, and the LLVM IR dialect.  It also registers the dialect.
+//
+// The NVVM dialect only contains GPU specific additions on top of the general
+// LLVM dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+using namespace NVVM;
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for NVVM ops
+//===----------------------------------------------------------------------===//
+
+static void printNVVMIntrinsicOp(OpAsmPrinter &p, Operation *op) {
+  p << op->getName() << " " << op->getOperands();
+  if (op->getNumResults() > 0)
+    p << " : " << op->getResultTypes();
+}
+
+// <operation> ::= `llvm.nvvm.XYZ` : type
+static ParseResult parseNVVMSpecialRegisterOp(OpAsmParser &parser,
+                                              OperationState &result) {
+  Type type;
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type))
+    return failure();
+
+  result.addTypes(type);
+  return success();
+}
+
+static LLVM::LLVMDialect *getLlvmDialect(OpAsmParser &parser) {
+  return parser.getBuilder()
+      .getContext()
+      ->getRegisteredDialect<LLVM::LLVMDialect>();
+}
+
+// <operation> ::=
+//     `llvm.nvvm.shfl.sync.bfly %dst, %val, %offset, %clamp_and_mask`
+//      ({return_value_and_is_valid})? : result_type
+static ParseResult parseNVVMShflSyncBflyOp(OpAsmParser &parser,
+                                           OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 8> ops;
+  Type resultType;
+  if (parser.parseOperandList(ops) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(resultType) ||
+      parser.addTypeToList(resultType, result.types))
+    return failure();
+
+  auto type = resultType.cast<LLVM::LLVMType>();
+  for (auto &attr : result.attributes) {
+    if (attr.first != "return_value_and_is_valid")
+      continue;
+    if (type.isStructTy() && type.getStructNumElements() > 0)
+      type = type.getStructElementType(0);
+    break;
+  }
+
+  auto int32Ty = LLVM::LLVMType::getInt32Ty(getLlvmDialect(parser));
+  return parser.resolveOperands(ops, {int32Ty, type, int32Ty, int32Ty},
+                                parser.getNameLoc(), result.operands);
+}
+
+// <operation> ::= `llvm.nvvm.vote.ballot.sync %mask, %pred` : result_type
+static ParseResult parseNVVMVoteBallotOp(OpAsmParser &parser,
+                                         OperationState &result) {
+  auto llvmDialect = getLlvmDialect(parser);
+  auto int32Ty = LLVM::LLVMType::getInt32Ty(llvmDialect);
+  auto int1Ty = LLVM::LLVMType::getInt1Ty(llvmDialect);
+
+  SmallVector<OpAsmParser::OperandType, 8> ops;
+  Type type;
+  return failure(parser.parseOperandList(ops) ||
+                 parser.parseOptionalAttrDict(result.attributes) ||
+                 parser.parseColonType(type) ||
+                 parser.addTypeToList(type, result.types) ||
+                 parser.resolveOperands(ops, {int32Ty, int1Ty},
+                                        parser.getNameLoc(), result.operands));
+}
+
+// <operation> ::= `llvm.nvvm.mma.sync %lhs... %rhs... %acc...`
+//                 : signature_type
+static ParseResult parseNVVMMmaOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 12> ops;
+  Type type;
+  llvm::SMLoc typeLoc;
+  if (parser.parseOperandList(ops) ||
+      parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() ||
+      parser.getCurrentLocation(&typeLoc) || parser.parseType(type)) {
+    return failure();
+  }
+
+  auto signature = type.dyn_cast<FunctionType>();
+  if (!signature) {
+    return parser.emitError(
+        typeLoc, "expected the type to be the full list of input and output");
+  }
+
+  if (signature.getNumResults() != 1) {
+    return parser.emitError(typeLoc, "expected single result");
+  }
+
+  return failure(parser.addTypeToList(signature.getResult(0), result.types) ||
+                 parser.resolveOperands(ops, signature.getInputs(),
+                                        parser.getNameLoc(), result.operands));
+}
+
+static void printNVVMMmaOp(OpAsmPrinter &p, MmaOp &op) {
+  p << op.getOperationName() << " " << op.getOperands();
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : "
+    << FunctionType::get(llvm::to_vector<12>(op.getOperandTypes()),
+                         op.getType(), op.getContext());
+}
+
+static LogicalResult verify(MmaOp op) {
+  auto dialect = op.getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
+  auto f16Ty = LLVM::LLVMType::getHalfTy(dialect);
+  auto f16x2Ty = LLVM::LLVMType::getVectorTy(f16Ty, 2);
+  auto f32Ty = LLVM::LLVMType::getFloatTy(dialect);
+  auto f16x2x4StructTy = LLVM::LLVMType::getStructTy(
+      dialect, {f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty});
+  auto f32x8StructTy = LLVM::LLVMType::getStructTy(
+      dialect, {f32Ty, f32Ty, f32Ty, f32Ty, f32Ty, f32Ty, f32Ty, f32Ty});
+
+  SmallVector<Type, 12> operand_types(op.getOperandTypes().begin(),
+                                      op.getOperandTypes().end());
+  if (operand_types != SmallVector<Type, 8>(8, f16x2Ty) &&
+      operand_types != SmallVector<Type, 12>{f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty,
+                                             f32Ty, f32Ty, f32Ty, f32Ty, f32Ty,
+                                             f32Ty, f32Ty, f32Ty}) {
+    return op.emitOpError(
+        "expected operands to be 4 <halfx2>s followed by either "
+        "4 <halfx2>s or 8 floats");
+  }
+  if (op.getType() != f32x8StructTy && op.getType() != f16x2x4StructTy) {
+    return op.emitOpError("expected result type to be a struct of either 4 "
+                          "<halfx2>s or 8 floats");
+  }
+
+  auto alayout = op.getAttrOfType<StringAttr>("alayout");
+  auto blayout = op.getAttrOfType<StringAttr>("blayout");
+
+  if (!(alayout && blayout) ||
+      !(alayout.getValue() == "row" || alayout.getValue() == "col") ||
+      !(blayout.getValue() == "row" || blayout.getValue() == "col")) {
+    return op.emitOpError(
+        "alayout and blayout attributes must be set to either "
+        "\"row\" or \"col\"");
+  }
+
+  if (operand_types == SmallVector<Type, 12>{f16x2Ty, f16x2Ty, f16x2Ty, f16x2Ty,
+                                             f32Ty, f32Ty, f32Ty, f32Ty, f32Ty,
+                                             f32Ty, f32Ty, f32Ty} &&
+      op.getType() == f32x8StructTy && alayout.getValue() == "row" &&
+      blayout.getValue() == "row") {
+    return success();
+  }
+  return op.emitOpError("unimplemented mma.sync variant");
+}
+
+//===----------------------------------------------------------------------===//
+// NVVMDialect initialization, type parsing, and registration.
+//===----------------------------------------------------------------------===//
+
+// TODO(herhut): This should be the llvm.nvvm dialect once this is supported.
+NVVMDialect::NVVMDialect(MLIRContext *context) : Dialect("nvvm", context) {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc"
+      >();
+
+  // Support unknown operations because not all NVVM operations are registered.
+  allowUnknownOperations();
+}
+
+namespace mlir {
+namespace NVVM {
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc"
+} // namespace NVVM
+} // namespace mlir
+
+static DialectRegistration<NVVMDialect> nvvmDialect;
diff --git a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c11572cf5a2000f9fe95340f5830900766c7b8bc
--- /dev/null
+++ b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
@@ -0,0 +1,73 @@
+//===- ROCDLDialect.cpp - ROCDL IR Ops and Dialect registration -----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the types and operation details for the ROCDL IR dialect in
+// MLIR, and the LLVM IR dialect.  It also registers the dialect.
+//
+// The ROCDL dialect only contains GPU specific additions on top of the general
+// LLVM dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+using namespace ROCDL;
+
+//===----------------------------------------------------------------------===//
+// Printing/parsing for ROCDL ops
+//===----------------------------------------------------------------------===//
+
+static void printROCDLOp(OpAsmPrinter &p, Operation *op) {
+  p << op->getName() << " " << op->getOperands();
+  if (op->getNumResults() > 0)
+    p << " : " << op->getResultTypes();
+}
+
+// <operation> ::= `rocdl.XYZ` : type
+static ParseResult parseROCDLOp(OpAsmParser &parser, OperationState &result) {
+  Type type;
+  return failure(parser.parseOptionalAttrDict(result.attributes) ||
+                 parser.parseColonType(type) ||
+                 parser.addTypeToList(type, result.types));
+}
+
+//===----------------------------------------------------------------------===//
+// ROCDLDialect initialization, type parsing, and registration.
+//===----------------------------------------------------------------------===//
+
+// TODO(herhut): This should be the llvm.rocdl dialect once this is supported.
+ROCDLDialect::ROCDLDialect(MLIRContext *context) : Dialect("rocdl", context) {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/LLVMIR/ROCDLOps.cpp.inc"
+      >();
+
+  // Support unknown operations because not all ROCDL operations are registered.
+  allowUnknownOperations();
+}
+
+namespace mlir {
+namespace ROCDL {
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LLVMIR/ROCDLOps.cpp.inc"
+} // namespace ROCDL
+} // namespace mlir
+
+static DialectRegistration<ROCDLDialect> rocdlDialect;
diff --git a/mlir/lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp b/mlir/lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8667f078221b713f3636848a367ac33dc789f30
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp
@@ -0,0 +1,231 @@
+//===- DependenceAnalysis.cpp - Dependence analysis on SSA views ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements view-based alias and dependence analyses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "linalg-dependence-analysis"
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+using llvm::dbgs;
+
+static StringRef toStringRef(LinalgDependenceGraph::DependenceType dt) {
+  switch (dt) {
+  case LinalgDependenceGraph::DependenceType::RAW:
+    return "RAW";
+  case LinalgDependenceGraph::DependenceType::RAR:
+    return "RAR";
+  case LinalgDependenceGraph::DependenceType::WAR:
+    return "WAR";
+  case LinalgDependenceGraph::DependenceType::WAW:
+    return "WAW";
+  default:
+    break;
+  }
+  llvm_unreachable("Unexpected DependenceType");
+}
+
+Value Aliases::find(Value v) {
+  if (v.isa<BlockArgument>())
+    return v;
+
+  auto it = aliases.find(v);
+  if (it != aliases.end()) {
+    assert(it->getSecond()->getType().isa<MemRefType>() && "Memref expected");
+    return it->getSecond();
+  }
+
+  while (true) {
+    if (v.isa<BlockArgument>())
+      return v;
+    if (auto alloc = dyn_cast_or_null<AllocOp>(v->getDefiningOp())) {
+      if (isStrided(alloc.getType()))
+        return alloc.getResult();
+    }
+    if (auto slice = dyn_cast_or_null<SliceOp>(v->getDefiningOp())) {
+      auto it = aliases.insert(std::make_pair(v, find(slice.view())));
+      return it.first->second;
+    }
+    if (auto view = dyn_cast_or_null<ViewOp>(v->getDefiningOp())) {
+      auto it = aliases.insert(std::make_pair(v, view.source()));
+      return it.first->second;
+    }
+    if (auto view = dyn_cast_or_null<SubViewOp>(v->getDefiningOp())) {
+      v = view.source();
+      continue;
+    }
+    llvm::errs() << "View alias analysis reduces to: " << *v << "\n";
+    llvm_unreachable("unsupported view alias case");
+  }
+}
+
+LinalgDependenceGraph
+LinalgDependenceGraph::buildDependenceGraph(Aliases &aliases, FuncOp f) {
+  SmallVector<Operation *, 8> linalgOps;
+  f.walk([&](LinalgOp op) { linalgOps.push_back(op); });
+  return LinalgDependenceGraph(aliases, linalgOps);
+}
+
+LinalgDependenceGraph::LinalgDependenceGraph(Aliases &aliases,
+                                             ArrayRef<Operation *> ops)
+    : aliases(aliases), linalgOps(ops.begin(), ops.end()) {
+  for (auto en : llvm::enumerate(linalgOps)) {
+    assert(isa<LinalgOp>(en.value()) && "Expected value for LinalgOp");
+    linalgOpPositions.insert(std::make_pair(en.value(), en.index()));
+  }
+  for (unsigned i = 0, e = ops.size(); i < e; ++i) {
+    for (unsigned j = i + 1; j < e; ++j) {
+      addDependencesBetween(cast<LinalgOp>(ops[i]), cast<LinalgOp>(ops[j]));
+    }
+  }
+}
+
+void LinalgDependenceGraph::addDependenceElem(DependenceType dt,
+                                              LinalgOpView indexingOpView,
+                                              LinalgOpView dependentOpView) {
+  LLVM_DEBUG(dbgs() << "\nAdd dep type " << toStringRef(dt) << ":\t"
+                    << *indexingOpView.op << " -> " << *dependentOpView.op);
+  (void)toStringRef;
+  dependencesFromGraphs[dt][indexingOpView.op].push_back(
+      LinalgDependenceGraphElem{dependentOpView, indexingOpView.view});
+  dependencesIntoGraphs[dt][dependentOpView.op].push_back(
+      LinalgDependenceGraphElem{indexingOpView, dependentOpView.view});
+}
+
+LinalgDependenceGraph::dependence_range
+LinalgDependenceGraph::getDependencesFrom(
+    LinalgOp src, LinalgDependenceGraph::DependenceType dt) const {
+  return getDependencesFrom(src.getOperation(), dt);
+}
+
+LinalgDependenceGraph::dependence_range
+LinalgDependenceGraph::getDependencesFrom(
+    Operation *src, LinalgDependenceGraph::DependenceType dt) const {
+  auto iter = dependencesFromGraphs[dt].find(src);
+  if (iter == dependencesFromGraphs[dt].end())
+    return llvm::make_range(nullptr, nullptr);
+  return llvm::make_range(iter->second.begin(), iter->second.end());
+}
+
+LinalgDependenceGraph::dependence_range
+LinalgDependenceGraph::getDependencesInto(
+    LinalgOp dst, LinalgDependenceGraph::DependenceType dt) const {
+  return getDependencesInto(dst.getOperation(), dt);
+}
+
+LinalgDependenceGraph::dependence_range
+LinalgDependenceGraph::getDependencesInto(
+    Operation *dst, LinalgDependenceGraph::DependenceType dt) const {
+  auto iter = dependencesIntoGraphs[dt].find(dst);
+  if (iter == dependencesIntoGraphs[dt].end())
+    return llvm::make_range(nullptr, nullptr);
+  return llvm::make_range(iter->second.begin(), iter->second.end());
+}
+
+void LinalgDependenceGraph::addDependencesBetween(LinalgOp src, LinalgOp dst) {
+  for (auto srcView : src.getOutputs()) { // W
+    // RAW graph
+    for (auto dstView : dst.getInputs()) {   // R
+      if (aliases.alias(srcView, dstView)) { // if alias, fill RAW
+        addDependenceElem(DependenceType::RAW,
+                          LinalgOpView{src.getOperation(), srcView},
+                          LinalgOpView{dst.getOperation(), dstView});
+      }
+    }
+    // WAW graph
+    for (auto dstView : dst.getOutputs()) {  // W
+      if (aliases.alias(srcView, dstView)) { // if alias, fill WAW
+        addDependenceElem(DependenceType::WAW,
+                          LinalgOpView{src.getOperation(), srcView},
+                          LinalgOpView{dst.getOperation(), dstView});
+      }
+    }
+  }
+  for (auto srcView : src.getInputs()) { // R
+    // RAR graph
+    for (auto dstView : dst.getInputs()) {   // R
+      if (aliases.alias(srcView, dstView)) { // if alias, fill RAR
+        addDependenceElem(DependenceType::RAR,
+                          LinalgOpView{src.getOperation(), srcView},
+                          LinalgOpView{dst.getOperation(), dstView});
+      }
+    }
+    // WAR graph
+    for (auto dstView : dst.getOutputs()) {  // W
+      if (aliases.alias(srcView, dstView)) { // if alias, fill WAR
+        addDependenceElem(DependenceType::WAR,
+                          LinalgOpView{src.getOperation(), srcView},
+                          LinalgOpView{dst.getOperation(), dstView});
+      }
+    }
+  }
+}
+
+SmallVector<Operation *, 8>
+LinalgDependenceGraph::findCoveringDependences(LinalgOp srcLinalgOp,
+                                               LinalgOp dstLinalgOp) const {
+  return findOperationsWithCoveringDependences(
+      srcLinalgOp, dstLinalgOp, nullptr,
+      {DependenceType::WAW, DependenceType::WAR, DependenceType::RAW});
+}
+
+SmallVector<Operation *, 8> LinalgDependenceGraph::findCoveringWrites(
+    LinalgOp srcLinalgOp, LinalgOp dstLinalgOp, Value view) const {
+  return findOperationsWithCoveringDependences(
+      srcLinalgOp, dstLinalgOp, view,
+      {DependenceType::WAW, DependenceType::WAR});
+}
+
+SmallVector<Operation *, 8> LinalgDependenceGraph::findCoveringReads(
+    LinalgOp srcLinalgOp, LinalgOp dstLinalgOp, Value view) const {
+  return findOperationsWithCoveringDependences(
+      srcLinalgOp, dstLinalgOp, view,
+      {DependenceType::RAR, DependenceType::RAW});
+}
+
+SmallVector<Operation *, 8>
+LinalgDependenceGraph::findOperationsWithCoveringDependences(
+    LinalgOp srcLinalgOp, LinalgOp dstLinalgOp, Value view,
+    ArrayRef<DependenceType> types) const {
+  auto *src = srcLinalgOp.getOperation();
+  auto *dst = dstLinalgOp.getOperation();
+  auto srcPos = linalgOpPositions.lookup(src);
+  auto dstPos = linalgOpPositions.lookup(dst);
+  assert(srcPos < dstPos && "expected dst after src in IR traversal order");
+
+  SmallVector<Operation *, 8> res;
+  // Consider an intermediate interleaved `interim` op, look for any dependence
+  // to an aliasing view on a src -> op -> dst path.
+  // TODO(ntv) we are not considering paths yet, just interleaved positions.
+  for (auto dt : types) {
+    for (auto dependence : getDependencesFrom(src, dt)) {
+      auto interimPos = linalgOpPositions.lookup(dependence.dependentOpView.op);
+      // Skip if not interleaved.
+      if (interimPos >= dstPos || interimPos <= srcPos)
+        continue;
+      if (view && !aliases.alias(view, dependence.indexingView))
+        continue;
+      auto *op = dependence.dependentOpView.op;
+      LLVM_DEBUG(dbgs() << "\n***Found covering dependence of type "
+                        << toStringRef(dt) << ": " << *src << " -> " << *op
+                        << " on " << *dependence.indexingView);
+      res.push_back(op);
+    }
+  }
+  return res;
+}
diff --git a/mlir/lib/Dialect/Linalg/CMakeLists.txt b/mlir/lib/Dialect/Linalg/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2ca5da3eb2f54b1ddacc794a4acc30d0de9d1021
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/CMakeLists.txt
@@ -0,0 +1,31 @@
+add_llvm_library(MLIRLinalg
+  LinalgRegistration.cpp
+  Analysis/DependenceAnalysis.cpp
+  EDSC/Builders.cpp
+  IR/LinalgOps.cpp
+  IR/LinalgTypes.cpp
+  Transforms/Fusion.cpp
+  Transforms/LinalgTransforms.cpp
+  Transforms/LinalgToLoops.cpp
+  Transforms/Promotion.cpp
+  Transforms/Tiling.cpp
+  Utils/Utils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Linalg
+  DEPENDS
+  intrinsics_gen
+  )
+
+add_dependencies(MLIRLinalg
+
+  MLIRAffineOps
+  MLIRAnalysis
+  MLIREDSC
+  MLIRLinalgOpsIncGen
+  MLIRLinalgStructuredOpsIncGen
+  MLIRLinalgTransformPatternsIncGen
+  MLIRStandardOps
+  MLIRStandardToLLVM
+  MLIRVectorOps
+  )
diff --git a/mlir/lib/Dialect/Linalg/EDSC/Builders.cpp b/mlir/lib/Dialect/Linalg/EDSC/Builders.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..37c63b74f14418a64de67d84e2e12fba6d9e5845
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/EDSC/Builders.cpp
@@ -0,0 +1,246 @@
+//===- Builders.cpp - MLIR Declarative Linalg Builders --------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/EDSC/Builders.h"
+#include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Intrinsics.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Support/Functional.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::edsc::ops;
+
+static void getMaxDimIndex(ArrayRef<StructuredIndexed> structuredIndices,
+                           unsigned &pos) {
+  for (auto sidx : structuredIndices) {
+    for (auto expr : sidx.getExprs()) {
+      expr.walk([&pos](AffineExpr e) {
+        if (auto d = e.dyn_cast<AffineDimExpr>())
+          pos = std::max(pos, d.getPosition());
+      });
+    }
+  }
+}
+
+Operation *mlir::edsc::makeLinalgGenericOp(
+    ArrayRef<IterType> iteratorTypes, ArrayRef<StructuredIndexed> inputs,
+    ArrayRef<StructuredIndexed> outputs,
+    function_ref<void(ArrayRef<BlockArgument>)> regionBuilder,
+    ArrayRef<Value> otherValues, ArrayRef<Attribute> otherAttributes) {
+  auto &builder = edsc::ScopedContext::getBuilder();
+  auto *ctx = builder.getContext();
+  unsigned nInputs = inputs.size();
+  unsigned nOutputs = outputs.size();
+  unsigned maxPos = 0;
+  getMaxDimIndex(inputs, maxPos);
+  getMaxDimIndex(outputs, maxPos);
+  // maxPos is 0 indexed, need to turn this into a count (i.e. +1)
+  unsigned nDims = maxPos + 1;
+
+  SmallVector<AffineMap, 4> maps;
+  maps.reserve(nInputs + nOutputs);
+  for (auto in : inputs)
+    maps.push_back(
+        AffineMap::get(/*dimCount=*/nDims, /*symbolCount=*/0, in.getExprs()));
+  for (auto out : outputs)
+    maps.push_back(
+        AffineMap::get(/*dimCount=*/nDims, /*symbolCount=*/0, out.getExprs()));
+
+  unsigned nViews = nInputs + nOutputs;
+  SmallVector<Value, 4> values;
+  values.reserve(nViews);
+  values.append(inputs.begin(), inputs.end());
+  values.append(outputs.begin(), outputs.end());
+
+  auto iteratorStrTypes = functional::map(toString, iteratorTypes);
+  // clang-format off
+  auto *op =
+      edsc::ScopedContext::getBuilder()
+          .create<linalg::GenericOp>(
+              edsc::ScopedContext::getLocation(),
+              values,
+              IntegerAttr::get(IntegerType::get(64, ctx), nInputs),
+              IntegerAttr::get(IntegerType::get(64, ctx), nOutputs),
+              builder.getAffineMapArrayAttr(maps),
+              builder.getStrArrayAttr(iteratorStrTypes),
+              StringAttr() /*doc*/,
+              FlatSymbolRefAttr() /*fun*/,
+              StringAttr() /*library_call*/
+              /* TODO: other attributes in op */
+              )
+          .getOperation();
+  // clang-format on
+
+  using namespace edsc;
+  SmallVector<Type, 4> blockTypes;
+  blockTypes.reserve(values.size());
+  for (auto it : llvm::enumerate(values))
+    blockTypes.push_back((it.index() < nViews)
+                             ? getElementTypeOrSelf(it.value())
+                             : it.value()->getType());
+
+  assert(op->getRegions().front().empty());
+  op->getRegions().front().push_front(new Block);
+  OpBuilder bb(op->getRegions().front());
+  ScopedContext scope(bb, op->getLoc());
+  BlockHandle b;
+  auto handles = makeValueHandles(blockTypes);
+  BlockBuilder(&b, makeHandlePointers(MutableArrayRef<ValueHandle>(handles)))(
+      [&] { regionBuilder(b.getBlock()->getArguments()); });
+  return op;
+}
+
+void mlir::edsc::ops::macRegionBuilder(ArrayRef<BlockArgument> args) {
+  using edsc::op::operator+;
+  using edsc::op::operator*;
+  assert(args.size() == 3 && "expected 3 block arguments");
+  ValueHandle a(args[0]), b(args[1]), c(args[2]);
+  linalg_yield((c + a * b).getValue());
+}
+
+Operation *mlir::edsc::ops::linalg_pointwise(UnaryPointwiseOpBuilder unaryOp,
+                                             StructuredIndexed I,
+                                             StructuredIndexed O) {
+  SmallVector<edsc::IterType, 4> iterTypes(O.getExprs().size(),
+                                           edsc::IterType::Parallel);
+  auto fun = [&unaryOp](ArrayRef<BlockArgument> args) {
+    assert(args.size() == 2 && "expected 2 block arguments");
+    ValueHandle a(args[0]);
+    linalg_yield(unaryOp(a));
+  };
+  return makeLinalgGenericOp(iterTypes, {I}, {O}, fun);
+}
+
+Operation *mlir::edsc::ops::linalg_pointwise_tanh(StructuredIndexed I,
+                                                  StructuredIndexed O) {
+  ;
+  using edsc::intrinsics::tanh;
+  UnaryPointwiseOpBuilder unOp([](ValueHandle a) -> Value { return tanh(a); });
+  return linalg_pointwise(unOp, I, O);
+}
+
+/// Binary pointwise operation (with broadcast) entry point.
+Operation *mlir::edsc::ops::linalg_pointwise(BinaryPointwiseOpBuilder binaryOp,
+                                             StructuredIndexed I1,
+                                             StructuredIndexed I2,
+                                             StructuredIndexed O) {
+  SmallVector<edsc::IterType, 4> iterTypes(O.getExprs().size(),
+                                           edsc::IterType::Parallel);
+  auto fun = [&binaryOp](ArrayRef<BlockArgument> args) {
+    assert(args.size() == 3 && "expected 3 block arguments");
+    ValueHandle a(args[0]), b(args[1]);
+    linalg_yield(binaryOp(a, b));
+  };
+  return makeLinalgGenericOp(iterTypes, {I1, I2}, {O}, fun);
+}
+
+Operation *mlir::edsc::ops::linalg_pointwise_add(StructuredIndexed I1,
+                                                 StructuredIndexed I2,
+                                                 StructuredIndexed O) {
+  using edsc::op::operator+;
+  BinaryPointwiseOpBuilder binOp(
+      [](ValueHandle a, ValueHandle b) -> Value { return a + b; });
+  return linalg_pointwise(binOp, I1, I2, O);
+}
+
+Operation *mlir::edsc::ops::linalg_pointwise_max(StructuredIndexed I1,
+                                                 StructuredIndexed I2,
+                                                 StructuredIndexed O) {
+  BinaryPointwiseOpBuilder binOp([](ValueHandle a, ValueHandle b) -> Value {
+    using edsc::intrinsics::select;
+    using edsc::op::operator>;
+    return select(a > b, a, b).getValue();
+  });
+  return linalg_pointwise(binOp, I1, I2, O);
+}
+
+Operation *mlir::edsc::ops::linalg_matmul(ValueHandle vA, ValueHandle vB,
+                                          ValueHandle vC) {
+  // clang-format off
+  AffineExpr m, n, k;
+  bindDims(ScopedContext::getContext(), m, n, k);
+  StructuredIndexed A(vA), B(vB), C(vC);
+  return makeLinalgGenericOp(
+    {IterType::Parallel, IterType::Parallel, IterType::Reduction},
+    {A({m, k}), B({k, n})},
+    {C({m, n})},
+    macRegionBuilder);
+  // clang-format on
+}
+
+Operation *mlir::edsc::ops::linalg_conv_nhwc(ValueHandle vI, ValueHandle vW,
+                                             ValueHandle vO,
+                                             ArrayRef<int> strides,
+                                             ArrayRef<int> dilations) {
+  MLIRContext *ctx = ScopedContext::getContext();
+  // TODO(ntv) some template magic to make everything rank-polymorphic.
+  assert((dilations.empty() || dilations.size() == 2) && "only 2-D conv atm");
+  assert((strides.empty() || strides.size() == 2) && "only 2-D conv atm");
+
+  // Some short names.
+  auto par = IterType::Parallel;
+  auto red = IterType::Reduction;
+  auto s = strides;
+  auto d = dilations;
+
+  AffineExpr b, f, h, w, kh, kw, c;
+  bindDims(ctx, b, f, h, w, kh, kw, c);
+  unsigned numDims = c.cast<AffineDimExpr>().getPosition() + 1;
+  StructuredIndexed I(vI), W(vW), O(vO);
+  // clang-format off
+  return makeLinalgGenericOp(
+    {par, par, par, par, red, red, red}, {
+      I({b,
+         // Roundtrip to flattened form to serve as canonicalization and ensure
+         // consistent ordering of subexpressions.
+         simplifyAffineExpr(s[0] * h + d[0] * kh, numDims, 0),
+         simplifyAffineExpr(s[1] * w + d[1] * kw, numDims, 0),
+         c}),
+      W({kh, kw, c, f})}, {
+      O({b, h, w, f})},
+    macRegionBuilder);
+  // clang-format on
+}
+
+Operation *mlir::edsc::ops::linalg_dilated_conv_nhwc(
+    ValueHandle vI, ValueHandle vW, ValueHandle vO, int depth_multiplier,
+    ArrayRef<int> strides, ArrayRef<int> dilations) {
+  MLIRContext *ctx = ScopedContext::getContext();
+  // TODO(ntv) some template magic to make everything rank-polymorphic.
+  assert((dilations.empty() || dilations.size() == 2) && "only 2-D conv atm");
+  assert((strides.empty() || strides.size() == 2) && "only 2-D conv atm");
+
+  // Some short names.
+  auto par = IterType::Parallel;
+  auto red = IterType::Reduction;
+  auto s = strides;
+  auto d = dilations;
+
+  // clang-format off
+  AffineExpr b, dm, c, h, w, kh, kw;
+  bindDims(ctx, b, dm, c, h, w, kh, kw);
+  unsigned numDims = kw.cast<AffineDimExpr>().getPosition() + 1;
+  StructuredIndexed I(vI), W(vW), O(vO);
+  return makeLinalgGenericOp(
+    {par, par, par, par, par, red, red}, {
+      I({b,
+         // Roundtrip to flattened form to serve as canonicalization and ensure
+         // consistent ordering of subexpressions.
+         simplifyAffineExpr(s[0] * h + d[0] * kh, numDims, 0),
+         simplifyAffineExpr(s[1] * w + d[1] * kw, numDims, 0),
+         c}),
+      W({kh, kw, c, dm})}, {
+      O({b, h, w, simplifyAffineExpr(c * depth_multiplier + dm, numDims, 0)})},
+    macRegionBuilder);
+  // clang-format on
+}
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f9f8f8d51f80cff8cb27bbb6163daf8c3d40df9
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -0,0 +1,822 @@
+//===- LinalgOps.cpp - Implementation of the linalg operations ------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a the Linalg operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+
+///////////////////// Operations defined with Tablegen /////////////////////////
+// For such operations that do not correspond to library calls (i.e. defined in
+// LinalgOps.td), we define an overloaded `print` function and a
+// parse`className` function.
+
+//===----------------------------------------------------------------------===//
+// GenericOps
+//===----------------------------------------------------------------------===//
+
+template <typename GenericOpType>
+static void printGenericOp(OpAsmPrinter &p, GenericOpType op) {
+  auto attrNames = op.linalgTraitAttrNames();
+  llvm::StringSet<> linalgTraitAttrsSet;
+  linalgTraitAttrsSet.insert(attrNames.begin(), attrNames.end());
+  SmallVector<NamedAttribute, 8> attrs;
+  for (auto attr : op.getAttrs())
+    if (linalgTraitAttrsSet.count(attr.first.strref()) > 0)
+      attrs.push_back(attr);
+
+  auto dictAttr = DictionaryAttr::get(attrs, op.getContext());
+  p << op.getOperationName() << " " << dictAttr << " " << op.getOperands();
+  if (!op.region().empty())
+    p.printRegion(op.region());
+  p.printOptionalAttrDict(op.getAttrs(), attrNames);
+  p << ": " << op.getOperandTypes();
+}
+
+static void print(OpAsmPrinter &p, GenericOp op) { printGenericOp(p, op); }
+
+static void print(OpAsmPrinter &p, IndexedGenericOp op) {
+  printGenericOp(p, op);
+}
+
+static ParseResult parseGenericOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 8> operandsInfo, regionOperandsInfo;
+  DictionaryAttr dictAttr;
+  // Parse the core linalg traits that must check into a dictAttr.
+  // The name is unimportant as we will overwrite result.attributes.
+  // The core linalg traits must contain the information necessary to pass the
+  // verifier.
+  if (parser.parseAttribute(dictAttr, "_", result.attributes) ||
+      parser.parseOperandList(operandsInfo))
+    return failure();
+  result.attributes.assign(dictAttr.getValue().begin(),
+                           dictAttr.getValue().end());
+
+  Region &region = *result.addRegion();
+  SmallVector<Type, 8> operandTypes, regionTypes;
+  // Optional attributes may be added.
+  // Either Optional getFunAttrName() attribute or region must be specified.
+  if (!dictAttr.get(getFunAttrName()) &&
+      parser.parseOptionalRegion(region, regionOperandsInfo, regionTypes))
+    return failure();
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonTypeList(operandTypes))
+    return failure();
+  return parser.resolveOperands(operandsInfo, operandTypes,
+                                parser.getCurrentLocation(), result.operands);
+}
+
+template <typename GenericOpType>
+LogicalResult verifyBlockArgs(GenericOpType op, Block &block);
+
+template <> LogicalResult verifyBlockArgs(GenericOp op, Block &block) {
+  auto nViews = op.getNumInputsAndOutputs();
+  auto nInputViews = op.getNumInputs();
+  if (block.getNumArguments() != nViews)
+    return op.emitError(
+        "op expected number of block arguments to match number of views");
+
+  for (unsigned i = 0; i < nViews; ++i) {
+    auto viewType = op.getViewType(i);
+    if (viewType.getElementType() != block.getArgument(i)->getType())
+      return op.emitError("op expected block argument ")
+             << i << " of the same type as elemental type of "
+             << ((i < nInputViews) ? "input " : "output ")
+             << "view: " << viewType;
+  }
+  return success();
+}
+
+template <> LogicalResult verifyBlockArgs(IndexedGenericOp op, Block &block) {
+  auto nInputViews = op.getNumInputs();
+  auto nLoops = op.getNumLoops();
+  auto nViews = op.getNumInputsAndOutputs();
+  if (block.getNumArguments() != nViews + nLoops)
+    return op.emitError(
+        "op expected number of block arguments to match number of views + "
+        "number of loops");
+
+  for (unsigned i = 0; i < nLoops; ++i) {
+    if (!block.getArgument(i)->getType().isIndex())
+      return op.emitError("op expected block argument ")
+             << i << " to be of IndexType";
+  }
+
+  for (unsigned i = 0; i < nViews; ++i) {
+    unsigned memrefArgIndex = i + nLoops;
+    auto viewType = op.getViewType(i);
+    if (viewType.getElementType() !=
+        block.getArgument(memrefArgIndex)->getType())
+      return op.emitError("op expected block argument ")
+             << memrefArgIndex << " of the same type as elemental type of "
+             << ((i < nInputViews) ? "input " : "output ")
+             << "view: " << viewType;
+  }
+  return success();
+}
+
+template <typename GenericOpType>
+LogicalResult verifyFuncArgs(GenericOpType op, FunctionType funType);
+
+template <> LogicalResult verifyFuncArgs(GenericOp op, FunctionType funType) {
+  auto nViews = op.getNumInputsAndOutputs();
+  auto nInputViews = op.getNumInputs();
+  if (funType.getNumInputs() != nViews)
+    return op.emitError("op expected fun arguments to match number of views");
+  if (funType.getNumResults() != op.getNumOutputs())
+    return op.emitError(
+        "op expected fun results to match number of output views");
+
+  for (auto en : llvm::enumerate(op.indexing_maps())) {
+    auto idx = en.index();
+    auto view = (idx < nInputViews) ? op.getInputViewType(idx)
+                                    : op.getOutputViewType(idx - nInputViews);
+    if (funType.getInput(idx) != view.getElementType())
+      return op.emitError("op expected fun argument ")
+             << idx << " of the same type as elemental type "
+             << view.getElementType() << " of view " << idx;
+
+    if (idx >= nInputViews) {
+      auto resultIdx = idx - nInputViews;
+      if (funType.getResult(resultIdx) != view.getElementType())
+        return op.emitError("op expected fun result ")
+               << resultIdx << " of the same type as elemental type "
+               << view.getElementType() << " of view " << idx;
+    }
+  }
+  return success();
+}
+
+template <>
+LogicalResult verifyFuncArgs(IndexedGenericOp op, FunctionType funType) {
+  auto nLoops = op.getNumLoops();
+  auto nInputViews = op.getNumInputs();
+  auto nOutputs = op.getNumOutputs();
+  auto nViews = op.getNumInputsAndOutputs();
+  if (funType.getNumInputs() != nViews + nLoops)
+    return op.emitError(
+        "op expected fun arguments to match number of views + number of loops");
+  if (funType.getNumResults() != nOutputs)
+    return op.emitError(
+        "op expected fun results to match number of output views");
+  for (unsigned i = 0; i < nLoops; ++i) {
+    if (!funType.getInput(i).isIndex())
+      return op.emitError("op expected fun argument ")
+             << i << " to be of IndexType";
+  }
+  for (auto en : llvm::enumerate(op.indexing_maps())) {
+    auto idx = en.index();
+    auto funIdx = nLoops + idx;
+    auto view = (idx < nInputViews) ? op.getInputViewType(idx)
+                                    : op.getOutputViewType(idx - nInputViews);
+    if (funType.getInput(funIdx) != view.getElementType())
+      return op.emitError("op expected fun argument ")
+             << funIdx << " of the same type as elemental type "
+             << view.getElementType() << " of view " << idx;
+
+    if (idx >= nInputViews) {
+      auto resultIdx = idx - nInputViews;
+      if (funType.getResult(resultIdx) != view.getElementType())
+        return op.emitError("op expected fun result ")
+               << resultIdx << " of the same type as elemental type "
+               << view.getElementType() << " of view " << idx;
+    }
+  }
+  return success();
+}
+
+template <typename GenericOpType>
+LogicalResult verifyGenericOp(GenericOpType op) {
+  auto nInputViews = op.getNumInputs();
+  auto nLoops = op.getNumLoops();
+  auto nViews = op.getNumInputsAndOutputs();
+  if (nViews != llvm::size(op.views()))
+    return op.emitError("op expected exactly ") << nViews << " view operands";
+
+  auto &region = op.region();
+  auto funOp = op.getFunction();
+  auto funType = funOp ? funOp.getType() : FunctionType();
+  if (!region.empty()) {
+    if (region.getBlocks().size() != 1)
+      return op.emitError("op expected region with 1 block");
+    if (failed(verifyBlockArgs(op, region.getBlocks().front())))
+      return failure();
+  } else {
+    if (!funOp || !funOp.getType())
+      return op.emitError(
+          "op expected fun attribute to refer to a defined symbol");
+    if (failed(verifyFuncArgs(op, funType)))
+      return failure();
+  }
+
+  SmallVector<AffineMap, 4> indexingMaps;
+  indexingMaps.reserve(op.indexing_maps().size());
+  for (auto en : llvm::enumerate(op.indexing_maps())) {
+    auto idx = en.index();
+    auto m = en.value().template cast<AffineMapAttr>().getValue();
+    indexingMaps.push_back(m); // Save reference to map for further checks.
+    auto view = (idx < nInputViews) ? op.getInputViewType(idx)
+                                    : op.getOutputViewType(idx - nInputViews);
+
+    if (m.getNumSymbols() != 0)
+      return op.emitError("op expected indexing_map #")
+             << idx << " to have no symbols";
+
+    if (m.getNumDims() != nLoops)
+      return op.emitError("op expected indexing_map #")
+             << idx << " to have " << nLoops
+             << " dim(s) to match the number of loops";
+
+    if (m.getNumResults() == 1 && view.getRank() == 0) {
+      auto cst = m.getResult(0).template dyn_cast<AffineConstantExpr>();
+      if (!cst || cst.getValue() != 0)
+        return op.emitError("op expected indexing_map #")
+               << idx << " to be 0 to match 0-D view: " << view;
+    }
+
+    if (m.getNumResults() != view.getRank())
+      return op.emitError("op expected indexing_map #")
+             << idx << " results to match view rank: " << view;
+  }
+
+  auto concatMap = concatAffineMaps(indexingMaps);
+  auto aggregateMap = inversePermutation(concatMap);
+  if (!aggregateMap)
+    return op.emitError("op expected the concatenation of maps in indexing_map "
+                        "to be invertible");
+
+  return success();
+}
+
+static LogicalResult verify(GenericOp op) { return verifyGenericOp(op); }
+static LogicalResult verify(IndexedGenericOp op) { return verifyGenericOp(op); }
+
+//===----------------------------------------------------------------------===//
+// RangeOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, RangeOp op) {
+  p << op.getOperationName() << " " << *op.min() << ":" << *op.max() << ":"
+    << *op.step();
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getResult()->getType();
+}
+
+static ParseResult parseRangeOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 3> rangeInfo(3);
+  RangeType type;
+  auto indexTy = parser.getBuilder().getIndexType();
+  return failure(parser.parseOperand(rangeInfo[0]) || parser.parseColon() ||
+                 parser.parseOperand(rangeInfo[1]) || parser.parseColon() ||
+                 parser.parseOperand(rangeInfo[2]) ||
+                 parser.parseOptionalAttrDict(result.attributes) ||
+                 parser.parseColonType(type) ||
+                 parser.resolveOperands(rangeInfo, indexTy, result.operands) ||
+                 parser.addTypeToList(type, result.types));
+}
+
+//===----------------------------------------------------------------------===//
+// SliceOp
+//===----------------------------------------------------------------------===//
+void mlir::linalg::SliceOp::build(Builder *b, OperationState &result,
+                                  Value base, ValueRange indexings) {
+  result.addOperands(base);
+  result.addOperands(indexings);
+
+  auto memRefType = base->getType().cast<MemRefType>();
+  int64_t offset;
+  SmallVector<int64_t, 4> strides;
+  auto res = getStridesAndOffset(memRefType, strides, offset);
+  assert(succeeded(res) && strides.size() == indexings.size());
+  (void)res;
+
+  unsigned rank = memRefType.getRank();
+  // TODO(ntv): propagate static size and stride information when available.
+  SmallVector<int64_t, 4> sizes(rank, -1); // -1 encodes dynamic size.
+  Type elementType = memRefType.getElementType();
+  result.addTypes({MemRefType::get(
+      sizes, elementType,
+      {makeStridedLinearLayoutMap(strides, offset, b->getContext())},
+      memRefType.getMemorySpace())});
+}
+
+static void print(OpAsmPrinter &p, SliceOp op) {
+  auto indexings = op.indexings();
+  p << SliceOp::getOperationName() << " " << *op.view() << "[" << indexings
+    << "] ";
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getBaseViewType();
+  if (!indexings.empty())
+    p << ", " << op.indexings().getTypes();
+  p << ", " << op.getType();
+}
+
+static ParseResult parseSliceOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType baseInfo;
+  SmallVector<OpAsmParser::OperandType, 8> operands;
+  SmallVector<Type, 8> types;
+  if (parser.parseOperand(baseInfo) ||
+      parser.parseOperandList(operands, OpAsmParser::Delimiter::Square) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonTypeList(types))
+    return failure();
+
+  if (types.size() < 2)
+    return parser.emitError(parser.getCurrentLocation(),
+                            "expected at least input and result view types");
+
+  ArrayRef<Type> indexingTypes = ArrayRef<Type>(types).drop_front().drop_back();
+  return failure(
+      parser.resolveOperand(baseInfo, types.front(), result.operands) ||
+      (!operands.empty() &&
+       parser.resolveOperands(operands, indexingTypes,
+                              operands.front().location, result.operands)) ||
+      parser.addTypeToList(types.back(), result.types));
+}
+
+static LogicalResult verify(SliceOp op) {
+  unsigned rank = op.getBaseViewRank();
+  if (rank != llvm::size(op.indexings()))
+    return op.emitOpError("expected ")
+           << rank << " indexings, got " << llvm::size(op.indexings());
+  unsigned index = 0;
+  for (auto indexing : op.indexings()) {
+    if (indexing->getType().isa<IndexType>())
+      --rank;
+    ++index;
+  }
+  if (op.getRank() != rank)
+    return op.emitOpError() << "expected rank of the view(" << op.getRank()
+                            << ") to be the number of ranges(" << rank << ")";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+//===----------------------------------------------------------------------===//
+void mlir::linalg::TransposeOp::build(Builder *b, OperationState &result,
+                                      Value view, AffineMapAttr permutation,
+                                      ArrayRef<NamedAttribute> attrs) {
+  auto permutationMap = permutation.getValue();
+  assert(permutationMap);
+
+  auto memRefType = view->getType().cast<MemRefType>();
+  auto rank = memRefType.getRank();
+  auto originalSizes = memRefType.getShape();
+  // Compute permuted sizes.
+  SmallVector<int64_t, 4> sizes(rank, 0);
+  for (auto en : llvm::enumerate(permutationMap.getResults()))
+    sizes[en.index()] =
+        originalSizes[en.value().cast<AffineDimExpr>().getPosition()];
+
+  // Compute permuted strides.
+  int64_t offset;
+  SmallVector<int64_t, 4> strides;
+  auto res = getStridesAndOffset(memRefType, strides, offset);
+  assert(succeeded(res) && strides.size() == static_cast<unsigned>(rank));
+  (void)res;
+  auto map = makeStridedLinearLayoutMap(strides, offset, b->getContext());
+  map = permutationMap ? map.compose(permutationMap) : map;
+  // Compute result type.
+  auto resultType = MemRefType::get(sizes, memRefType.getElementType(), map,
+                                    memRefType.getMemorySpace());
+
+  build(b, result, resultType, view, attrs);
+  result.addAttribute(TransposeOp::getPermutationAttrName(), permutation);
+}
+
+static void print(OpAsmPrinter &p, TransposeOp op) {
+  p << op.getOperationName() << " " << *op.view() << " " << op.permutation();
+  p.printOptionalAttrDict(op.getAttrs(),
+                          {TransposeOp::getPermutationAttrName()});
+  p << " : " << op.view()->getType();
+}
+
+static ParseResult parseTransposeOp(OpAsmParser &parser,
+                                    OperationState &result) {
+  OpAsmParser::OperandType view;
+  AffineMapAttr permutation;
+  MemRefType type;
+  return failure(parser.parseOperand(view) ||
+                 parser.parseAttribute(permutation,
+                                       TransposeOp::getPermutationAttrName(),
+                                       result.attributes) ||
+                 parser.parseOptionalAttrDict(result.attributes) ||
+                 parser.parseColonType(type) ||
+                 parser.resolveOperand(view, type, result.operands) ||
+                 parser.addTypeToList(type, result.types));
+}
+
+//===----------------------------------------------------------------------===//
+// YieldOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, YieldOp op) {
+  p << op.getOperationName();
+  if (op.getNumOperands() > 0)
+    p << ' ' << op.getOperands();
+  p.printOptionalAttrDict(op.getAttrs());
+  if (op.getNumOperands() > 0)
+    p << " : " << op.getOperandTypes();
+}
+
+static ParseResult parseYieldOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 2> opInfo;
+  SmallVector<Type, 2> types;
+  llvm::SMLoc loc = parser.getCurrentLocation();
+  return failure(parser.parseOperandList(opInfo) ||
+                 parser.parseOptionalAttrDict(result.attributes) ||
+                 (!opInfo.empty() && parser.parseColonTypeList(types)) ||
+                 parser.resolveOperands(opInfo, types, loc, result.operands));
+}
+
+template <typename GenericOpType>
+LogicalResult verifyYield(YieldOp op, GenericOpType genericOp) {
+  // The operand number and types must match the view element types.
+  auto nOutputViews = genericOp.getNumOutputs();
+  if (op.getNumOperands() != nOutputViews)
+    return op.emitOpError("op expected ")
+           << nOutputViews << " operand to match enclosing linalg.generic op";
+
+  for (unsigned i = 0; i != nOutputViews; ++i) {
+    auto elementType = genericOp.getOutputViewType(i).getElementType();
+    if (op.getOperand(i)->getType() != elementType)
+      return op.emitError("type of return operand ")
+             << i << " (" << op.getOperand(i)->getType()
+             << ") doesn't match view element type (" << elementType << ")";
+  }
+  return success();
+}
+
+static LogicalResult verify(YieldOp op) {
+  auto *parentOp = op.getParentOp();
+  if (parentOp->getNumRegions() != 1 || parentOp->getRegion(0).empty())
+    return op.emitOpError("op expected single non-empty parent region");
+
+  auto genericOp = dyn_cast<GenericOp>(parentOp);
+  if (genericOp)
+    return verifyYield(op, genericOp);
+
+  auto indexedGenericOp = dyn_cast<IndexedGenericOp>(parentOp);
+  if (indexedGenericOp)
+    return verifyYield(op, indexedGenericOp);
+
+  return op.emitOpError("expected '")
+         << GenericOp::getOperationName() << "' or '"
+         << IndexedGenericOp::getOperationName() << "' parent op";
+}
+
+/////// Operations corresponding to library calls defined with Tablegen ////////
+// For such operations correspond to library calls (i.e. defined in
+// LinalgStructuredOps.td), we define an overloaded `print` function and a
+// parse`className` function.
+
+// A LinalgStructuredOp prints as:
+//
+// ```mlir
+//   concrete_op_name (ssa-inputs, ssa-outputs) : view-types
+// ```
+//
+// for example:
+//
+// ```
+//   linalg.matmul(%0, %1, %2) :
+//     memref<?x?xf32, stride_specification>,
+//     memref<?x?xf32, stride_specification>,
+//     memref<?x?xf32, stride_specification>
+// ```
+//
+// Where %0, %1 and %2 are ssa-values of type MemRefType with strides.
+static void printLinalgStructuredOp(OpAsmPrinter &p, Operation *op) {
+  assert(op->getAbstractOperation() && "unregistered operation");
+  p << op->getName().getStringRef() << "(" << op->getOperands() << ")";
+  p.printOptionalAttrDict(op->getAttrs());
+  p << " : " << op->getOperandTypes();
+}
+
+static ParseResult parseLinalgStructuredOp(OpAsmParser &parser,
+                                           OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 3> ops;
+  SmallVector<Type, 3> types;
+  return failure(
+      parser.parseOperandList(ops, OpAsmParser::Delimiter::Paren) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonTypeList(types) ||
+      parser.resolveOperands(ops, types, parser.getNameLoc(), result.operands));
+}
+
+static LogicalResult verify(FillOp op) {
+  auto viewType = op.getOutputViewType(0);
+  auto fillType = op.value()->getType();
+  if (viewType.getElementType() != fillType)
+    return op.emitOpError("expects fill type to match view elemental type");
+  return success();
+}
+
+static LogicalResult verify(CopyOp op) {
+  auto outputViewType = op.getOutputViewType(0);
+  auto inputViewType = op.getInputViewType(0);
+  if (inputViewType.getElementType() != outputViewType.getElementType())
+    return op.emitOpError("expects views of the same type");
+  if (inputViewType.getRank() != outputViewType.getRank())
+    return op.emitOpError("expects views of the same rank");
+  auto rank = op.getNumParallelLoops();
+  auto inputPermutationMap = op.inputPermutation();
+  if (inputPermutationMap) {
+    if (inputPermutationMap->getNumInputs() != rank)
+      return op.emitOpError("expects optional input_permutation map of rank ")
+             << rank;
+    if (!inputPermutationMap->isPermutation())
+      return op.emitOpError(
+          "expects optional input_permutation map to be a permutation");
+  }
+  auto outputPermutationMap = op.outputPermutation();
+  if (outputPermutationMap) {
+    if (outputPermutationMap->getNumInputs() != rank)
+      return op.emitOpError("expects optional output_permutation map of rank ")
+             << rank;
+    if (!outputPermutationMap->isPermutation())
+      return op.emitOpError(
+          "expects optional output_permutation map to be a permutation");
+  }
+  if (rank == 0 && inputPermutationMap)
+    return op.emitOpError("expected no input permutation when rank == 0");
+  if (rank == 0 && outputPermutationMap)
+    return op.emitOpError("expected no output permutation when rank == 0");
+  return success();
+}
+
+static LogicalResult
+verifyStrideOrDilation(ConvOp op, ArrayRef<Attribute> attrs, bool isStride) {
+  auto strideOrDilation = isStride ? "stride" : "dilation";
+  if (attrs.size() != op.getNumWindowLoops())
+    return op.emitOpError("expects num ")
+           << strideOrDilation
+           << "s equal to number of window dimensions: " << attrs.size()
+           << " vs " << op.getNumWindowLoops();
+  return success();
+}
+
+static LogicalResult verify(ConvOp op) {
+  auto oType = op.output()->getType().cast<MemRefType>();
+  auto fType = op.filter()->getType().cast<MemRefType>();
+  auto iType = op.input()->getType().cast<MemRefType>();
+  if (oType.getElementType() != iType.getElementType() ||
+      oType.getElementType() != fType.getElementType())
+    return op.emitOpError("expects memref elemental types to match");
+  if (oType.getRank() != iType.getRank() || oType.getRank() != fType.getRank())
+    return op.emitOpError("expects memref ranks to match");
+  if (auto strides = op.strides()) {
+    if (failed(
+            verifyStrideOrDilation(op, strides->getValue(), /*isStride=*/true)))
+      return failure();
+  }
+  if (auto dilations = op.dilations()) {
+    if (failed(verifyStrideOrDilation(op, dilations->getValue(),
+                                      /*isStride=*/false)))
+      return failure();
+  }
+  return success();
+}
+
+namespace mlir {
+namespace linalg {
+
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterfaces.cpp.inc"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/Linalg/IR/LinalgOps.cpp.inc"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
+
+} // namespace linalg
+} // namespace mlir
+
+static AffineMap extractOrIdentityMap(Optional<AffineMap> maybeMap,
+                                      unsigned rank, MLIRContext *context) {
+  if (maybeMap)
+    return maybeMap.getValue();
+  if (rank == 0)
+    return AffineMap();
+  return AffineMap::getMultiDimIdentityMap(rank, context);
+}
+
+// Returns `num` AffineDimExpr dimensions at positions [curIdx, curIdx + num)
+// and increments `curIdx` to `curIdx + num`.
+static SmallVector<AffineExpr, 4>
+makeAffineDimExprs(unsigned num, unsigned &curIdx, MLIRContext *context) {
+  SmallVector<AffineExpr, 4> res;
+  res.reserve(num);
+  for (unsigned i = 0; i < num; ++i)
+    res.push_back(getAffineDimExpr(curIdx++, context));
+  return res;
+}
+
+static SmallVector<AffineExpr, 4>
+weightedConvInputIndex(ConvOp op, ArrayRef<AffineExpr> a,
+                       ArrayRef<AffineExpr> b) {
+  assert(a.size() == b.size());
+  SmallVector<AffineExpr, 4> res;
+  res.reserve(a.size());
+  for (unsigned i = 0, e = a.size(); i < e; ++i) {
+    res.push_back(op.getStride(i) * a[i] + op.getDilation(i) * b[i]);
+  }
+  return res;
+}
+
+static SmallVector<AffineExpr, 4> concat(ArrayRef<AffineExpr> a,
+                                         ArrayRef<AffineExpr> b) {
+  SmallVector<AffineExpr, 4> res;
+  res.reserve(a.size() + b.size());
+  res.assign(a.begin(), a.end());
+  res.append(b.begin(), b.end());
+  return res;
+}
+
+// Note: both functions below would completely disappear with a simple tensor
+// kernel language.
+//
+// Ideally this should all be Tablegen'd but there is no good story for
+// AffineMap for now.
+SmallVector<AffineMap, 4> mlir::linalg::loopToOperandRangesMaps(Operation *op) {
+  MLIRContext *context = op->getContext();
+  if (auto copyOp = dyn_cast<CopyOp>(op)) {
+    // I(input_perm(ivs)) -> O(output_perm(ivs))
+    auto maybeInputMap = copyOp.inputPermutation();
+    auto maybeOutputMap = copyOp.outputPermutation();
+    unsigned inputRank = copyOp.getInputViewType(0).getRank();
+    unsigned outputRank = copyOp.getOutputViewType(0).getRank();
+    return SmallVector<AffineMap, 4>{
+        extractOrIdentityMap(maybeInputMap, inputRank, context),
+        extractOrIdentityMap(maybeOutputMap, outputRank, context)};
+  }
+  if (auto fillOp = dyn_cast<FillOp>(op)) {
+    // filling_value -> O(ivs)
+    unsigned rank = fillOp.getNumParallelLoops();
+    return SmallVector<AffineMap, 4>{
+        extractOrIdentityMap(llvm::None, rank, context)};
+  }
+  auto i = getAffineDimExpr(0, context);
+  auto j = getAffineDimExpr(1, context);
+  auto k = getAffineDimExpr(2, context);
+  if (isa<DotOp>(op))
+    // A(r_i) * B(r_i) -> C()
+    return SmallVector<AffineMap, 4>{AffineMap::get(1, 0, {i}),
+                                     AffineMap::get(1, 0, {i}), AffineMap()};
+  if (isa<MatvecOp>(op))
+    //   A(i, r_j) * B(r_j) -> C(i)
+    return SmallVector<AffineMap, 4>{AffineMap::get(2, 0, {i, j}),
+                                     AffineMap::get(2, 0, {j}),
+                                     AffineMap::get(2, 0, {i})};
+  if (isa<MatmulOp>(op))
+    //   A(i, r_k) * B(r_k, j) -> C(i, j)
+    return SmallVector<AffineMap, 4>{AffineMap::get(3, 0, {i, k}),
+                                     AffineMap::get(3, 0, {k, j}),
+                                     AffineMap::get(3, 0, {i, j})};
+  if (auto convOp = dyn_cast<ConvOp>(op)) {
+    //   F(z0, ..., zN-1, q, k) * I(b, x0 + z0, ..., xN-1 + zN-1, q) ->
+    //     O(b, x0, ..., xN-1, k)
+    // for N equal to `nWindow`.
+    auto nWin = convOp.getNumWindowLoops();
+    assert(nWin > 0 && "expected at least one window dimension");
+    unsigned idx = 0;
+    // In the following, AffineDimExprs are indexed in loop order:
+    //   [ b, xs, k,           q,                     zs]
+    //    parallels     non-window reductions     windows
+    //
+    // Parallel dims are exactly the dimensions indexing `output`:
+    //     output[b, x[0], ..., x[N-1], k]; i.e.
+    //  * batch dimensions (bs with #bs = 1 for now)
+    //  * "image" dimensions (xs with #xs = #zs = output_rank - #bs - #ks)
+    //  * output filter dimensions (ks with #ks = 1 for now)
+    auto bs = makeAffineDimExprs(convOp.getNumBatchDimensions(), idx, context);
+    auto xs = makeAffineDimExprs(nWin, idx, context);
+    auto ks = makeAffineDimExprs(convOp.getNumOutputFeatureDimensions(), idx,
+                                 context);
+    // Non-window reduction dim: sum_{z[0], ..., z[N-1], q}
+    auto qs =
+        makeAffineDimExprs(convOp.getNumInputFeatureDimensions(), idx, context);
+    // Window reduction dims: sum_{z[0], ..., z[N-1], q}
+    auto zs = makeAffineDimExprs(nWin, idx, context);
+    // Construct the weighedSum expression.
+    auto ws = weightedConvInputIndex(convOp, xs, zs);
+    return SmallVector<AffineMap, 4>{
+        // filter[z[0], ..., z[N-1], q, k]
+        AffineMap::get(idx, 0, concat(concat(zs, qs), ks)),
+        // input[b,
+        //       x[0]*s[0] + d[0]*z[0], ..., x[N-1]*s[N-1] + d[N-1]*z[N-1],
+        //       q]
+        AffineMap::get(idx, 0, concat(concat(bs, ws), qs)),
+        // output[b, x[0], ..., x[N-1], k]
+        AffineMap::get(idx, 0, concat(concat(bs, xs), ks))};
+  } else if (auto genericOp = dyn_cast<GenericOp>(op)) {
+    SmallVector<AffineMap, 4> res;
+    unsigned nViews = genericOp.getNumInputsAndOutputs();
+    res.reserve(nViews);
+    for (unsigned i = 0, e = nViews; i < e; ++i) {
+      res.push_back(genericOp.getIndexingMap(i));
+    }
+    return res;
+  } else if (auto indexedGenericOp = dyn_cast<IndexedGenericOp>(op)) {
+    SmallVector<AffineMap, 4> res;
+    unsigned nViews = indexedGenericOp.getNumInputsAndOutputs();
+    res.reserve(nViews);
+    for (unsigned i = 0, e = nViews; i < e; ++i)
+      res.push_back(indexedGenericOp.getIndexingMap(i));
+    return res;
+  }
+  llvm_unreachable("Missing loopToOperandRangesMaps for op");
+}
+
+static void appendMangledType(llvm::raw_string_ostream &ss, Type t) {
+  if (auto memref = t.dyn_cast<MemRefType>()) {
+    ss << "view";
+    for (auto size : memref.getShape())
+      if (size < 0)
+        ss << "sx";
+      else
+        ss << size << "x";
+    appendMangledType(ss, memref.getElementType());
+  } else if (auto vec = t.dyn_cast<VectorType>()) {
+    ss << "vector";
+    interleave(
+        vec.getShape(), [&](int64_t i) { ss << i; }, [&]() { ss << "x"; });
+    appendMangledType(ss, vec.getElementType());
+  } else if (t.isIntOrIndexOrFloat()) {
+    ss << t;
+  } else {
+    llvm_unreachable("Invalid type for linalg library name mangling");
+  }
+}
+
+std::string mlir::linalg::generateLibraryCallName(Operation *op) {
+  assert(isa<LinalgOp>(op));
+  std::string name(op->getName().getStringRef().str());
+  name.reserve(128);
+  std::replace(name.begin(), name.end(), '.', '_');
+  llvm::raw_string_ostream ss(name);
+  ss << "_";
+  auto types = op->getOperandTypes();
+  interleave(
+      types.begin(), types.end(), [&](Type t) { appendMangledType(ss, t); },
+      [&]() { ss << "_"; });
+  return ss.str();
+}
+
+static ArrayAttr getIndexingMaps(Operation *op) {
+  LinalgOp linalgOp = cast<LinalgOp>(op);
+  SmallVector<Attribute, 4> maps;
+  maps.reserve(linalgOp.getNumInputsAndOutputs());
+  for (AffineMap map : loopToOperandRangesMaps(op))
+    maps.push_back(AffineMapAttr::get(map));
+  return ArrayAttr::get(maps, op->getContext());
+}
+ArrayAttr mlir::linalg::ConvOp::indexing_maps() {
+  return getIndexingMaps(getOperation());
+}
+ArrayAttr mlir::linalg::CopyOp::indexing_maps() {
+  return getIndexingMaps(getOperation());
+}
+ArrayAttr mlir::linalg::DotOp::indexing_maps() {
+  return getIndexingMaps(getOperation());
+}
+ArrayAttr mlir::linalg::FillOp::indexing_maps() {
+  return getIndexingMaps(getOperation());
+}
+ArrayAttr mlir::linalg::MatmulOp::indexing_maps() {
+  return getIndexingMaps(getOperation());
+}
+ArrayAttr mlir::linalg::MatvecOp::indexing_maps() {
+  return getIndexingMaps(getOperation());
+}
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgTypes.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgTypes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..32b1620f67c87d15249c80ebafbc951aaef84872
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgTypes.cpp
@@ -0,0 +1,66 @@
+//===- Dialect.cpp - Implementation of the linalg dialect and types -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Linalg dialect types and dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Parser.h"
+#include "mlir/Support/LLVM.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+mlir::linalg::LinalgDialect::LinalgDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context) {
+  addTypes<RangeType>();
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgOps.cpp.inc"
+      >();
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
+      >();
+}
+Type mlir::linalg::LinalgDialect::parseType(DialectAsmParser &parser) const {
+  // Parse the main keyword for the type.
+  StringRef keyword;
+  if (parser.parseKeyword(&keyword))
+    return Type();
+  MLIRContext *context = getContext();
+
+  // Handle 'range' types.
+  if (keyword == "range")
+    return RangeType::get(context);
+
+  parser.emitError(parser.getNameLoc(), "unknown Linalg type: " + keyword);
+  return Type();
+}
+
+/// RangeType prints as just "range".
+static void print(RangeType rt, DialectAsmPrinter &os) { os << "range"; }
+
+void mlir::linalg::LinalgDialect::printType(Type type,
+                                            DialectAsmPrinter &os) const {
+  switch (type.getKind()) {
+  default:
+    llvm_unreachable("Unhandled Linalg type");
+  case LinalgTypes::Range:
+    print(type.cast<RangeType>(), os);
+    break;
+  }
+}
diff --git a/mlir/lib/Dialect/Linalg/LinalgRegistration.cpp b/mlir/lib/Dialect/Linalg/LinalgRegistration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..768b18b57f0760bdeefaedcf9ad3cb990f787563
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/LinalgRegistration.cpp
@@ -0,0 +1,16 @@
+//===- LinalgRegistration.cpp - Register the linalg dialect statically ----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+// Static initialization for LinalgOps dialect registration.
+static DialectRegistration<LinalgDialect> LinalgOps;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9df7bce087923ad4852b2fbb9389fb3eed9761da
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
@@ -0,0 +1,346 @@
+//===- Fusion.cpp - Implementation of linalg Fusion -----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the linalg dialect Fusion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Dominance.h"
+#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/FoldUtils.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "linalg-fusion"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+using namespace mlir::linalg::intrinsics;
+
+using llvm::dbgs;
+
+/// Implements a simple high-level fusion pass of linalg library operations.
+///
+/// In each block, linalg ops are processed in reverse textual order.
+/// Given a linalg op `O`, fusion occurs by:
+///   1. inspecting the linalg ops that write into the views read by `O`. This
+///      uses the SSA value of the views and a simple subview/slice analysis to
+///      determine producer-consumer dependences;
+///   2. greedily fuse the linalg ops that produce subview
+///   3. inspect the fused ops and determine whether they have other remaining
+///      LinalgOp uses. If not, then erase the original producing linalg op.
+///
+/// More advanced use cases, analyses as well as profitability heuristics are
+/// left for future work.
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+static llvm::cl::list<unsigned> clTileSizes(
+    "linalg-fusion-tile-sizes",
+    llvm::cl::desc(
+        "Tile sizes by which to tile linalg operations during linalg fusion"),
+    llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
+    llvm::cl::cat(clOptionsCategory));
+
+// Return a cloned version of `op` that operates on `loopRanges`, assumed to be
+// a subset of the original loop ranges of `op`.
+// This is achieved by applying the `loopToOperandRangesMaps` permutation maps
+// to the `loopRanges` in order to obtain view ranges.
+static LinalgOp cloneWithLoopRanges(OpBuilder &b, Location loc, LinalgOp op,
+                                    ArrayRef<SubViewOp::Range> loopRanges) {
+  auto maps = loopToOperandRangesMaps(op);
+  SmallVector<Value, 8> clonedViews;
+  clonedViews.reserve(op.getNumInputsAndOutputs());
+  // Iterate over the inputs and outputs in order.
+  // Extract the subranges from the linearized ranges.
+  SmallVector<Value, 8> ios(op.getInputsAndOutputs());
+  for (auto en : llvm::enumerate(ios)) {
+    unsigned idx = en.index();
+    auto map = maps[idx];
+    LLVM_DEBUG(dbgs() << "map: " << map << "\n");
+    Value view = en.value();
+    SmallVector<SubViewOp::Range, 4> viewRanges(map.getNumResults());
+    for (auto en2 : llvm::enumerate(map.getResults())) {
+      unsigned d = en2.index();
+      // loopToOperandRangesMaps are permutations-only.
+      unsigned loopPos = en2.value().cast<AffineDimExpr>().getPosition();
+      viewRanges[d] = loopRanges[loopPos];
+      LLVM_DEBUG(dbgs() << "\ni,j: " << en.index() << ", " << en2.index()
+                        << "\t"
+                        << "loopPos: " << loopPos << "\t" << viewRanges[d]);
+    }
+    // Construct a new subview for the tile.
+    unsigned rank = viewRanges.size();
+    SmallVector<Value, 4> offsets, sizes, strides;
+    offsets.reserve(rank);
+    sizes.reserve(rank);
+    strides.reserve(rank);
+    for (auto r : viewRanges) {
+      offsets.push_back(r.offset);
+      sizes.push_back(r.size);
+      strides.push_back(r.stride);
+    }
+    clonedViews.push_back(
+        b.create<SubViewOp>(loc, view, offsets, sizes, strides));
+  }
+  auto operands = getAssumedNonViewOperands(op);
+  clonedViews.append(operands.begin(), operands.end());
+  return op.clone(b, loc, clonedViews);
+}
+
+struct ViewDimension {
+  Value view;
+  unsigned dimension;
+};
+
+// Given an `op`, returns the first (`view`, `dimension`) pair that identifies
+// the loop range at `loopDepth`. The semantics of the loopToOperandRangesMaps
+// guarantees at least one such dimension is found. If multiple candidates exist
+// they must agree by construction (i.e. have the same size) and we just return
+// the first one.
+static ViewDimension getViewDefiningLoopRange(LinalgOp op, unsigned loopDepth) {
+  auto maps = loopToOperandRangesMaps(op);
+  // Iterate over the inputs and outputs in order.
+  // Extract the subranges from the linearized ranges.
+  SmallVector<Value, 8> ios(op.getInputsAndOutputs());
+  for (auto en : llvm::enumerate(ios)) {
+    unsigned idx = en.index();
+    auto map = maps[idx];
+    LLVM_DEBUG(dbgs() << "getViewDefiningLoopRange I/O idx: " << idx << "\n");
+    LLVM_DEBUG(dbgs() << "getViewDefiningLoopRange map: " << map << "\n");
+    Value view = en.value();
+    SmallVector<Value, 8> viewRanges(map.getNumResults(), nullptr);
+    for (auto en2 : llvm::enumerate(map.getResults())) {
+      if (loopDepth == en2.value().cast<AffineDimExpr>().getPosition()) {
+        LLVM_DEBUG(dbgs() << "getViewDefiningLoopRange loopDepth: " << loopDepth
+                          << "\n");
+        LLVM_DEBUG(dbgs() << "getViewDefiningLoopRange view: " << *view
+                          << "\n");
+        return ViewDimension{view, static_cast<unsigned>(en2.index())};
+      }
+    }
+  }
+  llvm_unreachable("Expect to be able to extract a view defining loop range");
+}
+
+static LinalgOp fuse(Value producedView, LinalgOp producer, LinalgOp consumer,
+                     unsigned consumerIdx, unsigned producerIdx,
+                     OperationFolder *folder) {
+  auto subView = dyn_cast_or_null<SubViewOp>(
+      consumer.getInput(consumerIdx)->getDefiningOp());
+  auto slice = dyn_cast_or_null<SliceOp>(
+      consumer.getInput(consumerIdx)->getDefiningOp());
+  assert(subView || slice);
+  (void)subView;
+  (void)slice;
+
+  // loopToOperandRangesMaps are permutations-only by construction:
+  //   we can always identify a data dimension with a (at least one) loop
+  //   dimension.
+  AffineMap producerMap =
+      loopToOperandRangesMaps(producer)[producer.getNumInputs() + producerIdx];
+  LLVM_DEBUG(dbgs() << "Producer Idx: " << producerIdx
+                    << ", producer map: " << producerMap << "\n");
+
+  unsigned nPar = producer.getNumParallelLoops();
+  unsigned nRed = producer.getNumReductionLoops();
+  unsigned nWin = producer.getNumWindowLoops();
+  SmallVector<SubViewOp::Range, 8> loopRanges(nPar + nRed + nWin);
+
+  // Iterate over dimensions identified by the producer map for `producerIdx`.
+  // This defines a subset of the loop ranges that we need to complete later.
+  for (auto en : llvm::enumerate(producerMap.getResults())) {
+    unsigned posInProducerLoop = en.value().cast<AffineDimExpr>().getPosition();
+    loopRanges[posInProducerLoop] = subView.getRanges()[en.index()];
+  }
+
+  OpBuilder b(consumer.getOperation());
+  auto loc = consumer.getLoc();
+  // Iterate over all dimensions. For the dimensions not identified by the
+  // producer map for `producerIdx`, we need to explicitly compute the view that
+  // defines the loop ranges using the `producer`.
+  for (unsigned i = 0, nLoops = loopRanges.size(); i < nLoops; ++i) {
+    if (loopRanges[i].offset)
+      LLVM_DEBUG(llvm::dbgs()
+                 << "existing LoopRange: " << loopRanges[i] << "\n");
+    else {
+      auto viewDim = getViewDefiningLoopRange(producer, i);
+      loopRanges[i] = SubViewOp::Range{constant_index(folder, 0),
+                                       dim(viewDim.view, viewDim.dimension),
+                                       constant_index(folder, 1)};
+      LLVM_DEBUG(llvm::dbgs() << "new LoopRange: " << loopRanges[i] << "\n");
+    }
+  }
+
+  return cloneWithLoopRanges(b, loc, producer, loopRanges);
+}
+
+// Encode structural fusion safety preconditions.
+// Some of these will be lifted in the future with better analysis.
+static bool isStructurallyFusableProducer(LinalgOp producer, Value consumedView,
+                                          LinalgOp consumer) {
+  if (producer.getNumOutputs() != 1) {
+    LLVM_DEBUG(dbgs() << "\nNot structurally fusable (multi-output)");
+    return false;
+  }
+  // Only fuse when the producer block dominates.
+  DominanceInfo dom(producer.getOperation());
+  if (!dom.dominates(producer.getOperation()->getBlock(),
+                     consumer.getOperation()->getBlock())) {
+    LLVM_DEBUG(
+        dbgs()
+        << "\nNot structurally fusable (producer block does not dominate)");
+    return false;
+  }
+  return true;
+}
+
+bool mlir::linalg::isProducerLastWriteOfView(const LinalgDependenceGraph &graph,
+                                             LinalgOp consumer,
+                                             Value consumedView,
+                                             LinalgOp producer) {
+  // Make some simple structural checks that alleviate the need for more
+  // complex analyses.
+  if (!isStructurallyFusableProducer(producer, consumedView, consumer)) {
+    LLVM_DEBUG(dbgs() << "\n***Not static last write due to structure:\t"
+                      << *producer.getOperation());
+    return false;
+  }
+  // Check for any interleaved write to consumedView.
+  if (!graph.findCoveringWrites(producer, consumer, consumedView).empty()) {
+    LLVM_DEBUG(dbgs() << "\n***Not fusable due to interleaved write:\t"
+                      << *producer.getOperation());
+    return false;
+  }
+  return true;
+}
+
+bool mlir::linalg::isFusableInto(const LinalgDependenceGraph &graph,
+                                 LinalgOp consumer, Value consumedView,
+                                 LinalgOp producer) {
+  if (!isProducerLastWriteOfView(graph, consumer, consumedView, producer))
+    return false;
+  // Check for any fusion-preventing dependence to any view read/written that
+  // would violate dependences.
+  if (!graph.findCoveringDependences(producer, consumer).empty()) {
+    LLVM_DEBUG(dbgs() << "\n***Not fusable due to an interleaved dependence:\t"
+                      << *producer.getOperation());
+    return false;
+  }
+  return true;
+}
+
+// Only consider RAW atm.
+Optional<FusionInfo> mlir::linalg::fuseProducerOf(
+    OpBuilder &b, LinalgOp consumer, unsigned consumerIdx,
+    const LinalgDependenceGraph &graph, OperationFolder *folder) {
+  LLVM_DEBUG(dbgs() << "\nStart examining consumer: "
+                    << *consumer.getOperation());
+  for (auto dependence : graph.getDependencesInto(
+           consumer, LinalgDependenceGraph::DependenceType::RAW)) {
+    LLVM_DEBUG(dbgs() << "\n***Consider producer:\t"
+                      << *dependence.dependentOpView.op << "\n");
+    auto producer = cast<LinalgOp>(dependence.dependentOpView.op);
+
+    // Check that the dependence is indeed on the input `consumerIdx` view.
+    auto consumedView = dependence.indexingView;
+    if (consumer.getInput(consumerIdx) != consumedView)
+      continue;
+
+    // Consumer consumes this view, `isStructurallyFusableProducer` also checks
+    // whether it is a strict subview of the producer view.
+    auto producedView = dependence.dependentOpView.view;
+    auto producerIdx = producer.getIndexOfOutput(producedView).getValue();
+    // `consumerIdx` and `producerIdx` exist by construction.
+    LLVM_DEBUG(dbgs() << "\nRAW producer: " << *producer.getOperation()
+                      << " view: " << *producedView
+                      << " output index: " << producerIdx);
+
+    // Must be a subview or a slice to guarantee there are loops we can fuse
+    // into.
+    auto subView = dyn_cast_or_null<SubViewOp>(consumedView->getDefiningOp());
+    auto slice = dyn_cast_or_null<SliceOp>(consumedView->getDefiningOp());
+    if (!subView && !slice) {
+      LLVM_DEBUG(dbgs() << "\nNot fusable (not a subview or slice)");
+      continue;
+    }
+
+    // Simple fusability checks.
+    if (!isFusableInto(graph, consumer, consumedView, producer))
+      continue;
+
+    // Fuse `producer` just before `consumer`.
+    OpBuilder::InsertionGuard g(b);
+    b.setInsertionPoint(consumer.getOperation());
+    ScopedContext scope(b, consumer.getLoc());
+    LLVM_DEBUG(dbgs() << "Fuse into consumer: " << *consumer << "\n");
+    auto fusedProducer = fuse(producedView, producer, consumer, consumerIdx,
+                              producerIdx, folder);
+
+    return FusionInfo{producer, fusedProducer};
+  }
+  return llvm::None;
+}
+
+static void fuseLinalgOpsGreedily(FuncOp f) {
+  LLVM_DEBUG(f.print(dbgs() << "\nBefore linalg-fusion: \n"));
+
+  OpBuilder b(f);
+  OperationFolder folder(f.getContext());
+  DenseSet<Operation *> eraseSet;
+
+  // Save original Linalg ops, we only want to make a pass over those.
+  SmallVector<Operation *, 8> linalgOps;
+  f.walk([&](LinalgOp op) { linalgOps.push_back(op); });
+
+  Aliases aliases;
+  LinalgDependenceGraph G(aliases, linalgOps);
+  for (auto *op : llvm::reverse(linalgOps)) {
+    for (unsigned consumerIdx = 0, e = LinalgOp(op).getNumInputs();
+         consumerIdx < e; ++consumerIdx) {
+      if (auto fusionInfo = fuseProducerOf(b, op, consumerIdx, G, &folder))
+        eraseSet.insert(fusionInfo->originalProducer.getOperation());
+    }
+  }
+
+  // The `fuseProducerOf` function performs structural checks and in particular
+  // that no covering read or write exist between the consumer and the producer.
+  // As a consequence, the only fusions that may occur preserve subsequent
+  // dependences and are guaranteed by construction to produce the whole view.
+  // We may thus erase the producer once it is fused.
+  for (auto *e : eraseSet)
+    e->erase();
+  LLVM_DEBUG(f.print(dbgs() << "\nAfter linalg-fusion: \n"));
+}
+
+namespace {
+struct LinalgFusionPass : public FunctionPass<LinalgFusionPass> {
+  void runOnFunction() override { fuseLinalgOpsGreedily(getFunction()); }
+};
+} // namespace
+
+std::unique_ptr<OpPassBase<FuncOp>> mlir::linalg::createLinalgFusionPass() {
+  return std::make_unique<LinalgFusionPass>();
+}
+
+static PassRegistration<LinalgFusionPass>
+    pass("linalg-fusion", "Fuse operations in the linalg dialect");
diff --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7cc4a86d21d1e1d3fdc64643ea23aabab3dae4a
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp
@@ -0,0 +1,600 @@
+//===- LowerToLoops.cpp - conversion from Linalg library ops to loops------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Transforms/LinalgTransforms.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+using namespace mlir::linalg::intrinsics;
+
+using IndexedStdValue = TemplatedIndexedValue<std_load, std_store>;
+using IndexedAffineValue = TemplatedIndexedValue<affine_load, affine_store>;
+
+using edsc::op::operator+;
+using edsc::op::operator==;
+
+static SmallVector<ValueHandle, 8>
+makeCanonicalAffineApplies(OpBuilder &b, Location loc, AffineMap map,
+                           ArrayRef<Value> vals) {
+  assert(map.getNumSymbols() == 0);
+  assert(map.getNumInputs() == vals.size());
+  SmallVector<ValueHandle, 8> res;
+  res.reserve(map.getNumResults());
+  auto dims = map.getNumDims();
+  for (auto e : map.getResults()) {
+    auto exprMap = AffineMap::get(dims, 0, e);
+    SmallVector<Value, 4> operands(vals.begin(), vals.end());
+    canonicalizeMapAndOperands(&exprMap, &operands);
+    res.push_back(affine_apply(exprMap, operands));
+  }
+  return res;
+}
+
+static SmallVector<Value, 4> permuteIvs(ArrayRef<Value> ivs,
+                                        Optional<AffineMap> permutation) {
+  return permutation ? applyMapToValues(ScopedContext::getBuilder(),
+                                        ScopedContext::getLocation(),
+                                        permutation.getValue(), ivs)
+                     : SmallVector<Value, 4>(ivs.begin(), ivs.end());
+}
+
+// Creates a number of ranges equal to the number of results in `map`.
+// The returned ranges correspond to the loop ranges, in the proper order, for
+// which new loops will be created.
+static SmallVector<Value, 4> emitLoopRanges(OpBuilder &b, Location loc,
+                                            AffineMap map,
+                                            ArrayRef<Value> allViewSizes);
+SmallVector<Value, 4> emitLoopRanges(OpBuilder &b, Location loc, AffineMap map,
+                                     ArrayRef<Value> allViewSizes) {
+  // Apply `map` to get view sizes in loop order.
+  auto sizes = applyMapToValues(b, loc, map, allViewSizes);
+  // Create a new range with the applied tile sizes.
+  ScopedContext scope(b, loc);
+  SmallVector<Value, 4> res;
+  for (unsigned idx = 0, e = map.getNumResults(); idx < e; ++idx) {
+    res.push_back(range(constant_index(0), sizes[idx], constant_index(1)));
+  }
+  return res;
+}
+
+template <typename IndexedValueType, typename LinalgOpType>
+class LinalgScopedEmitter {};
+
+template <typename IndexedValueType>
+class LinalgScopedEmitter<IndexedValueType, CopyOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value> allIvs, CopyOp copyOp) {
+    auto nPar = copyOp.getNumParallelLoops();
+    assert(nPar == allIvs.size());
+    auto inputIvs =
+        permuteIvs(allIvs.take_front(nPar), copyOp.inputPermutation());
+    auto outputIvs =
+        permuteIvs(allIvs.take_front(nPar), copyOp.outputPermutation());
+    SmallVector<IndexHandle, 8> iivs(inputIvs.begin(), inputIvs.end());
+    SmallVector<IndexHandle, 8> oivs(outputIvs.begin(), outputIvs.end());
+    IndexedValueType O(copyOp.getOutput(0)), I(copyOp.getInput(0));
+    // Emit the proper scalar assignment, whether we are dealing with a 0-D or
+    // an n-D loop nest; with or without permutations.
+    // clang-format off
+    nPar > 0 ? O(oivs) = I(iivs) :
+               O() = I();
+    // clang-format on
+  }
+};
+
+template <typename IndexedValueType>
+class LinalgScopedEmitter<IndexedValueType, FillOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value> allIvs, FillOp fillOp) {
+    auto nPar = fillOp.getNumParallelLoops();
+    assert(nPar == allIvs.size());
+    auto ivs =
+        SmallVector<IndexHandle, 4>(allIvs.begin(), allIvs.begin() + nPar);
+    IndexedValueType O(fillOp.getOutput(0));
+    // Emit the proper scalar assignment, whether we are dealing with a 0-D or
+    // an n-D loop nest; with or without permutations.
+    nPar > 0 ? O(ivs) = ValueHandle(fillOp.value())
+             : O() = ValueHandle(fillOp.value());
+  }
+};
+
+template <typename IndexedValueType>
+class LinalgScopedEmitter<IndexedValueType, DotOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value> allIvs, DotOp dotOp) {
+    assert(allIvs.size() == 1);
+    IndexHandle r_i(allIvs[0]);
+    IndexedValueType A(dotOp.getInput(0)), B(dotOp.getInput(1)),
+        C(dotOp.getOutput(0));
+    // Emit scalar form.
+    C() = C() + A(r_i) * B(r_i);
+  }
+};
+
+template <typename IndexedValueType>
+class LinalgScopedEmitter<IndexedValueType, MatvecOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value> allIvs,
+                                       MatvecOp matvecOp) {
+    assert(allIvs.size() == 2);
+    IndexHandle i(allIvs[0]), r_j(allIvs[1]);
+    IndexedValueType A(matvecOp.getInput(0)), B(matvecOp.getInput(1)),
+        C(matvecOp.getOutput(0));
+    // Emit scalar form.
+    C(i) = C(i) + A(i, r_j) * B(r_j);
+  }
+};
+
+template <typename IndexedValueType>
+class LinalgScopedEmitter<IndexedValueType, MatmulOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value> allIvs,
+                                       MatmulOp matmulOp) {
+    assert(allIvs.size() == 3);
+    IndexHandle i(allIvs[0]), j(allIvs[1]), r_k(allIvs[2]);
+    IndexedValueType A(matmulOp.getInput(0)), B(matmulOp.getInput(1)),
+        C(matmulOp.getOutput(0));
+    // Emit scalar form.
+    C(i, j) = C(i, j) + A(i, r_k) * B(r_k, j);
+  }
+};
+
+template <typename IndexedValueType>
+class LinalgScopedEmitter<IndexedValueType, ConvOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value> allIvs, ConvOp convOp) {
+    auto b = ScopedContext::getBuilder();
+    auto loc = ScopedContext::getLocation();
+    auto maps = loopToOperandRangesMaps(convOp);
+    SmallVector<ValueHandle, 8> fIdx(
+        makeCanonicalAffineApplies(b, loc, maps[0], allIvs));
+    SmallVector<ValueHandle, 8> imIdx(
+        makeCanonicalAffineApplies(b, loc, maps[1], allIvs));
+    SmallVector<ValueHandle, 8> oIdx(
+        makeCanonicalAffineApplies(b, loc, maps[2], allIvs));
+    IndexedValueType F(convOp.filter()), I(convOp.input()), O(convOp.output());
+    // Emit scalar form.
+    O(oIdx) += F(fIdx) * I(imIdx);
+  }
+};
+
+// Emits the MLIR for the scalar part of the generic op by:
+//   1. Emitting std_load and std_store ops for each input and output
+//      view in order. This is achieved by applying the appropriate input or
+//      output map to the enclosing induction variables.
+//   2. Emitting a call to `op.fun()` that takes as arguments the scalars
+//      from point 1. above.
+//   3. Emitting std_store to store the results of 2. to the output
+//      views.
+//
+// An example output may resemble:
+//
+// ```
+//    loop.for %i = %c0 to %0 step %c1 {
+//      loop.for %j = %c0 to %1 step %c1 {
+//        loop.for %k = %c0 to %4 step %c1 {
+//          %11 = load %arg0[%i, %j] :
+//            memref<?x?xf32, stride_specification>
+//          %12 = load %arg1[%i, %j, %k] :
+//            memref<?x?x?xf32, stride_specification>
+//          %13 = load %arg2[%i, %k, %j] :
+//            memref<?x?x?xf32, stride_specification>
+//          %14:2 = call @foo(%11, %12, %13) : (f32, f32, f32) -> (f32, f32)
+//          store %14#0, %arg1[%i, %j, %k] :
+//            memref<?x?x?Xf32, stride_specification>
+//          store %14#1, %arg2[%i, %k, %j] :
+//            memref<?x?x?Xf32, stride_specification>
+//       }
+//      }
+//    }
+// ```
+template <typename IndexedValueType>
+class LinalgScopedEmitter<IndexedValueType, GenericOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value> allIvs,
+                                       GenericOp genericOp) {
+    auto b = ScopedContext::getBuilder();
+    auto loc = ScopedContext::getLocation();
+    using edsc::intrinsics::detail::ValueHandleArray;
+    unsigned nInputs = genericOp.getNumInputs();
+    unsigned nOutputs = genericOp.getNumOutputs();
+    SmallVector<Value, 4> indexedValues(nInputs + nOutputs);
+
+    // 1.a. Emit std_load from input views.
+    for (unsigned i = 0; i < nInputs; ++i) {
+      ValueHandleArray indexing(makeCanonicalAffineApplies(
+          b, loc, genericOp.getInputIndexingMap(i), allIvs));
+      indexedValues[i] = std_load(genericOp.getInput(i), indexing);
+    }
+
+    // 1.b. Emit std_load from output views.
+    for (unsigned i = 0; i < nOutputs; ++i) {
+      ValueHandleArray indexing(makeCanonicalAffineApplies(
+          b, loc, genericOp.getOutputIndexingMap(i), allIvs));
+      indexedValues[nInputs + i] = std_load(genericOp.getOutput(i), indexing);
+    }
+
+    auto funcOp = genericOp.getFunction();
+    if (funcOp) {
+      // 2. Emit call.
+      Operation *callOp = call(funcOp, indexedValues);
+      assert(callOp->getNumResults() == genericOp.getNumOutputs());
+
+      // 3. Emit std_store.
+      for (unsigned i = 0; i < nOutputs; ++i) {
+        ValueHandleArray indexing(makeCanonicalAffineApplies(
+            b, loc, genericOp.getOutputIndexingMap(i), allIvs));
+        std_store(callOp->getResult(i), genericOp.getOutput(i), indexing);
+      }
+      return;
+    }
+    // TODO(ntv): When a region inliner exists, use it.
+    // 2. Inline region, currently only works for a single basic block.
+    BlockAndValueMapping map;
+    auto &block = genericOp.region().front();
+    for (auto it : llvm::zip(block.getArguments(), indexedValues))
+      map.map(std::get<0>(it), std::get<1>(it));
+    for (auto &op : block.without_terminator()) {
+      assert(op.getNumRegions() == 0);
+      auto *newOp = b.clone(op, map);
+      for (auto it : llvm::zip(op.getResults(), newOp->getResults()))
+        map.map(std::get<0>(it), std::get<1>(it));
+    }
+
+    // 3. Emit std_store.
+    auto *yieldOp = cast<YieldOp>(block.back()).getOperation();
+    assert(yieldOp->getNumOperands() == nOutputs);
+    for (unsigned i = 0; i < nOutputs; ++i) {
+      ValueHandleArray indexing(makeCanonicalAffineApplies(
+          b, loc, genericOp.getOutputIndexingMap(i), allIvs));
+      std_store(map.lookup(yieldOp->getOperand(i)), genericOp.getOutput(i),
+                indexing);
+    }
+  }
+};
+
+// Emits the MLIR for the scalar part of the indexed generic op by:
+//   1. Emitting std_load and std_store ops for each input and output view in
+//      order. This is achieved by applying the appropriate input or output map
+//      to the enclosing induction variables.
+//   2. Emitting a call to `op.fun()` that takes as arguments the induction
+//      variables and the scalars from point 1. above.
+//   3. Emitting std_store to store the results of 2. to the output views.
+//
+// An example output may resemble:
+//
+// ```
+//    loop.for %i = %c0 to %0 step %c1 {
+//      loop.for %j = %c0 to %1 step %c1 {
+//        loop.for %k = %c0 to %4 step %c1 {
+//          %11 = load %arg0[%i, %j] :
+//            memref<?x?xf32, stride_specification>
+//          %12 = load %arg1[%i, %j, %k] :
+//            memref<?x?x?xf32, stride_specification>
+//          %13 = load %arg2[%i, %k, %j] :
+//            memref<?x?x?xf32, stride_specification>
+//          %14:2 = call @foo(%i, %j, %k, %11, %12, %13) :
+//            (index, index, index, f32, f32, f32) -> (f32, f32)
+//          store %14#0, %arg1[%i, %j, %k] :
+//            memref<?x?x?Xf32, stride_specification>
+//          store %14#1, %arg2[%i, %k, %j] :
+//            memref<?x?x?Xf32, stride_specification>
+//       }
+//      }
+//    }
+// ```
+template <typename IndexedValueType>
+class LinalgScopedEmitter<IndexedValueType, IndexedGenericOp> {
+public:
+  static void emitScalarImplementation(ArrayRef<Value> allIvs,
+                                       IndexedGenericOp indexedGenericOp) {
+    auto b = ScopedContext::getBuilder();
+    auto loc = ScopedContext::getLocation();
+    using edsc::intrinsics::detail::ValueHandleArray;
+    unsigned nInputs = indexedGenericOp.getNumInputs();
+    unsigned nOutputs = indexedGenericOp.getNumOutputs();
+    unsigned nLoops = allIvs.size();
+    SmallVector<Value, 4> indexedValues(nLoops + nInputs + nOutputs);
+
+    for (unsigned i = 0; i < nLoops; ++i) {
+      indexedValues[i] = allIvs[i];
+    }
+
+    // 1.a. Emit std_load from input views.
+    for (unsigned i = 0; i < nInputs; ++i) {
+      ValueHandleArray indexing(makeCanonicalAffineApplies(
+          b, loc, indexedGenericOp.getInputIndexingMap(i), allIvs));
+      indexedValues[nLoops + i] =
+          std_load(indexedGenericOp.getInput(i), indexing);
+    }
+
+    // 1.b. Emit std_load from output views.
+    for (unsigned i = 0; i < nOutputs; ++i) {
+      ValueHandleArray indexing(makeCanonicalAffineApplies(
+          b, loc, indexedGenericOp.getOutputIndexingMap(i), allIvs));
+      indexedValues[nLoops + nInputs + i] =
+          std_load(indexedGenericOp.getOutput(i), indexing);
+    }
+
+    if (auto funcOp = indexedGenericOp.getFunction()) {
+      // 2. Emit call.
+      Operation *callOp = call(funcOp, indexedValues);
+      assert(callOp->getNumResults() == indexedGenericOp.getNumOutputs());
+
+      // 3. Emit std_store.
+      for (unsigned i = 0; i < nOutputs; ++i) {
+        ValueHandleArray indexing(makeCanonicalAffineApplies(
+            b, loc, indexedGenericOp.getOutputIndexingMap(i), allIvs));
+        std_store(callOp->getResult(i), indexedGenericOp.getOutput(i),
+                  indexing);
+      }
+      return;
+    }
+    // TODO(ntv): When a region inliner exists, use it.
+    // 2. Inline region, currently only works for a single basic block.
+    BlockAndValueMapping map;
+    auto &block = indexedGenericOp.region().front();
+    for (auto it : llvm::zip(block.getArguments(), indexedValues))
+      map.map(std::get<0>(it), std::get<1>(it));
+    for (auto &op : block.without_terminator()) {
+      assert(op.getNumRegions() == 0);
+      auto *newOp = b.clone(op, map);
+      for (auto it : llvm::zip(op.getResults(), newOp->getResults()))
+        map.map(std::get<0>(it), std::get<1>(it));
+    }
+
+    // 3. Emit std_store.
+    auto *yieldOp = cast<YieldOp>(block.back()).getOperation();
+    assert(yieldOp->getNumOperands() == nOutputs);
+    for (unsigned i = 0; i < nOutputs; ++i) {
+      ValueHandleArray indexing(makeCanonicalAffineApplies(
+          b, loc, indexedGenericOp.getOutputIndexingMap(i), allIvs));
+      std_store(map.lookup(yieldOp->getOperand(i)),
+                indexedGenericOp.getOutput(i), indexing);
+    }
+  }
+};
+
+namespace {
+// This struct is for factoring out the implementation and support template
+// instantiations in the following 2 cases:
+//   1. Appending to a list of patterns via RewritePatternList.
+//   2. Direct invocation via `linalgOpToLoops` and `linalgOpToAffineLoops`.
+// The implementation must work both in DRR and inside a RewritePattern. As a
+// consequence, (1) it is only allowed to emit new ops if the match is
+// guaranteed to be a success, (2) it is not allowed erase/replace, and (3) an
+// encompassing pattern must take care of the erasure logic.
+template <typename LoopTy, typename IndexedValueTy, typename ConcreteOpTy>
+class LinalgOpToLoopsImpl {
+public:
+  static LogicalResult doit(Operation *op, PatternRewriter &rewriter);
+};
+} // namespace
+
+template <typename LoopTy, typename IndexedValueTy, typename ConcreteOpTy>
+LogicalResult LinalgOpToLoopsImpl<LoopTy, IndexedValueTy, ConcreteOpTy>::doit(
+    Operation *op, PatternRewriter &rewriter) {
+  OpBuilder b(op);
+  ScopedContext scope(b, op->getLoc());
+
+  // The flattened loopToOperandRangesMaps is expected to be an invertible
+  // permutation map (which is asserted in the inverse calculation).
+  auto linalgOp = cast<ConcreteOpTy>(op);
+  auto invertedMap =
+      inversePermutation(concatAffineMaps(loopToOperandRangesMaps(linalgOp)));
+  if (!invertedMap) {
+    LinalgScopedEmitter<IndexedValueTy, ConcreteOpTy>::emitScalarImplementation(
+        {}, linalgOp);
+    return success();
+  }
+
+  auto nPar = linalgOp.getNumParallelLoops();
+  auto nRed = linalgOp.getNumReductionLoops();
+  auto nWin = linalgOp.getNumWindowLoops();
+  SmallVector<IndexHandle, 4> allIvs(nPar + nRed + nWin);
+  SmallVector<ValueHandle *, 4> allPIvs =
+      makeHandlePointers(MutableArrayRef<IndexHandle>(allIvs));
+  auto loopRanges = emitLoopRanges(scope.getBuilder(), scope.getLocation(),
+                                   invertedMap, getViewSizes(linalgOp));
+  assert(loopRanges.size() == allIvs.size());
+
+  LoopNestRangeBuilder(allPIvs, loopRanges)([&] {
+    auto allIvValues = extractValues(allIvs);
+    LinalgScopedEmitter<IndexedValueTy, ConcreteOpTy>::emitScalarImplementation(
+        allIvValues, linalgOp);
+  });
+  return success();
+}
+
+template <typename LoopType, typename IndexedValueType, typename ConcreteOp>
+class LinalgRewritePattern : public RewritePattern {
+public:
+  explicit LinalgRewritePattern(MLIRContext *context)
+      : RewritePattern(ConcreteOp::getOperationName(), 1, context) {}
+
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const override {
+    using Impl = LinalgOpToLoopsImpl<LoopType, IndexedValueType, ConcreteOp>;
+    if (failed(Impl::doit(op, rewriter)))
+      return matchFailure();
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+};
+
+// Helper classes for type list expansion.
+template <typename LoopType, typename IndexedValueType, typename... LinalgOps>
+class RewritePatternList;
+
+template <typename LoopType, typename IndexedValueType>
+class RewritePatternList<LoopType, IndexedValueType> {
+public:
+  static void build(OwningRewritePatternList &patterns, MLIRContext *ctx) {}
+};
+
+template <typename LoopType, typename IndexedValueType, typename ConcreteOp,
+          typename... LinalgOps>
+class RewritePatternList<LoopType, IndexedValueType, ConcreteOp, LinalgOps...> {
+public:
+  static void build(OwningRewritePatternList &patterns, MLIRContext *ctx) {
+    patterns
+        .insert<LinalgRewritePattern<LoopType, IndexedValueType, ConcreteOp>>(
+            ctx);
+    RewritePatternList<LoopType, IndexedValueType, LinalgOps...>::build(
+        patterns, ctx);
+  }
+};
+
+/// Populate the given list with patterns that convert from Linalg to LLVM.
+template <typename LoopType, typename IndexedValueType>
+void FillRewritePatterns(OwningRewritePatternList &patterns, MLIRContext *ctx) {
+  RewritePatternList<LoopType, IndexedValueType,
+#define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
+                     >::build(patterns, ctx);
+}
+
+namespace {
+template <typename LoopType, typename IndexedValueType>
+struct LowerLinalgToLoopsPass
+    : public FunctionPass<LowerLinalgToLoopsPass<LoopType, IndexedValueType>> {
+  void runOnFunction() override;
+};
+} // namespace
+
+// Local folding pattern for AffineApplyOp that we can apply greedily.
+// This replaces AffineApplyOp by the proper value in cases where the associated
+// map is trivial. A trivial map here is defined as a map with a single result
+// and either:
+//   1. Zero operand + returns a single AffineConstantExpr
+//   2. One operand + returns a single AffineDimExpr
+//   3. One operands + returns a single AffineSymbolExpr
+//
+// In the first case, the AffineApplyOp is replaced by a new constant. In the
+// other cases, it is replaced by its unique operand.
+struct FoldAffineOp : public RewritePattern {
+  FoldAffineOp(MLIRContext *context)
+      : RewritePattern(AffineApplyOp::getOperationName(), 0, context) {}
+
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const override {
+    AffineApplyOp affineApplyOp = cast<AffineApplyOp>(op);
+    auto map = affineApplyOp.getAffineMap();
+    if (map.getNumResults() != 1 || map.getNumInputs() > 1)
+      return matchFailure();
+
+    AffineExpr expr = map.getResult(0);
+    if (map.getNumInputs() == 0) {
+      if (auto val = expr.dyn_cast<AffineConstantExpr>()) {
+        rewriter.replaceOpWithNewOp<ConstantIndexOp>(op, val.getValue());
+        return matchSuccess();
+      }
+      return matchFailure();
+    }
+    if (expr.dyn_cast<AffineDimExpr>() || expr.dyn_cast<AffineSymbolExpr>()) {
+      rewriter.replaceOp(op, op->getOperand(0));
+      return matchSuccess();
+    }
+    return matchFailure();
+  }
+};
+
+template <typename LoopType, typename IndexedValueType>
+void LowerLinalgToLoopsPass<LoopType, IndexedValueType>::runOnFunction() {
+  auto *context = &this->getContext();
+  OwningRewritePatternList patterns;
+  // Canonicalization and folding patterns applied greedily allow cleaning up
+  // the emitted IR on the fly.
+  // TODO(ntv) fold view and subview ops?
+  FillRewritePatterns<LoopType, IndexedValueType>(patterns, context);
+  DimOp::getCanonicalizationPatterns(patterns, context);
+  AffineApplyOp::getCanonicalizationPatterns(patterns, context);
+  patterns.insert<FoldAffineOp>(context);
+  // Just apply the patterns greedily.
+  applyPatternsGreedily(this->getFunction(), patterns);
+}
+
+/// Create a pass to convert Linalg operations to loop.for loops and
+/// std.load/std.store accesses.
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::linalg::createConvertLinalgToLoopsPass() {
+  return std::make_unique<
+      LowerLinalgToLoopsPass<loop::ForOp, IndexedStdValue>>();
+}
+
+/// Create a pass to convert Linalg operations to affine.for loops and
+/// affine_load/affine_store accesses.
+/// Placeholder for now, this is NYI.
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::linalg::createConvertLinalgToAffineLoopsPass() {
+  return std::make_unique<
+      LowerLinalgToLoopsPass<AffineForOp, IndexedAffineValue>>();
+}
+
+// Emits a loop nest of `loop.for` with the proper body for `op`.
+template <typename ConcreteOp>
+LogicalResult mlir::linalg::linalgOpToLoops(PatternRewriter &rewriter,
+                                            Operation *op) {
+  return LinalgOpToLoopsImpl<loop::ForOp, IndexedStdValue, ConcreteOp>::doit(
+      op, rewriter);
+}
+
+// Emits a loop nest of `affine.for` with the proper body for `op`.
+template <typename ConcreteOp>
+LogicalResult mlir::linalg::linalgOpToAffineLoops(PatternRewriter &rewriter,
+                                                  Operation *op) {
+  return LinalgOpToLoopsImpl<AffineForOp, IndexedAffineValue, ConcreteOp>::doit(
+      op, rewriter);
+}
+
+// TODO(ntv) Need to make these instantiations more future-proof to avoid the
+// need to update as soon as we add new ops.
+#define INSTANTIATE_LINALG_OP_TO_LOOPS(OP_TYPE)                                \
+  template LogicalResult mlir::linalg::linalgOpToLoops<OP_TYPE>(               \
+      PatternRewriter & rewriter, Operation * op);                             \
+  template LogicalResult mlir::linalg::linalgOpToAffineLoops<OP_TYPE>(         \
+      PatternRewriter & rewriter, Operation * op);
+
+INSTANTIATE_LINALG_OP_TO_LOOPS(CopyOp)
+INSTANTIATE_LINALG_OP_TO_LOOPS(FillOp)
+INSTANTIATE_LINALG_OP_TO_LOOPS(DotOp)
+INSTANTIATE_LINALG_OP_TO_LOOPS(MatvecOp)
+INSTANTIATE_LINALG_OP_TO_LOOPS(MatmulOp)
+INSTANTIATE_LINALG_OP_TO_LOOPS(ConvOp)
+INSTANTIATE_LINALG_OP_TO_LOOPS(GenericOp)
+INSTANTIATE_LINALG_OP_TO_LOOPS(IndexedGenericOp)
+
+static PassRegistration<LowerLinalgToLoopsPass<loop::ForOp, IndexedStdValue>>
+    structuredLoopsPass(
+        "convert-linalg-to-loops",
+        "Lower the operations from the linalg dialect into loops");
+
+static PassRegistration<LowerLinalgToLoopsPass<AffineForOp, IndexedAffineValue>>
+    affineLoopsPass(
+        "convert-linalg-to-affine-loops",
+        "Lower the operations from the linalg dialect into affine loops");
diff --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgTransforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgTransforms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eb23a8ceb1aa8919347f3d031e44dc04e59db283
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgTransforms.cpp
@@ -0,0 +1,238 @@
+//===- LinalgTransforms.cpp - Linalg transformations as patterns ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements logic for transforming Linalg operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/Transforms/LinalgTransforms.h"
+#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <type_traits>
+
+#define DEBUG_TYPE "linalg-transforms"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+using namespace mlir::linalg::intrinsics;
+
+using llvm::dbgs;
+using llvm::SetVector;
+
+// Marker used as attribute name in generated Linalg rewriting transformations.
+const StringLiteral mlir::linalg::LinalgTransforms::kLinalgTransformMarker =
+    "__internal_linalg_transform__";
+
+LogicalResult mlir::linalg::tileLinalgOpAndSetMarker(
+    PatternRewriter &rewriter, Operation *op, ArrayRef<int64_t> sizes,
+    StringRef linalgMarker, ArrayRef<unsigned> permutation) {
+  assert(permutation.empty() || permutation.size() == sizes.size());
+  auto tileRes = tileLinalgOperation(rewriter, op, sizes, permutation);
+  if (!tileRes)
+    return failure();
+  tileRes->op.setAttr(LinalgTransforms::kLinalgTransformMarker,
+                      rewriter.getStringAttr(linalgMarker));
+  return success();
+}
+
+LogicalResult mlir::linalg::tileAndFuseLinalgOpAndSetMarker(
+    PatternRewriter &rewriter, Operation *op, ArrayRef<int64_t> sizes,
+    ArrayRef<int64_t> operandIndicesToFuse, StringRef linalgMarker) {
+  auto tileRes = tileLinalgOperation(rewriter, op, sizes);
+  if (!tileRes)
+    return failure();
+  tileRes->op.setAttr(LinalgTransforms::kLinalgTransformMarker,
+                      rewriter.getStringAttr(linalgMarker));
+  Aliases aliases;
+  auto G = LinalgDependenceGraph::buildDependenceGraph(
+      aliases, op->getParentOfType<FuncOp>());
+  SmallVector<Operation *, 4> originalProducers;
+  for (auto operandIdx : operandIndicesToFuse) {
+    auto fusionRes = fuseProducerOf(rewriter, tileRes->op, operandIdx, G);
+    if (!fusionRes) {
+      // Linalg fusion requires tiled loops to even determine whether it is
+      // possible to fuse. As a consequence, the pattern may fail even though a
+      // tiled version of op has already been introduced.
+      // So we need to remove the tiled version ourselves in case of failure.
+      // Another possibility is to ensure the constraints on the pattern
+      // guarantee that fusion will occur and just assert here. As we develop
+      // more complex patterns we can choose what is best.
+      rewriter.eraseOp(tileRes->loops[0]);
+      return failure();
+    }
+    fusionRes->fusedProducer.setAttr(LinalgTransforms::kLinalgTransformMarker,
+                                     rewriter.getStringAttr(linalgMarker));
+    originalProducers.push_back(fusionRes->originalProducer);
+  }
+
+  // The originalProducers can now be safely erased. This is similar to
+  // SSA-value use-def but in the world of buffer + structured ops.
+  for (auto *originalProducer : originalProducers)
+    rewriter.eraseOp(originalProducer);
+  return success();
+}
+
+bool mlir::linalg::detail::isProducedByOpOfTypeImpl(
+    Operation *consumerOp, Value consumedView,
+    function_ref<bool(Operation *)> isaOpType) {
+  LinalgOp consumer = dyn_cast<LinalgOp>(consumerOp);
+  if (!consumer)
+    return false;
+
+  auto maybeConsumerIndex = consumer.getIndexOfInput(consumedView);
+  if (!maybeConsumerIndex)
+    return false;
+
+  Aliases aliases;
+  auto G = LinalgDependenceGraph::buildDependenceGraph(
+      aliases, consumer.getParentOfType<FuncOp>());
+  for (auto dependence : G.getDependencesInto(
+           consumer, LinalgDependenceGraph::DependenceType::RAW)) {
+    auto producer = cast<LinalgOp>(dependence.dependentOpView.op);
+    if (!isProducerLastWriteOfView(G, consumer, consumedView, producer))
+      continue;
+    if (isaOpType(dependence.dependentOpView.op))
+      return true;
+  }
+  return false;
+}
+
+static bool hasMultiplyAddBody(linalg::GenericOp op) {
+  auto &r = op.region();
+  if (r.empty())
+    return false;
+  if (r.getBlocks().size() != 1)
+    return false;
+  auto &ops = r.front().getOperations();
+  if (ops.size() != 3)
+    return false;
+
+  using mlir::matchers::m_Val;
+  auto a = m_Val(r.front().getArgument(0));
+  auto b = m_Val(r.front().getArgument(1));
+  auto c = m_Val(r.front().getArgument(2));
+  // TODO(ntv) Update this detection once we have  matcher support for
+  // specifying that any permutation of operands matches.
+  auto pattern1 = m_Op<YieldOp>(m_Op<AddFOp>(m_Op<MulFOp>(a, b), c));
+  auto pattern2 = m_Op<YieldOp>(m_Op<AddFOp>(c, m_Op<MulFOp>(a, b)));
+  auto pattern3 = m_Op<YieldOp>(m_Op<AddFOp>(m_Op<MulFOp>(b, a), c));
+  auto pattern4 = m_Op<YieldOp>(m_Op<AddFOp>(c, m_Op<MulFOp>(b, a)));
+  return pattern1.match(&ops.back()) || pattern2.match(&ops.back()) ||
+         pattern3.match(&ops.back()) || pattern4.match(&ops.back());
+}
+
+// TODO(ntv) should be Tablegen'd from a single source that generates the op
+// itself.
+static bool isMatmul(linalg::GenericOp genericOp) {
+  auto *ctx = genericOp.getContext();
+  auto m = getAffineDimExpr(0, ctx);
+  auto n = getAffineDimExpr(1, ctx);
+  auto k = getAffineDimExpr(2, ctx);
+  auto mapA = AffineMapAttr::get(AffineMap::get(3, 0, {m, k}));
+  auto mapB = AffineMapAttr::get(AffineMap::get(3, 0, {k, n}));
+  auto mapC = AffineMapAttr::get(AffineMap::get(3, 0, {m, n}));
+  auto maps = ArrayAttr::get({mapA, mapB, mapC}, ctx);
+  return genericOp.getNumInputs() == 2 && genericOp.getNumOutputs() == 1 &&
+         genericOp.indexing_maps() == maps && hasMultiplyAddBody(genericOp);
+}
+
+LogicalResult mlir::linalg::vectorizeGenericOp(PatternRewriter &rewriter,
+                                               Operation *op) {
+  LLVM_DEBUG(dbgs() << "\n[" DEBUG_TYPE
+                       "]: Rewrite linalg op as vector.contract: "
+                    << *op << ":\n");
+
+  // TODO(ntv): This is in fact much more general than just vectorization for
+  // matmul ops.
+  auto genericOp = dyn_cast<linalg::GenericOp>(op);
+  if (!genericOp || !isMatmul(genericOp))
+    return failure();
+
+  // TODO(ntv): non-identity layout.
+  auto isStaticMemRefWithIdentityLayout = [](Value v) {
+    auto m = v->getType().dyn_cast<MemRefType>();
+    if (!m || !m.hasStaticShape() || !m.getAffineMaps().empty())
+      return false;
+    return true;
+  };
+  if (!llvm::all_of(genericOp.getInputsAndOutputs(),
+                    isStaticMemRefWithIdentityLayout))
+    return failure();
+
+  edsc::ScopedContext scope(rewriter, op->getLoc());
+  using edsc::intrinsics::std_load;
+  using edsc::intrinsics::std_store;
+  using vector_contract = edsc::intrinsics::ValueBuilder<vector::ContractionOp>;
+  using vector_type_cast = edsc::intrinsics::ValueBuilder<vector::TypeCastOp>;
+  auto vA = std_load(vector_type_cast(genericOp.getInput(0)));
+  auto vB = std_load(vector_type_cast(genericOp.getInput(1)));
+  auto vectorMemRefC = vector_type_cast(genericOp.getOutput(0));
+  auto vC = std_load(vectorMemRefC);
+  auto vRes = vector_contract(vA, vB, vC, genericOp.indexing_maps(),
+                              genericOp.iterator_types());
+  std_store(vRes, vectorMemRefC);
+  return success();
+}
+
+LogicalResult
+mlir::linalg::permuteGenericLinalgOp(PatternRewriter &rewriter, Operation *op,
+                                     ArrayRef<unsigned> permutation,
+                                     StringRef linalgMarker) {
+  // If permutation is empty, there is nothing to be done.
+  if (permutation.empty())
+    return failure();
+
+  auto linOp = cast<LinalgOp>(op);
+  auto permutationMap = inversePermutation(
+      AffineMap::getPermutationMap(permutation, rewriter.getContext()));
+  SmallVector<AffineMap, 4> newIndexingMap;
+  auto indexingMaps = linOp.indexing_maps().getValue();
+  for (unsigned i = 0, e = linOp.getNumInputsAndOutputs(); i != e; ++i) {
+    AffineMap m = indexingMaps[i].cast<AffineMapAttr>().getValue().compose(
+        permutationMap);
+    newIndexingMap.push_back(m);
+  }
+  auto itTypes = linOp.iterator_types().getValue();
+  SmallVector<Attribute, 4> itTypesVector;
+  for (unsigned i = 0, e = itTypes.size(); i != e; ++i)
+    itTypesVector.push_back(itTypes[i]);
+  applyPermutationToVector(itTypesVector, permutation);
+  op->setAttr(getIndexingMapsAttrName(),
+              rewriter.getAffineMapArrayAttr(newIndexingMap));
+  op->setAttr(getIteratorTypesAttrName(), rewriter.getArrayAttr(itTypesVector));
+  op->setAttr(LinalgTransforms::kLinalgTransformMarker,
+              rewriter.getStringAttr(linalgMarker));
+  linOp.clone(rewriter, linOp.getLoc(), op->getOperands());
+  return success();
+}
+
+LogicalResult mlir::linalg::linalgOpPromoteSubviews(PatternRewriter &rewriter,
+                                                    Operation *op) {
+  LinalgOp linOp = dyn_cast<LinalgOp>(op);
+  SetVector<Value> subViews;
+  for (auto it : linOp.getInputsAndOutputs())
+    if (auto sv = dyn_cast_or_null<SubViewOp>(it->getDefiningOp()))
+      subViews.insert(sv);
+  if (!subViews.empty()) {
+    auto resOp = promoteSubViewOperands(rewriter, linOp, subViews);
+    return success(resOp);
+  }
+  return failure();
+}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8b27958ff5aa01876901527e1d54fc0331ec08d
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -0,0 +1,243 @@
+//===- Promotion.cpp - Implementation of linalg Promotion -----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the linalg dialect Promotion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+using namespace mlir::linalg::intrinsics;
+using namespace mlir::loop;
+
+using llvm::SetVector;
+
+#define DEBUG_TYPE "linalg-promotion"
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+static llvm::cl::opt<bool> clPromoteDynamic(
+    "test-linalg-promote-dynamic",
+    llvm::cl::desc("Test generation of dynamic promoted buffers"),
+    llvm::cl::cat(clOptionsCategory), llvm::cl::init(false));
+
+static Value allocBuffer(Type elementType, Value size, bool dynamicBuffers) {
+  auto *ctx = size->getContext();
+  auto width = llvm::divideCeil(elementType.getIntOrFloatBitWidth(), 8);
+  if (!dynamicBuffers)
+    if (auto cst = dyn_cast_or_null<ConstantIndexOp>(size->getDefiningOp()))
+      return alloc(
+          MemRefType::get(width * cst.getValue(), IntegerType::get(8, ctx)));
+  Value mul = muli(constant_index(width), size);
+  return alloc(MemRefType::get(-1, IntegerType::get(8, ctx)), mul);
+}
+
+// Performs promotion of a `subView` into a local buffer of the size of the
+// *ranges* of the `subView`. This produces a buffer whose size may be bigger
+// than the actual size of the `subView` at the boundaries.
+// This is related to the full/partial tile problem.
+// Returns a PromotionInfo containing a `buffer`, `fullLocalView` and
+// `partialLocalView` such that:
+//   * `buffer` is always the size of the full tile.
+//   * `fullLocalView` is a dense contiguous view into that buffer.
+//   * `partialLocalView` is a dense non-contiguous slice of `fullLocalView`
+//     that corresponds to the size of `subView` and accounting for boundary
+//     effects.
+// The point of the full tile buffer is that constant static tile sizes are
+// folded and result in a buffer type with statically known size and alignment
+// properties.
+// To account for general boundary effects, padding must be performed on the
+// boundary tiles. For now this is done with an unconditional `fill` op followed
+// by a partial `copy` op.
+static PromotionInfo promoteFullTileBuffer(OpBuilder &b, Location loc,
+                                           SubViewOp subView,
+                                           bool dynamicBuffers,
+                                           OperationFolder *folder) {
+  auto zero = constant_index(folder, 0);
+  auto one = constant_index(folder, 1);
+
+  auto viewType = subView.getType();
+  auto rank = viewType.getRank();
+  Value allocSize = one;
+  SmallVector<Value, 8> fullRanges, partialRanges;
+  fullRanges.reserve(rank);
+  partialRanges.reserve(rank);
+  for (auto en : llvm::enumerate(subView.getRanges())) {
+    auto rank = en.index();
+    auto rangeValue = en.value();
+    Value d = rangeValue.size;
+    allocSize = muli(folder, allocSize, d).getValue();
+    fullRanges.push_back(d);
+    partialRanges.push_back(range(folder, zero, dim(subView, rank), one));
+  }
+  SmallVector<int64_t, 4> dynSizes(fullRanges.size(), -1);
+  auto buffer =
+      allocBuffer(viewType.getElementType(), allocSize, dynamicBuffers);
+  auto fullLocalView = view(
+      MemRefType::get(dynSizes, viewType.getElementType()), buffer, fullRanges);
+  auto partialLocalView = slice(fullLocalView, partialRanges);
+  return PromotionInfo{buffer, fullLocalView, partialLocalView};
+}
+
+SmallVector<PromotionInfo, 8>
+mlir::linalg::promoteSubViews(OpBuilder &b, Location loc,
+                              ArrayRef<Value> subViews, bool dynamicBuffers,
+                              OperationFolder *folder) {
+  if (subViews.empty())
+    return {};
+
+  ScopedContext scope(b, loc);
+  SmallVector<PromotionInfo, 8> res;
+  res.reserve(subViews.size());
+  DenseMap<Value, PromotionInfo> promotionInfoMap;
+  for (auto v : subViews) {
+    SubViewOp subView = cast<SubViewOp>(v->getDefiningOp());
+    auto viewType = subView.getType();
+    // TODO(ntv): support more cases than just float.
+    if (!viewType.getElementType().isa<FloatType>())
+      continue;
+    auto promotionInfo =
+        promoteFullTileBuffer(b, loc, subView, dynamicBuffers, folder);
+    promotionInfoMap.insert(std::make_pair(subView.getResult(), promotionInfo));
+    res.push_back(promotionInfo);
+  }
+
+  for (auto v : subViews) {
+    SubViewOp subView = cast<SubViewOp>(v->getDefiningOp());
+    auto info = promotionInfoMap.find(v);
+    if (info == promotionInfoMap.end())
+      continue;
+    // TODO(ntv): value to fill with should be related to the operation.
+    // For now, just use APFloat(0.0f).
+    auto t = subView.getType().getElementType().cast<FloatType>();
+    Value fillVal = constant_float(folder, APFloat(0.0f), t);
+    // TODO(ntv): fill is only necessary if `promotionInfo` has a full local
+    // view that is different from the partial local view and we are on the
+    // boundary.
+    fill(info->second.fullLocalView, fillVal);
+  }
+
+  for (auto v : subViews) {
+    auto info = promotionInfoMap.find(v);
+    if (info == promotionInfoMap.end())
+      continue;
+    copy(cast<SubViewOp>(v->getDefiningOp()), info->second.partialLocalView);
+  }
+  return res;
+}
+
+LinalgOp mlir::linalg::promoteSubViewOperands(OpBuilder &b, LinalgOp op,
+                                              SetVector<Value> subViews,
+                                              bool dynamicBuffers,
+                                              OperationFolder *folder) {
+  // 1. Promote the specified views and use them in the new op.
+  ScopedContext scope(b, op.getLoc());
+  auto promotedBufferAndViews = promoteSubViews(
+      b, op.getLoc(), subViews.getArrayRef(), dynamicBuffers, folder);
+  SmallVector<Value, 8> opViews;
+  opViews.reserve(op.getNumInputsAndOutputs());
+  SmallVector<std::pair<Value, Value>, 8> writebackViews;
+  writebackViews.reserve(subViews.size());
+  unsigned promotedIdx = 0;
+  for (auto view : op.getInputsAndOutputs()) {
+    if (subViews.count(view) != 0) {
+      opViews.push_back(promotedBufferAndViews[promotedIdx].fullLocalView);
+      writebackViews.emplace_back(std::make_pair(
+          view, promotedBufferAndViews[promotedIdx].partialLocalView));
+      promotedIdx++;
+    } else {
+      opViews.push_back(view);
+    }
+  }
+
+  // 2. Append all other operands as they appear, this enforces that such
+  // operands are not views. This is to support cases such as FillOp taking
+  // extra scalars etc.
+  auto operands = getAssumedNonViewOperands(op);
+  opViews.append(operands.begin(), operands.end());
+  LinalgOp res = op.clone(b, op.getLoc(), opViews);
+
+  // 3. Emit write-back for the promoted output views: copy the partial view.
+  for (auto viewAndPartialLocalView : writebackViews) {
+    // WARNING: MUST use the old op to determine whether the operand view is an
+    // output.
+    bool isOutput =
+        op.getIndexOfOutput(viewAndPartialLocalView.first).hasValue();
+    if (isOutput)
+      copy(viewAndPartialLocalView.second, viewAndPartialLocalView.first);
+  }
+
+  // 4. Dealloc local buffers.
+  for (const auto &pi : promotedBufferAndViews)
+    dealloc(pi.buffer);
+
+  return res;
+}
+
+static void promoteSubViews(FuncOp f, bool dynamicBuffers) {
+  SmallVector<LinalgOp, 8> toErase;
+  OperationFolder folder(f.getContext());
+  f.walk([dynamicBuffers, &folder, &toErase](LinalgOp op) {
+    // TODO(ntv) some heuristic here to decide what to promote. Atm it is all or
+    // nothing.
+    SetVector<Value> subViews;
+    OpBuilder b(op);
+    for (auto it : op.getInputsAndOutputs())
+      if (auto sv = dyn_cast_or_null<SubViewOp>(it->getDefiningOp()))
+        subViews.insert(sv);
+    if (!subViews.empty()) {
+      promoteSubViewOperands(b, op, subViews, dynamicBuffers, &folder);
+      toErase.push_back(op);
+    }
+  });
+  for (auto op : toErase)
+    op.erase();
+}
+
+namespace {
+struct LinalgPromotionPass : public FunctionPass<LinalgPromotionPass> {
+  LinalgPromotionPass() = default;
+  LinalgPromotionPass(bool dynamicBuffers) : dynamicBuffers(dynamicBuffers) {}
+
+  void runOnFunction() override {
+    promoteSubViews(getFunction(), dynamicBuffers);
+  }
+
+  bool dynamicBuffers;
+};
+} // namespace
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::linalg::createLinalgPromotionPass(bool dynamicBuffers) {
+  return std::make_unique<LinalgPromotionPass>(dynamicBuffers);
+}
+
+static PassRegistration<LinalgPromotionPass>
+    pass("linalg-promote-subviews", "promote subview ops to local buffers", [] {
+      return std::make_unique<LinalgPromotionPass>(clPromoteDynamic);
+    });
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..964f540c099213be90549b07cbf5a54dfd8f9791
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -0,0 +1,461 @@
+//===- Tiling.cpp - Implementation of linalg Tiling -----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the linalg dialect Tiling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+#include "llvm/Support/CommandLine.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+using namespace mlir::linalg::intrinsics;
+using namespace mlir::loop;
+
+#define DEBUG_TYPE "linalg-tiling"
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+static llvm::cl::list<unsigned>
+    clTileSizes("linalg-tile-sizes",
+                llvm::cl::desc("Tile sizes by which to tile linalg operations"),
+                llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
+                llvm::cl::cat(clOptionsCategory));
+
+static bool isZero(Value v) {
+  return isa_and_nonnull<ConstantIndexOp>(v->getDefiningOp()) &&
+         cast<ConstantIndexOp>(v->getDefiningOp()).getValue() == 0;
+}
+
+using LoopIndexToRangeIndexMap = DenseMap<int, int>;
+
+// Creates a number of ranges equal to the number of non-zero in `tileSizes`.
+// One for each loop of the LinalgOp that is tiled. The `tileSizes` argument has
+// one entry per surrounding loop. It uses zero as the convention that a
+// particular loop is not tiled. This convention simplifies implementations by
+// avoiding affine map manipulations.
+// The returned ranges correspond to the loop ranges, in the proper order, that
+// are tiled and for which new loops will be created. Also the function returns
+// a map from loop indices of the LinalgOp to the corresponding non-empty range
+// indices of newly created loops.
+static std::tuple<SmallVector<SubViewOp::Range, 4>, LoopIndexToRangeIndexMap>
+makeTiledLoopRanges(OpBuilder &b, Location loc, AffineMap map,
+                    ArrayRef<Value> allViewSizes, ArrayRef<Value> allTileSizes,
+                    OperationFolder *folder) {
+  assert(allTileSizes.size() == map.getNumResults());
+  // Apply `map` to get view sizes in loop order.
+  auto viewSizes = applyMapToValues(b, loc, map, allViewSizes, folder);
+  SmallVector<Value, 4> tileSizes(allTileSizes.begin(), allTileSizes.end());
+
+  // Traverse the tile sizes, which are in loop order, erase zeros everywhere.
+  LoopIndexToRangeIndexMap loopIndexToRangeIndex;
+  for (int idx = 0, e = tileSizes.size(), zerosCount = 0; idx < e; ++idx) {
+    if (isZero(tileSizes[idx - zerosCount])) {
+      viewSizes.erase(viewSizes.begin() + idx - zerosCount);
+      tileSizes.erase(tileSizes.begin() + idx - zerosCount);
+      ++zerosCount;
+      continue;
+    }
+    loopIndexToRangeIndex[idx] = idx - zerosCount;
+  }
+
+  // Create a new range with the applied tile sizes.
+  SmallVector<SubViewOp::Range, 4> res;
+  for (unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx) {
+    res.push_back(SubViewOp::Range{constant_index(folder, 0), viewSizes[idx],
+                                   tileSizes[idx]});
+  }
+  return std::make_tuple(res, loopIndexToRangeIndex);
+}
+
+namespace {
+
+// Helper visitor to determine whether an AffineExpr is tiled.
+// This is achieved by traversing every AffineDimExpr with position `pos` and
+// checking whether the corresponding `tileSizes[pos]` is non-zero.
+// This also enforces only positive coefficients occur in multiplications.
+//
+// Example:
+//   `d0 + 2 * d1 + d3` is tiled by [0, 0, 0, 2] but not by [0, 0, 2, 0]
+//
+struct TileCheck : public AffineExprVisitor<TileCheck> {
+  TileCheck(ArrayRef<Value> tileSizes) : isTiled(false), tileSizes(tileSizes) {}
+
+  void visitDimExpr(AffineDimExpr expr) {
+    isTiled |= !isZero(tileSizes[expr.getPosition()]);
+  }
+  void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) {
+    visit(expr.getLHS());
+    visit(expr.getRHS());
+    if (expr.getKind() == mlir::AffineExprKind::Mul)
+      assert(expr.getRHS().cast<AffineConstantExpr>().getValue() > 0 &&
+             "nonpositive multiplying coefficient");
+  }
+  bool isTiled;
+  ArrayRef<Value> tileSizes;
+};
+
+} // namespace
+
+// IndexedGenericOp explicitly uses induction variables in the loop body. The
+// values of the indices that are used in the loop body for any given access of
+// input/output memref before `subview` op was applied should be invariant with
+// respect to tiling.
+//
+// Therefore, if the operation is tiled, we have to transform the indices
+// accordingly, i.e. offset them by the values of the corresponding induction
+// variables that are captured implicitly in the body of the op.
+//
+// Example. `linalg.indexed_generic` before tiling:
+//
+// #id_2d = (i, j) -> (i, j)
+// #pointwise_2d_trait = {
+//   indexing_maps = [#id_2d, #id_2d],
+//   iterator_types = ["parallel", "parallel"],
+//   n_views = [1, 1]
+// }
+// linalg.indexed_generic #pointwise_2d_trait %operand, %result {
+//   ^bb0(%i: index, %j: index, %operand_in: f32, %result_in: f32):
+//     <some operations that use %i, %j>
+// }: memref<50x100xf32>, memref<50x100xf32>
+//
+// After tiling pass with tiles sizes 10 and 25:
+//
+// #strided = (i, j)[s0, s1, s2] -> (i * s1 + s0 + j * s2)
+//
+// %c1 = constant 1 : index
+// %c0 = constant 0 : index
+// %c25 = constant 25 : index
+// %c10 = constant 10 : index
+// operand_dim_0 = dim %operand, 0 : memref<50x100xf32>
+// operand_dim_1 = dim %operand, 1 : memref<50x100xf32>
+// loop.for %k = %c0 to operand_dim_0 step %c10 {
+//   loop.for %l = %c0 to operand_dim_1 step %c25 {
+//     %4 = std.subview %operand[%k, %l][%c10, %c25][%c1, %c1]
+//       : memref<50x100xf32> to memref<?x?xf32, #strided>
+//     %5 = std.subview %result[%k, %l][%c10, %c25][%c1, %c1]
+//       : memref<50x100xf32> to memref<?x?xf32, #strided>
+//     linalg.indexed_generic pointwise_2d_trait %4, %5 {
+//     ^bb0(%i: index, %j: index, %operand_in: f32, %result_in: f32):
+//       // Indices `k` and `l` are implicitly captured in the body.
+//       %transformed_i = addi %i, %k : index // index `i` is offset by %k
+//       %transformed_j = addi %j, %l : index // index `j` is offset by %l
+//       // Every use of %i, %j is replaced with %transformed_i, %transformed_j
+//       <some operations that use %transformed_i, %transformed_j>
+//     }: memref<?x?xf32, #strided>, memref<?x?xf32, #strided>
+//   }
+// }
+//
+// TODO(pifon, ntv): Investigate whether mixing implicit and explicit indices
+// does not lead to losing information.
+void transformIndexedGenericOpIndices(
+    OpBuilder &b, LinalgOp op, ArrayRef<ValueHandle *> pivs,
+    const LoopIndexToRangeIndexMap &loopIndexToRangeIndex) {
+  auto indexedGenericOp = dyn_cast<IndexedGenericOp>(op.getOperation());
+  if (!indexedGenericOp)
+    return;
+
+  // `linalg.indexed_generic` comes in two flavours. One has a region with a
+  // single block that defines the loop body. The other has a `fun` attribute
+  // that refers to an existing function symbol. The `fun` function call will be
+  // inserted in the loop body in that case.
+  //
+  // TODO(pifon): Add support for `linalg.indexed_generic` with `fun` attribute.
+  auto &region = indexedGenericOp.region();
+  if (region.empty()) {
+    indexedGenericOp.emitError("op expected a region");
+    return;
+  }
+  auto &block = region.getBlocks().front();
+
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPointToStart(&block);
+  for (unsigned i = 0; i < indexedGenericOp.getNumLoops(); ++i) {
+    auto rangeIndex = loopIndexToRangeIndex.find(i);
+    if (rangeIndex == loopIndexToRangeIndex.end())
+      continue;
+    Value oldIndex = block.getArgument(i);
+    // Offset the index argument `i` by the value of the corresponding induction
+    // variable and replace all uses of the previous value.
+    Value newIndex = b.create<AddIOp>(indexedGenericOp.getLoc(), oldIndex,
+                                      pivs[rangeIndex->second]->getValue());
+    for (auto &use : oldIndex->getUses()) {
+      if (use.getOwner() == newIndex->getDefiningOp())
+        continue;
+      use.set(newIndex);
+    }
+  }
+}
+
+static bool isTiled(AffineExpr expr, ArrayRef<Value> tileSizes) {
+  if (!expr)
+    return false;
+  TileCheck t(tileSizes);
+  t.visit(expr);
+  return t.isTiled;
+}
+
+// Checks whether the view with index `viewIndex` within `linalgOp` varies with
+// respect to a non-zero `tileSize`.
+static bool isTiled(AffineMap map, ArrayRef<Value> tileSizes) {
+  if (!map)
+    return false;
+  for (unsigned r = 0; r < map.getNumResults(); ++r)
+    if (isTiled(map.getResult(r), tileSizes))
+      return true;
+  return false;
+}
+
+static SmallVector<Value, 4>
+makeTiledViews(OpBuilder &b, Location loc, LinalgOp linalgOp,
+               ArrayRef<Value> ivs, ArrayRef<Value> tileSizes,
+               ArrayRef<Value> viewSizes, OperationFolder *folder) {
+  assert(ivs.size() == static_cast<size_t>(llvm::count_if(
+                           llvm::make_range(tileSizes.begin(), tileSizes.end()),
+                           [](Value v) { return !isZero(v); })) &&
+         "expected as many ivs as non-zero sizes");
+
+  using edsc::intrinsics::select;
+  using edsc::op::operator+;
+  using edsc::op::operator<;
+
+  // Construct (potentially temporary) mins and maxes on which to apply maps
+  // that define tile subviews.
+  SmallVector<Value, 8> lbs, subViewSizes;
+  for (unsigned idx = 0, idxIvs = 0, e = tileSizes.size(); idx < e; ++idx) {
+    bool isTiled = !isZero(tileSizes[idx]);
+    lbs.push_back(isTiled ? ivs[idxIvs++] : (Value)constant_index(folder, 0));
+    subViewSizes.push_back(isTiled ? tileSizes[idx] : viewSizes[idx]);
+  }
+
+  auto *op = linalgOp.getOperation();
+
+  SmallVector<Value, 4> res;
+  res.reserve(op->getNumOperands());
+  auto viewIteratorBegin = linalgOp.getInputsAndOutputs().begin();
+  for (unsigned viewIndex = 0; viewIndex < linalgOp.getNumInputsAndOutputs();
+       ++viewIndex) {
+    Value view = *(viewIteratorBegin + viewIndex);
+    unsigned rank = view->getType().cast<MemRefType>().getRank();
+    auto map = loopToOperandRangesMaps(linalgOp)[viewIndex];
+    // If the view is not tiled, we can use it as is.
+    if (!isTiled(map, tileSizes)) {
+      res.push_back(view);
+      continue;
+    }
+
+    // Construct a new subview for the tile.
+    SmallVector<Value, 4> offsets, sizes, strides;
+    offsets.reserve(rank);
+    sizes.reserve(rank);
+    strides.reserve(rank);
+    for (unsigned r = 0; r < rank; ++r) {
+      if (!isTiled(map.getSubMap({r}), tileSizes)) {
+        offsets.push_back(constant_index(folder, 0));
+        sizes.push_back(dim(view, r));
+        strides.push_back(constant_index(folder, 1));
+        continue;
+      }
+
+      // Tiling creates a new slice at the proper index, the slice step is 1
+      // (i.e. the slice view does not subsample, stepping occurs in the loop).
+      auto m = map.getSubMap({r});
+      auto offset = applyMapToValues(b, loc, m, lbs, folder).front();
+      offsets.push_back(offset);
+      auto size = applyMapToValues(b, loc, m, subViewSizes, folder).front();
+      sizes.push_back(size);
+      strides.push_back(constant_index(folder, 1));
+    }
+    // TODO(b/144419024) Atm std.subview is not guaranteed in-bounds. Depending
+    // on the semantics we attach to it, we may need to use min(size, dim) here
+    // and canonicalize later.
+    res.push_back(b.create<SubViewOp>(loc, view, offsets, sizes, strides));
+  }
+
+  // Traverse the mins/maxes and erase those that don't have uses left.
+  // This is a special type of folding that we only apply when `folder` is
+  // defined.
+  if (folder)
+    for (auto v : llvm::concat<Value>(lbs, subViewSizes))
+      if (v->use_empty())
+        v->getDefiningOp()->erase();
+
+  return res;
+}
+
+Optional<TiledLinalgOp>
+mlir::linalg::tileLinalgOp(OpBuilder &b, LinalgOp op, ArrayRef<Value> tileSizes,
+                           ArrayRef<unsigned> permutation,
+                           OperationFolder *folder) {
+  // 1. Enforce the convention that "tiling by zero" skips tiling a particular
+  // dimension. This convention is significantly simpler to handle instead of
+  // adjusting affine maps to account for missing dimensions.
+  assert(op.getNumParallelLoops() + op.getNumReductionLoops() +
+                 op.getNumWindowLoops() ==
+             tileSizes.size() &&
+         "expected matching number of tile sizes and loops");
+
+  // If permutation is empty, use the identity. Build the permutation map
+  // otherwise.
+  auto invPermutationMap = AffineMap::getMultiDimIdentityMap(
+      tileSizes.size(), ScopedContext::getContext());
+  if (!permutation.empty())
+    invPermutationMap = inversePermutation(
+        AffineMap::getPermutationMap(permutation, ScopedContext::getContext()));
+
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(op);
+  ScopedContext scope(b, op.getLoc());
+  // 2. Build the tiled loop ranges.
+  auto viewSizes = getViewSizes(op);
+  // The flattened loopToOperandRangesMaps is expected to be an invertible
+  // permutation map (asserted in the inverse calculation).
+  auto viewSizesToLoopsMap =
+      inversePermutation(concatAffineMaps(loopToOperandRangesMaps(op)));
+  assert(viewSizesToLoopsMap && "expected invertible map");
+
+  SmallVector<SubViewOp::Range, 4> loopRanges;
+  LoopIndexToRangeIndexMap loopIndexToRangeIndex;
+  std::tie(loopRanges, loopIndexToRangeIndex) =
+      makeTiledLoopRanges(b, scope.getLocation(), viewSizesToLoopsMap,
+                          viewSizes, tileSizes, folder);
+  if (!permutation.empty())
+    applyPermutationToVector(loopRanges, permutation);
+
+  // 3. Create the tiled loops.
+  LinalgOp res = op;
+  SmallVector<IndexHandle, 4> ivs(loopRanges.size());
+  auto pivs = makeHandlePointers(MutableArrayRef<IndexHandle>(ivs));
+  LoopNestRangeBuilder(pivs, loopRanges)([&] {
+    auto b = ScopedContext::getBuilder();
+    auto loc = ScopedContext::getLocation();
+    SmallVector<Value, 4> ivValues(ivs.begin(), ivs.end());
+
+    // If we have to apply a permutation to the tiled loop nest, we have to
+    // reorder the induction variables This permutation is the right one
+    // assuming that loopRanges have previously been permuted by
+    // (i,j,k)->(k,i,j) So this permutation should be the inversePermutation of
+    // that one: (d0,d1,d2)->(d2,d0,d1)
+    if (!permutation.empty())
+      ivValues = applyMapToValues(b, loc, invPermutationMap, ivValues, folder);
+
+    auto views =
+        makeTiledViews(b, loc, op, ivValues, tileSizes, viewSizes, folder);
+    auto operands = getAssumedNonViewOperands(op);
+    views.append(operands.begin(), operands.end());
+    res = op.clone(b, loc, views);
+  });
+
+  // 4. Transforms index arguments of `linalg.generic` w.r.t. to the tiling.
+  transformIndexedGenericOpIndices(b, res, pivs, loopIndexToRangeIndex);
+
+  // 5. Gather the newly created loops and return them with the new op.
+  SmallVector<ForOp, 8> loops;
+  loops.reserve(ivs.size());
+  for (auto iv : ivs)
+    loops.push_back(loop::getForInductionVarOwner(iv));
+
+  return TiledLinalgOp{res, loops};
+}
+
+Optional<TiledLinalgOp> mlir::linalg::tileLinalgOp(
+    OpBuilder &b, LinalgOp op, ArrayRef<int64_t> tileSizes,
+    ArrayRef<unsigned> permutation, OperationFolder *folder) {
+  if (tileSizes.empty())
+    return llvm::None;
+
+  // The following uses the convention that "tiling by zero" skips tiling a
+  // particular dimension. This convention is significantly simpler to handle
+  // instead of adjusting affine maps to account for missing dimensions.
+  auto nLoops = op.getNumParallelLoops() + op.getNumReductionLoops() +
+                op.getNumWindowLoops();
+  tileSizes = tileSizes.take_front(nLoops);
+  // If only 0 tilings are left, then return.
+  if (llvm::all_of(tileSizes, [](int64_t v) { return v == 0; }))
+    return llvm::None;
+
+  // Create a builder for tile size constants.
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(op);
+  ScopedContext scope(b, op.getLoc());
+
+  // Materialize concrete tile size values to pass the generic tiling function.
+  SmallVector<Value, 8> tileSizeValues;
+  tileSizeValues.reserve(tileSizes.size());
+  for (auto ts : tileSizes)
+    tileSizeValues.push_back(constant_index(folder, ts));
+  // Pad tile sizes with zero values to enforce our convention.
+  if (tileSizeValues.size() < nLoops) {
+    for (unsigned i = tileSizeValues.size(); i < nLoops; ++i)
+      tileSizeValues.push_back(constant_index(folder, 0));
+  }
+
+  return tileLinalgOp(b, op, tileSizeValues, permutation, folder);
+}
+
+static void tileLinalgOps(FuncOp f, ArrayRef<int64_t> tileSizes) {
+  OpBuilder b(f);
+  OperationFolder folder(f.getContext());
+  f.walk([tileSizes, &b, &folder](LinalgOp op) {
+    auto opLoopsPair =
+        tileLinalgOp(b, op, tileSizes, /*permutation=*/{}, &folder);
+    // If tiling occurred successfully, erase old op.
+    if (opLoopsPair)
+      op.erase();
+  });
+  f.walk([](LinalgOp op) {
+    if (!op.getOperation()->hasNoSideEffect())
+      return;
+    if (op.getOperation()->use_empty())
+      op.erase();
+  });
+}
+
+namespace {
+struct LinalgTilingPass : public FunctionPass<LinalgTilingPass> {
+  LinalgTilingPass() = default;
+  LinalgTilingPass(ArrayRef<int64_t> sizes);
+
+  void runOnFunction() override { tileLinalgOps(getFunction(), tileSizes); }
+
+  SmallVector<int64_t, 8> tileSizes;
+};
+} // namespace
+
+LinalgTilingPass::LinalgTilingPass(ArrayRef<int64_t> sizes) {
+  this->tileSizes.assign(sizes.begin(), sizes.end());
+}
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::linalg::createLinalgTilingPass(ArrayRef<int64_t> tileSizes) {
+  return std::make_unique<LinalgTilingPass>(tileSizes);
+}
+
+static PassRegistration<LinalgTilingPass>
+    pass("linalg-tile", "Tile operations in the linalg dialect", [] {
+      auto pass = std::make_unique<LinalgTilingPass>();
+      pass->tileSizes.assign(clTileSizes.begin(), clTileSizes.end());
+      return pass;
+    });
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..560a0235a389d74bbc5f058ed1638ef34bb22162
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -0,0 +1,146 @@
+//===- Utils.cpp - Utilities to support the Linalg dialect ----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities for the Linalg dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+using namespace mlir::linalg::intrinsics;
+using namespace mlir::loop;
+
+mlir::edsc::LoopRangeBuilder::LoopRangeBuilder(ValueHandle *iv,
+                                               ValueHandle range) {
+  assert(range.getType() && "expected !linalg.range type");
+  assert(range.getValue()->getDefiningOp() &&
+         "need operations to extract range parts");
+  auto rangeOp = cast<RangeOp>(range.getValue()->getDefiningOp());
+  auto lb = rangeOp.min();
+  auto ub = rangeOp.max();
+  auto step = rangeOp.step();
+  auto forOp = OperationHandle::createOp<ForOp>(lb, ub, step);
+  *iv = ValueHandle(forOp.getInductionVar());
+  auto *body = forOp.getBody();
+  enter(body, /*prev=*/1);
+}
+
+mlir::edsc::LoopRangeBuilder::LoopRangeBuilder(ValueHandle *iv,
+                                               SubViewOp::Range range) {
+  auto forOp =
+      OperationHandle::createOp<ForOp>(range.offset, range.size, range.stride);
+  *iv = ValueHandle(forOp.getInductionVar());
+  auto *body = forOp.getBody();
+  enter(body, /*prev=*/1);
+}
+
+ValueHandle
+mlir::edsc::LoopRangeBuilder::operator()(std::function<void(void)> fun) {
+  if (fun)
+    fun();
+  exit();
+  return ValueHandle::null();
+}
+
+mlir::edsc::LoopNestRangeBuilder::LoopNestRangeBuilder(
+    ArrayRef<ValueHandle *> ivs, ArrayRef<SubViewOp::Range> ranges) {
+  loops.reserve(ranges.size());
+  for (unsigned i = 0, e = ranges.size(); i < e; ++i) {
+    loops.emplace_back(ivs[i], ranges[i]);
+  }
+  assert(loops.size() == ivs.size() && "Mismatch loops vs ivs size");
+}
+
+mlir::edsc::LoopNestRangeBuilder::LoopNestRangeBuilder(
+    ArrayRef<ValueHandle *> ivs, ArrayRef<ValueHandle> ranges) {
+  loops.reserve(ranges.size());
+  for (unsigned i = 0, e = ranges.size(); i < e; ++i) {
+    loops.emplace_back(ivs[i], ranges[i]);
+  }
+  assert(loops.size() == ivs.size() && "Mismatch loops vs ivs size");
+}
+
+mlir::edsc::LoopNestRangeBuilder::LoopNestRangeBuilder(
+    ArrayRef<ValueHandle *> ivs, ArrayRef<Value> ranges)
+    : LoopNestRangeBuilder(
+          ivs, SmallVector<ValueHandle, 4>(ranges.begin(), ranges.end())) {}
+
+ValueHandle LoopNestRangeBuilder::LoopNestRangeBuilder::operator()(
+    std::function<void(void)> fun) {
+  if (fun)
+    fun();
+  for (auto &lit : reverse(loops)) {
+    lit({});
+  }
+  return ValueHandle::null();
+}
+
+static Value emitOrFoldComposedAffineApply(OpBuilder &b, Location loc,
+                                           AffineMap map,
+                                           ArrayRef<Value> operandsRef,
+                                           OperationFolder *folder) {
+  SmallVector<Value, 4> operands(operandsRef.begin(), operandsRef.end());
+  fullyComposeAffineMapAndOperands(&map, &operands);
+  canonicalizeMapAndOperands(&map, &operands);
+  return folder ? folder->create<AffineApplyOp>(b, loc, map, operands)
+                : b.create<AffineApplyOp>(loc, map, operands);
+}
+
+SmallVector<Value, 4> mlir::linalg::applyMapToValues(OpBuilder &b, Location loc,
+                                                     AffineMap map,
+                                                     ArrayRef<Value> values,
+                                                     OperationFolder *folder) {
+  SmallVector<Value, 4> res;
+  res.reserve(map.getNumResults());
+  unsigned numDims = map.getNumDims();
+  // For each `expr` in `map`, applies the `expr` to the values extracted from
+  // ranges. If the resulting application can be folded into a Value, the
+  // folding occurs eagerly. Otherwise, an affine.apply operation is emitted.
+  for (auto expr : map.getResults()) {
+    AffineMap map = AffineMap::get(numDims, 0, expr);
+    res.push_back(emitOrFoldComposedAffineApply(b, loc, map, values, folder));
+  }
+  return res;
+}
+
+/// Returns all the operands of `linalgOp` that are not views.
+/// Asserts that these operands are value types to allow transformations like
+/// tiling to just use the values when cloning `linalgOp`.
+SmallVector<Value, 4>
+mlir::linalg::getAssumedNonViewOperands(LinalgOp linalgOp) {
+  auto *op = linalgOp.getOperation();
+  unsigned numViews = linalgOp.getNumInputsAndOutputs();
+  unsigned nOperands = op->getNumOperands() - numViews;
+  SmallVector<Value, 4> res;
+  res.reserve(nOperands);
+  for (unsigned i = 0; i < nOperands; ++i) {
+    res.push_back(op->getOperand(numViews + i));
+    auto t = res.back()->getType();
+    (void)t;
+    assert((t.isIntOrIndexOrFloat() || t.isa<VectorType>()) &&
+           "expected scalar or vector type");
+  }
+  return res;
+}
diff --git a/mlir/lib/Dialect/LoopOps/CMakeLists.txt b/mlir/lib/Dialect/LoopOps/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..995e74442905b53b47a4fe32c682730fe4fd6bd3
--- /dev/null
+++ b/mlir/lib/Dialect/LoopOps/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB globbed *.c *.cpp)
+add_llvm_library(MLIRLoopOps
+  ${globbed}
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/LoopOps
+  )
+add_dependencies(MLIRLoopOps MLIRLoopLikeInterfaceIncGen MLIRLoopOpsIncGen MLIRStandardOps LLVMSupport)
+target_link_libraries(MLIRLoopOps LLVMSupport)
diff --git a/mlir/lib/Dialect/LoopOps/DialectRegistration.cpp b/mlir/lib/Dialect/LoopOps/DialectRegistration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6564e78855c1edfd19c2f7542397be93aa986875
--- /dev/null
+++ b/mlir/lib/Dialect/LoopOps/DialectRegistration.cpp
@@ -0,0 +1,13 @@
+//===- DialectRegistration.cpp - Register loop dialect --------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+using namespace mlir;
+
+// Static initialization for loop dialect registration.
+static DialectRegistration<loop::LoopOpsDialect> LoopOps;
diff --git a/mlir/lib/Dialect/LoopOps/LoopOps.cpp b/mlir/lib/Dialect/LoopOps/LoopOps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..acbab01df79f13e9ca035aa698916765155b821f
--- /dev/null
+++ b/mlir/lib/Dialect/LoopOps/LoopOps.cpp
@@ -0,0 +1,230 @@
+//===- Ops.cpp - Loop MLIR Operations -------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/MathExtras.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/SideEffectsInterface.h"
+
+using namespace mlir;
+using namespace mlir::loop;
+
+//===----------------------------------------------------------------------===//
+// LoopOpsDialect Interfaces
+//===----------------------------------------------------------------------===//
+namespace {
+
+struct LoopSideEffectsInterface : public SideEffectsDialectInterface {
+  using SideEffectsDialectInterface::SideEffectsDialectInterface;
+
+  SideEffecting isSideEffecting(Operation *op) const override {
+    if (isa<IfOp>(op) || isa<ForOp>(op)) {
+      return Recursive;
+    }
+    return SideEffectsDialectInterface::isSideEffecting(op);
+  };
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// LoopOpsDialect
+//===----------------------------------------------------------------------===//
+
+LoopOpsDialect::LoopOpsDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context) {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/LoopOps/LoopOps.cpp.inc"
+      >();
+  addInterfaces<LoopSideEffectsInterface>();
+}
+
+//===----------------------------------------------------------------------===//
+// ForOp
+//===----------------------------------------------------------------------===//
+
+void ForOp::build(Builder *builder, OperationState &result, Value lb, Value ub,
+                  Value step) {
+  result.addOperands({lb, ub, step});
+  Region *bodyRegion = result.addRegion();
+  ForOp::ensureTerminator(*bodyRegion, *builder, result.location);
+  bodyRegion->front().addArgument(builder->getIndexType());
+}
+
+LogicalResult verify(ForOp op) {
+  if (auto cst = dyn_cast_or_null<ConstantIndexOp>(op.step()->getDefiningOp()))
+    if (cst.getValue() <= 0)
+      return op.emitOpError("constant step operand must be positive");
+
+  // Check that the body defines as single block argument for the induction
+  // variable.
+  auto *body = op.getBody();
+  if (body->getNumArguments() != 1 ||
+      !body->getArgument(0)->getType().isIndex())
+    return op.emitOpError("expected body to have a single index argument for "
+                          "the induction variable");
+  return success();
+}
+
+static void print(OpAsmPrinter &p, ForOp op) {
+  p << op.getOperationName() << " " << *op.getInductionVar() << " = "
+    << *op.lowerBound() << " to " << *op.upperBound() << " step " << *op.step();
+  p.printRegion(op.region(),
+                /*printEntryBlockArgs=*/false,
+                /*printBlockTerminators=*/false);
+  p.printOptionalAttrDict(op.getAttrs());
+}
+
+static ParseResult parseForOp(OpAsmParser &parser, OperationState &result) {
+  auto &builder = parser.getBuilder();
+  OpAsmParser::OperandType inductionVariable, lb, ub, step;
+  // Parse the induction variable followed by '='.
+  if (parser.parseRegionArgument(inductionVariable) || parser.parseEqual())
+    return failure();
+
+  // Parse loop bounds.
+  Type indexType = builder.getIndexType();
+  if (parser.parseOperand(lb) ||
+      parser.resolveOperand(lb, indexType, result.operands) ||
+      parser.parseKeyword("to") || parser.parseOperand(ub) ||
+      parser.resolveOperand(ub, indexType, result.operands) ||
+      parser.parseKeyword("step") || parser.parseOperand(step) ||
+      parser.resolveOperand(step, indexType, result.operands))
+    return failure();
+
+  // Parse the body region.
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, inductionVariable, indexType))
+    return failure();
+
+  ForOp::ensureTerminator(*body, builder, result.location);
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  return success();
+}
+
+Region &ForOp::getLoopBody() { return region(); }
+
+bool ForOp::isDefinedOutsideOfLoop(Value value) {
+  return !region().isAncestor(value->getParentRegion());
+}
+
+LogicalResult ForOp::moveOutOfLoop(ArrayRef<Operation *> ops) {
+  for (auto *op : ops)
+    op->moveBefore(this->getOperation());
+  return success();
+}
+
+ForOp mlir::loop::getForInductionVarOwner(Value val) {
+  auto ivArg = val.dyn_cast<BlockArgument>();
+  if (!ivArg)
+    return ForOp();
+  assert(ivArg->getOwner() && "unlinked block argument");
+  auto *containingInst = ivArg->getOwner()->getParentOp();
+  return dyn_cast_or_null<ForOp>(containingInst);
+}
+
+//===----------------------------------------------------------------------===//
+// IfOp
+//===----------------------------------------------------------------------===//
+
+void IfOp::build(Builder *builder, OperationState &result, Value cond,
+                 bool withElseRegion) {
+  result.addOperands(cond);
+  Region *thenRegion = result.addRegion();
+  Region *elseRegion = result.addRegion();
+  IfOp::ensureTerminator(*thenRegion, *builder, result.location);
+  if (withElseRegion)
+    IfOp::ensureTerminator(*elseRegion, *builder, result.location);
+}
+
+static LogicalResult verify(IfOp op) {
+  // Verify that the entry of each child region does not have arguments.
+  for (auto &region : op.getOperation()->getRegions()) {
+    if (region.empty())
+      continue;
+
+    for (auto &b : region)
+      if (b.getNumArguments() != 0)
+        return op.emitOpError(
+            "requires that child entry blocks have no arguments");
+  }
+  return success();
+}
+
+static ParseResult parseIfOp(OpAsmParser &parser, OperationState &result) {
+  // Create the regions for 'then'.
+  result.regions.reserve(2);
+  Region *thenRegion = result.addRegion();
+  Region *elseRegion = result.addRegion();
+
+  auto &builder = parser.getBuilder();
+  OpAsmParser::OperandType cond;
+  Type i1Type = builder.getIntegerType(1);
+  if (parser.parseOperand(cond) ||
+      parser.resolveOperand(cond, i1Type, result.operands))
+    return failure();
+
+  // Parse the 'then' region.
+  if (parser.parseRegion(*thenRegion, {}, {}))
+    return failure();
+  IfOp::ensureTerminator(*thenRegion, parser.getBuilder(), result.location);
+
+  // If we find an 'else' keyword then parse the 'else' region.
+  if (!parser.parseOptionalKeyword("else")) {
+    if (parser.parseRegion(*elseRegion, {}, {}))
+      return failure();
+    IfOp::ensureTerminator(*elseRegion, parser.getBuilder(), result.location);
+  }
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  return success();
+}
+
+static void print(OpAsmPrinter &p, IfOp op) {
+  p << IfOp::getOperationName() << " " << *op.condition();
+  p.printRegion(op.thenRegion(),
+                /*printEntryBlockArgs=*/false,
+                /*printBlockTerminators=*/false);
+
+  // Print the 'else' regions if it exists and has a block.
+  auto &elseRegion = op.elseRegion();
+  if (!elseRegion.empty()) {
+    p << " else";
+    p.printRegion(elseRegion,
+                  /*printEntryBlockArgs=*/false,
+                  /*printBlockTerminators=*/false);
+  }
+
+  p.printOptionalAttrDict(op.getAttrs());
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/LoopOps/LoopOps.cpp.inc"
diff --git a/mlir/lib/Dialect/QuantOps/CMakeLists.txt b/mlir/lib/Dialect/QuantOps/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..74b3f3c4525c1fdfa7401b3386177dde9ef653d8
--- /dev/null
+++ b/mlir/lib/Dialect/QuantOps/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_llvm_library(MLIRQuantOps
+  IR/DialectRegistration.cpp
+  IR/QuantOps.cpp
+  IR/QuantTypes.cpp
+  IR/TypeDetail.h
+  IR/TypeParser.cpp
+  Transforms/ConvertConst.cpp
+  Transforms/ConvertSimQuant.cpp
+  Utils/QuantizeUtils.cpp
+  Utils/UniformSupport.cpp
+  Utils/FakeQuantSupport.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/QuantOps
+  )
+add_dependencies(MLIRQuantOps
+                 MLIRIR
+                 MLIRPass
+                 MLIRQuantOpsIncGen
+                 MLIRSupport
+                 MLIRStandardOps)
diff --git a/mlir/lib/Dialect/QuantOps/IR/DialectRegistration.cpp b/mlir/lib/Dialect/QuantOps/IR/DialectRegistration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1738d6d727735c1a4a42cefa33a6e5bccb70ae3b
--- /dev/null
+++ b/mlir/lib/Dialect/QuantOps/IR/DialectRegistration.cpp
@@ -0,0 +1,15 @@
+//===- DialectRegistration.cpp - Register Quantization dialect ------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+
+// Static initialization for Quantization dialect registration.
+static mlir::DialectRegistration<QuantizationDialect> QuantizationOps;
diff --git a/mlir/lib/Dialect/QuantOps/IR/QuantOps.cpp b/mlir/lib/Dialect/QuantOps/IR/QuantOps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..faeff246bd2bd4de489aff19c82c662501f7e9a1
--- /dev/null
+++ b/mlir/lib/Dialect/QuantOps/IR/QuantOps.cpp
@@ -0,0 +1,46 @@
+//===- QuantOps.cpp - Quantization Type and Ops Implementation --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "TypeDetail.h"
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/MathExtras.h"
+#include <numeric>
+
+using namespace mlir;
+using namespace mlir::quant;
+using namespace mlir::quant::detail;
+
+QuantizationDialect::QuantizationDialect(MLIRContext *context)
+    : Dialect(/*name=*/"quant", context) {
+  addTypes<AnyQuantizedType, UniformQuantizedType,
+           UniformQuantizedPerAxisType>();
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/QuantOps/QuantOps.cpp.inc"
+      >();
+}
+
+OpFoldResult StorageCastOp::fold(ArrayRef<Attribute> operands) {
+  /// Matches x -> [scast -> scast] -> y, replacing the second scast with the
+  /// value of x if the casts invert each other.
+  auto srcScastOp = dyn_cast_or_null<StorageCastOp>(arg()->getDefiningOp());
+  if (!srcScastOp || srcScastOp.arg()->getType() != getType())
+    return OpFoldResult();
+  return srcScastOp.arg();
+}
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/QuantOps/QuantOps.cpp.inc"
diff --git a/mlir/lib/Dialect/QuantOps/IR/QuantTypes.cpp b/mlir/lib/Dialect/QuantOps/IR/QuantTypes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e33963602c69639a79fb8056e00004946b6d3e6
--- /dev/null
+++ b/mlir/lib/Dialect/QuantOps/IR/QuantTypes.cpp
@@ -0,0 +1,362 @@
+//===- QuantOps.cpp - Quantization Type and Ops Implementation --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "TypeDetail.h"
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+using namespace mlir::quant::detail;
+
+unsigned QuantizedType::getFlags() const {
+  return static_cast<ImplType *>(impl)->flags;
+}
+
+LogicalResult QuantizedType::verifyConstructionInvariants(
+    Optional<Location> loc, MLIRContext *context, unsigned flags,
+    Type storageType, Type expressedType, int64_t storageTypeMin,
+    int64_t storageTypeMax) {
+  // Verify that the storage type is integral.
+  // This restriction may be lifted at some point in favor of using bf16
+  // or f16 as exact representations on hardware where that is advantageous.
+  auto intStorageType = storageType.dyn_cast<IntegerType>();
+  if (!intStorageType)
+    return emitOptionalError(loc, "storage type must be integral");
+  unsigned integralWidth = intStorageType.getWidth();
+
+  // Verify storage width.
+  if (integralWidth == 0 || integralWidth > MaxStorageBits)
+    return emitOptionalError(loc, "illegal storage type size: ", integralWidth);
+
+  // Verify storageTypeMin and storageTypeMax.
+  bool isSigned =
+      (flags & QuantizationFlags::Signed) == QuantizationFlags::Signed;
+  int64_t defaultIntegerMin =
+      getDefaultMinimumForInteger(isSigned, integralWidth);
+  int64_t defaultIntegerMax =
+      getDefaultMaximumForInteger(isSigned, integralWidth);
+  if (storageTypeMax - storageTypeMin <= 0 ||
+      storageTypeMin < defaultIntegerMin ||
+      storageTypeMax > defaultIntegerMax) {
+    return emitOptionalError(loc, "illegal storage min and storage max: (",
+                             storageTypeMin, ":", storageTypeMax, ")");
+  }
+  return success();
+}
+
+Type QuantizedType::getStorageType() const {
+  return static_cast<ImplType *>(impl)->storageType;
+}
+
+int64_t QuantizedType::getStorageTypeMin() const {
+  return static_cast<ImplType *>(impl)->storageTypeMin;
+}
+
+int64_t QuantizedType::getStorageTypeMax() const {
+  return static_cast<ImplType *>(impl)->storageTypeMax;
+}
+
+unsigned QuantizedType::getStorageTypeIntegralWidth() const {
+  // NOTE: If ever supporting non-integral storage types, some other scheme
+  // for determining the width will be needed.
+  return static_cast<ImplType *>(impl)->storageType.getIntOrFloatBitWidth();
+}
+
+Type QuantizedType::getExpressedType() const {
+  return static_cast<ImplType *>(impl)->expressedType;
+}
+
+bool QuantizedType::isCompatibleExpressedType(Type candidateExpressedType) {
+  if (candidateExpressedType.isa<ShapedType>()) {
+    return candidateExpressedType.cast<ShapedType>().getElementType() ==
+           getExpressedType();
+  }
+  return candidateExpressedType == getExpressedType();
+}
+
+QuantizedType
+QuantizedType::getQuantizedElementType(Type primitiveOrContainerType) {
+  if (primitiveOrContainerType.isa<ShapedType>()) {
+    Type elementType =
+        primitiveOrContainerType.cast<ShapedType>().getElementType();
+    return elementType.dyn_cast<QuantizedType>();
+  }
+  return primitiveOrContainerType.dyn_cast<QuantizedType>();
+}
+
+Type QuantizedType::castFromStorageType(Type candidateType) {
+  if (candidateType == getStorageType()) {
+    // i.e. i32 -> quant<"uniform[i8:f32]{1.0}">
+    return *this;
+  } else if (candidateType.isa<RankedTensorType>()) {
+    // i.e. tensor<4xi8> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+    return RankedTensorType::get(
+        candidateType.cast<RankedTensorType>().getShape(), getStorageType());
+  } else if (candidateType.isa<UnrankedTensorType>()) {
+    // i.e. tensor<i8> -> tensor<!quant<"uniform[i8:f32]{1.0}">>
+    return UnrankedTensorType::get(getStorageType());
+  } else if (candidateType.isa<VectorType>()) {
+    // i.e. tensor<4xi8> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+    return VectorType::get(candidateType.cast<VectorType>().getShape(),
+                           getStorageType());
+  }
+
+  return nullptr;
+}
+
+Type QuantizedType::castToStorageType(Type quantizedType) {
+  if (quantizedType.isa<QuantizedType>()) {
+    // i.e. quant<"uniform[i8:f32]{1.0}"> -> i8
+    return quantizedType.cast<QuantizedType>().getStorageType();
+  } else if (quantizedType.isa<ShapedType>()) {
+    // i.e. tensor<4xi8> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+    ShapedType sType = quantizedType.cast<ShapedType>();
+    if (!sType.getElementType().isa<QuantizedType>()) {
+      return nullptr;
+    }
+    Type storageType =
+        sType.getElementType().cast<QuantizedType>().getStorageType();
+    if (quantizedType.isa<RankedTensorType>()) {
+      return RankedTensorType::get(sType.getShape(), storageType);
+    } else if (quantizedType.isa<UnrankedTensorType>()) {
+      return UnrankedTensorType::get(storageType);
+    } else if (quantizedType.isa<VectorType>()) {
+      return VectorType::get(sType.getShape(), storageType);
+    }
+  }
+
+  return nullptr;
+}
+
+Type QuantizedType::castFromExpressedType(Type candidateType) {
+  if (candidateType == getExpressedType()) {
+    // i.e. f32 -> quant<"uniform[i8:f32]{1.0}">
+    return *this;
+  } else if (candidateType.isa<ShapedType>()) {
+    ShapedType candidateShapedType = candidateType.cast<ShapedType>();
+    if (candidateShapedType.getElementType() != getExpressedType()) {
+      return nullptr;
+    }
+
+    if (candidateType.isa<RankedTensorType>()) {
+      // i.e. tensor<4xf32> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+      return RankedTensorType::get(candidateShapedType.getShape(), *this);
+    } else if (candidateType.isa<UnrankedTensorType>()) {
+      // i.e. tensor<xf32> -> tensor<x!quant<"uniform[i8:f32]{1.0}">>
+      return UnrankedTensorType::get(*this);
+    } else if (candidateType.isa<VectorType>()) {
+      // i.e. tensor<4xf32> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+      return VectorType::get(candidateShapedType.getShape(), *this);
+    }
+  }
+
+  return nullptr;
+}
+
+Type QuantizedType::castToExpressedType(Type quantizedType) {
+  if (quantizedType.isa<QuantizedType>()) {
+    // i.e. quant<"uniform[i8:f32]{1.0}"> -> f32
+    return quantizedType.cast<QuantizedType>().getExpressedType();
+  } else if (quantizedType.isa<ShapedType>()) {
+    // i.e. tensor<4xi8> -> tensor<4x!quant<"uniform[i8:f32]{1.0}">>
+    ShapedType sType = quantizedType.cast<ShapedType>();
+    if (!sType.getElementType().isa<QuantizedType>()) {
+      return nullptr;
+    }
+    Type expressedType =
+        sType.getElementType().cast<QuantizedType>().getExpressedType();
+    if (quantizedType.isa<RankedTensorType>()) {
+      return RankedTensorType::get(sType.getShape(), expressedType);
+    } else if (quantizedType.isa<UnrankedTensorType>()) {
+      return UnrankedTensorType::get(expressedType);
+    } else if (quantizedType.isa<VectorType>()) {
+      return VectorType::get(sType.getShape(), expressedType);
+    }
+  }
+
+  return nullptr;
+}
+
+Type QuantizedType::castExpressedToStorageType(Type candidateType) {
+  Type expressedQuantizedType = castFromExpressedType(candidateType);
+  if (!expressedQuantizedType) {
+    return nullptr;
+  }
+  return QuantizedType::castToStorageType(expressedQuantizedType);
+}
+
+AnyQuantizedType AnyQuantizedType::get(unsigned flags, Type storageType,
+                                       Type expressedType,
+                                       int64_t storageTypeMin,
+                                       int64_t storageTypeMax) {
+  return Base::get(storageType.getContext(), QuantizationTypes::Any, flags,
+                   storageType, expressedType, storageTypeMin, storageTypeMax);
+}
+
+AnyQuantizedType AnyQuantizedType::getChecked(unsigned flags, Type storageType,
+                                              Type expressedType,
+                                              int64_t storageTypeMin,
+                                              int64_t storageTypeMax,
+                                              Location location) {
+  return Base::getChecked(location, storageType.getContext(),
+                          QuantizationTypes::Any, flags, storageType,
+                          expressedType, storageTypeMin, storageTypeMax);
+}
+
+LogicalResult AnyQuantizedType::verifyConstructionInvariants(
+    Optional<Location> loc, MLIRContext *context, unsigned flags,
+    Type storageType, Type expressedType, int64_t storageTypeMin,
+    int64_t storageTypeMax) {
+  if (failed(QuantizedType::verifyConstructionInvariants(
+          loc, context, flags, storageType, expressedType, storageTypeMin,
+          storageTypeMax))) {
+    return failure();
+  }
+
+  // Verify that the expressed type is floating point.
+  // If this restriction is ever eliminated, the parser/printer must be
+  // extended.
+  if (expressedType && !expressedType.isa<FloatType>())
+    return emitOptionalError(loc, "expressed type must be floating point");
+
+  return success();
+}
+
+UniformQuantizedType UniformQuantizedType::get(unsigned flags, Type storageType,
+                                               Type expressedType, double scale,
+                                               int64_t zeroPoint,
+                                               int64_t storageTypeMin,
+                                               int64_t storageTypeMax) {
+  return Base::get(storageType.getContext(),
+                   QuantizationTypes::UniformQuantized, flags, storageType,
+                   expressedType, scale, zeroPoint, storageTypeMin,
+                   storageTypeMax);
+}
+
+UniformQuantizedType
+UniformQuantizedType::getChecked(unsigned flags, Type storageType,
+                                 Type expressedType, double scale,
+                                 int64_t zeroPoint, int64_t storageTypeMin,
+                                 int64_t storageTypeMax, Location location) {
+  return Base::getChecked(location, storageType.getContext(),
+                          QuantizationTypes::UniformQuantized, flags,
+                          storageType, expressedType, scale, zeroPoint,
+                          storageTypeMin, storageTypeMax);
+}
+
+LogicalResult UniformQuantizedType::verifyConstructionInvariants(
+    Optional<Location> loc, MLIRContext *context, unsigned flags,
+    Type storageType, Type expressedType, double scale, int64_t zeroPoint,
+    int64_t storageTypeMin, int64_t storageTypeMax) {
+  if (failed(QuantizedType::verifyConstructionInvariants(
+          loc, context, flags, storageType, expressedType, storageTypeMin,
+          storageTypeMax))) {
+    return failure();
+  }
+
+  // Uniform quantization requires fully expressed parameters, including
+  // expressed type.
+  if (!expressedType)
+    return emitOptionalError(loc,
+                             "uniform quantization requires expressed type");
+
+  // Verify that the expressed type is floating point.
+  // If this restriction is ever eliminated, the parser/printer must be
+  // extended.
+  if (!expressedType.isa<FloatType>())
+    return emitOptionalError(loc, "expressed type must be floating point");
+
+  // Verify scale.
+  if (scale <= 0.0 || std::isinf(scale) || std::isnan(scale))
+    return emitOptionalError(loc, "illegal scale: ", scale);
+
+  return success();
+}
+
+double UniformQuantizedType::getScale() const { return getImpl()->scale; }
+
+int64_t UniformQuantizedType::getZeroPoint() const {
+  return getImpl()->zeroPoint;
+}
+
+UniformQuantizedPerAxisType UniformQuantizedPerAxisType::get(
+    unsigned flags, Type storageType, Type expressedType,
+    ArrayRef<double> scales, ArrayRef<int64_t> zeroPoints,
+    int32_t quantizedDimension, int64_t storageTypeMin,
+    int64_t storageTypeMax) {
+  return Base::get(storageType.getContext(),
+                   QuantizationTypes::UniformQuantizedPerAxis, flags,
+                   storageType, expressedType, scales, zeroPoints,
+                   quantizedDimension, storageTypeMin, storageTypeMax);
+}
+
+UniformQuantizedPerAxisType UniformQuantizedPerAxisType::getChecked(
+    unsigned flags, Type storageType, Type expressedType,
+    ArrayRef<double> scales, ArrayRef<int64_t> zeroPoints,
+    int32_t quantizedDimension, int64_t storageTypeMin, int64_t storageTypeMax,
+    Location location) {
+  return Base::getChecked(location, storageType.getContext(),
+                          QuantizationTypes::UniformQuantizedPerAxis, flags,
+                          storageType, expressedType, scales, zeroPoints,
+                          quantizedDimension, storageTypeMin, storageTypeMax);
+}
+
+LogicalResult UniformQuantizedPerAxisType::verifyConstructionInvariants(
+    Optional<Location> loc, MLIRContext *context, unsigned flags,
+    Type storageType, Type expressedType, ArrayRef<double> scales,
+    ArrayRef<int64_t> zeroPoints, int32_t quantizedDimension,
+    int64_t storageTypeMin, int64_t storageTypeMax) {
+  if (failed(QuantizedType::verifyConstructionInvariants(
+          loc, context, flags, storageType, expressedType, storageTypeMin,
+          storageTypeMax))) {
+    return failure();
+  }
+
+  // Uniform quantization requires fully expressed parameters, including
+  // expressed type.
+  if (!expressedType)
+    return emitOptionalError(loc,
+                             "uniform quantization requires expressed type");
+
+  // Verify that the expressed type is floating point.
+  // If this restriction is ever eliminated, the parser/printer must be
+  // extended.
+  if (!expressedType.isa<FloatType>())
+    return emitOptionalError(loc, "expressed type must be floating point");
+
+  // Ensure that the number of scales and zeroPoints match.
+  if (scales.size() != zeroPoints.size())
+    return emitOptionalError(loc, "illegal number of scales and zeroPoints: ",
+                             scales.size(), ", ", zeroPoints.size());
+
+  // Verify scale.
+  for (double scale : scales) {
+    if (scale <= 0.0 || std::isinf(scale) || std::isnan(scale))
+      return emitOptionalError(loc, "illegal scale: ", scale);
+  }
+
+  return success();
+}
+
+ArrayRef<double> UniformQuantizedPerAxisType::getScales() const {
+  return getImpl()->getScales();
+}
+
+ArrayRef<int64_t> UniformQuantizedPerAxisType::getZeroPoints() const {
+  return getImpl()->getZeroPoints();
+}
+
+int32_t UniformQuantizedPerAxisType::getQuantizedDimension() const {
+  return getImpl()->quantizedDimension;
+}
diff --git a/mlir/lib/Dialect/QuantOps/IR/TypeDetail.h b/mlir/lib/Dialect/QuantOps/IR/TypeDetail.h
new file mode 100644
index 0000000000000000000000000000000000000000..801a0de32b495c1bc4f035faf089e8bb3d98c631
--- /dev/null
+++ b/mlir/lib/Dialect/QuantOps/IR/TypeDetail.h
@@ -0,0 +1,260 @@
+//===- TypeDetail.h - QuantOps Type detail ----------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TYPE_DETAIL_H_
+#define TYPE_DETAIL_H_
+
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeSupport.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/bit.h"
+
+namespace mlir {
+namespace quant {
+namespace detail {
+
+struct QuantizedTypeStorage : public mlir::TypeStorage {
+  QuantizedTypeStorage(unsigned flags, Type storageType, Type expressedType,
+                       int64_t storageTypeMin, int64_t storageTypeMax)
+      : flags(flags), storageType(storageType), expressedType(expressedType),
+        storageTypeMin(storageTypeMin), storageTypeMax(storageTypeMax) {}
+
+  /// Flags corresponding to the bitmapped enum QuantizationFlags::FlagValue.
+  unsigned flags;
+
+  // Integral type for the storage point representation.
+  Type storageType;
+
+  // Floating point type that the quantized type approximates.
+  Type expressedType;
+
+  // The minimum value storageType can take.
+  int64_t storageTypeMin;
+
+  // The maximum value storageType can take.
+  int64_t storageTypeMax;
+};
+
+struct AnyQuantizedTypeStorage : public QuantizedTypeStorage {
+  struct KeyTy {
+    KeyTy(unsigned flags, Type storageType, Type expressedType,
+          int64_t storageTypeMin, int64_t storageTypeMax)
+        : flags(flags), storageType(storageType), expressedType(expressedType),
+          storageTypeMin(storageTypeMin), storageTypeMax(storageTypeMax) {}
+    unsigned flags;
+    Type storageType;
+    Type expressedType;
+    int64_t storageTypeMin;
+    int64_t storageTypeMax;
+
+    // Check for equality of two structures that share KeyTy data members
+    // (by name).
+    template <typename T, typename U>
+    static bool genericIsEqual(const T &lhs, const U &rhs) {
+      return lhs.flags == rhs.flags && lhs.storageType == rhs.storageType &&
+             lhs.expressedType == rhs.expressedType &&
+             lhs.storageTypeMin == rhs.storageTypeMin &&
+             lhs.storageTypeMax == rhs.storageTypeMax;
+    }
+
+    bool operator==(const KeyTy &other) const {
+      return genericIsEqual(*this, other);
+    }
+
+    unsigned getHashValue() const {
+      return llvm::hash_combine(flags, storageType, expressedType,
+                                storageTypeMin, storageTypeMax);
+    }
+  };
+
+  AnyQuantizedTypeStorage(const KeyTy &key)
+      : QuantizedTypeStorage(key.flags, key.storageType, key.expressedType,
+                             key.storageTypeMin, key.storageTypeMax) {}
+
+  bool operator==(const KeyTy &key) const {
+    return KeyTy::genericIsEqual(*this, key);
+  }
+
+  /// Construction.
+  static AnyQuantizedTypeStorage *construct(TypeStorageAllocator &allocator,
+                                            const KeyTy &key) {
+    return new (allocator.allocate<AnyQuantizedTypeStorage>())
+        AnyQuantizedTypeStorage(key);
+  }
+
+  static unsigned hashKey(const KeyTy &key) { return key.getHashValue(); }
+};
+
+struct UniformQuantizedTypeStorage : public QuantizedTypeStorage {
+  struct KeyTy {
+    KeyTy(unsigned flags, Type storageType, Type expressedType, double scale,
+          int64_t zeroPoint, int64_t storageTypeMin, int64_t storageTypeMax)
+        : flags(flags), storageType(storageType), expressedType(expressedType),
+          scale(scale), zeroPoint(zeroPoint), storageTypeMin(storageTypeMin),
+          storageTypeMax(storageTypeMax) {}
+    /// Flags corresponding to the bitmapped enum QuantizationFlags::FlagValue.
+    unsigned flags;
+
+    // Integral type for the storage point representation.
+    Type storageType;
+
+    // Floating point type that the quantized type approximates.
+    Type expressedType;
+
+    double scale;
+    int64_t zeroPoint;
+    int64_t storageTypeMin;
+    int64_t storageTypeMax;
+
+    // Check for equality of two structures that share KeyTy data members
+    // (by name).
+    template <typename T, typename U>
+    static bool genericIsEqual(const T &lhs, const U &rhs) {
+      return lhs.flags == rhs.flags && lhs.storageType == rhs.storageType &&
+             lhs.expressedType == rhs.expressedType && lhs.scale == rhs.scale &&
+             lhs.zeroPoint == rhs.zeroPoint &&
+             lhs.storageTypeMin == rhs.storageTypeMin &&
+             lhs.storageTypeMax == rhs.storageTypeMax;
+    }
+
+    bool operator==(const KeyTy &other) const {
+      return genericIsEqual(*this, other);
+    }
+
+    unsigned getHashValue() const {
+      int64_t scaleBits = llvm::bit_cast<int64_t>(scale);
+      return llvm::hash_combine(flags, storageType, expressedType, scaleBits,
+                                zeroPoint, storageTypeMin, storageTypeMax);
+    }
+  };
+
+  UniformQuantizedTypeStorage(const KeyTy &key)
+      : QuantizedTypeStorage(key.flags, key.storageType, key.expressedType,
+                             key.storageTypeMin, key.storageTypeMax),
+        scale(key.scale), zeroPoint(key.zeroPoint) {}
+
+  bool operator==(const KeyTy &key) const {
+    return KeyTy::genericIsEqual(*this, key);
+  }
+
+  /// Construction.
+  static UniformQuantizedTypeStorage *construct(TypeStorageAllocator &allocator,
+                                                const KeyTy &key) {
+    return new (allocator.allocate<UniformQuantizedTypeStorage>())
+        UniformQuantizedTypeStorage(key);
+  }
+
+  static unsigned hashKey(const KeyTy &key) { return key.getHashValue(); }
+
+  double scale;
+  int64_t zeroPoint;
+};
+
+struct UniformQuantizedPerAxisTypeStorage : public QuantizedTypeStorage {
+  struct KeyTy {
+    KeyTy(unsigned flags, Type storageType, Type expressedType,
+          ArrayRef<double> scales, ArrayRef<int64_t> zeroPoints,
+          int32_t quantizedDimension, int64_t storageTypeMin,
+          int64_t storageTypeMax)
+        : flags(flags), storageType(storageType), expressedType(expressedType),
+          scales(scales), zeroPoints(zeroPoints),
+          quantizedDimension(quantizedDimension),
+          storageTypeMin(storageTypeMin), storageTypeMax(storageTypeMax) {}
+    /// Flags corresponding to the bitmapped enum QuantizationFlags::FlagValue.
+    unsigned flags;
+
+    // Integral type for the storage point representation.
+    Type storageType;
+
+    // Floating point type that the quantized type approximates.
+    Type expressedType;
+
+    ArrayRef<double> scales;
+    ArrayRef<int64_t> zeroPoints;
+    int32_t quantizedDimension;
+    int64_t storageTypeMin;
+    int64_t storageTypeMax;
+
+    ArrayRef<double> getScales() const { return scales; }
+
+    ArrayRef<int64_t> getZeroPoints() const { return zeroPoints; }
+
+    // Check for equality of two structures that share KeyTy data members
+    // (by name).
+    template <typename T, typename U>
+    static bool genericIsEqual(const T &lhs, const U &rhs) {
+      return lhs.flags == rhs.flags && lhs.storageType == rhs.storageType &&
+             lhs.expressedType == rhs.expressedType &&
+             lhs.getScales() == rhs.getScales() &&
+             lhs.getZeroPoints() == rhs.getZeroPoints() &&
+             lhs.quantizedDimension == rhs.quantizedDimension &&
+             lhs.storageTypeMin == rhs.storageTypeMin &&
+             lhs.storageTypeMax == rhs.storageTypeMax;
+    }
+
+    bool operator==(const KeyTy &other) const {
+      return genericIsEqual(*this, other);
+    }
+
+    unsigned getHashValue() const {
+      int64_t *scalesCast = llvm::bit_cast<int64_t *>(scales.data());
+      ArrayRef<int64_t> scalesBits(scalesCast, scales.size());
+      return llvm::hash_combine(
+          flags, storageType, expressedType,
+          llvm::hash_combine_range(scalesBits.begin(), scalesBits.end()),
+          llvm::hash_combine_range(zeroPoints.begin(), zeroPoints.end()),
+          storageTypeMin, storageTypeMax);
+    }
+  };
+
+  // We pass scales and zeroPoints in directly rather than relying on KeyTy
+  // because we have to create new reallocated versions in `construct` below.
+  UniformQuantizedPerAxisTypeStorage(const KeyTy &key, ArrayRef<double> scales,
+                                     ArrayRef<int64_t> zeroPoints)
+      : QuantizedTypeStorage(key.flags, key.storageType, key.expressedType,
+                             key.storageTypeMin, key.storageTypeMax),
+        scaleElements(scales.data()), zeroPointElements(zeroPoints.data()),
+        quantParamsSize(scales.size()),
+        quantizedDimension(key.quantizedDimension) {}
+
+  bool operator==(const KeyTy &key) const {
+    return KeyTy::genericIsEqual(*this, key);
+  }
+
+  /// Construction.
+  static UniformQuantizedPerAxisTypeStorage *
+  construct(TypeStorageAllocator &allocator, const KeyTy &key) {
+    ArrayRef<double> scales = allocator.copyInto(key.scales);
+    ArrayRef<int64_t> zeroPoints = allocator.copyInto(key.zeroPoints);
+    return new (allocator.allocate<UniformQuantizedPerAxisTypeStorage>())
+        UniformQuantizedPerAxisTypeStorage(key, scales, zeroPoints);
+  }
+
+  static unsigned hashKey(const KeyTy &key) { return key.getHashValue(); }
+
+  ArrayRef<double> getScales() const {
+    return ArrayRef<double>(scaleElements, quantParamsSize);
+  }
+
+  ArrayRef<int64_t> getZeroPoints() const {
+    return ArrayRef<int64_t>(zeroPointElements, quantParamsSize);
+  }
+
+  const double *scaleElements;
+  const int64_t *zeroPointElements;
+  unsigned quantParamsSize;
+  int32_t quantizedDimension;
+};
+
+} // namespace detail
+} // namespace quant
+} // namespace mlir
+
+#endif // TYPE_DETAIL_H_
diff --git a/mlir/lib/Dialect/QuantOps/IR/TypeParser.cpp b/mlir/lib/Dialect/QuantOps/IR/TypeParser.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2689a2dff893c73b1c0eb222a4a60f62b93663e3
--- /dev/null
+++ b/mlir/lib/Dialect/QuantOps/IR/TypeParser.cpp
@@ -0,0 +1,382 @@
+//===- TypeParser.h - Quantization Type Parser ------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace quant;
+
+static IntegerType parseStorageType(DialectAsmParser &parser, bool &isSigned) {
+  auto typeLoc = parser.getCurrentLocation();
+  IntegerType type;
+
+  // Parse storage type (alpha_ident, integer_literal).
+  StringRef identifier;
+  unsigned storageTypeWidth = 0;
+  if (failed(parser.parseOptionalKeyword(&identifier))) {
+    // If we didn't parse a keyword, this must be a signed type.
+    if (parser.parseType(type))
+      return nullptr;
+    isSigned = true;
+    storageTypeWidth = type.getWidth();
+
+    // Otherwise, this must be an unsigned integer (`u` integer-literal).
+  } else {
+    if (!identifier.consume_front("u")) {
+      parser.emitError(typeLoc, "illegal storage type prefix");
+      return nullptr;
+    }
+    if (identifier.getAsInteger(10, storageTypeWidth)) {
+      parser.emitError(typeLoc, "expected storage type width");
+      return nullptr;
+    }
+    isSigned = false;
+    type = parser.getBuilder().getIntegerType(storageTypeWidth);
+  }
+
+  if (storageTypeWidth == 0 ||
+      storageTypeWidth > QuantizedType::MaxStorageBits) {
+    parser.emitError(typeLoc, "illegal storage type size: ")
+        << storageTypeWidth;
+    return nullptr;
+  }
+
+  return type;
+}
+
+static ParseResult parseStorageRange(DialectAsmParser &parser,
+                                     IntegerType storageType, bool isSigned,
+                                     int64_t &storageTypeMin,
+                                     int64_t &storageTypeMax) {
+  int64_t defaultIntegerMin = QuantizedType::getDefaultMinimumForInteger(
+      isSigned, storageType.getWidth());
+  int64_t defaultIntegerMax = QuantizedType::getDefaultMaximumForInteger(
+      isSigned, storageType.getWidth());
+  if (failed(parser.parseOptionalLess())) {
+    storageTypeMin = defaultIntegerMin;
+    storageTypeMax = defaultIntegerMax;
+    return success();
+  }
+
+  // Explicit storage min and storage max.
+  llvm::SMLoc minLoc = parser.getCurrentLocation(), maxLoc;
+  if (parser.parseInteger(storageTypeMin) || parser.parseColon() ||
+      parser.getCurrentLocation(&maxLoc) ||
+      parser.parseInteger(storageTypeMax) || parser.parseGreater())
+    return failure();
+  if (storageTypeMin < defaultIntegerMin) {
+    return parser.emitError(minLoc, "illegal storage type minimum: ")
+           << storageTypeMin;
+  }
+  if (storageTypeMax > defaultIntegerMax) {
+    return parser.emitError(maxLoc, "illegal storage type maximum: ")
+           << storageTypeMax;
+  }
+  return success();
+}
+
+/// Parses a UniformQuantizedType.
+///
+///   uniform_per_layer ::= `any<` storage-spec (expressed-type-spec)?`>`
+///   storage-spec ::= storage-type (`<` storage-range `>`)?
+///   storage-range ::= integer-literal `:` integer-literal
+///   storage-type ::= (`i` | `u`) integer-literal
+///   expressed-type-spec ::= `:` `f` integer-literal
+static Type parseAnyType(DialectAsmParser &parser, Location loc) {
+  IntegerType storageType;
+  FloatType expressedType;
+  unsigned typeFlags = 0;
+  int64_t storageTypeMin;
+  int64_t storageTypeMax;
+
+  // Type specification.
+  if (parser.parseLess())
+    return nullptr;
+
+  // Storage type.
+  bool isSigned = false;
+  storageType = parseStorageType(parser, isSigned);
+  if (!storageType) {
+    return nullptr;
+  }
+  if (isSigned) {
+    typeFlags |= QuantizationFlags::Signed;
+  }
+
+  // Storage type range.
+  if (parseStorageRange(parser, storageType, isSigned, storageTypeMin,
+                        storageTypeMax)) {
+    return nullptr;
+  }
+
+  // Optional expressed type.
+  if (succeeded(parser.parseOptionalColon())) {
+    if (parser.parseType(expressedType)) {
+      return nullptr;
+    }
+  }
+
+  if (parser.parseGreater()) {
+    return nullptr;
+  }
+
+  return AnyQuantizedType::getChecked(typeFlags, storageType, expressedType,
+                                      storageTypeMin, storageTypeMax, loc);
+}
+
+static ParseResult parseQuantParams(DialectAsmParser &parser, double &scale,
+                                    int64_t &zeroPoint) {
+  // scale[:zeroPoint]?
+  // scale.
+  if (parser.parseFloat(scale))
+    return failure();
+
+  // zero point.
+  zeroPoint = 0;
+  if (failed(parser.parseOptionalColon())) {
+    // Default zero point.
+    return success();
+  }
+
+  return parser.parseInteger(zeroPoint);
+}
+
+/// Parses a UniformQuantizedType.
+///
+///   uniform_type ::= uniform_per_layer
+///                  | uniform_per_axis
+///   uniform_per_layer ::= `uniform<` storage-spec expressed-type-spec
+///                          `,` scale-zero `>`
+///   uniform_per_axis ::= `uniform<` storage-spec expressed-type-spec
+///                        axis-spec `,` scale-zero-list `>`
+///   storage-spec ::= storage-type (`<` storage-range `>`)?
+///   storage-range ::= integer-literal `:` integer-literal
+///   storage-type ::= (`i` | `u`) integer-literal
+///   expressed-type-spec ::= `:` `f` integer-literal
+///   axis-spec ::= `:` integer-literal
+///   scale-zero ::= float-literal `:` integer-literal
+///   scale-zero-list ::= `{` scale-zero (`,` scale-zero)* `}`
+static Type parseUniformType(DialectAsmParser &parser, Location loc) {
+  IntegerType storageType;
+  FloatType expressedType;
+  unsigned typeFlags = 0;
+  int64_t storageTypeMin;
+  int64_t storageTypeMax;
+  bool isPerAxis = false;
+  int32_t quantizedDimension;
+  SmallVector<double, 1> scales;
+  SmallVector<int64_t, 1> zeroPoints;
+
+  // Type specification.
+  if (parser.parseLess()) {
+    return nullptr;
+  }
+
+  // Storage type.
+  bool isSigned = false;
+  storageType = parseStorageType(parser, isSigned);
+  if (!storageType) {
+    return nullptr;
+  }
+  if (isSigned) {
+    typeFlags |= QuantizationFlags::Signed;
+  }
+
+  // Storage type range.
+  if (parseStorageRange(parser, storageType, isSigned, storageTypeMin,
+                        storageTypeMax)) {
+    return nullptr;
+  }
+
+  // Expressed type.
+  if (parser.parseColon() || parser.parseType(expressedType)) {
+    return nullptr;
+  }
+
+  // Optionally parse quantized dimension for per-axis quantization.
+  if (succeeded(parser.parseOptionalColon())) {
+    if (parser.parseInteger(quantizedDimension))
+      return nullptr;
+    isPerAxis = true;
+  }
+
+  // Comma leading into range_spec.
+  if (parser.parseComma()) {
+    return nullptr;
+  }
+
+  // Parameter specification.
+  // For per-axis, ranges are in a {} delimitted list.
+  if (isPerAxis) {
+    if (parser.parseLBrace()) {
+      return nullptr;
+    }
+  }
+
+  // Parse scales/zeroPoints.
+  llvm::SMLoc scaleZPLoc = parser.getCurrentLocation();
+  do {
+    scales.resize(scales.size() + 1);
+    zeroPoints.resize(zeroPoints.size() + 1);
+    if (parseQuantParams(parser, scales.back(), zeroPoints.back())) {
+      return nullptr;
+    }
+  } while (isPerAxis && succeeded(parser.parseOptionalComma()));
+
+  if (isPerAxis) {
+    if (parser.parseRBrace()) {
+      return nullptr;
+    }
+  }
+
+  if (parser.parseGreater()) {
+    return nullptr;
+  }
+
+  if (!isPerAxis && scales.size() > 1) {
+    return (parser.emitError(scaleZPLoc,
+                             "multiple scales/zeroPoints provided, but "
+                             "quantizedDimension wasn't specified"),
+            nullptr);
+  }
+
+  if (isPerAxis) {
+    ArrayRef<double> scalesRef(scales.begin(), scales.end());
+    ArrayRef<int64_t> zeroPointsRef(zeroPoints.begin(), zeroPoints.end());
+    return UniformQuantizedPerAxisType::getChecked(
+        typeFlags, storageType, expressedType, scalesRef, zeroPointsRef,
+        quantizedDimension, storageTypeMin, storageTypeMax, loc);
+  }
+
+  return UniformQuantizedType::getChecked(typeFlags, storageType, expressedType,
+                                          scales.front(), zeroPoints.front(),
+                                          storageTypeMin, storageTypeMax, loc);
+}
+
+/// Parse a type registered to this dialect.
+Type QuantizationDialect::parseType(DialectAsmParser &parser) const {
+  Location loc = parser.getEncodedSourceLoc(parser.getNameLoc());
+
+  // All types start with an identifier that we switch on.
+  StringRef typeNameSpelling;
+  if (failed(parser.parseKeyword(&typeNameSpelling)))
+    return nullptr;
+
+  if (typeNameSpelling == "uniform")
+    return parseUniformType(parser, loc);
+  if (typeNameSpelling == "any")
+    return parseAnyType(parser, loc);
+
+  parser.emitError(parser.getNameLoc(),
+                   "unknown quantized type " + typeNameSpelling);
+  return nullptr;
+}
+
+static void printStorageType(QuantizedType type, DialectAsmPrinter &out) {
+  // storage type
+  unsigned storageWidth = type.getStorageTypeIntegralWidth();
+  bool isSigned = type.isSigned();
+  if (isSigned) {
+    out << "i" << storageWidth;
+  } else {
+    out << "u" << storageWidth;
+  }
+
+  // storageTypeMin and storageTypeMax if not default.
+  int64_t defaultIntegerMin =
+      QuantizedType::getDefaultMinimumForInteger(isSigned, storageWidth);
+  int64_t defaultIntegerMax =
+      QuantizedType::getDefaultMaximumForInteger(isSigned, storageWidth);
+  if (defaultIntegerMin != type.getStorageTypeMin() ||
+      defaultIntegerMax != type.getStorageTypeMax()) {
+    out << "<" << type.getStorageTypeMin() << ":" << type.getStorageTypeMax()
+        << ">";
+  }
+}
+
+static void printQuantParams(double scale, int64_t zeroPoint,
+                             DialectAsmPrinter &out) {
+  out << scale;
+  if (zeroPoint != 0) {
+    out << ":" << zeroPoint;
+  }
+}
+
+/// Helper that prints a UniformQuantizedType.
+static void printAnyQuantizedType(AnyQuantizedType type,
+                                  DialectAsmPrinter &out) {
+  out << "any<";
+  printStorageType(type, out);
+  if (Type expressedType = type.getExpressedType()) {
+    out << ":" << expressedType;
+  }
+  out << ">";
+}
+
+/// Helper that prints a UniformQuantizedType.
+static void printUniformQuantizedType(UniformQuantizedType type,
+                                      DialectAsmPrinter &out) {
+  out << "uniform<";
+  printStorageType(type, out);
+  out << ":" << type.getExpressedType() << ", ";
+
+  // scheme specific parameters
+  printQuantParams(type.getScale(), type.getZeroPoint(), out);
+  out << ">";
+}
+
+/// Helper that prints a UniformQuantizedPerAxisType.
+static void printUniformQuantizedPerAxisType(UniformQuantizedPerAxisType type,
+                                             DialectAsmPrinter &out) {
+  out << "uniform<";
+  printStorageType(type, out);
+  out << ":" << type.getExpressedType() << ":";
+  out << type.getQuantizedDimension();
+  out << ", ";
+
+  // scheme specific parameters
+  ArrayRef<double> scales = type.getScales();
+  ArrayRef<int64_t> zeroPoints = type.getZeroPoints();
+  out << "{";
+  interleave(
+      llvm::seq<size_t>(0, scales.size()), out,
+      [&](size_t index) {
+        printQuantParams(scales[index], zeroPoints[index], out);
+      },
+      ",");
+  out << "}>";
+}
+
+/// Print a type registered to this dialect.
+void QuantizationDialect::printType(Type type, DialectAsmPrinter &os) const {
+  switch (type.getKind()) {
+  default:
+    llvm_unreachable("Unhandled quantized type");
+  case QuantizationTypes::Any:
+    printAnyQuantizedType(type.cast<AnyQuantizedType>(), os);
+    break;
+  case QuantizationTypes::UniformQuantized:
+    printUniformQuantizedType(type.cast<UniformQuantizedType>(), os);
+    break;
+  case QuantizationTypes::UniformQuantizedPerAxis:
+    printUniformQuantizedPerAxisType(type.cast<UniformQuantizedPerAxisType>(),
+                                     os);
+    break;
+  }
+}
diff --git a/mlir/lib/Dialect/QuantOps/Transforms/ConvertConst.cpp b/mlir/lib/Dialect/QuantOps/Transforms/ConvertConst.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..08a5ec59e8d848559e1e3d90fb78df3ca252ec04
--- /dev/null
+++ b/mlir/lib/Dialect/QuantOps/Transforms/ConvertConst.cpp
@@ -0,0 +1,112 @@
+//===- ConvertConst.cpp - Quantizes constant ops --------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/Passes.h"
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/QuantizeUtils.h"
+#include "mlir/Dialect/QuantOps/UniformSupport.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+
+namespace {
+
+class ConvertConstPass : public FunctionPass<ConvertConstPass> {
+public:
+  void runOnFunction() override;
+};
+
+struct QuantizedConstRewrite : public OpRewritePattern<QuantizeCastOp> {
+  using OpRewritePattern<QuantizeCastOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(QuantizeCastOp qbarrier,
+                                     PatternRewriter &rewriter) const override;
+};
+
+} // end anonymous namespace
+
+/// Matches a [constant] -> [qbarrier] where the qbarrier results type is
+/// quantized and the operand type is quantizable.
+
+PatternMatchResult
+QuantizedConstRewrite::matchAndRewrite(QuantizeCastOp qbarrier,
+                                       PatternRewriter &rewriter) const {
+  Attribute value;
+
+  // Is the operand a constant?
+  if (!matchPattern(qbarrier.arg(), m_Constant(&value))) {
+    return matchFailure();
+  }
+
+  // Does the qbarrier convert to a quantized type. This will not be true
+  // if a quantized type has not yet been chosen or if the cast to an equivalent
+  // storage type is not supported.
+  Type qbarrierResultType = qbarrier.getResult()->getType();
+  QuantizedType quantizedElementType =
+      QuantizedType::getQuantizedElementType(qbarrierResultType);
+  if (!quantizedElementType) {
+    return matchFailure();
+  }
+  if (!QuantizedType::castToStorageType(qbarrierResultType)) {
+    return matchFailure();
+  }
+
+  // Is the operand type compatible with the expressed type of the quantized
+  // type? This will not be true if the qbarrier is superfluous (converts
+  // from and to a quantized type).
+  if (!quantizedElementType.isCompatibleExpressedType(
+          qbarrier.arg()->getType())) {
+    return matchFailure();
+  }
+
+  // Is the constant value a type expressed in a way that we support?
+  if (!value.isa<FloatAttr>() && !value.isa<DenseElementsAttr>() &&
+      !value.isa<SparseElementsAttr>()) {
+    return matchFailure();
+  }
+
+  Type newConstValueType;
+  auto newConstValue =
+      quantizeAttr(value, quantizedElementType, newConstValueType);
+  if (!newConstValue) {
+    return matchFailure();
+  }
+
+  // When creating the new const op, use a fused location that combines the
+  // original const and the qbarrier that led to the quantization.
+  auto fusedLoc = FusedLoc::get(
+      {qbarrier.arg()->getDefiningOp()->getLoc(), qbarrier.getLoc()},
+      rewriter.getContext());
+  auto newConstOp =
+      rewriter.create<ConstantOp>(fusedLoc, newConstValueType, newConstValue);
+  rewriter.replaceOpWithNewOp<StorageCastOp>({qbarrier.arg()}, qbarrier,
+                                             qbarrier.getType(), newConstOp);
+  return matchSuccess();
+}
+
+void ConvertConstPass::runOnFunction() {
+  OwningRewritePatternList patterns;
+  auto func = getFunction();
+  auto *context = &getContext();
+  patterns.insert<QuantizedConstRewrite>(context);
+  applyPatternsGreedily(func, patterns);
+}
+
+std::unique_ptr<OpPassBase<FuncOp>> mlir::quant::createConvertConstPass() {
+  return std::make_unique<ConvertConstPass>();
+}
+
+static PassRegistration<ConvertConstPass>
+    pass("quant-convert-const",
+         "Converts constants followed by qbarrier to actual quantized values");
diff --git a/mlir/lib/Dialect/QuantOps/Transforms/ConvertSimQuant.cpp b/mlir/lib/Dialect/QuantOps/Transforms/ConvertSimQuant.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a4c14f2231fda56051398edb9b1c49c34a03232
--- /dev/null
+++ b/mlir/lib/Dialect/QuantOps/Transforms/ConvertSimQuant.cpp
@@ -0,0 +1,149 @@
+//===- ConvertSimQuant.cpp - Converts simulated quant ops------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/FakeQuantSupport.h"
+#include "mlir/Dialect/QuantOps/Passes.h"
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/UniformSupport.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+
+namespace {
+
+class ConvertSimulatedQuantPass
+    : public FunctionPass<ConvertSimulatedQuantPass> {
+public:
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+/// Base class rewrites ConstFakeQuant into a qbarrier/dbarrier pair.
+template <typename ConcreteRewriteClass, typename FakeQuantOp>
+class FakeQuantRewrite : public OpRewritePattern<FakeQuantOp> {
+public:
+  using OpRewritePattern<FakeQuantOp>::OpRewritePattern;
+
+  FakeQuantRewrite(MLIRContext *ctx, bool *hadFailure)
+      : OpRewritePattern<FakeQuantOp>(ctx), hadFailure(hadFailure) {}
+
+  PatternMatchResult matchAndRewrite(FakeQuantOp op,
+                                     PatternRewriter &rewriter) const override {
+    // TODO: If this pattern comes up more frequently, consider adding core
+    // support for failable rewrites.
+    if (failableRewrite(op, rewriter)) {
+      *hadFailure = true;
+      return Pattern::matchFailure();
+    }
+
+    return Pattern::matchSuccess();
+  }
+
+private:
+  bool *hadFailure;
+
+  bool failableRewrite(FakeQuantOp op, PatternRewriter &rewriter) const {
+    auto converter = ExpressedToQuantizedConverter::forInputType(op.getType());
+    if (!converter) {
+      return (op.emitError("unsupported quantized type conversion"), true);
+    }
+
+    QuantizedType elementType =
+        static_cast<const ConcreteRewriteClass *>(this)
+            ->convertFakeQuantAttrsToType(op, converter.expressedType);
+
+    if (!elementType) {
+      // Note that the fakeQuantAttrsToType will have emitted the error.
+      return true;
+    }
+
+    Type quantizedType = converter.convert(elementType);
+    assert(quantizedType &&
+           "Converter accepted a type that it did not convert");
+
+    // TODO: Map to a qbarrier with an attribute like [Forced] to signal that
+    // this is a forced/hard-coded constraint.
+    auto qbarrier = rewriter.create<QuantizeCastOp>(op.getLoc(), quantizedType,
+                                                    op.inputs());
+    rewriter.replaceOpWithNewOp<DequantizeCastOp>(op, converter.inputType,
+                                                  qbarrier.getResult());
+
+    return false;
+  }
+};
+
+class ConstFakeQuantRewrite
+    : public FakeQuantRewrite<ConstFakeQuantRewrite, ConstFakeQuant> {
+public:
+  using BaseRewrite = FakeQuantRewrite<ConstFakeQuantRewrite, ConstFakeQuant>;
+
+  ConstFakeQuantRewrite(MLIRContext *ctx, bool *hadFailure)
+      : BaseRewrite(ctx, hadFailure) {}
+
+  QuantizedType convertFakeQuantAttrsToType(ConstFakeQuant fqOp,
+                                            Type expressedType) const {
+    return fakeQuantAttrsToType(
+        fqOp.getLoc(), fqOp.num_bits().getSExtValue(),
+        fqOp.min().convertToFloat(), fqOp.max().convertToFloat(),
+        fqOp.narrow_range(), expressedType, fqOp.is_signed());
+  }
+};
+
+class ConstFakeQuantPerAxisRewrite
+    : public FakeQuantRewrite<ConstFakeQuantPerAxisRewrite,
+                              ConstFakeQuantPerAxis> {
+public:
+  using BaseRewrite =
+      FakeQuantRewrite<ConstFakeQuantPerAxisRewrite, ConstFakeQuantPerAxis>;
+
+  ConstFakeQuantPerAxisRewrite(MLIRContext *ctx, bool *hadFailure)
+      : BaseRewrite(ctx, hadFailure) {}
+
+  QuantizedType convertFakeQuantAttrsToType(ConstFakeQuantPerAxis fqOp,
+                                            Type expressedType) const {
+    SmallVector<double, 4> min, max;
+    min.reserve(fqOp.min().size());
+    max.reserve(fqOp.max().size());
+    for (auto m : fqOp.min())
+      min.push_back(m.cast<FloatAttr>().getValueAsDouble());
+    for (auto m : fqOp.max())
+      max.push_back(m.cast<FloatAttr>().getValueAsDouble());
+
+    return fakeQuantAttrsToType(fqOp.getLoc(), fqOp.num_bits().getSExtValue(),
+                                fqOp.axis().getSExtValue(), min, max,
+                                fqOp.narrow_range(), expressedType,
+                                fqOp.is_signed());
+  }
+};
+
+void ConvertSimulatedQuantPass::runOnFunction() {
+  bool hadFailure = false;
+  OwningRewritePatternList patterns;
+  auto func = getFunction();
+  auto ctx = func.getContext();
+  patterns.insert<ConstFakeQuantRewrite, ConstFakeQuantPerAxisRewrite>(
+      ctx, &hadFailure);
+  applyPatternsGreedily(func, patterns);
+  if (hadFailure)
+    signalPassFailure();
+}
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::quant::createConvertSimulatedQuantPass() {
+  return std::make_unique<ConvertSimulatedQuantPass>();
+}
+
+static PassRegistration<ConvertSimulatedQuantPass>
+    pass("quant-convert-simulated-quantization",
+         "Converts training-time simulated quantization ops to corresponding "
+         "quantize/dequantize casts.");
diff --git a/mlir/lib/Dialect/QuantOps/Utils/FakeQuantSupport.cpp b/mlir/lib/Dialect/QuantOps/Utils/FakeQuantSupport.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cbd4315f832eaa1538da4614c2b03490d4196a45
--- /dev/null
+++ b/mlir/lib/Dialect/QuantOps/Utils/FakeQuantSupport.cpp
@@ -0,0 +1,175 @@
+//===- FakeQuantSupport.cpp - Support utilities for FakeQuant ops ---------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/FakeQuantSupport.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+
+static bool getDefaultStorageParams(unsigned numBits, bool narrowRange,
+                                    bool isSigned, MLIRContext *ctx,
+                                    Type &storageType, int64_t &qmin,
+                                    int64_t &qmax) {
+  // Hard-coded type mapping from TFLite.
+  if (numBits <= 8) {
+    storageType = IntegerType::get(8, ctx);
+    if (isSigned) {
+      qmin = -128;
+      qmax = 127;
+    } else {
+      qmin = 0;
+      qmax = 255;
+    }
+  } else if (numBits <= 16) {
+    storageType = IntegerType::get(16, ctx);
+    if (isSigned) {
+      qmin = -32768;
+      qmax = 32767;
+    } else {
+      qmin = 0;
+      qmax = 65535;
+    }
+  } else {
+    return true;
+  }
+
+  // Handle narrowRange.
+  if (narrowRange) {
+    qmin += 1;
+  }
+  return false;
+}
+
+// This is a specific implementation of nudging:
+// If 0.0 < rmin < rmax or rmin < rmax < 0.0, the range will be shifted
+// to include 0.0, but the range width size (rmax-rmin) isn't changed. The zero
+// point is derived from the shifted range, and the scale isn't changed. As
+// a consequence some values, which are supposed in the original [rmin, rmax]
+// range will be outside the shifted range and be clamped during quantization.
+// TODO(fengliuai): we should nudge the scale as well, but that requires the
+// fake quant op used in the training to use the nudged scale as well.
+static void getNudgedScaleAndZeroPoint(int64_t qmin, int64_t qmax, double rmin,
+                                       double rmax, double &scale,
+                                       int64_t &nudgedZeroPoint) {
+  // Determine the scale.
+  const double qminDouble = qmin;
+  const double qmaxDouble = qmax;
+  scale = (rmax - rmin) / (qmaxDouble - qminDouble);
+
+  // Zero point computation.
+  // In float, solve the affine equation for any known pair
+  // (real value, corresponding quantized value), of which, two such pairs
+  // are known: (rmin, qmin), (rmax, qmax).
+  // The arithmetic error on the zero point computed from either pair will be
+  // roughly machine_epsilon * (sum of absolute values of terms).
+  // Use the variant that adds the smaller error.
+  const double zeroPointFromMin = qminDouble - rmin / scale;
+  const double zeroPointFromMinError =
+      std::abs(qminDouble) + std::abs(rmin / scale);
+  const double zeroPointFromMax = qmaxDouble - rmax / scale;
+  const double zeroPointFromMaxError =
+      std::abs(qmaxDouble) + std::abs(rmax / scale);
+
+  const double zeroPointDouble = (zeroPointFromMinError < zeroPointFromMaxError)
+                                     ? zeroPointFromMin
+                                     : zeroPointFromMax;
+
+  // Now nudge the zero point to be an integer.
+  nudgedZeroPoint = 0;
+  if (zeroPointDouble < qminDouble) {
+    nudgedZeroPoint = qmin;
+  } else if (zeroPointDouble > qmaxDouble) {
+    nudgedZeroPoint = qmax;
+  } else {
+    nudgedZeroPoint = round(zeroPointDouble);
+  }
+
+  // By construction, the nudged zero point should always be in range.
+  assert(nudgedZeroPoint >= qmin);
+  assert(nudgedZeroPoint <= qmax);
+}
+
+UniformQuantizedType
+mlir::quant::fakeQuantAttrsToType(Location loc, unsigned numBits, double rmin,
+                                  double rmax, bool narrowRange,
+                                  Type expressedType, bool isSigned) {
+  MLIRContext *ctx = expressedType.getContext();
+  unsigned flags = isSigned ? QuantizationFlags::Signed : 0;
+  Type storageType;
+  int64_t qmin;
+  int64_t qmax;
+  if (getDefaultStorageParams(numBits, narrowRange, isSigned, ctx, storageType,
+                              qmin, qmax)) {
+    return (emitError(loc, "unsupported FakeQuant number of bits: ") << numBits,
+            nullptr);
+  }
+
+  // Special case where min/max is close enough. The tensor contents are all
+  // 0.0s, so the scale is set to 1.0 and the tensor can be quantized to zero
+  // points and dequantized to 0.0.
+  if (std::fabs(rmax - rmin) < std::numeric_limits<double>::epsilon()) {
+    return UniformQuantizedType::getChecked(flags, storageType, expressedType,
+                                            1.0, qmin, qmin, qmax, loc);
+  }
+
+  double scale;
+  int64_t nudgedZeroPoint;
+  getNudgedScaleAndZeroPoint(qmin, qmax, rmin, rmax, scale, nudgedZeroPoint);
+
+  return UniformQuantizedType::getChecked(flags, storageType, expressedType,
+                                          scale, nudgedZeroPoint, qmin, qmax,
+                                          loc);
+}
+
+UniformQuantizedPerAxisType mlir::quant::fakeQuantAttrsToType(
+    Location loc, unsigned numBits, int32_t quantizedDimension,
+    ArrayRef<double> rmins, ArrayRef<double> rmaxs, bool narrowRange,
+    Type expressedType, bool isSigned) {
+  size_t axis_size = rmins.size();
+  if (axis_size != rmaxs.size()) {
+    return (emitError(loc, "mismatched per-axis min and max size: ")
+                << axis_size << " vs. " << rmaxs.size(),
+            nullptr);
+  }
+
+  MLIRContext *ctx = expressedType.getContext();
+  Type storageType;
+  int64_t qmin;
+  int64_t qmax;
+  if (getDefaultStorageParams(numBits, narrowRange, isSigned, ctx, storageType,
+                              qmin, qmax)) {
+    return (emitError(loc, "unsupported FakeQuant number of bits: ") << numBits,
+            nullptr);
+  }
+
+  SmallVector<double, 4> scales;
+  SmallVector<int64_t, 4> zeroPoints;
+  scales.reserve(axis_size);
+  zeroPoints.reserve(axis_size);
+  for (size_t axis = 0; axis != axis_size; ++axis) {
+    double rmin = rmins[axis];
+    double rmax = rmaxs[axis];
+    if (std::fabs(rmax - rmin) < std::numeric_limits<double>::epsilon()) {
+      scales.push_back(1.0);
+      zeroPoints.push_back(qmin);
+      continue;
+    }
+
+    double scale;
+    int64_t nudgedZeroPoint;
+    getNudgedScaleAndZeroPoint(qmin, qmax, rmin, rmax, scale, nudgedZeroPoint);
+    scales.push_back(scale);
+    zeroPoints.push_back(nudgedZeroPoint);
+  }
+
+  unsigned flags = isSigned ? QuantizationFlags::Signed : 0;
+  return UniformQuantizedPerAxisType::getChecked(
+      flags, storageType, expressedType, scales, zeroPoints, quantizedDimension,
+      qmin, qmax, loc);
+}
diff --git a/mlir/lib/Dialect/QuantOps/Utils/QuantizeUtils.cpp b/mlir/lib/Dialect/QuantOps/Utils/QuantizeUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..094fefee48602bd1ed593af6393f568d0dce5b19
--- /dev/null
+++ b/mlir/lib/Dialect/QuantOps/Utils/QuantizeUtils.cpp
@@ -0,0 +1,148 @@
+//===- QuantizeUtils.cpp - Support utilities for quantization -------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/QuantizeUtils.h"
+#include "mlir/Dialect/QuantOps/UniformSupport.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+
+/// Converts a possible primitive, real expressed value attribute to a
+/// corresponding storage attribute (typically FloatAttr -> IntegerAttr).
+/// quantizedElementType is the QuantizedType that describes the expressed
+/// origValue.
+/// Returns a converter Attribute or nullptr if conversion is not possible.
+static Attribute convertPrimitiveValueAttr(
+    Attribute origRealValue, QuantizedType quantizedElementType,
+    const UniformQuantizedValueConverter &converter, Type &outConvertedType) {
+  if (origRealValue.isa<FloatAttr>()) {
+    FloatAttr floatAttr = origRealValue.cast<FloatAttr>();
+    outConvertedType = quantizedElementType.getStorageType();
+    return IntegerAttr::get(quantizedElementType.getStorageType(),
+                            converter.quantizeFloatToInt(floatAttr.getValue()));
+  }
+
+  return nullptr;
+}
+
+/// Converts a real expressed DenseFPElementsAttr to a corresponding
+/// DenseElementsAttr (typically DenseIntElementsAttr) containing quantized
+/// storage values assuming the given quantizedElementType and converter.
+static DenseElementsAttr
+convertDenseFPElementsAttr(DenseFPElementsAttr realFPElementsAttr,
+                           QuantizedType quantizedElementType,
+                           const UniformQuantizedValueConverter &converter) {
+  // Convert to corresponding quantized value attributes.
+  SmallVector<APInt, 8> quantValues;
+  if (realFPElementsAttr.isSplat()) {
+    quantValues.push_back(
+        converter.quantizeFloatToInt(*realFPElementsAttr.begin()));
+  } else {
+    quantValues.reserve(realFPElementsAttr.getNumElements());
+    for (APFloat realVal : realFPElementsAttr) {
+      quantValues.push_back(converter.quantizeFloatToInt(realVal));
+    }
+  }
+
+  // Cast from an expressed-type-based type to storage-type-based type,
+  // preserving the dense shape (i.e. tensor<4xf32> -> tensor<4xi8>).
+  ShapedType newDenseType =
+      quantizedElementType
+          .castExpressedToStorageType(realFPElementsAttr.getType())
+          .dyn_cast_or_null<ShapedType>();
+  if (!newDenseType) {
+    return nullptr;
+  }
+  return DenseIntElementsAttr::get(newDenseType, quantValues);
+}
+
+/// Converts a real expressed SplatElementsAttr to a corresponding
+/// SplatElementsAttr containing quantized storage values assuming the given
+/// quantizedElementType and converter.
+static SparseElementsAttr
+convertSparseElementsAttr(SparseElementsAttr realSparseAttr,
+                          QuantizedType quantizedElementType,
+                          const UniformQuantizedValueConverter &converter) {
+  DenseElementsAttr realDenseAttr = realSparseAttr.getValues();
+  if (!realDenseAttr.isa<DenseFPElementsAttr>()) {
+    return nullptr;
+  }
+  DenseElementsAttr quantDenseAttr =
+      convertDenseFPElementsAttr(realDenseAttr.cast<DenseFPElementsAttr>(),
+                                 quantizedElementType, converter);
+  if (!quantDenseAttr) {
+    return nullptr;
+  }
+
+  // Cast from an expressed-type-based type to storage-type-based type,
+  // preserving the sparse shape (i.e. tensor<4xf32> -> tensor<4xi8>).
+  ShapedType newSparseType =
+      quantizedElementType.castExpressedToStorageType(realSparseAttr.getType())
+          .dyn_cast_or_null<ShapedType>();
+  if (!newSparseType) {
+    return nullptr;
+  }
+  return SparseElementsAttr::get(newSparseType, realSparseAttr.getIndices(),
+                                 quantDenseAttr);
+}
+
+/// Converts a real expressed Attribute to a corresponding Attribute containing
+/// quantized storage values assuming the given uniform quantizedElementType and
+/// converter.
+Attribute mlir::quant::quantizeAttrUniform(
+    Attribute realValue, UniformQuantizedType quantizedElementType,
+    const UniformQuantizedValueConverter &converter, Type &outConvertedType) {
+  // Fork to handle different variants of constants supported.
+  if (realValue.isa<DenseFPElementsAttr>()) {
+    // Dense tensor or vector constant.
+    auto converted = convertDenseFPElementsAttr(
+        realValue.cast<DenseFPElementsAttr>(), quantizedElementType, converter);
+    outConvertedType = converted.getType();
+    return converted;
+  } else if (realValue.isa<SparseElementsAttr>()) {
+    // Sparse tensor or vector constant.
+    auto converted = convertSparseElementsAttr(
+        realValue.cast<SparseElementsAttr>(), quantizedElementType, converter);
+    outConvertedType = converted.getType();
+    return converted;
+  } else {
+    // Nothing else matched: try to convert a primitive.
+    return convertPrimitiveValueAttr(realValue, quantizedElementType, converter,
+                                     outConvertedType);
+  }
+}
+
+/// Convert an attribute from a type based on
+/// quantizedElementType.getExpressedType() to one based on
+/// quantizedElementType.getStorageType().
+/// Returns nullptr if the conversion is not supported.
+/// On success, stores the converted type in outConvertedType.
+Attribute mlir::quant::quantizeAttr(Attribute realValue,
+                                    QuantizedType quantizedElementType,
+                                    Type &outConvertedType) {
+  if (auto uniformQuantized =
+          quantizedElementType.dyn_cast<UniformQuantizedType>()) {
+    UniformQuantizedValueConverter converter(uniformQuantized);
+    return quantizeAttrUniform(realValue, uniformQuantized, converter,
+                               outConvertedType);
+
+  } else if (auto uniformQuantizedPerAxis =
+                 quantizedElementType.dyn_cast<UniformQuantizedPerAxisType>()) {
+    UniformQuantizedPerAxisValueConverter converter(uniformQuantizedPerAxis);
+    auto converted = converter.convert(realValue);
+    // TODO(fengliuai): why we need this outConvertedType? remove it?
+    if (converted) {
+      outConvertedType = converted.getType();
+    }
+    return converted;
+  } else {
+    return nullptr;
+  }
+}
diff --git a/mlir/lib/Dialect/QuantOps/Utils/UniformSupport.cpp b/mlir/lib/Dialect/QuantOps/Utils/UniformSupport.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df002336c1630b337100b7778c0f458325d5897a
--- /dev/null
+++ b/mlir/lib/Dialect/QuantOps/Utils/UniformSupport.cpp
@@ -0,0 +1,102 @@
+//===- UniformSupport.cpp - Support utilities for uniform quant -----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/UniformSupport.h"
+#include "mlir/IR/StandardTypes.h"
+#include <numeric>
+
+using namespace mlir;
+using namespace mlir::quant;
+
+static bool isQuantizablePrimitiveType(Type inputType) {
+  return inputType.isa<FloatType>();
+}
+
+const ExpressedToQuantizedConverter
+ExpressedToQuantizedConverter::forInputType(Type inputType) {
+  switch (inputType.getKind()) {
+  default:
+    if (isQuantizablePrimitiveType(inputType)) {
+      // Supported primitive type (which just is the expressed type).
+      return ExpressedToQuantizedConverter{inputType, inputType};
+    }
+    // Unsupported.
+    return ExpressedToQuantizedConverter{inputType, nullptr};
+  case StandardTypes::RankedTensor:
+  case StandardTypes::UnrankedTensor:
+  case StandardTypes::Vector: {
+    Type elementType = inputType.cast<ShapedType>().getElementType();
+    if (!isQuantizablePrimitiveType(elementType)) {
+      // Unsupported.
+      return ExpressedToQuantizedConverter{inputType, nullptr};
+    }
+    return ExpressedToQuantizedConverter{
+        inputType, inputType.cast<ShapedType>().getElementType()};
+  }
+  }
+}
+
+Type ExpressedToQuantizedConverter::convert(QuantizedType elementalType) const {
+  assert(expressedType && "convert() on unsupported conversion");
+
+  switch (inputType.getKind()) {
+  default:
+    if (isQuantizablePrimitiveType(elementalType)) {
+      // For primitives, just use the new elemental type.
+      return elementalType;
+    }
+    // Unsupported.
+    return nullptr;
+  case StandardTypes::RankedTensor:
+    return RankedTensorType::get(inputType.cast<RankedTensorType>().getShape(),
+                                 elementalType);
+  case StandardTypes::UnrankedTensor:
+    return UnrankedTensorType::get(elementalType);
+  case StandardTypes::Vector:
+    return VectorType::get(inputType.cast<VectorType>().getShape(),
+                           elementalType);
+  }
+}
+
+ElementsAttr
+UniformQuantizedPerAxisValueConverter::convert(Attribute realValue) {
+  if (auto attr = realValue.dyn_cast<DenseFPElementsAttr>()) {
+    return convert(attr);
+  }
+  // TODO(fengliuai): handles sparse elements attribute
+  return nullptr;
+}
+
+DenseElementsAttr
+UniformQuantizedPerAxisValueConverter::convert(DenseFPElementsAttr attr) {
+  // Creates the converter for each chunk. Normally the size of the
+  // quantization dim is 3, so we can cache all the converters.
+  ShapedType type = attr.getType();
+  size_t dimSize = type.getDimSize(quantizationDim);
+  if (dimSize != scales.size()) {
+    return {};
+  }
+  SmallVector<UniformQuantizedValueConverter, 4> converters;
+  converters.reserve(dimSize);
+  for (int i = 0, e = dimSize; i != e; ++i) {
+    converters.push_back(getPerChunkConverter(i));
+  }
+
+  // Scan the elements of the dense elements attributes and quantize them by
+  // using the right quantization parameters.
+  int64_t flattenIndex = 0;
+  auto shape = type.getShape();
+  int64_t chunkSize =
+      std::accumulate(std::next(shape.begin(), quantizationDim + 1),
+                      shape.end(), 1, std::multiplies<int64_t>());
+  Type newElementType = IntegerType::get(storageBitWidth, attr.getContext());
+  return attr.mapValues(newElementType, [&](const APFloat &old) {
+    int chunkIndex = (flattenIndex++) / chunkSize;
+    return converters[chunkIndex % dimSize].quantizeFloatToInt(old);
+  });
+}
diff --git a/mlir/lib/Dialect/SDBM/CMakeLists.txt b/mlir/lib/Dialect/SDBM/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e36308e0edafd4cccc90d1d95dd080bc1feac017
--- /dev/null
+++ b/mlir/lib/Dialect/SDBM/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(MLIRSDBM
+  SDBM.cpp
+  SDBMExpr.cpp
+  SDBMDialect.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SDBM
+)
+add_dependencies(MLIRSDBM MLIRIR)
+target_link_libraries(MLIRSDBM MLIRIR)
diff --git a/mlir/lib/Dialect/SDBM/SDBM.cpp b/mlir/lib/Dialect/SDBM/SDBM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..03ffe3ffbb9a60e73d834a580934e661644a696a
--- /dev/null
+++ b/mlir/lib/Dialect/SDBM/SDBM.cpp
@@ -0,0 +1,551 @@
+//===- SDBM.cpp - MLIR SDBM implementation --------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A striped difference-bound matrix (SDBM) is a set in Z^N (or R^N) defined
+// as {(x_1, ... x_n) | f(x_1, ... x_n) >= 0} where f is an SDBM expression.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SDBM/SDBM.h"
+#include "mlir/Dialect/SDBM/SDBMExpr.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+// Helper function for SDBM construction that collects information necessary to
+// start building an SDBM in one sweep.  In particular, it records the largest
+// position of a dimension in `dim`, that of a symbol in `symbol` as well as
+// collects all unique stripe expressions in `stripes`.  Uses SetVector to
+// ensure these expressions always have the same order.
+static void collectSDBMBuildInfo(SDBMExpr expr, int &dim, int &symbol,
+                                 llvm::SmallSetVector<SDBMExpr, 8> &stripes) {
+  struct Visitor : public SDBMVisitor<Visitor> {
+    void visitDim(SDBMDimExpr dimExpr) {
+      int p = dimExpr.getPosition();
+      if (p > maxDimPosition)
+        maxDimPosition = p;
+    }
+    void visitSymbol(SDBMSymbolExpr symbExpr) {
+      int p = symbExpr.getPosition();
+      if (p > maxSymbPosition)
+        maxSymbPosition = p;
+    }
+    void visitStripe(SDBMStripeExpr stripeExpr) { stripes.insert(stripeExpr); }
+
+    Visitor(llvm::SmallSetVector<SDBMExpr, 8> &stripes) : stripes(stripes) {}
+
+    int maxDimPosition = -1;
+    int maxSymbPosition = -1;
+    llvm::SmallSetVector<SDBMExpr, 8> &stripes;
+  };
+
+  Visitor visitor(stripes);
+  visitor.walkPostorder(expr);
+  dim = std::max(dim, visitor.maxDimPosition);
+  symbol = std::max(symbol, visitor.maxSymbPosition);
+}
+
+namespace {
+// Utility class for SDBMBuilder.  Represents a value that can be inserted in
+// the SDB matrix that corresponds to "v0 - v1 + C <= 0", where v0 and v1 is
+// any combination of the positive and negative positions.  Since multiple
+// variables can be declared equal to the same stripe expression, the
+// constraints on this expression must be reflected to all these variables.  For
+// example, if
+//   d0 = s0 # 42
+//   d1 = s0 # 42
+//   d2 = s1 # 2
+//   d3 = s1 # 2
+// the constraint
+//   s0 # 42 - s1 # 2 <= C
+// should be reflected in the DB matrix as
+//   d0 - d2 <= C
+//   d1 - d2 <= C
+//   d0 - d3 <= C
+//   d1 - d3 <= C
+// since the DB matrix has no knowledge of the transitive equality between d0,
+// d1 and s0 # 42 as well as between d2, d3 and s1 # 2.  This knowledge can be
+// obtained by computing a transitive closure, which is impossible until the
+// DBM is actually built.
+struct SDBMBuilderResult {
+  // Positions in the matrix of the variables taken with the "+" sign in the
+  // difference expression, 0 if it is a constant rather than a variable.
+  SmallVector<unsigned, 2> positivePos;
+
+  // Positions in the matrix of the variables taken with the "-" sign in the
+  // difference expression, 0 if it is a constant rather than a variable.
+  SmallVector<unsigned, 2> negativePos;
+
+  // Constant value in the difference expression.
+  int64_t value = 0;
+};
+
+// Visitor for building an SDBM from SDBM expressions.  After traversing an SDBM
+// expression, produces an update to the SDB matrix specifying the positions in
+// the matrix and the negated value that should be stored.  Both the positive
+// and the negative positions may be lists of indices in cases where multiple
+// variables are equal to the same stripe expression.  In such cases, the update
+// applies to the cross product of positions because elements involved in the
+// update are (transitively) equal and should have the same constraints, but we
+// may not have an explicit equality for them.
+struct SDBMBuilder : public SDBMVisitor<SDBMBuilder, SDBMBuilderResult> {
+public:
+  // A difference expression produces both the positive and the negative
+  // coordinate in the matrix, recursively traversing the LHS and the RHS. The
+  // value is the difference between values obtained from LHS and RHS.
+  SDBMBuilderResult visitDiff(SDBMDiffExpr diffExpr) {
+    auto lhs = visit(diffExpr.getLHS());
+    auto rhs = visit(diffExpr.getRHS());
+    assert(lhs.negativePos.size() == 1 && lhs.negativePos[0] == 0 &&
+           "unexpected negative expression in a difference expression");
+    assert(rhs.negativePos.size() == 1 && lhs.negativePos[0] == 0 &&
+           "unexpected negative expression in a difference expression");
+
+    SDBMBuilderResult result;
+    result.positivePos = lhs.positivePos;
+    result.negativePos = rhs.positivePos;
+    result.value = lhs.value - rhs.value;
+    return result;
+  }
+
+  // An input expression is always taken with the "+" sign and therefore
+  // produces a positive coordinate keeping the negative coordinate zero for an
+  // eventual constant.
+  SDBMBuilderResult visitInput(SDBMInputExpr expr) {
+    SDBMBuilderResult r;
+    r.positivePos.push_back(linearPosition(expr));
+    r.negativePos.push_back(0);
+    return r;
+  }
+
+  // A stripe expression is always equal to one or more variables, which may be
+  // temporaries, and appears with a "+" sign in the SDBM expression tree. Take
+  // the positions of the corresponding variables as positive coordinates.
+  SDBMBuilderResult visitStripe(SDBMStripeExpr expr) {
+    SDBMBuilderResult r;
+    assert(pointExprToStripe.count(expr));
+    r.positivePos = pointExprToStripe[expr];
+    r.negativePos.push_back(0);
+    return r;
+  }
+
+  // A constant expression has both coordinates at zero.
+  SDBMBuilderResult visitConstant(SDBMConstantExpr expr) {
+    SDBMBuilderResult r;
+    r.positivePos.push_back(0);
+    r.negativePos.push_back(0);
+    r.value = expr.getValue();
+    return r;
+  }
+
+  // A negation expression swaps the positive and the negative coordinates
+  // and also negates the constant value.
+  SDBMBuilderResult visitNeg(SDBMNegExpr expr) {
+    SDBMBuilderResult result = visit(expr.getVar());
+    std::swap(result.positivePos, result.negativePos);
+    result.value = -result.value;
+    return result;
+  }
+
+  // The RHS of a sum expression must be a constant and therefore must have both
+  // positive and negative coordinates at zero.  Take the sum of the values
+  // between LHS and RHS and keep LHS coordinates.
+  SDBMBuilderResult visitSum(SDBMSumExpr expr) {
+    auto lhs = visit(expr.getLHS());
+    auto rhs = visit(expr.getRHS());
+    for (auto pos : rhs.negativePos) {
+      (void)pos;
+      assert(pos == 0 && "unexpected variable on the RHS of SDBM sum");
+    }
+    for (auto pos : rhs.positivePos) {
+      (void)pos;
+      assert(pos == 0 && "unexpected variable on the RHS of SDBM sum");
+    }
+
+    lhs.value += rhs.value;
+    return lhs;
+  }
+
+  SDBMBuilder(DenseMap<SDBMExpr, SmallVector<unsigned, 2>> &pointExprToStripe,
+              function_ref<unsigned(SDBMInputExpr)> callback)
+      : pointExprToStripe(pointExprToStripe), linearPosition(callback) {}
+
+  DenseMap<SDBMExpr, SmallVector<unsigned, 2>> &pointExprToStripe;
+  function_ref<unsigned(SDBMInputExpr)> linearPosition;
+};
+} // namespace
+
+SDBM SDBM::get(ArrayRef<SDBMExpr> inequalities, ArrayRef<SDBMExpr> equalities) {
+  SDBM result;
+
+  // TODO(zinenko): consider detecting equalities in the list of inequalities.
+  // This is potentially expensive and requires to
+  //   - create a list of negated inequalities (may allocate under lock);
+  //   - perform a pairwise comparison of direct and negated inequalities;
+  //   - copy the lists of equalities and inequalities, and move entries between
+  //     them;
+  // only for the purpose of sparing a temporary variable in cases where an
+  // implicit equality between a variable and a stripe expression is present in
+  // the input.
+
+  // Do the first sweep over (in)equalities to collect the information necessary
+  // to allocate the SDB matrix (number of dimensions, symbol and temporary
+  // variables required for stripe expressions).
+  llvm::SmallSetVector<SDBMExpr, 8> stripes;
+  int maxDim = -1;
+  int maxSymbol = -1;
+  for (auto expr : inequalities)
+    collectSDBMBuildInfo(expr, maxDim, maxSymbol, stripes);
+  for (auto expr : equalities)
+    collectSDBMBuildInfo(expr, maxDim, maxSymbol, stripes);
+  // Indexing of dimensions starts with 0, obtain the number of dimensions by
+  // incrementing the maximal position of the dimension seen in expressions.
+  result.numDims = maxDim + 1;
+  result.numSymbols = maxSymbol + 1;
+  result.numTemporaries = 0;
+
+  // Helper function that returns the position of the variable represented by
+  // an SDBM input expression.
+  auto linearPosition = [result](SDBMInputExpr expr) {
+    if (expr.isa<SDBMDimExpr>())
+      return result.getDimPosition(expr.getPosition());
+    return result.getSymbolPosition(expr.getPosition());
+  };
+
+  // Check if some stripe expressions are equal to another variable. In
+  // particular, look for the equalities of the form
+  //   d0 - stripe-expression = 0, or
+  //   stripe-expression - d0 = 0.
+  // There may be multiple variables that are equal to the same stripe
+  // expression.  Keep track of those in pointExprToStripe.
+  // There may also be multiple stripe expressions equal to the same variable.
+  // Introduce a temporary variable for each of those.
+  DenseMap<SDBMExpr, SmallVector<unsigned, 2>> pointExprToStripe;
+  unsigned numTemporaries = 0;
+
+  auto updateStripePointMaps = [&numTemporaries, &result, &pointExprToStripe,
+                                linearPosition](SDBMInputExpr input,
+                                                SDBMExpr expr) {
+    unsigned position = linearPosition(input);
+    if (result.stripeToPoint.count(position) &&
+        result.stripeToPoint[position] != expr) {
+      position = result.getNumVariables() + numTemporaries++;
+    }
+    pointExprToStripe[expr].push_back(position);
+    result.stripeToPoint.insert(std::make_pair(position, expr));
+  };
+
+  for (auto eq : equalities) {
+    auto diffExpr = eq.dyn_cast<SDBMDiffExpr>();
+    if (!diffExpr)
+      continue;
+
+    auto lhs = diffExpr.getLHS();
+    auto rhs = diffExpr.getRHS();
+    auto lhsInput = lhs.dyn_cast<SDBMInputExpr>();
+    auto rhsInput = rhs.dyn_cast<SDBMInputExpr>();
+
+    if (lhsInput && stripes.count(rhs))
+      updateStripePointMaps(lhsInput, rhs);
+    if (rhsInput && stripes.count(lhs))
+      updateStripePointMaps(rhsInput, lhs);
+  }
+
+  // Assign the remaining stripe expressions to temporary variables.  These
+  // expressions are the ones that could not be associated with an existing
+  // variable in the previous step.
+  for (auto expr : stripes) {
+    if (pointExprToStripe.count(expr))
+      continue;
+    unsigned position = result.getNumVariables() + numTemporaries++;
+    pointExprToStripe[expr].push_back(position);
+    result.stripeToPoint.insert(std::make_pair(position, expr));
+  }
+
+  // Create the DBM matrix, initialized to infinity values for the least tight
+  // possible bound (x - y <= infinity is always true).
+  result.numTemporaries = numTemporaries;
+  result.matrix.resize(result.getNumVariables() * result.getNumVariables(),
+                       IntInfty::infinity());
+
+  SDBMBuilder builder(pointExprToStripe, linearPosition);
+
+  // Only keep the tightest constraint.  Since we transform everything into
+  // less-than-or-equals-to inequalities, keep the smallest constant.  For
+  // example, if we have d0 - d1 <= 42 and d0 - d1 <= 2, we keep the latter.
+  // Note that the input expressions are in the shape of d0 - d1 + -42 <= 0
+  // so we negate the value before storing it.
+  // In case where the positive and the negative positions are equal, the
+  // corresponding expression has the form d0 - d0 + -42 <= 0.  If the constant
+  // value is positive, the set defined by SDBM is trivially empty.  We store
+  // this value anyway and continue processing to maintain the correspondence
+  // between the matrix form and the list-of-SDBMExpr form.
+  // TODO(zinenko): we may want to reconsider this once we have canonicalization
+  // or simplification in place
+  auto updateMatrix = [](SDBM &sdbm, const SDBMBuilderResult &r) {
+    for (auto positivePos : r.positivePos) {
+      for (auto negativePos : r.negativePos) {
+        auto &m = sdbm.at(negativePos, positivePos);
+        m = m < -r.value ? m : -r.value;
+      }
+    }
+  };
+
+  // Do the second sweep on (in)equalities, updating the SDB matrix to reflect
+  // the constraints.
+  for (auto ineq : inequalities)
+    updateMatrix(result, builder.visit(ineq));
+
+  // An equality f(x) = 0 is represented as a pair of inequalities {f(x) >= 0;
+  // f(x) <= 0} or, alternatively, {-f(x) <= 0 and f(x) <= 0}.
+  for (auto eq : equalities) {
+    updateMatrix(result, builder.visit(eq));
+    updateMatrix(result, builder.visit(-eq));
+  }
+
+  // Add the inequalities induced by stripe equalities.
+  //   t = x # C  =>  t <= x <= t + C - 1
+  // which is equivalent to
+  //   {t - x <= 0;
+  //    x - t - (C - 1) <= 0}.
+  for (const auto &pair : result.stripeToPoint) {
+    auto stripe = pair.second.cast<SDBMStripeExpr>();
+    SDBMBuilderResult update = builder.visit(stripe.getLHS());
+    assert(update.negativePos.size() == 1 && update.negativePos[0] == 0 &&
+           "unexpected negated variable in stripe expression");
+    assert(update.value == 0 &&
+           "unexpected non-zero value in stripe expression");
+    update.negativePos.clear();
+    update.negativePos.push_back(pair.first);
+    update.value = -(stripe.getStripeFactor().getValue() - 1);
+    updateMatrix(result, update);
+
+    std::swap(update.negativePos, update.positivePos);
+    update.value = 0;
+    updateMatrix(result, update);
+  }
+
+  return result;
+}
+
+// Given a row and a column position in the square DBM, insert one equality
+// or up to two inequalities that correspond the entries (col, row) and (row,
+// col) in the DBM.  `rowExpr` and `colExpr` contain the expressions such that
+// colExpr - rowExpr <= V where V is the value at (row, col) in the DBM.
+// If one of the expressions is derived from another using a stripe operation,
+// check if the inequalities induced by the stripe operation subsume the
+// inequalities defined in the DBM and if so, elide these inequalities.
+void SDBM::convertDBMElement(unsigned row, unsigned col, SDBMTermExpr rowExpr,
+                             SDBMTermExpr colExpr,
+                             SmallVectorImpl<SDBMExpr> &inequalities,
+                             SmallVectorImpl<SDBMExpr> &equalities) {
+  using ops_assertions::operator+;
+  using ops_assertions::operator-;
+
+  auto diffIJValue = at(col, row);
+  auto diffJIValue = at(row, col);
+
+  // If symmetric entries are opposite, the corresponding expressions are equal.
+  if (diffIJValue.isFinite() &&
+      diffIJValue.getValue() == -diffJIValue.getValue()) {
+    equalities.push_back(rowExpr - colExpr - diffIJValue.getValue());
+    return;
+  }
+
+  // Given an inequality x0 - x1 <= A, check if x0 is a stripe variable derived
+  // from x1: x0 = x1 # B.  If so, it would imply the constraints
+  // x0 <= x1 <= x0 + (B - 1) <=> x0 - x1 <= 0 and x1 - x0 <= (B - 1).
+  // Therefore, if A >= 0, this inequality is subsumed by that implied
+  // by the stripe equality and thus can be elided.
+  // Similarly, check if x1 is a stripe variable derived from x0: x1 = x0 # C.
+  // If so, it would imply the constraints x1 <= x0 <= x1 + (C - 1) <=>
+  // <=> x1 - x0 <= 0 and x0 - x1 <= (C - 1).  Therefore, if A >= (C - 1), this
+  // inequality can be elided.
+  //
+  // Note: x0 and x1 may be a stripe expressions themselves, we rely on stripe
+  // expressions being stored without temporaries on the RHS and being passed
+  // into this function as is.
+  auto canElide = [this](unsigned x0, unsigned x1, SDBMExpr x0Expr,
+                         SDBMExpr x1Expr, int64_t value) {
+    if (stripeToPoint.count(x0)) {
+      auto stripe = stripeToPoint[x0].cast<SDBMStripeExpr>();
+      SDBMDirectExpr var = stripe.getLHS();
+      if (x1Expr == var && value >= 0)
+        return true;
+    }
+    if (stripeToPoint.count(x1)) {
+      auto stripe = stripeToPoint[x1].cast<SDBMStripeExpr>();
+      SDBMDirectExpr var = stripe.getLHS();
+      if (x0Expr == var && value >= stripe.getStripeFactor().getValue() - 1)
+        return true;
+    }
+    return false;
+  };
+
+  // Check row - col.
+  if (diffIJValue.isFinite() &&
+      !canElide(row, col, rowExpr, colExpr, diffIJValue.getValue())) {
+    inequalities.push_back(rowExpr - colExpr - diffIJValue.getValue());
+  }
+  // Check col - row.
+  if (diffJIValue.isFinite() &&
+      !canElide(col, row, colExpr, rowExpr, diffJIValue.getValue())) {
+    inequalities.push_back(colExpr - rowExpr - diffJIValue.getValue());
+  }
+}
+
+// The values on the main diagonal correspond to the upper bound on the
+// difference between a variable and itself: d0 - d0 <= C, or alternatively
+// to -C <= 0.  Only construct the inequalities when C is negative, which
+// are trivially false but necessary for the returned system of inequalities
+// to indicate that the set it defines is empty.
+void SDBM::convertDBMDiagonalElement(unsigned pos, SDBMTermExpr expr,
+                                     SmallVectorImpl<SDBMExpr> &inequalities) {
+  auto selfDifference = at(pos, pos);
+  if (selfDifference.isFinite() && selfDifference < 0) {
+    auto selfDifferenceValueExpr =
+        SDBMConstantExpr::get(expr.getDialect(), -selfDifference.getValue());
+    inequalities.push_back(selfDifferenceValueExpr);
+  }
+}
+
+void SDBM::getSDBMExpressions(SDBMDialect *dialect,
+                              SmallVectorImpl<SDBMExpr> &inequalities,
+                              SmallVectorImpl<SDBMExpr> &equalities) {
+  using ops_assertions::operator-;
+  using ops_assertions::operator+;
+
+  // Helper function that creates an SDBMInputExpr given the linearized position
+  // of variable in the DBM.
+  auto getInput = [dialect, this](unsigned matrixPos) -> SDBMInputExpr {
+    if (matrixPos < numDims)
+      return SDBMDimExpr::get(dialect, matrixPos);
+    return SDBMSymbolExpr::get(dialect, matrixPos - numDims);
+  };
+
+  // The top-left value corresponds to inequality 0 <= C.  If C is negative, the
+  // set defined by SDBM is trivially empty and we add the constraint -C <= 0 to
+  // the list of inequalities.  Otherwise, the constraint is trivially true and
+  // we ignore it.
+  auto difference = at(0, 0);
+  if (difference.isFinite() && difference < 0) {
+    inequalities.push_back(
+        SDBMConstantExpr::get(dialect, -difference.getValue()));
+  }
+
+  // Traverse the segment of the matrix that involves non-temporary variables.
+  unsigned numTrueVariables = numDims + numSymbols;
+  for (unsigned i = 0; i < numTrueVariables; ++i) {
+    // The first row and column represent numerical upper and lower bound on
+    // each variable.  Transform them into inequalities if they are finite.
+    auto upperBound = at(0, 1 + i);
+    auto lowerBound = at(1 + i, 0);
+    auto inputExpr = getInput(i);
+    if (upperBound.isFinite() &&
+        upperBound.getValue() == -lowerBound.getValue()) {
+      equalities.push_back(inputExpr - upperBound.getValue());
+    } else if (upperBound.isFinite()) {
+      inequalities.push_back(inputExpr - upperBound.getValue());
+    } else if (lowerBound.isFinite()) {
+      inequalities.push_back(-inputExpr - lowerBound.getValue());
+    }
+
+    // Introduce trivially false inequalities if required by diagonal elements.
+    convertDBMDiagonalElement(1 + i, inputExpr, inequalities);
+
+    // Introduce equalities or inequalities between non-temporary variables.
+    for (unsigned j = 0; j < i; ++j) {
+      convertDBMElement(1 + i, 1 + j, getInput(i), getInput(j), inequalities,
+                        equalities);
+    }
+  }
+
+  // Add equalities for stripe expressions that define non-temporary
+  // variables.  Temporary variables will be substituted into their uses and
+  // should not appear in the resulting equalities.
+  for (const auto &stripePair : stripeToPoint) {
+    unsigned position = stripePair.first;
+    if (position < 1 + numTrueVariables) {
+      equalities.push_back(getInput(position - 1) - stripePair.second);
+    }
+  }
+
+  // Add equalities / inequalities involving temporaries by replacing the
+  // temporaries with stripe expressions that define them.
+  for (unsigned i = 1 + numTrueVariables, e = getNumVariables(); i < e; ++i) {
+    // Mixed constraints involving one temporary (j) and one non-temporary (i)
+    // variable.
+    for (unsigned j = 0; j < numTrueVariables; ++j) {
+      convertDBMElement(i, 1 + j, stripeToPoint[i].cast<SDBMStripeExpr>(),
+                        getInput(j), inequalities, equalities);
+    }
+
+    // Constraints involving only temporary variables.
+    for (unsigned j = 1 + numTrueVariables; j < i; ++j) {
+      convertDBMElement(i, j, stripeToPoint[i].cast<SDBMStripeExpr>(),
+                        stripeToPoint[j].cast<SDBMStripeExpr>(), inequalities,
+                        equalities);
+    }
+
+    // Introduce trivially false inequalities if required by diagonal elements.
+    convertDBMDiagonalElement(i, stripeToPoint[i].cast<SDBMStripeExpr>(),
+                              inequalities);
+  }
+}
+
+void SDBM::print(raw_ostream &os) {
+  unsigned numVariables = getNumVariables();
+
+  // Helper function that prints the name of the variable given its linearized
+  // position in the DBM.
+  auto getVarName = [this](unsigned matrixPos) -> std::string {
+    if (matrixPos == 0)
+      return "cst";
+    matrixPos -= 1;
+    if (matrixPos < numDims)
+      return llvm::formatv("d{0}", matrixPos);
+    matrixPos -= numDims;
+    if (matrixPos < numSymbols)
+      return llvm::formatv("s{0}", matrixPos);
+    matrixPos -= numSymbols;
+    return llvm::formatv("t{0}", matrixPos);
+  };
+
+  // Header row.
+  os << "      cst";
+  for (unsigned i = 1; i < numVariables; ++i) {
+    os << llvm::formatv(" {0,4}", getVarName(i));
+  }
+  os << '\n';
+
+  // Data rows.
+  for (unsigned i = 0; i < numVariables; ++i) {
+    os << llvm::formatv("{0,-4}", getVarName(i));
+    for (unsigned j = 0; j < numVariables; ++j) {
+      IntInfty value = operator()(i, j);
+      if (!value.isFinite())
+        os << "  inf";
+      else
+        os << llvm::formatv(" {0,4}", value.getValue());
+    }
+    os << '\n';
+  }
+
+  // Explanation of temporaries.
+  for (const auto &pair : stripeToPoint) {
+    os << getVarName(pair.first) << " = ";
+    pair.second.print(os);
+    os << '\n';
+  }
+}
+
+void SDBM::dump() { print(llvm::errs()); }
diff --git a/mlir/lib/Dialect/SDBM/SDBMDialect.cpp b/mlir/lib/Dialect/SDBM/SDBMDialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fab9463a86606b57752b5f5cc0036c3c4e1be62d
--- /dev/null
+++ b/mlir/lib/Dialect/SDBM/SDBMDialect.cpp
@@ -0,0 +1,11 @@
+//===- SDBMDialect.cpp - Dialect for striped difference-bound matrices ----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SDBM/SDBMDialect.h"
+
+static mlir::DialectRegistration<mlir::SDBMDialect> SDBMDialect;
diff --git a/mlir/lib/Dialect/SDBM/SDBMExpr.cpp b/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..68e3e1c278ec9412c99dbd801078f1e9d42e8193
--- /dev/null
+++ b/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
@@ -0,0 +1,734 @@
+//===- SDBMExpr.cpp - MLIR SDBM Expression implementation -----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A striped difference-bound matrix (SDBM) expression is a constant expression,
+// an identifier, a binary expression with constant RHS and +, stripe operators
+// or a difference expression between two identifiers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SDBM/SDBMExpr.h"
+#include "SDBMExprDetail.h"
+#include "mlir/Dialect/SDBM/SDBMDialect.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineExprVisitor.h"
+
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+namespace {
+/// A simple compositional matcher for AffineExpr
+///
+/// Example usage:
+///
+/// ```c++
+///    AffineExprMatcher x, C, m;
+///    AffineExprMatcher pattern1 = ((x % C) * m) + x;
+///    AffineExprMatcher pattern2 = x + ((x % C) * m);
+///    if (pattern1.match(expr) || pattern2.match(expr)) {
+///      ...
+///    }
+/// ```
+class AffineExprMatcherStorage;
+class AffineExprMatcher {
+public:
+  AffineExprMatcher();
+  AffineExprMatcher(const AffineExprMatcher &other);
+
+  AffineExprMatcher operator+(AffineExprMatcher other) {
+    return AffineExprMatcher(AffineExprKind::Add, *this, other);
+  }
+  AffineExprMatcher operator*(AffineExprMatcher other) {
+    return AffineExprMatcher(AffineExprKind::Mul, *this, other);
+  }
+  AffineExprMatcher floorDiv(AffineExprMatcher other) {
+    return AffineExprMatcher(AffineExprKind::FloorDiv, *this, other);
+  }
+  AffineExprMatcher ceilDiv(AffineExprMatcher other) {
+    return AffineExprMatcher(AffineExprKind::CeilDiv, *this, other);
+  }
+  AffineExprMatcher operator%(AffineExprMatcher other) {
+    return AffineExprMatcher(AffineExprKind::Mod, *this, other);
+  }
+
+  AffineExpr match(AffineExpr expr);
+  AffineExpr matched();
+  Optional<int> getMatchedConstantValue();
+
+private:
+  AffineExprMatcher(AffineExprKind k, AffineExprMatcher a, AffineExprMatcher b);
+  AffineExprKind kind; // only used to match in binary op cases.
+  // A shared_ptr allows multiple references to same matcher storage without
+  // worrying about ownership or dealing with an arena. To be cleaned up if we
+  // go with this.
+  std::shared_ptr<AffineExprMatcherStorage> storage;
+};
+
+class AffineExprMatcherStorage {
+public:
+  AffineExprMatcherStorage() {}
+  AffineExprMatcherStorage(const AffineExprMatcherStorage &other)
+      : subExprs(other.subExprs.begin(), other.subExprs.end()),
+        matched(other.matched) {}
+  AffineExprMatcherStorage(ArrayRef<AffineExprMatcher> exprs)
+      : subExprs(exprs.begin(), exprs.end()) {}
+  AffineExprMatcherStorage(AffineExprMatcher &a, AffineExprMatcher &b)
+      : subExprs({a, b}) {}
+  SmallVector<AffineExprMatcher, 0> subExprs;
+  AffineExpr matched;
+};
+} // namespace
+
+AffineExprMatcher::AffineExprMatcher()
+    : kind(AffineExprKind::Constant), storage(new AffineExprMatcherStorage()) {}
+
+AffineExprMatcher::AffineExprMatcher(const AffineExprMatcher &other)
+    : kind(other.kind), storage(other.storage) {}
+
+Optional<int> AffineExprMatcher::getMatchedConstantValue() {
+  if (auto cst = storage->matched.dyn_cast<AffineConstantExpr>())
+    return cst.getValue();
+  return None;
+}
+
+AffineExpr AffineExprMatcher::match(AffineExpr expr) {
+  if (kind > AffineExprKind::LAST_AFFINE_BINARY_OP) {
+    if (storage->matched)
+      if (storage->matched != expr)
+        return AffineExpr();
+    storage->matched = expr;
+    return storage->matched;
+  }
+  if (kind != expr.getKind()) {
+    return AffineExpr();
+  }
+  if (auto bin = expr.dyn_cast<AffineBinaryOpExpr>()) {
+    if (!storage->subExprs.empty() &&
+        !storage->subExprs[0].match(bin.getLHS())) {
+      return AffineExpr();
+    }
+    if (!storage->subExprs.empty() &&
+        !storage->subExprs[1].match(bin.getRHS())) {
+      return AffineExpr();
+    }
+    if (storage->matched)
+      if (storage->matched != expr)
+        return AffineExpr();
+    storage->matched = expr;
+    return storage->matched;
+  }
+  llvm_unreachable("binary expected");
+}
+
+AffineExpr AffineExprMatcher::matched() { return storage->matched; }
+
+AffineExprMatcher::AffineExprMatcher(AffineExprKind k, AffineExprMatcher a,
+                                     AffineExprMatcher b)
+    : kind(k), storage(new AffineExprMatcherStorage(a, b)) {
+  storage->subExprs.push_back(a);
+  storage->subExprs.push_back(b);
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMExpr
+//===----------------------------------------------------------------------===//
+
+SDBMExprKind SDBMExpr::getKind() const { return impl->getKind(); }
+
+MLIRContext *SDBMExpr::getContext() const {
+  return impl->dialect->getContext();
+}
+
+SDBMDialect *SDBMExpr::getDialect() const { return impl->dialect; }
+
+void SDBMExpr::print(raw_ostream &os) const {
+  struct Printer : public SDBMVisitor<Printer> {
+    Printer(raw_ostream &ostream) : prn(ostream) {}
+
+    void visitSum(SDBMSumExpr expr) {
+      visit(expr.getLHS());
+      prn << " + ";
+      visit(expr.getRHS());
+    }
+    void visitDiff(SDBMDiffExpr expr) {
+      visit(expr.getLHS());
+      prn << " - ";
+      visit(expr.getRHS());
+    }
+    void visitDim(SDBMDimExpr expr) { prn << 'd' << expr.getPosition(); }
+    void visitSymbol(SDBMSymbolExpr expr) { prn << 's' << expr.getPosition(); }
+    void visitStripe(SDBMStripeExpr expr) {
+      SDBMDirectExpr lhs = expr.getLHS();
+      bool isTerm = lhs.isa<SDBMTermExpr>();
+      if (!isTerm)
+        prn << '(';
+      visit(lhs);
+      if (!isTerm)
+        prn << ')';
+      prn << " # ";
+      visitConstant(expr.getStripeFactor());
+    }
+    void visitNeg(SDBMNegExpr expr) {
+      bool isSum = expr.getVar().isa<SDBMSumExpr>();
+      prn << '-';
+      if (isSum)
+        prn << '(';
+      visit(expr.getVar());
+      if (isSum)
+        prn << ')';
+    }
+    void visitConstant(SDBMConstantExpr expr) { prn << expr.getValue(); }
+
+    raw_ostream &prn;
+  };
+  Printer printer(os);
+  printer.visit(*this);
+}
+
+void SDBMExpr::dump() const {
+  print(llvm::errs());
+  llvm::errs() << '\n';
+}
+
+namespace {
+// Helper class to perform negation of an SDBM expression.
+struct SDBMNegator : public SDBMVisitor<SDBMNegator, SDBMExpr> {
+  // Any term expression is wrapped into a negation expression.
+  //  -(x) = -x
+  SDBMExpr visitDirect(SDBMDirectExpr expr) { return SDBMNegExpr::get(expr); }
+  // A negation expression is unwrapped.
+  //  -(-x) = x
+  SDBMExpr visitNeg(SDBMNegExpr expr) { return expr.getVar(); }
+  // The value of the constant is negated.
+  SDBMExpr visitConstant(SDBMConstantExpr expr) {
+    return SDBMConstantExpr::get(expr.getDialect(), -expr.getValue());
+  }
+
+  // Terms of a difference are interchanged. Since only the LHS of a diff
+  // expression is allowed to be a sum with a constant, we need to recreate the
+  // sum with the negated value:
+  //   -((x + C) - y) = (y - C) - x.
+  SDBMExpr visitDiff(SDBMDiffExpr expr) {
+    // If the LHS is just a term, we can do straightforward interchange.
+    if (auto term = expr.getLHS().dyn_cast<SDBMTermExpr>())
+      return SDBMDiffExpr::get(expr.getRHS(), term);
+
+    auto sum = expr.getLHS().cast<SDBMSumExpr>();
+    auto cst = visitConstant(sum.getRHS()).cast<SDBMConstantExpr>();
+    return SDBMDiffExpr::get(SDBMSumExpr::get(expr.getRHS(), cst),
+                             sum.getLHS());
+  }
+};
+} // namespace
+
+SDBMExpr SDBMExpr::operator-() { return SDBMNegator().visit(*this); }
+
+//===----------------------------------------------------------------------===//
+// SDBMSumExpr
+//===----------------------------------------------------------------------===//
+
+SDBMSumExpr SDBMSumExpr::get(SDBMTermExpr lhs, SDBMConstantExpr rhs) {
+  assert(lhs && "expected SDBM variable expression");
+  assert(rhs && "expected SDBM constant");
+
+  // If LHS of a sum is another sum, fold the constant RHS parts.
+  if (auto lhsSum = lhs.dyn_cast<SDBMSumExpr>()) {
+    lhs = lhsSum.getLHS();
+    rhs = SDBMConstantExpr::get(rhs.getDialect(),
+                                rhs.getValue() + lhsSum.getRHS().getValue());
+  }
+
+  StorageUniquer &uniquer = lhs.getDialect()->getUniquer();
+  return uniquer.get<detail::SDBMBinaryExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Add), lhs, rhs);
+}
+
+SDBMTermExpr SDBMSumExpr::getLHS() const {
+  return static_cast<ImplType *>(impl)->lhs.cast<SDBMTermExpr>();
+}
+
+SDBMConstantExpr SDBMSumExpr::getRHS() const {
+  return static_cast<ImplType *>(impl)->rhs;
+}
+
+AffineExpr SDBMExpr::getAsAffineExpr() const {
+  struct Converter : public SDBMVisitor<Converter, AffineExpr> {
+    AffineExpr visitSum(SDBMSumExpr expr) {
+      AffineExpr lhs = visit(expr.getLHS()), rhs = visit(expr.getRHS());
+      return lhs + rhs;
+    }
+
+    AffineExpr visitStripe(SDBMStripeExpr expr) {
+      AffineExpr lhs = visit(expr.getLHS()),
+                 rhs = visit(expr.getStripeFactor());
+      return lhs - (lhs % rhs);
+    }
+
+    AffineExpr visitDiff(SDBMDiffExpr expr) {
+      AffineExpr lhs = visit(expr.getLHS()), rhs = visit(expr.getRHS());
+      return lhs - rhs;
+    }
+
+    AffineExpr visitDim(SDBMDimExpr expr) {
+      return getAffineDimExpr(expr.getPosition(), expr.getContext());
+    }
+
+    AffineExpr visitSymbol(SDBMSymbolExpr expr) {
+      return getAffineSymbolExpr(expr.getPosition(), expr.getContext());
+    }
+
+    AffineExpr visitNeg(SDBMNegExpr expr) {
+      return getAffineBinaryOpExpr(AffineExprKind::Mul,
+                                   getAffineConstantExpr(-1, expr.getContext()),
+                                   visit(expr.getVar()));
+    }
+
+    AffineExpr visitConstant(SDBMConstantExpr expr) {
+      return getAffineConstantExpr(expr.getValue(), expr.getContext());
+    }
+  } converter;
+  return converter.visit(*this);
+}
+
+// Given a direct expression `expr`, add the given constant to it and pass the
+// resulting expression to `builder` before returning its result.  If the
+// expression is already a sum expression, update its constant and extract the
+// LHS if the constant becomes zero.  Otherwise, construct a sum expression.
+template <typename Result>
+Result addConstantAndSink(SDBMDirectExpr expr, int64_t constant, bool negated,
+                          function_ref<Result(SDBMDirectExpr)> builder) {
+  SDBMDialect *dialect = expr.getDialect();
+  if (auto sumExpr = expr.dyn_cast<SDBMSumExpr>()) {
+    if (negated)
+      constant = sumExpr.getRHS().getValue() - constant;
+    else
+      constant += sumExpr.getRHS().getValue();
+
+    if (constant != 0) {
+      auto sum = SDBMSumExpr::get(sumExpr.getLHS(),
+                                  SDBMConstantExpr::get(dialect, constant));
+      return builder(sum);
+    } else {
+      return builder(sumExpr.getLHS());
+    }
+  }
+  if (constant != 0)
+    return builder(SDBMSumExpr::get(
+        expr.cast<SDBMTermExpr>(),
+        SDBMConstantExpr::get(dialect, negated ? -constant : constant)));
+  return expr;
+}
+
+// Construct an expression lhs + constant while maintaining the canonical form
+// of the SDBM expressions, in particular sink the constant expression to the
+// nearest sum expression in the left subtree of the expression tree.
+static SDBMExpr addConstant(SDBMVaryingExpr lhs, int64_t constant) {
+  if (auto lhsDiff = lhs.dyn_cast<SDBMDiffExpr>())
+    return addConstantAndSink<SDBMExpr>(
+        lhsDiff.getLHS(), constant, /*negated=*/false,
+        [lhsDiff](SDBMDirectExpr e) {
+          return SDBMDiffExpr::get(e, lhsDiff.getRHS());
+        });
+  if (auto lhsNeg = lhs.dyn_cast<SDBMNegExpr>())
+    return addConstantAndSink<SDBMExpr>(
+        lhsNeg.getVar(), constant, /*negated=*/true,
+        [](SDBMDirectExpr e) { return SDBMNegExpr::get(e); });
+  if (auto lhsSum = lhs.dyn_cast<SDBMSumExpr>())
+    return addConstantAndSink<SDBMExpr>(lhsSum, constant, /*negated=*/false,
+                                        [](SDBMDirectExpr e) { return e; });
+  if (constant != 0)
+    return SDBMSumExpr::get(lhs.cast<SDBMTermExpr>(),
+                            SDBMConstantExpr::get(lhs.getDialect(), constant));
+  return lhs;
+}
+
+// Build a difference expression given a direct expression and a negation
+// expression.
+static SDBMExpr buildDiffExpr(SDBMDirectExpr lhs, SDBMNegExpr rhs) {
+  // Fold (x + C) - (x + D) = C - D.
+  if (lhs.getTerm() == rhs.getVar().getTerm())
+    return SDBMConstantExpr::get(
+        lhs.getDialect(), lhs.getConstant() - rhs.getVar().getConstant());
+
+  return SDBMDiffExpr::get(
+      addConstantAndSink<SDBMDirectExpr>(lhs, -rhs.getVar().getConstant(),
+                                         /*negated=*/false,
+                                         [](SDBMDirectExpr e) { return e; }),
+      rhs.getVar().getTerm());
+}
+
+// Try folding an expression (lhs + rhs) where at least one of the operands
+// contains a negated variable, i.e. is a negation or a difference expression.
+static SDBMExpr foldSumDiff(SDBMExpr lhs, SDBMExpr rhs) {
+  // If exactly one of LHS, RHS is a negation expression, we can construct
+  // a difference expression, which is a special kind in SDBM.
+  auto lhsDirect = lhs.dyn_cast<SDBMDirectExpr>();
+  auto rhsDirect = rhs.dyn_cast<SDBMDirectExpr>();
+  auto lhsNeg = lhs.dyn_cast<SDBMNegExpr>();
+  auto rhsNeg = rhs.dyn_cast<SDBMNegExpr>();
+
+  if (lhsDirect && rhsNeg)
+    return buildDiffExpr(lhsDirect, rhsNeg);
+  if (lhsNeg && rhsDirect)
+    return buildDiffExpr(rhsDirect, lhsNeg);
+
+  // If a subexpression appears in a diff expression on the LHS(RHS) of a
+  // sum expression where it also appears on the RHS(LHS) with the opposite
+  // sign, we can simplify it away and obtain the SDBM form.
+  auto lhsDiff = lhs.dyn_cast<SDBMDiffExpr>();
+  auto rhsDiff = rhs.dyn_cast<SDBMDiffExpr>();
+
+  // -(x + A) + ((x + B) - y) = -(y + (A - B))
+  if (lhsNeg && rhsDiff &&
+      lhsNeg.getVar().getTerm() == rhsDiff.getLHS().getTerm()) {
+    int64_t constant =
+        lhsNeg.getVar().getConstant() - rhsDiff.getLHS().getConstant();
+    // RHS of the diff is a term expression, its sum with a constant is a direct
+    // expression.
+    return SDBMNegExpr::get(
+        addConstant(rhsDiff.getRHS(), constant).cast<SDBMDirectExpr>());
+  }
+
+  // (x + A) + ((y + B) - x) = (y + B) + A.
+  if (lhsDirect && rhsDiff && lhsDirect.getTerm() == rhsDiff.getRHS())
+    return addConstant(rhsDiff.getLHS(), lhsDirect.getConstant());
+
+  // ((x + A) - y) + (-(x + B)) = -(y + (B - A)).
+  if (lhsDiff && rhsNeg &&
+      lhsDiff.getLHS().getTerm() == rhsNeg.getVar().getTerm()) {
+    int64_t constant =
+        rhsNeg.getVar().getConstant() - lhsDiff.getLHS().getConstant();
+    // RHS of the diff is a term expression, its sum with a constant is a direct
+    // expression.
+    return SDBMNegExpr::get(
+        addConstant(lhsDiff.getRHS(), constant).cast<SDBMDirectExpr>());
+  }
+
+  // ((x + A) - y) + (y + B) = (x + A) + B.
+  if (rhsDirect && lhsDiff && rhsDirect.getTerm() == lhsDiff.getRHS())
+    return addConstant(lhsDiff.getLHS(), rhsDirect.getConstant());
+
+  return {};
+}
+
+Optional<SDBMExpr> SDBMExpr::tryConvertAffineExpr(AffineExpr affine) {
+  struct Converter : public AffineExprVisitor<Converter, SDBMExpr> {
+    SDBMExpr visitAddExpr(AffineBinaryOpExpr expr) {
+      auto lhs = visit(expr.getLHS()), rhs = visit(expr.getRHS());
+      if (!lhs || !rhs)
+        return {};
+
+      // In a "add" AffineExpr, the constant always appears on the right.  If
+      // there were two constants, they would have been folded away.
+      assert(!lhs.isa<SDBMConstantExpr>() && "non-canonical affine expression");
+
+      // If RHS is a constant, we can always extend the SDBM expression to
+      // include it by sinking the constant into the nearest sum expression.
+      if (auto rhsConstant = rhs.dyn_cast<SDBMConstantExpr>()) {
+        int64_t constant = rhsConstant.getValue();
+        auto varying = lhs.dyn_cast<SDBMVaryingExpr>();
+        assert(varying && "unexpected uncanonicalized sum of constants");
+        return addConstant(varying, constant);
+      }
+
+      // Try building a difference expression if one of the values is negated,
+      // or check if a difference on either hand side cancels out the outer term
+      // so as to remain correct within SDBM. Return null otherwise.
+      return foldSumDiff(lhs, rhs);
+    }
+
+    SDBMExpr visitMulExpr(AffineBinaryOpExpr expr) {
+      // Attempt to recover a stripe expression "x # C = (x floordiv C) * C".
+      AffineExprMatcher x, C;
+      AffineExprMatcher pattern = (x.floorDiv(C)) * C;
+      if (pattern.match(expr)) {
+        if (SDBMExpr converted = visit(x.matched())) {
+          if (auto varConverted = converted.dyn_cast<SDBMTermExpr>())
+            // TODO(ntv): return varConverted.stripe(C.getConstantValue());
+            return SDBMStripeExpr::get(
+                varConverted,
+                SDBMConstantExpr::get(dialect,
+                                      C.getMatchedConstantValue().getValue()));
+        }
+      }
+
+      auto lhs = visit(expr.getLHS()), rhs = visit(expr.getRHS());
+      if (!lhs || !rhs)
+        return {};
+
+      // In a "mul" AffineExpr, the constant always appears on the right.  If
+      // there were two constants, they would have been folded away.
+      assert(!lhs.isa<SDBMConstantExpr>() && "non-canonical affine expression");
+      auto rhsConstant = rhs.dyn_cast<SDBMConstantExpr>();
+      if (!rhsConstant)
+        return {};
+
+      // The only supported "multiplication" expression is an SDBM is dimension
+      // negation, that is a product of dimension and constant -1.
+      if (rhsConstant.getValue() != -1)
+        return {};
+
+      if (auto lhsVar = lhs.dyn_cast<SDBMTermExpr>())
+        return SDBMNegExpr::get(lhsVar);
+      if (auto lhsDiff = lhs.dyn_cast<SDBMDiffExpr>())
+        return SDBMNegator().visitDiff(lhsDiff);
+
+      // Other multiplications are not allowed in SDBM.
+      return {};
+    }
+
+    SDBMExpr visitModExpr(AffineBinaryOpExpr expr) {
+      auto lhs = visit(expr.getLHS()), rhs = visit(expr.getRHS());
+      if (!lhs || !rhs)
+        return {};
+
+      // 'mod' can only be converted to SDBM if its LHS is a direct expression
+      // and its RHS is a constant.  Then it `x mod c = x - x stripe c`.
+      auto rhsConstant = rhs.dyn_cast<SDBMConstantExpr>();
+      auto lhsVar = lhs.dyn_cast<SDBMDirectExpr>();
+      if (!lhsVar || !rhsConstant)
+        return {};
+      return SDBMDiffExpr::get(lhsVar,
+                               SDBMStripeExpr::get(lhsVar, rhsConstant));
+    }
+
+    // `a floordiv b = (a stripe b) / b`, but we have no division in SDBM
+    SDBMExpr visitFloorDivExpr(AffineBinaryOpExpr expr) { return {}; }
+    SDBMExpr visitCeilDivExpr(AffineBinaryOpExpr expr) { return {}; }
+
+    // Dimensions, symbols and constants are converted trivially.
+    SDBMExpr visitConstantExpr(AffineConstantExpr expr) {
+      return SDBMConstantExpr::get(dialect, expr.getValue());
+    }
+    SDBMExpr visitDimExpr(AffineDimExpr expr) {
+      return SDBMDimExpr::get(dialect, expr.getPosition());
+    }
+    SDBMExpr visitSymbolExpr(AffineSymbolExpr expr) {
+      return SDBMSymbolExpr::get(dialect, expr.getPosition());
+    }
+
+    SDBMDialect *dialect;
+  } converter;
+  converter.dialect = affine.getContext()->getRegisteredDialect<SDBMDialect>();
+
+  if (auto result = converter.visit(affine))
+    return result;
+  return None;
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMDiffExpr
+//===----------------------------------------------------------------------===//
+
+SDBMDiffExpr SDBMDiffExpr::get(SDBMDirectExpr lhs, SDBMTermExpr rhs) {
+  assert(lhs && "expected SDBM dimension");
+  assert(rhs && "expected SDBM dimension");
+
+  StorageUniquer &uniquer = lhs.getDialect()->getUniquer();
+  return uniquer.get<detail::SDBMDiffExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Diff), lhs, rhs);
+}
+
+SDBMDirectExpr SDBMDiffExpr::getLHS() const {
+  return static_cast<ImplType *>(impl)->lhs;
+}
+
+SDBMTermExpr SDBMDiffExpr::getRHS() const {
+  return static_cast<ImplType *>(impl)->rhs;
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMDirectExpr
+//===----------------------------------------------------------------------===//
+
+SDBMTermExpr SDBMDirectExpr::getTerm() {
+  if (auto sum = dyn_cast<SDBMSumExpr>())
+    return sum.getLHS();
+  return cast<SDBMTermExpr>();
+}
+
+int64_t SDBMDirectExpr::getConstant() {
+  if (auto sum = dyn_cast<SDBMSumExpr>())
+    return sum.getRHS().getValue();
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMStripeExpr
+//===----------------------------------------------------------------------===//
+
+SDBMStripeExpr SDBMStripeExpr::get(SDBMDirectExpr var,
+                                   SDBMConstantExpr stripeFactor) {
+  assert(var && "expected SDBM variable expression");
+  assert(stripeFactor && "expected non-null stripe factor");
+  if (stripeFactor.getValue() <= 0)
+    llvm::report_fatal_error("non-positive stripe factor");
+
+  StorageUniquer &uniquer = var.getDialect()->getUniquer();
+  return uniquer.get<detail::SDBMBinaryExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Stripe), var,
+      stripeFactor);
+}
+
+SDBMDirectExpr SDBMStripeExpr::getLHS() const {
+  if (SDBMVaryingExpr lhs = static_cast<ImplType *>(impl)->lhs)
+    return lhs.cast<SDBMDirectExpr>();
+  return {};
+}
+
+SDBMConstantExpr SDBMStripeExpr::getStripeFactor() const {
+  return static_cast<ImplType *>(impl)->rhs;
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMInputExpr
+//===----------------------------------------------------------------------===//
+
+unsigned SDBMInputExpr::getPosition() const {
+  return static_cast<ImplType *>(impl)->position;
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMDimExpr
+//===----------------------------------------------------------------------===//
+
+SDBMDimExpr SDBMDimExpr::get(SDBMDialect *dialect, unsigned position) {
+  assert(dialect && "expected non-null dialect");
+
+  auto assignDialect = [dialect](detail::SDBMTermExprStorage *storage) {
+    storage->dialect = dialect;
+  };
+
+  StorageUniquer &uniquer = dialect->getUniquer();
+  return uniquer.get<detail::SDBMTermExprStorage>(
+      assignDialect, static_cast<unsigned>(SDBMExprKind::DimId), position);
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMSymbolExpr
+//===----------------------------------------------------------------------===//
+
+SDBMSymbolExpr SDBMSymbolExpr::get(SDBMDialect *dialect, unsigned position) {
+  assert(dialect && "expected non-null dialect");
+
+  auto assignDialect = [dialect](detail::SDBMTermExprStorage *storage) {
+    storage->dialect = dialect;
+  };
+
+  StorageUniquer &uniquer = dialect->getUniquer();
+  return uniquer.get<detail::SDBMTermExprStorage>(
+      assignDialect, static_cast<unsigned>(SDBMExprKind::SymbolId), position);
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMConstantExpr
+//===----------------------------------------------------------------------===//
+
+SDBMConstantExpr SDBMConstantExpr::get(SDBMDialect *dialect, int64_t value) {
+  assert(dialect && "expected non-null dialect");
+
+  auto assignCtx = [dialect](detail::SDBMConstantExprStorage *storage) {
+    storage->dialect = dialect;
+  };
+
+  StorageUniquer &uniquer = dialect->getUniquer();
+  return uniquer.get<detail::SDBMConstantExprStorage>(
+      assignCtx, static_cast<unsigned>(SDBMExprKind::Constant), value);
+}
+
+int64_t SDBMConstantExpr::getValue() const {
+  return static_cast<ImplType *>(impl)->constant;
+}
+
+//===----------------------------------------------------------------------===//
+// SDBMNegExpr
+//===----------------------------------------------------------------------===//
+
+SDBMNegExpr SDBMNegExpr::get(SDBMDirectExpr var) {
+  assert(var && "expected non-null SDBM direct expression");
+
+  StorageUniquer &uniquer = var.getDialect()->getUniquer();
+  return uniquer.get<detail::SDBMNegExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Neg), var);
+}
+
+SDBMDirectExpr SDBMNegExpr::getVar() const {
+  return static_cast<ImplType *>(impl)->expr;
+}
+
+SDBMExpr mlir::ops_assertions::operator+(SDBMExpr lhs, SDBMExpr rhs) {
+  if (auto folded = foldSumDiff(lhs, rhs))
+    return folded;
+  assert(!(lhs.isa<SDBMNegExpr>() && rhs.isa<SDBMNegExpr>()) &&
+         "a sum of negated expressions is a negation of a sum of variables and "
+         "not a correct SDBM");
+
+  // Fold (x - y) + (y - x) = 0.
+  auto lhsDiff = lhs.dyn_cast<SDBMDiffExpr>();
+  auto rhsDiff = rhs.dyn_cast<SDBMDiffExpr>();
+  if (lhsDiff && rhsDiff) {
+    if (lhsDiff.getLHS() == rhsDiff.getRHS() &&
+        lhsDiff.getRHS() == rhsDiff.getLHS())
+      return SDBMConstantExpr::get(lhs.getDialect(), 0);
+  }
+
+  // If LHS is a constant and RHS is not, swap the order to get into a supported
+  // sum case.  From now on, RHS must be a constant.
+  auto lhsConstant = lhs.dyn_cast<SDBMConstantExpr>();
+  auto rhsConstant = rhs.dyn_cast<SDBMConstantExpr>();
+  if (!rhsConstant && lhsConstant) {
+    std::swap(lhs, rhs);
+    std::swap(lhsConstant, rhsConstant);
+  }
+  assert(rhsConstant && "at least one operand must be a constant");
+
+  // Constant-fold if LHS is also a constant.
+  if (lhsConstant)
+    return SDBMConstantExpr::get(lhs.getDialect(), lhsConstant.getValue() +
+                                                       rhsConstant.getValue());
+  return addConstant(lhs.cast<SDBMVaryingExpr>(), rhsConstant.getValue());
+}
+
+SDBMExpr mlir::ops_assertions::operator-(SDBMExpr lhs, SDBMExpr rhs) {
+  // Fold x - x == 0.
+  if (lhs == rhs)
+    return SDBMConstantExpr::get(lhs.getDialect(), 0);
+
+  // LHS and RHS may be constants.
+  auto lhsConstant = lhs.dyn_cast<SDBMConstantExpr>();
+  auto rhsConstant = rhs.dyn_cast<SDBMConstantExpr>();
+
+  // Constant fold if both LHS and RHS are constants.
+  if (lhsConstant && rhsConstant)
+    return SDBMConstantExpr::get(lhs.getDialect(), lhsConstant.getValue() -
+                                                       rhsConstant.getValue());
+
+  // Replace a difference with a sum with a negated value if one of LHS and RHS
+  // is a constant:
+  //   x - C == x + (-C);
+  //   C - x == -x + C.
+  // This calls into operator+ for further simplification.
+  if (rhsConstant)
+    return lhs + (-rhsConstant);
+  if (lhsConstant)
+    return -rhs + lhsConstant;
+
+  return buildDiffExpr(lhs.cast<SDBMDirectExpr>(), (-rhs).cast<SDBMNegExpr>());
+}
+
+SDBMExpr mlir::ops_assertions::stripe(SDBMExpr expr, SDBMExpr factor) {
+  auto constantFactor = factor.cast<SDBMConstantExpr>();
+  assert(constantFactor.getValue() > 0 && "non-positive stripe");
+
+  // Fold x # 1 = x.
+  if (constantFactor.getValue() == 1)
+    return expr;
+
+  return SDBMStripeExpr::get(expr.cast<SDBMDirectExpr>(), constantFactor);
+}
diff --git a/mlir/lib/Dialect/SDBM/SDBMExprDetail.h b/mlir/lib/Dialect/SDBM/SDBMExprDetail.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb80b45902ebca7299a4ae94e7ad54dcff334084
--- /dev/null
+++ b/mlir/lib/Dialect/SDBM/SDBMExprDetail.h
@@ -0,0 +1,129 @@
+//===- SDBMExprDetail.h - MLIR SDBM Expression storage details --*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This holds implementation details of SDBMExpr, in particular underlying
+// storage types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_SDBMEXPRDETAIL_H
+#define MLIR_IR_SDBMEXPRDETAIL_H
+
+#include "mlir/Dialect/SDBM/SDBMExpr.h"
+#include "mlir/Support/StorageUniquer.h"
+
+namespace mlir {
+
+class SDBMDialect;
+
+namespace detail {
+
+// Base storage class for SDBMExpr.
+struct SDBMExprStorage : public StorageUniquer::BaseStorage {
+  SDBMExprKind getKind() {
+    return static_cast<SDBMExprKind>(BaseStorage::getKind());
+  }
+
+  SDBMDialect *dialect;
+};
+
+// Storage class for SDBM sum and stripe expressions.
+struct SDBMBinaryExprStorage : public SDBMExprStorage {
+  using KeyTy = std::pair<SDBMDirectExpr, SDBMConstantExpr>;
+
+  bool operator==(const KeyTy &key) const {
+    return std::get<0>(key) == lhs && std::get<1>(key) == rhs;
+  }
+
+  static SDBMBinaryExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<SDBMBinaryExprStorage>();
+    result->lhs = std::get<0>(key);
+    result->rhs = std::get<1>(key);
+    result->dialect = result->lhs.getDialect();
+    return result;
+  }
+
+  SDBMDirectExpr lhs;
+  SDBMConstantExpr rhs;
+};
+
+// Storage class for SDBM difference expressions.
+struct SDBMDiffExprStorage : public SDBMExprStorage {
+  using KeyTy = std::pair<SDBMDirectExpr, SDBMTermExpr>;
+
+  bool operator==(const KeyTy &key) const {
+    return std::get<0>(key) == lhs && std::get<1>(key) == rhs;
+  }
+
+  static SDBMDiffExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<SDBMDiffExprStorage>();
+    result->lhs = std::get<0>(key);
+    result->rhs = std::get<1>(key);
+    result->dialect = result->lhs.getDialect();
+    return result;
+  }
+
+  SDBMDirectExpr lhs;
+  SDBMTermExpr rhs;
+};
+
+// Storage class for SDBM constant expressions.
+struct SDBMConstantExprStorage : public SDBMExprStorage {
+  using KeyTy = int64_t;
+
+  bool operator==(const KeyTy &key) const { return constant == key; }
+
+  static SDBMConstantExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<SDBMConstantExprStorage>();
+    result->constant = key;
+    return result;
+  }
+
+  int64_t constant;
+};
+
+// Storage class for SDBM dimension and symbol expressions.
+struct SDBMTermExprStorage : public SDBMExprStorage {
+  using KeyTy = unsigned;
+
+  bool operator==(const KeyTy &key) const { return position == key; }
+
+  static SDBMTermExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<SDBMTermExprStorage>();
+    result->position = key;
+    return result;
+  }
+
+  unsigned position;
+};
+
+// Storage class for SDBM negation expressions.
+struct SDBMNegExprStorage : public SDBMExprStorage {
+  using KeyTy = SDBMDirectExpr;
+
+  bool operator==(const KeyTy &key) const { return key == expr; }
+
+  static SDBMNegExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<SDBMNegExprStorage>();
+    result->expr = key;
+    result->dialect = key.getDialect();
+    return result;
+  }
+
+  SDBMDirectExpr expr;
+};
+
+} // end namespace detail
+} // end namespace mlir
+
+#endif // MLIR_IR_SDBMEXPRDETAIL_H
diff --git a/mlir/lib/Dialect/SPIRV/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2c3b1b95a6854c8c7c2cf4cc8c570533505c4dda
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/CMakeLists.txt
@@ -0,0 +1,31 @@
+set(LLVM_TARGET_DEFINITIONS SPIRVCanonicalization.td)
+mlir_tablegen(SPIRVCanonicalization.inc -gen-rewriters)
+add_public_tablegen_target(MLIRSPIRVCanonicalizationIncGen)
+
+add_llvm_library(MLIRSPIRV
+  DialectRegistration.cpp
+  LayoutUtils.cpp
+  SPIRVDialect.cpp
+  SPIRVOps.cpp
+  SPIRVLowering.cpp
+  SPIRVTypes.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/SPIRV
+  )
+
+add_dependencies(MLIRSPIRV
+  MLIRSPIRVCanonicalizationIncGen
+  MLIRSPIRVEnumsIncGen
+  MLIRSPIRVLoweringStructGen
+  MLIRSPIRVOpsIncGen
+  MLIRSPIRVOpUtilsGen)
+
+target_link_libraries(MLIRSPIRV
+  MLIRIR
+  MLIRParser
+  MLIRSupport
+  MLIRTransforms)
+
+add_subdirectory(Serialization)
+add_subdirectory(Transforms)
diff --git a/mlir/lib/Dialect/SPIRV/DialectRegistration.cpp b/mlir/lib/Dialect/SPIRV/DialectRegistration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..431b40ef022070a140f300bed1e6886fe069d1c1
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/DialectRegistration.cpp
@@ -0,0 +1,12 @@
+//===- DialectRegistration.cpp - MLIR SPIR-V dialect registration ---------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+
+// Static initialization for SPIR-V dialect registration.
+static mlir::DialectRegistration<mlir::spirv::SPIRVDialect> spirvDialect;
diff --git a/mlir/lib/Dialect/SPIRV/LayoutUtils.cpp b/mlir/lib/Dialect/SPIRV/LayoutUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a12d04edd6888b91f7755483988b91995daaa291
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/LayoutUtils.cpp
@@ -0,0 +1,156 @@
+//===-- LayoutUtils.cpp - Decorate composite type with layout information -===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Utilities used to get alignment and layout information
+// for types in SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/SPIRV/LayoutUtils.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+
+using namespace mlir;
+
+spirv::StructType
+VulkanLayoutUtils::decorateType(spirv::StructType structType,
+                                VulkanLayoutUtils::Size &size,
+                                VulkanLayoutUtils::Size &alignment) {
+  if (structType.getNumElements() == 0) {
+    return structType;
+  }
+
+  SmallVector<Type, 4> memberTypes;
+  SmallVector<VulkanLayoutUtils::Size, 4> layoutInfo;
+  SmallVector<spirv::StructType::MemberDecorationInfo, 4> memberDecorations;
+
+  VulkanLayoutUtils::Size structMemberOffset = 0;
+  VulkanLayoutUtils::Size maxMemberAlignment = 1;
+
+  for (uint32_t i = 0, e = structType.getNumElements(); i < e; ++i) {
+    VulkanLayoutUtils::Size memberSize = 0;
+    VulkanLayoutUtils::Size memberAlignment = 1;
+
+    auto memberType = VulkanLayoutUtils::decorateType(
+        structType.getElementType(i), memberSize, memberAlignment);
+    structMemberOffset = llvm::alignTo(structMemberOffset, memberAlignment);
+    memberTypes.push_back(memberType);
+    layoutInfo.push_back(structMemberOffset);
+    // According to the Vulkan spec:
+    // "A structure has a base alignment equal to the largest base alignment of
+    // any of its members."
+    structMemberOffset += memberSize;
+    maxMemberAlignment = std::max(maxMemberAlignment, memberAlignment);
+  }
+
+  // According to the Vulkan spec:
+  // "The Offset decoration of a member must not place it between the end of a
+  // structure or an array and the next multiple of the alignment of that
+  // structure or array."
+  size = llvm::alignTo(structMemberOffset, maxMemberAlignment);
+  alignment = maxMemberAlignment;
+  structType.getMemberDecorations(memberDecorations);
+  return spirv::StructType::get(memberTypes, layoutInfo, memberDecorations);
+}
+
+Type VulkanLayoutUtils::decorateType(Type type, VulkanLayoutUtils::Size &size,
+                                     VulkanLayoutUtils::Size &alignment) {
+  if (spirv::SPIRVDialect::isValidScalarType(type)) {
+    alignment = VulkanLayoutUtils::getScalarTypeAlignment(type);
+    // Vulkan spec does not specify any padding for a scalar type.
+    size = alignment;
+    return type;
+  }
+
+  switch (type.getKind()) {
+  case spirv::TypeKind::Struct:
+    return VulkanLayoutUtils::decorateType(type.cast<spirv::StructType>(), size,
+                                           alignment);
+  case spirv::TypeKind::Array:
+    return VulkanLayoutUtils::decorateType(type.cast<spirv::ArrayType>(), size,
+                                           alignment);
+  case StandardTypes::Vector:
+    return VulkanLayoutUtils::decorateType(type.cast<VectorType>(), size,
+                                           alignment);
+  default:
+    llvm_unreachable("unhandled SPIR-V type");
+  }
+}
+
+Type VulkanLayoutUtils::decorateType(VectorType vectorType,
+                                     VulkanLayoutUtils::Size &size,
+                                     VulkanLayoutUtils::Size &alignment) {
+  const auto numElements = vectorType.getNumElements();
+  auto elementType = vectorType.getElementType();
+  VulkanLayoutUtils::Size elementSize = 0;
+  VulkanLayoutUtils::Size elementAlignment = 1;
+
+  auto memberType = VulkanLayoutUtils::decorateType(elementType, elementSize,
+                                                    elementAlignment);
+  // According to the Vulkan spec:
+  // 1. "A two-component vector has a base alignment equal to twice its scalar
+  // alignment."
+  // 2. "A three- or four-component vector has a base alignment equal to four
+  // times its scalar alignment."
+  size = elementSize * numElements;
+  alignment = numElements == 2 ? elementAlignment * 2 : elementAlignment * 4;
+  return VectorType::get(numElements, memberType);
+}
+
+Type VulkanLayoutUtils::decorateType(spirv::ArrayType arrayType,
+                                     VulkanLayoutUtils::Size &size,
+                                     VulkanLayoutUtils::Size &alignment) {
+  const auto numElements = arrayType.getNumElements();
+  auto elementType = arrayType.getElementType();
+  spirv::ArrayType::LayoutInfo elementSize = 0;
+  VulkanLayoutUtils::Size elementAlignment = 1;
+
+  auto memberType = VulkanLayoutUtils::decorateType(elementType, elementSize,
+                                                    elementAlignment);
+  // According to the Vulkan spec:
+  // "An array has a base alignment equal to the base alignment of its element
+  // type."
+  size = elementSize * numElements;
+  alignment = elementAlignment;
+  return spirv::ArrayType::get(memberType, numElements, elementSize);
+}
+
+VulkanLayoutUtils::Size
+VulkanLayoutUtils::getScalarTypeAlignment(Type scalarType) {
+  // According to the Vulkan spec:
+  // 1. "A scalar of size N has a scalar alignment of N."
+  // 2. "A scalar has a base alignment equal to its scalar alignment."
+  // 3. "A scalar, vector or matrix type has an extended alignment equal to its
+  // base alignment."
+  auto bitWidth = scalarType.getIntOrFloatBitWidth();
+  if (bitWidth == 1)
+    return 1;
+  return bitWidth / 8;
+}
+
+bool VulkanLayoutUtils::isLegalType(Type type) {
+  auto ptrType = type.dyn_cast<spirv::PointerType>();
+  if (!ptrType) {
+    return true;
+  }
+
+  auto storageClass = ptrType.getStorageClass();
+  auto structType = ptrType.getPointeeType().dyn_cast<spirv::StructType>();
+  if (!structType) {
+    return true;
+  }
+
+  switch (storageClass) {
+  case spirv::StorageClass::Uniform:
+  case spirv::StorageClass::StorageBuffer:
+  case spirv::StorageClass::PushConstant:
+  case spirv::StorageClass::PhysicalStorageBuffer:
+    return structType.hasLayout() || !structType.getNumElements();
+  default:
+    return true;
+  }
+}
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVCanonicalization.td b/mlir/lib/Dialect/SPIRV/SPIRVCanonicalization.td
new file mode 100644
index 0000000000000000000000000000000000000000..1845003db07c6943ab7daee31a0b6a5159afd0e6
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/SPIRVCanonicalization.td
@@ -0,0 +1,40 @@
+//==- SPIRVCanonicalization.td - Canonicalization Patterns ---*- tablegen -*==//
+
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines SPIR-V canonicalization patterns.
+//
+//===----------------------------------------------------------------------===//
+
+include "mlir/Dialect/SPIRV/SPIRVOps.td"
+
+//===----------------------------------------------------------------------===//
+// spv.Bitcast
+//===----------------------------------------------------------------------===//
+
+def ConvertChainedBitcast : Pat<(SPV_BitcastOp (SPV_BitcastOp $operand)),
+                                (SPV_BitcastOp $operand)>;
+
+//===----------------------------------------------------------------------===//
+// spv.LogicalNot
+//===----------------------------------------------------------------------===//
+
+def ConvertLogicalNotOfIEqual : Pat<
+    (SPV_LogicalNotOp (SPV_IEqualOp $lhs, $rhs)),
+    (SPV_INotEqualOp $lhs, $rhs)>;
+
+def ConvertLogicalNotOfINotEqual : Pat<
+    (SPV_LogicalNotOp (SPV_INotEqualOp $lhs, $rhs)),
+    (SPV_IEqualOp $lhs, $rhs)>;
+
+def ConvertLogicalNotOfLogicalEqual : Pat<
+    (SPV_LogicalNotOp (SPV_LogicalEqualOp $lhs, $rhs)),
+    (SPV_LogicalNotEqualOp $lhs, $rhs)>;
+
+def ConvertLogicalNotOfLogicalNotEqual : Pat<
+    (SPV_LogicalNotOp (SPV_LogicalNotEqualOp $lhs, $rhs)),
+    (SPV_LogicalEqualOp $lhs, $rhs)>;
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..144252bb2722a76155e267a41eb5c76f7e61609c
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp
@@ -0,0 +1,639 @@
+//===- LLVMDialect.cpp - MLIR SPIR-V dialect ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SPIR-V dialect in MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Parser.h"
+#include "mlir/Support/StringExtras.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+namespace spirv {
+#include "mlir/Dialect/SPIRV/SPIRVOpUtils.inc"
+} // namespace spirv
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::spirv;
+
+//===----------------------------------------------------------------------===//
+// InlinerInterface
+//===----------------------------------------------------------------------===//
+
+/// Returns true if the given region contains spv.Return or spv.ReturnValue ops.
+static inline bool containsReturn(Region &region) {
+  return llvm::any_of(region, [](Block &block) {
+    Operation *terminator = block.getTerminator();
+    return isa<spirv::ReturnOp>(terminator) ||
+           isa<spirv::ReturnValueOp>(terminator);
+  });
+}
+
+namespace {
+/// This class defines the interface for inlining within the SPIR-V dialect.
+struct SPIRVInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  /// Returns true if the given region 'src' can be inlined into the region
+  /// 'dest' that is attached to an operation registered to the current dialect.
+  bool isLegalToInline(Region *dest, Region *src,
+                       BlockAndValueMapping &) const final {
+    // Return true here when inlining into spv.selection and spv.loop
+    // operations.
+    auto op = dest->getParentOp();
+    return isa<spirv::SelectionOp>(op) || isa<spirv::LoopOp>(op);
+  }
+
+  /// Returns true if the given operation 'op', that is registered to this
+  /// dialect, can be inlined into the region 'dest' that is attached to an
+  /// operation registered to the current dialect.
+  bool isLegalToInline(Operation *op, Region *dest,
+                       BlockAndValueMapping &) const final {
+    // TODO(antiagainst): Enable inlining structured control flows with return.
+    if ((isa<spirv::SelectionOp>(op) || isa<spirv::LoopOp>(op)) &&
+        containsReturn(op->getRegion(0)))
+      return false;
+    // TODO(antiagainst): we need to filter OpKill here to avoid inlining it to
+    // a loop continue construct:
+    // https://github.com/KhronosGroup/SPIRV-Headers/issues/86
+    // However OpKill is fragment shader specific and we don't support it yet.
+    return true;
+  }
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary.
+  void handleTerminator(Operation *op, Block *newDest) const final {
+    if (auto returnOp = dyn_cast<spirv::ReturnOp>(op)) {
+      OpBuilder(op).create<spirv::BranchOp>(op->getLoc(), newDest);
+      op->erase();
+    } else if (auto retValOp = dyn_cast<spirv::ReturnValueOp>(op)) {
+      llvm_unreachable("unimplemented spv.ReturnValue in inliner");
+    }
+  }
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary.
+  void handleTerminator(Operation *op,
+                        ArrayRef<Value> valuesToRepl) const final {
+    // Only spv.ReturnValue needs to be handled here.
+    auto retValOp = dyn_cast<spirv::ReturnValueOp>(op);
+    if (!retValOp)
+      return;
+
+    // Replace the values directly with the return operands.
+    assert(valuesToRepl.size() == 1 &&
+           "spv.ReturnValue expected to only handle one result");
+    valuesToRepl.front()->replaceAllUsesWith(retValOp.value());
+  }
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// SPIR-V Dialect
+//===----------------------------------------------------------------------===//
+
+SPIRVDialect::SPIRVDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context) {
+  addTypes<ArrayType, ImageType, PointerType, RuntimeArrayType, StructType>();
+
+  // Add SPIR-V ops.
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/SPIRV/SPIRVOps.cpp.inc"
+      >();
+
+  addInterfaces<SPIRVInlinerInterface>();
+
+  // Allow unknown operations because SPIR-V is extensible.
+  allowUnknownOperations();
+}
+
+std::string SPIRVDialect::getAttributeName(Decoration decoration) {
+  return convertToSnakeCase(stringifyDecoration(decoration));
+}
+
+//===----------------------------------------------------------------------===//
+// Type Parsing
+//===----------------------------------------------------------------------===//
+
+// Forward declarations.
+template <typename ValTy>
+static Optional<ValTy> parseAndVerify(SPIRVDialect const &dialect,
+                                      DialectAsmParser &parser);
+template <>
+Optional<Type> parseAndVerify<Type>(SPIRVDialect const &dialect,
+                                    DialectAsmParser &parser);
+
+template <>
+Optional<uint64_t> parseAndVerify<uint64_t>(SPIRVDialect const &dialect,
+                                            DialectAsmParser &parser);
+
+static bool isValidSPIRVIntType(IntegerType type) {
+  return llvm::is_contained(ArrayRef<unsigned>({1, 8, 16, 32, 64}),
+                            type.getWidth());
+}
+
+bool SPIRVDialect::isValidScalarType(Type type) {
+  if (type.isa<FloatType>()) {
+    return !type.isBF16();
+  }
+  if (auto intType = type.dyn_cast<IntegerType>()) {
+    return isValidSPIRVIntType(intType);
+  }
+  return false;
+}
+
+static bool isValidSPIRVVectorType(VectorType type) {
+  return type.getRank() == 1 &&
+         SPIRVDialect::isValidScalarType(type.getElementType()) &&
+         type.getNumElements() >= 2 && type.getNumElements() <= 4;
+}
+
+bool SPIRVDialect::isValidType(Type type) {
+  // Allow SPIR-V dialect types
+  if (type.getKind() >= Type::FIRST_SPIRV_TYPE &&
+      type.getKind() <= TypeKind::LAST_SPIRV_TYPE) {
+    return true;
+  }
+  if (SPIRVDialect::isValidScalarType(type)) {
+    return true;
+  }
+  if (auto vectorType = type.dyn_cast<VectorType>()) {
+    return isValidSPIRVVectorType(vectorType);
+  }
+  return false;
+}
+
+static Type parseAndVerifyType(SPIRVDialect const &dialect,
+                               DialectAsmParser &parser) {
+  Type type;
+  llvm::SMLoc typeLoc = parser.getCurrentLocation();
+  if (parser.parseType(type))
+    return Type();
+
+  // Allow SPIR-V dialect types
+  if (&type.getDialect() == &dialect)
+    return type;
+
+  // Check other allowed types
+  if (auto t = type.dyn_cast<FloatType>()) {
+    if (type.isBF16()) {
+      parser.emitError(typeLoc, "cannot use 'bf16' to compose SPIR-V types");
+      return Type();
+    }
+  } else if (auto t = type.dyn_cast<IntegerType>()) {
+    if (!isValidSPIRVIntType(t)) {
+      parser.emitError(typeLoc,
+                       "only 1/8/16/32/64-bit integer type allowed but found ")
+          << type;
+      return Type();
+    }
+  } else if (auto t = type.dyn_cast<VectorType>()) {
+    if (t.getRank() != 1) {
+      parser.emitError(typeLoc, "only 1-D vector allowed but found ") << t;
+      return Type();
+    }
+    if (t.getNumElements() > 4) {
+      parser.emitError(
+          typeLoc, "vector length has to be less than or equal to 4 but found ")
+          << t.getNumElements();
+      return Type();
+    }
+  } else {
+    parser.emitError(typeLoc, "cannot use ")
+        << type << " to compose SPIR-V types";
+    return Type();
+  }
+
+  return type;
+}
+
+// element-type ::= integer-type
+//                | floating-point-type
+//                | vector-type
+//                | spirv-type
+//
+// array-type ::= `!spv.array<` integer-literal `x` element-type
+//                (`[` integer-literal `]`)? `>`
+static Type parseArrayType(SPIRVDialect const &dialect,
+                           DialectAsmParser &parser) {
+  if (parser.parseLess())
+    return Type();
+
+  SmallVector<int64_t, 1> countDims;
+  llvm::SMLoc countLoc = parser.getCurrentLocation();
+  if (parser.parseDimensionList(countDims, /*allowDynamic=*/false))
+    return Type();
+  if (countDims.size() != 1) {
+    parser.emitError(countLoc,
+                     "expected single integer for array element count");
+    return Type();
+  }
+
+  // According to the SPIR-V spec:
+  // "Length is the number of elements in the array. It must be at least 1."
+  int64_t count = countDims[0];
+  if (count == 0) {
+    parser.emitError(countLoc, "expected array length greater than 0");
+    return Type();
+  }
+
+  Type elementType = parseAndVerifyType(dialect, parser);
+  if (!elementType)
+    return Type();
+
+  ArrayType::LayoutInfo layoutInfo = 0;
+  if (succeeded(parser.parseOptionalLSquare())) {
+    llvm::SMLoc layoutLoc = parser.getCurrentLocation();
+    auto layout = parseAndVerify<ArrayType::LayoutInfo>(dialect, parser);
+    if (!layout)
+      return Type();
+
+    if (!(layoutInfo = layout.getValue())) {
+      parser.emitError(layoutLoc, "ArrayStride must be greater than zero");
+      return Type();
+    }
+
+    if (parser.parseRSquare())
+      return Type();
+  }
+
+  if (parser.parseGreater())
+    return Type();
+  return ArrayType::get(elementType, count, layoutInfo);
+}
+
+// TODO(ravishankarm) : Reorder methods to be utilities first and parse*Type
+// methods in alphabetical order
+//
+// storage-class ::= `UniformConstant`
+//                 | `Uniform`
+//                 | `Workgroup`
+//                 | <and other storage classes...>
+//
+// pointer-type ::= `!spv.ptr<` element-type `,` storage-class `>`
+static Type parsePointerType(SPIRVDialect const &dialect,
+                             DialectAsmParser &parser) {
+  if (parser.parseLess())
+    return Type();
+
+  auto pointeeType = parseAndVerifyType(dialect, parser);
+  if (!pointeeType)
+    return Type();
+
+  StringRef storageClassSpec;
+  llvm::SMLoc storageClassLoc = parser.getCurrentLocation();
+  if (parser.parseComma() || parser.parseKeyword(&storageClassSpec))
+    return Type();
+
+  auto storageClass = symbolizeStorageClass(storageClassSpec);
+  if (!storageClass) {
+    parser.emitError(storageClassLoc, "unknown storage class: ")
+        << storageClassSpec;
+    return Type();
+  }
+  if (parser.parseGreater())
+    return Type();
+  return PointerType::get(pointeeType, *storageClass);
+}
+
+// runtime-array-type ::= `!spv.rtarray<` element-type `>`
+static Type parseRuntimeArrayType(SPIRVDialect const &dialect,
+                                  DialectAsmParser &parser) {
+  if (parser.parseLess())
+    return Type();
+
+  Type elementType = parseAndVerifyType(dialect, parser);
+  if (!elementType)
+    return Type();
+
+  if (parser.parseGreater())
+    return Type();
+  return RuntimeArrayType::get(elementType);
+}
+
+// Specialize this function to parse each of the parameters that define an
+// ImageType. By default it assumes this is an enum type.
+template <typename ValTy>
+static Optional<ValTy> parseAndVerify(SPIRVDialect const &dialect,
+                                      DialectAsmParser &parser) {
+  StringRef enumSpec;
+  llvm::SMLoc enumLoc = parser.getCurrentLocation();
+  if (parser.parseKeyword(&enumSpec)) {
+    return llvm::None;
+  }
+
+  auto val = spirv::symbolizeEnum<ValTy>()(enumSpec);
+  if (!val)
+    parser.emitError(enumLoc, "unknown attribute: '") << enumSpec << "'";
+  return val;
+}
+
+template <>
+Optional<Type> parseAndVerify<Type>(SPIRVDialect const &dialect,
+                                    DialectAsmParser &parser) {
+  // TODO(ravishankarm): Further verify that the element type can be sampled
+  auto ty = parseAndVerifyType(dialect, parser);
+  if (!ty)
+    return llvm::None;
+  return ty;
+}
+
+template <typename IntTy>
+static Optional<IntTy> parseAndVerifyInteger(SPIRVDialect const &dialect,
+                                             DialectAsmParser &parser) {
+  IntTy offsetVal = std::numeric_limits<IntTy>::max();
+  if (parser.parseInteger(offsetVal))
+    return llvm::None;
+  return offsetVal;
+}
+
+template <>
+Optional<uint64_t> parseAndVerify<uint64_t>(SPIRVDialect const &dialect,
+                                            DialectAsmParser &parser) {
+  return parseAndVerifyInteger<uint64_t>(dialect, parser);
+}
+
+// Functor object to parse a comma separated list of specs. The function
+// parseAndVerify does the actual parsing and verification of individual
+// elements. This is a functor since parsing the last element of the list
+// (termination condition) needs partial specialization.
+template <typename ParseType, typename... Args> struct parseCommaSeparatedList {
+  Optional<std::tuple<ParseType, Args...>>
+  operator()(SPIRVDialect const &dialect, DialectAsmParser &parser) const {
+    auto parseVal = parseAndVerify<ParseType>(dialect, parser);
+    if (!parseVal)
+      return llvm::None;
+
+    auto numArgs = std::tuple_size<std::tuple<Args...>>::value;
+    if (numArgs != 0 && failed(parser.parseComma()))
+      return llvm::None;
+    auto remainingValues = parseCommaSeparatedList<Args...>{}(dialect, parser);
+    if (!remainingValues)
+      return llvm::None;
+    return std::tuple_cat(std::tuple<ParseType>(parseVal.getValue()),
+                          remainingValues.getValue());
+  }
+};
+
+// Partial specialization of the function to parse a comma separated list of
+// specs to parse the last element of the list.
+template <typename ParseType> struct parseCommaSeparatedList<ParseType> {
+  Optional<std::tuple<ParseType>> operator()(SPIRVDialect const &dialect,
+                                             DialectAsmParser &parser) const {
+    if (auto value = parseAndVerify<ParseType>(dialect, parser))
+      return std::tuple<ParseType>(value.getValue());
+    return llvm::None;
+  }
+};
+
+// dim ::= `1D` | `2D` | `3D` | `Cube` | <and other SPIR-V Dim specifiers...>
+//
+// depth-info ::= `NoDepth` | `IsDepth` | `DepthUnknown`
+//
+// arrayed-info ::= `NonArrayed` | `Arrayed`
+//
+// sampling-info ::= `SingleSampled` | `MultiSampled`
+//
+// sampler-use-info ::= `SamplerUnknown` | `NeedSampler` |  `NoSampler`
+//
+// format ::= `Unknown` | `Rgba32f` | <and other SPIR-V Image formats...>
+//
+// image-type ::= `!spv.image<` element-type `,` dim `,` depth-info `,`
+//                              arrayed-info `,` sampling-info `,`
+//                              sampler-use-info `,` format `>`
+static Type parseImageType(SPIRVDialect const &dialect,
+                           DialectAsmParser &parser) {
+  if (parser.parseLess())
+    return Type();
+
+  auto value =
+      parseCommaSeparatedList<Type, Dim, ImageDepthInfo, ImageArrayedInfo,
+                              ImageSamplingInfo, ImageSamplerUseInfo,
+                              ImageFormat>{}(dialect, parser);
+  if (!value)
+    return Type();
+
+  if (parser.parseGreater())
+    return Type();
+  return ImageType::get(value.getValue());
+}
+
+// Parse decorations associated with a member.
+static ParseResult parseStructMemberDecorations(
+    SPIRVDialect const &dialect, DialectAsmParser &parser,
+    ArrayRef<Type> memberTypes,
+    SmallVectorImpl<StructType::LayoutInfo> &layoutInfo,
+    SmallVectorImpl<StructType::MemberDecorationInfo> &memberDecorationInfo) {
+
+  // Check if the first element is offset.
+  llvm::SMLoc layoutLoc = parser.getCurrentLocation();
+  StructType::LayoutInfo layout = 0;
+  OptionalParseResult layoutParseResult = parser.parseOptionalInteger(layout);
+  if (layoutParseResult.hasValue()) {
+    if (failed(*layoutParseResult))
+      return failure();
+
+    if (layoutInfo.size() != memberTypes.size() - 1) {
+      return parser.emitError(
+          layoutLoc, "layout specification must be given for all members");
+    }
+    layoutInfo.push_back(layout);
+  }
+
+  // Check for no spirv::Decorations.
+  if (succeeded(parser.parseOptionalRSquare()))
+    return success();
+
+  // If there was a layout, make sure to parse the comma.
+  if (layoutParseResult.hasValue() && parser.parseComma())
+    return failure();
+
+  // Check for spirv::Decorations.
+  do {
+    auto memberDecoration = parseAndVerify<spirv::Decoration>(dialect, parser);
+    if (!memberDecoration)
+      return failure();
+
+    memberDecorationInfo.emplace_back(
+        static_cast<uint32_t>(memberTypes.size() - 1),
+        memberDecoration.getValue());
+  } while (succeeded(parser.parseOptionalComma()));
+
+  return parser.parseRSquare();
+}
+
+// struct-member-decoration ::= integer-literal? spirv-decoration*
+// struct-type ::= `!spv.struct<` spirv-type (`[` struct-member-decoration `]`)?
+//                     (`, ` spirv-type (`[` struct-member-decoration `]`)? `>`
+static Type parseStructType(SPIRVDialect const &dialect,
+                            DialectAsmParser &parser) {
+  if (parser.parseLess())
+    return Type();
+
+  if (succeeded(parser.parseOptionalGreater()))
+    return StructType::getEmpty(dialect.getContext());
+
+  SmallVector<Type, 4> memberTypes;
+  SmallVector<StructType::LayoutInfo, 4> layoutInfo;
+  SmallVector<StructType::MemberDecorationInfo, 4> memberDecorationInfo;
+
+  do {
+    Type memberType;
+    if (parser.parseType(memberType))
+      return Type();
+    memberTypes.push_back(memberType);
+
+    if (succeeded(parser.parseOptionalLSquare())) {
+      if (parseStructMemberDecorations(dialect, parser, memberTypes, layoutInfo,
+                                       memberDecorationInfo)) {
+        return Type();
+      }
+    }
+  } while (succeeded(parser.parseOptionalComma()));
+
+  if (!layoutInfo.empty() && memberTypes.size() != layoutInfo.size()) {
+    parser.emitError(parser.getNameLoc(),
+                     "layout specification must be given for all members");
+    return Type();
+  }
+  if (parser.parseGreater())
+    return Type();
+  return StructType::get(memberTypes, layoutInfo, memberDecorationInfo);
+}
+
+// spirv-type ::= array-type
+//              | element-type
+//              | image-type
+//              | pointer-type
+//              | runtime-array-type
+//              | struct-type
+Type SPIRVDialect::parseType(DialectAsmParser &parser) const {
+  StringRef keyword;
+  if (parser.parseKeyword(&keyword))
+    return Type();
+
+  if (keyword == "array")
+    return parseArrayType(*this, parser);
+  if (keyword == "image")
+    return parseImageType(*this, parser);
+  if (keyword == "ptr")
+    return parsePointerType(*this, parser);
+  if (keyword == "rtarray")
+    return parseRuntimeArrayType(*this, parser);
+  if (keyword == "struct")
+    return parseStructType(*this, parser);
+
+  parser.emitError(parser.getNameLoc(), "unknown SPIR-V type: ") << keyword;
+  return Type();
+}
+
+//===----------------------------------------------------------------------===//
+// Type Printing
+//===----------------------------------------------------------------------===//
+
+static void print(ArrayType type, DialectAsmPrinter &os) {
+  os << "array<" << type.getNumElements() << " x " << type.getElementType();
+  if (type.hasLayout()) {
+    os << " [" << type.getArrayStride() << "]";
+  }
+  os << ">";
+}
+
+static void print(RuntimeArrayType type, DialectAsmPrinter &os) {
+  os << "rtarray<" << type.getElementType() << ">";
+}
+
+static void print(PointerType type, DialectAsmPrinter &os) {
+  os << "ptr<" << type.getPointeeType() << ", "
+     << stringifyStorageClass(type.getStorageClass()) << ">";
+}
+
+static void print(ImageType type, DialectAsmPrinter &os) {
+  os << "image<" << type.getElementType() << ", " << stringifyDim(type.getDim())
+     << ", " << stringifyImageDepthInfo(type.getDepthInfo()) << ", "
+     << stringifyImageArrayedInfo(type.getArrayedInfo()) << ", "
+     << stringifyImageSamplingInfo(type.getSamplingInfo()) << ", "
+     << stringifyImageSamplerUseInfo(type.getSamplerUseInfo()) << ", "
+     << stringifyImageFormat(type.getImageFormat()) << ">";
+}
+
+static void print(StructType type, DialectAsmPrinter &os) {
+  os << "struct<";
+  auto printMember = [&](unsigned i) {
+    os << type.getElementType(i);
+    SmallVector<spirv::Decoration, 0> decorations;
+    type.getMemberDecorations(i, decorations);
+    if (type.hasLayout() || !decorations.empty()) {
+      os << " [";
+      if (type.hasLayout()) {
+        os << type.getOffset(i);
+        if (!decorations.empty())
+          os << ", ";
+      }
+      auto each_fn = [&os](spirv::Decoration decoration) {
+        os << stringifyDecoration(decoration);
+      };
+      interleaveComma(decorations, os, each_fn);
+      os << "]";
+    }
+  };
+  interleaveComma(llvm::seq<unsigned>(0, type.getNumElements()), os,
+                  printMember);
+  os << ">";
+}
+
+void SPIRVDialect::printType(Type type, DialectAsmPrinter &os) const {
+  switch (type.getKind()) {
+  case TypeKind::Array:
+    print(type.cast<ArrayType>(), os);
+    return;
+  case TypeKind::Pointer:
+    print(type.cast<PointerType>(), os);
+    return;
+  case TypeKind::RuntimeArray:
+    print(type.cast<RuntimeArrayType>(), os);
+    return;
+  case TypeKind::Image:
+    print(type.cast<ImageType>(), os);
+    return;
+  case TypeKind::Struct:
+    print(type.cast<StructType>(), os);
+    return;
+  default:
+    llvm_unreachable("unhandled SPIR-V type");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Constant
+//===----------------------------------------------------------------------===//
+
+Operation *SPIRVDialect::materializeConstant(OpBuilder &builder,
+                                             Attribute value, Type type,
+                                             Location loc) {
+  if (!ConstantOp::isBuildableWith(type))
+    return nullptr;
+
+  return builder.create<spirv::ConstantOp>(loc, type, value);
+}
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d2348c262600de72527992a5c772f2a03141888
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
@@ -0,0 +1,253 @@
+//===- SPIRVLowering.cpp - Standard to SPIR-V dialect conversion--===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities used to lower to SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/SPIRV/SPIRVLowering.h"
+#include "mlir/Dialect/SPIRV/LayoutUtils.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "llvm/ADT/Sequence.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Attributes for ABI
+//===----------------------------------------------------------------------===//
+
+// Pull in the attributes needed for lowering.
+namespace mlir {
+#include "mlir/Dialect/SPIRV/SPIRVLowering.cpp.inc"
+}
+
+StringRef mlir::spirv::getInterfaceVarABIAttrName() {
+  return "spirv.interface_var_abi";
+}
+
+mlir::spirv::InterfaceVarABIAttr
+mlir::spirv::getInterfaceVarABIAttr(unsigned descriptorSet, unsigned binding,
+                                    spirv::StorageClass storageClass,
+                                    MLIRContext *context) {
+  Type i32Type = IntegerType::get(32, context);
+  return mlir::spirv::InterfaceVarABIAttr::get(
+      IntegerAttr::get(i32Type, descriptorSet),
+      IntegerAttr::get(i32Type, binding),
+      IntegerAttr::get(i32Type, static_cast<int64_t>(storageClass)), context);
+}
+
+StringRef mlir::spirv::getEntryPointABIAttrName() {
+  return "spirv.entry_point_abi";
+}
+
+mlir::spirv::EntryPointABIAttr
+mlir::spirv::getEntryPointABIAttr(ArrayRef<int32_t> localSize,
+                                  MLIRContext *context) {
+  assert(localSize.size() == 3);
+  return mlir::spirv::EntryPointABIAttr::get(
+      DenseElementsAttr::get<int32_t>(
+          VectorType::get(3, IntegerType::get(32, context)), localSize)
+          .cast<DenseIntElementsAttr>(),
+      context);
+}
+
+//===----------------------------------------------------------------------===//
+// Type Conversion
+//===----------------------------------------------------------------------===//
+
+Type SPIRVTypeConverter::getIndexType(MLIRContext *context) {
+  // Convert to 32-bit integers for now. Might need a way to control this in
+  // future.
+  // TODO(ravishankarm): It is probably better to make it 64-bit integers. To
+  // this some support is needed in SPIR-V dialect for Conversion
+  // instructions. The Vulkan spec requires the builtins like
+  // GlobalInvocationID, etc. to be 32-bit (unsigned) integers which should be
+  // SExtended to 64-bit for index computations.
+  return IntegerType::get(32, context);
+}
+
+// TODO(ravishankarm): This is a utility function that should probably be
+// exposed by the SPIR-V dialect. Keeping it local till the use case arises.
+static Optional<int64_t> getTypeNumBytes(Type t) {
+  if (auto integerType = t.dyn_cast<IntegerType>()) {
+    return integerType.getWidth() / 8;
+  } else if (auto floatType = t.dyn_cast<FloatType>()) {
+    return floatType.getWidth() / 8;
+  } else if (auto memRefType = t.dyn_cast<MemRefType>()) {
+    // TODO: Layout should also be controlled by the ABI attributes. For now
+    // using the layout from MemRef.
+    int64_t offset;
+    SmallVector<int64_t, 4> strides;
+    if (!memRefType.hasStaticShape() ||
+        failed(getStridesAndOffset(memRefType, strides, offset))) {
+      return llvm::None;
+    }
+    // To get the size of the memref object in memory, the total size is the
+    // max(stride * dimension-size) computed for all dimensions times the size
+    // of the element.
+    auto elementSize = getTypeNumBytes(memRefType.getElementType());
+    if (!elementSize) {
+      return llvm::None;
+    }
+    auto dims = memRefType.getShape();
+    if (llvm::is_contained(dims, ShapedType::kDynamicSize) ||
+        offset == MemRefType::getDynamicStrideOrOffset() ||
+        llvm::is_contained(strides, MemRefType::getDynamicStrideOrOffset())) {
+      return llvm::None;
+    }
+    int64_t memrefSize = -1;
+    for (auto shape : enumerate(dims)) {
+      memrefSize = std::max(memrefSize, shape.value() * strides[shape.index()]);
+    }
+    return (offset + memrefSize) * elementSize.getValue();
+  }
+  // TODO: Add size computation for other types.
+  return llvm::None;
+}
+
+static Type convertStdType(Type type) {
+  // If the type is already valid in SPIR-V, directly return.
+  if (spirv::SPIRVDialect::isValidType(type)) {
+    return type;
+  }
+
+  if (auto indexType = type.dyn_cast<IndexType>()) {
+    return SPIRVTypeConverter::getIndexType(type.getContext());
+  }
+
+  if (auto memRefType = type.dyn_cast<MemRefType>()) {
+    // TODO(ravishankarm): For now only support default memory space. The memory
+    // space description is not set is stone within MLIR, i.e. it depends on the
+    // context it is being used. To map this to SPIR-V storage classes, we
+    // should rely on the ABI attributes, and not on the memory space. This is
+    // still evolving, and needs to be revisited when there is more clarity.
+    if (memRefType.getMemorySpace()) {
+      return Type();
+    }
+
+    auto elementType = convertStdType(memRefType.getElementType());
+    if (!elementType) {
+      return Type();
+    }
+
+    auto elementSize = getTypeNumBytes(elementType);
+    if (!elementSize) {
+      return Type();
+    }
+    // TODO(ravishankarm) : Handle dynamic shapes.
+    if (memRefType.hasStaticShape()) {
+      auto arraySize = getTypeNumBytes(memRefType);
+      if (!arraySize) {
+        return Type();
+      }
+      auto arrayType = spirv::ArrayType::get(
+          elementType, arraySize.getValue() / elementSize.getValue(),
+          elementSize.getValue());
+      auto structType = spirv::StructType::get(arrayType, 0);
+      // For now initialize the storage class to StorageBuffer. This will be
+      // updated later based on whats passed in w.r.t to the ABI attributes.
+      return spirv::PointerType::get(structType,
+                                     spirv::StorageClass::StorageBuffer);
+    }
+  }
+
+  return Type();
+}
+
+Type SPIRVTypeConverter::convertType(Type type) { return convertStdType(type); }
+
+//===----------------------------------------------------------------------===//
+// Builtin Variables
+//===----------------------------------------------------------------------===//
+
+/// Look through all global variables in `moduleOp` and check if there is a
+/// spv.globalVariable that has the same `builtin` attribute.
+static spirv::GlobalVariableOp getBuiltinVariable(spirv::ModuleOp &moduleOp,
+                                                  spirv::BuiltIn builtin) {
+  for (auto varOp : moduleOp.getBlock().getOps<spirv::GlobalVariableOp>()) {
+    if (auto builtinAttr = varOp.getAttrOfType<StringAttr>(
+            spirv::SPIRVDialect::getAttributeName(
+                spirv::Decoration::BuiltIn))) {
+      auto varBuiltIn = spirv::symbolizeBuiltIn(builtinAttr.getValue());
+      if (varBuiltIn && varBuiltIn.getValue() == builtin) {
+        return varOp;
+      }
+    }
+  }
+  return nullptr;
+}
+
+/// Gets name of global variable for a builtin.
+static std::string getBuiltinVarName(spirv::BuiltIn builtin) {
+  return std::string("__builtin_var_") + stringifyBuiltIn(builtin).str() + "__";
+}
+
+/// Gets or inserts a global variable for a builtin within a module.
+static spirv::GlobalVariableOp
+getOrInsertBuiltinVariable(spirv::ModuleOp &moduleOp, Location loc,
+                           spirv::BuiltIn builtin, OpBuilder &builder) {
+  if (auto varOp = getBuiltinVariable(moduleOp, builtin)) {
+    return varOp;
+  }
+  auto ip = builder.saveInsertionPoint();
+  builder.setInsertionPointToStart(&moduleOp.getBlock());
+  auto name = getBuiltinVarName(builtin);
+  spirv::GlobalVariableOp newVarOp;
+  switch (builtin) {
+  case spirv::BuiltIn::NumWorkgroups:
+  case spirv::BuiltIn::WorkgroupSize:
+  case spirv::BuiltIn::WorkgroupId:
+  case spirv::BuiltIn::LocalInvocationId:
+  case spirv::BuiltIn::GlobalInvocationId: {
+    auto ptrType = spirv::PointerType::get(
+        VectorType::get({3}, builder.getIntegerType(32)),
+        spirv::StorageClass::Input);
+    newVarOp =
+        builder.create<spirv::GlobalVariableOp>(loc, ptrType, name, builtin);
+    break;
+  }
+  default:
+    emitError(loc, "unimplemented builtin variable generation for ")
+        << stringifyBuiltIn(builtin);
+  }
+  builder.restoreInsertionPoint(ip);
+  return newVarOp;
+}
+
+/// Gets the global variable associated with a builtin and add
+/// it if it doesn't exist.
+Value mlir::spirv::getBuiltinVariableValue(Operation *op,
+                                           spirv::BuiltIn builtin,
+                                           OpBuilder &builder) {
+  auto moduleOp = op->getParentOfType<spirv::ModuleOp>();
+  if (!moduleOp) {
+    op->emitError("expected operation to be within a SPIR-V module");
+    return nullptr;
+  }
+  spirv::GlobalVariableOp varOp =
+      getOrInsertBuiltinVariable(moduleOp, op->getLoc(), builtin, builder);
+  Value ptr = builder.create<spirv::AddressOfOp>(op->getLoc(), varOp);
+  return builder.create<spirv::LoadOp>(op->getLoc(), ptr,
+                                       /*memory_access =*/nullptr,
+                                       /*alignment =*/nullptr);
+}
+
+//===----------------------------------------------------------------------===//
+// Set ABI attributes for lowering entry functions.
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+mlir::spirv::setABIAttrs(FuncOp funcOp, spirv::EntryPointABIAttr entryPointInfo,
+                         ArrayRef<spirv::InterfaceVarABIAttr> argABIInfo) {
+  // Set the attributes for argument and the function.
+  StringRef argABIAttrName = spirv::getInterfaceVarABIAttrName();
+  for (auto argIndex : llvm::seq<unsigned>(0, funcOp.getNumArguments())) {
+    funcOp.setArgAttr(argIndex, argABIAttrName, argABIInfo[argIndex]);
+  }
+  funcOp.setAttr(spirv::getEntryPointABIAttrName(), entryPointInfo);
+  return success();
+}
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f42c077f77e61ef9e922e1770fd91f36f8de1555
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
@@ -0,0 +1,3070 @@
+//===- SPIRVOps.cpp - MLIR SPIR-V operations ------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the operations in the SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+
+#include "mlir/Analysis/CallInterfaces.h"
+#include "mlir/Dialect/CommonFolders.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/StringExtras.h"
+#include "llvm/ADT/bit.h"
+
+using namespace mlir;
+
+// TODO(antiagainst): generate these strings using ODS.
+static constexpr const char kAlignmentAttrName[] = "alignment";
+static constexpr const char kBranchWeightAttrName[] = "branch_weights";
+static constexpr const char kCallee[] = "callee";
+static constexpr const char kDefaultValueAttrName[] = "default_value";
+static constexpr const char kExecutionScopeAttrName[] = "execution_scope";
+static constexpr const char kEqualSemanticsAttrName[] = "equal_semantics";
+static constexpr const char kFnNameAttrName[] = "fn";
+static constexpr const char kIndicesAttrName[] = "indices";
+static constexpr const char kInitializerAttrName[] = "initializer";
+static constexpr const char kInterfaceAttrName[] = "interface";
+static constexpr const char kMemoryScopeAttrName[] = "memory_scope";
+static constexpr const char kSemanticsAttrName[] = "semantics";
+static constexpr const char kSpecConstAttrName[] = "spec_const";
+static constexpr const char kSpecIdAttrName[] = "spec_id";
+static constexpr const char kTypeAttrName[] = "type";
+static constexpr const char kUnequalSemanticsAttrName[] = "unequal_semantics";
+static constexpr const char kValueAttrName[] = "value";
+static constexpr const char kValuesAttrName[] = "values";
+static constexpr const char kVariableAttrName[] = "variable";
+
+//===----------------------------------------------------------------------===//
+// Common utility functions
+//===----------------------------------------------------------------------===//
+
+static LogicalResult extractValueFromConstOp(Operation *op,
+                                             int32_t &indexValue) {
+  auto constOp = dyn_cast<spirv::ConstantOp>(op);
+  if (!constOp) {
+    return failure();
+  }
+  auto valueAttr = constOp.value();
+  auto integerValueAttr = valueAttr.dyn_cast<IntegerAttr>();
+  if (!integerValueAttr) {
+    return failure();
+  }
+  indexValue = integerValueAttr.getInt();
+  return success();
+}
+
+template <typename Ty>
+static ArrayAttr
+getStrArrayAttrForEnumList(Builder &builder, ArrayRef<Ty> enumValues,
+                           function_ref<StringRef(Ty)> stringifyFn) {
+  if (enumValues.empty()) {
+    return nullptr;
+  }
+  SmallVector<StringRef, 1> enumValStrs;
+  enumValStrs.reserve(enumValues.size());
+  for (auto val : enumValues) {
+    enumValStrs.emplace_back(stringifyFn(val));
+  }
+  return builder.getStrArrayAttr(enumValStrs);
+}
+
+template <typename EnumClass>
+static ParseResult
+parseEnumAttribute(EnumClass &value, OpAsmParser &parser,
+                   StringRef attrName = spirv::attributeName<EnumClass>()) {
+  Attribute attrVal;
+  SmallVector<NamedAttribute, 1> attr;
+  auto loc = parser.getCurrentLocation();
+  if (parser.parseAttribute(attrVal, parser.getBuilder().getNoneType(),
+                            attrName, attr)) {
+    return failure();
+  }
+  if (!attrVal.isa<StringAttr>()) {
+    return parser.emitError(loc, "expected ")
+           << attrName << " attribute specified as string";
+  }
+  auto attrOptional =
+      spirv::symbolizeEnum<EnumClass>()(attrVal.cast<StringAttr>().getValue());
+  if (!attrOptional) {
+    return parser.emitError(loc, "invalid ")
+           << attrName << " attribute specification: " << attrVal;
+  }
+  value = attrOptional.getValue();
+  return success();
+}
+
+template <typename EnumClass>
+static ParseResult
+parseEnumAttribute(EnumClass &value, OpAsmParser &parser, OperationState &state,
+                   StringRef attrName = spirv::attributeName<EnumClass>()) {
+  if (parseEnumAttribute(value, parser)) {
+    return failure();
+  }
+  state.addAttribute(attrName, parser.getBuilder().getI32IntegerAttr(
+                                   llvm::bit_cast<int32_t>(value)));
+  return success();
+}
+
+static ParseResult parseMemoryAccessAttributes(OpAsmParser &parser,
+                                               OperationState &state) {
+  // Parse an optional list of attributes staring with '['
+  if (parser.parseOptionalLSquare()) {
+    // Nothing to do
+    return success();
+  }
+
+  spirv::MemoryAccess memoryAccessAttr;
+  if (parseEnumAttribute(memoryAccessAttr, parser, state)) {
+    return failure();
+  }
+
+  if (spirv::bitEnumContains(memoryAccessAttr, spirv::MemoryAccess::Aligned)) {
+    // Parse integer attribute for alignment.
+    Attribute alignmentAttr;
+    Type i32Type = parser.getBuilder().getIntegerType(32);
+    if (parser.parseComma() ||
+        parser.parseAttribute(alignmentAttr, i32Type, kAlignmentAttrName,
+                              state.attributes)) {
+      return failure();
+    }
+  }
+  return parser.parseRSquare();
+}
+
+template <typename LoadStoreOpTy>
+static void
+printMemoryAccessAttribute(LoadStoreOpTy loadStoreOp, OpAsmPrinter &printer,
+                           SmallVectorImpl<StringRef> &elidedAttrs) {
+  // Print optional memory access attribute.
+  if (auto memAccess = loadStoreOp.memory_access()) {
+    elidedAttrs.push_back(spirv::attributeName<spirv::MemoryAccess>());
+    printer << " [\"" << stringifyMemoryAccess(*memAccess) << "\"";
+
+    // Print integer alignment attribute.
+    if (auto alignment = loadStoreOp.alignment()) {
+      elidedAttrs.push_back(kAlignmentAttrName);
+      printer << ", " << alignment;
+    }
+    printer << "]";
+  }
+  elidedAttrs.push_back(spirv::attributeName<spirv::StorageClass>());
+}
+
+static LogicalResult verifyCastOp(Operation *op,
+                                  bool requireSameBitWidth = true) {
+  Type operandType = op->getOperand(0)->getType();
+  Type resultType = op->getResult(0)->getType();
+
+  // ODS checks that result type and operand type have the same shape.
+  if (auto vectorType = operandType.dyn_cast<VectorType>()) {
+    operandType = vectorType.getElementType();
+    resultType = resultType.cast<VectorType>().getElementType();
+  }
+
+  auto operandTypeBitWidth = operandType.getIntOrFloatBitWidth();
+  auto resultTypeBitWidth = resultType.getIntOrFloatBitWidth();
+  auto isSameBitWidth = operandTypeBitWidth == resultTypeBitWidth;
+
+  if (requireSameBitWidth) {
+    if (!isSameBitWidth) {
+      return op->emitOpError(
+                 "expected the same bit widths for operand type and result "
+                 "type, but provided ")
+             << operandType << " and " << resultType;
+    }
+    return success();
+  }
+
+  if (isSameBitWidth) {
+    return op->emitOpError(
+               "expected the different bit widths for operand type and result "
+               "type, but provided ")
+           << operandType << " and " << resultType;
+  }
+  return success();
+}
+
+template <typename LoadStoreOpTy>
+static LogicalResult verifyMemoryAccessAttribute(LoadStoreOpTy loadStoreOp) {
+  // ODS checks for attributes values. Just need to verify that if the
+  // memory-access attribute is Aligned, then the alignment attribute must be
+  // present.
+  auto *op = loadStoreOp.getOperation();
+  auto memAccessAttr = op->getAttr(spirv::attributeName<spirv::MemoryAccess>());
+  if (!memAccessAttr) {
+    // Alignment attribute shouldn't be present if memory access attribute is
+    // not present.
+    if (op->getAttr(kAlignmentAttrName)) {
+      return loadStoreOp.emitOpError(
+          "invalid alignment specification without aligned memory access "
+          "specification");
+    }
+    return success();
+  }
+
+  auto memAccessVal = memAccessAttr.template cast<IntegerAttr>();
+  auto memAccess = spirv::symbolizeMemoryAccess(memAccessVal.getInt());
+
+  if (!memAccess) {
+    return loadStoreOp.emitOpError("invalid memory access specifier: ")
+           << memAccessVal;
+  }
+
+  if (spirv::bitEnumContains(*memAccess, spirv::MemoryAccess::Aligned)) {
+    if (!op->getAttr(kAlignmentAttrName)) {
+      return loadStoreOp.emitOpError("missing alignment value");
+    }
+  } else {
+    if (op->getAttr(kAlignmentAttrName)) {
+      return loadStoreOp.emitOpError(
+          "invalid alignment specification with non-aligned memory access "
+          "specification");
+    }
+  }
+  return success();
+}
+
+template <typename BarrierOp>
+static LogicalResult verifyMemorySemantics(BarrierOp op) {
+  // According to the SPIR-V specification:
+  // "Despite being a mask and allowing multiple bits to be combined, it is
+  // invalid for more than one of these four bits to be set: Acquire, Release,
+  // AcquireRelease, or SequentiallyConsistent. Requesting both Acquire and
+  // Release semantics is done by setting the AcquireRelease bit, not by setting
+  // two bits."
+  auto memorySemantics = op.memory_semantics();
+  auto atMostOneInSet = spirv::MemorySemantics::Acquire |
+                        spirv::MemorySemantics::Release |
+                        spirv::MemorySemantics::AcquireRelease |
+                        spirv::MemorySemantics::SequentiallyConsistent;
+
+  auto bitCount = llvm::countPopulation(
+      static_cast<uint32_t>(memorySemantics & atMostOneInSet));
+  if (bitCount > 1) {
+    return op.emitError("expected at most one of these four memory constraints "
+                        "to be set: `Acquire`, `Release`,"
+                        "`AcquireRelease` or `SequentiallyConsistent`");
+  }
+  return success();
+}
+
+template <typename LoadStoreOpTy>
+static LogicalResult verifyLoadStorePtrAndValTypes(LoadStoreOpTy op, Value ptr,
+                                                   Value val) {
+  // ODS already checks ptr is spirv::PointerType. Just check that the pointee
+  // type of the pointer and the type of the value are the same
+  //
+  // TODO(ravishankarm): Check that the value type satisfies restrictions of
+  // SPIR-V OpLoad/OpStore operations
+  if (val->getType() !=
+      ptr->getType().cast<spirv::PointerType>().getPointeeType()) {
+    return op.emitOpError("mismatch in result type and pointer type");
+  }
+  return success();
+}
+
+static ParseResult parseVariableDecorations(OpAsmParser &parser,
+                                            OperationState &state) {
+  auto builtInName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::BuiltIn));
+  if (succeeded(parser.parseOptionalKeyword("bind"))) {
+    Attribute set, binding;
+    // Parse optional descriptor binding
+    auto descriptorSetName = convertToSnakeCase(
+        stringifyDecoration(spirv::Decoration::DescriptorSet));
+    auto bindingName =
+        convertToSnakeCase(stringifyDecoration(spirv::Decoration::Binding));
+    Type i32Type = parser.getBuilder().getIntegerType(32);
+    if (parser.parseLParen() ||
+        parser.parseAttribute(set, i32Type, descriptorSetName,
+                              state.attributes) ||
+        parser.parseComma() ||
+        parser.parseAttribute(binding, i32Type, bindingName,
+                              state.attributes) ||
+        parser.parseRParen()) {
+      return failure();
+    }
+  } else if (succeeded(parser.parseOptionalKeyword(builtInName))) {
+    StringAttr builtIn;
+    if (parser.parseLParen() ||
+        parser.parseAttribute(builtIn, builtInName, state.attributes) ||
+        parser.parseRParen()) {
+      return failure();
+    }
+  }
+
+  // Parse other attributes
+  if (parser.parseOptionalAttrDict(state.attributes))
+    return failure();
+
+  return success();
+}
+
+static void printVariableDecorations(Operation *op, OpAsmPrinter &printer,
+                                     SmallVectorImpl<StringRef> &elidedAttrs) {
+  // Print optional descriptor binding
+  auto descriptorSetName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::DescriptorSet));
+  auto bindingName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::Binding));
+  auto descriptorSet = op->getAttrOfType<IntegerAttr>(descriptorSetName);
+  auto binding = op->getAttrOfType<IntegerAttr>(bindingName);
+  if (descriptorSet && binding) {
+    elidedAttrs.push_back(descriptorSetName);
+    elidedAttrs.push_back(bindingName);
+    printer << " bind(" << descriptorSet.getInt() << ", " << binding.getInt()
+            << ")";
+  }
+
+  // Print BuiltIn attribute if present
+  auto builtInName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::BuiltIn));
+  if (auto builtin = op->getAttrOfType<StringAttr>(builtInName)) {
+    printer << " " << builtInName << "(\"" << builtin.getValue() << "\")";
+    elidedAttrs.push_back(builtInName);
+  }
+
+  printer.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
+}
+
+// Extracts an element from the given `composite` by following the given
+// `indices`. Returns a null Attribute if error happens.
+static Attribute extractCompositeElement(Attribute composite,
+                                         ArrayRef<unsigned> indices) {
+  // Check that given composite is a constant.
+  if (!composite)
+    return {};
+  // Return composite itself if we reach the end of the index chain.
+  if (indices.empty())
+    return composite;
+
+  if (auto vector = composite.dyn_cast<ElementsAttr>()) {
+    assert(indices.size() == 1 && "must have exactly one index for a vector");
+    return vector.getValue({indices[0]});
+  }
+
+  if (auto array = composite.dyn_cast<ArrayAttr>()) {
+    assert(!indices.empty() && "must have at least one index for an array");
+    return extractCompositeElement(array.getValue()[indices[0]],
+                                   indices.drop_front());
+  }
+
+  return {};
+}
+
+// Get bit width of types.
+static unsigned getBitWidth(Type type) {
+  if (type.isa<spirv::PointerType>()) {
+    // Just return 64 bits for pointer types for now.
+    // TODO: Make sure not caller relies on the actual pointer width value.
+    return 64;
+  }
+  if (type.isIntOrFloat()) {
+    return type.getIntOrFloatBitWidth();
+  }
+  if (auto vectorType = type.dyn_cast<VectorType>()) {
+    assert(vectorType.getElementType().isIntOrFloat());
+    return vectorType.getNumElements() *
+           vectorType.getElementType().getIntOrFloatBitWidth();
+  }
+  llvm_unreachable("unhandled bit width computation for type");
+}
+
+/// Walks the given type hierarchy with the given indices, potentially down
+/// to component granularity, to select an element type. Returns null type and
+/// emits errors with the given loc on failure.
+static Type
+getElementType(Type type, ArrayRef<int32_t> indices,
+               function_ref<InFlightDiagnostic(StringRef)> emitErrorFn) {
+  if (indices.empty()) {
+    emitErrorFn("expected at least one index for spv.CompositeExtract");
+    return nullptr;
+  }
+
+  for (auto index : indices) {
+    if (auto cType = type.dyn_cast<spirv::CompositeType>()) {
+      if (index < 0 || static_cast<uint64_t>(index) >= cType.getNumElements()) {
+        emitErrorFn("index ") << index << " out of bounds for " << type;
+        return nullptr;
+      }
+      type = cType.getElementType(index);
+    } else {
+      emitErrorFn("cannot extract from non-composite type ")
+          << type << " with index " << index;
+      return nullptr;
+    }
+  }
+  return type;
+}
+
+static Type
+getElementType(Type type, Attribute indices,
+               function_ref<InFlightDiagnostic(StringRef)> emitErrorFn) {
+  auto indicesArrayAttr = indices.dyn_cast<ArrayAttr>();
+  if (!indicesArrayAttr) {
+    emitErrorFn("expected a 32-bit integer array attribute for 'indices'");
+    return nullptr;
+  }
+  if (!indicesArrayAttr.size()) {
+    emitErrorFn("expected at least one index for spv.CompositeExtract");
+    return nullptr;
+  }
+
+  SmallVector<int32_t, 2> indexVals;
+  for (auto indexAttr : indicesArrayAttr) {
+    auto indexIntAttr = indexAttr.dyn_cast<IntegerAttr>();
+    if (!indexIntAttr) {
+      emitErrorFn("expected an 32-bit integer for index, but found '")
+          << indexAttr << "'";
+      return nullptr;
+    }
+    indexVals.push_back(indexIntAttr.getInt());
+  }
+  return getElementType(type, indexVals, emitErrorFn);
+}
+
+static Type getElementType(Type type, Attribute indices, Location loc) {
+  auto errorFn = [&](StringRef err) -> InFlightDiagnostic {
+    return ::mlir::emitError(loc, err);
+  };
+  return getElementType(type, indices, errorFn);
+}
+
+static Type getElementType(Type type, Attribute indices, OpAsmParser &parser,
+                           llvm::SMLoc loc) {
+  auto errorFn = [&](StringRef err) -> InFlightDiagnostic {
+    return parser.emitError(loc, err);
+  };
+  return getElementType(type, indices, errorFn);
+}
+
+/// Returns true if the given `block` only contains one `spv._merge` op.
+static inline bool isMergeBlock(Block &block) {
+  return !block.empty() && std::next(block.begin()) == block.end() &&
+         isa<spirv::MergeOp>(block.front());
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'erated canonicalizers
+//===----------------------------------------------------------------------===//
+
+namespace {
+#include "SPIRVCanonicalization.inc"
+}
+
+//===----------------------------------------------------------------------===//
+// Common parsers and printers
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseBitFieldExtractOp(OpAsmParser &parser,
+                                          OperationState &state) {
+  SmallVector<OpAsmParser::OperandType, 3> operandInfo;
+  Type baseType;
+  Type offsetType;
+  Type countType;
+  auto loc = parser.getCurrentLocation();
+
+  if (parser.parseOperandList(operandInfo, 3) || parser.parseColon() ||
+      parser.parseType(baseType) || parser.parseComma() ||
+      parser.parseType(offsetType) || parser.parseComma() ||
+      parser.parseType(countType) ||
+      parser.resolveOperands(operandInfo, {baseType, offsetType, countType},
+                             loc, state.operands)) {
+    return failure();
+  }
+  state.addTypes(baseType);
+  return success();
+}
+
+static void printBitFieldExtractOp(Operation *op, OpAsmPrinter &printer) {
+  printer << op->getName() << ' ' << op->getOperands() << " : "
+          << op->getOperandTypes();
+}
+
+static LogicalResult verifyBitFieldExtractOp(Operation *op) {
+  if (op->getOperand(0)->getType() != op->getResult(0)->getType()) {
+    return op->emitError("expected the same type for the first operand and "
+                         "result, but provided ")
+           << op->getOperand(0)->getType() << " and "
+           << op->getResult(0)->getType();
+  }
+  return success();
+}
+
+// Parses an atomic update op. If the update op does not take a value (like
+// AtomicIIncrement) `hasValue` must be false.
+static ParseResult parseAtomicUpdateOp(OpAsmParser &parser,
+                                       OperationState &state, bool hasValue) {
+  spirv::Scope scope;
+  spirv::MemorySemantics memoryScope;
+  SmallVector<OpAsmParser::OperandType, 2> operandInfo;
+  OpAsmParser::OperandType ptrInfo, valueInfo;
+  Type type;
+  llvm::SMLoc loc;
+  if (parseEnumAttribute(scope, parser, state, kMemoryScopeAttrName) ||
+      parseEnumAttribute(memoryScope, parser, state, kSemanticsAttrName) ||
+      parser.parseOperandList(operandInfo, (hasValue ? 2 : 1)) ||
+      parser.getCurrentLocation(&loc) || parser.parseColonType(type))
+    return failure();
+
+  auto ptrType = type.dyn_cast<spirv::PointerType>();
+  if (!ptrType)
+    return parser.emitError(loc, "expected pointer type");
+
+  SmallVector<Type, 2> operandTypes;
+  operandTypes.push_back(ptrType);
+  if (hasValue)
+    operandTypes.push_back(ptrType.getPointeeType());
+  if (parser.resolveOperands(operandInfo, operandTypes, parser.getNameLoc(),
+                             state.operands))
+    return failure();
+  return parser.addTypeToList(ptrType.getPointeeType(), state.types);
+}
+
+// Prints an atomic update op.
+static void printAtomicUpdateOp(Operation *op, OpAsmPrinter &printer) {
+  printer << op->getName() << " \"";
+  auto scopeAttr = op->getAttrOfType<IntegerAttr>(kMemoryScopeAttrName);
+  printer << spirv::stringifyScope(
+                 static_cast<spirv::Scope>(scopeAttr.getInt()))
+          << "\" \"";
+  auto memorySemanticsAttr = op->getAttrOfType<IntegerAttr>(kSemanticsAttrName);
+  printer << spirv::stringifyMemorySemantics(
+                 static_cast<spirv::MemorySemantics>(
+                     memorySemanticsAttr.getInt()))
+          << "\" " << op->getOperands() << " : "
+          << op->getOperand(0)->getType();
+}
+
+// Verifies an atomic update op.
+static LogicalResult verifyAtomicUpdateOp(Operation *op) {
+  auto ptrType = op->getOperand(0)->getType().cast<spirv::PointerType>();
+  auto elementType = ptrType.getPointeeType();
+  if (!elementType.isa<IntegerType>())
+    return op->emitOpError(
+               "pointer operand must point to an integer value, found ")
+           << elementType;
+
+  if (op->getNumOperands() > 1) {
+    auto valueType = op->getOperand(1)->getType();
+    if (valueType != elementType)
+      return op->emitOpError("expected value to have the same type as the "
+                             "pointer operand's pointee type ")
+             << elementType << ", but found " << valueType;
+  }
+  return success();
+}
+
+// Parses an op that has no inputs and no outputs.
+static ParseResult parseNoIOOp(OpAsmParser &parser, OperationState &state) {
+  if (parser.parseOptionalAttrDict(state.attributes))
+    return failure();
+  return success();
+}
+
+// Prints an op that has no inputs and no outputs.
+static void printNoIOOp(Operation *op, OpAsmPrinter &printer) {
+  printer << op->getName();
+  printer.printOptionalAttrDict(op->getAttrs());
+}
+
+static ParseResult parseUnaryOp(OpAsmParser &parser, OperationState &state) {
+  OpAsmParser::OperandType operandInfo;
+  Type type;
+  if (parser.parseOperand(operandInfo) || parser.parseColonType(type) ||
+      parser.resolveOperands(operandInfo, type, state.operands)) {
+    return failure();
+  }
+  state.addTypes(type);
+  return success();
+}
+
+static void printUnaryOp(Operation *unaryOp, OpAsmPrinter &printer) {
+  printer << unaryOp->getName() << ' ' << *unaryOp->getOperand(0) << " : "
+          << unaryOp->getOperand(0)->getType();
+}
+
+/// Result of a logical op must be a scalar or vector of boolean type.
+static Type getUnaryOpResultType(Builder &builder, Type operandType) {
+  Type resultType = builder.getIntegerType(1);
+  if (auto vecType = operandType.dyn_cast<VectorType>()) {
+    return VectorType::get(vecType.getNumElements(), resultType);
+  }
+  return resultType;
+}
+
+static ParseResult parseLogicalUnaryOp(OpAsmParser &parser,
+                                       OperationState &state) {
+  OpAsmParser::OperandType operandInfo;
+  Type type;
+  if (parser.parseOperand(operandInfo) || parser.parseColonType(type) ||
+      parser.resolveOperand(operandInfo, type, state.operands)) {
+    return failure();
+  }
+  state.addTypes(getUnaryOpResultType(parser.getBuilder(), type));
+  return success();
+}
+
+static ParseResult parseLogicalBinaryOp(OpAsmParser &parser,
+                                        OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 2> ops;
+  Type type;
+  if (parser.parseOperandList(ops, 2) || parser.parseColonType(type) ||
+      parser.resolveOperands(ops, type, result.operands)) {
+    return failure();
+  }
+  result.addTypes(getUnaryOpResultType(parser.getBuilder(), type));
+  return success();
+}
+
+static void printLogicalOp(Operation *logicalOp, OpAsmPrinter &printer) {
+  printer << logicalOp->getName() << ' ' << logicalOp->getOperands() << " : "
+          << logicalOp->getOperand(0)->getType();
+}
+
+static ParseResult parseShiftOp(OpAsmParser &parser, OperationState &state) {
+  SmallVector<OpAsmParser::OperandType, 2> operandInfo;
+  Type baseType;
+  Type shiftType;
+  auto loc = parser.getCurrentLocation();
+
+  if (parser.parseOperandList(operandInfo, 2) || parser.parseColon() ||
+      parser.parseType(baseType) || parser.parseComma() ||
+      parser.parseType(shiftType) ||
+      parser.resolveOperands(operandInfo, {baseType, shiftType}, loc,
+                             state.operands)) {
+    return failure();
+  }
+  state.addTypes(baseType);
+  return success();
+}
+
+static void printShiftOp(Operation *op, OpAsmPrinter &printer) {
+  Value base = op->getOperand(0);
+  Value shift = op->getOperand(1);
+  printer << op->getName() << ' ' << *base << ", " << *shift << " : "
+          << base->getType() << ", " << shift->getType();
+}
+
+static LogicalResult verifyShiftOp(Operation *op) {
+  if (op->getOperand(0)->getType() != op->getResult(0)->getType()) {
+    return op->emitError("expected the same type for the first operand and "
+                         "result, but provided ")
+           << op->getOperand(0)->getType() << " and "
+           << op->getResult(0)->getType();
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.AccessChainOp
+//===----------------------------------------------------------------------===//
+
+static Type getElementPtrType(Type type, ValueRange indices, Location baseLoc) {
+  if (indices.empty()) {
+    emitError(baseLoc, "'spv.AccessChain' op expected at least "
+                       "one index ");
+    return nullptr;
+  }
+
+  auto ptrType = type.dyn_cast<spirv::PointerType>();
+  if (!ptrType) {
+    emitError(baseLoc, "'spv.AccessChain' op expected a pointer "
+                       "to composite type, but provided ")
+        << type;
+    return nullptr;
+  }
+
+  auto resultType = ptrType.getPointeeType();
+  auto resultStorageClass = ptrType.getStorageClass();
+  int32_t index = 0;
+
+  for (auto indexSSA : indices) {
+    auto cType = resultType.dyn_cast<spirv::CompositeType>();
+    if (!cType) {
+      emitError(baseLoc,
+                "'spv.AccessChain' op cannot extract from non-composite type ")
+          << resultType << " with index " << index;
+      return nullptr;
+    }
+    index = 0;
+    if (resultType.isa<spirv::StructType>()) {
+      Operation *op = indexSSA->getDefiningOp();
+      if (!op) {
+        emitError(baseLoc, "'spv.AccessChain' op index must be an "
+                           "integer spv.constant to access "
+                           "element of spv.struct");
+        return nullptr;
+      }
+
+      // TODO(denis0x0D): this should be relaxed to allow
+      // integer literals of other bitwidths.
+      if (failed(extractValueFromConstOp(op, index))) {
+        emitError(baseLoc,
+                  "'spv.AccessChain' index must be an integer spv.constant to "
+                  "access element of spv.struct, but provided ")
+            << op->getName();
+        return nullptr;
+      }
+      if (index < 0 || static_cast<uint64_t>(index) >= cType.getNumElements()) {
+        emitError(baseLoc, "'spv.AccessChain' op index ")
+            << index << " out of bounds for " << resultType;
+        return nullptr;
+      }
+    }
+    resultType = cType.getElementType(index);
+  }
+  return spirv::PointerType::get(resultType, resultStorageClass);
+}
+
+void spirv::AccessChainOp::build(Builder *builder, OperationState &state,
+                                 Value basePtr, ValueRange indices) {
+  auto type = getElementPtrType(basePtr->getType(), indices, state.location);
+  assert(type && "Unable to deduce return type based on basePtr and indices");
+  build(builder, state, type, basePtr, indices);
+}
+
+static ParseResult parseAccessChainOp(OpAsmParser &parser,
+                                      OperationState &state) {
+  OpAsmParser::OperandType ptrInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indicesInfo;
+  Type type;
+  // TODO(denis0x0D): regarding to the spec an index must be any integer type,
+  // figure out how to use resolveOperand with a range of types and do not
+  // fail on first attempt.
+  Type indicesType = parser.getBuilder().getIntegerType(32);
+
+  if (parser.parseOperand(ptrInfo) ||
+      parser.parseOperandList(indicesInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseColonType(type) ||
+      parser.resolveOperand(ptrInfo, type, state.operands) ||
+      parser.resolveOperands(indicesInfo, indicesType, state.operands)) {
+    return failure();
+  }
+
+  auto resultType = getElementPtrType(
+      type, llvm::makeArrayRef(state.operands).drop_front(), state.location);
+  if (!resultType) {
+    return failure();
+  }
+
+  state.addTypes(resultType);
+  return success();
+}
+
+static void print(spirv::AccessChainOp op, OpAsmPrinter &printer) {
+  printer << spirv::AccessChainOp::getOperationName() << ' ' << *op.base_ptr()
+          << '[' << op.indices() << "] : " << op.base_ptr()->getType();
+}
+
+static LogicalResult verify(spirv::AccessChainOp accessChainOp) {
+  SmallVector<Value, 4> indices(accessChainOp.indices().begin(),
+                                accessChainOp.indices().end());
+  auto resultType = getElementPtrType(accessChainOp.base_ptr()->getType(),
+                                      indices, accessChainOp.getLoc());
+  if (!resultType) {
+    return failure();
+  }
+
+  auto providedResultType =
+      accessChainOp.getType().dyn_cast<spirv::PointerType>();
+  if (!providedResultType) {
+    return accessChainOp.emitOpError(
+               "result type must be a pointer, but provided")
+           << providedResultType;
+  }
+
+  if (resultType != providedResultType) {
+    return accessChainOp.emitOpError("invalid result type: expected ")
+           << resultType << ", but provided " << providedResultType;
+  }
+
+  return success();
+}
+
+namespace {
+
+/// Combines chained `spirv::AccessChainOp` operations into one
+/// `spirv::AccessChainOp` operation.
+struct CombineChainedAccessChain
+    : public OpRewritePattern<spirv::AccessChainOp> {
+  using OpRewritePattern<spirv::AccessChainOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(spirv::AccessChainOp accessChainOp,
+                                     PatternRewriter &rewriter) const override {
+    auto parentAccessChainOp = dyn_cast_or_null<spirv::AccessChainOp>(
+        accessChainOp.base_ptr()->getDefiningOp());
+
+    if (!parentAccessChainOp) {
+      return matchFailure();
+    }
+
+    // Combine indices.
+    SmallVector<Value, 4> indices(parentAccessChainOp.indices());
+    indices.append(accessChainOp.indices().begin(),
+                   accessChainOp.indices().end());
+
+    rewriter.replaceOpWithNewOp<spirv::AccessChainOp>(
+        accessChainOp, parentAccessChainOp.base_ptr(), indices);
+
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace
+
+void spirv::AccessChainOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<CombineChainedAccessChain>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// spv._address_of
+//===----------------------------------------------------------------------===//
+
+void spirv::AddressOfOp::build(Builder *builder, OperationState &state,
+                               spirv::GlobalVariableOp var) {
+  build(builder, state, var.type(), builder->getSymbolRefAttr(var));
+}
+
+static ParseResult parseAddressOfOp(OpAsmParser &parser,
+                                    OperationState &state) {
+  FlatSymbolRefAttr varRefAttr;
+  Type type;
+  if (parser.parseAttribute(varRefAttr, Type(), kVariableAttrName,
+                            state.attributes) ||
+      parser.parseColonType(type)) {
+    return failure();
+  }
+  auto ptrType = type.dyn_cast<spirv::PointerType>();
+  if (!ptrType) {
+    return parser.emitError(parser.getCurrentLocation(),
+                            "expected spv.ptr type");
+  }
+  state.addTypes(ptrType);
+  return success();
+}
+
+static void print(spirv::AddressOfOp addressOfOp, OpAsmPrinter &printer) {
+  SmallVector<StringRef, 4> elidedAttrs;
+  printer << spirv::AddressOfOp::getOperationName();
+
+  // Print symbol name.
+  printer << ' ';
+  printer.printSymbolName(addressOfOp.variable());
+
+  // Print the type.
+  printer << " : " << addressOfOp.pointer()->getType();
+}
+
+static LogicalResult verify(spirv::AddressOfOp addressOfOp) {
+  auto moduleOp = addressOfOp.getParentOfType<spirv::ModuleOp>();
+  auto varOp =
+      moduleOp.lookupSymbol<spirv::GlobalVariableOp>(addressOfOp.variable());
+  if (!varOp) {
+    return addressOfOp.emitOpError("expected spv.globalVariable symbol");
+  }
+  if (addressOfOp.pointer()->getType() != varOp.type()) {
+    return addressOfOp.emitOpError(
+        "result type mismatch with the referenced global variable's type");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicCompareExchangeWeak
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseAtomicCompareExchangeWeakOp(OpAsmParser &parser,
+                                                    OperationState &state) {
+  spirv::Scope memoryScope;
+  spirv::MemorySemantics equalSemantics, unequalSemantics;
+  SmallVector<OpAsmParser::OperandType, 3> operandInfo;
+  Type type;
+  if (parseEnumAttribute(memoryScope, parser, state, kMemoryScopeAttrName) ||
+      parseEnumAttribute(equalSemantics, parser, state,
+                         kEqualSemanticsAttrName) ||
+      parseEnumAttribute(unequalSemantics, parser, state,
+                         kUnequalSemanticsAttrName) ||
+      parser.parseOperandList(operandInfo, 3))
+    return failure();
+
+  auto loc = parser.getCurrentLocation();
+  if (parser.parseColonType(type))
+    return failure();
+
+  auto ptrType = type.dyn_cast<spirv::PointerType>();
+  if (!ptrType)
+    return parser.emitError(loc, "expected pointer type");
+
+  if (parser.resolveOperands(
+          operandInfo,
+          {ptrType, ptrType.getPointeeType(), ptrType.getPointeeType()},
+          parser.getNameLoc(), state.operands))
+    return failure();
+
+  return parser.addTypeToList(ptrType.getPointeeType(), state.types);
+}
+
+static void print(spirv::AtomicCompareExchangeWeakOp atomOp,
+                  OpAsmPrinter &printer) {
+  printer << spirv::AtomicCompareExchangeWeakOp::getOperationName() << " \""
+          << stringifyScope(atomOp.memory_scope()) << "\" \""
+          << stringifyMemorySemantics(atomOp.equal_semantics()) << "\" \""
+          << stringifyMemorySemantics(atomOp.unequal_semantics()) << "\" "
+          << atomOp.getOperands() << " : " << atomOp.pointer()->getType();
+}
+
+static LogicalResult verify(spirv::AtomicCompareExchangeWeakOp atomOp) {
+  // According to the spec:
+  // "The type of Value must be the same as Result Type. The type of the value
+  // pointed to by Pointer must be the same as Result Type. This type must also
+  // match the type of Comparator."
+  if (atomOp.getType() != atomOp.value()->getType())
+    return atomOp.emitOpError("value operand must have the same type as the op "
+                              "result, but found ")
+           << atomOp.value()->getType() << " vs " << atomOp.getType();
+
+  if (atomOp.getType() != atomOp.comparator()->getType())
+    return atomOp.emitOpError(
+               "comparator operand must have the same type as the op "
+               "result, but found ")
+           << atomOp.comparator()->getType() << " vs " << atomOp.getType();
+
+  Type pointeeType =
+      atomOp.pointer()->getType().cast<spirv::PointerType>().getPointeeType();
+  if (atomOp.getType() != pointeeType)
+    return atomOp.emitOpError(
+               "pointer operand's pointee type must have the same "
+               "as the op result type, but found ")
+           << pointeeType << " vs " << atomOp.getType();
+
+  // TODO(antiagainst): Unequal cannot be set to Release or Acquire and Release.
+  // In addition, Unequal cannot be set to a stronger memory-order then Equal.
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.BitcastOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(spirv::BitcastOp bitcastOp) {
+  // TODO: The SPIR-V spec validation rules are different for different
+  // versions.
+  auto operandType = bitcastOp.operand()->getType();
+  auto resultType = bitcastOp.result()->getType();
+  if (operandType == resultType) {
+    return bitcastOp.emitError(
+        "result type must be different from operand type");
+  }
+  if (operandType.isa<spirv::PointerType>() &&
+      !resultType.isa<spirv::PointerType>()) {
+    return bitcastOp.emitError(
+        "unhandled bit cast conversion from pointer type to non-pointer type");
+  }
+  if (!operandType.isa<spirv::PointerType>() &&
+      resultType.isa<spirv::PointerType>()) {
+    return bitcastOp.emitError(
+        "unhandled bit cast conversion from non-pointer type to pointer type");
+  }
+  auto operandBitWidth = getBitWidth(operandType);
+  auto resultBitWidth = getBitWidth(resultType);
+  if (operandBitWidth != resultBitWidth) {
+    return bitcastOp.emitOpError("mismatch in result type bitwidth ")
+           << resultBitWidth << " and operand type bitwidth "
+           << operandBitWidth;
+  }
+  return success();
+}
+
+void spirv::BitcastOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<ConvertChainedBitcast>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// spv.BitFieldInsert
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseBitFieldInsertOp(OpAsmParser &parser,
+                                         OperationState &state) {
+  SmallVector<OpAsmParser::OperandType, 4> operandInfo;
+  Type baseType;
+  Type offsetType;
+  Type countType;
+  auto loc = parser.getCurrentLocation();
+
+  if (parser.parseOperandList(operandInfo, 4) || parser.parseColon() ||
+      parser.parseType(baseType) || parser.parseComma() ||
+      parser.parseType(offsetType) || parser.parseComma() ||
+      parser.parseType(countType) ||
+      parser.resolveOperands(operandInfo,
+                             {baseType, baseType, offsetType, countType}, loc,
+                             state.operands)) {
+    return failure();
+  }
+  state.addTypes(baseType);
+  return success();
+}
+
+static void print(spirv::BitFieldInsertOp bitFieldInsertOp,
+                  OpAsmPrinter &printer) {
+  printer << spirv::BitFieldInsertOp::getOperationName() << ' '
+          << bitFieldInsertOp.getOperands() << " : "
+          << bitFieldInsertOp.base()->getType() << ", "
+          << bitFieldInsertOp.offset()->getType() << ", "
+          << bitFieldInsertOp.count()->getType();
+}
+
+static LogicalResult verify(spirv::BitFieldInsertOp bitFieldOp) {
+  auto baseType = bitFieldOp.base()->getType();
+  auto insertType = bitFieldOp.insert()->getType();
+  auto resultType = bitFieldOp.getResult()->getType();
+
+  if ((baseType != insertType) || (baseType != resultType)) {
+    return bitFieldOp.emitError("expected the same type for the base operand, "
+                                "insert operand, and "
+                                "result, but provided ")
+           << baseType << ", " << insertType << " and " << resultType;
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.BranchOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseBranchOp(OpAsmParser &parser, OperationState &state) {
+  Block *dest;
+  SmallVector<Value, 4> destOperands;
+  if (parser.parseSuccessorAndUseList(dest, destOperands))
+    return failure();
+  state.addSuccessor(dest, destOperands);
+  return success();
+}
+
+static void print(spirv::BranchOp branchOp, OpAsmPrinter &printer) {
+  printer << spirv::BranchOp::getOperationName() << ' ';
+  printer.printSuccessorAndUseList(branchOp.getOperation(), /*index=*/0);
+}
+
+static LogicalResult verify(spirv::BranchOp branchOp) {
+  auto *op = branchOp.getOperation();
+  if (op->getNumSuccessors() != 1)
+    branchOp.emitOpError("must have exactly one successor");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.BranchConditionalOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseBranchConditionalOp(OpAsmParser &parser,
+                                            OperationState &state) {
+  auto &builder = parser.getBuilder();
+  OpAsmParser::OperandType condInfo;
+  Block *dest;
+  SmallVector<Value, 4> destOperands;
+
+  // Parse the condition.
+  Type boolTy = builder.getI1Type();
+  if (parser.parseOperand(condInfo) ||
+      parser.resolveOperand(condInfo, boolTy, state.operands))
+    return failure();
+
+  // Parse the optional branch weights.
+  if (succeeded(parser.parseOptionalLSquare())) {
+    IntegerAttr trueWeight, falseWeight;
+    SmallVector<NamedAttribute, 2> weights;
+
+    auto i32Type = builder.getIntegerType(32);
+    if (parser.parseAttribute(trueWeight, i32Type, "weight", weights) ||
+        parser.parseComma() ||
+        parser.parseAttribute(falseWeight, i32Type, "weight", weights) ||
+        parser.parseRSquare())
+      return failure();
+
+    state.addAttribute(kBranchWeightAttrName,
+                       builder.getArrayAttr({trueWeight, falseWeight}));
+  }
+
+  // Parse the true branch.
+  if (parser.parseComma() ||
+      parser.parseSuccessorAndUseList(dest, destOperands))
+    return failure();
+  state.addSuccessor(dest, destOperands);
+
+  // Parse the false branch.
+  destOperands.clear();
+  if (parser.parseComma() ||
+      parser.parseSuccessorAndUseList(dest, destOperands))
+    return failure();
+  state.addSuccessor(dest, destOperands);
+
+  return success();
+}
+
+static void print(spirv::BranchConditionalOp branchOp, OpAsmPrinter &printer) {
+  printer << spirv::BranchConditionalOp::getOperationName() << ' '
+          << branchOp.condition();
+
+  if (auto weights = branchOp.branch_weights()) {
+    printer << " [";
+    interleaveComma(weights->getValue(), printer, [&](Attribute a) {
+      printer << a.cast<IntegerAttr>().getInt();
+    });
+    printer << "]";
+  }
+
+  printer << ", ";
+  printer.printSuccessorAndUseList(branchOp.getOperation(),
+                                   spirv::BranchConditionalOp::kTrueIndex);
+  printer << ", ";
+  printer.printSuccessorAndUseList(branchOp.getOperation(),
+                                   spirv::BranchConditionalOp::kFalseIndex);
+}
+
+static LogicalResult verify(spirv::BranchConditionalOp branchOp) {
+  auto *op = branchOp.getOperation();
+  if (op->getNumSuccessors() != 2)
+    return branchOp.emitOpError("must have exactly two successors");
+
+  if (auto weights = branchOp.branch_weights()) {
+    if (weights->getValue().size() != 2) {
+      return branchOp.emitOpError("must have exactly two branch weights");
+    }
+    if (llvm::all_of(*weights, [](Attribute attr) {
+          return attr.cast<IntegerAttr>().getValue().isNullValue();
+        }))
+      return branchOp.emitOpError("branch weights cannot both be zero");
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.CompositeConstruct
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseCompositeConstructOp(OpAsmParser &parser,
+                                             OperationState &state) {
+  SmallVector<OpAsmParser::OperandType, 4> operands;
+  Type type;
+  auto loc = parser.getCurrentLocation();
+
+  if (parser.parseOperandList(operands) || parser.parseColonType(type)) {
+    return failure();
+  }
+  auto cType = type.dyn_cast<spirv::CompositeType>();
+  if (!cType) {
+    return parser.emitError(
+               loc, "result type must be a composite type, but provided ")
+           << type;
+  }
+
+  if (operands.size() != cType.getNumElements()) {
+    return parser.emitError(loc, "has incorrect number of operands: expected ")
+           << cType.getNumElements() << ", but provided " << operands.size();
+  }
+  // TODO: Add support for constructing a vector type from the vector operands.
+  // According to the spec: "for constructing a vector, the operands may
+  // also be vectors with the same component type as the Result Type component
+  // type".
+  SmallVector<Type, 4> elementTypes;
+  elementTypes.reserve(cType.getNumElements());
+  for (auto index : llvm::seq<uint32_t>(0, cType.getNumElements())) {
+    elementTypes.push_back(cType.getElementType(index));
+  }
+  state.addTypes(type);
+  return parser.resolveOperands(operands, elementTypes, loc, state.operands);
+}
+
+static void print(spirv::CompositeConstructOp compositeConstructOp,
+                  OpAsmPrinter &printer) {
+  printer << spirv::CompositeConstructOp::getOperationName() << " "
+          << compositeConstructOp.constituents() << " : "
+          << compositeConstructOp.getResult()->getType();
+}
+
+static LogicalResult verify(spirv::CompositeConstructOp compositeConstructOp) {
+  auto cType = compositeConstructOp.getType().cast<spirv::CompositeType>();
+
+  SmallVector<Value, 4> constituents(compositeConstructOp.constituents());
+  if (constituents.size() != cType.getNumElements()) {
+    return compositeConstructOp.emitError(
+               "has incorrect number of operands: expected ")
+           << cType.getNumElements() << ", but provided "
+           << constituents.size();
+  }
+
+  for (auto index : llvm::seq<uint32_t>(0, constituents.size())) {
+    if (constituents[index]->getType() != cType.getElementType(index)) {
+      return compositeConstructOp.emitError(
+                 "operand type mismatch: expected operand type ")
+             << cType.getElementType(index) << ", but provided "
+             << constituents[index]->getType();
+    }
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.CompositeExtractOp
+//===----------------------------------------------------------------------===//
+
+void spirv::CompositeExtractOp::build(Builder *builder, OperationState &state,
+                                      Value composite,
+                                      ArrayRef<int32_t> indices) {
+  auto indexAttr = builder->getI32ArrayAttr(indices);
+  auto elementType =
+      getElementType(composite->getType(), indexAttr, state.location);
+  if (!elementType) {
+    return;
+  }
+  build(builder, state, elementType, composite, indexAttr);
+}
+
+static ParseResult parseCompositeExtractOp(OpAsmParser &parser,
+                                           OperationState &state) {
+  OpAsmParser::OperandType compositeInfo;
+  Attribute indicesAttr;
+  Type compositeType;
+  llvm::SMLoc attrLocation;
+
+  if (parser.parseOperand(compositeInfo) ||
+      parser.getCurrentLocation(&attrLocation) ||
+      parser.parseAttribute(indicesAttr, kIndicesAttrName, state.attributes) ||
+      parser.parseColonType(compositeType) ||
+      parser.resolveOperand(compositeInfo, compositeType, state.operands)) {
+    return failure();
+  }
+
+  Type resultType =
+      getElementType(compositeType, indicesAttr, parser, attrLocation);
+  if (!resultType) {
+    return failure();
+  }
+  state.addTypes(resultType);
+  return success();
+}
+
+static void print(spirv::CompositeExtractOp compositeExtractOp,
+                  OpAsmPrinter &printer) {
+  printer << spirv::CompositeExtractOp::getOperationName() << ' '
+          << *compositeExtractOp.composite() << compositeExtractOp.indices()
+          << " : " << compositeExtractOp.composite()->getType();
+}
+
+static LogicalResult verify(spirv::CompositeExtractOp compExOp) {
+  auto indicesArrayAttr = compExOp.indices().dyn_cast<ArrayAttr>();
+  auto resultType = getElementType(compExOp.composite()->getType(),
+                                   indicesArrayAttr, compExOp.getLoc());
+  if (!resultType)
+    return failure();
+
+  if (resultType != compExOp.getType()) {
+    return compExOp.emitOpError("invalid result type: expected ")
+           << resultType << " but provided " << compExOp.getType();
+  }
+
+  return success();
+}
+
+OpFoldResult spirv::CompositeExtractOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 1 && "spv.CompositeExtract expects one operand");
+  auto indexVector = functional::map(
+      [](Attribute attr) {
+        return static_cast<unsigned>(attr.cast<IntegerAttr>().getInt());
+      },
+      indices());
+  return extractCompositeElement(operands[0], indexVector);
+}
+
+//===----------------------------------------------------------------------===//
+// spv.CompositeInsert
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseCompositeInsertOp(OpAsmParser &parser,
+                                          OperationState &state) {
+  SmallVector<OpAsmParser::OperandType, 2> operands;
+  Type objectType, compositeType;
+  Attribute indicesAttr;
+  auto loc = parser.getCurrentLocation();
+
+  return failure(
+      parser.parseOperandList(operands, 2) ||
+      parser.parseAttribute(indicesAttr, kIndicesAttrName, state.attributes) ||
+      parser.parseColonType(objectType) ||
+      parser.parseKeywordType("into", compositeType) ||
+      parser.resolveOperands(operands, {objectType, compositeType}, loc,
+                             state.operands) ||
+      parser.addTypesToList(compositeType, state.types));
+}
+
+static LogicalResult verify(spirv::CompositeInsertOp compositeInsertOp) {
+  auto indicesArrayAttr = compositeInsertOp.indices().dyn_cast<ArrayAttr>();
+  auto objectType =
+      getElementType(compositeInsertOp.composite()->getType(), indicesArrayAttr,
+                     compositeInsertOp.getLoc());
+  if (!objectType)
+    return failure();
+
+  if (objectType != compositeInsertOp.object()->getType()) {
+    return compositeInsertOp.emitOpError("object operand type should be ")
+           << objectType << ", but found "
+           << compositeInsertOp.object()->getType();
+  }
+
+  if (compositeInsertOp.composite()->getType() != compositeInsertOp.getType()) {
+    return compositeInsertOp.emitOpError("result type should be the same as "
+                                         "the composite type, but found ")
+           << compositeInsertOp.composite()->getType() << " vs "
+           << compositeInsertOp.getType();
+  }
+
+  return success();
+}
+
+static void print(spirv::CompositeInsertOp compositeInsertOp,
+                  OpAsmPrinter &printer) {
+  printer << spirv::CompositeInsertOp::getOperationName() << " "
+          << *compositeInsertOp.object() << ", "
+          << *compositeInsertOp.composite() << compositeInsertOp.indices()
+          << " : " << compositeInsertOp.object()->getType() << " into "
+          << compositeInsertOp.composite()->getType();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.constant
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseConstantOp(OpAsmParser &parser, OperationState &state) {
+  Attribute value;
+  if (parser.parseAttribute(value, kValueAttrName, state.attributes))
+    return failure();
+
+  Type type = value.getType();
+  if (type.isa<NoneType>() || type.isa<TensorType>()) {
+    if (parser.parseColonType(type))
+      return failure();
+  }
+
+  return parser.addTypeToList(type, state.types);
+}
+
+static void print(spirv::ConstantOp constOp, OpAsmPrinter &printer) {
+  printer << spirv::ConstantOp::getOperationName() << ' ' << constOp.value();
+  if (constOp.getType().isa<spirv::ArrayType>())
+    printer << " : " << constOp.getType();
+}
+
+static LogicalResult verify(spirv::ConstantOp constOp) {
+  auto opType = constOp.getType();
+  auto value = constOp.value();
+  auto valueType = value.getType();
+
+  // ODS already generates checks to make sure the result type is valid. We just
+  // need to additionally check that the value's attribute type is consistent
+  // with the result type.
+  switch (value.getKind()) {
+  case StandardAttributes::Bool:
+  case StandardAttributes::Integer:
+  case StandardAttributes::Float: {
+    if (valueType != opType)
+      return constOp.emitOpError("result type (")
+             << opType << ") does not match value type (" << valueType << ")";
+    return success();
+  } break;
+  case StandardAttributes::DenseElements:
+  case StandardAttributes::SparseElements: {
+    if (valueType == opType)
+      break;
+    auto arrayType = opType.dyn_cast<spirv::ArrayType>();
+    auto shapedType = valueType.dyn_cast<ShapedType>();
+    if (!arrayType) {
+      return constOp.emitOpError(
+          "must have spv.array result type for array value");
+    }
+
+    int numElements = arrayType.getNumElements();
+    auto opElemType = arrayType.getElementType();
+    while (auto t = opElemType.dyn_cast<spirv::ArrayType>()) {
+      numElements *= t.getNumElements();
+      opElemType = t.getElementType();
+    }
+    if (!opElemType.isIntOrFloat()) {
+      return constOp.emitOpError("only support nested array result type");
+    }
+
+    auto valueElemType = shapedType.getElementType();
+    if (valueElemType != opElemType) {
+      return constOp.emitOpError("result element type (")
+             << opElemType << ") does not match value element type ("
+             << valueElemType << ")";
+    }
+
+    if (numElements != shapedType.getNumElements()) {
+      return constOp.emitOpError("result number of elements (")
+             << numElements << ") does not match value number of elements ("
+             << shapedType.getNumElements() << ")";
+    }
+  } break;
+  case StandardAttributes::Array: {
+    auto arrayType = opType.dyn_cast<spirv::ArrayType>();
+    if (!arrayType)
+      return constOp.emitOpError(
+          "must have spv.array result type for array value");
+    auto elemType = arrayType.getElementType();
+    for (auto element : value.cast<ArrayAttr>().getValue()) {
+      if (element.getType() != elemType)
+        return constOp.emitOpError("has array element whose type (")
+               << element.getType()
+               << ") does not match the result element type (" << elemType
+               << ')';
+    }
+  } break;
+  default:
+    return constOp.emitOpError("cannot have value of type ") << valueType;
+  }
+
+  return success();
+}
+
+OpFoldResult spirv::ConstantOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.empty() && "spv.constant has no operands");
+  return value();
+}
+
+bool spirv::ConstantOp::isBuildableWith(Type type) {
+  // Must be valid SPIR-V type first.
+  if (!SPIRVDialect::isValidType(type))
+    return false;
+
+  if (type.getKind() >= Type::FIRST_SPIRV_TYPE &&
+      type.getKind() <= spirv::TypeKind::LAST_SPIRV_TYPE) {
+    // TODO(antiagainst): support constant struct
+    return type.isa<spirv::ArrayType>();
+  }
+
+  return true;
+}
+
+spirv::ConstantOp spirv::ConstantOp::getZero(Type type, Location loc,
+                                             OpBuilder *builder) {
+  if (auto intType = type.dyn_cast<IntegerType>()) {
+    unsigned width = intType.getWidth();
+    Attribute val;
+    if (width == 1)
+      return builder->create<spirv::ConstantOp>(loc, type,
+                                                builder->getBoolAttr(false));
+    return builder->create<spirv::ConstantOp>(
+        loc, type, builder->getIntegerAttr(type, APInt(width, 0)));
+  }
+
+  llvm_unreachable("unimplemented types for ConstantOp::getZero()");
+}
+
+spirv::ConstantOp spirv::ConstantOp::getOne(Type type, Location loc,
+                                            OpBuilder *builder) {
+  if (auto intType = type.dyn_cast<IntegerType>()) {
+    unsigned width = intType.getWidth();
+    if (width == 1)
+      return builder->create<spirv::ConstantOp>(loc, type,
+                                                builder->getBoolAttr(true));
+    return builder->create<spirv::ConstantOp>(
+        loc, type, builder->getIntegerAttr(type, APInt(width, 1)));
+  }
+
+  llvm_unreachable("unimplemented types for ConstantOp::getOne()");
+}
+
+//===----------------------------------------------------------------------===//
+// spv.ControlBarrier
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseControlBarrierOp(OpAsmParser &parser,
+                                         OperationState &state) {
+  spirv::Scope executionScope;
+  spirv::Scope memoryScope;
+  spirv::MemorySemantics memorySemantics;
+
+  return failure(
+      parseEnumAttribute(executionScope, parser, state,
+                         kExecutionScopeAttrName) ||
+      parser.parseComma() ||
+      parseEnumAttribute(memoryScope, parser, state, kMemoryScopeAttrName) ||
+      parser.parseComma() ||
+      parseEnumAttribute(memorySemantics, parser, state));
+}
+
+static void print(spirv::ControlBarrierOp op, OpAsmPrinter &printer) {
+  printer << spirv::ControlBarrierOp::getOperationName() << " \""
+          << stringifyScope(op.execution_scope()) << "\", \""
+          << stringifyScope(op.memory_scope()) << "\", \""
+          << stringifyMemorySemantics(op.memory_semantics()) << "\"";
+}
+
+//===----------------------------------------------------------------------===//
+// spv.EntryPoint
+//===----------------------------------------------------------------------===//
+
+void spirv::EntryPointOp::build(Builder *builder, OperationState &state,
+                                spirv::ExecutionModel executionModel,
+                                FuncOp function,
+                                ArrayRef<Attribute> interfaceVars) {
+  build(builder, state,
+        builder->getI32IntegerAttr(static_cast<int32_t>(executionModel)),
+        builder->getSymbolRefAttr(function),
+        builder->getArrayAttr(interfaceVars));
+}
+
+static ParseResult parseEntryPointOp(OpAsmParser &parser,
+                                     OperationState &state) {
+  spirv::ExecutionModel execModel;
+  SmallVector<OpAsmParser::OperandType, 0> identifiers;
+  SmallVector<Type, 0> idTypes;
+  SmallVector<Attribute, 4> interfaceVars;
+
+  FlatSymbolRefAttr fn;
+  if (parseEnumAttribute(execModel, parser, state) ||
+      parser.parseAttribute(fn, Type(), kFnNameAttrName, state.attributes)) {
+    return failure();
+  }
+
+  if (!parser.parseOptionalComma()) {
+    // Parse the interface variables
+    do {
+      // The name of the interface variable attribute isnt important
+      auto attrName = "var_symbol";
+      FlatSymbolRefAttr var;
+      SmallVector<NamedAttribute, 1> attrs;
+      if (parser.parseAttribute(var, Type(), attrName, attrs)) {
+        return failure();
+      }
+      interfaceVars.push_back(var);
+    } while (!parser.parseOptionalComma());
+  }
+  state.addAttribute(kInterfaceAttrName,
+                     parser.getBuilder().getArrayAttr(interfaceVars));
+  return success();
+}
+
+static void print(spirv::EntryPointOp entryPointOp, OpAsmPrinter &printer) {
+  printer << spirv::EntryPointOp::getOperationName() << " \""
+          << stringifyExecutionModel(entryPointOp.execution_model()) << "\" ";
+  printer.printSymbolName(entryPointOp.fn());
+  auto interfaceVars = entryPointOp.interface().getValue();
+  if (!interfaceVars.empty()) {
+    printer << ", ";
+    interleaveComma(interfaceVars, printer);
+  }
+}
+
+static LogicalResult verify(spirv::EntryPointOp entryPointOp) {
+  // Checks for fn and interface symbol reference are done in spirv::ModuleOp
+  // verification.
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.ExecutionMode
+//===----------------------------------------------------------------------===//
+
+void spirv::ExecutionModeOp::build(Builder *builder, OperationState &state,
+                                   FuncOp function,
+                                   spirv::ExecutionMode executionMode,
+                                   ArrayRef<int32_t> params) {
+  build(builder, state, builder->getSymbolRefAttr(function),
+        builder->getI32IntegerAttr(static_cast<int32_t>(executionMode)),
+        builder->getI32ArrayAttr(params));
+}
+
+static ParseResult parseExecutionModeOp(OpAsmParser &parser,
+                                        OperationState &state) {
+  spirv::ExecutionMode execMode;
+  Attribute fn;
+  if (parser.parseAttribute(fn, kFnNameAttrName, state.attributes) ||
+      parseEnumAttribute(execMode, parser, state)) {
+    return failure();
+  }
+
+  SmallVector<int32_t, 4> values;
+  Type i32Type = parser.getBuilder().getIntegerType(32);
+  while (!parser.parseOptionalComma()) {
+    SmallVector<NamedAttribute, 1> attr;
+    Attribute value;
+    if (parser.parseAttribute(value, i32Type, "value", attr)) {
+      return failure();
+    }
+    values.push_back(value.cast<IntegerAttr>().getInt());
+  }
+  state.addAttribute(kValuesAttrName,
+                     parser.getBuilder().getI32ArrayAttr(values));
+  return success();
+}
+
+static void print(spirv::ExecutionModeOp execModeOp, OpAsmPrinter &printer) {
+  printer << spirv::ExecutionModeOp::getOperationName() << " @"
+          << execModeOp.fn() << " \""
+          << stringifyExecutionMode(execModeOp.execution_mode()) << "\"";
+  auto values = execModeOp.values();
+  if (!values.size())
+    return;
+  printer << ", ";
+  interleaveComma(values, printer, [&](Attribute a) {
+    printer << a.cast<IntegerAttr>().getInt();
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// spv.FunctionCall
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseFunctionCallOp(OpAsmParser &parser,
+                                       OperationState &state) {
+  FlatSymbolRefAttr calleeAttr;
+  FunctionType type;
+  SmallVector<OpAsmParser::OperandType, 4> operands;
+  auto loc = parser.getNameLoc();
+  if (parser.parseAttribute(calleeAttr, kCallee, state.attributes) ||
+      parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
+      parser.parseColonType(type)) {
+    return failure();
+  }
+
+  auto funcType = type.dyn_cast<FunctionType>();
+  if (!funcType) {
+    return parser.emitError(loc, "expected function type, but provided ")
+           << type;
+  }
+
+  if (funcType.getNumResults() > 1) {
+    return parser.emitError(loc, "expected callee function to have 0 or 1 "
+                                 "result, but provided ")
+           << funcType.getNumResults();
+  }
+
+  return failure(parser.addTypesToList(funcType.getResults(), state.types) ||
+                 parser.resolveOperands(operands, funcType.getInputs(), loc,
+                                        state.operands));
+}
+
+static void print(spirv::FunctionCallOp functionCallOp, OpAsmPrinter &printer) {
+  SmallVector<Type, 4> argTypes(functionCallOp.getOperandTypes());
+  SmallVector<Type, 1> resultTypes(functionCallOp.getResultTypes());
+  Type functionType =
+      FunctionType::get(argTypes, resultTypes, functionCallOp.getContext());
+
+  printer << spirv::FunctionCallOp::getOperationName() << ' '
+          << functionCallOp.getAttr(kCallee) << '('
+          << functionCallOp.arguments() << ") : " << functionType;
+}
+
+static LogicalResult verify(spirv::FunctionCallOp functionCallOp) {
+  auto fnName = functionCallOp.callee();
+
+  auto moduleOp = functionCallOp.getParentOfType<spirv::ModuleOp>();
+  if (!moduleOp) {
+    return functionCallOp.emitOpError(
+        "must appear in a function inside 'spv.module'");
+  }
+
+  auto funcOp = moduleOp.lookupSymbol<FuncOp>(fnName);
+  if (!funcOp) {
+    return functionCallOp.emitOpError("callee function '")
+           << fnName << "' not found in 'spv.module'";
+  }
+
+  auto functionType = funcOp.getType();
+
+  if (functionCallOp.getNumResults() > 1) {
+    return functionCallOp.emitOpError(
+               "expected callee function to have 0 or 1 result, but provided ")
+           << functionCallOp.getNumResults();
+  }
+
+  if (functionType.getNumInputs() != functionCallOp.getNumOperands()) {
+    return functionCallOp.emitOpError(
+               "has incorrect number of operands for callee: expected ")
+           << functionType.getNumInputs() << ", but provided "
+           << functionCallOp.getNumOperands();
+  }
+
+  for (uint32_t i = 0, e = functionType.getNumInputs(); i != e; ++i) {
+    if (functionCallOp.getOperand(i)->getType() != functionType.getInput(i)) {
+      return functionCallOp.emitOpError(
+                 "operand type mismatch: expected operand type ")
+             << functionType.getInput(i) << ", but provided "
+             << functionCallOp.getOperand(i)->getType()
+             << " for operand number " << i;
+    }
+  }
+
+  if (functionType.getNumResults() != functionCallOp.getNumResults()) {
+    return functionCallOp.emitOpError(
+               "has incorrect number of results has for callee: expected ")
+           << functionType.getNumResults() << ", but provided "
+           << functionCallOp.getNumResults();
+  }
+
+  if (functionCallOp.getNumResults() &&
+      (functionCallOp.getResult(0)->getType() != functionType.getResult(0))) {
+    return functionCallOp.emitOpError("result type mismatch: expected ")
+           << functionType.getResult(0) << ", but provided "
+           << functionCallOp.getResult(0)->getType();
+  }
+
+  return success();
+}
+
+CallInterfaceCallable spirv::FunctionCallOp::getCallableForCallee() {
+  return getAttrOfType<SymbolRefAttr>(kCallee);
+}
+
+Operation::operand_range spirv::FunctionCallOp::getArgOperands() {
+  return arguments();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.globalVariable
+//===----------------------------------------------------------------------===//
+
+void spirv::GlobalVariableOp::build(Builder *builder, OperationState &state,
+                                    Type type, StringRef name,
+                                    unsigned descriptorSet, unsigned binding) {
+  build(builder, state, TypeAttr::get(type), builder->getStringAttr(name),
+        nullptr);
+  state.addAttribute(
+      spirv::SPIRVDialect::getAttributeName(spirv::Decoration::DescriptorSet),
+      builder->getI32IntegerAttr(descriptorSet));
+  state.addAttribute(
+      spirv::SPIRVDialect::getAttributeName(spirv::Decoration::Binding),
+      builder->getI32IntegerAttr(binding));
+}
+
+void spirv::GlobalVariableOp::build(Builder *builder, OperationState &state,
+                                    Type type, StringRef name,
+                                    spirv::BuiltIn builtin) {
+  build(builder, state, TypeAttr::get(type), builder->getStringAttr(name),
+        nullptr);
+  state.addAttribute(
+      spirv::SPIRVDialect::getAttributeName(spirv::Decoration::BuiltIn),
+      builder->getStringAttr(spirv::stringifyBuiltIn(builtin)));
+}
+
+static ParseResult parseGlobalVariableOp(OpAsmParser &parser,
+                                         OperationState &state) {
+  // Parse variable name.
+  StringAttr nameAttr;
+  if (parser.parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(),
+                             state.attributes)) {
+    return failure();
+  }
+
+  // Parse optional initializer
+  if (succeeded(parser.parseOptionalKeyword(kInitializerAttrName))) {
+    FlatSymbolRefAttr initSymbol;
+    if (parser.parseLParen() ||
+        parser.parseAttribute(initSymbol, Type(), kInitializerAttrName,
+                              state.attributes) ||
+        parser.parseRParen())
+      return failure();
+  }
+
+  if (parseVariableDecorations(parser, state)) {
+    return failure();
+  }
+
+  Type type;
+  auto loc = parser.getCurrentLocation();
+  if (parser.parseColonType(type)) {
+    return failure();
+  }
+  if (!type.isa<spirv::PointerType>()) {
+    return parser.emitError(loc, "expected spv.ptr type");
+  }
+  state.addAttribute(kTypeAttrName, TypeAttr::get(type));
+
+  return success();
+}
+
+static void print(spirv::GlobalVariableOp varOp, OpAsmPrinter &printer) {
+  auto *op = varOp.getOperation();
+  SmallVector<StringRef, 4> elidedAttrs{
+      spirv::attributeName<spirv::StorageClass>()};
+  printer << spirv::GlobalVariableOp::getOperationName();
+
+  // Print variable name.
+  printer << ' ';
+  printer.printSymbolName(varOp.sym_name());
+  elidedAttrs.push_back(SymbolTable::getSymbolAttrName());
+
+  // Print optional initializer
+  if (auto initializer = varOp.initializer()) {
+    printer << " " << kInitializerAttrName << '(';
+    printer.printSymbolName(initializer.getValue());
+    printer << ')';
+    elidedAttrs.push_back(kInitializerAttrName);
+  }
+
+  elidedAttrs.push_back(kTypeAttrName);
+  printVariableDecorations(op, printer, elidedAttrs);
+  printer << " : " << varOp.type();
+}
+
+static LogicalResult verify(spirv::GlobalVariableOp varOp) {
+  // SPIR-V spec: "Storage Class is the Storage Class of the memory holding the
+  // object. It cannot be Generic. It must be the same as the Storage Class
+  // operand of the Result Type."
+  if (varOp.storageClass() == spirv::StorageClass::Generic)
+    return varOp.emitOpError("storage class cannot be 'Generic'");
+
+  if (auto init =
+          varOp.getAttrOfType<FlatSymbolRefAttr>(kInitializerAttrName)) {
+    auto moduleOp = varOp.getParentOfType<spirv::ModuleOp>();
+    auto *initOp = moduleOp.lookupSymbol(init.getValue());
+    // TODO: Currently only variable initialization with specialization
+    // constants and other variables is supported. They could be normal
+    // constants in the module scope as well.
+    if (!initOp || !(isa<spirv::GlobalVariableOp>(initOp) ||
+                     isa<spirv::SpecConstantOp>(initOp))) {
+      return varOp.emitOpError("initializer must be result of a "
+                               "spv.specConstant or spv.globalVariable op");
+    }
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.GroupNonUniformBallotOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseGroupNonUniformBallotOp(OpAsmParser &parser,
+                                                OperationState &state) {
+  spirv::Scope executionScope;
+  OpAsmParser::OperandType operandInfo;
+  Type resultType;
+  IntegerType i1Type = parser.getBuilder().getI1Type();
+  if (parseEnumAttribute(executionScope, parser, state,
+                         kExecutionScopeAttrName) ||
+      parser.parseOperand(operandInfo) || parser.parseColonType(resultType) ||
+      parser.resolveOperand(operandInfo, i1Type, state.operands))
+    return failure();
+
+  return parser.addTypeToList(resultType, state.types);
+}
+
+static void print(spirv::GroupNonUniformBallotOp ballotOp,
+                  OpAsmPrinter &printer) {
+  printer << spirv::GroupNonUniformBallotOp::getOperationName() << " \""
+          << stringifyScope(ballotOp.execution_scope()) << "\" "
+          << ballotOp.predicate() << " : " << ballotOp.getType();
+}
+
+static LogicalResult verify(spirv::GroupNonUniformBallotOp ballotOp) {
+  // TODO(antiagainst): check the result integer type's signedness bit is 0.
+
+  spirv::Scope scope = ballotOp.execution_scope();
+  if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup)
+    return ballotOp.emitOpError(
+        "execution scope must be 'Workgroup' or 'Subgroup'");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.IAdd
+//===----------------------------------------------------------------------===//
+
+OpFoldResult spirv::IAddOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "spv.IAdd expects two operands");
+  // x + 0 = x
+  if (matchPattern(operand2(), m_Zero()))
+    return operand1();
+
+  // According to the SPIR-V spec:
+  //
+  // The resulting value will equal the low-order N bits of the correct result
+  // R, where N is the component width and R is computed with enough precision
+  // to avoid overflow and underflow.
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a + b; });
+}
+
+//===----------------------------------------------------------------------===//
+// spv.IMul
+//===----------------------------------------------------------------------===//
+
+OpFoldResult spirv::IMulOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "spv.IMul expects two operands");
+  // x * 0 == 0
+  if (matchPattern(operand2(), m_Zero()))
+    return operand2();
+  // x * 1 = x
+  if (matchPattern(operand2(), m_One()))
+    return operand1();
+
+  // According to the SPIR-V spec:
+  //
+  // The resulting value will equal the low-order N bits of the correct result
+  // R, where N is the component width and R is computed with enough precision
+  // to avoid overflow and underflow.
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a * b; });
+}
+
+//===----------------------------------------------------------------------===//
+// spv.ISub
+//===----------------------------------------------------------------------===//
+
+OpFoldResult spirv::ISubOp::fold(ArrayRef<Attribute> operands) {
+  // x - x = 0
+  if (operand1() == operand2())
+    return Builder(getContext()).getIntegerAttr(getType(), 0);
+
+  // According to the SPIR-V spec:
+  //
+  // The resulting value will equal the low-order N bits of the correct result
+  // R, where N is the component width and R is computed with enough precision
+  // to avoid overflow and underflow.
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a - b; });
+}
+
+//===----------------------------------------------------------------------===//
+// spv.LoadOp
+//===----------------------------------------------------------------------===//
+
+void spirv::LoadOp::build(Builder *builder, OperationState &state,
+                          Value basePtr, IntegerAttr memory_access,
+                          IntegerAttr alignment) {
+  auto ptrType = basePtr->getType().cast<spirv::PointerType>();
+  build(builder, state, ptrType.getPointeeType(), basePtr, memory_access,
+        alignment);
+}
+
+static ParseResult parseLoadOp(OpAsmParser &parser, OperationState &state) {
+  // Parse the storage class specification
+  spirv::StorageClass storageClass;
+  OpAsmParser::OperandType ptrInfo;
+  Type elementType;
+  if (parseEnumAttribute(storageClass, parser) ||
+      parser.parseOperand(ptrInfo) ||
+      parseMemoryAccessAttributes(parser, state) ||
+      parser.parseOptionalAttrDict(state.attributes) || parser.parseColon() ||
+      parser.parseType(elementType)) {
+    return failure();
+  }
+
+  auto ptrType = spirv::PointerType::get(elementType, storageClass);
+  if (parser.resolveOperand(ptrInfo, ptrType, state.operands)) {
+    return failure();
+  }
+
+  state.addTypes(elementType);
+  return success();
+}
+
+static void print(spirv::LoadOp loadOp, OpAsmPrinter &printer) {
+  auto *op = loadOp.getOperation();
+  SmallVector<StringRef, 4> elidedAttrs;
+  StringRef sc = stringifyStorageClass(
+      loadOp.ptr()->getType().cast<spirv::PointerType>().getStorageClass());
+  printer << spirv::LoadOp::getOperationName() << " \"" << sc << "\" "
+          << loadOp.ptr();
+
+  printMemoryAccessAttribute(loadOp, printer, elidedAttrs);
+
+  printer.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
+  printer << " : " << loadOp.getType();
+}
+
+static LogicalResult verify(spirv::LoadOp loadOp) {
+  // SPIR-V spec : "Result Type is the type of the loaded object. It must be a
+  // type with fixed size; i.e., it cannot be, nor include, any
+  // OpTypeRuntimeArray types."
+  if (failed(verifyLoadStorePtrAndValTypes(loadOp, loadOp.ptr(),
+                                           loadOp.value()))) {
+    return failure();
+  }
+  return verifyMemoryAccessAttribute(loadOp);
+}
+
+//===----------------------------------------------------------------------===//
+// spv.LogicalNot
+//===----------------------------------------------------------------------===//
+
+void spirv::LogicalNotOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<ConvertLogicalNotOfIEqual, ConvertLogicalNotOfINotEqual,
+                 ConvertLogicalNotOfLogicalEqual,
+                 ConvertLogicalNotOfLogicalNotEqual>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// spv.loop
+//===----------------------------------------------------------------------===//
+
+void spirv::LoopOp::build(Builder *builder, OperationState &state) {
+  state.addAttribute("loop_control",
+                     builder->getI32IntegerAttr(
+                         static_cast<uint32_t>(spirv::LoopControl::None)));
+  state.addRegion();
+}
+
+static ParseResult parseLoopOp(OpAsmParser &parser, OperationState &state) {
+  // TODO(antiagainst): support loop control properly
+  Builder builder = parser.getBuilder();
+  state.addAttribute("loop_control",
+                     builder.getI32IntegerAttr(
+                         static_cast<uint32_t>(spirv::LoopControl::None)));
+
+  return parser.parseRegion(*state.addRegion(), /*arguments=*/{},
+                            /*argTypes=*/{});
+}
+
+static void print(spirv::LoopOp loopOp, OpAsmPrinter &printer) {
+  auto *op = loopOp.getOperation();
+
+  printer << spirv::LoopOp::getOperationName();
+  printer.printRegion(op->getRegion(0), /*printEntryBlockArgs=*/false,
+                      /*printBlockTerminators=*/true);
+}
+
+/// Returns true if the given `srcBlock` contains only one `spv.Branch` to the
+/// given `dstBlock`.
+static inline bool hasOneBranchOpTo(Block &srcBlock, Block &dstBlock) {
+  // Check that there is only one op in the `srcBlock`.
+  if (!has_single_element(srcBlock))
+    return false;
+
+  auto branchOp = dyn_cast<spirv::BranchOp>(srcBlock.back());
+  return branchOp && branchOp.getSuccessor(0) == &dstBlock;
+}
+
+static LogicalResult verify(spirv::LoopOp loopOp) {
+  auto *op = loopOp.getOperation();
+
+  // We need to verify that the blocks follow the following layout:
+  //
+  //                     +-------------+
+  //                     | entry block |
+  //                     +-------------+
+  //                            |
+  //                            v
+  //                     +-------------+
+  //                     | loop header | <-----+
+  //                     +-------------+       |
+  //                                           |
+  //                           ...             |
+  //                          \ | /            |
+  //                            v              |
+  //                    +---------------+      |
+  //                    | loop continue | -----+
+  //                    +---------------+
+  //
+  //                           ...
+  //                          \ | /
+  //                            v
+  //                     +-------------+
+  //                     | merge block |
+  //                     +-------------+
+
+  auto &region = op->getRegion(0);
+  // Allow empty region as a degenerated case, which can come from
+  // optimizations.
+  if (region.empty())
+    return success();
+
+  // The last block is the merge block.
+  Block &merge = region.back();
+  if (!isMergeBlock(merge))
+    return loopOp.emitOpError(
+        "last block must be the merge block with only one 'spv._merge' op");
+
+  if (std::next(region.begin()) == region.end())
+    return loopOp.emitOpError(
+        "must have an entry block branching to the loop header block");
+  // The first block is the entry block.
+  Block &entry = region.front();
+
+  if (std::next(region.begin(), 2) == region.end())
+    return loopOp.emitOpError(
+        "must have a loop header block branched from the entry block");
+  // The second block is the loop header block.
+  Block &header = *std::next(region.begin(), 1);
+
+  if (!hasOneBranchOpTo(entry, header))
+    return loopOp.emitOpError(
+        "entry block must only have one 'spv.Branch' op to the second block");
+
+  if (std::next(region.begin(), 3) == region.end())
+    return loopOp.emitOpError(
+        "requires a loop continue block branching to the loop header block");
+  // The second to last block is the loop continue block.
+  Block &cont = *std::prev(region.end(), 2);
+
+  // Make sure that we have a branch from the loop continue block to the loop
+  // header block.
+  if (llvm::none_of(
+          llvm::seq<unsigned>(0, cont.getNumSuccessors()),
+          [&](unsigned index) { return cont.getSuccessor(index) == &header; }))
+    return loopOp.emitOpError("second to last block must be the loop continue "
+                              "block that branches to the loop header block");
+
+  // Make sure that no other blocks (except the entry and loop continue block)
+  // branches to the loop header block.
+  for (auto &block : llvm::make_range(std::next(region.begin(), 2),
+                                      std::prev(region.end(), 2))) {
+    for (auto i : llvm::seq<unsigned>(0, block.getNumSuccessors())) {
+      if (block.getSuccessor(i) == &header) {
+        return loopOp.emitOpError("can only have the entry and loop continue "
+                                  "block branching to the loop header block");
+      }
+    }
+  }
+
+  return success();
+}
+
+Block *spirv::LoopOp::getEntryBlock() {
+  assert(!body().empty() && "op region should not be empty!");
+  return &body().front();
+}
+
+Block *spirv::LoopOp::getHeaderBlock() {
+  assert(!body().empty() && "op region should not be empty!");
+  // The second block is the loop header block.
+  return &*std::next(body().begin());
+}
+
+Block *spirv::LoopOp::getContinueBlock() {
+  assert(!body().empty() && "op region should not be empty!");
+  // The second to last block is the loop continue block.
+  return &*std::prev(body().end(), 2);
+}
+
+Block *spirv::LoopOp::getMergeBlock() {
+  assert(!body().empty() && "op region should not be empty!");
+  // The last block is the loop merge block.
+  return &body().back();
+}
+
+void spirv::LoopOp::addEntryAndMergeBlock() {
+  assert(body().empty() && "entry and merge block already exist");
+  body().push_back(new Block());
+  auto *mergeBlock = new Block();
+  body().push_back(mergeBlock);
+  OpBuilder builder(mergeBlock);
+
+  // Add a spv._merge op into the merge block.
+  builder.create<spirv::MergeOp>(getLoc());
+}
+
+//===----------------------------------------------------------------------===//
+// spv._merge
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(spirv::MergeOp mergeOp) {
+  auto *parentOp = mergeOp.getParentOp();
+  if (!parentOp ||
+      (!isa<spirv::SelectionOp>(parentOp) && !isa<spirv::LoopOp>(parentOp)))
+    return mergeOp.emitOpError(
+        "expected parent op to be 'spv.selection' or 'spv.loop'");
+
+  Block &parentLastBlock = mergeOp.getParentRegion()->back();
+  if (mergeOp.getOperation() != parentLastBlock.getTerminator())
+    return mergeOp.emitOpError(
+        "can only be used in the last block of 'spv.selection' or 'spv.loop'");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.MemoryBarrier
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseMemoryBarrierOp(OpAsmParser &parser,
+                                        OperationState &state) {
+  spirv::Scope memoryScope;
+  spirv::MemorySemantics memorySemantics;
+
+  return failure(
+      parseEnumAttribute(memoryScope, parser, state, kMemoryScopeAttrName) ||
+      parser.parseComma() ||
+      parseEnumAttribute(memorySemantics, parser, state));
+}
+
+static void print(spirv::MemoryBarrierOp op, OpAsmPrinter &printer) {
+  printer << spirv::MemoryBarrierOp::getOperationName() << " \""
+          << stringifyScope(op.memory_scope()) << "\", \""
+          << stringifyMemorySemantics(op.memory_semantics()) << "\"";
+}
+
+//===----------------------------------------------------------------------===//
+// spv.module
+//===----------------------------------------------------------------------===//
+
+void spirv::ModuleOp::build(Builder *builder, OperationState &state) {
+  ensureTerminator(*state.addRegion(), *builder, state.location);
+}
+
+// TODO(ravishankarm): This is only here for resolving some dependency outside
+// of mlir. Remove once it is done.
+void spirv::ModuleOp::build(Builder *builder, OperationState &state,
+                            IntegerAttr addressing_model,
+                            IntegerAttr memory_model) {
+  state.addAttribute("addressing_model", addressing_model);
+  state.addAttribute("memory_model", memory_model);
+  build(builder, state);
+}
+
+void spirv::ModuleOp::build(Builder *builder, OperationState &state,
+                            spirv::AddressingModel addressing_model,
+                            spirv::MemoryModel memory_model,
+                            ArrayRef<spirv::Capability> capabilities,
+                            ArrayRef<spirv::Extension> extensions,
+                            ArrayAttr extended_instruction_sets) {
+  state.addAttribute(
+      "addressing_model",
+      builder->getI32IntegerAttr(static_cast<int32_t>(addressing_model)));
+  state.addAttribute("memory_model", builder->getI32IntegerAttr(
+                                         static_cast<int32_t>(memory_model)));
+  if (!capabilities.empty())
+    state.addAttribute("capabilities",
+                       getStrArrayAttrForEnumList<spirv::Capability>(
+                           *builder, capabilities, spirv::stringifyCapability));
+  if (!extensions.empty())
+    state.addAttribute("extensions",
+                       getStrArrayAttrForEnumList<spirv::Extension>(
+                           *builder, extensions, spirv::stringifyExtension));
+  if (extended_instruction_sets)
+    state.addAttribute("extended_instruction_sets", extended_instruction_sets);
+  build(builder, state);
+}
+
+static ParseResult parseModuleOp(OpAsmParser &parser, OperationState &state) {
+  Region *body = state.addRegion();
+
+  // Parse attributes
+  spirv::AddressingModel addrModel;
+  spirv::MemoryModel memoryModel;
+  if (parseEnumAttribute(addrModel, parser, state) ||
+      parseEnumAttribute(memoryModel, parser, state)) {
+    return failure();
+  }
+
+  if (parser.parseRegion(*body, /*arguments=*/{}, /*argTypes=*/{}))
+    return failure();
+
+  if (parser.parseOptionalAttrDictWithKeyword(state.attributes))
+    return failure();
+
+  spirv::ModuleOp::ensureTerminator(*body, parser.getBuilder(), state.location);
+  return success();
+}
+
+static void print(spirv::ModuleOp moduleOp, OpAsmPrinter &printer) {
+  printer << spirv::ModuleOp::getOperationName();
+
+  // Only print out addressing model and memory model in a nicer way if both
+  // presents. Otherwise, print them in the general form. This helps
+  // debugging ill-formed ModuleOp.
+  SmallVector<StringRef, 2> elidedAttrs;
+  auto addressingModelAttrName = spirv::attributeName<spirv::AddressingModel>();
+  auto memoryModelAttrName = spirv::attributeName<spirv::MemoryModel>();
+  if (moduleOp.getAttr(addressingModelAttrName) &&
+      moduleOp.getAttr(memoryModelAttrName)) {
+    printer << " \""
+            << spirv::stringifyAddressingModel(moduleOp.addressing_model())
+            << "\" \"" << spirv::stringifyMemoryModel(moduleOp.memory_model())
+            << '"';
+    elidedAttrs.assign({addressingModelAttrName, memoryModelAttrName});
+  }
+
+  printer.printRegion(moduleOp.body(), /*printEntryBlockArgs=*/false,
+                      /*printBlockTerminators=*/false);
+  printer.printOptionalAttrDictWithKeyword(moduleOp.getAttrs(), elidedAttrs);
+}
+
+static LogicalResult verify(spirv::ModuleOp moduleOp) {
+  auto &op = *moduleOp.getOperation();
+  auto *dialect = op.getDialect();
+  auto &body = op.getRegion(0).front();
+  DenseMap<std::pair<FuncOp, spirv::ExecutionModel>, spirv::EntryPointOp>
+      entryPoints;
+  SymbolTable table(moduleOp);
+
+  for (auto &op : body) {
+    if (op.getDialect() == dialect) {
+      // For EntryPoint op, check that the function and execution model is not
+      // duplicated in EntryPointOps. Also verify that the interface specified
+      // comes from globalVariables here to make this check cheaper.
+      if (auto entryPointOp = dyn_cast<spirv::EntryPointOp>(op)) {
+        auto funcOp = table.lookup<FuncOp>(entryPointOp.fn());
+        if (!funcOp) {
+          return entryPointOp.emitError("function '")
+                 << entryPointOp.fn() << "' not found in 'spv.module'";
+        }
+        if (auto interface = entryPointOp.interface()) {
+          for (Attribute varRef : interface) {
+            auto varSymRef = varRef.dyn_cast<FlatSymbolRefAttr>();
+            if (!varSymRef) {
+              return entryPointOp.emitError(
+                         "expected symbol reference for interface "
+                         "specification instead of '")
+                     << varRef;
+            }
+            auto variableOp =
+                table.lookup<spirv::GlobalVariableOp>(varSymRef.getValue());
+            if (!variableOp) {
+              return entryPointOp.emitError("expected spv.globalVariable "
+                                            "symbol reference instead of'")
+                     << varSymRef << "'";
+            }
+          }
+        }
+
+        auto key = std::pair<FuncOp, spirv::ExecutionModel>(
+            funcOp, entryPointOp.execution_model());
+        auto entryPtIt = entryPoints.find(key);
+        if (entryPtIt != entryPoints.end()) {
+          return entryPointOp.emitError("duplicate of a previous EntryPointOp");
+        }
+        entryPoints[key] = entryPointOp;
+      }
+      continue;
+    }
+
+    auto funcOp = dyn_cast<FuncOp>(op);
+    if (!funcOp)
+      return op.emitError("'spv.module' can only contain func and spv.* ops");
+
+    if (funcOp.isExternal())
+      return op.emitError("'spv.module' cannot contain external functions");
+
+    for (auto &block : funcOp)
+      for (auto &op : block) {
+        if (op.getDialect() == dialect)
+          continue;
+
+        if (isa<FuncOp>(op))
+          return op.emitError("'spv.module' cannot contain nested functions");
+
+        return op.emitError(
+            "functions in 'spv.module' can only contain spv.* ops");
+      }
+  }
+
+  // Verify capabilities. ODS already guarantees that we have an array of
+  // string attributes.
+  if (auto caps = moduleOp.getAttrOfType<ArrayAttr>("capabilities")) {
+    for (auto cap : caps.getValue()) {
+      auto capStr = cap.cast<StringAttr>().getValue();
+      if (!spirv::symbolizeCapability(capStr))
+        return moduleOp.emitOpError("uses unknown capability: ") << capStr;
+    }
+  }
+
+  // Verify extensions. ODS already guarantees that we have an array of
+  // string attributes.
+  if (auto exts = moduleOp.getAttrOfType<ArrayAttr>("extensions")) {
+    for (auto ext : exts.getValue()) {
+      auto extStr = ext.cast<StringAttr>().getValue();
+      if (!spirv::symbolizeExtension(extStr))
+        return moduleOp.emitOpError("uses unknown extension: ") << extStr;
+    }
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv._reference_of
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseReferenceOfOp(OpAsmParser &parser,
+                                      OperationState &state) {
+  FlatSymbolRefAttr constRefAttr;
+  Type type;
+  if (parser.parseAttribute(constRefAttr, Type(), kSpecConstAttrName,
+                            state.attributes) ||
+      parser.parseColonType(type)) {
+    return failure();
+  }
+  return parser.addTypeToList(type, state.types);
+}
+
+static void print(spirv::ReferenceOfOp referenceOfOp, OpAsmPrinter &printer) {
+  printer << spirv::ReferenceOfOp::getOperationName() << ' ';
+  printer.printSymbolName(referenceOfOp.spec_const());
+  printer << " : " << referenceOfOp.reference()->getType();
+}
+
+static LogicalResult verify(spirv::ReferenceOfOp referenceOfOp) {
+  auto moduleOp = referenceOfOp.getParentOfType<spirv::ModuleOp>();
+  auto specConstOp =
+      moduleOp.lookupSymbol<spirv::SpecConstantOp>(referenceOfOp.spec_const());
+  if (!specConstOp) {
+    return referenceOfOp.emitOpError("expected spv.specConstant symbol");
+  }
+  if (referenceOfOp.reference()->getType() !=
+      specConstOp.default_value().getType()) {
+    return referenceOfOp.emitOpError("result type mismatch with the referenced "
+                                     "specialization constant's type");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.Return
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(spirv::ReturnOp returnOp) {
+  auto funcOp = returnOp.getParentOfType<FuncOp>();
+  auto numOutputs = funcOp.getType().getNumResults();
+  if (numOutputs != 0)
+    return returnOp.emitOpError("cannot be used in functions returning value")
+           << (numOutputs > 1 ? "s" : "");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.ReturnValue
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseReturnValueOp(OpAsmParser &parser,
+                                      OperationState &state) {
+  OpAsmParser::OperandType retValInfo;
+  Type retValType;
+  return failure(parser.parseOperand(retValInfo) ||
+                 parser.parseColonType(retValType) ||
+                 parser.resolveOperand(retValInfo, retValType, state.operands));
+}
+
+static void print(spirv::ReturnValueOp retValOp, OpAsmPrinter &printer) {
+  printer << spirv::ReturnValueOp::getOperationName() << ' ' << retValOp.value()
+          << " : " << retValOp.value()->getType();
+}
+
+static LogicalResult verify(spirv::ReturnValueOp retValOp) {
+  auto funcOp = retValOp.getParentOfType<FuncOp>();
+  auto numFnResults = funcOp.getType().getNumResults();
+  if (numFnResults != 1)
+    return retValOp.emitOpError(
+               "returns 1 value but enclosing function requires ")
+           << numFnResults << " results";
+
+  auto operandType = retValOp.value()->getType();
+  auto fnResultType = funcOp.getType().getResult(0);
+  if (operandType != fnResultType)
+    return retValOp.emitOpError(" return value's type (")
+           << operandType << ") mismatch with function's result type ("
+           << fnResultType << ")";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.Select
+//===----------------------------------------------------------------------===//
+
+void spirv::SelectOp::build(Builder *builder, OperationState &state, Value cond,
+                            Value trueValue, Value falseValue) {
+  build(builder, state, trueValue->getType(), cond, trueValue, falseValue);
+}
+
+static ParseResult parseSelectOp(OpAsmParser &parser, OperationState &state) {
+  OpAsmParser::OperandType condition;
+  SmallVector<OpAsmParser::OperandType, 2> operands;
+  SmallVector<Type, 2> types;
+  auto loc = parser.getCurrentLocation();
+  if (parser.parseOperand(condition) || parser.parseComma() ||
+      parser.parseOperandList(operands, 2) ||
+      parser.parseColonTypeList(types)) {
+    return failure();
+  }
+  if (types.size() != 2) {
+    return parser.emitError(
+        loc, "need exactly two trailing types for select condition and object");
+  }
+  if (parser.resolveOperand(condition, types[0], state.operands) ||
+      parser.resolveOperands(operands, types[1], state.operands)) {
+    return failure();
+  }
+  return parser.addTypesToList(types[1], state.types);
+}
+
+static void print(spirv::SelectOp op, OpAsmPrinter &printer) {
+  printer << spirv::SelectOp::getOperationName() << " " << op.getOperands()
+          << " : " << op.condition()->getType() << ", "
+          << op.result()->getType();
+}
+
+static LogicalResult verify(spirv::SelectOp op) {
+  auto resultTy = op.result()->getType();
+  if (op.true_value()->getType() != resultTy) {
+    return op.emitOpError("result type and true value type must be the same");
+  }
+  if (op.false_value()->getType() != resultTy) {
+    return op.emitOpError("result type and false value type must be the same");
+  }
+  if (auto conditionTy = op.condition()->getType().dyn_cast<VectorType>()) {
+    auto resultVectorTy = resultTy.dyn_cast<VectorType>();
+    if (!resultVectorTy) {
+      return op.emitOpError("result expected to be of vector type when "
+                            "condition is of vector type");
+    }
+    if (resultVectorTy.getNumElements() != conditionTy.getNumElements()) {
+      return op.emitOpError("result should have the same number of elements as "
+                            "the condition when condition is of vector type");
+    }
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.selection
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseSelectionOp(OpAsmParser &parser,
+                                    OperationState &state) {
+  // TODO(antiagainst): support selection control properly
+  Builder builder = parser.getBuilder();
+  state.addAttribute("selection_control",
+                     builder.getI32IntegerAttr(
+                         static_cast<uint32_t>(spirv::SelectionControl::None)));
+
+  return parser.parseRegion(*state.addRegion(), /*arguments=*/{},
+                            /*argTypes=*/{});
+}
+
+static void print(spirv::SelectionOp selectionOp, OpAsmPrinter &printer) {
+  auto *op = selectionOp.getOperation();
+
+  printer << spirv::SelectionOp::getOperationName();
+  printer.printRegion(op->getRegion(0), /*printEntryBlockArgs=*/false,
+                      /*printBlockTerminators=*/true);
+}
+
+static LogicalResult verify(spirv::SelectionOp selectionOp) {
+  auto *op = selectionOp.getOperation();
+
+  // We need to verify that the blocks follow the following layout:
+  //
+  //                     +--------------+
+  //                     | header block |
+  //                     +--------------+
+  //                          / | \
+  //                           ...
+  //
+  //
+  //         +---------+   +---------+   +---------+
+  //         | case #0 |   | case #1 |   | case #2 |  ...
+  //         +---------+   +---------+   +---------+
+  //
+  //
+  //                           ...
+  //                          \ | /
+  //                            v
+  //                     +-------------+
+  //                     | merge block |
+  //                     +-------------+
+
+  auto &region = op->getRegion(0);
+  // Allow empty region as a degenerated case, which can come from
+  // optimizations.
+  if (region.empty())
+    return success();
+
+  // The last block is the merge block.
+  if (!isMergeBlock(region.back()))
+    return selectionOp.emitOpError(
+        "last block must be the merge block with only one 'spv._merge' op");
+
+  if (std::next(region.begin()) == region.end())
+    return selectionOp.emitOpError("must have a selection header block");
+
+  return success();
+}
+
+Block *spirv::SelectionOp::getHeaderBlock() {
+  assert(!body().empty() && "op region should not be empty!");
+  // The first block is the loop header block.
+  return &body().front();
+}
+
+Block *spirv::SelectionOp::getMergeBlock() {
+  assert(!body().empty() && "op region should not be empty!");
+  // The last block is the loop merge block.
+  return &body().back();
+}
+
+void spirv::SelectionOp::addMergeBlock() {
+  assert(body().empty() && "entry and merge block already exist");
+  auto *mergeBlock = new Block();
+  body().push_back(mergeBlock);
+  OpBuilder builder(mergeBlock);
+
+  // Add a spv._merge op into the merge block.
+  builder.create<spirv::MergeOp>(getLoc());
+}
+
+namespace {
+// Blocks from the given `spv.selection` operation must satisfy the following
+// layout:
+//
+//       +-----------------------------------------------+
+//       | header block                                  |
+//       | spv.BranchConditionalOp %cond, ^case0, ^case1 |
+//       +-----------------------------------------------+
+//                            /   \
+//                             ...
+//
+//
+//   +------------------------+    +------------------------+
+//   | case #0                |    | case #1                |
+//   | spv.Store %ptr %value0 |    | spv.Store %ptr %value1 |
+//   | spv.Branch ^merge      |    | spv.Branch ^merge      |
+//   +------------------------+    +------------------------+
+//
+//
+//                             ...
+//                            \   /
+//                              v
+//                       +-------------+
+//                       | merge block |
+//                       +-------------+
+//
+struct ConvertSelectionOpToSelect
+    : public OpRewritePattern<spirv::SelectionOp> {
+  using OpRewritePattern<spirv::SelectionOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(spirv::SelectionOp selectionOp,
+                                     PatternRewriter &rewriter) const override {
+    auto *op = selectionOp.getOperation();
+    auto &body = op->getRegion(0);
+    // Verifier allows an empty region for `spv.selection`.
+    if (body.empty()) {
+      return matchFailure();
+    }
+
+    // Check that region consists of 4 blocks:
+    // header block, `true` block, `false` block and merge block.
+    if (std::distance(body.begin(), body.end()) != 4) {
+      return matchFailure();
+    }
+
+    auto *headerBlock = selectionOp.getHeaderBlock();
+    if (!onlyContainsBranchConditionalOp(headerBlock)) {
+      return matchFailure();
+    }
+
+    auto brConditionalOp =
+        cast<spirv::BranchConditionalOp>(headerBlock->front());
+
+    auto *trueBlock = brConditionalOp.getSuccessor(0);
+    auto *falseBlock = brConditionalOp.getSuccessor(1);
+    auto *mergeBlock = selectionOp.getMergeBlock();
+
+    if (!canCanonicalizeSelection(trueBlock, falseBlock, mergeBlock)) {
+      return matchFailure();
+    }
+
+    auto trueValue = getSrcValue(trueBlock);
+    auto falseValue = getSrcValue(falseBlock);
+    auto ptrValue = getDstPtr(trueBlock);
+    auto storeOpAttributes =
+        cast<spirv::StoreOp>(trueBlock->front()).getOperation()->getAttrs();
+
+    auto selectOp = rewriter.create<spirv::SelectOp>(
+        selectionOp.getLoc(), trueValue->getType(), brConditionalOp.condition(),
+        trueValue, falseValue);
+    rewriter.create<spirv::StoreOp>(selectOp.getLoc(), ptrValue,
+                                    selectOp.getResult(), storeOpAttributes);
+
+    // `spv.selection` is not needed anymore.
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+
+private:
+  // Checks that given blocks follow the following rules:
+  // 1. Each conditional block consists of two operations, the first operation
+  //    is a `spv.Store` and the last operation is a `spv.Branch`.
+  // 2. Each `spv.Store` uses the same pointer and the same memory attributes.
+  // 3. A control flow goes into the given merge block from the given
+  //    conditional blocks.
+  PatternMatchResult canCanonicalizeSelection(Block *trueBlock,
+                                              Block *falseBlock,
+                                              Block *mergeBlock) const;
+
+  bool onlyContainsBranchConditionalOp(Block *block) const {
+    return std::next(block->begin()) == block->end() &&
+           isa<spirv::BranchConditionalOp>(block->front());
+  }
+
+  bool isSameAttrList(spirv::StoreOp lhs, spirv::StoreOp rhs) const {
+    return lhs.getOperation()->getAttrList().getDictionary() ==
+           rhs.getOperation()->getAttrList().getDictionary();
+  }
+
+  // Checks that given type is valid for `spv.SelectOp`.
+  // According to SPIR-V spec:
+  // "Before version 1.4, Result Type must be a pointer, scalar, or vector.
+  // Starting with version 1.4, Result Type can additionally be a composite type
+  // other than a vector."
+  bool isValidType(Type type) const {
+    return spirv::SPIRVDialect::isValidScalarType(type) ||
+           type.isa<VectorType>();
+  }
+
+  // Returns a soruce value for the given block.
+  Value getSrcValue(Block *block) const {
+    auto storeOp = cast<spirv::StoreOp>(block->front());
+    return storeOp.value();
+  }
+
+  // Returns a destination value for the given block.
+  Value getDstPtr(Block *block) const {
+    auto storeOp = cast<spirv::StoreOp>(block->front());
+    return storeOp.ptr();
+  }
+};
+
+PatternMatchResult ConvertSelectionOpToSelect::canCanonicalizeSelection(
+    Block *trueBlock, Block *falseBlock, Block *mergeBlock) const {
+  // Each block must consists of 2 operations.
+  if ((std::distance(trueBlock->begin(), trueBlock->end()) != 2) ||
+      (std::distance(falseBlock->begin(), falseBlock->end()) != 2)) {
+    return matchFailure();
+  }
+
+  auto trueBrStoreOp = dyn_cast<spirv::StoreOp>(trueBlock->front());
+  auto trueBrBranchOp =
+      dyn_cast<spirv::BranchOp>(*std::next(trueBlock->begin()));
+  auto falseBrStoreOp = dyn_cast<spirv::StoreOp>(falseBlock->front());
+  auto falseBrBranchOp =
+      dyn_cast<spirv::BranchOp>(*std::next(falseBlock->begin()));
+
+  if (!trueBrStoreOp || !trueBrBranchOp || !falseBrStoreOp ||
+      !falseBrBranchOp) {
+    return matchFailure();
+  }
+
+  // Check that each `spv.Store` uses the same pointer, memory access
+  // attributes and a valid type of the value.
+  if ((trueBrStoreOp.ptr() != falseBrStoreOp.ptr()) ||
+      !isSameAttrList(trueBrStoreOp, falseBrStoreOp) ||
+      !isValidType(trueBrStoreOp.value()->getType())) {
+    return matchFailure();
+  }
+
+  if ((trueBrBranchOp.getOperation()->getSuccessor(0) != mergeBlock) ||
+      (falseBrBranchOp.getOperation()->getSuccessor(0) != mergeBlock)) {
+    return matchFailure();
+  }
+
+  return matchSuccess();
+}
+} // end anonymous namespace
+
+void spirv::SelectionOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<ConvertSelectionOpToSelect>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// spv.specConstant
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseSpecConstantOp(OpAsmParser &parser,
+                                       OperationState &state) {
+  StringAttr nameAttr;
+  Attribute valueAttr;
+
+  if (parser.parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(),
+                             state.attributes))
+    return failure();
+
+  // Parse optional spec_id.
+  if (succeeded(parser.parseOptionalKeyword(kSpecIdAttrName))) {
+    IntegerAttr specIdAttr;
+    if (parser.parseLParen() ||
+        parser.parseAttribute(specIdAttr, kSpecIdAttrName, state.attributes) ||
+        parser.parseRParen())
+      return failure();
+  }
+
+  if (parser.parseEqual() ||
+      parser.parseAttribute(valueAttr, kDefaultValueAttrName, state.attributes))
+    return failure();
+
+  return success();
+}
+
+static void print(spirv::SpecConstantOp constOp, OpAsmPrinter &printer) {
+  printer << spirv::SpecConstantOp::getOperationName() << ' ';
+  printer.printSymbolName(constOp.sym_name());
+  if (auto specID = constOp.getAttrOfType<IntegerAttr>(kSpecIdAttrName))
+    printer << ' ' << kSpecIdAttrName << '(' << specID.getInt() << ')';
+  printer << " = " << constOp.default_value();
+}
+
+static LogicalResult verify(spirv::SpecConstantOp constOp) {
+  if (auto specID = constOp.getAttrOfType<IntegerAttr>(kSpecIdAttrName))
+    if (specID.getValue().isNegative())
+      return constOp.emitOpError("SpecId cannot be negative");
+
+  auto value = constOp.default_value();
+
+  switch (value.getKind()) {
+  case StandardAttributes::Bool:
+  case StandardAttributes::Integer:
+  case StandardAttributes::Float: {
+    // Make sure bitwidth is allowed.
+    if (!spirv::SPIRVDialect::isValidType(value.getType()))
+      return constOp.emitOpError("default value bitwidth disallowed");
+    return success();
+  }
+  default:
+    return constOp.emitOpError(
+        "default value can only be a bool, integer, or float scalar");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// spv.StoreOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseStoreOp(OpAsmParser &parser, OperationState &state) {
+  // Parse the storage class specification
+  spirv::StorageClass storageClass;
+  SmallVector<OpAsmParser::OperandType, 2> operandInfo;
+  auto loc = parser.getCurrentLocation();
+  Type elementType;
+  if (parseEnumAttribute(storageClass, parser) ||
+      parser.parseOperandList(operandInfo, 2) ||
+      parseMemoryAccessAttributes(parser, state) || parser.parseColon() ||
+      parser.parseType(elementType)) {
+    return failure();
+  }
+
+  auto ptrType = spirv::PointerType::get(elementType, storageClass);
+  if (parser.resolveOperands(operandInfo, {ptrType, elementType}, loc,
+                             state.operands)) {
+    return failure();
+  }
+  return success();
+}
+
+static void print(spirv::StoreOp storeOp, OpAsmPrinter &printer) {
+  auto *op = storeOp.getOperation();
+  SmallVector<StringRef, 4> elidedAttrs;
+  StringRef sc = stringifyStorageClass(
+      storeOp.ptr()->getType().cast<spirv::PointerType>().getStorageClass());
+  printer << spirv::StoreOp::getOperationName() << " \"" << sc << "\" "
+          << storeOp.ptr() << ", " << storeOp.value();
+
+  printMemoryAccessAttribute(storeOp, printer, elidedAttrs);
+
+  printer << " : " << storeOp.value()->getType();
+  printer.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
+}
+
+static LogicalResult verify(spirv::StoreOp storeOp) {
+  // SPIR-V spec : "Pointer is the pointer to store through. Its type must be an
+  // OpTypePointer whose Type operand is the same as the type of Object."
+  if (failed(verifyLoadStorePtrAndValTypes(storeOp, storeOp.ptr(),
+                                           storeOp.value()))) {
+    return failure();
+  }
+  return verifyMemoryAccessAttribute(storeOp);
+}
+
+//===----------------------------------------------------------------------===//
+// spv.SubgroupBallotKHROp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseSubgroupBallotKHROp(OpAsmParser &parser,
+                                            OperationState &state) {
+  OpAsmParser::OperandType operandInfo;
+  Type resultType;
+  IntegerType i1Type = parser.getBuilder().getI1Type();
+  if (parser.parseOperand(operandInfo) || parser.parseColonType(resultType) ||
+      parser.resolveOperand(operandInfo, i1Type, state.operands))
+    return failure();
+
+  return parser.addTypeToList(resultType, state.types);
+}
+
+static void print(spirv::SubgroupBallotKHROp ballotOp, OpAsmPrinter &printer) {
+  printer << spirv::SubgroupBallotKHROp::getOperationName() << ' '
+          << ballotOp.predicate() << " : " << ballotOp.getType();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.Undef
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseUndefOp(OpAsmParser &parser, OperationState &state) {
+  Type type;
+  if (parser.parseColonType(type)) {
+    return failure();
+  }
+  state.addTypes(type);
+  return success();
+}
+
+static void print(spirv::UndefOp undefOp, OpAsmPrinter &printer) {
+  printer << spirv::UndefOp::getOperationName() << " : " << undefOp.getType();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.Unreachable
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(spirv::UnreachableOp unreachableOp) {
+  auto *op = unreachableOp.getOperation();
+  auto *block = op->getBlock();
+  // Fast track: if this is in entry block, its invalid. Otherwise, if no
+  // predecessors, it's valid.
+  if (block->isEntryBlock())
+    return unreachableOp.emitOpError("cannot be used in reachable block");
+  if (block->hasNoPredecessors())
+    return success();
+
+  // TODO(antiagainst): further verification needs to analyze reachablility from
+  // the entry block.
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spv.Variable
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseVariableOp(OpAsmParser &parser, OperationState &state) {
+  // Parse optional initializer
+  Optional<OpAsmParser::OperandType> initInfo;
+  if (succeeded(parser.parseOptionalKeyword("init"))) {
+    initInfo = OpAsmParser::OperandType();
+    if (parser.parseLParen() || parser.parseOperand(*initInfo) ||
+        parser.parseRParen())
+      return failure();
+  }
+
+  if (parseVariableDecorations(parser, state)) {
+    return failure();
+  }
+
+  // Parse result pointer type
+  Type type;
+  if (parser.parseColon())
+    return failure();
+  auto loc = parser.getCurrentLocation();
+  if (parser.parseType(type))
+    return failure();
+
+  auto ptrType = type.dyn_cast<spirv::PointerType>();
+  if (!ptrType)
+    return parser.emitError(loc, "expected spv.ptr type");
+  state.addTypes(ptrType);
+
+  // Resolve the initializer operand
+  if (initInfo) {
+    if (parser.resolveOperand(*initInfo, ptrType.getPointeeType(),
+                              state.operands))
+      return failure();
+  }
+
+  auto attr = parser.getBuilder().getI32IntegerAttr(
+      llvm::bit_cast<int32_t>(ptrType.getStorageClass()));
+  state.addAttribute(spirv::attributeName<spirv::StorageClass>(), attr);
+
+  return success();
+}
+
+static void print(spirv::VariableOp varOp, OpAsmPrinter &printer) {
+  SmallVector<StringRef, 4> elidedAttrs{
+      spirv::attributeName<spirv::StorageClass>()};
+  printer << spirv::VariableOp::getOperationName();
+
+  // Print optional initializer
+  if (varOp.getNumOperands() != 0)
+    printer << " init(" << varOp.initializer() << ")";
+
+  printVariableDecorations(varOp, printer, elidedAttrs);
+  printer << " : " << varOp.getType();
+}
+
+static LogicalResult verify(spirv::VariableOp varOp) {
+  // SPIR-V spec: "Storage Class is the Storage Class of the memory holding the
+  // object. It cannot be Generic. It must be the same as the Storage Class
+  // operand of the Result Type."
+  if (varOp.storage_class() != spirv::StorageClass::Function) {
+    return varOp.emitOpError(
+        "can only be used to model function-level variables. Use "
+        "spv.globalVariable for module-level variables.");
+  }
+
+  auto pointerType = varOp.pointer()->getType().cast<spirv::PointerType>();
+  if (varOp.storage_class() != pointerType.getStorageClass())
+    return varOp.emitOpError(
+        "storage class must match result pointer's storage class");
+
+  if (varOp.getNumOperands() != 0) {
+    // SPIR-V spec: "Initializer must be an <id> from a constant instruction or
+    // a global (module scope) OpVariable instruction".
+    auto *initOp = varOp.getOperand(0)->getDefiningOp();
+    if (!initOp || !(isa<spirv::ConstantOp>(initOp) ||    // for normal constant
+                     isa<spirv::ReferenceOfOp>(initOp) || // for spec constant
+                     isa<spirv::AddressOfOp>(initOp)))
+      return varOp.emitOpError("initializer must be the result of a "
+                               "constant or spv.globalVariable op");
+  }
+
+  // TODO(antiagainst): generate these strings using ODS.
+  auto *op = varOp.getOperation();
+  auto descriptorSetName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::DescriptorSet));
+  auto bindingName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::Binding));
+  auto builtInName =
+      convertToSnakeCase(stringifyDecoration(spirv::Decoration::BuiltIn));
+
+  for (const auto &attr : {descriptorSetName, bindingName, builtInName}) {
+    if (op->getAttr(attr))
+      return varOp.emitOpError("cannot have '")
+             << attr << "' attribute (only allowed in spv.globalVariable)";
+  }
+
+  return success();
+}
+
+namespace mlir {
+namespace spirv {
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/SPIRV/SPIRVOps.cpp.inc"
+
+} // namespace spirv
+} // namespace mlir
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..18e027afb4c92fe2eeb23c8e068d2739d50d33ae
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp
@@ -0,0 +1,510 @@
+//===- SPIRVTypes.cpp - MLIR SPIR-V Types ---------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the types in the SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+
+using namespace mlir;
+using namespace mlir::spirv;
+
+// Pull in all enum utility function definitions
+#include "mlir/Dialect/SPIRV/SPIRVEnums.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// ArrayType
+//===----------------------------------------------------------------------===//
+
+struct spirv::detail::ArrayTypeStorage : public TypeStorage {
+  using KeyTy = std::tuple<Type, unsigned, ArrayType::LayoutInfo>;
+
+  static ArrayTypeStorage *construct(TypeStorageAllocator &allocator,
+                                     const KeyTy &key) {
+    return new (allocator.allocate<ArrayTypeStorage>()) ArrayTypeStorage(key);
+  }
+
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(elementType, getSubclassData(), layoutInfo);
+  }
+
+  ArrayTypeStorage(const KeyTy &key)
+      : TypeStorage(std::get<1>(key)), elementType(std::get<0>(key)),
+        layoutInfo(std::get<2>(key)) {}
+
+  Type elementType;
+  ArrayType::LayoutInfo layoutInfo;
+};
+
+ArrayType ArrayType::get(Type elementType, unsigned elementCount) {
+  assert(elementCount && "ArrayType needs at least one element");
+  return Base::get(elementType.getContext(), TypeKind::Array, elementType,
+                   elementCount, 0);
+}
+
+ArrayType ArrayType::get(Type elementType, unsigned elementCount,
+                         ArrayType::LayoutInfo layoutInfo) {
+  assert(elementCount && "ArrayType needs at least one element");
+  return Base::get(elementType.getContext(), TypeKind::Array, elementType,
+                   elementCount, layoutInfo);
+}
+
+unsigned ArrayType::getNumElements() const {
+  return getImpl()->getSubclassData();
+}
+
+Type ArrayType::getElementType() const { return getImpl()->elementType; }
+
+// ArrayStride must be greater than zero
+bool ArrayType::hasLayout() const { return getImpl()->layoutInfo; }
+
+uint64_t ArrayType::getArrayStride() const { return getImpl()->layoutInfo; }
+
+//===----------------------------------------------------------------------===//
+// CompositeType
+//===----------------------------------------------------------------------===//
+
+bool CompositeType::classof(Type type) {
+  switch (type.getKind()) {
+  case TypeKind::Array:
+  case TypeKind::RuntimeArray:
+  case TypeKind::Struct:
+  case StandardTypes::Vector:
+    return true;
+  default:
+    return false;
+  }
+}
+
+Type CompositeType::getElementType(unsigned index) const {
+  switch (getKind()) {
+  case spirv::TypeKind::Array:
+    return cast<ArrayType>().getElementType();
+  case spirv::TypeKind::RuntimeArray:
+    return cast<RuntimeArrayType>().getElementType();
+  case spirv::TypeKind::Struct:
+    return cast<StructType>().getElementType(index);
+  case StandardTypes::Vector:
+    return cast<VectorType>().getElementType();
+  default:
+    llvm_unreachable("invalid composite type");
+  }
+}
+
+unsigned CompositeType::getNumElements() const {
+  switch (getKind()) {
+  case spirv::TypeKind::Array:
+    return cast<ArrayType>().getNumElements();
+  case spirv::TypeKind::RuntimeArray:
+    llvm_unreachable(
+        "invalid to query number of elements of spirv::RuntimeArray type");
+  case spirv::TypeKind::Struct:
+    return cast<StructType>().getNumElements();
+  case StandardTypes::Vector:
+    return cast<VectorType>().getNumElements();
+  default:
+    llvm_unreachable("invalid composite type");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ImageType
+//===----------------------------------------------------------------------===//
+
+template <typename T> static constexpr unsigned getNumBits() { return 0; }
+template <> constexpr unsigned getNumBits<Dim>() {
+  static_assert((1 << 3) > getMaxEnumValForDim(),
+                "Not enough bits to encode Dim value");
+  return 3;
+}
+template <> constexpr unsigned getNumBits<ImageDepthInfo>() {
+  static_assert((1 << 2) > getMaxEnumValForImageDepthInfo(),
+                "Not enough bits to encode ImageDepthInfo value");
+  return 2;
+}
+template <> constexpr unsigned getNumBits<ImageArrayedInfo>() {
+  static_assert((1 << 1) > getMaxEnumValForImageArrayedInfo(),
+                "Not enough bits to encode ImageArrayedInfo value");
+  return 1;
+}
+template <> constexpr unsigned getNumBits<ImageSamplingInfo>() {
+  static_assert((1 << 1) > getMaxEnumValForImageSamplingInfo(),
+                "Not enough bits to encode ImageSamplingInfo value");
+  return 1;
+}
+template <> constexpr unsigned getNumBits<ImageSamplerUseInfo>() {
+  static_assert((1 << 2) > getMaxEnumValForImageSamplerUseInfo(),
+                "Not enough bits to encode ImageSamplerUseInfo value");
+  return 2;
+}
+template <> constexpr unsigned getNumBits<ImageFormat>() {
+  static_assert((1 << 6) > getMaxEnumValForImageFormat(),
+                "Not enough bits to encode ImageFormat value");
+  return 6;
+}
+
+struct spirv::detail::ImageTypeStorage : public TypeStorage {
+private:
+  /// Define a bit-field struct to pack the enum values
+  union EnumPack {
+    struct {
+      unsigned dimEncoding : getNumBits<Dim>();
+      unsigned depthInfoEncoding : getNumBits<ImageDepthInfo>();
+      unsigned arrayedInfoEncoding : getNumBits<ImageArrayedInfo>();
+      unsigned samplingInfoEncoding : getNumBits<ImageSamplingInfo>();
+      unsigned samplerUseInfoEncoding : getNumBits<ImageSamplerUseInfo>();
+      unsigned formatEncoding : getNumBits<ImageFormat>();
+    } data;
+    unsigned storage;
+  };
+
+public:
+  using KeyTy = std::tuple<Type, Dim, ImageDepthInfo, ImageArrayedInfo,
+                           ImageSamplingInfo, ImageSamplerUseInfo, ImageFormat>;
+
+  static ImageTypeStorage *construct(TypeStorageAllocator &allocator,
+                                     const KeyTy &key) {
+    return new (allocator.allocate<ImageTypeStorage>()) ImageTypeStorage(key);
+  }
+
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(elementType, getDim(), getDepthInfo(), getArrayedInfo(),
+                        getSamplingInfo(), getSamplerUseInfo(),
+                        getImageFormat());
+  }
+
+  Dim getDim() const {
+    EnumPack v;
+    v.storage = getSubclassData();
+    return static_cast<Dim>(v.data.dimEncoding);
+  }
+  void setDim(Dim dim) {
+    EnumPack v;
+    v.storage = getSubclassData();
+    v.data.dimEncoding = static_cast<unsigned>(dim);
+    setSubclassData(v.storage);
+  }
+
+  ImageDepthInfo getDepthInfo() const {
+    EnumPack v;
+    v.storage = getSubclassData();
+    return static_cast<ImageDepthInfo>(v.data.depthInfoEncoding);
+  }
+  void setDepthInfo(ImageDepthInfo depthInfo) {
+    EnumPack v;
+    v.storage = getSubclassData();
+    v.data.depthInfoEncoding = static_cast<unsigned>(depthInfo);
+    setSubclassData(v.storage);
+  }
+
+  ImageArrayedInfo getArrayedInfo() const {
+    EnumPack v;
+    v.storage = getSubclassData();
+    return static_cast<ImageArrayedInfo>(v.data.arrayedInfoEncoding);
+  }
+  void setArrayedInfo(ImageArrayedInfo arrayedInfo) {
+    EnumPack v;
+    v.storage = getSubclassData();
+    v.data.arrayedInfoEncoding = static_cast<unsigned>(arrayedInfo);
+    setSubclassData(v.storage);
+  }
+
+  ImageSamplingInfo getSamplingInfo() const {
+    EnumPack v;
+    v.storage = getSubclassData();
+    return static_cast<ImageSamplingInfo>(v.data.samplingInfoEncoding);
+  }
+  void setSamplingInfo(ImageSamplingInfo samplingInfo) {
+    EnumPack v;
+    v.storage = getSubclassData();
+    v.data.samplingInfoEncoding = static_cast<unsigned>(samplingInfo);
+    setSubclassData(v.storage);
+  }
+
+  ImageSamplerUseInfo getSamplerUseInfo() const {
+    EnumPack v;
+    v.storage = getSubclassData();
+    return static_cast<ImageSamplerUseInfo>(v.data.samplerUseInfoEncoding);
+  }
+  void setSamplerUseInfo(ImageSamplerUseInfo samplerUseInfo) {
+    EnumPack v;
+    v.storage = getSubclassData();
+    v.data.samplerUseInfoEncoding = static_cast<unsigned>(samplerUseInfo);
+    setSubclassData(v.storage);
+  }
+
+  ImageFormat getImageFormat() const {
+    EnumPack v;
+    v.storage = getSubclassData();
+    return static_cast<ImageFormat>(v.data.formatEncoding);
+  }
+  void setImageFormat(ImageFormat format) {
+    EnumPack v;
+    v.storage = getSubclassData();
+    v.data.formatEncoding = static_cast<unsigned>(format);
+    setSubclassData(v.storage);
+  }
+
+  ImageTypeStorage(const KeyTy &key) : elementType(std::get<0>(key)) {
+    static_assert(sizeof(EnumPack) <= sizeof(getSubclassData()),
+                  "EnumPack size greater than subClassData type size");
+    setDim(std::get<1>(key));
+    setDepthInfo(std::get<2>(key));
+    setArrayedInfo(std::get<3>(key));
+    setSamplingInfo(std::get<4>(key));
+    setSamplerUseInfo(std::get<5>(key));
+    setImageFormat(std::get<6>(key));
+  }
+
+  Type elementType;
+};
+
+ImageType
+ImageType::get(std::tuple<Type, Dim, ImageDepthInfo, ImageArrayedInfo,
+                          ImageSamplingInfo, ImageSamplerUseInfo, ImageFormat>
+                   value) {
+  return Base::get(std::get<0>(value).getContext(), TypeKind::Image, value);
+}
+
+Type ImageType::getElementType() const { return getImpl()->elementType; }
+
+Dim ImageType::getDim() const { return getImpl()->getDim(); }
+
+ImageDepthInfo ImageType::getDepthInfo() const {
+  return getImpl()->getDepthInfo();
+}
+
+ImageArrayedInfo ImageType::getArrayedInfo() const {
+  return getImpl()->getArrayedInfo();
+}
+
+ImageSamplingInfo ImageType::getSamplingInfo() const {
+  return getImpl()->getSamplingInfo();
+}
+
+ImageSamplerUseInfo ImageType::getSamplerUseInfo() const {
+  return getImpl()->getSamplerUseInfo();
+}
+
+ImageFormat ImageType::getImageFormat() const {
+  return getImpl()->getImageFormat();
+}
+
+//===----------------------------------------------------------------------===//
+// PointerType
+//===----------------------------------------------------------------------===//
+
+struct spirv::detail::PointerTypeStorage : public TypeStorage {
+  // (Type, StorageClass) as the key: Type stored in this struct, and
+  // StorageClass stored as TypeStorage's subclass data.
+  using KeyTy = std::pair<Type, StorageClass>;
+
+  static PointerTypeStorage *construct(TypeStorageAllocator &allocator,
+                                       const KeyTy &key) {
+    return new (allocator.allocate<PointerTypeStorage>())
+        PointerTypeStorage(key);
+  }
+
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(pointeeType, getStorageClass());
+  }
+
+  PointerTypeStorage(const KeyTy &key)
+      : TypeStorage(static_cast<unsigned>(key.second)), pointeeType(key.first) {
+  }
+
+  StorageClass getStorageClass() const {
+    return static_cast<StorageClass>(getSubclassData());
+  }
+
+  Type pointeeType;
+};
+
+PointerType PointerType::get(Type pointeeType, StorageClass storageClass) {
+  return Base::get(pointeeType.getContext(), TypeKind::Pointer, pointeeType,
+                   storageClass);
+}
+
+Type PointerType::getPointeeType() const { return getImpl()->pointeeType; }
+
+StorageClass PointerType::getStorageClass() const {
+  return getImpl()->getStorageClass();
+}
+
+//===----------------------------------------------------------------------===//
+// RuntimeArrayType
+//===----------------------------------------------------------------------===//
+
+struct spirv::detail::RuntimeArrayTypeStorage : public TypeStorage {
+  using KeyTy = Type;
+
+  static RuntimeArrayTypeStorage *construct(TypeStorageAllocator &allocator,
+                                            const KeyTy &key) {
+    return new (allocator.allocate<RuntimeArrayTypeStorage>())
+        RuntimeArrayTypeStorage(key);
+  }
+
+  bool operator==(const KeyTy &key) const { return elementType == key; }
+
+  RuntimeArrayTypeStorage(const KeyTy &key) : elementType(key) {}
+
+  Type elementType;
+};
+
+RuntimeArrayType RuntimeArrayType::get(Type elementType) {
+  return Base::get(elementType.getContext(), TypeKind::RuntimeArray,
+                   elementType);
+}
+
+Type RuntimeArrayType::getElementType() const { return getImpl()->elementType; }
+
+//===----------------------------------------------------------------------===//
+// StructType
+//===----------------------------------------------------------------------===//
+
+struct spirv::detail::StructTypeStorage : public TypeStorage {
+  StructTypeStorage(
+      unsigned numMembers, Type const *memberTypes,
+      StructType::LayoutInfo const *layoutInfo, unsigned numMemberDecorations,
+      StructType::MemberDecorationInfo const *memberDecorationsInfo)
+      : TypeStorage(numMembers), memberTypes(memberTypes),
+        layoutInfo(layoutInfo), numMemberDecorations(numMemberDecorations),
+        memberDecorationsInfo(memberDecorationsInfo) {}
+
+  using KeyTy = std::tuple<ArrayRef<Type>, ArrayRef<StructType::LayoutInfo>,
+                           ArrayRef<StructType::MemberDecorationInfo>>;
+  bool operator==(const KeyTy &key) const {
+    return key ==
+           KeyTy(getMemberTypes(), getLayoutInfo(), getMemberDecorationsInfo());
+  }
+
+  static StructTypeStorage *construct(TypeStorageAllocator &allocator,
+                                      const KeyTy &key) {
+    ArrayRef<Type> keyTypes = std::get<0>(key);
+
+    // Copy the member type and layout information into the bump pointer
+    const Type *typesList = nullptr;
+    if (!keyTypes.empty()) {
+      typesList = allocator.copyInto(keyTypes).data();
+    }
+
+    const StructType::LayoutInfo *layoutInfoList = nullptr;
+    if (!std::get<1>(key).empty()) {
+      ArrayRef<StructType::LayoutInfo> keyLayoutInfo = std::get<1>(key);
+      assert(keyLayoutInfo.size() == keyTypes.size() &&
+             "size of layout information must be same as the size of number of "
+             "elements");
+      layoutInfoList = allocator.copyInto(keyLayoutInfo).data();
+    }
+
+    const StructType::MemberDecorationInfo *memberDecorationList = nullptr;
+    unsigned numMemberDecorations = 0;
+    if (!std::get<2>(key).empty()) {
+      auto keyMemberDecorations = std::get<2>(key);
+      numMemberDecorations = keyMemberDecorations.size();
+      memberDecorationList = allocator.copyInto(keyMemberDecorations).data();
+    }
+    return new (allocator.allocate<StructTypeStorage>())
+        StructTypeStorage(keyTypes.size(), typesList, layoutInfoList,
+                          numMemberDecorations, memberDecorationList);
+  }
+
+  ArrayRef<Type> getMemberTypes() const {
+    return ArrayRef<Type>(memberTypes, getSubclassData());
+  }
+
+  ArrayRef<StructType::LayoutInfo> getLayoutInfo() const {
+    if (layoutInfo) {
+      return ArrayRef<StructType::LayoutInfo>(layoutInfo, getSubclassData());
+    }
+    return {};
+  }
+
+  ArrayRef<StructType::MemberDecorationInfo> getMemberDecorationsInfo() const {
+    if (memberDecorationsInfo) {
+      return ArrayRef<StructType::MemberDecorationInfo>(memberDecorationsInfo,
+                                                        numMemberDecorations);
+    }
+    return {};
+  }
+
+  Type const *memberTypes;
+  StructType::LayoutInfo const *layoutInfo;
+  unsigned numMemberDecorations;
+  StructType::MemberDecorationInfo const *memberDecorationsInfo;
+};
+
+StructType
+StructType::get(ArrayRef<Type> memberTypes,
+                ArrayRef<StructType::LayoutInfo> layoutInfo,
+                ArrayRef<StructType::MemberDecorationInfo> memberDecorations) {
+  assert(!memberTypes.empty() && "Struct needs at least one member type");
+  // Sort the decorations.
+  SmallVector<StructType::MemberDecorationInfo, 4> sortedDecorations(
+      memberDecorations.begin(), memberDecorations.end());
+  llvm::array_pod_sort(sortedDecorations.begin(), sortedDecorations.end());
+  return Base::get(memberTypes.vec().front().getContext(), TypeKind::Struct,
+                   memberTypes, layoutInfo, sortedDecorations);
+}
+
+StructType StructType::getEmpty(MLIRContext *context) {
+  return Base::get(context, TypeKind::Struct, ArrayRef<Type>(),
+                   ArrayRef<StructType::LayoutInfo>(),
+                   ArrayRef<StructType::MemberDecorationInfo>());
+}
+
+unsigned StructType::getNumElements() const {
+  return getImpl()->getSubclassData();
+}
+
+Type StructType::getElementType(unsigned index) const {
+  assert(
+      getNumElements() > index &&
+      "element index is more than number of members of the SPIR-V StructType");
+  return getImpl()->memberTypes[index];
+}
+
+bool StructType::hasLayout() const { return getImpl()->layoutInfo; }
+
+uint64_t StructType::getOffset(unsigned index) const {
+  assert(
+      getNumElements() > index &&
+      "element index is more than number of members of the SPIR-V StructType");
+  return getImpl()->layoutInfo[index];
+}
+
+void StructType::getMemberDecorations(
+    SmallVectorImpl<StructType::MemberDecorationInfo> &memberDecorations)
+    const {
+  memberDecorations.clear();
+  auto implMemberDecorations = getImpl()->getMemberDecorationsInfo();
+  memberDecorations.append(implMemberDecorations.begin(),
+                           implMemberDecorations.end());
+}
+
+void StructType::getMemberDecorations(
+    unsigned index, SmallVectorImpl<spirv::Decoration> &decorations) const {
+  assert(getNumElements() > index && "member index out of range");
+  auto memberDecorations = getImpl()->getMemberDecorationsInfo();
+  decorations.clear();
+  for (auto &memberDecoration : memberDecorations) {
+    if (memberDecoration.first == index) {
+      decorations.push_back(memberDecoration.second);
+    }
+    if (memberDecoration.first > index) {
+      // Early exit since the decorations are stored sorted.
+      return;
+    }
+  }
+}
diff --git a/mlir/lib/Dialect/SPIRV/Serialization/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/Serialization/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..05a6c9cd851fb1b2ee8e3a4a6db2952a8691d87c
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/Serialization/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_llvm_library(MLIRSPIRVSerialization
+  Deserializer.cpp
+  Serializer.cpp
+  SPIRVBinaryUtils.cpp
+  TranslateRegistration.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/SPIRV
+  )
+
+add_dependencies(MLIRSPIRVSerialization
+  MLIRSPIRVSerializationGen)
+
+target_link_libraries(MLIRSPIRVSerialization
+  MLIRIR
+  MLIRSPIRV
+  MLIRSupport)
diff --git a/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp b/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..17ddc48573abb97d4ff344f49b4f1d2c2c798e41
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp
@@ -0,0 +1,2423 @@
+//===- Deserializer.cpp - MLIR SPIR-V Deserialization ---------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SPIR-V binary to MLIR SPIR-V module deserialization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/Serialization.h"
+
+#include "mlir/Dialect/SPIRV/SPIRVBinaryUtils.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+#define DEBUG_TYPE "spirv-deserialization"
+
+/// Decodes a string literal in `words` starting at `wordIndex`. Update the
+/// latter to point to the position in words after the string literal.
+static inline StringRef decodeStringLiteral(ArrayRef<uint32_t> words,
+                                            unsigned &wordIndex) {
+  StringRef str(reinterpret_cast<const char *>(words.data() + wordIndex));
+  wordIndex += str.size() / 4 + 1;
+  return str;
+}
+
+/// Extracts the opcode from the given first word of a SPIR-V instruction.
+static inline spirv::Opcode extractOpcode(uint32_t word) {
+  return static_cast<spirv::Opcode>(word & 0xffff);
+}
+
+/// Returns true if the given `block` is a function entry block.
+static inline bool isFnEntryBlock(Block *block) {
+  return block->isEntryBlock() && isa_and_nonnull<FuncOp>(block->getParentOp());
+}
+
+namespace {
+/// A struct for containing a header block's merge and continue targets.
+///
+/// This struct is used to track original structured control flow info from
+/// SPIR-V blob. This info will be used to create spv.selection/spv.loop
+/// later.
+struct BlockMergeInfo {
+  Block *mergeBlock;
+  Block *continueBlock; // nullptr for spv.selection
+
+  BlockMergeInfo() : mergeBlock(nullptr), continueBlock(nullptr) {}
+  BlockMergeInfo(Block *m, Block *c = nullptr)
+      : mergeBlock(m), continueBlock(c) {}
+};
+
+/// Map from a selection/loop's header block to its merge (and continue) target.
+using BlockMergeInfoMap = DenseMap<Block *, BlockMergeInfo>;
+
+/// A SPIR-V module serializer.
+///
+/// A SPIR-V binary module is a single linear stream of instructions; each
+/// instruction is composed of 32-bit words. The first word of an instruction
+/// records the total number of words of that instruction using the 16
+/// higher-order bits. So this deserializer uses that to get instruction
+/// boundary and parse instructions and build a SPIR-V ModuleOp gradually.
+///
+// TODO(antiagainst): clean up created ops on errors
+class Deserializer {
+public:
+  /// Creates a deserializer for the given SPIR-V `binary` module.
+  /// The SPIR-V ModuleOp will be created into `context.
+  explicit Deserializer(ArrayRef<uint32_t> binary, MLIRContext *context);
+
+  /// Deserializes the remembered SPIR-V binary module.
+  LogicalResult deserialize();
+
+  /// Collects the final SPIR-V ModuleOp.
+  Optional<spirv::ModuleOp> collect();
+
+private:
+  //===--------------------------------------------------------------------===//
+  // Module structure
+  //===--------------------------------------------------------------------===//
+
+  /// Initializes the `module` ModuleOp in this deserializer instance.
+  spirv::ModuleOp createModuleOp();
+
+  /// Processes SPIR-V module header in `binary`.
+  LogicalResult processHeader();
+
+  /// Processes the SPIR-V OpCapability with `operands` and updates bookkeeping
+  /// in the deserializer.
+  LogicalResult processCapability(ArrayRef<uint32_t> operands);
+
+  /// Attaches all collected capabilities to `module` as an attribute.
+  void attachCapabilities();
+
+  /// Processes the SPIR-V OpExtension with `operands` and updates bookkeeping
+  /// in the deserializer.
+  LogicalResult processExtension(ArrayRef<uint32_t> words);
+
+  /// Processes the SPIR-V OpExtInstImport with `operands` and updates
+  /// bookkeeping in the deserializer.
+  LogicalResult processExtInstImport(ArrayRef<uint32_t> words);
+
+  /// Attaches all collected extensions to `module` as an attribute.
+  void attachExtensions();
+
+  /// Processes the SPIR-V OpMemoryModel with `operands` and updates `module`.
+  LogicalResult processMemoryModel(ArrayRef<uint32_t> operands);
+
+  /// Process SPIR-V OpName with `operands`.
+  LogicalResult processName(ArrayRef<uint32_t> operands);
+
+  /// Processes an OpDecorate instruction.
+  LogicalResult processDecoration(ArrayRef<uint32_t> words);
+
+  // Processes an OpMemberDecorate instruction.
+  LogicalResult processMemberDecoration(ArrayRef<uint32_t> words);
+
+  /// Processes an OpMemberName instruction.
+  LogicalResult processMemberName(ArrayRef<uint32_t> words);
+
+  /// Gets the FuncOp associated with a result <id> of OpFunction.
+  FuncOp getFunction(uint32_t id) { return funcMap.lookup(id); }
+
+  /// Processes the SPIR-V function at the current `offset` into `binary`.
+  /// The operands to the OpFunction instruction is passed in as ``operands`.
+  /// This method processes each instruction inside the function and dispatches
+  /// them to their handler method accordingly.
+  LogicalResult processFunction(ArrayRef<uint32_t> operands);
+
+  /// Processes OpFunctionEnd and finalizes function. This wires up block
+  /// argument created from OpPhi instructions and also structurizes control
+  /// flow.
+  LogicalResult processFunctionEnd(ArrayRef<uint32_t> operands);
+
+  /// Gets the constant's attribute and type associated with the given <id>.
+  Optional<std::pair<Attribute, Type>> getConstant(uint32_t id);
+
+  /// Gets the constant's integer attribute with the given <id>. Returns a null
+  /// IntegerAttr if the given is not registered or does not correspond to an
+  /// integer constant.
+  IntegerAttr getConstantInt(uint32_t id);
+
+  /// Returns a symbol to be used for the function name with the given
+  /// result <id>. This tries to use the function's OpName if
+  /// exists; otherwise creates one based on the <id>.
+  std::string getFunctionSymbol(uint32_t id);
+
+  /// Returns a symbol to be used for the specialization constant with the given
+  /// result <id>. This tries to use the specialization constant's OpName if
+  /// exists; otherwise creates one based on the <id>.
+  std::string getSpecConstantSymbol(uint32_t id);
+
+  /// Gets the specialization constant with the given result <id>.
+  spirv::SpecConstantOp getSpecConstant(uint32_t id) {
+    return specConstMap.lookup(id);
+  }
+
+  /// Creates a spirv::SpecConstantOp.
+  spirv::SpecConstantOp createSpecConstant(Location loc, uint32_t resultID,
+                                           Attribute defaultValue);
+
+  /// Processes the OpVariable instructions at current `offset` into `binary`.
+  /// It is expected that this method is used for variables that are to be
+  /// defined at module scope and will be deserialized into a spv.globalVariable
+  /// instruction.
+  LogicalResult processGlobalVariable(ArrayRef<uint32_t> operands);
+
+  /// Gets the global variable associated with a result <id> of OpVariable.
+  spirv::GlobalVariableOp getGlobalVariable(uint32_t id) {
+    return globalVariableMap.lookup(id);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Type
+  //===--------------------------------------------------------------------===//
+
+  /// Gets type for a given result <id>.
+  Type getType(uint32_t id) { return typeMap.lookup(id); }
+
+  /// Get the type associated with the result <id> of an OpUndef.
+  Type getUndefType(uint32_t id) { return undefMap.lookup(id); }
+
+  /// Returns true if the given `type` is for SPIR-V void type.
+  bool isVoidType(Type type) const { return type.isa<NoneType>(); }
+
+  /// Processes a SPIR-V type instruction with given `opcode` and `operands` and
+  /// registers the type into `module`.
+  LogicalResult processType(spirv::Opcode opcode, ArrayRef<uint32_t> operands);
+
+  LogicalResult processArrayType(ArrayRef<uint32_t> operands);
+
+  LogicalResult processFunctionType(ArrayRef<uint32_t> operands);
+
+  LogicalResult processRuntimeArrayType(ArrayRef<uint32_t> operands);
+
+  LogicalResult processStructType(ArrayRef<uint32_t> operands);
+
+  //===--------------------------------------------------------------------===//
+  // Constant
+  //===--------------------------------------------------------------------===//
+
+  /// Processes a SPIR-V Op{|Spec}Constant instruction with the given
+  /// `operands`. `isSpec` indicates whether this is a specialization constant.
+  LogicalResult processConstant(ArrayRef<uint32_t> operands, bool isSpec);
+
+  /// Processes a SPIR-V Op{|Spec}Constant{True|False} instruction with the
+  /// given `operands`. `isSpec` indicates whether this is a specialization
+  /// constant.
+  LogicalResult processConstantBool(bool isTrue, ArrayRef<uint32_t> operands,
+                                    bool isSpec);
+
+  /// Processes a SPIR-V OpConstantComposite instruction with the given
+  /// `operands`.
+  LogicalResult processConstantComposite(ArrayRef<uint32_t> operands);
+
+  /// Processes a SPIR-V OpConstantNull instruction with the given `operands`.
+  LogicalResult processConstantNull(ArrayRef<uint32_t> operands);
+
+  //===--------------------------------------------------------------------===//
+  // Control flow
+  //===--------------------------------------------------------------------===//
+
+  /// Returns the block for the given label <id>.
+  Block *getBlock(uint32_t id) const { return blockMap.lookup(id); }
+
+  // In SPIR-V, structured control flow is explicitly declared using merge
+  // instructions (OpSelectionMerge and OpLoopMerge). In the SPIR-V dialect,
+  // we use spv.selection and spv.loop to group structured control flow.
+  // The deserializer need to turn structured control flow marked with merge
+  // instructions into using spv.selection/spv.loop ops.
+  //
+  // Because structured control flow can nest and the basic block order have
+  // flexibility, we cannot isolate a structured selection/loop without
+  // deserializing all the blocks. So we use the following approach:
+  //
+  // 1. Deserialize all basic blocks in a function and create MLIR blocks for
+  //    them into the function's region. In the meanwhile, keep a map between
+  //    selection/loop header blocks to their corresponding merge (and continue)
+  //    target blocks.
+  // 2. For each selection/loop header block, recursively get all basic blocks
+  //    reachable (except the merge block) and put them in a newly created
+  //    spv.selection/spv.loop's region. Structured control flow guarantees
+  //    that we enter and exit in structured ways and the construct is nestable.
+  // 3. Put the new spv.selection/spv.loop op at the beginning of the old merge
+  //    block and redirect all branches to the old header block to the old
+  //    merge block (which contains the spv.selection/spv.loop op now).
+
+  /// For OpPhi instructions, we use block arguments to represent them. OpPhi
+  /// encodes a list of (value, predecessor) pairs. At the time of handling the
+  /// block containing an OpPhi instruction, the predecessor block might not be
+  /// processed yet, also the value sent by it. So we need to defer handling
+  /// the block argument from the predecessors. We use the following approach:
+  ///
+  /// 1. For each OpPhi instruction, add a block argument to the current block
+  ///    in construction. Record the block argument in `valueMap` so its uses
+  ///    can be resolved. For the list of (value, predecessor) pairs, update
+  ///    `blockPhiInfo` for bookkeeping.
+  /// 2. After processing all blocks, loop over `blockPhiInfo` to fix up each
+  ///    block recorded there to create the proper block arguments on their
+  ///    terminators.
+
+  /// A data structure for containing a SPIR-V block's phi info. It will be
+  /// represented as block argument in SPIR-V dialect.
+  using BlockPhiInfo =
+      SmallVector<uint32_t, 2>; // The result <id> of the values sent
+
+  /// Gets or creates the block corresponding to the given label <id>. The newly
+  /// created block will always be placed at the end of the current function.
+  Block *getOrCreateBlock(uint32_t id);
+
+  LogicalResult processBranch(ArrayRef<uint32_t> operands);
+
+  LogicalResult processBranchConditional(ArrayRef<uint32_t> operands);
+
+  /// Processes a SPIR-V OpLabel instruction with the given `operands`.
+  LogicalResult processLabel(ArrayRef<uint32_t> operands);
+
+  /// Processes a SPIR-V OpSelectionMerge instruction with the given `operands`.
+  LogicalResult processSelectionMerge(ArrayRef<uint32_t> operands);
+
+  /// Processes a SPIR-V OpLoopMerge instruction with the given `operands`.
+  LogicalResult processLoopMerge(ArrayRef<uint32_t> operands);
+
+  /// Processes a SPIR-V OpPhi instruction with the given `operands`.
+  LogicalResult processPhi(ArrayRef<uint32_t> operands);
+
+  /// Creates block arguments on predecessors previously recorded when handling
+  /// OpPhi instructions.
+  LogicalResult wireUpBlockArgument();
+
+  /// Extracts blocks belonging to a structured selection/loop into a
+  /// spv.selection/spv.loop op. This method iterates until all blocks
+  /// declared as selection/loop headers are handled.
+  LogicalResult structurizeControlFlow();
+
+  //===--------------------------------------------------------------------===//
+  // Instruction
+  //===--------------------------------------------------------------------===//
+
+  /// Get the Value associated with a result <id>.
+  ///
+  /// This method materializes normal constants and inserts "casting" ops
+  /// (`spv._address_of` and `spv._reference_of`) to turn an symbol into a SSA
+  /// value for handling uses of module scope constants/variables in functions.
+  Value getValue(uint32_t id);
+
+  /// Slices the first instruction out of `binary` and returns its opcode and
+  /// operands via `opcode` and `operands` respectively. Returns failure if
+  /// there is no more remaining instructions (`expectedOpcode` will be used to
+  /// compose the error message) or the next instruction is malformed.
+  LogicalResult
+  sliceInstruction(spirv::Opcode &opcode, ArrayRef<uint32_t> &operands,
+                   Optional<spirv::Opcode> expectedOpcode = llvm::None);
+
+  /// Processes a SPIR-V instruction with the given `opcode` and `operands`.
+  /// This method is the main entrance for handling SPIR-V instruction; it
+  /// checks the instruction opcode and dispatches to the corresponding handler.
+  /// Processing of Some instructions (like OpEntryPoint and OpExecutionMode)
+  /// might need to be deferred, since they contain forward references to <id>s
+  /// in the deserialized binary, but module in SPIR-V dialect expects these to
+  /// be ssa-uses.
+  LogicalResult processInstruction(spirv::Opcode opcode,
+                                   ArrayRef<uint32_t> operands,
+                                   bool deferInstructions = true);
+
+  /// Processes a OpUndef instruction. Adds a spv.Undef operation at the current
+  /// insertion point.
+  LogicalResult processUndef(ArrayRef<uint32_t> operands);
+
+  /// Processes an OpBitcast instruction.
+  LogicalResult processBitcast(ArrayRef<uint32_t> words);
+
+  /// Method to dispatch to the specialized deserialization function for an
+  /// operation in SPIR-V dialect that is a mirror of an instruction in the
+  /// SPIR-V spec. This is auto-generated from ODS. Dispatch is handled for
+  /// all operations in SPIR-V dialect that have hasOpcode == 1.
+  LogicalResult dispatchToAutogenDeserialization(spirv::Opcode opcode,
+                                                 ArrayRef<uint32_t> words);
+
+  /// Processes a SPIR-V OpExtInst with given `operands`. This slices the
+  /// entries of `operands` that specify the extended instruction set <id> and
+  /// the instruction opcode. The op deserializer is then invoked using the
+  /// other entries.
+  LogicalResult processExtInst(ArrayRef<uint32_t> operands);
+
+  /// Dispatches the deserialization of extended instruction set operation based
+  /// on the extended instruction set name, and instruction opcode. This is
+  /// autogenerated from ODS.
+  LogicalResult
+  dispatchToExtensionSetAutogenDeserialization(StringRef extensionSetName,
+                                               uint32_t instructionID,
+                                               ArrayRef<uint32_t> words);
+
+  /// Method to deserialize an operation in the SPIR-V dialect that is a mirror
+  /// of an instruction in the SPIR-V spec. This is auto generated if hasOpcode
+  /// == 1 and autogenSerialization == 1 in ODS.
+  template <typename OpTy> LogicalResult processOp(ArrayRef<uint32_t> words) {
+    return emitError(unknownLoc, "unsupported deserialization for ")
+           << OpTy::getOperationName() << " op";
+  }
+
+private:
+  /// The SPIR-V binary module.
+  ArrayRef<uint32_t> binary;
+
+  /// The current word offset into the binary module.
+  unsigned curOffset = 0;
+
+  /// MLIRContext to create SPIR-V ModuleOp into.
+  MLIRContext *context;
+
+  // TODO(antiagainst): create Location subclass for binary blob
+  Location unknownLoc;
+
+  /// The SPIR-V ModuleOp.
+  Optional<spirv::ModuleOp> module;
+
+  /// The current function under construction.
+  Optional<FuncOp> curFunction;
+
+  /// The current block under construction.
+  Block *curBlock = nullptr;
+
+  OpBuilder opBuilder;
+
+  /// The list of capabilities used by the module.
+  llvm::SmallSetVector<spirv::Capability, 4> capabilities;
+
+  /// The list of extensions used by the module.
+  llvm::SmallSetVector<StringRef, 2> extensions;
+
+  // Result <id> to type mapping.
+  DenseMap<uint32_t, Type> typeMap;
+
+  // Result <id> to constant attribute and type mapping.
+  ///
+  /// In the SPIR-V binary format, all constants are placed in the module and
+  /// shared by instructions at module level and in subsequent functions. But in
+  /// the SPIR-V dialect, we materialize the constant to where it's used in the
+  /// function. So when seeing a constant instruction in the binary format, we
+  /// don't immediately emit a constant op into the module, we keep its value
+  /// (and type) here. Later when it's used, we materialize the constant.
+  DenseMap<uint32_t, std::pair<Attribute, Type>> constantMap;
+
+  // Result <id> to variable mapping.
+  DenseMap<uint32_t, spirv::SpecConstantOp> specConstMap;
+
+  // Result <id> to variable mapping.
+  DenseMap<uint32_t, spirv::GlobalVariableOp> globalVariableMap;
+
+  // Result <id> to function mapping.
+  DenseMap<uint32_t, FuncOp> funcMap;
+
+  // Result <id> to block mapping.
+  DenseMap<uint32_t, Block *> blockMap;
+
+  // Header block to its merge (and continue) target mapping.
+  BlockMergeInfoMap blockMergeInfo;
+
+  // Block to its phi (block argument) mapping.
+  DenseMap<Block *, BlockPhiInfo> blockPhiInfo;
+
+  // Result <id> to value mapping.
+  DenseMap<uint32_t, Value> valueMap;
+
+  // Mapping from result <id> to undef value of a type.
+  DenseMap<uint32_t, Type> undefMap;
+
+  // Result <id> to name mapping.
+  DenseMap<uint32_t, StringRef> nameMap;
+
+  // Result <id> to decorations mapping.
+  DenseMap<uint32_t, NamedAttributeList> decorations;
+
+  // Result <id> to type decorations.
+  DenseMap<uint32_t, uint32_t> typeDecorations;
+
+  // Result <id> to member decorations.
+  // decorated-struct-type-<id> ->
+  //    (struct-member-index -> (decoration -> decoration-operands))
+  DenseMap<uint32_t,
+           DenseMap<uint32_t, DenseMap<spirv::Decoration, ArrayRef<uint32_t>>>>
+      memberDecorationMap;
+
+  // Result <id> to member name.
+  // struct-type-<id> -> (struct-member-index -> name)
+  DenseMap<uint32_t, DenseMap<uint32_t, StringRef>> memberNameMap;
+
+  // Result <id> to extended instruction set name.
+  DenseMap<uint32_t, StringRef> extendedInstSets;
+
+  // List of instructions that are processed in a deferred fashion (after an
+  // initial processing of the entire binary). Some operations like
+  // OpEntryPoint, and OpExecutionMode use forward references to function
+  // <id>s. In SPIR-V dialect the corresponding operations (spv.EntryPoint and
+  // spv.ExecutionMode) need these references resolved. So these instructions
+  // are deserialized and stored for processing once the entire binary is
+  // processed.
+  SmallVector<std::pair<spirv::Opcode, ArrayRef<uint32_t>>, 4>
+      deferredInstructions;
+};
+} // namespace
+
+Deserializer::Deserializer(ArrayRef<uint32_t> binary, MLIRContext *context)
+    : binary(binary), context(context), unknownLoc(UnknownLoc::get(context)),
+      module(createModuleOp()), opBuilder(module->body()) {}
+
+LogicalResult Deserializer::deserialize() {
+  LLVM_DEBUG(llvm::dbgs() << "+++ starting deserialization +++\n");
+
+  if (failed(processHeader()))
+    return failure();
+
+  spirv::Opcode opcode = spirv::Opcode::OpNop;
+  ArrayRef<uint32_t> operands;
+  auto binarySize = binary.size();
+  while (curOffset < binarySize) {
+    // Slice the next instruction out and populate `opcode` and `operands`.
+    // Internally this also updates `curOffset`.
+    if (failed(sliceInstruction(opcode, operands)))
+      return failure();
+
+    if (failed(processInstruction(opcode, operands)))
+      return failure();
+  }
+
+  assert(curOffset == binarySize &&
+         "deserializer should never index beyond the binary end");
+
+  for (auto &deferred : deferredInstructions) {
+    if (failed(processInstruction(deferred.first, deferred.second, false))) {
+      return failure();
+    }
+  }
+
+  // Attaches the capabilities/extensions as an attribute to the module.
+  attachCapabilities();
+  attachExtensions();
+
+  LLVM_DEBUG(llvm::dbgs() << "+++ completed deserialization +++\n");
+  return success();
+}
+
+Optional<spirv::ModuleOp> Deserializer::collect() { return module; }
+
+//===----------------------------------------------------------------------===//
+// Module structure
+//===----------------------------------------------------------------------===//
+
+spirv::ModuleOp Deserializer::createModuleOp() {
+  Builder builder(context);
+  OperationState state(unknownLoc, spirv::ModuleOp::getOperationName());
+  // TODO(antiagainst): use target environment to select the version
+  state.addAttribute("major_version", builder.getI32IntegerAttr(1));
+  state.addAttribute("minor_version", builder.getI32IntegerAttr(0));
+  spirv::ModuleOp::build(&builder, state);
+  return cast<spirv::ModuleOp>(Operation::create(state));
+}
+
+LogicalResult Deserializer::processHeader() {
+  if (binary.size() < spirv::kHeaderWordCount)
+    return emitError(unknownLoc,
+                     "SPIR-V binary module must have a 5-word header");
+
+  if (binary[0] != spirv::kMagicNumber)
+    return emitError(unknownLoc, "incorrect magic number");
+
+  // TODO(antiagainst): generator number, bound, schema
+  curOffset = spirv::kHeaderWordCount;
+  return success();
+}
+
+LogicalResult Deserializer::processCapability(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 1)
+    return emitError(unknownLoc, "OpMemoryModel must have one parameter");
+
+  auto cap = spirv::symbolizeCapability(operands[0]);
+  if (!cap)
+    return emitError(unknownLoc, "unknown capability: ") << operands[0];
+
+  capabilities.insert(*cap);
+  return success();
+}
+
+void Deserializer::attachCapabilities() {
+  if (capabilities.empty())
+    return;
+
+  SmallVector<StringRef, 2> caps;
+  caps.reserve(capabilities.size());
+
+  for (auto cap : capabilities) {
+    caps.push_back(spirv::stringifyCapability(cap));
+  }
+
+  module->setAttr("capabilities", opBuilder.getStrArrayAttr(caps));
+}
+
+LogicalResult Deserializer::processExtension(ArrayRef<uint32_t> words) {
+  if (words.empty()) {
+    return emitError(
+        unknownLoc,
+        "OpExtension must have a literal string for the extension name");
+  }
+
+  unsigned wordIndex = 0;
+  StringRef extName = decodeStringLiteral(words, wordIndex);
+  if (wordIndex != words.size()) {
+    return emitError(unknownLoc,
+                     "unexpected trailing words in OpExtension instruction");
+  }
+
+  extensions.insert(extName);
+  return success();
+}
+
+LogicalResult Deserializer::processExtInstImport(ArrayRef<uint32_t> words) {
+  if (words.size() < 2) {
+    return emitError(unknownLoc,
+                     "OpExtInstImport must have a result <id> and a literal "
+                     "string for the extended instruction set name");
+  }
+
+  unsigned wordIndex = 1;
+  extendedInstSets[words[0]] = decodeStringLiteral(words, wordIndex);
+  if (wordIndex != words.size()) {
+    return emitError(unknownLoc,
+                     "unexpected trailing words in OpExtInstImport");
+  }
+  return success();
+}
+
+void Deserializer::attachExtensions() {
+  if (extensions.empty())
+    return;
+
+  module->setAttr("extensions",
+                  opBuilder.getStrArrayAttr(extensions.getArrayRef()));
+}
+
+LogicalResult Deserializer::processMemoryModel(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 2)
+    return emitError(unknownLoc, "OpMemoryModel must have two operands");
+
+  module->setAttr(
+      "addressing_model",
+      opBuilder.getI32IntegerAttr(llvm::bit_cast<int32_t>(operands.front())));
+  module->setAttr(
+      "memory_model",
+      opBuilder.getI32IntegerAttr(llvm::bit_cast<int32_t>(operands.back())));
+
+  return success();
+}
+
+LogicalResult Deserializer::processDecoration(ArrayRef<uint32_t> words) {
+  // TODO : This function should also be auto-generated. For now, since only a
+  // few decorations are processed/handled in a meaningful manner, going with a
+  // manual implementation.
+  if (words.size() < 2) {
+    return emitError(
+        unknownLoc, "OpDecorate must have at least result <id> and Decoration");
+  }
+  auto decorationName =
+      stringifyDecoration(static_cast<spirv::Decoration>(words[1]));
+  if (decorationName.empty()) {
+    return emitError(unknownLoc, "invalid Decoration code : ") << words[1];
+  }
+  auto attrName = convertToSnakeCase(decorationName);
+  auto symbol = opBuilder.getIdentifier(attrName);
+  switch (static_cast<spirv::Decoration>(words[1])) {
+  case spirv::Decoration::DescriptorSet:
+  case spirv::Decoration::Binding:
+    if (words.size() != 3) {
+      return emitError(unknownLoc, "OpDecorate with ")
+             << decorationName << " needs a single integer literal";
+    }
+    decorations[words[0]].set(
+        symbol, opBuilder.getI32IntegerAttr(static_cast<int32_t>(words[2])));
+    break;
+  case spirv::Decoration::BuiltIn:
+    if (words.size() != 3) {
+      return emitError(unknownLoc, "OpDecorate with ")
+             << decorationName << " needs a single integer literal";
+    }
+    decorations[words[0]].set(
+        symbol, opBuilder.getStringAttr(
+                    stringifyBuiltIn(static_cast<spirv::BuiltIn>(words[2]))));
+    break;
+  case spirv::Decoration::ArrayStride:
+    if (words.size() != 3) {
+      return emitError(unknownLoc, "OpDecorate with ")
+             << decorationName << " needs a single integer literal";
+    }
+    typeDecorations[words[0]] = words[2];
+    break;
+  case spirv::Decoration::Block:
+  case spirv::Decoration::BufferBlock:
+    if (words.size() != 2) {
+      return emitError(unknownLoc, "OpDecoration with ")
+             << decorationName << "needs a single target <id>";
+    }
+    // Block decoration does not affect spv.struct type, but is still stored for
+    // verification.
+    // TODO: Update StructType to contain this information since
+    // it is needed for many validation rules.
+    decorations[words[0]].set(symbol, opBuilder.getUnitAttr());
+    break;
+  case spirv::Decoration::SpecId:
+    if (words.size() != 3) {
+      return emitError(unknownLoc, "OpDecoration with ")
+             << decorationName << "needs a single integer literal";
+    }
+    decorations[words[0]].set(
+        symbol, opBuilder.getI32IntegerAttr(static_cast<int32_t>(words[2])));
+    break;
+  default:
+    return emitError(unknownLoc, "unhandled Decoration : '") << decorationName;
+  }
+  return success();
+}
+
+LogicalResult Deserializer::processMemberDecoration(ArrayRef<uint32_t> words) {
+  // The binary layout of OpMemberDecorate is different comparing to OpDecorate
+  if (words.size() < 3) {
+    return emitError(unknownLoc,
+                     "OpMemberDecorate must have at least 3 operands");
+  }
+
+  auto decoration = static_cast<spirv::Decoration>(words[2]);
+  if (decoration == spirv::Decoration::Offset && words.size() != 4) {
+    return emitError(unknownLoc,
+                     " missing offset specification in OpMemberDecorate with "
+                     "Offset decoration");
+  }
+  ArrayRef<uint32_t> decorationOperands;
+  if (words.size() > 3) {
+    decorationOperands = words.slice(3);
+  }
+  memberDecorationMap[words[0]][words[1]][decoration] = decorationOperands;
+  return success();
+}
+
+LogicalResult Deserializer::processMemberName(ArrayRef<uint32_t> words) {
+  if (words.size() < 3) {
+    return emitError(unknownLoc, "OpMemberName must have at least 3 operands");
+  }
+  unsigned wordIndex = 2;
+  auto name = decodeStringLiteral(words, wordIndex);
+  if (wordIndex != words.size()) {
+    return emitError(unknownLoc,
+                     "unexpected trailing words in OpMemberName instruction");
+  }
+  memberNameMap[words[0]][words[1]] = name;
+  return success();
+}
+
+LogicalResult Deserializer::processFunction(ArrayRef<uint32_t> operands) {
+  if (curFunction) {
+    return emitError(unknownLoc, "found function inside function");
+  }
+
+  // Get the result type
+  if (operands.size() != 4) {
+    return emitError(unknownLoc, "OpFunction must have 4 parameters");
+  }
+  Type resultType = getType(operands[0]);
+  if (!resultType) {
+    return emitError(unknownLoc, "undefined result type from <id> ")
+           << operands[0];
+  }
+
+  if (funcMap.count(operands[1])) {
+    return emitError(unknownLoc, "duplicate function definition/declaration");
+  }
+
+  auto functionControl = spirv::symbolizeFunctionControl(operands[2]);
+  if (!functionControl) {
+    return emitError(unknownLoc, "unknown Function Control: ") << operands[2];
+  }
+  if (functionControl.getValue() != spirv::FunctionControl::None) {
+    /// TODO : Handle different function controls
+    return emitError(unknownLoc, "unhandled Function Control: '")
+           << spirv::stringifyFunctionControl(functionControl.getValue())
+           << "'";
+  }
+
+  Type fnType = getType(operands[3]);
+  if (!fnType || !fnType.isa<FunctionType>()) {
+    return emitError(unknownLoc, "unknown function type from <id> ")
+           << operands[3];
+  }
+  auto functionType = fnType.cast<FunctionType>();
+
+  if ((isVoidType(resultType) && functionType.getNumResults() != 0) ||
+      (functionType.getNumResults() == 1 &&
+       functionType.getResult(0) != resultType)) {
+    return emitError(unknownLoc, "mismatch in function type ")
+           << functionType << " and return type " << resultType << " specified";
+  }
+
+  std::string fnName = getFunctionSymbol(operands[1]);
+  auto funcOp = opBuilder.create<FuncOp>(unknownLoc, fnName, functionType,
+                                         ArrayRef<NamedAttribute>());
+  curFunction = funcMap[operands[1]] = funcOp;
+  LLVM_DEBUG(llvm::dbgs() << "-- start function " << fnName << " (type = "
+                          << fnType << ", id = " << operands[1] << ") --\n");
+  auto *entryBlock = funcOp.addEntryBlock();
+  LLVM_DEBUG(llvm::dbgs() << "[block] created entry block " << entryBlock
+                          << "\n");
+
+  // Parse the op argument instructions
+  if (functionType.getNumInputs()) {
+    for (size_t i = 0, e = functionType.getNumInputs(); i != e; ++i) {
+      auto argType = functionType.getInput(i);
+      spirv::Opcode opcode = spirv::Opcode::OpNop;
+      ArrayRef<uint32_t> operands;
+      if (failed(sliceInstruction(opcode, operands,
+                                  spirv::Opcode::OpFunctionParameter))) {
+        return failure();
+      }
+      if (opcode != spirv::Opcode::OpFunctionParameter) {
+        return emitError(
+                   unknownLoc,
+                   "missing OpFunctionParameter instruction for argument ")
+               << i;
+      }
+      if (operands.size() != 2) {
+        return emitError(
+            unknownLoc,
+            "expected result type and result <id> for OpFunctionParameter");
+      }
+      auto argDefinedType = getType(operands[0]);
+      if (!argDefinedType || argDefinedType != argType) {
+        return emitError(unknownLoc,
+                         "mismatch in argument type between function type "
+                         "definition ")
+               << functionType << " and argument type definition "
+               << argDefinedType << " at argument " << i;
+      }
+      if (getValue(operands[1])) {
+        return emitError(unknownLoc, "duplicate definition of result <id> '")
+               << operands[1];
+      }
+      auto argValue = funcOp.getArgument(i);
+      valueMap[operands[1]] = argValue;
+    }
+  }
+
+  // RAII guard to reset the insertion point to the module's region after
+  // deserializing the body of this function.
+  OpBuilder::InsertionGuard moduleInsertionGuard(opBuilder);
+
+  spirv::Opcode opcode = spirv::Opcode::OpNop;
+  ArrayRef<uint32_t> instOperands;
+
+  // Special handling for the entry block. We need to make sure it starts with
+  // an OpLabel instruction. The entry block takes the same parameters as the
+  // function. All other blocks do not take any parameter. We have already
+  // created the entry block, here we need to register it to the correct label
+  // <id>.
+  if (failed(sliceInstruction(opcode, instOperands,
+                              spirv::Opcode::OpFunctionEnd))) {
+    return failure();
+  }
+  if (opcode == spirv::Opcode::OpFunctionEnd) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "-- completed function '" << fnName << "' (type = " << fnType
+               << ", id = " << operands[1] << ") --\n");
+    return processFunctionEnd(instOperands);
+  }
+  if (opcode != spirv::Opcode::OpLabel) {
+    return emitError(unknownLoc, "a basic block must start with OpLabel");
+  }
+  if (instOperands.size() != 1) {
+    return emitError(unknownLoc, "OpLabel should only have result <id>");
+  }
+  blockMap[instOperands[0]] = entryBlock;
+  if (failed(processLabel(instOperands))) {
+    return failure();
+  }
+
+  // Then process all the other instructions in the function until we hit
+  // OpFunctionEnd.
+  while (succeeded(sliceInstruction(opcode, instOperands,
+                                    spirv::Opcode::OpFunctionEnd)) &&
+         opcode != spirv::Opcode::OpFunctionEnd) {
+    if (failed(processInstruction(opcode, instOperands))) {
+      return failure();
+    }
+  }
+  if (opcode != spirv::Opcode::OpFunctionEnd) {
+    return failure();
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "-- completed function '" << fnName << "' (type = "
+                          << fnType << ", id = " << operands[1] << ") --\n");
+  return processFunctionEnd(instOperands);
+}
+
+LogicalResult Deserializer::processFunctionEnd(ArrayRef<uint32_t> operands) {
+  // Process OpFunctionEnd.
+  if (!operands.empty()) {
+    return emitError(unknownLoc, "unexpected operands for OpFunctionEnd");
+  }
+
+  // Wire up block arguments from OpPhi instructions.
+  // Put all structured control flow in spv.selection/spv.loop ops.
+  if (failed(wireUpBlockArgument()) || failed(structurizeControlFlow())) {
+    return failure();
+  }
+
+  curBlock = nullptr;
+  curFunction = llvm::None;
+
+  return success();
+}
+
+Optional<std::pair<Attribute, Type>> Deserializer::getConstant(uint32_t id) {
+  auto constIt = constantMap.find(id);
+  if (constIt == constantMap.end())
+    return llvm::None;
+  return constIt->getSecond();
+}
+
+std::string Deserializer::getFunctionSymbol(uint32_t id) {
+  auto funcName = nameMap.lookup(id).str();
+  if (funcName.empty()) {
+    funcName = "spirv_fn_" + std::to_string(id);
+  }
+  return funcName;
+}
+
+std::string Deserializer::getSpecConstantSymbol(uint32_t id) {
+  auto constName = nameMap.lookup(id).str();
+  if (constName.empty()) {
+    constName = "spirv_spec_const_" + std::to_string(id);
+  }
+  return constName;
+}
+
+spirv::SpecConstantOp Deserializer::createSpecConstant(Location loc,
+                                                       uint32_t resultID,
+                                                       Attribute defaultValue) {
+  auto symName = opBuilder.getStringAttr(getSpecConstantSymbol(resultID));
+  auto op = opBuilder.create<spirv::SpecConstantOp>(unknownLoc, symName,
+                                                    defaultValue);
+  if (decorations.count(resultID)) {
+    for (auto attr : decorations[resultID].getAttrs())
+      op.setAttr(attr.first, attr.second);
+  }
+  specConstMap[resultID] = op;
+  return op;
+}
+
+LogicalResult Deserializer::processGlobalVariable(ArrayRef<uint32_t> operands) {
+  unsigned wordIndex = 0;
+  if (operands.size() < 3) {
+    return emitError(
+        unknownLoc,
+        "OpVariable needs at least 3 operands, type, <id> and storage class");
+  }
+
+  // Result Type.
+  auto type = getType(operands[wordIndex]);
+  if (!type) {
+    return emitError(unknownLoc, "unknown result type <id> : ")
+           << operands[wordIndex];
+  }
+  auto ptrType = type.dyn_cast<spirv::PointerType>();
+  if (!ptrType) {
+    return emitError(unknownLoc,
+                     "expected a result type <id> to be a spv.ptr, found : ")
+           << type;
+  }
+  wordIndex++;
+
+  // Result <id>.
+  auto variableID = operands[wordIndex];
+  auto variableName = nameMap.lookup(variableID).str();
+  if (variableName.empty()) {
+    variableName = "spirv_var_" + std::to_string(variableID);
+  }
+  wordIndex++;
+
+  // Storage class.
+  auto storageClass = static_cast<spirv::StorageClass>(operands[wordIndex]);
+  if (ptrType.getStorageClass() != storageClass) {
+    return emitError(unknownLoc, "mismatch in storage class of pointer type ")
+           << type << " and that specified in OpVariable instruction  : "
+           << stringifyStorageClass(storageClass);
+  }
+  wordIndex++;
+
+  // Initializer.
+  FlatSymbolRefAttr initializer = nullptr;
+  if (wordIndex < operands.size()) {
+    auto initializerOp = getGlobalVariable(operands[wordIndex]);
+    if (!initializerOp) {
+      return emitError(unknownLoc, "unknown <id> ")
+             << operands[wordIndex] << "used as initializer";
+    }
+    wordIndex++;
+    initializer = opBuilder.getSymbolRefAttr(initializerOp.getOperation());
+  }
+  if (wordIndex != operands.size()) {
+    return emitError(unknownLoc,
+                     "found more operands than expected when deserializing "
+                     "OpVariable instruction, only ")
+           << wordIndex << " of " << operands.size() << " processed";
+  }
+  auto varOp = opBuilder.create<spirv::GlobalVariableOp>(
+      unknownLoc, TypeAttr::get(type), opBuilder.getStringAttr(variableName),
+      initializer);
+
+  // Decorations.
+  if (decorations.count(variableID)) {
+    for (auto attr : decorations[variableID].getAttrs()) {
+      varOp.setAttr(attr.first, attr.second);
+    }
+  }
+  globalVariableMap[variableID] = varOp;
+  return success();
+}
+
+IntegerAttr Deserializer::getConstantInt(uint32_t id) {
+  auto constInfo = getConstant(id);
+  if (!constInfo) {
+    return nullptr;
+  }
+  return constInfo->first.dyn_cast<IntegerAttr>();
+}
+
+LogicalResult Deserializer::processName(ArrayRef<uint32_t> operands) {
+  if (operands.size() < 2) {
+    return emitError(unknownLoc, "OpName needs at least 2 operands");
+  }
+  if (!nameMap.lookup(operands[0]).empty()) {
+    return emitError(unknownLoc, "duplicate name found for result <id> ")
+           << operands[0];
+  }
+  unsigned wordIndex = 1;
+  StringRef name = decodeStringLiteral(operands, wordIndex);
+  if (wordIndex != operands.size()) {
+    return emitError(unknownLoc,
+                     "unexpected trailing words in OpName instruction");
+  }
+  nameMap[operands[0]] = name;
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Type
+//===----------------------------------------------------------------------===//
+
+LogicalResult Deserializer::processType(spirv::Opcode opcode,
+                                        ArrayRef<uint32_t> operands) {
+  if (operands.empty()) {
+    return emitError(unknownLoc, "type instruction with opcode ")
+           << spirv::stringifyOpcode(opcode) << " needs at least one <id>";
+  }
+
+  /// TODO: Types might be forward declared in some instructions and need to be
+  /// handled appropriately.
+  if (typeMap.count(operands[0])) {
+    return emitError(unknownLoc, "duplicate definition for result <id> ")
+           << operands[0];
+  }
+
+  switch (opcode) {
+  case spirv::Opcode::OpTypeVoid:
+    if (operands.size() != 1) {
+      return emitError(unknownLoc, "OpTypeVoid must have no parameters");
+    }
+    typeMap[operands[0]] = opBuilder.getNoneType();
+    break;
+  case spirv::Opcode::OpTypeBool:
+    if (operands.size() != 1) {
+      return emitError(unknownLoc, "OpTypeBool must have no parameters");
+    }
+    typeMap[operands[0]] = opBuilder.getI1Type();
+    break;
+  case spirv::Opcode::OpTypeInt:
+    if (operands.size() != 3) {
+      return emitError(
+          unknownLoc, "OpTypeInt must have bitwidth and signedness parameters");
+    }
+    // TODO: Ignoring the signedness right now. Need to handle this effectively
+    // in the MLIR representation.
+    typeMap[operands[0]] = opBuilder.getIntegerType(operands[1]);
+    break;
+  case spirv::Opcode::OpTypeFloat: {
+    if (operands.size() != 2) {
+      return emitError(unknownLoc, "OpTypeFloat must have bitwidth parameter");
+    }
+    Type floatTy;
+    switch (operands[1]) {
+    case 16:
+      floatTy = opBuilder.getF16Type();
+      break;
+    case 32:
+      floatTy = opBuilder.getF32Type();
+      break;
+    case 64:
+      floatTy = opBuilder.getF64Type();
+      break;
+    default:
+      return emitError(unknownLoc, "unsupported OpTypeFloat bitwidth: ")
+             << operands[1];
+    }
+    typeMap[operands[0]] = floatTy;
+  } break;
+  case spirv::Opcode::OpTypeVector: {
+    if (operands.size() != 3) {
+      return emitError(
+          unknownLoc,
+          "OpTypeVector must have element type and count parameters");
+    }
+    Type elementTy = getType(operands[1]);
+    if (!elementTy) {
+      return emitError(unknownLoc, "OpTypeVector references undefined <id> ")
+             << operands[1];
+    }
+    typeMap[operands[0]] = VectorType::get({operands[2]}, elementTy);
+  } break;
+  case spirv::Opcode::OpTypePointer: {
+    if (operands.size() != 3) {
+      return emitError(unknownLoc, "OpTypePointer must have two parameters");
+    }
+    auto pointeeType = getType(operands[2]);
+    if (!pointeeType) {
+      return emitError(unknownLoc, "unknown OpTypePointer pointee type <id> ")
+             << operands[2];
+    }
+    auto storageClass = static_cast<spirv::StorageClass>(operands[1]);
+    typeMap[operands[0]] = spirv::PointerType::get(pointeeType, storageClass);
+  } break;
+  case spirv::Opcode::OpTypeArray:
+    return processArrayType(operands);
+  case spirv::Opcode::OpTypeFunction:
+    return processFunctionType(operands);
+  case spirv::Opcode::OpTypeRuntimeArray:
+    return processRuntimeArrayType(operands);
+  case spirv::Opcode::OpTypeStruct:
+    return processStructType(operands);
+  default:
+    return emitError(unknownLoc, "unhandled type instruction");
+  }
+  return success();
+}
+
+LogicalResult Deserializer::processArrayType(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 3) {
+    return emitError(unknownLoc,
+                     "OpTypeArray must have element type and count parameters");
+  }
+
+  Type elementTy = getType(operands[1]);
+  if (!elementTy) {
+    return emitError(unknownLoc, "OpTypeArray references undefined <id> ")
+           << operands[1];
+  }
+
+  unsigned count = 0;
+  // TODO(antiagainst): The count can also come frome a specialization constant.
+  auto countInfo = getConstant(operands[2]);
+  if (!countInfo) {
+    return emitError(unknownLoc, "OpTypeArray count <id> ")
+           << operands[2] << "can only come from normal constant right now";
+  }
+
+  if (auto intVal = countInfo->first.dyn_cast<IntegerAttr>()) {
+    count = intVal.getInt();
+  } else {
+    return emitError(unknownLoc, "OpTypeArray count must come from a "
+                                 "scalar integer constant instruction");
+  }
+
+  typeMap[operands[0]] = spirv::ArrayType::get(
+      elementTy, count, typeDecorations.lookup(operands[0]));
+  return success();
+}
+
+LogicalResult Deserializer::processFunctionType(ArrayRef<uint32_t> operands) {
+  assert(!operands.empty() && "No operands for processing function type");
+  if (operands.size() == 1) {
+    return emitError(unknownLoc, "missing return type for OpTypeFunction");
+  }
+  auto returnType = getType(operands[1]);
+  if (!returnType) {
+    return emitError(unknownLoc, "unknown return type in OpTypeFunction");
+  }
+  SmallVector<Type, 1> argTypes;
+  for (size_t i = 2, e = operands.size(); i < e; ++i) {
+    auto ty = getType(operands[i]);
+    if (!ty) {
+      return emitError(unknownLoc, "unknown argument type in OpTypeFunction");
+    }
+    argTypes.push_back(ty);
+  }
+  ArrayRef<Type> returnTypes;
+  if (!isVoidType(returnType)) {
+    returnTypes = llvm::makeArrayRef(returnType);
+  }
+  typeMap[operands[0]] = FunctionType::get(argTypes, returnTypes, context);
+  return success();
+}
+
+LogicalResult
+Deserializer::processRuntimeArrayType(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 2) {
+    return emitError(unknownLoc, "OpTypeRuntimeArray must have two operands");
+  }
+  Type memberType = getType(operands[1]);
+  if (!memberType) {
+    return emitError(unknownLoc,
+                     "OpTypeRuntimeArray references undefined <id> ")
+           << operands[1];
+  }
+  typeMap[operands[0]] = spirv::RuntimeArrayType::get(memberType);
+  return success();
+}
+
+LogicalResult Deserializer::processStructType(ArrayRef<uint32_t> operands) {
+  if (operands.empty()) {
+    return emitError(unknownLoc, "OpTypeStruct must have at least result <id>");
+  }
+  if (operands.size() == 1) {
+    // Handle empty struct.
+    typeMap[operands[0]] = spirv::StructType::getEmpty(context);
+    return success();
+  }
+
+  SmallVector<Type, 0> memberTypes;
+  for (auto op : llvm::drop_begin(operands, 1)) {
+    Type memberType = getType(op);
+    if (!memberType) {
+      return emitError(unknownLoc, "OpTypeStruct references undefined <id> ")
+             << op;
+    }
+    memberTypes.push_back(memberType);
+  }
+
+  SmallVector<spirv::StructType::LayoutInfo, 0> layoutInfo;
+  SmallVector<spirv::StructType::MemberDecorationInfo, 0> memberDecorationsInfo;
+  if (memberDecorationMap.count(operands[0])) {
+    auto &allMemberDecorations = memberDecorationMap[operands[0]];
+    for (auto memberIndex : llvm::seq<uint32_t>(0, memberTypes.size())) {
+      if (allMemberDecorations.count(memberIndex)) {
+        for (auto &memberDecoration : allMemberDecorations[memberIndex]) {
+          // Check for offset.
+          if (memberDecoration.first == spirv::Decoration::Offset) {
+            // If layoutInfo is empty, resize to the number of members;
+            if (layoutInfo.empty()) {
+              layoutInfo.resize(memberTypes.size());
+            }
+            layoutInfo[memberIndex] = memberDecoration.second[0];
+          } else {
+            if (!memberDecoration.second.empty()) {
+              return emitError(unknownLoc,
+                               "unhandled OpMemberDecoration with decoration ")
+                     << stringifyDecoration(memberDecoration.first)
+                     << " which has additional operands";
+            }
+            memberDecorationsInfo.emplace_back(memberIndex,
+                                               memberDecoration.first);
+          }
+        }
+      }
+    }
+  }
+  typeMap[operands[0]] =
+      spirv::StructType::get(memberTypes, layoutInfo, memberDecorationsInfo);
+  // TODO(ravishankarm): Update StructType to have member name as attribute as
+  // well.
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Constant
+//===----------------------------------------------------------------------===//
+
+LogicalResult Deserializer::processConstant(ArrayRef<uint32_t> operands,
+                                            bool isSpec) {
+  StringRef opname = isSpec ? "OpSpecConstant" : "OpConstant";
+
+  if (operands.size() < 2) {
+    return emitError(unknownLoc)
+           << opname << " must have type <id> and result <id>";
+  }
+  if (operands.size() < 3) {
+    return emitError(unknownLoc)
+           << opname << " must have at least 1 more parameter";
+  }
+
+  Type resultType = getType(operands[0]);
+  if (!resultType) {
+    return emitError(unknownLoc, "undefined result type from <id> ")
+           << operands[0];
+  }
+
+  auto checkOperandSizeForBitwidth = [&](unsigned bitwidth) -> LogicalResult {
+    if (bitwidth == 64) {
+      if (operands.size() == 4) {
+        return success();
+      }
+      return emitError(unknownLoc)
+             << opname << " should have 2 parameters for 64-bit values";
+    }
+    if (bitwidth <= 32) {
+      if (operands.size() == 3) {
+        return success();
+      }
+
+      return emitError(unknownLoc)
+             << opname
+             << " should have 1 parameter for values with no more than 32 bits";
+    }
+    return emitError(unknownLoc, "unsupported OpConstant bitwidth: ")
+           << bitwidth;
+  };
+
+  auto resultID = operands[1];
+
+  if (auto intType = resultType.dyn_cast<IntegerType>()) {
+    auto bitwidth = intType.getWidth();
+    if (failed(checkOperandSizeForBitwidth(bitwidth))) {
+      return failure();
+    }
+
+    APInt value;
+    if (bitwidth == 64) {
+      // 64-bit integers are represented with two SPIR-V words. According to
+      // SPIR-V spec: "When the type’s bit width is larger than one word, the
+      // literal’s low-order words appear first."
+      struct DoubleWord {
+        uint32_t word1;
+        uint32_t word2;
+      } words = {operands[2], operands[3]};
+      value = APInt(64, llvm::bit_cast<uint64_t>(words), /*isSigned=*/true);
+    } else if (bitwidth <= 32) {
+      value = APInt(bitwidth, operands[2], /*isSigned=*/true);
+    }
+
+    auto attr = opBuilder.getIntegerAttr(intType, value);
+
+    if (isSpec) {
+      createSpecConstant(unknownLoc, resultID, attr);
+    } else {
+      // For normal constants, we just record the attribute (and its type) for
+      // later materialization at use sites.
+      constantMap.try_emplace(resultID, attr, intType);
+    }
+
+    return success();
+  }
+
+  if (auto floatType = resultType.dyn_cast<FloatType>()) {
+    auto bitwidth = floatType.getWidth();
+    if (failed(checkOperandSizeForBitwidth(bitwidth))) {
+      return failure();
+    }
+
+    APFloat value(0.f);
+    if (floatType.isF64()) {
+      // Double values are represented with two SPIR-V words. According to
+      // SPIR-V spec: "When the type’s bit width is larger than one word, the
+      // literal’s low-order words appear first."
+      struct DoubleWord {
+        uint32_t word1;
+        uint32_t word2;
+      } words = {operands[2], operands[3]};
+      value = APFloat(llvm::bit_cast<double>(words));
+    } else if (floatType.isF32()) {
+      value = APFloat(llvm::bit_cast<float>(operands[2]));
+    } else if (floatType.isF16()) {
+      APInt data(16, operands[2]);
+      value = APFloat(APFloat::IEEEhalf(), data);
+    }
+
+    auto attr = opBuilder.getFloatAttr(floatType, value);
+    if (isSpec) {
+      createSpecConstant(unknownLoc, resultID, attr);
+    } else {
+      // For normal constants, we just record the attribute (and its type) for
+      // later materialization at use sites.
+      constantMap.try_emplace(resultID, attr, floatType);
+    }
+
+    return success();
+  }
+
+  return emitError(unknownLoc, "OpConstant can only generate values of "
+                               "scalar integer or floating-point type");
+}
+
+LogicalResult Deserializer::processConstantBool(bool isTrue,
+                                                ArrayRef<uint32_t> operands,
+                                                bool isSpec) {
+  if (operands.size() != 2) {
+    return emitError(unknownLoc, "Op")
+           << (isSpec ? "Spec" : "") << "Constant"
+           << (isTrue ? "True" : "False")
+           << " must have type <id> and result <id>";
+  }
+
+  auto attr = opBuilder.getBoolAttr(isTrue);
+  auto resultID = operands[1];
+  if (isSpec) {
+    createSpecConstant(unknownLoc, resultID, attr);
+  } else {
+    // For normal constants, we just record the attribute (and its type) for
+    // later materialization at use sites.
+    constantMap.try_emplace(resultID, attr, opBuilder.getI1Type());
+  }
+
+  return success();
+}
+
+LogicalResult
+Deserializer::processConstantComposite(ArrayRef<uint32_t> operands) {
+  if (operands.size() < 2) {
+    return emitError(unknownLoc,
+                     "OpConstantComposite must have type <id> and result <id>");
+  }
+  if (operands.size() < 3) {
+    return emitError(unknownLoc,
+                     "OpConstantComposite must have at least 1 parameter");
+  }
+
+  Type resultType = getType(operands[0]);
+  if (!resultType) {
+    return emitError(unknownLoc, "undefined result type from <id> ")
+           << operands[0];
+  }
+
+  SmallVector<Attribute, 4> elements;
+  elements.reserve(operands.size() - 2);
+  for (unsigned i = 2, e = operands.size(); i < e; ++i) {
+    auto elementInfo = getConstant(operands[i]);
+    if (!elementInfo) {
+      return emitError(unknownLoc, "OpConstantComposite component <id> ")
+             << operands[i] << " must come from a normal constant";
+    }
+    elements.push_back(elementInfo->first);
+  }
+
+  auto resultID = operands[1];
+  if (auto vectorType = resultType.dyn_cast<VectorType>()) {
+    auto attr = DenseElementsAttr::get(vectorType, elements);
+    // For normal constants, we just record the attribute (and its type) for
+    // later materialization at use sites.
+    constantMap.try_emplace(resultID, attr, resultType);
+  } else if (auto arrayType = resultType.dyn_cast<spirv::ArrayType>()) {
+    auto attr = opBuilder.getArrayAttr(elements);
+    constantMap.try_emplace(resultID, attr, resultType);
+  } else {
+    return emitError(unknownLoc, "unsupported OpConstantComposite type: ")
+           << resultType;
+  }
+
+  return success();
+}
+
+LogicalResult Deserializer::processConstantNull(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 2) {
+    return emitError(unknownLoc,
+                     "OpConstantNull must have type <id> and result <id>");
+  }
+
+  Type resultType = getType(operands[0]);
+  if (!resultType) {
+    return emitError(unknownLoc, "undefined result type from <id> ")
+           << operands[0];
+  }
+
+  auto resultID = operands[1];
+  if (resultType.isa<IntegerType>() || resultType.isa<FloatType>() ||
+      resultType.isa<VectorType>()) {
+    auto attr = opBuilder.getZeroAttr(resultType);
+    // For normal constants, we just record the attribute (and its type) for
+    // later materialization at use sites.
+    constantMap.try_emplace(resultID, attr, resultType);
+    return success();
+  }
+
+    return emitError(unknownLoc, "unsupported OpConstantNull type: ")
+           << resultType;
+}
+
+//===----------------------------------------------------------------------===//
+// Control flow
+//===----------------------------------------------------------------------===//
+
+Block *Deserializer::getOrCreateBlock(uint32_t id) {
+  if (auto *block = getBlock(id)) {
+    LLVM_DEBUG(llvm::dbgs() << "[block] got exiting block for id = " << id
+                            << " @ " << block << "\n");
+    return block;
+  }
+
+  // We don't know where this block will be placed finally (in a spv.selection
+  // or spv.loop or function). Create it into the function for now and sort
+  // out the proper place later.
+  auto *block = curFunction->addBlock();
+  LLVM_DEBUG(llvm::dbgs() << "[block] created block for id = " << id << " @ "
+                          << block << "\n");
+  return blockMap[id] = block;
+}
+
+LogicalResult Deserializer::processBranch(ArrayRef<uint32_t> operands) {
+  if (!curBlock) {
+    return emitError(unknownLoc, "OpBranch must appear inside a block");
+  }
+
+  if (operands.size() != 1) {
+    return emitError(unknownLoc, "OpBranch must take exactly one target label");
+  }
+
+  auto *target = getOrCreateBlock(operands[0]);
+  opBuilder.create<spirv::BranchOp>(unknownLoc, target);
+
+  return success();
+}
+
+LogicalResult
+Deserializer::processBranchConditional(ArrayRef<uint32_t> operands) {
+  if (!curBlock) {
+    return emitError(unknownLoc,
+                     "OpBranchConditional must appear inside a block");
+  }
+
+  if (operands.size() != 3 && operands.size() != 5) {
+    return emitError(unknownLoc,
+                     "OpBranchConditional must have condition, true label, "
+                     "false label, and optionally two branch weights");
+  }
+
+  auto condition = getValue(operands[0]);
+  auto *trueBlock = getOrCreateBlock(operands[1]);
+  auto *falseBlock = getOrCreateBlock(operands[2]);
+
+  Optional<std::pair<uint32_t, uint32_t>> weights;
+  if (operands.size() == 5) {
+    weights = std::make_pair(operands[3], operands[4]);
+  }
+
+  opBuilder.create<spirv::BranchConditionalOp>(
+      unknownLoc, condition, trueBlock,
+      /*trueArguments=*/ArrayRef<Value>(), falseBlock,
+      /*falseArguments=*/ArrayRef<Value>(), weights);
+
+  return success();
+}
+
+LogicalResult Deserializer::processLabel(ArrayRef<uint32_t> operands) {
+  if (!curFunction) {
+    return emitError(unknownLoc, "OpLabel must appear inside a function");
+  }
+
+  if (operands.size() != 1) {
+    return emitError(unknownLoc, "OpLabel should only have result <id>");
+  }
+
+  auto labelID = operands[0];
+  // We may have forward declared this block.
+  auto *block = getOrCreateBlock(labelID);
+  LLVM_DEBUG(llvm::dbgs() << "[block] populating block " << block << "\n");
+  // If we have seen this block, make sure it was just a forward declaration.
+  assert(block->empty() && "re-deserialize the same block!");
+
+  opBuilder.setInsertionPointToStart(block);
+  blockMap[labelID] = curBlock = block;
+
+  return success();
+}
+
+LogicalResult Deserializer::processSelectionMerge(ArrayRef<uint32_t> operands) {
+  if (!curBlock) {
+    return emitError(unknownLoc, "OpSelectionMerge must appear in a block");
+  }
+
+  if (operands.size() < 2) {
+    return emitError(
+        unknownLoc,
+        "OpLoopMerge must specify merge target and selection control");
+  }
+
+  if (static_cast<uint32_t>(spirv::LoopControl::None) != operands[1]) {
+    return emitError(unknownLoc,
+                     "unimplmented OpSelectionMerge selection control: ")
+           << operands[2];
+  }
+
+  auto *mergeBlock = getOrCreateBlock(operands[0]);
+
+  if (!blockMergeInfo.try_emplace(curBlock, mergeBlock).second) {
+    return emitError(
+        unknownLoc,
+        "a block cannot have more than one OpSelectionMerge instruction");
+  }
+
+  return success();
+}
+
+LogicalResult Deserializer::processLoopMerge(ArrayRef<uint32_t> operands) {
+  if (!curBlock) {
+    return emitError(unknownLoc, "OpLoopMerge must appear in a block");
+  }
+
+  if (operands.size() < 3) {
+    return emitError(unknownLoc, "OpLoopMerge must specify merge target, "
+                                 "continue target and loop control");
+  }
+
+  if (static_cast<uint32_t>(spirv::LoopControl::None) != operands[2]) {
+    return emitError(unknownLoc, "unimplmented OpLoopMerge loop control: ")
+           << operands[2];
+  }
+
+  auto *mergeBlock = getOrCreateBlock(operands[0]);
+  auto *continueBlock = getOrCreateBlock(operands[1]);
+
+  if (!blockMergeInfo.try_emplace(curBlock, mergeBlock, continueBlock).second) {
+    return emitError(
+        unknownLoc,
+        "a block cannot have more than one OpLoopMerge instruction");
+  }
+
+  return success();
+}
+
+LogicalResult Deserializer::processPhi(ArrayRef<uint32_t> operands) {
+  if (!curBlock) {
+    return emitError(unknownLoc, "OpPhi must appear in a block");
+  }
+
+  if (operands.size() < 4) {
+    return emitError(unknownLoc, "OpPhi must specify result type, result <id>, "
+                                 "and variable-parent pairs");
+  }
+
+  // Create a block argument for this OpPhi instruction.
+  Type blockArgType = getType(operands[0]);
+  BlockArgument blockArg = curBlock->addArgument(blockArgType);
+  valueMap[operands[1]] = blockArg;
+  LLVM_DEBUG(llvm::dbgs() << "[phi] created block argument " << blockArg
+                          << " id = " << operands[1] << " of type "
+                          << blockArgType << '\n');
+
+  // For each (value, predecessor) pair, insert the value to the predecessor's
+  // blockPhiInfo entry so later we can fix the block argument there.
+  for (unsigned i = 2, e = operands.size(); i < e; i += 2) {
+    uint32_t value = operands[i];
+    Block *predecessor = getOrCreateBlock(operands[i + 1]);
+    blockPhiInfo[predecessor].push_back(value);
+    LLVM_DEBUG(llvm::dbgs() << "[phi] predecessor @ " << predecessor
+                            << " with arg id = " << value << '\n');
+  }
+
+  return success();
+}
+
+namespace {
+/// A class for putting all blocks in a structured selection/loop in a
+/// spv.selection/spv.loop op.
+class ControlFlowStructurizer {
+public:
+  /// Structurizes the loop at the given `headerBlock`.
+  ///
+  /// This method will create an spv.loop op in the `mergeBlock` and move all
+  /// blocks in the structured loop into the spv.loop's region. All branches to
+  /// the `headerBlock` will be redirected to the `mergeBlock`.
+  /// This method will also update `mergeInfo` by remapping all blocks inside to
+  /// the newly cloned ones inside structured control flow op's regions.
+  static LogicalResult structurize(Location loc, BlockMergeInfoMap &mergeInfo,
+                                   Block *headerBlock, Block *mergeBlock,
+                                   Block *continueBlock) {
+    return ControlFlowStructurizer(loc, mergeInfo, headerBlock, mergeBlock,
+                                   continueBlock)
+        .structurizeImpl();
+  }
+
+private:
+  ControlFlowStructurizer(Location loc, BlockMergeInfoMap &mergeInfo,
+                          Block *header, Block *merge, Block *cont)
+      : location(loc), blockMergeInfo(mergeInfo), headerBlock(header),
+        mergeBlock(merge), continueBlock(cont) {}
+
+  /// Creates a new spv.selection op at the beginning of the `mergeBlock`.
+  spirv::SelectionOp createSelectionOp();
+
+  /// Creates a new spv.loop op at the beginning of the `mergeBlock`.
+  spirv::LoopOp createLoopOp();
+
+  /// Collects all blocks reachable from `headerBlock` except `mergeBlock`.
+  void collectBlocksInConstruct();
+
+  LogicalResult structurizeImpl();
+
+  Location location;
+
+  BlockMergeInfoMap &blockMergeInfo;
+
+  Block *headerBlock;
+  Block *mergeBlock;
+  Block *continueBlock; // nullptr for spv.selection
+
+  llvm::SetVector<Block *> constructBlocks;
+};
+} // namespace
+
+spirv::SelectionOp ControlFlowStructurizer::createSelectionOp() {
+  // Create a builder and set the insertion point to the beginning of the
+  // merge block so that the newly created SelectionOp will be inserted there.
+  OpBuilder builder(&mergeBlock->front());
+
+  auto control = builder.getI32IntegerAttr(
+      static_cast<uint32_t>(spirv::SelectionControl::None));
+  auto selectionOp = builder.create<spirv::SelectionOp>(location, control);
+  selectionOp.addMergeBlock();
+
+  return selectionOp;
+}
+
+spirv::LoopOp ControlFlowStructurizer::createLoopOp() {
+  // Create a builder and set the insertion point to the beginning of the
+  // merge block so that the newly created LoopOp will be inserted there.
+  OpBuilder builder(&mergeBlock->front());
+
+  // TODO(antiagainst): handle loop control properly
+  auto loopOp = builder.create<spirv::LoopOp>(location);
+  loopOp.addEntryAndMergeBlock();
+
+  return loopOp;
+}
+
+void ControlFlowStructurizer::collectBlocksInConstruct() {
+  assert(constructBlocks.empty() && "expected empty constructBlocks");
+
+  // Put the header block in the work list first.
+  constructBlocks.insert(headerBlock);
+
+  // For each item in the work list, add its successors excluding the merge
+  // block.
+  for (unsigned i = 0; i < constructBlocks.size(); ++i) {
+    for (auto *successor : constructBlocks[i]->getSuccessors())
+      if (successor != mergeBlock)
+        constructBlocks.insert(successor);
+  }
+}
+
+LogicalResult ControlFlowStructurizer::structurizeImpl() {
+  Operation *op = nullptr;
+  bool isLoop = continueBlock != nullptr;
+  if (isLoop) {
+    if (auto loopOp = createLoopOp())
+      op = loopOp.getOperation();
+  } else {
+    if (auto selectionOp = createSelectionOp())
+      op = selectionOp.getOperation();
+  }
+  if (!op)
+    return failure();
+  Region &body = op->getRegion(0);
+
+  BlockAndValueMapping mapper;
+  // All references to the old merge block should be directed to the
+  // selection/loop merge block in the SelectionOp/LoopOp's region.
+  mapper.map(mergeBlock, &body.back());
+
+  collectBlocksInConstruct();
+
+  // We've identified all blocks belonging to the selection/loop's region. Now
+  // need to "move" them into the selection/loop. Instead of really moving the
+  // blocks, in the following we copy them and remap all values and branches.
+  // This is because:
+  // * Inserting a block into a region requires the block not in any region
+  //   before. But selections/loops can nest so we can create selection/loop ops
+  //   in a nested manner, which means some blocks may already be in a
+  //   selection/loop region when to be moved again.
+  // * It's much trickier to fix up the branches into and out of the loop's
+  //   region: we need to treat not-moved blocks and moved blocks differently:
+  //   Not-moved blocks jumping to the loop header block need to jump to the
+  //   merge point containing the new loop op but not the loop continue block's
+  //   back edge. Moved blocks jumping out of the loop need to jump to the
+  //   merge block inside the loop region but not other not-moved blocks.
+  //   We cannot use replaceAllUsesWith clearly and it's harder to follow the
+  //   logic.
+
+  // Create a corresponding block in the SelectionOp/LoopOp's region for each
+  // block in this loop construct.
+  OpBuilder builder(body);
+  for (auto *block : constructBlocks) {
+    // Create a block and insert it before the selection/loop merge block in the
+    // SelectionOp/LoopOp's region.
+    auto *newBlock = builder.createBlock(&body.back());
+    mapper.map(block, newBlock);
+    LLVM_DEBUG(llvm::dbgs() << "[cf] cloned block " << newBlock
+                            << " from block " << block << "\n");
+    if (!isFnEntryBlock(block)) {
+      for (BlockArgument blockArg : block->getArguments()) {
+        auto newArg = newBlock->addArgument(blockArg->getType());
+        mapper.map(blockArg, newArg);
+        LLVM_DEBUG(llvm::dbgs() << "[cf] remapped block argument " << blockArg
+                                << " to " << newArg << '\n');
+      }
+    } else {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "[cf] block " << block << " is a function entry block\n");
+    }
+
+    for (auto &op : *block)
+      newBlock->push_back(op.clone(mapper));
+  }
+
+  // Go through all ops and remap the operands.
+  auto remapOperands = [&](Operation *op) {
+    for (auto &operand : op->getOpOperands())
+      if (auto mappedOp = mapper.lookupOrNull(operand.get()))
+        operand.set(mappedOp);
+    for (auto &succOp : op->getBlockOperands())
+      if (auto mappedOp = mapper.lookupOrNull(succOp.get()))
+        succOp.set(mappedOp);
+  };
+  for (auto &block : body) {
+    block.walk(remapOperands);
+  }
+
+  // We have created the SelectionOp/LoopOp and "moved" all blocks belonging to
+  // the selection/loop construct into its region. Next we need to fix the
+  // connections between this new SelectionOp/LoopOp with existing blocks.
+
+  // All existing incoming branches should go to the merge block, where the
+  // SelectionOp/LoopOp resides right now.
+  headerBlock->replaceAllUsesWith(mergeBlock);
+
+  if (isLoop) {
+    // The loop selection/loop header block may have block arguments. Since now
+    // we place the selection/loop op inside the old merge block, we need to
+    // make sure the old merge block has the same block argument list.
+    assert(mergeBlock->args_empty() && "OpPhi in loop merge block unsupported");
+    for (BlockArgument blockArg : headerBlock->getArguments()) {
+      mergeBlock->addArgument(blockArg->getType());
+    }
+
+    // If the loop header block has block arguments, make sure the spv.branch op
+    // matches.
+    SmallVector<Value, 4> blockArgs;
+    if (!headerBlock->args_empty())
+      blockArgs = {mergeBlock->args_begin(), mergeBlock->args_end()};
+
+    // The loop entry block should have a unconditional branch jumping to the
+    // loop header block.
+    builder.setInsertionPointToEnd(&body.front());
+    builder.create<spirv::BranchOp>(location, mapper.lookupOrNull(headerBlock),
+                                    ArrayRef<Value>(blockArgs));
+  }
+
+  // All the blocks cloned into the SelectionOp/LoopOp's region can now be
+  // cleaned up.
+  LLVM_DEBUG(llvm::dbgs() << "[cf] cleaning up blocks after clone\n");
+  // First we need to drop all operands' references inside all blocks. This is
+  // needed because we can have blocks referencing SSA values from one another.
+  for (auto *block : constructBlocks)
+    block->dropAllReferences();
+
+  // Then erase all old blocks.
+  for (auto *block : constructBlocks) {
+    // We've cloned all blocks belonging to this construct into the structured
+    // control flow op's region. Among these blocks, some may compose another
+    // selection/loop. If so, they will be recorded within blockMergeInfo.
+    // We need to update the pointers there to the newly remapped ones so we can
+    // continue structurizing them later.
+    // TODO(antiagainst): The asserts in the following assumes input SPIR-V blob
+    // forms correctly nested selection/loop constructs. We should relax this
+    // and support error cases better.
+    auto it = blockMergeInfo.find(block);
+    if (it != blockMergeInfo.end()) {
+      Block *newHeader = mapper.lookupOrNull(block);
+      assert(newHeader && "nested loop header block should be remapped!");
+
+      Block *newContinue = it->second.continueBlock;
+      if (newContinue) {
+        newContinue = mapper.lookupOrNull(newContinue);
+        assert(newContinue && "nested loop continue block should be remapped!");
+      }
+
+      Block *newMerge = it->second.mergeBlock;
+      if (Block *mappedTo = mapper.lookupOrNull(newMerge))
+        newMerge = mappedTo;
+
+      // The iterator should be erased before adding a new entry into
+      // blockMergeInfo to avoid iterator invalidation.
+      blockMergeInfo.erase(it);
+      blockMergeInfo.try_emplace(newHeader, newMerge, newContinue);
+    }
+
+    // The structured selection/loop's entry block does not have arguments.
+    // If the function's header block is also part of the structured control
+    // flow, we cannot just simply erase it because it may contain arguments
+    // matching the function signature and used by the cloned blocks.
+    if (isFnEntryBlock(block)) {
+      LLVM_DEBUG(llvm::dbgs() << "[cf] changing entry block " << block
+                              << " to only contain a spv.Branch op\n");
+      // Still keep the function entry block for the potential block arguments,
+      // but replace all ops inside with a branch to the merge block.
+      block->clear();
+      builder.setInsertionPointToEnd(block);
+      builder.create<spirv::BranchOp>(location, mergeBlock);
+    } else {
+      LLVM_DEBUG(llvm::dbgs() << "[cf] erasing block " << block << "\n");
+      block->erase();
+    }
+  }
+
+  LLVM_DEBUG(
+      llvm::dbgs() << "[cf] after structurizing construct with header block "
+                   << headerBlock << ":\n"
+                   << *op << '\n');
+
+  return success();
+}
+
+LogicalResult Deserializer::wireUpBlockArgument() {
+  LLVM_DEBUG(llvm::dbgs() << "[phi] start wiring up block arguments\n");
+
+  OpBuilder::InsertionGuard guard(opBuilder);
+
+  for (const auto &info : blockPhiInfo) {
+    Block *block = info.first;
+    const BlockPhiInfo &phiInfo = info.second;
+    LLVM_DEBUG(llvm::dbgs() << "[phi] block " << block << "\n");
+    LLVM_DEBUG(llvm::dbgs() << "[phi] before creating block argument:\n");
+    LLVM_DEBUG(block->getParentOp()->print(llvm::dbgs()));
+    LLVM_DEBUG(llvm::dbgs() << '\n');
+
+    // Set insertion point to before this block's terminator early because we
+    // may materialize ops via getValue() call.
+    auto *op = block->getTerminator();
+    opBuilder.setInsertionPoint(op);
+
+    SmallVector<Value, 4> blockArgs;
+    blockArgs.reserve(phiInfo.size());
+    for (uint32_t valueId : phiInfo) {
+      if (Value value = getValue(valueId)) {
+        blockArgs.push_back(value);
+        LLVM_DEBUG(llvm::dbgs() << "[phi] block argument " << value
+                                << " id = " << valueId << '\n');
+      } else {
+        return emitError(unknownLoc, "OpPhi references undefined value!");
+      }
+    }
+
+    if (auto branchOp = dyn_cast<spirv::BranchOp>(op)) {
+      // Replace the previous branch op with a new one with block arguments.
+      opBuilder.create<spirv::BranchOp>(branchOp.getLoc(), branchOp.getTarget(),
+                                        blockArgs);
+      branchOp.erase();
+    } else {
+      return emitError(unknownLoc, "unimplemented terminator for Phi creation");
+    }
+
+    LLVM_DEBUG(llvm::dbgs() << "[phi] after creating block argument:\n");
+    LLVM_DEBUG(block->getParentOp()->print(llvm::dbgs()));
+    LLVM_DEBUG(llvm::dbgs() << '\n');
+  }
+  blockPhiInfo.clear();
+
+  LLVM_DEBUG(llvm::dbgs() << "[phi] completed wiring up block arguments\n");
+  return success();
+}
+
+LogicalResult Deserializer::structurizeControlFlow() {
+  LLVM_DEBUG(llvm::dbgs() << "[cf] start structurizing control flow\n");
+
+  while (!blockMergeInfo.empty()) {
+    Block *headerBlock = blockMergeInfo.begin()->first;
+    BlockMergeInfo mergeInfo = blockMergeInfo.begin()->second;
+
+    LLVM_DEBUG(llvm::dbgs() << "[cf] header block " << headerBlock << ":\n");
+    LLVM_DEBUG(headerBlock->print(llvm::dbgs()));
+
+    auto *mergeBlock = mergeInfo.mergeBlock;
+    assert(mergeBlock && "merge block cannot be nullptr");
+    if (!mergeBlock->args_empty())
+      return emitError(unknownLoc, "OpPhi in loop merge block unimplemented");
+    LLVM_DEBUG(llvm::dbgs() << "[cf] merge block " << mergeBlock << ":\n");
+    LLVM_DEBUG(mergeBlock->print(llvm::dbgs()));
+
+    auto *continueBlock = mergeInfo.continueBlock;
+    if (continueBlock) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "[cf] continue block " << continueBlock << ":\n");
+      LLVM_DEBUG(continueBlock->print(llvm::dbgs()));
+    }
+
+    // Erase this case before calling into structurizer, who will update
+    // blockMergeInfo.
+    blockMergeInfo.erase(blockMergeInfo.begin());
+    if (failed(ControlFlowStructurizer::structurize(unknownLoc, blockMergeInfo,
+                                                    headerBlock, mergeBlock,
+                                                    continueBlock)))
+      return failure();
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "[cf] completed structurizing control flow\n");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction
+//===----------------------------------------------------------------------===//
+
+Value Deserializer::getValue(uint32_t id) {
+  if (auto constInfo = getConstant(id)) {
+    // Materialize a `spv.constant` op at every use site.
+    return opBuilder.create<spirv::ConstantOp>(unknownLoc, constInfo->second,
+                                               constInfo->first);
+  }
+  if (auto varOp = getGlobalVariable(id)) {
+    auto addressOfOp = opBuilder.create<spirv::AddressOfOp>(
+        unknownLoc, varOp.type(),
+        opBuilder.getSymbolRefAttr(varOp.getOperation()));
+    return addressOfOp.pointer();
+  }
+  if (auto constOp = getSpecConstant(id)) {
+    auto referenceOfOp = opBuilder.create<spirv::ReferenceOfOp>(
+        unknownLoc, constOp.default_value().getType(),
+        opBuilder.getSymbolRefAttr(constOp.getOperation()));
+    return referenceOfOp.reference();
+  }
+  if (auto undef = getUndefType(id)) {
+    return opBuilder.create<spirv::UndefOp>(unknownLoc, undef);
+  }
+  return valueMap.lookup(id);
+}
+
+LogicalResult
+Deserializer::sliceInstruction(spirv::Opcode &opcode,
+                               ArrayRef<uint32_t> &operands,
+                               Optional<spirv::Opcode> expectedOpcode) {
+  auto binarySize = binary.size();
+  if (curOffset >= binarySize) {
+    return emitError(unknownLoc, "expected ")
+           << (expectedOpcode ? spirv::stringifyOpcode(*expectedOpcode)
+                              : "more")
+           << " instruction";
+  }
+
+  // For each instruction, get its word count from the first word to slice it
+  // from the stream properly, and then dispatch to the instruction handler.
+
+  uint32_t wordCount = binary[curOffset] >> 16;
+
+  if (wordCount == 0)
+    return emitError(unknownLoc, "word count cannot be zero");
+
+  uint32_t nextOffset = curOffset + wordCount;
+  if (nextOffset > binarySize)
+    return emitError(unknownLoc, "insufficient words for the last instruction");
+
+  opcode = extractOpcode(binary[curOffset]);
+  operands = binary.slice(curOffset + 1, wordCount - 1);
+  curOffset = nextOffset;
+  return success();
+}
+
+LogicalResult Deserializer::processInstruction(spirv::Opcode opcode,
+                                               ArrayRef<uint32_t> operands,
+                                               bool deferInstructions) {
+  LLVM_DEBUG(llvm::dbgs() << "[inst] processing instruction "
+                          << spirv::stringifyOpcode(opcode) << "\n");
+
+  // First dispatch all the instructions whose opcode does not correspond to
+  // those that have a direct mirror in the SPIR-V dialect
+  switch (opcode) {
+  case spirv::Opcode::OpBitcast:
+    return processBitcast(operands);
+  case spirv::Opcode::OpCapability:
+    return processCapability(operands);
+  case spirv::Opcode::OpExtension:
+    return processExtension(operands);
+  case spirv::Opcode::OpExtInst:
+    return processExtInst(operands);
+  case spirv::Opcode::OpExtInstImport:
+    return processExtInstImport(operands);
+  case spirv::Opcode::OpMemberName:
+    return processMemberName(operands);
+  case spirv::Opcode::OpMemoryModel:
+    return processMemoryModel(operands);
+  case spirv::Opcode::OpEntryPoint:
+  case spirv::Opcode::OpExecutionMode:
+    if (deferInstructions) {
+      deferredInstructions.emplace_back(opcode, operands);
+      return success();
+    }
+    break;
+  case spirv::Opcode::OpVariable:
+    if (isa<spirv::ModuleOp>(opBuilder.getBlock()->getParentOp())) {
+      return processGlobalVariable(operands);
+    }
+    break;
+  case spirv::Opcode::OpName:
+    return processName(operands);
+  case spirv::Opcode::OpModuleProcessed:
+  case spirv::Opcode::OpString:
+  case spirv::Opcode::OpSource:
+  case spirv::Opcode::OpSourceContinued:
+  case spirv::Opcode::OpSourceExtension:
+    // TODO: This is debug information embedded in the binary which should be
+    // translated into the spv.module.
+    return success();
+  case spirv::Opcode::OpTypeVoid:
+  case spirv::Opcode::OpTypeBool:
+  case spirv::Opcode::OpTypeInt:
+  case spirv::Opcode::OpTypeFloat:
+  case spirv::Opcode::OpTypeVector:
+  case spirv::Opcode::OpTypeArray:
+  case spirv::Opcode::OpTypeFunction:
+  case spirv::Opcode::OpTypeRuntimeArray:
+  case spirv::Opcode::OpTypeStruct:
+  case spirv::Opcode::OpTypePointer:
+    return processType(opcode, operands);
+  case spirv::Opcode::OpConstant:
+    return processConstant(operands, /*isSpec=*/false);
+  case spirv::Opcode::OpSpecConstant:
+    return processConstant(operands, /*isSpec=*/true);
+  case spirv::Opcode::OpConstantComposite:
+    return processConstantComposite(operands);
+  case spirv::Opcode::OpConstantTrue:
+    return processConstantBool(/*isTrue=*/true, operands, /*isSpec=*/false);
+  case spirv::Opcode::OpSpecConstantTrue:
+    return processConstantBool(/*isTrue=*/true, operands, /*isSpec=*/true);
+  case spirv::Opcode::OpConstantFalse:
+    return processConstantBool(/*isTrue=*/false, operands, /*isSpec=*/false);
+  case spirv::Opcode::OpSpecConstantFalse:
+    return processConstantBool(/*isTrue=*/false, operands, /*isSpec=*/true);
+  case spirv::Opcode::OpConstantNull:
+    return processConstantNull(operands);
+  case spirv::Opcode::OpDecorate:
+    return processDecoration(operands);
+  case spirv::Opcode::OpMemberDecorate:
+    return processMemberDecoration(operands);
+  case spirv::Opcode::OpFunction:
+    return processFunction(operands);
+  case spirv::Opcode::OpLabel:
+    return processLabel(operands);
+  case spirv::Opcode::OpBranch:
+    return processBranch(operands);
+  case spirv::Opcode::OpBranchConditional:
+    return processBranchConditional(operands);
+  case spirv::Opcode::OpSelectionMerge:
+    return processSelectionMerge(operands);
+  case spirv::Opcode::OpLoopMerge:
+    return processLoopMerge(operands);
+  case spirv::Opcode::OpPhi:
+    return processPhi(operands);
+  case spirv::Opcode::OpUndef:
+    return processUndef(operands);
+  default:
+    break;
+  }
+  return dispatchToAutogenDeserialization(opcode, operands);
+}
+
+LogicalResult Deserializer::processUndef(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 2) {
+    return emitError(unknownLoc, "OpUndef instruction must have two operands");
+  }
+  auto type = getType(operands[0]);
+  if (!type) {
+    return emitError(unknownLoc, "unknown type <id> with OpUndef instruction");
+  }
+  undefMap[operands[1]] = type;
+  return success();
+}
+
+// TODO(b/130356985): This method is copied from the auto-generated
+// deserialization function for OpBitcast instruction. This is to avoid
+// generating a Bitcast operations for cast from signed integer to unsigned
+// integer and viceversa. MLIR doesn't have native support for this so they both
+// end up mapping to the same type right now which is illegal according to
+// OpBitcast semantics (and enforced by the SPIR-V dialect).
+LogicalResult Deserializer::processBitcast(ArrayRef<uint32_t> words) {
+  SmallVector<Type, 1> resultTypes;
+  size_t wordIndex = 0;
+  (void)wordIndex;
+  uint32_t valueID = 0;
+  (void)valueID;
+  {
+    if (wordIndex >= words.size()) {
+      return emitError(
+          unknownLoc,
+          "expected result type <id> while deserializing spirv::BitcastOp");
+    }
+    auto ty = getType(words[wordIndex]);
+    if (!ty) {
+      return emitError(unknownLoc, "unknown type result <id> : ")
+             << words[wordIndex];
+    }
+    resultTypes.push_back(ty);
+    wordIndex++;
+    if (wordIndex >= words.size()) {
+      return emitError(
+          unknownLoc,
+          "expected result <id> while deserializing spirv::BitcastOp");
+    }
+  }
+  valueID = words[wordIndex++];
+  SmallVector<Value, 4> operands;
+  SmallVector<NamedAttribute, 4> attributes;
+  if (wordIndex < words.size()) {
+    auto arg = getValue(words[wordIndex]);
+    if (!arg) {
+      return emitError(unknownLoc, "unknown result <id> : ")
+             << words[wordIndex];
+    }
+    operands.push_back(arg);
+    wordIndex++;
+  }
+  if (wordIndex != words.size()) {
+    return emitError(unknownLoc,
+                     "found more operands than expected when deserializing "
+                     "spirv::BitcastOp, only ")
+           << wordIndex << " of " << words.size() << " processed";
+  }
+  if (resultTypes[0] == operands[0]->getType() &&
+      resultTypes[0].isa<IntegerType>()) {
+    // TODO(b/130356985): This check is added to ignore error in Op verification
+    // due to both signed and unsigned integers mapping to the same
+    // type. Without this check this method is same as what is auto-generated.
+    valueMap[valueID] = operands[0];
+    return success();
+  }
+
+  auto op = opBuilder.create<spirv::BitcastOp>(unknownLoc, resultTypes,
+                                               operands, attributes);
+  (void)op;
+  valueMap[valueID] = op.getResult();
+
+  if (decorations.count(valueID)) {
+    auto attrs = decorations[valueID].getAttrs();
+    attributes.append(attrs.begin(), attrs.end());
+  }
+  return success();
+}
+
+LogicalResult Deserializer::processExtInst(ArrayRef<uint32_t> operands) {
+  if (operands.size() < 4) {
+    return emitError(unknownLoc,
+                     "OpExtInst must have at least 4 operands, result type "
+                     "<id>, result <id>, set <id> and instruction opcode");
+  }
+  if (!extendedInstSets.count(operands[2])) {
+    return emitError(unknownLoc, "undefined set <id> in OpExtInst");
+  }
+  SmallVector<uint32_t, 4> slicedOperands;
+  slicedOperands.append(operands.begin(), std::next(operands.begin(), 2));
+  slicedOperands.append(std::next(operands.begin(), 4), operands.end());
+  return dispatchToExtensionSetAutogenDeserialization(
+      extendedInstSets[operands[2]], operands[3], slicedOperands);
+}
+
+namespace {
+
+template <>
+LogicalResult
+Deserializer::processOp<spirv::EntryPointOp>(ArrayRef<uint32_t> words) {
+  unsigned wordIndex = 0;
+  if (wordIndex >= words.size()) {
+    return emitError(unknownLoc,
+                     "missing Execution Model specification in OpEntryPoint");
+  }
+  auto exec_model = opBuilder.getI32IntegerAttr(words[wordIndex++]);
+  if (wordIndex >= words.size()) {
+    return emitError(unknownLoc, "missing <id> in OpEntryPoint");
+  }
+  // Get the function <id>
+  auto fnID = words[wordIndex++];
+  // Get the function name
+  auto fnName = decodeStringLiteral(words, wordIndex);
+  // Verify that the function <id> matches the fnName
+  auto parsedFunc = getFunction(fnID);
+  if (!parsedFunc) {
+    return emitError(unknownLoc, "no function matching <id> ") << fnID;
+  }
+  if (parsedFunc.getName() != fnName) {
+    return emitError(unknownLoc, "function name mismatch between OpEntryPoint "
+                                 "and OpFunction with <id> ")
+           << fnID << ": " << fnName << " vs. " << parsedFunc.getName();
+  }
+  SmallVector<Attribute, 4> interface;
+  while (wordIndex < words.size()) {
+    auto arg = getGlobalVariable(words[wordIndex]);
+    if (!arg) {
+      return emitError(unknownLoc, "undefined result <id> ")
+             << words[wordIndex] << " while decoding OpEntryPoint";
+    }
+    interface.push_back(opBuilder.getSymbolRefAttr(arg.getOperation()));
+    wordIndex++;
+  }
+  opBuilder.create<spirv::EntryPointOp>(unknownLoc, exec_model,
+                                        opBuilder.getSymbolRefAttr(fnName),
+                                        opBuilder.getArrayAttr(interface));
+  return success();
+}
+
+template <>
+LogicalResult
+Deserializer::processOp<spirv::ExecutionModeOp>(ArrayRef<uint32_t> words) {
+  unsigned wordIndex = 0;
+  if (wordIndex >= words.size()) {
+    return emitError(unknownLoc,
+                     "missing function result <id> in OpExecutionMode");
+  }
+  // Get the function <id> to get the name of the function
+  auto fnID = words[wordIndex++];
+  auto fn = getFunction(fnID);
+  if (!fn) {
+    return emitError(unknownLoc, "no function matching <id> ") << fnID;
+  }
+  // Get the Execution mode
+  if (wordIndex >= words.size()) {
+    return emitError(unknownLoc, "missing Execution Mode in OpExecutionMode");
+  }
+  auto execMode = opBuilder.getI32IntegerAttr(words[wordIndex++]);
+
+  // Get the values
+  SmallVector<Attribute, 4> attrListElems;
+  while (wordIndex < words.size()) {
+    attrListElems.push_back(opBuilder.getI32IntegerAttr(words[wordIndex++]));
+  }
+  auto values = opBuilder.getArrayAttr(attrListElems);
+  opBuilder.create<spirv::ExecutionModeOp>(
+      unknownLoc, opBuilder.getSymbolRefAttr(fn.getName()), execMode, values);
+  return success();
+}
+
+template <>
+LogicalResult
+Deserializer::processOp<spirv::ControlBarrierOp>(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 3) {
+    return emitError(
+        unknownLoc,
+        "OpControlBarrier must have execution scope <id>, memory scope <id> "
+        "and memory semantics <id>");
+  }
+
+  SmallVector<IntegerAttr, 3> argAttrs;
+  for (auto operand : operands) {
+    auto argAttr = getConstantInt(operand);
+    if (!argAttr) {
+      return emitError(unknownLoc,
+                       "expected 32-bit integer constant from <id> ")
+             << operand << " for OpControlBarrier";
+    }
+    argAttrs.push_back(argAttr);
+  }
+
+  opBuilder.create<spirv::ControlBarrierOp>(unknownLoc, argAttrs[0],
+                                            argAttrs[1], argAttrs[2]);
+  return success();
+}
+
+template <>
+LogicalResult
+Deserializer::processOp<spirv::FunctionCallOp>(ArrayRef<uint32_t> operands) {
+  if (operands.size() < 3) {
+    return emitError(unknownLoc,
+                     "OpFunctionCall must have at least 3 operands");
+  }
+
+  Type resultType = getType(operands[0]);
+  if (!resultType) {
+    return emitError(unknownLoc, "undefined result type from <id> ")
+           << operands[0];
+  }
+
+  auto resultID = operands[1];
+  auto functionID = operands[2];
+
+  auto functionName = getFunctionSymbol(functionID);
+
+  SmallVector<Value, 4> arguments;
+  for (auto operand : llvm::drop_begin(operands, 3)) {
+    auto value = getValue(operand);
+    if (!value) {
+      return emitError(unknownLoc, "unknown <id> ")
+             << operand << " used by OpFunctionCall";
+    }
+    arguments.push_back(value);
+  }
+
+  SmallVector<Type, 1> resultTypes;
+  if (!isVoidType(resultType)) {
+    resultTypes.push_back(resultType);
+  }
+
+  auto opFunctionCall = opBuilder.create<spirv::FunctionCallOp>(
+      unknownLoc, resultTypes, opBuilder.getSymbolRefAttr(functionName),
+      arguments);
+
+  if (!resultTypes.empty()) {
+    valueMap[resultID] = opFunctionCall.getResult(0);
+  }
+  return success();
+}
+
+template <>
+LogicalResult
+Deserializer::processOp<spirv::MemoryBarrierOp>(ArrayRef<uint32_t> operands) {
+  if (operands.size() != 2) {
+    return emitError(unknownLoc, "OpMemoryBarrier must have memory scope <id> "
+                                 "and memory semantics <id>");
+  }
+
+  SmallVector<IntegerAttr, 2> argAttrs;
+  for (auto operand : operands) {
+    auto argAttr = getConstantInt(operand);
+    if (!argAttr) {
+      return emitError(unknownLoc,
+                       "expected 32-bit integer constant from <id> ")
+             << operand << " for OpMemoryBarrier";
+    }
+    argAttrs.push_back(argAttr);
+  }
+
+  opBuilder.create<spirv::MemoryBarrierOp>(unknownLoc, argAttrs[0],
+                                           argAttrs[1]);
+  return success();
+}
+
+// Pull in auto-generated Deserializer::dispatchToAutogenDeserialization() and
+// various Deserializer::processOp<...>() specializations.
+#define GET_DESERIALIZATION_FNS
+#include "mlir/Dialect/SPIRV/SPIRVSerialization.inc"
+} // namespace
+
+Optional<spirv::ModuleOp> spirv::deserialize(ArrayRef<uint32_t> binary,
+                                             MLIRContext *context) {
+  Deserializer deserializer(binary, context);
+
+  if (failed(deserializer.deserialize()))
+    return llvm::None;
+
+  return deserializer.collect();
+}
diff --git a/mlir/lib/Dialect/SPIRV/Serialization/SPIRVBinaryUtils.cpp b/mlir/lib/Dialect/SPIRV/Serialization/SPIRVBinaryUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..13405c9883d36246f99d905a0b25028642a148c6
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/Serialization/SPIRVBinaryUtils.cpp
@@ -0,0 +1,60 @@
+//===- SPIRVBinaryUtils.cpp - MLIR SPIR-V Binary Module Utilities ---------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines common utilities for SPIR-V binary module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVBinaryUtils.h"
+
+using namespace mlir;
+
+void spirv::appendModuleHeader(SmallVectorImpl<uint32_t> &header,
+                               uint32_t idBound) {
+  // The major and minor version number for the generated SPIR-V binary.
+  // TODO(antiagainst): use target environment to select the version
+  constexpr uint8_t kMajorVersion = 1;
+  constexpr uint8_t kMinorVersion = 0;
+
+  // See "2.3. Physical Layout of a SPIR-V Module and Instruction" in the SPIR-V
+  // spec for the definition of the binary module header.
+  //
+  // The first five words of a SPIR-V module must be:
+  // +-------------------------------------------------------------------------+
+  // | Magic number                                                            |
+  // +-------------------------------------------------------------------------+
+  // | Version number (bytes: 0 | major number | minor number | 0)             |
+  // +-------------------------------------------------------------------------+
+  // | Generator magic number                                                  |
+  // +-------------------------------------------------------------------------+
+  // | Bound (all result <id>s in the module guaranteed to be less than it)    |
+  // +-------------------------------------------------------------------------+
+  // | 0 (reserved for instruction schema)                                     |
+  // +-------------------------------------------------------------------------+
+  header.push_back(spirv::kMagicNumber);
+  header.push_back((kMajorVersion << 16) | (kMinorVersion << 8));
+  header.push_back(kGeneratorNumber);
+  header.push_back(idBound); // <id> bound
+  header.push_back(0);       // Schema (reserved word)
+}
+
+/// Returns the word-count-prefixed opcode for an SPIR-V instruction.
+uint32_t spirv::getPrefixedOpcode(uint32_t wordCount, spirv::Opcode opcode) {
+  assert(((wordCount >> 16) == 0) && "word count out of range!");
+  return (wordCount << 16) | static_cast<uint32_t>(opcode);
+}
+
+LogicalResult spirv::encodeStringLiteralInto(SmallVectorImpl<uint32_t> &binary,
+                                             StringRef literal) {
+  // We need to encode the literal and the null termination.
+  auto encodingSize = literal.size() / 4 + 1;
+  auto bufferStartSize = binary.size();
+  binary.resize(bufferStartSize + encodingSize, 0);
+  std::memcpy(binary.data() + bufferStartSize, literal.data(), literal.size());
+  return success();
+}
diff --git a/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0cdcc25b77d75f62d87762dd3152d2484ecb7407
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp
@@ -0,0 +1,1821 @@
+//===- Serializer.cpp - MLIR SPIR-V Serialization -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MLIR SPIR-V module to SPIR-V binary serialization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/Serialization.h"
+
+#include "mlir/ADT/TypeSwitch.h"
+#include "mlir/Dialect/SPIRV/SPIRVBinaryUtils.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/RegionGraphTraits.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/StringExtras.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "spirv-serialization"
+
+using namespace mlir;
+
+/// Encodes an SPIR-V instruction with the given `opcode` and `operands` into
+/// the given `binary` vector.
+static LogicalResult encodeInstructionInto(SmallVectorImpl<uint32_t> &binary,
+                                           spirv::Opcode op,
+                                           ArrayRef<uint32_t> operands) {
+  uint32_t wordCount = 1 + operands.size();
+  binary.push_back(spirv::getPrefixedOpcode(wordCount, op));
+    binary.append(operands.begin(), operands.end());
+  return success();
+}
+
+/// A pre-order depth-first visitor function for processing basic blocks.
+///
+/// Visits the basic blocks starting from the given `headerBlock` in pre-order
+/// depth-first manner and calls `blockHandler` on each block. Skips handling
+/// blocks in the `skipBlocks` list. If `skipHeader` is true, `blockHandler`
+/// will not be invoked in `headerBlock` but still handles all `headerBlock`'s
+/// successors.
+///
+/// SPIR-V spec "2.16.1. Universal Validation Rules" requires that "the order
+/// of blocks in a function must satisfy the rule that blocks appear before
+/// all blocks they dominate." This can be achieved by a pre-order CFG
+/// traversal algorithm. To make the serialization output more logical and
+/// readable to human, we perform depth-first CFG traversal and delay the
+/// serialization of the merge block and the continue block, if exists, until
+/// after all other blocks have been processed.
+static LogicalResult visitInPrettyBlockOrder(
+    Block *headerBlock, function_ref<LogicalResult(Block *)> blockHandler,
+    bool skipHeader = false, ArrayRef<Block *> skipBlocks = {}) {
+  llvm::df_iterator_default_set<Block *, 4> doneBlocks;
+  doneBlocks.insert(skipBlocks.begin(), skipBlocks.end());
+
+  for (Block *block : llvm::depth_first_ext(headerBlock, doneBlocks)) {
+    if (skipHeader && block == headerBlock)
+      continue;
+    if (failed(blockHandler(block)))
+      return failure();
+  }
+  return success();
+}
+
+/// Returns the last structured control flow op's merge block if the given
+/// `block` contains any structured control flow op. Otherwise returns nullptr.
+static Block *getLastStructuredControlFlowOpMergeBlock(Block *block) {
+  for (Operation &op : llvm::reverse(block->getOperations())) {
+    if (auto selectionOp = dyn_cast<spirv::SelectionOp>(op))
+      return selectionOp.getMergeBlock();
+    if (auto loopOp = dyn_cast<spirv::LoopOp>(op))
+      return loopOp.getMergeBlock();
+  }
+  return nullptr;
+}
+
+namespace {
+
+/// A SPIR-V module serializer.
+///
+/// A SPIR-V binary module is a single linear stream of instructions; each
+/// instruction is composed of 32-bit words with the layout:
+///
+///   | <word-count>|<opcode> |  <operand>   |  <operand>   | ... |
+///   | <------ word -------> | <-- word --> | <-- word --> | ... |
+///
+/// For the first word, the 16 high-order bits are the word count of the
+/// instruction, the 16 low-order bits are the opcode enumerant. The
+/// instructions then belong to different sections, which must be laid out in
+/// the particular order as specified in "2.4 Logical Layout of a Module" of
+/// the SPIR-V spec.
+class Serializer {
+public:
+  /// Creates a serializer for the given SPIR-V `module`.
+  explicit Serializer(spirv::ModuleOp module);
+
+  /// Serializes the remembered SPIR-V module.
+  LogicalResult serialize();
+
+  /// Collects the final SPIR-V `binary`.
+  void collect(SmallVectorImpl<uint32_t> &binary);
+
+  /// (For debugging) prints each value and its corresponding result <id>.
+  void printValueIDMap(raw_ostream &os);
+
+private:
+  // Note that there are two main categories of methods in this class:
+  // * process*() methods are meant to fully serialize a SPIR-V module entity
+  //   (header, type, op, etc.). They update internal vectors containing
+  //   different binary sections. They are not meant to be called except the
+  //   top-level serialization loop.
+  // * prepare*() methods are meant to be helpers that prepare for serializing
+  //   certain entity. They may or may not update internal vectors containing
+  //   different binary sections. They are meant to be called among themselves
+  //   or by other process*() methods for subtasks.
+
+  //===--------------------------------------------------------------------===//
+  // <id>
+  //===--------------------------------------------------------------------===//
+
+  // Note that it is illegal to use id <0> in SPIR-V binary module. Various
+  // methods in this class, if using SPIR-V word (uint32_t) as interface,
+  // check or return id <0> to indicate error in processing.
+
+  /// Consumes the next unused <id>. This method will never return 0.
+  uint32_t getNextID() { return nextID++; }
+
+  //===--------------------------------------------------------------------===//
+  // Module structure
+  //===--------------------------------------------------------------------===//
+
+  uint32_t getSpecConstID(StringRef constName) const {
+    return specConstIDMap.lookup(constName);
+  }
+
+  uint32_t getVariableID(StringRef varName) const {
+    return globalVarIDMap.lookup(varName);
+  }
+
+  uint32_t getFunctionID(StringRef fnName) const {
+    return funcIDMap.lookup(fnName);
+  }
+
+  /// Gets the <id> for the function with the given name. Assigns the next
+  /// available <id> if the function haven't been deserialized.
+  uint32_t getOrCreateFunctionID(StringRef fnName);
+
+  void processCapability();
+
+  void processExtension();
+
+  void processMemoryModel();
+
+  LogicalResult processConstantOp(spirv::ConstantOp op);
+
+  LogicalResult processSpecConstantOp(spirv::SpecConstantOp op);
+
+  /// SPIR-V dialect supports OpUndef using spv.UndefOp that produces a SSA
+  /// value to use with other operations. The SPIR-V spec recommends that
+  /// OpUndef be generated at module level. The serialization generates an
+  /// OpUndef for each type needed at module level.
+  LogicalResult processUndefOp(spirv::UndefOp op);
+
+  /// Emit OpName for the given `resultID`.
+  LogicalResult processName(uint32_t resultID, StringRef name);
+
+  /// Processes a SPIR-V function op.
+  LogicalResult processFuncOp(FuncOp op);
+
+  LogicalResult processVariableOp(spirv::VariableOp op);
+
+  /// Process a SPIR-V GlobalVariableOp
+  LogicalResult processGlobalVariableOp(spirv::GlobalVariableOp varOp);
+
+  /// Process attributes that translate to decorations on the result <id>
+  LogicalResult processDecoration(Location loc, uint32_t resultID,
+                                  NamedAttribute attr);
+
+  template <typename DType>
+  LogicalResult processTypeDecoration(Location loc, DType type,
+                                      uint32_t resultId) {
+    return emitError(loc, "unhandled decoration for type:") << type;
+  }
+
+  /// Process member decoration
+  LogicalResult processMemberDecoration(uint32_t structID, uint32_t memberIndex,
+                                        spirv::Decoration decorationType,
+                                        ArrayRef<uint32_t> values = {});
+
+  //===--------------------------------------------------------------------===//
+  // Types
+  //===--------------------------------------------------------------------===//
+
+  uint32_t getTypeID(Type type) const { return typeIDMap.lookup(type); }
+
+  Type getVoidType() { return mlirBuilder.getNoneType(); }
+
+  bool isVoidType(Type type) const { return type.isa<NoneType>(); }
+
+  /// Returns true if the given type is a pointer type to a struct in Uniform or
+  /// StorageBuffer storage class.
+  bool isInterfaceStructPtrType(Type type) const;
+
+  /// Main dispatch method for serializing a type. The result <id> of the
+  /// serialized type will be returned as `typeID`.
+  LogicalResult processType(Location loc, Type type, uint32_t &typeID);
+
+  /// Method for preparing basic SPIR-V type serialization. Returns the type's
+  /// opcode and operands for the instruction via `typeEnum` and `operands`.
+  LogicalResult prepareBasicType(Location loc, Type type, uint32_t resultID,
+                                 spirv::Opcode &typeEnum,
+                                 SmallVectorImpl<uint32_t> &operands);
+
+  LogicalResult prepareFunctionType(Location loc, FunctionType type,
+                                    spirv::Opcode &typeEnum,
+                                    SmallVectorImpl<uint32_t> &operands);
+
+  //===--------------------------------------------------------------------===//
+  // Constant
+  //===--------------------------------------------------------------------===//
+
+  uint32_t getConstantID(Attribute value) const {
+    return constIDMap.lookup(value);
+  }
+
+  /// Main dispatch method for processing a constant with the given `constType`
+  /// and `valueAttr`. `constType` is needed here because we can interpret the
+  /// `valueAttr` as a different type than the type of `valueAttr` itself; for
+  /// example, ArrayAttr, whose type is NoneType, is used for spirv::ArrayType
+  /// constants.
+  uint32_t prepareConstant(Location loc, Type constType, Attribute valueAttr);
+
+  /// Prepares array attribute serialization. This method emits corresponding
+  /// OpConstant* and returns the result <id> associated with it. Returns 0 if
+  /// failed.
+  uint32_t prepareArrayConstant(Location loc, Type constType, ArrayAttr attr);
+
+  /// Prepares bool/int/float DenseElementsAttr serialization. This method
+  /// iterates the DenseElementsAttr to construct the constant array, and
+  /// returns the result <id>  associated with it. Returns 0 if failed. Note
+  /// that the size of `index` must match the rank.
+  /// TODO(hanchung): Consider to enhance splat elements cases. For splat cases,
+  /// we don't need to loop over all elements, especially when the splat value
+  /// is zero. We can use OpConstantNull when the value is zero.
+  uint32_t prepareDenseElementsConstant(Location loc, Type constType,
+                                        DenseElementsAttr valueAttr, int dim,
+                                        MutableArrayRef<uint64_t> index);
+
+  /// Prepares scalar attribute serialization. This method emits corresponding
+  /// OpConstant* and returns the result <id> associated with it. Returns 0 if
+  /// the attribute is not for a scalar bool/integer/float value. If `isSpec` is
+  /// true, then the constant will be serialized as a specialization constant.
+  uint32_t prepareConstantScalar(Location loc, Attribute valueAttr,
+                                 bool isSpec = false);
+
+  uint32_t prepareConstantBool(Location loc, BoolAttr boolAttr,
+                               bool isSpec = false);
+
+  uint32_t prepareConstantInt(Location loc, IntegerAttr intAttr,
+                              bool isSpec = false);
+
+  uint32_t prepareConstantFp(Location loc, FloatAttr floatAttr,
+                             bool isSpec = false);
+
+  //===--------------------------------------------------------------------===//
+  // Control flow
+  //===--------------------------------------------------------------------===//
+
+  /// Returns the result <id> for the given block.
+  uint32_t getBlockID(Block *block) const { return blockIDMap.lookup(block); }
+
+  /// Returns the result <id> for the given block. If no <id> has been assigned,
+  /// assigns the next available <id>
+  uint32_t getOrCreateBlockID(Block *block);
+
+  /// Processes the given `block` and emits SPIR-V instructions for all ops
+  /// inside. Does not emit OpLabel for this block if `omitLabel` is true.
+  /// `actionBeforeTerminator` is a callback that will be invoked before
+  /// handling the terminator op. It can be used to inject the Op*Merge
+  /// instruction if this is a SPIR-V selection/loop header block.
+  LogicalResult
+  processBlock(Block *block, bool omitLabel = false,
+               function_ref<void()> actionBeforeTerminator = nullptr);
+
+  /// Emits OpPhi instructions for the given block if it has block arguments.
+  LogicalResult emitPhiForBlockArguments(Block *block);
+
+  LogicalResult processSelectionOp(spirv::SelectionOp selectionOp);
+
+  LogicalResult processLoopOp(spirv::LoopOp loopOp);
+
+  LogicalResult processBranchConditionalOp(spirv::BranchConditionalOp);
+
+  LogicalResult processBranchOp(spirv::BranchOp branchOp);
+
+  //===--------------------------------------------------------------------===//
+  // Operations
+  //===--------------------------------------------------------------------===//
+
+  LogicalResult encodeExtensionInstruction(Operation *op,
+                                           StringRef extensionSetName,
+                                           uint32_t opcode,
+                                           ArrayRef<uint32_t> operands);
+
+  uint32_t getValueID(Value val) const { return valueIDMap.lookup(val); }
+
+  LogicalResult processAddressOfOp(spirv::AddressOfOp addressOfOp);
+
+  LogicalResult processReferenceOfOp(spirv::ReferenceOfOp referenceOfOp);
+
+  /// Main dispatch method for serializing an operation.
+  LogicalResult processOperation(Operation *op);
+
+  /// Method to dispatch to the serialization function for an operation in
+  /// SPIR-V dialect that is a mirror of an instruction in the SPIR-V spec.
+  /// This is auto-generated from ODS. Dispatch is handled for all operations
+  /// in SPIR-V dialect that have hasOpcode == 1.
+  LogicalResult dispatchToAutogenSerialization(Operation *op);
+
+  /// Method to serialize an operation in the SPIR-V dialect that is a mirror of
+  /// an instruction in the SPIR-V spec. This is auto generated if hasOpcode ==
+  /// 1 and autogenSerialization == 1 in ODS.
+  template <typename OpTy> LogicalResult processOp(OpTy op) {
+    return op.emitError("unsupported op serialization");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Utilities
+  //===--------------------------------------------------------------------===//
+
+  /// Emits an OpDecorate instruction to decorate the given `target` with the
+  /// given `decoration`.
+  LogicalResult emitDecoration(uint32_t target, spirv::Decoration decoration,
+                               ArrayRef<uint32_t> params = {});
+
+private:
+  /// The SPIR-V module to be serialized.
+  spirv::ModuleOp module;
+
+  /// An MLIR builder for getting MLIR constructs.
+  mlir::Builder mlirBuilder;
+
+  /// The next available result <id>.
+  uint32_t nextID = 1;
+
+  // The following are for different SPIR-V instruction sections. They follow
+  // the logical layout of a SPIR-V module.
+
+  SmallVector<uint32_t, 4> capabilities;
+  SmallVector<uint32_t, 0> extensions;
+  SmallVector<uint32_t, 0> extendedSets;
+  SmallVector<uint32_t, 3> memoryModel;
+  SmallVector<uint32_t, 0> entryPoints;
+  SmallVector<uint32_t, 4> executionModes;
+  // TODO(antiagainst): debug instructions
+  SmallVector<uint32_t, 0> names;
+  SmallVector<uint32_t, 0> decorations;
+  SmallVector<uint32_t, 0> typesGlobalValues;
+  SmallVector<uint32_t, 0> functions;
+
+  /// `functionHeader` contains all the instructions that must be in the first
+  /// block in the function, and `functionBody` contains the rest. After
+  /// processing FuncOp, the encoded instructions of a function are appended to
+  /// `functions`. An example of instructions in `functionHeader` in order:
+  /// OpFunction ...
+  /// OpFunctionParameter ...
+  /// OpFunctionParameter ...
+  /// OpLabel ...
+  /// OpVariable ...
+  /// OpVariable ...
+  SmallVector<uint32_t, 0> functionHeader;
+  SmallVector<uint32_t, 0> functionBody;
+
+  /// Map from type used in SPIR-V module to their <id>s.
+  DenseMap<Type, uint32_t> typeIDMap;
+
+  /// Map from constant values to their <id>s.
+  DenseMap<Attribute, uint32_t> constIDMap;
+
+  /// Map from specialization constant names to their <id>s.
+  llvm::StringMap<uint32_t> specConstIDMap;
+
+  /// Map from GlobalVariableOps name to <id>s.
+  llvm::StringMap<uint32_t> globalVarIDMap;
+
+  /// Map from FuncOps name to <id>s.
+  llvm::StringMap<uint32_t> funcIDMap;
+
+  /// Map from blocks to their <id>s.
+  DenseMap<Block *, uint32_t> blockIDMap;
+
+  /// Map from the Type to the <id> that represents undef value of that type.
+  DenseMap<Type, uint32_t> undefValIDMap;
+
+  /// Map from results of normal operations to their <id>s.
+  DenseMap<Value, uint32_t> valueIDMap;
+
+  /// Map from extended instruction set name to <id>s.
+  llvm::StringMap<uint32_t> extendedInstSetIDMap;
+
+  /// Map from values used in OpPhi instructions to their offset in the
+  /// `functions` section.
+  ///
+  /// When processing a block with arguments, we need to emit OpPhi
+  /// instructions to record the predecessor block <id>s and the values they
+  /// send to the block in question. But it's not guaranteed all values are
+  /// visited and thus assigned result <id>s. So we need this list to capture
+  /// the offsets into `functions` where a value is used so that we can fix it
+  /// up later after processing all the blocks in a function.
+  ///
+  /// More concretely, say if we are visiting the following blocks:
+  ///
+  /// ```mlir
+  /// ^phi(%arg0: i32):
+  ///   ...
+  /// ^parent1:
+  ///   ...
+  ///   spv.Branch ^phi(%val0: i32)
+  /// ^parent2:
+  ///   ...
+  ///   spv.Branch ^phi(%val1: i32)
+  /// ```
+  ///
+  /// When we are serializing the `^phi` block, we need to emit at the beginning
+  /// of the block OpPhi instructions which has the following parameters:
+  ///
+  /// OpPhi id-for-i32 id-for-%arg0 id-for-%val0 id-for-^parent1
+  ///                               id-for-%val1 id-for-^parent2
+  ///
+  /// But we don't know the <id> for %val0 and %val1 yet. One way is to visit
+  /// all the blocks twice and use the first visit to assign an <id> to each
+  /// value. But it's paying the overheads just for OpPhi emission. Instead,
+  /// we still visit the blocks once for emission. When we emit the OpPhi
+  /// instructions, we use 0 as a placeholder for the <id>s for %val0 and %val1.
+  /// At the same time, we record their offsets in the emitted binary (which is
+  /// placed inside `functions`) here. And then after emitting all blocks, we
+  /// replace the dummy <id> 0 with the real result <id> by overwriting
+  /// `functions[offset]`.
+  DenseMap<Value, SmallVector<size_t, 1>> deferredPhiValues;
+};
+} // namespace
+
+Serializer::Serializer(spirv::ModuleOp module)
+    : module(module), mlirBuilder(module.getContext()) {}
+
+LogicalResult Serializer::serialize() {
+  LLVM_DEBUG(llvm::dbgs() << "+++ starting serialization +++\n");
+
+  if (failed(module.verify()))
+    return failure();
+
+  // TODO(antiagainst): handle the other sections
+  processCapability();
+  processExtension();
+  processMemoryModel();
+
+  // Iterate over the module body to serialize it. Assumptions are that there is
+  // only one basic block in the moduleOp
+  for (auto &op : module.getBlock()) {
+    if (failed(processOperation(&op))) {
+      return failure();
+    }
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "+++ completed serialization +++\n");
+  return success();
+}
+
+void Serializer::collect(SmallVectorImpl<uint32_t> &binary) {
+  auto moduleSize = spirv::kHeaderWordCount + capabilities.size() +
+                    extensions.size() + extendedSets.size() +
+                    memoryModel.size() + entryPoints.size() +
+                    executionModes.size() + decorations.size() +
+                    typesGlobalValues.size() + functions.size();
+
+  binary.clear();
+  binary.reserve(moduleSize);
+
+  spirv::appendModuleHeader(binary, nextID);
+  binary.append(capabilities.begin(), capabilities.end());
+  binary.append(extensions.begin(), extensions.end());
+  binary.append(extendedSets.begin(), extendedSets.end());
+  binary.append(memoryModel.begin(), memoryModel.end());
+  binary.append(entryPoints.begin(), entryPoints.end());
+  binary.append(executionModes.begin(), executionModes.end());
+  binary.append(names.begin(), names.end());
+  binary.append(decorations.begin(), decorations.end());
+  binary.append(typesGlobalValues.begin(), typesGlobalValues.end());
+  binary.append(functions.begin(), functions.end());
+}
+
+void Serializer::printValueIDMap(raw_ostream &os) {
+  os << "\n= Value <id> Map =\n\n";
+  for (auto valueIDPair : valueIDMap) {
+    Value val = valueIDPair.first;
+    os << "  " << val << " "
+       << "id = " << valueIDPair.second << ' ';
+    if (auto *op = val->getDefiningOp()) {
+      os << "from op '" << op->getName() << "'";
+    } else if (auto arg = val.dyn_cast<BlockArgument>()) {
+      Block *block = arg->getOwner();
+      os << "from argument of block " << block << ' ';
+      os << " in op '" << block->getParentOp()->getName() << "'";
+    }
+    os << '\n';
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Module structure
+//===----------------------------------------------------------------------===//
+
+uint32_t Serializer::getOrCreateFunctionID(StringRef fnName) {
+  auto funcID = funcIDMap.lookup(fnName);
+  if (!funcID) {
+    funcID = getNextID();
+    funcIDMap[fnName] = funcID;
+  }
+  return funcID;
+}
+
+void Serializer::processCapability() {
+  auto caps = module.getAttrOfType<ArrayAttr>("capabilities");
+  if (!caps)
+    return;
+
+  for (auto cap : caps.getValue()) {
+    auto capStr = cap.cast<StringAttr>().getValue();
+    auto capVal = spirv::symbolizeCapability(capStr);
+    encodeInstructionInto(capabilities, spirv::Opcode::OpCapability,
+                          {static_cast<uint32_t>(*capVal)});
+  }
+}
+
+void Serializer::processExtension() {
+  auto exts = module.getAttrOfType<ArrayAttr>("extensions");
+  if (!exts)
+    return;
+
+  SmallVector<uint32_t, 16> extName;
+  for (auto ext : exts.getValue()) {
+    auto extStr = ext.cast<StringAttr>().getValue();
+    extName.clear();
+    spirv::encodeStringLiteralInto(extName, extStr);
+    encodeInstructionInto(extensions, spirv::Opcode::OpExtension, extName);
+  }
+}
+
+void Serializer::processMemoryModel() {
+  uint32_t mm = module.getAttrOfType<IntegerAttr>("memory_model").getInt();
+  uint32_t am = module.getAttrOfType<IntegerAttr>("addressing_model").getInt();
+
+  encodeInstructionInto(memoryModel, spirv::Opcode::OpMemoryModel, {am, mm});
+}
+
+LogicalResult Serializer::processConstantOp(spirv::ConstantOp op) {
+  if (auto resultID = prepareConstant(op.getLoc(), op.getType(), op.value())) {
+    valueIDMap[op.getResult()] = resultID;
+    return success();
+  }
+  return failure();
+}
+
+LogicalResult Serializer::processSpecConstantOp(spirv::SpecConstantOp op) {
+  if (auto resultID = prepareConstantScalar(op.getLoc(), op.default_value(),
+                                            /*isSpec=*/true)) {
+    // Emit the OpDecorate instruction for SpecId.
+    if (auto specID = op.getAttrOfType<IntegerAttr>("spec_id")) {
+      auto val = static_cast<uint32_t>(specID.getInt());
+      emitDecoration(resultID, spirv::Decoration::SpecId, {val});
+    }
+
+    specConstIDMap[op.sym_name()] = resultID;
+    return processName(resultID, op.sym_name());
+  }
+  return failure();
+}
+
+LogicalResult Serializer::processUndefOp(spirv::UndefOp op) {
+  auto undefType = op.getType();
+  auto &id = undefValIDMap[undefType];
+  if (!id) {
+    id = getNextID();
+    uint32_t typeID = 0;
+    if (failed(processType(op.getLoc(), undefType, typeID)) ||
+        failed(encodeInstructionInto(typesGlobalValues, spirv::Opcode::OpUndef,
+                                     {typeID, id}))) {
+      return failure();
+    }
+  }
+  valueIDMap[op.getResult()] = id;
+  return success();
+}
+
+LogicalResult Serializer::processDecoration(Location loc, uint32_t resultID,
+                                            NamedAttribute attr) {
+  auto attrName = attr.first.strref();
+  auto decorationName = mlir::convertToCamelCase(attrName, true);
+  auto decoration = spirv::symbolizeDecoration(decorationName);
+  if (!decoration) {
+    return emitError(
+               loc, "non-argument attributes expected to have snake-case-ified "
+                    "decoration name, unhandled attribute with name : ")
+           << attrName;
+  }
+  SmallVector<uint32_t, 1> args;
+  switch (decoration.getValue()) {
+  case spirv::Decoration::DescriptorSet:
+  case spirv::Decoration::Binding:
+    if (auto intAttr = attr.second.dyn_cast<IntegerAttr>()) {
+      args.push_back(intAttr.getValue().getZExtValue());
+      break;
+    }
+    return emitError(loc, "expected integer attribute for ") << attrName;
+  case spirv::Decoration::BuiltIn:
+    if (auto strAttr = attr.second.dyn_cast<StringAttr>()) {
+      auto enumVal = spirv::symbolizeBuiltIn(strAttr.getValue());
+      if (enumVal) {
+        args.push_back(static_cast<uint32_t>(enumVal.getValue()));
+        break;
+      }
+      return emitError(loc, "invalid ")
+             << attrName << " attribute " << strAttr.getValue();
+    }
+    return emitError(loc, "expected string attribute for ") << attrName;
+  default:
+    return emitError(loc, "unhandled decoration ") << decorationName;
+  }
+  return emitDecoration(resultID, decoration.getValue(), args);
+}
+
+LogicalResult Serializer::processName(uint32_t resultID, StringRef name) {
+  assert(!name.empty() && "unexpected empty string for OpName");
+
+  SmallVector<uint32_t, 4> nameOperands;
+  nameOperands.push_back(resultID);
+  if (failed(spirv::encodeStringLiteralInto(nameOperands, name))) {
+    return failure();
+  }
+  return encodeInstructionInto(names, spirv::Opcode::OpName, nameOperands);
+}
+
+namespace {
+template <>
+LogicalResult Serializer::processTypeDecoration<spirv::ArrayType>(
+    Location loc, spirv::ArrayType type, uint32_t resultID) {
+  if (type.hasLayout()) {
+    // OpDecorate %arrayTypeSSA ArrayStride strideLiteral
+    return emitDecoration(resultID, spirv::Decoration::ArrayStride,
+                          {static_cast<uint32_t>(type.getArrayStride())});
+  }
+  return success();
+}
+
+LogicalResult
+Serializer::processMemberDecoration(uint32_t structID, uint32_t memberIndex,
+                                    spirv::Decoration decorationType,
+                                    ArrayRef<uint32_t> values) {
+  SmallVector<uint32_t, 4> args(
+      {structID, memberIndex, static_cast<uint32_t>(decorationType)});
+  if (!values.empty()) {
+    args.append(values.begin(), values.end());
+  }
+  return encodeInstructionInto(decorations, spirv::Opcode::OpMemberDecorate,
+                               args);
+}
+} // namespace
+
+LogicalResult Serializer::processFuncOp(FuncOp op) {
+  LLVM_DEBUG(llvm::dbgs() << "-- start function '" << op.getName() << "' --\n");
+  assert(functionHeader.empty() && functionBody.empty());
+
+  uint32_t fnTypeID = 0;
+  // Generate type of the function.
+  processType(op.getLoc(), op.getType(), fnTypeID);
+
+  // Add the function definition.
+  SmallVector<uint32_t, 4> operands;
+  uint32_t resTypeID = 0;
+  auto resultTypes = op.getType().getResults();
+  if (resultTypes.size() > 1) {
+    return op.emitError("cannot serialize function with multiple return types");
+  }
+  if (failed(processType(op.getLoc(),
+                         (resultTypes.empty() ? getVoidType() : resultTypes[0]),
+                         resTypeID))) {
+    return failure();
+  }
+  operands.push_back(resTypeID);
+  auto funcID = getOrCreateFunctionID(op.getName());
+  operands.push_back(funcID);
+  // TODO : Support other function control options.
+  operands.push_back(static_cast<uint32_t>(spirv::FunctionControl::None));
+  operands.push_back(fnTypeID);
+  encodeInstructionInto(functionHeader, spirv::Opcode::OpFunction, operands);
+
+  // Add function name.
+  if (failed(processName(funcID, op.getName()))) {
+    return failure();
+  }
+
+  // Declare the parameters.
+  for (auto arg : op.getArguments()) {
+    uint32_t argTypeID = 0;
+    if (failed(processType(op.getLoc(), arg->getType(), argTypeID))) {
+      return failure();
+    }
+    auto argValueID = getNextID();
+    valueIDMap[arg] = argValueID;
+    encodeInstructionInto(functionHeader, spirv::Opcode::OpFunctionParameter,
+                          {argTypeID, argValueID});
+  }
+
+  // Process the body.
+  if (op.isExternal()) {
+    return op.emitError("external function is unhandled");
+  }
+
+  // Some instructions (e.g., OpVariable) in a function must be in the first
+  // block in the function. These instructions will be put in functionHeader.
+  // Thus, we put the label in functionHeader first, and omit it from the first
+  // block.
+  encodeInstructionInto(functionHeader, spirv::Opcode::OpLabel,
+                        {getOrCreateBlockID(&op.front())});
+  processBlock(&op.front(), /*omitLabel=*/true);
+  if (failed(visitInPrettyBlockOrder(
+          &op.front(), [&](Block *block) { return processBlock(block); },
+          /*skipHeader=*/true))) {
+    return failure();
+  }
+
+  // There might be OpPhi instructions who have value references needing to fix.
+  for (auto deferredValue : deferredPhiValues) {
+    Value value = deferredValue.first;
+    uint32_t id = getValueID(value);
+    LLVM_DEBUG(llvm::dbgs() << "[phi] fix reference of value " << value
+                            << " to id = " << id << '\n');
+    assert(id && "OpPhi references undefined value!");
+    for (size_t offset : deferredValue.second)
+      functionBody[offset] = id;
+  }
+  deferredPhiValues.clear();
+
+  LLVM_DEBUG(llvm::dbgs() << "-- completed function '" << op.getName()
+                          << "' --\n");
+  // Insert OpFunctionEnd.
+  if (failed(encodeInstructionInto(functionBody, spirv::Opcode::OpFunctionEnd,
+                                   {}))) {
+    return failure();
+  }
+
+  functions.append(functionHeader.begin(), functionHeader.end());
+  functions.append(functionBody.begin(), functionBody.end());
+  functionHeader.clear();
+  functionBody.clear();
+
+  return success();
+}
+
+LogicalResult Serializer::processVariableOp(spirv::VariableOp op) {
+  SmallVector<uint32_t, 4> operands;
+  SmallVector<StringRef, 2> elidedAttrs;
+  uint32_t resultID = 0;
+  uint32_t resultTypeID = 0;
+  if (failed(processType(op.getLoc(), op.getType(), resultTypeID))) {
+    return failure();
+  }
+  operands.push_back(resultTypeID);
+  resultID = getNextID();
+  valueIDMap[op.getResult()] = resultID;
+  operands.push_back(resultID);
+  auto attr = op.getAttr(spirv::attributeName<spirv::StorageClass>());
+  if (attr) {
+    operands.push_back(static_cast<uint32_t>(
+        attr.cast<IntegerAttr>().getValue().getZExtValue()));
+  }
+  elidedAttrs.push_back(spirv::attributeName<spirv::StorageClass>());
+  for (auto arg : op.getODSOperands(0)) {
+    auto argID = getValueID(arg);
+    if (!argID) {
+      return emitError(op.getLoc(), "operand 0 has a use before def");
+    }
+    operands.push_back(argID);
+  }
+  encodeInstructionInto(functionHeader, spirv::getOpcode<spirv::VariableOp>(),
+                        operands);
+  for (auto attr : op.getAttrs()) {
+    if (llvm::any_of(elidedAttrs,
+                     [&](StringRef elided) { return attr.first.is(elided); })) {
+      continue;
+    }
+    if (failed(processDecoration(op.getLoc(), resultID, attr))) {
+      return failure();
+    }
+  }
+  return success();
+}
+
+LogicalResult
+Serializer::processGlobalVariableOp(spirv::GlobalVariableOp varOp) {
+  // Get TypeID.
+  uint32_t resultTypeID = 0;
+  SmallVector<StringRef, 4> elidedAttrs;
+  if (failed(processType(varOp.getLoc(), varOp.type(), resultTypeID))) {
+    return failure();
+  }
+
+  if (isInterfaceStructPtrType(varOp.type())) {
+    auto structType = varOp.type()
+                          .cast<spirv::PointerType>()
+                          .getPointeeType()
+                          .cast<spirv::StructType>();
+    if (failed(
+            emitDecoration(getTypeID(structType), spirv::Decoration::Block))) {
+      return varOp.emitError("cannot decorate ")
+             << structType << " with Block decoration";
+    }
+  }
+
+  elidedAttrs.push_back("type");
+  SmallVector<uint32_t, 4> operands;
+  operands.push_back(resultTypeID);
+  auto resultID = getNextID();
+
+  // Encode the name.
+  auto varName = varOp.sym_name();
+  elidedAttrs.push_back(SymbolTable::getSymbolAttrName());
+  if (failed(processName(resultID, varName))) {
+    return failure();
+  }
+  globalVarIDMap[varName] = resultID;
+  operands.push_back(resultID);
+
+  // Encode StorageClass.
+  operands.push_back(static_cast<uint32_t>(varOp.storageClass()));
+
+  // Encode initialization.
+  if (auto initializer = varOp.initializer()) {
+    auto initializerID = getVariableID(initializer.getValue());
+    if (!initializerID) {
+      return emitError(varOp.getLoc(),
+                       "invalid usage of undefined variable as initializer");
+    }
+    operands.push_back(initializerID);
+    elidedAttrs.push_back("initializer");
+  }
+
+  if (failed(encodeInstructionInto(typesGlobalValues, spirv::Opcode::OpVariable,
+                                   operands))) {
+    elidedAttrs.push_back("initializer");
+    return failure();
+  }
+
+  // Encode decorations.
+  for (auto attr : varOp.getAttrs()) {
+    if (llvm::any_of(elidedAttrs,
+                     [&](StringRef elided) { return attr.first.is(elided); })) {
+      continue;
+    }
+    if (failed(processDecoration(varOp.getLoc(), resultID, attr))) {
+      return failure();
+    }
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Type
+//===----------------------------------------------------------------------===//
+
+bool Serializer::isInterfaceStructPtrType(Type type) const {
+  if (auto ptrType = type.dyn_cast<spirv::PointerType>()) {
+    auto storageClass = ptrType.getStorageClass();
+    if (storageClass == spirv::StorageClass::Uniform ||
+        storageClass == spirv::StorageClass::StorageBuffer) {
+      return ptrType.getPointeeType().isa<spirv::StructType>();
+    }
+  }
+  return false;
+}
+
+LogicalResult Serializer::processType(Location loc, Type type,
+                                      uint32_t &typeID) {
+  typeID = getTypeID(type);
+  if (typeID) {
+    return success();
+  }
+  typeID = getNextID();
+  SmallVector<uint32_t, 4> operands;
+  operands.push_back(typeID);
+  auto typeEnum = spirv::Opcode::OpTypeVoid;
+  if ((type.isa<FunctionType>() &&
+       succeeded(prepareFunctionType(loc, type.cast<FunctionType>(), typeEnum,
+                                     operands))) ||
+      succeeded(prepareBasicType(loc, type, typeID, typeEnum, operands))) {
+    typeIDMap[type] = typeID;
+    return encodeInstructionInto(typesGlobalValues, typeEnum, operands);
+  }
+  return failure();
+}
+
+LogicalResult
+Serializer::prepareBasicType(Location loc, Type type, uint32_t resultID,
+                             spirv::Opcode &typeEnum,
+                             SmallVectorImpl<uint32_t> &operands) {
+  if (isVoidType(type)) {
+    typeEnum = spirv::Opcode::OpTypeVoid;
+    return success();
+  }
+
+  if (auto intType = type.dyn_cast<IntegerType>()) {
+    if (intType.getWidth() == 1) {
+      typeEnum = spirv::Opcode::OpTypeBool;
+      return success();
+    }
+
+    typeEnum = spirv::Opcode::OpTypeInt;
+    operands.push_back(intType.getWidth());
+    // TODO(antiagainst): support unsigned integers
+    operands.push_back(1);
+    return success();
+  }
+
+  if (auto floatType = type.dyn_cast<FloatType>()) {
+    typeEnum = spirv::Opcode::OpTypeFloat;
+    operands.push_back(floatType.getWidth());
+    return success();
+  }
+
+  if (auto vectorType = type.dyn_cast<VectorType>()) {
+    uint32_t elementTypeID = 0;
+    if (failed(processType(loc, vectorType.getElementType(), elementTypeID))) {
+      return failure();
+    }
+    typeEnum = spirv::Opcode::OpTypeVector;
+    operands.push_back(elementTypeID);
+    operands.push_back(vectorType.getNumElements());
+    return success();
+  }
+
+  if (auto arrayType = type.dyn_cast<spirv::ArrayType>()) {
+    typeEnum = spirv::Opcode::OpTypeArray;
+    uint32_t elementTypeID = 0;
+    if (failed(processType(loc, arrayType.getElementType(), elementTypeID))) {
+      return failure();
+    }
+    operands.push_back(elementTypeID);
+    if (auto elementCountID = prepareConstantInt(
+            loc, mlirBuilder.getI32IntegerAttr(arrayType.getNumElements()))) {
+      operands.push_back(elementCountID);
+    }
+    return processTypeDecoration(loc, arrayType, resultID);
+  }
+
+  if (auto ptrType = type.dyn_cast<spirv::PointerType>()) {
+    uint32_t pointeeTypeID = 0;
+    if (failed(processType(loc, ptrType.getPointeeType(), pointeeTypeID))) {
+      return failure();
+    }
+    typeEnum = spirv::Opcode::OpTypePointer;
+    operands.push_back(static_cast<uint32_t>(ptrType.getStorageClass()));
+    operands.push_back(pointeeTypeID);
+    return success();
+  }
+
+  if (auto runtimeArrayType = type.dyn_cast<spirv::RuntimeArrayType>()) {
+    uint32_t elementTypeID = 0;
+    if (failed(processType(loc, runtimeArrayType.getElementType(),
+                           elementTypeID))) {
+      return failure();
+    }
+    operands.push_back(elementTypeID);
+    typeEnum = spirv::Opcode::OpTypeRuntimeArray;
+    return success();
+  }
+
+  if (auto structType = type.dyn_cast<spirv::StructType>()) {
+    bool hasLayout = structType.hasLayout();
+    for (auto elementIndex :
+         llvm::seq<uint32_t>(0, structType.getNumElements())) {
+      uint32_t elementTypeID = 0;
+      if (failed(processType(loc, structType.getElementType(elementIndex),
+                             elementTypeID))) {
+        return failure();
+      }
+      operands.push_back(elementTypeID);
+      if (hasLayout) {
+        // Decorate each struct member with an offset
+        if (failed(processMemberDecoration(
+                resultID, elementIndex, spirv::Decoration::Offset,
+                static_cast<uint32_t>(structType.getOffset(elementIndex))))) {
+          return emitError(loc, "cannot decorate ")
+                 << elementIndex << "-th member of " << structType
+                 << " with its offset";
+        }
+      }
+    }
+    SmallVector<spirv::StructType::MemberDecorationInfo, 4> memberDecorations;
+    structType.getMemberDecorations(memberDecorations);
+    for (auto &memberDecoration : memberDecorations) {
+      if (failed(processMemberDecoration(resultID, memberDecoration.first,
+                                         memberDecoration.second))) {
+        return emitError(loc, "cannot decorate ")
+               << memberDecoration.first << "-th member of " << structType
+               << " with " << stringifyDecoration(memberDecoration.second);
+      }
+    }
+    typeEnum = spirv::Opcode::OpTypeStruct;
+    return success();
+  }
+
+  // TODO(ravishankarm) : Handle other types.
+  return emitError(loc, "unhandled type in serialization: ") << type;
+}
+
+LogicalResult
+Serializer::prepareFunctionType(Location loc, FunctionType type,
+                                spirv::Opcode &typeEnum,
+                                SmallVectorImpl<uint32_t> &operands) {
+  typeEnum = spirv::Opcode::OpTypeFunction;
+  assert(type.getNumResults() <= 1 &&
+         "serialization supports only a single return value");
+  uint32_t resultID = 0;
+  if (failed(processType(
+          loc, type.getNumResults() == 1 ? type.getResult(0) : getVoidType(),
+          resultID))) {
+    return failure();
+  }
+  operands.push_back(resultID);
+  for (auto &res : type.getInputs()) {
+    uint32_t argTypeID = 0;
+    if (failed(processType(loc, res, argTypeID))) {
+      return failure();
+    }
+    operands.push_back(argTypeID);
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Constant
+//===----------------------------------------------------------------------===//
+
+uint32_t Serializer::prepareConstant(Location loc, Type constType,
+                                     Attribute valueAttr) {
+  if (auto id = prepareConstantScalar(loc, valueAttr)) {
+    return id;
+  }
+
+  // This is a composite literal. We need to handle each component separately
+  // and then emit an OpConstantComposite for the whole.
+
+  if (auto id = getConstantID(valueAttr)) {
+    return id;
+  }
+
+  uint32_t typeID = 0;
+  if (failed(processType(loc, constType, typeID))) {
+    return 0;
+  }
+
+  uint32_t resultID = 0;
+  if (auto attr = valueAttr.dyn_cast<DenseElementsAttr>()) {
+    int rank = attr.getType().dyn_cast<ShapedType>().getRank();
+    SmallVector<uint64_t, 4> index(rank);
+    resultID = prepareDenseElementsConstant(loc, constType, attr,
+                                            /*dim=*/0, index);
+  } else if (auto arrayAttr = valueAttr.dyn_cast<ArrayAttr>()) {
+    resultID = prepareArrayConstant(loc, constType, arrayAttr);
+  }
+
+  if (resultID == 0) {
+    emitError(loc, "cannot serialize attribute: ") << valueAttr;
+    return 0;
+  }
+
+  constIDMap[valueAttr] = resultID;
+  return resultID;
+}
+
+uint32_t Serializer::prepareArrayConstant(Location loc, Type constType,
+                                          ArrayAttr attr) {
+  uint32_t typeID = 0;
+  if (failed(processType(loc, constType, typeID))) {
+    return 0;
+  }
+
+  uint32_t resultID = getNextID();
+  SmallVector<uint32_t, 4> operands = {typeID, resultID};
+  operands.reserve(attr.size() + 2);
+  auto elementType = constType.cast<spirv::ArrayType>().getElementType();
+  for (Attribute elementAttr : attr) {
+    if (auto elementID = prepareConstant(loc, elementType, elementAttr)) {
+      operands.push_back(elementID);
+    } else {
+      return 0;
+    }
+  }
+  spirv::Opcode opcode = spirv::Opcode::OpConstantComposite;
+  encodeInstructionInto(typesGlobalValues, opcode, operands);
+
+  return resultID;
+}
+
+// TODO(hanchung): Turn the below function into iterative function, instead of
+// recursive function.
+uint32_t
+Serializer::prepareDenseElementsConstant(Location loc, Type constType,
+                                         DenseElementsAttr valueAttr, int dim,
+                                         MutableArrayRef<uint64_t> index) {
+  auto shapedType = valueAttr.getType().dyn_cast<ShapedType>();
+  assert(dim <= shapedType.getRank());
+  if (shapedType.getRank() == dim) {
+    if (auto attr = valueAttr.dyn_cast<DenseIntElementsAttr>()) {
+      return attr.getType().getElementType().isInteger(1)
+                 ? prepareConstantBool(loc, attr.getValue<BoolAttr>(index))
+                 : prepareConstantInt(loc, attr.getValue<IntegerAttr>(index));
+    }
+    if (auto attr = valueAttr.dyn_cast<DenseFPElementsAttr>()) {
+      return prepareConstantFp(loc, attr.getValue<FloatAttr>(index));
+    }
+    return 0;
+  }
+
+  uint32_t typeID = 0;
+  if (failed(processType(loc, constType, typeID))) {
+    return 0;
+  }
+
+  uint32_t resultID = getNextID();
+  SmallVector<uint32_t, 4> operands = {typeID, resultID};
+  operands.reserve(shapedType.getDimSize(dim) + 2);
+  auto elementType = constType.cast<spirv::CompositeType>().getElementType(0);
+  for (int i = 0; i < shapedType.getDimSize(dim); ++i) {
+    index[dim] = i;
+    if (auto elementID = prepareDenseElementsConstant(
+            loc, elementType, valueAttr, dim + 1, index)) {
+      operands.push_back(elementID);
+    } else {
+      return 0;
+    }
+  }
+  spirv::Opcode opcode = spirv::Opcode::OpConstantComposite;
+  encodeInstructionInto(typesGlobalValues, opcode, operands);
+
+  return resultID;
+}
+
+uint32_t Serializer::prepareConstantScalar(Location loc, Attribute valueAttr,
+                                           bool isSpec) {
+  if (auto floatAttr = valueAttr.dyn_cast<FloatAttr>()) {
+    return prepareConstantFp(loc, floatAttr, isSpec);
+  }
+  if (auto intAttr = valueAttr.dyn_cast<IntegerAttr>()) {
+    return prepareConstantInt(loc, intAttr, isSpec);
+  }
+  if (auto boolAttr = valueAttr.dyn_cast<BoolAttr>()) {
+    return prepareConstantBool(loc, boolAttr, isSpec);
+  }
+
+  return 0;
+}
+
+uint32_t Serializer::prepareConstantBool(Location loc, BoolAttr boolAttr,
+                                         bool isSpec) {
+  if (!isSpec) {
+    // We can de-duplicate normal constants, but not specialization constants.
+    if (auto id = getConstantID(boolAttr)) {
+      return id;
+    }
+  }
+
+  // Process the type for this bool literal
+  uint32_t typeID = 0;
+  if (failed(processType(loc, boolAttr.getType(), typeID))) {
+    return 0;
+  }
+
+  auto resultID = getNextID();
+  auto opcode = boolAttr.getValue()
+                    ? (isSpec ? spirv::Opcode::OpSpecConstantTrue
+                              : spirv::Opcode::OpConstantTrue)
+                    : (isSpec ? spirv::Opcode::OpSpecConstantFalse
+                              : spirv::Opcode::OpConstantFalse);
+  encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID});
+
+  if (!isSpec) {
+    constIDMap[boolAttr] = resultID;
+  }
+  return resultID;
+}
+
+uint32_t Serializer::prepareConstantInt(Location loc, IntegerAttr intAttr,
+                                        bool isSpec) {
+  if (!isSpec) {
+    // We can de-duplicate normal constants, but not specialization constants.
+    if (auto id = getConstantID(intAttr)) {
+      return id;
+    }
+  }
+
+  // Process the type for this integer literal
+  uint32_t typeID = 0;
+  if (failed(processType(loc, intAttr.getType(), typeID))) {
+    return 0;
+  }
+
+  auto resultID = getNextID();
+  APInt value = intAttr.getValue();
+  unsigned bitwidth = value.getBitWidth();
+  bool isSigned = value.isSignedIntN(bitwidth);
+
+  auto opcode =
+      isSpec ? spirv::Opcode::OpSpecConstant : spirv::Opcode::OpConstant;
+
+  // According to SPIR-V spec, "When the type's bit width is less than 32-bits,
+  // the literal's value appears in the low-order bits of the word, and the
+  // high-order bits must be 0 for a floating-point type, or 0 for an integer
+  // type with Signedness of 0, or sign extended when Signedness is 1."
+  if (bitwidth == 32 || bitwidth == 16) {
+    uint32_t word = 0;
+    if (isSigned) {
+      word = static_cast<int32_t>(value.getSExtValue());
+    } else {
+      word = static_cast<uint32_t>(value.getZExtValue());
+    }
+    encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, word});
+  }
+  // According to SPIR-V spec: "When the type's bit width is larger than one
+  // word, the literal’s low-order words appear first."
+  else if (bitwidth == 64) {
+    struct DoubleWord {
+      uint32_t word1;
+      uint32_t word2;
+    } words;
+    if (isSigned) {
+      words = llvm::bit_cast<DoubleWord>(value.getSExtValue());
+    } else {
+      words = llvm::bit_cast<DoubleWord>(value.getZExtValue());
+    }
+    encodeInstructionInto(typesGlobalValues, opcode,
+                          {typeID, resultID, words.word1, words.word2});
+  } else {
+    std::string valueStr;
+    llvm::raw_string_ostream rss(valueStr);
+    value.print(rss, /*isSigned=*/false);
+
+    emitError(loc, "cannot serialize ")
+        << bitwidth << "-bit integer literal: " << rss.str();
+    return 0;
+  }
+
+  if (!isSpec) {
+    constIDMap[intAttr] = resultID;
+  }
+  return resultID;
+}
+
+uint32_t Serializer::prepareConstantFp(Location loc, FloatAttr floatAttr,
+                                       bool isSpec) {
+  if (!isSpec) {
+    // We can de-duplicate normal constants, but not specialization constants.
+    if (auto id = getConstantID(floatAttr)) {
+      return id;
+    }
+  }
+
+  // Process the type for this float literal
+  uint32_t typeID = 0;
+  if (failed(processType(loc, floatAttr.getType(), typeID))) {
+    return 0;
+  }
+
+  auto resultID = getNextID();
+  APFloat value = floatAttr.getValue();
+  APInt intValue = value.bitcastToAPInt();
+
+  auto opcode =
+      isSpec ? spirv::Opcode::OpSpecConstant : spirv::Opcode::OpConstant;
+
+  if (&value.getSemantics() == &APFloat::IEEEsingle()) {
+    uint32_t word = llvm::bit_cast<uint32_t>(value.convertToFloat());
+    encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, word});
+  } else if (&value.getSemantics() == &APFloat::IEEEdouble()) {
+    struct DoubleWord {
+      uint32_t word1;
+      uint32_t word2;
+    } words = llvm::bit_cast<DoubleWord>(value.convertToDouble());
+    encodeInstructionInto(typesGlobalValues, opcode,
+                          {typeID, resultID, words.word1, words.word2});
+  } else if (&value.getSemantics() == &APFloat::IEEEhalf()) {
+    uint32_t word =
+        static_cast<uint32_t>(value.bitcastToAPInt().getZExtValue());
+    encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, word});
+  } else {
+    std::string valueStr;
+    llvm::raw_string_ostream rss(valueStr);
+    value.print(rss);
+
+    emitError(loc, "cannot serialize ")
+        << floatAttr.getType() << "-typed float literal: " << rss.str();
+    return 0;
+  }
+
+  if (!isSpec) {
+    constIDMap[floatAttr] = resultID;
+  }
+  return resultID;
+}
+
+//===----------------------------------------------------------------------===//
+// Control flow
+//===----------------------------------------------------------------------===//
+
+uint32_t Serializer::getOrCreateBlockID(Block *block) {
+  if (uint32_t id = getBlockID(block))
+    return id;
+  return blockIDMap[block] = getNextID();
+}
+
+LogicalResult
+Serializer::processBlock(Block *block, bool omitLabel,
+                         function_ref<void()> actionBeforeTerminator) {
+  LLVM_DEBUG(llvm::dbgs() << "processing block " << block << ":\n");
+  LLVM_DEBUG(block->print(llvm::dbgs()));
+  LLVM_DEBUG(llvm::dbgs() << '\n');
+  if (!omitLabel) {
+    uint32_t blockID = getOrCreateBlockID(block);
+    LLVM_DEBUG(llvm::dbgs()
+               << "[block] " << block << " (id = " << blockID << ")\n");
+
+    // Emit OpLabel for this block.
+    encodeInstructionInto(functionBody, spirv::Opcode::OpLabel, {blockID});
+  }
+
+  // Emit OpPhi instructions for block arguments, if any.
+  if (failed(emitPhiForBlockArguments(block)))
+    return failure();
+
+  // Process each op in this block except the terminator.
+  for (auto &op : llvm::make_range(block->begin(), std::prev(block->end()))) {
+    if (failed(processOperation(&op)))
+      return failure();
+  }
+
+  // Process the terminator.
+  if (actionBeforeTerminator)
+    actionBeforeTerminator();
+  if (failed(processOperation(&block->back())))
+    return failure();
+
+  return success();
+}
+
+LogicalResult Serializer::emitPhiForBlockArguments(Block *block) {
+  // Nothing to do if this block has no arguments or it's the entry block, which
+  // always has the same arguments as the function signature.
+  if (block->args_empty() || block->isEntryBlock())
+    return success();
+
+  // If the block has arguments, we need to create SPIR-V OpPhi instructions.
+  // A SPIR-V OpPhi instruction is of the syntax:
+  //   OpPhi | result type | result <id> | (value <id>, parent block <id>) pair
+  // So we need to collect all predecessor blocks and the arguments they send
+  // to this block.
+  SmallVector<std::pair<Block *, Operation::operand_iterator>, 4> predecessors;
+  for (Block *predecessor : block->getPredecessors()) {
+    auto *terminator = predecessor->getTerminator();
+    // Check whether this predecessor block contains a structured control flow
+    // op. If so, the structured control flow op will be serialized to multiple
+    // SPIR-V blocks. The branch op jumping to the OpPhi's block then resides in
+    // the last structured control flow op's merge block.
+    if (auto *merge = getLastStructuredControlFlowOpMergeBlock(predecessor))
+      predecessor = merge;
+    if (auto branchOp = dyn_cast<spirv::BranchOp>(terminator)) {
+      predecessors.emplace_back(predecessor, branchOp.operand_begin());
+    } else {
+      return terminator->emitError("unimplemented terminator for Phi creation");
+    }
+  }
+
+  // Then create OpPhi instruction for each of the block argument.
+  for (auto argIndex : llvm::seq<unsigned>(0, block->getNumArguments())) {
+    BlockArgument arg = block->getArgument(argIndex);
+
+    // Get the type <id> and result <id> for this OpPhi instruction.
+    uint32_t phiTypeID = 0;
+    if (failed(processType(arg->getLoc(), arg->getType(), phiTypeID)))
+      return failure();
+    uint32_t phiID = getNextID();
+
+    LLVM_DEBUG(llvm::dbgs() << "[phi] for block argument #" << argIndex << ' '
+                            << arg << " (id = " << phiID << ")\n");
+
+    SmallVector<uint32_t, 8> phiArgs;
+    phiArgs.push_back(phiTypeID);
+    phiArgs.push_back(phiID);
+
+    for (auto predIndex : llvm::seq<unsigned>(0, predecessors.size())) {
+      Value value = *(predecessors[predIndex].second + argIndex);
+      uint32_t predBlockId = getOrCreateBlockID(predecessors[predIndex].first);
+      LLVM_DEBUG(llvm::dbgs() << "[phi] use predecessor (id = " << predBlockId
+                              << ") value " << value << ' ');
+      // Each pair is a value <id> ...
+      uint32_t valueId = getValueID(value);
+      if (valueId == 0) {
+        // The op generating this value hasn't been visited yet so we don't have
+        // an <id> assigned yet. Record this to fix up later.
+        LLVM_DEBUG(llvm::dbgs() << "(need to fix)\n");
+        deferredPhiValues[value].push_back(functionBody.size() + 1 +
+                                           phiArgs.size());
+      } else {
+        LLVM_DEBUG(llvm::dbgs() << "(id = " << valueId << ")\n");
+      }
+      phiArgs.push_back(valueId);
+      // ... and a parent block <id>.
+      phiArgs.push_back(predBlockId);
+    }
+
+    encodeInstructionInto(functionBody, spirv::Opcode::OpPhi, phiArgs);
+    valueIDMap[arg] = phiID;
+  }
+
+  return success();
+}
+
+LogicalResult Serializer::processSelectionOp(spirv::SelectionOp selectionOp) {
+  // Assign <id>s to all blocks so that branches inside the SelectionOp can
+  // resolve properly.
+  auto &body = selectionOp.body();
+  for (Block &block : body)
+    getOrCreateBlockID(&block);
+
+  auto *headerBlock = selectionOp.getHeaderBlock();
+  auto *mergeBlock = selectionOp.getMergeBlock();
+  auto mergeID = getBlockID(mergeBlock);
+
+  // Emit the selection header block, which dominates all other blocks, first.
+  // We need to emit an OpSelectionMerge instruction before the loop header
+  // block's terminator.
+  auto emitSelectionMerge = [&]() {
+    // TODO(antiagainst): properly support loop control here
+    encodeInstructionInto(
+        functionBody, spirv::Opcode::OpSelectionMerge,
+        {mergeID, static_cast<uint32_t>(spirv::LoopControl::None)});
+  };
+  // For structured selection, we cannot have blocks in the selection construct
+  // branching to the selection header block. Entering the selection (and
+  // reaching the selection header) must be from the block containing the
+  // spv.selection op. If there are ops ahead of the spv.selection op in the
+  // block, we can "merge" them into the selection header. So here we don't need
+  // to emit a separate block; just continue with the existing block.
+  if (failed(processBlock(headerBlock, /*omitLabel=*/true, emitSelectionMerge)))
+    return failure();
+
+  // Process all blocks with a depth-first visitor starting from the header
+  // block. The selection header block and merge block are skipped by this
+  // visitor.
+  if (failed(visitInPrettyBlockOrder(
+          headerBlock, [&](Block *block) { return processBlock(block); },
+          /*skipHeader=*/true, /*skipBlocks=*/{mergeBlock})))
+    return failure();
+
+  // There is nothing to do for the merge block in the selection, which just
+  // contains a spv._merge op, itself. But we need to have an OpLabel
+  // instruction to start a new SPIR-V block for ops following this SelectionOp.
+  // The block should use the <id> for the merge block.
+  return encodeInstructionInto(functionBody, spirv::Opcode::OpLabel, {mergeID});
+}
+
+LogicalResult Serializer::processLoopOp(spirv::LoopOp loopOp) {
+  // Assign <id>s to all blocks so that branches inside the LoopOp can resolve
+  // properly. We don't need to assign for the entry block, which is just for
+  // satisfying MLIR region's structural requirement.
+  auto &body = loopOp.body();
+  for (Block &block :
+       llvm::make_range(std::next(body.begin(), 1), body.end())) {
+    getOrCreateBlockID(&block);
+  }
+  auto *headerBlock = loopOp.getHeaderBlock();
+  auto *continueBlock = loopOp.getContinueBlock();
+  auto *mergeBlock = loopOp.getMergeBlock();
+  auto headerID = getBlockID(headerBlock);
+  auto continueID = getBlockID(continueBlock);
+  auto mergeID = getBlockID(mergeBlock);
+
+  // This LoopOp is in some MLIR block with preceding and following ops. In the
+  // binary format, it should reside in separate SPIR-V blocks from its
+  // preceding and following ops. So we need to emit unconditional branches to
+  // jump to this LoopOp's SPIR-V blocks and jumping back to the normal flow
+  // afterwards.
+  encodeInstructionInto(functionBody, spirv::Opcode::OpBranch, {headerID});
+
+  // We omit the LoopOp's entry block and start serialization from the loop
+  // header block. The entry block should not contain any additional ops other
+  // than a single spv.Branch that jumps to the loop header block. However,
+  // the spv.Branch can contain additional block arguments. Those block
+  // arguments must come from out of the loop using implicit capture. We will
+  // need to query the <id> for the value sent and the <id> for the incoming
+  // parent block. For the latter, we need to make sure this block is
+  // registered. The value sent should come from the block this loop resides in.
+  blockIDMap[loopOp.getEntryBlock()] =
+      getBlockID(loopOp.getOperation()->getBlock());
+
+  // Emit the loop header block, which dominates all other blocks, first. We
+  // need to emit an OpLoopMerge instruction before the loop header block's
+  // terminator.
+  auto emitLoopMerge = [&]() {
+    // TODO(antiagainst): properly support loop control here
+    encodeInstructionInto(
+        functionBody, spirv::Opcode::OpLoopMerge,
+        {mergeID, continueID, static_cast<uint32_t>(spirv::LoopControl::None)});
+  };
+  if (failed(processBlock(headerBlock, /*omitLabel=*/false, emitLoopMerge)))
+    return failure();
+
+  // Process all blocks with a depth-first visitor starting from the header
+  // block. The loop header block, loop continue block, and loop merge block are
+  // skipped by this visitor and handled later in this function.
+  if (failed(visitInPrettyBlockOrder(
+          headerBlock, [&](Block *block) { return processBlock(block); },
+          /*skipHeader=*/true, /*skipBlocks=*/{continueBlock, mergeBlock})))
+    return failure();
+
+  // We have handled all other blocks. Now get to the loop continue block.
+  if (failed(processBlock(continueBlock)))
+    return failure();
+
+  // There is nothing to do for the merge block in the loop, which just contains
+  // a spv._merge op, itself. But we need to have an OpLabel instruction to
+  // start a new SPIR-V block for ops following this LoopOp. The block should
+  // use the <id> for the merge block.
+  return encodeInstructionInto(functionBody, spirv::Opcode::OpLabel, {mergeID});
+}
+
+LogicalResult Serializer::processBranchConditionalOp(
+    spirv::BranchConditionalOp condBranchOp) {
+  auto conditionID = getValueID(condBranchOp.condition());
+  auto trueLabelID = getOrCreateBlockID(condBranchOp.getTrueBlock());
+  auto falseLabelID = getOrCreateBlockID(condBranchOp.getFalseBlock());
+  SmallVector<uint32_t, 5> arguments{conditionID, trueLabelID, falseLabelID};
+
+  if (auto weights = condBranchOp.branch_weights()) {
+    for (auto val : weights->getValue())
+      arguments.push_back(val.cast<IntegerAttr>().getInt());
+  }
+
+  return encodeInstructionInto(functionBody, spirv::Opcode::OpBranchConditional,
+                               arguments);
+}
+
+LogicalResult Serializer::processBranchOp(spirv::BranchOp branchOp) {
+  return encodeInstructionInto(functionBody, spirv::Opcode::OpBranch,
+                               {getOrCreateBlockID(branchOp.getTarget())});
+}
+
+//===----------------------------------------------------------------------===//
+// Operation
+//===----------------------------------------------------------------------===//
+
+LogicalResult Serializer::encodeExtensionInstruction(
+    Operation *op, StringRef extensionSetName, uint32_t extensionOpcode,
+    ArrayRef<uint32_t> operands) {
+  // Check if the extension has been imported.
+  auto &setID = extendedInstSetIDMap[extensionSetName];
+  if (!setID) {
+    setID = getNextID();
+    SmallVector<uint32_t, 16> importOperands;
+    importOperands.push_back(setID);
+    if (failed(
+            spirv::encodeStringLiteralInto(importOperands, extensionSetName)) ||
+        failed(encodeInstructionInto(
+            extendedSets, spirv::Opcode::OpExtInstImport, importOperands))) {
+      return failure();
+    }
+  }
+
+  // The first two operands are the result type <id> and result <id>. The set
+  // <id> and the opcode need to be insert after this.
+  if (operands.size() < 2) {
+    return op->emitError("extended instructions must have a result encoding");
+  }
+  SmallVector<uint32_t, 8> extInstOperands;
+  extInstOperands.reserve(operands.size() + 2);
+  extInstOperands.append(operands.begin(), std::next(operands.begin(), 2));
+  extInstOperands.push_back(setID);
+  extInstOperands.push_back(extensionOpcode);
+  extInstOperands.append(std::next(operands.begin(), 2), operands.end());
+  return encodeInstructionInto(functionBody, spirv::Opcode::OpExtInst,
+                               extInstOperands);
+}
+
+LogicalResult Serializer::processAddressOfOp(spirv::AddressOfOp addressOfOp) {
+  auto varName = addressOfOp.variable();
+  auto variableID = getVariableID(varName);
+  if (!variableID) {
+    return addressOfOp.emitError("unknown result <id> for variable ")
+           << varName;
+  }
+  valueIDMap[addressOfOp.pointer()] = variableID;
+  return success();
+}
+
+LogicalResult
+Serializer::processReferenceOfOp(spirv::ReferenceOfOp referenceOfOp) {
+  auto constName = referenceOfOp.spec_const();
+  auto constID = getSpecConstID(constName);
+  if (!constID) {
+    return referenceOfOp.emitError(
+               "unknown result <id> for specialization constant ")
+           << constName;
+  }
+  valueIDMap[referenceOfOp.reference()] = constID;
+  return success();
+}
+
+LogicalResult Serializer::processOperation(Operation *opInst) {
+  LLVM_DEBUG(llvm::dbgs() << "[op] '" << opInst->getName() << "'\n");
+
+  // First dispatch the ops that do not directly mirror an instruction from
+  // the SPIR-V spec.
+  return TypeSwitch<Operation *, LogicalResult>(opInst)
+      .Case([&](spirv::AddressOfOp op) { return processAddressOfOp(op); })
+      .Case([&](spirv::BranchOp op) { return processBranchOp(op); })
+      .Case([&](spirv::BranchConditionalOp op) {
+        return processBranchConditionalOp(op);
+      })
+      .Case([&](spirv::ConstantOp op) { return processConstantOp(op); })
+      .Case([&](FuncOp op) { return processFuncOp(op); })
+      .Case([&](spirv::GlobalVariableOp op) {
+        return processGlobalVariableOp(op);
+      })
+      .Case([&](spirv::LoopOp op) { return processLoopOp(op); })
+      .Case([&](spirv::ModuleEndOp) { return success(); })
+      .Case([&](spirv::ReferenceOfOp op) { return processReferenceOfOp(op); })
+      .Case([&](spirv::SelectionOp op) { return processSelectionOp(op); })
+      .Case([&](spirv::SpecConstantOp op) { return processSpecConstantOp(op); })
+      .Case([&](spirv::UndefOp op) { return processUndefOp(op); })
+      .Case([&](spirv::VariableOp op) { return processVariableOp(op); })
+
+      // Then handle all the ops that directly mirror SPIR-V instructions with
+      // auto-generated methods.
+      .Default(
+          [&](Operation *op) { return dispatchToAutogenSerialization(op); });
+}
+
+namespace {
+template <>
+LogicalResult
+Serializer::processOp<spirv::EntryPointOp>(spirv::EntryPointOp op) {
+  SmallVector<uint32_t, 4> operands;
+  // Add the ExecutionModel.
+  operands.push_back(static_cast<uint32_t>(op.execution_model()));
+  // Add the function <id>.
+  auto funcID = getFunctionID(op.fn());
+  if (!funcID) {
+    return op.emitError("missing <id> for function ")
+           << op.fn()
+           << "; function needs to be defined before spv.EntryPoint is "
+              "serialized";
+  }
+  operands.push_back(funcID);
+  // Add the name of the function.
+  spirv::encodeStringLiteralInto(operands, op.fn());
+
+  // Add the interface values.
+  if (auto interface = op.interface()) {
+    for (auto var : interface.getValue()) {
+      auto id = getVariableID(var.cast<FlatSymbolRefAttr>().getValue());
+      if (!id) {
+        return op.emitError("referencing undefined global variable."
+                            "spv.EntryPoint is at the end of spv.module. All "
+                            "referenced variables should already be defined");
+      }
+      operands.push_back(id);
+    }
+  }
+  return encodeInstructionInto(entryPoints, spirv::Opcode::OpEntryPoint,
+                               operands);
+}
+
+template <>
+LogicalResult
+Serializer::processOp<spirv::ControlBarrierOp>(spirv::ControlBarrierOp op) {
+  StringRef argNames[] = {"execution_scope", "memory_scope",
+                          "memory_semantics"};
+  SmallVector<uint32_t, 3> operands;
+
+  for (auto argName : argNames) {
+    auto argIntAttr = op.getAttrOfType<IntegerAttr>(argName);
+    auto operand = prepareConstantInt(op.getLoc(), argIntAttr);
+    if (!operand) {
+      return failure();
+    }
+    operands.push_back(operand);
+  }
+
+  return encodeInstructionInto(functionBody, spirv::Opcode::OpControlBarrier,
+                               operands);
+}
+
+template <>
+LogicalResult
+Serializer::processOp<spirv::ExecutionModeOp>(spirv::ExecutionModeOp op) {
+  SmallVector<uint32_t, 4> operands;
+  // Add the function <id>.
+  auto funcID = getFunctionID(op.fn());
+  if (!funcID) {
+    return op.emitError("missing <id> for function ")
+           << op.fn()
+           << "; function needs to be serialized before ExecutionModeOp is "
+              "serialized";
+  }
+  operands.push_back(funcID);
+  // Add the ExecutionMode.
+  operands.push_back(static_cast<uint32_t>(op.execution_mode()));
+
+  // Serialize values if any.
+  auto values = op.values();
+  if (values) {
+    for (auto &intVal : values.getValue()) {
+      operands.push_back(static_cast<uint32_t>(
+          intVal.cast<IntegerAttr>().getValue().getZExtValue()));
+    }
+  }
+  return encodeInstructionInto(executionModes, spirv::Opcode::OpExecutionMode,
+                               operands);
+}
+
+template <>
+LogicalResult
+Serializer::processOp<spirv::MemoryBarrierOp>(spirv::MemoryBarrierOp op) {
+  StringRef argNames[] = {"memory_scope", "memory_semantics"};
+  SmallVector<uint32_t, 2> operands;
+
+  for (auto argName : argNames) {
+    auto argIntAttr = op.getAttrOfType<IntegerAttr>(argName);
+    auto operand = prepareConstantInt(op.getLoc(), argIntAttr);
+    if (!operand) {
+      return failure();
+    }
+    operands.push_back(operand);
+  }
+
+  return encodeInstructionInto(functionBody, spirv::Opcode::OpMemoryBarrier,
+                               operands);
+}
+
+template <>
+LogicalResult
+Serializer::processOp<spirv::FunctionCallOp>(spirv::FunctionCallOp op) {
+  auto funcName = op.callee();
+  uint32_t resTypeID = 0;
+
+  SmallVector<Type, 1> resultTypes(op.getResultTypes());
+  if (failed(processType(op.getLoc(),
+                         (resultTypes.empty() ? getVoidType() : resultTypes[0]),
+                         resTypeID))) {
+    return failure();
+  }
+
+  auto funcID = getOrCreateFunctionID(funcName);
+  auto funcCallID = getNextID();
+  SmallVector<uint32_t, 8> operands{resTypeID, funcCallID, funcID};
+
+  for (auto value : op.arguments()) {
+    auto valueID = getValueID(value);
+    assert(valueID && "cannot find a value for spv.FunctionCall");
+    operands.push_back(valueID);
+  }
+
+  if (!resultTypes.empty()) {
+    valueIDMap[op.getResult(0)] = funcCallID;
+  }
+
+  return encodeInstructionInto(functionBody, spirv::Opcode::OpFunctionCall,
+                               operands);
+}
+
+// Pull in auto-generated Serializer::dispatchToAutogenSerialization() and
+// various Serializer::processOp<...>() specializations.
+#define GET_SERIALIZATION_FNS
+#include "mlir/Dialect/SPIRV/SPIRVSerialization.inc"
+} // namespace
+
+LogicalResult Serializer::emitDecoration(uint32_t target,
+                                         spirv::Decoration decoration,
+                                         ArrayRef<uint32_t> params) {
+  uint32_t wordCount = 3 + params.size();
+  decorations.push_back(
+      spirv::getPrefixedOpcode(wordCount, spirv::Opcode::OpDecorate));
+  decorations.push_back(target);
+  decorations.push_back(static_cast<uint32_t>(decoration));
+  decorations.append(params.begin(), params.end());
+  return success();
+}
+
+LogicalResult spirv::serialize(spirv::ModuleOp module,
+                               SmallVectorImpl<uint32_t> &binary) {
+  Serializer serializer(module);
+
+  if (failed(serializer.serialize()))
+    return failure();
+
+  LLVM_DEBUG(serializer.printValueIDMap(llvm::dbgs()));
+
+  serializer.collect(binary);
+  return success();
+}
diff --git a/mlir/lib/Dialect/SPIRV/Serialization/TranslateRegistration.cpp b/mlir/lib/Dialect/SPIRV/Serialization/TranslateRegistration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..750710fa3d99a942b9c7d97c9cbb6e099093b965
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/Serialization/TranslateRegistration.cpp
@@ -0,0 +1,145 @@
+//===- TranslateRegistration.cpp - hooks to mlir-translate ----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a translation from SPIR-V binary module to MLIR SPIR-V
+// ModuleOp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/Serialization.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Parser.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Translation.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Deserialization registration
+//===----------------------------------------------------------------------===//
+
+// Deserializes the SPIR-V binary module stored in the file named as
+// `inputFilename` and returns a module containing the SPIR-V module.
+OwningModuleRef deserializeModule(const llvm::MemoryBuffer *input,
+                                  MLIRContext *context) {
+  Builder builder(context);
+
+  // Make sure the input stream can be treated as a stream of SPIR-V words
+  auto start = input->getBufferStart();
+  auto size = input->getBufferSize();
+  if (size % sizeof(uint32_t) != 0) {
+    emitError(UnknownLoc::get(context))
+        << "SPIR-V binary module must contain integral number of 32-bit words";
+    return {};
+  }
+
+  auto binary = llvm::makeArrayRef(reinterpret_cast<const uint32_t *>(start),
+                                   size / sizeof(uint32_t));
+
+  auto spirvModule = spirv::deserialize(binary, context);
+  if (!spirvModule)
+    return {};
+
+  OwningModuleRef module(ModuleOp::create(FileLineColLoc::get(
+      input->getBufferIdentifier(), /*line=*/0, /*column=*/0, context)));
+  module->getBody()->push_front(spirvModule->getOperation());
+
+  return module;
+}
+
+static TranslateToMLIRRegistration fromBinary(
+    "deserialize-spirv", [](llvm::SourceMgr &sourceMgr, MLIRContext *context) {
+      assert(sourceMgr.getNumBuffers() == 1 && "expected one buffer");
+      return deserializeModule(
+          sourceMgr.getMemoryBuffer(sourceMgr.getMainFileID()), context);
+    });
+
+//===----------------------------------------------------------------------===//
+// Serialization registration
+//===----------------------------------------------------------------------===//
+
+LogicalResult serializeModule(ModuleOp module, raw_ostream &output) {
+  if (!module)
+    return failure();
+
+  SmallVector<uint32_t, 0> binary;
+
+  SmallVector<spirv::ModuleOp, 1> spirvModules;
+  module.walk([&](spirv::ModuleOp op) { spirvModules.push_back(op); });
+
+  if (spirvModules.empty())
+    return module.emitError("found no 'spv.module' op");
+
+  if (spirvModules.size() != 1)
+    return module.emitError("found more than one 'spv.module' op");
+
+  if (failed(spirv::serialize(spirvModules[0], binary)))
+    return failure();
+
+  output.write(reinterpret_cast<char *>(binary.data()),
+               binary.size() * sizeof(uint32_t));
+
+  return mlir::success();
+}
+
+static TranslateFromMLIRRegistration
+    toBinary("serialize-spirv", [](ModuleOp module, raw_ostream &output) {
+      return serializeModule(module, output);
+    });
+
+//===----------------------------------------------------------------------===//
+// Round-trip registration
+//===----------------------------------------------------------------------===//
+
+LogicalResult roundTripModule(llvm::SourceMgr &sourceMgr, raw_ostream &output,
+                              MLIRContext *context) {
+  // Parse an MLIR module from the source manager.
+  auto srcModule = OwningModuleRef(parseSourceFile(sourceMgr, context));
+  if (!srcModule)
+    return failure();
+
+  SmallVector<uint32_t, 0> binary;
+
+  auto spirvModules = srcModule->getOps<spirv::ModuleOp>();
+
+  if (spirvModules.begin() == spirvModules.end())
+    return srcModule->emitError("found no 'spv.module' op");
+
+  if (std::next(spirvModules.begin()) != spirvModules.end())
+    return srcModule->emitError("found more than one 'spv.module' op");
+
+  if (failed(spirv::serialize(*spirvModules.begin(), binary)))
+    return failure();
+
+  // Then deserialize to get back a SPIR-V module.
+  auto spirvModule = spirv::deserialize(binary, context);
+  if (!spirvModule)
+    return failure();
+
+  // Wrap around in a new MLIR module.
+  OwningModuleRef dstModule(ModuleOp::create(FileLineColLoc::get(
+      /*filename=*/"", /*line=*/0, /*column=*/0, context)));
+  dstModule->getBody()->push_front(spirvModule->getOperation());
+  dstModule->print(output);
+
+  return mlir::success();
+}
+
+static TranslateRegistration roundtrip(
+    "test-spirv-roundtrip",
+    [](llvm::SourceMgr &sourceMgr, raw_ostream &output, MLIRContext *context) {
+      return roundTripModule(sourceMgr, output, context);
+    });
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8316d542fa52139487df61831b692b3c78608595
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_llvm_library(MLIRSPIRVTransforms
+  DecorateSPIRVCompositeTypeLayoutPass.cpp
+  LowerABIAttributesPass.cpp
+  )
+
+target_link_libraries(MLIRSPIRVTransforms
+  MLIRPass
+  MLIRSPIRV
+  )
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/DecorateSPIRVCompositeTypeLayoutPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/DecorateSPIRVCompositeTypeLayoutPass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07621d6fa80aaacd3bfc5da7b03437d59658138b
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/Transforms/DecorateSPIRVCompositeTypeLayoutPass.cpp
@@ -0,0 +1,123 @@
+//===- DecorateSPIRVCompositeTypeLayoutPass.cpp - Decorate composite type -===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to decorate the composite types used by
+// composite objects in the StorageBuffer, PhysicalStorageBuffer, Uniform, and
+// PushConstant storage classes with layout information. See SPIR-V spec
+// "2.16.2. Validation Rules for Shader Capabilities" for more details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/LayoutUtils.h"
+#include "mlir/Dialect/SPIRV/Passes.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+namespace {
+class SPIRVGlobalVariableOpLayoutInfoDecoration
+    : public OpRewritePattern<spirv::GlobalVariableOp> {
+public:
+  using OpRewritePattern<spirv::GlobalVariableOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(spirv::GlobalVariableOp op,
+                                     PatternRewriter &rewriter) const override {
+    spirv::StructType::LayoutInfo structSize = 0;
+    VulkanLayoutUtils::Size structAlignment = 1;
+    SmallVector<NamedAttribute, 4> globalVarAttrs;
+
+    auto ptrType = op.type().cast<spirv::PointerType>();
+    auto structType = VulkanLayoutUtils::decorateType(
+        ptrType.getPointeeType().cast<spirv::StructType>(), structSize,
+        structAlignment);
+    auto decoratedType =
+        spirv::PointerType::get(structType, ptrType.getStorageClass());
+
+    // Save all named attributes except "type" attribute.
+    for (const auto &attr : op.getAttrs()) {
+      if (attr.first == "type") {
+        continue;
+      }
+      globalVarAttrs.push_back(attr);
+    }
+
+    rewriter.replaceOpWithNewOp<spirv::GlobalVariableOp>(
+        op, TypeAttr::get(decoratedType), globalVarAttrs);
+    return matchSuccess();
+  }
+};
+
+class SPIRVAddressOfOpLayoutInfoDecoration
+    : public OpRewritePattern<spirv::AddressOfOp> {
+public:
+  using OpRewritePattern<spirv::AddressOfOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(spirv::AddressOfOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto spirvModule = op.getParentOfType<spirv::ModuleOp>();
+    auto varName = op.variable();
+    auto varOp = spirvModule.lookupSymbol<spirv::GlobalVariableOp>(varName);
+
+    rewriter.replaceOpWithNewOp<spirv::AddressOfOp>(
+        op, varOp.type(), rewriter.getSymbolRefAttr(varName));
+    return matchSuccess();
+  }
+};
+} // namespace
+
+static void populateSPIRVLayoutInfoPatterns(OwningRewritePatternList &patterns,
+                                            MLIRContext *ctx) {
+  patterns.insert<SPIRVGlobalVariableOpLayoutInfoDecoration,
+                  SPIRVAddressOfOpLayoutInfoDecoration>(ctx);
+}
+
+namespace {
+class DecorateSPIRVCompositeTypeLayoutPass
+    : public ModulePass<DecorateSPIRVCompositeTypeLayoutPass> {
+private:
+  void runOnModule() override;
+};
+} // namespace
+
+void DecorateSPIRVCompositeTypeLayoutPass::runOnModule() {
+  auto module = getModule();
+  OwningRewritePatternList patterns;
+  populateSPIRVLayoutInfoPatterns(patterns, module.getContext());
+  ConversionTarget target(*(module.getContext()));
+  target.addLegalDialect<spirv::SPIRVDialect>();
+  target.addLegalOp<FuncOp>();
+  target.addDynamicallyLegalOp<spirv::GlobalVariableOp>(
+      [](spirv::GlobalVariableOp op) {
+        return VulkanLayoutUtils::isLegalType(op.type());
+      });
+
+  // Change the type for the direct users.
+  target.addDynamicallyLegalOp<spirv::AddressOfOp>([](spirv::AddressOfOp op) {
+    return VulkanLayoutUtils::isLegalType(op.pointer()->getType());
+  });
+
+  // TODO: Change the type for the indirect users such as spv.Load, spv.Store,
+  // spv.FunctionCall and so on.
+
+  for (auto spirvModule : module.getOps<spirv::ModuleOp>()) {
+    if (failed(applyFullConversion(spirvModule, target, patterns))) {
+      signalPassFailure();
+    }
+  }
+}
+
+std::unique_ptr<OpPassBase<ModuleOp>>
+mlir::spirv::createDecorateSPIRVCompositeTypeLayoutPass() {
+  return std::make_unique<DecorateSPIRVCompositeTypeLayoutPass>();
+}
+
+static PassRegistration<DecorateSPIRVCompositeTypeLayoutPass>
+    pass("decorate-spirv-composite-type-layout",
+         "Decorate SPIR-V composite type with layout info");
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7194da0778c567709e897bb64fd5dc8d9abbdb9
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
@@ -0,0 +1,253 @@
+//===- LowerABIAttributesPass.cpp - Decorate composite type ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to lower attributes that specify the shader ABI
+// for the functions in the generated SPIR-V module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/LayoutUtils.h"
+#include "mlir/Dialect/SPIRV/Passes.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVLowering.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+/// Checks if the `type` is a scalar or vector type. It is assumed that they are
+/// valid for SPIR-V dialect already.
+static bool isScalarOrVectorType(Type type) {
+  return spirv::SPIRVDialect::isValidScalarType(type) || type.isa<VectorType>();
+}
+
+/// Creates a global variable for an argument based on the ABI info.
+static spirv::GlobalVariableOp
+createGlobalVariableForArg(FuncOp funcOp, OpBuilder &builder, unsigned argNum,
+                           spirv::InterfaceVarABIAttr abiInfo) {
+  auto spirvModule = funcOp.getParentOfType<spirv::ModuleOp>();
+  if (!spirvModule) {
+    return nullptr;
+  }
+  OpBuilder::InsertionGuard moduleInsertionGuard(builder);
+  builder.setInsertionPoint(funcOp.getOperation());
+  std::string varName =
+      funcOp.getName().str() + "_arg_" + std::to_string(argNum);
+
+  // Get the type of variable. If this is a scalar/vector type and has an ABI
+  // info create a variable of type !spv.ptr<!spv.struct<elementTYpe>>. If not
+  // it must already be a !spv.ptr<!spv.struct<...>>.
+  auto varType = funcOp.getType().getInput(argNum);
+  auto storageClass =
+      static_cast<spirv::StorageClass>(abiInfo.storage_class().getInt());
+  if (isScalarOrVectorType(varType)) {
+    varType =
+        spirv::PointerType::get(spirv::StructType::get(varType), storageClass);
+  }
+  auto varPtrType = varType.cast<spirv::PointerType>();
+  auto varPointeeType = varPtrType.getPointeeType().cast<spirv::StructType>();
+
+  // Set the offset information.
+  VulkanLayoutUtils::Size size = 0, alignment = 0;
+  varPointeeType =
+      VulkanLayoutUtils::decorateType(varPointeeType, size, alignment)
+          .cast<spirv::StructType>();
+  varType =
+      spirv::PointerType::get(varPointeeType, varPtrType.getStorageClass());
+
+  return builder.create<spirv::GlobalVariableOp>(
+      funcOp.getLoc(), varType, varName, abiInfo.descriptor_set().getInt(),
+      abiInfo.binding().getInt());
+}
+
+/// Gets the global variables that need to be specified as interface variable
+/// with an spv.EntryPointOp. Traverses the body of a entry function to do so.
+static LogicalResult
+getInterfaceVariables(FuncOp funcOp,
+                      SmallVectorImpl<Attribute> &interfaceVars) {
+  auto module = funcOp.getParentOfType<spirv::ModuleOp>();
+  if (!module) {
+    return failure();
+  }
+  llvm::SetVector<Operation *> interfaceVarSet;
+
+  // TODO(ravishankarm) : This should in reality traverse the entry function
+  // call graph and collect all the interfaces. For now, just traverse the
+  // instructions in this function.
+  funcOp.walk([&](spirv::AddressOfOp addressOfOp) {
+    auto var =
+        module.lookupSymbol<spirv::GlobalVariableOp>(addressOfOp.variable());
+    if (var.type().cast<spirv::PointerType>().getStorageClass() !=
+        spirv::StorageClass::StorageBuffer) {
+      interfaceVarSet.insert(var.getOperation());
+    }
+  });
+  for (auto &var : interfaceVarSet) {
+    interfaceVars.push_back(SymbolRefAttr::get(
+        cast<spirv::GlobalVariableOp>(var).sym_name(), funcOp.getContext()));
+  }
+  return success();
+}
+
+/// Lowers the entry point attribute.
+static LogicalResult lowerEntryPointABIAttr(FuncOp funcOp, OpBuilder &builder) {
+  auto entryPointAttrName = spirv::getEntryPointABIAttrName();
+  auto entryPointAttr =
+      funcOp.getAttrOfType<spirv::EntryPointABIAttr>(entryPointAttrName);
+  if (!entryPointAttr) {
+    return failure();
+  }
+
+  OpBuilder::InsertionGuard moduleInsertionGuard(builder);
+  auto spirvModule = funcOp.getParentOfType<spirv::ModuleOp>();
+  builder.setInsertionPoint(spirvModule.body().front().getTerminator());
+
+  // Adds the spv.EntryPointOp after collecting all the interface variables
+  // needed.
+  SmallVector<Attribute, 1> interfaceVars;
+  if (failed(getInterfaceVariables(funcOp, interfaceVars))) {
+    return failure();
+  }
+  builder.create<spirv::EntryPointOp>(
+      funcOp.getLoc(), spirv::ExecutionModel::GLCompute, funcOp, interfaceVars);
+  // Specifies the spv.ExecutionModeOp.
+  auto localSizeAttr = entryPointAttr.local_size();
+  SmallVector<int32_t, 3> localSize(localSizeAttr.getValues<int32_t>());
+  builder.create<spirv::ExecutionModeOp>(
+      funcOp.getLoc(), funcOp, spirv::ExecutionMode::LocalSize, localSize);
+  funcOp.removeAttr(entryPointAttrName);
+  return success();
+}
+
+namespace {
+/// Pattern rewriter for changing function signature to match the ABI specified
+/// in attributes.
+class FuncOpLowering final : public SPIRVOpLowering<FuncOp> {
+public:
+  using SPIRVOpLowering<FuncOp>::SPIRVOpLowering;
+  PatternMatchResult
+  matchAndRewrite(FuncOp funcOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+/// Pass to implement the ABI information specified as attributes.
+class LowerABIAttributesPass final
+    : public OperationPass<LowerABIAttributesPass, spirv::ModuleOp> {
+private:
+  void runOnOperation() override;
+};
+} // namespace
+
+PatternMatchResult
+FuncOpLowering::matchAndRewrite(FuncOp funcOp, ArrayRef<Value> operands,
+                                ConversionPatternRewriter &rewriter) const {
+  if (!funcOp.getAttrOfType<spirv::EntryPointABIAttr>(
+          spirv::getEntryPointABIAttrName())) {
+    // TODO(ravishankarm) : Non-entry point functions are not handled.
+    return matchFailure();
+  }
+  TypeConverter::SignatureConversion signatureConverter(
+      funcOp.getType().getNumInputs());
+
+  auto attrName = spirv::getInterfaceVarABIAttrName();
+  for (auto argType : llvm::enumerate(funcOp.getType().getInputs())) {
+    auto abiInfo = funcOp.getArgAttrOfType<spirv::InterfaceVarABIAttr>(
+        argType.index(), attrName);
+    if (!abiInfo) {
+      // TODO(ravishankarm) : For non-entry point functions, it should be legal
+      // to pass around scalar/vector values and return a scalar/vector. For now
+      // non-entry point functions are not handled in this ABI lowering and will
+      // produce an error.
+      return matchFailure();
+    }
+    auto var =
+        createGlobalVariableForArg(funcOp, rewriter, argType.index(), abiInfo);
+    if (!var) {
+      return matchFailure();
+    }
+
+    OpBuilder::InsertionGuard funcInsertionGuard(rewriter);
+    rewriter.setInsertionPointToStart(&funcOp.front());
+    // Insert spirv::AddressOf and spirv::AccessChain operations.
+    Value replacement =
+        rewriter.create<spirv::AddressOfOp>(funcOp.getLoc(), var);
+    // Check if the arg is a scalar or vector type. In that case, the value
+    // needs to be loaded into registers.
+    // TODO(ravishankarm) : This is loading value of the scalar into registers
+    // at the start of the function. It is probably better to do the load just
+    // before the use. There might be multiple loads and currently there is no
+    // easy way to replace all uses with a sequence of operations.
+    if (isScalarOrVectorType(argType.value())) {
+      auto indexType =
+          typeConverter.convertType(IndexType::get(funcOp.getContext()));
+      auto zero =
+          spirv::ConstantOp::getZero(indexType, funcOp.getLoc(), &rewriter);
+      auto loadPtr = rewriter.create<spirv::AccessChainOp>(
+          funcOp.getLoc(), replacement, zero.constant());
+      replacement = rewriter.create<spirv::LoadOp>(funcOp.getLoc(), loadPtr,
+                                                   /*memory_access=*/nullptr,
+                                                   /*alignment=*/nullptr);
+    }
+    signatureConverter.remapInput(argType.index(), replacement);
+  }
+
+  // Creates a new function with the update signature.
+  rewriter.updateRootInPlace(funcOp, [&] {
+    funcOp.setType(rewriter.getFunctionType(
+        signatureConverter.getConvertedTypes(), llvm::None));
+    rewriter.applySignatureConversion(&funcOp.getBody(), signatureConverter);
+  });
+  return matchSuccess();
+}
+
+void LowerABIAttributesPass::runOnOperation() {
+  // Uses the signature conversion methodology of the dialect conversion
+  // framework to implement the conversion.
+  spirv::ModuleOp module = getOperation();
+  MLIRContext *context = &getContext();
+
+  SPIRVTypeConverter typeConverter;
+  OwningRewritePatternList patterns;
+  patterns.insert<FuncOpLowering>(context, typeConverter);
+
+  ConversionTarget target(*context);
+  target.addLegalDialect<spirv::SPIRVDialect>();
+  auto entryPointAttrName = spirv::getEntryPointABIAttrName();
+  target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
+    return op.getAttrOfType<spirv::EntryPointABIAttr>(entryPointAttrName) &&
+           op.getNumResults() == 0 && op.getNumArguments() == 0;
+  });
+  target.addLegalOp<ReturnOp>();
+  if (failed(
+          applyPartialConversion(module, target, patterns, &typeConverter))) {
+    return signalPassFailure();
+  }
+
+  // Walks over all the FuncOps in spirv::ModuleOp to lower the entry point
+  // attributes.
+  OpBuilder builder(context);
+  SmallVector<FuncOp, 1> entryPointFns;
+  module.walk([&](FuncOp funcOp) {
+    if (funcOp.getAttrOfType<spirv::EntryPointABIAttr>(entryPointAttrName)) {
+      entryPointFns.push_back(funcOp);
+    }
+  });
+  for (auto fn : entryPointFns) {
+    if (failed(lowerEntryPointABIAttr(fn, builder))) {
+      return signalPassFailure();
+    }
+  }
+}
+
+std::unique_ptr<OpPassBase<spirv::ModuleOp>>
+mlir::spirv::createLowerABIAttributesPass() {
+  return std::make_unique<LowerABIAttributesPass>();
+}
+
+static PassRegistration<LowerABIAttributesPass>
+    pass("spirv-lower-abi-attrs", "Lower SPIR-V ABI Attributes");
diff --git a/mlir/lib/Dialect/StandardOps/CMakeLists.txt b/mlir/lib/Dialect/StandardOps/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..87ea1e67606c8f293038067033ea8994ac7a4fa8
--- /dev/null
+++ b/mlir/lib/Dialect/StandardOps/CMakeLists.txt
@@ -0,0 +1,14 @@
+file(GLOB globbed *.c *.cpp)
+add_llvm_library(MLIRStandardOps
+  ${globbed}
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/StandardOps
+  )
+add_dependencies(MLIRStandardOps
+  MLIRCallOpInterfacesIncGen
+  MLIRStandardOpsIncGen
+  MLIRIR
+  LLVMSupport
+  )
+target_link_libraries(MLIRStandardOps MLIRIR LLVMSupport)
diff --git a/mlir/lib/Dialect/StandardOps/DialectRegistration.cpp b/mlir/lib/Dialect/StandardOps/DialectRegistration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..684806009e5a8f2fe3f5a27f96e575622c310908
--- /dev/null
+++ b/mlir/lib/Dialect/StandardOps/DialectRegistration.cpp
@@ -0,0 +1,13 @@
+//===- DialectRegistration.cpp - Register standard Op dialect -------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+using namespace mlir;
+
+// Static initialization for standard op dialect registration.
+static DialectRegistration<StandardOpsDialect> StandardOps;
diff --git a/mlir/lib/Dialect/StandardOps/Ops.cpp b/mlir/lib/Dialect/StandardOps/Ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..831c78a4521317640af3e122c52aaa623f29a009
--- /dev/null
+++ b/mlir/lib/Dialect/StandardOps/Ops.cpp
@@ -0,0 +1,3000 @@
+//===- Ops.cpp - Standard MLIR Operations ---------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+
+#include "mlir/Dialect/CommonFolders.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/MathExtras.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+// Pull in all enum type definitions and utility function declarations.
+#include "mlir/Dialect/StandardOps/OpsEnums.cpp.inc"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// StandardOpsDialect Interfaces
+//===----------------------------------------------------------------------===//
+namespace {
+/// This class defines the interface for handling inlining with standard
+/// operations.
+struct StdInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// All operations within standard ops can be inlined.
+  bool isLegalToInline(Operation *, Region *,
+                       BlockAndValueMapping &) const final {
+    return true;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary.
+  void handleTerminator(Operation *op, Block *newDest) const final {
+    // Only "std.return" needs to be handled here.
+    auto returnOp = dyn_cast<ReturnOp>(op);
+    if (!returnOp)
+      return;
+
+    // Replace the return with a branch to the dest.
+    OpBuilder builder(op);
+    builder.create<BranchOp>(op->getLoc(), newDest, returnOp.getOperands());
+    op->erase();
+  }
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary.
+  void handleTerminator(Operation *op,
+                        ArrayRef<Value> valuesToRepl) const final {
+    // Only "std.return" needs to be handled here.
+    auto returnOp = cast<ReturnOp>(op);
+
+    // Replace the values directly with the return operands.
+    assert(returnOp.getNumOperands() == valuesToRepl.size());
+    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+      valuesToRepl[it.index()]->replaceAllUsesWith(it.value());
+  }
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// StandardOpsDialect
+//===----------------------------------------------------------------------===//
+
+/// A custom unary operation printer that omits the "std." prefix from the
+/// operation names.
+static void printStandardUnaryOp(Operation *op, OpAsmPrinter &p) {
+  assert(op->getNumOperands() == 1 && "unary op should have one operand");
+  assert(op->getNumResults() == 1 && "unary op should have one result");
+
+  int stdDotLen = StandardOpsDialect::getDialectNamespace().size() + 1;
+  p << op->getName().getStringRef().drop_front(stdDotLen) << ' '
+    << *op->getOperand(0);
+  p.printOptionalAttrDict(op->getAttrs());
+  p << " : " << op->getOperand(0)->getType();
+}
+
+/// A custom binary operation printer that omits the "std." prefix from the
+/// operation names.
+static void printStandardBinaryOp(Operation *op, OpAsmPrinter &p) {
+  assert(op->getNumOperands() == 2 && "binary op should have two operands");
+  assert(op->getNumResults() == 1 && "binary op should have one result");
+
+  // If not all the operand and result types are the same, just use the
+  // generic assembly form to avoid omitting information in printing.
+  auto resultType = op->getResult(0)->getType();
+  if (op->getOperand(0)->getType() != resultType ||
+      op->getOperand(1)->getType() != resultType) {
+    p.printGenericOp(op);
+    return;
+  }
+
+  int stdDotLen = StandardOpsDialect::getDialectNamespace().size() + 1;
+  p << op->getName().getStringRef().drop_front(stdDotLen) << ' '
+    << *op->getOperand(0) << ", " << *op->getOperand(1);
+  p.printOptionalAttrDict(op->getAttrs());
+
+  // Now we can output only one type for all operands and the result.
+  p << " : " << op->getResult(0)->getType();
+}
+
+/// A custom cast operation printer that omits the "std." prefix from the
+/// operation names.
+static void printStandardCastOp(Operation *op, OpAsmPrinter &p) {
+  int stdDotLen = StandardOpsDialect::getDialectNamespace().size() + 1;
+  p << op->getName().getStringRef().drop_front(stdDotLen) << ' '
+    << *op->getOperand(0) << " : " << op->getOperand(0)->getType() << " to "
+    << op->getResult(0)->getType();
+}
+
+/// A custom cast operation verifier.
+template <typename T> static LogicalResult verifyCastOp(T op) {
+  auto opType = op.getOperand()->getType();
+  auto resType = op.getType();
+  if (!T::areCastCompatible(opType, resType))
+    return op.emitError("operand type ") << opType << " and result type "
+                                         << resType << " are cast incompatible";
+
+  return success();
+}
+
+StandardOpsDialect::StandardOpsDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context) {
+  addOperations<DmaStartOp, DmaWaitOp,
+#define GET_OP_LIST
+#include "mlir/Dialect/StandardOps/Ops.cpp.inc"
+                >();
+  addInterfaces<StdInlinerInterface>();
+}
+
+/// Materialize a single constant operation from a given attribute value with
+/// the desired resultant type.
+Operation *StandardOpsDialect::materializeConstant(OpBuilder &builder,
+                                                   Attribute value, Type type,
+                                                   Location loc) {
+  return builder.create<ConstantOp>(loc, type, value);
+}
+
+void mlir::printDimAndSymbolList(Operation::operand_iterator begin,
+                                 Operation::operand_iterator end,
+                                 unsigned numDims, OpAsmPrinter &p) {
+  Operation::operand_range operands(begin, end);
+  p << '(' << operands.take_front(numDims) << ')';
+  if (operands.size() != numDims)
+    p << '[' << operands.drop_front(numDims) << ']';
+}
+
+// Parses dimension and symbol list, and sets 'numDims' to the number of
+// dimension operands parsed.
+// Returns 'false' on success and 'true' on error.
+ParseResult mlir::parseDimAndSymbolList(OpAsmParser &parser,
+                                        SmallVectorImpl<Value> &operands,
+                                        unsigned &numDims) {
+  SmallVector<OpAsmParser::OperandType, 8> opInfos;
+  if (parser.parseOperandList(opInfos, OpAsmParser::Delimiter::Paren))
+    return failure();
+  // Store number of dimensions for validation by caller.
+  numDims = opInfos.size();
+
+  // Parse the optional symbol operands.
+  auto indexTy = parser.getBuilder().getIndexType();
+  if (parser.parseOperandList(opInfos,
+                              OpAsmParser::Delimiter::OptionalSquare) ||
+      parser.resolveOperands(opInfos, indexTy, operands))
+    return failure();
+  return success();
+}
+
+/// Matches a ConstantIndexOp.
+/// TODO: This should probably just be a general matcher that uses m_Constant
+/// and checks the operation for an index type.
+static detail::op_matcher<ConstantIndexOp> m_ConstantIndex() {
+  return detail::op_matcher<ConstantIndexOp>();
+}
+
+//===----------------------------------------------------------------------===//
+// Common canonicalization pattern support logic
+//===----------------------------------------------------------------------===//
+
+/// This is a common class used for patterns of the form
+/// "someop(memrefcast) -> someop".  It folds the source of any memref_cast
+/// into the root operation directly.
+static LogicalResult foldMemRefCast(Operation *op) {
+  bool folded = false;
+  for (OpOperand &operand : op->getOpOperands()) {
+    auto cast = dyn_cast_or_null<MemRefCastOp>(operand.get()->getDefiningOp());
+    if (cast && !cast.getOperand()->getType().isa<UnrankedMemRefType>()) {
+      operand.set(cast.getOperand());
+      folded = true;
+    }
+  }
+  return success(folded);
+}
+
+//===----------------------------------------------------------------------===//
+// AddFOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult AddFOp::fold(ArrayRef<Attribute> operands) {
+  return constFoldBinaryOp<FloatAttr>(
+      operands, [](APFloat a, APFloat b) { return a + b; });
+}
+
+//===----------------------------------------------------------------------===//
+// AddIOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult AddIOp::fold(ArrayRef<Attribute> operands) {
+  /// addi(x, 0) -> x
+  if (matchPattern(rhs(), m_Zero()))
+    return lhs();
+
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a + b; });
+}
+
+//===----------------------------------------------------------------------===//
+// AllocOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, AllocOp op) {
+  p << "alloc";
+
+  // Print dynamic dimension operands.
+  MemRefType type = op.getType();
+  printDimAndSymbolList(op.operand_begin(), op.operand_end(),
+                        type.getNumDynamicDims(), p);
+  p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"map"});
+  p << " : " << type;
+}
+
+static ParseResult parseAllocOp(OpAsmParser &parser, OperationState &result) {
+  MemRefType type;
+
+  // Parse the dimension operands and optional symbol operands, followed by a
+  // memref type.
+  unsigned numDimOperands;
+  if (parseDimAndSymbolList(parser, result.operands, numDimOperands) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type))
+    return failure();
+
+  // Check numDynamicDims against number of question marks in memref type.
+  // Note: this check remains here (instead of in verify()), because the
+  // partition between dim operands and symbol operands is lost after parsing.
+  // Verification still checks that the total number of operands matches
+  // the number of symbols in the affine map, plus the number of dynamic
+  // dimensions in the memref.
+  if (numDimOperands != type.getNumDynamicDims())
+    return parser.emitError(parser.getNameLoc())
+           << "dimension operand count does not equal memref dynamic dimension "
+              "count";
+  result.types.push_back(type);
+  return success();
+}
+
+static LogicalResult verify(AllocOp op) {
+  auto memRefType = op.getResult()->getType().dyn_cast<MemRefType>();
+  if (!memRefType)
+    return op.emitOpError("result must be a memref");
+
+  unsigned numSymbols = 0;
+  if (!memRefType.getAffineMaps().empty()) {
+    // Store number of symbols used in affine map (used in subsequent check).
+    AffineMap affineMap = memRefType.getAffineMaps()[0];
+    numSymbols = affineMap.getNumSymbols();
+  }
+
+  // Check that the total number of operands matches the number of symbols in
+  // the affine map, plus the number of dynamic dimensions specified in the
+  // memref type.
+  unsigned numDynamicDims = memRefType.getNumDynamicDims();
+  if (op.getNumOperands() != numDynamicDims + numSymbols)
+    return op.emitOpError(
+        "operand count does not equal dimension plus symbol operand count");
+
+  // Verify that all operands are of type Index.
+  for (auto operandType : op.getOperandTypes())
+    if (!operandType.isIndex())
+      return op.emitOpError("requires operands to be of type Index");
+  return success();
+}
+
+namespace {
+/// Fold constant dimensions into an alloc operation.
+struct SimplifyAllocConst : public OpRewritePattern<AllocOp> {
+  using OpRewritePattern<AllocOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AllocOp alloc,
+                                     PatternRewriter &rewriter) const override {
+    // Check to see if any dimensions operands are constants.  If so, we can
+    // substitute and drop them.
+    if (llvm::none_of(alloc.getOperands(), [](Value operand) {
+          return matchPattern(operand, m_ConstantIndex());
+        }))
+      return matchFailure();
+
+    auto memrefType = alloc.getType();
+
+    // Ok, we have one or more constant operands.  Collect the non-constant ones
+    // and keep track of the resultant memref type to build.
+    SmallVector<int64_t, 4> newShapeConstants;
+    newShapeConstants.reserve(memrefType.getRank());
+    SmallVector<Value, 4> newOperands;
+    SmallVector<Value, 4> droppedOperands;
+
+    unsigned dynamicDimPos = 0;
+    for (unsigned dim = 0, e = memrefType.getRank(); dim < e; ++dim) {
+      int64_t dimSize = memrefType.getDimSize(dim);
+      // If this is already static dimension, keep it.
+      if (dimSize != -1) {
+        newShapeConstants.push_back(dimSize);
+        continue;
+      }
+      auto *defOp = alloc.getOperand(dynamicDimPos)->getDefiningOp();
+      if (auto constantIndexOp = dyn_cast_or_null<ConstantIndexOp>(defOp)) {
+        // Dynamic shape dimension will be folded.
+        newShapeConstants.push_back(constantIndexOp.getValue());
+        // Record to check for zero uses later below.
+        droppedOperands.push_back(constantIndexOp);
+      } else {
+        // Dynamic shape dimension not folded; copy operand from old memref.
+        newShapeConstants.push_back(-1);
+        newOperands.push_back(alloc.getOperand(dynamicDimPos));
+      }
+      dynamicDimPos++;
+    }
+
+    // Create new memref type (which will have fewer dynamic dimensions).
+    auto newMemRefType = MemRefType::get(
+        newShapeConstants, memrefType.getElementType(),
+        memrefType.getAffineMaps(), memrefType.getMemorySpace());
+    assert(static_cast<int64_t>(newOperands.size()) ==
+           newMemRefType.getNumDynamicDims());
+
+    // Create and insert the alloc op for the new memref.
+    auto newAlloc = rewriter.create<AllocOp>(alloc.getLoc(), newMemRefType,
+                                             newOperands, IntegerAttr());
+    // Insert a cast so we have the same type as the old alloc.
+    auto resultCast = rewriter.create<MemRefCastOp>(alloc.getLoc(), newAlloc,
+                                                    alloc.getType());
+
+    rewriter.replaceOp(alloc, {resultCast}, droppedOperands);
+    return matchSuccess();
+  }
+};
+
+/// Fold alloc operations with no uses. Alloc has side effects on the heap,
+/// but can still be deleted if it has zero uses.
+struct SimplifyDeadAlloc : public OpRewritePattern<AllocOp> {
+  using OpRewritePattern<AllocOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AllocOp alloc,
+                                     PatternRewriter &rewriter) const override {
+    if (alloc.use_empty()) {
+      rewriter.eraseOp(alloc);
+      return matchSuccess();
+    }
+    return matchFailure();
+  }
+};
+} // end anonymous namespace.
+
+void AllocOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                          MLIRContext *context) {
+  results.insert<SimplifyAllocConst, SimplifyDeadAlloc>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// BranchOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Simplify a branch to a block that has a single predecessor. This effectively
+/// merges the two blocks.
+struct SimplifyBrToBlockWithSinglePred : public OpRewritePattern<BranchOp> {
+  using OpRewritePattern<BranchOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(BranchOp op,
+                                     PatternRewriter &rewriter) const override {
+    // Check that the successor block has a single predecessor.
+    Block *succ = op.getDest();
+    Block *opParent = op.getOperation()->getBlock();
+    if (succ == opParent || !has_single_element(succ->getPredecessors()))
+      return matchFailure();
+
+    // Merge the successor into the current block and erase the branch.
+    rewriter.mergeBlocks(succ, opParent, op.getOperands());
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace.
+
+static ParseResult parseBranchOp(OpAsmParser &parser, OperationState &result) {
+  Block *dest;
+  SmallVector<Value, 4> destOperands;
+  if (parser.parseSuccessorAndUseList(dest, destOperands))
+    return failure();
+  result.addSuccessor(dest, destOperands);
+  return success();
+}
+
+static void print(OpAsmPrinter &p, BranchOp op) {
+  p << "br ";
+  p.printSuccessorAndUseList(op.getOperation(), 0);
+}
+
+Block *BranchOp::getDest() { return getSuccessor(0); }
+
+void BranchOp::setDest(Block *block) { return setSuccessor(block, 0); }
+
+void BranchOp::eraseOperand(unsigned index) {
+  getOperation()->eraseSuccessorOperand(0, index);
+}
+
+void BranchOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<SimplifyBrToBlockWithSinglePred>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// CallOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseCallOp(OpAsmParser &parser, OperationState &result) {
+  FlatSymbolRefAttr calleeAttr;
+  FunctionType calleeType;
+  SmallVector<OpAsmParser::OperandType, 4> operands;
+  auto calleeLoc = parser.getNameLoc();
+  if (parser.parseAttribute(calleeAttr, "callee", result.attributes) ||
+      parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(calleeType) ||
+      parser.addTypesToList(calleeType.getResults(), result.types) ||
+      parser.resolveOperands(operands, calleeType.getInputs(), calleeLoc,
+                             result.operands))
+    return failure();
+
+  return success();
+}
+
+static void print(OpAsmPrinter &p, CallOp op) {
+  p << "call " << op.getAttr("callee") << '(' << op.getOperands() << ')';
+  p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"callee"});
+  p << " : " << op.getCalleeType();
+}
+
+static LogicalResult verify(CallOp op) {
+  // Check that the callee attribute was specified.
+  auto fnAttr = op.getAttrOfType<FlatSymbolRefAttr>("callee");
+  if (!fnAttr)
+    return op.emitOpError("requires a 'callee' symbol reference attribute");
+  auto fn =
+      op.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(fnAttr.getValue());
+  if (!fn)
+    return op.emitOpError() << "'" << fnAttr.getValue()
+                            << "' does not reference a valid function";
+
+  // Verify that the operand and result types match the callee.
+  auto fnType = fn.getType();
+  if (fnType.getNumInputs() != op.getNumOperands())
+    return op.emitOpError("incorrect number of operands for callee");
+
+  for (unsigned i = 0, e = fnType.getNumInputs(); i != e; ++i)
+    if (op.getOperand(i)->getType() != fnType.getInput(i))
+      return op.emitOpError("operand type mismatch");
+
+  if (fnType.getNumResults() != op.getNumResults())
+    return op.emitOpError("incorrect number of results for callee");
+
+  for (unsigned i = 0, e = fnType.getNumResults(); i != e; ++i)
+    if (op.getResult(i)->getType() != fnType.getResult(i))
+      return op.emitOpError("result type mismatch");
+
+  return success();
+}
+
+FunctionType CallOp::getCalleeType() {
+  SmallVector<Type, 4> resultTypes(getResultTypes());
+  SmallVector<Type, 8> argTypes(getOperandTypes());
+  return FunctionType::get(argTypes, resultTypes, getContext());
+}
+
+//===----------------------------------------------------------------------===//
+// CallIndirectOp
+//===----------------------------------------------------------------------===//
+namespace {
+/// Fold indirect calls that have a constant function as the callee operand.
+struct SimplifyIndirectCallWithKnownCallee
+    : public OpRewritePattern<CallIndirectOp> {
+  using OpRewritePattern<CallIndirectOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(CallIndirectOp indirectCall,
+                                     PatternRewriter &rewriter) const override {
+    // Check that the callee is a constant callee.
+    SymbolRefAttr calledFn;
+    if (!matchPattern(indirectCall.getCallee(), m_Constant(&calledFn)))
+      return matchFailure();
+
+    // Replace with a direct call.
+    SmallVector<Type, 8> callResults(indirectCall.getResultTypes());
+    rewriter.replaceOpWithNewOp<CallOp>(indirectCall, calledFn, callResults,
+                                        indirectCall.getArgOperands());
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace.
+
+static ParseResult parseCallIndirectOp(OpAsmParser &parser,
+                                       OperationState &result) {
+  FunctionType calleeType;
+  OpAsmParser::OperandType callee;
+  llvm::SMLoc operandsLoc;
+  SmallVector<OpAsmParser::OperandType, 4> operands;
+  return failure(
+      parser.parseOperand(callee) || parser.getCurrentLocation(&operandsLoc) ||
+      parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(calleeType) ||
+      parser.resolveOperand(callee, calleeType, result.operands) ||
+      parser.resolveOperands(operands, calleeType.getInputs(), operandsLoc,
+                             result.operands) ||
+      parser.addTypesToList(calleeType.getResults(), result.types));
+}
+
+static void print(OpAsmPrinter &p, CallIndirectOp op) {
+  p << "call_indirect " << op.getCallee() << '(' << op.getArgOperands() << ')';
+  p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"callee"});
+  p << " : " << op.getCallee()->getType();
+}
+
+static LogicalResult verify(CallIndirectOp op) {
+  // The callee must be a function.
+  auto fnType = op.getCallee()->getType().dyn_cast<FunctionType>();
+  if (!fnType)
+    return op.emitOpError("callee must have function type");
+
+  // Verify that the operand and result types match the callee.
+  if (fnType.getNumInputs() != op.getNumOperands() - 1)
+    return op.emitOpError("incorrect number of operands for callee");
+
+  for (unsigned i = 0, e = fnType.getNumInputs(); i != e; ++i)
+    if (op.getOperand(i + 1)->getType() != fnType.getInput(i))
+      return op.emitOpError("operand type mismatch");
+
+  if (fnType.getNumResults() != op.getNumResults())
+    return op.emitOpError("incorrect number of results for callee");
+
+  for (unsigned i = 0, e = fnType.getNumResults(); i != e; ++i)
+    if (op.getResult(i)->getType() != fnType.getResult(i))
+      return op.emitOpError("result type mismatch");
+
+  return success();
+}
+
+void CallIndirectOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<SimplifyIndirectCallWithKnownCallee>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// General helpers for comparison ops
+//===----------------------------------------------------------------------===//
+
+// Return the type of the same shape (scalar, vector or tensor) containing i1.
+static Type getCheckedI1SameShape(Builder *build, Type type) {
+  auto i1Type = build->getI1Type();
+  if (type.isIntOrIndexOrFloat())
+    return i1Type;
+  if (auto tensorType = type.dyn_cast<RankedTensorType>())
+    return RankedTensorType::get(tensorType.getShape(), i1Type);
+  if (type.isa<UnrankedTensorType>())
+    return UnrankedTensorType::get(i1Type);
+  if (auto vectorType = type.dyn_cast<VectorType>())
+    return VectorType::get(vectorType.getShape(), i1Type);
+  return Type();
+}
+
+static Type getI1SameShape(Builder *build, Type type) {
+  Type res = getCheckedI1SameShape(build, type);
+  assert(res && "expected type with valid i1 shape");
+  return res;
+}
+
+//===----------------------------------------------------------------------===//
+// CmpIOp
+//===----------------------------------------------------------------------===//
+
+static void buildCmpIOp(Builder *build, OperationState &result,
+                        CmpIPredicate predicate, Value lhs, Value rhs) {
+  result.addOperands({lhs, rhs});
+  result.types.push_back(getI1SameShape(build, lhs->getType()));
+  result.addAttribute(
+      CmpIOp::getPredicateAttrName(),
+      build->getI64IntegerAttr(static_cast<int64_t>(predicate)));
+}
+
+static ParseResult parseCmpIOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 2> ops;
+  SmallVector<NamedAttribute, 4> attrs;
+  Attribute predicateNameAttr;
+  Type type;
+  if (parser.parseAttribute(predicateNameAttr, CmpIOp::getPredicateAttrName(),
+                            attrs) ||
+      parser.parseComma() || parser.parseOperandList(ops, 2) ||
+      parser.parseOptionalAttrDict(attrs) || parser.parseColonType(type) ||
+      parser.resolveOperands(ops, type, result.operands))
+    return failure();
+
+  if (!predicateNameAttr.isa<StringAttr>())
+    return parser.emitError(parser.getNameLoc(),
+                            "expected string comparison predicate attribute");
+
+  // Rewrite string attribute to an enum value.
+  StringRef predicateName = predicateNameAttr.cast<StringAttr>().getValue();
+  Optional<CmpIPredicate> predicate = symbolizeCmpIPredicate(predicateName);
+  if (!predicate.hasValue())
+    return parser.emitError(parser.getNameLoc())
+           << "unknown comparison predicate \"" << predicateName << "\"";
+
+  auto builder = parser.getBuilder();
+  Type i1Type = getCheckedI1SameShape(&builder, type);
+  if (!i1Type)
+    return parser.emitError(parser.getNameLoc(),
+                            "expected type with valid i1 shape");
+
+  attrs[0].second = builder.getI64IntegerAttr(static_cast<int64_t>(*predicate));
+  result.attributes = attrs;
+
+  result.addTypes({i1Type});
+  return success();
+}
+
+static void print(OpAsmPrinter &p, CmpIOp op) {
+  p << "cmpi ";
+
+  Builder b(op.getContext());
+  auto predicateValue =
+      op.getAttrOfType<IntegerAttr>(CmpIOp::getPredicateAttrName()).getInt();
+  p << '"' << stringifyCmpIPredicate(static_cast<CmpIPredicate>(predicateValue))
+    << '"' << ", " << op.lhs() << ", " << op.rhs();
+  p.printOptionalAttrDict(op.getAttrs(),
+                          /*elidedAttrs=*/{CmpIOp::getPredicateAttrName()});
+  p << " : " << op.lhs()->getType();
+}
+
+// Compute `lhs` `pred` `rhs`, where `pred` is one of the known integer
+// comparison predicates.
+static bool applyCmpPredicate(CmpIPredicate predicate, const APInt &lhs,
+                              const APInt &rhs) {
+  switch (predicate) {
+  case CmpIPredicate::eq:
+    return lhs.eq(rhs);
+  case CmpIPredicate::ne:
+    return lhs.ne(rhs);
+  case CmpIPredicate::slt:
+    return lhs.slt(rhs);
+  case CmpIPredicate::sle:
+    return lhs.sle(rhs);
+  case CmpIPredicate::sgt:
+    return lhs.sgt(rhs);
+  case CmpIPredicate::sge:
+    return lhs.sge(rhs);
+  case CmpIPredicate::ult:
+    return lhs.ult(rhs);
+  case CmpIPredicate::ule:
+    return lhs.ule(rhs);
+  case CmpIPredicate::ugt:
+    return lhs.ugt(rhs);
+  case CmpIPredicate::uge:
+    return lhs.uge(rhs);
+  default:
+    llvm_unreachable("unknown comparison predicate");
+  }
+}
+
+// Constant folding hook for comparisons.
+OpFoldResult CmpIOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "cmpi takes two arguments");
+
+  auto lhs = operands.front().dyn_cast_or_null<IntegerAttr>();
+  auto rhs = operands.back().dyn_cast_or_null<IntegerAttr>();
+  if (!lhs || !rhs)
+    return {};
+
+  auto val = applyCmpPredicate(getPredicate(), lhs.getValue(), rhs.getValue());
+  return IntegerAttr::get(IntegerType::get(1, getContext()), APInt(1, val));
+}
+
+//===----------------------------------------------------------------------===//
+// CmpFOp
+//===----------------------------------------------------------------------===//
+
+// Returns an array of mnemonics for CmpFPredicates indexed by values thereof.
+static inline const char *const *getCmpFPredicateNames() {
+  static const char *predicateNames[] = {
+      /*AlwaysFalse*/ "false",
+      /*OEQ*/ "oeq",
+      /*OGT*/ "ogt",
+      /*OGE*/ "oge",
+      /*OLT*/ "olt",
+      /*OLE*/ "ole",
+      /*ONE*/ "one",
+      /*ORD*/ "ord",
+      /*UEQ*/ "ueq",
+      /*UGT*/ "ugt",
+      /*UGE*/ "uge",
+      /*ULT*/ "ult",
+      /*ULE*/ "ule",
+      /*UNE*/ "une",
+      /*UNO*/ "uno",
+      /*AlwaysTrue*/ "true",
+  };
+  static_assert(std::extent<decltype(predicateNames)>::value ==
+                    (size_t)CmpFPredicate::NumPredicates,
+                "wrong number of predicate names");
+  return predicateNames;
+}
+
+// Returns a value of the predicate corresponding to the given mnemonic.
+// Returns NumPredicates (one-past-end) if there is no such mnemonic.
+CmpFPredicate CmpFOp::getPredicateByName(StringRef name) {
+  return llvm::StringSwitch<CmpFPredicate>(name)
+      .Case("false", CmpFPredicate::AlwaysFalse)
+      .Case("oeq", CmpFPredicate::OEQ)
+      .Case("ogt", CmpFPredicate::OGT)
+      .Case("oge", CmpFPredicate::OGE)
+      .Case("olt", CmpFPredicate::OLT)
+      .Case("ole", CmpFPredicate::OLE)
+      .Case("one", CmpFPredicate::ONE)
+      .Case("ord", CmpFPredicate::ORD)
+      .Case("ueq", CmpFPredicate::UEQ)
+      .Case("ugt", CmpFPredicate::UGT)
+      .Case("uge", CmpFPredicate::UGE)
+      .Case("ult", CmpFPredicate::ULT)
+      .Case("ule", CmpFPredicate::ULE)
+      .Case("une", CmpFPredicate::UNE)
+      .Case("uno", CmpFPredicate::UNO)
+      .Case("true", CmpFPredicate::AlwaysTrue)
+      .Default(CmpFPredicate::NumPredicates);
+}
+
+static void buildCmpFOp(Builder *build, OperationState &result,
+                        CmpFPredicate predicate, Value lhs, Value rhs) {
+  result.addOperands({lhs, rhs});
+  result.types.push_back(getI1SameShape(build, lhs->getType()));
+  result.addAttribute(
+      CmpFOp::getPredicateAttrName(),
+      build->getI64IntegerAttr(static_cast<int64_t>(predicate)));
+}
+
+static ParseResult parseCmpFOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 2> ops;
+  SmallVector<NamedAttribute, 4> attrs;
+  Attribute predicateNameAttr;
+  Type type;
+  if (parser.parseAttribute(predicateNameAttr, CmpFOp::getPredicateAttrName(),
+                            attrs) ||
+      parser.parseComma() || parser.parseOperandList(ops, 2) ||
+      parser.parseOptionalAttrDict(attrs) || parser.parseColonType(type) ||
+      parser.resolveOperands(ops, type, result.operands))
+    return failure();
+
+  if (!predicateNameAttr.isa<StringAttr>())
+    return parser.emitError(parser.getNameLoc(),
+                            "expected string comparison predicate attribute");
+
+  // Rewrite string attribute to an enum value.
+  StringRef predicateName = predicateNameAttr.cast<StringAttr>().getValue();
+  auto predicate = CmpFOp::getPredicateByName(predicateName);
+  if (predicate == CmpFPredicate::NumPredicates)
+    return parser.emitError(parser.getNameLoc(),
+                            "unknown comparison predicate \"" + predicateName +
+                                "\"");
+
+  auto builder = parser.getBuilder();
+  Type i1Type = getCheckedI1SameShape(&builder, type);
+  if (!i1Type)
+    return parser.emitError(parser.getNameLoc(),
+                            "expected type with valid i1 shape");
+
+  attrs[0].second = builder.getI64IntegerAttr(static_cast<int64_t>(predicate));
+  result.attributes = attrs;
+
+  result.addTypes({i1Type});
+  return success();
+}
+
+static void print(OpAsmPrinter &p, CmpFOp op) {
+  p << "cmpf ";
+
+  auto predicateValue =
+      op.getAttrOfType<IntegerAttr>(CmpFOp::getPredicateAttrName()).getInt();
+  assert(predicateValue >= static_cast<int>(CmpFPredicate::FirstValidValue) &&
+         predicateValue < static_cast<int>(CmpFPredicate::NumPredicates) &&
+         "unknown predicate index");
+  p << '"' << getCmpFPredicateNames()[predicateValue] << '"' << ", " << op.lhs()
+    << ", " << op.rhs();
+  p.printOptionalAttrDict(op.getAttrs(),
+                          /*elidedAttrs=*/{CmpFOp::getPredicateAttrName()});
+  p << " : " << op.lhs()->getType();
+}
+
+static LogicalResult verify(CmpFOp op) {
+  auto predicateAttr =
+      op.getAttrOfType<IntegerAttr>(CmpFOp::getPredicateAttrName());
+  if (!predicateAttr)
+    return op.emitOpError("requires an integer attribute named 'predicate'");
+  auto predicate = predicateAttr.getInt();
+  if (predicate < (int64_t)CmpFPredicate::FirstValidValue ||
+      predicate >= (int64_t)CmpFPredicate::NumPredicates)
+    return op.emitOpError("'predicate' attribute value out of range");
+
+  return success();
+}
+
+// Compute `lhs` `pred` `rhs`, where `pred` is one of the known floating point
+// comparison predicates.
+static bool applyCmpPredicate(CmpFPredicate predicate, const APFloat &lhs,
+                              const APFloat &rhs) {
+  auto cmpResult = lhs.compare(rhs);
+  switch (predicate) {
+  case CmpFPredicate::AlwaysFalse:
+    return false;
+  case CmpFPredicate::OEQ:
+    return cmpResult == APFloat::cmpEqual;
+  case CmpFPredicate::OGT:
+    return cmpResult == APFloat::cmpGreaterThan;
+  case CmpFPredicate::OGE:
+    return cmpResult == APFloat::cmpGreaterThan ||
+           cmpResult == APFloat::cmpEqual;
+  case CmpFPredicate::OLT:
+    return cmpResult == APFloat::cmpLessThan;
+  case CmpFPredicate::OLE:
+    return cmpResult == APFloat::cmpLessThan || cmpResult == APFloat::cmpEqual;
+  case CmpFPredicate::ONE:
+    return cmpResult != APFloat::cmpUnordered && cmpResult != APFloat::cmpEqual;
+  case CmpFPredicate::ORD:
+    return cmpResult != APFloat::cmpUnordered;
+  case CmpFPredicate::UEQ:
+    return cmpResult == APFloat::cmpUnordered || cmpResult == APFloat::cmpEqual;
+  case CmpFPredicate::UGT:
+    return cmpResult == APFloat::cmpUnordered ||
+           cmpResult == APFloat::cmpGreaterThan;
+  case CmpFPredicate::UGE:
+    return cmpResult == APFloat::cmpUnordered ||
+           cmpResult == APFloat::cmpGreaterThan ||
+           cmpResult == APFloat::cmpEqual;
+  case CmpFPredicate::ULT:
+    return cmpResult == APFloat::cmpUnordered ||
+           cmpResult == APFloat::cmpLessThan;
+  case CmpFPredicate::ULE:
+    return cmpResult == APFloat::cmpUnordered ||
+           cmpResult == APFloat::cmpLessThan || cmpResult == APFloat::cmpEqual;
+  case CmpFPredicate::UNE:
+    return cmpResult != APFloat::cmpEqual;
+  case CmpFPredicate::UNO:
+    return cmpResult == APFloat::cmpUnordered;
+  case CmpFPredicate::AlwaysTrue:
+    return true;
+  default:
+    llvm_unreachable("unknown comparison predicate");
+  }
+}
+
+// Constant folding hook for comparisons.
+OpFoldResult CmpFOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "cmpf takes two arguments");
+
+  auto lhs = operands.front().dyn_cast_or_null<FloatAttr>();
+  auto rhs = operands.back().dyn_cast_or_null<FloatAttr>();
+
+  // TODO(gcmn) We could actually do some intelligent things if we know only one
+  // of the operands, but it's inf or nan.
+  if (!lhs || !rhs)
+    return {};
+
+  auto val = applyCmpPredicate(getPredicate(), lhs.getValue(), rhs.getValue());
+  return IntegerAttr::get(IntegerType::get(1, getContext()), APInt(1, val));
+}
+
+//===----------------------------------------------------------------------===//
+// CondBranchOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// cond_br true, ^bb1, ^bb2 -> br ^bb1
+/// cond_br false, ^bb1, ^bb2 -> br ^bb2
+///
+struct SimplifyConstCondBranchPred : public OpRewritePattern<CondBranchOp> {
+  using OpRewritePattern<CondBranchOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(CondBranchOp condbr,
+                                     PatternRewriter &rewriter) const override {
+    if (matchPattern(condbr.getCondition(), m_NonZero())) {
+      // True branch taken.
+      rewriter.replaceOpWithNewOp<BranchOp>(condbr, condbr.getTrueDest(),
+                                            condbr.getTrueOperands());
+      return matchSuccess();
+    } else if (matchPattern(condbr.getCondition(), m_Zero())) {
+      // False branch taken.
+      rewriter.replaceOpWithNewOp<BranchOp>(condbr, condbr.getFalseDest(),
+                                            condbr.getFalseOperands());
+      return matchSuccess();
+    }
+    return matchFailure();
+  }
+};
+} // end anonymous namespace.
+
+static ParseResult parseCondBranchOp(OpAsmParser &parser,
+                                     OperationState &result) {
+  SmallVector<Value, 4> destOperands;
+  Block *dest;
+  OpAsmParser::OperandType condInfo;
+
+  // Parse the condition.
+  Type int1Ty = parser.getBuilder().getI1Type();
+  if (parser.parseOperand(condInfo) || parser.parseComma() ||
+      parser.resolveOperand(condInfo, int1Ty, result.operands)) {
+    return parser.emitError(parser.getNameLoc(),
+                            "expected condition type was boolean (i1)");
+  }
+
+  // Parse the true successor.
+  if (parser.parseSuccessorAndUseList(dest, destOperands))
+    return failure();
+  result.addSuccessor(dest, destOperands);
+
+  // Parse the false successor.
+  destOperands.clear();
+  if (parser.parseComma() ||
+      parser.parseSuccessorAndUseList(dest, destOperands))
+    return failure();
+  result.addSuccessor(dest, destOperands);
+
+  return success();
+}
+
+static void print(OpAsmPrinter &p, CondBranchOp op) {
+  p << "cond_br " << op.getCondition() << ", ";
+  p.printSuccessorAndUseList(op.getOperation(), CondBranchOp::trueIndex);
+  p << ", ";
+  p.printSuccessorAndUseList(op.getOperation(), CondBranchOp::falseIndex);
+}
+
+void CondBranchOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<SimplifyConstCondBranchPred>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// Constant*Op
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, ConstantOp &op) {
+  p << "constant ";
+  p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"value"});
+
+  if (op.getAttrs().size() > 1)
+    p << ' ';
+  p << op.getValue();
+
+  // If the value is a symbol reference, print a trailing type.
+  if (op.getValue().isa<SymbolRefAttr>())
+    p << " : " << op.getType();
+}
+
+static ParseResult parseConstantOp(OpAsmParser &parser,
+                                   OperationState &result) {
+  Attribute valueAttr;
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseAttribute(valueAttr, "value", result.attributes))
+    return failure();
+
+  // If the attribute is a symbol reference, then we expect a trailing type.
+  Type type;
+  if (!valueAttr.isa<SymbolRefAttr>())
+    type = valueAttr.getType();
+  else if (parser.parseColonType(type))
+    return failure();
+
+  // Add the attribute type to the list.
+  return parser.addTypeToList(type, result.types);
+}
+
+/// The constant op requires an attribute, and furthermore requires that it
+/// matches the return type.
+static LogicalResult verify(ConstantOp &op) {
+  auto value = op.getValue();
+  if (!value)
+    return op.emitOpError("requires a 'value' attribute");
+
+  auto type = op.getType();
+  if (!value.getType().isa<NoneType>() && type != value.getType())
+    return op.emitOpError() << "requires attribute's type (" << value.getType()
+                            << ") to match op's return type (" << type << ")";
+
+  if (type.isa<IndexType>() || value.isa<BoolAttr>())
+    return success();
+
+  if (auto intAttr = value.dyn_cast<IntegerAttr>()) {
+    // If the type has a known bitwidth we verify that the value can be
+    // represented with the given bitwidth.
+    auto bitwidth = type.cast<IntegerType>().getWidth();
+    auto intVal = intAttr.getValue();
+    if (!intVal.isSignedIntN(bitwidth) && !intVal.isIntN(bitwidth))
+      return op.emitOpError("requires 'value' to be an integer within the "
+                            "range of the integer result type");
+    return success();
+  }
+
+  if (type.isa<FloatType>()) {
+    if (!value.isa<FloatAttr>())
+      return op.emitOpError("requires 'value' to be a floating point constant");
+    return success();
+  }
+
+  if (type.isa<ShapedType>()) {
+    if (!value.isa<ElementsAttr>())
+      return op.emitOpError("requires 'value' to be a shaped constant");
+    return success();
+  }
+
+  if (type.isa<FunctionType>()) {
+    auto fnAttr = value.dyn_cast<FlatSymbolRefAttr>();
+    if (!fnAttr)
+      return op.emitOpError("requires 'value' to be a function reference");
+
+    // Try to find the referenced function.
+    auto fn =
+        op.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(fnAttr.getValue());
+    if (!fn)
+      return op.emitOpError("reference to undefined function 'bar'");
+
+    // Check that the referenced function has the correct type.
+    if (fn.getType() != type)
+      return op.emitOpError("reference to function with mismatched type");
+
+    return success();
+  }
+
+  if (type.isa<NoneType>() && value.isa<UnitAttr>())
+    return success();
+
+  return op.emitOpError("unsupported 'value' attribute: ") << value;
+}
+
+OpFoldResult ConstantOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.empty() && "constant has no operands");
+  return getValue();
+}
+
+void ConstantOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  Type type = getType();
+  if (auto intCst = getValue().dyn_cast<IntegerAttr>()) {
+    IntegerType intTy = type.dyn_cast<IntegerType>();
+
+    // Sugar i1 constants with 'true' and 'false'.
+    if (intTy && intTy.getWidth() == 1)
+      return setNameFn(getResult(), (intCst.getInt() ? "true" : "false"));
+
+    // Otherwise, build a complex name with the value and type.
+    SmallString<32> specialNameBuffer;
+    llvm::raw_svector_ostream specialName(specialNameBuffer);
+    specialName << 'c' << intCst.getInt();
+    if (intTy)
+      specialName << '_' << type;
+    setNameFn(getResult(), specialName.str());
+
+  } else if (type.isa<FunctionType>()) {
+    setNameFn(getResult(), "f");
+  } else {
+    setNameFn(getResult(), "cst");
+  }
+}
+
+/// Returns true if a constant operation can be built with the given value and
+/// result type.
+bool ConstantOp::isBuildableWith(Attribute value, Type type) {
+  // SymbolRefAttr can only be used with a function type.
+  if (value.isa<SymbolRefAttr>())
+    return type.isa<FunctionType>();
+  // Otherwise, the attribute must have the same type as 'type'.
+  if (value.getType() != type)
+    return false;
+  // Finally, check that the attribute kind is handled.
+  return value.isa<BoolAttr>() || value.isa<IntegerAttr>() ||
+         value.isa<FloatAttr>() || value.isa<ElementsAttr>() ||
+         value.isa<UnitAttr>();
+}
+
+void ConstantFloatOp::build(Builder *builder, OperationState &result,
+                            const APFloat &value, FloatType type) {
+  ConstantOp::build(builder, result, type, builder->getFloatAttr(type, value));
+}
+
+bool ConstantFloatOp::classof(Operation *op) {
+  return ConstantOp::classof(op) &&
+         op->getResult(0)->getType().isa<FloatType>();
+}
+
+/// ConstantIntOp only matches values whose result type is an IntegerType.
+bool ConstantIntOp::classof(Operation *op) {
+  return ConstantOp::classof(op) &&
+         op->getResult(0)->getType().isa<IntegerType>();
+}
+
+void ConstantIntOp::build(Builder *builder, OperationState &result,
+                          int64_t value, unsigned width) {
+  Type type = builder->getIntegerType(width);
+  ConstantOp::build(builder, result, type,
+                    builder->getIntegerAttr(type, value));
+}
+
+/// Build a constant int op producing an integer with the specified type,
+/// which must be an integer type.
+void ConstantIntOp::build(Builder *builder, OperationState &result,
+                          int64_t value, Type type) {
+  assert(type.isa<IntegerType>() && "ConstantIntOp can only have integer type");
+  ConstantOp::build(builder, result, type,
+                    builder->getIntegerAttr(type, value));
+}
+
+/// ConstantIndexOp only matches values whose result type is Index.
+bool ConstantIndexOp::classof(Operation *op) {
+  return ConstantOp::classof(op) && op->getResult(0)->getType().isIndex();
+}
+
+void ConstantIndexOp::build(Builder *builder, OperationState &result,
+                            int64_t value) {
+  Type type = builder->getIndexType();
+  ConstantOp::build(builder, result, type,
+                    builder->getIntegerAttr(type, value));
+}
+
+//===----------------------------------------------------------------------===//
+// DeallocOp
+//===----------------------------------------------------------------------===//
+namespace {
+/// Fold Dealloc operations that are deallocating an AllocOp that is only used
+/// by other Dealloc operations.
+struct SimplifyDeadDealloc : public OpRewritePattern<DeallocOp> {
+  using OpRewritePattern<DeallocOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(DeallocOp dealloc,
+                                     PatternRewriter &rewriter) const override {
+    // Check that the memref operand's defining operation is an AllocOp.
+    Value memref = dealloc.memref();
+    if (!isa_and_nonnull<AllocOp>(memref->getDefiningOp()))
+      return matchFailure();
+
+    // Check that all of the uses of the AllocOp are other DeallocOps.
+    for (auto *user : memref->getUsers())
+      if (!isa<DeallocOp>(user))
+        return matchFailure();
+
+    // Erase the dealloc operation.
+    rewriter.eraseOp(dealloc);
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace.
+
+static void print(OpAsmPrinter &p, DeallocOp op) {
+  p << "dealloc " << *op.memref() << " : " << op.memref()->getType();
+}
+
+static ParseResult parseDeallocOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType memrefInfo;
+  MemRefType type;
+
+  return failure(parser.parseOperand(memrefInfo) ||
+                 parser.parseColonType(type) ||
+                 parser.resolveOperand(memrefInfo, type, result.operands));
+}
+
+static LogicalResult verify(DeallocOp op) {
+  if (!op.memref()->getType().isa<MemRefType>())
+    return op.emitOpError("operand must be a memref");
+  return success();
+}
+
+void DeallocOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  results.insert<SimplifyDeadDealloc>(context);
+}
+
+LogicalResult DeallocOp::fold(ArrayRef<Attribute> cstOperands,
+                              SmallVectorImpl<OpFoldResult> &results) {
+  /// dealloc(memrefcast) -> dealloc
+  return foldMemRefCast(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// DimOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, DimOp op) {
+  p << "dim " << *op.getOperand() << ", " << op.getIndex();
+  p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"index"});
+  p << " : " << op.getOperand()->getType();
+}
+
+static ParseResult parseDimOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType operandInfo;
+  IntegerAttr indexAttr;
+  Type type;
+  Type indexType = parser.getBuilder().getIndexType();
+
+  return failure(
+      parser.parseOperand(operandInfo) || parser.parseComma() ||
+      parser.parseAttribute(indexAttr, indexType, "index", result.attributes) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type) ||
+      parser.resolveOperand(operandInfo, type, result.operands) ||
+      parser.addTypeToList(indexType, result.types));
+}
+
+static LogicalResult verify(DimOp op) {
+  // Check that we have an integer index operand.
+  auto indexAttr = op.getAttrOfType<IntegerAttr>("index");
+  if (!indexAttr)
+    return op.emitOpError("requires an integer attribute named 'index'");
+  int64_t index = indexAttr.getValue().getSExtValue();
+
+  auto type = op.getOperand()->getType();
+  if (auto tensorType = type.dyn_cast<RankedTensorType>()) {
+    if (index >= tensorType.getRank())
+      return op.emitOpError("index is out of range");
+  } else if (auto memrefType = type.dyn_cast<MemRefType>()) {
+    if (index >= memrefType.getRank())
+      return op.emitOpError("index is out of range");
+
+  } else if (type.isa<UnrankedTensorType>()) {
+    // ok, assumed to be in-range.
+  } else {
+    return op.emitOpError("requires an operand with tensor or memref type");
+  }
+
+  return success();
+}
+
+OpFoldResult DimOp::fold(ArrayRef<Attribute> operands) {
+  // Constant fold dim when the size along the index referred to is a constant.
+  auto opType = memrefOrTensor()->getType();
+  int64_t indexSize = -1;
+  if (auto tensorType = opType.dyn_cast<RankedTensorType>())
+    indexSize = tensorType.getShape()[getIndex()];
+  else if (auto memrefType = opType.dyn_cast<MemRefType>())
+    indexSize = memrefType.getShape()[getIndex()];
+
+  if (!ShapedType::isDynamic(indexSize))
+    return IntegerAttr::get(IndexType::get(getContext()), indexSize);
+
+  // Fold dim to the size argument for an AllocOp/ViewOp/SubViewOp.
+  auto memrefType = opType.dyn_cast<MemRefType>();
+  if (!memrefType)
+    return {};
+
+  // The size at getIndex() is now a dynamic size of a memref.
+  auto memref = memrefOrTensor()->getDefiningOp();
+  if (auto alloc = dyn_cast_or_null<AllocOp>(memref))
+    return *(alloc.getDynamicSizes().begin() +
+             memrefType.getDynamicDimIndex(getIndex()));
+
+  if (auto view = dyn_cast_or_null<ViewOp>(memref))
+    return *(view.getDynamicSizes().begin() +
+             memrefType.getDynamicDimIndex(getIndex()));
+
+  // The subview op here is expected to have rank dynamic sizes now.
+  if (auto subview = dyn_cast_or_null<SubViewOp>(memref)) {
+    auto sizes = subview.sizes();
+    if (!sizes.empty())
+      return *(sizes.begin() + getIndex());
+  }
+
+  /// dim(memrefcast) -> dim
+  if (succeeded(foldMemRefCast(*this)))
+    return getResult();
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// SignedDivIOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SignedDivIOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "binary operation takes two operands");
+
+  // Don't fold if it would overflow or if it requires a division by zero.
+  bool overflowOrDiv0 = false;
+  auto result = constFoldBinaryOp<IntegerAttr>(operands, [&](APInt a, APInt b) {
+    if (overflowOrDiv0 || !b) {
+      overflowOrDiv0 = true;
+      return a;
+    }
+    return a.sdiv_ov(b, overflowOrDiv0);
+  });
+  return overflowOrDiv0 ? Attribute() : result;
+}
+
+//===----------------------------------------------------------------------===//
+// UnsignedDivIOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult UnsignedDivIOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "binary operation takes two operands");
+
+  // Don't fold if it would require a division by zero.
+  bool div0 = false;
+  auto result = constFoldBinaryOp<IntegerAttr>(operands, [&](APInt a, APInt b) {
+    if (div0 || !b) {
+      div0 = true;
+      return a;
+    }
+    return a.udiv(b);
+  });
+  return div0 ? Attribute() : result;
+}
+
+// ---------------------------------------------------------------------------
+// DmaStartOp
+// ---------------------------------------------------------------------------
+
+void DmaStartOp::build(Builder *builder, OperationState &result,
+                       Value srcMemRef, ValueRange srcIndices, Value destMemRef,
+                       ValueRange destIndices, Value numElements,
+                       Value tagMemRef, ValueRange tagIndices, Value stride,
+                       Value elementsPerStride) {
+  result.addOperands(srcMemRef);
+  result.addOperands(srcIndices);
+  result.addOperands(destMemRef);
+  result.addOperands(destIndices);
+  result.addOperands({numElements, tagMemRef});
+  result.addOperands(tagIndices);
+  if (stride)
+    result.addOperands({stride, elementsPerStride});
+}
+
+void DmaStartOp::print(OpAsmPrinter &p) {
+  p << "dma_start " << *getSrcMemRef() << '[' << getSrcIndices() << "], "
+    << *getDstMemRef() << '[' << getDstIndices() << "], " << *getNumElements()
+    << ", " << *getTagMemRef() << '[' << getTagIndices() << ']';
+  if (isStrided())
+    p << ", " << *getStride() << ", " << *getNumElementsPerStride();
+
+  p.printOptionalAttrDict(getAttrs());
+  p << " : " << getSrcMemRef()->getType();
+  p << ", " << getDstMemRef()->getType();
+  p << ", " << getTagMemRef()->getType();
+}
+
+// Parse DmaStartOp.
+// Ex:
+//   %dma_id = dma_start %src[%i, %j], %dst[%k, %l], %size,
+//                       %tag[%index], %stride, %num_elt_per_stride :
+//                     : memref<3076 x f32, 0>,
+//                       memref<1024 x f32, 2>,
+//                       memref<1 x i32>
+//
+ParseResult DmaStartOp::parse(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType srcMemRefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> srcIndexInfos;
+  OpAsmParser::OperandType dstMemRefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> dstIndexInfos;
+  OpAsmParser::OperandType numElementsInfo;
+  OpAsmParser::OperandType tagMemrefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> tagIndexInfos;
+  SmallVector<OpAsmParser::OperandType, 2> strideInfo;
+
+  SmallVector<Type, 3> types;
+  auto indexType = parser.getBuilder().getIndexType();
+
+  // Parse and resolve the following list of operands:
+  // *) source memref followed by its indices (in square brackets).
+  // *) destination memref followed by its indices (in square brackets).
+  // *) dma size in KiB.
+  if (parser.parseOperand(srcMemRefInfo) ||
+      parser.parseOperandList(srcIndexInfos, OpAsmParser::Delimiter::Square) ||
+      parser.parseComma() || parser.parseOperand(dstMemRefInfo) ||
+      parser.parseOperandList(dstIndexInfos, OpAsmParser::Delimiter::Square) ||
+      parser.parseComma() || parser.parseOperand(numElementsInfo) ||
+      parser.parseComma() || parser.parseOperand(tagMemrefInfo) ||
+      parser.parseOperandList(tagIndexInfos, OpAsmParser::Delimiter::Square))
+    return failure();
+
+  // Parse optional stride and elements per stride.
+  if (parser.parseTrailingOperandList(strideInfo))
+    return failure();
+
+  bool isStrided = strideInfo.size() == 2;
+  if (!strideInfo.empty() && !isStrided) {
+    return parser.emitError(parser.getNameLoc(),
+                            "expected two stride related operands");
+  }
+
+  if (parser.parseColonTypeList(types))
+    return failure();
+  if (types.size() != 3)
+    return parser.emitError(parser.getNameLoc(), "fewer/more types expected");
+
+  if (parser.resolveOperand(srcMemRefInfo, types[0], result.operands) ||
+      parser.resolveOperands(srcIndexInfos, indexType, result.operands) ||
+      parser.resolveOperand(dstMemRefInfo, types[1], result.operands) ||
+      parser.resolveOperands(dstIndexInfos, indexType, result.operands) ||
+      // size should be an index.
+      parser.resolveOperand(numElementsInfo, indexType, result.operands) ||
+      parser.resolveOperand(tagMemrefInfo, types[2], result.operands) ||
+      // tag indices should be index.
+      parser.resolveOperands(tagIndexInfos, indexType, result.operands))
+    return failure();
+
+  auto memrefType0 = types[0].dyn_cast<MemRefType>();
+  if (!memrefType0)
+    return parser.emitError(parser.getNameLoc(),
+                            "expected source to be of memref type");
+
+  auto memrefType1 = types[1].dyn_cast<MemRefType>();
+  if (!memrefType1)
+    return parser.emitError(parser.getNameLoc(),
+                            "expected destination to be of memref type");
+
+  auto memrefType2 = types[2].dyn_cast<MemRefType>();
+  if (!memrefType2)
+    return parser.emitError(parser.getNameLoc(),
+                            "expected tag to be of memref type");
+
+  if (isStrided) {
+    if (parser.resolveOperands(strideInfo, indexType, result.operands))
+      return failure();
+  }
+
+  // Check that source/destination index list size matches associated rank.
+  if (static_cast<int64_t>(srcIndexInfos.size()) != memrefType0.getRank() ||
+      static_cast<int64_t>(dstIndexInfos.size()) != memrefType1.getRank())
+    return parser.emitError(parser.getNameLoc(),
+                            "memref rank not equal to indices count");
+  if (static_cast<int64_t>(tagIndexInfos.size()) != memrefType2.getRank())
+    return parser.emitError(parser.getNameLoc(),
+                            "tag memref rank not equal to indices count");
+
+  return success();
+}
+
+LogicalResult DmaStartOp::verify() {
+  // DMAs from different memory spaces supported.
+  if (getSrcMemorySpace() == getDstMemorySpace())
+    return emitOpError("DMA should be between different memory spaces");
+
+  if (getNumOperands() != getTagMemRefRank() + getSrcMemRefRank() +
+                              getDstMemRefRank() + 3 + 1 &&
+      getNumOperands() != getTagMemRefRank() + getSrcMemRefRank() +
+                              getDstMemRefRank() + 3 + 1 + 2) {
+    return emitOpError("incorrect number of operands");
+  }
+  return success();
+}
+
+LogicalResult DmaStartOp::fold(ArrayRef<Attribute> cstOperands,
+                               SmallVectorImpl<OpFoldResult> &results) {
+  /// dma_start(memrefcast) -> dma_start
+  return foldMemRefCast(*this);
+}
+
+// ---------------------------------------------------------------------------
+// DmaWaitOp
+// ---------------------------------------------------------------------------
+
+void DmaWaitOp::build(Builder *builder, OperationState &result, Value tagMemRef,
+                      ValueRange tagIndices, Value numElements) {
+  result.addOperands(tagMemRef);
+  result.addOperands(tagIndices);
+  result.addOperands(numElements);
+}
+
+void DmaWaitOp::print(OpAsmPrinter &p) {
+  p << "dma_wait " << getTagMemRef() << '[' << getTagIndices() << "], "
+    << getNumElements();
+  p.printOptionalAttrDict(getAttrs());
+  p << " : " << getTagMemRef()->getType();
+}
+
+// Parse DmaWaitOp.
+// Eg:
+//   dma_wait %tag[%index], %num_elements : memref<1 x i32, (d0) -> (d0), 4>
+//
+ParseResult DmaWaitOp::parse(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType tagMemrefInfo;
+  SmallVector<OpAsmParser::OperandType, 2> tagIndexInfos;
+  Type type;
+  auto indexType = parser.getBuilder().getIndexType();
+  OpAsmParser::OperandType numElementsInfo;
+
+  // Parse tag memref, its indices, and dma size.
+  if (parser.parseOperand(tagMemrefInfo) ||
+      parser.parseOperandList(tagIndexInfos, OpAsmParser::Delimiter::Square) ||
+      parser.parseComma() || parser.parseOperand(numElementsInfo) ||
+      parser.parseColonType(type) ||
+      parser.resolveOperand(tagMemrefInfo, type, result.operands) ||
+      parser.resolveOperands(tagIndexInfos, indexType, result.operands) ||
+      parser.resolveOperand(numElementsInfo, indexType, result.operands))
+    return failure();
+
+  auto memrefType = type.dyn_cast<MemRefType>();
+  if (!memrefType)
+    return parser.emitError(parser.getNameLoc(),
+                            "expected tag to be of memref type");
+
+  if (static_cast<int64_t>(tagIndexInfos.size()) != memrefType.getRank())
+    return parser.emitError(parser.getNameLoc(),
+                            "tag memref rank not equal to indices count");
+
+  return success();
+}
+
+LogicalResult DmaWaitOp::fold(ArrayRef<Attribute> cstOperands,
+                              SmallVectorImpl<OpFoldResult> &results) {
+  /// dma_wait(memrefcast) -> dma_wait
+  return foldMemRefCast(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// ExtractElementOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, ExtractElementOp op) {
+  p << "extract_element " << *op.getAggregate() << '[' << op.getIndices();
+  p << ']';
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getAggregate()->getType();
+}
+
+static ParseResult parseExtractElementOp(OpAsmParser &parser,
+                                         OperationState &result) {
+  OpAsmParser::OperandType aggregateInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
+  ShapedType type;
+
+  auto indexTy = parser.getBuilder().getIndexType();
+  return failure(
+      parser.parseOperand(aggregateInfo) ||
+      parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type) ||
+      parser.resolveOperand(aggregateInfo, type, result.operands) ||
+      parser.resolveOperands(indexInfo, indexTy, result.operands) ||
+      parser.addTypeToList(type.getElementType(), result.types));
+}
+
+static LogicalResult verify(ExtractElementOp op) {
+  auto aggregateType = op.getAggregate()->getType().cast<ShapedType>();
+
+  // This should be possible with tablegen type constraints
+  if (op.getType() != aggregateType.getElementType())
+    return op.emitOpError("result type must match element type of aggregate");
+
+  // Verify the # indices match if we have a ranked type.
+  if (aggregateType.hasRank() &&
+      aggregateType.getRank() != op.getNumOperands() - 1)
+    return op.emitOpError("incorrect number of indices for extract_element");
+
+  return success();
+}
+
+OpFoldResult ExtractElementOp::fold(ArrayRef<Attribute> operands) {
+  assert(!operands.empty() && "extract_element takes at least one operand");
+
+  // The aggregate operand must be a known constant.
+  Attribute aggregate = operands.front();
+  if (!aggregate)
+    return {};
+
+  // If this is a splat elements attribute, simply return the value. All of the
+  // elements of a splat attribute are the same.
+  if (auto splatAggregate = aggregate.dyn_cast<SplatElementsAttr>())
+    return splatAggregate.getSplatValue();
+
+  // Otherwise, collect the constant indices into the aggregate.
+  SmallVector<uint64_t, 8> indices;
+  for (Attribute indice : llvm::drop_begin(operands, 1)) {
+    if (!indice || !indice.isa<IntegerAttr>())
+      return {};
+    indices.push_back(indice.cast<IntegerAttr>().getInt());
+  }
+
+  // If this is an elements attribute, query the value at the given indices.
+  auto elementsAttr = aggregate.dyn_cast<ElementsAttr>();
+  if (elementsAttr && elementsAttr.isValidIndex(indices))
+    return elementsAttr.getValue(indices);
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// IndexCastOp
+//===----------------------------------------------------------------------===//
+
+// Index cast is applicable from index to integer and backwards.
+bool IndexCastOp::areCastCompatible(Type a, Type b) {
+  return (a.isIndex() && b.isa<IntegerType>()) ||
+         (a.isa<IntegerType>() && b.isIndex());
+}
+
+//===----------------------------------------------------------------------===//
+// LoadOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, LoadOp op) {
+  p << "load " << *op.getMemRef() << '[' << op.getIndices() << ']';
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getMemRefType();
+}
+
+static ParseResult parseLoadOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType memrefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
+  MemRefType type;
+
+  auto indexTy = parser.getBuilder().getIndexType();
+  return failure(
+      parser.parseOperand(memrefInfo) ||
+      parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type) ||
+      parser.resolveOperand(memrefInfo, type, result.operands) ||
+      parser.resolveOperands(indexInfo, indexTy, result.operands) ||
+      parser.addTypeToList(type.getElementType(), result.types));
+}
+
+static LogicalResult verify(LoadOp op) {
+  if (op.getType() != op.getMemRefType().getElementType())
+    return op.emitOpError("result type must match element type of memref");
+
+  if (op.getNumOperands() != 1 + op.getMemRefType().getRank())
+    return op.emitOpError("incorrect number of indices for load");
+
+  return success();
+}
+
+OpFoldResult LoadOp::fold(ArrayRef<Attribute> cstOperands) {
+  /// load(memrefcast) -> load
+  if (succeeded(foldMemRefCast(*this)))
+    return getResult();
+  return OpFoldResult();
+}
+
+//===----------------------------------------------------------------------===//
+// MemRefCastOp
+//===----------------------------------------------------------------------===//
+
+bool MemRefCastOp::areCastCompatible(Type a, Type b) {
+  auto aT = a.dyn_cast<MemRefType>();
+  auto bT = b.dyn_cast<MemRefType>();
+
+  auto uaT = a.dyn_cast<UnrankedMemRefType>();
+  auto ubT = b.dyn_cast<UnrankedMemRefType>();
+
+  if (aT && bT) {
+    if (aT.getElementType() != bT.getElementType())
+      return false;
+    if (aT.getAffineMaps() != bT.getAffineMaps()) {
+      int64_t aOffset, bOffset;
+      SmallVector<int64_t, 4> aStrides, bStrides;
+      if (failed(getStridesAndOffset(aT, aStrides, aOffset)) ||
+          failed(getStridesAndOffset(bT, bStrides, bOffset)) ||
+          aStrides.size() != bStrides.size())
+        return false;
+
+      // Strides along a dimension/offset are compatible if the value in the
+      // source memref is static and the value in the target memref is the
+      // same. They are also compatible if either one is dynamic (see
+      // description of MemRefCastOp for details).
+      auto checkCompatible = [](int64_t a, int64_t b) {
+        return (a == MemRefType::getDynamicStrideOrOffset() ||
+                b == MemRefType::getDynamicStrideOrOffset() || a == b);
+      };
+      if (!checkCompatible(aOffset, bOffset))
+        return false;
+      for (auto aStride : enumerate(aStrides))
+        if (!checkCompatible(aStride.value(), bStrides[aStride.index()]))
+          return false;
+    }
+    if (aT.getMemorySpace() != bT.getMemorySpace())
+      return false;
+
+    // They must have the same rank, and any specified dimensions must match.
+    if (aT.getRank() != bT.getRank())
+      return false;
+
+    for (unsigned i = 0, e = aT.getRank(); i != e; ++i) {
+      int64_t aDim = aT.getDimSize(i), bDim = bT.getDimSize(i);
+      if (aDim != -1 && bDim != -1 && aDim != bDim)
+        return false;
+    }
+    return true;
+  } else {
+    if (!aT && !uaT)
+      return false;
+    if (!bT && !ubT)
+      return false;
+    // Unranked to unranked casting is unsupported
+    if (uaT && ubT)
+      return false;
+
+    auto aEltType = (aT) ? aT.getElementType() : uaT.getElementType();
+    auto bEltType = (bT) ? bT.getElementType() : ubT.getElementType();
+    if (aEltType != bEltType)
+      return false;
+
+    auto aMemSpace = (aT) ? aT.getMemorySpace() : uaT.getMemorySpace();
+    auto bMemSpace = (bT) ? bT.getMemorySpace() : ubT.getMemorySpace();
+    if (aMemSpace != bMemSpace)
+      return false;
+
+    return true;
+  }
+
+  return false;
+}
+
+OpFoldResult MemRefCastOp::fold(ArrayRef<Attribute> operands) {
+  return impl::foldCastOp(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// MulFOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult MulFOp::fold(ArrayRef<Attribute> operands) {
+  return constFoldBinaryOp<FloatAttr>(
+      operands, [](APFloat a, APFloat b) { return a * b; });
+}
+
+//===----------------------------------------------------------------------===//
+// MulIOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult MulIOp::fold(ArrayRef<Attribute> operands) {
+  /// muli(x, 0) -> 0
+  if (matchPattern(rhs(), m_Zero()))
+    return rhs();
+  /// muli(x, 1) -> x
+  if (matchPattern(rhs(), m_One()))
+    return getOperand(0);
+
+  // TODO: Handle the overflow case.
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a * b; });
+}
+
+//===----------------------------------------------------------------------===//
+// PrefetchOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, PrefetchOp op) {
+  p << PrefetchOp::getOperationName() << " " << *op.memref() << '[';
+  p.printOperands(op.indices());
+  p << ']' << ", " << (op.isWrite() ? "write" : "read");
+  p << ", locality<" << op.localityHint();
+  p << ">, " << (op.isDataCache() ? "data" : "instr");
+  p.printOptionalAttrDict(
+      op.getAttrs(),
+      /*elidedAttrs=*/{"localityHint", "isWrite", "isDataCache"});
+  p << " : " << op.getMemRefType();
+}
+
+static ParseResult parsePrefetchOp(OpAsmParser &parser,
+                                   OperationState &result) {
+  OpAsmParser::OperandType memrefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
+  IntegerAttr localityHint;
+  MemRefType type;
+  StringRef readOrWrite, cacheType;
+
+  auto indexTy = parser.getBuilder().getIndexType();
+  auto i32Type = parser.getBuilder().getIntegerType(32);
+  if (parser.parseOperand(memrefInfo) ||
+      parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseComma() || parser.parseKeyword(&readOrWrite) ||
+      parser.parseComma() || parser.parseKeyword("locality") ||
+      parser.parseLess() ||
+      parser.parseAttribute(localityHint, i32Type, "localityHint",
+                            result.attributes) ||
+      parser.parseGreater() || parser.parseComma() ||
+      parser.parseKeyword(&cacheType) || parser.parseColonType(type) ||
+      parser.resolveOperand(memrefInfo, type, result.operands) ||
+      parser.resolveOperands(indexInfo, indexTy, result.operands))
+    return failure();
+
+  if (!readOrWrite.equals("read") && !readOrWrite.equals("write"))
+    return parser.emitError(parser.getNameLoc(),
+                            "rw specifier has to be 'read' or 'write'");
+  result.addAttribute(
+      PrefetchOp::getIsWriteAttrName(),
+      parser.getBuilder().getBoolAttr(readOrWrite.equals("write")));
+
+  if (!cacheType.equals("data") && !cacheType.equals("instr"))
+    return parser.emitError(parser.getNameLoc(),
+                            "cache type has to be 'data' or 'instr'");
+
+  result.addAttribute(
+      PrefetchOp::getIsDataCacheAttrName(),
+      parser.getBuilder().getBoolAttr(cacheType.equals("data")));
+
+  return success();
+}
+
+static LogicalResult verify(PrefetchOp op) {
+  if (op.getNumOperands() != 1 + op.getMemRefType().getRank())
+    return op.emitOpError("too few indices");
+
+  return success();
+}
+
+LogicalResult PrefetchOp::fold(ArrayRef<Attribute> cstOperands,
+                               SmallVectorImpl<OpFoldResult> &results) {
+  // prefetch(memrefcast) -> prefetch
+  return foldMemRefCast(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// RankOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, RankOp op) {
+  p << "rank " << *op.getOperand() << " : " << op.getOperand()->getType();
+}
+
+static ParseResult parseRankOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType operandInfo;
+  Type type;
+  Type indexType = parser.getBuilder().getIndexType();
+  return failure(parser.parseOperand(operandInfo) ||
+                 parser.parseColonType(type) ||
+                 parser.resolveOperand(operandInfo, type, result.operands) ||
+                 parser.addTypeToList(indexType, result.types));
+}
+
+OpFoldResult RankOp::fold(ArrayRef<Attribute> operands) {
+  // Constant fold rank when the rank of the tensor is known.
+  auto type = getOperand()->getType();
+  if (auto tensorType = type.dyn_cast<RankedTensorType>())
+    return IntegerAttr::get(IndexType::get(getContext()), tensorType.getRank());
+  return IntegerAttr();
+}
+
+//===----------------------------------------------------------------------===//
+// SignedRemIOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SignedRemIOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "remi_signed takes two operands");
+
+  auto rhs = operands.back().dyn_cast_or_null<IntegerAttr>();
+  if (!rhs)
+    return {};
+  auto rhsValue = rhs.getValue();
+
+  // x % 1 = 0
+  if (rhsValue.isOneValue())
+    return IntegerAttr::get(rhs.getType(), APInt(rhsValue.getBitWidth(), 0));
+
+  // Don't fold if it requires division by zero.
+  if (rhsValue.isNullValue())
+    return {};
+
+  auto lhs = operands.front().dyn_cast_or_null<IntegerAttr>();
+  if (!lhs)
+    return {};
+  return IntegerAttr::get(lhs.getType(), lhs.getValue().srem(rhsValue));
+}
+
+//===----------------------------------------------------------------------===//
+// UnsignedRemIOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult UnsignedRemIOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "remi_unsigned takes two operands");
+
+  auto rhs = operands.back().dyn_cast_or_null<IntegerAttr>();
+  if (!rhs)
+    return {};
+  auto rhsValue = rhs.getValue();
+
+  // x % 1 = 0
+  if (rhsValue.isOneValue())
+    return IntegerAttr::get(rhs.getType(), APInt(rhsValue.getBitWidth(), 0));
+
+  // Don't fold if it requires division by zero.
+  if (rhsValue.isNullValue())
+    return {};
+
+  auto lhs = operands.front().dyn_cast_or_null<IntegerAttr>();
+  if (!lhs)
+    return {};
+  return IntegerAttr::get(lhs.getType(), lhs.getValue().urem(rhsValue));
+}
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseReturnOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 2> opInfo;
+  SmallVector<Type, 2> types;
+  llvm::SMLoc loc = parser.getCurrentLocation();
+  return failure(parser.parseOperandList(opInfo) ||
+                 (!opInfo.empty() && parser.parseColonTypeList(types)) ||
+                 parser.resolveOperands(opInfo, types, loc, result.operands));
+}
+
+static void print(OpAsmPrinter &p, ReturnOp op) {
+  p << "return";
+  if (op.getNumOperands() != 0)
+    p << ' ' << op.getOperands() << " : " << op.getOperandTypes();
+}
+
+static LogicalResult verify(ReturnOp op) {
+  auto function = cast<FuncOp>(op.getParentOp());
+
+  // The operand number and types must match the function signature.
+  const auto &results = function.getType().getResults();
+  if (op.getNumOperands() != results.size())
+    return op.emitOpError("has ")
+           << op.getNumOperands()
+           << " operands, but enclosing function returns " << results.size();
+
+  for (unsigned i = 0, e = results.size(); i != e; ++i)
+    if (op.getOperand(i)->getType() != results[i])
+      return op.emitError()
+             << "type of return operand " << i << " ("
+             << op.getOperand(i)->getType()
+             << ") doesn't match function result type (" << results[i] << ")";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SIToFPOp
+//===----------------------------------------------------------------------===//
+
+// sitofp is applicable from integer types to float types.
+bool SIToFPOp::areCastCompatible(Type a, Type b) {
+  return a.isa<IntegerType>() && b.isa<FloatType>();
+}
+
+//===----------------------------------------------------------------------===//
+// SelectOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseSelectOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 3> ops;
+  SmallVector<NamedAttribute, 4> attrs;
+  Type type;
+  if (parser.parseOperandList(ops, 3) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type))
+    return failure();
+
+  auto i1Type = getCheckedI1SameShape(&parser.getBuilder(), type);
+  if (!i1Type)
+    return parser.emitError(parser.getNameLoc(),
+                            "expected type with valid i1 shape");
+
+  SmallVector<Type, 3> types = {i1Type, type, type};
+  return failure(parser.resolveOperands(ops, types, parser.getNameLoc(),
+                                        result.operands) ||
+                 parser.addTypeToList(type, result.types));
+}
+
+static void print(OpAsmPrinter &p, SelectOp op) {
+  p << "select " << op.getOperands() << " : " << op.getTrueValue()->getType();
+  p.printOptionalAttrDict(op.getAttrs());
+}
+
+static LogicalResult verify(SelectOp op) {
+  auto trueType = op.getTrueValue()->getType();
+  auto falseType = op.getFalseValue()->getType();
+
+  if (trueType != falseType)
+    return op.emitOpError(
+        "requires 'true' and 'false' arguments to be of the same type");
+
+  return success();
+}
+
+OpFoldResult SelectOp::fold(ArrayRef<Attribute> operands) {
+  auto condition = getCondition();
+
+  // select true, %0, %1 => %0
+  if (matchPattern(condition, m_One()))
+    return getTrueValue();
+
+  // select false, %0, %1 => %1
+  if (matchPattern(condition, m_Zero()))
+    return getFalseValue();
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// SignExtendIOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(SignExtendIOp op) {
+  // Get the scalar type (which is either directly the type of the operand
+  // or the vector's/tensor's element type.
+  auto srcType = getElementTypeOrSelf(op.getOperand()->getType());
+  auto dstType = getElementTypeOrSelf(op.getType());
+
+  // For now, index is forbidden for the source and the destination type.
+  if (srcType.isa<IndexType>())
+    return op.emitError() << srcType << " is not a valid operand type";
+  if (dstType.isa<IndexType>())
+    return op.emitError() << dstType << " is not a valid result type";
+
+  if (srcType.cast<IntegerType>().getWidth() >=
+      dstType.cast<IntegerType>().getWidth())
+    return op.emitError("result type ")
+           << dstType << " must be wider than operand type " << srcType;
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SplatOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, SplatOp op) {
+  p << "splat " << *op.getOperand();
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getType();
+}
+
+static ParseResult parseSplatOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType splatValueInfo;
+  ShapedType shapedType;
+
+  return failure(parser.parseOperand(splatValueInfo) ||
+                 parser.parseOptionalAttrDict(result.attributes) ||
+                 parser.parseColonType(shapedType) ||
+                 parser.resolveOperand(splatValueInfo,
+                                       shapedType.getElementType(),
+                                       result.operands) ||
+                 parser.addTypeToList(shapedType, result.types));
+}
+
+static LogicalResult verify(SplatOp op) {
+  // TODO: we could replace this by a trait.
+  if (op.getOperand()->getType() !=
+      op.getType().cast<ShapedType>().getElementType())
+    return op.emitError("operand should be of elemental type of result type");
+
+  return success();
+}
+
+// Constant folding hook for SplatOp.
+OpFoldResult SplatOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 1 && "splat takes one operand");
+
+  auto constOperand = operands.front();
+  if (!constOperand ||
+      (!constOperand.isa<IntegerAttr>() && !constOperand.isa<FloatAttr>()))
+    return {};
+
+  auto shapedType = getType().cast<ShapedType>();
+  assert(shapedType.getElementType() == constOperand.getType() &&
+         "incorrect input attribute type for folding");
+
+  // SplatElementsAttr::get treats single value for second arg as being a splat.
+  return SplatElementsAttr::get(shapedType, {constOperand});
+}
+
+//===----------------------------------------------------------------------===//
+// StoreOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, StoreOp op) {
+  p << "store " << *op.getValueToStore();
+  p << ", " << *op.getMemRef() << '[' << op.getIndices() << ']';
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getMemRefType();
+}
+
+static ParseResult parseStoreOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType storeValueInfo;
+  OpAsmParser::OperandType memrefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
+  MemRefType memrefType;
+
+  auto indexTy = parser.getBuilder().getIndexType();
+  return failure(
+      parser.parseOperand(storeValueInfo) || parser.parseComma() ||
+      parser.parseOperand(memrefInfo) ||
+      parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(memrefType) ||
+      parser.resolveOperand(storeValueInfo, memrefType.getElementType(),
+                            result.operands) ||
+      parser.resolveOperand(memrefInfo, memrefType, result.operands) ||
+      parser.resolveOperands(indexInfo, indexTy, result.operands));
+}
+
+static LogicalResult verify(StoreOp op) {
+  // First operand must have same type as memref element type.
+  if (op.getValueToStore()->getType() != op.getMemRefType().getElementType())
+    return op.emitOpError(
+        "first operand must have same type memref element type");
+
+  if (op.getNumOperands() != 2 + op.getMemRefType().getRank())
+    return op.emitOpError("store index operand count not equal to memref rank");
+
+  return success();
+}
+
+LogicalResult StoreOp::fold(ArrayRef<Attribute> cstOperands,
+                            SmallVectorImpl<OpFoldResult> &results) {
+  /// store(memrefcast) -> store
+  return foldMemRefCast(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// SubFOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SubFOp::fold(ArrayRef<Attribute> operands) {
+  return constFoldBinaryOp<FloatAttr>(
+      operands, [](APFloat a, APFloat b) { return a - b; });
+}
+
+//===----------------------------------------------------------------------===//
+// SubIOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SubIOp::fold(ArrayRef<Attribute> operands) {
+  // subi(x,x) -> 0
+  if (getOperand(0) == getOperand(1))
+    return Builder(getContext()).getZeroAttr(getType());
+
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a - b; });
+}
+
+//===----------------------------------------------------------------------===//
+// AndOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult AndOp::fold(ArrayRef<Attribute> operands) {
+  /// and(x, 0) -> 0
+  if (matchPattern(rhs(), m_Zero()))
+    return rhs();
+  /// and(x,x) -> x
+  if (lhs() == rhs())
+    return rhs();
+
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a & b; });
+}
+
+//===----------------------------------------------------------------------===//
+// OrOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult OrOp::fold(ArrayRef<Attribute> operands) {
+  /// or(x, 0) -> x
+  if (matchPattern(rhs(), m_Zero()))
+    return lhs();
+  /// or(x,x) -> x
+  if (lhs() == rhs())
+    return rhs();
+
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a | b; });
+}
+
+//===----------------------------------------------------------------------===//
+// XOrOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult XOrOp::fold(ArrayRef<Attribute> operands) {
+  /// xor(x, 0) -> x
+  if (matchPattern(rhs(), m_Zero()))
+    return lhs();
+  /// xor(x,x) -> 0
+  if (lhs() == rhs())
+    return Builder(getContext()).getZeroAttr(getType());
+
+  return constFoldBinaryOp<IntegerAttr>(operands,
+                                        [](APInt a, APInt b) { return a ^ b; });
+}
+
+//===----------------------------------------------------------------------===//
+// TensorCastOp
+//===----------------------------------------------------------------------===//
+
+bool TensorCastOp::areCastCompatible(Type a, Type b) {
+  auto aT = a.dyn_cast<TensorType>();
+  auto bT = b.dyn_cast<TensorType>();
+  if (!aT || !bT)
+    return false;
+
+  if (aT.getElementType() != bT.getElementType())
+    return false;
+
+  return succeeded(verifyCompatibleShape(aT, bT));
+}
+
+OpFoldResult TensorCastOp::fold(ArrayRef<Attribute> operands) {
+  return impl::foldCastOp(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// Helpers for Tensor[Load|Store]Op
+//===----------------------------------------------------------------------===//
+
+static Type getTensorTypeFromMemRefType(Builder &b, Type type) {
+  if (auto memref = type.dyn_cast<MemRefType>())
+    return RankedTensorType::get(memref.getShape(), memref.getElementType());
+  return b.getNoneType();
+}
+
+//===----------------------------------------------------------------------===//
+// TensorLoadOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, TensorLoadOp op) {
+  p << "tensor_load " << *op.getOperand();
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getOperand()->getType();
+}
+
+static ParseResult parseTensorLoadOp(OpAsmParser &parser,
+                                     OperationState &result) {
+  OpAsmParser::OperandType op;
+  Type type;
+  return failure(parser.parseOperand(op) ||
+                 parser.parseOptionalAttrDict(result.attributes) ||
+                 parser.parseColonType(type) ||
+                 parser.resolveOperand(op, type, result.operands) ||
+                 parser.addTypeToList(
+                     getTensorTypeFromMemRefType(parser.getBuilder(), type),
+                     result.types));
+}
+
+//===----------------------------------------------------------------------===//
+// TensorStoreOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, TensorStoreOp op) {
+  p << "tensor_store " << *op.tensor() << ", " << *op.memref();
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.memref()->getType();
+}
+
+static ParseResult parseTensorStoreOp(OpAsmParser &parser,
+                                      OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 2> ops;
+  Type type;
+  llvm::SMLoc loc = parser.getCurrentLocation();
+  return failure(
+      parser.parseOperandList(ops, /*requiredOperandCount=*/2) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type) ||
+      parser.resolveOperands(
+          ops, {getTensorTypeFromMemRefType(parser.getBuilder(), type), type},
+          loc, result.operands));
+}
+
+//===----------------------------------------------------------------------===//
+// TruncateIOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(TruncateIOp op) {
+  auto srcType = getElementTypeOrSelf(op.getOperand()->getType());
+  auto dstType = getElementTypeOrSelf(op.getType());
+
+  if (srcType.isa<IndexType>())
+    return op.emitError() << srcType << " is not a valid operand type";
+  if (dstType.isa<IndexType>())
+    return op.emitError() << dstType << " is not a valid result type";
+
+  if (srcType.cast<IntegerType>().getWidth() <=
+      dstType.cast<IntegerType>().getWidth())
+    return op.emitError("operand type ")
+           << srcType << " must be wider than result type " << dstType;
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ViewOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseViewOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType srcInfo;
+  SmallVector<OpAsmParser::OperandType, 1> offsetInfo;
+  SmallVector<OpAsmParser::OperandType, 4> sizesInfo;
+  auto indexType = parser.getBuilder().getIndexType();
+  Type srcType, dstType;
+  llvm::SMLoc offsetLoc;
+  if (parser.parseOperand(srcInfo) || parser.getCurrentLocation(&offsetLoc) ||
+      parser.parseOperandList(offsetInfo, OpAsmParser::Delimiter::Square))
+    return failure();
+
+  if (offsetInfo.size() > 1)
+    return parser.emitError(offsetLoc) << "expects 0 or 1 offset operand";
+
+  return failure(
+      parser.parseOperandList(sizesInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(srcType) ||
+      parser.resolveOperand(srcInfo, srcType, result.operands) ||
+      parser.resolveOperands(offsetInfo, indexType, result.operands) ||
+      parser.resolveOperands(sizesInfo, indexType, result.operands) ||
+      parser.parseKeywordType("to", dstType) ||
+      parser.addTypeToList(dstType, result.types));
+}
+
+static void print(OpAsmPrinter &p, ViewOp op) {
+  p << op.getOperationName() << ' ' << *op.getOperand(0) << '[';
+  auto dynamicOffset = op.getDynamicOffset();
+  if (dynamicOffset != nullptr)
+    p.printOperand(dynamicOffset);
+  p << "][" << op.getDynamicSizes() << ']';
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getOperand(0)->getType() << " to " << op.getType();
+}
+
+Value ViewOp::getDynamicOffset() {
+  int64_t offset;
+  SmallVector<int64_t, 4> strides;
+  auto result =
+      succeeded(mlir::getStridesAndOffset(getType(), strides, offset));
+  assert(result);
+  if (result && offset == MemRefType::getDynamicStrideOrOffset())
+    return getOperand(1);
+  return nullptr;
+}
+
+static LogicalResult verifyDynamicStrides(MemRefType memrefType,
+                                          ArrayRef<int64_t> strides) {
+  ArrayRef<int64_t> shape = memrefType.getShape();
+  unsigned rank = memrefType.getRank();
+  assert(rank == strides.size());
+  bool dynamicStrides = false;
+  for (int i = rank - 2; i >= 0; --i) {
+    // If size at dim 'i + 1' is dynamic, set the 'dynamicStrides' flag.
+    if (ShapedType::isDynamic(shape[i + 1]))
+      dynamicStrides = true;
+    // If stride at dim 'i' is not dynamic, return error.
+    if (dynamicStrides && strides[i] != MemRefType::getDynamicStrideOrOffset())
+      return failure();
+  }
+  return success();
+}
+
+static LogicalResult verify(ViewOp op) {
+  auto baseType = op.getOperand(0)->getType().cast<MemRefType>();
+  auto viewType = op.getResult()->getType().cast<MemRefType>();
+
+  // The base memref should have identity layout map (or none).
+  if (baseType.getAffineMaps().size() > 1 ||
+      (baseType.getAffineMaps().size() == 1 &&
+       !baseType.getAffineMaps()[0].isIdentity()))
+    return op.emitError("unsupported map for base memref type ") << baseType;
+
+  // The base memref and the view memref should be in the same memory space.
+  if (baseType.getMemorySpace() != viewType.getMemorySpace())
+    return op.emitError("different memory spaces specified for base memref "
+                        "type ")
+           << baseType << " and view memref type " << viewType;
+
+  // Verify that the result memref type has a strided layout map.
+  int64_t offset;
+  SmallVector<int64_t, 4> strides;
+  if (failed(getStridesAndOffset(viewType, strides, offset)))
+    return op.emitError("result type ") << viewType << " is not strided";
+
+  // Verify that we have the correct number of operands for the result type.
+  unsigned memrefOperandCount = 1;
+  unsigned numDynamicDims = viewType.getNumDynamicDims();
+  unsigned dynamicOffsetCount =
+      offset == MemRefType::getDynamicStrideOrOffset() ? 1 : 0;
+  if (op.getNumOperands() !=
+      memrefOperandCount + numDynamicDims + dynamicOffsetCount)
+    return op.emitError("incorrect number of operands for type ") << viewType;
+
+  // Verify dynamic strides symbols were added to correct dimensions based
+  // on dynamic sizes.
+  if (failed(verifyDynamicStrides(viewType, strides)))
+    return op.emitError("incorrect dynamic strides in view memref type ")
+           << viewType;
+  return success();
+}
+
+namespace {
+
+struct ViewOpShapeFolder : public OpRewritePattern<ViewOp> {
+  using OpRewritePattern<ViewOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(ViewOp viewOp,
+                                     PatternRewriter &rewriter) const override {
+    // Return if none of the operands are constants.
+    if (llvm::none_of(viewOp.getOperands(), [](Value operand) {
+          return matchPattern(operand, m_ConstantIndex());
+        }))
+      return matchFailure();
+
+    // Get result memref type.
+    auto memrefType = viewOp.getType();
+    if (memrefType.getAffineMaps().size() != 1)
+      return matchFailure();
+    auto map = memrefType.getAffineMaps()[0];
+
+    // Get offset from old memref view type 'memRefType'.
+    int64_t oldOffset;
+    SmallVector<int64_t, 4> oldStrides;
+    if (failed(getStridesAndOffset(memrefType, oldStrides, oldOffset)))
+      return matchFailure();
+
+    SmallVector<Value, 4> newOperands;
+    SmallVector<Value, 4> droppedOperands;
+
+    // Fold dynamic offset operand if it is produced by a constant.
+    auto dynamicOffset = viewOp.getDynamicOffset();
+    int64_t newOffset = oldOffset;
+    unsigned dynamicOffsetOperandCount = 0;
+    if (dynamicOffset != nullptr) {
+      auto *defOp = dynamicOffset->getDefiningOp();
+      if (auto constantIndexOp = dyn_cast_or_null<ConstantIndexOp>(defOp)) {
+        // Dynamic offset will be folded into the map.
+        newOffset = constantIndexOp.getValue();
+        droppedOperands.push_back(dynamicOffset);
+      } else {
+        // Unable to fold dynamic offset. Add it to 'newOperands' list.
+        newOperands.push_back(dynamicOffset);
+        dynamicOffsetOperandCount = 1;
+      }
+    }
+
+    // Fold any dynamic dim operands which are produced by a constant.
+    SmallVector<int64_t, 4> newShapeConstants;
+    newShapeConstants.reserve(memrefType.getRank());
+
+    unsigned dynamicDimPos = viewOp.getDynamicSizesOperandStart();
+    unsigned rank = memrefType.getRank();
+    for (unsigned dim = 0, e = rank; dim < e; ++dim) {
+      int64_t dimSize = memrefType.getDimSize(dim);
+      // If this is already static dimension, keep it.
+      if (!ShapedType::isDynamic(dimSize)) {
+        newShapeConstants.push_back(dimSize);
+        continue;
+      }
+      auto *defOp = viewOp.getOperand(dynamicDimPos)->getDefiningOp();
+      if (auto constantIndexOp = dyn_cast_or_null<ConstantIndexOp>(defOp)) {
+        // Dynamic shape dimension will be folded.
+        newShapeConstants.push_back(constantIndexOp.getValue());
+        // Record to check for zero uses later below.
+        droppedOperands.push_back(constantIndexOp);
+      } else {
+        // Dynamic shape dimension not folded; copy operand from old memref.
+        newShapeConstants.push_back(dimSize);
+        newOperands.push_back(viewOp.getOperand(dynamicDimPos));
+      }
+      dynamicDimPos++;
+    }
+
+    // Compute new strides based on 'newShapeConstants'.
+    SmallVector<int64_t, 4> newStrides(rank);
+    newStrides[rank - 1] = 1;
+    bool dynamicStrides = false;
+    for (int i = rank - 2; i >= 0; --i) {
+      if (ShapedType::isDynamic(newShapeConstants[i + 1]))
+        dynamicStrides = true;
+      if (dynamicStrides)
+        newStrides[i] = MemRefType::getDynamicStrideOrOffset();
+      else
+        newStrides[i] = newShapeConstants[i + 1] * newStrides[i + 1];
+    }
+
+    // Regenerate strided layout map with 'newStrides' and 'newOffset'.
+    map = makeStridedLinearLayoutMap(newStrides, newOffset,
+                                     rewriter.getContext());
+
+    // Create new memref type with constant folded dims and/or offset/strides.
+    auto newMemRefType =
+        MemRefType::get(newShapeConstants, memrefType.getElementType(), {map},
+                        memrefType.getMemorySpace());
+    assert(static_cast<int64_t>(newOperands.size()) ==
+           dynamicOffsetOperandCount + newMemRefType.getNumDynamicDims());
+
+    // Create new ViewOp.
+    auto newViewOp = rewriter.create<ViewOp>(viewOp.getLoc(), newMemRefType,
+                                             viewOp.getOperand(0), newOperands);
+    // Insert a cast so we have the same type as the old memref type.
+    rewriter.replaceOpWithNewOp<MemRefCastOp>(droppedOperands, viewOp,
+                                              newViewOp, viewOp.getType());
+    return matchSuccess();
+  }
+};
+
+} // end anonymous namespace
+
+void ViewOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                         MLIRContext *context) {
+  results.insert<ViewOpShapeFolder>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// SubViewOp
+//===----------------------------------------------------------------------===//
+
+// Returns a MemRefType with dynamic sizes and offset and the same stride as the
+// `memRefType` passed as argument.
+// TODO(andydavis,ntv) Evolve to a more powerful inference that can also keep
+// sizes and offset static.
+static Type inferSubViewResultType(MemRefType memRefType) {
+  auto rank = memRefType.getRank();
+  int64_t offset;
+  SmallVector<int64_t, 4> strides;
+  Type elementType = memRefType.getElementType();
+  auto res = getStridesAndOffset(memRefType, strides, offset);
+  assert(succeeded(res) && "SubViewOp expected strided memref type");
+  (void)res;
+
+  // Assume sizes and offset are fully dynamic for now until canonicalization
+  // occurs on the ranges. Typed strides don't change though.
+  offset = MemRefType::getDynamicStrideOrOffset();
+  // Overwrite strides because verifier will not pass.
+  // TODO(b/144419106): don't force degrade the strides to fully dynamic.
+  for (auto &stride : strides)
+    stride = MemRefType::getDynamicStrideOrOffset();
+  auto stridedLayout =
+      makeStridedLinearLayoutMap(strides, offset, memRefType.getContext());
+  SmallVector<int64_t, 4> sizes(rank, ShapedType::kDynamicSize);
+  return MemRefType::get(sizes, elementType, stridedLayout,
+                         memRefType.getMemorySpace());
+}
+
+void mlir::SubViewOp::build(Builder *b, OperationState &result, Value source,
+                            ValueRange offsets, ValueRange sizes,
+                            ValueRange strides, Type resultType,
+                            ArrayRef<NamedAttribute> attrs) {
+  if (!resultType)
+    resultType = inferSubViewResultType(source->getType().cast<MemRefType>());
+  auto segmentAttr = b->getI32VectorAttr(
+      {1, static_cast<int>(offsets.size()), static_cast<int32_t>(sizes.size()),
+       static_cast<int32_t>(strides.size())});
+  build(b, result, resultType, source, offsets, sizes, strides, segmentAttr);
+  result.addAttributes(attrs);
+}
+
+void mlir::SubViewOp::build(Builder *b, OperationState &result, Type resultType,
+                            Value source) {
+  build(b, result, source, /*offsets=*/{}, /*sizes=*/{}, /*strides=*/{},
+        resultType);
+}
+
+static ParseResult parseSubViewOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType srcInfo;
+  SmallVector<OpAsmParser::OperandType, 4> offsetsInfo;
+  SmallVector<OpAsmParser::OperandType, 4> sizesInfo;
+  SmallVector<OpAsmParser::OperandType, 4> stridesInfo;
+  auto indexType = parser.getBuilder().getIndexType();
+  Type srcType, dstType;
+  if (parser.parseOperand(srcInfo) ||
+      parser.parseOperandList(offsetsInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseOperandList(sizesInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseOperandList(stridesInfo, OpAsmParser::Delimiter::Square)) {
+    return failure();
+  }
+
+  auto builder = parser.getBuilder();
+  result.addAttribute(
+      SubViewOp::getOperandSegmentSizeAttr(),
+      builder.getI32VectorAttr({1, static_cast<int>(offsetsInfo.size()),
+                                static_cast<int32_t>(sizesInfo.size()),
+                                static_cast<int32_t>(stridesInfo.size())}));
+
+  return failure(
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(srcType) ||
+      parser.resolveOperand(srcInfo, srcType, result.operands) ||
+      parser.resolveOperands(offsetsInfo, indexType, result.operands) ||
+      parser.resolveOperands(sizesInfo, indexType, result.operands) ||
+      parser.resolveOperands(stridesInfo, indexType, result.operands) ||
+      parser.parseKeywordType("to", dstType) ||
+      parser.addTypeToList(dstType, result.types));
+}
+
+static void print(OpAsmPrinter &p, SubViewOp op) {
+  p << op.getOperationName() << ' ' << *op.getOperand(0) << '[' << op.offsets()
+    << "][" << op.sizes() << "][" << op.strides() << ']';
+
+  SmallVector<StringRef, 1> elidedAttrs = {
+      SubViewOp::getOperandSegmentSizeAttr()};
+  p.printOptionalAttrDict(op.getAttrs(), elidedAttrs);
+  p << " : " << op.getOperand(0)->getType() << " to " << op.getType();
+}
+
+static LogicalResult verify(SubViewOp op) {
+  auto baseType = op.getBaseMemRefType().cast<MemRefType>();
+  auto subViewType = op.getType();
+
+  // The rank of the base and result subview must match.
+  if (baseType.getRank() != subViewType.getRank()) {
+    return op.emitError(
+        "expected rank of result type to match rank of base type ");
+  }
+
+  // The base memref and the view memref should be in the same memory space.
+  if (baseType.getMemorySpace() != subViewType.getMemorySpace())
+    return op.emitError("different memory spaces specified for base memref "
+                        "type ")
+           << baseType << " and subview memref type " << subViewType;
+
+  // Verify that the base memref type has a strided layout map.
+  int64_t baseOffset;
+  SmallVector<int64_t, 4> baseStrides;
+  if (failed(getStridesAndOffset(baseType, baseStrides, baseOffset)))
+    return op.emitError("base type ") << subViewType << " is not strided";
+
+  // Verify that the result memref type has a strided layout map.
+  int64_t subViewOffset;
+  SmallVector<int64_t, 4> subViewStrides;
+  if (failed(getStridesAndOffset(subViewType, subViewStrides, subViewOffset)))
+    return op.emitError("result type ") << subViewType << " is not strided";
+
+  // Num offsets should either be zero or rank of memref.
+  if (op.getNumOffsets() != 0 && op.getNumOffsets() != subViewType.getRank()) {
+    return op.emitError("expected number of dynamic offsets specified to match "
+                        "the rank of the result type ")
+           << subViewType;
+  }
+
+  // Num sizes should either be zero or rank of memref.
+  if (op.getNumSizes() != 0 && op.getNumSizes() != subViewType.getRank()) {
+    return op.emitError("expected number of dynamic sizes specified to match "
+                        "the rank of the result type ")
+           << subViewType;
+  }
+
+  // Num strides should either be zero or rank of memref.
+  if (op.getNumStrides() != 0 && op.getNumStrides() != subViewType.getRank()) {
+    return op.emitError("expected number of dynamic strides specified to match "
+                        "the rank of the result type ")
+           << subViewType;
+  }
+
+  // Verify that if the shape of the subview type is static, then sizes are not
+  // dynamic values, and vice versa.
+  if ((subViewType.hasStaticShape() && op.getNumSizes() != 0) ||
+      (op.getNumSizes() == 0 && !subViewType.hasStaticShape())) {
+    return op.emitError("invalid to specify dynamic sizes when subview result "
+                        "type is statically shaped and viceversa");
+  }
+
+  // Verify that if dynamic sizes are specified, then the result memref type
+  // have full dynamic dimensions.
+  if (op.getNumSizes() > 0) {
+    if (llvm::any_of(subViewType.getShape(), [](int64_t dim) {
+          return dim != ShapedType::kDynamicSize;
+        })) {
+      // TODO: This is based on the assumption that number of size arguments are
+      // either 0, or the rank of the result type. It is possible to have more
+      // fine-grained verification where only particular dimensions are
+      // dynamic. That probably needs further changes to the shape op
+      // specification.
+      return op.emitError("expected shape of result type to be fully dynamic "
+                          "when sizes are specified");
+    }
+  }
+
+  // Verify that if dynamic offsets are specified or base memref has dynamic
+  // offset or base memref has dynamic strides, then the subview offset is
+  // dynamic.
+  if ((op.getNumOffsets() > 0 ||
+       baseOffset == MemRefType::getDynamicStrideOrOffset() ||
+       llvm::is_contained(baseStrides,
+                          MemRefType::getDynamicStrideOrOffset())) &&
+      subViewOffset != MemRefType::getDynamicStrideOrOffset()) {
+    return op.emitError(
+        "expected result memref layout map to have dynamic offset");
+  }
+
+  // For now, verify that if dynamic strides are specified, then all the result
+  // memref type have dynamic strides.
+  if (op.getNumStrides() > 0) {
+    if (llvm::any_of(subViewStrides, [](int64_t stride) {
+          return stride != MemRefType::getDynamicStrideOrOffset();
+        })) {
+      return op.emitError("expected result type to have dynamic strides");
+    }
+  }
+
+  // If any of the base memref has dynamic stride, then the corresponding
+  // stride of the subview must also have dynamic stride.
+  assert(baseStrides.size() == subViewStrides.size());
+  for (auto stride : enumerate(baseStrides)) {
+    if (stride.value() == MemRefType::getDynamicStrideOrOffset() &&
+        subViewStrides[stride.index()] !=
+            MemRefType::getDynamicStrideOrOffset()) {
+      return op.emitError(
+          "expected result type to have dynamic stride along a dimension if "
+          "the base memref type has dynamic stride along that dimension");
+    }
+  }
+  return success();
+}
+
+raw_ostream &mlir::operator<<(raw_ostream &os, SubViewOp::Range &range) {
+  return os << "range " << *range.offset << ":" << *range.size << ":"
+            << *range.stride;
+}
+
+SmallVector<SubViewOp::Range, 8> SubViewOp::getRanges() {
+  SmallVector<Range, 8> res;
+  unsigned rank = getType().getRank();
+  res.reserve(rank);
+  for (unsigned i = 0; i < rank; ++i)
+    res.emplace_back(Range{*(offsets().begin() + i), *(sizes().begin() + i),
+                           *(strides().begin() + i)});
+  return res;
+}
+
+LogicalResult
+SubViewOp::getStaticStrides(SmallVectorImpl<int64_t> &staticStrides) {
+  // If the strides are dynamic return failure.
+  if (getNumStrides())
+    return failure();
+
+  // When static, the stride operands can be retrieved by taking the strides of
+  // the result of the subview op, and dividing the strides of the base memref.
+  int64_t resultOffset, baseOffset;
+  SmallVector<int64_t, 2> resultStrides, baseStrides;
+  if (failed(
+          getStridesAndOffset(getBaseMemRefType(), baseStrides, baseOffset)) ||
+      llvm::is_contained(baseStrides, MemRefType::getDynamicStrideOrOffset()) ||
+      failed(getStridesAndOffset(getType(), resultStrides, resultOffset)))
+    return failure();
+
+  assert(static_cast<int64_t>(resultStrides.size()) == getType().getRank() &&
+         baseStrides.size() == resultStrides.size() &&
+         "base and result memrefs must have the same rank");
+  assert(!llvm::is_contained(resultStrides,
+                             MemRefType::getDynamicStrideOrOffset()) &&
+         "strides of subview op must be static, when there are no dynamic "
+         "strides specified");
+  staticStrides.resize(getType().getRank());
+  for (auto resultStride : enumerate(resultStrides)) {
+    auto baseStride = baseStrides[resultStride.index()];
+    // The result stride is expected to be a multiple of the base stride. Abort
+    // if that is not the case.
+    if (resultStride.value() < baseStride ||
+        resultStride.value() % baseStride != 0)
+      return failure();
+    staticStrides[resultStride.index()] = resultStride.value() / baseStride;
+  }
+  return success();
+}
+
+static bool hasConstantOffsetSizesAndStrides(MemRefType memrefType) {
+  if (memrefType.getNumDynamicDims() > 0)
+    return false;
+  // Get offset and strides.
+  int64_t offset;
+  SmallVector<int64_t, 4> strides;
+  if (failed(getStridesAndOffset(memrefType, strides, offset)))
+    return false;
+  // Return 'false' if any of offset or strides is dynamic.
+  if (offset == MemRefType::getDynamicStrideOrOffset() ||
+      llvm::is_contained(strides, MemRefType::getDynamicStrideOrOffset()))
+    return false;
+  return true;
+}
+
+namespace {
+
+/// Pattern to rewrite a subview op with constant size arguments.
+class SubViewOpShapeFolder final : public OpRewritePattern<SubViewOp> {
+public:
+  using OpRewritePattern<SubViewOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(SubViewOp subViewOp,
+                                     PatternRewriter &rewriter) const override {
+    MemRefType subViewType = subViewOp.getType();
+    // Follow all or nothing approach for shapes for now. If all the operands
+    // for sizes are constants then fold it into the type of the result memref.
+    if (subViewType.hasStaticShape() ||
+        llvm::any_of(subViewOp.sizes(), [](Value operand) {
+          return !matchPattern(operand, m_ConstantIndex());
+        })) {
+      return matchFailure();
+    }
+    SmallVector<int64_t, 4> staticShape(subViewOp.getNumSizes());
+    for (auto size : llvm::enumerate(subViewOp.sizes())) {
+      auto defOp = size.value()->getDefiningOp();
+      assert(defOp);
+      staticShape[size.index()] = cast<ConstantIndexOp>(defOp).getValue();
+    }
+    MemRefType newMemRefType = MemRefType::get(
+        staticShape, subViewType.getElementType(), subViewType.getAffineMaps(),
+        subViewType.getMemorySpace());
+    auto newSubViewOp = rewriter.create<SubViewOp>(
+        subViewOp.getLoc(), subViewOp.source(), subViewOp.offsets(),
+        ArrayRef<Value>(), subViewOp.strides(), newMemRefType);
+    // Insert a memref_cast for compatibility of the uses of the op.
+    rewriter.replaceOpWithNewOp<MemRefCastOp>(
+        subViewOp.sizes(), subViewOp, newSubViewOp, subViewOp.getType());
+    return matchSuccess();
+  }
+};
+
+// Pattern to rewrite a subview op with constant stride arguments.
+class SubViewOpStrideFolder final : public OpRewritePattern<SubViewOp> {
+public:
+  using OpRewritePattern<SubViewOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(SubViewOp subViewOp,
+                                     PatternRewriter &rewriter) const override {
+    if (subViewOp.getNumStrides() == 0) {
+      return matchFailure();
+    }
+    // Follow all or nothing approach for strides for now. If all the operands
+    // for strides are constants then fold it into the strides of the result
+    // memref.
+    int64_t baseOffset, resultOffset;
+    SmallVector<int64_t, 4> baseStrides, resultStrides;
+    MemRefType subViewType = subViewOp.getType();
+    if (failed(getStridesAndOffset(subViewOp.getBaseMemRefType(), baseStrides,
+                                   baseOffset)) ||
+        failed(getStridesAndOffset(subViewType, resultStrides, resultOffset)) ||
+        llvm::is_contained(baseStrides,
+                           MemRefType::getDynamicStrideOrOffset()) ||
+        llvm::any_of(subViewOp.strides(), [](Value stride) {
+          return !matchPattern(stride, m_ConstantIndex());
+        })) {
+      return matchFailure();
+    }
+
+    SmallVector<int64_t, 4> staticStrides(subViewOp.getNumStrides());
+    for (auto stride : llvm::enumerate(subViewOp.strides())) {
+      auto defOp = stride.value()->getDefiningOp();
+      assert(defOp);
+      assert(baseStrides[stride.index()] > 0);
+      staticStrides[stride.index()] =
+          cast<ConstantIndexOp>(defOp).getValue() * baseStrides[stride.index()];
+    }
+    AffineMap layoutMap = makeStridedLinearLayoutMap(
+        staticStrides, resultOffset, rewriter.getContext());
+    MemRefType newMemRefType =
+        MemRefType::get(subViewType.getShape(), subViewType.getElementType(),
+                        layoutMap, subViewType.getMemorySpace());
+    auto newSubViewOp = rewriter.create<SubViewOp>(
+        subViewOp.getLoc(), subViewOp.source(), subViewOp.offsets(),
+        subViewOp.sizes(), ArrayRef<Value>(), newMemRefType);
+    // Insert a memref_cast for compatibility of the uses of the op.
+    rewriter.replaceOpWithNewOp<MemRefCastOp>(
+        subViewOp.strides(), subViewOp, newSubViewOp, subViewOp.getType());
+    return matchSuccess();
+  }
+};
+
+// Pattern to rewrite a subview op with constant offset arguments.
+class SubViewOpOffsetFolder final : public OpRewritePattern<SubViewOp> {
+public:
+  using OpRewritePattern<SubViewOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(SubViewOp subViewOp,
+                                     PatternRewriter &rewriter) const override {
+    if (subViewOp.getNumOffsets() == 0) {
+      return matchFailure();
+    }
+    // Follow all or nothing approach for offsets for now. If all the operands
+    // for offsets are constants then fold it into the offset of the result
+    // memref.
+    int64_t baseOffset, resultOffset;
+    SmallVector<int64_t, 4> baseStrides, resultStrides;
+    MemRefType subViewType = subViewOp.getType();
+    if (failed(getStridesAndOffset(subViewOp.getBaseMemRefType(), baseStrides,
+                                   baseOffset)) ||
+        failed(getStridesAndOffset(subViewType, resultStrides, resultOffset)) ||
+        llvm::is_contained(baseStrides,
+                           MemRefType::getDynamicStrideOrOffset()) ||
+        baseOffset == MemRefType::getDynamicStrideOrOffset() ||
+        llvm::any_of(subViewOp.offsets(), [](Value stride) {
+          return !matchPattern(stride, m_ConstantIndex());
+        })) {
+      return matchFailure();
+    }
+
+    auto staticOffset = baseOffset;
+    for (auto offset : llvm::enumerate(subViewOp.offsets())) {
+      auto defOp = offset.value()->getDefiningOp();
+      assert(defOp);
+      assert(baseStrides[offset.index()] > 0);
+      staticOffset +=
+          cast<ConstantIndexOp>(defOp).getValue() * baseStrides[offset.index()];
+    }
+
+    AffineMap layoutMap = makeStridedLinearLayoutMap(
+        resultStrides, staticOffset, rewriter.getContext());
+    MemRefType newMemRefType =
+        MemRefType::get(subViewType.getShape(), subViewType.getElementType(),
+                        layoutMap, subViewType.getMemorySpace());
+    auto newSubViewOp = rewriter.create<SubViewOp>(
+        subViewOp.getLoc(), subViewOp.source(), ArrayRef<Value>(),
+        subViewOp.sizes(), subViewOp.strides(), newMemRefType);
+    // Insert a memref_cast for compatibility of the uses of the op.
+    rewriter.replaceOpWithNewOp<MemRefCastOp>(
+        subViewOp.offsets(), subViewOp, newSubViewOp, subViewOp.getType());
+    return matchSuccess();
+  }
+};
+
+} // end anonymous namespace
+
+void SubViewOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  results.insert<SubViewOpShapeFolder, SubViewOpStrideFolder,
+                 SubViewOpOffsetFolder>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// ZeroExtendIOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(ZeroExtendIOp op) {
+  auto srcType = getElementTypeOrSelf(op.getOperand()->getType());
+  auto dstType = getElementTypeOrSelf(op.getType());
+
+  if (srcType.isa<IndexType>())
+    return op.emitError() << srcType << " is not a valid operand type";
+  if (dstType.isa<IndexType>())
+    return op.emitError() << dstType << " is not a valid result type";
+
+  if (srcType.cast<IntegerType>().getWidth() >=
+      dstType.cast<IntegerType>().getWidth())
+    return op.emitError("result type ")
+           << dstType << " must be wider than operand type " << srcType;
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// FPExtOp
+//===----------------------------------------------------------------------===//
+
+bool FPExtOp::areCastCompatible(Type a, Type b) {
+  if (auto fa = a.dyn_cast<FloatType>())
+    if (auto fb = b.dyn_cast<FloatType>())
+      return fa.getWidth() < fb.getWidth();
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// FPTruncOp
+//===----------------------------------------------------------------------===//
+
+bool FPTruncOp::areCastCompatible(Type a, Type b) {
+  if (auto fa = a.dyn_cast<FloatType>())
+    if (auto fb = b.dyn_cast<FloatType>())
+      return fa.getWidth() > fb.getWidth();
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/StandardOps/Ops.cpp.inc"
diff --git a/mlir/lib/Dialect/Traits.cpp b/mlir/lib/Dialect/Traits.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3aea206c07e6b2db846a12e91ff1fc96de216681
--- /dev/null
+++ b/mlir/lib/Dialect/Traits.cpp
@@ -0,0 +1,211 @@
+//===- Traits.cpp - Common op traits shared by dialects -------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Traits.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace mlir;
+
+bool OpTrait::util::getBroadcastedShape(ArrayRef<int64_t> shape1,
+                                        ArrayRef<int64_t> shape2,
+                                        SmallVectorImpl<int64_t> &resultShape) {
+  // To compute the result broadcasted shape, we compare operand shapes
+  // element-wise: starting with the trailing dimensions, and working the
+  // way backward. Two dimensions are compatible when
+  //   1. they are equal, or
+  //   2. one of them is 1
+  // The result shape has the maximum among the two inputs at every
+  // dimension index.
+
+  resultShape.clear();
+  if (shape1.size() > shape2.size()) {
+    std::copy(shape1.begin(), shape1.end(), std::back_inserter(resultShape));
+  } else {
+    std::copy(shape2.begin(), shape2.end(), std::back_inserter(resultShape));
+  }
+
+  auto i1 = shape1.rbegin(), e1 = shape1.rend();
+  auto i2 = shape2.rbegin(), e2 = shape2.rend();
+  auto iR = resultShape.rbegin();
+
+  // Check each dimension is consistent.
+  for (; i1 != e1 && i2 != e2; ++i1, ++i2, ++iR) {
+    if (*i1 == -1 || *i2 == -1) {
+      // One or both dimensions is unknown. Follow TensorFlow behavior:
+      // - If either dimension is greater than 1, we assume that the program is
+      //   correct, and the other dimension will be broadcast to match it.
+      // - If either dimension is 1, the other dimension is the output.
+      if (*i1 > 1) {
+        *iR = *i1;
+      } else if (*i2 > 1) {
+        *iR = *i2;
+      } else if (*i1 == 1) {
+        *iR = *i2;
+      } else if (*i2 == 1) {
+        *iR = *i1;
+      } else {
+        *iR = -1;
+      }
+    } else {
+      if (*i1 == *i2 || *i2 == 1) {
+        *iR = *i1;
+      } else if (*i1 == 1) {
+        *iR = *i2;
+      } else {
+        // This dimension of the two operand types is incompatible.
+        resultShape.clear();
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+/// Returns the shape of the given type. Scalars will be considered as having a
+/// shape with zero dimensions.
+static ArrayRef<int64_t> getShape(Type type) {
+  if (auto sType = type.dyn_cast<ShapedType>())
+    return sType.getShape();
+  return {};
+}
+
+/// Returns the result broadcast composition type from the two given types by
+/// following NumPy broadcast semantics. Returned type may have dynamic shape if
+/// either of the input types has dynamic shape. Returns null type if the two
+/// given types are not broadcast-compatible.
+Type OpTrait::util::getBroadcastedType(Type type1, Type type2) {
+  // Returns the scalar type out of the given type.
+  auto getScalarType = [](Type type) -> Type {
+    if (auto shapedType = type.dyn_cast<ShapedType>())
+      return shapedType.getElementType();
+    return type;
+  };
+
+  // Make sure underlying scalar type is the same.
+  auto scalarType = getScalarType(type1);
+  if (scalarType != getScalarType(type2))
+    return {};
+
+  // If one of the types is unranked tensor, then the other type shouldn't be
+  // vector and the result should have unranked tensor type.
+  if (type1.isa<UnrankedTensorType>() || type2.isa<UnrankedTensorType>()) {
+    if (type1.isa<VectorType>() || type2.isa<VectorType>())
+      return {};
+    return UnrankedTensorType::get(scalarType);
+  }
+
+  // Returns the type kind if the given type is a vector or ranked tensor type.
+  // Returns llvm::None otherwise.
+  auto getCompositeTypeKind = [](Type type) -> Optional<StandardTypes::Kind> {
+    if (type.isa<VectorType>() || type.isa<RankedTensorType>())
+      return static_cast<StandardTypes::Kind>(type.getKind());
+    return llvm::None;
+  };
+
+  // Make sure the composite type, if has, is consistent.
+  auto compositeKind1 = getCompositeTypeKind(type1);
+  auto compositeKind2 = getCompositeTypeKind(type2);
+  Optional<StandardTypes::Kind> resultCompositeKind;
+
+  if (compositeKind1 && compositeKind2) {
+    // Disallow mixing vector and tensor.
+    if (compositeKind1 != compositeKind2)
+      return {};
+    resultCompositeKind = compositeKind1;
+  } else if (compositeKind1) {
+    resultCompositeKind = compositeKind1;
+  } else if (compositeKind2) {
+    resultCompositeKind = compositeKind2;
+  }
+
+  // Get the shape of each type.
+  SmallVector<int64_t, 4> resultShape;
+  if (!getBroadcastedShape(getShape(type1), getShape(type2), resultShape))
+    return {};
+
+  // Compose the final broadcasted type
+  if (resultCompositeKind == StandardTypes::Vector)
+    return VectorType::get(resultShape, scalarType);
+  if (resultCompositeKind == StandardTypes::RankedTensor)
+    return RankedTensorType::get(resultShape, scalarType);
+  return scalarType;
+}
+
+/// Returns true if the given types has both vector types and tensor types.
+static bool hasBothVectorAndTensorType(ArrayRef<Type> types) {
+  return llvm::any_of(types, [](Type t) { return t.isa<VectorType>(); }) &&
+         llvm::any_of(types, [](Type t) { return t.isa<TensorType>(); });
+}
+
+static bool areCompatibleShapes(ArrayRef<int64_t> shape1,
+                                ArrayRef<int64_t> shape2) {
+  auto isCompatible = [](int64_t dim1, int64_t dim2) {
+    return dim1 == dim2 || dim1 == -1 || dim2 == -1;
+  };
+  if (shape1.size() != shape2.size())
+    return false;
+  for (const auto &p : llvm::zip(shape1, shape2))
+    if (!isCompatible(std::get<0>(p), std::get<1>(p)))
+      return false;
+  return true;
+}
+
+LogicalResult OpTrait::impl::verifyCompatibleOperandBroadcast(Operation *op) {
+  assert(op->getNumOperands() == 2 &&
+         "only support broadcast check on two operands");
+  assert(op->getNumResults() == 1 &&
+         "only support broadcast check on one result");
+
+  auto type1 = op->getOperand(0)->getType();
+  auto type2 = op->getOperand(1)->getType();
+  auto retType = op->getResult(0)->getType();
+
+  // We forbid broadcasting vector and tensor.
+  if (hasBothVectorAndTensorType({type1, type2, retType}))
+    return op->emitError("cannot broadcast vector with tensor");
+
+  if (retType.isa<UnrankedTensorType>())
+    return success();
+
+  bool isUnranked1 = type1.isa<UnrankedTensorType>();
+  bool isUnranked2 = type2.isa<UnrankedTensorType>();
+
+  // If both operands are unranked, then all result shapes are possible.
+  if (isUnranked1 && isUnranked2)
+    return success();
+
+  // If one of the operands is unranked, then the known dimensions in the result
+  // should be compatible with the other shaped operand.
+  if (isUnranked1 || isUnranked2) {
+    // Result should have higher rank than the shaped operand's rank and then
+    // the result's trailing dimensions should be compatible with the operand
+    // shape.
+    ArrayRef<int64_t> shape = getShape(!isUnranked1 ? type1 : type2);
+    ArrayRef<int64_t> actualSuffix = getShape(retType).take_back(shape.size());
+    if (!areCompatibleShapes(actualSuffix, shape))
+      return op->emitOpError()
+             << "result type " << retType
+             << " has shape incompatible with a ranked operand type";
+    return success();
+  }
+
+  // If both operands are shaped, then the computed broadcasted shape should be
+  // compatible with the result shape.
+  SmallVector<int64_t, 4> resultShape;
+  if (!util::getBroadcastedShape(getShape(type1), getShape(type2), resultShape))
+    return op->emitOpError("operands don't have broadcast-compatible shapes");
+
+  if (!areCompatibleShapes(resultShape, getShape(retType)))
+    return op->emitOpError() << "result type " << retType
+                             << " does not have shape compatible with the one "
+                                "computed from the operand types";
+
+  return success();
+}
diff --git a/mlir/lib/Dialect/VectorOps/CMakeLists.txt b/mlir/lib/Dialect/VectorOps/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..08d58404b71815d2a9148f075e18c42c427db7bc
--- /dev/null
+++ b/mlir/lib/Dialect/VectorOps/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_llvm_library(MLIRVectorOps
+  DialectRegistration.cpp
+  VectorOps.cpp
+  VectorTransforms.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/VectorOps
+  )
+
+add_dependencies(MLIRVectorOps MLIRVectorOpsIncGen)
+add_dependencies(MLIRVectorOps MLIRVectorTransformPatternsIncGen)
+
+target_link_libraries(MLIRVectorOps MLIRIR)
diff --git a/mlir/lib/Dialect/VectorOps/DialectRegistration.cpp b/mlir/lib/Dialect/VectorOps/DialectRegistration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..edd6abb4e2e8a406af7d3efb2c7938a924b8e4f2
--- /dev/null
+++ b/mlir/lib/Dialect/VectorOps/DialectRegistration.cpp
@@ -0,0 +1,13 @@
+//===- DialectRegistration.cpp - Register super vectorization dialect -----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+using namespace mlir;
+
+// Static initialization for VectorOps dialect registration.
+static DialectRegistration<vector::VectorOpsDialect> VectorOps;
diff --git a/mlir/lib/Dialect/VectorOps/VectorOps.cpp b/mlir/lib/Dialect/VectorOps/VectorOps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3904ef97a251380da2bd651f672740131de4391
--- /dev/null
+++ b/mlir/lib/Dialect/VectorOps/VectorOps.cpp
@@ -0,0 +1,1824 @@
+//===- VectorOps.cpp - MLIR Super Vectorizer Operations -------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements convenience types for working with super-vectorization
+// operations, in particular super-vector loads and stores.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/MathExtras.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/StringSet.h"
+
+using namespace mlir;
+using namespace mlir::vector;
+
+//===----------------------------------------------------------------------===//
+// VectorOpsDialect
+//===----------------------------------------------------------------------===//
+
+VectorOpsDialect::VectorOpsDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context) {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/VectorOps/VectorOps.cpp.inc"
+      >();
+}
+
+/// Materialize a single constant operation from a given attribute value with
+/// the desired resultant type.
+Operation *VectorOpsDialect::materializeConstant(OpBuilder &builder,
+                                                 Attribute value, Type type,
+                                                 Location loc) {
+  return builder.create<ConstantOp>(loc, type, value);
+}
+
+IntegerType vector::getVectorSubscriptType(Builder &builder) {
+  return builder.getIntegerType(64);
+}
+
+ArrayAttr vector::getVectorSubscriptAttr(Builder &builder,
+                                         ArrayRef<int64_t> values) {
+  return builder.getI64ArrayAttr(values);
+}
+
+//===----------------------------------------------------------------------===//
+// ContractionOp
+//===----------------------------------------------------------------------===//
+
+void vector::ContractionOp::build(Builder *builder, OperationState &result,
+                                  Value lhs, Value rhs, Value acc,
+                                  ArrayAttr indexingMaps,
+                                  ArrayAttr iteratorTypes) {
+  result.addOperands({lhs, rhs, acc});
+  result.addTypes(acc->getType());
+  result.addAttribute(getIndexingMapsAttrName(), indexingMaps);
+  result.addAttribute(getIteratorTypesAttrName(), iteratorTypes);
+}
+
+static ParseResult parseContractionOp(OpAsmParser &parser,
+                                      OperationState &result) {
+  OpAsmParser::OperandType lhsInfo;
+  OpAsmParser::OperandType rhsInfo;
+  OpAsmParser::OperandType accInfo;
+  SmallVector<OpAsmParser::OperandType, 2> masksInfo;
+  SmallVector<Type, 2> types;
+  Type resultVectorType;
+  auto loc = parser.getCurrentLocation();
+  DictionaryAttr dictAttr;
+  // TODO(andydavis, ntv) Unify linalg op attribute parsing.
+  if (parser.parseAttribute(dictAttr, "_", result.attributes) ||
+      parser.parseOperand(lhsInfo) || parser.parseComma() ||
+      parser.parseOperand(rhsInfo) || parser.parseComma() ||
+      parser.parseOperand(accInfo) ||
+      parser.parseTrailingOperandList(masksInfo) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonTypeList(types) ||
+      parser.parseKeywordType("into", resultVectorType) ||
+      parser.resolveOperand(lhsInfo, types[0], result.operands) ||
+      parser.resolveOperand(rhsInfo, types[1], result.operands) ||
+      parser.resolveOperand(accInfo, resultVectorType, result.operands) ||
+      parser.addTypeToList(resultVectorType, result.types))
+    return failure();
+  result.attributes.assign(dictAttr.getValue().begin(),
+                           dictAttr.getValue().end());
+  if (masksInfo.empty())
+    return success();
+  if (masksInfo.size() != 2)
+    return parser.emitError(parser.getNameLoc(),
+                            "expected zero or exactly 2 vector mask operands");
+  auto lhsType = types[0].cast<VectorType>();
+  auto rhsType = types[1].cast<VectorType>();
+  auto maskElementType = parser.getBuilder().getI1Type();
+  SmallVector<Type, 2> maskTypes;
+  maskTypes.push_back(VectorType::get(lhsType.getShape(), maskElementType));
+  maskTypes.push_back(VectorType::get(rhsType.getShape(), maskElementType));
+  if (parser.resolveOperands(masksInfo, maskTypes, loc, result.operands))
+    return failure();
+  return success();
+}
+
+static void print(OpAsmPrinter &p, ContractionOp op) {
+  // TODO(andydavis, ntv) Unify printing code with linalg ops.
+  auto attrNames = op.getTraitAttrNames();
+  llvm::StringSet<> traitAttrsSet;
+  traitAttrsSet.insert(attrNames.begin(), attrNames.end());
+  SmallVector<NamedAttribute, 8> attrs;
+  for (auto attr : op.getAttrs())
+    if (traitAttrsSet.count(attr.first.strref()) > 0)
+      attrs.push_back(attr);
+
+  auto dictAttr = DictionaryAttr::get(attrs, op.getContext());
+  p << op.getOperationName() << " " << dictAttr << " " << *op.lhs() << ", ";
+  p << *op.rhs() << ", " << *op.acc();
+  if (op.masks().size() == 2)
+    p << ", " << op.masks();
+
+  p.printOptionalAttrDict(op.getAttrs(), attrNames);
+  p << " : " << op.lhs()->getType() << ", " << op.rhs()->getType() << " into "
+    << op.getResultType();
+}
+
+static bool verifyDimMap(VectorType lhsType, VectorType rhsType,
+                         const std::vector<std::pair<int64_t, int64_t>> &map) {
+  for (auto &dimPair : map) {
+    if (dimPair.first < 0 || dimPair.first >= lhsType.getRank() ||
+        dimPair.second < 0 || dimPair.second >= rhsType.getRank() ||
+        lhsType.getDimSize(dimPair.first) != rhsType.getDimSize(dimPair.second))
+      return false;
+  }
+  return true;
+}
+
+static bool verifyOutputShape(
+    VectorType lhsType, VectorType rhsType, VectorType accType,
+    VectorType resType,
+    const std::vector<std::pair<int64_t, int64_t>> &contractingDimMap,
+    const std::vector<std::pair<int64_t, int64_t>> &batchDimMap) {
+  DenseSet<int64_t> lhsContractingDimSet;
+  DenseSet<int64_t> rhsContractingDimSet;
+  for (auto &dimPair : contractingDimMap) {
+    lhsContractingDimSet.insert(dimPair.first);
+    rhsContractingDimSet.insert(dimPair.second);
+  }
+  DenseSet<int64_t> rhsBatchDimSet;
+  for (auto &dimPair : batchDimMap)
+    rhsBatchDimSet.insert(dimPair.second);
+
+  // Add free and batch dimensions from 'lhsType' to 'expectedResultDims'.
+  SmallVector<int64_t, 4> expectedResultDims;
+  for (int64_t i = 0, e = lhsType.getRank(); i < e; ++i) {
+    if (lhsContractingDimSet.count(i) > 0)
+      continue;
+    expectedResultDims.push_back(lhsType.getDimSize(i));
+  }
+
+  // Add free dimensions from 'rhsType' to 'expectedResultDims'.
+  for (int64_t i = 0, e = rhsType.getRank(); i < e; ++i) {
+    if (rhsContractingDimSet.count(i) > 0 || rhsBatchDimSet.count(i) > 0)
+      continue;
+    expectedResultDims.push_back(rhsType.getDimSize(i));
+  }
+
+  // Verify dimension from 'resType' against 'expectedResultDims'.
+  if (resType.getShape().size() != expectedResultDims.size() ||
+      accType.getShape().size() != expectedResultDims.size())
+    return false;
+  for (int64_t i = 0, e = resType.getRank(); i < e; ++i) {
+    if (resType.getDimSize(i) != expectedResultDims[i] ||
+        accType.getDimSize(i) != expectedResultDims[i])
+      return false;
+  }
+  return true;
+}
+
+static LogicalResult verify(ContractionOp op) {
+  auto lhsType = op.getLhsType();
+  auto rhsType = op.getRhsType();
+  auto accType = op.getAccType();
+  auto resType = op.getResultType();
+
+  // Verify that an indexing map was specified for each vector operand.
+  if (op.indexing_maps().size() != 3)
+    return op.emitOpError("expected an indexing map for each vector operand");
+
+  // Verify that each index map has 'numIterators' inputs, no symbols, and
+  // that the number of map outputs equals the rank of its associated
+  // vector operand.
+  unsigned numIterators = op.iterator_types().getValue().size();
+  for (auto it : llvm::enumerate(op.indexing_maps())) {
+    auto index = it.index();
+    auto map = it.value().cast<AffineMapAttr>().getValue();
+    if (map.getNumSymbols() != 0)
+      return op.emitOpError("expected indexing map ")
+             << index << " to have no symbols";
+    if (map.getNumDims() != numIterators)
+      return op.emitOpError("expected indexing map ")
+             << index << " to have " << numIterators << " number of inputs";
+    auto operandType = op.getOperand(index)->getType().cast<VectorType>();
+    unsigned rank = operandType.getShape().size();
+    if (map.getNumResults() != rank)
+      return op.emitOpError("expected indexing map ")
+             << index << " to have " << rank << " number of outputs";
+    if (!map.isProjectedPermutation())
+      return op.emitOpError("expected indexing map ")
+             << index << " to be a projected permutation of its inputs";
+  }
+
+  auto contractingDimMap = op.getContractingDimMap();
+  auto batchDimMap = op.getBatchDimMap();
+
+  // Verify at least one contracting dimension pair was specified.
+  if (contractingDimMap.empty())
+    return op.emitOpError("expected at least one contracting dimension pair");
+
+  // Verify contracting dimension map was properly constructed.
+  if (!verifyDimMap(lhsType, rhsType, contractingDimMap))
+    return op.emitOpError("invalid contracting dimension map");
+
+  // Verify batch dimension map was properly constructed.
+  if (!verifyDimMap(lhsType, rhsType, batchDimMap))
+    return op.emitOpError("invalid batch dimension map");
+
+  // Verify 'accType' and 'resType' shape.
+  if (!verifyOutputShape(lhsType, rhsType, accType, resType, contractingDimMap,
+                         batchDimMap))
+    return op.emitOpError("invalid accumulator/result vector shape");
+
+  // Verify that either two vector masks are set or none are set.
+  auto lhsMaskType = op.getLHSVectorMaskType();
+  auto rhsMaskType = op.getRHSVectorMaskType();
+  if ((lhsMaskType && !rhsMaskType) || (!lhsMaskType && rhsMaskType))
+    return op.emitOpError("invalid number of vector masks specified");
+  if (lhsMaskType && rhsMaskType) {
+    // Verify mask rank == argument rank.
+    if (lhsMaskType.getShape().size() != lhsType.getShape().size() ||
+        rhsMaskType.getShape().size() != rhsType.getShape().size())
+      return op.emitOpError("invalid vector mask rank");
+  }
+  return success();
+}
+
+ArrayRef<StringRef> ContractionOp::getTraitAttrNames() {
+  static constexpr StringLiteral names[2] = {getIndexingMapsAttrName(),
+                                             getIteratorTypesAttrName()};
+  ArrayRef<StringLiteral> res{names};
+  return ArrayRef<StringRef>{res.begin(), res.end()};
+}
+
+static int64_t getResultIndex(AffineMap map, AffineExpr targetExpr) {
+  for (int64_t i = 0, e = map.getNumResults(); i < e; ++i)
+    if (targetExpr == map.getResult(i))
+      return i;
+  return -1;
+}
+
+static std::vector<std::pair<int64_t, int64_t>>
+getDimMap(ArrayRef<AffineMap> indexingMaps, ArrayAttr iteratorTypes,
+          StringRef targetIteratorTypeName, MLIRContext *context) {
+  std::vector<std::pair<int64_t, int64_t>> dimMap;
+  for (auto it : llvm::enumerate(iteratorTypes)) {
+    auto iteratorTypeName = it.value().cast<StringAttr>().getValue();
+    if (iteratorTypeName != targetIteratorTypeName)
+      continue;
+    // Search lhs/rhs map results for 'targetExpr'.
+    auto targetExpr = getAffineDimExpr(it.index(), context);
+    int64_t lhsDim = getResultIndex(indexingMaps[0], targetExpr);
+    int64_t rhsDim = getResultIndex(indexingMaps[1], targetExpr);
+    if (lhsDim >= 0 && rhsDim >= 0)
+      dimMap.push_back({lhsDim, rhsDim});
+  }
+  return dimMap;
+}
+
+void ContractionOp::getIterationBounds(
+    SmallVectorImpl<int64_t> &iterationBounds) {
+  auto lhsShape = getLhsType().getShape();
+  auto resShape = getResultType().getShape();
+  SmallVector<AffineMap, 4> indexingMaps(getIndexingMaps());
+  SmallVector<int64_t, 2> iterationShape;
+  for (auto it : llvm::enumerate(iterator_types())) {
+    // Search lhs/rhs map results for 'targetExpr'.
+    auto targetExpr = getAffineDimExpr(it.index(), getContext());
+    auto iteratorTypeName = it.value().cast<StringAttr>().getValue();
+    if (iteratorTypeName == getReductionIteratorTypeName()) {
+      // Get reduction dim size from lhs shape (same size in rhsShape).
+      int64_t lhsDimIndex = getResultIndex(indexingMaps[0], targetExpr);
+      assert(lhsDimIndex >= 0);
+      iterationBounds.push_back(lhsShape[lhsDimIndex]);
+      continue;
+    }
+    // Get parallel dimension size from result shape.
+    int64_t resDimIndex = getResultIndex(indexingMaps[2], targetExpr);
+    assert(resDimIndex >= 0);
+    iterationBounds.push_back(resShape[resDimIndex]);
+  }
+}
+
+void ContractionOp::getIterationIndexMap(
+    std::vector<DenseMap<int64_t, int64_t>> &iterationIndexMap) {
+  unsigned numMaps = indexing_maps().getValue().size();
+  iterationIndexMap.resize(numMaps);
+  for (auto it : llvm::enumerate(indexing_maps())) {
+    auto index = it.index();
+    auto map = it.value().cast<AffineMapAttr>().getValue();
+    for (unsigned i = 0, e = map.getNumResults(); i < e; ++i) {
+      auto dim = map.getResult(i).cast<AffineDimExpr>();
+      iterationIndexMap[index][dim.getPosition()] = i;
+    }
+  }
+}
+
+std::vector<std::pair<int64_t, int64_t>> ContractionOp::getContractingDimMap() {
+  SmallVector<AffineMap, 4> indexingMaps(getIndexingMaps());
+  return getDimMap(indexingMaps, iterator_types(),
+                   getReductionIteratorTypeName(), getContext());
+}
+
+std::vector<std::pair<int64_t, int64_t>> ContractionOp::getBatchDimMap() {
+  SmallVector<AffineMap, 4> indexingMaps(getIndexingMaps());
+  return getDimMap(indexingMaps, iterator_types(),
+                   getParallelIteratorTypeName(), getContext());
+}
+
+SmallVector<AffineMap, 4> ContractionOp::getIndexingMaps() {
+  SmallVector<AffineMap, 4> res;
+  auto mapAttrs = indexing_maps().getValue();
+  res.reserve(mapAttrs.size());
+  for (auto mapAttr : mapAttrs)
+    res.push_back(mapAttr.cast<AffineMapAttr>().getValue());
+  return res;
+}
+
+//===----------------------------------------------------------------------===//
+// ExtractElementOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, vector::ExtractElementOp op) {
+  p << op.getOperationName() << " " << *op.vector() << "[" << *op.position()
+    << " : " << op.position()->getType() << "]";
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.vector()->getType();
+}
+
+static ParseResult parseExtractElementOp(OpAsmParser &parser,
+                                         OperationState &result) {
+  OpAsmParser::OperandType vector, position;
+  Type positionType;
+  VectorType vectorType;
+  if (parser.parseOperand(vector) || parser.parseLSquare() ||
+      parser.parseOperand(position) || parser.parseColonType(positionType) ||
+      parser.parseRSquare() ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(vectorType))
+    return failure();
+  Type resultType = vectorType.getElementType();
+  return failure(
+      parser.resolveOperand(vector, vectorType, result.operands) ||
+      parser.resolveOperand(position, positionType, result.operands) ||
+      parser.addTypeToList(resultType, result.types));
+}
+
+static LogicalResult verify(vector::ExtractElementOp op) {
+  VectorType vectorType = op.getVectorType();
+  if (vectorType.getRank() != 1)
+    return op.emitOpError("expected 1-D vector");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ExtractOp
+//===----------------------------------------------------------------------===//
+
+static Type inferExtractOpResultType(VectorType vectorType,
+                                     ArrayAttr position) {
+  if (static_cast<int64_t>(position.size()) == vectorType.getRank())
+    return vectorType.getElementType();
+  return VectorType::get(vectorType.getShape().drop_front(position.size()),
+                         vectorType.getElementType());
+}
+
+void vector::ExtractOp::build(Builder *builder, OperationState &result,
+                              Value source, ArrayRef<int64_t> position) {
+  result.addOperands(source);
+  auto positionAttr = getVectorSubscriptAttr(*builder, position);
+  result.addTypes(inferExtractOpResultType(source->getType().cast<VectorType>(),
+                                           positionAttr));
+  result.addAttribute(getPositionAttrName(), positionAttr);
+}
+
+static void print(OpAsmPrinter &p, vector::ExtractOp op) {
+  p << op.getOperationName() << " " << *op.vector() << op.position();
+  p.printOptionalAttrDict(op.getAttrs(), {"position"});
+  p << " : " << op.vector()->getType();
+}
+
+static ParseResult parseExtractOp(OpAsmParser &parser, OperationState &result) {
+  llvm::SMLoc attributeLoc, typeLoc;
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType vector;
+  Type type;
+  Attribute attr;
+  if (parser.parseOperand(vector) || parser.getCurrentLocation(&attributeLoc) ||
+      parser.parseAttribute(attr, "position", attrs) ||
+      parser.parseOptionalAttrDict(attrs) ||
+      parser.getCurrentLocation(&typeLoc) || parser.parseColonType(type))
+    return failure();
+
+  auto vectorType = type.dyn_cast<VectorType>();
+  if (!vectorType)
+    return parser.emitError(typeLoc, "expected vector type");
+
+  auto positionAttr = attr.dyn_cast<ArrayAttr>();
+  if (!positionAttr ||
+      static_cast<int64_t>(positionAttr.size()) > vectorType.getRank())
+    return parser.emitError(
+        attributeLoc,
+        "expected position attribute of rank smaller than vector rank");
+
+  Type resType = inferExtractOpResultType(vectorType, positionAttr);
+  result.attributes = attrs;
+  return failure(parser.resolveOperand(vector, type, result.operands) ||
+                 parser.addTypeToList(resType, result.types));
+}
+
+static LogicalResult verify(vector::ExtractOp op) {
+  auto positionAttr = op.position().getValue();
+  if (positionAttr.empty())
+    return op.emitOpError("expected non-empty position attribute");
+  if (positionAttr.size() > static_cast<unsigned>(op.getVectorType().getRank()))
+    return op.emitOpError(
+        "expected position attribute of rank smaller than vector rank");
+  for (auto en : llvm::enumerate(positionAttr)) {
+    auto attr = en.value().dyn_cast<IntegerAttr>();
+    if (!attr || attr.getInt() < 0 ||
+        attr.getInt() >= op.getVectorType().getDimSize(en.index()))
+      return op.emitOpError("expected position attribute #")
+             << (en.index() + 1)
+             << " to be a non-negative integer smaller than the corresponding "
+                "vector dimension";
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ExtractSlicesOp
+//===----------------------------------------------------------------------===//
+
+void ExtractSlicesOp::build(Builder *builder, OperationState &result,
+                            TupleType tupleType, Value vector,
+                            ArrayRef<int64_t> sizes,
+                            ArrayRef<int64_t> strides) {
+  result.addOperands(vector);
+  auto sizesAttr = getVectorSubscriptAttr(*builder, sizes);
+  auto stridesAttr = getVectorSubscriptAttr(*builder, strides);
+  result.addTypes(tupleType);
+  result.addAttribute(getSizesAttrName(), sizesAttr);
+  result.addAttribute(getStridesAttrName(), stridesAttr);
+}
+
+static ParseResult parseExtractSlicesOp(OpAsmParser &parser,
+                                        OperationState &result) {
+  OpAsmParser::OperandType operandInfo;
+  ArrayAttr sizesAttr;
+  StringRef sizesAttrName = ExtractSlicesOp::getSizesAttrName();
+  ArrayAttr stridesAttr;
+  StringRef stridesAttrName = ExtractSlicesOp::getStridesAttrName();
+  VectorType vectorType;
+  TupleType resultTupleType;
+  return failure(
+      parser.parseOperand(operandInfo) || parser.parseComma() ||
+      parser.parseAttribute(sizesAttr, sizesAttrName, result.attributes) ||
+      parser.parseComma() ||
+      parser.parseAttribute(stridesAttr, stridesAttrName, result.attributes) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(vectorType) ||
+      parser.parseKeywordType("into", resultTupleType) ||
+      parser.resolveOperand(operandInfo, vectorType, result.operands) ||
+      parser.addTypeToList(resultTupleType, result.types));
+}
+
+static void print(OpAsmPrinter &p, ExtractSlicesOp op) {
+  p << op.getOperationName() << ' ' << *op.vector() << ", ";
+  p << op.sizes() << ", " << op.strides();
+  p.printOptionalAttrDict(
+      op.getAttrs(),
+      /*elidedAttrs=*/{ExtractSlicesOp::getSizesAttrName(),
+                       ExtractSlicesOp::getStridesAttrName()});
+  p << " : " << op.vector()->getType();
+  p << " into " << op.getResultTupleType();
+}
+
+static LogicalResult
+isValidExtractOrInsertSlicesType(Operation *op, VectorType vectorType,
+                                 TupleType tupleType, ArrayRef<int64_t> sizes,
+                                 ArrayRef<int64_t> strides) {
+  // Check for non-unit strides.
+  // TODO(b/144845578) Support non-1 strides.
+  if (llvm::any_of(strides, [](int64_t s) { return s != 1; }))
+    return op->emitError("requires unit strides");
+  // Check that 'vectorType' rank matches rank of tuple element vectors.
+  unsigned rank = vectorType.getRank();
+  auto is_vector_type_of_rank = [&](Type t) {
+    return t.isa<VectorType>() && t.cast<VectorType>().getRank() == rank;
+  };
+  if (!llvm::all_of(tupleType.getTypes(), is_vector_type_of_rank))
+    return op->emitError("requires vector tuple elements of rank ") << rank;
+  // Check that 'sizes' and 'strides' are of size == 'rank'.
+  if (sizes.size() != rank || strides.size() != rank)
+    return op->emitError("requires sizes and strides of rank ") << rank;
+
+  // Compute the number of slices in each dimension.
+  // TODO(andydavis) Move this into a slice generation helper function.
+  auto shape = vectorType.getShape();
+  SmallVector<int64_t, 4> dimSliceCounts(rank);
+  for (unsigned i = 0; i < rank; ++i)
+    dimSliceCounts[i] = ceilDiv(shape[i], sizes[i]);
+  // Compute the strides between slices in each dimension.
+  SmallVector<int64_t, 4> sliceStrides(rank);
+  sliceStrides[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; --i)
+    sliceStrides[i] = sliceStrides[i + 1] * dimSliceCounts[i + 1];
+
+  // Generate each slice shape based on 'sizes', 'strides' and 'vectorType',
+  // and varify that the same matches the corresponding tuple element 'i'.
+  for (int64_t i = 0, e = tupleType.size(); i < e; ++i) {
+    // De-linearize w.r.t. 'sliceStrides'.
+    SmallVector<int64_t, 4> vectorOffsets(rank);
+    int64_t linearIndex = i;
+    for (unsigned j = 0; j < rank; ++j) {
+      vectorOffsets[j] = linearIndex / sliceStrides[j];
+      linearIndex %= sliceStrides[j];
+    }
+    // Convert from unrolled vector-space offsets to element-space offsets.
+    auto offsets = mlir::functional::zipMap(
+        [](int64_t v1, int64_t v2) { return v1 * v2; }, vectorOffsets, sizes);
+    // Initialize 'sliceSizes' to target 'sizes'
+    SmallVector<int64_t, 4> sliceSizes(sizes.begin(), sizes.end());
+    for (unsigned j = 0; j < rank; ++j) {
+      // Based on 'offsets' and 'shape' clip some dim sizes for partial tiles.
+      sliceSizes[j] = std::min(sliceSizes[j], shape[j] - offsets[j]);
+    }
+    // Create slice VectorType type.
+    auto sliceVectorType =
+        VectorType::get(sliceSizes, vectorType.getElementType());
+    // Verify that 'sliceVectorType' matches tupleType.getTypes(i)
+    if (sliceVectorType != tupleType.getType(i))
+      return op->emitError("invalid tuple element type ") << sliceVectorType;
+  }
+  return success();
+}
+
+static LogicalResult verify(ExtractSlicesOp op) {
+  SmallVector<int64_t, 4> sizes;
+  op.getSizes(sizes);
+  SmallVector<int64_t, 4> strides;
+  op.getStrides(strides);
+  return isValidExtractOrInsertSlicesType(
+      op.getOperation(), op.getSourceVectorType(), op.getResultTupleType(),
+      sizes, strides);
+}
+
+static void populateFromInt64AttrArray(ArrayAttr arrayAttr,
+                                       SmallVectorImpl<int64_t> &results) {
+  for (auto attr : arrayAttr)
+    results.push_back(attr.cast<IntegerAttr>().getInt());
+}
+
+void ExtractSlicesOp::getSizes(SmallVectorImpl<int64_t> &results) {
+  populateFromInt64AttrArray(sizes(), results);
+}
+
+void ExtractSlicesOp::getStrides(SmallVectorImpl<int64_t> &results) {
+  populateFromInt64AttrArray(strides(), results);
+}
+
+//===----------------------------------------------------------------------===//
+// BroadcastOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, BroadcastOp op) {
+  p << op.getOperationName() << " " << *op.source() << " : "
+    << op.getSourceType() << " to " << op.getVectorType();
+}
+
+static LogicalResult verify(BroadcastOp op) {
+  VectorType srcVectorType = op.getSourceType().dyn_cast<VectorType>();
+  VectorType dstVectorType = op.getVectorType();
+  // Scalar to vector broadcast is always valid. A vector
+  // to vector broadcast needs some additional checking.
+  if (srcVectorType) {
+    int64_t srcRank = srcVectorType.getRank();
+    int64_t dstRank = dstVectorType.getRank();
+    if (srcRank > dstRank)
+      return op.emitOpError("source rank higher than destination rank");
+    // Source has an exact match or singleton value for all trailing dimensions
+    // (all leading dimensions are simply duplicated).
+    int64_t lead = dstRank - srcRank;
+    for (int64_t r = 0; r < srcRank; ++r) {
+      int64_t srcDim = srcVectorType.getDimSize(r);
+      int64_t dstDim = dstVectorType.getDimSize(lead + r);
+      if (srcDim != 1 && srcDim != dstDim)
+        return op.emitOpError("dimension mismatch (")
+               << srcDim << " vs. " << dstDim << ")";
+    }
+  }
+  return success();
+}
+
+static ParseResult parseBroadcastOp(OpAsmParser &parser,
+                                    OperationState &result) {
+  OpAsmParser::OperandType source;
+  Type sourceType;
+  VectorType vectorType;
+  return failure(parser.parseOperand(source) ||
+                 parser.parseColonType(sourceType) ||
+                 parser.parseKeywordType("to", vectorType) ||
+                 parser.resolveOperand(source, sourceType, result.operands) ||
+                 parser.addTypeToList(vectorType, result.types));
+}
+
+//===----------------------------------------------------------------------===//
+// ShuffleOp
+//===----------------------------------------------------------------------===//
+
+void ShuffleOp::build(Builder *builder, OperationState &result, Value v1,
+                      Value v2, ArrayRef<int64_t> mask) {
+  result.addOperands({v1, v2});
+  auto maskAttr = getVectorSubscriptAttr(*builder, mask);
+  result.addTypes(v1->getType());
+  result.addAttribute(getMaskAttrName(), maskAttr);
+}
+
+static void print(OpAsmPrinter &p, ShuffleOp op) {
+  p << op.getOperationName() << " " << *op.v1() << ", " << *op.v2() << " "
+    << op.mask();
+  p.printOptionalAttrDict(op.getAttrs(), {ShuffleOp::getMaskAttrName()});
+  p << " : " << op.v1()->getType() << ", " << op.v2()->getType();
+}
+
+static LogicalResult verify(ShuffleOp op) {
+  VectorType resultType = op.getVectorType();
+  VectorType v1Type = op.getV1VectorType();
+  VectorType v2Type = op.getV2VectorType();
+  // Verify ranks.
+  int64_t resRank = resultType.getRank();
+  int64_t v1Rank = v1Type.getRank();
+  int64_t v2Rank = v2Type.getRank();
+  if (resRank != v1Rank || v1Rank != v2Rank)
+    return op.emitOpError("rank mismatch");
+  // Verify all but leading dimension sizes.
+  for (int64_t r = 1; r < v1Rank; ++r) {
+    int64_t resDim = resultType.getDimSize(r);
+    int64_t v1Dim = v1Type.getDimSize(r);
+    int64_t v2Dim = v2Type.getDimSize(r);
+    if (resDim != v1Dim || v1Dim != v2Dim)
+      return op.emitOpError("dimension mismatch");
+  }
+  // Verify mask length.
+  auto maskAttr = op.mask().getValue();
+  int64_t maskLength = maskAttr.size();
+  if (maskLength != resultType.getDimSize(0))
+    return op.emitOpError("mask length mismatch");
+  // Verify all indices.
+  int64_t indexSize = v1Type.getDimSize(0) + v2Type.getDimSize(0);
+  for (auto en : llvm::enumerate(maskAttr)) {
+    auto attr = en.value().dyn_cast<IntegerAttr>();
+    if (!attr || attr.getInt() < 0 || attr.getInt() >= indexSize)
+      return op.emitOpError("mask index #")
+             << (en.index() + 1) << " out of range";
+  }
+  return success();
+}
+
+static ParseResult parseShuffleOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType v1, v2;
+  Attribute attr;
+  VectorType v1Type, v2Type;
+  if (parser.parseOperand(v1) || parser.parseComma() ||
+      parser.parseOperand(v2) ||
+      parser.parseAttribute(attr, ShuffleOp::getMaskAttrName(),
+                            result.attributes) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(v1Type) || parser.parseComma() ||
+      parser.parseType(v2Type) ||
+      parser.resolveOperand(v1, v1Type, result.operands) ||
+      parser.resolveOperand(v2, v2Type, result.operands))
+    return failure();
+  // Construct resulting type: leading dimension matches mask length,
+  // all trailing dimensions match the operands.
+  auto maskAttr = attr.dyn_cast<ArrayAttr>();
+  if (!maskAttr)
+    return parser.emitError(parser.getNameLoc(), "missing mask attribute");
+  int64_t maskLength = maskAttr.size();
+  if (maskLength <= 0)
+    return parser.emitError(parser.getNameLoc(), "invalid mask length");
+  int64_t v1Rank = v1Type.getRank();
+  SmallVector<int64_t, 4> shape;
+  shape.reserve(v1Rank);
+  shape.push_back(maskLength);
+  for (int64_t r = 1; r < v1Rank; ++r)
+    shape.push_back(v1Type.getDimSize(r));
+  VectorType resType = VectorType::get(shape, v1Type.getElementType());
+  parser.addTypeToList(resType, result.types);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// InsertElementOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, InsertElementOp op) {
+  p << op.getOperationName() << " " << *op.source() << ", " << *op.dest() << "["
+    << *op.position() << " : " << op.position()->getType() << "]";
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.dest()->getType();
+}
+
+static ParseResult parseInsertElementOp(OpAsmParser &parser,
+                                        OperationState &result) {
+  OpAsmParser::OperandType source, dest, position;
+  Type positionType;
+  VectorType destType;
+  if (parser.parseOperand(source) || parser.parseComma() ||
+      parser.parseOperand(dest) || parser.parseLSquare() ||
+      parser.parseOperand(position) || parser.parseColonType(positionType) ||
+      parser.parseRSquare() ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(destType))
+    return failure();
+  Type sourceType = destType.getElementType();
+  return failure(
+      parser.resolveOperand(source, sourceType, result.operands) ||
+      parser.resolveOperand(dest, destType, result.operands) ||
+      parser.resolveOperand(position, positionType, result.operands) ||
+      parser.addTypeToList(destType, result.types));
+}
+
+static LogicalResult verify(InsertElementOp op) {
+  auto dstVectorType = op.getDestVectorType();
+  if (dstVectorType.getRank() != 1)
+    return op.emitOpError("expected 1-D vector");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// InsertOp
+//===----------------------------------------------------------------------===//
+
+void InsertOp::build(Builder *builder, OperationState &result, Value source,
+                     Value dest, ArrayRef<int64_t> position) {
+  result.addOperands({source, dest});
+  auto positionAttr = getVectorSubscriptAttr(*builder, position);
+  result.addTypes(dest->getType());
+  result.addAttribute(getPositionAttrName(), positionAttr);
+}
+
+static void print(OpAsmPrinter &p, InsertOp op) {
+  p << op.getOperationName() << " " << *op.source() << ", " << *op.dest()
+    << op.position();
+  p.printOptionalAttrDict(op.getAttrs(), {InsertOp::getPositionAttrName()});
+  p << " : " << op.getSourceType() << " into " << op.getDestVectorType();
+}
+
+static ParseResult parseInsertOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<NamedAttribute, 4> attrs;
+  OpAsmParser::OperandType source, dest;
+  Type sourceType;
+  VectorType destType;
+  Attribute attr;
+  return failure(parser.parseOperand(source) || parser.parseComma() ||
+                 parser.parseOperand(dest) ||
+                 parser.parseAttribute(attr, InsertOp::getPositionAttrName(),
+                                       result.attributes) ||
+                 parser.parseOptionalAttrDict(attrs) ||
+                 parser.parseColonType(sourceType) ||
+                 parser.parseKeywordType("into", destType) ||
+                 parser.resolveOperand(source, sourceType, result.operands) ||
+                 parser.resolveOperand(dest, destType, result.operands) ||
+                 parser.addTypeToList(destType, result.types));
+}
+
+static LogicalResult verify(InsertOp op) {
+  auto positionAttr = op.position().getValue();
+  if (positionAttr.empty())
+    return op.emitOpError("expected non-empty position attribute");
+  auto destVectorType = op.getDestVectorType();
+  if (positionAttr.size() > static_cast<unsigned>(destVectorType.getRank()))
+    return op.emitOpError(
+        "expected position attribute of rank smaller than dest vector rank");
+  auto srcVectorType = op.getSourceType().dyn_cast<VectorType>();
+  if (srcVectorType &&
+      (static_cast<unsigned>(srcVectorType.getRank()) + positionAttr.size() !=
+       static_cast<unsigned>(destVectorType.getRank())))
+    return op.emitOpError("expected position attribute rank + source rank to "
+                          "match dest vector rank");
+  else if (!srcVectorType && (positionAttr.size() !=
+                              static_cast<unsigned>(destVectorType.getRank())))
+    return op.emitOpError(
+        "expected position attribute rank to match the dest vector rank");
+  for (auto en : llvm::enumerate(positionAttr)) {
+    auto attr = en.value().dyn_cast<IntegerAttr>();
+    if (!attr || attr.getInt() < 0 ||
+        attr.getInt() >= destVectorType.getDimSize(en.index()))
+      return op.emitOpError("expected position attribute #")
+             << (en.index() + 1)
+             << " to be a non-negative integer smaller than the corresponding "
+                "dest vector dimension";
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// InsertSlicesOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseInsertSlicesOp(OpAsmParser &parser,
+                                       OperationState &result) {
+  OpAsmParser::OperandType operandInfo;
+  ArrayAttr sizesAttr;
+  StringRef sizesAttrName = InsertSlicesOp::getSizesAttrName();
+  ArrayAttr stridesAttr;
+  StringRef stridesAttrName = InsertSlicesOp::getStridesAttrName();
+  TupleType tupleType;
+  VectorType resultVectorType;
+  return failure(
+      parser.parseOperand(operandInfo) || parser.parseComma() ||
+      parser.parseAttribute(sizesAttr, sizesAttrName, result.attributes) ||
+      parser.parseComma() ||
+      parser.parseAttribute(stridesAttr, stridesAttrName, result.attributes) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(tupleType) ||
+      parser.parseKeywordType("into", resultVectorType) ||
+      parser.resolveOperand(operandInfo, tupleType, result.operands) ||
+      parser.addTypeToList(resultVectorType, result.types));
+}
+
+static void print(OpAsmPrinter &p, InsertSlicesOp op) {
+  p << op.getOperationName() << ' ' << *op.vectors() << ", ";
+  p << op.sizes() << ", " << op.strides();
+  p.printOptionalAttrDict(
+      op.getAttrs(),
+      /*elidedAttrs=*/{InsertSlicesOp::getSizesAttrName(),
+                       InsertSlicesOp::getStridesAttrName()});
+  p << " : " << op.vectors()->getType();
+  p << " into " << op.getResultVectorType();
+}
+
+static LogicalResult verify(InsertSlicesOp op) {
+  SmallVector<int64_t, 4> sizes;
+  op.getSizes(sizes);
+  SmallVector<int64_t, 4> strides;
+  op.getStrides(strides);
+  return isValidExtractOrInsertSlicesType(
+      op.getOperation(), op.getResultVectorType(), op.getSourceTupleType(),
+      sizes, strides);
+}
+
+void InsertSlicesOp::getSizes(SmallVectorImpl<int64_t> &results) {
+  populateFromInt64AttrArray(sizes(), results);
+}
+
+void InsertSlicesOp::getStrides(SmallVectorImpl<int64_t> &results) {
+  populateFromInt64AttrArray(strides(), results);
+}
+
+//===----------------------------------------------------------------------===//
+// InsertStridedSliceOp
+//===----------------------------------------------------------------------===//
+
+void InsertStridedSliceOp::build(Builder *builder, OperationState &result,
+                                 Value source, Value dest,
+                                 ArrayRef<int64_t> offsets,
+                                 ArrayRef<int64_t> strides) {
+  result.addOperands({source, dest});
+  auto offsetsAttr = getVectorSubscriptAttr(*builder, offsets);
+  auto stridesAttr = getVectorSubscriptAttr(*builder, strides);
+  result.addTypes(dest->getType());
+  result.addAttribute(getOffsetsAttrName(), offsetsAttr);
+  result.addAttribute(getStridesAttrName(), stridesAttr);
+}
+
+static void print(OpAsmPrinter &p, InsertStridedSliceOp op) {
+  p << op.getOperationName() << " " << *op.source() << ", " << *op.dest()
+    << " ";
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getSourceVectorType() << " into " << op.getDestVectorType();
+}
+
+static ParseResult parseInsertStridedSliceOp(OpAsmParser &parser,
+                                             OperationState &result) {
+  OpAsmParser::OperandType source, dest;
+  VectorType sourceVectorType, destVectorType;
+  return failure(
+      parser.parseOperand(source) || parser.parseComma() ||
+      parser.parseOperand(dest) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(sourceVectorType) ||
+      parser.parseKeywordType("into", destVectorType) ||
+      parser.resolveOperand(source, sourceVectorType, result.operands) ||
+      parser.resolveOperand(dest, destVectorType, result.operands) ||
+      parser.addTypeToList(destVectorType, result.types));
+}
+
+// TODO(ntv) Should be moved to Tablegen Confined attributes.
+template <typename OpType>
+LogicalResult isIntegerArrayAttrSmallerThanShape(OpType op, ArrayAttr arrayAttr,
+                                                 ArrayRef<int64_t> shape,
+                                                 StringRef attrName) {
+  if (arrayAttr.size() > shape.size())
+    return op.emitOpError("expected ")
+           << attrName << " attribute of rank smaller than vector rank";
+  return success();
+}
+
+// Returns true if all integers in `arrayAttr` are in the half-open [min, max}
+// interval. If `halfOpen` is true then the admissible interval is [min, max).
+// Otherwise, the admissible interval is [min, max].
+template <typename OpType>
+LogicalResult isIntegerArrayAttrConfinedToRange(OpType op, ArrayAttr arrayAttr,
+                                                int64_t min, int64_t max,
+                                                StringRef attrName,
+                                                bool halfOpen = true) {
+  for (auto attr : arrayAttr) {
+    auto val = attr.cast<IntegerAttr>().getInt();
+    auto upper = max;
+    if (!halfOpen)
+      upper += 1;
+    if (val < min || val >= upper)
+      return op.emitOpError("expected ") << attrName << " to be confined to ["
+                                         << min << ", " << upper << ")";
+  }
+  return success();
+}
+
+// Returns true if all integers in `arrayAttr` are in the half-open [min, max}
+// interval. If `halfOpen` is true then the admissible interval is [min, max).
+// Otherwise, the admissible interval is [min, max].
+template <typename OpType>
+LogicalResult
+isIntegerArrayAttrConfinedToShape(OpType op, ArrayAttr arrayAttr,
+                                  ArrayRef<int64_t> shape, StringRef attrName,
+                                  bool halfOpen = true, int64_t min = 0) {
+  assert(arrayAttr.size() <= shape.size());
+  unsigned index = 0;
+  for (auto it : llvm::zip(arrayAttr, shape)) {
+    auto val = std::get<0>(it).cast<IntegerAttr>().getInt();
+    auto max = std::get<1>(it);
+    if (!halfOpen)
+      max += 1;
+    if (val < min || val >= max)
+      return op.emitOpError("expected ")
+             << attrName << " dimension " << index << " to be confined to ["
+             << min << ", " << max << ")";
+    ++index;
+  }
+  return success();
+}
+
+// Returns true if all integers in `arrayAttr` are in the interval [min, max}.
+// interval. If `halfOpen` is true then the admissible interval is [min, max).
+// Otherwise, the admissible interval is [min, max].
+template <typename OpType>
+LogicalResult isSumOfIntegerArrayAttrConfinedToShape(
+    OpType op, ArrayAttr arrayAttr1, ArrayAttr arrayAttr2,
+    ArrayRef<int64_t> shape, StringRef attrName1, StringRef attrName2,
+    bool halfOpen = true, int64_t min = 1) {
+  assert(arrayAttr1.size() <= shape.size());
+  assert(arrayAttr2.size() <= shape.size());
+  unsigned index = 0;
+  for (auto it : llvm::zip(arrayAttr1, arrayAttr2, shape)) {
+    auto val1 = std::get<0>(it).cast<IntegerAttr>().getInt();
+    auto val2 = std::get<1>(it).cast<IntegerAttr>().getInt();
+    auto max = std::get<2>(it);
+    if (!halfOpen)
+      max += 1;
+    if (val1 + val2 < 0 || val1 + val2 >= max)
+      return op.emitOpError("expected sum(")
+             << attrName1 << ", " << attrName2 << ") dimension " << index
+             << " to be confined to [" << min << ", " << max << ")";
+    ++index;
+  }
+  return success();
+}
+
+static ArrayAttr makeI64ArrayAttr(ArrayRef<int64_t> values,
+                                  MLIRContext *context) {
+  auto attrs = functional::map(
+      [context](int64_t v) -> Attribute {
+        return IntegerAttr::get(IntegerType::get(64, context), APInt(64, v));
+      },
+      values);
+  return ArrayAttr::get(attrs, context);
+}
+
+static LogicalResult verify(InsertStridedSliceOp op) {
+  auto sourceVectorType = op.getSourceVectorType();
+  auto destVectorType = op.getDestVectorType();
+  auto offsets = op.offsets();
+  auto strides = op.strides();
+  if (offsets.size() != static_cast<unsigned>(destVectorType.getRank()))
+    return op.emitOpError(
+        "expected offsets of same size as destination vector rank");
+  if (strides.size() != static_cast<unsigned>(sourceVectorType.getRank()))
+    return op.emitOpError(
+        "expected strides of same size as source vector rank");
+  if (sourceVectorType.getRank() > destVectorType.getRank())
+    return op.emitOpError(
+        "expected source rank to be smaller than destination rank");
+
+  auto sourceShape = sourceVectorType.getShape();
+  auto destShape = destVectorType.getShape();
+  SmallVector<int64_t, 4> sourceShapeAsDestShape(
+      destShape.size() - sourceShape.size(), 0);
+  sourceShapeAsDestShape.append(sourceShape.begin(), sourceShape.end());
+  auto offName = InsertStridedSliceOp::getOffsetsAttrName();
+  auto stridesName = InsertStridedSliceOp::getStridesAttrName();
+  if (failed(
+          isIntegerArrayAttrConfinedToShape(op, offsets, destShape, offName)) ||
+      failed(isIntegerArrayAttrConfinedToRange(op, strides, 1, 1, stridesName,
+                                               /*halfOpen=*/false)) ||
+      failed(isSumOfIntegerArrayAttrConfinedToShape(
+          op, offsets,
+          makeI64ArrayAttr(sourceShapeAsDestShape, op.getContext()), destShape,
+          offName, "source vector shape",
+          /*halfOpen=*/false, /*min=*/1)))
+    return failure();
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// OuterProductOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, OuterProductOp op) {
+  p << op.getOperationName() << " " << *op.lhs() << ", " << *op.rhs();
+  if (!op.acc().empty())
+    p << ", " << op.acc();
+  p << " : " << op.lhs()->getType() << ", " << op.rhs()->getType();
+}
+
+static ParseResult parseOuterProductOp(OpAsmParser &parser,
+                                       OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 3> operandsInfo;
+  Type tLHS, tRHS;
+  if (parser.parseOperandList(operandsInfo) || parser.parseColonType(tLHS) ||
+      parser.parseComma() || parser.parseType(tRHS))
+    return failure();
+  if (operandsInfo.size() < 2)
+    return parser.emitError(parser.getNameLoc(),
+                            "expected at least 2 operands");
+  VectorType vLHS = tLHS.dyn_cast<VectorType>();
+  VectorType vRHS = tRHS.dyn_cast<VectorType>();
+  if (!vLHS || !vRHS)
+    return parser.emitError(parser.getNameLoc(), "expected 2 vector types");
+  VectorType resType = VectorType::get({vLHS.getDimSize(0), vRHS.getDimSize(0)},
+                                       vLHS.getElementType());
+  return failure(
+      parser.resolveOperand(operandsInfo[0], tLHS, result.operands) ||
+      parser.resolveOperand(operandsInfo[1], tRHS, result.operands) ||
+      (operandsInfo.size() > 2 &&
+       parser.resolveOperand(operandsInfo[2], resType, result.operands)) ||
+      parser.addTypeToList(resType, result.types));
+}
+
+static LogicalResult verify(OuterProductOp op) {
+  VectorType vLHS = op.getOperandVectorTypeLHS(),
+             vRHS = op.getOperandVectorTypeRHS(),
+             vACC = op.getOperandVectorTypeACC(), vRES = op.getVectorType();
+  if (vLHS.getRank() != 1)
+    return op.emitOpError("expected 1-d vector for operand #1");
+  if (vRHS.getRank() != 1)
+    return op.emitOpError("expected 1-d vector for operand #2");
+  if (vRES.getRank() != 2)
+    return op.emitOpError("expected 2-d vector result");
+  if (vLHS.getDimSize(0) != vRES.getDimSize(0))
+    return op.emitOpError("expected #1 operand dim to match result dim #1");
+  if (vRHS.getDimSize(0) != vRES.getDimSize(1))
+    return op.emitOpError("expected #2 operand dim to match result dim #2");
+  if (vACC && vACC != vRES)
+    return op.emitOpError("expected operand #3 of same type as result type");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ReshapeOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, ReshapeOp op) {
+  p << op.getOperationName() << " " << *op.vector() << ", [" << op.input_shape()
+    << "], [" << op.output_shape() << "], " << op.fixed_vector_sizes();
+  SmallVector<StringRef, 2> elidedAttrs = {
+      ReshapeOp::getOperandSegmentSizeAttr(),
+      ReshapeOp::getFixedVectorSizesAttrName()};
+  p.printOptionalAttrDict(op.getAttrs(), elidedAttrs);
+  p << " : " << op.getInputVectorType() << " to " << op.getOutputVectorType();
+}
+
+// TODO(b/146516564) Consider passing number of inner vector dimensions that
+// are fixed, instead of their values in 'fixesVectorSizes' array attr.
+//
+// operation ::= ssa-id `=` `vector.reshape` ssa-use, `[` ssa-use-list `]`,
+//                          `[` ssa-use-list `]`, `[` array-attribute `]`
+//                          `:` vector-type 'to' vector-type
+//
+static ParseResult parseReshapeOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType inputInfo;
+  SmallVector<OpAsmParser::OperandType, 4> inputShapeInfo;
+  SmallVector<OpAsmParser::OperandType, 4> outputShapeInfo;
+  ArrayAttr fixedVectorSizesAttr;
+  StringRef attrName = ReshapeOp::getFixedVectorSizesAttrName();
+  auto indexType = parser.getBuilder().getIndexType();
+  if (parser.parseOperand(inputInfo) || parser.parseComma() ||
+      parser.parseOperandList(inputShapeInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseComma() ||
+      parser.parseOperandList(outputShapeInfo,
+                              OpAsmParser::Delimiter::Square) ||
+      parser.parseComma()) {
+    return failure();
+  }
+
+  auto builder = parser.getBuilder();
+  result.addAttribute(
+      ReshapeOp::getOperandSegmentSizeAttr(),
+      builder.getI32VectorAttr({1, static_cast<int32_t>(inputShapeInfo.size()),
+                                static_cast<int32_t>(outputShapeInfo.size())}));
+  Type inputType;
+  Type outputType;
+  return failure(
+      parser.parseAttribute(fixedVectorSizesAttr, attrName,
+                            result.attributes) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(inputType) ||
+      parser.resolveOperand(inputInfo, inputType, result.operands) ||
+      parser.resolveOperands(inputShapeInfo, indexType, result.operands) ||
+      parser.resolveOperands(outputShapeInfo, indexType, result.operands) ||
+      parser.parseKeywordType("to", outputType) ||
+      parser.addTypeToList(outputType, result.types));
+}
+
+static LogicalResult verify(ReshapeOp op) {
+  // Verify that rank(numInputs/outputs) + numFixedVec dim matches vec rank.
+  auto inputVectorType = op.getInputVectorType();
+  auto outputVectorType = op.getOutputVectorType();
+  int64_t inputShapeRank = op.getNumInputShapeSizes();
+  int64_t outputShapeRank = op.getNumOutputShapeSizes();
+  SmallVector<int64_t, 4> fixedVectorSizes;
+  op.getFixedVectorSizes(fixedVectorSizes);
+  int64_t numFixedVectorSizes = fixedVectorSizes.size();
+
+  if (inputVectorType.getRank() != inputShapeRank + numFixedVectorSizes)
+    return op.emitError("invalid input shape for vector type ")
+           << inputVectorType;
+
+  if (outputVectorType.getRank() != outputShapeRank + numFixedVectorSizes)
+    return op.emitError("invalid output shape for vector type ")
+           << outputVectorType;
+
+  // Verify that the 'fixedVectorSizes' match a input/output vector shape
+  // suffix.
+  unsigned inputVectorRank = inputVectorType.getRank();
+  for (unsigned i = 0; i < numFixedVectorSizes; ++i) {
+    unsigned index = inputVectorRank - numFixedVectorSizes - i;
+    if (fixedVectorSizes[i] != inputVectorType.getShape()[index])
+      return op.emitError("fixed vector size must match input vector for dim ")
+             << i;
+  }
+
+  unsigned outputVectorRank = outputVectorType.getRank();
+  for (unsigned i = 0; i < numFixedVectorSizes; ++i) {
+    unsigned index = outputVectorRank - numFixedVectorSizes - i;
+    if (fixedVectorSizes[i] != outputVectorType.getShape()[index])
+      return op.emitError("fixed vector size must match output vector for dim ")
+             << i;
+  }
+
+  // If all shape operands are produced by constant ops, verify that product
+  // of dimensions for input/output shape match.
+  auto isDefByConstant = [](Value operand) {
+    return isa_and_nonnull<ConstantIndexOp>(operand->getDefiningOp());
+  };
+  if (llvm::all_of(op.input_shape(), isDefByConstant) &&
+      llvm::all_of(op.output_shape(), isDefByConstant)) {
+    int64_t numInputElements = 1;
+    for (auto operand : op.input_shape())
+      numInputElements *=
+          cast<ConstantIndexOp>(operand->getDefiningOp()).getValue();
+    int64_t numOutputElements = 1;
+    for (auto operand : op.output_shape())
+      numOutputElements *=
+          cast<ConstantIndexOp>(operand->getDefiningOp()).getValue();
+    if (numInputElements != numOutputElements)
+      return op.emitError("product of input and output shape sizes must match");
+  }
+  return success();
+}
+
+void ReshapeOp::getFixedVectorSizes(SmallVectorImpl<int64_t> &results) {
+  populateFromInt64AttrArray(fixed_vector_sizes(), results);
+}
+
+//===----------------------------------------------------------------------===//
+// StridedSliceOp
+//===----------------------------------------------------------------------===//
+
+// Inference works as follows:
+//   1. Add 'sizes' from prefix of dims in 'offsets'.
+//   2. Add sizes from 'vectorType' for remaining dims.
+static Type inferStridedSliceOpResultType(VectorType vectorType,
+                                          ArrayAttr offsets, ArrayAttr sizes,
+                                          ArrayAttr strides) {
+  assert(offsets.size() == sizes.size() && offsets.size() == strides.size());
+  SmallVector<int64_t, 4> shape;
+  shape.reserve(vectorType.getRank());
+  unsigned idx = 0;
+  for (unsigned e = offsets.size(); idx < e; ++idx)
+    shape.push_back(sizes.getValue()[idx].cast<IntegerAttr>().getInt());
+  for (unsigned e = vectorType.getShape().size(); idx < e; ++idx)
+    shape.push_back(vectorType.getShape()[idx]);
+
+  return VectorType::get(shape, vectorType.getElementType());
+}
+
+void StridedSliceOp::build(Builder *builder, OperationState &result,
+                           Value source, ArrayRef<int64_t> offsets,
+                           ArrayRef<int64_t> sizes, ArrayRef<int64_t> strides) {
+  result.addOperands(source);
+  auto offsetsAttr = getVectorSubscriptAttr(*builder, offsets);
+  auto sizesAttr = getVectorSubscriptAttr(*builder, sizes);
+  auto stridesAttr = getVectorSubscriptAttr(*builder, strides);
+  result.addTypes(
+      inferStridedSliceOpResultType(source->getType().cast<VectorType>(),
+                                    offsetsAttr, sizesAttr, stridesAttr));
+  result.addAttribute(getOffsetsAttrName(), offsetsAttr);
+  result.addAttribute(getSizesAttrName(), sizesAttr);
+  result.addAttribute(getStridesAttrName(), stridesAttr);
+}
+
+static void print(OpAsmPrinter &p, StridedSliceOp op) {
+  p << op.getOperationName() << " " << *op.vector();
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.vector()->getType() << " to " << op.getResult()->getType();
+}
+
+static ParseResult parseStridedSliceOp(OpAsmParser &parser,
+                                       OperationState &result) {
+  llvm::SMLoc attributeLoc, typeLoc;
+  OpAsmParser::OperandType vector;
+  VectorType vectorType, resultVectorType;
+  return failure(parser.parseOperand(vector) ||
+                 parser.getCurrentLocation(&attributeLoc) ||
+                 parser.parseOptionalAttrDict(result.attributes) ||
+                 parser.getCurrentLocation(&typeLoc) ||
+                 parser.parseColonType(vectorType) ||
+                 parser.parseKeywordType("to", resultVectorType) ||
+                 parser.resolveOperand(vector, vectorType, result.operands) ||
+                 parser.addTypeToList(resultVectorType, result.types));
+}
+
+static LogicalResult verify(StridedSliceOp op) {
+  auto type = op.getVectorType();
+  auto offsets = op.offsets();
+  auto sizes = op.sizes();
+  auto strides = op.strides();
+  if (offsets.size() != sizes.size() || offsets.size() != strides.size()) {
+    op.emitOpError(
+        "expected offsets, sizes and strides attributes of same size");
+    return failure();
+  }
+
+  auto shape = type.getShape();
+  auto offName = StridedSliceOp::getOffsetsAttrName();
+  auto sizesName = StridedSliceOp::getSizesAttrName();
+  auto stridesName = StridedSliceOp::getStridesAttrName();
+  if (failed(isIntegerArrayAttrSmallerThanShape(op, offsets, shape, offName)) ||
+      failed(isIntegerArrayAttrSmallerThanShape(op, sizes, shape, sizesName)) ||
+      failed(isIntegerArrayAttrSmallerThanShape(op, strides, shape,
+                                                stridesName)) ||
+      failed(isIntegerArrayAttrConfinedToShape(op, offsets, shape, offName)) ||
+      failed(isIntegerArrayAttrConfinedToShape(op, sizes, shape, sizesName,
+                                               /*halfOpen=*/false,
+                                               /*min=*/1)) ||
+      failed(isIntegerArrayAttrConfinedToRange(op, strides, 1, 1, stridesName,
+                                               /*halfOpen=*/false)) ||
+      failed(isSumOfIntegerArrayAttrConfinedToShape(op, offsets, sizes, shape,
+                                                    offName, sizesName,
+                                                    /*halfOpen=*/false)))
+    return failure();
+
+  auto resultType = inferStridedSliceOpResultType(
+      op.getVectorType(), op.offsets(), op.sizes(), op.strides());
+  if (op.getResult()->getType() != resultType) {
+    op.emitOpError("expected result type to be ") << resultType;
+    return failure();
+  }
+
+  return success();
+}
+
+void StridedSliceOp::getOffsets(SmallVectorImpl<int64_t> &results) {
+  populateFromInt64AttrArray(offsets(), results);
+}
+
+namespace {
+
+// Pattern to rewrite a StridedSliceOp(ConstantMaskOp) -> ConstantMaskOp.
+class StridedSliceConstantMaskFolder final
+    : public OpRewritePattern<StridedSliceOp> {
+public:
+  using OpRewritePattern<StridedSliceOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(StridedSliceOp stridedSliceOp,
+                                     PatternRewriter &rewriter) const override {
+    // Return if 'stridedSliceOp' operand is not defined by a ConstantMaskOp.
+    auto defOp = stridedSliceOp.vector()->getDefiningOp();
+    auto constantMaskOp = dyn_cast_or_null<ConstantMaskOp>(defOp);
+    if (!constantMaskOp)
+      return matchFailure();
+    // Return if 'stridedSliceOp' has non-unit strides.
+    if (llvm::any_of(stridedSliceOp.strides(), [](Attribute attr) {
+          return attr.cast<IntegerAttr>().getInt() != 1;
+        }))
+      return matchFailure();
+    // Gather constant mask dimension sizes.
+    SmallVector<int64_t, 4> maskDimSizes;
+    populateFromInt64AttrArray(constantMaskOp.mask_dim_sizes(), maskDimSizes);
+    // Gather strided slice offsets and sizes.
+    SmallVector<int64_t, 4> sliceOffsets;
+    populateFromInt64AttrArray(stridedSliceOp.offsets(), sliceOffsets);
+    SmallVector<int64_t, 4> sliceSizes;
+    populateFromInt64AttrArray(stridedSliceOp.sizes(), sliceSizes);
+
+    // Compute slice of vector mask region.
+    SmallVector<int64_t, 4> sliceMaskDimSizes;
+    assert(sliceOffsets.size() == maskDimSizes.size());
+    for (const auto &it : llvm::zip(maskDimSizes, sliceOffsets, sliceSizes)) {
+      int64_t maskDimSize = std::get<0>(it);
+      int64_t sliceOffset = std::get<1>(it);
+      int64_t sliceSize = std::get<2>(it);
+      int64_t sliceMaskDimSize = std::max(
+          static_cast<int64_t>(0),
+          std::min(sliceOffset + sliceSize, maskDimSize) - sliceOffset);
+      sliceMaskDimSizes.push_back(sliceMaskDimSize);
+    }
+    // If any of 'sliceMaskDimSizes' are zero, then set all to zero (masked
+    // region is a conjunction of mask dim intervals).
+    if (llvm::any_of(sliceMaskDimSizes, [](int64_t sz) { return sz == 0; }))
+      sliceMaskDimSizes.assign(maskDimSizes.size(), 0);
+
+    // Replace 'stridedSliceOp' with ConstantMaskOp with sliced mask region.
+    rewriter.replaceOpWithNewOp<ConstantMaskOp>(
+        stridedSliceOp, stridedSliceOp.getResult()->getType(),
+        vector::getVectorSubscriptAttr(rewriter, sliceMaskDimSizes));
+    return matchSuccess();
+  }
+};
+
+} // end anonymous namespace
+
+void StridedSliceOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  // Pattern to rewrite a StridedSliceOp(ConstantMaskOp) -> ConstantMaskOp.
+  results.insert<StridedSliceConstantMaskFolder>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// TransferReadOp
+//===----------------------------------------------------------------------===//
+template <typename EmitFun>
+static LogicalResult verifyPermutationMap(AffineMap permutationMap,
+                                          EmitFun emitOpError) {
+  SmallVector<bool, 8> seen(permutationMap.getNumInputs(), false);
+  for (auto expr : permutationMap.getResults()) {
+    auto dim = expr.dyn_cast<AffineDimExpr>();
+    auto zero = expr.dyn_cast<AffineConstantExpr>();
+    if (zero) {
+      if (zero.getValue() != 0) {
+        return emitOpError(
+            "requires a projected permutation_map (at most one dim or the zero "
+            "constant can appear in each result)");
+      }
+      continue;
+    }
+    if (!dim) {
+      return emitOpError("requires a projected permutation_map (at most one "
+                         "dim or the zero constant can appear in each result)");
+    }
+    if (seen[dim.getPosition()]) {
+      return emitOpError(
+          "requires a permutation_map that is a permutation (found one dim "
+          "used more than once)");
+    }
+    seen[dim.getPosition()] = true;
+  }
+  return success();
+}
+
+static LogicalResult verifyTransferOp(Operation *op, MemRefType memrefType,
+                                      VectorType vectorType,
+                                      AffineMap permutationMap) {
+  auto memrefElementType = memrefType.getElementType();
+  if (auto memrefVectorElementType = memrefElementType.dyn_cast<VectorType>()) {
+    // Memref has vector element type.
+
+    // Check that 'memrefVectorElementType' and vector element types match.
+    if (memrefVectorElementType.getElementType() != vectorType.getElementType())
+      return op->emitOpError(
+          "requires memref and vector types of the same elemental type");
+
+    // Check that memref vector type is a suffix of 'vectorType.
+    unsigned memrefVecEltRank = memrefVectorElementType.getRank();
+    unsigned resultVecRank = vectorType.getRank();
+    if (memrefVecEltRank > resultVecRank)
+      return op->emitOpError(
+          "requires memref vector element and vector result ranks to match.");
+    // TODO(b/146516564) Move this to isSuffix in VectorOps/Utils.h.
+    unsigned rankOffset = resultVecRank - memrefVecEltRank;
+    auto memrefVecEltShape = memrefVectorElementType.getShape();
+    auto resultVecShape = vectorType.getShape();
+    for (unsigned i = 0; i < memrefVecEltRank; ++i)
+      if (memrefVecEltShape[i] != resultVecShape[rankOffset + i])
+        return op->emitOpError(
+            "requires memref vector element shape to match suffix of "
+            "vector result shape.");
+    // Check that permutation map results match 'rankOffset' of vector type.
+    if (permutationMap.getNumResults() != rankOffset)
+      return op->emitOpError("requires a permutation_map with result dims of "
+                             "the same rank as the vector type");
+  } else {
+    // Memref has scalar element type.
+
+    // Check that memref and vector element types match.
+    if (memrefType.getElementType() != vectorType.getElementType())
+      return op->emitOpError(
+          "requires memref and vector types of the same elemental type");
+
+    // Check that permutation map results match rank of vector type.
+    if (permutationMap.getNumResults() != vectorType.getRank())
+      return op->emitOpError("requires a permutation_map with result dims of "
+                             "the same rank as the vector type");
+  }
+
+  if (permutationMap.getNumSymbols() != 0)
+    return op->emitOpError("requires permutation_map without symbols");
+  if (permutationMap.getNumInputs() != memrefType.getRank())
+    return op->emitOpError("requires a permutation_map with input dims of the "
+                           "same rank as the memref type");
+  return success();
+}
+
+static void print(OpAsmPrinter &p, TransferReadOp op) {
+  p << op.getOperationName() << " " << op.memref() << "[" << op.indices()
+    << "], " << op.padding() << " ";
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getMemRefType() << ", " << op.getVectorType();
+}
+
+ParseResult parseTransferReadOp(OpAsmParser &parser, OperationState &result) {
+  llvm::SMLoc typesLoc;
+  OpAsmParser::OperandType memrefInfo;
+  SmallVector<OpAsmParser::OperandType, 8> indexInfo;
+  OpAsmParser::OperandType paddingInfo;
+  SmallVector<Type, 2> types;
+  // Parsing with support for optional paddingValue.
+  if (parser.parseOperand(memrefInfo) ||
+      parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseComma() || parser.parseOperand(paddingInfo) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
+    return failure();
+  if (types.size() != 2)
+    return parser.emitError(typesLoc, "two types required");
+  auto indexType = parser.getBuilder().getIndexType();
+  MemRefType memRefType = types[0].dyn_cast<MemRefType>();
+  if (!memRefType)
+    return parser.emitError(typesLoc, "memref type required"), failure();
+  Type vectorType = types[1];
+  return failure(
+      parser.resolveOperand(memrefInfo, memRefType, result.operands) ||
+      parser.resolveOperands(indexInfo, indexType, result.operands) ||
+      parser.resolveOperand(paddingInfo, memRefType.getElementType(),
+                            result.operands) ||
+      parser.addTypeToList(vectorType, result.types));
+}
+
+static LogicalResult verify(TransferReadOp op) {
+  // Consistency of elemental types in memref and vector.
+  MemRefType memrefType = op.getMemRefType();
+  VectorType vectorType = op.getVectorType();
+  auto paddingType = op.padding()->getType();
+  auto permutationMap = op.permutation_map();
+  auto memrefElementType = memrefType.getElementType();
+
+  if (static_cast<int64_t>(op.indices().size()) != memrefType.getRank())
+    return op.emitOpError("requires ") << memrefType.getRank() << " indices";
+
+  if (failed(verifyTransferOp(op.getOperation(), memrefType, vectorType,
+                              permutationMap)))
+    return failure();
+
+  if (auto memrefVectorElementType = memrefElementType.dyn_cast<VectorType>()) {
+    // Memref has vector element type.
+    // Check that 'memrefVectorElementType' and 'paddingType' types match.
+    if (memrefVectorElementType != paddingType)
+      return op.emitOpError(
+          "requires memref element type and padding type to match.");
+
+  } else {
+    // Check that 'paddingType' is valid to store in a vector type.
+    if (!VectorType::isValidElementType(paddingType))
+      return op.emitOpError("requires valid padding vector elemental type");
+
+    // Check that padding type and vector element types match.
+    if (paddingType != vectorType.getElementType())
+      return op.emitOpError(
+          "requires formal padding and vector of the same elemental type");
+  }
+
+  return verifyPermutationMap(permutationMap,
+                              [&op](Twine t) { return op.emitOpError(t); });
+}
+
+//===----------------------------------------------------------------------===//
+// TransferWriteOp
+//===----------------------------------------------------------------------===//
+static void print(OpAsmPrinter &p, TransferWriteOp op) {
+  p << op.getOperationName() << " " << *op.vector() << ", " << *op.memref()
+    << "[" << op.indices() << "]";
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.getVectorType() << ", " << op.getMemRefType();
+}
+
+ParseResult parseTransferWriteOp(OpAsmParser &parser, OperationState &result) {
+  llvm::SMLoc typesLoc;
+  OpAsmParser::OperandType storeValueInfo;
+  OpAsmParser::OperandType memRefInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indexInfo;
+  SmallVector<Type, 2> types;
+  if (parser.parseOperand(storeValueInfo) || parser.parseComma() ||
+      parser.parseOperand(memRefInfo) ||
+      parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
+    return failure();
+  if (types.size() != 2)
+    return parser.emitError(typesLoc, "two types required");
+  auto indexType = parser.getBuilder().getIndexType();
+  Type vectorType = types[0], memRefType = types[1];
+  return failure(
+      parser.resolveOperand(storeValueInfo, vectorType, result.operands) ||
+      parser.resolveOperand(memRefInfo, memRefType, result.operands) ||
+      parser.resolveOperands(indexInfo, indexType, result.operands));
+}
+
+static LogicalResult verify(TransferWriteOp op) {
+  // Consistency of elemental types in memref and vector.
+  MemRefType memrefType = op.getMemRefType();
+  VectorType vectorType = op.getVectorType();
+  auto permutationMap = op.permutation_map();
+
+  if (llvm::size(op.indices()) != memrefType.getRank())
+    return op.emitOpError("requires ") << memrefType.getRank() << " indices";
+
+  if (failed(verifyTransferOp(op.getOperation(), memrefType, vectorType,
+                              permutationMap)))
+    return failure();
+
+  return verifyPermutationMap(permutationMap,
+                              [&op](Twine t) { return op.emitOpError(t); });
+}
+
+//===----------------------------------------------------------------------===//
+// TypeCastOp
+//===----------------------------------------------------------------------===//
+
+static MemRefType inferVectorTypeCastResultType(MemRefType t) {
+  return MemRefType::get({}, VectorType::get(t.getShape(), t.getElementType()));
+}
+
+void TypeCastOp::build(Builder *builder, OperationState &result, Value source) {
+  result.addOperands(source);
+  result.addTypes(
+      inferVectorTypeCastResultType(source->getType().cast<MemRefType>()));
+}
+
+static void print(OpAsmPrinter &p, TypeCastOp op) {
+  auto type = op.getOperand()->getType().cast<MemRefType>();
+  p << op.getOperationName() << ' ' << *op.memref() << " : " << type << " to "
+    << inferVectorTypeCastResultType(type);
+}
+
+static LogicalResult verify(TypeCastOp op) {
+  auto resultType = inferVectorTypeCastResultType(op.getMemRefType());
+  if (op.getResultMemRefType() != resultType)
+    return op.emitOpError("expects result type to be: ") << resultType;
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// TupleOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseTupleOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 4> operandInfos;
+  SmallVector<Type, 4> types;
+  auto loc = parser.getCurrentLocation();
+  auto *ctx = parser.getBuilder().getContext();
+  return failure(
+      parser.parseOperandList(operandInfos) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonTypeList(types) ||
+      parser.resolveOperands(operandInfos, types, loc, result.operands) ||
+      parser.addTypeToList(TupleType::get(types, ctx), result.types));
+}
+
+static void print(OpAsmPrinter &p, TupleOp op) {
+  p << op.getOperationName() << ' ';
+  p.printOperands(op.getOperands());
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : ";
+  interleaveComma(op.getOperation()->getOperandTypes(), p);
+}
+
+static LogicalResult verify(TupleOp op) { return success(); }
+
+//===----------------------------------------------------------------------===//
+// TupleGetOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseTupleGetOp(OpAsmParser &parser,
+                                   OperationState &result) {
+  OpAsmParser::OperandType operandInfo;
+  IntegerAttr indexAttr;
+  StringRef indexAttrName = TupleGetOp::getIndexAttrName();
+  Type indexType = parser.getBuilder().getIndexType();
+  TupleType tupleType;
+  VectorType resultVectorType;
+  if (parser.parseOperand(operandInfo) || parser.parseComma() ||
+      parser.parseAttribute(indexAttr, indexType, indexAttrName,
+                            result.attributes) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(tupleType) ||
+      parser.resolveOperand(operandInfo, tupleType, result.operands))
+    return failure();
+  if (indexAttr.getInt() < 0 ||
+      indexAttr.getInt() >= static_cast<int64_t>(tupleType.size()))
+    return failure();
+  parser.addTypeToList(tupleType.getType(indexAttr.getInt()), result.types);
+  return success();
+}
+
+static void print(OpAsmPrinter &p, TupleGetOp op) {
+  p << op.getOperationName() << ' ' << *op.getOperand() << ", " << op.index();
+  p.printOptionalAttrDict(op.getAttrs(),
+                          /*elidedAttrs=*/{TupleGetOp::getIndexAttrName()});
+  p << " : " << op.getOperand()->getType();
+}
+
+static LogicalResult verify(TupleGetOp op) {
+  auto tupleType = op.getOperand()->getType().cast<TupleType>();
+  if (op.getIndex() < 0 ||
+      op.getIndex() >= static_cast<int64_t>(tupleType.size()))
+    return op.emitOpError("tuple get index out of range");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ConstantMaskOp
+//===----------------------------------------------------------------------===//
+
+ParseResult parseConstantMaskOp(OpAsmParser &parser, OperationState &result) {
+  Type resultType;
+  ArrayAttr maskDimSizesAttr;
+  StringRef attrName = ConstantMaskOp::getMaskDimSizesAttrName();
+  return failure(
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseAttribute(maskDimSizesAttr, attrName, result.attributes) ||
+      parser.parseColonType(resultType) ||
+      parser.addTypeToList(resultType, result.types));
+}
+
+static void print(OpAsmPrinter &p, ConstantMaskOp op) {
+  p << op.getOperationName() << ' ' << op.mask_dim_sizes() << " : "
+    << op.getResult()->getType();
+}
+
+static LogicalResult verify(ConstantMaskOp &op) {
+  // Verify that array attr size matches the rank of the vector result.
+  auto resultType = op.getResult()->getType().cast<VectorType>();
+  if (static_cast<int64_t>(op.mask_dim_sizes().size()) != resultType.getRank())
+    return op.emitOpError(
+        "must specify array attr of size equal vector result rank");
+  // Verify that each array attr element is in bounds of corresponding vector
+  // result dimension size.
+  auto resultShape = resultType.getShape();
+  SmallVector<int64_t, 4> maskDimSizes;
+  for (auto it : llvm::enumerate(op.mask_dim_sizes())) {
+    int64_t attrValue = it.value().cast<IntegerAttr>().getInt();
+    if (attrValue < 0 || attrValue > resultShape[it.index()])
+      return op.emitOpError(
+          "array attr of size out of bounds of vector result dimension size");
+    maskDimSizes.push_back(attrValue);
+  }
+  // Verify that if one mask dim size is zero, they all should be zero (because
+  // the mask region is a conjunction of each mask dimension interval).
+  bool any_zeros = llvm::is_contained(maskDimSizes, 0);
+  bool all_zeros = llvm::all_of(maskDimSizes, [](int64_t s) { return s == 0; });
+  if (any_zeros && !all_zeros)
+    return op.emitOpError("expected all mask dim sizes to be zeros, "
+                          "as a result of conjunction with zero mask dim");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// CreateMaskOp
+//===----------------------------------------------------------------------===//
+
+ParseResult parseCreateMaskOp(OpAsmParser &parser, OperationState &result) {
+  auto indexType = parser.getBuilder().getIndexType();
+  Type resultType;
+  SmallVector<OpAsmParser::OperandType, 4> operandInfo;
+  return failure(
+      parser.parseOperandList(operandInfo) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(resultType) ||
+      parser.resolveOperands(operandInfo, indexType, result.operands) ||
+      parser.addTypeToList(resultType, result.types));
+}
+
+static void print(OpAsmPrinter &p, CreateMaskOp op) {
+  p << op.getOperationName() << ' ' << op.operands() << " : " << op.getType();
+}
+
+static LogicalResult verify(CreateMaskOp op) {
+  // Verify that an operand was specified for each result vector each dimension.
+  if (op.getNumOperands() !=
+      op.getResult()->getType().cast<VectorType>().getRank())
+    return op.emitOpError(
+        "must specify an operand for each result vector dimension");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// PrintOp
+//===----------------------------------------------------------------------===//
+
+ParseResult parsePrintOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType source;
+  Type sourceType;
+  return failure(parser.parseOperand(source) ||
+                 parser.parseColonType(sourceType) ||
+                 parser.resolveOperand(source, sourceType, result.operands));
+}
+
+static void print(OpAsmPrinter &p, PrintOp op) {
+  p << op.getOperationName() << ' ' << *op.source() << " : "
+    << op.getPrintType();
+}
+
+namespace {
+
+// Pattern to rewrite a CreateMaskOp with a ConstantMaskOp.
+class CreateMaskFolder final : public OpRewritePattern<CreateMaskOp> {
+public:
+  using OpRewritePattern<CreateMaskOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(CreateMaskOp createMaskOp,
+                                     PatternRewriter &rewriter) const override {
+    // Return if any of 'createMaskOp' operands are not defined by a constant.
+    auto is_not_def_by_constant = [](Value operand) {
+      return !isa_and_nonnull<ConstantIndexOp>(operand->getDefiningOp());
+    };
+    if (llvm::any_of(createMaskOp.operands(), is_not_def_by_constant))
+      return matchFailure();
+    // Gather constant mask dimension sizes.
+    SmallVector<int64_t, 4> maskDimSizes;
+    for (auto operand : createMaskOp.operands()) {
+      auto defOp = operand->getDefiningOp();
+      maskDimSizes.push_back(cast<ConstantIndexOp>(defOp).getValue());
+    }
+    // Replace 'createMaskOp' with ConstantMaskOp.
+    rewriter.replaceOpWithNewOp<ConstantMaskOp>(
+        createMaskOp, createMaskOp.getResult()->getType(),
+        vector::getVectorSubscriptAttr(rewriter, maskDimSizes));
+    return matchSuccess();
+  }
+};
+
+} // end anonymous namespace
+
+void CreateMaskOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<CreateMaskFolder>(context);
+}
+
+void mlir::vector::populateVectorToVectorCanonicalizationPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *context) {
+  patterns.insert<CreateMaskFolder, StridedSliceConstantMaskFolder>(context);
+}
+
+namespace mlir {
+namespace vector {
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/VectorOps/VectorOps.cpp.inc"
+
+} // namespace vector
+} // namespace mlir
diff --git a/mlir/lib/Dialect/VectorOps/VectorTransforms.cpp b/mlir/lib/Dialect/VectorOps/VectorTransforms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..28b803f7cdee53224201e372328370f9c63543d0
--- /dev/null
+++ b/mlir/lib/Dialect/VectorOps/VectorTransforms.cpp
@@ -0,0 +1,681 @@
+//===- VectorToLoops.cpp - Conversion within the Vector dialect -----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target-independent rewrites as 1->N patterns.
+//
+//===----------------------------------------------------------------------===//
+
+#include <type_traits>
+
+#include "mlir/Dialect/VectorOps/Utils.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/Dialect/VectorOps/VectorTransforms.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/STLExtras.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "vector-to-vector"
+
+using namespace mlir;
+using llvm::dbgs;
+using mlir::functional::zipMap;
+
+/// Given a shape with sizes greater than 0 along all dimensions,
+/// returns the distance, in number of elements, between a slice in a dimension
+/// and the next slice in the same dimension.
+///   e.g. shape[3, 4, 5] -> linearization_basis[20, 5, 1]
+static SmallVector<int64_t, 8> computeStrides(ArrayRef<int64_t> shape) {
+  if (shape.empty())
+    return {};
+  SmallVector<int64_t, 8> tmp;
+  tmp.reserve(shape.size());
+  int64_t running = 1;
+  for (auto size : llvm::reverse(shape)) {
+    assert(size > 0 && "size must be nonnegative");
+    tmp.push_back(running);
+    running *= size;
+  }
+  return SmallVector<int64_t, 8>(tmp.rbegin(), tmp.rend());
+}
+
+static int64_t computeMaxLinearIndex(ArrayRef<int64_t> basis) {
+  if (basis.empty())
+    return 0;
+  int64_t res = 1;
+  for (auto b : basis)
+    res *= b;
+  return res;
+}
+
+/// Computes and returns the linearized index of 'offsets' w.r.t. 'basis'.
+static int64_t linearize(ArrayRef<int64_t> offsets, ArrayRef<int64_t> basis) {
+  assert(offsets.size() == basis.size());
+  int64_t linearIndex = 0;
+  for (unsigned idx = 0, e = basis.size(); idx < e; ++idx)
+    linearIndex += offsets[idx] * basis[idx];
+  return linearIndex;
+}
+
+/// Given a shape with sizes greater than 0 along all dimensions, returns the
+/// delinearized components of linearIndex along shape.
+static SmallVector<int64_t, 8> delinearize(int64_t linearIndex,
+                                           ArrayRef<int64_t> basis) {
+  SmallVector<int64_t, 8> res;
+  res.reserve(basis.size());
+  for (unsigned idx = 0, e = basis.size(); idx < e; ++idx) {
+    assert(basis[idx] > 0);
+    res.push_back(linearIndex / basis[idx]);
+    linearIndex %= basis[idx];
+  }
+  // Sanity check.
+  assert(linearIndex == 0 && "linear index remainder must be 0");
+  return res;
+}
+
+// Clones `op` into a new operations that takes `operands` and returns
+// `resultTypes`.
+static Operation *cloneOpWithOperandsAndTypes(PatternRewriter &builder,
+                                              Location loc, Operation *op,
+                                              ArrayRef<Value> operands,
+                                              ArrayRef<Type> resultTypes) {
+  OperationState res(loc, op->getName().getStringRef(), operands, resultTypes,
+                     op->getAttrs());
+  return builder.createOperation(res);
+}
+
+static Value makeSplatZero(Location loc, PatternRewriter &rewriter,
+                           VectorType vt) {
+  auto t = vt.getElementType();
+  Value f = nullptr;
+  if (t.isBF16() || t.isF16())
+    f = rewriter.create<ConstantOp>(loc, t, rewriter.getF64FloatAttr(0.0f));
+  else if (t.isF32())
+    f = rewriter.create<ConstantOp>(loc, t, rewriter.getF32FloatAttr(0.0f));
+  else if (t.isF64())
+    f = rewriter.create<ConstantOp>(loc, t, rewriter.getF64FloatAttr(0.0f));
+  if (f)
+    return rewriter.create<SplatOp>(loc, vt, f);
+  llvm_unreachable("Unsupported type in `makeSplatZero`");
+}
+
+// Populates 'resultElements[indexMap[i]]' with elements from 'inputElements[i]'
+// for each index 'i' in inputElements with a valid mapping in 'indexMap'.
+static void getMappedElements(const DenseMap<int64_t, int64_t> &indexMap,
+                              ArrayRef<int64_t> inputElements,
+                              SmallVectorImpl<int64_t> &resultElements) {
+  assert(indexMap.size() == resultElements.size());
+  assert(inputElements.size() >= resultElements.size());
+  for (unsigned i = 0, e = inputElements.size(); i < e; ++i) {
+    auto it = indexMap.find(i);
+    if (it != indexMap.end())
+      resultElements[it->second] = inputElements[i];
+  }
+}
+
+// Returns a tuple type with vector element types for each resulting slice
+// of 'vectorType' unrolled by 'sizes' and 'strides'.
+// TODO(andydavis) Move this to a utility function and share it with
+// Extract/InsertSlicesOp verification.
+static TupleType generateExtractSlicesOpResultType(VectorType vectorType,
+                                                   ArrayRef<int64_t> sizes,
+                                                   ArrayRef<int64_t> strides,
+                                                   PatternRewriter &builder) {
+  assert(llvm::all_of(strides, [](int64_t s) { return s == 1; }));
+  unsigned rank = vectorType.getRank();
+  assert(sizes.size() == rank);
+  assert(strides.size() == rank);
+
+  // Compute shape ratio of 'shape' and 'sizes'.
+  auto shape = vectorType.getShape();
+  auto maybeDimSliceCounts = shapeRatio(shape, sizes);
+  assert(maybeDimSliceCounts.hasValue());
+  auto sliceDimCounts = *maybeDimSliceCounts;
+
+  // Compute strides w.r.t number of slices in each dimension.
+  auto basis = computeStrides(sliceDimCounts);
+  int64_t sliceCount = computeMaxLinearIndex(sliceDimCounts);
+  SmallVector<Type, 4> vectorTypes(sliceCount);
+  for (unsigned i = 0; i < sliceCount; ++i) {
+    // De-linearize w.r.t. 'basis'.
+    auto vectorOffsets = delinearize(i, basis);
+    // Convert from unrolled vector-space offsets to element-space offsets.
+    auto offsets = zipMap([](int64_t v1, int64_t v2) { return v1 * v2; },
+                          vectorOffsets, sizes);
+    // Initialize 'sliceSizes' to target 'sizes'
+    SmallVector<int64_t, 4> sliceSizes(sizes.begin(), sizes.end());
+    for (unsigned j = 0; j < rank; ++j) {
+      // Based on 'offsets' and 'shape' clip some dim sizes for partial tiles.
+      sliceSizes[j] = std::min(sliceSizes[j], shape[j] - offsets[j]);
+    }
+    // Create Vector type and add to 'vectorTypes[i]'.
+    vectorTypes[i] = VectorType::get(sliceSizes, vectorType.getElementType());
+  }
+  return TupleType::get(vectorTypes, builder.getContext());
+}
+
+// UnrolledVectorState aggregates per-operand/result vector state required for
+// unrolling.
+struct UnrolledVectorState {
+  SmallVector<int64_t, 4> unrolledShape;
+  SmallVector<int64_t, 4> unrollFactors;
+  SmallVector<int64_t, 8> basis;
+  int64_t numInstances;
+  Value slicesTuple;
+};
+
+// Populates 'state' with unrolled shape, unroll factors, basis and
+// num unrolled instances for 'vectorType'.
+static void initUnrolledVectorState(VectorType vectorType, Value initValue,
+                                    const DenseMap<int64_t, int64_t> &indexMap,
+                                    ArrayRef<int64_t> targetShape,
+                                    UnrolledVectorState &state,
+                                    PatternRewriter &builder) {
+  // Compute unrolled shape of 'vectorType'.
+  state.unrolledShape.resize(vectorType.getRank());
+  getMappedElements(indexMap, targetShape, state.unrolledShape);
+  // Compute unroll factors for unrolled shape.
+  auto maybeUnrollFactors =
+      shapeRatio(vectorType.getShape(), state.unrolledShape);
+  assert(maybeUnrollFactors.hasValue());
+  state.unrollFactors = *maybeUnrollFactors;
+  // Compute 'basis' and 'numInstances' based on 'state.unrollFactors'.
+  state.basis = computeStrides(state.unrollFactors);
+  state.numInstances = computeMaxLinearIndex(state.unrollFactors);
+  state.slicesTuple = nullptr;
+  if (initValue != nullptr) {
+    // Create ExtractSlicesOp.
+    SmallVector<int64_t, 4> sizes(state.unrolledShape);
+    SmallVector<int64_t, 4> strides(state.unrollFactors.size(), 1);
+    auto tupleType =
+        generateExtractSlicesOpResultType(vectorType, sizes, strides, builder);
+    state.slicesTuple = builder.create<vector::ExtractSlicesOp>(
+        initValue->getLoc(), tupleType, initValue, sizes, strides);
+  }
+}
+
+// Computes and returns the linear index of the unrolled vector at
+// 'vectorOffsets' within the vector represented by 'state'.
+static int64_t
+getUnrolledVectorLinearIndex(UnrolledVectorState &state,
+                             ArrayRef<int64_t> vectorOffsets,
+                             DenseMap<int64_t, int64_t> &indexMap) {
+  // Compute vector offsets.
+  SmallVector<int64_t, 4> sliceOffsets(state.unrolledShape.size());
+  getMappedElements(indexMap, vectorOffsets, sliceOffsets);
+  // Compute and return linear index of 'sliceOffsets' w.r.t 'state.basis'.
+  return linearize(sliceOffsets, state.basis);
+}
+
+// Returns an unrolled vector at 'vectorOffsets' within the vector
+// represented by 'state'. The vector is created from a slice of 'initValue'
+// if not present in 'cache'.
+static Value getOrCreateUnrolledVectorSlice(
+    Location loc, UnrolledVectorState &state, ArrayRef<int64_t> vectorOffsets,
+    ArrayRef<int64_t> offsets, DenseMap<int64_t, int64_t> &indexMap,
+    Value initValue, SmallVectorImpl<Value> &cache, PatternRewriter &builder) {
+  // Compute slice offsets.
+  SmallVector<int64_t, 4> sliceOffsets(state.unrolledShape.size());
+  getMappedElements(indexMap, offsets, sliceOffsets);
+  // TODO(b/144845578) Support non-1 strides.
+  SmallVector<int64_t, 4> sliceStrides(state.unrolledShape.size(), 1);
+  // Compute linear index of 'sliceOffsets' w.r.t 'state.basis'.
+  int64_t sliceLinearIndex =
+      getUnrolledVectorLinearIndex(state, vectorOffsets, indexMap);
+  assert(sliceLinearIndex < static_cast<int64_t>(cache.size()));
+  auto valueSlice = cache[sliceLinearIndex];
+  if (valueSlice == nullptr) {
+    // Return tuple element at 'sliceLinearIndex'.
+    auto tupleIndex = builder.getI64IntegerAttr(sliceLinearIndex);
+    auto initValueType = initValue->getType().cast<VectorType>();
+    auto vectorType =
+        VectorType::get(state.unrolledShape, initValueType.getElementType());
+    // Initialize 'cache' with slice from 'initValue'.
+    valueSlice = builder.create<vector::TupleGetOp>(
+        loc, vectorType, state.slicesTuple, tupleIndex);
+    // Store value back to 'cache'.
+    cache[sliceLinearIndex] = valueSlice;
+  }
+  return valueSlice;
+}
+
+// VectorState aggregates per-operand/result vector state required for
+// creating slices of vector operands, and clones of the operation being
+// unrolled.
+struct VectorState {
+  // The type of this vector.
+  VectorType type;
+  // Map from iteration space index to vector dimension index.
+  DenseMap<int64_t, int64_t> indexMap;
+  // Index of this value in operation's operand list (-1 if not an operand).
+  int64_t operandIndex = -1;
+  // Accumulator iterator flag.
+  bool isAcc = false;
+};
+
+//
+// unrollSingleResultStructuredOp
+//
+// Returns a value representing the result of structured operation 'op'
+// with iteration bounds 'iterationBounds' unrolled to 'targetShape'.
+// A list of VectorState objects must be specified in 'vectors', where
+// each VectorState in the list represents a vector operand or vector result
+// (if the operation does not have an accumulator operand).
+// The VectorState at index 'resultIndex' in the list must be the state
+// associated with the operations single result (i.e. either its accumulator
+// operand or vector result value).
+//
+// Example:
+//
+//  // Before unrolling
+//
+//   operand0                operand1                operand2
+//       \                      |                      /
+//        -------------------- opA --------------------
+//
+//  // After unrolling by 2
+//
+//   operand0                operand1                operand2
+//   /      \                /      \                /      \
+// slice00  slice01       slice10  slice11        slice20  slice21
+//   \         |            |          |            /          |
+//    -------------------- opA0 --------------------           |
+//             |            |          |                       |
+//              \           |          |                      /
+//               -------------------- opA1 -------------------
+//                          |          |
+//                           \        /
+//                           insertslice
+//                                |
+
+// TODO(andydavis) Add the following canonicalization/simplifcation patterns:
+// *) Add pattern which matches InsertStridedSlice -> StridedSlice and forwards
+//    InsertStridedSlice operand to StridedSlice.
+// *) Add pattern which matches SourceOp -> StridedSlice -> UserOp which checks
+//    if there are duplicate identical StridedSlice ops from SourceOp, and
+//    rewrites itself to use the first duplicate. This transformation should
+//    cause users of identifical StridedSlice ops to reuse the same StridedSlice
+//    operation, and leave the duplicate StridedSlice ops with no users
+//    (removable with DCE).
+
+// TODO(andydavis) Generalize this to support structured ops beyond
+// vector ContractionOp, and merge it with 'unrollSingleResultOpMatchingType'
+static Value unrollSingleResultStructuredOp(Operation *op,
+                                            ArrayRef<int64_t> iterationBounds,
+                                            std::vector<VectorState> &vectors,
+                                            unsigned resultIndex,
+                                            ArrayRef<int64_t> targetShape,
+                                            PatternRewriter &builder) {
+  auto shapedType = op->getResult(0)->getType().dyn_cast_or_null<ShapedType>();
+  if (!shapedType || !shapedType.hasStaticShape())
+    assert(false && "Expected a statically shaped result type");
+
+  // Compute unroll factors for 'iterationBounds' based on 'targetShape'
+  auto maybeUnrollFactors = shapeRatio(iterationBounds, targetShape);
+  if (!maybeUnrollFactors.hasValue())
+    assert(false && "Failed to compute unroll factors for target shape");
+  auto unrollFactors = *maybeUnrollFactors;
+
+  // Compute unrolled vector state for each vector in 'vectors'.
+  unsigned numVectors = vectors.size();
+  SmallVector<UnrolledVectorState, 3> unrolledVectorState(numVectors);
+  for (unsigned i = 0; i < numVectors; ++i) {
+    int64_t operandIndex = vectors[i].operandIndex;
+    auto operand = operandIndex >= 0 ? op->getOperand(operandIndex) : nullptr;
+    initUnrolledVectorState(vectors[i].type, operand, vectors[i].indexMap,
+                            targetShape, unrolledVectorState[i], builder);
+  }
+  // Compute number of total unrolled instances.
+  auto numUnrolledInstances = computeMaxLinearIndex(unrollFactors);
+  auto basis = computeStrides(unrollFactors);
+
+  auto &resultValueState = unrolledVectorState[resultIndex];
+  auto unrolledResultType = VectorType::get(resultValueState.unrolledShape,
+                                            shapedType.getElementType());
+
+  // Initialize caches for intermediate vector results.
+  std::vector<SmallVector<Value, 4>> caches(numVectors);
+  for (unsigned i = 0; i < numVectors; ++i)
+    caches[i].resize(unrolledVectorState[i].numInstances);
+
+  // Unroll 'numUnrolledInstances' of 'op', storing results in 'caches'.
+  for (unsigned i = 0; i < numUnrolledInstances; ++i) {
+    // De-linearize w.r.t. 'basis'.
+    auto vectorOffsets = delinearize(i, basis);
+    // Convert from unrolled vector-space offsets to element-space offsets.
+    auto offsets = zipMap([](int64_t v1, int64_t v2) { return v1 * v2; },
+                          vectorOffsets, targetShape);
+    // Get cached slice (or create slice) for each operand at 'offsets'.
+    SmallVector<Value, 3> operands;
+    operands.resize(op->getNumOperands());
+    for (unsigned i = 0; i < numVectors; ++i) {
+      int64_t operandIndex = vectors[i].operandIndex;
+      if (operandIndex < 0)
+        continue; // Output
+      auto operand = op->getOperand(operandIndex);
+      operands[operandIndex] = getOrCreateUnrolledVectorSlice(
+          op->getLoc(), unrolledVectorState[i], vectorOffsets, offsets,
+          vectors[i].indexMap, operand, caches[i], builder);
+    }
+    // Create op on sliced vector arguments.
+    auto resultVector =
+        cloneOpWithOperandsAndTypes(builder, op->getLoc(), op, operands,
+                                    unrolledResultType)
+            ->getResult(0);
+
+    // Compute linear result index.
+    int64_t linearIndex = getUnrolledVectorLinearIndex(
+        resultValueState, vectorOffsets, vectors[resultIndex].indexMap);
+    // Update result cache at 'linearIndex'.
+    caches[resultIndex][linearIndex] = resultVector;
+  }
+
+  // Create TupleOp of unrolled result vectors.
+  SmallVector<Type, 4> vectorTupleTypes(resultValueState.numInstances);
+  SmallVector<Value, 4> vectorTupleValues(resultValueState.numInstances);
+  for (unsigned i = 0; i < resultValueState.numInstances; ++i) {
+    vectorTupleTypes[i] = caches[resultIndex][i]->getType().cast<VectorType>();
+    vectorTupleValues[i] = caches[resultIndex][i];
+  }
+  TupleType tupleType = builder.getTupleType(vectorTupleTypes);
+  Value tupleOp = builder.create<vector::TupleOp>(op->getLoc(), tupleType,
+                                                  vectorTupleValues);
+
+  // Create InsertSlicesOp(Tuple(result_vectors)).
+  auto resultVectorType = op->getResult(0)->getType().cast<VectorType>();
+  SmallVector<int64_t, 4> sizes(resultValueState.unrolledShape);
+  SmallVector<int64_t, 4> strides(resultValueState.unrollFactors.size(), 1);
+
+  Value insertSlicesOp = builder.create<vector::InsertSlicesOp>(
+      op->getLoc(), resultVectorType, tupleOp, builder.getI64ArrayAttr(sizes),
+      builder.getI64ArrayAttr(strides));
+  return insertSlicesOp;
+}
+
+static void getVectorContractionOpUnrollState(
+    vector::ContractionOp contractionOp, ArrayRef<int64_t> targetShape,
+    SmallVectorImpl<int64_t> &iterationBounds,
+    std::vector<VectorState> &vectors, unsigned &resultIndex) {
+  // Get contraction op iteration bounds.
+  contractionOp.getIterationBounds(iterationBounds);
+  assert(iterationBounds.size() == targetShape.size());
+  // Get map from iteration space index to lhs/rhs/result shape index.
+  std::vector<DenseMap<int64_t, int64_t>> iterationIndexMapList;
+  contractionOp.getIterationIndexMap(iterationIndexMapList);
+  unsigned numIterators = iterationIndexMapList.size();
+  vectors.resize(numIterators);
+  unsigned accOperandIndex = vector::ContractionOp::getAccOperandIndex();
+  for (unsigned i = 0; i < numIterators; ++i) {
+    vectors[i].type = contractionOp.getOperand(i)->getType().cast<VectorType>();
+    vectors[i].indexMap = iterationIndexMapList[i];
+    vectors[i].operandIndex = i;
+    vectors[i].isAcc = i == accOperandIndex ? true : false;
+  }
+
+  if (llvm::size(contractionOp.masks()) == 2) {
+    // Add vectors for lhs/rhs vector mask arguments. Masks have the
+    // same vector shape lhs/rhs args, so copy their index maps.
+    vectors.push_back({contractionOp.getLHSVectorMaskType(),
+                       vectors[0].indexMap, accOperandIndex + 1, false});
+    vectors.push_back({contractionOp.getRHSVectorMaskType(),
+                       vectors[1].indexMap, accOperandIndex + 2, false});
+  }
+  // Unroll 'op' 'iterationBounds' to 'targetShape'.
+  // TODO(andydavis) Use linalg style 'args_in'/'args_out' to partition
+  // 'vectors' instead of 'resultIndex'.
+  resultIndex = accOperandIndex;
+}
+
+static void
+getVectorElementwiseOpUnrollState(Operation *op, ArrayRef<int64_t> targetShape,
+                                  SmallVectorImpl<int64_t> &iterationBounds,
+                                  std::vector<VectorState> &vectors,
+                                  unsigned &resultIndex) {
+  // Verify that operation and operands all have the same vector shape.
+  auto resultType = op->getResult(0)->getType().dyn_cast_or_null<VectorType>();
+  assert(resultType && "Expected op with vector result type");
+  auto resultShape = resultType.getShape();
+  // Verify that all operands have the same vector type as result.
+  assert(llvm::all_of(op->getOperandTypes(),
+                      [=](Type type) { return type == resultType; }));
+  // Populate 'iterationBounds' with 'resultShape' for elementwise operations.
+  iterationBounds.assign(resultShape.begin(), resultShape.end());
+
+  // Create trivial elementwise identity index map based on 'resultShape'.
+  DenseMap<int64_t, int64_t> indexMap;
+  indexMap.reserve(resultShape.size());
+  for (unsigned i = 0; i < resultShape.size(); ++i)
+    indexMap[i] = i;
+
+  // Create VectorState each operand and single result.
+  unsigned numVectors = op->getNumOperands() + op->getNumResults();
+  vectors.resize(numVectors);
+  for (unsigned i = 0; i < op->getNumOperands(); ++i)
+    vectors[i] = {resultType, indexMap, i, false};
+  vectors[numVectors - 1] = {resultType, indexMap, -1, false};
+  resultIndex = numVectors - 1;
+}
+
+// Entry point for unrolling declarative pattern rewrites.
+Value mlir::vector::unrollSingleResultOpMatchingType(
+    PatternRewriter &builder, Operation *op, ArrayRef<int64_t> targetShape) {
+  assert(op->getNumResults() == 1 && "Expected single result operation");
+
+  // Populate 'iterationBounds', 'vectors' and 'resultIndex' to unroll 'op'.
+  SmallVector<int64_t, 6> iterationBounds;
+  std::vector<VectorState> vectors;
+  unsigned resultIndex;
+
+  if (auto contractionOp = dyn_cast<vector::ContractionOp>(op)) {
+    // Popultate state for vector ContractionOp.
+    getVectorContractionOpUnrollState(contractionOp, targetShape,
+                                      iterationBounds, vectors, resultIndex);
+  } else {
+    // Populate state for vector elementwise op.
+    getVectorElementwiseOpUnrollState(op, targetShape, iterationBounds, vectors,
+                                      resultIndex);
+  }
+
+  // Unroll 'op' with 'iterationBounds' to 'targetShape'.
+  return unrollSingleResultStructuredOp(op, iterationBounds, vectors,
+                                        resultIndex, targetShape, builder);
+}
+
+// Generates slices of 'vectorType' according to 'sizes' and 'strides, and
+// calls 'fn' with linear index and indices for each slice.
+static void
+generateTransferOpSlices(VectorType vectorType, TupleType tupleType,
+                         ArrayRef<int64_t> sizes, ArrayRef<int64_t> strides,
+                         ArrayRef<Value> indices, PatternRewriter &rewriter,
+                         function_ref<void(unsigned, ArrayRef<Value>)> fn) {
+  // Compute strides w.r.t. to slice counts in each dimension.
+  auto maybeDimSliceCounts = shapeRatio(vectorType.getShape(), sizes);
+  assert(maybeDimSliceCounts.hasValue());
+  auto sliceDimCounts = *maybeDimSliceCounts;
+  auto basis = computeStrides(sliceDimCounts);
+
+  int64_t numSlices = tupleType.size();
+  unsigned numSliceIndices = indices.size();
+  auto *ctx = rewriter.getContext();
+  for (unsigned i = 0; i < numSlices; ++i) {
+    // De-linearize w.r.t. 'basis'.
+    auto vectorOffsets = delinearize(i, basis);
+    // Convert from unrolled vector-space offsets to element-space offsets.
+    auto offsets = zipMap([](int64_t v1, int64_t v2) { return v1 * v2; },
+                          vectorOffsets, sizes);
+    // Compute 'sliceIndices' by adding 'sliceOffsets[i]' to 'indices[i]'.
+    SmallVector<Value, 4> sliceIndices(numSliceIndices);
+    for (auto it : llvm::enumerate(indices)) {
+      auto expr = getAffineDimExpr(0, ctx) +
+                  getAffineConstantExpr(offsets[it.index()], ctx);
+      auto map = AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, expr);
+      sliceIndices[it.index()] = rewriter.create<AffineApplyOp>(
+          it.value()->getLoc(), map, ArrayRef<Value>(it.value()));
+    }
+    // Call 'fn' to generate slice 'i' at 'sliceIndices'.
+    fn(i, sliceIndices);
+  }
+}
+
+// Splits vector TransferReadOp into smaller TransferReadOps based on slicing
+// scheme of its unique ExtractSlicesOp user.
+struct SplitTransferReadOp : public OpRewritePattern<vector::TransferReadOp> {
+  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::TransferReadOp xferReadOp,
+                                     PatternRewriter &rewriter) const override {
+    // TODO(andydavis, ntv) Support spliting TransferReadOp with non-identity
+    // permutation maps. Repurpose code from MaterializeVectors transformation.
+    if (!xferReadOp.permutation_map().isIdentity())
+      return matchFailure();
+    // Return unless the unique 'xferReadOp' user is an ExtractSlicesOp.
+    Value xferReadResult = xferReadOp.getResult();
+    auto extractSlicesOp =
+        dyn_cast<vector::ExtractSlicesOp>(*xferReadResult->getUsers().begin());
+    if (!xferReadResult->hasOneUse() || !extractSlicesOp)
+      return matchFailure();
+
+    // Get 'sizes' and 'strides' parameters from ExtractSlicesOp user.
+    auto sourceVectorType = extractSlicesOp.getSourceVectorType();
+    auto resultTupleType = extractSlicesOp.getResultTupleType();
+    SmallVector<int64_t, 4> sizes;
+    extractSlicesOp.getSizes(sizes);
+    SmallVector<int64_t, 4> strides;
+    extractSlicesOp.getStrides(strides);
+    assert(llvm::all_of(strides, [](int64_t s) { return s == 1; }));
+
+    Location loc = xferReadOp.getLoc();
+    int64_t numSlices = resultTupleType.size();
+    SmallVector<Value, 4> vectorTupleValues(numSlices);
+    SmallVector<Value, 4> indices(xferReadOp.indices().begin(),
+                                  xferReadOp.indices().end());
+    auto createSlice = [&](unsigned index, ArrayRef<Value> sliceIndices) {
+      // Get VectorType for slice 'i'.
+      auto sliceVectorType = resultTupleType.getType(index);
+      // Create split TransferReadOp for 'sliceUser'.
+      vectorTupleValues[index] = rewriter.create<vector::TransferReadOp>(
+          loc, sliceVectorType, xferReadOp.memref(), sliceIndices,
+          xferReadOp.permutation_map(), xferReadOp.padding());
+    };
+    generateTransferOpSlices(sourceVectorType, resultTupleType, sizes, strides,
+                             indices, rewriter, createSlice);
+
+    // Create tuple of splice xfer read operations.
+    Value tupleOp = rewriter.create<vector::TupleOp>(loc, resultTupleType,
+                                                     vectorTupleValues);
+    // Replace 'xferReadOp' with result 'insertSlicesResult'.
+    rewriter.replaceOpWithNewOp<vector::InsertSlicesOp>(
+        xferReadOp, sourceVectorType, tupleOp, extractSlicesOp.sizes(),
+        extractSlicesOp.strides());
+    return matchSuccess();
+  }
+};
+
+// Splits vector TransferWriteOp into smaller TransferWriteOps for each source.
+struct SplitTransferWriteOp : public OpRewritePattern<vector::TransferWriteOp> {
+  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::TransferWriteOp xferWriteOp,
+                                     PatternRewriter &rewriter) const override {
+    // TODO(andydavis, ntv) Support spliting TransferWriteOp with non-identity
+    // permutation maps. Repurpose code from MaterializeVectors transformation.
+    if (!xferWriteOp.permutation_map().isIdentity())
+      return matchFailure();
+    // Return unless the 'xferWriteOp' 'vector' operand is an 'InsertSlicesOp'.
+    auto *vectorDefOp = xferWriteOp.vector()->getDefiningOp();
+    auto insertSlicesOp = dyn_cast_or_null<vector::InsertSlicesOp>(vectorDefOp);
+    if (!insertSlicesOp)
+      return matchFailure();
+
+    // Get TupleOp operand of 'insertSlicesOp'.
+    auto tupleOp = dyn_cast_or_null<vector::TupleOp>(
+        insertSlicesOp.vectors()->getDefiningOp());
+    if (!tupleOp)
+      return matchFailure();
+
+    // Get 'sizes' and 'strides' parameters from InsertSlicesOp user.
+    auto sourceTupleType = insertSlicesOp.getSourceTupleType();
+    auto resultVectorType = insertSlicesOp.getResultVectorType();
+    SmallVector<int64_t, 4> sizes;
+    insertSlicesOp.getSizes(sizes);
+    SmallVector<int64_t, 4> strides;
+    insertSlicesOp.getStrides(strides);
+
+    Location loc = xferWriteOp.getLoc();
+    SmallVector<Value, 4> indices(xferWriteOp.indices().begin(),
+                                  xferWriteOp.indices().end());
+    auto createSlice = [&](unsigned index, ArrayRef<Value> sliceIndices) {
+      // Create split TransferWriteOp for source vector 'tupleOp.operand[i]'.
+      rewriter.create<vector::TransferWriteOp>(
+          loc, tupleOp.getOperand(index), xferWriteOp.memref(), sliceIndices,
+          xferWriteOp.permutation_map());
+    };
+    generateTransferOpSlices(resultVectorType, sourceTupleType, sizes, strides,
+                             indices, rewriter, createSlice);
+
+    // Erase old 'xferWriteOp'.
+    rewriter.eraseOp(xferWriteOp);
+    return matchSuccess();
+  }
+};
+
+// Patter rewrite which forward tuple elements to their users.
+// User(TupleGetOp(ExtractSlicesOp(InsertSlicesOp(TupleOp(Producer)))))
+//   -> User(Producer)
+struct TupleGetFolderOp : public OpRewritePattern<vector::TupleGetOp> {
+  using OpRewritePattern<vector::TupleGetOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(vector::TupleGetOp tupleGetOp,
+                                     PatternRewriter &rewriter) const override {
+    // Return if 'tupleGetOp.vectors' arg was not defined by ExtractSlicesOp.
+    auto extractSlicesOp = dyn_cast_or_null<vector::ExtractSlicesOp>(
+        tupleGetOp.vectors()->getDefiningOp());
+    if (!extractSlicesOp)
+      return matchFailure();
+
+    // Return if 'extractSlicesOp.vector' arg was not defined by InsertSlicesOp.
+    auto insertSlicesOp = dyn_cast_or_null<vector::InsertSlicesOp>(
+        extractSlicesOp.vector()->getDefiningOp());
+    if (!insertSlicesOp)
+      return matchFailure();
+
+    // Return if 'insertSlicesOp.vectors' arg was not defined by TupleOp.
+    auto tupleOp = dyn_cast_or_null<vector::TupleOp>(
+        insertSlicesOp.vectors()->getDefiningOp());
+    if (!tupleOp)
+      return matchFailure();
+
+    // Forward Value from 'tupleOp' at 'tupleGetOp.index'.
+    Value tupleValue = tupleOp.getOperand(tupleGetOp.getIndex());
+    rewriter.replaceOp(tupleGetOp, tupleValue);
+    return matchSuccess();
+  }
+};
+
+// TODO(andydavis) Add pattern to rewrite ExtractSlices(ConstantMaskOp).
+// TODO(andydavis) Add this as DRR pattern.
+void mlir::vector::populateVectorToVectorTransformationPatterns(
+    OwningRewritePatternList &patterns, MLIRContext *context) {
+  patterns.insert<SplitTransferReadOp, SplitTransferWriteOp, TupleGetFolderOp>(
+      context);
+}
diff --git a/mlir/lib/EDSC/Builders.cpp b/mlir/lib/EDSC/Builders.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d51cded0c51da3561a5f0a94455e26dfc340438
--- /dev/null
+++ b/mlir/lib/EDSC/Builders.cpp
@@ -0,0 +1,487 @@
+//===- Builders.cpp - MLIR Declarative Builder Classes --------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/EDSC/Builders.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+
+#include "llvm/ADT/Optional.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+
+mlir::edsc::ScopedContext::ScopedContext(OpBuilder &builder, Location location)
+    : builder(builder), location(location),
+      enclosingScopedContext(ScopedContext::getCurrentScopedContext()),
+      nestedBuilder(nullptr) {
+  getCurrentScopedContext() = this;
+}
+
+/// Sets the insertion point of the builder to 'newInsertPt' for the duration
+/// of the scope. The existing insertion point of the builder is restored on
+/// destruction.
+mlir::edsc::ScopedContext::ScopedContext(OpBuilder &builder,
+                                         OpBuilder::InsertPoint newInsertPt,
+                                         Location location)
+    : builder(builder), prevBuilderInsertPoint(builder.saveInsertionPoint()),
+      location(location),
+      enclosingScopedContext(ScopedContext::getCurrentScopedContext()),
+      nestedBuilder(nullptr) {
+  getCurrentScopedContext() = this;
+  builder.restoreInsertionPoint(newInsertPt);
+}
+
+mlir::edsc::ScopedContext::~ScopedContext() {
+  assert(!nestedBuilder &&
+         "Active NestedBuilder must have been exited at this point!");
+  if (prevBuilderInsertPoint)
+    builder.restoreInsertionPoint(*prevBuilderInsertPoint);
+  getCurrentScopedContext() = enclosingScopedContext;
+}
+
+ScopedContext *&mlir::edsc::ScopedContext::getCurrentScopedContext() {
+  thread_local ScopedContext *context = nullptr;
+  return context;
+}
+
+OpBuilder &mlir::edsc::ScopedContext::getBuilder() {
+  assert(ScopedContext::getCurrentScopedContext() &&
+         "Unexpected Null ScopedContext");
+  return ScopedContext::getCurrentScopedContext()->builder;
+}
+
+Location mlir::edsc::ScopedContext::getLocation() {
+  assert(ScopedContext::getCurrentScopedContext() &&
+         "Unexpected Null ScopedContext");
+  return ScopedContext::getCurrentScopedContext()->location;
+}
+
+MLIRContext *mlir::edsc::ScopedContext::getContext() {
+  return getBuilder().getContext();
+}
+
+mlir::edsc::ValueHandle::ValueHandle(index_t cst) {
+  auto &b = ScopedContext::getBuilder();
+  auto loc = ScopedContext::getLocation();
+  v = b.create<ConstantIndexOp>(loc, cst.v).getResult();
+  t = v->getType();
+}
+
+ValueHandle &mlir::edsc::ValueHandle::operator=(const ValueHandle &other) {
+  assert(t == other.t && "Wrong type capture");
+  assert(!v && "ValueHandle has already been captured, use a new name!");
+  v = other.v;
+  return *this;
+}
+
+ValueHandle
+mlir::edsc::ValueHandle::createComposedAffineApply(AffineMap map,
+                                                   ArrayRef<Value> operands) {
+  Operation *op =
+      makeComposedAffineApply(ScopedContext::getBuilder(),
+                              ScopedContext::getLocation(), map, operands)
+          .getOperation();
+  assert(op->getNumResults() == 1 && "Not a single result AffineApply");
+  return ValueHandle(op->getResult(0));
+}
+
+ValueHandle ValueHandle::create(StringRef name, ArrayRef<ValueHandle> operands,
+                                ArrayRef<Type> resultTypes,
+                                ArrayRef<NamedAttribute> attributes) {
+  Operation *op =
+      OperationHandle::create(name, operands, resultTypes, attributes);
+  if (op->getNumResults() == 1) {
+    return ValueHandle(op->getResult(0));
+  }
+  if (auto f = dyn_cast<AffineForOp>(op)) {
+    return ValueHandle(f.getInductionVar());
+  }
+  llvm_unreachable("unsupported operation, use an OperationHandle instead");
+}
+
+OperationHandle OperationHandle::create(StringRef name,
+                                        ArrayRef<ValueHandle> operands,
+                                        ArrayRef<Type> resultTypes,
+                                        ArrayRef<NamedAttribute> attributes) {
+  OperationState state(ScopedContext::getLocation(), name);
+  SmallVector<Value, 4> ops(operands.begin(), operands.end());
+  state.addOperands(ops);
+  state.addTypes(resultTypes);
+  for (const auto &attr : attributes) {
+    state.addAttribute(attr.first, attr.second);
+  }
+  return OperationHandle(ScopedContext::getBuilder().createOperation(state));
+}
+
+BlockHandle mlir::edsc::BlockHandle::create(ArrayRef<Type> argTypes) {
+  auto &currentB = ScopedContext::getBuilder();
+  auto *ib = currentB.getInsertionBlock();
+  auto ip = currentB.getInsertionPoint();
+  BlockHandle res;
+  res.block = ScopedContext::getBuilder().createBlock(ib->getParent());
+  // createBlock sets the insertion point inside the block.
+  // We do not want this behavior when using declarative builders with nesting.
+  currentB.setInsertionPoint(ib, ip);
+  for (auto t : argTypes) {
+    res.block->addArgument(t);
+  }
+  return res;
+}
+
+static Optional<ValueHandle> emitStaticFor(ArrayRef<ValueHandle> lbs,
+                                           ArrayRef<ValueHandle> ubs,
+                                           int64_t step) {
+  if (lbs.size() != 1 || ubs.size() != 1)
+    return Optional<ValueHandle>();
+
+  auto *lbDef = lbs.front().getValue()->getDefiningOp();
+  auto *ubDef = ubs.front().getValue()->getDefiningOp();
+  if (!lbDef || !ubDef)
+    return Optional<ValueHandle>();
+
+  auto lbConst = dyn_cast<ConstantIndexOp>(lbDef);
+  auto ubConst = dyn_cast<ConstantIndexOp>(ubDef);
+  if (!lbConst || !ubConst)
+    return Optional<ValueHandle>();
+
+  return ValueHandle::create<AffineForOp>(lbConst.getValue(),
+                                          ubConst.getValue(), step);
+}
+
+mlir::edsc::LoopBuilder mlir::edsc::LoopBuilder::makeAffine(
+    ValueHandle *iv, ArrayRef<ValueHandle> lbHandles,
+    ArrayRef<ValueHandle> ubHandles, int64_t step) {
+  mlir::edsc::LoopBuilder result;
+  if (auto staticFor = emitStaticFor(lbHandles, ubHandles, step)) {
+    *iv = staticFor.getValue();
+  } else {
+    SmallVector<Value, 4> lbs(lbHandles.begin(), lbHandles.end());
+    SmallVector<Value, 4> ubs(ubHandles.begin(), ubHandles.end());
+    *iv = ValueHandle::create<AffineForOp>(
+        lbs, ScopedContext::getBuilder().getMultiDimIdentityMap(lbs.size()),
+        ubs, ScopedContext::getBuilder().getMultiDimIdentityMap(ubs.size()),
+        step);
+  }
+  auto *body = getForInductionVarOwner(iv->getValue()).getBody();
+  result.enter(body, /*prev=*/1);
+  return result;
+}
+
+mlir::edsc::LoopBuilder
+mlir::edsc::LoopBuilder::makeLoop(ValueHandle *iv, ValueHandle lbHandle,
+                                  ValueHandle ubHandle,
+                                  ValueHandle stepHandle) {
+  mlir::edsc::LoopBuilder result;
+  auto forOp =
+      OperationHandle::createOp<loop::ForOp>(lbHandle, ubHandle, stepHandle);
+  *iv = ValueHandle(forOp.getInductionVar());
+  auto *body = loop::getForInductionVarOwner(iv->getValue()).getBody();
+  result.enter(body, /*prev=*/1);
+  return result;
+}
+
+void mlir::edsc::LoopBuilder::operator()(function_ref<void(void)> fun) {
+  // Call to `exit` must be explicit and asymmetric (cannot happen in the
+  // destructor) because of ordering wrt comma operator.
+  /// The particular use case concerns nested blocks:
+  ///
+  /// ```c++
+  ///    For (&i, lb, ub, 1)({
+  ///      /--- destructor for this `For` is not always called before ...
+  ///      V
+  ///      For (&j1, lb, ub, 1)({
+  ///        some_op_1,
+  ///      }),
+  ///      /--- ... this scope is entered, resulting in improperly nested IR.
+  ///      V
+  ///      For (&j2, lb, ub, 1)({
+  ///        some_op_2,
+  ///      }),
+  ///    });
+  /// ```
+  if (fun)
+    fun();
+  exit();
+}
+
+mlir::edsc::AffineLoopNestBuilder::AffineLoopNestBuilder(
+    ValueHandle *iv, ArrayRef<ValueHandle> lbs, ArrayRef<ValueHandle> ubs,
+    int64_t step) {
+  loops.emplace_back(LoopBuilder::makeAffine(iv, lbs, ubs, step));
+}
+
+mlir::edsc::AffineLoopNestBuilder::AffineLoopNestBuilder(
+    ArrayRef<ValueHandle *> ivs, ArrayRef<ValueHandle> lbs,
+    ArrayRef<ValueHandle> ubs, ArrayRef<int64_t> steps) {
+  assert(ivs.size() == lbs.size() && "Mismatch in number of arguments");
+  assert(ivs.size() == ubs.size() && "Mismatch in number of arguments");
+  assert(ivs.size() == steps.size() && "Mismatch in number of arguments");
+  for (auto it : llvm::zip(ivs, lbs, ubs, steps))
+    loops.emplace_back(LoopBuilder::makeAffine(
+        std::get<0>(it), std::get<1>(it), std::get<2>(it), std::get<3>(it)));
+}
+
+void mlir::edsc::AffineLoopNestBuilder::operator()(
+    function_ref<void(void)> fun) {
+  if (fun)
+    fun();
+  // Iterate on the calling operator() on all the loops in the nest.
+  // The iteration order is from innermost to outermost because enter/exit needs
+  // to be asymmetric (i.e. enter() occurs on LoopBuilder construction, exit()
+  // occurs on calling operator()). The asymmetry is required for properly
+  // nesting imperfectly nested regions (see LoopBuilder::operator()).
+  for (auto lit = loops.rbegin(), eit = loops.rend(); lit != eit; ++lit)
+    (*lit)();
+}
+
+mlir::edsc::LoopNestBuilder::LoopNestBuilder(ArrayRef<ValueHandle *> ivs,
+                                             ArrayRef<ValueHandle> lbs,
+                                             ArrayRef<ValueHandle> ubs,
+                                             ArrayRef<ValueHandle> steps) {
+  assert(ivs.size() == lbs.size() && "expected size of ivs and lbs to match");
+  assert(ivs.size() == ubs.size() && "expected size of ivs and ubs to match");
+  assert(ivs.size() == steps.size() &&
+         "expected size of ivs and steps to match");
+  loops.reserve(ivs.size());
+  for (auto it : llvm::zip(ivs, lbs, ubs, steps)) {
+    loops.emplace_back(LoopBuilder::makeLoop(std::get<0>(it), std::get<1>(it),
+                                             std::get<2>(it), std::get<3>(it)));
+  }
+  assert(loops.size() == ivs.size() && "Mismatch loops vs ivs size");
+}
+
+void LoopNestBuilder::LoopNestBuilder::operator()(
+    std::function<void(void)> fun) {
+  if (fun)
+    fun();
+  for (auto &lit : reverse(loops))
+    lit({});
+}
+
+mlir::edsc::BlockBuilder::BlockBuilder(BlockHandle bh, Append) {
+  assert(bh && "Expected already captured BlockHandle");
+  enter(bh.getBlock());
+}
+
+mlir::edsc::BlockBuilder::BlockBuilder(BlockHandle *bh,
+                                       ArrayRef<ValueHandle *> args) {
+  assert(!*bh && "BlockHandle already captures a block, use "
+                 "the explicit BockBuilder(bh, Append())({}) syntax instead.");
+  SmallVector<Type, 8> types;
+  for (auto *a : args) {
+    assert(!a->hasValue() &&
+           "Expected delayed ValueHandle that has not yet captured.");
+    types.push_back(a->getType());
+  }
+  *bh = BlockHandle::create(types);
+  for (auto it : llvm::zip(args, bh->getBlock()->getArguments())) {
+    *(std::get<0>(it)) = ValueHandle(std::get<1>(it));
+  }
+  enter(bh->getBlock());
+}
+
+/// Only serves as an ordering point between entering nested block and creating
+/// stmts.
+void mlir::edsc::BlockBuilder::operator()(function_ref<void(void)> fun) {
+  // Call to `exit` must be explicit and asymmetric (cannot happen in the
+  // destructor) because of ordering wrt comma operator.
+  if (fun)
+    fun();
+  exit();
+}
+
+template <typename Op>
+static ValueHandle createBinaryHandle(ValueHandle lhs, ValueHandle rhs) {
+  return ValueHandle::create<Op>(lhs.getValue(), rhs.getValue());
+}
+
+static std::pair<AffineExpr, Value>
+categorizeValueByAffineType(MLIRContext *context, Value val, unsigned &numDims,
+                            unsigned &numSymbols) {
+  AffineExpr d;
+  Value resultVal = nullptr;
+  if (auto constant = dyn_cast_or_null<ConstantIndexOp>(val->getDefiningOp())) {
+    d = getAffineConstantExpr(constant.getValue(), context);
+  } else if (isValidSymbol(val) && !isValidDim(val)) {
+    d = getAffineSymbolExpr(numSymbols++, context);
+    resultVal = val;
+  } else {
+    d = getAffineDimExpr(numDims++, context);
+    resultVal = val;
+  }
+  return std::make_pair(d, resultVal);
+}
+
+static ValueHandle createBinaryIndexHandle(
+    ValueHandle lhs, ValueHandle rhs,
+    function_ref<AffineExpr(AffineExpr, AffineExpr)> affCombiner) {
+  MLIRContext *context = ScopedContext::getContext();
+  unsigned numDims = 0, numSymbols = 0;
+  AffineExpr d0, d1;
+  Value v0, v1;
+  std::tie(d0, v0) =
+      categorizeValueByAffineType(context, lhs.getValue(), numDims, numSymbols);
+  std::tie(d1, v1) =
+      categorizeValueByAffineType(context, rhs.getValue(), numDims, numSymbols);
+  SmallVector<Value, 2> operands;
+  if (v0) {
+    operands.push_back(v0);
+  }
+  if (v1) {
+    operands.push_back(v1);
+  }
+  auto map = AffineMap::get(numDims, numSymbols, {affCombiner(d0, d1)});
+  // TODO: createOrFold when available.
+  return ValueHandle::createComposedAffineApply(map, operands);
+}
+
+template <typename IOp, typename FOp>
+static ValueHandle createBinaryHandle(
+    ValueHandle lhs, ValueHandle rhs,
+    function_ref<AffineExpr(AffineExpr, AffineExpr)> affCombiner) {
+  auto thisType = lhs.getValue()->getType();
+  auto thatType = rhs.getValue()->getType();
+  assert(thisType == thatType && "cannot mix types in operators");
+  (void)thisType;
+  (void)thatType;
+  if (thisType.isIndex()) {
+    return createBinaryIndexHandle(lhs, rhs, affCombiner);
+  } else if (thisType.isa<IntegerType>()) {
+    return createBinaryHandle<IOp>(lhs, rhs);
+  } else if (thisType.isa<FloatType>()) {
+    return createBinaryHandle<FOp>(lhs, rhs);
+  } else if (thisType.isa<VectorType>() || thisType.isa<TensorType>()) {
+    auto aggregateType = thisType.cast<ShapedType>();
+    if (aggregateType.getElementType().isa<IntegerType>())
+      return createBinaryHandle<IOp>(lhs, rhs);
+    else if (aggregateType.getElementType().isa<FloatType>())
+      return createBinaryHandle<FOp>(lhs, rhs);
+  }
+  llvm_unreachable("failed to create a ValueHandle");
+}
+
+ValueHandle mlir::edsc::op::operator+(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryHandle<AddIOp, AddFOp>(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) { return d0 + d1; });
+}
+
+ValueHandle mlir::edsc::op::operator-(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryHandle<SubIOp, SubFOp>(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) { return d0 - d1; });
+}
+
+ValueHandle mlir::edsc::op::operator*(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryHandle<MulIOp, MulFOp>(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) { return d0 * d1; });
+}
+
+ValueHandle mlir::edsc::op::operator/(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryHandle<SignedDivIOp, DivFOp>(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) -> AffineExpr {
+        llvm_unreachable("only exprs of non-index type support operator/");
+      });
+}
+
+ValueHandle mlir::edsc::op::operator%(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryHandle<SignedRemIOp, RemFOp>(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) { return d0 % d1; });
+}
+
+ValueHandle mlir::edsc::op::floorDiv(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryIndexHandle(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) { return d0.floorDiv(d1); });
+}
+
+ValueHandle mlir::edsc::op::ceilDiv(ValueHandle lhs, ValueHandle rhs) {
+  return createBinaryIndexHandle(
+      lhs, rhs, [](AffineExpr d0, AffineExpr d1) { return d0.ceilDiv(d1); });
+}
+
+ValueHandle mlir::edsc::op::operator!(ValueHandle value) {
+  assert(value.getType().isInteger(1) && "expected boolean expression");
+  return ValueHandle::create<ConstantIntOp>(1, 1) - value;
+}
+
+ValueHandle mlir::edsc::op::operator&&(ValueHandle lhs, ValueHandle rhs) {
+  assert(lhs.getType().isInteger(1) && "expected boolean expression on LHS");
+  assert(rhs.getType().isInteger(1) && "expected boolean expression on RHS");
+  return lhs * rhs;
+}
+
+ValueHandle mlir::edsc::op::operator||(ValueHandle lhs, ValueHandle rhs) {
+  return !(!lhs && !rhs);
+}
+
+static ValueHandle createIComparisonExpr(CmpIPredicate predicate,
+                                         ValueHandle lhs, ValueHandle rhs) {
+  auto lhsType = lhs.getType();
+  auto rhsType = rhs.getType();
+  (void)lhsType;
+  (void)rhsType;
+  assert(lhsType == rhsType && "cannot mix types in operators");
+  assert((lhsType.isa<IndexType>() || lhsType.isa<IntegerType>()) &&
+         "only integer comparisons are supported");
+
+  auto op = ScopedContext::getBuilder().create<CmpIOp>(
+      ScopedContext::getLocation(), predicate, lhs.getValue(), rhs.getValue());
+  return ValueHandle(op.getResult());
+}
+
+static ValueHandle createFComparisonExpr(CmpFPredicate predicate,
+                                         ValueHandle lhs, ValueHandle rhs) {
+  auto lhsType = lhs.getType();
+  auto rhsType = rhs.getType();
+  (void)lhsType;
+  (void)rhsType;
+  assert(lhsType == rhsType && "cannot mix types in operators");
+  assert(lhsType.isa<FloatType>() && "only float comparisons are supported");
+
+  auto op = ScopedContext::getBuilder().create<CmpFOp>(
+      ScopedContext::getLocation(), predicate, lhs.getValue(), rhs.getValue());
+  return ValueHandle(op.getResult());
+}
+
+// All floating point comparison are ordered through EDSL
+ValueHandle mlir::edsc::op::operator==(ValueHandle lhs, ValueHandle rhs) {
+  auto type = lhs.getType();
+  return type.isa<FloatType>()
+             ? createFComparisonExpr(CmpFPredicate::OEQ, lhs, rhs)
+             : createIComparisonExpr(CmpIPredicate::eq, lhs, rhs);
+}
+ValueHandle mlir::edsc::op::operator!=(ValueHandle lhs, ValueHandle rhs) {
+  auto type = lhs.getType();
+  return type.isa<FloatType>()
+             ? createFComparisonExpr(CmpFPredicate::ONE, lhs, rhs)
+             : createIComparisonExpr(CmpIPredicate::ne, lhs, rhs);
+}
+ValueHandle mlir::edsc::op::operator<(ValueHandle lhs, ValueHandle rhs) {
+  auto type = lhs.getType();
+  return type.isa<FloatType>()
+             ? createFComparisonExpr(CmpFPredicate::OLT, lhs, rhs)
+             :
+             // TODO(ntv,zinenko): signed by default, how about unsigned?
+             createIComparisonExpr(CmpIPredicate::slt, lhs, rhs);
+}
+ValueHandle mlir::edsc::op::operator<=(ValueHandle lhs, ValueHandle rhs) {
+  auto type = lhs.getType();
+  return type.isa<FloatType>()
+             ? createFComparisonExpr(CmpFPredicate::OLE, lhs, rhs)
+             : createIComparisonExpr(CmpIPredicate::sle, lhs, rhs);
+}
+ValueHandle mlir::edsc::op::operator>(ValueHandle lhs, ValueHandle rhs) {
+  auto type = lhs.getType();
+  return type.isa<FloatType>()
+             ? createFComparisonExpr(CmpFPredicate::OGT, lhs, rhs)
+             : createIComparisonExpr(CmpIPredicate::sgt, lhs, rhs);
+}
+ValueHandle mlir::edsc::op::operator>=(ValueHandle lhs, ValueHandle rhs) {
+  auto type = lhs.getType();
+  return type.isa<FloatType>()
+             ? createFComparisonExpr(CmpFPredicate::OGE, lhs, rhs)
+             : createIComparisonExpr(CmpIPredicate::sge, lhs, rhs);
+}
diff --git a/mlir/lib/EDSC/CMakeLists.txt b/mlir/lib/EDSC/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bb7e864bce944b44137ca6e7120f527cc866285c
--- /dev/null
+++ b/mlir/lib/EDSC/CMakeLists.txt
@@ -0,0 +1,37 @@
+set(LLVM_OPTIONAL_SOURCES
+  Builders.cpp
+  CoreAPIs.cpp
+  Helpers.cpp
+  Intrinsics.cpp
+  )
+
+add_llvm_library(MLIREDSC
+  Builders.cpp
+  Helpers.cpp
+  Intrinsics.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/EDSC
+  )
+target_link_libraries(MLIREDSC
+  PUBLIC
+    MLIRAffineOps
+    MLIRLoopOps
+    MLIRStandardOps
+    MLIRTransformUtils
+    MLIRVectorOps
+    )
+
+add_llvm_library(MLIREDSCInterface
+  CoreAPIs.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/EDSC
+  )
+add_dependencies(MLIREDSCInterface MLIRIR)
+target_link_libraries(MLIREDSC
+  PUBLIC
+    MLIRIR
+    MLIRSupport
+    MLIRParser
+    )
diff --git a/mlir/lib/EDSC/CoreAPIs.cpp b/mlir/lib/EDSC/CoreAPIs.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6f7c1728bb05733d6471c6b27132a346d12664a3
--- /dev/null
+++ b/mlir/lib/EDSC/CoreAPIs.cpp
@@ -0,0 +1,93 @@
+//===- Types.cpp - Implementations of MLIR Core C APIs --------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir-c/Core.h"
+
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Support/LLVM.h"
+
+#include "mlir/Parser.h"
+
+#include "llvm/ADT/StringSwitch.h"
+
+using namespace mlir;
+
+mlir_type_t makeMemRefType(mlir_context_t context, mlir_type_t elemType,
+                           int64_list_t sizes) {
+  auto t = mlir::MemRefType::get(
+      ArrayRef<int64_t>(sizes.values, sizes.n),
+      mlir::Type::getFromOpaquePointer(elemType),
+      {mlir::AffineMap::getMultiDimIdentityMap(
+          sizes.n, reinterpret_cast<mlir::MLIRContext *>(context))},
+      0);
+  return mlir_type_t{t.getAsOpaquePointer()};
+}
+
+mlir_type_t makeFunctionType(mlir_context_t context, mlir_type_list_t inputs,
+                             mlir_type_list_t outputs) {
+  SmallVector<mlir::Type, 8> ins(inputs.n), outs(outputs.n);
+  for (unsigned i = 0; i < inputs.n; ++i) {
+    ins[i] = mlir::Type::getFromOpaquePointer(inputs.types[i]);
+  }
+  for (unsigned i = 0; i < outputs.n; ++i) {
+    outs[i] = mlir::Type::getFromOpaquePointer(outputs.types[i]);
+  }
+  auto ft = mlir::FunctionType::get(
+      ins, outs, reinterpret_cast<mlir::MLIRContext *>(context));
+  return mlir_type_t{ft.getAsOpaquePointer()};
+}
+
+mlir_type_t makeIndexType(mlir_context_t context) {
+  auto *ctx = reinterpret_cast<mlir::MLIRContext *>(context);
+  auto type = mlir::IndexType::get(ctx);
+  return mlir_type_t{type.getAsOpaquePointer()};
+}
+
+mlir_attr_t makeIntegerAttr(mlir_type_t type, int64_t value) {
+  auto ty = Type::getFromOpaquePointer(reinterpret_cast<const void *>(type));
+  auto attr = IntegerAttr::get(ty, value);
+  return mlir_attr_t{attr.getAsOpaquePointer()};
+}
+
+mlir_attr_t makeBoolAttr(mlir_context_t context, bool value) {
+  auto *ctx = reinterpret_cast<mlir::MLIRContext *>(context);
+  auto attr = BoolAttr::get(value, ctx);
+  return mlir_attr_t{attr.getAsOpaquePointer()};
+}
+
+mlir_attr_t makeFloatAttr(mlir_context_t context, float value) {
+  auto *ctx = reinterpret_cast<mlir::MLIRContext *>(context);
+  auto attr = FloatAttr::get(FloatType::getF32(ctx), APFloat(value));
+  return mlir_attr_t{attr.getAsOpaquePointer()};
+}
+
+mlir_attr_t makeStringAttr(mlir_context_t context, const char *value) {
+  auto *ctx = reinterpret_cast<mlir::MLIRContext *>(context);
+  auto attr = StringAttr::get(value, ctx);
+  return mlir_attr_t{attr.getAsOpaquePointer()};
+}
+
+unsigned getFunctionArity(mlir_func_t function) {
+  auto f = mlir::FuncOp::getFromOpaquePointer(function);
+  return f.getNumArguments();
+}
+
+mlir_type_t mlirParseType(const char *type, mlir_context_t context,
+                          uint64_t *charsRead) {
+  auto *ctx = reinterpret_cast<MLIRContext *>(context);
+  size_t numRead = 0;
+  Type ty = parseType(type, ctx, numRead);
+  if (charsRead)
+    *charsRead = numRead;
+  return mlir_type_t{ty.getAsOpaquePointer()};
+}
diff --git a/mlir/lib/EDSC/Helpers.cpp b/mlir/lib/EDSC/Helpers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..008948b202f369c4961a17632bdcb0c25cc0256e
--- /dev/null
+++ b/mlir/lib/EDSC/Helpers.cpp
@@ -0,0 +1,52 @@
+//===- Helpers.cpp - MLIR Declarative Helper Functionality ----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+
+static SmallVector<ValueHandle, 8> getMemRefSizes(Value memRef) {
+  MemRefType memRefType = memRef->getType().cast<MemRefType>();
+  assert(isStrided(memRefType) && "Expected strided MemRef type");
+
+  SmallVector<ValueHandle, 8> res;
+  res.reserve(memRefType.getShape().size());
+  const auto &shape = memRefType.getShape();
+  for (unsigned idx = 0, n = shape.size(); idx < n; ++idx) {
+    if (shape[idx] == -1) {
+      res.push_back(ValueHandle::create<DimOp>(memRef, idx));
+    } else {
+      res.push_back(static_cast<index_t>(shape[idx]));
+    }
+  }
+  return res;
+}
+
+mlir::edsc::MemRefView::MemRefView(Value v) : base(v) {
+  assert(v->getType().isa<MemRefType>() && "MemRefType expected");
+
+  auto memrefSizeValues = getMemRefSizes(v);
+  for (auto &size : memrefSizeValues) {
+    lbs.push_back(static_cast<index_t>(0));
+    ubs.push_back(size);
+    steps.push_back(1);
+  }
+}
+
+mlir::edsc::VectorView::VectorView(Value v) : base(v) {
+  auto vectorType = v->getType().cast<VectorType>();
+
+  for (auto s : vectorType.getShape()) {
+    lbs.push_back(static_cast<index_t>(0));
+    ubs.push_back(static_cast<index_t>(s));
+    steps.push_back(1);
+  }
+}
diff --git a/mlir/lib/EDSC/Intrinsics.cpp b/mlir/lib/EDSC/Intrinsics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d339ec06884557d8c85a55cac193714585f79128
--- /dev/null
+++ b/mlir/lib/EDSC/Intrinsics.cpp
@@ -0,0 +1,76 @@
+//===- Intrinsics.cpp - MLIR Operations for Declarative Builders ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/EDSC/Intrinsics.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/IR/AffineExpr.h"
+
+using namespace mlir;
+using namespace mlir::edsc;
+
+OperationHandle mlir::edsc::intrinsics::br(BlockHandle bh,
+                                           ArrayRef<ValueHandle> operands) {
+  assert(bh && "Expected already captured BlockHandle");
+  for (auto &o : operands) {
+    (void)o;
+    assert(o && "Expected already captured ValueHandle");
+  }
+  SmallVector<Value, 4> ops(operands.begin(), operands.end());
+  return OperationHandle::create<BranchOp>(bh.getBlock(), ops);
+}
+static void enforceEmptyCapturesMatchOperands(ArrayRef<ValueHandle *> captures,
+                                              ArrayRef<ValueHandle> operands) {
+  assert(captures.size() == operands.size() &&
+         "Expected same number of captures as operands");
+  for (auto it : llvm::zip(captures, operands)) {
+    (void)it;
+    assert(!std::get<0>(it)->hasValue() &&
+           "Unexpected already captured ValueHandle");
+    assert(std::get<1>(it) && "Expected already captured ValueHandle");
+    assert(std::get<0>(it)->getType() == std::get<1>(it).getType() &&
+           "Expected the same type for capture and operand");
+  }
+}
+
+OperationHandle mlir::edsc::intrinsics::br(BlockHandle *bh,
+                                           ArrayRef<ValueHandle *> captures,
+                                           ArrayRef<ValueHandle> operands) {
+  assert(!*bh && "Unexpected already captured BlockHandle");
+  enforceEmptyCapturesMatchOperands(captures, operands);
+  BlockBuilder(bh, captures)(/* no body */);
+  SmallVector<Value, 4> ops(operands.begin(), operands.end());
+  return OperationHandle::create<BranchOp>(bh->getBlock(), ops);
+}
+
+OperationHandle
+mlir::edsc::intrinsics::cond_br(ValueHandle cond, BlockHandle trueBranch,
+                                ArrayRef<ValueHandle> trueOperands,
+                                BlockHandle falseBranch,
+                                ArrayRef<ValueHandle> falseOperands) {
+  SmallVector<Value, 4> trueOps(trueOperands.begin(), trueOperands.end());
+  SmallVector<Value, 4> falseOps(falseOperands.begin(), falseOperands.end());
+  return OperationHandle::create<CondBranchOp>(
+      cond, trueBranch.getBlock(), trueOps, falseBranch.getBlock(), falseOps);
+}
+
+OperationHandle mlir::edsc::intrinsics::cond_br(
+    ValueHandle cond, BlockHandle *trueBranch,
+    ArrayRef<ValueHandle *> trueCaptures, ArrayRef<ValueHandle> trueOperands,
+    BlockHandle *falseBranch, ArrayRef<ValueHandle *> falseCaptures,
+    ArrayRef<ValueHandle> falseOperands) {
+  assert(!*trueBranch && "Unexpected already captured BlockHandle");
+  assert(!*falseBranch && "Unexpected already captured BlockHandle");
+  enforceEmptyCapturesMatchOperands(trueCaptures, trueOperands);
+  enforceEmptyCapturesMatchOperands(falseCaptures, falseOperands);
+  BlockBuilder(trueBranch, trueCaptures)(/* no body */);
+  BlockBuilder(falseBranch, falseCaptures)(/* no body */);
+  SmallVector<Value, 4> trueOps(trueOperands.begin(), trueOperands.end());
+  SmallVector<Value, 4> falseOps(falseOperands.begin(), falseOperands.end());
+  return OperationHandle::create<CondBranchOp>(
+      cond, trueBranch->getBlock(), trueOps, falseBranch->getBlock(), falseOps);
+}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1326e1a5b9a53a130c7640f1b05614c05587a192
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -0,0 +1,20 @@
+llvm_map_components_to_libnames(outlibs "nativecodegen" "IPO")
+add_llvm_library(MLIRExecutionEngine
+  ExecutionEngine.cpp
+  OptUtils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/ExecutionEngine
+  )
+target_link_libraries(MLIRExecutionEngine
+
+  MLIRLLVMIR
+  MLIRTargetLLVMIR
+  LLVMBitReader
+  LLVMBitWriter
+  LLVMExecutionEngine
+  LLVMOrcJIT
+  LLVMSupport
+  LLVMTransformUtils
+
+  ${outlibs})
diff --git a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1537018076ab401d4aca533c99e0f40ed8d78962
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -0,0 +1,307 @@
+//===- ExecutionEngine.cpp - MLIR Execution engine and utils --------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the execution engine for MLIR modules based on LLVM Orc
+// JIT engine.
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/ExecutionEngine/ExecutionEngine.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Target/LLVMIR.h"
+
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+#define DEBUG_TYPE "execution-engine"
+
+using namespace mlir;
+using llvm::dbgs;
+using llvm::Error;
+using llvm::errs;
+using llvm::Expected;
+using llvm::LLVMContext;
+using llvm::MemoryBuffer;
+using llvm::MemoryBufferRef;
+using llvm::Module;
+using llvm::SectionMemoryManager;
+using llvm::StringError;
+using llvm::Triple;
+using llvm::orc::DynamicLibrarySearchGenerator;
+using llvm::orc::ExecutionSession;
+using llvm::orc::IRCompileLayer;
+using llvm::orc::JITTargetMachineBuilder;
+using llvm::orc::RTDyldObjectLinkingLayer;
+using llvm::orc::ThreadSafeModule;
+using llvm::orc::TMOwningSimpleCompiler;
+
+/// Wrap a string into an llvm::StringError.
+static Error make_string_error(const Twine &message) {
+  return llvm::make_error<StringError>(message.str(),
+                                       llvm::inconvertibleErrorCode());
+}
+
+void SimpleObjectCache::notifyObjectCompiled(const Module *M,
+                                             MemoryBufferRef ObjBuffer) {
+  cachedObjects[M->getModuleIdentifier()] = MemoryBuffer::getMemBufferCopy(
+      ObjBuffer.getBuffer(), ObjBuffer.getBufferIdentifier());
+}
+
+std::unique_ptr<MemoryBuffer> SimpleObjectCache::getObject(const Module *M) {
+  auto I = cachedObjects.find(M->getModuleIdentifier());
+  if (I == cachedObjects.end()) {
+    LLVM_DEBUG(dbgs() << "No object for " << M->getModuleIdentifier()
+                      << " in cache. Compiling.\n");
+    return nullptr;
+  }
+  LLVM_DEBUG(dbgs() << "Object for " << M->getModuleIdentifier()
+                    << " loaded from cache.\n");
+  return MemoryBuffer::getMemBuffer(I->second->getMemBufferRef());
+}
+
+void SimpleObjectCache::dumpToObjectFile(StringRef outputFilename) {
+  // Set up the output file.
+  std::string errorMessage;
+  auto file = openOutputFile(outputFilename, &errorMessage);
+  if (!file) {
+    llvm::errs() << errorMessage << "\n";
+    return;
+  }
+
+  // Dump the object generated for a single module to the output file.
+  assert(cachedObjects.size() == 1 && "Expected only one object entry.");
+  auto &cachedObject = cachedObjects.begin()->second;
+  file->os() << cachedObject->getBuffer();
+  file->keep();
+}
+
+void ExecutionEngine::dumpToObjectFile(StringRef filename) {
+  cache->dumpToObjectFile(filename);
+}
+
+// Setup LLVM target triple from the current machine.
+bool ExecutionEngine::setupTargetTriple(Module *llvmModule) {
+  // Setup the machine properties from the current architecture.
+  auto targetTriple = llvm::sys::getDefaultTargetTriple();
+  std::string errorMessage;
+  auto target = llvm::TargetRegistry::lookupTarget(targetTriple, errorMessage);
+  if (!target) {
+    errs() << "NO target: " << errorMessage << "\n";
+    return true;
+  }
+  std::unique_ptr<llvm::TargetMachine> machine(
+      target->createTargetMachine(targetTriple, "generic", "", {}, {}));
+  llvmModule->setDataLayout(machine->createDataLayout());
+  llvmModule->setTargetTriple(targetTriple);
+  return false;
+}
+
+static std::string makePackedFunctionName(StringRef name) {
+  return "_mlir_" + name.str();
+}
+
+// For each function in the LLVM module, define an interface function that wraps
+// all the arguments of the original function and all its results into an i8**
+// pointer to provide a unified invocation interface.
+void packFunctionArguments(Module *module) {
+  auto &ctx = module->getContext();
+  llvm::IRBuilder<> builder(ctx);
+  DenseSet<llvm::Function *> interfaceFunctions;
+  for (auto &func : module->getFunctionList()) {
+    if (func.isDeclaration()) {
+      continue;
+    }
+    if (interfaceFunctions.count(&func)) {
+      continue;
+    }
+
+    // Given a function `foo(<...>)`, define the interface function
+    // `mlir_foo(i8**)`.
+    auto newType = llvm::FunctionType::get(
+        builder.getVoidTy(), builder.getInt8PtrTy()->getPointerTo(),
+        /*isVarArg=*/false);
+    auto newName = makePackedFunctionName(func.getName());
+    auto funcCst = module->getOrInsertFunction(newName, newType);
+    llvm::Function *interfaceFunc = cast<llvm::Function>(funcCst.getCallee());
+    interfaceFunctions.insert(interfaceFunc);
+
+    // Extract the arguments from the type-erased argument list and cast them to
+    // the proper types.
+    auto bb = llvm::BasicBlock::Create(ctx);
+    bb->insertInto(interfaceFunc);
+    builder.SetInsertPoint(bb);
+    llvm::Value *argList = interfaceFunc->arg_begin();
+    SmallVector<llvm::Value *, 8> args;
+    args.reserve(llvm::size(func.args()));
+    for (auto &indexedArg : llvm::enumerate(func.args())) {
+      llvm::Value *argIndex = llvm::Constant::getIntegerValue(
+          builder.getInt64Ty(), APInt(64, indexedArg.index()));
+      llvm::Value *argPtrPtr = builder.CreateGEP(argList, argIndex);
+      llvm::Value *argPtr = builder.CreateLoad(argPtrPtr);
+      argPtr = builder.CreateBitCast(
+          argPtr, indexedArg.value().getType()->getPointerTo());
+      llvm::Value *arg = builder.CreateLoad(argPtr);
+      args.push_back(arg);
+    }
+
+    // Call the implementation function with the extracted arguments.
+    llvm::Value *result = builder.CreateCall(&func, args);
+
+    // Assuming the result is one value, potentially of type `void`.
+    if (!result->getType()->isVoidTy()) {
+      llvm::Value *retIndex = llvm::Constant::getIntegerValue(
+          builder.getInt64Ty(), APInt(64, llvm::size(func.args())));
+      llvm::Value *retPtrPtr = builder.CreateGEP(argList, retIndex);
+      llvm::Value *retPtr = builder.CreateLoad(retPtrPtr);
+      retPtr = builder.CreateBitCast(retPtr, result->getType()->getPointerTo());
+      builder.CreateStore(result, retPtr);
+    }
+
+    // The interface function returns void.
+    builder.CreateRetVoid();
+  }
+}
+
+ExecutionEngine::ExecutionEngine(bool enableObjectCache)
+    : cache(enableObjectCache ? nullptr : new SimpleObjectCache()) {}
+
+Expected<std::unique_ptr<ExecutionEngine>> ExecutionEngine::create(
+    ModuleOp m, std::function<Error(llvm::Module *)> transformer,
+    Optional<llvm::CodeGenOpt::Level> jitCodeGenOptLevel,
+    ArrayRef<StringRef> sharedLibPaths, bool enableObjectCache) {
+  auto engine = std::make_unique<ExecutionEngine>(enableObjectCache);
+
+  std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext);
+  auto llvmModule = translateModuleToLLVMIR(m);
+  if (!llvmModule)
+    return make_string_error("could not convert to LLVM IR");
+  // FIXME: the triple should be passed to the translation or dialect conversion
+  // instead of this.  Currently, the LLVM module created above has no triple
+  // associated with it.
+  setupTargetTriple(llvmModule.get());
+  packFunctionArguments(llvmModule.get());
+
+  // Clone module in a new LLVMContext since translateModuleToLLVMIR buries
+  // ownership too deeply.
+  // TODO(zinenko): Reevaluate model of ownership of LLVMContext in LLVMDialect.
+  SmallVector<char, 1> buffer;
+  {
+    llvm::raw_svector_ostream os(buffer);
+    WriteBitcodeToFile(*llvmModule, os);
+  }
+  llvm::MemoryBufferRef bufferRef(StringRef(buffer.data(), buffer.size()),
+                                  "cloned module buffer");
+  auto expectedModule = parseBitcodeFile(bufferRef, *ctx);
+  if (!expectedModule)
+    return expectedModule.takeError();
+  std::unique_ptr<Module> deserModule = std::move(*expectedModule);
+
+  // Callback to create the object layer with symbol resolution to current
+  // process and dynamically linked libraries.
+  auto objectLinkingLayerCreator = [&](ExecutionSession &session,
+                                       const Triple &TT) {
+    auto objectLayer = std::make_unique<RTDyldObjectLinkingLayer>(
+        session, []() { return std::make_unique<SectionMemoryManager>(); });
+    auto dataLayout = deserModule->getDataLayout();
+    llvm::orc::JITDylib *mainJD = session.getJITDylibByName("<main>");
+    if (!mainJD)
+      mainJD = &session.createJITDylib("<main>");
+
+    // Resolve symbols that are statically linked in the current process.
+    mainJD->addGenerator(
+        cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(
+            dataLayout.getGlobalPrefix())));
+
+    // Resolve symbols from shared libraries.
+    for (auto libPath : sharedLibPaths) {
+      auto mb = llvm::MemoryBuffer::getFile(libPath);
+      if (!mb) {
+        errs() << "Fail to create MemoryBuffer for: " << libPath << "\n";
+        continue;
+      }
+      auto &JD = session.createJITDylib(libPath);
+      auto loaded = DynamicLibrarySearchGenerator::Load(
+          libPath.data(), dataLayout.getGlobalPrefix());
+      if (!loaded) {
+        errs() << "Could not load " << libPath << ":\n  " << loaded.takeError()
+               << "\n";
+        continue;
+      }
+      JD.addGenerator(std::move(*loaded));
+      cantFail(objectLayer->add(JD, std::move(mb.get())));
+    }
+
+    return objectLayer;
+  };
+
+  // Callback to inspect the cache and recompile on demand. This follows Lang's
+  // LLJITWithObjectCache example.
+  auto compileFunctionCreator = [&](JITTargetMachineBuilder JTMB)
+      -> Expected<IRCompileLayer::CompileFunction> {
+    if (jitCodeGenOptLevel)
+      JTMB.setCodeGenOptLevel(jitCodeGenOptLevel.getValue());
+    auto TM = JTMB.createTargetMachine();
+    if (!TM)
+      return TM.takeError();
+    return IRCompileLayer::CompileFunction(
+        TMOwningSimpleCompiler(std::move(*TM), engine->cache.get()));
+  };
+
+  // Create the LLJIT by calling the LLJITBuilder with 2 callbacks.
+  auto jit =
+      cantFail(llvm::orc::LLJITBuilder()
+                   .setCompileFunctionCreator(compileFunctionCreator)
+                   .setObjectLinkingLayerCreator(objectLinkingLayerCreator)
+                   .create());
+
+  // Add a ThreadSafemodule to the engine and return.
+  ThreadSafeModule tsm(std::move(deserModule), std::move(ctx));
+  if (transformer)
+    cantFail(tsm.withModuleDo(
+        [&](llvm::Module &module) { return transformer(&module); }));
+  cantFail(jit->addIRModule(std::move(tsm)));
+  engine->jit = std::move(jit);
+
+  return std::move(engine);
+}
+
+Expected<void (*)(void **)> ExecutionEngine::lookup(StringRef name) const {
+  auto expectedSymbol = jit->lookup(makePackedFunctionName(name));
+  if (!expectedSymbol)
+    return expectedSymbol.takeError();
+  auto rawFPtr = expectedSymbol->getAddress();
+  auto fptr = reinterpret_cast<void (*)(void **)>(rawFPtr);
+  if (!fptr)
+    return make_string_error("looked up function is null");
+  return fptr;
+}
+
+Error ExecutionEngine::invoke(StringRef name, MutableArrayRef<void *> args) {
+  auto expectedFPtr = lookup(name);
+  if (!expectedFPtr)
+    return expectedFPtr.takeError();
+  auto fptr = *expectedFPtr;
+
+  (*fptr)(args.data());
+
+  return Error::success();
+}
diff --git a/mlir/lib/ExecutionEngine/OptUtils.cpp b/mlir/lib/ExecutionEngine/OptUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ec2ae5f2dccdcd828c4c80bb336bc0e6cbae158d
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/OptUtils.cpp
@@ -0,0 +1,142 @@
+//===- OptUtils.cpp - MLIR Execution Engine optimization pass utilities ---===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the utility functions to trigger LLVM optimizations from
+// MLIR Execution Engine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ExecutionEngine/OptUtils.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LegacyPassNameParser.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include <climits>
+#include <mutex>
+
+// Run the module and function passes managed by the module manager.
+static void runPasses(llvm::legacy::PassManager &modulePM,
+                      llvm::legacy::FunctionPassManager &funcPM,
+                      llvm::Module &m) {
+  funcPM.doInitialization();
+  for (auto &func : m) {
+    funcPM.run(func);
+  }
+  funcPM.doFinalization();
+  modulePM.run(m);
+}
+
+// Initialize basic LLVM transformation passes under lock.
+void mlir::initializeLLVMPasses() {
+  static std::mutex mutex;
+  std::lock_guard<std::mutex> lock(mutex);
+
+  auto &registry = *llvm::PassRegistry::getPassRegistry();
+  llvm::initializeCore(registry);
+  llvm::initializeTransformUtils(registry);
+  llvm::initializeScalarOpts(registry);
+  llvm::initializeIPO(registry);
+  llvm::initializeInstCombine(registry);
+  llvm::initializeAggressiveInstCombine(registry);
+  llvm::initializeAnalysis(registry);
+  llvm::initializeVectorization(registry);
+}
+
+// Populate pass managers according to the optimization and size levels.
+// This behaves similarly to LLVM opt.
+static void populatePassManagers(llvm::legacy::PassManager &modulePM,
+                                 llvm::legacy::FunctionPassManager &funcPM,
+                                 unsigned optLevel, unsigned sizeLevel,
+                                 llvm::TargetMachine *targetMachine) {
+  llvm::PassManagerBuilder builder;
+  builder.OptLevel = optLevel;
+  builder.SizeLevel = sizeLevel;
+  builder.Inliner = llvm::createFunctionInliningPass(
+      optLevel, sizeLevel, /*DisableInlineHotCallSite=*/false);
+  builder.LoopVectorize = optLevel > 1 && sizeLevel < 2;
+  builder.SLPVectorize = optLevel > 1 && sizeLevel < 2;
+  builder.DisableUnrollLoops = (optLevel == 0);
+
+  if (targetMachine) {
+    // Add pass to initialize TTI for this specific target. Otherwise, TTI will
+    // be initialized to NoTTIImpl by default.
+    modulePM.add(createTargetTransformInfoWrapperPass(
+        targetMachine->getTargetIRAnalysis()));
+    funcPM.add(createTargetTransformInfoWrapperPass(
+        targetMachine->getTargetIRAnalysis()));
+  }
+
+  builder.populateModulePassManager(modulePM);
+  builder.populateFunctionPassManager(funcPM);
+}
+
+// Create and return a lambda that uses LLVM pass manager builder to set up
+// optimizations based on the given level.
+std::function<llvm::Error(llvm::Module *)>
+mlir::makeOptimizingTransformer(unsigned optLevel, unsigned sizeLevel,
+                                llvm::TargetMachine *targetMachine) {
+  return [optLevel, sizeLevel, targetMachine](llvm::Module *m) -> llvm::Error {
+    llvm::legacy::PassManager modulePM;
+    llvm::legacy::FunctionPassManager funcPM(m);
+    populatePassManagers(modulePM, funcPM, optLevel, sizeLevel, targetMachine);
+    runPasses(modulePM, funcPM, *m);
+
+    return llvm::Error::success();
+  };
+}
+
+// Create and return a lambda that is given a set of passes to run, plus an
+// optional optimization level to pre-populate the pass manager.
+std::function<llvm::Error(llvm::Module *)> mlir::makeLLVMPassesTransformer(
+    llvm::ArrayRef<const llvm::PassInfo *> llvmPasses,
+    llvm::Optional<unsigned> mbOptLevel, llvm::TargetMachine *targetMachine,
+    unsigned optPassesInsertPos) {
+  return [llvmPasses, mbOptLevel, optPassesInsertPos,
+          targetMachine](llvm::Module *m) -> llvm::Error {
+    llvm::legacy::PassManager modulePM;
+    llvm::legacy::FunctionPassManager funcPM(m);
+
+    bool insertOptPasses = mbOptLevel.hasValue();
+    for (unsigned i = 0, e = llvmPasses.size(); i < e; ++i) {
+      const auto *passInfo = llvmPasses[i];
+      if (!passInfo->getNormalCtor())
+        continue;
+
+      if (insertOptPasses && optPassesInsertPos == i) {
+        populatePassManagers(modulePM, funcPM, mbOptLevel.getValue(), 0,
+                             targetMachine);
+        insertOptPasses = false;
+      }
+
+      auto *pass = passInfo->createPass();
+      if (!pass)
+        return llvm::make_error<llvm::StringError>(
+            "could not create pass " + passInfo->getPassName(),
+            llvm::inconvertibleErrorCode());
+      modulePM.add(pass);
+    }
+
+    if (insertOptPasses)
+      populatePassManagers(modulePM, funcPM, mbOptLevel.getValue(), 0,
+                           targetMachine);
+
+    runPasses(modulePM, funcPM, *m);
+    return llvm::Error::success();
+  };
+}
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd8ce00c82a989b4353ba4d6d20b06f72bd4882e
--- /dev/null
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -0,0 +1,918 @@
+//===- AffineExpr.cpp - MLIR Affine Expr Classes --------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/AffineExpr.h"
+#include "AffineExprDetail.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/Support/MathExtras.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+MLIRContext *AffineExpr::getContext() const { return expr->context; }
+
+AffineExprKind AffineExpr::getKind() const {
+  return static_cast<AffineExprKind>(expr->getKind());
+}
+
+/// Walk all of the AffineExprs in this subgraph in postorder.
+void AffineExpr::walk(std::function<void(AffineExpr)> callback) const {
+  struct AffineExprWalker : public AffineExprVisitor<AffineExprWalker> {
+    std::function<void(AffineExpr)> callback;
+
+    AffineExprWalker(std::function<void(AffineExpr)> callback)
+        : callback(callback) {}
+
+    void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) { callback(expr); }
+    void visitConstantExpr(AffineConstantExpr expr) { callback(expr); }
+    void visitDimExpr(AffineDimExpr expr) { callback(expr); }
+    void visitSymbolExpr(AffineSymbolExpr expr) { callback(expr); }
+  };
+
+  AffineExprWalker(callback).walkPostOrder(*this);
+}
+
+// Dispatch affine expression construction based on kind.
+AffineExpr mlir::getAffineBinaryOpExpr(AffineExprKind kind, AffineExpr lhs,
+                                       AffineExpr rhs) {
+  if (kind == AffineExprKind::Add)
+    return lhs + rhs;
+  if (kind == AffineExprKind::Mul)
+    return lhs * rhs;
+  if (kind == AffineExprKind::FloorDiv)
+    return lhs.floorDiv(rhs);
+  if (kind == AffineExprKind::CeilDiv)
+    return lhs.ceilDiv(rhs);
+  if (kind == AffineExprKind::Mod)
+    return lhs % rhs;
+
+  llvm_unreachable("unknown binary operation on affine expressions");
+}
+
+/// This method substitutes any uses of dimensions and symbols (e.g.
+/// dim#0 with dimReplacements[0]) and returns the modified expression tree.
+AffineExpr
+AffineExpr::replaceDimsAndSymbols(ArrayRef<AffineExpr> dimReplacements,
+                                  ArrayRef<AffineExpr> symReplacements) const {
+  switch (getKind()) {
+  case AffineExprKind::Constant:
+    return *this;
+  case AffineExprKind::DimId: {
+    unsigned dimId = cast<AffineDimExpr>().getPosition();
+    if (dimId >= dimReplacements.size())
+      return *this;
+    return dimReplacements[dimId];
+  }
+  case AffineExprKind::SymbolId: {
+    unsigned symId = cast<AffineSymbolExpr>().getPosition();
+    if (symId >= symReplacements.size())
+      return *this;
+    return symReplacements[symId];
+  }
+  case AffineExprKind::Add:
+  case AffineExprKind::Mul:
+  case AffineExprKind::FloorDiv:
+  case AffineExprKind::CeilDiv:
+  case AffineExprKind::Mod:
+    auto binOp = cast<AffineBinaryOpExpr>();
+    auto lhs = binOp.getLHS(), rhs = binOp.getRHS();
+    auto newLHS = lhs.replaceDimsAndSymbols(dimReplacements, symReplacements);
+    auto newRHS = rhs.replaceDimsAndSymbols(dimReplacements, symReplacements);
+    if (newLHS == lhs && newRHS == rhs)
+      return *this;
+    return getAffineBinaryOpExpr(getKind(), newLHS, newRHS);
+  }
+  llvm_unreachable("Unknown AffineExpr");
+}
+
+/// Returns true if this expression is made out of only symbols and
+/// constants (no dimensional identifiers).
+bool AffineExpr::isSymbolicOrConstant() const {
+  switch (getKind()) {
+  case AffineExprKind::Constant:
+    return true;
+  case AffineExprKind::DimId:
+    return false;
+  case AffineExprKind::SymbolId:
+    return true;
+
+  case AffineExprKind::Add:
+  case AffineExprKind::Mul:
+  case AffineExprKind::FloorDiv:
+  case AffineExprKind::CeilDiv:
+  case AffineExprKind::Mod: {
+    auto expr = this->cast<AffineBinaryOpExpr>();
+    return expr.getLHS().isSymbolicOrConstant() &&
+           expr.getRHS().isSymbolicOrConstant();
+  }
+  }
+  llvm_unreachable("Unknown AffineExpr");
+}
+
+/// Returns true if this is a pure affine expression, i.e., multiplication,
+/// floordiv, ceildiv, and mod is only allowed w.r.t constants.
+bool AffineExpr::isPureAffine() const {
+  switch (getKind()) {
+  case AffineExprKind::SymbolId:
+  case AffineExprKind::DimId:
+  case AffineExprKind::Constant:
+    return true;
+  case AffineExprKind::Add: {
+    auto op = cast<AffineBinaryOpExpr>();
+    return op.getLHS().isPureAffine() && op.getRHS().isPureAffine();
+  }
+
+  case AffineExprKind::Mul: {
+    // TODO: Canonicalize the constants in binary operators to the RHS when
+    // possible, allowing this to merge into the next case.
+    auto op = cast<AffineBinaryOpExpr>();
+    return op.getLHS().isPureAffine() && op.getRHS().isPureAffine() &&
+           (op.getLHS().template isa<AffineConstantExpr>() ||
+            op.getRHS().template isa<AffineConstantExpr>());
+  }
+  case AffineExprKind::FloorDiv:
+  case AffineExprKind::CeilDiv:
+  case AffineExprKind::Mod: {
+    auto op = cast<AffineBinaryOpExpr>();
+    return op.getLHS().isPureAffine() &&
+           op.getRHS().template isa<AffineConstantExpr>();
+  }
+  }
+  llvm_unreachable("Unknown AffineExpr");
+}
+
+// Returns the greatest known integral divisor of this affine expression.
+int64_t AffineExpr::getLargestKnownDivisor() const {
+  AffineBinaryOpExpr binExpr(nullptr);
+  switch (getKind()) {
+  case AffineExprKind::SymbolId:
+    LLVM_FALLTHROUGH;
+  case AffineExprKind::DimId:
+    return 1;
+  case AffineExprKind::Constant:
+    return std::abs(this->cast<AffineConstantExpr>().getValue());
+  case AffineExprKind::Mul: {
+    binExpr = this->cast<AffineBinaryOpExpr>();
+    return binExpr.getLHS().getLargestKnownDivisor() *
+           binExpr.getRHS().getLargestKnownDivisor();
+  }
+  case AffineExprKind::Add:
+    LLVM_FALLTHROUGH;
+  case AffineExprKind::FloorDiv:
+  case AffineExprKind::CeilDiv:
+  case AffineExprKind::Mod: {
+    binExpr = cast<AffineBinaryOpExpr>();
+    return llvm::GreatestCommonDivisor64(
+        binExpr.getLHS().getLargestKnownDivisor(),
+        binExpr.getRHS().getLargestKnownDivisor());
+  }
+  }
+  llvm_unreachable("Unknown AffineExpr");
+}
+
+bool AffineExpr::isMultipleOf(int64_t factor) const {
+  AffineBinaryOpExpr binExpr(nullptr);
+  uint64_t l, u;
+  switch (getKind()) {
+  case AffineExprKind::SymbolId:
+    LLVM_FALLTHROUGH;
+  case AffineExprKind::DimId:
+    return factor * factor == 1;
+  case AffineExprKind::Constant:
+    return cast<AffineConstantExpr>().getValue() % factor == 0;
+  case AffineExprKind::Mul: {
+    binExpr = cast<AffineBinaryOpExpr>();
+    // It's probably not worth optimizing this further (to not traverse the
+    // whole sub-tree under - it that would require a version of isMultipleOf
+    // that on a 'false' return also returns the largest known divisor).
+    return (l = binExpr.getLHS().getLargestKnownDivisor()) % factor == 0 ||
+           (u = binExpr.getRHS().getLargestKnownDivisor()) % factor == 0 ||
+           (l * u) % factor == 0;
+  }
+  case AffineExprKind::Add:
+  case AffineExprKind::FloorDiv:
+  case AffineExprKind::CeilDiv:
+  case AffineExprKind::Mod: {
+    binExpr = cast<AffineBinaryOpExpr>();
+    return llvm::GreatestCommonDivisor64(
+               binExpr.getLHS().getLargestKnownDivisor(),
+               binExpr.getRHS().getLargestKnownDivisor()) %
+               factor ==
+           0;
+  }
+  }
+  llvm_unreachable("Unknown AffineExpr");
+}
+
+bool AffineExpr::isFunctionOfDim(unsigned position) const {
+  if (getKind() == AffineExprKind::DimId) {
+    return *this == mlir::getAffineDimExpr(position, getContext());
+  }
+  if (auto expr = this->dyn_cast<AffineBinaryOpExpr>()) {
+    return expr.getLHS().isFunctionOfDim(position) ||
+           expr.getRHS().isFunctionOfDim(position);
+  }
+  return false;
+}
+
+AffineBinaryOpExpr::AffineBinaryOpExpr(AffineExpr::ImplType *ptr)
+    : AffineExpr(ptr) {}
+AffineExpr AffineBinaryOpExpr::getLHS() const {
+  return static_cast<ImplType *>(expr)->lhs;
+}
+AffineExpr AffineBinaryOpExpr::getRHS() const {
+  return static_cast<ImplType *>(expr)->rhs;
+}
+
+AffineDimExpr::AffineDimExpr(AffineExpr::ImplType *ptr) : AffineExpr(ptr) {}
+unsigned AffineDimExpr::getPosition() const {
+  return static_cast<ImplType *>(expr)->position;
+}
+
+static AffineExpr getAffineDimOrSymbol(AffineExprKind kind, unsigned position,
+                                       MLIRContext *context) {
+  auto assignCtx = [context](AffineDimExprStorage *storage) {
+    storage->context = context;
+  };
+
+  StorageUniquer &uniquer = context->getAffineUniquer();
+  return uniquer.get<AffineDimExprStorage>(
+      assignCtx, static_cast<unsigned>(kind), position);
+}
+
+AffineExpr mlir::getAffineDimExpr(unsigned position, MLIRContext *context) {
+  return getAffineDimOrSymbol(AffineExprKind::DimId, position, context);
+}
+
+AffineSymbolExpr::AffineSymbolExpr(AffineExpr::ImplType *ptr)
+    : AffineExpr(ptr) {}
+unsigned AffineSymbolExpr::getPosition() const {
+  return static_cast<ImplType *>(expr)->position;
+}
+
+AffineExpr mlir::getAffineSymbolExpr(unsigned position, MLIRContext *context) {
+  return getAffineDimOrSymbol(AffineExprKind::SymbolId, position, context);
+  ;
+}
+
+AffineConstantExpr::AffineConstantExpr(AffineExpr::ImplType *ptr)
+    : AffineExpr(ptr) {}
+int64_t AffineConstantExpr::getValue() const {
+  return static_cast<ImplType *>(expr)->constant;
+}
+
+bool AffineExpr::operator==(int64_t v) const {
+  return *this == getAffineConstantExpr(v, getContext());
+}
+
+AffineExpr mlir::getAffineConstantExpr(int64_t constant, MLIRContext *context) {
+  auto assignCtx = [context](AffineConstantExprStorage *storage) {
+    storage->context = context;
+  };
+
+  StorageUniquer &uniquer = context->getAffineUniquer();
+  return uniquer.get<AffineConstantExprStorage>(
+      assignCtx, static_cast<unsigned>(AffineExprKind::Constant), constant);
+}
+
+/// Simplify add expression. Return nullptr if it can't be simplified.
+static AffineExpr simplifyAdd(AffineExpr lhs, AffineExpr rhs) {
+  auto lhsConst = lhs.dyn_cast<AffineConstantExpr>();
+  auto rhsConst = rhs.dyn_cast<AffineConstantExpr>();
+  // Fold if both LHS, RHS are a constant.
+  if (lhsConst && rhsConst)
+    return getAffineConstantExpr(lhsConst.getValue() + rhsConst.getValue(),
+                                 lhs.getContext());
+
+  // Canonicalize so that only the RHS is a constant. (4 + d0 becomes d0 + 4).
+  // If only one of them is a symbolic expressions, make it the RHS.
+  if (lhs.isa<AffineConstantExpr>() ||
+      (lhs.isSymbolicOrConstant() && !rhs.isSymbolicOrConstant())) {
+    return rhs + lhs;
+  }
+
+  // At this point, if there was a constant, it would be on the right.
+
+  // Addition with a zero is a noop, return the other input.
+  if (rhsConst) {
+    if (rhsConst.getValue() == 0)
+      return lhs;
+  }
+  // Fold successive additions like (d0 + 2) + 3 into d0 + 5.
+  auto lBin = lhs.dyn_cast<AffineBinaryOpExpr>();
+  if (lBin && rhsConst && lBin.getKind() == AffineExprKind::Add) {
+    if (auto lrhs = lBin.getRHS().dyn_cast<AffineConstantExpr>())
+      return lBin.getLHS() + (lrhs.getValue() + rhsConst.getValue());
+  }
+
+  // When doing successive additions, bring constant to the right: turn (d0 + 2)
+  // + d1 into (d0 + d1) + 2.
+  if (lBin && lBin.getKind() == AffineExprKind::Add) {
+    if (auto lrhs = lBin.getRHS().dyn_cast<AffineConstantExpr>()) {
+      return lBin.getLHS() + rhs + lrhs;
+    }
+  }
+
+  // Detect and transform "expr - c * (expr floordiv c)" to "expr mod c". This
+  // leads to a much more efficient form when 'c' is a power of two, and in
+  // general a more compact and readable form.
+
+  // Process '(expr floordiv c) * (-c)'.
+  AffineBinaryOpExpr rBinOpExpr = rhs.dyn_cast<AffineBinaryOpExpr>();
+  if (!rBinOpExpr)
+    return nullptr;
+
+  auto lrhs = rBinOpExpr.getLHS();
+  auto rrhs = rBinOpExpr.getRHS();
+
+  // Process lrhs, which is 'expr floordiv c'.
+  AffineBinaryOpExpr lrBinOpExpr = lrhs.dyn_cast<AffineBinaryOpExpr>();
+  if (!lrBinOpExpr || lrBinOpExpr.getKind() != AffineExprKind::FloorDiv)
+    return nullptr;
+
+  auto llrhs = lrBinOpExpr.getLHS();
+  auto rlrhs = lrBinOpExpr.getRHS();
+
+  if (lhs == llrhs && rlrhs == -rrhs) {
+    return lhs % rlrhs;
+  }
+  return nullptr;
+}
+
+AffineExpr AffineExpr::operator+(int64_t v) const {
+  return *this + getAffineConstantExpr(v, getContext());
+}
+AffineExpr AffineExpr::operator+(AffineExpr other) const {
+  if (auto simplified = simplifyAdd(*this, other))
+    return simplified;
+
+  StorageUniquer &uniquer = getContext()->getAffineUniquer();
+  return uniquer.get<AffineBinaryOpExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::Add), *this, other);
+}
+
+/// Simplify a multiply expression. Return nullptr if it can't be simplified.
+static AffineExpr simplifyMul(AffineExpr lhs, AffineExpr rhs) {
+  auto lhsConst = lhs.dyn_cast<AffineConstantExpr>();
+  auto rhsConst = rhs.dyn_cast<AffineConstantExpr>();
+
+  if (lhsConst && rhsConst)
+    return getAffineConstantExpr(lhsConst.getValue() * rhsConst.getValue(),
+                                 lhs.getContext());
+
+  assert(lhs.isSymbolicOrConstant() || rhs.isSymbolicOrConstant());
+
+  // Canonicalize the mul expression so that the constant/symbolic term is the
+  // RHS. If both the lhs and rhs are symbolic, swap them if the lhs is a
+  // constant. (Note that a constant is trivially symbolic).
+  if (!rhs.isSymbolicOrConstant() || lhs.isa<AffineConstantExpr>()) {
+    // At least one of them has to be symbolic.
+    return rhs * lhs;
+  }
+
+  // At this point, if there was a constant, it would be on the right.
+
+  // Multiplication with a one is a noop, return the other input.
+  if (rhsConst) {
+    if (rhsConst.getValue() == 1)
+      return lhs;
+    // Multiplication with zero.
+    if (rhsConst.getValue() == 0)
+      return rhsConst;
+  }
+
+  // Fold successive multiplications: eg: (d0 * 2) * 3 into d0 * 6.
+  auto lBin = lhs.dyn_cast<AffineBinaryOpExpr>();
+  if (lBin && rhsConst && lBin.getKind() == AffineExprKind::Mul) {
+    if (auto lrhs = lBin.getRHS().dyn_cast<AffineConstantExpr>())
+      return lBin.getLHS() * (lrhs.getValue() * rhsConst.getValue());
+  }
+
+  // When doing successive multiplication, bring constant to the right: turn (d0
+  // * 2) * d1 into (d0 * d1) * 2.
+  if (lBin && lBin.getKind() == AffineExprKind::Mul) {
+    if (auto lrhs = lBin.getRHS().dyn_cast<AffineConstantExpr>()) {
+      return (lBin.getLHS() * rhs) * lrhs;
+    }
+  }
+
+  return nullptr;
+}
+
+AffineExpr AffineExpr::operator*(int64_t v) const {
+  return *this * getAffineConstantExpr(v, getContext());
+}
+AffineExpr AffineExpr::operator*(AffineExpr other) const {
+  if (auto simplified = simplifyMul(*this, other))
+    return simplified;
+
+  StorageUniquer &uniquer = getContext()->getAffineUniquer();
+  return uniquer.get<AffineBinaryOpExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::Mul), *this, other);
+}
+
+// Unary minus, delegate to operator*.
+AffineExpr AffineExpr::operator-() const {
+  return *this * getAffineConstantExpr(-1, getContext());
+}
+
+// Delegate to operator+.
+AffineExpr AffineExpr::operator-(int64_t v) const { return *this + (-v); }
+AffineExpr AffineExpr::operator-(AffineExpr other) const {
+  return *this + (-other);
+}
+
+static AffineExpr simplifyFloorDiv(AffineExpr lhs, AffineExpr rhs) {
+  auto lhsConst = lhs.dyn_cast<AffineConstantExpr>();
+  auto rhsConst = rhs.dyn_cast<AffineConstantExpr>();
+
+  // mlir floordiv by zero or negative numbers is undefined and preserved as is.
+  if (!rhsConst || rhsConst.getValue() < 1)
+    return nullptr;
+
+  if (lhsConst)
+    return getAffineConstantExpr(
+        floorDiv(lhsConst.getValue(), rhsConst.getValue()), lhs.getContext());
+
+  // Fold floordiv of a multiply with a constant that is a multiple of the
+  // divisor. Eg: (i * 128) floordiv 64 = i * 2.
+  if (rhsConst == 1)
+    return lhs;
+
+  // Simplify (expr * const) floordiv divConst when expr is known to be a
+  // multiple of divConst.
+  auto lBin = lhs.dyn_cast<AffineBinaryOpExpr>();
+  if (lBin && lBin.getKind() == AffineExprKind::Mul) {
+    if (auto lrhs = lBin.getRHS().dyn_cast<AffineConstantExpr>()) {
+      // rhsConst is known to be a positive constant.
+      if (lrhs.getValue() % rhsConst.getValue() == 0)
+        return lBin.getLHS() * (lrhs.getValue() / rhsConst.getValue());
+    }
+  }
+
+  // Simplify (expr1 + expr2) floordiv divConst when either expr1 or expr2 is
+  // known to be a multiple of divConst.
+  if (lBin && lBin.getKind() == AffineExprKind::Add) {
+    int64_t llhsDiv = lBin.getLHS().getLargestKnownDivisor();
+    int64_t lrhsDiv = lBin.getRHS().getLargestKnownDivisor();
+    // rhsConst is known to be a positive constant.
+    if (llhsDiv % rhsConst.getValue() == 0 ||
+        lrhsDiv % rhsConst.getValue() == 0)
+      return lBin.getLHS().floorDiv(rhsConst.getValue()) +
+             lBin.getRHS().floorDiv(rhsConst.getValue());
+  }
+
+  return nullptr;
+}
+
+AffineExpr AffineExpr::floorDiv(uint64_t v) const {
+  return floorDiv(getAffineConstantExpr(v, getContext()));
+}
+AffineExpr AffineExpr::floorDiv(AffineExpr other) const {
+  if (auto simplified = simplifyFloorDiv(*this, other))
+    return simplified;
+
+  StorageUniquer &uniquer = getContext()->getAffineUniquer();
+  return uniquer.get<AffineBinaryOpExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::FloorDiv), *this,
+      other);
+}
+
+static AffineExpr simplifyCeilDiv(AffineExpr lhs, AffineExpr rhs) {
+  auto lhsConst = lhs.dyn_cast<AffineConstantExpr>();
+  auto rhsConst = rhs.dyn_cast<AffineConstantExpr>();
+
+  if (!rhsConst || rhsConst.getValue() < 1)
+    return nullptr;
+
+  if (lhsConst)
+    return getAffineConstantExpr(
+        ceilDiv(lhsConst.getValue(), rhsConst.getValue()), lhs.getContext());
+
+  // Fold ceildiv of a multiply with a constant that is a multiple of the
+  // divisor. Eg: (i * 128) ceildiv 64 = i * 2.
+  if (rhsConst.getValue() == 1)
+    return lhs;
+
+  // Simplify (expr * const) ceildiv divConst when const is known to be a
+  // multiple of divConst.
+  auto lBin = lhs.dyn_cast<AffineBinaryOpExpr>();
+  if (lBin && lBin.getKind() == AffineExprKind::Mul) {
+    if (auto lrhs = lBin.getRHS().dyn_cast<AffineConstantExpr>()) {
+      // rhsConst is known to be a positive constant.
+      if (lrhs.getValue() % rhsConst.getValue() == 0)
+        return lBin.getLHS() * (lrhs.getValue() / rhsConst.getValue());
+    }
+  }
+
+  return nullptr;
+}
+
+AffineExpr AffineExpr::ceilDiv(uint64_t v) const {
+  return ceilDiv(getAffineConstantExpr(v, getContext()));
+}
+AffineExpr AffineExpr::ceilDiv(AffineExpr other) const {
+  if (auto simplified = simplifyCeilDiv(*this, other))
+    return simplified;
+
+  StorageUniquer &uniquer = getContext()->getAffineUniquer();
+  return uniquer.get<AffineBinaryOpExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::CeilDiv), *this,
+      other);
+}
+
+static AffineExpr simplifyMod(AffineExpr lhs, AffineExpr rhs) {
+  auto lhsConst = lhs.dyn_cast<AffineConstantExpr>();
+  auto rhsConst = rhs.dyn_cast<AffineConstantExpr>();
+
+  // mod w.r.t zero or negative numbers is undefined and preserved as is.
+  if (!rhsConst || rhsConst.getValue() < 1)
+    return nullptr;
+
+  if (lhsConst)
+    return getAffineConstantExpr(mod(lhsConst.getValue(), rhsConst.getValue()),
+                                 lhs.getContext());
+
+  // Fold modulo of an expression that is known to be a multiple of a constant
+  // to zero if that constant is a multiple of the modulo factor. Eg: (i * 128)
+  // mod 64 is folded to 0, and less trivially, (i*(j*4*(k*32))) mod 128 = 0.
+  if (lhs.getLargestKnownDivisor() % rhsConst.getValue() == 0)
+    return getAffineConstantExpr(0, lhs.getContext());
+
+  // Simplify (expr1 + expr2) mod divConst when either expr1 or expr2 is
+  // known to be a multiple of divConst.
+  auto lBin = lhs.dyn_cast<AffineBinaryOpExpr>();
+  if (lBin && lBin.getKind() == AffineExprKind::Add) {
+    int64_t llhsDiv = lBin.getLHS().getLargestKnownDivisor();
+    int64_t lrhsDiv = lBin.getRHS().getLargestKnownDivisor();
+    // rhsConst is known to be a positive constant.
+    if (llhsDiv % rhsConst.getValue() == 0)
+      return lBin.getRHS() % rhsConst.getValue();
+    if (lrhsDiv % rhsConst.getValue() == 0)
+      return lBin.getLHS() % rhsConst.getValue();
+  }
+
+  return nullptr;
+}
+
+AffineExpr AffineExpr::operator%(uint64_t v) const {
+  return *this % getAffineConstantExpr(v, getContext());
+}
+AffineExpr AffineExpr::operator%(AffineExpr other) const {
+  if (auto simplified = simplifyMod(*this, other))
+    return simplified;
+
+  StorageUniquer &uniquer = getContext()->getAffineUniquer();
+  return uniquer.get<AffineBinaryOpExprStorage>(
+      /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::Mod), *this, other);
+}
+
+AffineExpr AffineExpr::compose(AffineMap map) const {
+  SmallVector<AffineExpr, 8> dimReplacements(map.getResults().begin(),
+                                             map.getResults().end());
+  return replaceDimsAndSymbols(dimReplacements, {});
+}
+raw_ostream &mlir::operator<<(raw_ostream &os, AffineExpr &expr) {
+  expr.print(os);
+  return os;
+}
+
+/// Constructs an affine expression from a flat ArrayRef. If there are local
+/// identifiers (neither dimensional nor symbolic) that appear in the sum of
+/// products expression, 'localExprs' is expected to have the AffineExpr
+/// for it, and is substituted into. The ArrayRef 'eq' is expected to be in the
+/// format [dims, symbols, locals, constant term].
+AffineExpr mlir::toAffineExpr(ArrayRef<int64_t> eq, unsigned numDims,
+                              unsigned numSymbols,
+                              ArrayRef<AffineExpr> localExprs,
+                              MLIRContext *context) {
+  // Assert expected numLocals = eq.size() - numDims - numSymbols - 1
+  assert(eq.size() - numDims - numSymbols - 1 == localExprs.size() &&
+         "unexpected number of local expressions");
+
+  auto expr = getAffineConstantExpr(0, context);
+  // Dimensions and symbols.
+  for (unsigned j = 0; j < numDims + numSymbols; j++) {
+    if (eq[j] == 0) {
+      continue;
+    }
+    auto id = j < numDims ? getAffineDimExpr(j, context)
+                          : getAffineSymbolExpr(j - numDims, context);
+    expr = expr + id * eq[j];
+  }
+
+  // Local identifiers.
+  for (unsigned j = numDims + numSymbols, e = eq.size() - 1; j < e; j++) {
+    if (eq[j] == 0) {
+      continue;
+    }
+    auto term = localExprs[j - numDims - numSymbols] * eq[j];
+    expr = expr + term;
+  }
+
+  // Constant term.
+  int64_t constTerm = eq[eq.size() - 1];
+  if (constTerm != 0)
+    expr = expr + constTerm;
+  return expr;
+}
+
+SimpleAffineExprFlattener::SimpleAffineExprFlattener(unsigned numDims,
+                                                     unsigned numSymbols)
+    : numDims(numDims), numSymbols(numSymbols), numLocals(0) {
+  operandExprStack.reserve(8);
+}
+
+void SimpleAffineExprFlattener::visitMulExpr(AffineBinaryOpExpr expr) {
+  assert(operandExprStack.size() >= 2);
+  // This is a pure affine expr; the RHS will be a constant.
+  assert(expr.getRHS().isa<AffineConstantExpr>());
+  // Get the RHS constant.
+  auto rhsConst = operandExprStack.back()[getConstantIndex()];
+  operandExprStack.pop_back();
+  // Update the LHS in place instead of pop and push.
+  auto &lhs = operandExprStack.back();
+  for (unsigned i = 0, e = lhs.size(); i < e; i++) {
+    lhs[i] *= rhsConst;
+  }
+}
+
+void SimpleAffineExprFlattener::visitAddExpr(AffineBinaryOpExpr expr) {
+  assert(operandExprStack.size() >= 2);
+  const auto &rhs = operandExprStack.back();
+  auto &lhs = operandExprStack[operandExprStack.size() - 2];
+  assert(lhs.size() == rhs.size());
+  // Update the LHS in place.
+  for (unsigned i = 0, e = rhs.size(); i < e; i++) {
+    lhs[i] += rhs[i];
+  }
+  // Pop off the RHS.
+  operandExprStack.pop_back();
+}
+
+//
+// t = expr mod c   <=>  t = expr - c*q and c*q <= expr <= c*q + c - 1
+//
+// A mod expression "expr mod c" is thus flattened by introducing a new local
+// variable q (= expr floordiv c), such that expr mod c is replaced with
+// 'expr - c * q' and c * q <= expr <= c * q + c - 1 are added to localVarCst.
+void SimpleAffineExprFlattener::visitModExpr(AffineBinaryOpExpr expr) {
+  assert(operandExprStack.size() >= 2);
+  // This is a pure affine expr; the RHS will be a constant.
+  assert(expr.getRHS().isa<AffineConstantExpr>());
+  auto rhsConst = operandExprStack.back()[getConstantIndex()];
+  operandExprStack.pop_back();
+  auto &lhs = operandExprStack.back();
+  // TODO(bondhugula): handle modulo by zero case when this issue is fixed
+  // at the other places in the IR.
+  assert(rhsConst > 0 && "RHS constant has to be positive");
+
+  // Check if the LHS expression is a multiple of modulo factor.
+  unsigned i, e;
+  for (i = 0, e = lhs.size(); i < e; i++)
+    if (lhs[i] % rhsConst != 0)
+      break;
+  // If yes, modulo expression here simplifies to zero.
+  if (i == lhs.size()) {
+    std::fill(lhs.begin(), lhs.end(), 0);
+    return;
+  }
+
+  // Add a local variable for the quotient, i.e., expr % c is replaced by
+  // (expr - q * c) where q = expr floordiv c. Do this while canceling out
+  // the GCD of expr and c.
+  SmallVector<int64_t, 8> floorDividend(lhs);
+  uint64_t gcd = rhsConst;
+  for (unsigned i = 0, e = lhs.size(); i < e; i++)
+    gcd = llvm::GreatestCommonDivisor64(gcd, std::abs(lhs[i]));
+  // Simplify the numerator and the denominator.
+  if (gcd != 1) {
+    for (unsigned i = 0, e = floorDividend.size(); i < e; i++)
+      floorDividend[i] = floorDividend[i] / static_cast<int64_t>(gcd);
+  }
+  int64_t floorDivisor = rhsConst / static_cast<int64_t>(gcd);
+
+  // Construct the AffineExpr form of the floordiv to store in localExprs.
+  MLIRContext *context = expr.getContext();
+  auto dividendExpr =
+      toAffineExpr(floorDividend, numDims, numSymbols, localExprs, context);
+  auto divisorExpr = getAffineConstantExpr(floorDivisor, context);
+  auto floorDivExpr = dividendExpr.floorDiv(divisorExpr);
+  int loc;
+  if ((loc = findLocalId(floorDivExpr)) == -1) {
+    addLocalFloorDivId(floorDividend, floorDivisor, floorDivExpr);
+    // Set result at top of stack to "lhs - rhsConst * q".
+    lhs[getLocalVarStartIndex() + numLocals - 1] = -rhsConst;
+  } else {
+    // Reuse the existing local id.
+    lhs[getLocalVarStartIndex() + loc] = -rhsConst;
+  }
+}
+
+void SimpleAffineExprFlattener::visitCeilDivExpr(AffineBinaryOpExpr expr) {
+  visitDivExpr(expr, /*isCeil=*/true);
+}
+void SimpleAffineExprFlattener::visitFloorDivExpr(AffineBinaryOpExpr expr) {
+  visitDivExpr(expr, /*isCeil=*/false);
+}
+
+void SimpleAffineExprFlattener::visitDimExpr(AffineDimExpr expr) {
+  operandExprStack.emplace_back(SmallVector<int64_t, 32>(getNumCols(), 0));
+  auto &eq = operandExprStack.back();
+  assert(expr.getPosition() < numDims && "Inconsistent number of dims");
+  eq[getDimStartIndex() + expr.getPosition()] = 1;
+}
+
+void SimpleAffineExprFlattener::visitSymbolExpr(AffineSymbolExpr expr) {
+  operandExprStack.emplace_back(SmallVector<int64_t, 32>(getNumCols(), 0));
+  auto &eq = operandExprStack.back();
+  assert(expr.getPosition() < numSymbols && "inconsistent number of symbols");
+  eq[getSymbolStartIndex() + expr.getPosition()] = 1;
+}
+
+void SimpleAffineExprFlattener::visitConstantExpr(AffineConstantExpr expr) {
+  operandExprStack.emplace_back(SmallVector<int64_t, 32>(getNumCols(), 0));
+  auto &eq = operandExprStack.back();
+  eq[getConstantIndex()] = expr.getValue();
+}
+
+// t = expr floordiv c   <=> t = q, c * q <= expr <= c * q + c - 1
+// A floordiv is thus flattened by introducing a new local variable q, and
+// replacing that expression with 'q' while adding the constraints
+// c * q <= expr <= c * q + c - 1 to localVarCst (done by
+// FlatAffineConstraints::addLocalFloorDiv).
+//
+// A ceildiv is similarly flattened:
+// t = expr ceildiv c   <=> t =  (expr + c - 1) floordiv c
+void SimpleAffineExprFlattener::visitDivExpr(AffineBinaryOpExpr expr,
+                                             bool isCeil) {
+  assert(operandExprStack.size() >= 2);
+  assert(expr.getRHS().isa<AffineConstantExpr>());
+
+  // This is a pure affine expr; the RHS is a positive constant.
+  int64_t rhsConst = operandExprStack.back()[getConstantIndex()];
+  // TODO(bondhugula): handle division by zero at the same time the issue is
+  // fixed at other places.
+  assert(rhsConst > 0 && "RHS constant has to be positive");
+  operandExprStack.pop_back();
+  auto &lhs = operandExprStack.back();
+
+  // Simplify the floordiv, ceildiv if possible by canceling out the greatest
+  // common divisors of the numerator and denominator.
+  uint64_t gcd = std::abs(rhsConst);
+  for (unsigned i = 0, e = lhs.size(); i < e; i++)
+    gcd = llvm::GreatestCommonDivisor64(gcd, std::abs(lhs[i]));
+  // Simplify the numerator and the denominator.
+  if (gcd != 1) {
+    for (unsigned i = 0, e = lhs.size(); i < e; i++)
+      lhs[i] = lhs[i] / static_cast<int64_t>(gcd);
+  }
+  int64_t divisor = rhsConst / static_cast<int64_t>(gcd);
+  // If the divisor becomes 1, the updated LHS is the result. (The
+  // divisor can't be negative since rhsConst is positive).
+  if (divisor == 1)
+    return;
+
+  // If the divisor cannot be simplified to one, we will have to retain
+  // the ceil/floor expr (simplified up until here). Add an existential
+  // quantifier to express its result, i.e., expr1 div expr2 is replaced
+  // by a new identifier, q.
+  MLIRContext *context = expr.getContext();
+  auto a = toAffineExpr(lhs, numDims, numSymbols, localExprs, context);
+  auto b = getAffineConstantExpr(divisor, context);
+
+  int loc;
+  auto divExpr = isCeil ? a.ceilDiv(b) : a.floorDiv(b);
+  if ((loc = findLocalId(divExpr)) == -1) {
+    if (!isCeil) {
+      SmallVector<int64_t, 8> dividend(lhs);
+      addLocalFloorDivId(dividend, divisor, divExpr);
+    } else {
+      // lhs ceildiv c <=>  (lhs + c - 1) floordiv c
+      SmallVector<int64_t, 8> dividend(lhs);
+      dividend.back() += divisor - 1;
+      addLocalFloorDivId(dividend, divisor, divExpr);
+    }
+  }
+  // Set the expression on stack to the local var introduced to capture the
+  // result of the division (floor or ceil).
+  std::fill(lhs.begin(), lhs.end(), 0);
+  if (loc == -1)
+    lhs[getLocalVarStartIndex() + numLocals - 1] = 1;
+  else
+    lhs[getLocalVarStartIndex() + loc] = 1;
+}
+
+// Add a local identifier (needed to flatten a mod, floordiv, ceildiv expr).
+// The local identifier added is always a floordiv of a pure add/mul affine
+// function of other identifiers, coefficients of which are specified in
+// dividend and with respect to a positive constant divisor. localExpr is the
+// simplified tree expression (AffineExpr) corresponding to the quantifier.
+void SimpleAffineExprFlattener::addLocalFloorDivId(ArrayRef<int64_t> dividend,
+                                                   int64_t divisor,
+                                                   AffineExpr localExpr) {
+  assert(divisor > 0 && "positive constant divisor expected");
+  for (auto &subExpr : operandExprStack)
+    subExpr.insert(subExpr.begin() + getLocalVarStartIndex() + numLocals, 0);
+  localExprs.push_back(localExpr);
+  numLocals++;
+  // dividend and divisor are not used here; an override of this method uses it.
+}
+
+int SimpleAffineExprFlattener::findLocalId(AffineExpr localExpr) {
+  SmallVectorImpl<AffineExpr>::iterator it;
+  if ((it = llvm::find(localExprs, localExpr)) == localExprs.end())
+    return -1;
+  return it - localExprs.begin();
+}
+
+/// Simplify the affine expression by flattening it and reconstructing it.
+AffineExpr mlir::simplifyAffineExpr(AffineExpr expr, unsigned numDims,
+                                    unsigned numSymbols) {
+  // TODO(bondhugula): only pure affine for now. The simplification here can
+  // be extended to semi-affine maps in the future.
+  if (!expr.isPureAffine())
+    return expr;
+
+  SimpleAffineExprFlattener flattener(numDims, numSymbols);
+  flattener.walkPostOrder(expr);
+  ArrayRef<int64_t> flattenedExpr = flattener.operandExprStack.back();
+  auto simplifiedExpr = toAffineExpr(flattenedExpr, numDims, numSymbols,
+                                     flattener.localExprs, expr.getContext());
+  flattener.operandExprStack.pop_back();
+  assert(flattener.operandExprStack.empty());
+
+  return simplifiedExpr;
+}
+
+// Flattens the expressions in map. Returns true on success or false
+// if 'expr' was unable to be flattened (i.e., semi-affine expressions not
+// handled yet).
+static bool
+getFlattenedAffineExprs(ArrayRef<AffineExpr> exprs, unsigned numDims,
+                        unsigned numSymbols,
+                        std::vector<SmallVector<int64_t, 8>> *flattenedExprs) {
+  if (exprs.empty()) {
+    return true;
+  }
+
+  SimpleAffineExprFlattener flattener(numDims, numSymbols);
+  // Use the same flattener to simplify each expression successively. This way
+  // local identifiers / expressions are shared.
+  for (auto expr : exprs) {
+    if (!expr.isPureAffine())
+      return false;
+
+    flattener.walkPostOrder(expr);
+  }
+
+  flattenedExprs->clear();
+  assert(flattener.operandExprStack.size() == exprs.size());
+  flattenedExprs->assign(flattener.operandExprStack.begin(),
+                         flattener.operandExprStack.end());
+
+  return true;
+}
+
+// Flattens 'expr' into 'flattenedExpr'. Returns true on success or false
+// if 'expr' was unable to be flattened (semi-affine expressions not handled
+// yet).
+bool mlir::getFlattenedAffineExpr(AffineExpr expr, unsigned numDims,
+                                  unsigned numSymbols,
+                                  SmallVectorImpl<int64_t> *flattenedExpr) {
+  std::vector<SmallVector<int64_t, 8>> flattenedExprs;
+  bool ret =
+      ::getFlattenedAffineExprs({expr}, numDims, numSymbols, &flattenedExprs);
+  *flattenedExpr = flattenedExprs[0];
+  return ret;
+}
+
+/// Flattens the expressions in map. Returns true on success or false
+/// if 'expr' was unable to be flattened (i.e., semi-affine expressions not
+/// handled yet).
+bool mlir::getFlattenedAffineExprs(
+    AffineMap map, std::vector<SmallVector<int64_t, 8>> *flattenedExprs) {
+  if (map.getNumResults() == 0) {
+    return true;
+  }
+  return ::getFlattenedAffineExprs(map.getResults(), map.getNumDims(),
+                                   map.getNumSymbols(), flattenedExprs);
+}
+
+bool mlir::getFlattenedAffineExprs(
+    IntegerSet set, std::vector<SmallVector<int64_t, 8>> *flattenedExprs) {
+  if (set.getNumConstraints() == 0) {
+    return true;
+  }
+  return ::getFlattenedAffineExprs(set.getConstraints(), set.getNumDims(),
+                                   set.getNumSymbols(), flattenedExprs);
+}
diff --git a/mlir/lib/IR/AffineExprDetail.h b/mlir/lib/IR/AffineExprDetail.h
new file mode 100644
index 0000000000000000000000000000000000000000..8824ddd8682aa5cc051a834e7f9ac4688374affe
--- /dev/null
+++ b/mlir/lib/IR/AffineExprDetail.h
@@ -0,0 +1,89 @@
+//===- AffineExprDetail.h - MLIR Affine Expr storage details ----*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This holds implementation details of AffineExpr. Ideally it would not be
+// exposed and would be kept local to AffineExpr.cpp however, MLIRContext.cpp
+// needs to know the sizes for placement-new style Allocation.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_IR_AFFINEEXPRDETAIL_H_
+#define MLIR_IR_AFFINEEXPRDETAIL_H_
+
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Support/StorageUniquer.h"
+
+namespace mlir {
+
+class MLIRContext;
+
+namespace detail {
+
+/// Base storage class appearing in an affine expression.
+struct AffineExprStorage : public StorageUniquer::BaseStorage {
+  MLIRContext *context;
+};
+
+/// A binary operation appearing in an affine expression.
+struct AffineBinaryOpExprStorage : public AffineExprStorage {
+  using KeyTy = std::pair<AffineExpr, AffineExpr>;
+
+  bool operator==(const KeyTy &key) const {
+    return key.first == lhs && key.second == rhs;
+  }
+
+  static AffineBinaryOpExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<AffineBinaryOpExprStorage>();
+    result->lhs = key.first;
+    result->rhs = key.second;
+    result->context = result->lhs.getContext();
+    return result;
+  }
+
+  AffineExpr lhs;
+  AffineExpr rhs;
+};
+
+/// A dimensional or symbolic identifier appearing in an affine expression.
+struct AffineDimExprStorage : public AffineExprStorage {
+  using KeyTy = unsigned;
+
+  bool operator==(const KeyTy &key) const { return position == key; }
+
+  static AffineDimExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<AffineDimExprStorage>();
+    result->position = key;
+    return result;
+  }
+
+  /// Position of this identifier in the argument list.
+  unsigned position;
+};
+
+/// An integer constant appearing in affine expression.
+struct AffineConstantExprStorage : public AffineExprStorage {
+  using KeyTy = int64_t;
+
+  bool operator==(const KeyTy &key) const { return constant == key; }
+
+  static AffineConstantExprStorage *
+  construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
+    auto *result = allocator.allocate<AffineConstantExprStorage>();
+    result->constant = key;
+    return result;
+  }
+
+  // The constant.
+  int64_t constant;
+};
+
+} // end namespace detail
+} // end namespace mlir
+#endif // MLIR_IR_AFFINEEXPRDETAIL_H_
diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50624afa3ebdbd23a6e16b2dbbd348b1c44398e9
--- /dev/null
+++ b/mlir/lib/IR/AffineMap.cpp
@@ -0,0 +1,328 @@
+//===- AffineMap.cpp - MLIR Affine Map Classes ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/AffineMap.h"
+#include "AffineMapDetail.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/MathExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+namespace {
+
+// AffineExprConstantFolder evaluates an affine expression using constant
+// operands passed in 'operandConsts'. Returns an IntegerAttr attribute
+// representing the constant value of the affine expression evaluated on
+// constant 'operandConsts', or nullptr if it can't be folded.
+class AffineExprConstantFolder {
+public:
+  AffineExprConstantFolder(unsigned numDims, ArrayRef<Attribute> operandConsts)
+      : numDims(numDims), operandConsts(operandConsts) {}
+
+  /// Attempt to constant fold the specified affine expr, or return null on
+  /// failure.
+  IntegerAttr constantFold(AffineExpr expr) {
+    if (auto result = constantFoldImpl(expr))
+      return IntegerAttr::get(IndexType::get(expr.getContext()), *result);
+    return nullptr;
+  }
+
+private:
+  Optional<int64_t> constantFoldImpl(AffineExpr expr) {
+    switch (expr.getKind()) {
+    case AffineExprKind::Add:
+      return constantFoldBinExpr(
+          expr, [](int64_t lhs, int64_t rhs) { return lhs + rhs; });
+    case AffineExprKind::Mul:
+      return constantFoldBinExpr(
+          expr, [](int64_t lhs, int64_t rhs) { return lhs * rhs; });
+    case AffineExprKind::Mod:
+      return constantFoldBinExpr(
+          expr, [](int64_t lhs, int64_t rhs) { return mod(lhs, rhs); });
+    case AffineExprKind::FloorDiv:
+      return constantFoldBinExpr(
+          expr, [](int64_t lhs, int64_t rhs) { return floorDiv(lhs, rhs); });
+    case AffineExprKind::CeilDiv:
+      return constantFoldBinExpr(
+          expr, [](int64_t lhs, int64_t rhs) { return ceilDiv(lhs, rhs); });
+    case AffineExprKind::Constant:
+      return expr.cast<AffineConstantExpr>().getValue();
+    case AffineExprKind::DimId:
+      if (auto attr = operandConsts[expr.cast<AffineDimExpr>().getPosition()]
+                          .dyn_cast_or_null<IntegerAttr>())
+        return attr.getInt();
+      return llvm::None;
+    case AffineExprKind::SymbolId:
+      if (auto attr = operandConsts[numDims +
+                                    expr.cast<AffineSymbolExpr>().getPosition()]
+                          .dyn_cast_or_null<IntegerAttr>())
+        return attr.getInt();
+      return llvm::None;
+    }
+    llvm_unreachable("Unknown AffineExpr");
+  }
+
+  // TODO: Change these to operate on APInts too.
+  Optional<int64_t> constantFoldBinExpr(AffineExpr expr,
+                                        int64_t (*op)(int64_t, int64_t)) {
+    auto binOpExpr = expr.cast<AffineBinaryOpExpr>();
+    if (auto lhs = constantFoldImpl(binOpExpr.getLHS()))
+      if (auto rhs = constantFoldImpl(binOpExpr.getRHS()))
+        return op(*lhs, *rhs);
+    return llvm::None;
+  }
+
+  // The number of dimension operands in AffineMap containing this expression.
+  unsigned numDims;
+  // The constant valued operands used to evaluate this AffineExpr.
+  ArrayRef<Attribute> operandConsts;
+};
+
+} // end anonymous namespace
+
+/// Returns a single constant result affine map.
+AffineMap AffineMap::getConstantMap(int64_t val, MLIRContext *context) {
+  return get(/*dimCount=*/0, /*symbolCount=*/0,
+             {getAffineConstantExpr(val, context)});
+}
+
+/// Returns an AffineMap representing a permutation.
+AffineMap AffineMap::getPermutationMap(ArrayRef<unsigned> permutation,
+                                       MLIRContext *context) {
+  assert(!permutation.empty() &&
+         "Cannot create permutation map from empty permutation vector");
+  SmallVector<AffineExpr, 4> affExprs;
+  for (auto index : permutation)
+    affExprs.push_back(getAffineDimExpr(index, context));
+  auto m = std::max_element(permutation.begin(), permutation.end());
+  auto permutationMap = AffineMap::get(*m + 1, 0, affExprs);
+  assert(permutationMap.isPermutation() && "Invalid permutation vector");
+  return permutationMap;
+}
+
+AffineMap AffineMap::getMultiDimIdentityMap(unsigned numDims,
+                                            MLIRContext *context) {
+  SmallVector<AffineExpr, 4> dimExprs;
+  dimExprs.reserve(numDims);
+  for (unsigned i = 0; i < numDims; ++i)
+    dimExprs.push_back(mlir::getAffineDimExpr(i, context));
+  return get(/*dimCount=*/numDims, /*symbolCount=*/0, dimExprs);
+}
+
+MLIRContext *AffineMap::getContext() const { return map->context; }
+
+bool AffineMap::isIdentity() const {
+  if (getNumDims() != getNumResults())
+    return false;
+  ArrayRef<AffineExpr> results = getResults();
+  for (unsigned i = 0, numDims = getNumDims(); i < numDims; ++i) {
+    auto expr = results[i].dyn_cast<AffineDimExpr>();
+    if (!expr || expr.getPosition() != i)
+      return false;
+  }
+  return true;
+}
+
+bool AffineMap::isEmpty() const {
+  return getNumDims() == 0 && getNumSymbols() == 0 && getNumResults() == 0;
+}
+
+bool AffineMap::isSingleConstant() const {
+  return getNumResults() == 1 && getResult(0).isa<AffineConstantExpr>();
+}
+
+int64_t AffineMap::getSingleConstantResult() const {
+  assert(isSingleConstant() && "map must have a single constant result");
+  return getResult(0).cast<AffineConstantExpr>().getValue();
+}
+
+unsigned AffineMap::getNumDims() const {
+  assert(map && "uninitialized map storage");
+  return map->numDims;
+}
+unsigned AffineMap::getNumSymbols() const {
+  assert(map && "uninitialized map storage");
+  return map->numSymbols;
+}
+unsigned AffineMap::getNumResults() const {
+  assert(map && "uninitialized map storage");
+  return map->results.size();
+}
+unsigned AffineMap::getNumInputs() const {
+  assert(map && "uninitialized map storage");
+  return map->numDims + map->numSymbols;
+}
+
+ArrayRef<AffineExpr> AffineMap::getResults() const {
+  assert(map && "uninitialized map storage");
+  return map->results;
+}
+AffineExpr AffineMap::getResult(unsigned idx) const {
+  assert(map && "uninitialized map storage");
+  return map->results[idx];
+}
+
+/// Folds the results of the application of an affine map on the provided
+/// operands to a constant if possible. Returns false if the folding happens,
+/// true otherwise.
+LogicalResult
+AffineMap::constantFold(ArrayRef<Attribute> operandConstants,
+                        SmallVectorImpl<Attribute> &results) const {
+  assert(getNumInputs() == operandConstants.size());
+
+  // Fold each of the result expressions.
+  AffineExprConstantFolder exprFolder(getNumDims(), operandConstants);
+  // Constant fold each AffineExpr in AffineMap and add to 'results'.
+  for (auto expr : getResults()) {
+    auto folded = exprFolder.constantFold(expr);
+    // If we didn't fold to a constant, then folding fails.
+    if (!folded)
+      return failure();
+
+    results.push_back(folded);
+  }
+  assert(results.size() == getNumResults() &&
+         "constant folding produced the wrong number of results");
+  return success();
+}
+
+/// Walk all of the AffineExpr's in this mapping. Each node in an expression
+/// tree is visited in postorder.
+void AffineMap::walkExprs(std::function<void(AffineExpr)> callback) const {
+  for (auto expr : getResults())
+    expr.walk(callback);
+}
+
+/// This method substitutes any uses of dimensions and symbols (e.g.
+/// dim#0 with dimReplacements[0]) in subexpressions and returns the modified
+/// expression mapping.  Because this can be used to eliminate dims and
+/// symbols, the client needs to specify the number of dims and symbols in
+/// the result.  The returned map always has the same number of results.
+AffineMap AffineMap::replaceDimsAndSymbols(ArrayRef<AffineExpr> dimReplacements,
+                                           ArrayRef<AffineExpr> symReplacements,
+                                           unsigned numResultDims,
+                                           unsigned numResultSyms) {
+  SmallVector<AffineExpr, 8> results;
+  results.reserve(getNumResults());
+  for (auto expr : getResults())
+    results.push_back(
+        expr.replaceDimsAndSymbols(dimReplacements, symReplacements));
+
+  return get(numResultDims, numResultSyms, results);
+}
+
+AffineMap AffineMap::compose(AffineMap map) {
+  assert(getNumDims() == map.getNumResults() && "Number of results mismatch");
+  // Prepare `map` by concatenating the symbols and rewriting its exprs.
+  unsigned numDims = map.getNumDims();
+  unsigned numSymbolsThisMap = getNumSymbols();
+  unsigned numSymbols = numSymbolsThisMap + map.getNumSymbols();
+  SmallVector<AffineExpr, 8> newDims(numDims);
+  for (unsigned idx = 0; idx < numDims; ++idx) {
+    newDims[idx] = getAffineDimExpr(idx, getContext());
+  }
+  SmallVector<AffineExpr, 8> newSymbols(numSymbols);
+  for (unsigned idx = numSymbolsThisMap; idx < numSymbols; ++idx) {
+    newSymbols[idx - numSymbolsThisMap] =
+        getAffineSymbolExpr(idx, getContext());
+  }
+  auto newMap =
+      map.replaceDimsAndSymbols(newDims, newSymbols, numDims, numSymbols);
+  SmallVector<AffineExpr, 8> exprs;
+  exprs.reserve(getResults().size());
+  for (auto expr : getResults())
+    exprs.push_back(expr.compose(newMap));
+  return AffineMap::get(numDims, numSymbols, exprs);
+}
+
+bool AffineMap::isProjectedPermutation() {
+  if (getNumSymbols() > 0)
+    return false;
+  SmallVector<bool, 8> seen(getNumInputs(), false);
+  for (auto expr : getResults()) {
+    if (auto dim = expr.dyn_cast<AffineDimExpr>()) {
+      if (seen[dim.getPosition()])
+        return false;
+      seen[dim.getPosition()] = true;
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+bool AffineMap::isPermutation() {
+  if (getNumDims() != getNumResults())
+    return false;
+  return isProjectedPermutation();
+}
+
+AffineMap AffineMap::getSubMap(ArrayRef<unsigned> resultPos) {
+  SmallVector<AffineExpr, 4> exprs;
+  exprs.reserve(resultPos.size());
+  for (auto idx : resultPos) {
+    exprs.push_back(getResult(idx));
+  }
+  return AffineMap::get(getNumDims(), getNumSymbols(), exprs);
+}
+
+AffineMap mlir::simplifyAffineMap(AffineMap map) {
+  SmallVector<AffineExpr, 8> exprs;
+  for (auto e : map.getResults()) {
+    exprs.push_back(
+        simplifyAffineExpr(e, map.getNumDims(), map.getNumSymbols()));
+  }
+  return AffineMap::get(map.getNumDims(), map.getNumSymbols(), exprs);
+}
+
+AffineMap mlir::inversePermutation(AffineMap map) {
+  if (!map)
+    return map;
+  assert(map.getNumSymbols() == 0 && "expected map without symbols");
+  SmallVector<AffineExpr, 4> exprs(map.getNumDims());
+  for (auto en : llvm::enumerate(map.getResults())) {
+    auto expr = en.value();
+    // Skip non-permutations.
+    if (auto d = expr.dyn_cast<AffineDimExpr>()) {
+      if (exprs[d.getPosition()])
+        continue;
+      exprs[d.getPosition()] = getAffineDimExpr(en.index(), d.getContext());
+    }
+  }
+  SmallVector<AffineExpr, 4> seenExprs;
+  seenExprs.reserve(map.getNumDims());
+  for (auto expr : exprs)
+    if (expr)
+      seenExprs.push_back(expr);
+  if (seenExprs.size() != map.getNumInputs())
+    return AffineMap();
+  return AffineMap::get(map.getNumResults(), 0, seenExprs);
+}
+
+AffineMap mlir::concatAffineMaps(ArrayRef<AffineMap> maps) {
+  unsigned numResults = 0;
+  for (auto m : maps)
+    numResults += m ? m.getNumResults() : 0;
+  unsigned numDims = 0;
+  SmallVector<AffineExpr, 8> results;
+  results.reserve(numResults);
+  for (auto m : maps) {
+    if (!m)
+      continue;
+    assert(m.getNumSymbols() == 0 && "expected map without symbols");
+    results.append(m.getResults().begin(), m.getResults().end());
+    numDims = std::max(m.getNumDims(), numDims);
+  }
+  return numDims == 0 ? AffineMap() : AffineMap::get(numDims, 0, results);
+}
diff --git a/mlir/lib/IR/AffineMapDetail.h b/mlir/lib/IR/AffineMapDetail.h
new file mode 100644
index 0000000000000000000000000000000000000000..f00c4ba216eccc8f8d30a9b3c1e07807e27ef8d4
--- /dev/null
+++ b/mlir/lib/IR/AffineMapDetail.h
@@ -0,0 +1,37 @@
+//===- AffineMapDetail.h - MLIR Affine Map details Class --------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This holds implementation details of AffineMap.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AFFINEMAPDETAIL_H_
+#define AFFINEMAPDETAIL_H_
+
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+namespace detail {
+
+struct AffineMapStorage {
+  unsigned numDims;
+  unsigned numSymbols;
+
+  /// The affine expressions for this (multi-dimensional) map.
+  /// TODO: use trailing objects for this.
+  ArrayRef<AffineExpr> results;
+
+  MLIRContext *context;
+};
+
+} // end namespace detail
+} // end namespace mlir
+
+#endif // AFFINEMAPDETAIL_H_
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..881a6365e205d05c0e232429775d6c11a2fd22d3
--- /dev/null
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -0,0 +1,2145 @@
+//===- AsmPrinter.cpp - MLIR Assembly Printer Implementation --------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MLIR AsmPrinter class, which is used to implement
+// the various print() methods on the core IR objects.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Regex.h"
+using namespace mlir;
+
+void Identifier::print(raw_ostream &os) const { os << str(); }
+
+void Identifier::dump() const { print(llvm::errs()); }
+
+void OperationName::print(raw_ostream &os) const { os << getStringRef(); }
+
+void OperationName::dump() const { print(llvm::errs()); }
+
+DialectAsmPrinter::~DialectAsmPrinter() {}
+
+OpAsmPrinter::~OpAsmPrinter() {}
+
+//===--------------------------------------------------------------------===//
+// Operation OpAsm interface.
+//===--------------------------------------------------------------------===//
+
+/// The OpAsmOpInterface, see OpAsmInterface.td for more details.
+#include "mlir/IR/OpAsmInterface.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// OpPrintingFlags
+//===----------------------------------------------------------------------===//
+
+static llvm::cl::opt<unsigned> elideElementsAttrIfLarger(
+    "mlir-elide-elementsattrs-if-larger",
+    llvm::cl::desc("Elide ElementsAttrs with \"...\" that have "
+                   "more elements than the given upper limit"));
+
+static llvm::cl::opt<bool>
+    printDebugInfoOpt("mlir-print-debuginfo",
+                      llvm::cl::desc("Print debug info in MLIR output"),
+                      llvm::cl::init(false));
+
+static llvm::cl::opt<bool> printPrettyDebugInfoOpt(
+    "mlir-pretty-debuginfo",
+    llvm::cl::desc("Print pretty debug info in MLIR output"),
+    llvm::cl::init(false));
+
+// Use the generic op output form in the operation printer even if the custom
+// form is defined.
+static llvm::cl::opt<bool>
+    printGenericOpFormOpt("mlir-print-op-generic",
+                          llvm::cl::desc("Print the generic op form"),
+                          llvm::cl::init(false), llvm::cl::Hidden);
+
+static llvm::cl::opt<bool> printLocalScopeOpt(
+    "mlir-print-local-scope",
+    llvm::cl::desc("Print assuming in local scope by default"),
+    llvm::cl::init(false), llvm::cl::Hidden);
+
+/// Initialize the printing flags with default supplied by the cl::opts above.
+OpPrintingFlags::OpPrintingFlags()
+    : elementsAttrElementLimit(
+          elideElementsAttrIfLarger.getNumOccurrences()
+              ? Optional<int64_t>(elideElementsAttrIfLarger)
+              : Optional<int64_t>()),
+      printDebugInfoFlag(printDebugInfoOpt),
+      printDebugInfoPrettyFormFlag(printPrettyDebugInfoOpt),
+      printGenericOpFormFlag(printGenericOpFormOpt),
+      printLocalScope(printLocalScopeOpt) {}
+
+/// Enable the elision of large elements attributes, by printing a '...'
+/// instead of the element data, when the number of elements is greater than
+/// `largeElementLimit`. Note: The IR generated with this option is not
+/// parsable.
+OpPrintingFlags &
+OpPrintingFlags::elideLargeElementsAttrs(int64_t largeElementLimit) {
+  elementsAttrElementLimit = largeElementLimit;
+  return *this;
+}
+
+/// Enable printing of debug information. If 'prettyForm' is set to true,
+/// debug information is printed in a more readable 'pretty' form.
+OpPrintingFlags &OpPrintingFlags::enableDebugInfo(bool prettyForm) {
+  printDebugInfoFlag = true;
+  printDebugInfoPrettyFormFlag = prettyForm;
+  return *this;
+}
+
+/// Always print operations in the generic form.
+OpPrintingFlags &OpPrintingFlags::printGenericOpForm() {
+  printGenericOpFormFlag = true;
+  return *this;
+}
+
+/// Use local scope when printing the operation. This allows for using the
+/// printer in a more localized and thread-safe setting, but may not necessarily
+/// be identical of what the IR will look like when dumping the full module.
+OpPrintingFlags &OpPrintingFlags::useLocalScope() {
+  printLocalScope = true;
+  return *this;
+}
+
+/// Return if the given ElementsAttr should be elided.
+bool OpPrintingFlags::shouldElideElementsAttr(ElementsAttr attr) const {
+  return elementsAttrElementLimit.hasValue() &&
+         *elementsAttrElementLimit < int64_t(attr.getNumElements());
+}
+
+/// Return if debug information should be printed.
+bool OpPrintingFlags::shouldPrintDebugInfo() const {
+  return printDebugInfoFlag;
+}
+
+/// Return if debug information should be printed in the pretty form.
+bool OpPrintingFlags::shouldPrintDebugInfoPrettyForm() const {
+  return printDebugInfoPrettyFormFlag;
+}
+
+/// Return if operations should be printed in the generic form.
+bool OpPrintingFlags::shouldPrintGenericOpForm() const {
+  return printGenericOpFormFlag;
+}
+
+/// Return if the printer should use local scope when dumping the IR.
+bool OpPrintingFlags::shouldUseLocalScope() const { return printLocalScope; }
+
+//===----------------------------------------------------------------------===//
+// ModuleState
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A special index constant used for non-kind attribute aliases.
+static constexpr int kNonAttrKindAlias = -1;
+
+class ModuleState {
+public:
+  explicit ModuleState(MLIRContext *context) : interfaces(context) {}
+  void initialize(Operation *op);
+
+  Twine getAttributeAlias(Attribute attr) const {
+    auto alias = attrToAlias.find(attr);
+    if (alias == attrToAlias.end())
+      return Twine();
+
+    // Return the alias for this attribute, along with the index if this was
+    // generated by a kind alias.
+    int kindIndex = alias->second.second;
+    return alias->second.first +
+           (kindIndex == kNonAttrKindAlias ? Twine() : Twine(kindIndex));
+  }
+
+  void printAttributeAliases(raw_ostream &os) const {
+    auto printAlias = [&](StringRef alias, Attribute attr, int index) {
+      os << '#' << alias;
+      if (index != kNonAttrKindAlias)
+        os << index;
+      os << " = " << attr << '\n';
+    };
+
+    // Print all of the attribute kind aliases.
+    for (auto &kindAlias : attrKindToAlias) {
+      for (unsigned i = 0, e = kindAlias.second.second.size(); i != e; ++i)
+        printAlias(kindAlias.second.first, kindAlias.second.second[i], i);
+      os << "\n";
+    }
+
+    // In a second pass print all of the remaining attribute aliases that aren't
+    // kind aliases.
+    for (Attribute attr : usedAttributes) {
+      auto alias = attrToAlias.find(attr);
+      if (alias != attrToAlias.end() &&
+          alias->second.second == kNonAttrKindAlias)
+        printAlias(alias->second.first, attr, alias->second.second);
+    }
+  }
+
+  StringRef getTypeAlias(Type ty) const { return typeToAlias.lookup(ty); }
+
+  void printTypeAliases(raw_ostream &os) const {
+    for (Type type : usedTypes) {
+      auto alias = typeToAlias.find(type);
+      if (alias != typeToAlias.end())
+        os << '!' << alias->second << " = type " << type << '\n';
+    }
+  }
+
+  /// Get an instance of the OpAsmDialectInterface for the given dialect, or
+  /// null if one wasn't registered.
+  const OpAsmDialectInterface *getOpAsmInterface(Dialect *dialect) {
+    return interfaces.getInterfaceFor(dialect);
+  }
+
+private:
+  void recordAttributeReference(Attribute attr) {
+    // Don't recheck attributes that have already been seen or those that
+    // already have an alias.
+    if (!usedAttributes.insert(attr) || attrToAlias.count(attr))
+      return;
+
+    // If this attribute kind has an alias, then record one for this attribute.
+    auto alias = attrKindToAlias.find(static_cast<unsigned>(attr.getKind()));
+    if (alias == attrKindToAlias.end())
+      return;
+    std::pair<StringRef, int> attrAlias(alias->second.first,
+                                        alias->second.second.size());
+    attrToAlias.insert({attr, attrAlias});
+    alias->second.second.push_back(attr);
+  }
+
+  void recordTypeReference(Type ty) { usedTypes.insert(ty); }
+
+  // Visit functions.
+  void visitOperation(Operation *op);
+  void visitType(Type type);
+  void visitAttribute(Attribute attr);
+
+  // Initialize symbol aliases.
+  void initializeSymbolAliases();
+
+  /// Set of attributes known to be used within the module.
+  llvm::SetVector<Attribute> usedAttributes;
+
+  /// Mapping between attribute and a pair comprised of a base alias name and a
+  /// count suffix. If the suffix is set to -1, it is not displayed.
+  llvm::MapVector<Attribute, std::pair<StringRef, int>> attrToAlias;
+
+  /// Mapping between attribute kind and a pair comprised of a base alias name
+  /// and a unique list of attributes belonging to this kind sorted by location
+  /// seen in the module.
+  llvm::MapVector<unsigned, std::pair<StringRef, std::vector<Attribute>>>
+      attrKindToAlias;
+
+  /// Set of types known to be used within the module.
+  llvm::SetVector<Type> usedTypes;
+
+  /// A mapping between a type and a given alias.
+  DenseMap<Type, StringRef> typeToAlias;
+
+  /// Collection of OpAsm interfaces implemented in the context.
+  DialectInterfaceCollection<OpAsmDialectInterface> interfaces;
+};
+} // end anonymous namespace
+
+// TODO Support visiting other types/operations when implemented.
+void ModuleState::visitType(Type type) {
+  recordTypeReference(type);
+  if (auto funcType = type.dyn_cast<FunctionType>()) {
+    // Visit input and result types for functions.
+    for (auto input : funcType.getInputs())
+      visitType(input);
+    for (auto result : funcType.getResults())
+      visitType(result);
+    return;
+  }
+  if (auto memref = type.dyn_cast<MemRefType>()) {
+    // Visit affine maps in memref type.
+    for (auto map : memref.getAffineMaps())
+      recordAttributeReference(AffineMapAttr::get(map));
+  }
+  if (auto shapedType = type.dyn_cast<ShapedType>()) {
+    visitType(shapedType.getElementType());
+  }
+}
+
+void ModuleState::visitAttribute(Attribute attr) {
+  recordAttributeReference(attr);
+  if (auto arrayAttr = attr.dyn_cast<ArrayAttr>()) {
+    for (auto elt : arrayAttr.getValue())
+      visitAttribute(elt);
+  } else if (auto typeAttr = attr.dyn_cast<TypeAttr>()) {
+    visitType(typeAttr.getValue());
+  }
+}
+
+void ModuleState::visitOperation(Operation *op) {
+  // Visit all the types used in the operation.
+  for (auto type : op->getOperandTypes())
+    visitType(type);
+  for (auto type : op->getResultTypes())
+    visitType(type);
+  for (auto &region : op->getRegions())
+    for (auto &block : region)
+      for (auto arg : block.getArguments())
+        visitType(arg->getType());
+
+  // Visit each of the attributes.
+  for (auto elt : op->getAttrs())
+    visitAttribute(elt.second);
+}
+
+// Utility to generate a function to register a symbol alias.
+static bool canRegisterAlias(StringRef name, llvm::StringSet<> &usedAliases) {
+  assert(!name.empty() && "expected alias name to be non-empty");
+  // TODO(riverriddle) Assert that the provided alias name can be lexed as
+  // an identifier.
+
+  // Check that the alias doesn't contain a '.' character and the name is not
+  // already in use.
+  return !name.contains('.') && usedAliases.insert(name).second;
+}
+
+void ModuleState::initializeSymbolAliases() {
+  // Track the identifiers in use for each symbol so that the same identifier
+  // isn't used twice.
+  llvm::StringSet<> usedAliases;
+
+  // Collect the set of aliases from each dialect.
+  SmallVector<std::pair<unsigned, StringRef>, 8> attributeKindAliases;
+  SmallVector<std::pair<Attribute, StringRef>, 8> attributeAliases;
+  SmallVector<std::pair<Type, StringRef>, 16> typeAliases;
+
+  // AffineMap/Integer set have specific kind aliases.
+  attributeKindAliases.emplace_back(StandardAttributes::AffineMap, "map");
+  attributeKindAliases.emplace_back(StandardAttributes::IntegerSet, "set");
+
+  for (auto &interface : interfaces) {
+    interface.getAttributeKindAliases(attributeKindAliases);
+    interface.getAttributeAliases(attributeAliases);
+    interface.getTypeAliases(typeAliases);
+  }
+
+  // Setup the attribute kind aliases.
+  StringRef alias;
+  unsigned attrKind;
+  for (auto &attrAliasPair : attributeKindAliases) {
+    std::tie(attrKind, alias) = attrAliasPair;
+    assert(!alias.empty() && "expected non-empty alias string");
+    if (!usedAliases.count(alias) && !alias.contains('.'))
+      attrKindToAlias.insert({attrKind, {alias, {}}});
+  }
+
+  // Clear the set of used identifiers so that the attribute kind aliases are
+  // just a prefix and not the full alias, i.e. there may be some overlap.
+  usedAliases.clear();
+
+  // Register the attribute aliases.
+  // Create a regex for the attribute kind alias names, these have a prefix with
+  // a counter appended to the end. We prevent normal aliases from having these
+  // names to avoid collisions.
+  llvm::Regex reservedAttrNames("[0-9]+$");
+
+  // Attribute value aliases.
+  Attribute attr;
+  for (auto &attrAliasPair : attributeAliases) {
+    std::tie(attr, alias) = attrAliasPair;
+    if (!reservedAttrNames.match(alias) && canRegisterAlias(alias, usedAliases))
+      attrToAlias.insert({attr, {alias, kNonAttrKindAlias}});
+  }
+
+  // Clear the set of used identifiers as types can have the same identifiers as
+  // affine structures.
+  usedAliases.clear();
+
+  // Type aliases.
+  for (auto &typeAliasPair : typeAliases)
+    if (canRegisterAlias(typeAliasPair.second, usedAliases))
+      typeToAlias.insert(typeAliasPair);
+}
+
+void ModuleState::initialize(Operation *op) {
+  // Initialize the symbol aliases.
+  initializeSymbolAliases();
+
+  // Visit each of the nested operations.
+  op->walk([&](Operation *op) { visitOperation(op); });
+}
+
+//===----------------------------------------------------------------------===//
+// ModulePrinter
+//===----------------------------------------------------------------------===//
+
+namespace {
+class ModulePrinter {
+public:
+  ModulePrinter(raw_ostream &os, OpPrintingFlags flags = llvm::None,
+                ModuleState *state = nullptr)
+      : os(os), printerFlags(flags), state(state) {}
+  explicit ModulePrinter(ModulePrinter &printer)
+      : os(printer.os), printerFlags(printer.printerFlags),
+        state(printer.state) {}
+
+  /// Returns the output stream of the printer.
+  raw_ostream &getStream() { return os; }
+
+  template <typename Container, typename UnaryFunctor>
+  inline void interleaveComma(const Container &c, UnaryFunctor each_fn) const {
+    mlir::interleaveComma(c, os, each_fn);
+  }
+
+  void print(ModuleOp module);
+
+  /// Print the given attribute. If 'mayElideType' is true, some attributes are
+  /// printed without the type when the type matches the default used in the
+  /// parser (for example i64 is the default for integer attributes).
+  void printAttribute(Attribute attr, bool mayElideType = false);
+
+  void printType(Type type);
+  void printLocation(LocationAttr loc);
+
+  void printAffineMap(AffineMap map);
+  void
+  printAffineExpr(AffineExpr expr,
+                  function_ref<void(unsigned, bool)> printValueName = nullptr);
+  void printAffineConstraint(AffineExpr expr, bool isEq);
+  void printIntegerSet(IntegerSet set);
+
+protected:
+  void printOptionalAttrDict(ArrayRef<NamedAttribute> attrs,
+                             ArrayRef<StringRef> elidedAttrs = {},
+                             bool withKeyword = false);
+  void printTrailingLocation(Location loc);
+  void printLocationInternal(LocationAttr loc, bool pretty = false);
+  void printDenseElementsAttr(DenseElementsAttr attr);
+
+  void printDialectAttribute(Attribute attr);
+  void printDialectType(Type type);
+
+  /// This enum is used to represent the binding strength of the enclosing
+  /// context that an AffineExprStorage is being printed in, so we can
+  /// intelligently produce parens.
+  enum class BindingStrength {
+    Weak,   // + and -
+    Strong, // All other binary operators.
+  };
+  void printAffineExprInternal(
+      AffineExpr expr, BindingStrength enclosingTightness,
+      function_ref<void(unsigned, bool)> printValueName = nullptr);
+
+  /// The output stream for the printer.
+  raw_ostream &os;
+
+  /// A set of flags to control the printer's behavior.
+  OpPrintingFlags printerFlags;
+
+  /// An optional printer state for the module.
+  ModuleState *state;
+};
+} // end anonymous namespace
+
+void ModulePrinter::printTrailingLocation(Location loc) {
+  // Check to see if we are printing debug information.
+  if (!printerFlags.shouldPrintDebugInfo())
+    return;
+
+  os << " ";
+  printLocation(loc);
+}
+
+void ModulePrinter::printLocationInternal(LocationAttr loc, bool pretty) {
+  switch (loc.getKind()) {
+  case StandardAttributes::OpaqueLocation:
+    printLocationInternal(loc.cast<OpaqueLoc>().getFallbackLocation(), pretty);
+    break;
+  case StandardAttributes::UnknownLocation:
+    if (pretty)
+      os << "[unknown]";
+    else
+      os << "unknown";
+    break;
+  case StandardAttributes::FileLineColLocation: {
+    auto fileLoc = loc.cast<FileLineColLoc>();
+    auto mayQuote = pretty ? "" : "\"";
+    os << mayQuote << fileLoc.getFilename() << mayQuote << ':'
+       << fileLoc.getLine() << ':' << fileLoc.getColumn();
+    break;
+  }
+  case StandardAttributes::NameLocation: {
+    auto nameLoc = loc.cast<NameLoc>();
+    os << '\"' << nameLoc.getName() << '\"';
+
+    // Print the child if it isn't unknown.
+    auto childLoc = nameLoc.getChildLoc();
+    if (!childLoc.isa<UnknownLoc>()) {
+      os << '(';
+      printLocationInternal(childLoc, pretty);
+      os << ')';
+    }
+    break;
+  }
+  case StandardAttributes::CallSiteLocation: {
+    auto callLocation = loc.cast<CallSiteLoc>();
+    auto caller = callLocation.getCaller();
+    auto callee = callLocation.getCallee();
+    if (!pretty)
+      os << "callsite(";
+    printLocationInternal(callee, pretty);
+    if (pretty) {
+      if (callee.isa<NameLoc>()) {
+        if (caller.isa<FileLineColLoc>()) {
+          os << " at ";
+        } else {
+          os << "\n at ";
+        }
+      } else {
+        os << "\n at ";
+      }
+    } else {
+      os << " at ";
+    }
+    printLocationInternal(caller, pretty);
+    if (!pretty)
+      os << ")";
+    break;
+  }
+  case StandardAttributes::FusedLocation: {
+    auto fusedLoc = loc.cast<FusedLoc>();
+    if (!pretty)
+      os << "fused";
+    if (auto metadata = fusedLoc.getMetadata())
+      os << '<' << metadata << '>';
+    os << '[';
+    interleave(
+        fusedLoc.getLocations(),
+        [&](Location loc) { printLocationInternal(loc, pretty); },
+        [&]() { os << ", "; });
+    os << ']';
+    break;
+  }
+  }
+}
+
+/// Print a floating point value in a way that the parser will be able to
+/// round-trip losslessly.
+static void printFloatValue(const APFloat &apValue, raw_ostream &os) {
+  // We would like to output the FP constant value in exponential notation,
+  // but we cannot do this if doing so will lose precision.  Check here to
+  // make sure that we only output it in exponential format if we can parse
+  // the value back and get the same value.
+  bool isInf = apValue.isInfinity();
+  bool isNaN = apValue.isNaN();
+  if (!isInf && !isNaN) {
+    SmallString<128> strValue;
+    apValue.toString(strValue, 6, 0, false);
+
+    // Check to make sure that the stringized number is not some string like
+    // "Inf" or NaN, that atof will accept, but the lexer will not.  Check
+    // that the string matches the "[-+]?[0-9]" regex.
+    assert(((strValue[0] >= '0' && strValue[0] <= '9') ||
+            ((strValue[0] == '-' || strValue[0] == '+') &&
+             (strValue[1] >= '0' && strValue[1] <= '9'))) &&
+           "[-+]?[0-9] regex does not match!");
+
+    // Parse back the stringized version and check that the value is equal
+    // (i.e., there is no precision loss). If it is not, use the default format
+    // of APFloat instead of the exponential notation.
+    if (!APFloat(apValue.getSemantics(), strValue).bitwiseIsEqual(apValue)) {
+      strValue.clear();
+      apValue.toString(strValue);
+    }
+    os << strValue;
+    return;
+  }
+
+  // Print special values in hexadecimal format.  The sign bit should be
+  // included in the literal.
+  SmallVector<char, 16> str;
+  APInt apInt = apValue.bitcastToAPInt();
+  apInt.toString(str, /*Radix=*/16, /*Signed=*/false,
+                 /*formatAsCLiteral=*/true);
+  os << str;
+}
+
+void ModulePrinter::printLocation(LocationAttr loc) {
+  if (printerFlags.shouldPrintDebugInfoPrettyForm()) {
+    printLocationInternal(loc, /*pretty=*/true);
+  } else {
+    os << "loc(";
+    printLocationInternal(loc);
+    os << ')';
+  }
+}
+
+/// Returns if the given dialect symbol data is simple enough to print in the
+/// pretty form, i.e. without the enclosing "".
+static bool isDialectSymbolSimpleEnoughForPrettyForm(StringRef symName) {
+  // The name must start with an identifier.
+  if (symName.empty() || !isalpha(symName.front()))
+    return false;
+
+  // Ignore all the characters that are valid in an identifier in the symbol
+  // name.
+  symName = symName.drop_while(
+      [](char c) { return llvm::isAlnum(c) || c == '.' || c == '_'; });
+  if (symName.empty())
+    return true;
+
+  // If we got to an unexpected character, then it must be a <>.  Check those
+  // recursively.
+  if (symName.front() != '<' || symName.back() != '>')
+    return false;
+
+  SmallVector<char, 8> nestedPunctuation;
+  do {
+    // If we ran out of characters, then we had a punctuation mismatch.
+    if (symName.empty())
+      return false;
+
+    auto c = symName.front();
+    symName = symName.drop_front();
+
+    switch (c) {
+    // We never allow null characters. This is an EOF indicator for the lexer
+    // which we could handle, but isn't important for any known dialect.
+    case '\0':
+      return false;
+    case '<':
+    case '[':
+    case '(':
+    case '{':
+      nestedPunctuation.push_back(c);
+      continue;
+    case '-':
+      // Treat `->` as a special token.
+      if (!symName.empty() && symName.front() == '>') {
+        symName = symName.drop_front();
+        continue;
+      }
+      break;
+    // Reject types with mismatched brackets.
+    case '>':
+      if (nestedPunctuation.pop_back_val() != '<')
+        return false;
+      break;
+    case ']':
+      if (nestedPunctuation.pop_back_val() != '[')
+        return false;
+      break;
+    case ')':
+      if (nestedPunctuation.pop_back_val() != '(')
+        return false;
+      break;
+    case '}':
+      if (nestedPunctuation.pop_back_val() != '{')
+        return false;
+      break;
+    default:
+      continue;
+    }
+
+    // We're done when the punctuation is fully matched.
+  } while (!nestedPunctuation.empty());
+
+  // If there were extra characters, then we failed.
+  return symName.empty();
+}
+
+/// Print the given dialect symbol to the stream.
+static void printDialectSymbol(raw_ostream &os, StringRef symPrefix,
+                               StringRef dialectName, StringRef symString) {
+  os << symPrefix << dialectName;
+
+  // If this symbol name is simple enough, print it directly in pretty form,
+  // otherwise, we print it as an escaped string.
+  if (isDialectSymbolSimpleEnoughForPrettyForm(symString)) {
+    os << '.' << symString;
+    return;
+  }
+
+  // TODO: escape the symbol name, it could contain " characters.
+  os << "<\"" << symString << "\">";
+}
+
+/// Returns if the given string can be represented as a bare identifier.
+static bool isBareIdentifier(StringRef name) {
+  assert(!name.empty() && "invalid name");
+
+  // By making this unsigned, the value passed in to isalnum will always be
+  // in the range 0-255. This is important when building with MSVC because
+  // its implementation will assert. This situation can arise when dealing
+  // with UTF-8 multibyte characters.
+  unsigned char firstChar = static_cast<unsigned char>(name[0]);
+  if (!isalpha(firstChar) && firstChar != '_')
+    return false;
+  return llvm::all_of(name.drop_front(), [](unsigned char c) {
+    return isalnum(c) || c == '_' || c == '$' || c == '.';
+  });
+}
+
+/// Print the given string as a symbol reference. A symbol reference is
+/// represented as a string prefixed with '@'. The reference is surrounded with
+/// ""'s and escaped if it has any special or non-printable characters in it.
+static void printSymbolReference(StringRef symbolRef, raw_ostream &os) {
+  assert(!symbolRef.empty() && "expected valid symbol reference");
+
+  // If the symbol can be represented as a bare identifier, write it directly.
+  if (isBareIdentifier(symbolRef)) {
+    os << '@' << symbolRef;
+    return;
+  }
+
+  // Otherwise, output the reference wrapped in quotes with proper escaping.
+  os << "@\"";
+  printEscapedString(symbolRef, os);
+  os << '"';
+}
+
+// Print out a valid ElementsAttr that is succinct and can represent any
+// potential shape/type, for use when eliding a large ElementsAttr.
+//
+// We choose to use an opaque ElementsAttr literal with conspicuous content to
+// hopefully alert readers to the fact that this has been elided.
+//
+// Unfortunately, neither of the strings of an opaque ElementsAttr literal will
+// accept the string "elided". The first string must be a registered dialect
+// name and the latter must be a hex constant.
+static void printElidedElementsAttr(raw_ostream &os) {
+  os << R"(opaque<"", "0xDEADBEEF">)";
+}
+
+void ModulePrinter::printAttribute(Attribute attr, bool mayElideType) {
+  if (!attr) {
+    os << "<<NULL ATTRIBUTE>>";
+    return;
+  }
+
+  // Check for an alias for this attribute.
+  if (state) {
+    Twine alias = state->getAttributeAlias(attr);
+    if (!alias.isTriviallyEmpty()) {
+      os << '#' << alias;
+      return;
+    }
+  }
+
+  switch (attr.getKind()) {
+  default:
+    return printDialectAttribute(attr);
+
+  case StandardAttributes::Opaque: {
+    auto opaqueAttr = attr.cast<OpaqueAttr>();
+    printDialectSymbol(os, "#", opaqueAttr.getDialectNamespace(),
+                       opaqueAttr.getAttrData());
+    break;
+  }
+  case StandardAttributes::Unit:
+    os << "unit";
+    break;
+  case StandardAttributes::Bool:
+    os << (attr.cast<BoolAttr>().getValue() ? "true" : "false");
+
+    // BoolAttr always elides the type.
+    return;
+  case StandardAttributes::Dictionary:
+    os << '{';
+    interleaveComma(attr.cast<DictionaryAttr>().getValue(),
+                    [&](NamedAttribute attr) {
+                      os << attr.first;
+
+                      // The value of a UnitAttr is elided within a dictionary.
+                      if (attr.second.isa<UnitAttr>())
+                        return;
+
+                      os << " = ";
+                      printAttribute(attr.second);
+                    });
+    os << '}';
+    break;
+  case StandardAttributes::Integer: {
+    auto intAttr = attr.cast<IntegerAttr>();
+    // Print all integer attributes as signed unless i1.
+    bool isSigned = intAttr.getType().isIndex() ||
+                    intAttr.getType().getIntOrFloatBitWidth() != 1;
+    intAttr.getValue().print(os, isSigned);
+
+    // IntegerAttr elides the type if I64.
+    if (mayElideType && intAttr.getType().isInteger(64))
+      return;
+    break;
+  }
+  case StandardAttributes::Float: {
+    auto floatAttr = attr.cast<FloatAttr>();
+    printFloatValue(floatAttr.getValue(), os);
+
+    // FloatAttr elides the type if F64.
+    if (mayElideType && floatAttr.getType().isF64())
+      return;
+    break;
+  }
+  case StandardAttributes::String:
+    os << '"';
+    printEscapedString(attr.cast<StringAttr>().getValue(), os);
+    os << '"';
+    break;
+  case StandardAttributes::Array:
+    os << '[';
+    interleaveComma(attr.cast<ArrayAttr>().getValue(), [&](Attribute attr) {
+      printAttribute(attr, /*mayElideType=*/true);
+    });
+    os << ']';
+    break;
+  case StandardAttributes::AffineMap:
+    attr.cast<AffineMapAttr>().getValue().print(os);
+
+    // AffineMap always elides the type.
+    return;
+  case StandardAttributes::IntegerSet:
+    attr.cast<IntegerSetAttr>().getValue().print(os);
+    break;
+  case StandardAttributes::Type:
+    printType(attr.cast<TypeAttr>().getValue());
+    break;
+  case StandardAttributes::SymbolRef: {
+    auto refAttr = attr.dyn_cast<SymbolRefAttr>();
+    printSymbolReference(refAttr.getRootReference(), os);
+    for (FlatSymbolRefAttr nestedRef : refAttr.getNestedReferences()) {
+      os << "::";
+      printSymbolReference(nestedRef.getValue(), os);
+    }
+    break;
+  }
+  case StandardAttributes::OpaqueElements: {
+    auto eltsAttr = attr.cast<OpaqueElementsAttr>();
+    if (printerFlags.shouldElideElementsAttr(eltsAttr)) {
+      printElidedElementsAttr(os);
+      break;
+    }
+    os << "opaque<\"" << eltsAttr.getDialect()->getNamespace() << "\", ";
+    os << '"' << "0x" << llvm::toHex(eltsAttr.getValue()) << "\">";
+    break;
+  }
+  case StandardAttributes::DenseElements: {
+    auto eltsAttr = attr.cast<DenseElementsAttr>();
+    if (printerFlags.shouldElideElementsAttr(eltsAttr)) {
+      printElidedElementsAttr(os);
+      break;
+    }
+    os << "dense<";
+    printDenseElementsAttr(eltsAttr);
+    os << '>';
+    break;
+  }
+  case StandardAttributes::SparseElements: {
+    auto elementsAttr = attr.cast<SparseElementsAttr>();
+    if (printerFlags.shouldElideElementsAttr(elementsAttr.getIndices()) ||
+        printerFlags.shouldElideElementsAttr(elementsAttr.getValues())) {
+      printElidedElementsAttr(os);
+      break;
+    }
+    os << "sparse<";
+    printDenseElementsAttr(elementsAttr.getIndices());
+    os << ", ";
+    printDenseElementsAttr(elementsAttr.getValues());
+    os << '>';
+    break;
+  }
+
+  // Location attributes.
+  case StandardAttributes::CallSiteLocation:
+  case StandardAttributes::FileLineColLocation:
+  case StandardAttributes::FusedLocation:
+  case StandardAttributes::NameLocation:
+  case StandardAttributes::OpaqueLocation:
+  case StandardAttributes::UnknownLocation:
+    printLocation(attr.cast<LocationAttr>());
+    break;
+  }
+
+  // Print the type if it isn't a 'none' type.
+  auto attrType = attr.getType();
+  if (!attrType.isa<NoneType>()) {
+    os << " : ";
+    printType(attrType);
+  }
+}
+
+/// Print the integer element of the given DenseElementsAttr at 'index'.
+static void printDenseIntElement(DenseElementsAttr attr, raw_ostream &os,
+                                 unsigned index) {
+  APInt value = *std::next(attr.int_value_begin(), index);
+  if (value.getBitWidth() == 1)
+    os << (value.getBoolValue() ? "true" : "false");
+  else
+    value.print(os, /*isSigned=*/true);
+}
+
+/// Print the float element of the given DenseElementsAttr at 'index'.
+static void printDenseFloatElement(DenseElementsAttr attr, raw_ostream &os,
+                                   unsigned index) {
+  APFloat value = *std::next(attr.float_value_begin(), index);
+  printFloatValue(value, os);
+}
+
+void ModulePrinter::printDenseElementsAttr(DenseElementsAttr attr) {
+  auto type = attr.getType();
+  auto shape = type.getShape();
+  auto rank = type.getRank();
+
+  // The function used to print elements of this attribute.
+  auto printEltFn = type.getElementType().isa<IntegerType>()
+                        ? printDenseIntElement
+                        : printDenseFloatElement;
+
+  // Special case for 0-d and splat tensors.
+  if (attr.isSplat()) {
+    printEltFn(attr, os, 0);
+    return;
+  }
+
+  // Special case for degenerate tensors.
+  auto numElements = type.getNumElements();
+  if (numElements == 0) {
+    for (int i = 0; i < rank; ++i)
+      os << '[';
+    for (int i = 0; i < rank; ++i)
+      os << ']';
+    return;
+  }
+
+  // We use a mixed-radix counter to iterate through the shape. When we bump a
+  // non-least-significant digit, we emit a close bracket. When we next emit an
+  // element we re-open all closed brackets.
+
+  // The mixed-radix counter, with radices in 'shape'.
+  SmallVector<unsigned, 4> counter(rank, 0);
+  // The number of brackets that have been opened and not closed.
+  unsigned openBrackets = 0;
+
+  auto bumpCounter = [&]() {
+    // Bump the least significant digit.
+    ++counter[rank - 1];
+    // Iterate backwards bubbling back the increment.
+    for (unsigned i = rank - 1; i > 0; --i)
+      if (counter[i] >= shape[i]) {
+        // Index 'i' is rolled over. Bump (i-1) and close a bracket.
+        counter[i] = 0;
+        ++counter[i - 1];
+        --openBrackets;
+        os << ']';
+      }
+  };
+
+  for (unsigned idx = 0, e = numElements; idx != e; ++idx) {
+    if (idx != 0)
+      os << ", ";
+    while (openBrackets++ < rank)
+      os << '[';
+    openBrackets = rank;
+    printEltFn(attr, os, idx);
+    bumpCounter();
+  }
+  while (openBrackets-- > 0)
+    os << ']';
+}
+
+void ModulePrinter::printType(Type type) {
+  // Check for an alias for this type.
+  if (state) {
+    StringRef alias = state->getTypeAlias(type);
+    if (!alias.empty()) {
+      os << '!' << alias;
+      return;
+    }
+  }
+
+  switch (type.getKind()) {
+  default:
+    return printDialectType(type);
+
+  case Type::Kind::Opaque: {
+    auto opaqueTy = type.cast<OpaqueType>();
+    printDialectSymbol(os, "!", opaqueTy.getDialectNamespace(),
+                       opaqueTy.getTypeData());
+    return;
+  }
+  case StandardTypes::Index:
+    os << "index";
+    return;
+  case StandardTypes::BF16:
+    os << "bf16";
+    return;
+  case StandardTypes::F16:
+    os << "f16";
+    return;
+  case StandardTypes::F32:
+    os << "f32";
+    return;
+  case StandardTypes::F64:
+    os << "f64";
+    return;
+
+  case StandardTypes::Integer: {
+    auto integer = type.cast<IntegerType>();
+    os << 'i' << integer.getWidth();
+    return;
+  }
+  case Type::Kind::Function: {
+    auto func = type.cast<FunctionType>();
+    os << '(';
+    interleaveComma(func.getInputs(), [&](Type type) { printType(type); });
+    os << ") -> ";
+    auto results = func.getResults();
+    if (results.size() == 1 && !results[0].isa<FunctionType>())
+      os << results[0];
+    else {
+      os << '(';
+      interleaveComma(results, [&](Type type) { printType(type); });
+      os << ')';
+    }
+    return;
+  }
+  case StandardTypes::Vector: {
+    auto v = type.cast<VectorType>();
+    os << "vector<";
+    for (auto dim : v.getShape())
+      os << dim << 'x';
+    os << v.getElementType() << '>';
+    return;
+  }
+  case StandardTypes::RankedTensor: {
+    auto v = type.cast<RankedTensorType>();
+    os << "tensor<";
+    for (auto dim : v.getShape()) {
+      if (dim < 0)
+        os << '?';
+      else
+        os << dim;
+      os << 'x';
+    }
+    os << v.getElementType() << '>';
+    return;
+  }
+  case StandardTypes::UnrankedTensor: {
+    auto v = type.cast<UnrankedTensorType>();
+    os << "tensor<*x";
+    printType(v.getElementType());
+    os << '>';
+    return;
+  }
+  case StandardTypes::MemRef: {
+    auto v = type.cast<MemRefType>();
+    os << "memref<";
+    for (auto dim : v.getShape()) {
+      if (dim < 0)
+        os << '?';
+      else
+        os << dim;
+      os << 'x';
+    }
+    printType(v.getElementType());
+    for (auto map : v.getAffineMaps()) {
+      os << ", ";
+      printAttribute(AffineMapAttr::get(map));
+    }
+    // Only print the memory space if it is the non-default one.
+    if (v.getMemorySpace())
+      os << ", " << v.getMemorySpace();
+    os << '>';
+    return;
+  }
+  case StandardTypes::UnrankedMemRef: {
+    auto v = type.cast<UnrankedMemRefType>();
+    os << "memref<*x";
+    printType(v.getElementType());
+    os << '>';
+    return;
+  }
+  case StandardTypes::Complex:
+    os << "complex<";
+    printType(type.cast<ComplexType>().getElementType());
+    os << '>';
+    return;
+  case StandardTypes::Tuple: {
+    auto tuple = type.cast<TupleType>();
+    os << "tuple<";
+    interleaveComma(tuple.getTypes(), [&](Type type) { printType(type); });
+    os << '>';
+    return;
+  }
+  case StandardTypes::None:
+    os << "none";
+    return;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// CustomDialectAsmPrinter
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This class provides the main specialization of the DialectAsmPrinter that is
+/// used to provide support for print attributes and types. This hooks allows
+/// for dialects to hook into the main ModulePrinter.
+struct CustomDialectAsmPrinter : public DialectAsmPrinter {
+public:
+  CustomDialectAsmPrinter(ModulePrinter &printer) : printer(printer) {}
+  ~CustomDialectAsmPrinter() override {}
+
+  raw_ostream &getStream() const override { return printer.getStream(); }
+
+  /// Print the given attribute to the stream.
+  void printAttribute(Attribute attr) override { printer.printAttribute(attr); }
+
+  /// Print the given floating point value in a stablized form.
+  void printFloat(const APFloat &value) override {
+    printFloatValue(value, getStream());
+  }
+
+  /// Print the given type to the stream.
+  void printType(Type type) override { printer.printType(type); }
+
+  /// The main module printer.
+  ModulePrinter &printer;
+};
+} // end anonymous namespace
+
+void ModulePrinter::printDialectAttribute(Attribute attr) {
+  auto &dialect = attr.getDialect();
+
+  // Ask the dialect to serialize the attribute to a string.
+  std::string attrName;
+  {
+    llvm::raw_string_ostream attrNameStr(attrName);
+    ModulePrinter subPrinter(attrNameStr, printerFlags, state);
+    CustomDialectAsmPrinter printer(subPrinter);
+    dialect.printAttribute(attr, printer);
+  }
+  printDialectSymbol(os, "#", dialect.getNamespace(), attrName);
+}
+
+void ModulePrinter::printDialectType(Type type) {
+  auto &dialect = type.getDialect();
+
+  // Ask the dialect to serialize the type to a string.
+  std::string typeName;
+  {
+    llvm::raw_string_ostream typeNameStr(typeName);
+    ModulePrinter subPrinter(typeNameStr, printerFlags, state);
+    CustomDialectAsmPrinter printer(subPrinter);
+    dialect.printType(type, printer);
+  }
+  printDialectSymbol(os, "!", dialect.getNamespace(), typeName);
+}
+
+//===----------------------------------------------------------------------===//
+// Affine expressions and maps
+//===----------------------------------------------------------------------===//
+
+void ModulePrinter::printAffineExpr(
+    AffineExpr expr, function_ref<void(unsigned, bool)> printValueName) {
+  printAffineExprInternal(expr, BindingStrength::Weak, printValueName);
+}
+
+void ModulePrinter::printAffineExprInternal(
+    AffineExpr expr, BindingStrength enclosingTightness,
+    function_ref<void(unsigned, bool)> printValueName) {
+  const char *binopSpelling = nullptr;
+  switch (expr.getKind()) {
+  case AffineExprKind::SymbolId: {
+    unsigned pos = expr.cast<AffineSymbolExpr>().getPosition();
+    if (printValueName)
+      printValueName(pos, /*isSymbol=*/true);
+    else
+      os << 's' << pos;
+    return;
+  }
+  case AffineExprKind::DimId: {
+    unsigned pos = expr.cast<AffineDimExpr>().getPosition();
+    if (printValueName)
+      printValueName(pos, /*isSymbol=*/false);
+    else
+      os << 'd' << pos;
+    return;
+  }
+  case AffineExprKind::Constant:
+    os << expr.cast<AffineConstantExpr>().getValue();
+    return;
+  case AffineExprKind::Add:
+    binopSpelling = " + ";
+    break;
+  case AffineExprKind::Mul:
+    binopSpelling = " * ";
+    break;
+  case AffineExprKind::FloorDiv:
+    binopSpelling = " floordiv ";
+    break;
+  case AffineExprKind::CeilDiv:
+    binopSpelling = " ceildiv ";
+    break;
+  case AffineExprKind::Mod:
+    binopSpelling = " mod ";
+    break;
+  }
+
+  auto binOp = expr.cast<AffineBinaryOpExpr>();
+  AffineExpr lhsExpr = binOp.getLHS();
+  AffineExpr rhsExpr = binOp.getRHS();
+
+  // Handle tightly binding binary operators.
+  if (binOp.getKind() != AffineExprKind::Add) {
+    if (enclosingTightness == BindingStrength::Strong)
+      os << '(';
+
+    // Pretty print multiplication with -1.
+    auto rhsConst = rhsExpr.dyn_cast<AffineConstantExpr>();
+    if (rhsConst && binOp.getKind() == AffineExprKind::Mul &&
+        rhsConst.getValue() == -1) {
+      os << "-";
+      printAffineExprInternal(lhsExpr, BindingStrength::Strong, printValueName);
+      if (enclosingTightness == BindingStrength::Strong)
+        os << ')';
+      return;
+    }
+
+    printAffineExprInternal(lhsExpr, BindingStrength::Strong, printValueName);
+
+    os << binopSpelling;
+    printAffineExprInternal(rhsExpr, BindingStrength::Strong, printValueName);
+
+    if (enclosingTightness == BindingStrength::Strong)
+      os << ')';
+    return;
+  }
+
+  // Print out special "pretty" forms for add.
+  if (enclosingTightness == BindingStrength::Strong)
+    os << '(';
+
+  // Pretty print addition to a product that has a negative operand as a
+  // subtraction.
+  if (auto rhs = rhsExpr.dyn_cast<AffineBinaryOpExpr>()) {
+    if (rhs.getKind() == AffineExprKind::Mul) {
+      AffineExpr rrhsExpr = rhs.getRHS();
+      if (auto rrhs = rrhsExpr.dyn_cast<AffineConstantExpr>()) {
+        if (rrhs.getValue() == -1) {
+          printAffineExprInternal(lhsExpr, BindingStrength::Weak,
+                                  printValueName);
+          os << " - ";
+          if (rhs.getLHS().getKind() == AffineExprKind::Add) {
+            printAffineExprInternal(rhs.getLHS(), BindingStrength::Strong,
+                                    printValueName);
+          } else {
+            printAffineExprInternal(rhs.getLHS(), BindingStrength::Weak,
+                                    printValueName);
+          }
+
+          if (enclosingTightness == BindingStrength::Strong)
+            os << ')';
+          return;
+        }
+
+        if (rrhs.getValue() < -1) {
+          printAffineExprInternal(lhsExpr, BindingStrength::Weak,
+                                  printValueName);
+          os << " - ";
+          printAffineExprInternal(rhs.getLHS(), BindingStrength::Strong,
+                                  printValueName);
+          os << " * " << -rrhs.getValue();
+          if (enclosingTightness == BindingStrength::Strong)
+            os << ')';
+          return;
+        }
+      }
+    }
+  }
+
+  // Pretty print addition to a negative number as a subtraction.
+  if (auto rhsConst = rhsExpr.dyn_cast<AffineConstantExpr>()) {
+    if (rhsConst.getValue() < 0) {
+      printAffineExprInternal(lhsExpr, BindingStrength::Weak, printValueName);
+      os << " - " << -rhsConst.getValue();
+      if (enclosingTightness == BindingStrength::Strong)
+        os << ')';
+      return;
+    }
+  }
+
+  printAffineExprInternal(lhsExpr, BindingStrength::Weak, printValueName);
+
+  os << " + ";
+  printAffineExprInternal(rhsExpr, BindingStrength::Weak, printValueName);
+
+  if (enclosingTightness == BindingStrength::Strong)
+    os << ')';
+}
+
+void ModulePrinter::printAffineConstraint(AffineExpr expr, bool isEq) {
+  printAffineExprInternal(expr, BindingStrength::Weak);
+  isEq ? os << " == 0" : os << " >= 0";
+}
+
+void ModulePrinter::printAffineMap(AffineMap map) {
+  // Dimension identifiers.
+  os << '(';
+  for (int i = 0; i < (int)map.getNumDims() - 1; ++i)
+    os << 'd' << i << ", ";
+  if (map.getNumDims() >= 1)
+    os << 'd' << map.getNumDims() - 1;
+  os << ')';
+
+  // Symbolic identifiers.
+  if (map.getNumSymbols() != 0) {
+    os << '[';
+    for (unsigned i = 0; i < map.getNumSymbols() - 1; ++i)
+      os << 's' << i << ", ";
+    if (map.getNumSymbols() >= 1)
+      os << 's' << map.getNumSymbols() - 1;
+    os << ']';
+  }
+
+  // Result affine expressions.
+  os << " -> (";
+  interleaveComma(map.getResults(),
+                  [&](AffineExpr expr) { printAffineExpr(expr); });
+  os << ')';
+}
+
+void ModulePrinter::printIntegerSet(IntegerSet set) {
+  // Dimension identifiers.
+  os << '(';
+  for (unsigned i = 1; i < set.getNumDims(); ++i)
+    os << 'd' << i - 1 << ", ";
+  if (set.getNumDims() >= 1)
+    os << 'd' << set.getNumDims() - 1;
+  os << ')';
+
+  // Symbolic identifiers.
+  if (set.getNumSymbols() != 0) {
+    os << '[';
+    for (unsigned i = 0; i < set.getNumSymbols() - 1; ++i)
+      os << 's' << i << ", ";
+    if (set.getNumSymbols() >= 1)
+      os << 's' << set.getNumSymbols() - 1;
+    os << ']';
+  }
+
+  // Print constraints.
+  os << " : (";
+  int numConstraints = set.getNumConstraints();
+  for (int i = 1; i < numConstraints; ++i) {
+    printAffineConstraint(set.getConstraint(i - 1), set.isEq(i - 1));
+    os << ", ";
+  }
+  if (numConstraints >= 1)
+    printAffineConstraint(set.getConstraint(numConstraints - 1),
+                          set.isEq(numConstraints - 1));
+  os << ')';
+}
+
+//===----------------------------------------------------------------------===//
+// Operation printing
+//===----------------------------------------------------------------------===//
+
+void ModulePrinter::printOptionalAttrDict(ArrayRef<NamedAttribute> attrs,
+                                          ArrayRef<StringRef> elidedAttrs,
+                                          bool withKeyword) {
+  // If there are no attributes, then there is nothing to be done.
+  if (attrs.empty())
+    return;
+
+  // Filter out any attributes that shouldn't be included.
+  SmallVector<NamedAttribute, 8> filteredAttrs(
+      llvm::make_filter_range(attrs, [&](NamedAttribute attr) {
+        return !llvm::is_contained(elidedAttrs, attr.first.strref());
+      }));
+
+  // If there are no attributes left to print after filtering, then we're done.
+  if (filteredAttrs.empty())
+    return;
+
+  // Print the 'attributes' keyword if necessary.
+  if (withKeyword)
+    os << " attributes";
+
+  // Otherwise, print them all out in braces.
+  os << " {";
+  interleaveComma(filteredAttrs, [&](NamedAttribute attr) {
+    os << attr.first;
+
+    // Pretty printing elides the attribute value for unit attributes.
+    if (attr.second.isa<UnitAttr>())
+      return;
+
+    os << " = ";
+    printAttribute(attr.second);
+  });
+  os << '}';
+}
+
+namespace {
+
+// OperationPrinter contains common functionality for printing operations.
+class OperationPrinter : public ModulePrinter, private OpAsmPrinter {
+public:
+  OperationPrinter(Operation *op, ModulePrinter &other);
+  OperationPrinter(Region *region, ModulePrinter &other);
+
+  // Methods to print operations.
+  void print(Operation *op);
+  void print(Block *block, bool printBlockArgs = true,
+             bool printBlockTerminator = true);
+
+  void printOperation(Operation *op);
+  void printGenericOp(Operation *op) override;
+
+  // Implement OpAsmPrinter.
+  raw_ostream &getStream() const override { return os; }
+  void printType(Type type) override { ModulePrinter::printType(type); }
+  void printAttribute(Attribute attr) override {
+    ModulePrinter::printAttribute(attr);
+  }
+  void printOperand(Value value) override { printValueID(value); }
+
+  void printOptionalAttrDict(ArrayRef<NamedAttribute> attrs,
+                             ArrayRef<StringRef> elidedAttrs = {}) override {
+    ModulePrinter::printOptionalAttrDict(attrs, elidedAttrs);
+  }
+  void printOptionalAttrDictWithKeyword(
+      ArrayRef<NamedAttribute> attrs,
+      ArrayRef<StringRef> elidedAttrs = {}) override {
+    ModulePrinter::printOptionalAttrDict(attrs, elidedAttrs,
+                                         /*withKeyword=*/true);
+  }
+
+  enum { nameSentinel = ~0U };
+
+  void printBlockName(Block *block) {
+    auto id = getBlockID(block);
+    if (id != ~0U)
+      os << "^bb" << id;
+    else
+      os << "^INVALIDBLOCK";
+  }
+
+  unsigned getBlockID(Block *block) {
+    auto it = blockIDs.find(block);
+    return it != blockIDs.end() ? it->second : ~0U;
+  }
+
+  void printSuccessorAndUseList(Operation *term, unsigned index) override;
+
+  /// Print a region.
+  void printRegion(Region &blocks, bool printEntryBlockArgs,
+                   bool printBlockTerminators) override {
+    os << " {\n";
+    if (!blocks.empty()) {
+      auto *entryBlock = &blocks.front();
+      print(entryBlock,
+            printEntryBlockArgs && entryBlock->getNumArguments() != 0,
+            printBlockTerminators);
+      for (auto &b : llvm::drop_begin(blocks.getBlocks(), 1))
+        print(&b);
+    }
+    os.indent(currentIndent) << "}";
+  }
+
+  /// Renumber the arguments for the specified region to the same names as the
+  /// SSA values in namesToUse.  This may only be used for IsolatedFromAbove
+  /// operations.  If any entry in namesToUse is null, the corresponding
+  /// argument name is left alone.
+  void shadowRegionArgs(Region &region, ValueRange namesToUse) override;
+
+  void printAffineMapOfSSAIds(AffineMapAttr mapAttr,
+                              ValueRange operands) override {
+    AffineMap map = mapAttr.getValue();
+    unsigned numDims = map.getNumDims();
+    auto printValueName = [&](unsigned pos, bool isSymbol) {
+      unsigned index = isSymbol ? numDims + pos : pos;
+      assert(index < operands.size());
+      if (isSymbol)
+        os << "symbol(";
+      printValueID(operands[index]);
+      if (isSymbol)
+        os << ')';
+    };
+
+    interleaveComma(map.getResults(), [&](AffineExpr expr) {
+      printAffineExpr(expr, printValueName);
+    });
+  }
+
+  /// Print the given string as a symbol reference.
+  void printSymbolName(StringRef symbolRef) override {
+    ::printSymbolReference(symbolRef, os);
+  }
+
+  // Number of spaces used for indenting nested operations.
+  const static unsigned indentWidth = 2;
+
+protected:
+  void numberValuesInRegion(Region &region);
+  void numberValuesInBlock(Block &block);
+  void numberValuesInOp(Operation &op);
+  void printValueID(Value value, bool printResultNo = true) const {
+    printValueIDImpl(value, printResultNo, os);
+  }
+
+private:
+  /// Given a result of an operation 'result', find the result group head
+  /// 'lookupValue' and the result of 'result' within that group in
+  /// 'lookupResultNo'. 'lookupResultNo' is only filled in if the result group
+  /// has more than 1 result.
+  void getResultIDAndNumber(OpResult result, Value &lookupValue,
+                            int &lookupResultNo) const;
+  void printValueIDImpl(Value value, bool printResultNo,
+                        raw_ostream &stream) const;
+
+  /// Set a special value name for the given value.
+  void setValueName(Value value, StringRef name);
+
+  /// Uniques the given value name within the printer. If the given name
+  /// conflicts, it is automatically renamed.
+  StringRef uniqueValueName(StringRef name);
+
+  /// This is the value ID for each SSA value. If this returns ~0, then the
+  /// valueID has an entry in valueNames.
+  DenseMap<Value, unsigned> valueIDs;
+  DenseMap<Value, StringRef> valueNames;
+
+  /// This is a map of operations that contain multiple named result groups,
+  /// i.e. there may be multiple names for the results of the operation. The key
+  /// of this map are the result numbers that start a result group.
+  DenseMap<Operation *, SmallVector<int, 1>> opResultGroups;
+
+  /// This is the block ID for each block in the current.
+  DenseMap<Block *, unsigned> blockIDs;
+
+  /// This keeps track of all of the non-numeric names that are in flight,
+  /// allowing us to check for duplicates.
+  /// Note: the value of the map is unused.
+  llvm::ScopedHashTable<StringRef, char> usedNames;
+  llvm::BumpPtrAllocator usedNameAllocator;
+
+  // This is the current indentation level for nested structures.
+  unsigned currentIndent = 0;
+
+  /// This is the next value ID to assign in numbering.
+  unsigned nextValueID = 0;
+  /// This is the next ID to assign to a region entry block argument.
+  unsigned nextArgumentID = 0;
+  /// This is the next ID to assign when a name conflict is detected.
+  unsigned nextConflictID = 0;
+};
+} // end anonymous namespace
+
+OperationPrinter::OperationPrinter(Operation *op, ModulePrinter &other)
+    : ModulePrinter(other) {
+  llvm::ScopedHashTable<StringRef, char>::ScopeTy usedNamesScope(usedNames);
+  numberValuesInOp(*op);
+
+  for (auto &region : op->getRegions())
+    numberValuesInRegion(region);
+}
+
+OperationPrinter::OperationPrinter(Region *region, ModulePrinter &other)
+    : ModulePrinter(other) {
+  numberValuesInRegion(*region);
+}
+
+void OperationPrinter::numberValuesInRegion(Region &region) {
+  // Save the current value ids to allow for numbering values in sibling regions
+  // the same.
+  unsigned curValueID = nextValueID;
+  unsigned curArgumentID = nextArgumentID;
+  unsigned curConflictID = nextConflictID;
+
+  // Push a new used names scope.
+  llvm::ScopedHashTable<StringRef, char>::ScopeTy usedNamesScope(usedNames);
+
+  // Number the values within this region in a breadth-first order.
+  unsigned nextBlockID = 0;
+  for (auto &block : region) {
+    // Each block gets a unique ID, and all of the operations within it get
+    // numbered as well.
+    blockIDs[&block] = nextBlockID++;
+    numberValuesInBlock(block);
+  }
+
+  // After that we traverse the nested regions.
+  // TODO: Rework this loop to not use recursion.
+  for (auto &block : region) {
+    for (auto &op : block)
+      for (auto &nestedRegion : op.getRegions())
+        numberValuesInRegion(nestedRegion);
+  }
+
+  // Restore the original value ids.
+  nextValueID = curValueID;
+  nextArgumentID = curArgumentID;
+  nextConflictID = curConflictID;
+}
+
+void OperationPrinter::numberValuesInBlock(Block &block) {
+  auto setArgNameFn = [&](Value arg, StringRef name) {
+    assert(!valueIDs.count(arg) && "arg numbered multiple times");
+    assert(arg.cast<BlockArgument>()->getOwner() == &block &&
+           "arg not defined in 'block'");
+    setValueName(arg, name);
+  };
+
+  bool isEntryBlock = block.isEntryBlock();
+  if (isEntryBlock && state) {
+    if (auto *op = block.getParentOp()) {
+      if (auto dialectAsmInterface = state->getOpAsmInterface(op->getDialect()))
+        dialectAsmInterface->getAsmBlockArgumentNames(&block, setArgNameFn);
+    }
+  }
+
+  // Number the block arguments. We give entry block arguments a special name
+  // 'arg'.
+  SmallString<32> specialNameBuffer(isEntryBlock ? "arg" : "");
+  llvm::raw_svector_ostream specialName(specialNameBuffer);
+  for (auto arg : block.getArguments()) {
+    if (valueIDs.count(arg))
+      continue;
+    if (isEntryBlock) {
+      specialNameBuffer.resize(strlen("arg"));
+      specialName << nextArgumentID++;
+    }
+    setValueName(arg, specialName.str());
+  }
+
+  // Number the operations in this block.
+  for (auto &op : block)
+    numberValuesInOp(op);
+}
+
+void OperationPrinter::numberValuesInOp(Operation &op) {
+  unsigned numResults = op.getNumResults();
+  if (numResults == 0)
+    return;
+  Value resultBegin = op.getResult(0);
+
+  // Function used to set the special result names for the operation.
+  SmallVector<int, 2> resultGroups(/*Size=*/1, /*Value=*/0);
+  auto setResultNameFn = [&](Value result, StringRef name) {
+    assert(!valueIDs.count(result) && "result numbered multiple times");
+    assert(result->getDefiningOp() == &op && "result not defined by 'op'");
+    setValueName(result, name);
+
+    // Record the result number for groups not anchored at 0.
+    if (int resultNo = result.cast<OpResult>()->getResultNumber())
+      resultGroups.push_back(resultNo);
+  };
+
+  if (OpAsmOpInterface asmInterface = dyn_cast<OpAsmOpInterface>(&op)) {
+    asmInterface.getAsmResultNames(setResultNameFn);
+  } else if (auto *dialectAsmInterface =
+                 state ? state->getOpAsmInterface(op.getDialect()) : nullptr) {
+    dialectAsmInterface->getAsmResultNames(&op, setResultNameFn);
+  }
+
+  // If the first result wasn't numbered, give it a default number.
+  if (valueIDs.try_emplace(resultBegin, nextValueID).second)
+    ++nextValueID;
+
+  // If this operation has multiple result groups, mark it.
+  if (resultGroups.size() != 1) {
+    llvm::array_pod_sort(resultGroups.begin(), resultGroups.end());
+    opResultGroups.try_emplace(&op, std::move(resultGroups));
+  }
+}
+
+/// Set a special value name for the given value.
+void OperationPrinter::setValueName(Value value, StringRef name) {
+  // If the name is empty, the value uses the default numbering.
+  if (name.empty()) {
+    valueIDs[value] = nextValueID++;
+    return;
+  }
+
+  valueIDs[value] = nameSentinel;
+  valueNames[value] = uniqueValueName(name);
+}
+
+/// Uniques the given value name within the printer. If the given name
+/// conflicts, it is automatically renamed.
+StringRef OperationPrinter::uniqueValueName(StringRef name) {
+  // Check to see if this name is already unique.
+  if (!usedNames.count(name)) {
+    name = name.copy(usedNameAllocator);
+  } else {
+    // Otherwise, we had a conflict - probe until we find a unique name. This
+    // is guaranteed to terminate (and usually in a single iteration) because it
+    // generates new names by incrementing nextConflictID.
+    SmallString<64> probeName(name);
+    probeName.push_back('_');
+    while (true) {
+      probeName.resize(name.size() + 1);
+      probeName += llvm::utostr(nextConflictID++);
+      if (!usedNames.count(probeName)) {
+        name = StringRef(probeName).copy(usedNameAllocator);
+        break;
+      }
+    }
+  }
+
+  usedNames.insert(name, char());
+  return name;
+}
+
+void OperationPrinter::print(Block *block, bool printBlockArgs,
+                             bool printBlockTerminator) {
+  // Print the block label and argument list if requested.
+  if (printBlockArgs) {
+    os.indent(currentIndent);
+    printBlockName(block);
+
+    // Print the argument list if non-empty.
+    if (!block->args_empty()) {
+      os << '(';
+      interleaveComma(block->getArguments(), [&](BlockArgument arg) {
+        printValueID(arg);
+        os << ": ";
+        printType(arg->getType());
+      });
+      os << ')';
+    }
+    os << ':';
+
+    // Print out some context information about the predecessors of this block.
+    if (!block->getParent()) {
+      os << "\t// block is not in a region!";
+    } else if (block->hasNoPredecessors()) {
+      os << "\t// no predecessors";
+    } else if (auto *pred = block->getSinglePredecessor()) {
+      os << "\t// pred: ";
+      printBlockName(pred);
+    } else {
+      // We want to print the predecessors in increasing numeric order, not in
+      // whatever order the use-list is in, so gather and sort them.
+      SmallVector<std::pair<unsigned, Block *>, 4> predIDs;
+      for (auto *pred : block->getPredecessors())
+        predIDs.push_back({getBlockID(pred), pred});
+      llvm::array_pod_sort(predIDs.begin(), predIDs.end());
+
+      os << "\t// " << predIDs.size() << " preds: ";
+
+      interleaveComma(predIDs, [&](std::pair<unsigned, Block *> pred) {
+        printBlockName(pred.second);
+      });
+    }
+    os << '\n';
+  }
+
+  currentIndent += indentWidth;
+  auto range = llvm::make_range(
+      block->getOperations().begin(),
+      std::prev(block->getOperations().end(), printBlockTerminator ? 0 : 1));
+  for (auto &op : range) {
+    print(&op);
+    os << '\n';
+  }
+  currentIndent -= indentWidth;
+}
+
+void OperationPrinter::print(Operation *op) {
+  os.indent(currentIndent);
+  printOperation(op);
+  printTrailingLocation(op->getLoc());
+}
+
+void OperationPrinter::getResultIDAndNumber(OpResult result, Value &lookupValue,
+                                            int &lookupResultNo) const {
+  Operation *owner = result->getOwner();
+  if (owner->getNumResults() == 1)
+    return;
+  int resultNo = result->getResultNumber();
+
+  // If this operation has multiple result groups, we will need to find the
+  // one corresponding to this result.
+  auto resultGroupIt = opResultGroups.find(owner);
+  if (resultGroupIt == opResultGroups.end()) {
+    // If not, just use the first result.
+    lookupResultNo = resultNo;
+    lookupValue = owner->getResult(0);
+    return;
+  }
+
+  // Find the correct index using a binary search, as the groups are ordered.
+  ArrayRef<int> resultGroups = resultGroupIt->second;
+  auto it = llvm::upper_bound(resultGroups, resultNo);
+  int groupResultNo = 0, groupSize = 0;
+
+  // If there are no smaller elements, the last result group is the lookup.
+  if (it == resultGroups.end()) {
+    groupResultNo = resultGroups.back();
+    groupSize = static_cast<int>(owner->getNumResults()) - resultGroups.back();
+  } else {
+    // Otherwise, the previous element is the lookup.
+    groupResultNo = *std::prev(it);
+    groupSize = *it - groupResultNo;
+  }
+
+  // We only record the result number for a group of size greater than 1.
+  if (groupSize != 1)
+    lookupResultNo = resultNo - groupResultNo;
+  lookupValue = owner->getResult(groupResultNo);
+}
+
+void OperationPrinter::printValueIDImpl(Value value, bool printResultNo,
+                                        raw_ostream &stream) const {
+  if (!value) {
+    stream << "<<NULL>>";
+    return;
+  }
+
+  int resultNo = -1;
+  auto lookupValue = value;
+
+  // If this is a reference to the result of a multi-result operation or
+  // operation, print out the # identifier and make sure to map our lookup
+  // to the first result of the operation.
+  if (OpResult result = value.dyn_cast<OpResult>())
+    getResultIDAndNumber(result, lookupValue, resultNo);
+
+  auto it = valueIDs.find(lookupValue);
+  if (it == valueIDs.end()) {
+    stream << "<<UNKNOWN SSA VALUE>>";
+    return;
+  }
+
+  stream << '%';
+  if (it->second != nameSentinel) {
+    stream << it->second;
+  } else {
+    auto nameIt = valueNames.find(lookupValue);
+    assert(nameIt != valueNames.end() && "Didn't have a name entry?");
+    stream << nameIt->second;
+  }
+
+  if (resultNo != -1 && printResultNo)
+    stream << '#' << resultNo;
+}
+
+/// Renumber the arguments for the specified region to the same names as the
+/// SSA values in namesToUse.  This may only be used for IsolatedFromAbove
+/// operations.  If any entry in namesToUse is null, the corresponding
+/// argument name is left alone.
+void OperationPrinter::shadowRegionArgs(Region &region, ValueRange namesToUse) {
+  assert(!region.empty() && "cannot shadow arguments of an empty region");
+  assert(region.front().getNumArguments() == namesToUse.size() &&
+         "incorrect number of names passed in");
+  assert(region.getParentOp()->isKnownIsolatedFromAbove() &&
+         "only KnownIsolatedFromAbove ops can shadow names");
+
+  SmallVector<char, 16> nameStr;
+  for (unsigned i = 0, e = namesToUse.size(); i != e; ++i) {
+    auto nameToUse = namesToUse[i];
+    if (nameToUse == nullptr)
+      continue;
+
+    auto nameToReplace = region.front().getArgument(i);
+
+    nameStr.clear();
+    llvm::raw_svector_ostream nameStream(nameStr);
+    printValueIDImpl(nameToUse, /*printResultNo=*/true, nameStream);
+
+    // Entry block arguments should already have a pretty "arg" name.
+    assert(valueIDs[nameToReplace] == nameSentinel);
+
+    // Use the name without the leading %.
+    auto name = StringRef(nameStream.str()).drop_front();
+
+    // Overwrite the name.
+    valueNames[nameToReplace] = name.copy(usedNameAllocator);
+  }
+}
+
+void OperationPrinter::printOperation(Operation *op) {
+  if (size_t numResults = op->getNumResults()) {
+    auto printResultGroup = [&](size_t resultNo, size_t resultCount) {
+      printValueID(op->getResult(resultNo), /*printResultNo=*/false);
+      if (resultCount > 1)
+        os << ':' << resultCount;
+    };
+
+    // Check to see if this operation has multiple result groups.
+    auto resultGroupIt = opResultGroups.find(op);
+    if (resultGroupIt != opResultGroups.end()) {
+      ArrayRef<int> resultGroups = resultGroupIt->second;
+      // Interleave the groups excluding the last one, this one will be handled
+      // separately.
+      interleaveComma(llvm::seq<int>(0, resultGroups.size() - 1), [&](int i) {
+        printResultGroup(resultGroups[i],
+                         resultGroups[i + 1] - resultGroups[i]);
+      });
+      os << ", ";
+      printResultGroup(resultGroups.back(), numResults - resultGroups.back());
+
+    } else {
+      printResultGroup(/*resultNo=*/0, /*resultCount=*/numResults);
+    }
+
+    os << " = ";
+  }
+
+  // TODO(riverriddle): FuncOp cannot be round-tripped currently, as
+  // FunctionType cannot be used in a TypeAttr.
+  if (printerFlags.shouldPrintGenericOpForm() && !isa<FuncOp>(op))
+    return printGenericOp(op);
+
+  // Check to see if this is a known operation.  If so, use the registered
+  // custom printer hook.
+  if (auto *opInfo = op->getAbstractOperation()) {
+    opInfo->printAssembly(op, *this);
+    return;
+  }
+
+  // Otherwise print with the generic assembly form.
+  printGenericOp(op);
+}
+
+void OperationPrinter::printGenericOp(Operation *op) {
+  os << '"';
+  printEscapedString(op->getName().getStringRef(), os);
+  os << "\"(";
+
+  // Get the list of operands that are not successor operands.
+  unsigned totalNumSuccessorOperands = 0;
+  unsigned numSuccessors = op->getNumSuccessors();
+  for (unsigned i = 0; i < numSuccessors; ++i)
+    totalNumSuccessorOperands += op->getNumSuccessorOperands(i);
+  unsigned numProperOperands = op->getNumOperands() - totalNumSuccessorOperands;
+  SmallVector<Value, 8> properOperands(
+      op->operand_begin(), std::next(op->operand_begin(), numProperOperands));
+
+  interleaveComma(properOperands, [&](Value value) { printValueID(value); });
+
+  os << ')';
+
+  // For terminators, print the list of successors and their operands.
+  if (numSuccessors != 0) {
+    os << '[';
+    for (unsigned i = 0; i < numSuccessors; ++i) {
+      if (i != 0)
+        os << ", ";
+      printSuccessorAndUseList(op, i);
+    }
+    os << ']';
+  }
+
+  // Print regions.
+  if (op->getNumRegions() != 0) {
+    os << " (";
+    interleaveComma(op->getRegions(), [&](Region &region) {
+      printRegion(region, /*printEntryBlockArgs=*/true,
+                  /*printBlockTerminators=*/true);
+    });
+    os << ')';
+  }
+
+  auto attrs = op->getAttrs();
+  printOptionalAttrDict(attrs);
+
+  // Print the type signature of the operation.
+  os << " : ";
+  printFunctionalType(op);
+}
+
+void OperationPrinter::printSuccessorAndUseList(Operation *term,
+                                                unsigned index) {
+  printBlockName(term->getSuccessor(index));
+
+  auto succOperands = term->getSuccessorOperands(index);
+  if (succOperands.begin() == succOperands.end())
+    return;
+
+  os << '(';
+  interleaveComma(succOperands,
+                  [this](Value operand) { printValueID(operand); });
+  os << " : ";
+  interleaveComma(succOperands,
+                  [this](Value operand) { printType(operand->getType()); });
+  os << ')';
+}
+
+void ModulePrinter::print(ModuleOp module) {
+  // Output the aliases at the top level.
+  if (state) {
+    state->printAttributeAliases(os);
+    state->printTypeAliases(os);
+  }
+
+  // Print the module.
+  OperationPrinter(module, *this).print(module);
+  os << '\n';
+}
+
+//===----------------------------------------------------------------------===//
+// print and dump methods
+//===----------------------------------------------------------------------===//
+
+void Attribute::print(raw_ostream &os) const {
+  ModulePrinter(os).printAttribute(*this);
+}
+
+void Attribute::dump() const {
+  print(llvm::errs());
+  llvm::errs() << "\n";
+}
+
+void Type::print(raw_ostream &os) { ModulePrinter(os).printType(*this); }
+
+void Type::dump() { print(llvm::errs()); }
+
+void AffineMap::dump() const {
+  print(llvm::errs());
+  llvm::errs() << "\n";
+}
+
+void IntegerSet::dump() const {
+  print(llvm::errs());
+  llvm::errs() << "\n";
+}
+
+void AffineExpr::print(raw_ostream &os) const {
+  if (expr == nullptr) {
+    os << "null affine expr";
+    return;
+  }
+  ModulePrinter(os).printAffineExpr(*this);
+}
+
+void AffineExpr::dump() const {
+  print(llvm::errs());
+  llvm::errs() << "\n";
+}
+
+void AffineMap::print(raw_ostream &os) const {
+  if (map == nullptr) {
+    os << "null affine map";
+    return;
+  }
+  ModulePrinter(os).printAffineMap(*this);
+}
+
+void IntegerSet::print(raw_ostream &os) const {
+  ModulePrinter(os).printIntegerSet(*this);
+}
+
+void Value::print(raw_ostream &os) {
+  if (auto *op = getDefiningOp())
+    return op->print(os);
+  // TODO: Improve this.
+  assert(isa<BlockArgument>());
+  os << "<block argument>\n";
+}
+
+void Value::dump() {
+  print(llvm::errs());
+  llvm::errs() << "\n";
+}
+
+void Operation::print(raw_ostream &os, OpPrintingFlags flags) {
+  // Handle top-level operations or local printing.
+  if (!getParent() || flags.shouldUseLocalScope()) {
+    ModuleState state(getContext());
+    ModulePrinter modulePrinter(os, flags, &state);
+    OperationPrinter(this, modulePrinter).print(this);
+    return;
+  }
+
+  auto region = getParentRegion();
+  if (!region) {
+    os << "<<UNLINKED INSTRUCTION>>\n";
+    return;
+  }
+
+  // Get the top-level region.
+  while (auto *nextRegion = region->getParentRegion())
+    region = nextRegion;
+
+  ModuleState state(getContext());
+  ModulePrinter modulePrinter(os, flags, &state);
+  OperationPrinter(region, modulePrinter).print(this);
+}
+
+void Operation::dump() {
+  print(llvm::errs(), OpPrintingFlags().useLocalScope());
+  llvm::errs() << "\n";
+}
+
+void Block::print(raw_ostream &os) {
+  auto region = getParent();
+  if (!region) {
+    os << "<<UNLINKED BLOCK>>\n";
+    return;
+  }
+
+  // Get the top-level region.
+  while (auto *nextRegion = region->getParentRegion())
+    region = nextRegion;
+
+  ModuleState state(region->getContext());
+  ModulePrinter modulePrinter(os, /*flags=*/llvm::None, &state);
+  OperationPrinter(region, modulePrinter).print(this);
+}
+
+void Block::dump() { print(llvm::errs()); }
+
+/// Print out the name of the block without printing its body.
+void Block::printAsOperand(raw_ostream &os, bool printType) {
+  auto region = getParent();
+  if (!region) {
+    os << "<<UNLINKED BLOCK>>\n";
+    return;
+  }
+
+  // Get the top-level region.
+  while (auto *nextRegion = region->getParentRegion())
+    region = nextRegion;
+
+  ModulePrinter modulePrinter(os);
+  OperationPrinter(region, modulePrinter).printBlockName(this);
+}
+
+void ModuleOp::print(raw_ostream &os, OpPrintingFlags flags) {
+  ModuleState state(getContext());
+  // Skip initializing in local scope to avoid populating aliases.
+  if (!flags.shouldUseLocalScope())
+    state.initialize(*this);
+  ModulePrinter(os, flags, &state).print(*this);
+}
+
+void ModuleOp::dump() { print(llvm::errs()); }
diff --git a/mlir/lib/IR/AttributeDetail.h b/mlir/lib/IR/AttributeDetail.h
new file mode 100644
index 0000000000000000000000000000000000000000..c78d49c0f878b65ba255249258adf80c66b361f1
--- /dev/null
+++ b/mlir/lib/IR/AttributeDetail.h
@@ -0,0 +1,595 @@
+//===- AttributeDetail.h - MLIR Affine Map details Class --------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This holds implementation details of Attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ATTRIBUTEDETAIL_H_
+#define ATTRIBUTEDETAIL_H_
+
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/StorageUniquer.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/Support/TrailingObjects.h"
+
+namespace mlir {
+namespace detail {
+// An attribute representing a reference to an affine map.
+struct AffineMapAttributeStorage : public AttributeStorage {
+  using KeyTy = AffineMap;
+
+  AffineMapAttributeStorage(AffineMap value)
+      : AttributeStorage(IndexType::get(value.getContext())), value(value) {}
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const { return key == value; }
+
+  /// Construct a new storage instance.
+  static AffineMapAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, KeyTy key) {
+    return new (allocator.allocate<AffineMapAttributeStorage>())
+        AffineMapAttributeStorage(key);
+  }
+
+  AffineMap value;
+};
+
+/// An attribute representing an array of other attributes.
+struct ArrayAttributeStorage : public AttributeStorage {
+  using KeyTy = ArrayRef<Attribute>;
+
+  ArrayAttributeStorage(ArrayRef<Attribute> value) : value(value) {}
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const { return key == value; }
+
+  /// Construct a new storage instance.
+  static ArrayAttributeStorage *construct(AttributeStorageAllocator &allocator,
+                                          const KeyTy &key) {
+    return new (allocator.allocate<ArrayAttributeStorage>())
+        ArrayAttributeStorage(allocator.copyInto(key));
+  }
+
+  ArrayRef<Attribute> value;
+};
+
+/// An attribute representing a boolean value.
+struct BoolAttributeStorage : public AttributeStorage {
+  using KeyTy = std::pair<MLIRContext *, bool>;
+
+  BoolAttributeStorage(Type type, bool value)
+      : AttributeStorage(type), value(value) {}
+
+  /// We only check equality for and hash with the boolean key parameter.
+  bool operator==(const KeyTy &key) const { return key.second == value; }
+  static unsigned hashKey(const KeyTy &key) {
+    return llvm::hash_value(key.second);
+  }
+
+  static BoolAttributeStorage *construct(AttributeStorageAllocator &allocator,
+                                         const KeyTy &key) {
+    return new (allocator.allocate<BoolAttributeStorage>())
+        BoolAttributeStorage(IntegerType::get(1, key.first), key.second);
+  }
+
+  bool value;
+};
+
+/// An attribute representing a dictionary of sorted named attributes.
+struct DictionaryAttributeStorage final
+    : public AttributeStorage,
+      private llvm::TrailingObjects<DictionaryAttributeStorage,
+                                    NamedAttribute> {
+  using KeyTy = ArrayRef<NamedAttribute>;
+
+  /// Given a list of NamedAttribute's, canonicalize the list (sorting
+  /// by name) and return the unique'd result.
+  static DictionaryAttributeStorage *get(ArrayRef<NamedAttribute> attrs);
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const { return key == getElements(); }
+
+  /// Construct a new storage instance.
+  static DictionaryAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, const KeyTy &key) {
+    auto size = DictionaryAttributeStorage::totalSizeToAlloc<NamedAttribute>(
+        key.size());
+    auto rawMem = allocator.allocate(size, alignof(NamedAttribute));
+
+    // Initialize the storage and trailing attribute list.
+    auto result = ::new (rawMem) DictionaryAttributeStorage(key.size());
+    std::uninitialized_copy(key.begin(), key.end(),
+                            result->getTrailingObjects<NamedAttribute>());
+    return result;
+  }
+
+  /// Return the elements of this dictionary attribute.
+  ArrayRef<NamedAttribute> getElements() const {
+    return {getTrailingObjects<NamedAttribute>(), numElements};
+  }
+
+private:
+  friend class llvm::TrailingObjects<DictionaryAttributeStorage,
+                                     NamedAttribute>;
+
+  // This is used by the llvm::TrailingObjects base class.
+  size_t numTrailingObjects(OverloadToken<NamedAttribute>) const {
+    return numElements;
+  }
+  DictionaryAttributeStorage(unsigned numElements) : numElements(numElements) {}
+
+  /// This is the number of attributes.
+  const unsigned numElements;
+};
+
+/// An attribute representing a floating point value.
+struct FloatAttributeStorage final
+    : public AttributeStorage,
+      public llvm::TrailingObjects<FloatAttributeStorage, uint64_t> {
+  using KeyTy = std::pair<Type, APFloat>;
+
+  FloatAttributeStorage(const llvm::fltSemantics &semantics, Type type,
+                        size_t numObjects)
+      : AttributeStorage(type), semantics(semantics), numObjects(numObjects) {}
+
+  /// Key equality and hash functions.
+  bool operator==(const KeyTy &key) const {
+    return key.first == getType() && key.second.bitwiseIsEqual(getValue());
+  }
+  static unsigned hashKey(const KeyTy &key) {
+    return llvm::hash_combine(key.first, llvm::hash_value(key.second));
+  }
+
+  /// Construct a key with a type and double.
+  static KeyTy getKey(Type type, double value) {
+    // Treat BF16 as double because it is not supported in LLVM's APFloat.
+    // TODO(b/121118307): add BF16 support to APFloat?
+    if (type.isBF16() || type.isF64())
+      return KeyTy(type, APFloat(value));
+
+    // This handles, e.g., F16 because there is no APFloat constructor for it.
+    bool unused;
+    APFloat val(value);
+    val.convert(type.cast<FloatType>().getFloatSemantics(),
+                APFloat::rmNearestTiesToEven, &unused);
+    return KeyTy(type, val);
+  }
+
+  /// Construct a new storage instance.
+  static FloatAttributeStorage *construct(AttributeStorageAllocator &allocator,
+                                          const KeyTy &key) {
+    const auto &apint = key.second.bitcastToAPInt();
+
+    // Here one word's bitwidth equals to that of uint64_t.
+    auto elements = ArrayRef<uint64_t>(apint.getRawData(), apint.getNumWords());
+
+    auto byteSize =
+        FloatAttributeStorage::totalSizeToAlloc<uint64_t>(elements.size());
+    auto rawMem = allocator.allocate(byteSize, alignof(FloatAttributeStorage));
+    auto result = ::new (rawMem) FloatAttributeStorage(
+        key.second.getSemantics(), key.first, elements.size());
+    std::uninitialized_copy(elements.begin(), elements.end(),
+                            result->getTrailingObjects<uint64_t>());
+    return result;
+  }
+
+  /// Returns an APFloat representing the stored value.
+  APFloat getValue() const {
+    auto val = APInt(APFloat::getSizeInBits(semantics),
+                     {getTrailingObjects<uint64_t>(), numObjects});
+    return APFloat(semantics, val);
+  }
+
+  const llvm::fltSemantics &semantics;
+  size_t numObjects;
+};
+
+/// An attribute representing a integral value.
+struct IntegerAttributeStorage final
+    : public AttributeStorage,
+      public llvm::TrailingObjects<IntegerAttributeStorage, uint64_t> {
+  using KeyTy = std::pair<Type, APInt>;
+
+  IntegerAttributeStorage(Type type, size_t numObjects)
+      : AttributeStorage(type), numObjects(numObjects) {
+    assert((type.isIndex() || type.isa<IntegerType>()) && "invalid type");
+  }
+
+  /// Key equality and hash functions.
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getType(), getValue());
+  }
+  static unsigned hashKey(const KeyTy &key) {
+    return llvm::hash_combine(key.first, llvm::hash_value(key.second));
+  }
+
+  /// Construct a new storage instance.
+  static IntegerAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, const KeyTy &key) {
+    Type type;
+    APInt value;
+    std::tie(type, value) = key;
+
+    auto elements = ArrayRef<uint64_t>(value.getRawData(), value.getNumWords());
+    auto size =
+        IntegerAttributeStorage::totalSizeToAlloc<uint64_t>(elements.size());
+    auto rawMem = allocator.allocate(size, alignof(IntegerAttributeStorage));
+    auto result = ::new (rawMem) IntegerAttributeStorage(type, elements.size());
+    std::uninitialized_copy(elements.begin(), elements.end(),
+                            result->getTrailingObjects<uint64_t>());
+    return result;
+  }
+
+  /// Returns an APInt representing the stored value.
+  APInt getValue() const {
+    if (getType().isIndex())
+      return APInt(64, {getTrailingObjects<uint64_t>(), numObjects});
+    return APInt(getType().getIntOrFloatBitWidth(),
+                 {getTrailingObjects<uint64_t>(), numObjects});
+  }
+
+  size_t numObjects;
+};
+
+// An attribute representing a reference to an integer set.
+struct IntegerSetAttributeStorage : public AttributeStorage {
+  using KeyTy = IntegerSet;
+
+  IntegerSetAttributeStorage(IntegerSet value) : value(value) {}
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const { return key == value; }
+
+  /// Construct a new storage instance.
+  static IntegerSetAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, KeyTy key) {
+    return new (allocator.allocate<IntegerSetAttributeStorage>())
+        IntegerSetAttributeStorage(key);
+  }
+
+  IntegerSet value;
+};
+
+/// Opaque Attribute Storage and Uniquing.
+struct OpaqueAttributeStorage : public AttributeStorage {
+  OpaqueAttributeStorage(Identifier dialectNamespace, StringRef attrData,
+                         Type type)
+      : AttributeStorage(type), dialectNamespace(dialectNamespace),
+        attrData(attrData) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::tuple<Identifier, StringRef, Type>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(dialectNamespace, attrData, getType());
+  }
+
+  static OpaqueAttributeStorage *construct(AttributeStorageAllocator &allocator,
+                                           const KeyTy &key) {
+    return new (allocator.allocate<OpaqueAttributeStorage>())
+        OpaqueAttributeStorage(std::get<0>(key),
+                               allocator.copyInto(std::get<1>(key)),
+                               std::get<2>(key));
+  }
+
+  // The dialect namespace.
+  Identifier dialectNamespace;
+
+  // The parser attribute data for this opaque attribute.
+  StringRef attrData;
+};
+
+/// An attribute representing a string value.
+struct StringAttributeStorage : public AttributeStorage {
+  using KeyTy = std::pair<StringRef, Type>;
+
+  StringAttributeStorage(StringRef value, Type type)
+      : AttributeStorage(type), value(value) {}
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(value, getType());
+  }
+
+  /// Construct a new storage instance.
+  static StringAttributeStorage *construct(AttributeStorageAllocator &allocator,
+                                           const KeyTy &key) {
+    return new (allocator.allocate<StringAttributeStorage>())
+        StringAttributeStorage(allocator.copyInto(key.first), key.second);
+  }
+
+  StringRef value;
+};
+
+/// An attribute representing a symbol reference.
+struct SymbolRefAttributeStorage final
+    : public AttributeStorage,
+      public llvm::TrailingObjects<SymbolRefAttributeStorage,
+                                   FlatSymbolRefAttr> {
+  using KeyTy = std::pair<StringRef, ArrayRef<FlatSymbolRefAttr>>;
+
+  SymbolRefAttributeStorage(StringRef value, size_t numNestedRefs)
+      : value(value), numNestedRefs(numNestedRefs) {}
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(value, getNestedRefs());
+  }
+
+  /// Construct a new storage instance.
+  static SymbolRefAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, const KeyTy &key) {
+    auto size = SymbolRefAttributeStorage::totalSizeToAlloc<FlatSymbolRefAttr>(
+        key.second.size());
+    auto rawMem = allocator.allocate(size, alignof(SymbolRefAttributeStorage));
+    auto result = ::new (rawMem) SymbolRefAttributeStorage(
+        allocator.copyInto(key.first), key.second.size());
+    std::uninitialized_copy(key.second.begin(), key.second.end(),
+                            result->getTrailingObjects<FlatSymbolRefAttr>());
+    return result;
+  }
+
+  /// Returns the set of nested references.
+  ArrayRef<FlatSymbolRefAttr> getNestedRefs() const {
+    return {getTrailingObjects<FlatSymbolRefAttr>(), numNestedRefs};
+  }
+
+  StringRef value;
+  size_t numNestedRefs;
+};
+
+/// An attribute representing a reference to a type.
+struct TypeAttributeStorage : public AttributeStorage {
+  using KeyTy = Type;
+
+  TypeAttributeStorage(Type value) : value(value) {}
+
+  /// Key equality function.
+  bool operator==(const KeyTy &key) const { return key == value; }
+
+  /// Construct a new storage instance.
+  static TypeAttributeStorage *construct(AttributeStorageAllocator &allocator,
+                                         KeyTy key) {
+    return new (allocator.allocate<TypeAttributeStorage>())
+        TypeAttributeStorage(key);
+  }
+
+  Type value;
+};
+
+//===----------------------------------------------------------------------===//
+// Elements Attributes
+//===----------------------------------------------------------------------===//
+
+/// An attribute representing a reference to a dense vector or tensor object.
+struct DenseElementsAttributeStorage : public AttributeStorage {
+  struct KeyTy {
+    KeyTy(ShapedType type, ArrayRef<char> data, llvm::hash_code hashCode,
+          bool isSplat = false)
+        : type(type), data(data), hashCode(hashCode), isSplat(isSplat) {}
+
+    /// The type of the dense elements.
+    ShapedType type;
+
+    /// The raw buffer for the data storage.
+    ArrayRef<char> data;
+
+    /// The computed hash code for the storage data.
+    llvm::hash_code hashCode;
+
+    /// A boolean that indicates if this data is a splat or not.
+    bool isSplat;
+  };
+
+  DenseElementsAttributeStorage(ShapedType ty, ArrayRef<char> data,
+                                bool isSplat = false)
+      : AttributeStorage(ty), data(data), isSplat(isSplat) {}
+
+  /// Compare this storage instance with the provided key.
+  bool operator==(const KeyTy &key) const {
+    if (key.type != getType())
+      return false;
+
+    // For boolean splats we need to explicitly check that the first bit is the
+    // same. Boolean values are packed at the bit level, and even though a splat
+    // is detected the rest of the bits in the first byte may differ from the
+    // splat value.
+    if (key.type.getElementTypeBitWidth() == 1) {
+      if (key.isSplat != isSplat)
+        return false;
+      if (isSplat)
+        return (key.data.front() & 1) == data.front();
+    }
+
+    // Otherwise, we can default to just checking the data.
+    return key.data == data;
+  }
+
+  /// Construct a key from a shaped type, raw data buffer, and a flag that
+  /// signals if the data is already known to be a splat. Callers to this
+  /// function are expected to tag preknown splat values when possible, e.g. one
+  /// element shapes.
+  static KeyTy getKey(ShapedType ty, ArrayRef<char> data, bool isKnownSplat) {
+    // Handle an empty storage instance.
+    if (data.empty())
+      return KeyTy(ty, data, 0);
+
+    // If the data is already known to be a splat, the key hash value is
+    // directly the data buffer.
+    if (isKnownSplat)
+      return KeyTy(ty, data, llvm::hash_value(data), isKnownSplat);
+
+    // Otherwise, we need to check if the data corresponds to a splat or not.
+
+    // Handle the simple case of only one element.
+    size_t numElements = ty.getNumElements();
+    assert(numElements != 1 && "splat of 1 element should already be detected");
+
+    // Handle boolean values directly as they are packed to 1-bit.
+    size_t elementWidth = ty.getElementTypeBitWidth();
+    if (elementWidth == 1)
+      return getKeyForBoolData(ty, data, numElements);
+
+    // FIXME(b/121118307): using 64 bits for BF16 because it is currently stored
+    // with double semantics.
+    if (ty.getElementType().isBF16())
+      elementWidth = 64;
+
+    // Non 1-bit dense elements are padded to 8-bits.
+    size_t storageSize = llvm::divideCeil(elementWidth, CHAR_BIT);
+    assert(((data.size() / storageSize) == numElements) &&
+           "data does not hold expected number of elements");
+
+    // Create the initial hash value with just the first element.
+    auto firstElt = data.take_front(storageSize);
+    auto hashVal = llvm::hash_value(firstElt);
+
+    // Check to see if this storage represents a splat. If it doesn't then
+    // combine the hash for the data starting with the first non splat element.
+    for (size_t i = storageSize, e = data.size(); i != e; i += storageSize)
+      if (memcmp(data.data(), &data[i], storageSize))
+        return KeyTy(ty, data, llvm::hash_combine(hashVal, data.drop_front(i)));
+
+    // Otherwise, this is a splat so just return the hash of the first element.
+    return KeyTy(ty, firstElt, hashVal, /*isSplat=*/true);
+  }
+
+  /// Construct a key with a set of boolean data.
+  static KeyTy getKeyForBoolData(ShapedType ty, ArrayRef<char> data,
+                                 size_t numElements) {
+    ArrayRef<char> splatData = data;
+    bool splatValue = splatData.front() & 1;
+
+    // Helper functor to generate a KeyTy for a boolean splat value.
+    auto generateSplatKey = [=] {
+      return KeyTy(ty, data.take_front(1),
+                   llvm::hash_value(ArrayRef<char>(splatValue ? 1 : 0)),
+                   /*isSplat=*/true);
+    };
+
+    // Handle the case where the potential splat value is 1 and the number of
+    // elements is non 8-bit aligned.
+    size_t numOddElements = numElements % CHAR_BIT;
+    if (splatValue && numOddElements != 0) {
+      // Check that all bits are set in the last value.
+      char lastElt = splatData.back();
+      if (lastElt != llvm::maskTrailingOnes<unsigned char>(numOddElements))
+        return KeyTy(ty, data, llvm::hash_value(data));
+
+      // If this is the only element, the data is known to be a splat.
+      if (splatData.size() == 1)
+        return generateSplatKey();
+      splatData = splatData.drop_back();
+    }
+
+    // Check that the data buffer corresponds to a splat of the proper mask.
+    char mask = splatValue ? ~0 : 0;
+    return llvm::all_of(splatData, [mask](char c) { return c == mask; })
+               ? generateSplatKey()
+               : KeyTy(ty, data, llvm::hash_value(data));
+  }
+
+  /// Hash the key for the storage.
+  static llvm::hash_code hashKey(const KeyTy &key) {
+    return llvm::hash_combine(key.type, key.hashCode);
+  }
+
+  /// Construct a new storage instance.
+  static DenseElementsAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, KeyTy key) {
+    // If the data buffer is non-empty, we copy it into the allocator with a
+    // 64-bit alignment.
+    ArrayRef<char> copy, data = key.data;
+    if (!data.empty()) {
+      char *rawData = reinterpret_cast<char *>(
+          allocator.allocate(data.size(), alignof(uint64_t)));
+      std::memcpy(rawData, data.data(), data.size());
+
+      // If this is a boolean splat, make sure only the first bit is used.
+      if (key.isSplat && key.type.getElementTypeBitWidth() == 1)
+        rawData[0] &= 1;
+      copy = ArrayRef<char>(rawData, data.size());
+    }
+
+    return new (allocator.allocate<DenseElementsAttributeStorage>())
+        DenseElementsAttributeStorage(key.type, copy, key.isSplat);
+  }
+
+  ArrayRef<char> data;
+  bool isSplat;
+};
+
+/// An attribute representing a reference to a tensor constant with opaque
+/// content.
+struct OpaqueElementsAttributeStorage : public AttributeStorage {
+  using KeyTy = std::tuple<Type, Dialect *, StringRef>;
+
+  OpaqueElementsAttributeStorage(Type type, Dialect *dialect, StringRef bytes)
+      : AttributeStorage(type), dialect(dialect), bytes(bytes) {}
+
+  /// Key equality and hash functions.
+  bool operator==(const KeyTy &key) const {
+    return key == std::make_tuple(getType(), dialect, bytes);
+  }
+  static unsigned hashKey(const KeyTy &key) {
+    return llvm::hash_combine(std::get<0>(key), std::get<1>(key),
+                              std::get<2>(key));
+  }
+
+  /// Construct a new storage instance.
+  static OpaqueElementsAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, KeyTy key) {
+    // TODO(b/131468830): Provide a way to avoid copying content of large opaque
+    // tensors This will likely require a new reference attribute kind.
+    return new (allocator.allocate<OpaqueElementsAttributeStorage>())
+        OpaqueElementsAttributeStorage(std::get<0>(key), std::get<1>(key),
+                                       allocator.copyInto(std::get<2>(key)));
+  }
+
+  Dialect *dialect;
+  StringRef bytes;
+};
+
+/// An attribute representing a reference to a sparse vector or tensor object.
+struct SparseElementsAttributeStorage : public AttributeStorage {
+  using KeyTy = std::tuple<Type, DenseIntElementsAttr, DenseElementsAttr>;
+
+  SparseElementsAttributeStorage(Type type, DenseIntElementsAttr indices,
+                                 DenseElementsAttr values)
+      : AttributeStorage(type), indices(indices), values(values) {}
+
+  /// Key equality and hash functions.
+  bool operator==(const KeyTy &key) const {
+    return key == std::make_tuple(getType(), indices, values);
+  }
+  static unsigned hashKey(const KeyTy &key) {
+    return llvm::hash_combine(std::get<0>(key), std::get<1>(key),
+                              std::get<2>(key));
+  }
+
+  /// Construct a new storage instance.
+  static SparseElementsAttributeStorage *
+  construct(AttributeStorageAllocator &allocator, KeyTy key) {
+    return new (allocator.allocate<SparseElementsAttributeStorage>())
+        SparseElementsAttributeStorage(std::get<0>(key), std::get<1>(key),
+                                       std::get<2>(key));
+  }
+
+  DenseIntElementsAttr indices;
+  DenseElementsAttr values;
+};
+} // namespace detail
+} // namespace mlir
+
+#endif // ATTRIBUTEDETAIL_H_
diff --git a/mlir/lib/IR/Attributes.cpp b/mlir/lib/IR/Attributes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a9c91f6f770484a0dc048e5f866899f4dbd441d
--- /dev/null
+++ b/mlir/lib/IR/Attributes.cpp
@@ -0,0 +1,1101 @@
+//===- Attributes.cpp - MLIR Affine Expr Classes --------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Attributes.h"
+#include "AttributeDetail.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/Twine.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+//===----------------------------------------------------------------------===//
+// AttributeStorage
+//===----------------------------------------------------------------------===//
+
+AttributeStorage::AttributeStorage(Type type)
+    : type(type.getAsOpaquePointer()) {}
+AttributeStorage::AttributeStorage() : type(nullptr) {}
+
+Type AttributeStorage::getType() const {
+  return Type::getFromOpaquePointer(type);
+}
+void AttributeStorage::setType(Type newType) {
+  type = newType.getAsOpaquePointer();
+}
+
+//===----------------------------------------------------------------------===//
+// Attribute
+//===----------------------------------------------------------------------===//
+
+/// Return the type of this attribute.
+Type Attribute::getType() const { return impl->getType(); }
+
+/// Return the context this attribute belongs to.
+MLIRContext *Attribute::getContext() const { return getType().getContext(); }
+
+/// Get the dialect this attribute is registered to.
+Dialect &Attribute::getDialect() const { return impl->getDialect(); }
+
+//===----------------------------------------------------------------------===//
+// AffineMapAttr
+//===----------------------------------------------------------------------===//
+
+AffineMapAttr AffineMapAttr::get(AffineMap value) {
+  return Base::get(value.getContext(), StandardAttributes::AffineMap, value);
+}
+
+AffineMap AffineMapAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// ArrayAttr
+//===----------------------------------------------------------------------===//
+
+ArrayAttr ArrayAttr::get(ArrayRef<Attribute> value, MLIRContext *context) {
+  return Base::get(context, StandardAttributes::Array, value);
+}
+
+ArrayRef<Attribute> ArrayAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// BoolAttr
+//===----------------------------------------------------------------------===//
+
+bool BoolAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// DictionaryAttr
+//===----------------------------------------------------------------------===//
+
+/// Perform a three-way comparison between the names of the specified
+/// NamedAttributes.
+static int compareNamedAttributes(const NamedAttribute *lhs,
+                                  const NamedAttribute *rhs) {
+  return lhs->first.strref().compare(rhs->first.strref());
+}
+
+DictionaryAttr DictionaryAttr::get(ArrayRef<NamedAttribute> value,
+                                   MLIRContext *context) {
+  assert(llvm::all_of(value,
+                      [](const NamedAttribute &attr) { return attr.second; }) &&
+         "value cannot have null entries");
+
+  // We need to sort the element list to canonicalize it, but we also don't want
+  // to do a ton of work in the super common case where the element list is
+  // already sorted.
+  SmallVector<NamedAttribute, 8> storage;
+  switch (value.size()) {
+  case 0:
+    break;
+  case 1:
+    // A single element is already sorted.
+    break;
+  case 2:
+    assert(value[0].first != value[1].first &&
+           "DictionaryAttr element names must be unique");
+
+    // Don't invoke a general sort for two element case.
+    if (value[0].first.strref() > value[1].first.strref()) {
+      storage.push_back(value[1]);
+      storage.push_back(value[0]);
+      value = storage;
+    }
+    break;
+  default:
+    // Check to see they are sorted already.
+    bool isSorted = true;
+    for (unsigned i = 0, e = value.size() - 1; i != e; ++i) {
+      if (value[i].first.strref() > value[i + 1].first.strref()) {
+        isSorted = false;
+        break;
+      }
+    }
+    // If not, do a general sort.
+    if (!isSorted) {
+      storage.append(value.begin(), value.end());
+      llvm::array_pod_sort(storage.begin(), storage.end(),
+                           compareNamedAttributes);
+      value = storage;
+    }
+
+    // Ensure that the attribute elements are unique.
+    assert(std::adjacent_find(value.begin(), value.end(),
+                              [](NamedAttribute l, NamedAttribute r) {
+                                return l.first == r.first;
+                              }) == value.end() &&
+           "DictionaryAttr element names must be unique");
+  }
+
+  return Base::get(context, StandardAttributes::Dictionary, value);
+}
+
+ArrayRef<NamedAttribute> DictionaryAttr::getValue() const {
+  return getImpl()->getElements();
+}
+
+/// Return the specified attribute if present, null otherwise.
+Attribute DictionaryAttr::get(StringRef name) const {
+  ArrayRef<NamedAttribute> values = getValue();
+  auto compare = [](NamedAttribute attr, StringRef name) {
+    return attr.first.strref() < name;
+  };
+  auto it = llvm::lower_bound(values, name, compare);
+  return it != values.end() && it->first.is(name) ? it->second : Attribute();
+}
+Attribute DictionaryAttr::get(Identifier name) const {
+  for (auto elt : getValue())
+    if (elt.first == name)
+      return elt.second;
+  return nullptr;
+}
+
+DictionaryAttr::iterator DictionaryAttr::begin() const {
+  return getValue().begin();
+}
+DictionaryAttr::iterator DictionaryAttr::end() const {
+  return getValue().end();
+}
+size_t DictionaryAttr::size() const { return getValue().size(); }
+
+//===----------------------------------------------------------------------===//
+// FloatAttr
+//===----------------------------------------------------------------------===//
+
+FloatAttr FloatAttr::get(Type type, double value) {
+  return Base::get(type.getContext(), StandardAttributes::Float, type, value);
+}
+
+FloatAttr FloatAttr::getChecked(Type type, double value, Location loc) {
+  return Base::getChecked(loc, type.getContext(), StandardAttributes::Float,
+                          type, value);
+}
+
+FloatAttr FloatAttr::get(Type type, const APFloat &value) {
+  return Base::get(type.getContext(), StandardAttributes::Float, type, value);
+}
+
+FloatAttr FloatAttr::getChecked(Type type, const APFloat &value, Location loc) {
+  return Base::getChecked(loc, type.getContext(), StandardAttributes::Float,
+                          type, value);
+}
+
+APFloat FloatAttr::getValue() const { return getImpl()->getValue(); }
+
+double FloatAttr::getValueAsDouble() const {
+  return getValueAsDouble(getValue());
+}
+double FloatAttr::getValueAsDouble(APFloat value) {
+  if (&value.getSemantics() != &APFloat::IEEEdouble()) {
+    bool losesInfo = false;
+    value.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
+                  &losesInfo);
+  }
+  return value.convertToDouble();
+}
+
+/// Verify construction invariants.
+static LogicalResult verifyFloatTypeInvariants(Optional<Location> loc,
+                                               Type type) {
+  if (!type.isa<FloatType>())
+    return emitOptionalError(loc, "expected floating point type");
+  return success();
+}
+
+LogicalResult FloatAttr::verifyConstructionInvariants(Optional<Location> loc,
+                                                      MLIRContext *ctx,
+                                                      Type type, double value) {
+  return verifyFloatTypeInvariants(loc, type);
+}
+
+LogicalResult FloatAttr::verifyConstructionInvariants(Optional<Location> loc,
+                                                      MLIRContext *ctx,
+                                                      Type type,
+                                                      const APFloat &value) {
+  // Verify that the type is correct.
+  if (failed(verifyFloatTypeInvariants(loc, type)))
+    return failure();
+
+  // Verify that the type semantics match that of the value.
+  if (&type.cast<FloatType>().getFloatSemantics() != &value.getSemantics()) {
+    return emitOptionalError(
+        loc, "FloatAttr type doesn't match the type implied by its value");
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SymbolRefAttr
+//===----------------------------------------------------------------------===//
+
+FlatSymbolRefAttr SymbolRefAttr::get(StringRef value, MLIRContext *ctx) {
+  return Base::get(ctx, StandardAttributes::SymbolRef, value, llvm::None)
+      .cast<FlatSymbolRefAttr>();
+}
+
+SymbolRefAttr SymbolRefAttr::get(StringRef value,
+                                 ArrayRef<FlatSymbolRefAttr> nestedReferences,
+                                 MLIRContext *ctx) {
+  return Base::get(ctx, StandardAttributes::SymbolRef, value, nestedReferences);
+}
+
+StringRef SymbolRefAttr::getRootReference() const { return getImpl()->value; }
+
+StringRef SymbolRefAttr::getLeafReference() const {
+  ArrayRef<FlatSymbolRefAttr> nestedRefs = getNestedReferences();
+  return nestedRefs.empty() ? getRootReference() : nestedRefs.back().getValue();
+}
+
+ArrayRef<FlatSymbolRefAttr> SymbolRefAttr::getNestedReferences() const {
+  return getImpl()->getNestedRefs();
+}
+
+//===----------------------------------------------------------------------===//
+// IntegerAttr
+//===----------------------------------------------------------------------===//
+
+IntegerAttr IntegerAttr::get(Type type, const APInt &value) {
+  return Base::get(type.getContext(), StandardAttributes::Integer, type, value);
+}
+
+IntegerAttr IntegerAttr::get(Type type, int64_t value) {
+  // This uses 64 bit APInts by default for index type.
+  if (type.isIndex())
+    return get(type, APInt(64, value));
+
+  auto intType = type.cast<IntegerType>();
+  return get(type, APInt(intType.getWidth(), value));
+}
+
+APInt IntegerAttr::getValue() const { return getImpl()->getValue(); }
+
+int64_t IntegerAttr::getInt() const { return getValue().getSExtValue(); }
+
+//===----------------------------------------------------------------------===//
+// IntegerSetAttr
+//===----------------------------------------------------------------------===//
+
+IntegerSetAttr IntegerSetAttr::get(IntegerSet value) {
+  return Base::get(value.getConstraint(0).getContext(),
+                   StandardAttributes::IntegerSet, value);
+}
+
+IntegerSet IntegerSetAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// OpaqueAttr
+//===----------------------------------------------------------------------===//
+
+OpaqueAttr OpaqueAttr::get(Identifier dialect, StringRef attrData, Type type,
+                           MLIRContext *context) {
+  return Base::get(context, StandardAttributes::Opaque, dialect, attrData,
+                   type);
+}
+
+OpaqueAttr OpaqueAttr::getChecked(Identifier dialect, StringRef attrData,
+                                  Type type, Location location) {
+  return Base::getChecked(location, type.getContext(),
+                          StandardAttributes::Opaque, dialect, attrData, type);
+}
+
+/// Returns the dialect namespace of the opaque attribute.
+Identifier OpaqueAttr::getDialectNamespace() const {
+  return getImpl()->dialectNamespace;
+}
+
+/// Returns the raw attribute data of the opaque attribute.
+StringRef OpaqueAttr::getAttrData() const { return getImpl()->attrData; }
+
+/// Verify the construction of an opaque attribute.
+LogicalResult OpaqueAttr::verifyConstructionInvariants(Optional<Location> loc,
+                                                       MLIRContext *context,
+                                                       Identifier dialect,
+                                                       StringRef attrData,
+                                                       Type type) {
+  if (!Dialect::isValidNamespace(dialect.strref()))
+    return emitOptionalError(loc, "invalid dialect namespace '", dialect, "'");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// StringAttr
+//===----------------------------------------------------------------------===//
+
+StringAttr StringAttr::get(StringRef bytes, MLIRContext *context) {
+  return get(bytes, NoneType::get(context));
+}
+
+/// Get an instance of a StringAttr with the given string and Type.
+StringAttr StringAttr::get(StringRef bytes, Type type) {
+  return Base::get(type.getContext(), StandardAttributes::String, bytes, type);
+}
+
+StringRef StringAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// TypeAttr
+//===----------------------------------------------------------------------===//
+
+TypeAttr TypeAttr::get(Type value) {
+  return Base::get(value.getContext(), StandardAttributes::Type, value);
+}
+
+Type TypeAttr::getValue() const { return getImpl()->value; }
+
+//===----------------------------------------------------------------------===//
+// ElementsAttr
+//===----------------------------------------------------------------------===//
+
+ShapedType ElementsAttr::getType() const {
+  return Attribute::getType().cast<ShapedType>();
+}
+
+/// Returns the number of elements held by this attribute.
+int64_t ElementsAttr::getNumElements() const {
+  return getType().getNumElements();
+}
+
+/// Return the value at the given index. If index does not refer to a valid
+/// element, then a null attribute is returned.
+Attribute ElementsAttr::getValue(ArrayRef<uint64_t> index) const {
+  switch (getKind()) {
+  case StandardAttributes::DenseElements:
+    return cast<DenseElementsAttr>().getValue(index);
+  case StandardAttributes::OpaqueElements:
+    return cast<OpaqueElementsAttr>().getValue(index);
+  case StandardAttributes::SparseElements:
+    return cast<SparseElementsAttr>().getValue(index);
+  default:
+    llvm_unreachable("unknown ElementsAttr kind");
+  }
+}
+
+/// Return if the given 'index' refers to a valid element in this attribute.
+bool ElementsAttr::isValidIndex(ArrayRef<uint64_t> index) const {
+  auto type = getType();
+
+  // Verify that the rank of the indices matches the held type.
+  auto rank = type.getRank();
+  if (rank != static_cast<int64_t>(index.size()))
+    return false;
+
+  // Verify that all of the indices are within the shape dimensions.
+  auto shape = type.getShape();
+  return llvm::all_of(llvm::seq<int>(0, rank), [&](int i) {
+    return static_cast<int64_t>(index[i]) < shape[i];
+  });
+}
+
+ElementsAttr
+ElementsAttr::mapValues(Type newElementType,
+                        function_ref<APInt(const APInt &)> mapping) const {
+  switch (getKind()) {
+  case StandardAttributes::DenseElements:
+    return cast<DenseElementsAttr>().mapValues(newElementType, mapping);
+  default:
+    llvm_unreachable("unsupported ElementsAttr subtype");
+  }
+}
+
+ElementsAttr
+ElementsAttr::mapValues(Type newElementType,
+                        function_ref<APInt(const APFloat &)> mapping) const {
+  switch (getKind()) {
+  case StandardAttributes::DenseElements:
+    return cast<DenseElementsAttr>().mapValues(newElementType, mapping);
+  default:
+    llvm_unreachable("unsupported ElementsAttr subtype");
+  }
+}
+
+/// Returns the 1 dimensional flattened row-major index from the given
+/// multi-dimensional index.
+uint64_t ElementsAttr::getFlattenedIndex(ArrayRef<uint64_t> index) const {
+  assert(isValidIndex(index) && "expected valid multi-dimensional index");
+  auto type = getType();
+
+  // Reduce the provided multidimensional index into a flattended 1D row-major
+  // index.
+  auto rank = type.getRank();
+  auto shape = type.getShape();
+  uint64_t valueIndex = 0;
+  uint64_t dimMultiplier = 1;
+  for (int i = rank - 1; i >= 0; --i) {
+    valueIndex += index[i] * dimMultiplier;
+    dimMultiplier *= shape[i];
+  }
+  return valueIndex;
+}
+
+//===----------------------------------------------------------------------===//
+// DenseElementAttr Utilities
+//===----------------------------------------------------------------------===//
+
+static size_t getDenseElementBitwidth(Type eltType) {
+  // FIXME(b/121118307): using 64 bits for BF16 because it is currently stored
+  // with double semantics.
+  return eltType.isBF16() ? 64 : eltType.getIntOrFloatBitWidth();
+}
+
+/// Get the bitwidth of a dense element type within the buffer.
+/// DenseElementsAttr requires bitwidths greater than 1 to be aligned by 8.
+static size_t getDenseElementStorageWidth(size_t origWidth) {
+  return origWidth == 1 ? origWidth : llvm::alignTo<8>(origWidth);
+}
+
+/// Set a bit to a specific value.
+static void setBit(char *rawData, size_t bitPos, bool value) {
+  if (value)
+    rawData[bitPos / CHAR_BIT] |= (1 << (bitPos % CHAR_BIT));
+  else
+    rawData[bitPos / CHAR_BIT] &= ~(1 << (bitPos % CHAR_BIT));
+}
+
+/// Return the value of the specified bit.
+static bool getBit(const char *rawData, size_t bitPos) {
+  return (rawData[bitPos / CHAR_BIT] & (1 << (bitPos % CHAR_BIT))) != 0;
+}
+
+/// Writes value to the bit position `bitPos` in array `rawData`.
+static void writeBits(char *rawData, size_t bitPos, APInt value) {
+  size_t bitWidth = value.getBitWidth();
+
+  // If the bitwidth is 1 we just toggle the specific bit.
+  if (bitWidth == 1)
+    return setBit(rawData, bitPos, value.isOneValue());
+
+  // Otherwise, the bit position is guaranteed to be byte aligned.
+  assert((bitPos % CHAR_BIT) == 0 && "expected bitPos to be 8-bit aligned");
+  std::copy_n(reinterpret_cast<const char *>(value.getRawData()),
+              llvm::divideCeil(bitWidth, CHAR_BIT),
+              rawData + (bitPos / CHAR_BIT));
+}
+
+/// Reads the next `bitWidth` bits from the bit position `bitPos` in array
+/// `rawData`.
+static APInt readBits(const char *rawData, size_t bitPos, size_t bitWidth) {
+  // Handle a boolean bit position.
+  if (bitWidth == 1)
+    return APInt(1, getBit(rawData, bitPos) ? 1 : 0);
+
+  // Otherwise, the bit position must be 8-bit aligned.
+  assert((bitPos % CHAR_BIT) == 0 && "expected bitPos to be 8-bit aligned");
+  APInt result(bitWidth, 0);
+  std::copy_n(
+      rawData + (bitPos / CHAR_BIT), llvm::divideCeil(bitWidth, CHAR_BIT),
+      const_cast<char *>(reinterpret_cast<const char *>(result.getRawData())));
+  return result;
+}
+
+/// Returns if 'values' corresponds to a splat, i.e. one element, or has the
+/// same element count as 'type'.
+template <typename Values>
+static bool hasSameElementsOrSplat(ShapedType type, const Values &values) {
+  return (values.size() == 1) ||
+         (type.getNumElements() == static_cast<int64_t>(values.size()));
+}
+
+//===----------------------------------------------------------------------===//
+// DenseElementAttr Iterators
+//===----------------------------------------------------------------------===//
+
+/// Constructs a new iterator.
+DenseElementsAttr::AttributeElementIterator::AttributeElementIterator(
+    DenseElementsAttr attr, size_t index)
+    : indexed_accessor_iterator<AttributeElementIterator, const void *,
+                                Attribute, Attribute, Attribute>(
+          attr.getAsOpaquePointer(), index) {}
+
+/// Accesses the Attribute value at this iterator position.
+Attribute DenseElementsAttr::AttributeElementIterator::operator*() const {
+  auto owner = getFromOpaquePointer(base).cast<DenseElementsAttr>();
+  Type eltTy = owner.getType().getElementType();
+  if (auto intEltTy = eltTy.dyn_cast<IntegerType>()) {
+    if (intEltTy.getWidth() == 1)
+      return BoolAttr::get((*IntElementIterator(owner, index)).isOneValue(),
+                           owner.getContext());
+    return IntegerAttr::get(eltTy, *IntElementIterator(owner, index));
+  }
+  if (auto floatEltTy = eltTy.dyn_cast<FloatType>()) {
+    IntElementIterator intIt(owner, index);
+    FloatElementIterator floatIt(floatEltTy.getFloatSemantics(), intIt);
+    return FloatAttr::get(eltTy, *floatIt);
+  }
+  llvm_unreachable("unexpected element type");
+}
+
+/// Constructs a new iterator.
+DenseElementsAttr::BoolElementIterator::BoolElementIterator(
+    DenseElementsAttr attr, size_t dataIndex)
+    : DenseElementIndexedIteratorImpl<BoolElementIterator, bool, bool, bool>(
+          attr.getRawData().data(), attr.isSplat(), dataIndex) {}
+
+/// Accesses the bool value at this iterator position.
+bool DenseElementsAttr::BoolElementIterator::operator*() const {
+  return getBit(getData(), getDataIndex());
+}
+
+/// Constructs a new iterator.
+DenseElementsAttr::IntElementIterator::IntElementIterator(
+    DenseElementsAttr attr, size_t dataIndex)
+    : DenseElementIndexedIteratorImpl<IntElementIterator, APInt, APInt, APInt>(
+          attr.getRawData().data(), attr.isSplat(), dataIndex),
+      bitWidth(getDenseElementBitwidth(attr.getType().getElementType())) {}
+
+/// Accesses the raw APInt value at this iterator position.
+APInt DenseElementsAttr::IntElementIterator::operator*() const {
+  return readBits(getData(),
+                  getDataIndex() * getDenseElementStorageWidth(bitWidth),
+                  bitWidth);
+}
+
+DenseElementsAttr::FloatElementIterator::FloatElementIterator(
+    const llvm::fltSemantics &smt, IntElementIterator it)
+    : llvm::mapped_iterator<IntElementIterator,
+                            std::function<APFloat(const APInt &)>>(
+          it, [&](const APInt &val) { return APFloat(smt, val); }) {}
+
+//===----------------------------------------------------------------------===//
+// DenseElementsAttr
+//===----------------------------------------------------------------------===//
+
+DenseElementsAttr DenseElementsAttr::get(ShapedType type,
+                                         ArrayRef<Attribute> values) {
+  assert(type.getElementType().isIntOrFloat() &&
+         "expected int or float element type");
+  assert(hasSameElementsOrSplat(type, values));
+
+  auto eltType = type.getElementType();
+  size_t bitWidth = getDenseElementBitwidth(eltType);
+  size_t storageBitWidth = getDenseElementStorageWidth(bitWidth);
+
+  // Compress the attribute values into a character buffer.
+  SmallVector<char, 8> data(llvm::divideCeil(storageBitWidth, CHAR_BIT) *
+                            values.size());
+  APInt intVal;
+  for (unsigned i = 0, e = values.size(); i < e; ++i) {
+    assert(eltType == values[i].getType() &&
+           "expected attribute value to have element type");
+
+    switch (eltType.getKind()) {
+    case StandardTypes::BF16:
+    case StandardTypes::F16:
+    case StandardTypes::F32:
+    case StandardTypes::F64:
+      intVal = values[i].cast<FloatAttr>().getValue().bitcastToAPInt();
+      break;
+    case StandardTypes::Integer:
+      intVal = values[i].isa<BoolAttr>()
+                   ? APInt(1, values[i].cast<BoolAttr>().getValue() ? 1 : 0)
+                   : values[i].cast<IntegerAttr>().getValue();
+      break;
+    default:
+      llvm_unreachable("unexpected element type");
+    }
+    assert(intVal.getBitWidth() == bitWidth &&
+           "expected value to have same bitwidth as element type");
+    writeBits(data.data(), i * storageBitWidth, intVal);
+  }
+  return getRaw(type, data, /*isSplat=*/(values.size() == 1));
+}
+
+DenseElementsAttr DenseElementsAttr::get(ShapedType type,
+                                         ArrayRef<bool> values) {
+  assert(hasSameElementsOrSplat(type, values));
+  assert(type.getElementType().isInteger(1));
+
+  std::vector<char> buff(llvm::divideCeil(values.size(), CHAR_BIT));
+  for (int i = 0, e = values.size(); i != e; ++i)
+    setBit(buff.data(), i, values[i]);
+  return getRaw(type, buff, /*isSplat=*/(values.size() == 1));
+}
+
+/// Constructs a dense integer elements attribute from an array of APInt
+/// values. Each APInt value is expected to have the same bitwidth as the
+/// element type of 'type'.
+DenseElementsAttr DenseElementsAttr::get(ShapedType type,
+                                         ArrayRef<APInt> values) {
+  assert(type.getElementType().isa<IntegerType>());
+  return getRaw(type, values);
+}
+
+// Constructs a dense float elements attribute from an array of APFloat
+// values. Each APFloat value is expected to have the same bitwidth as the
+// element type of 'type'.
+DenseElementsAttr DenseElementsAttr::get(ShapedType type,
+                                         ArrayRef<APFloat> values) {
+  assert(type.getElementType().isa<FloatType>());
+
+  // Convert the APFloat values to APInt and create a dense elements attribute.
+  std::vector<APInt> intValues(values.size());
+  for (unsigned i = 0, e = values.size(); i != e; ++i)
+    intValues[i] = values[i].bitcastToAPInt();
+  return getRaw(type, intValues);
+}
+
+// Constructs a dense elements attribute from an array of raw APInt values.
+// Each APInt value is expected to have the same bitwidth as the element type
+// of 'type'.
+DenseElementsAttr DenseElementsAttr::getRaw(ShapedType type,
+                                            ArrayRef<APInt> values) {
+  assert(hasSameElementsOrSplat(type, values));
+
+  size_t bitWidth = getDenseElementBitwidth(type.getElementType());
+  size_t storageBitWidth = getDenseElementStorageWidth(bitWidth);
+  std::vector<char> elementData(llvm::divideCeil(storageBitWidth, CHAR_BIT) *
+                                values.size());
+  for (unsigned i = 0, e = values.size(); i != e; ++i) {
+    assert(values[i].getBitWidth() == bitWidth);
+    writeBits(elementData.data(), i * storageBitWidth, values[i]);
+  }
+  return getRaw(type, elementData, /*isSplat=*/(values.size() == 1));
+}
+
+DenseElementsAttr DenseElementsAttr::getRaw(ShapedType type,
+                                            ArrayRef<char> data, bool isSplat) {
+  assert((type.isa<RankedTensorType>() || type.isa<VectorType>()) &&
+         "type must be ranked tensor or vector");
+  assert(type.hasStaticShape() && "type must have static shape");
+  return Base::get(type.getContext(), StandardAttributes::DenseElements, type,
+                   data, isSplat);
+}
+
+/// Check the information for a c++ data type, check if this type is valid for
+/// the current attribute. This method is used to verify specific type
+/// invariants that the templatized 'getValues' method cannot.
+static bool isValidIntOrFloat(ShapedType type, int64_t dataEltSize,
+                              bool isInt) {
+  // Make sure that the data element size is the same as the type element width.
+  if ((dataEltSize * CHAR_BIT) != type.getElementTypeBitWidth())
+    return false;
+
+  // Check that the element type is valid.
+  return isInt ? type.getElementType().isa<IntegerType>()
+               : type.getElementType().isa<FloatType>();
+}
+
+/// Overload of the 'getRaw' method that asserts that the given type is of
+/// integer type. This method is used to verify type invariants that the
+/// templatized 'get' method cannot.
+DenseElementsAttr DenseElementsAttr::getRawIntOrFloat(ShapedType type,
+                                                      ArrayRef<char> data,
+                                                      int64_t dataEltSize,
+                                                      bool isInt) {
+  assert(::isValidIntOrFloat(type, dataEltSize, isInt));
+
+  int64_t numElements = data.size() / dataEltSize;
+  assert(numElements == 1 || numElements == type.getNumElements());
+  return getRaw(type, data, /*isSplat=*/numElements == 1);
+}
+
+/// A method used to verify specific type invariants that the templatized 'get'
+/// method cannot.
+bool DenseElementsAttr::isValidIntOrFloat(int64_t dataEltSize,
+                                          bool isInt) const {
+  return ::isValidIntOrFloat(getType(), dataEltSize, isInt);
+}
+
+/// Return the raw storage data held by this attribute.
+ArrayRef<char> DenseElementsAttr::getRawData() const {
+  return static_cast<ImplType *>(impl)->data;
+}
+
+/// Returns if this attribute corresponds to a splat, i.e. if all element
+/// values are the same.
+bool DenseElementsAttr::isSplat() const { return getImpl()->isSplat; }
+
+/// Return the held element values as a range of Attributes.
+auto DenseElementsAttr::getAttributeValues() const
+    -> llvm::iterator_range<AttributeElementIterator> {
+  return {attr_value_begin(), attr_value_end()};
+}
+auto DenseElementsAttr::attr_value_begin() const -> AttributeElementIterator {
+  return AttributeElementIterator(*this, 0);
+}
+auto DenseElementsAttr::attr_value_end() const -> AttributeElementIterator {
+  return AttributeElementIterator(*this, getNumElements());
+}
+
+/// Return the held element values as a range of bool. The element type of
+/// this attribute must be of integer type of bitwidth 1.
+auto DenseElementsAttr::getBoolValues() const
+    -> llvm::iterator_range<BoolElementIterator> {
+  auto eltType = getType().getElementType().dyn_cast<IntegerType>();
+  assert(eltType && eltType.getWidth() == 1 && "expected i1 integer type");
+  (void)eltType;
+  return {BoolElementIterator(*this, 0),
+          BoolElementIterator(*this, getNumElements())};
+}
+
+/// Return the held element values as a range of APInts. The element type of
+/// this attribute must be of integer type.
+auto DenseElementsAttr::getIntValues() const
+    -> llvm::iterator_range<IntElementIterator> {
+  assert(getType().getElementType().isa<IntegerType>() &&
+         "expected integer type");
+  return {raw_int_begin(), raw_int_end()};
+}
+auto DenseElementsAttr::int_value_begin() const -> IntElementIterator {
+  assert(getType().getElementType().isa<IntegerType>() &&
+         "expected integer type");
+  return raw_int_begin();
+}
+auto DenseElementsAttr::int_value_end() const -> IntElementIterator {
+  assert(getType().getElementType().isa<IntegerType>() &&
+         "expected integer type");
+  return raw_int_end();
+}
+
+/// Return the held element values as a range of APFloat. The element type of
+/// this attribute must be of float type.
+auto DenseElementsAttr::getFloatValues() const
+    -> llvm::iterator_range<FloatElementIterator> {
+  auto elementType = getType().getElementType().cast<FloatType>();
+  assert(elementType.isa<FloatType>() && "expected float type");
+  const auto &elementSemantics = elementType.getFloatSemantics();
+  return {FloatElementIterator(elementSemantics, raw_int_begin()),
+          FloatElementIterator(elementSemantics, raw_int_end())};
+}
+auto DenseElementsAttr::float_value_begin() const -> FloatElementIterator {
+  return getFloatValues().begin();
+}
+auto DenseElementsAttr::float_value_end() const -> FloatElementIterator {
+  return getFloatValues().end();
+}
+
+/// Return a new DenseElementsAttr that has the same data as the current
+/// attribute, but has been reshaped to 'newType'. The new type must have the
+/// same total number of elements as well as element type.
+DenseElementsAttr DenseElementsAttr::reshape(ShapedType newType) {
+  ShapedType curType = getType();
+  if (curType == newType)
+    return *this;
+
+  (void)curType;
+  assert(newType.getElementType() == curType.getElementType() &&
+         "expected the same element type");
+  assert(newType.getNumElements() == curType.getNumElements() &&
+         "expected the same number of elements");
+  return getRaw(newType, getRawData(), isSplat());
+}
+
+DenseElementsAttr
+DenseElementsAttr::mapValues(Type newElementType,
+                             function_ref<APInt(const APInt &)> mapping) const {
+  return cast<DenseIntElementsAttr>().mapValues(newElementType, mapping);
+}
+
+DenseElementsAttr DenseElementsAttr::mapValues(
+    Type newElementType, function_ref<APInt(const APFloat &)> mapping) const {
+  return cast<DenseFPElementsAttr>().mapValues(newElementType, mapping);
+}
+
+//===----------------------------------------------------------------------===//
+// DenseFPElementsAttr
+//===----------------------------------------------------------------------===//
+
+template <typename Fn, typename Attr>
+static ShapedType mappingHelper(Fn mapping, Attr &attr, ShapedType inType,
+                                Type newElementType,
+                                llvm::SmallVectorImpl<char> &data) {
+  size_t bitWidth = getDenseElementBitwidth(newElementType);
+  size_t storageBitWidth = getDenseElementStorageWidth(bitWidth);
+
+  ShapedType newArrayType;
+  if (inType.isa<RankedTensorType>())
+    newArrayType = RankedTensorType::get(inType.getShape(), newElementType);
+  else if (inType.isa<UnrankedTensorType>())
+    newArrayType = RankedTensorType::get(inType.getShape(), newElementType);
+  else if (inType.isa<VectorType>())
+    newArrayType = VectorType::get(inType.getShape(), newElementType);
+  else
+    assert(newArrayType && "Unhandled tensor type");
+
+  size_t numRawElements = attr.isSplat() ? 1 : newArrayType.getNumElements();
+  data.resize(llvm::divideCeil(storageBitWidth, CHAR_BIT) * numRawElements);
+
+  // Functor used to process a single element value of the attribute.
+  auto processElt = [&](decltype(*attr.begin()) value, size_t index) {
+    auto newInt = mapping(value);
+    assert(newInt.getBitWidth() == bitWidth);
+    writeBits(data.data(), index * storageBitWidth, newInt);
+  };
+
+  // Check for the splat case.
+  if (attr.isSplat()) {
+    processElt(*attr.begin(), /*index=*/0);
+    return newArrayType;
+  }
+
+  // Otherwise, process all of the element values.
+  uint64_t elementIdx = 0;
+  for (auto value : attr)
+    processElt(value, elementIdx++);
+  return newArrayType;
+}
+
+DenseElementsAttr DenseFPElementsAttr::mapValues(
+    Type newElementType, function_ref<APInt(const APFloat &)> mapping) const {
+  llvm::SmallVector<char, 8> elementData;
+  auto newArrayType =
+      mappingHelper(mapping, *this, getType(), newElementType, elementData);
+
+  return getRaw(newArrayType, elementData, isSplat());
+}
+
+/// Method for supporting type inquiry through isa, cast and dyn_cast.
+bool DenseFPElementsAttr::classof(Attribute attr) {
+  return attr.isa<DenseElementsAttr>() &&
+         attr.getType().cast<ShapedType>().getElementType().isa<FloatType>();
+}
+
+//===----------------------------------------------------------------------===//
+// DenseIntElementsAttr
+//===----------------------------------------------------------------------===//
+
+DenseElementsAttr DenseIntElementsAttr::mapValues(
+    Type newElementType, function_ref<APInt(const APInt &)> mapping) const {
+  llvm::SmallVector<char, 8> elementData;
+  auto newArrayType =
+      mappingHelper(mapping, *this, getType(), newElementType, elementData);
+
+  return getRaw(newArrayType, elementData, isSplat());
+}
+
+/// Method for supporting type inquiry through isa, cast and dyn_cast.
+bool DenseIntElementsAttr::classof(Attribute attr) {
+  return attr.isa<DenseElementsAttr>() &&
+         attr.getType().cast<ShapedType>().getElementType().isa<IntegerType>();
+}
+
+//===----------------------------------------------------------------------===//
+// OpaqueElementsAttr
+//===----------------------------------------------------------------------===//
+
+OpaqueElementsAttr OpaqueElementsAttr::get(Dialect *dialect, ShapedType type,
+                                           StringRef bytes) {
+  assert(TensorType::isValidElementType(type.getElementType()) &&
+         "Input element type should be a valid tensor element type");
+  return Base::get(type.getContext(), StandardAttributes::OpaqueElements, type,
+                   dialect, bytes);
+}
+
+StringRef OpaqueElementsAttr::getValue() const { return getImpl()->bytes; }
+
+/// Return the value at the given index. If index does not refer to a valid
+/// element, then a null attribute is returned.
+Attribute OpaqueElementsAttr::getValue(ArrayRef<uint64_t> index) const {
+  assert(isValidIndex(index) && "expected valid multi-dimensional index");
+  if (Dialect *dialect = getDialect())
+    return dialect->extractElementHook(*this, index);
+  return Attribute();
+}
+
+Dialect *OpaqueElementsAttr::getDialect() const { return getImpl()->dialect; }
+
+bool OpaqueElementsAttr::decode(ElementsAttr &result) {
+  if (auto *d = getDialect())
+    return d->decodeHook(*this, result);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// SparseElementsAttr
+//===----------------------------------------------------------------------===//
+
+SparseElementsAttr SparseElementsAttr::get(ShapedType type,
+                                           DenseElementsAttr indices,
+                                           DenseElementsAttr values) {
+  assert(indices.getType().getElementType().isInteger(64) &&
+         "expected sparse indices to be 64-bit integer values");
+  assert((type.isa<RankedTensorType>() || type.isa<VectorType>()) &&
+         "type must be ranked tensor or vector");
+  assert(type.hasStaticShape() && "type must have static shape");
+  return Base::get(type.getContext(), StandardAttributes::SparseElements, type,
+                   indices.cast<DenseIntElementsAttr>(), values);
+}
+
+DenseIntElementsAttr SparseElementsAttr::getIndices() const {
+  return getImpl()->indices;
+}
+
+DenseElementsAttr SparseElementsAttr::getValues() const {
+  return getImpl()->values;
+}
+
+/// Return the value of the element at the given index.
+Attribute SparseElementsAttr::getValue(ArrayRef<uint64_t> index) const {
+  assert(isValidIndex(index) && "expected valid multi-dimensional index");
+  auto type = getType();
+
+  // The sparse indices are 64-bit integers, so we can reinterpret the raw data
+  // as a 1-D index array.
+  auto sparseIndices = getIndices();
+  auto sparseIndexValues = sparseIndices.getValues<uint64_t>();
+
+  // Check to see if the indices are a splat.
+  if (sparseIndices.isSplat()) {
+    // If the index is also not a splat of the index value, we know that the
+    // value is zero.
+    auto splatIndex = *sparseIndexValues.begin();
+    if (llvm::any_of(index, [=](uint64_t i) { return i != splatIndex; }))
+      return getZeroAttr();
+
+    // If the indices are a splat, we also expect the values to be a splat.
+    assert(getValues().isSplat() && "expected splat values");
+    return getValues().getSplatValue();
+  }
+
+  // Build a mapping between known indices and the offset of the stored element.
+  llvm::SmallDenseMap<llvm::ArrayRef<uint64_t>, size_t> mappedIndices;
+  auto numSparseIndices = sparseIndices.getType().getDimSize(0);
+  size_t rank = type.getRank();
+  for (size_t i = 0, e = numSparseIndices; i != e; ++i)
+    mappedIndices.try_emplace(
+        {&*std::next(sparseIndexValues.begin(), i * rank), rank}, i);
+
+  // Look for the provided index key within the mapped indices. If the provided
+  // index is not found, then return a zero attribute.
+  auto it = mappedIndices.find(index);
+  if (it == mappedIndices.end())
+    return getZeroAttr();
+
+  // Otherwise, return the held sparse value element.
+  return getValues().getValue(it->second);
+}
+
+/// Get a zero APFloat for the given sparse attribute.
+APFloat SparseElementsAttr::getZeroAPFloat() const {
+  auto eltType = getType().getElementType().cast<FloatType>();
+  return APFloat(eltType.getFloatSemantics());
+}
+
+/// Get a zero APInt for the given sparse attribute.
+APInt SparseElementsAttr::getZeroAPInt() const {
+  auto eltType = getType().getElementType().cast<IntegerType>();
+  return APInt::getNullValue(eltType.getWidth());
+}
+
+/// Get a zero attribute for the given attribute type.
+Attribute SparseElementsAttr::getZeroAttr() const {
+  auto eltType = getType().getElementType();
+
+  // Handle floating point elements.
+  if (eltType.isa<FloatType>())
+    return FloatAttr::get(eltType, 0);
+
+  // Otherwise, this is an integer.
+  auto intEltTy = eltType.cast<IntegerType>();
+  if (intEltTy.getWidth() == 1)
+    return BoolAttr::get(false, eltType.getContext());
+  return IntegerAttr::get(eltType, 0);
+}
+
+/// Flatten, and return, all of the sparse indices in this attribute in
+/// row-major order.
+std::vector<ptrdiff_t> SparseElementsAttr::getFlattenedSparseIndices() const {
+  std::vector<ptrdiff_t> flatSparseIndices;
+
+  // The sparse indices are 64-bit integers, so we can reinterpret the raw data
+  // as a 1-D index array.
+  auto sparseIndices = getIndices();
+  auto sparseIndexValues = sparseIndices.getValues<uint64_t>();
+  if (sparseIndices.isSplat()) {
+    SmallVector<uint64_t, 8> indices(getType().getRank(),
+                                     *sparseIndexValues.begin());
+    flatSparseIndices.push_back(getFlattenedIndex(indices));
+    return flatSparseIndices;
+  }
+
+  // Otherwise, reinterpret each index as an ArrayRef when flattening.
+  auto numSparseIndices = sparseIndices.getType().getDimSize(0);
+  size_t rank = getType().getRank();
+  for (size_t i = 0, e = numSparseIndices; i != e; ++i)
+    flatSparseIndices.push_back(getFlattenedIndex(
+        {&*std::next(sparseIndexValues.begin(), i * rank), rank}));
+  return flatSparseIndices;
+}
+
+//===----------------------------------------------------------------------===//
+// NamedAttributeList
+//===----------------------------------------------------------------------===//
+
+NamedAttributeList::NamedAttributeList(ArrayRef<NamedAttribute> attributes) {
+  setAttrs(attributes);
+}
+
+ArrayRef<NamedAttribute> NamedAttributeList::getAttrs() const {
+  return attrs ? attrs.getValue() : llvm::None;
+}
+
+/// Replace the held attributes with ones provided in 'newAttrs'.
+void NamedAttributeList::setAttrs(ArrayRef<NamedAttribute> attributes) {
+  // Don't create an attribute list if there are no attributes.
+  if (attributes.empty())
+    attrs = nullptr;
+  else
+    attrs = DictionaryAttr::get(attributes, attributes[0].second.getContext());
+}
+
+/// Return the specified attribute if present, null otherwise.
+Attribute NamedAttributeList::get(StringRef name) const {
+  return attrs ? attrs.get(name) : nullptr;
+}
+
+/// Return the specified attribute if present, null otherwise.
+Attribute NamedAttributeList::get(Identifier name) const {
+  return attrs ? attrs.get(name) : nullptr;
+}
+
+/// If the an attribute exists with the specified name, change it to the new
+/// value.  Otherwise, add a new attribute with the specified name/value.
+void NamedAttributeList::set(Identifier name, Attribute value) {
+  assert(value && "attributes may never be null");
+
+  // If we already have this attribute, replace it.
+  auto origAttrs = getAttrs();
+  SmallVector<NamedAttribute, 8> newAttrs(origAttrs.begin(), origAttrs.end());
+  for (auto &elt : newAttrs)
+    if (elt.first == name) {
+      elt.second = value;
+      attrs = DictionaryAttr::get(newAttrs, value.getContext());
+      return;
+    }
+
+  // Otherwise, add it.
+  newAttrs.push_back({name, value});
+  attrs = DictionaryAttr::get(newAttrs, value.getContext());
+}
+
+/// Remove the attribute with the specified name if it exists.  The return
+/// value indicates whether the attribute was present or not.
+auto NamedAttributeList::remove(Identifier name) -> RemoveResult {
+  auto origAttrs = getAttrs();
+  for (unsigned i = 0, e = origAttrs.size(); i != e; ++i) {
+    if (origAttrs[i].first == name) {
+      // Handle the simple case of removing the only attribute in the list.
+      if (e == 1) {
+        attrs = nullptr;
+        return RemoveResult::Removed;
+      }
+
+      SmallVector<NamedAttribute, 8> newAttrs;
+      newAttrs.reserve(origAttrs.size() - 1);
+      newAttrs.append(origAttrs.begin(), origAttrs.begin() + i);
+      newAttrs.append(origAttrs.begin() + i + 1, origAttrs.end());
+      attrs = DictionaryAttr::get(newAttrs, newAttrs[0].second.getContext());
+      return RemoveResult::Removed;
+    }
+  }
+  return RemoveResult::NotFound;
+}
diff --git a/mlir/lib/IR/Block.cpp b/mlir/lib/IR/Block.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0ada9981a8a77891922f8e14588a10c0d49e110
--- /dev/null
+++ b/mlir/lib/IR/Block.cpp
@@ -0,0 +1,274 @@
+//===- Block.cpp - MLIR Block Class ---------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Operation.h"
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// BlockArgument
+//===----------------------------------------------------------------------===//
+
+/// Returns the number of this argument.
+unsigned BlockArgument::getArgNumber() const {
+  // Arguments are not stored in place, so we have to find it within the list.
+  auto argList = getOwner()->getArguments();
+  return std::distance(argList.begin(), llvm::find(argList, *this));
+}
+
+//===----------------------------------------------------------------------===//
+// Block
+//===----------------------------------------------------------------------===//
+
+Block::~Block() {
+  assert(!verifyOpOrder() && "Expected valid operation ordering.");
+  clear();
+  for (BlockArgument arg : arguments)
+    arg.destroy();
+}
+
+Region *Block::getParent() const { return parentValidOpOrderPair.getPointer(); }
+
+/// Returns the closest surrounding operation that contains this block or
+/// nullptr if this block is unlinked.
+Operation *Block::getParentOp() {
+  return getParent() ? getParent()->getParentOp() : nullptr;
+}
+
+/// Return if this block is the entry block in the parent region.
+bool Block::isEntryBlock() { return this == &getParent()->front(); }
+
+/// Insert this block (which must not already be in a region) right before the
+/// specified block.
+void Block::insertBefore(Block *block) {
+  assert(!getParent() && "already inserted into a block!");
+  assert(block->getParent() && "cannot insert before a block without a parent");
+  block->getParent()->getBlocks().insert(block->getIterator(), this);
+}
+
+/// Unlink this block from its current region and insert it right before the
+/// specific block.
+void Block::moveBefore(Block *block) {
+  assert(block->getParent() && "cannot insert before a block without a parent");
+  block->getParent()->getBlocks().splice(
+      block->getIterator(), getParent()->getBlocks(), getIterator());
+}
+
+/// Unlink this Block from its parent Region and delete it.
+void Block::erase() {
+  assert(getParent() && "Block has no parent");
+  getParent()->getBlocks().erase(this);
+}
+
+/// Returns 'op' if 'op' lies in this block, or otherwise finds the
+/// ancestor operation of 'op' that lies in this block. Returns nullptr if
+/// the latter fails.
+Operation *Block::findAncestorOpInBlock(Operation &op) {
+  // Traverse up the operation hierarchy starting from the owner of operand to
+  // find the ancestor operation that resides in the block of 'forOp'.
+  auto *currOp = &op;
+  while (currOp->getBlock() != this) {
+    currOp = currOp->getParentOp();
+    if (!currOp)
+      return nullptr;
+  }
+  return currOp;
+}
+
+/// This drops all operand uses from operations within this block, which is
+/// an essential step in breaking cyclic dependences between references when
+/// they are to be deleted.
+void Block::dropAllReferences() {
+  for (Operation &i : *this)
+    i.dropAllReferences();
+}
+
+void Block::dropAllDefinedValueUses() {
+  for (auto arg : getArguments())
+    arg->dropAllUses();
+  for (auto &op : *this)
+    op.dropAllDefinedValueUses();
+  dropAllUses();
+}
+
+/// Returns true if the ordering of the child operations is valid, false
+/// otherwise.
+bool Block::isOpOrderValid() { return parentValidOpOrderPair.getInt(); }
+
+/// Invalidates the current ordering of operations.
+void Block::invalidateOpOrder() {
+  // Validate the current ordering.
+  assert(!verifyOpOrder());
+  parentValidOpOrderPair.setInt(false);
+}
+
+/// Verifies the current ordering of child operations. Returns false if the
+/// order is valid, true otherwise.
+bool Block::verifyOpOrder() {
+  // The order is already known to be invalid.
+  if (!isOpOrderValid())
+    return false;
+  // The order is valid if there are less than 2 operations.
+  if (operations.empty() || std::next(operations.begin()) == operations.end())
+    return false;
+
+  Operation *prev = nullptr;
+  for (auto &i : *this) {
+    // The previous operation must have a smaller order index than the next as
+    // it appears earlier in the list.
+    if (prev && prev->orderIndex != Operation::kInvalidOrderIdx &&
+        prev->orderIndex >= i.orderIndex)
+      return true;
+    prev = &i;
+  }
+  return false;
+}
+
+/// Recomputes the ordering of child operations within the block.
+void Block::recomputeOpOrder() {
+  parentValidOpOrderPair.setInt(true);
+
+  unsigned orderIndex = 0;
+  for (auto &op : *this)
+    op.orderIndex = (orderIndex += Operation::kOrderStride);
+}
+
+//===----------------------------------------------------------------------===//
+// Argument list management.
+//===----------------------------------------------------------------------===//
+
+BlockArgument Block::addArgument(Type type) {
+  BlockArgument arg = BlockArgument::create(type, this);
+  arguments.push_back(arg);
+  return arg;
+}
+
+/// Add one argument to the argument list for each type specified in the list.
+auto Block::addArguments(ArrayRef<Type> types)
+    -> iterator_range<args_iterator> {
+  arguments.reserve(arguments.size() + types.size());
+  auto initialSize = arguments.size();
+  for (auto type : types) {
+    addArgument(type);
+  }
+  return {arguments.data() + initialSize, arguments.data() + arguments.size()};
+}
+
+void Block::eraseArgument(unsigned index, bool updatePredTerms) {
+  assert(index < arguments.size());
+
+  // Delete the argument.
+  arguments[index].destroy();
+  arguments.erase(arguments.begin() + index);
+
+  // If we aren't updating predecessors, there is nothing left to do.
+  if (!updatePredTerms)
+    return;
+
+  // Erase this argument from each of the predecessor's terminator.
+  for (auto predIt = pred_begin(), predE = pred_end(); predIt != predE;
+       ++predIt) {
+    auto *predTerminator = (*predIt)->getTerminator();
+    predTerminator->eraseSuccessorOperand(predIt.getSuccessorIndex(), index);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Terminator management
+//===----------------------------------------------------------------------===//
+
+/// Get the terminator operation of this block. This function asserts that
+/// the block has a valid terminator operation.
+Operation *Block::getTerminator() {
+  assert(!empty() && !back().isKnownNonTerminator());
+  return &back();
+}
+
+/// Return true if this block has no predecessors.
+bool Block::hasNoPredecessors() { return pred_begin() == pred_end(); }
+
+// Indexed successor access.
+unsigned Block::getNumSuccessors() {
+  return empty() ? 0 : back().getNumSuccessors();
+}
+
+Block *Block::getSuccessor(unsigned i) {
+  assert(i < getNumSuccessors());
+  return getTerminator()->getSuccessor(i);
+}
+
+/// If this block has exactly one predecessor, return it.  Otherwise, return
+/// null.
+///
+/// Note that multiple edges from a single block (e.g. if you have a cond
+/// branch with the same block as the true/false destinations) is not
+/// considered to be a single predecessor.
+Block *Block::getSinglePredecessor() {
+  auto it = pred_begin();
+  if (it == pred_end())
+    return nullptr;
+  auto *firstPred = *it;
+  ++it;
+  return it == pred_end() ? firstPred : nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Other
+//===----------------------------------------------------------------------===//
+
+/// Split the block into two blocks before the specified operation or
+/// iterator.
+///
+/// Note that all operations BEFORE the specified iterator stay as part of
+/// the original basic block, and the rest of the operations in the original
+/// block are moved to the new block, including the old terminator.  The
+/// original block is left without a terminator.
+///
+/// The newly formed Block is returned, and the specified iterator is
+/// invalidated.
+Block *Block::splitBlock(iterator splitBefore) {
+  // Start by creating a new basic block, and insert it immediate after this
+  // one in the containing region.
+  auto newBB = new Block();
+  getParent()->getBlocks().insert(std::next(Region::iterator(this)), newBB);
+
+  // Move all of the operations from the split point to the end of the region
+  // into the new block.
+  newBB->getOperations().splice(newBB->end(), getOperations(), splitBefore,
+                                end());
+  return newBB;
+}
+
+//===----------------------------------------------------------------------===//
+// Predecessors
+//===----------------------------------------------------------------------===//
+
+Block *PredecessorIterator::unwrap(BlockOperand &value) {
+  return value.getOwner()->getBlock();
+}
+
+/// Get the successor number in the predecessor terminator.
+unsigned PredecessorIterator::getSuccessorIndex() const {
+  return I->getOperandNumber();
+}
+
+//===----------------------------------------------------------------------===//
+// Successors
+//===----------------------------------------------------------------------===//
+
+SuccessorRange::SuccessorRange(Block *block) : SuccessorRange(nullptr, 0) {
+  if (Operation *term = block->getTerminator())
+    if ((count = term->getNumSuccessors()))
+      base = term->getBlockOperands().data();
+}
+
+SuccessorRange::SuccessorRange(Operation *term) : SuccessorRange(nullptr, 0) {
+  if ((count = term->getNumSuccessors()))
+    base = term->getBlockOperands().data();
+}
diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5567f873b5e4d2c98a29481190955875f8189532
--- /dev/null
+++ b/mlir/lib/IR/Builders.cpp
@@ -0,0 +1,398 @@
+//===- Builders.cpp - Helpers for constructing MLIR Classes ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/Functional.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace mlir;
+
+Builder::Builder(ModuleOp module) : context(module.getContext()) {}
+
+Identifier Builder::getIdentifier(StringRef str) {
+  return Identifier::get(str, context);
+}
+
+//===----------------------------------------------------------------------===//
+// Locations.
+//===----------------------------------------------------------------------===//
+
+Location Builder::getUnknownLoc() { return UnknownLoc::get(context); }
+
+Location Builder::getFileLineColLoc(Identifier filename, unsigned line,
+                                    unsigned column) {
+  return FileLineColLoc::get(filename, line, column, context);
+}
+
+Location Builder::getFusedLoc(ArrayRef<Location> locs, Attribute metadata) {
+  return FusedLoc::get(locs, metadata, context);
+}
+
+//===----------------------------------------------------------------------===//
+// Types.
+//===----------------------------------------------------------------------===//
+
+FloatType Builder::getBF16Type() { return FloatType::getBF16(context); }
+
+FloatType Builder::getF16Type() { return FloatType::getF16(context); }
+
+FloatType Builder::getF32Type() { return FloatType::getF32(context); }
+
+FloatType Builder::getF64Type() { return FloatType::getF64(context); }
+
+IndexType Builder::getIndexType() { return IndexType::get(context); }
+
+IntegerType Builder::getI1Type() { return IntegerType::get(1, context); }
+
+IntegerType Builder::getIntegerType(unsigned width) {
+  return IntegerType::get(width, context);
+}
+
+FunctionType Builder::getFunctionType(ArrayRef<Type> inputs,
+                                      ArrayRef<Type> results) {
+  return FunctionType::get(inputs, results, context);
+}
+
+TupleType Builder::getTupleType(ArrayRef<Type> elementTypes) {
+  return TupleType::get(elementTypes, context);
+}
+
+NoneType Builder::getNoneType() { return NoneType::get(context); }
+
+//===----------------------------------------------------------------------===//
+// Attributes.
+//===----------------------------------------------------------------------===//
+
+NamedAttribute Builder::getNamedAttr(StringRef name, Attribute val) {
+  return NamedAttribute(getIdentifier(name), val);
+}
+
+UnitAttr Builder::getUnitAttr() { return UnitAttr::get(context); }
+
+BoolAttr Builder::getBoolAttr(bool value) {
+  return BoolAttr::get(value, context);
+}
+
+DictionaryAttr Builder::getDictionaryAttr(ArrayRef<NamedAttribute> value) {
+  return DictionaryAttr::get(value, context);
+}
+
+IntegerAttr Builder::getI64IntegerAttr(int64_t value) {
+  return IntegerAttr::get(getIntegerType(64), APInt(64, value));
+}
+
+DenseIntElementsAttr Builder::getI32VectorAttr(ArrayRef<int32_t> values) {
+  return DenseElementsAttr::get(
+             VectorType::get(static_cast<int64_t>(values.size()),
+                             getIntegerType(32)),
+             values)
+      .cast<DenseIntElementsAttr>();
+}
+
+IntegerAttr Builder::getI32IntegerAttr(int32_t value) {
+  return IntegerAttr::get(getIntegerType(32), APInt(32, value));
+}
+
+IntegerAttr Builder::getI16IntegerAttr(int16_t value) {
+  return IntegerAttr::get(getIntegerType(16), APInt(16, value));
+}
+
+IntegerAttr Builder::getI8IntegerAttr(int8_t value) {
+  return IntegerAttr::get(getIntegerType(8), APInt(8, value));
+}
+
+IntegerAttr Builder::getIntegerAttr(Type type, int64_t value) {
+  if (type.isIndex())
+    return IntegerAttr::get(type, APInt(64, value));
+  return IntegerAttr::get(type, APInt(type.getIntOrFloatBitWidth(), value));
+}
+
+IntegerAttr Builder::getIntegerAttr(Type type, const APInt &value) {
+  return IntegerAttr::get(type, value);
+}
+
+FloatAttr Builder::getF64FloatAttr(double value) {
+  return FloatAttr::get(getF64Type(), APFloat(value));
+}
+
+FloatAttr Builder::getF32FloatAttr(float value) {
+  return FloatAttr::get(getF32Type(), APFloat(value));
+}
+
+FloatAttr Builder::getF16FloatAttr(float value) {
+  return FloatAttr::get(getF16Type(), value);
+}
+
+FloatAttr Builder::getFloatAttr(Type type, double value) {
+  return FloatAttr::get(type, value);
+}
+
+FloatAttr Builder::getFloatAttr(Type type, const APFloat &value) {
+  return FloatAttr::get(type, value);
+}
+
+StringAttr Builder::getStringAttr(StringRef bytes) {
+  return StringAttr::get(bytes, context);
+}
+
+ArrayAttr Builder::getArrayAttr(ArrayRef<Attribute> value) {
+  return ArrayAttr::get(value, context);
+}
+
+FlatSymbolRefAttr Builder::getSymbolRefAttr(Operation *value) {
+  auto symName =
+      value->getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName());
+  assert(symName && "value does not have a valid symbol name");
+  return getSymbolRefAttr(symName.getValue());
+}
+FlatSymbolRefAttr Builder::getSymbolRefAttr(StringRef value) {
+  return SymbolRefAttr::get(value, getContext());
+}
+SymbolRefAttr
+Builder::getSymbolRefAttr(StringRef value,
+                          ArrayRef<FlatSymbolRefAttr> nestedReferences) {
+  return SymbolRefAttr::get(value, nestedReferences, getContext());
+}
+
+ArrayAttr Builder::getI32ArrayAttr(ArrayRef<int32_t> values) {
+  auto attrs = functional::map(
+      [this](int32_t v) -> Attribute { return getI32IntegerAttr(v); }, values);
+  return getArrayAttr(attrs);
+}
+
+ArrayAttr Builder::getI64ArrayAttr(ArrayRef<int64_t> values) {
+  auto attrs = functional::map(
+      [this](int64_t v) -> Attribute { return getI64IntegerAttr(v); }, values);
+  return getArrayAttr(attrs);
+}
+
+ArrayAttr Builder::getIndexArrayAttr(ArrayRef<int64_t> values) {
+  auto attrs = functional::map(
+      [this](int64_t v) -> Attribute {
+        return getIntegerAttr(IndexType::get(getContext()), v);
+      },
+      values);
+  return getArrayAttr(attrs);
+}
+
+ArrayAttr Builder::getF32ArrayAttr(ArrayRef<float> values) {
+  auto attrs = functional::map(
+      [this](float v) -> Attribute { return getF32FloatAttr(v); }, values);
+  return getArrayAttr(attrs);
+}
+
+ArrayAttr Builder::getF64ArrayAttr(ArrayRef<double> values) {
+  auto attrs = functional::map(
+      [this](double v) -> Attribute { return getF64FloatAttr(v); }, values);
+  return getArrayAttr(attrs);
+}
+
+ArrayAttr Builder::getStrArrayAttr(ArrayRef<StringRef> values) {
+  auto attrs = functional::map(
+      [this](StringRef v) -> Attribute { return getStringAttr(v); }, values);
+  return getArrayAttr(attrs);
+}
+
+ArrayAttr Builder::getAffineMapArrayAttr(ArrayRef<AffineMap> values) {
+  auto attrs = functional::map(
+      [](AffineMap v) -> Attribute { return AffineMapAttr::get(v); }, values);
+  return getArrayAttr(attrs);
+}
+
+Attribute Builder::getZeroAttr(Type type) {
+  switch (type.getKind()) {
+  case StandardTypes::BF16:
+  case StandardTypes::F16:
+  case StandardTypes::F32:
+  case StandardTypes::F64:
+    return getFloatAttr(type, 0.0);
+  case StandardTypes::Integer: {
+    auto width = type.cast<IntegerType>().getWidth();
+    if (width == 1)
+      return getBoolAttr(false);
+    return getIntegerAttr(type, APInt(width, 0));
+  }
+  case StandardTypes::Vector:
+  case StandardTypes::RankedTensor: {
+    auto vtType = type.cast<ShapedType>();
+    auto element = getZeroAttr(vtType.getElementType());
+    if (!element)
+      return {};
+    return DenseElementsAttr::get(vtType, element);
+  }
+  default:
+    break;
+  }
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// Affine Expressions, Affine Maps, and Integer Sets.
+//===----------------------------------------------------------------------===//
+
+AffineExpr Builder::getAffineDimExpr(unsigned position) {
+  return mlir::getAffineDimExpr(position, context);
+}
+
+AffineExpr Builder::getAffineSymbolExpr(unsigned position) {
+  return mlir::getAffineSymbolExpr(position, context);
+}
+
+AffineExpr Builder::getAffineConstantExpr(int64_t constant) {
+  return mlir::getAffineConstantExpr(constant, context);
+}
+
+AffineMap Builder::getEmptyAffineMap() { return AffineMap::get(context); }
+
+AffineMap Builder::getConstantAffineMap(int64_t val) {
+  return AffineMap::get(/*dimCount=*/0, /*symbolCount=*/0,
+                        {getAffineConstantExpr(val)});
+}
+
+AffineMap Builder::getDimIdentityMap() {
+  return AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0,
+                        {getAffineDimExpr(0)});
+}
+
+AffineMap Builder::getMultiDimIdentityMap(unsigned rank) {
+  SmallVector<AffineExpr, 4> dimExprs;
+  dimExprs.reserve(rank);
+  for (unsigned i = 0; i < rank; ++i)
+    dimExprs.push_back(getAffineDimExpr(i));
+  return AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/0, dimExprs);
+}
+
+AffineMap Builder::getSymbolIdentityMap() {
+  return AffineMap::get(/*dimCount=*/0, /*symbolCount=*/1,
+                        {getAffineSymbolExpr(0)});
+}
+
+AffineMap Builder::getSingleDimShiftAffineMap(int64_t shift) {
+  // expr = d0 + shift.
+  auto expr = getAffineDimExpr(0) + shift;
+  return AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, {expr});
+}
+
+AffineMap Builder::getShiftedAffineMap(AffineMap map, int64_t shift) {
+  SmallVector<AffineExpr, 4> shiftedResults;
+  shiftedResults.reserve(map.getNumResults());
+  for (auto resultExpr : map.getResults())
+    shiftedResults.push_back(resultExpr + shift);
+  return AffineMap::get(map.getNumDims(), map.getNumSymbols(), shiftedResults);
+}
+
+//===----------------------------------------------------------------------===//
+// OpBuilder.
+//===----------------------------------------------------------------------===//
+
+OpBuilder::~OpBuilder() {}
+
+/// Insert the given operation at the current insertion point and return it.
+Operation *OpBuilder::insert(Operation *op) {
+  if (block)
+    block->getOperations().insert(insertPoint, op);
+  return op;
+}
+
+/// Add new block and set the insertion point to the end of it. The block is
+/// inserted at the provided insertion point of 'parent'.
+Block *OpBuilder::createBlock(Region *parent, Region::iterator insertPt) {
+  assert(parent && "expected valid parent region");
+  if (insertPt == Region::iterator())
+    insertPt = parent->end();
+
+  Block *b = new Block();
+  parent->getBlocks().insert(insertPt, b);
+  setInsertionPointToEnd(b);
+  return b;
+}
+
+/// Add new block and set the insertion point to the end of it.  The block is
+/// placed before 'insertBefore'.
+Block *OpBuilder::createBlock(Block *insertBefore) {
+  assert(insertBefore && "expected valid insertion block");
+  return createBlock(insertBefore->getParent(), Region::iterator(insertBefore));
+}
+
+/// Create an operation given the fields represented as an OperationState.
+Operation *OpBuilder::createOperation(const OperationState &state) {
+  return insert(Operation::create(state));
+}
+
+/// Attempts to fold the given operation and places new results within
+/// 'results'. Returns success if the operation was folded, failure otherwise.
+/// Note: This function does not erase the operation on a successful fold.
+LogicalResult OpBuilder::tryFold(Operation *op,
+                                 SmallVectorImpl<Value> &results) {
+  results.reserve(op->getNumResults());
+  auto cleanupFailure = [&] {
+    results.assign(op->result_begin(), op->result_end());
+    return failure();
+  };
+
+  // If this operation is already a constant, there is nothing to do.
+  Attribute unused;
+  if (matchPattern(op, m_Constant(&unused)))
+    return cleanupFailure();
+
+  // Check to see if any operands to the operation is constant and whether
+  // the operation knows how to constant fold itself.
+  SmallVector<Attribute, 4> constOperands(op->getNumOperands());
+  for (unsigned i = 0, e = op->getNumOperands(); i != e; ++i)
+    matchPattern(op->getOperand(i), m_Constant(&constOperands[i]));
+
+  // Try to fold the operation.
+  SmallVector<OpFoldResult, 4> foldResults;
+  if (failed(op->fold(constOperands, foldResults)) || foldResults.empty())
+    return cleanupFailure();
+
+  // A temporary builder used for creating constants during folding.
+  OpBuilder cstBuilder(context);
+  SmallVector<Operation *, 1> generatedConstants;
+
+  // Populate the results with the folded results.
+  Dialect *dialect = op->getDialect();
+  for (auto &it : llvm::enumerate(foldResults)) {
+    // Normal values get pushed back directly.
+    if (auto value = it.value().dyn_cast<Value>()) {
+      results.push_back(value);
+      continue;
+    }
+
+    // Otherwise, try to materialize a constant operation.
+    if (!dialect)
+      return cleanupFailure();
+
+    // Ask the dialect to materialize a constant operation for this value.
+    Attribute attr = it.value().get<Attribute>();
+    auto *constOp = dialect->materializeConstant(
+        cstBuilder, attr, op->getResult(it.index())->getType(), op->getLoc());
+    if (!constOp) {
+      // Erase any generated constants.
+      for (Operation *cst : generatedConstants)
+        cst->erase();
+      return cleanupFailure();
+    }
+    assert(matchPattern(constOp, m_Constant(&attr)));
+
+    generatedConstants.push_back(constOp);
+    results.push_back(constOp->getResult(0));
+  }
+
+  // If we were successful, insert any generated constants.
+  for (Operation *cst : generatedConstants)
+    insert(cst);
+
+  return success();
+}
diff --git a/mlir/lib/IR/CMakeLists.txt b/mlir/lib/IR/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..415d9d66e22ce901dcfe61c6128efe4ae46c0709
--- /dev/null
+++ b/mlir/lib/IR/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB globbed *.c *.cpp)
+add_llvm_library(MLIRIR
+  ${globbed}
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR
+  )
+add_dependencies(MLIRIR MLIRCallOpInterfacesIncGen MLIROpAsmInterfacesIncGen MLIRSupport LLVMSupport)
+target_link_libraries(MLIRIR MLIRSupport LLVMSupport)
diff --git a/mlir/lib/IR/Diagnostics.cpp b/mlir/lib/IR/Diagnostics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6ec92f053707b029c99a40df472f58cb58b1d8fc
--- /dev/null
+++ b/mlir/lib/IR/Diagnostics.cpp
@@ -0,0 +1,918 @@
+//===- Diagnostics.cpp - MLIR Diagnostics ---------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+static llvm::cl::opt<bool> printStackTraceOnDiagnostic(
+    "mlir-print-stacktrace-on-diagnostic",
+    llvm::cl::desc("When a diagnostic is emitted, also print the stack trace "
+                   "as an attached note"));
+
+//===----------------------------------------------------------------------===//
+// DiagnosticArgument
+//===----------------------------------------------------------------------===//
+
+// Construct from an Attribute.
+DiagnosticArgument::DiagnosticArgument(Attribute attr)
+    : kind(DiagnosticArgumentKind::Attribute),
+      opaqueVal(reinterpret_cast<intptr_t>(attr.getAsOpaquePointer())) {}
+
+// Construct from a Type.
+DiagnosticArgument::DiagnosticArgument(Type val)
+    : kind(DiagnosticArgumentKind::Type),
+      opaqueVal(reinterpret_cast<intptr_t>(val.getAsOpaquePointer())) {}
+
+/// Returns this argument as an Attribute.
+Attribute DiagnosticArgument::getAsAttribute() const {
+  assert(getKind() == DiagnosticArgumentKind::Attribute);
+  return Attribute::getFromOpaquePointer(
+      reinterpret_cast<const void *>(opaqueVal));
+}
+
+/// Returns this argument as a Type.
+Type DiagnosticArgument::getAsType() const {
+  assert(getKind() == DiagnosticArgumentKind::Type);
+  return Type::getFromOpaquePointer(reinterpret_cast<const void *>(opaqueVal));
+}
+
+/// Outputs this argument to a stream.
+void DiagnosticArgument::print(raw_ostream &os) const {
+  switch (kind) {
+  case DiagnosticArgumentKind::Attribute:
+    os << getAsAttribute();
+    break;
+  case DiagnosticArgumentKind::Double:
+    os << getAsDouble();
+    break;
+  case DiagnosticArgumentKind::Integer:
+    os << getAsInteger();
+    break;
+  case DiagnosticArgumentKind::Operation:
+    getAsOperation().print(os, OpPrintingFlags().useLocalScope());
+    break;
+  case DiagnosticArgumentKind::String:
+    os << getAsString();
+    break;
+  case DiagnosticArgumentKind::Type:
+    os << '\'' << getAsType() << '\'';
+    break;
+  case DiagnosticArgumentKind::Unsigned:
+    os << getAsUnsigned();
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Diagnostic
+//===----------------------------------------------------------------------===//
+
+/// Convert a Twine to a StringRef. Memory used for generating the StringRef is
+/// stored in 'strings'.
+static StringRef twineToStrRef(const Twine &val,
+                               std::vector<std::unique_ptr<char[]>> &strings) {
+  // Allocate memory to hold this string.
+  SmallString<64> data;
+  auto strRef = val.toStringRef(data);
+  strings.push_back(std::unique_ptr<char[]>(new char[strRef.size()]));
+  memcpy(&strings.back()[0], strRef.data(), strRef.size());
+
+  // Return a reference to the new string.
+  return StringRef(&strings.back()[0], strRef.size());
+}
+
+/// Stream in a Twine argument.
+Diagnostic &Diagnostic::operator<<(char val) { return *this << Twine(val); }
+Diagnostic &Diagnostic::operator<<(const Twine &val) {
+  arguments.push_back(DiagnosticArgument(twineToStrRef(val, strings)));
+  return *this;
+}
+Diagnostic &Diagnostic::operator<<(Twine &&val) {
+  arguments.push_back(DiagnosticArgument(twineToStrRef(val, strings)));
+  return *this;
+}
+
+/// Stream in an Identifier.
+Diagnostic &Diagnostic::operator<<(Identifier val) {
+  // An identifier is stored in the context, so we don't need to worry about the
+  // lifetime of its data.
+  arguments.push_back(DiagnosticArgument(val.strref()));
+  return *this;
+}
+
+/// Stream in an OperationName.
+Diagnostic &Diagnostic::operator<<(OperationName val) {
+  // An OperationName is stored in the context, so we don't need to worry about
+  // the lifetime of its data.
+  arguments.push_back(DiagnosticArgument(val.getStringRef()));
+  return *this;
+}
+
+/// Outputs this diagnostic to a stream.
+void Diagnostic::print(raw_ostream &os) const {
+  for (auto &arg : getArguments())
+    arg.print(os);
+}
+
+/// Convert the diagnostic to a string.
+std::string Diagnostic::str() const {
+  std::string str;
+  llvm::raw_string_ostream os(str);
+  print(os);
+  return os.str();
+}
+
+/// Attaches a note to this diagnostic. A new location may be optionally
+/// provided, if not, then the location defaults to the one specified for this
+/// diagnostic. Notes may not be attached to other notes.
+Diagnostic &Diagnostic::attachNote(Optional<Location> noteLoc) {
+  // We don't allow attaching notes to notes.
+  assert(severity != DiagnosticSeverity::Note &&
+         "cannot attach a note to a note");
+
+  // If a location wasn't provided then reuse our location.
+  if (!noteLoc)
+    noteLoc = loc;
+
+  /// Append and return a new note.
+  notes.push_back(
+      std::make_unique<Diagnostic>(*noteLoc, DiagnosticSeverity::Note));
+  return *notes.back();
+}
+
+/// Allow a diagnostic to be converted to 'failure'.
+Diagnostic::operator LogicalResult() const { return failure(); }
+
+//===----------------------------------------------------------------------===//
+// InFlightDiagnostic
+//===----------------------------------------------------------------------===//
+
+/// Allow an inflight diagnostic to be converted to 'failure', otherwise
+/// 'success' if this is an empty diagnostic.
+InFlightDiagnostic::operator LogicalResult() const {
+  return failure(isActive());
+}
+
+/// Reports the diagnostic to the engine.
+void InFlightDiagnostic::report() {
+  // If this diagnostic is still inflight and it hasn't been abandoned, then
+  // report it.
+  if (isInFlight()) {
+    owner->emit(std::move(*impl));
+    owner = nullptr;
+  }
+  impl.reset();
+}
+
+/// Abandons this diagnostic.
+void InFlightDiagnostic::abandon() { owner = nullptr; }
+
+//===----------------------------------------------------------------------===//
+// DiagnosticEngineImpl
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace detail {
+struct DiagnosticEngineImpl {
+  /// Emit a diagnostic using the registered issue handle if present, or with
+  /// the default behavior if not.
+  void emit(Diagnostic diag);
+
+  /// A mutex to ensure that diagnostics emission is thread-safe.
+  llvm::sys::SmartMutex<true> mutex;
+
+  /// These are the handlers used to report diagnostics.
+  llvm::SmallMapVector<DiagnosticEngine::HandlerID, DiagnosticEngine::HandlerTy,
+                       2>
+      handlers;
+
+  /// This is a unique identifier counter for diagnostic handlers in the
+  /// context. This id starts at 1 to allow for 0 to be used as a sentinel.
+  DiagnosticEngine::HandlerID uniqueHandlerId = 1;
+};
+} // namespace detail
+} // namespace mlir
+
+/// Emit a diagnostic using the registered issue handle if present, or with
+/// the default behavior if not.
+void DiagnosticEngineImpl::emit(Diagnostic diag) {
+  llvm::sys::SmartScopedLock<true> lock(mutex);
+
+  // Try to process the given diagnostic on one of the registered handlers.
+  // Handlers are walked in reverse order, so that the most recent handler is
+  // processed first.
+  for (auto &handlerIt : llvm::reverse(handlers))
+    if (succeeded(handlerIt.second(diag)))
+      return;
+
+  // Otherwise, if this is an error we emit it to stderr.
+  if (diag.getSeverity() != DiagnosticSeverity::Error)
+    return;
+
+  auto &os = llvm::errs();
+  if (!diag.getLocation().isa<UnknownLoc>())
+    os << diag.getLocation() << ": ";
+  os << "error: ";
+
+  // The default behavior for errors is to emit them to stderr.
+  os << diag << '\n';
+  os.flush();
+}
+
+//===----------------------------------------------------------------------===//
+// DiagnosticEngine
+//===----------------------------------------------------------------------===//
+
+DiagnosticEngine::DiagnosticEngine() : impl(new DiagnosticEngineImpl()) {}
+DiagnosticEngine::~DiagnosticEngine() {}
+
+/// Register a new handler for diagnostics to the engine. This function returns
+/// a unique identifier for the registered handler, which can be used to
+/// unregister this handler at a later time.
+auto DiagnosticEngine::registerHandler(const HandlerTy &handler) -> HandlerID {
+  llvm::sys::SmartScopedLock<true> lock(impl->mutex);
+  auto uniqueID = impl->uniqueHandlerId++;
+  impl->handlers.insert({uniqueID, handler});
+  return uniqueID;
+}
+
+/// Erase the registered diagnostic handler with the given identifier.
+void DiagnosticEngine::eraseHandler(HandlerID handlerID) {
+  llvm::sys::SmartScopedLock<true> lock(impl->mutex);
+  impl->handlers.erase(handlerID);
+}
+
+/// Emit a diagnostic using the registered issue handler if present, or with
+/// the default behavior if not.
+void DiagnosticEngine::emit(Diagnostic diag) {
+  assert(diag.getSeverity() != DiagnosticSeverity::Note &&
+         "notes should not be emitted directly");
+  impl->emit(std::move(diag));
+}
+
+/// Helper function used to emit a diagnostic with an optionally empty twine
+/// message. If the message is empty, then it is not inserted into the
+/// diagnostic.
+static InFlightDiagnostic
+emitDiag(Location location, DiagnosticSeverity severity, const Twine &message) {
+  auto &diagEngine = location->getContext()->getDiagEngine();
+  auto diag = diagEngine.emit(location, severity);
+  if (!message.isTriviallyEmpty())
+    diag << message;
+
+  // Add the stack trace as a note if necessary.
+  if (printStackTraceOnDiagnostic) {
+    std::string bt;
+    {
+      llvm::raw_string_ostream stream(bt);
+      llvm::sys::PrintStackTrace(stream);
+    }
+    if (!bt.empty())
+      diag.attachNote() << "diagnostic emitted with trace:\n" << bt;
+  }
+
+  return diag;
+}
+
+/// Emit an error message using this location.
+InFlightDiagnostic mlir::emitError(Location loc) { return emitError(loc, {}); }
+InFlightDiagnostic mlir::emitError(Location loc, const Twine &message) {
+  return emitDiag(loc, DiagnosticSeverity::Error, message);
+}
+
+/// Emit a warning message using this location.
+InFlightDiagnostic mlir::emitWarning(Location loc) {
+  return emitWarning(loc, {});
+}
+InFlightDiagnostic mlir::emitWarning(Location loc, const Twine &message) {
+  return emitDiag(loc, DiagnosticSeverity::Warning, message);
+}
+
+/// Emit a remark message using this location.
+InFlightDiagnostic mlir::emitRemark(Location loc) {
+  return emitRemark(loc, {});
+}
+InFlightDiagnostic mlir::emitRemark(Location loc, const Twine &message) {
+  return emitDiag(loc, DiagnosticSeverity::Remark, message);
+}
+
+//===----------------------------------------------------------------------===//
+// ScopedDiagnosticHandler
+//===----------------------------------------------------------------------===//
+
+ScopedDiagnosticHandler::~ScopedDiagnosticHandler() {
+  if (handlerID)
+    ctx->getDiagEngine().eraseHandler(handlerID);
+}
+
+//===----------------------------------------------------------------------===//
+// SourceMgrDiagnosticHandler
+//===----------------------------------------------------------------------===//
+namespace mlir {
+namespace detail {
+struct SourceMgrDiagnosticHandlerImpl {
+  /// Get a memory buffer for the given file, or nullptr if one is not found.
+  const llvm::MemoryBuffer *getBufferForFile(llvm::SourceMgr &mgr,
+                                             StringRef filename) {
+    // Check for an existing mapping to the buffer id for this file.
+    auto bufferIt = filenameToBuf.find(filename);
+    if (bufferIt != filenameToBuf.end())
+      return bufferIt->second;
+
+    // Look for a buffer in the manager that has this filename.
+    for (unsigned i = 1, e = mgr.getNumBuffers() + 1; i != e; ++i) {
+      auto *buf = mgr.getMemoryBuffer(i);
+      if (buf->getBufferIdentifier() == filename)
+        return filenameToBuf[filename] = buf;
+    }
+
+    // Otherwise, try to load the source file.
+    const llvm::MemoryBuffer *newBuf = nullptr;
+    std::string ignored;
+    if (auto newBufID = mgr.AddIncludeFile(filename, llvm::SMLoc(), ignored))
+      newBuf = mgr.getMemoryBuffer(newBufID);
+    return filenameToBuf[filename] = newBuf;
+  }
+
+  /// Mapping between file name and buffer pointer.
+  llvm::StringMap<const llvm::MemoryBuffer *> filenameToBuf;
+};
+} // end namespace detail
+} // end namespace mlir
+
+/// Return a processable FileLineColLoc from the given location.
+static Optional<FileLineColLoc> getFileLineColLoc(Location loc) {
+  switch (loc->getKind()) {
+  case StandardAttributes::NameLocation:
+    return getFileLineColLoc(loc.cast<NameLoc>().getChildLoc());
+  case StandardAttributes::FileLineColLocation:
+    return loc.cast<FileLineColLoc>();
+  case StandardAttributes::CallSiteLocation:
+    // Process the callee of a callsite location.
+    return getFileLineColLoc(loc.cast<CallSiteLoc>().getCallee());
+  default:
+    return llvm::None;
+  }
+}
+
+/// Given a diagnostic kind, returns the LLVM DiagKind.
+static llvm::SourceMgr::DiagKind getDiagKind(DiagnosticSeverity kind) {
+  switch (kind) {
+  case DiagnosticSeverity::Note:
+    return llvm::SourceMgr::DK_Note;
+  case DiagnosticSeverity::Warning:
+    return llvm::SourceMgr::DK_Warning;
+  case DiagnosticSeverity::Error:
+    return llvm::SourceMgr::DK_Error;
+  case DiagnosticSeverity::Remark:
+    return llvm::SourceMgr::DK_Remark;
+  }
+  llvm_unreachable("Unknown DiagnosticSeverity");
+}
+
+SourceMgrDiagnosticHandler::SourceMgrDiagnosticHandler(llvm::SourceMgr &mgr,
+                                                       MLIRContext *ctx,
+                                                       raw_ostream &os)
+    : ScopedDiagnosticHandler(ctx), mgr(mgr), os(os),
+      impl(new SourceMgrDiagnosticHandlerImpl()) {
+  setHandler([this](Diagnostic &diag) { emitDiagnostic(diag); });
+}
+
+SourceMgrDiagnosticHandler::SourceMgrDiagnosticHandler(llvm::SourceMgr &mgr,
+                                                       MLIRContext *ctx)
+    : SourceMgrDiagnosticHandler(mgr, ctx, llvm::errs()) {}
+
+SourceMgrDiagnosticHandler::~SourceMgrDiagnosticHandler() {}
+
+void SourceMgrDiagnosticHandler::emitDiagnostic(Location loc, Twine message,
+                                                DiagnosticSeverity kind) {
+  // Extract a file location from this loc.
+  auto fileLoc = getFileLineColLoc(loc);
+
+  // If one doesn't exist, then print the raw message without a source location.
+  if (!fileLoc) {
+    std::string str;
+    llvm::raw_string_ostream strOS(str);
+    if (!loc.isa<UnknownLoc>())
+      strOS << loc << ": ";
+    strOS << message;
+    return mgr.PrintMessage(os, llvm::SMLoc(), getDiagKind(kind), strOS.str());
+  }
+
+  // Otherwise, try to convert the file location to an SMLoc.
+  auto smloc = convertLocToSMLoc(*fileLoc);
+  if (smloc.isValid())
+    return mgr.PrintMessage(os, smloc, getDiagKind(kind), message);
+
+  // If the conversion was unsuccessful, create a diagnostic with the file
+  // information.
+  llvm::SMDiagnostic diag(fileLoc->getFilename(), getDiagKind(kind),
+                          message.str());
+  diag.print(nullptr, os);
+}
+
+/// Emit the given diagnostic with the held source manager.
+void SourceMgrDiagnosticHandler::emitDiagnostic(Diagnostic &diag) {
+  // Emit the diagnostic.
+  auto loc = diag.getLocation();
+  emitDiagnostic(loc, diag.str(), diag.getSeverity());
+
+  // If the diagnostic location was a call site location, then print the call
+  // stack as well.
+  if (auto callLoc = loc.dyn_cast<CallSiteLoc>()) {
+    // Print the call stack while valid, or until the limit is reached.
+    Location callerLoc = callLoc.getCaller();
+    for (unsigned curDepth = 0; curDepth < callStackLimit; ++curDepth) {
+      emitDiagnostic(callerLoc, "called from", DiagnosticSeverity::Note);
+      if ((callLoc = callerLoc.dyn_cast<CallSiteLoc>()))
+        callerLoc = callLoc.getCaller();
+      else
+        break;
+    }
+  }
+
+  // Emit each of the notes.
+  for (auto &note : diag.getNotes())
+    emitDiagnostic(note.getLocation(), note.str(), note.getSeverity());
+}
+
+/// Get a memory buffer for the given file, or nullptr if one is not found.
+const llvm::MemoryBuffer *
+SourceMgrDiagnosticHandler::getBufferForFile(StringRef filename) {
+  return impl->getBufferForFile(mgr, filename);
+}
+
+/// Get a memory buffer for the given file, or the main file of the source
+/// manager if one doesn't exist. This always returns non-null.
+llvm::SMLoc SourceMgrDiagnosticHandler::convertLocToSMLoc(FileLineColLoc loc) {
+  // Get the buffer for this filename.
+  auto *membuf = getBufferForFile(loc.getFilename());
+  if (!membuf)
+    return llvm::SMLoc();
+
+  // TODO: This should really be upstreamed to be a method on llvm::SourceMgr.
+  // Doing so would allow it to use the offset cache that is already maintained
+  // by SrcBuffer, making this more efficient.
+  unsigned lineNo = loc.getLine();
+  unsigned columnNo = loc.getColumn();
+
+  // Scan for the correct line number.
+  const char *position = membuf->getBufferStart();
+  const char *end = membuf->getBufferEnd();
+
+  // We start counting line and column numbers from 1.
+  if (lineNo != 0)
+    --lineNo;
+  if (columnNo != 0)
+    --columnNo;
+
+  while (position < end && lineNo) {
+    auto curChar = *position++;
+
+    // Scan for newlines.  If this isn't one, ignore it.
+    if (curChar != '\r' && curChar != '\n')
+      continue;
+
+    // We saw a line break, decrement our counter.
+    --lineNo;
+
+    // Check for \r\n and \n\r and treat it as a single escape.  We know that
+    // looking past one character is safe because MemoryBuffer's are always nul
+    // terminated.
+    if (*position != curChar && (*position == '\r' || *position == '\n'))
+      ++position;
+  }
+
+  // If the line/column counter was invalid, return a pointer to the start of
+  // the buffer.
+  if (lineNo || position + columnNo > end)
+    return llvm::SMLoc::getFromPointer(membuf->getBufferStart());
+
+  // If the column is zero, try to skip to the first non-whitespace character.
+  if (columnNo == 0) {
+    auto isNewline = [](char c) { return c == '\n' || c == '\r'; };
+    auto isWhitespace = [](char c) { return c == ' ' || c == '\t'; };
+
+    // Look for a valid non-whitespace character before the next line.
+    for (auto *newPos = position; newPos < end && !isNewline(*newPos); ++newPos)
+      if (!isWhitespace(*newPos))
+        return llvm::SMLoc::getFromPointer(newPos);
+  }
+
+  // Otherwise return the right pointer.
+  return llvm::SMLoc::getFromPointer(position + columnNo);
+}
+
+//===----------------------------------------------------------------------===//
+// SourceMgrDiagnosticVerifierHandler
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace detail {
+// Record the expected diagnostic's position, substring and whether it was
+// seen.
+struct ExpectedDiag {
+  DiagnosticSeverity kind;
+  unsigned lineNo;
+  StringRef substring;
+  llvm::SMLoc fileLoc;
+  bool matched;
+};
+
+struct SourceMgrDiagnosticVerifierHandlerImpl {
+  SourceMgrDiagnosticVerifierHandlerImpl() : status(success()) {}
+
+  /// Returns the expected diagnostics for the given source file.
+  Optional<MutableArrayRef<ExpectedDiag>> getExpectedDiags(StringRef bufName);
+
+  /// Computes the expected diagnostics for the given source buffer.
+  MutableArrayRef<ExpectedDiag>
+  computeExpectedDiags(const llvm::MemoryBuffer *buf);
+
+  /// The current status of the verifier.
+  LogicalResult status;
+
+  /// A list of expected diagnostics for each buffer of the source manager.
+  llvm::StringMap<SmallVector<ExpectedDiag, 2>> expectedDiagsPerFile;
+
+  /// Regex to match the expected diagnostics format.
+  llvm::Regex expected = llvm::Regex("expected-(error|note|remark|warning) "
+                                     "*(@([+-][0-9]+|above|below))? *{{(.*)}}");
+};
+} // end namespace detail
+} // end namespace mlir
+
+/// Given a diagnostic kind, return a human readable string for it.
+static StringRef getDiagKindStr(DiagnosticSeverity kind) {
+  switch (kind) {
+  case DiagnosticSeverity::Note:
+    return "note";
+  case DiagnosticSeverity::Warning:
+    return "warning";
+  case DiagnosticSeverity::Error:
+    return "error";
+  case DiagnosticSeverity::Remark:
+    return "remark";
+  }
+  llvm_unreachable("Unknown DiagnosticSeverity");
+}
+
+/// Returns the expected diagnostics for the given source file.
+Optional<MutableArrayRef<ExpectedDiag>>
+SourceMgrDiagnosticVerifierHandlerImpl::getExpectedDiags(StringRef bufName) {
+  auto expectedDiags = expectedDiagsPerFile.find(bufName);
+  if (expectedDiags != expectedDiagsPerFile.end())
+    return MutableArrayRef<ExpectedDiag>(expectedDiags->second);
+  return llvm::None;
+}
+
+/// Computes the expected diagnostics for the given source buffer.
+MutableArrayRef<ExpectedDiag>
+SourceMgrDiagnosticVerifierHandlerImpl::computeExpectedDiags(
+    const llvm::MemoryBuffer *buf) {
+  // If the buffer is invalid, return an empty list.
+  if (!buf)
+    return llvm::None;
+  auto &expectedDiags = expectedDiagsPerFile[buf->getBufferIdentifier()];
+
+  // The number of the last line that did not correlate to a designator.
+  unsigned lastNonDesignatorLine = 0;
+
+  // The indices of designators that apply to the next non designator line.
+  SmallVector<unsigned, 1> designatorsForNextLine;
+
+  // Scan the file for expected-* designators.
+  SmallVector<StringRef, 100> lines;
+  buf->getBuffer().split(lines, '\n');
+  for (unsigned lineNo = 0, e = lines.size(); lineNo < e; ++lineNo) {
+    SmallVector<StringRef, 4> matches;
+    if (!expected.match(lines[lineNo], &matches)) {
+      // Check for designators that apply to this line.
+      if (!designatorsForNextLine.empty()) {
+        for (unsigned diagIndex : designatorsForNextLine)
+          expectedDiags[diagIndex].lineNo = lineNo + 1;
+        designatorsForNextLine.clear();
+      }
+      lastNonDesignatorLine = lineNo;
+      continue;
+    }
+
+    // Point to the start of expected-*.
+    auto expectedStart = llvm::SMLoc::getFromPointer(matches[0].data());
+
+    DiagnosticSeverity kind;
+    if (matches[1] == "error")
+      kind = DiagnosticSeverity::Error;
+    else if (matches[1] == "warning")
+      kind = DiagnosticSeverity::Warning;
+    else if (matches[1] == "remark")
+      kind = DiagnosticSeverity::Remark;
+    else {
+      assert(matches[1] == "note");
+      kind = DiagnosticSeverity::Note;
+    }
+
+    ExpectedDiag record{kind, lineNo + 1, matches[4], expectedStart, false};
+    auto offsetMatch = matches[2];
+    if (!offsetMatch.empty()) {
+      offsetMatch = offsetMatch.drop_front(1);
+
+      // Get the integer value without the @ and +/- prefix.
+      if (offsetMatch[0] == '+' || offsetMatch[0] == '-') {
+        int offset;
+        offsetMatch.drop_front().getAsInteger(0, offset);
+
+        if (offsetMatch.front() == '+')
+          record.lineNo += offset;
+        else
+          record.lineNo -= offset;
+      } else if (offsetMatch.consume_front("above")) {
+        // If the designator applies 'above' we add it to the last non
+        // designator line.
+        record.lineNo = lastNonDesignatorLine + 1;
+      } else {
+        // Otherwise, this is a 'below' designator and applies to the next
+        // non-designator line.
+        assert(offsetMatch.consume_front("below"));
+        designatorsForNextLine.push_back(expectedDiags.size());
+
+        // Set the line number to the last in the case that this designator ends
+        // up dangling.
+        record.lineNo = e;
+      }
+    }
+    expectedDiags.push_back(record);
+  }
+  return expectedDiags;
+}
+
+SourceMgrDiagnosticVerifierHandler::SourceMgrDiagnosticVerifierHandler(
+    llvm::SourceMgr &srcMgr, MLIRContext *ctx, raw_ostream &out)
+    : SourceMgrDiagnosticHandler(srcMgr, ctx, out),
+      impl(new SourceMgrDiagnosticVerifierHandlerImpl()) {
+  // Compute the expected diagnostics for each of the current files in the
+  // source manager.
+  for (unsigned i = 0, e = mgr.getNumBuffers(); i != e; ++i)
+    (void)impl->computeExpectedDiags(mgr.getMemoryBuffer(i + 1));
+
+  // Register a handler to verify the diagnostics.
+  setHandler([&](Diagnostic &diag) {
+    // Process the main diagnostics.
+    process(diag);
+
+    // Process each of the notes.
+    for (auto &note : diag.getNotes())
+      process(note);
+  });
+}
+
+SourceMgrDiagnosticVerifierHandler::SourceMgrDiagnosticVerifierHandler(
+    llvm::SourceMgr &srcMgr, MLIRContext *ctx)
+    : SourceMgrDiagnosticVerifierHandler(srcMgr, ctx, llvm::errs()) {}
+
+SourceMgrDiagnosticVerifierHandler::~SourceMgrDiagnosticVerifierHandler() {
+  // Ensure that all expected diagnostics were handled.
+  (void)verify();
+}
+
+/// Returns the status of the verifier and verifies that all expected
+/// diagnostics were emitted. This return success if all diagnostics were
+/// verified correctly, failure otherwise.
+LogicalResult SourceMgrDiagnosticVerifierHandler::verify() {
+  // Verify that all expected errors were seen.
+  for (auto &expectedDiagsPair : impl->expectedDiagsPerFile) {
+    for (auto &err : expectedDiagsPair.second) {
+      if (err.matched)
+        continue;
+      llvm::SMRange range(err.fileLoc,
+                          llvm::SMLoc::getFromPointer(err.fileLoc.getPointer() +
+                                                      err.substring.size()));
+      mgr.PrintMessage(os, err.fileLoc, llvm::SourceMgr::DK_Error,
+                       "expected " + getDiagKindStr(err.kind) + " \"" +
+                           err.substring + "\" was not produced",
+                       range);
+      impl->status = failure();
+    }
+  }
+  impl->expectedDiagsPerFile.clear();
+  return impl->status;
+}
+
+/// Process a single diagnostic.
+void SourceMgrDiagnosticVerifierHandler::process(Diagnostic &diag) {
+  auto kind = diag.getSeverity();
+
+  // Process a FileLineColLoc.
+  if (auto fileLoc = getFileLineColLoc(diag.getLocation()))
+    return process(*fileLoc, diag.str(), kind);
+
+  emitDiagnostic(diag.getLocation(),
+                 "unexpected " + getDiagKindStr(kind) + ": " + diag.str(),
+                 DiagnosticSeverity::Error);
+  impl->status = failure();
+}
+
+/// Process a FileLineColLoc diagnostic.
+void SourceMgrDiagnosticVerifierHandler::process(FileLineColLoc loc,
+                                                 StringRef msg,
+                                                 DiagnosticSeverity kind) {
+  // Get the expected diagnostics for this file.
+  auto diags = impl->getExpectedDiags(loc.getFilename());
+  if (!diags)
+    diags = impl->computeExpectedDiags(getBufferForFile(loc.getFilename()));
+
+  // Search for a matching expected diagnostic.
+  // If we find something that is close then emit a more specific error.
+  ExpectedDiag *nearMiss = nullptr;
+
+  // If this was an expected error, remember that we saw it and return.
+  unsigned line = loc.getLine();
+  for (auto &e : *diags) {
+    if (line == e.lineNo && msg.contains(e.substring)) {
+      if (e.kind == kind) {
+        e.matched = true;
+        return;
+      }
+
+      // If this only differs based on the diagnostic kind, then consider it
+      // to be a near miss.
+      nearMiss = &e;
+    }
+  }
+
+  // Otherwise, emit an error for the near miss.
+  if (nearMiss)
+    mgr.PrintMessage(os, nearMiss->fileLoc, llvm::SourceMgr::DK_Error,
+                     "'" + getDiagKindStr(kind) +
+                         "' diagnostic emitted when expecting a '" +
+                         getDiagKindStr(nearMiss->kind) + "'");
+  else
+    emitDiagnostic(loc, "unexpected " + getDiagKindStr(kind) + ": " + msg,
+                   DiagnosticSeverity::Error);
+  impl->status = failure();
+}
+
+//===----------------------------------------------------------------------===//
+// ParallelDiagnosticHandler
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace detail {
+struct ParallelDiagnosticHandlerImpl : public llvm::PrettyStackTraceEntry {
+  struct ThreadDiagnostic {
+    ThreadDiagnostic(size_t id, Diagnostic diag)
+        : id(id), diag(std::move(diag)) {}
+    bool operator<(const ThreadDiagnostic &rhs) const { return id < rhs.id; }
+
+    /// The id for this diagnostic, this is used for ordering.
+    /// Note: This id corresponds to the ordered position of the current element
+    ///       being processed by a given thread.
+    size_t id;
+
+    /// The diagnostic.
+    Diagnostic diag;
+  };
+
+  ParallelDiagnosticHandlerImpl(MLIRContext *ctx) : handlerID(0), context(ctx) {
+    handlerID = ctx->getDiagEngine().registerHandler([this](Diagnostic &diag) {
+      uint64_t tid = llvm::get_threadid();
+      llvm::sys::SmartScopedLock<true> lock(mutex);
+
+      // If this thread is not tracked, then return failure to let another
+      // handler process this diagnostic.
+      if (!threadToOrderID.count(tid))
+        return failure();
+
+      // Append a new diagnostic.
+      diagnostics.emplace_back(threadToOrderID[tid], std::move(diag));
+      return success();
+    });
+  }
+
+  ~ParallelDiagnosticHandlerImpl() override {
+    // Erase this handler from the context.
+    context->getDiagEngine().eraseHandler(handlerID);
+
+    // Early exit if there are no diagnostics, this is the common case.
+    if (diagnostics.empty())
+      return;
+
+    // Emit the diagnostics back to the context.
+    emitDiagnostics([&](Diagnostic diag) {
+      return context->getDiagEngine().emit(std::move(diag));
+    });
+  }
+
+  /// Utility method to emit any held diagnostics.
+  void emitDiagnostics(std::function<void(Diagnostic)> emitFn) const {
+    // Stable sort all of the diagnostics that were emitted. This creates a
+    // deterministic ordering for the diagnostics based upon which order id they
+    // were emitted for.
+    std::stable_sort(diagnostics.begin(), diagnostics.end());
+
+    // Emit each diagnostic to the context again.
+    for (ThreadDiagnostic &diag : diagnostics)
+      emitFn(std::move(diag.diag));
+  }
+
+  /// Set the order id for the current thread.
+  void setOrderIDForThread(size_t orderID) {
+    uint64_t tid = llvm::get_threadid();
+    llvm::sys::SmartScopedLock<true> lock(mutex);
+    threadToOrderID[tid] = orderID;
+  }
+
+  /// Remove the order id for the current thread.
+  void eraseOrderIDForThread() {
+    uint64_t tid = llvm::get_threadid();
+    llvm::sys::SmartScopedLock<true> lock(mutex);
+    threadToOrderID.erase(tid);
+  }
+
+  /// Dump the current diagnostics that were inflight.
+  void print(raw_ostream &os) const override {
+    // Early exit if there are no diagnostics, this is the common case.
+    if (diagnostics.empty())
+      return;
+
+    os << "In-Flight Diagnostics:\n";
+    emitDiagnostics([&](Diagnostic diag) {
+      os.indent(4);
+
+      // Print each diagnostic with the format:
+      //   "<location>: <kind>: <msg>"
+      if (!diag.getLocation().isa<UnknownLoc>())
+        os << diag.getLocation() << ": ";
+      switch (diag.getSeverity()) {
+      case DiagnosticSeverity::Error:
+        os << "error: ";
+        break;
+      case DiagnosticSeverity::Warning:
+        os << "warning: ";
+        break;
+      case DiagnosticSeverity::Note:
+        os << "note: ";
+        break;
+      case DiagnosticSeverity::Remark:
+        os << "remark: ";
+        break;
+      }
+      os << diag << '\n';
+    });
+  }
+
+  /// A smart mutex to lock access to the internal state.
+  llvm::sys::SmartMutex<true> mutex;
+
+  /// A mapping between the thread id and the current order id.
+  DenseMap<uint64_t, size_t> threadToOrderID;
+
+  /// An unordered list of diagnostics that were emitted.
+  mutable std::vector<ThreadDiagnostic> diagnostics;
+
+  /// The unique id for the parallel handler.
+  DiagnosticEngine::HandlerID handlerID;
+
+  /// The context to emit the diagnostics to.
+  MLIRContext *context;
+};
+} // end namespace detail
+} // end namespace mlir
+
+ParallelDiagnosticHandler::ParallelDiagnosticHandler(MLIRContext *ctx)
+    : impl(new ParallelDiagnosticHandlerImpl(ctx)) {}
+ParallelDiagnosticHandler::~ParallelDiagnosticHandler() {}
+
+/// Set the order id for the current thread.
+void ParallelDiagnosticHandler::setOrderIDForThread(size_t orderID) {
+  impl->setOrderIDForThread(orderID);
+}
+
+/// Remove the order id for the current thread. This removes the thread from
+/// diagnostics tracking.
+void ParallelDiagnosticHandler::eraseOrderIDForThread() {
+  impl->eraseOrderIDForThread();
+}
diff --git a/mlir/lib/IR/Dialect.cpp b/mlir/lib/IR/Dialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2485a368fd488d592f173ac9751577b85c0812e
--- /dev/null
+++ b/mlir/lib/IR/Dialect.cpp
@@ -0,0 +1,156 @@
+//===- Dialect.cpp - Dialect implementation -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectHooks.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/DialectInterface.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Regex.h"
+
+using namespace mlir;
+using namespace detail;
+
+DialectAsmParser::~DialectAsmParser() {}
+
+//===----------------------------------------------------------------------===//
+// Dialect Registration
+//===----------------------------------------------------------------------===//
+
+// Registry for all dialect allocation functions.
+static llvm::ManagedStatic<SmallVector<DialectAllocatorFunction, 8>>
+    dialectRegistry;
+
+// Registry for functions that set dialect hooks.
+static llvm::ManagedStatic<SmallVector<DialectHooksSetter, 8>>
+    dialectHooksRegistry;
+
+/// Registers a specific dialect creation function with the system, typically
+/// used through the DialectRegistration template.
+void mlir::registerDialectAllocator(const DialectAllocatorFunction &function) {
+  assert(function &&
+         "Attempting to register an empty dialect initialize function");
+  dialectRegistry->push_back(function);
+}
+
+/// Registers a function to set specific hooks for a specific dialect, typically
+/// used through the DialectHooksRegistration template.
+void mlir::registerDialectHooksSetter(const DialectHooksSetter &function) {
+  assert(
+      function &&
+      "Attempting to register an empty dialect hooks initialization function");
+
+  dialectHooksRegistry->push_back(function);
+}
+
+/// Registers all dialects and their const folding hooks with the specified
+/// MLIRContext.
+void mlir::registerAllDialects(MLIRContext *context) {
+  for (const auto &fn : *dialectRegistry)
+    fn(context);
+  for (const auto &fn : *dialectHooksRegistry) {
+    fn(context);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Dialect
+//===----------------------------------------------------------------------===//
+
+Dialect::Dialect(StringRef name, MLIRContext *context)
+    : name(name), context(context) {
+  assert(isValidNamespace(name) && "invalid dialect namespace");
+  registerDialect(context);
+}
+
+Dialect::~Dialect() {}
+
+/// Verify an attribute from this dialect on the argument at 'argIndex' for
+/// the region at 'regionIndex' on the given operation. Returns failure if
+/// the verification failed, success otherwise. This hook may optionally be
+/// invoked from any operation containing a region.
+LogicalResult Dialect::verifyRegionArgAttribute(Operation *, unsigned, unsigned,
+                                                NamedAttribute) {
+  return success();
+}
+
+/// Verify an attribute from this dialect on the result at 'resultIndex' for
+/// the region at 'regionIndex' on the given operation. Returns failure if
+/// the verification failed, success otherwise. This hook may optionally be
+/// invoked from any operation containing a region.
+LogicalResult Dialect::verifyRegionResultAttribute(Operation *, unsigned,
+                                                   unsigned, NamedAttribute) {
+  return success();
+}
+
+/// Parse an attribute registered to this dialect.
+Attribute Dialect::parseAttribute(DialectAsmParser &parser, Type type) const {
+  parser.emitError(parser.getNameLoc())
+      << "dialect '" << getNamespace()
+      << "' provides no attribute parsing hook";
+  return Attribute();
+}
+
+/// Parse a type registered to this dialect.
+Type Dialect::parseType(DialectAsmParser &parser) const {
+  // If this dialect allows unknown types, then represent this with OpaqueType.
+  if (allowsUnknownTypes()) {
+    auto ns = Identifier::get(getNamespace(), getContext());
+    return OpaqueType::get(ns, parser.getFullSymbolSpec(), getContext());
+  }
+
+  parser.emitError(parser.getNameLoc())
+      << "dialect '" << getNamespace() << "' provides no type parsing hook";
+  return Type();
+}
+
+/// Utility function that returns if the given string is a valid dialect
+/// namespace.
+bool Dialect::isValidNamespace(StringRef str) {
+  if (str.empty())
+    return true;
+  llvm::Regex dialectNameRegex("^[a-zA-Z_][a-zA-Z_0-9\\$]*$");
+  return dialectNameRegex.match(str);
+}
+
+/// Register a set of dialect interfaces with this dialect instance.
+void Dialect::addInterface(std::unique_ptr<DialectInterface> interface) {
+  auto it = registeredInterfaces.try_emplace(interface->getID(),
+                                             std::move(interface));
+  (void)it;
+  assert(it.second && "interface kind has already been registered");
+}
+
+//===----------------------------------------------------------------------===//
+// Dialect Interface
+//===----------------------------------------------------------------------===//
+
+DialectInterface::~DialectInterface() {}
+
+DialectInterfaceCollectionBase::DialectInterfaceCollectionBase(
+    MLIRContext *ctx, ClassID *interfaceKind) {
+  for (auto *dialect : ctx->getRegisteredDialects()) {
+    if (auto *interface = dialect->getRegisteredInterface(interfaceKind)) {
+      interfaces.insert(interface);
+      orderedInterfaces.push_back(interface);
+    }
+  }
+}
+
+DialectInterfaceCollectionBase::~DialectInterfaceCollectionBase() {}
+
+/// Get the interface for the dialect of given operation, or null if one
+/// is not registered.
+const DialectInterface *
+DialectInterfaceCollectionBase::getInterfaceFor(Operation *op) const {
+  return getInterfaceFor(op->getDialect());
+}
diff --git a/mlir/lib/IR/Function.cpp b/mlir/lib/IR/Function.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72b5ac46a8f70d79f6d1240ce5c6af35ffd76796
--- /dev/null
+++ b/mlir/lib/IR/Function.cpp
@@ -0,0 +1,213 @@
+//===- Function.cpp - MLIR Function Classes -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Function.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/FunctionImplementation.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Function Operation.
+//===----------------------------------------------------------------------===//
+
+FuncOp FuncOp::create(Location location, StringRef name, FunctionType type,
+                      ArrayRef<NamedAttribute> attrs) {
+  OperationState state(location, "func");
+  Builder builder(location->getContext());
+  FuncOp::build(&builder, state, name, type, attrs);
+  return cast<FuncOp>(Operation::create(state));
+}
+FuncOp FuncOp::create(Location location, StringRef name, FunctionType type,
+                      iterator_range<dialect_attr_iterator> attrs) {
+  SmallVector<NamedAttribute, 8> attrRef(attrs);
+  return create(location, name, type, llvm::makeArrayRef(attrRef));
+}
+FuncOp FuncOp::create(Location location, StringRef name, FunctionType type,
+                      ArrayRef<NamedAttribute> attrs,
+                      ArrayRef<NamedAttributeList> argAttrs) {
+  FuncOp func = create(location, name, type, attrs);
+  func.setAllArgAttrs(argAttrs);
+  return func;
+}
+
+void FuncOp::build(Builder *builder, OperationState &result, StringRef name,
+                   FunctionType type, ArrayRef<NamedAttribute> attrs) {
+  result.addAttribute(SymbolTable::getSymbolAttrName(),
+                      builder->getStringAttr(name));
+  result.addAttribute(getTypeAttrName(), TypeAttr::get(type));
+  result.attributes.append(attrs.begin(), attrs.end());
+  result.addRegion();
+}
+
+void FuncOp::build(Builder *builder, OperationState &result, StringRef name,
+                   FunctionType type, ArrayRef<NamedAttribute> attrs,
+                   ArrayRef<NamedAttributeList> argAttrs) {
+  build(builder, result, name, type, attrs);
+  assert(type.getNumInputs() == argAttrs.size());
+  SmallString<8> argAttrName;
+  for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i)
+    if (auto argDict = argAttrs[i].getDictionary())
+      result.addAttribute(getArgAttrName(i, argAttrName), argDict);
+}
+
+/// Parsing/Printing methods.
+
+ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) {
+  auto buildFuncType = [](Builder &builder, ArrayRef<Type> argTypes,
+                          ArrayRef<Type> results, impl::VariadicFlag,
+                          std::string &) {
+    return builder.getFunctionType(argTypes, results);
+  };
+
+  return impl::parseFunctionLikeOp(parser, result, /*allowVariadic=*/false,
+                                   buildFuncType);
+}
+
+void FuncOp::print(OpAsmPrinter &p) {
+  FunctionType fnType = getType();
+  impl::printFunctionLikeOp(p, *this, fnType.getInputs(), /*isVariadic=*/false,
+                            fnType.getResults());
+}
+
+LogicalResult FuncOp::verify() {
+  // If this function is external there is nothing to do.
+  if (isExternal())
+    return success();
+
+  // Verify that the argument list of the function and the arg list of the entry
+  // block line up.  The trait already verified that the number of arguments is
+  // the same between the signature and the block.
+  auto fnInputTypes = getType().getInputs();
+  Block &entryBlock = front();
+  for (unsigned i = 0, e = entryBlock.getNumArguments(); i != e; ++i)
+    if (fnInputTypes[i] != entryBlock.getArgument(i)->getType())
+      return emitOpError("type of entry block argument #")
+             << i << '(' << entryBlock.getArgument(i)->getType()
+             << ") must match the type of the corresponding argument in "
+             << "function signature(" << fnInputTypes[i] << ')';
+
+  return success();
+}
+
+void FuncOp::eraseArguments(ArrayRef<unsigned> argIndices) {
+  auto oldType = getType();
+  int originalNumArgs = oldType.getNumInputs();
+  llvm::BitVector eraseIndices(originalNumArgs);
+  for (auto index : argIndices)
+    eraseIndices.set(index);
+  auto shouldEraseArg = [&](int i) { return eraseIndices.test(i); };
+
+  // There are 3 things that need to be updated:
+  // - Function type.
+  // - Arg attrs.
+  // - Block arguments of entry block.
+
+  // Update the function type and arg attrs.
+  SmallVector<Type, 4> newInputTypes;
+  SmallVector<NamedAttributeList, 4> newArgAttrs;
+  for (int i = 0; i < originalNumArgs; i++) {
+    if (shouldEraseArg(i))
+      continue;
+    newInputTypes.emplace_back(oldType.getInput(i));
+    newArgAttrs.emplace_back(getArgAttrDict(i));
+  }
+  setType(FunctionType::get(newInputTypes, oldType.getResults(), getContext()));
+  setAllArgAttrs(newArgAttrs);
+
+  // Update the entry block's arguments.
+  // We do this in reverse so that we erase later indices before earlier
+  // indices, to avoid shifting the later indices.
+  Block &entry = front();
+  for (int i = 0; i < originalNumArgs; i++)
+    if (shouldEraseArg(originalNumArgs - i - 1))
+      entry.eraseArgument(originalNumArgs - i - 1);
+}
+
+/// Add an entry block to an empty function, and set up the block arguments
+/// to match the signature of the function.
+Block *FuncOp::addEntryBlock() {
+  assert(empty() && "function already has an entry block");
+  auto *entry = new Block();
+  push_back(entry);
+  entry->addArguments(getType().getInputs());
+  return entry;
+}
+
+/// Add a normal block to the end of the function's block list. The function
+/// should at least already have an entry block.
+Block *FuncOp::addBlock() {
+  assert(!empty() && "function should at least have an entry block");
+  push_back(new Block());
+  return &back();
+}
+
+/// Clone the internal blocks from this function into dest and all attributes
+/// from this function to dest.
+void FuncOp::cloneInto(FuncOp dest, BlockAndValueMapping &mapper) {
+  // Add the attributes of this function to dest.
+  llvm::MapVector<Identifier, Attribute> newAttrs;
+  for (auto &attr : dest.getAttrs())
+    newAttrs.insert(attr);
+  for (auto &attr : getAttrs())
+    newAttrs.insert(attr);
+  dest.getOperation()->setAttrs(
+      DictionaryAttr::get(newAttrs.takeVector(), getContext()));
+
+  // Clone the body.
+  getBody().cloneInto(&dest.getBody(), mapper);
+}
+
+/// Create a deep copy of this function and all of its blocks, remapping
+/// any operands that use values outside of the function using the map that is
+/// provided (leaving them alone if no entry is present). Replaces references
+/// to cloned sub-values with the corresponding value that is copied, and adds
+/// those mappings to the mapper.
+FuncOp FuncOp::clone(BlockAndValueMapping &mapper) {
+  FunctionType newType = getType();
+
+  // If the function has a body, then the user might be deleting arguments to
+  // the function by specifying them in the mapper. If so, we don't add the
+  // argument to the input type vector.
+  bool isExternalFn = isExternal();
+  if (!isExternalFn) {
+    SmallVector<Type, 4> inputTypes;
+    inputTypes.reserve(newType.getNumInputs());
+    for (unsigned i = 0, e = getNumArguments(); i != e; ++i)
+      if (!mapper.contains(getArgument(i)))
+        inputTypes.push_back(newType.getInput(i));
+    newType = FunctionType::get(inputTypes, newType.getResults(), getContext());
+  }
+
+  // Create the new function.
+  FuncOp newFunc = cast<FuncOp>(getOperation()->cloneWithoutRegions());
+  newFunc.setType(newType);
+
+  /// Set the argument attributes for arguments that aren't being replaced.
+  for (unsigned i = 0, e = getNumArguments(), destI = 0; i != e; ++i)
+    if (isExternalFn || !mapper.contains(getArgument(i)))
+      newFunc.setArgAttrs(destI++, getArgAttrs(i));
+
+  /// Clone the current function into the new one and return it.
+  cloneInto(newFunc, mapper);
+  return newFunc;
+}
+FuncOp FuncOp::clone() {
+  BlockAndValueMapping mapper;
+  return clone(mapper);
+}
diff --git a/mlir/lib/IR/FunctionImplementation.cpp b/mlir/lib/IR/FunctionImplementation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79863bc74f4fbe5c2e008ce03b0def507e119254
--- /dev/null
+++ b/mlir/lib/IR/FunctionImplementation.cpp
@@ -0,0 +1,321 @@
+//===- FunctionImplementation.cpp - Utilities for function-like ops -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/FunctionImplementation.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/FunctionSupport.h"
+#include "mlir/IR/SymbolTable.h"
+
+using namespace mlir;
+
+static ParseResult
+parseArgumentList(OpAsmParser &parser, bool allowVariadic,
+                  SmallVectorImpl<Type> &argTypes,
+                  SmallVectorImpl<OpAsmParser::OperandType> &argNames,
+                  SmallVectorImpl<SmallVector<NamedAttribute, 2>> &argAttrs,
+                  bool &isVariadic) {
+  if (parser.parseLParen())
+    return failure();
+
+  // The argument list either has to consistently have ssa-id's followed by
+  // types, or just be a type list.  It isn't ok to sometimes have SSA ID's and
+  // sometimes not.
+  auto parseArgument = [&]() -> ParseResult {
+    llvm::SMLoc loc = parser.getCurrentLocation();
+
+    // Parse argument name if present.
+    OpAsmParser::OperandType argument;
+    Type argumentType;
+    if (succeeded(parser.parseOptionalRegionArgument(argument)) &&
+        !argument.name.empty()) {
+      // Reject this if the preceding argument was missing a name.
+      if (argNames.empty() && !argTypes.empty())
+        return parser.emitError(loc, "expected type instead of SSA identifier");
+      argNames.push_back(argument);
+
+      if (parser.parseColonType(argumentType))
+        return failure();
+    } else if (allowVariadic && succeeded(parser.parseOptionalEllipsis())) {
+      isVariadic = true;
+      return success();
+    } else if (!argNames.empty()) {
+      // Reject this if the preceding argument had a name.
+      return parser.emitError(loc, "expected SSA identifier");
+    } else if (parser.parseType(argumentType)) {
+      return failure();
+    }
+
+    // Add the argument type.
+    argTypes.push_back(argumentType);
+
+    // Parse any argument attributes.
+    SmallVector<NamedAttribute, 2> attrs;
+    if (parser.parseOptionalAttrDict(attrs))
+      return failure();
+    argAttrs.push_back(attrs);
+    return success();
+  };
+
+  // Parse the function arguments.
+  isVariadic = false;
+  if (failed(parser.parseOptionalRParen())) {
+    do {
+      unsigned numTypedArguments = argTypes.size();
+      if (parseArgument())
+        return failure();
+
+      llvm::SMLoc loc = parser.getCurrentLocation();
+      if (argTypes.size() == numTypedArguments &&
+          succeeded(parser.parseOptionalComma()))
+        return parser.emitError(
+            loc, "variadic arguments must be in the end of the argument list");
+    } while (succeeded(parser.parseOptionalComma()));
+    parser.parseRParen();
+  }
+
+  return success();
+}
+
+/// Parse a function result list.
+///
+///   function-result-list ::= function-result-list-parens
+///                          | non-function-type
+///   function-result-list-parens ::= `(` `)`
+///                                 | `(` function-result-list-no-parens `)`
+///   function-result-list-no-parens ::= function-result (`,` function-result)*
+///   function-result ::= type attribute-dict?
+///
+static ParseResult parseFunctionResultList(
+    OpAsmParser &parser, SmallVectorImpl<Type> &resultTypes,
+    SmallVectorImpl<SmallVector<NamedAttribute, 2>> &resultAttrs) {
+  if (failed(parser.parseOptionalLParen())) {
+    // We already know that there is no `(`, so parse a type.
+    // Because there is no `(`, it cannot be a function type.
+    Type ty;
+    if (parser.parseType(ty))
+      return failure();
+    resultTypes.push_back(ty);
+    resultAttrs.emplace_back();
+    return success();
+  }
+
+  // Special case for an empty set of parens.
+  if (succeeded(parser.parseOptionalRParen()))
+    return success();
+
+  // Parse individual function results.
+  do {
+    resultTypes.emplace_back();
+    resultAttrs.emplace_back();
+    if (parser.parseType(resultTypes.back()) ||
+        parser.parseOptionalAttrDict(resultAttrs.back())) {
+      return failure();
+    }
+  } while (succeeded(parser.parseOptionalComma()));
+  return parser.parseRParen();
+}
+
+/// Parses a function signature using `parser`. The `allowVariadic` argument
+/// indicates whether functions with variadic arguments are supported. The
+/// trailing arguments are populated by this function with names, types and
+/// attributes of the arguments and those of the results.
+ParseResult mlir::impl::parseFunctionSignature(
+    OpAsmParser &parser, bool allowVariadic,
+    SmallVectorImpl<OpAsmParser::OperandType> &argNames,
+    SmallVectorImpl<Type> &argTypes,
+    SmallVectorImpl<SmallVector<NamedAttribute, 2>> &argAttrs, bool &isVariadic,
+    SmallVectorImpl<Type> &resultTypes,
+    SmallVectorImpl<SmallVector<NamedAttribute, 2>> &resultAttrs) {
+  if (parseArgumentList(parser, allowVariadic, argTypes, argNames, argAttrs,
+                        isVariadic))
+    return failure();
+  if (succeeded(parser.parseOptionalArrow()))
+    return parseFunctionResultList(parser, resultTypes, resultAttrs);
+  return success();
+}
+
+void mlir::impl::addArgAndResultAttrs(
+    Builder &builder, OperationState &result,
+    ArrayRef<SmallVector<NamedAttribute, 2>> argAttrs,
+    ArrayRef<SmallVector<NamedAttribute, 2>> resultAttrs) {
+  // Add the attributes to the function arguments.
+  SmallString<8> attrNameBuf;
+  for (unsigned i = 0, e = argAttrs.size(); i != e; ++i)
+    if (!argAttrs[i].empty())
+      result.addAttribute(getArgAttrName(i, attrNameBuf),
+                          builder.getDictionaryAttr(argAttrs[i]));
+
+  // Add the attributes to the function results.
+  for (unsigned i = 0, e = resultAttrs.size(); i != e; ++i)
+    if (!resultAttrs[i].empty())
+      result.addAttribute(getResultAttrName(i, attrNameBuf),
+                          builder.getDictionaryAttr(resultAttrs[i]));
+}
+
+/// Parser implementation for function-like operations.  Uses `funcTypeBuilder`
+/// to construct the custom function type given lists of input and output types.
+ParseResult
+mlir::impl::parseFunctionLikeOp(OpAsmParser &parser, OperationState &result,
+                                bool allowVariadic,
+                                mlir::impl::FuncTypeBuilder funcTypeBuilder) {
+  SmallVector<OpAsmParser::OperandType, 4> entryArgs;
+  SmallVector<SmallVector<NamedAttribute, 2>, 4> argAttrs;
+  SmallVector<SmallVector<NamedAttribute, 2>, 4> resultAttrs;
+  SmallVector<Type, 4> argTypes;
+  SmallVector<Type, 4> resultTypes;
+  auto &builder = parser.getBuilder();
+
+  // Parse the name as a symbol.
+  StringAttr nameAttr;
+  if (parser.parseSymbolName(nameAttr, ::mlir::SymbolTable::getSymbolAttrName(),
+                             result.attributes))
+    return failure();
+
+  // Parse the function signature.
+  auto signatureLocation = parser.getCurrentLocation();
+  bool isVariadic = false;
+  if (parseFunctionSignature(parser, allowVariadic, entryArgs, argTypes,
+                             argAttrs, isVariadic, resultTypes, resultAttrs))
+    return failure();
+
+  std::string errorMessage;
+  if (auto type = funcTypeBuilder(builder, argTypes, resultTypes,
+                                  impl::VariadicFlag(isVariadic), errorMessage))
+    result.addAttribute(getTypeAttrName(), TypeAttr::get(type));
+  else
+    return parser.emitError(signatureLocation)
+           << "failed to construct function type"
+           << (errorMessage.empty() ? "" : ": ") << errorMessage;
+
+  // If function attributes are present, parse them.
+  if (parser.parseOptionalAttrDictWithKeyword(result.attributes))
+    return failure();
+
+  // Add the attributes to the function arguments.
+  assert(argAttrs.size() == argTypes.size());
+  assert(resultAttrs.size() == resultTypes.size());
+  addArgAndResultAttrs(builder, result, argAttrs, resultAttrs);
+
+  // Parse the optional function body.
+  auto *body = result.addRegion();
+  return parser.parseOptionalRegion(
+      *body, entryArgs, entryArgs.empty() ? ArrayRef<Type>() : argTypes);
+}
+
+// Print a function result list.
+static void printFunctionResultList(OpAsmPrinter &p, ArrayRef<Type> types,
+                                    ArrayRef<ArrayRef<NamedAttribute>> attrs) {
+  assert(!types.empty() && "Should not be called for empty result list.");
+  auto &os = p.getStream();
+  bool needsParens =
+      types.size() > 1 || types[0].isa<FunctionType>() || !attrs[0].empty();
+  if (needsParens)
+    os << '(';
+  interleaveComma(llvm::zip(types, attrs), os,
+                  [&](const std::tuple<Type, ArrayRef<NamedAttribute>> &t) {
+                    p.printType(std::get<0>(t));
+                    p.printOptionalAttrDict(std::get<1>(t));
+                  });
+  if (needsParens)
+    os << ')';
+}
+
+/// Print the signature of the function-like operation `op`.  Assumes `op` has
+/// the FunctionLike trait and passed the verification.
+void mlir::impl::printFunctionSignature(OpAsmPrinter &p, Operation *op,
+                                        ArrayRef<Type> argTypes,
+                                        bool isVariadic,
+                                        ArrayRef<Type> resultTypes) {
+  Region &body = op->getRegion(0);
+  bool isExternal = body.empty();
+
+  p << '(';
+  for (unsigned i = 0, e = argTypes.size(); i < e; ++i) {
+    if (i > 0)
+      p << ", ";
+
+    if (!isExternal) {
+      p.printOperand(body.front().getArgument(i));
+      p << ": ";
+    }
+
+    p.printType(argTypes[i]);
+    p.printOptionalAttrDict(::mlir::impl::getArgAttrs(op, i));
+  }
+
+  if (isVariadic) {
+    if (!argTypes.empty())
+      p << ", ";
+    p << "...";
+  }
+
+  p << ')';
+
+  if (!resultTypes.empty()) {
+    p.getStream() << " -> ";
+    SmallVector<ArrayRef<NamedAttribute>, 4> resultAttrs;
+    for (int i = 0, e = resultTypes.size(); i < e; ++i)
+      resultAttrs.push_back(::mlir::impl::getResultAttrs(op, i));
+    printFunctionResultList(p, resultTypes, resultAttrs);
+  }
+}
+
+/// Prints the list of function prefixed with the "attributes" keyword. The
+/// attributes with names listed in "elided" as well as those used by the
+/// function-like operation internally are not printed. Nothing is printed
+/// if all attributes are elided. Assumes `op` has the `FunctionLike` trait and
+/// passed the verification.
+void mlir::impl::printFunctionAttributes(OpAsmPrinter &p, Operation *op,
+                                         unsigned numInputs,
+                                         unsigned numResults,
+                                         ArrayRef<StringRef> elided) {
+  // Print out function attributes, if present.
+  SmallVector<StringRef, 2> ignoredAttrs = {
+      ::mlir::SymbolTable::getSymbolAttrName(), getTypeAttrName()};
+  ignoredAttrs.append(elided.begin(), elided.end());
+
+  SmallString<8> attrNameBuf;
+
+  // Ignore any argument attributes.
+  std::vector<SmallString<8>> argAttrStorage;
+  for (unsigned i = 0; i != numInputs; ++i)
+    if (op->getAttr(getArgAttrName(i, attrNameBuf)))
+      argAttrStorage.emplace_back(attrNameBuf);
+  ignoredAttrs.append(argAttrStorage.begin(), argAttrStorage.end());
+
+  // Ignore any result attributes.
+  std::vector<SmallString<8>> resultAttrStorage;
+  for (unsigned i = 0; i != numResults; ++i)
+    if (op->getAttr(getResultAttrName(i, attrNameBuf)))
+      resultAttrStorage.emplace_back(attrNameBuf);
+  ignoredAttrs.append(resultAttrStorage.begin(), resultAttrStorage.end());
+
+  p.printOptionalAttrDictWithKeyword(op->getAttrs(), ignoredAttrs);
+}
+
+/// Printer implementation for function-like operations.  Accepts lists of
+/// argument and result types to use while printing.
+void mlir::impl::printFunctionLikeOp(OpAsmPrinter &p, Operation *op,
+                                     ArrayRef<Type> argTypes, bool isVariadic,
+                                     ArrayRef<Type> resultTypes) {
+  // Print the operation and the function name.
+  auto funcName =
+      op->getAttrOfType<StringAttr>(::mlir::SymbolTable::getSymbolAttrName())
+          .getValue();
+  p << op->getName() << ' ';
+  p.printSymbolName(funcName);
+
+  printFunctionSignature(p, op, argTypes, isVariadic, resultTypes);
+  printFunctionAttributes(p, op, argTypes.size(), resultTypes.size());
+
+  // Print the body if this is not an external function.
+  Region &body = op->getRegion(0);
+  if (!body.empty())
+    p.printRegion(body, /*printEntryBlockArgs=*/false,
+                  /*printBlockTerminators=*/true);
+}
diff --git a/mlir/lib/IR/IntegerSet.cpp b/mlir/lib/IR/IntegerSet.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..835b4c3a7e27a1f216e6b5f5ab9eee783f50d8ed
--- /dev/null
+++ b/mlir/lib/IR/IntegerSet.cpp
@@ -0,0 +1,82 @@
+//===- IntegerSet.cpp - MLIR Integer Set class ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/IntegerSet.h"
+#include "IntegerSetDetail.h"
+#include "mlir/IR/AffineExpr.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+unsigned IntegerSet::getNumDims() const { return set->dimCount; }
+unsigned IntegerSet::getNumSymbols() const { return set->symbolCount; }
+unsigned IntegerSet::getNumInputs() const {
+  return set->dimCount + set->symbolCount;
+}
+
+unsigned IntegerSet::getNumConstraints() const {
+  return set->constraints.size();
+}
+
+unsigned IntegerSet::getNumEqualities() const {
+  unsigned numEqualities = 0;
+  for (unsigned i = 0, e = getNumConstraints(); i < e; i++)
+    if (isEq(i))
+      ++numEqualities;
+  return numEqualities;
+}
+
+unsigned IntegerSet::getNumInequalities() const {
+  return getNumConstraints() - getNumEqualities();
+}
+
+bool IntegerSet::isEmptyIntegerSet() const {
+  // This will only work if uniquing is on.
+  static_assert(kUniquingThreshold >= 1,
+                "uniquing threshold should be at least one");
+  return *this == getEmptySet(set->dimCount, set->symbolCount, getContext());
+}
+
+ArrayRef<AffineExpr> IntegerSet::getConstraints() const {
+  return set->constraints;
+}
+
+AffineExpr IntegerSet::getConstraint(unsigned idx) const {
+  return getConstraints()[idx];
+}
+
+/// Returns the equality bits, which specify whether each of the constraints
+/// is an equality or inequality.
+ArrayRef<bool> IntegerSet::getEqFlags() const { return set->eqFlags; }
+
+/// Returns true if the idx^th constraint is an equality, false if it is an
+/// inequality.
+bool IntegerSet::isEq(unsigned idx) const { return getEqFlags()[idx]; }
+
+MLIRContext *IntegerSet::getContext() const {
+  return getConstraint(0).getContext();
+}
+
+/// Walk all of the AffineExpr's in this set. Each node in an expression
+/// tree is visited in postorder.
+void IntegerSet::walkExprs(function_ref<void(AffineExpr)> callback) const {
+  for (auto expr : getConstraints())
+    expr.walk(callback);
+}
+
+IntegerSet IntegerSet::replaceDimsAndSymbols(
+    ArrayRef<AffineExpr> dimReplacements, ArrayRef<AffineExpr> symReplacements,
+    unsigned numResultDims, unsigned numResultSyms) {
+  SmallVector<AffineExpr, 8> constraints;
+  constraints.reserve(getNumConstraints());
+  for (auto cst : getConstraints())
+    constraints.push_back(
+        cst.replaceDimsAndSymbols(dimReplacements, symReplacements));
+
+  return get(numResultDims, numResultSyms, constraints, getEqFlags());
+}
diff --git a/mlir/lib/IR/IntegerSetDetail.h b/mlir/lib/IR/IntegerSetDetail.h
new file mode 100644
index 0000000000000000000000000000000000000000..54ffd47bd47ae7f357419f409d073d77c60e74fd
--- /dev/null
+++ b/mlir/lib/IR/IntegerSetDetail.h
@@ -0,0 +1,36 @@
+//===- IntegerSetDetail.h - MLIR IntegerSet storage details -----*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This holds implementation details of IntegerSet.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef INTEGERSETDETAIL_H_
+#define INTEGERSETDETAIL_H_
+
+#include "mlir/IR/AffineExpr.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+namespace detail {
+
+struct IntegerSetStorage {
+  unsigned dimCount;
+  unsigned symbolCount;
+
+  /// Array of affine constraints: a constraint is either an equality
+  /// (affine_expr == 0) or an inequality (affine_expr >= 0).
+  ArrayRef<AffineExpr> constraints;
+
+  // Bits to check whether a constraint is an equality or an inequality.
+  ArrayRef<bool> eqFlags;
+};
+
+} // end namespace detail
+} // end namespace mlir
+#endif // INTEGERSETDETAIL_H_
diff --git a/mlir/lib/IR/Location.cpp b/mlir/lib/IR/Location.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e23a73647a4958330faaabb550248ddb15a6f9fa
--- /dev/null
+++ b/mlir/lib/IR/Location.cpp
@@ -0,0 +1,137 @@
+//===- Location.cpp - MLIR Location Classes -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Location.h"
+#include "LocationDetail.h"
+#include "llvm/ADT/SetVector.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+//===----------------------------------------------------------------------===//
+// CallSiteLoc
+//===----------------------------------------------------------------------===//
+
+Location CallSiteLoc::get(Location callee, Location caller) {
+  return Base::get(callee->getContext(), StandardAttributes::CallSiteLocation,
+                   callee, caller);
+}
+
+Location CallSiteLoc::get(Location name, ArrayRef<Location> frames) {
+  assert(!frames.empty() && "required at least 1 call frame");
+  Location caller = frames.back();
+  for (auto frame : llvm::reverse(frames.drop_back()))
+    caller = CallSiteLoc::get(frame, caller);
+  return CallSiteLoc::get(name, caller);
+}
+
+Location CallSiteLoc::getCallee() const { return getImpl()->callee; }
+
+Location CallSiteLoc::getCaller() const { return getImpl()->caller; }
+
+//===----------------------------------------------------------------------===//
+// FileLineColLoc
+//===----------------------------------------------------------------------===//
+
+Location FileLineColLoc::get(Identifier filename, unsigned line,
+                             unsigned column, MLIRContext *context) {
+  return Base::get(context, StandardAttributes::FileLineColLocation, filename,
+                   line, column);
+}
+
+Location FileLineColLoc::get(StringRef filename, unsigned line, unsigned column,
+                             MLIRContext *context) {
+  return get(Identifier::get(filename.empty() ? "-" : filename, context), line,
+             column, context);
+}
+
+StringRef FileLineColLoc::getFilename() const { return getImpl()->filename; }
+unsigned FileLineColLoc::getLine() const { return getImpl()->line; }
+unsigned FileLineColLoc::getColumn() const { return getImpl()->column; }
+
+//===----------------------------------------------------------------------===//
+// FusedLoc
+//===----------------------------------------------------------------------===//
+
+Location FusedLoc::get(ArrayRef<Location> locs, Attribute metadata,
+                       MLIRContext *context) {
+  // Unique the set of locations to be fused.
+  llvm::SmallSetVector<Location, 4> decomposedLocs;
+  for (auto loc : locs) {
+    // If the location is a fused location we decompose it if it has no
+    // metadata or the metadata is the same as the top level metadata.
+    if (auto fusedLoc = loc.dyn_cast<FusedLoc>()) {
+      if (fusedLoc.getMetadata() == metadata) {
+        // UnknownLoc's have already been removed from FusedLocs so we can
+        // simply add all of the internal locations.
+        decomposedLocs.insert(fusedLoc.getLocations().begin(),
+                              fusedLoc.getLocations().end());
+        continue;
+      }
+    }
+    // Otherwise, only add known locations to the set.
+    if (!loc.isa<UnknownLoc>())
+      decomposedLocs.insert(loc);
+  }
+  locs = decomposedLocs.getArrayRef();
+
+  // Handle the simple cases of less than two locations.
+  if (locs.empty())
+    return UnknownLoc::get(context);
+  if (locs.size() == 1)
+    return locs.front();
+  return Base::get(context, StandardAttributes::FusedLocation, locs, metadata);
+}
+
+ArrayRef<Location> FusedLoc::getLocations() const {
+  return getImpl()->getLocations();
+}
+
+Attribute FusedLoc::getMetadata() const { return getImpl()->metadata; }
+
+//===----------------------------------------------------------------------===//
+// NameLoc
+//===----------------------------------------------------------------------===//
+
+Location NameLoc::get(Identifier name, Location child) {
+  assert(!child.isa<NameLoc>() &&
+         "a NameLoc cannot be used as a child of another NameLoc");
+  return Base::get(child->getContext(), StandardAttributes::NameLocation, name,
+                   child);
+}
+
+Location NameLoc::get(Identifier name, MLIRContext *context) {
+  return get(name, UnknownLoc::get(context));
+}
+
+/// Return the name identifier.
+Identifier NameLoc::getName() const { return getImpl()->name; }
+
+/// Return the child location.
+Location NameLoc::getChildLoc() const { return getImpl()->child; }
+
+//===----------------------------------------------------------------------===//
+// OpaqueLoc
+//===----------------------------------------------------------------------===//
+
+Location OpaqueLoc::get(uintptr_t underlyingLocation, ClassID *classID,
+                        Location fallbackLocation) {
+  return Base::get(fallbackLocation->getContext(),
+                   StandardAttributes::OpaqueLocation, underlyingLocation,
+                   classID, fallbackLocation);
+}
+
+uintptr_t OpaqueLoc::getUnderlyingLocation() const {
+  return Base::getImpl()->underlyingLocation;
+}
+
+ClassID *OpaqueLoc::getClassId() const { return getImpl()->classId; }
+
+Location OpaqueLoc::getFallbackLocation() const {
+  return Base::getImpl()->fallbackLocation;
+}
diff --git a/mlir/lib/IR/LocationDetail.h b/mlir/lib/IR/LocationDetail.h
new file mode 100644
index 0000000000000000000000000000000000000000..a47a2111c4fc90e3196b9ff14e463961f8ef004d
--- /dev/null
+++ b/mlir/lib/IR/LocationDetail.h
@@ -0,0 +1,162 @@
+//===- LocationDetail.h - MLIR Location storage details ---------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This holds implementation details of the location attributes.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_IR_LOCATIONDETAIL_H_
+#define MLIR_IR_LOCATIONDETAIL_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/Location.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/TrailingObjects.h"
+
+namespace mlir {
+
+namespace detail {
+
+struct CallSiteLocationStorage : public AttributeStorage {
+  CallSiteLocationStorage(Location callee, Location caller)
+      : callee(callee), caller(caller) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<Location, Location>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(callee, caller);
+  }
+
+  /// Construct a new storage instance.
+  static CallSiteLocationStorage *
+  construct(AttributeStorageAllocator &allocator, const KeyTy &key) {
+    return new (allocator.allocate<CallSiteLocationStorage>())
+        CallSiteLocationStorage(key.first, key.second);
+  }
+
+  Location callee, caller;
+};
+
+struct FileLineColLocationStorage : public AttributeStorage {
+  FileLineColLocationStorage(Identifier filename, unsigned line,
+                             unsigned column)
+      : filename(filename), line(line), column(column) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::tuple<Identifier, unsigned, unsigned>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(filename, line, column);
+  }
+
+  /// Construct a new storage instance.
+  static FileLineColLocationStorage *
+  construct(AttributeStorageAllocator &allocator, const KeyTy &key) {
+    return new (allocator.allocate<FileLineColLocationStorage>())
+        FileLineColLocationStorage(std::get<0>(key), std::get<1>(key),
+                                   std::get<2>(key));
+  }
+
+  Identifier filename;
+  unsigned line, column;
+};
+
+struct FusedLocationStorage final
+    : public AttributeStorage,
+      public llvm::TrailingObjects<FusedLocationStorage, Location> {
+  FusedLocationStorage(unsigned numLocs, Attribute metadata)
+      : numLocs(numLocs), metadata(metadata) {}
+
+  ArrayRef<Location> getLocations() const {
+    return ArrayRef<Location>(getTrailingObjects<Location>(), numLocs);
+  }
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<ArrayRef<Location>, Attribute>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getLocations(), metadata);
+  }
+
+  /// Construct a new storage instance.
+  static FusedLocationStorage *construct(AttributeStorageAllocator &allocator,
+                                         const KeyTy &key) {
+    ArrayRef<Location> locs = key.first;
+
+    auto byteSize = totalSizeToAlloc<Location>(locs.size());
+    auto rawMem = allocator.allocate(byteSize, alignof(FusedLocationStorage));
+    auto result = new (rawMem) FusedLocationStorage(locs.size(), key.second);
+
+    std::uninitialized_copy(locs.begin(), locs.end(),
+                            result->getTrailingObjects<Location>());
+    return result;
+  }
+
+  // This stuff is used by the TrailingObjects template.
+  friend llvm::TrailingObjects<FusedLocationStorage, Location>;
+  size_t numTrailingObjects(OverloadToken<Location>) const { return numLocs; }
+
+  /// Number of trailing location objects.
+  unsigned numLocs;
+
+  /// Metadata used to reason about the generation of this fused location.
+  Attribute metadata;
+};
+
+struct NameLocationStorage : public AttributeStorage {
+  NameLocationStorage(Identifier name, Location child)
+      : name(name), child(child) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<Identifier, Location>;
+  bool operator==(const KeyTy &key) const { return key == KeyTy(name, child); }
+
+  /// Construct a new storage instance.
+  static NameLocationStorage *construct(AttributeStorageAllocator &allocator,
+                                        const KeyTy &key) {
+    return new (allocator.allocate<NameLocationStorage>())
+        NameLocationStorage(key.first, key.second);
+  }
+
+  Identifier name;
+  Location child;
+};
+
+struct OpaqueLocationStorage : public AttributeStorage {
+  OpaqueLocationStorage(uintptr_t underlyingLocation, ClassID *classId,
+                        Location fallbackLocation)
+      : underlyingLocation(underlyingLocation), classId(classId),
+        fallbackLocation(fallbackLocation) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::tuple<uintptr_t, ClassID *, Location>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(underlyingLocation, classId, fallbackLocation);
+  }
+
+  /// Construct a new storage instance.
+  static OpaqueLocationStorage *construct(AttributeStorageAllocator &allocator,
+                                          const KeyTy &key) {
+    return new (allocator.allocate<OpaqueLocationStorage>())
+        OpaqueLocationStorage(std::get<0>(key), std::get<1>(key),
+                              std::get<2>(key));
+  }
+
+  /// Pointer to the corresponding object.
+  uintptr_t underlyingLocation;
+
+  /// A unique pointer for each type of underlyingLocation.
+  ClassID *classId;
+
+  /// An additional location that can be used if the external one is not
+  /// suitable.
+  Location fallbackLocation;
+};
+
+} // end namespace detail
+} // end namespace mlir
+
+#endif // MLIR_IR_LOCATIONDETAIL_H_
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..42d77ae2a3dbe482ff3082d76d106b635ad0dfe3
--- /dev/null
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -0,0 +1,641 @@
+//===- MLIRContext.cpp - MLIR Type Classes --------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/MLIRContext.h"
+#include "AffineExprDetail.h"
+#include "AffineMapDetail.h"
+#include "AttributeDetail.h"
+#include "IntegerSetDetail.h"
+#include "LocationDetail.h"
+#include "TypeDetail.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/RWMutex.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+using namespace mlir;
+using namespace mlir::detail;
+
+using llvm::hash_combine;
+using llvm::hash_combine_range;
+
+/// A utility function to safely get or create a uniqued instance within the
+/// given set container.
+template <typename ValueT, typename DenseInfoT, typename KeyT,
+          typename ConstructorFn>
+static ValueT safeGetOrCreate(DenseSet<ValueT, DenseInfoT> &container,
+                              KeyT &&key, llvm::sys::SmartRWMutex<true> &mutex,
+                              ConstructorFn &&constructorFn) {
+  { // Check for an existing instance in read-only mode.
+    llvm::sys::SmartScopedReader<true> instanceLock(mutex);
+    auto it = container.find_as(key);
+    if (it != container.end())
+      return *it;
+  }
+
+  // Acquire a writer-lock so that we can safely create the new instance.
+  llvm::sys::SmartScopedWriter<true> instanceLock(mutex);
+
+  // Check for an existing instance again here, because another writer thread
+  // may have already created one.
+  auto existing = container.insert_as(ValueT(), key);
+  if (!existing.second)
+    return *existing.first;
+
+  // Otherwise, construct a new instance of the value.
+  return *existing.first = constructorFn();
+}
+
+namespace {
+/// A builtin dialect to define types/etc that are necessary for the validity of
+/// the IR.
+struct BuiltinDialect : public Dialect {
+  BuiltinDialect(MLIRContext *context) : Dialect(/*name=*/"", context) {
+    addAttributes<AffineMapAttr, ArrayAttr, BoolAttr, DenseElementsAttr,
+                  DictionaryAttr, FloatAttr, SymbolRefAttr, IntegerAttr,
+                  IntegerSetAttr, OpaqueAttr, OpaqueElementsAttr,
+                  SparseElementsAttr, StringAttr, TypeAttr, UnitAttr>();
+    addAttributes<CallSiteLoc, FileLineColLoc, FusedLoc, NameLoc, OpaqueLoc,
+                  UnknownLoc>();
+
+    addTypes<ComplexType, FloatType, FunctionType, IndexType, IntegerType,
+             MemRefType, UnrankedMemRefType, NoneType, OpaqueType,
+             RankedTensorType, TupleType, UnrankedTensorType, VectorType>();
+
+    // TODO: These operations should be moved to a different dialect when they
+    // have been fully decoupled from the core.
+    addOperations<FuncOp, ModuleOp, ModuleTerminatorOp>();
+  }
+};
+
+struct AffineMapKeyInfo : DenseMapInfo<AffineMap> {
+  // Affine maps are uniqued based on their dim/symbol counts and affine
+  // expressions.
+  using KeyTy = std::tuple<unsigned, unsigned, ArrayRef<AffineExpr>>;
+  using DenseMapInfo<AffineMap>::isEqual;
+
+  static unsigned getHashValue(const AffineMap &key) {
+    return getHashValue(
+        KeyTy(key.getNumDims(), key.getNumSymbols(), key.getResults()));
+  }
+
+  static unsigned getHashValue(KeyTy key) {
+    return hash_combine(
+        std::get<0>(key), std::get<1>(key),
+        hash_combine_range(std::get<2>(key).begin(), std::get<2>(key).end()));
+  }
+
+  static bool isEqual(const KeyTy &lhs, AffineMap rhs) {
+    if (rhs == getEmptyKey() || rhs == getTombstoneKey())
+      return false;
+    return lhs == std::make_tuple(rhs.getNumDims(), rhs.getNumSymbols(),
+                                  rhs.getResults());
+  }
+};
+
+struct IntegerSetKeyInfo : DenseMapInfo<IntegerSet> {
+  // Integer sets are uniqued based on their dim/symbol counts, affine
+  // expressions appearing in the LHS of constraints, and eqFlags.
+  using KeyTy =
+      std::tuple<unsigned, unsigned, ArrayRef<AffineExpr>, ArrayRef<bool>>;
+  using DenseMapInfo<IntegerSet>::isEqual;
+
+  static unsigned getHashValue(const IntegerSet &key) {
+    return getHashValue(KeyTy(key.getNumDims(), key.getNumSymbols(),
+                              key.getConstraints(), key.getEqFlags()));
+  }
+
+  static unsigned getHashValue(KeyTy key) {
+    return hash_combine(
+        std::get<0>(key), std::get<1>(key),
+        hash_combine_range(std::get<2>(key).begin(), std::get<2>(key).end()),
+        hash_combine_range(std::get<3>(key).begin(), std::get<3>(key).end()));
+  }
+
+  static bool isEqual(const KeyTy &lhs, IntegerSet rhs) {
+    if (rhs == getEmptyKey() || rhs == getTombstoneKey())
+      return false;
+    return lhs == std::make_tuple(rhs.getNumDims(), rhs.getNumSymbols(),
+                                  rhs.getConstraints(), rhs.getEqFlags());
+  }
+};
+} // end anonymous namespace.
+
+namespace mlir {
+/// This is the implementation of the MLIRContext class, using the pImpl idiom.
+/// This class is completely private to this file, so everything is public.
+class MLIRContextImpl {
+public:
+  //===--------------------------------------------------------------------===//
+  // Identifier uniquing
+  //===--------------------------------------------------------------------===//
+
+  // Identifier allocator and mutex for thread safety.
+  llvm::BumpPtrAllocator identifierAllocator;
+  llvm::sys::SmartRWMutex<true> identifierMutex;
+
+  //===--------------------------------------------------------------------===//
+  // Diagnostics
+  //===--------------------------------------------------------------------===//
+  DiagnosticEngine diagEngine;
+
+  //===--------------------------------------------------------------------===//
+  // Other
+  //===--------------------------------------------------------------------===//
+
+  /// A general purpose mutex to lock access to parts of the context that do not
+  /// have a more specific mutex, e.g. registry operations.
+  llvm::sys::SmartRWMutex<true> contextMutex;
+
+  /// This is a list of dialects that are created referring to this context.
+  /// The MLIRContext owns the objects.
+  std::vector<std::unique_ptr<Dialect>> dialects;
+
+  /// This is a mapping from operation name to AbstractOperation for registered
+  /// operations.
+  llvm::StringMap<AbstractOperation> registeredOperations;
+
+  /// This is a mapping from class identifier to Dialect for registered
+  /// attributes and types.
+  DenseMap<const ClassID *, Dialect *> registeredDialectSymbols;
+
+  /// These are identifiers uniqued into this MLIRContext.
+  llvm::StringMap<char, llvm::BumpPtrAllocator &> identifiers;
+
+  //===--------------------------------------------------------------------===//
+  // Affine uniquing
+  //===--------------------------------------------------------------------===//
+
+  // Affine allocator and mutex for thread safety.
+  llvm::BumpPtrAllocator affineAllocator;
+  llvm::sys::SmartRWMutex<true> affineMutex;
+
+  // Affine map uniquing.
+  using AffineMapSet = DenseSet<AffineMap, AffineMapKeyInfo>;
+  AffineMapSet affineMaps;
+
+  // Integer set uniquing.
+  using IntegerSets = DenseSet<IntegerSet, IntegerSetKeyInfo>;
+  IntegerSets integerSets;
+
+  // Affine expression uniquing.
+  StorageUniquer affineUniquer;
+
+  //===--------------------------------------------------------------------===//
+  // Type uniquing
+  //===--------------------------------------------------------------------===//
+  StorageUniquer typeUniquer;
+
+  /// Cached Type Instances.
+  FloatType bf16Ty, f16Ty, f32Ty, f64Ty;
+  IndexType indexTy;
+  IntegerType int1Ty, int8Ty, int16Ty, int32Ty, int64Ty, int128Ty;
+  NoneType noneType;
+
+  //===--------------------------------------------------------------------===//
+  // Attribute uniquing
+  //===--------------------------------------------------------------------===//
+  StorageUniquer attributeUniquer;
+
+  /// Cached Attribute Instances.
+  BoolAttr falseAttr, trueAttr;
+  UnitAttr unitAttr;
+  UnknownLoc unknownLocAttr;
+
+public:
+  MLIRContextImpl() : identifiers(identifierAllocator) {}
+};
+} // end namespace mlir
+
+MLIRContext::MLIRContext() : impl(new MLIRContextImpl()) {
+  new BuiltinDialect(this);
+  registerAllDialects(this);
+
+  // Initialize several common attributes and types to avoid the need to lock
+  // the context when accessing them.
+
+  //// Types.
+  /// Floating-point Types.
+  impl->bf16Ty = TypeUniquer::get<FloatType>(this, StandardTypes::BF16);
+  impl->f16Ty = TypeUniquer::get<FloatType>(this, StandardTypes::F16);
+  impl->f32Ty = TypeUniquer::get<FloatType>(this, StandardTypes::F32);
+  impl->f64Ty = TypeUniquer::get<FloatType>(this, StandardTypes::F64);
+  /// Index Type.
+  impl->indexTy = TypeUniquer::get<IndexType>(this, StandardTypes::Index);
+  /// Integer Types.
+  impl->int1Ty = TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 1);
+  impl->int8Ty = TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 8);
+  impl->int16Ty =
+      TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 16);
+  impl->int32Ty =
+      TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 32);
+  impl->int64Ty =
+      TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 64);
+  impl->int128Ty =
+      TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 128);
+  /// None Type.
+  impl->noneType = TypeUniquer::get<NoneType>(this, StandardTypes::None);
+
+  //// Attributes.
+  //// Note: These must be registered after the types as they may generate one
+  //// of the above types internally.
+  /// Bool Attributes.
+  // Note: The context is also used within the BoolAttrStorage.
+  impl->falseAttr = AttributeUniquer::get<BoolAttr>(
+      this, StandardAttributes::Bool, this, false);
+  impl->trueAttr = AttributeUniquer::get<BoolAttr>(
+      this, StandardAttributes::Bool, this, true);
+  /// Unit Attribute.
+  impl->unitAttr =
+      AttributeUniquer::get<UnitAttr>(this, StandardAttributes::Unit);
+  /// Unknown Location Attribute.
+  impl->unknownLocAttr = AttributeUniquer::get<UnknownLoc>(
+      this, StandardAttributes::UnknownLocation);
+}
+
+MLIRContext::~MLIRContext() {}
+
+/// Copy the specified array of elements into memory managed by the provided
+/// bump pointer allocator.  This assumes the elements are all PODs.
+template <typename T>
+static ArrayRef<T> copyArrayRefInto(llvm::BumpPtrAllocator &allocator,
+                                    ArrayRef<T> elements) {
+  auto result = allocator.Allocate<T>(elements.size());
+  std::uninitialized_copy(elements.begin(), elements.end(), result);
+  return ArrayRef<T>(result, elements.size());
+}
+
+//===----------------------------------------------------------------------===//
+// Diagnostic Handlers
+//===----------------------------------------------------------------------===//
+
+/// Returns the diagnostic engine for this context.
+DiagnosticEngine &MLIRContext::getDiagEngine() { return getImpl().diagEngine; }
+
+//===----------------------------------------------------------------------===//
+// Dialect and Operation Registration
+//===----------------------------------------------------------------------===//
+
+/// Return information about all registered IR dialects.
+std::vector<Dialect *> MLIRContext::getRegisteredDialects() {
+  // Lock access to the context registry.
+  llvm::sys::SmartScopedReader<true> registryLock(getImpl().contextMutex);
+
+  std::vector<Dialect *> result;
+  result.reserve(getImpl().dialects.size());
+  for (auto &dialect : getImpl().dialects)
+    result.push_back(dialect.get());
+  return result;
+}
+
+/// Get a registered IR dialect with the given namespace. If none is found,
+/// then return nullptr.
+Dialect *MLIRContext::getRegisteredDialect(StringRef name) {
+  // Lock access to the context registry.
+  llvm::sys::SmartScopedReader<true> registryLock(getImpl().contextMutex);
+  for (auto &dialect : getImpl().dialects)
+    if (name == dialect->getNamespace())
+      return dialect.get();
+  return nullptr;
+}
+
+/// Register this dialect object with the specified context.  The context
+/// takes ownership of the heap allocated dialect.
+void Dialect::registerDialect(MLIRContext *context) {
+  auto &impl = context->getImpl();
+  std::unique_ptr<Dialect> dialect(this);
+
+  // Lock access to the context registry.
+  llvm::sys::SmartScopedWriter<true> registryLock(impl.contextMutex);
+
+  // Get the correct insertion position sorted by namespace.
+  auto insertPt =
+      llvm::lower_bound(impl.dialects, dialect,
+                        [](const std::unique_ptr<Dialect> &lhs,
+                           const std::unique_ptr<Dialect> &rhs) {
+                          return lhs->getNamespace() < rhs->getNamespace();
+                        });
+
+  // Abort if dialect with namespace has already been registered.
+  if (insertPt != impl.dialects.end() &&
+      (*insertPt)->getNamespace() == getNamespace()) {
+    llvm::report_fatal_error("a dialect with namespace '" + getNamespace() +
+                             "' has already been registered");
+  }
+  impl.dialects.insert(insertPt, std::move(dialect));
+}
+
+/// Return information about all registered operations.  This isn't very
+/// efficient, typically you should ask the operations about their properties
+/// directly.
+std::vector<AbstractOperation *> MLIRContext::getRegisteredOperations() {
+  std::vector<std::pair<StringRef, AbstractOperation *>> opsToSort;
+
+  { // Lock access to the context registry.
+    llvm::sys::SmartScopedReader<true> registryLock(getImpl().contextMutex);
+
+    // We just have the operations in a non-deterministic hash table order. Dump
+    // into a temporary array, then sort it by operation name to get a stable
+    // ordering.
+    llvm::StringMap<AbstractOperation> &registeredOps =
+        getImpl().registeredOperations;
+
+    opsToSort.reserve(registeredOps.size());
+    for (auto &elt : registeredOps)
+      opsToSort.push_back({elt.first(), &elt.second});
+  }
+
+  llvm::array_pod_sort(opsToSort.begin(), opsToSort.end());
+
+  std::vector<AbstractOperation *> result;
+  result.reserve(opsToSort.size());
+  for (auto &elt : opsToSort)
+    result.push_back(elt.second);
+  return result;
+}
+
+void Dialect::addOperation(AbstractOperation opInfo) {
+  assert((getNamespace().empty() ||
+          opInfo.name.split('.').first == getNamespace()) &&
+         "op name doesn't start with dialect namespace");
+  assert(&opInfo.dialect == this && "Dialect object mismatch");
+  auto &impl = context->getImpl();
+
+  // Lock access to the context registry.
+  llvm::sys::SmartScopedWriter<true> registryLock(impl.contextMutex);
+  if (!impl.registeredOperations.insert({opInfo.name, opInfo}).second) {
+    llvm::errs() << "error: operation named '" << opInfo.name
+                 << "' is already registered.\n";
+    abort();
+  }
+}
+
+/// Register a dialect-specific symbol(e.g. type) with the current context.
+void Dialect::addSymbol(const ClassID *const classID) {
+  auto &impl = context->getImpl();
+
+  // Lock access to the context registry.
+  llvm::sys::SmartScopedWriter<true> registryLock(impl.contextMutex);
+  if (!impl.registeredDialectSymbols.insert({classID, this}).second) {
+    llvm::errs() << "error: dialect symbol already registered.\n";
+    abort();
+  }
+}
+
+/// Look up the specified operation in the operation set and return a pointer
+/// to it if present.  Otherwise, return a null pointer.
+const AbstractOperation *AbstractOperation::lookup(StringRef opName,
+                                                   MLIRContext *context) {
+  auto &impl = context->getImpl();
+
+  // Lock access to the context registry.
+  llvm::sys::SmartScopedReader<true> registryLock(impl.contextMutex);
+  auto it = impl.registeredOperations.find(opName);
+  if (it != impl.registeredOperations.end())
+    return &it->second;
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Identifier uniquing
+//===----------------------------------------------------------------------===//
+
+/// Return an identifier for the specified string.
+Identifier Identifier::get(StringRef str, MLIRContext *context) {
+  assert(!str.empty() && "Cannot create an empty identifier");
+  assert(str.find('\0') == StringRef::npos &&
+         "Cannot create an identifier with a nul character");
+
+  auto &impl = context->getImpl();
+
+  { // Check for an existing identifier in read-only mode.
+    llvm::sys::SmartScopedReader<true> contextLock(impl.identifierMutex);
+    auto it = impl.identifiers.find(str);
+    if (it != impl.identifiers.end())
+      return Identifier(it->getKeyData());
+  }
+
+  // Acquire a writer-lock so that we can safely create the new instance.
+  llvm::sys::SmartScopedWriter<true> contextLock(impl.identifierMutex);
+  auto it = impl.identifiers.insert({str, char()}).first;
+  return Identifier(it->getKeyData());
+}
+
+//===----------------------------------------------------------------------===//
+// Type uniquing
+//===----------------------------------------------------------------------===//
+
+static Dialect &lookupDialectForSymbol(MLIRContext *ctx,
+                                       const ClassID *const classID) {
+  auto &impl = ctx->getImpl();
+  auto it = impl.registeredDialectSymbols.find(classID);
+  assert(it != impl.registeredDialectSymbols.end() &&
+         "symbol is not registered.");
+  return *it->second;
+}
+
+/// Returns the storage unqiuer used for constructing type storage instances.
+/// This should not be used directly.
+StorageUniquer &MLIRContext::getTypeUniquer() { return getImpl().typeUniquer; }
+
+/// Get the dialect that registered the type with the provided typeid.
+Dialect &TypeUniquer::lookupDialectForType(MLIRContext *ctx,
+                                           const ClassID *const typeID) {
+  return lookupDialectForSymbol(ctx, typeID);
+}
+
+FloatType FloatType::get(StandardTypes::Kind kind, MLIRContext *context) {
+  assert(kindof(kind) && "Not a FP kind.");
+  switch (kind) {
+  case StandardTypes::BF16:
+    return context->getImpl().bf16Ty;
+  case StandardTypes::F16:
+    return context->getImpl().f16Ty;
+  case StandardTypes::F32:
+    return context->getImpl().f32Ty;
+  case StandardTypes::F64:
+    return context->getImpl().f64Ty;
+  default:
+    llvm_unreachable("unexpected floating-point kind");
+  }
+}
+
+/// Get an instance of the IndexType.
+IndexType IndexType::get(MLIRContext *context) {
+  return context->getImpl().indexTy;
+}
+
+/// Return an existing integer type instance if one is cached within the
+/// context.
+static IntegerType getCachedIntegerType(unsigned width, MLIRContext *context) {
+  switch (width) {
+  case 1:
+    return context->getImpl().int1Ty;
+  case 8:
+    return context->getImpl().int8Ty;
+  case 16:
+    return context->getImpl().int16Ty;
+  case 32:
+    return context->getImpl().int32Ty;
+  case 64:
+    return context->getImpl().int64Ty;
+  case 128:
+    return context->getImpl().int128Ty;
+  default:
+    return IntegerType();
+  }
+}
+
+IntegerType IntegerType::get(unsigned width, MLIRContext *context) {
+  if (auto cached = getCachedIntegerType(width, context))
+    return cached;
+  return Base::get(context, StandardTypes::Integer, width);
+}
+
+IntegerType IntegerType::getChecked(unsigned width, MLIRContext *context,
+                                    Location location) {
+  if (auto cached = getCachedIntegerType(width, context))
+    return cached;
+  return Base::getChecked(location, context, StandardTypes::Integer, width);
+}
+
+/// Get an instance of the NoneType.
+NoneType NoneType::get(MLIRContext *context) {
+  return context->getImpl().noneType;
+}
+
+//===----------------------------------------------------------------------===//
+// Attribute uniquing
+//===----------------------------------------------------------------------===//
+
+/// Returns the storage uniquer used for constructing attribute storage
+/// instances. This should not be used directly.
+StorageUniquer &MLIRContext::getAttributeUniquer() {
+  return getImpl().attributeUniquer;
+}
+
+/// Returns a functor used to initialize new attribute storage instances.
+std::function<void(AttributeStorage *)>
+AttributeUniquer::getInitFn(MLIRContext *ctx, const ClassID *const attrID) {
+  return [ctx, attrID](AttributeStorage *storage) {
+    storage->initializeDialect(lookupDialectForSymbol(ctx, attrID));
+
+    // If the attribute did not provide a type, then default to NoneType.
+    if (!storage->getType())
+      storage->setType(NoneType::get(ctx));
+  };
+}
+
+BoolAttr BoolAttr::get(bool value, MLIRContext *context) {
+  return value ? context->getImpl().trueAttr : context->getImpl().falseAttr;
+}
+
+UnitAttr UnitAttr::get(MLIRContext *context) {
+  return context->getImpl().unitAttr;
+}
+
+Location UnknownLoc::get(MLIRContext *context) {
+  return context->getImpl().unknownLocAttr;
+}
+
+//===----------------------------------------------------------------------===//
+// AffineMap uniquing
+//===----------------------------------------------------------------------===//
+
+StorageUniquer &MLIRContext::getAffineUniquer() {
+  return getImpl().affineUniquer;
+}
+
+AffineMap AffineMap::getImpl(unsigned dimCount, unsigned symbolCount,
+                             ArrayRef<AffineExpr> results,
+                             MLIRContext *context) {
+  auto &impl = context->getImpl();
+  auto key = std::make_tuple(dimCount, symbolCount, results);
+
+  // Safely get or create an AffineMap instance.
+  return safeGetOrCreate(impl.affineMaps, key, impl.affineMutex, [&] {
+    auto *res = impl.affineAllocator.Allocate<detail::AffineMapStorage>();
+
+    // Copy the results into the bump pointer.
+    results = copyArrayRefInto(impl.affineAllocator, results);
+
+    // Initialize the memory using placement new.
+    new (res) detail::AffineMapStorage{dimCount, symbolCount, results, context};
+    return AffineMap(res);
+  });
+}
+
+AffineMap AffineMap::get(MLIRContext *context) {
+  return getImpl(/*dimCount=*/0, /*symbolCount=*/0, /*results=*/{}, context);
+}
+
+AffineMap AffineMap::get(unsigned dimCount, unsigned symbolCount,
+                         ArrayRef<AffineExpr> results) {
+  // The number of results can't be zero.
+  assert(!results.empty());
+  return getImpl(dimCount, symbolCount, results, results[0].getContext());
+}
+
+//===----------------------------------------------------------------------===//
+// Integer Sets: these are allocated into the bump pointer, and are immutable.
+// Unlike AffineMap's, these are uniqued only if they are small.
+//===----------------------------------------------------------------------===//
+
+IntegerSet IntegerSet::get(unsigned dimCount, unsigned symbolCount,
+                           ArrayRef<AffineExpr> constraints,
+                           ArrayRef<bool> eqFlags) {
+  // The number of constraints can't be zero.
+  assert(!constraints.empty());
+  assert(constraints.size() == eqFlags.size());
+
+  auto &impl = constraints[0].getContext()->getImpl();
+
+  // A utility function to construct a new IntegerSetStorage instance.
+  auto constructorFn = [&] {
+    auto *res = impl.affineAllocator.Allocate<detail::IntegerSetStorage>();
+
+    // Copy the results and equality flags into the bump pointer.
+    constraints = copyArrayRefInto(impl.affineAllocator, constraints);
+    eqFlags = copyArrayRefInto(impl.affineAllocator, eqFlags);
+
+    // Initialize the memory using placement new.
+    new (res)
+        detail::IntegerSetStorage{dimCount, symbolCount, constraints, eqFlags};
+    return IntegerSet(res);
+  };
+
+  // If this instance is uniqued, then we handle it separately so that multiple
+  // threads may simultaneously access existing instances.
+  if (constraints.size() < IntegerSet::kUniquingThreshold) {
+    auto key = std::make_tuple(dimCount, symbolCount, constraints, eqFlags);
+    return safeGetOrCreate(impl.integerSets, key, impl.affineMutex,
+                           constructorFn);
+  }
+
+  // Otherwise, acquire a writer-lock so that we can safely create the new
+  // instance.
+  llvm::sys::SmartScopedWriter<true> affineLock(impl.affineMutex);
+  return constructorFn();
+}
diff --git a/mlir/lib/IR/Module.cpp b/mlir/lib/IR/Module.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c5af227459c8177a5c57bc195fe76fc383c49e2a
--- /dev/null
+++ b/mlir/lib/IR/Module.cpp
@@ -0,0 +1,106 @@
+//===- Module.cpp - MLIR Module Operation ---------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/OpImplementation.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Module Operation.
+//===----------------------------------------------------------------------===//
+
+void ModuleOp::build(Builder *builder, OperationState &result,
+                     Optional<StringRef> name) {
+  ensureTerminator(*result.addRegion(), *builder, result.location);
+  if (name)
+    result.attributes.push_back(builder->getNamedAttr(
+        mlir::SymbolTable::getSymbolAttrName(), builder->getStringAttr(*name)));
+}
+
+/// Construct a module from the given context.
+ModuleOp ModuleOp::create(Location loc, Optional<StringRef> name) {
+  OperationState state(loc, "module");
+  Builder builder(loc->getContext());
+  ModuleOp::build(&builder, state, name);
+  return cast<ModuleOp>(Operation::create(state));
+}
+
+ParseResult ModuleOp::parse(OpAsmParser &parser, OperationState &result) {
+  // If the name is present, parse it.
+  StringAttr nameAttr;
+  (void)parser.parseOptionalSymbolName(
+      nameAttr, mlir::SymbolTable::getSymbolAttrName(), result.attributes);
+
+  // If module attributes are present, parse them.
+  if (parser.parseOptionalAttrDictWithKeyword(result.attributes))
+    return failure();
+
+  // Parse the module body.
+  auto *body = result.addRegion();
+  if (parser.parseRegion(*body, llvm::None, llvm::None))
+    return failure();
+
+  // Ensure that this module has a valid terminator.
+  ensureTerminator(*body, parser.getBuilder(), result.location);
+  return success();
+}
+
+void ModuleOp::print(OpAsmPrinter &p) {
+  p << "module";
+
+  if (Optional<StringRef> name = getName()) {
+    p << ' ';
+    p.printSymbolName(*name);
+  }
+
+  // Print the module attributes.
+  p.printOptionalAttrDictWithKeyword(getAttrs(),
+                                     {mlir::SymbolTable::getSymbolAttrName()});
+
+  // Print the region.
+  p.printRegion(getOperation()->getRegion(0), /*printEntryBlockArgs=*/false,
+                /*printBlockTerminators=*/false);
+}
+
+LogicalResult ModuleOp::verify() {
+  auto &bodyRegion = getOperation()->getRegion(0);
+
+  // The body must contain a single basic block.
+  if (!has_single_element(bodyRegion))
+    return emitOpError("expected body region to have a single block");
+
+  // Check that the body has no block arguments.
+  auto *body = &bodyRegion.front();
+  if (body->getNumArguments() != 0)
+    return emitOpError("expected body to have no arguments");
+
+  // Check that none of the attributes are non-dialect attributes, except for
+  // the symbol name attribute.
+  for (auto attr : getOperation()->getAttrList().getAttrs()) {
+    if (!attr.first.strref().contains('.') &&
+        attr.first.strref() != mlir::SymbolTable::getSymbolAttrName())
+      return emitOpError(
+                 "can only contain dialect-specific attributes, found: '")
+             << attr.first << "'";
+  }
+
+  return success();
+}
+
+/// Return body of this module.
+Region &ModuleOp::getBodyRegion() { return getOperation()->getRegion(0); }
+Block *ModuleOp::getBody() { return &getBodyRegion().front(); }
+
+Optional<StringRef> ModuleOp::getName() {
+  if (auto nameAttr =
+          getAttrOfType<StringAttr>(mlir::SymbolTable::getSymbolAttrName()))
+    return nameAttr.getValue();
+  return llvm::None;
+}
diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7baba840e0da6a550771d980f5afce246db5240
--- /dev/null
+++ b/mlir/lib/IR/Operation.cpp
@@ -0,0 +1,1230 @@
+//===- Operation.cpp - Operation support code -----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "llvm/Support/CommandLine.h"
+#include <numeric>
+
+using namespace mlir;
+
+static llvm::cl::opt<bool> printOpOnDiagnostic(
+    "mlir-print-op-on-diagnostic",
+    llvm::cl::desc("When a diagnostic is emitted on an operation, also print "
+                   "the operation as an attached note"));
+
+OpAsmParser::~OpAsmParser() {}
+
+//===----------------------------------------------------------------------===//
+// OperationName
+//===----------------------------------------------------------------------===//
+
+/// Form the OperationName for an op with the specified string.  This either is
+/// a reference to an AbstractOperation if one is known, or a uniqued Identifier
+/// if not.
+OperationName::OperationName(StringRef name, MLIRContext *context) {
+  if (auto *op = AbstractOperation::lookup(name, context))
+    representation = op;
+  else
+    representation = Identifier::get(name, context);
+}
+
+/// Return the name of the dialect this operation is registered to.
+StringRef OperationName::getDialect() const {
+  return getStringRef().split('.').first;
+}
+
+/// Return the name of this operation.  This always succeeds.
+StringRef OperationName::getStringRef() const {
+  if (auto *op = representation.dyn_cast<const AbstractOperation *>())
+    return op->name;
+  return representation.get<Identifier>().strref();
+}
+
+const AbstractOperation *OperationName::getAbstractOperation() const {
+  return representation.dyn_cast<const AbstractOperation *>();
+}
+
+OperationName OperationName::getFromOpaquePointer(void *pointer) {
+  return OperationName(RepresentationUnion::getFromOpaqueValue(pointer));
+}
+
+//===----------------------------------------------------------------------===//
+// OpResult
+//===----------------------------------------------------------------------===//
+
+/// Return the result number of this result.
+unsigned OpResult::getResultNumber() const {
+  // Results are not stored in place, so we have to find it within the list.
+  auto resList = getOwner()->getOpResults();
+  return std::distance(resList.begin(), llvm::find(resList, *this));
+}
+
+//===----------------------------------------------------------------------===//
+// OpOperand
+//===----------------------------------------------------------------------===//
+
+OpOperand::OpOperand(Operation *owner, Value value)
+    : IROperand(owner, value.impl) {}
+
+/// Return the current value being used by this operand.
+Value OpOperand::get() { return (detail::ValueImpl *)IROperand::get(); }
+
+/// Set the current value being used by this operand.
+void OpOperand::set(Value newValue) { IROperand::set(newValue.impl); }
+
+/// Return which operand this is in the operand list.
+unsigned OpOperand::getOperandNumber() {
+  return this - &getOwner()->getOpOperands()[0];
+}
+
+//===----------------------------------------------------------------------===//
+// BlockOperand
+//===----------------------------------------------------------------------===//
+
+// TODO: This namespace is only required because of a bug in GCC<7.0.
+namespace mlir {
+/// Return which operand this is in the operand list.
+template <> unsigned BlockOperand::getOperandNumber() {
+  return this - &getOwner()->getBlockOperands()[0];
+}
+} // end namespace mlir
+
+//===----------------------------------------------------------------------===//
+// Operation
+//===----------------------------------------------------------------------===//
+
+/// Create a new Operation with the specific fields.
+Operation *Operation::create(Location location, OperationName name,
+                             ArrayRef<Type> resultTypes,
+                             ArrayRef<Value> operands,
+                             ArrayRef<NamedAttribute> attributes,
+                             ArrayRef<Block *> successors, unsigned numRegions,
+                             bool resizableOperandList) {
+  return create(location, name, resultTypes, operands,
+                NamedAttributeList(attributes), successors, numRegions,
+                resizableOperandList);
+}
+
+/// Create a new Operation from operation state.
+Operation *Operation::create(const OperationState &state) {
+  return Operation::create(state.location, state.name, state.types,
+                           state.operands, NamedAttributeList(state.attributes),
+                           state.successors, state.regions,
+                           state.resizableOperandList);
+}
+
+/// Create a new Operation with the specific fields.
+Operation *Operation::create(Location location, OperationName name,
+                             ArrayRef<Type> resultTypes,
+                             ArrayRef<Value> operands,
+                             NamedAttributeList attributes,
+                             ArrayRef<Block *> successors, RegionRange regions,
+                             bool resizableOperandList) {
+  unsigned numRegions = regions.size();
+  Operation *op = create(location, name, resultTypes, operands, attributes,
+                         successors, numRegions, resizableOperandList);
+  for (unsigned i = 0; i < numRegions; ++i)
+    if (regions[i])
+      op->getRegion(i).takeBody(*regions[i]);
+  return op;
+}
+
+/// Overload of create that takes an existing NamedAttributeList to avoid
+/// unnecessarily uniquing a list of attributes.
+Operation *Operation::create(Location location, OperationName name,
+                             ArrayRef<Type> resultTypes,
+                             ArrayRef<Value> operands,
+                             NamedAttributeList attributes,
+                             ArrayRef<Block *> successors, unsigned numRegions,
+                             bool resizableOperandList) {
+  unsigned numSuccessors = successors.size();
+
+  // Input operands are nullptr-separated for each successor, the null operands
+  // aren't actually stored.
+  unsigned numOperands = operands.size() - numSuccessors;
+
+  // Compute the byte size for the operation and the operand storage.
+  auto byteSize = totalSizeToAlloc<OpResult, BlockOperand, unsigned, Region,
+                                   detail::OperandStorage>(
+      resultTypes.size(), numSuccessors, numSuccessors, numRegions,
+      /*detail::OperandStorage*/ 1);
+  byteSize += llvm::alignTo(detail::OperandStorage::additionalAllocSize(
+                                numOperands, resizableOperandList),
+                            alignof(Operation));
+  void *rawMem = malloc(byteSize);
+
+  // Create the new Operation.
+  auto op = ::new (rawMem) Operation(location, name, resultTypes.size(),
+                                     numSuccessors, numRegions, attributes);
+
+  assert((numSuccessors == 0 || !op->isKnownNonTerminator()) &&
+         "unexpected successors in a non-terminator operation");
+
+  // Initialize the regions.
+  for (unsigned i = 0; i != numRegions; ++i)
+    new (&op->getRegion(i)) Region(op);
+
+  // Initialize the results and operands.
+  new (&op->getOperandStorage())
+      detail::OperandStorage(numOperands, resizableOperandList);
+
+  auto instResults = op->getOpResults();
+  for (unsigned i = 0, e = resultTypes.size(); i != e; ++i)
+    new (&instResults[i]) OpResult(OpResult::create(resultTypes[i], op));
+
+  auto opOperands = op->getOpOperands();
+
+  // Initialize normal operands.
+  unsigned operandIt = 0, operandE = operands.size();
+  unsigned nextOperand = 0;
+  for (; operandIt != operandE; ++operandIt) {
+    // Null operands are used as sentinels between successor operand lists. If
+    // we encounter one here, break and handle the successor operands lists
+    // separately below.
+    if (!operands[operandIt])
+      break;
+    new (&opOperands[nextOperand++]) OpOperand(op, operands[operandIt]);
+  }
+
+  unsigned currentSuccNum = 0;
+  if (operandIt == operandE) {
+    // Verify that the amount of sentinel operands is equivalent to the number
+    // of successors.
+    assert(currentSuccNum == numSuccessors);
+    return op;
+  }
+
+  assert(!op->isKnownNonTerminator() &&
+         "Unexpected nullptr in operand list when creating non-terminator.");
+  auto instBlockOperands = op->getBlockOperands();
+  unsigned *succOperandCountIt = op->getTrailingObjects<unsigned>();
+  unsigned *succOperandCountE = succOperandCountIt + numSuccessors;
+  (void)succOperandCountE;
+
+  for (; operandIt != operandE; ++operandIt) {
+    // If we encounter a sentinel branch to the next operand update the count
+    // variable.
+    if (!operands[operandIt]) {
+      assert(currentSuccNum < numSuccessors);
+
+      // After the first iteration update the successor operand count
+      // variable.
+      if (currentSuccNum != 0) {
+        ++succOperandCountIt;
+        assert(succOperandCountIt != succOperandCountE &&
+               "More sentinel operands than successors.");
+      }
+
+      new (&instBlockOperands[currentSuccNum])
+          BlockOperand(op, successors[currentSuccNum]);
+      *succOperandCountIt = 0;
+      ++currentSuccNum;
+      continue;
+    }
+    new (&opOperands[nextOperand++]) OpOperand(op, operands[operandIt]);
+    ++(*succOperandCountIt);
+  }
+
+  // Verify that the amount of sentinel operands is equivalent to the number of
+  // successors.
+  assert(currentSuccNum == numSuccessors);
+
+  return op;
+}
+
+Operation::Operation(Location location, OperationName name, unsigned numResults,
+                     unsigned numSuccessors, unsigned numRegions,
+                     const NamedAttributeList &attributes)
+    : location(location), numResults(numResults), numSuccs(numSuccessors),
+      numRegions(numRegions), name(name), attrs(attributes) {}
+
+// Operations are deleted through the destroy() member because they are
+// allocated via malloc.
+Operation::~Operation() {
+  assert(block == nullptr && "operation destroyed but still in a block");
+
+  // Explicitly run the destructors for the operands and results.
+  getOperandStorage().~OperandStorage();
+
+  for (auto &result : getOpResults())
+    result.destroy();
+
+  // Explicitly run the destructors for the successors.
+  for (auto &successor : getBlockOperands())
+    successor.~BlockOperand();
+
+  // Explicitly destroy the regions.
+  for (auto &region : getRegions())
+    region.~Region();
+}
+
+/// Destroy this operation or one of its subclasses.
+void Operation::destroy() {
+  this->~Operation();
+  free(this);
+}
+
+/// Return the context this operation is associated with.
+MLIRContext *Operation::getContext() { return location->getContext(); }
+
+/// Return the dialect this operation is associated with, or nullptr if the
+/// associated dialect is not registered.
+Dialect *Operation::getDialect() {
+  if (auto *abstractOp = getAbstractOperation())
+    return &abstractOp->dialect;
+
+  // If this operation hasn't been registered or doesn't have abstract
+  // operation, try looking up the dialect name in the context.
+  return getContext()->getRegisteredDialect(getName().getDialect());
+}
+
+Region *Operation::getParentRegion() {
+  return block ? block->getParent() : nullptr;
+}
+
+Operation *Operation::getParentOp() {
+  return block ? block->getParentOp() : nullptr;
+}
+
+/// Return true if this operation is a proper ancestor of the `other`
+/// operation.
+bool Operation::isProperAncestor(Operation *other) {
+  while ((other = other->getParentOp()))
+    if (this == other)
+      return true;
+  return false;
+}
+
+/// Replace any uses of 'from' with 'to' within this operation.
+void Operation::replaceUsesOfWith(Value from, Value to) {
+  if (from == to)
+    return;
+  for (auto &operand : getOpOperands())
+    if (operand.get() == from)
+      operand.set(to);
+}
+
+/// Replace the current operands of this operation with the ones provided in
+/// 'operands'. If the operands list is not resizable, the size of 'operands'
+/// must be less than or equal to the current number of operands.
+void Operation::setOperands(ValueRange operands) {
+  getOperandStorage().setOperands(this, operands);
+}
+
+//===----------------------------------------------------------------------===//
+// Diagnostics
+//===----------------------------------------------------------------------===//
+
+/// Emit an error about fatal conditions with this operation, reporting up to
+/// any diagnostic handlers that may be listening.
+InFlightDiagnostic Operation::emitError(const Twine &message) {
+  InFlightDiagnostic diag = mlir::emitError(getLoc(), message);
+  if (printOpOnDiagnostic) {
+    // Print out the operation explicitly here so that we can print the generic
+    // form.
+    // TODO(riverriddle) It would be nice if we could instead provide the
+    // specific printing flags when adding the operation as an argument to the
+    // diagnostic.
+    std::string printedOp;
+    {
+      llvm::raw_string_ostream os(printedOp);
+      print(os, OpPrintingFlags().printGenericOpForm().useLocalScope());
+    }
+    diag.attachNote(getLoc()) << "see current operation: " << printedOp;
+  }
+  return diag;
+}
+
+/// Emit a warning about this operation, reporting up to any diagnostic
+/// handlers that may be listening.
+InFlightDiagnostic Operation::emitWarning(const Twine &message) {
+  InFlightDiagnostic diag = mlir::emitWarning(getLoc(), message);
+  if (printOpOnDiagnostic)
+    diag.attachNote(getLoc()) << "see current operation: " << *this;
+  return diag;
+}
+
+/// Emit a remark about this operation, reporting up to any diagnostic
+/// handlers that may be listening.
+InFlightDiagnostic Operation::emitRemark(const Twine &message) {
+  InFlightDiagnostic diag = mlir::emitRemark(getLoc(), message);
+  if (printOpOnDiagnostic)
+    diag.attachNote(getLoc()) << "see current operation: " << *this;
+  return diag;
+}
+
+//===----------------------------------------------------------------------===//
+// Operation Ordering
+//===----------------------------------------------------------------------===//
+
+constexpr unsigned Operation::kInvalidOrderIdx;
+constexpr unsigned Operation::kOrderStride;
+
+/// Given an operation 'other' that is within the same parent block, return
+/// whether the current operation is before 'other' in the operation list
+/// of the parent block.
+/// Note: This function has an average complexity of O(1), but worst case may
+/// take O(N) where N is the number of operations within the parent block.
+bool Operation::isBeforeInBlock(Operation *other) {
+  assert(block && "Operations without parent blocks have no order.");
+  assert(other && other->block == block &&
+         "Expected other operation to have the same parent block.");
+  // If the order of the block is already invalid, directly recompute the
+  // parent.
+  if (!block->isOpOrderValid()) {
+    block->recomputeOpOrder();
+  } else {
+    // Update the order either operation if necessary.
+    updateOrderIfNecessary();
+    other->updateOrderIfNecessary();
+  }
+
+  return orderIndex < other->orderIndex;
+}
+
+/// Update the order index of this operation of this operation if necessary,
+/// potentially recomputing the order of the parent block.
+void Operation::updateOrderIfNecessary() {
+  assert(block && "expected valid parent");
+
+  // If the order is valid for this operation there is nothing to do.
+  if (hasValidOrder())
+    return;
+  Operation *blockFront = &block->front();
+  Operation *blockBack = &block->back();
+
+  // This method is expected to only be invoked on blocks with more than one
+  // operation.
+  assert(blockFront != blockBack && "expected more than one operation");
+
+  // If the operation is at the end of the block.
+  if (this == blockBack) {
+    Operation *prevNode = getPrevNode();
+    if (!prevNode->hasValidOrder())
+      return block->recomputeOpOrder();
+
+    // Add the stride to the previous operation.
+    orderIndex = prevNode->orderIndex + kOrderStride;
+    return;
+  }
+
+  // If this is the first operation try to use the next operation to compute the
+  // ordering.
+  if (this == blockFront) {
+    Operation *nextNode = getNextNode();
+    if (!nextNode->hasValidOrder())
+      return block->recomputeOpOrder();
+    // There is no order to give this operation.
+    if (nextNode->orderIndex == 0)
+      return block->recomputeOpOrder();
+
+    // If we can't use the stride, just take the middle value left. This is safe
+    // because we know there is at least one valid index to assign to.
+    if (nextNode->orderIndex <= kOrderStride)
+      orderIndex = (nextNode->orderIndex / 2);
+    else
+      orderIndex = kOrderStride;
+    return;
+  }
+
+  // Otherwise, this operation is between two others. Place this operation in
+  // the middle of the previous and next if possible.
+  Operation *prevNode = getPrevNode(), *nextNode = getNextNode();
+  if (!prevNode->hasValidOrder() || !nextNode->hasValidOrder())
+    return block->recomputeOpOrder();
+  unsigned prevOrder = prevNode->orderIndex, nextOrder = nextNode->orderIndex;
+
+  // Check to see if there is a valid order between the two.
+  if (prevOrder + 1 == nextOrder)
+    return block->recomputeOpOrder();
+  orderIndex = prevOrder + 1 + ((nextOrder - prevOrder) / 2);
+}
+
+//===----------------------------------------------------------------------===//
+// ilist_traits for Operation
+//===----------------------------------------------------------------------===//
+
+auto llvm::ilist_detail::SpecificNodeAccess<
+    typename llvm::ilist_detail::compute_node_options<
+        ::mlir::Operation>::type>::getNodePtr(pointer N) -> node_type * {
+  return NodeAccess::getNodePtr<OptionsT>(N);
+}
+
+auto llvm::ilist_detail::SpecificNodeAccess<
+    typename llvm::ilist_detail::compute_node_options<
+        ::mlir::Operation>::type>::getNodePtr(const_pointer N)
+    -> const node_type * {
+  return NodeAccess::getNodePtr<OptionsT>(N);
+}
+
+auto llvm::ilist_detail::SpecificNodeAccess<
+    typename llvm::ilist_detail::compute_node_options<
+        ::mlir::Operation>::type>::getValuePtr(node_type *N) -> pointer {
+  return NodeAccess::getValuePtr<OptionsT>(N);
+}
+
+auto llvm::ilist_detail::SpecificNodeAccess<
+    typename llvm::ilist_detail::compute_node_options<
+        ::mlir::Operation>::type>::getValuePtr(const node_type *N)
+    -> const_pointer {
+  return NodeAccess::getValuePtr<OptionsT>(N);
+}
+
+void llvm::ilist_traits<::mlir::Operation>::deleteNode(Operation *op) {
+  op->destroy();
+}
+
+Block *llvm::ilist_traits<::mlir::Operation>::getContainingBlock() {
+  size_t Offset(size_t(&((Block *)nullptr->*Block::getSublistAccess(nullptr))));
+  iplist<Operation> *Anchor(static_cast<iplist<Operation> *>(this));
+  return reinterpret_cast<Block *>(reinterpret_cast<char *>(Anchor) - Offset);
+}
+
+/// This is a trait method invoked when a operation is added to a block.  We
+/// keep the block pointer up to date.
+void llvm::ilist_traits<::mlir::Operation>::addNodeToList(Operation *op) {
+  assert(!op->getBlock() && "already in a operation block!");
+  op->block = getContainingBlock();
+
+  // Invalidate the order on the operation.
+  op->orderIndex = Operation::kInvalidOrderIdx;
+}
+
+/// This is a trait method invoked when a operation is removed from a block.
+/// We keep the block pointer up to date.
+void llvm::ilist_traits<::mlir::Operation>::removeNodeFromList(Operation *op) {
+  assert(op->block && "not already in a operation block!");
+  op->block = nullptr;
+}
+
+/// This is a trait method invoked when a operation is moved from one block
+/// to another.  We keep the block pointer up to date.
+void llvm::ilist_traits<::mlir::Operation>::transferNodesFromList(
+    ilist_traits<Operation> &otherList, op_iterator first, op_iterator last) {
+  Block *curParent = getContainingBlock();
+
+  // Invalidate the ordering of the parent block.
+  curParent->invalidateOpOrder();
+
+  // If we are transferring operations within the same block, the block
+  // pointer doesn't need to be updated.
+  if (curParent == otherList.getContainingBlock())
+    return;
+
+  // Update the 'block' member of each operation.
+  for (; first != last; ++first)
+    first->block = curParent;
+}
+
+/// Remove this operation (and its descendants) from its Block and delete
+/// all of them.
+void Operation::erase() {
+  if (auto *parent = getBlock())
+    parent->getOperations().erase(this);
+  else
+    destroy();
+}
+
+/// Unlink this operation from its current block and insert it right before
+/// `existingOp` which may be in the same or another block in the same
+/// function.
+void Operation::moveBefore(Operation *existingOp) {
+  moveBefore(existingOp->getBlock(), existingOp->getIterator());
+}
+
+/// Unlink this operation from its current basic block and insert it right
+/// before `iterator` in the specified basic block.
+void Operation::moveBefore(Block *block,
+                           llvm::iplist<Operation>::iterator iterator) {
+  block->getOperations().splice(iterator, getBlock()->getOperations(),
+                                getIterator());
+}
+
+/// This drops all operand uses from this operation, which is an essential
+/// step in breaking cyclic dependences between references when they are to
+/// be deleted.
+void Operation::dropAllReferences() {
+  for (auto &op : getOpOperands())
+    op.drop();
+
+  for (auto &region : getRegions())
+    region.dropAllReferences();
+
+  for (auto &dest : getBlockOperands())
+    dest.drop();
+}
+
+/// This drops all uses of any values defined by this operation or its nested
+/// regions, wherever they are located.
+void Operation::dropAllDefinedValueUses() {
+  for (auto &val : getOpResults())
+    val.dropAllUses();
+
+  for (auto &region : getRegions())
+    for (auto &block : region)
+      block.dropAllDefinedValueUses();
+}
+
+/// Return true if there are no users of any results of this operation.
+bool Operation::use_empty() {
+  for (auto result : getResults())
+    if (!result->use_empty())
+      return false;
+  return true;
+}
+
+void Operation::setSuccessor(Block *block, unsigned index) {
+  assert(index < getNumSuccessors());
+  getBlockOperands()[index].set(block);
+}
+
+auto Operation::getNonSuccessorOperands() -> operand_range {
+  return getOperands().take_front(hasSuccessors() ? getSuccessorOperandIndex(0)
+                                                  : getNumOperands());
+}
+
+/// Get the index of the first operand of the successor at the provided
+/// index.
+unsigned Operation::getSuccessorOperandIndex(unsigned index) {
+  assert(!isKnownNonTerminator() && "only terminators may have successors");
+  assert(index < getNumSuccessors());
+
+  // Count the number of operands for each of the successors after, and
+  // including, the one at 'index'. This is based upon the assumption that all
+  // non successor operands are placed at the beginning of the operand list.
+  auto *successorOpCountBegin = getTrailingObjects<unsigned>();
+  unsigned postSuccessorOpCount =
+      std::accumulate(successorOpCountBegin + index,
+                      successorOpCountBegin + getNumSuccessors(), 0u);
+  return getNumOperands() - postSuccessorOpCount;
+}
+
+Optional<std::pair<unsigned, unsigned>>
+Operation::decomposeSuccessorOperandIndex(unsigned operandIndex) {
+  assert(!isKnownNonTerminator() && "only terminators may have successors");
+  assert(operandIndex < getNumOperands());
+  unsigned currentOperandIndex = getNumOperands();
+  auto *successorOperandCounts = getTrailingObjects<unsigned>();
+  for (unsigned i = 0, e = getNumSuccessors(); i < e; i++) {
+    unsigned successorIndex = e - i - 1;
+    currentOperandIndex -= successorOperandCounts[successorIndex];
+    if (currentOperandIndex <= operandIndex)
+      return std::make_pair(successorIndex, operandIndex - currentOperandIndex);
+  }
+  return None;
+}
+
+auto Operation::getSuccessorOperands(unsigned index) -> operand_range {
+  unsigned succOperandIndex = getSuccessorOperandIndex(index);
+  return getOperands().slice(succOperandIndex, getNumSuccessorOperands(index));
+}
+
+/// Attempt to fold this operation using the Op's registered foldHook.
+LogicalResult Operation::fold(ArrayRef<Attribute> operands,
+                              SmallVectorImpl<OpFoldResult> &results) {
+  // If we have a registered operation definition matching this one, use it to
+  // try to constant fold the operation.
+  auto *abstractOp = getAbstractOperation();
+  if (abstractOp && succeeded(abstractOp->foldHook(this, operands, results)))
+    return success();
+
+  // Otherwise, fall back on the dialect hook to handle it.
+  Dialect *dialect = getDialect();
+  if (!dialect)
+    return failure();
+
+  SmallVector<Attribute, 8> constants;
+  if (failed(dialect->constantFoldHook(this, operands, constants)))
+    return failure();
+  results.assign(constants.begin(), constants.end());
+  return success();
+}
+
+/// Emit an error with the op name prefixed, like "'dim' op " which is
+/// convenient for verifiers.
+InFlightDiagnostic Operation::emitOpError(const Twine &message) {
+  return emitError() << "'" << getName() << "' op " << message;
+}
+
+//===----------------------------------------------------------------------===//
+// Operation Cloning
+//===----------------------------------------------------------------------===//
+
+/// Create a deep copy of this operation but keep the operation regions empty.
+/// Operands are remapped using `mapper` (if present), and `mapper` is updated
+/// to contain the results.
+Operation *Operation::cloneWithoutRegions(BlockAndValueMapping &mapper) {
+  SmallVector<Value, 8> operands;
+  SmallVector<Block *, 2> successors;
+
+  operands.reserve(getNumOperands() + getNumSuccessors());
+
+  if (getNumSuccessors() == 0) {
+    // Non-branching operations can just add all the operands.
+    for (auto opValue : getOperands())
+      operands.push_back(mapper.lookupOrDefault(opValue));
+  } else {
+    // We add the operands separated by nullptr's for each successor.
+    unsigned firstSuccOperand =
+        getNumSuccessors() ? getSuccessorOperandIndex(0) : getNumOperands();
+    auto opOperands = getOpOperands();
+
+    unsigned i = 0;
+    for (; i != firstSuccOperand; ++i)
+      operands.push_back(mapper.lookupOrDefault(opOperands[i].get()));
+
+    successors.reserve(getNumSuccessors());
+    for (unsigned succ = 0, e = getNumSuccessors(); succ != e; ++succ) {
+      successors.push_back(mapper.lookupOrDefault(getSuccessor(succ)));
+
+      // Add sentinel to delineate successor operands.
+      operands.push_back(nullptr);
+
+      // Remap the successors operands.
+      for (auto operand : getSuccessorOperands(succ))
+        operands.push_back(mapper.lookupOrDefault(operand));
+    }
+  }
+
+  SmallVector<Type, 8> resultTypes(getResultTypes());
+  unsigned numRegions = getNumRegions();
+  auto *newOp =
+      Operation::create(getLoc(), getName(), resultTypes, operands, attrs,
+                        successors, numRegions, hasResizableOperandsList());
+
+  // Remember the mapping of any results.
+  for (unsigned i = 0, e = getNumResults(); i != e; ++i)
+    mapper.map(getResult(i), newOp->getResult(i));
+
+  return newOp;
+}
+
+Operation *Operation::cloneWithoutRegions() {
+  BlockAndValueMapping mapper;
+  return cloneWithoutRegions(mapper);
+}
+
+/// Create a deep copy of this operation, remapping any operands that use
+/// values outside of the operation using the map that is provided (leaving
+/// them alone if no entry is present).  Replaces references to cloned
+/// sub-operations to the corresponding operation that is copied, and adds
+/// those mappings to the map.
+Operation *Operation::clone(BlockAndValueMapping &mapper) {
+  auto *newOp = cloneWithoutRegions(mapper);
+
+  // Clone the regions.
+  for (unsigned i = 0; i != numRegions; ++i)
+    getRegion(i).cloneInto(&newOp->getRegion(i), mapper);
+
+  return newOp;
+}
+
+Operation *Operation::clone() {
+  BlockAndValueMapping mapper;
+  return clone(mapper);
+}
+
+//===----------------------------------------------------------------------===//
+// OpState trait class.
+//===----------------------------------------------------------------------===//
+
+// The fallback for the parser is to reject the custom assembly form.
+ParseResult OpState::parse(OpAsmParser &parser, OperationState &result) {
+  return parser.emitError(parser.getNameLoc(), "has no custom assembly form");
+}
+
+// The fallback for the printer is to print in the generic assembly form.
+void OpState::print(OpAsmPrinter &p) { p.printGenericOp(getOperation()); }
+
+/// Emit an error about fatal conditions with this operation, reporting up to
+/// any diagnostic handlers that may be listening.
+InFlightDiagnostic OpState::emitError(const Twine &message) {
+  return getOperation()->emitError(message);
+}
+
+/// Emit an error with the op name prefixed, like "'dim' op " which is
+/// convenient for verifiers.
+InFlightDiagnostic OpState::emitOpError(const Twine &message) {
+  return getOperation()->emitOpError(message);
+}
+
+/// Emit a warning about this operation, reporting up to any diagnostic
+/// handlers that may be listening.
+InFlightDiagnostic OpState::emitWarning(const Twine &message) {
+  return getOperation()->emitWarning(message);
+}
+
+/// Emit a remark about this operation, reporting up to any diagnostic
+/// handlers that may be listening.
+InFlightDiagnostic OpState::emitRemark(const Twine &message) {
+  return getOperation()->emitRemark(message);
+}
+
+//===----------------------------------------------------------------------===//
+// Op Trait implementations
+//===----------------------------------------------------------------------===//
+
+LogicalResult OpTrait::impl::verifyZeroOperands(Operation *op) {
+  if (op->getNumOperands() != 0)
+    return op->emitOpError() << "requires zero operands";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyOneOperand(Operation *op) {
+  if (op->getNumOperands() != 1)
+    return op->emitOpError() << "requires a single operand";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyNOperands(Operation *op,
+                                             unsigned numOperands) {
+  if (op->getNumOperands() != numOperands) {
+    return op->emitOpError() << "expected " << numOperands
+                             << " operands, but found " << op->getNumOperands();
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyAtLeastNOperands(Operation *op,
+                                                    unsigned numOperands) {
+  if (op->getNumOperands() < numOperands)
+    return op->emitOpError()
+           << "expected " << numOperands << " or more operands";
+  return success();
+}
+
+/// If this is a vector type, or a tensor type, return the scalar element type
+/// that it is built around, otherwise return the type unmodified.
+static Type getTensorOrVectorElementType(Type type) {
+  if (auto vec = type.dyn_cast<VectorType>())
+    return vec.getElementType();
+
+  // Look through tensor<vector<...>> to find the underlying element type.
+  if (auto tensor = type.dyn_cast<TensorType>())
+    return getTensorOrVectorElementType(tensor.getElementType());
+  return type;
+}
+
+LogicalResult OpTrait::impl::verifyOperandsAreIntegerLike(Operation *op) {
+  for (auto opType : op->getOperandTypes()) {
+    auto type = getTensorOrVectorElementType(opType);
+    if (!type.isIntOrIndex())
+      return op->emitOpError() << "requires an integer or index type";
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyOperandsAreFloatLike(Operation *op) {
+  for (auto opType : op->getOperandTypes()) {
+    auto type = getTensorOrVectorElementType(opType);
+    if (!type.isa<FloatType>())
+      return op->emitOpError("requires a float type");
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifySameTypeOperands(Operation *op) {
+  // Zero or one operand always have the "same" type.
+  unsigned nOperands = op->getNumOperands();
+  if (nOperands < 2)
+    return success();
+
+  auto type = op->getOperand(0)->getType();
+  for (auto opType : llvm::drop_begin(op->getOperandTypes(), 1))
+    if (opType != type)
+      return op->emitOpError() << "requires all operands to have the same type";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyZeroResult(Operation *op) {
+  if (op->getNumResults() != 0)
+    return op->emitOpError() << "requires zero results";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyOneResult(Operation *op) {
+  if (op->getNumResults() != 1)
+    return op->emitOpError() << "requires one result";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyNResults(Operation *op,
+                                            unsigned numOperands) {
+  if (op->getNumResults() != numOperands)
+    return op->emitOpError() << "expected " << numOperands << " results";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyAtLeastNResults(Operation *op,
+                                                   unsigned numOperands) {
+  if (op->getNumResults() < numOperands)
+    return op->emitOpError()
+           << "expected " << numOperands << " or more results";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifySameOperandsShape(Operation *op) {
+  if (failed(verifyAtLeastNOperands(op, 1)))
+    return failure();
+
+  auto type = op->getOperand(0)->getType();
+  for (auto opType : llvm::drop_begin(op->getOperandTypes(), 1)) {
+    if (failed(verifyCompatibleShape(opType, type)))
+      return op->emitOpError() << "requires the same shape for all operands";
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifySameOperandsAndResultShape(Operation *op) {
+  if (failed(verifyAtLeastNOperands(op, 1)) ||
+      failed(verifyAtLeastNResults(op, 1)))
+    return failure();
+
+  auto type = op->getOperand(0)->getType();
+  for (auto resultType : op->getResultTypes()) {
+    if (failed(verifyCompatibleShape(resultType, type)))
+      return op->emitOpError()
+             << "requires the same shape for all operands and results";
+  }
+  for (auto opType : llvm::drop_begin(op->getOperandTypes(), 1)) {
+    if (failed(verifyCompatibleShape(opType, type)))
+      return op->emitOpError()
+             << "requires the same shape for all operands and results";
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifySameOperandsElementType(Operation *op) {
+  if (failed(verifyAtLeastNOperands(op, 1)))
+    return failure();
+  auto elementType = getElementTypeOrSelf(op->getOperand(0));
+
+  for (auto operand : llvm::drop_begin(op->getOperands(), 1)) {
+    if (getElementTypeOrSelf(operand) != elementType)
+      return op->emitOpError("requires the same element type for all operands");
+  }
+
+  return success();
+}
+
+LogicalResult
+OpTrait::impl::verifySameOperandsAndResultElementType(Operation *op) {
+  if (failed(verifyAtLeastNOperands(op, 1)) ||
+      failed(verifyAtLeastNResults(op, 1)))
+    return failure();
+
+  auto elementType = getElementTypeOrSelf(op->getResult(0));
+
+  // Verify result element type matches first result's element type.
+  for (auto result : llvm::drop_begin(op->getResults(), 1)) {
+    if (getElementTypeOrSelf(result) != elementType)
+      return op->emitOpError(
+          "requires the same element type for all operands and results");
+  }
+
+  // Verify operand's element type matches first result's element type.
+  for (auto operand : op->getOperands()) {
+    if (getElementTypeOrSelf(operand) != elementType)
+      return op->emitOpError(
+          "requires the same element type for all operands and results");
+  }
+
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifySameOperandsAndResultType(Operation *op) {
+  if (failed(verifyAtLeastNOperands(op, 1)) ||
+      failed(verifyAtLeastNResults(op, 1)))
+    return failure();
+
+  auto type = op->getResult(0)->getType();
+  auto elementType = getElementTypeOrSelf(type);
+  for (auto resultType : llvm::drop_begin(op->getResultTypes(), 1)) {
+    if (getElementTypeOrSelf(resultType) != elementType ||
+        failed(verifyCompatibleShape(resultType, type)))
+      return op->emitOpError()
+             << "requires the same type for all operands and results";
+  }
+  for (auto opType : op->getOperandTypes()) {
+    if (getElementTypeOrSelf(opType) != elementType ||
+        failed(verifyCompatibleShape(opType, type)))
+      return op->emitOpError()
+             << "requires the same type for all operands and results";
+  }
+  return success();
+}
+
+static LogicalResult verifySuccessor(Operation *op, unsigned succNo) {
+  Operation::operand_range operands = op->getSuccessorOperands(succNo);
+  unsigned operandCount = op->getNumSuccessorOperands(succNo);
+  Block *destBB = op->getSuccessor(succNo);
+  if (operandCount != destBB->getNumArguments())
+    return op->emitError() << "branch has " << operandCount
+                           << " operands for successor #" << succNo
+                           << ", but target block has "
+                           << destBB->getNumArguments();
+
+  auto operandIt = operands.begin();
+  for (unsigned i = 0, e = operandCount; i != e; ++i, ++operandIt) {
+    if ((*operandIt)->getType() != destBB->getArgument(i)->getType())
+      return op->emitError() << "type mismatch for bb argument #" << i
+                             << " of successor #" << succNo;
+  }
+
+  return success();
+}
+
+static LogicalResult verifyTerminatorSuccessors(Operation *op) {
+  auto *parent = op->getParentRegion();
+
+  // Verify that the operands lines up with the BB arguments in the successor.
+  for (unsigned i = 0, e = op->getNumSuccessors(); i != e; ++i) {
+    auto *succ = op->getSuccessor(i);
+    if (succ->getParent() != parent)
+      return op->emitError("reference to block defined in another region");
+    if (failed(verifySuccessor(op, i)))
+      return failure();
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyIsTerminator(Operation *op) {
+  Block *block = op->getBlock();
+  // Verify that the operation is at the end of the respective parent block.
+  if (!block || &block->back() != op)
+    return op->emitOpError("must be the last operation in the parent block");
+
+  // Verify the state of the successor blocks.
+  if (op->getNumSuccessors() != 0 && failed(verifyTerminatorSuccessors(op)))
+    return failure();
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyResultsAreBoolLike(Operation *op) {
+  for (auto resultType : op->getResultTypes()) {
+    auto elementType = getTensorOrVectorElementType(resultType);
+    bool isBoolType = elementType.isInteger(1);
+    if (!isBoolType)
+      return op->emitOpError() << "requires a bool result type";
+  }
+
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyResultsAreFloatLike(Operation *op) {
+  for (auto resultType : op->getResultTypes())
+    if (!getTensorOrVectorElementType(resultType).isa<FloatType>())
+      return op->emitOpError() << "requires a floating point type";
+
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyResultsAreIntegerLike(Operation *op) {
+  for (auto resultType : op->getResultTypes())
+    if (!getTensorOrVectorElementType(resultType).isIntOrIndex())
+      return op->emitOpError() << "requires an integer or index type";
+  return success();
+}
+
+static LogicalResult verifyValueSizeAttr(Operation *op, StringRef attrName,
+                                         bool isOperand) {
+  auto sizeAttr = op->getAttrOfType<DenseIntElementsAttr>(attrName);
+  if (!sizeAttr)
+    return op->emitOpError("requires 1D vector attribute '") << attrName << "'";
+
+  auto sizeAttrType = sizeAttr.getType().dyn_cast<VectorType>();
+  if (!sizeAttrType || sizeAttrType.getRank() != 1)
+    return op->emitOpError("requires 1D vector attribute '") << attrName << "'";
+
+  if (llvm::any_of(sizeAttr.getIntValues(), [](const APInt &element) {
+        return !element.isNonNegative();
+      }))
+    return op->emitOpError("'")
+           << attrName << "' attribute cannot have negative elements";
+
+  size_t totalCount = std::accumulate(
+      sizeAttr.begin(), sizeAttr.end(), 0,
+      [](unsigned all, APInt one) { return all + one.getZExtValue(); });
+
+  if (isOperand && totalCount != op->getNumOperands())
+    return op->emitOpError("operand count (")
+           << op->getNumOperands() << ") does not match with the total size ("
+           << totalCount << ") specified in attribute '" << attrName << "'";
+  else if (!isOperand && totalCount != op->getNumResults())
+    return op->emitOpError("result count (")
+           << op->getNumResults() << ") does not match with the total size ("
+           << totalCount << ") specified in attribute '" << attrName << "'";
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifyOperandSizeAttr(Operation *op,
+                                                   StringRef attrName) {
+  return verifyValueSizeAttr(op, attrName, /*isOperand=*/true);
+}
+
+LogicalResult OpTrait::impl::verifyResultSizeAttr(Operation *op,
+                                                  StringRef attrName) {
+  return verifyValueSizeAttr(op, attrName, /*isOperand=*/false);
+}
+
+//===----------------------------------------------------------------------===//
+// BinaryOp implementation
+//===----------------------------------------------------------------------===//
+
+// These functions are out-of-line implementations of the methods in BinaryOp,
+// which avoids them being template instantiated/duplicated.
+
+void impl::buildBinaryOp(Builder *builder, OperationState &result, Value lhs,
+                         Value rhs) {
+  assert(lhs->getType() == rhs->getType());
+  result.addOperands({lhs, rhs});
+  result.types.push_back(lhs->getType());
+}
+
+ParseResult impl::parseOneResultSameOperandTypeOp(OpAsmParser &parser,
+                                                  OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 2> ops;
+  Type type;
+  return failure(parser.parseOperandList(ops) ||
+                 parser.parseOptionalAttrDict(result.attributes) ||
+                 parser.parseColonType(type) ||
+                 parser.resolveOperands(ops, type, result.operands) ||
+                 parser.addTypeToList(type, result.types));
+}
+
+void impl::printOneResultOp(Operation *op, OpAsmPrinter &p) {
+  assert(op->getNumResults() == 1 && "op should have one result");
+
+  // If not all the operand and result types are the same, just use the
+  // generic assembly form to avoid omitting information in printing.
+  auto resultType = op->getResult(0)->getType();
+  if (llvm::any_of(op->getOperandTypes(),
+                   [&](Type type) { return type != resultType; })) {
+    p.printGenericOp(op);
+    return;
+  }
+
+  p << op->getName() << ' ';
+  p.printOperands(op->getOperands());
+  p.printOptionalAttrDict(op->getAttrs());
+  // Now we can output only one type for all operands and the result.
+  p << " : " << resultType;
+}
+
+//===----------------------------------------------------------------------===//
+// CastOp implementation
+//===----------------------------------------------------------------------===//
+
+void impl::buildCastOp(Builder *builder, OperationState &result, Value source,
+                       Type destType) {
+  result.addOperands(source);
+  result.addTypes(destType);
+}
+
+ParseResult impl::parseCastOp(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::OperandType srcInfo;
+  Type srcType, dstType;
+  return failure(parser.parseOperand(srcInfo) ||
+                 parser.parseOptionalAttrDict(result.attributes) ||
+                 parser.parseColonType(srcType) ||
+                 parser.resolveOperand(srcInfo, srcType, result.operands) ||
+                 parser.parseKeywordType("to", dstType) ||
+                 parser.addTypeToList(dstType, result.types));
+}
+
+void impl::printCastOp(Operation *op, OpAsmPrinter &p) {
+  p << op->getName() << ' ' << *op->getOperand(0);
+  p.printOptionalAttrDict(op->getAttrs());
+  p << " : " << op->getOperand(0)->getType() << " to "
+    << op->getResult(0)->getType();
+}
+
+Value impl::foldCastOp(Operation *op) {
+  // Identity cast
+  if (op->getOperand(0)->getType() == op->getResult(0)->getType())
+    return op->getOperand(0);
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Misc. utils
+//===----------------------------------------------------------------------===//
+
+/// Insert an operation, generated by `buildTerminatorOp`, at the end of the
+/// region's only block if it does not have a terminator already. If the region
+/// is empty, insert a new block first. `buildTerminatorOp` should return the
+/// terminator operation to insert.
+void impl::ensureRegionTerminator(
+    Region &region, Location loc,
+    function_ref<Operation *()> buildTerminatorOp) {
+  if (region.empty())
+    region.push_back(new Block);
+
+  Block &block = region.back();
+  if (!block.empty() && block.back().isKnownTerminator())
+    return;
+
+  block.push_back(buildTerminatorOp());
+}
+
+//===----------------------------------------------------------------------===//
+// UseIterator
+//===----------------------------------------------------------------------===//
+
+UseIterator::UseIterator(Operation *op, bool end)
+    : op(op), res(end ? op->result_end() : op->result_begin()) {
+  // Only initialize current use if there are results/can be uses.
+  if (op->getNumResults())
+    skipOverResultsWithNoUsers();
+}
+
+UseIterator &UseIterator::operator++() {
+  // We increment over uses, if we reach the last use then move to next
+  // result.
+  if (use != (*res)->use_end())
+    ++use;
+  if (use == (*res)->use_end()) {
+    ++res;
+    skipOverResultsWithNoUsers();
+  }
+  return *this;
+}
+
+bool UseIterator::operator==(const UseIterator &other) const {
+  if (op != other.op)
+    return false;
+  if (op->getNumResults() == 0)
+    return true;
+  return res == other.res && use == other.use;
+}
+
+bool UseIterator::operator!=(const UseIterator &other) const {
+  return !(*this == other);
+}
+
+void UseIterator::skipOverResultsWithNoUsers() {
+  while (res != op->result_end() && (*res)->use_empty())
+    ++res;
+
+  // If we are at the last result, then set use to first use of
+  // first result (sentinel value used for end).
+  if (res == op->result_end())
+    use = {};
+  else
+    use = (*res)->use_begin();
+}
diff --git a/mlir/lib/IR/OperationSupport.cpp b/mlir/lib/IR/OperationSupport.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5dfd3b02cc64ee852fbee208501738b1ad0de395
--- /dev/null
+++ b/mlir/lib/IR/OperationSupport.cpp
@@ -0,0 +1,184 @@
+//===- OperationSupport.cpp -----------------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains out-of-line implementations of the support types that
+// Operation and related classes build on top of.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Operation.h"
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// OperationState
+//===----------------------------------------------------------------------===//
+
+OperationState::OperationState(Location location, StringRef name)
+    : location(location), name(name, location->getContext()) {}
+
+OperationState::OperationState(Location location, OperationName name)
+    : location(location), name(name) {}
+
+OperationState::OperationState(Location location, StringRef name,
+                               ValueRange operands, ArrayRef<Type> types,
+                               ArrayRef<NamedAttribute> attributes,
+                               ArrayRef<Block *> successors,
+                               MutableArrayRef<std::unique_ptr<Region>> regions,
+                               bool resizableOperandList)
+    : location(location), name(name, location->getContext()),
+      operands(operands.begin(), operands.end()),
+      types(types.begin(), types.end()),
+      attributes(attributes.begin(), attributes.end()),
+      successors(successors.begin(), successors.end()) {
+  for (std::unique_ptr<Region> &r : regions)
+    this->regions.push_back(std::move(r));
+}
+
+void OperationState::addOperands(ValueRange newOperands) {
+  assert(successors.empty() && "Non successor operands should be added first.");
+  operands.append(newOperands.begin(), newOperands.end());
+}
+
+void OperationState::addSuccessor(Block *successor, ValueRange succOperands) {
+  successors.push_back(successor);
+  // Insert a sentinel operand to mark a barrier between successor operands.
+  operands.push_back(nullptr);
+  operands.append(succOperands.begin(), succOperands.end());
+}
+
+Region *OperationState::addRegion() {
+  regions.emplace_back(new Region);
+  return regions.back().get();
+}
+
+void OperationState::addRegion(std::unique_ptr<Region> &&region) {
+  regions.push_back(std::move(region));
+}
+
+//===----------------------------------------------------------------------===//
+// OperandStorage
+//===----------------------------------------------------------------------===//
+
+/// Replace the operands contained in the storage with the ones provided in
+/// 'operands'.
+void detail::OperandStorage::setOperands(Operation *owner,
+                                         ValueRange operands) {
+  // If the number of operands is less than or equal to the current amount, we
+  // can just update in place.
+  if (operands.size() <= numOperands) {
+    auto opOperands = getOperands();
+
+    // If the number of new operands is less than the current count, then remove
+    // any extra operands.
+    for (unsigned i = operands.size(); i != numOperands; ++i)
+      opOperands[i].~OpOperand();
+
+    // Set the operands in place.
+    numOperands = operands.size();
+    for (unsigned i = 0; i != numOperands; ++i)
+      opOperands[i].set(operands[i]);
+    return;
+  }
+
+  // Otherwise, we need to be resizable.
+  assert(resizable && "Only resizable operations may add operands");
+
+  // Grow the capacity if necessary.
+  auto &resizeUtil = getResizableStorage();
+  if (resizeUtil.capacity < operands.size())
+    grow(resizeUtil, operands.size());
+
+  // Set the operands.
+  OpOperand *opBegin = getRawOperands();
+  for (unsigned i = 0; i != numOperands; ++i)
+    opBegin[i].set(operands[i]);
+  for (unsigned e = operands.size(); numOperands != e; ++numOperands)
+    new (&opBegin[numOperands]) OpOperand(owner, operands[numOperands]);
+}
+
+/// Erase an operand held by the storage.
+void detail::OperandStorage::eraseOperand(unsigned index) {
+  assert(index < size());
+  auto operands = getOperands();
+  --numOperands;
+
+  // Shift all operands down by 1 if the operand to remove is not at the end.
+  auto indexIt = std::next(operands.begin(), index);
+  if (index != numOperands)
+    std::rotate(indexIt, std::next(indexIt), operands.end());
+  operands[numOperands].~OpOperand();
+}
+
+/// Grow the internal operand storage.
+void detail::OperandStorage::grow(ResizableStorage &resizeUtil,
+                                  size_t minSize) {
+  // Allocate a new storage array.
+  resizeUtil.capacity =
+      std::max(size_t(llvm::NextPowerOf2(resizeUtil.capacity + 2)), minSize);
+  OpOperand *newStorage = static_cast<OpOperand *>(
+      llvm::safe_malloc(resizeUtil.capacity * sizeof(OpOperand)));
+
+  // Move the current operands to the new storage.
+  auto operands = getOperands();
+  std::uninitialized_copy(std::make_move_iterator(operands.begin()),
+                          std::make_move_iterator(operands.end()), newStorage);
+
+  // Destroy the original operands and update the resizable storage pointer.
+  for (auto &operand : operands)
+    operand.~OpOperand();
+  resizeUtil.setDynamicStorage(newStorage);
+}
+
+//===----------------------------------------------------------------------===//
+// Operation Value-Iterators
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// OperandRange
+
+OperandRange::OperandRange(Operation *op)
+    : OperandRange(op->getOpOperands().data(), op->getNumOperands()) {}
+
+//===----------------------------------------------------------------------===//
+// ResultRange
+
+ResultRange::ResultRange(Operation *op)
+    : ResultRange(op->getOpResults().data(), op->getNumResults()) {}
+
+//===----------------------------------------------------------------------===//
+// ValueRange
+
+ValueRange::ValueRange(ArrayRef<Value> values)
+    : ValueRange(values.data(), values.size()) {}
+ValueRange::ValueRange(OperandRange values)
+    : ValueRange(values.begin().getBase(), values.size()) {}
+ValueRange::ValueRange(ResultRange values)
+    : ValueRange(values.begin().getBase(), values.size()) {}
+
+/// See `detail::indexed_accessor_range_base` for details.
+ValueRange::OwnerT ValueRange::offset_base(const OwnerT &owner,
+                                           ptrdiff_t index) {
+  if (OpOperand *operand = owner.dyn_cast<OpOperand *>())
+    return operand + index;
+  if (OpResult *result = owner.dyn_cast<OpResult *>())
+    return result + index;
+  return owner.get<const Value *>() + index;
+}
+/// See `detail::indexed_accessor_range_base` for details.
+Value ValueRange::dereference_iterator(const OwnerT &owner, ptrdiff_t index) {
+  // Operands access the held value via 'get'.
+  if (OpOperand *operand = owner.dyn_cast<OpOperand *>())
+    return operand[index].get();
+  // An OpResult is a value, so we can return it directly.
+  if (OpResult *result = owner.dyn_cast<OpResult *>())
+    return result[index];
+  // Otherwise, this is a raw value array so just index directly.
+  return owner.get<const Value *>()[index];
+}
diff --git a/mlir/lib/IR/PatternMatch.cpp b/mlir/lib/IR/PatternMatch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50e6eeec982974d9cc50cc2f1ada7c21299ec6d7
--- /dev/null
+++ b/mlir/lib/IR/PatternMatch.cpp
@@ -0,0 +1,204 @@
+//===- PatternMatch.cpp - Base classes for pattern match ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+using namespace mlir;
+
+PatternBenefit::PatternBenefit(unsigned benefit) : representation(benefit) {
+  assert(representation == benefit && benefit != ImpossibleToMatchSentinel &&
+         "This pattern match benefit is too large to represent");
+}
+
+unsigned short PatternBenefit::getBenefit() const {
+  assert(representation != ImpossibleToMatchSentinel &&
+         "Pattern doesn't match");
+  return representation;
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern implementation
+//===----------------------------------------------------------------------===//
+
+Pattern::Pattern(StringRef rootName, PatternBenefit benefit,
+                 MLIRContext *context)
+    : rootKind(OperationName(rootName, context)), benefit(benefit) {}
+
+// Out-of-line vtable anchor.
+void Pattern::anchor() {}
+
+//===----------------------------------------------------------------------===//
+// RewritePattern and PatternRewriter implementation
+//===----------------------------------------------------------------------===//
+
+void RewritePattern::rewrite(Operation *op, std::unique_ptr<PatternState> state,
+                             PatternRewriter &rewriter) const {
+  rewrite(op, rewriter);
+}
+
+void RewritePattern::rewrite(Operation *op, PatternRewriter &rewriter) const {
+  llvm_unreachable("need to implement either matchAndRewrite or one of the "
+                   "rewrite functions!");
+}
+
+PatternMatchResult RewritePattern::match(Operation *op) const {
+  llvm_unreachable("need to implement either match or matchAndRewrite!");
+}
+
+/// Patterns must specify the root operation name they match against, and can
+/// also specify the benefit of the pattern matching. They can also specify the
+/// names of operations that may be generated during a successful rewrite.
+RewritePattern::RewritePattern(StringRef rootName,
+                               ArrayRef<StringRef> generatedNames,
+                               PatternBenefit benefit, MLIRContext *context)
+    : Pattern(rootName, benefit, context) {
+  generatedOps.reserve(generatedNames.size());
+  std::transform(generatedNames.begin(), generatedNames.end(),
+                 std::back_inserter(generatedOps), [context](StringRef name) {
+                   return OperationName(name, context);
+                 });
+}
+
+PatternRewriter::~PatternRewriter() {
+  // Out of line to provide a vtable anchor for the class.
+}
+
+/// This method performs the final replacement for a pattern, where the
+/// results of the operation are updated to use the specified list of SSA
+/// values.  In addition to replacing and removing the specified operation,
+/// clients can specify a list of other nodes that this replacement may make
+/// (perhaps transitively) dead.  If any of those ops are dead, this will
+/// remove them as well.
+void PatternRewriter::replaceOp(Operation *op, ValueRange newValues,
+                                ValueRange valuesToRemoveIfDead) {
+  // Notify the rewriter subclass that we're about to replace this root.
+  notifyRootReplaced(op);
+
+  assert(op->getNumResults() == newValues.size() &&
+         "incorrect # of replacement values");
+  op->replaceAllUsesWith(newValues);
+
+  notifyOperationRemoved(op);
+  op->erase();
+
+  // TODO: Process the valuesToRemoveIfDead list, removing things and calling
+  // the notifyOperationRemoved hook in the process.
+}
+
+/// This method erases an operation that is known to have no uses. The uses of
+/// the given operation *must* be known to be dead.
+void PatternRewriter::eraseOp(Operation *op) {
+  assert(op->use_empty() && "expected 'op' to have no uses");
+  notifyOperationRemoved(op);
+  op->erase();
+}
+
+/// Merge the operations of block 'source' into the end of block 'dest'.
+/// 'source's predecessors must be empty or only contain 'dest`.
+/// 'argValues' is used to replace the block arguments of 'source' after
+/// merging.
+void PatternRewriter::mergeBlocks(Block *source, Block *dest,
+                                  ValueRange argValues) {
+  assert(llvm::all_of(source->getPredecessors(),
+                      [dest](Block *succ) { return succ == dest; }) &&
+         "expected 'source' to have no predecessors or only 'dest'");
+  assert(argValues.size() == source->getNumArguments() &&
+         "incorrect # of argument replacement values");
+
+  // Replace all of the successor arguments with the provided values.
+  for (auto it : llvm::zip(source->getArguments(), argValues))
+    std::get<0>(it)->replaceAllUsesWith(std::get<1>(it));
+
+  // Splice the operations of the 'source' block into the 'dest' block and erase
+  // it.
+  dest->getOperations().splice(dest->end(), source->getOperations());
+  source->dropAllUses();
+  source->erase();
+}
+
+/// Split the operations starting at "before" (inclusive) out of the given
+/// block into a new block, and return it.
+Block *PatternRewriter::splitBlock(Block *block, Block::iterator before) {
+  return block->splitBlock(before);
+}
+
+/// op and newOp are known to have the same number of results, replace the
+/// uses of op with uses of newOp
+void PatternRewriter::replaceOpWithResultsOfAnotherOp(
+    Operation *op, Operation *newOp, ValueRange valuesToRemoveIfDead) {
+  assert(op->getNumResults() == newOp->getNumResults() &&
+         "replacement op doesn't match results of original op");
+  if (op->getNumResults() == 1)
+    return replaceOp(op, newOp->getResult(0), valuesToRemoveIfDead);
+  return replaceOp(op, newOp->getResults(), valuesToRemoveIfDead);
+}
+
+/// Move the blocks that belong to "region" before the given position in
+/// another region.  The two regions must be different.  The caller is in
+/// charge to update create the operation transferring the control flow to the
+/// region and pass it the correct block arguments.
+void PatternRewriter::inlineRegionBefore(Region &region, Region &parent,
+                                         Region::iterator before) {
+  parent.getBlocks().splice(before, region.getBlocks());
+}
+void PatternRewriter::inlineRegionBefore(Region &region, Block *before) {
+  inlineRegionBefore(region, *before->getParent(), before->getIterator());
+}
+
+/// Clone the blocks that belong to "region" before the given position in
+/// another region "parent". The two regions must be different. The caller is
+/// responsible for creating or updating the operation transferring flow of
+/// control to the region and passing it the correct block arguments.
+void PatternRewriter::cloneRegionBefore(Region &region, Region &parent,
+                                        Region::iterator before,
+                                        BlockAndValueMapping &mapping) {
+  region.cloneInto(&parent, before, mapping);
+}
+void PatternRewriter::cloneRegionBefore(Region &region, Region &parent,
+                                        Region::iterator before) {
+  BlockAndValueMapping mapping;
+  cloneRegionBefore(region, parent, before, mapping);
+}
+void PatternRewriter::cloneRegionBefore(Region &region, Block *before) {
+  cloneRegionBefore(region, *before->getParent(), before->getIterator());
+}
+
+//===----------------------------------------------------------------------===//
+// PatternMatcher implementation
+//===----------------------------------------------------------------------===//
+
+RewritePatternMatcher::RewritePatternMatcher(
+    const OwningRewritePatternList &patterns) {
+  for (auto &pattern : patterns)
+    this->patterns.push_back(pattern.get());
+
+  // Sort the patterns by benefit to simplify the matching logic.
+  std::stable_sort(this->patterns.begin(), this->patterns.end(),
+                   [](RewritePattern *l, RewritePattern *r) {
+                     return r->getBenefit() < l->getBenefit();
+                   });
+}
+
+/// Try to match the given operation to a pattern and rewrite it.
+bool RewritePatternMatcher::matchAndRewrite(Operation *op,
+                                            PatternRewriter &rewriter) {
+  for (auto *pattern : patterns) {
+    // Ignore patterns that are for the wrong root or are impossible to match.
+    if (pattern->getRootKind() != op->getName() ||
+        pattern->getBenefit().isImpossibleToMatch())
+      continue;
+
+    // Try to match and rewrite this pattern. The patterns are sorted by
+    // benefit, so if we match we can immediately rewrite and return.
+    if (pattern->matchAndRewrite(op, rewriter))
+      return true;
+  }
+  return false;
+}
diff --git a/mlir/lib/IR/Region.cpp b/mlir/lib/IR/Region.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e8abc884dd5446e4d65373a1e7c788b58f9c993
--- /dev/null
+++ b/mlir/lib/IR/Region.cpp
@@ -0,0 +1,230 @@
+//===- Region.cpp - MLIR Region Class -------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Region.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Operation.h"
+using namespace mlir;
+
+Region::Region(Operation *container) : container(container) {}
+
+Region::~Region() {
+  // Operations may have cyclic references, which need to be dropped before we
+  // can start deleting them.
+  dropAllReferences();
+}
+
+/// Return the context this region is inserted in. The region must have a valid
+/// parent container.
+MLIRContext *Region::getContext() {
+  assert(container && "region is not attached to a container");
+  return container->getContext();
+}
+
+/// Return a location for this region. This is the location attached to the
+/// parent container. The region must have a valid parent container.
+Location Region::getLoc() {
+  assert(container && "region is not attached to a container");
+  return container->getLoc();
+}
+
+Region *Region::getParentRegion() {
+  assert(container && "region is not attached to a container");
+  return container->getParentRegion();
+}
+
+Operation *Region::getParentOp() { return container; }
+
+bool Region::isProperAncestor(Region *other) {
+  if (this == other)
+    return false;
+
+  while ((other = other->getParentRegion())) {
+    if (this == other)
+      return true;
+  }
+  return false;
+}
+
+/// Return the number of this region in the parent operation.
+unsigned Region::getRegionNumber() {
+  // Regions are always stored consecutively, so use pointer subtraction to
+  // figure out what number this is.
+  return this - &getParentOp()->getRegions()[0];
+}
+
+/// Clone the internal blocks from this region into `dest`. Any
+/// cloned blocks are appended to the back of dest.
+void Region::cloneInto(Region *dest, BlockAndValueMapping &mapper) {
+  assert(dest && "expected valid region to clone into");
+  cloneInto(dest, dest->end(), mapper);
+}
+
+/// Clone this region into 'dest' before the given position in 'dest'.
+void Region::cloneInto(Region *dest, Region::iterator destPos,
+                       BlockAndValueMapping &mapper) {
+  assert(dest && "expected valid region to clone into");
+  assert(this != dest && "cannot clone region into itself");
+
+  // If the list is empty there is nothing to clone.
+  if (empty())
+    return;
+
+  for (Block &block : *this) {
+    Block *newBlock = new Block();
+    mapper.map(&block, newBlock);
+
+    // Clone the block arguments. The user might be deleting arguments to the
+    // block by specifying them in the mapper. If so, we don't add the
+    // argument to the cloned block.
+    for (auto arg : block.getArguments())
+      if (!mapper.contains(arg))
+        mapper.map(arg, newBlock->addArgument(arg->getType()));
+
+    // Clone and remap the operations within this block.
+    for (auto &op : block)
+      newBlock->push_back(op.clone(mapper));
+
+    dest->getBlocks().insert(destPos, newBlock);
+  }
+
+  // Now that each of the blocks have been cloned, go through and remap the
+  // operands of each of the operations.
+  auto remapOperands = [&](Operation *op) {
+    for (auto &operand : op->getOpOperands())
+      if (auto mappedOp = mapper.lookupOrNull(operand.get()))
+        operand.set(mappedOp);
+    for (auto &succOp : op->getBlockOperands())
+      if (auto *mappedOp = mapper.lookupOrNull(succOp.get()))
+        succOp.set(mappedOp);
+  };
+
+  for (iterator it(mapper.lookup(&front())); it != destPos; ++it)
+    it->walk(remapOperands);
+}
+
+void Region::dropAllReferences() {
+  for (Block &b : *this)
+    b.dropAllReferences();
+}
+
+/// Check if there are any values used by operations in `region` defined
+/// outside its ancestor region `limit`.  That is, given `A{B{C{}}}` with region
+/// `C` and limit `B`, the values defined in `B` can be used but the values
+/// defined in `A` cannot.  Emit errors if `noteLoc` is provided; this location
+/// is used to point to the operation containing the region, the actual error is
+/// reported at the operation with an offending use.
+static bool isIsolatedAbove(Region &region, Region &limit,
+                            Optional<Location> noteLoc) {
+  assert(limit.isAncestor(&region) &&
+         "expected isolation limit to be an ancestor of the given region");
+
+  // List of regions to analyze.  Each region is processed independently, with
+  // respect to the common `limit` region, so we can look at them in any order.
+  // Therefore, use a simple vector and push/pop back the current region.
+  SmallVector<Region *, 8> pendingRegions;
+  pendingRegions.push_back(&region);
+
+  // Traverse all operations in the region.
+  while (!pendingRegions.empty()) {
+    for (Block &block : *pendingRegions.pop_back_val()) {
+      for (Operation &op : block) {
+        for (Value operand : op.getOperands()) {
+          // operand should be non-null here if the IR is well-formed. But
+          // we don't assert here as this function is called from the verifier
+          // and so could be called on invalid IR.
+          if (!operand) {
+            if (noteLoc)
+              op.emitOpError("block's operand not defined").attachNote(noteLoc);
+            return false;
+          }
+
+          // Check that any value that is used by an operation is defined in the
+          // same region as either an operation result or a block argument.
+          if (operand->getParentRegion()->isProperAncestor(&limit)) {
+            if (noteLoc) {
+              op.emitOpError("using value defined outside the region")
+                      .attachNote(noteLoc)
+                  << "required by region isolation constraints";
+            }
+            return false;
+          }
+        }
+        // Schedule any regions the operations contain for further checking.
+        pendingRegions.reserve(pendingRegions.size() + op.getNumRegions());
+        for (Region &subRegion : op.getRegions())
+          pendingRegions.push_back(&subRegion);
+      }
+    }
+  }
+  return true;
+}
+
+bool Region::isIsolatedFromAbove(Optional<Location> noteLoc) {
+  return isIsolatedAbove(*this, *this, noteLoc);
+}
+
+Region *llvm::ilist_traits<::mlir::Block>::getParentRegion() {
+  size_t Offset(
+      size_t(&((Region *)nullptr->*Region::getSublistAccess(nullptr))));
+  iplist<Block> *Anchor(static_cast<iplist<Block> *>(this));
+  return reinterpret_cast<Region *>(reinterpret_cast<char *>(Anchor) - Offset);
+}
+
+/// This is a trait method invoked when a basic block is added to a region.
+/// We keep the region pointer up to date.
+void llvm::ilist_traits<::mlir::Block>::addNodeToList(Block *block) {
+  assert(!block->getParent() && "already in a region!");
+  block->parentValidOpOrderPair.setPointer(getParentRegion());
+}
+
+/// This is a trait method invoked when an operation is removed from a
+/// region.  We keep the region pointer up to date.
+void llvm::ilist_traits<::mlir::Block>::removeNodeFromList(Block *block) {
+  assert(block->getParent() && "not already in a region!");
+  block->parentValidOpOrderPair.setPointer(nullptr);
+}
+
+/// This is a trait method invoked when an operation is moved from one block
+/// to another.  We keep the block pointer up to date.
+void llvm::ilist_traits<::mlir::Block>::transferNodesFromList(
+    ilist_traits<Block> &otherList, block_iterator first, block_iterator last) {
+  // If we are transferring operations within the same function, the parent
+  // pointer doesn't need to be updated.
+  auto *curParent = getParentRegion();
+  if (curParent == otherList.getParentRegion())
+    return;
+
+  // Update the 'parent' member of each Block.
+  for (; first != last; ++first)
+    first->parentValidOpOrderPair.setPointer(curParent);
+}
+
+//===----------------------------------------------------------------------===//
+// RegionRange
+//===----------------------------------------------------------------------===//
+
+RegionRange::RegionRange(MutableArrayRef<Region> regions)
+    : RegionRange(regions.data(), regions.size()) {}
+RegionRange::RegionRange(ArrayRef<std::unique_ptr<Region>> regions)
+    : RegionRange(regions.data(), regions.size()) {}
+
+/// See `detail::indexed_accessor_range_base` for details.
+RegionRange::OwnerT RegionRange::offset_base(const OwnerT &owner,
+                                             ptrdiff_t index) {
+  if (auto *operand = owner.dyn_cast<const std::unique_ptr<Region> *>())
+    return operand + index;
+  return &owner.get<Region *>()[index];
+}
+/// See `detail::indexed_accessor_range_base` for details.
+Region *RegionRange::dereference_iterator(const OwnerT &owner,
+                                          ptrdiff_t index) {
+  if (auto *operand = owner.dyn_cast<const std::unique_ptr<Region> *>())
+    return operand[index].get();
+  return &owner.get<Region *>()[index];
+}
diff --git a/mlir/lib/IR/StandardTypes.cpp b/mlir/lib/IR/StandardTypes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..441b59ed9cdf4546b95cdf9e13bc55b4a160a7d0
--- /dev/null
+++ b/mlir/lib/IR/StandardTypes.cpp
@@ -0,0 +1,721 @@
+//===- StandardTypes.cpp - MLIR Standard Type Classes ---------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/StandardTypes.h"
+#include "TypeDetail.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/Twine.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+//===----------------------------------------------------------------------===//
+// Type
+//===----------------------------------------------------------------------===//
+
+bool Type::isBF16() { return getKind() == StandardTypes::BF16; }
+bool Type::isF16() { return getKind() == StandardTypes::F16; }
+bool Type::isF32() { return getKind() == StandardTypes::F32; }
+bool Type::isF64() { return getKind() == StandardTypes::F64; }
+
+bool Type::isIndex() { return isa<IndexType>(); }
+
+/// Return true if this is an integer type with the specified width.
+bool Type::isInteger(unsigned width) {
+  if (auto intTy = dyn_cast<IntegerType>())
+    return intTy.getWidth() == width;
+  return false;
+}
+
+bool Type::isIntOrIndex() { return isa<IndexType>() || isa<IntegerType>(); }
+
+bool Type::isIntOrIndexOrFloat() {
+  return isa<IndexType>() || isa<IntegerType>() || isa<FloatType>();
+}
+
+bool Type::isIntOrFloat() { return isa<IntegerType>() || isa<FloatType>(); }
+
+//===----------------------------------------------------------------------===//
+// Integer Type
+//===----------------------------------------------------------------------===//
+
+// static constexpr must have a definition (until in C++17 and inline variable).
+constexpr unsigned IntegerType::kMaxWidth;
+
+/// Verify the construction of an integer type.
+LogicalResult IntegerType::verifyConstructionInvariants(Optional<Location> loc,
+                                                        MLIRContext *context,
+                                                        unsigned width) {
+  if (width > IntegerType::kMaxWidth) {
+    return emitOptionalError(loc, "integer bitwidth is limited to ",
+                             IntegerType::kMaxWidth, " bits");
+  }
+  return success();
+}
+
+unsigned IntegerType::getWidth() const { return getImpl()->width; }
+
+//===----------------------------------------------------------------------===//
+// Float Type
+//===----------------------------------------------------------------------===//
+
+unsigned FloatType::getWidth() {
+  switch (getKind()) {
+  case StandardTypes::BF16:
+  case StandardTypes::F16:
+    return 16;
+  case StandardTypes::F32:
+    return 32;
+  case StandardTypes::F64:
+    return 64;
+  default:
+    llvm_unreachable("unexpected type");
+  }
+}
+
+/// Returns the floating semantics for the given type.
+const llvm::fltSemantics &FloatType::getFloatSemantics() {
+  if (isBF16())
+    // Treat BF16 like a double. This is unfortunate but BF16 fltSemantics is
+    // not defined in LLVM.
+    // TODO(jpienaar): add BF16 to LLVM? fltSemantics are internal to APFloat.cc
+    // else one could add it.
+    //  static const fltSemantics semBF16 = {127, -126, 8, 16};
+    return APFloat::IEEEdouble();
+  if (isF16())
+    return APFloat::IEEEhalf();
+  if (isF32())
+    return APFloat::IEEEsingle();
+  if (isF64())
+    return APFloat::IEEEdouble();
+  llvm_unreachable("non-floating point type used");
+}
+
+unsigned Type::getIntOrFloatBitWidth() {
+  assert(isIntOrFloat() && "only ints and floats have a bitwidth");
+  if (auto intType = dyn_cast<IntegerType>()) {
+    return intType.getWidth();
+  }
+
+  auto floatType = cast<FloatType>();
+  return floatType.getWidth();
+}
+
+//===----------------------------------------------------------------------===//
+// ShapedType
+//===----------------------------------------------------------------------===//
+constexpr int64_t ShapedType::kDynamicSize;
+constexpr int64_t ShapedType::kDynamicStrideOrOffset;
+
+Type ShapedType::getElementType() const {
+  return static_cast<ImplType *>(impl)->elementType;
+}
+
+unsigned ShapedType::getElementTypeBitWidth() const {
+  return getElementType().getIntOrFloatBitWidth();
+}
+
+int64_t ShapedType::getNumElements() const {
+  assert(hasStaticShape() && "cannot get element count of dynamic shaped type");
+  auto shape = getShape();
+  int64_t num = 1;
+  for (auto dim : shape)
+    num *= dim;
+  return num;
+}
+
+int64_t ShapedType::getRank() const { return getShape().size(); }
+
+bool ShapedType::hasRank() const { return !isa<UnrankedTensorType>(); }
+
+int64_t ShapedType::getDimSize(int64_t i) const {
+  assert(i >= 0 && i < getRank() && "invalid index for shaped type");
+  return getShape()[i];
+}
+
+unsigned ShapedType::getDynamicDimIndex(unsigned index) const {
+  assert(index < getRank() && "invalid index");
+  assert(ShapedType::isDynamic(getDimSize(index)) && "invalid index");
+  return llvm::count_if(getShape().take_front(index), ShapedType::isDynamic);
+}
+
+/// Get the number of bits require to store a value of the given shaped type.
+/// Compute the value recursively since tensors are allowed to have vectors as
+/// elements.
+int64_t ShapedType::getSizeInBits() const {
+  assert(hasStaticShape() &&
+         "cannot get the bit size of an aggregate with a dynamic shape");
+
+  auto elementType = getElementType();
+  if (elementType.isIntOrFloat())
+    return elementType.getIntOrFloatBitWidth() * getNumElements();
+
+  // Tensors can have vectors and other tensors as elements, other shaped types
+  // cannot.
+  assert(isa<TensorType>() && "unsupported element type");
+  assert((elementType.isa<VectorType>() || elementType.isa<TensorType>()) &&
+         "unsupported tensor element type");
+  return getNumElements() * elementType.cast<ShapedType>().getSizeInBits();
+}
+
+ArrayRef<int64_t> ShapedType::getShape() const {
+  switch (getKind()) {
+  case StandardTypes::Vector:
+    return cast<VectorType>().getShape();
+  case StandardTypes::RankedTensor:
+    return cast<RankedTensorType>().getShape();
+  case StandardTypes::MemRef:
+    return cast<MemRefType>().getShape();
+  default:
+    llvm_unreachable("not a ShapedType or not ranked");
+  }
+}
+
+int64_t ShapedType::getNumDynamicDims() const {
+  return llvm::count_if(getShape(), isDynamic);
+}
+
+bool ShapedType::hasStaticShape() const {
+  return hasRank() && llvm::none_of(getShape(), isDynamic);
+}
+
+bool ShapedType::hasStaticShape(ArrayRef<int64_t> shape) const {
+  return hasStaticShape() && getShape() == shape;
+}
+
+//===----------------------------------------------------------------------===//
+// VectorType
+//===----------------------------------------------------------------------===//
+
+VectorType VectorType::get(ArrayRef<int64_t> shape, Type elementType) {
+  return Base::get(elementType.getContext(), StandardTypes::Vector, shape,
+                   elementType);
+}
+
+VectorType VectorType::getChecked(ArrayRef<int64_t> shape, Type elementType,
+                                  Location location) {
+  return Base::getChecked(location, elementType.getContext(),
+                          StandardTypes::Vector, shape, elementType);
+}
+
+LogicalResult VectorType::verifyConstructionInvariants(Optional<Location> loc,
+                                                       MLIRContext *context,
+                                                       ArrayRef<int64_t> shape,
+                                                       Type elementType) {
+  if (shape.empty())
+    return emitOptionalError(loc,
+                             "vector types must have at least one dimension");
+
+  if (!isValidElementType(elementType))
+    return emitOptionalError(loc, "vector elements must be int or float type");
+
+  if (any_of(shape, [](int64_t i) { return i <= 0; }))
+    return emitOptionalError(loc,
+                             "vector types must have positive constant sizes");
+
+  return success();
+}
+
+ArrayRef<int64_t> VectorType::getShape() const { return getImpl()->getShape(); }
+
+//===----------------------------------------------------------------------===//
+// TensorType
+//===----------------------------------------------------------------------===//
+
+// Check if "elementType" can be an element type of a tensor. Emit errors if
+// location is not nullptr.  Returns failure if check failed.
+static inline LogicalResult checkTensorElementType(Optional<Location> location,
+                                                   MLIRContext *context,
+                                                   Type elementType) {
+  if (!TensorType::isValidElementType(elementType))
+    return emitOptionalError(location, "invalid tensor element type");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// RankedTensorType
+//===----------------------------------------------------------------------===//
+
+RankedTensorType RankedTensorType::get(ArrayRef<int64_t> shape,
+                                       Type elementType) {
+  return Base::get(elementType.getContext(), StandardTypes::RankedTensor, shape,
+                   elementType);
+}
+
+RankedTensorType RankedTensorType::getChecked(ArrayRef<int64_t> shape,
+                                              Type elementType,
+                                              Location location) {
+  return Base::getChecked(location, elementType.getContext(),
+                          StandardTypes::RankedTensor, shape, elementType);
+}
+
+LogicalResult RankedTensorType::verifyConstructionInvariants(
+    Optional<Location> loc, MLIRContext *context, ArrayRef<int64_t> shape,
+    Type elementType) {
+  for (int64_t s : shape) {
+    if (s < -1)
+      return emitOptionalError(loc, "invalid tensor dimension size");
+  }
+  return checkTensorElementType(loc, context, elementType);
+}
+
+ArrayRef<int64_t> RankedTensorType::getShape() const {
+  return getImpl()->getShape();
+}
+
+//===----------------------------------------------------------------------===//
+// UnrankedTensorType
+//===----------------------------------------------------------------------===//
+
+UnrankedTensorType UnrankedTensorType::get(Type elementType) {
+  return Base::get(elementType.getContext(), StandardTypes::UnrankedTensor,
+                   elementType);
+}
+
+UnrankedTensorType UnrankedTensorType::getChecked(Type elementType,
+                                                  Location location) {
+  return Base::getChecked(location, elementType.getContext(),
+                          StandardTypes::UnrankedTensor, elementType);
+}
+
+LogicalResult UnrankedTensorType::verifyConstructionInvariants(
+    Optional<Location> loc, MLIRContext *context, Type elementType) {
+  return checkTensorElementType(loc, context, elementType);
+}
+
+//===----------------------------------------------------------------------===//
+// MemRefType
+//===----------------------------------------------------------------------===//
+
+/// Get or create a new MemRefType based on shape, element type, affine
+/// map composition, and memory space.  Assumes the arguments define a
+/// well-formed MemRef type.  Use getChecked to gracefully handle MemRefType
+/// construction failures.
+MemRefType MemRefType::get(ArrayRef<int64_t> shape, Type elementType,
+                           ArrayRef<AffineMap> affineMapComposition,
+                           unsigned memorySpace) {
+  auto result = getImpl(shape, elementType, affineMapComposition, memorySpace,
+                        /*location=*/llvm::None);
+  assert(result && "Failed to construct instance of MemRefType.");
+  return result;
+}
+
+/// Get or create a new MemRefType based on shape, element type, affine
+/// map composition, and memory space declared at the given location.
+/// If the location is unknown, the last argument should be an instance of
+/// UnknownLoc.  If the MemRefType defined by the arguments would be
+/// ill-formed, emits errors (to the handler registered with the context or to
+/// the error stream) and returns nullptr.
+MemRefType MemRefType::getChecked(ArrayRef<int64_t> shape, Type elementType,
+                                  ArrayRef<AffineMap> affineMapComposition,
+                                  unsigned memorySpace, Location location) {
+  return getImpl(shape, elementType, affineMapComposition, memorySpace,
+                 location);
+}
+
+/// Get or create a new MemRefType defined by the arguments.  If the resulting
+/// type would be ill-formed, return nullptr.  If the location is provided,
+/// emit detailed error messages.  To emit errors when the location is unknown,
+/// pass in an instance of UnknownLoc.
+MemRefType MemRefType::getImpl(ArrayRef<int64_t> shape, Type elementType,
+                               ArrayRef<AffineMap> affineMapComposition,
+                               unsigned memorySpace,
+                               Optional<Location> location) {
+  auto *context = elementType.getContext();
+
+  // Check that memref is formed from allowed types.
+  if (!elementType.isIntOrFloat() && !elementType.isa<VectorType>())
+    return emitOptionalError(location, "invalid memref element type"),
+           MemRefType();
+
+  for (int64_t s : shape) {
+    // Negative sizes are not allowed except for `-1` that means dynamic size.
+    if (s < -1)
+      return emitOptionalError(location, "invalid memref size"), MemRefType();
+  }
+
+  // Check that the structure of the composition is valid, i.e. that each
+  // subsequent affine map has as many inputs as the previous map has results.
+  // Take the dimensionality of the MemRef for the first map.
+  auto dim = shape.size();
+  unsigned i = 0;
+  for (const auto &affineMap : affineMapComposition) {
+    if (affineMap.getNumDims() != dim) {
+      if (location)
+        emitError(*location)
+            << "memref affine map dimension mismatch between "
+            << (i == 0 ? Twine("memref rank") : "affine map " + Twine(i))
+            << " and affine map" << i + 1 << ": " << dim
+            << " != " << affineMap.getNumDims();
+      return nullptr;
+    }
+
+    dim = affineMap.getNumResults();
+    ++i;
+  }
+
+  // Drop identity maps from the composition.
+  // This may lead to the composition becoming empty, which is interpreted as an
+  // implicit identity.
+  SmallVector<AffineMap, 2> cleanedAffineMapComposition;
+  for (const auto &map : affineMapComposition) {
+    if (map.isIdentity())
+      continue;
+    cleanedAffineMapComposition.push_back(map);
+  }
+
+  return Base::get(context, StandardTypes::MemRef, shape, elementType,
+                   cleanedAffineMapComposition, memorySpace);
+}
+
+ArrayRef<int64_t> MemRefType::getShape() const { return getImpl()->getShape(); }
+
+ArrayRef<AffineMap> MemRefType::getAffineMaps() const {
+  return getImpl()->getAffineMaps();
+}
+
+unsigned MemRefType::getMemorySpace() const { return getImpl()->memorySpace; }
+
+//===----------------------------------------------------------------------===//
+// UnrankedMemRefType
+//===----------------------------------------------------------------------===//
+
+UnrankedMemRefType UnrankedMemRefType::get(Type elementType,
+                                           unsigned memorySpace) {
+  return Base::get(elementType.getContext(), StandardTypes::UnrankedMemRef,
+                   elementType, memorySpace);
+}
+
+UnrankedMemRefType UnrankedMemRefType::getChecked(Type elementType,
+                                                  unsigned memorySpace,
+                                                  Location location) {
+  return Base::getChecked(location, elementType.getContext(),
+                          StandardTypes::UnrankedMemRef, elementType,
+                          memorySpace);
+}
+
+unsigned UnrankedMemRefType::getMemorySpace() const {
+  return getImpl()->memorySpace;
+}
+
+LogicalResult UnrankedMemRefType::verifyConstructionInvariants(
+    Optional<Location> loc, MLIRContext *context, Type elementType,
+    unsigned memorySpace) {
+  // Check that memref is formed from allowed types.
+  if (!elementType.isIntOrFloat() && !elementType.isa<VectorType>())
+    return emitOptionalError(*loc, "invalid memref element type");
+  return success();
+}
+
+/// Given MemRef `sizes` that are either static or dynamic, returns the
+/// canonical "contiguous" strides AffineExpr. Strides are multiplicative and
+/// once a dynamic dimension is encountered, all canonical strides become
+/// dynamic and need to be encoded with a different symbol.
+/// For canonical strides expressions, the offset is always 0 and and fastest
+/// varying stride is always `1`.
+///
+/// Examples:
+///   - memref<3x4x5xf32> has canonical stride expression `20*d0 + 5*d1 + d2`.
+///   - memref<3x?x5xf32> has canonical stride expression `s0*d0 + 5*d1 + d2`.
+///   - memref<3x4x?xf32> has canonical stride expression `s1*d0 + s0*d1 + d2`.
+static AffineExpr makeCanonicalStridedLayoutExpr(ArrayRef<int64_t> sizes,
+                                                 MLIRContext *context) {
+  AffineExpr expr;
+  bool dynamicPoisonBit = false;
+  unsigned nSymbols = 0;
+  int64_t runningSize = 1;
+  unsigned rank = sizes.size();
+  for (auto en : llvm::enumerate(llvm::reverse(sizes))) {
+    auto size = en.value();
+    auto position = rank - 1 - en.index();
+    // Degenerate case, no size =-> no stride
+    if (size == 0)
+      continue;
+    auto d = getAffineDimExpr(position, context);
+    // Static case: stride = runningSize and runningSize *= size.
+    if (!dynamicPoisonBit) {
+      auto cst = getAffineConstantExpr(runningSize, context);
+      expr = expr ? expr + cst * d : cst * d;
+      if (size > 0)
+        runningSize *= size;
+      else
+        // From now on bail into dynamic mode.
+        dynamicPoisonBit = true;
+      continue;
+    }
+    // Dynamic case, new symbol for each new stride.
+    auto sym = getAffineSymbolExpr(nSymbols++, context);
+    expr = expr ? expr + d * sym : d * sym;
+  }
+  return expr;
+}
+
+// Factored out common logic to update `strides` and `seen` for `dim` with value
+// `val`. This handles both saturated and unsaturated cases.
+static void accumulateStrides(MutableArrayRef<int64_t> strides,
+                              MutableArrayRef<bool> seen, unsigned pos,
+                              int64_t val) {
+  if (!seen[pos]) {
+    // Newly seen case, sets value
+    strides[pos] = val;
+    seen[pos] = true;
+    return;
+  }
+  if (strides[pos] != MemRefType::getDynamicStrideOrOffset())
+    // Already seen case accumulates unless they are already saturated.
+    strides[pos] += val;
+}
+
+// This sums multiple offsets as they are seen. In the particular case of
+// accumulating a dynamic offset with either a static of dynamic one, this
+// saturates to MemRefType::getDynamicStrideOrOffset().
+static void accumulateOffset(int64_t &offset, bool &seenOffset, int64_t val) {
+  if (!seenOffset) {
+    // Newly seen case, sets value
+    offset = val;
+    seenOffset = true;
+    return;
+  }
+  if (offset != MemRefType::getDynamicStrideOrOffset())
+    // Already seen case accumulates unless they are already saturated.
+    offset += val;
+}
+
+/// Takes a single AffineExpr `e` and populates the `strides` and `seen` arrays
+/// with the strides values for each dim position and whether a value exists at
+/// that position, respectively.
+/// The convention is that the strides for dimensions d0, .. dn appear in
+/// order to make indexing intuitive into the result.
+static void extractStrides(AffineExpr e, MutableArrayRef<int64_t> strides,
+                           int64_t &offset, MutableArrayRef<bool> seen,
+                           bool &seenOffset, bool &failed) {
+  auto bin = e.dyn_cast<AffineBinaryOpExpr>();
+  if (!bin)
+    return;
+
+  if (bin.getKind() == AffineExprKind::CeilDiv ||
+      bin.getKind() == AffineExprKind::FloorDiv ||
+      bin.getKind() == AffineExprKind::Mod) {
+    failed = true;
+    return;
+  }
+  if (bin.getKind() == AffineExprKind::Mul) {
+    // LHS may be more complex than just a single dim (e.g. multiple syms and
+    // dims). Bail out for now and revisit when we have evidence this is needed.
+    auto dim = bin.getLHS().dyn_cast<AffineDimExpr>();
+    if (!dim) {
+      failed = true;
+      return;
+    }
+    auto cst = bin.getRHS().dyn_cast<AffineConstantExpr>();
+    if (!cst) {
+      strides[dim.getPosition()] = MemRefType::getDynamicStrideOrOffset();
+      seen[dim.getPosition()] = true;
+    } else {
+      accumulateStrides(strides, seen, dim.getPosition(), cst.getValue());
+    }
+    return;
+  }
+  if (bin.getKind() == AffineExprKind::Add) {
+    for (auto e : {bin.getLHS(), bin.getRHS()}) {
+      if (auto cst = e.dyn_cast<AffineConstantExpr>()) {
+        // Independent constants cumulate.
+        accumulateOffset(offset, seenOffset, cst.getValue());
+      } else if (auto sym = e.dyn_cast<AffineSymbolExpr>()) {
+        // Independent symbols saturate.
+        offset = MemRefType::getDynamicStrideOrOffset();
+        seenOffset = true;
+      } else if (auto dim = e.dyn_cast<AffineDimExpr>()) {
+        // Independent symbols cumulate 1.
+        accumulateStrides(strides, seen, dim.getPosition(), 1);
+      }
+      // Sum of binary ops dispatch to the respective exprs.
+    }
+    return;
+  }
+  llvm_unreachable("unexpected binary operation");
+}
+
+// Fallback cases for terminal dim/sym/cst that are not part of a binary op (
+// i.e. single term).
+static void extractStridesFromTerm(AffineExpr e,
+                                   MutableArrayRef<int64_t> strides,
+                                   int64_t &offset, MutableArrayRef<bool> seen,
+                                   bool &seenOffset) {
+  if (auto cst = e.dyn_cast<AffineConstantExpr>()) {
+    assert(!seenOffset && "unexpected `seen` bit with single term");
+    offset = cst.getValue();
+    seenOffset = true;
+    return;
+  }
+  if (auto sym = e.dyn_cast<AffineSymbolExpr>()) {
+    assert(!seenOffset && "unexpected `seen` bit with single term");
+    offset = MemRefType::getDynamicStrideOrOffset();
+    seenOffset = true;
+    return;
+  }
+  if (auto dim = e.dyn_cast<AffineDimExpr>()) {
+    assert(!seen[dim.getPosition()] &&
+           "unexpected `seen` bit with single term");
+    strides[dim.getPosition()] = 1;
+    seen[dim.getPosition()] = true;
+    return;
+  }
+  llvm_unreachable("unexpected binary operation");
+}
+
+LogicalResult mlir::getStridesAndOffset(MemRefType t,
+                                        SmallVectorImpl<int64_t> &strides,
+                                        int64_t &offset) {
+  auto affineMaps = t.getAffineMaps();
+  // For now strides are only computed on a single affine map with a single
+  // result (i.e. the closed subset of linearization maps that are compatible
+  // with striding semantics).
+  // TODO(ntv): support more forms on a per-need basis.
+  if (affineMaps.size() > 1)
+    return failure();
+  AffineExpr stridedExpr;
+  if (affineMaps.empty() || affineMaps[0].isIdentity()) {
+    if (t.getRank() == 0) {
+      // Handle 0-D corner case.
+      offset = 0;
+      return success();
+    }
+    stridedExpr = makeCanonicalStridedLayoutExpr(t.getShape(), t.getContext());
+  } else if (affineMaps[0].getNumResults() == 1) {
+    stridedExpr = affineMaps[0].getResult(0);
+  }
+  if (!stridedExpr)
+    return failure();
+
+  bool failed = false;
+  strides = SmallVector<int64_t, 4>(t.getRank(), 0);
+  bool seenOffset = false;
+  SmallVector<bool, 4> seen(t.getRank(), false);
+  if (stridedExpr.isa<AffineBinaryOpExpr>()) {
+    stridedExpr.walk([&](AffineExpr e) {
+      if (!failed)
+        extractStrides(e, strides, offset, seen, seenOffset, failed);
+    });
+  } else {
+    extractStridesFromTerm(stridedExpr, strides, offset, seen, seenOffset);
+  }
+
+  // Constant offset may not be present in `stridedExpr` which means it is
+  // implicitly 0.
+  if (!seenOffset)
+    offset = 0;
+
+  if (failed || !llvm::all_of(seen, [](bool b) { return b; })) {
+    strides.clear();
+    return failure();
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+/// ComplexType
+//===----------------------------------------------------------------------===//
+
+ComplexType ComplexType::get(Type elementType) {
+  return Base::get(elementType.getContext(), StandardTypes::Complex,
+                   elementType);
+}
+
+ComplexType ComplexType::getChecked(Type elementType, Location location) {
+  return Base::getChecked(location, elementType.getContext(),
+                          StandardTypes::Complex, elementType);
+}
+
+/// Verify the construction of an integer type.
+LogicalResult ComplexType::verifyConstructionInvariants(Optional<Location> loc,
+                                                        MLIRContext *context,
+                                                        Type elementType) {
+  if (!elementType.isa<FloatType>() && !elementType.isa<IntegerType>())
+    return emitOptionalError(loc, "invalid element type for complex");
+  return success();
+}
+
+Type ComplexType::getElementType() { return getImpl()->elementType; }
+
+//===----------------------------------------------------------------------===//
+/// TupleType
+//===----------------------------------------------------------------------===//
+
+/// Get or create a new TupleType with the provided element types. Assumes the
+/// arguments define a well-formed type.
+TupleType TupleType::get(ArrayRef<Type> elementTypes, MLIRContext *context) {
+  return Base::get(context, StandardTypes::Tuple, elementTypes);
+}
+
+/// Return the elements types for this tuple.
+ArrayRef<Type> TupleType::getTypes() const { return getImpl()->getTypes(); }
+
+/// Accumulate the types contained in this tuple and tuples nested within it.
+/// Note that this only flattens nested tuples, not any other container type,
+/// e.g. a tuple<i32, tensor<i32>, tuple<f32, tuple<i64>>> is flattened to
+/// (i32, tensor<i32>, f32, i64)
+void TupleType::getFlattenedTypes(SmallVectorImpl<Type> &types) {
+  for (Type type : getTypes()) {
+    if (auto nestedTuple = type.dyn_cast<TupleType>())
+      nestedTuple.getFlattenedTypes(types);
+    else
+      types.push_back(type);
+  }
+}
+
+/// Return the number of element types.
+size_t TupleType::size() const { return getImpl()->size(); }
+
+AffineMap mlir::makeStridedLinearLayoutMap(ArrayRef<int64_t> strides,
+                                           int64_t offset,
+                                           MLIRContext *context) {
+  AffineExpr expr;
+  unsigned nSymbols = 0;
+
+  // AffineExpr for offset.
+  // Static case.
+  if (offset != MemRefType::getDynamicStrideOrOffset()) {
+    auto cst = getAffineConstantExpr(offset, context);
+    expr = cst;
+  } else {
+    // Dynamic case, new symbol for the offset.
+    auto sym = getAffineSymbolExpr(nSymbols++, context);
+    expr = sym;
+  }
+
+  // AffineExpr for strides.
+  for (auto en : llvm::enumerate(strides)) {
+    auto dim = en.index();
+    auto stride = en.value();
+    assert(stride != 0 && "Invalid stride specification");
+    auto d = getAffineDimExpr(dim, context);
+    AffineExpr mult;
+    // Static case.
+    if (stride != MemRefType::getDynamicStrideOrOffset())
+      mult = getAffineConstantExpr(stride, context);
+    else
+      // Dynamic case, new symbol for each new stride.
+      mult = getAffineSymbolExpr(nSymbols++, context);
+    expr = expr + d * mult;
+  }
+
+  return AffineMap::get(strides.size(), nSymbols, expr);
+}
+
+bool mlir::isStrided(MemRefType t) {
+  int64_t offset;
+  SmallVector<int64_t, 4> stridesAndOffset;
+  auto res = getStridesAndOffset(t, stridesAndOffset, offset);
+  return succeeded(res);
+}
diff --git a/mlir/lib/IR/SymbolTable.cpp b/mlir/lib/IR/SymbolTable.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83e5802093c9e48506c981b157119b2434bf098a
--- /dev/null
+++ b/mlir/lib/IR/SymbolTable.cpp
@@ -0,0 +1,459 @@
+//===- SymbolTable.cpp - MLIR Symbol Table Class --------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/SymbolTable.h"
+#include "llvm/ADT/SmallString.h"
+
+using namespace mlir;
+
+/// Return true if the given operation is unknown and may potentially define a
+/// symbol table.
+static bool isPotentiallyUnknownSymbolTable(Operation *op) {
+  return !op->getDialect() && op->getNumRegions() == 1;
+}
+
+//===----------------------------------------------------------------------===//
+// SymbolTable
+//===----------------------------------------------------------------------===//
+
+/// Build a symbol table with the symbols within the given operation.
+SymbolTable::SymbolTable(Operation *symbolTableOp)
+    : symbolTableOp(symbolTableOp) {
+  assert(symbolTableOp->hasTrait<OpTrait::SymbolTable>() &&
+         "expected operation to have SymbolTable trait");
+  assert(symbolTableOp->getNumRegions() == 1 &&
+         "expected operation to have a single region");
+  assert(has_single_element(symbolTableOp->getRegion(0)) &&
+         "expected operation to have a single block");
+
+  for (auto &op : symbolTableOp->getRegion(0).front()) {
+    auto nameAttr = op.getAttrOfType<StringAttr>(getSymbolAttrName());
+    if (!nameAttr)
+      continue;
+
+    auto inserted = symbolTable.insert({nameAttr.getValue(), &op});
+    (void)inserted;
+    assert(inserted.second &&
+           "expected region to contain uniquely named symbol operations");
+  }
+}
+
+/// Look up a symbol with the specified name, returning null if no such name
+/// exists. Names never include the @ on them.
+Operation *SymbolTable::lookup(StringRef name) const {
+  return symbolTable.lookup(name);
+}
+
+/// Erase the given symbol from the table.
+void SymbolTable::erase(Operation *symbol) {
+  auto nameAttr = symbol->getAttrOfType<StringAttr>(getSymbolAttrName());
+  assert(nameAttr && "expected valid 'name' attribute");
+  assert(symbol->getParentOp() == symbolTableOp &&
+         "expected this operation to be inside of the operation with this "
+         "SymbolTable");
+
+  auto it = symbolTable.find(nameAttr.getValue());
+  if (it != symbolTable.end() && it->second == symbol) {
+    symbolTable.erase(it);
+    symbol->erase();
+  }
+}
+
+/// Insert a new symbol into the table and associated operation, and rename it
+/// as necessary to avoid collisions.
+void SymbolTable::insert(Operation *symbol, Block::iterator insertPt) {
+  auto nameAttr = symbol->getAttrOfType<StringAttr>(getSymbolAttrName());
+  assert(nameAttr && "expected valid 'name' attribute");
+
+  auto &body = symbolTableOp->getRegion(0).front();
+  if (insertPt == Block::iterator() || insertPt == body.end())
+    insertPt = Block::iterator(body.getTerminator());
+
+  assert(insertPt->getParentOp() == symbolTableOp &&
+         "expected insertPt to be in the associated module operation");
+
+  body.getOperations().insert(insertPt, symbol);
+
+  // Add this symbol to the symbol table, uniquing the name if a conflict is
+  // detected.
+  if (symbolTable.insert({nameAttr.getValue(), symbol}).second)
+    return;
+
+  // If a conflict was detected, then the symbol will not have been added to
+  // the symbol table. Try suffixes until we get to a unique name that works.
+  SmallString<128> nameBuffer(nameAttr.getValue());
+  unsigned originalLength = nameBuffer.size();
+
+  // Iteratively try suffixes until we find one that isn't used.
+  do {
+    nameBuffer.resize(originalLength);
+    nameBuffer += '_';
+    nameBuffer += std::to_string(uniquingCounter++);
+  } while (!symbolTable.insert({nameBuffer, symbol}).second);
+  symbol->setAttr(getSymbolAttrName(),
+                  StringAttr::get(nameBuffer, symbolTableOp->getContext()));
+}
+
+/// Returns the operation registered with the given symbol name with the
+/// regions of 'symbolTableOp'. 'symbolTableOp' is required to be an operation
+/// with the 'OpTrait::SymbolTable' trait. Returns nullptr if no valid symbol
+/// was found.
+Operation *SymbolTable::lookupSymbolIn(Operation *symbolTableOp,
+                                       StringRef symbol) {
+  assert(symbolTableOp->hasTrait<OpTrait::SymbolTable>());
+
+  // Look for a symbol with the given name.
+  for (auto &block : symbolTableOp->getRegion(0)) {
+    for (auto &op : block) {
+      auto nameAttr = op.template getAttrOfType<StringAttr>(
+          mlir::SymbolTable::getSymbolAttrName());
+      if (nameAttr && nameAttr.getValue() == symbol)
+        return &op;
+    }
+  }
+  return nullptr;
+}
+
+/// Returns the operation registered with the given symbol name within the
+/// closes parent operation with the 'OpTrait::SymbolTable' trait. Returns
+/// nullptr if no valid symbol was found.
+Operation *SymbolTable::lookupNearestSymbolFrom(Operation *from,
+                                                StringRef symbol) {
+  assert(from && "expected valid operation");
+  while (!from->hasTrait<OpTrait::SymbolTable>()) {
+    from = from->getParentOp();
+
+    // Check that this is a valid op and isn't an unknown symbol table.
+    if (!from || isPotentiallyUnknownSymbolTable(from))
+      return nullptr;
+  }
+  return lookupSymbolIn(from, symbol);
+}
+
+//===----------------------------------------------------------------------===//
+// SymbolTable Trait Types
+//===----------------------------------------------------------------------===//
+
+LogicalResult OpTrait::impl::verifySymbolTable(Operation *op) {
+  if (op->getNumRegions() != 1)
+    return op->emitOpError()
+           << "Operations with a 'SymbolTable' must have exactly one region";
+  if (!has_single_element(op->getRegion(0)))
+    return op->emitOpError()
+           << "Operations with a 'SymbolTable' must have exactly one block";
+
+  // Check that all symbols are uniquely named within child regions.
+  llvm::StringMap<Location> nameToOrigLoc;
+  for (auto &block : op->getRegion(0)) {
+    for (auto &op : block) {
+      // Check for a symbol name attribute.
+      auto nameAttr =
+          op.getAttrOfType<StringAttr>(mlir::SymbolTable::getSymbolAttrName());
+      if (!nameAttr)
+        continue;
+
+      // Try to insert this symbol into the table.
+      auto it = nameToOrigLoc.try_emplace(nameAttr.getValue(), op.getLoc());
+      if (!it.second)
+        return op.emitError()
+            .append("redefinition of symbol named '", nameAttr.getValue(), "'")
+            .attachNote(it.first->second)
+            .append("see existing symbol definition here");
+    }
+  }
+  return success();
+}
+
+LogicalResult OpTrait::impl::verifySymbol(Operation *op) {
+  if (!op->getAttrOfType<StringAttr>(mlir::SymbolTable::getSymbolAttrName()))
+    return op->emitOpError() << "requires string attribute '"
+                             << mlir::SymbolTable::getSymbolAttrName() << "'";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Symbol Use Lists
+//===----------------------------------------------------------------------===//
+
+/// Walk all of the symbol references within the given operation, invoking the
+/// provided callback for each found use. The callbacks takes as arguments: the
+/// use of the symbol, and the nested access chain to the attribute within the
+/// operation dictionary. An access chain is a set of indices into nested
+/// container attributes. For example, a symbol use in an attribute dictionary
+/// that looks like the following:
+///
+///    {use = [{other_attr, @symbol}]}
+///
+/// May have the following access chain:
+///
+///     [0, 0, 1]
+///
+static WalkResult walkSymbolRefs(
+    Operation *op,
+    function_ref<WalkResult(SymbolTable::SymbolUse, ArrayRef<int>)> callback) {
+  // Check to see if the operation has any attributes.
+  DictionaryAttr attrDict = op->getAttrList().getDictionary();
+  if (!attrDict)
+    return WalkResult::advance();
+
+  // A worklist of a container attribute and the current index into the held
+  // attribute list.
+  SmallVector<Attribute, 1> attrWorklist(1, attrDict);
+  SmallVector<int, 1> curAccessChain(1, /*Value=*/-1);
+
+  // Process the symbol references within the given nested attribute range.
+  auto processAttrs = [&](int &index, auto attrRange) -> WalkResult {
+    for (Attribute attr : llvm::drop_begin(attrRange, index)) {
+      /// Check for a nested container attribute, these will also need to be
+      /// walked.
+      if (attr.isa<ArrayAttr>() || attr.isa<DictionaryAttr>()) {
+        attrWorklist.push_back(attr);
+        curAccessChain.push_back(-1);
+        return WalkResult::advance();
+      }
+
+      // Invoke the provided callback if we find a symbol use and check for a
+      // requested interrupt.
+      if (auto symbolRef = attr.dyn_cast<SymbolRefAttr>())
+        if (callback({op, symbolRef}, curAccessChain).wasInterrupted())
+          return WalkResult::interrupt();
+
+      // Make sure to keep the index counter in sync.
+      ++index;
+    }
+
+    // Pop this container attribute from the worklist.
+    attrWorklist.pop_back();
+    curAccessChain.pop_back();
+    return WalkResult::advance();
+  };
+
+  WalkResult result = WalkResult::advance();
+  do {
+    Attribute attr = attrWorklist.back();
+    int &index = curAccessChain.back();
+    ++index;
+
+    // Process the given attribute, which is guaranteed to be a container.
+    if (auto dict = attr.dyn_cast<DictionaryAttr>())
+      result = processAttrs(index, make_second_range(dict.getValue()));
+    else
+      result = processAttrs(index, attr.cast<ArrayAttr>().getValue());
+  } while (!attrWorklist.empty() && !result.wasInterrupted());
+  return result;
+}
+
+/// Walk all of the uses, for any symbol, that are nested within the given
+/// operation 'from', invoking the provided callback for each. This does not
+/// traverse into any nested symbol tables, and will also only return uses on
+/// 'from' if it does not also define a symbol table.
+static Optional<WalkResult> walkSymbolUses(
+    Operation *from,
+    function_ref<WalkResult(SymbolTable::SymbolUse, ArrayRef<int>)> callback) {
+  // If from is not a symbol table, check for uses. A symbol table defines a new
+  // scope, so we can't walk the attributes from the symbol table op.
+  if (!from->hasTrait<OpTrait::SymbolTable>()) {
+    if (walkSymbolRefs(from, callback).wasInterrupted())
+      return WalkResult::interrupt();
+  }
+
+  SmallVector<Region *, 1> worklist;
+  worklist.reserve(from->getNumRegions());
+  for (Region &region : from->getRegions())
+    worklist.push_back(&region);
+
+  while (!worklist.empty()) {
+    Region *region = worklist.pop_back_val();
+    for (Block &block : *region) {
+      for (Operation &op : block) {
+        if (walkSymbolRefs(&op, callback).wasInterrupted())
+          return WalkResult::interrupt();
+
+        // If this operation has regions, and it as well as its dialect aren't
+        // registered then conservatively fail. The operation may define a
+        // symbol table, so we can't opaquely know if we should traverse to find
+        // nested uses.
+        if (isPotentiallyUnknownSymbolTable(&op))
+          return llvm::None;
+
+        // If this op defines a new symbol table scope, we can't traverse. Any
+        // symbol references nested within 'op' are different semantically.
+        if (!op.hasTrait<OpTrait::SymbolTable>()) {
+          for (Region &region : op.getRegions())
+            worklist.push_back(&region);
+        }
+      }
+    }
+  }
+  return WalkResult::advance();
+}
+
+/// Get an iterator range for all of the uses, for any symbol, that are nested
+/// within the given operation 'from'. This does not traverse into any nested
+/// symbol tables, and will also only return uses on 'from' if it does not
+/// also define a symbol table. This is because we treat the region as the
+/// boundary of the symbol table, and not the op itself. This function returns
+/// None if there are any unknown operations that may potentially be symbol
+/// tables.
+auto SymbolTable::getSymbolUses(Operation *from) -> Optional<UseRange> {
+  std::vector<SymbolUse> uses;
+  Optional<WalkResult> result =
+      walkSymbolUses(from, [&](SymbolUse symbolUse, ArrayRef<int>) {
+        uses.push_back(symbolUse);
+        return WalkResult::advance();
+      });
+  return result ? Optional<UseRange>(std::move(uses)) : Optional<UseRange>();
+}
+
+/// Get all of the uses of the given symbol that are nested within the given
+/// operation 'from', invoking the provided callback for each. This does not
+/// traverse into any nested symbol tables, and will also only return uses on
+/// 'from' if it does not also define a symbol table. This is because we treat
+/// the region as the boundary of the symbol table, and not the op itself. This
+/// function returns None if there are any unknown operations that may
+/// potentially be symbol tables.
+auto SymbolTable::getSymbolUses(StringRef symbol, Operation *from)
+    -> Optional<UseRange> {
+  SymbolRefAttr symbolRefAttr = SymbolRefAttr::get(symbol, from->getContext());
+
+  std::vector<SymbolUse> uses;
+  Optional<WalkResult> result =
+      walkSymbolUses(from, [&](SymbolUse symbolUse, ArrayRef<int>) {
+        if (symbolRefAttr == symbolUse.getSymbolRef())
+          uses.push_back(symbolUse);
+        return WalkResult::advance();
+      });
+  return result ? Optional<UseRange>(std::move(uses)) : Optional<UseRange>();
+}
+
+/// Return if the given symbol is known to have no uses that are nested within
+/// the given operation 'from'. This does not traverse into any nested symbol
+/// tables, and will also only count uses on 'from' if it does not also define
+/// a symbol table. This is because we treat the region as the boundary of the
+/// symbol table, and not the op itself. This function will also return false if
+/// there are any unknown operations that may potentially be symbol tables.
+bool SymbolTable::symbolKnownUseEmpty(StringRef symbol, Operation *from) {
+  SymbolRefAttr symbolRefAttr = SymbolRefAttr::get(symbol, from->getContext());
+
+  // Walk all of the symbol uses looking for a reference to 'symbol'.
+  Optional<WalkResult> walkResult =
+      walkSymbolUses(from, [&](SymbolUse symbolUse, ArrayRef<int>) {
+        return symbolUse.getSymbolRef() == symbolRefAttr
+                   ? WalkResult::interrupt()
+                   : WalkResult::advance();
+      });
+  return walkResult && !walkResult->wasInterrupted();
+}
+
+/// Rebuild the given attribute container after replacing all references to a
+/// symbol with `newSymAttr`.
+static Attribute rebuildAttrAfterRAUW(Attribute container,
+                                      ArrayRef<SmallVector<int, 1>> accesses,
+                                      SymbolRefAttr newSymAttr,
+                                      unsigned depth) {
+  // Given a range of Attributes, update the ones referred to by the given
+  // access chains to point to the new symbol attribute.
+  auto updateAttrs = [&](auto &&attrRange) {
+    auto attrBegin = std::begin(attrRange);
+    for (unsigned i = 0, e = accesses.size(); i != e;) {
+      ArrayRef<int> access = accesses[i];
+      Attribute &attr = *std::next(attrBegin, access[depth]);
+
+      // Check to see if this is a leaf access, i.e. a SymbolRef.
+      if (access.size() == depth + 1) {
+        attr = newSymAttr;
+        ++i;
+        continue;
+      }
+
+      // Otherwise, this is a container. Collect all of the accesses for this
+      // index and recurse. The recursion here is bounded by the size of the
+      // largest access array.
+      auto nestedAccesses =
+          accesses.drop_front(i).take_while([&](ArrayRef<int> nextAccess) {
+            return nextAccess.size() > depth + 1 &&
+                   nextAccess[depth] == access[depth];
+          });
+      attr = rebuildAttrAfterRAUW(attr, nestedAccesses, newSymAttr, depth + 1);
+
+      // Skip over all of the accesses that refer to the nested container.
+      i += nestedAccesses.size();
+    }
+  };
+
+  if (auto dictAttr = container.dyn_cast<DictionaryAttr>()) {
+    auto newAttrs = llvm::to_vector<4>(dictAttr.getValue());
+    updateAttrs(make_second_range(newAttrs));
+    return DictionaryAttr::get(newAttrs, dictAttr.getContext());
+  }
+  auto newAttrs = llvm::to_vector<4>(container.cast<ArrayAttr>().getValue());
+  updateAttrs(newAttrs);
+  return ArrayAttr::get(newAttrs, container.getContext());
+}
+
+/// Attempt to replace all uses of the given symbol 'oldSymbol' with the
+/// provided symbol 'newSymbol' that are nested within the given operation
+/// 'from'. This does not traverse into any nested symbol tables, and will
+/// also only replace uses on 'from' if it does not also define a symbol
+/// table. This is because we treat the region as the boundary of the symbol
+/// table, and not the op itself. If there are any unknown operations that may
+/// potentially be symbol tables, no uses are replaced and failure is returned.
+LogicalResult SymbolTable::replaceAllSymbolUses(StringRef oldSymbol,
+                                                StringRef newSymbol,
+                                                Operation *from) {
+  SymbolRefAttr oldAttr = SymbolRefAttr::get(oldSymbol, from->getContext());
+  SymbolRefAttr newSymAttr = SymbolRefAttr::get(newSymbol, from->getContext());
+
+  // A collection of operations along with their new attribute dictionary.
+  std::vector<std::pair<Operation *, DictionaryAttr>> updatedAttrDicts;
+
+  // The current operation, and its old symbol access chains, being processed.
+  Operation *curOp = nullptr;
+  SmallVector<SmallVector<int, 1>, 1> accessChains;
+
+  // Generate a new attribute dictionary for the current operation by replacing
+  // references to the old symbol.
+  auto generateNewAttrDict = [&] {
+    auto newAttrDict =
+        rebuildAttrAfterRAUW(curOp->getAttrList().getDictionary(), accessChains,
+                             newSymAttr, /*depth=*/0);
+    return newAttrDict.cast<DictionaryAttr>();
+  };
+
+  // Walk the symbol uses collecting uses of the old symbol.
+  auto walkFn = [&](SymbolTable::SymbolUse symbolUse,
+                    ArrayRef<int> accessChain) {
+    if (symbolUse.getSymbolRef() != oldAttr)
+      return WalkResult::advance();
+
+    // If there was a previous operation, generate a new attribute dict for it.
+    // This means that we've finished processing the current operation, so
+    // generate a new dictionary for it.
+    if (curOp && symbolUse.getUser() != curOp) {
+      updatedAttrDicts.push_back({curOp, generateNewAttrDict()});
+      accessChains.clear();
+    }
+
+    // Record this access.
+    curOp = symbolUse.getUser();
+    accessChains.push_back(llvm::to_vector<1>(accessChain));
+    return WalkResult::advance();
+  };
+  if (!walkSymbolUses(from, walkFn))
+    return failure();
+
+  // Update the attribute dictionaries as necessary.
+  for (auto &it : updatedAttrDicts)
+    it.first->setAttrs(it.second);
+
+  // Check to see if we have a dangling op that needs to be processed.
+  if (curOp)
+    curOp->setAttrs(generateNewAttrDict());
+
+  return success();
+}
diff --git a/mlir/lib/IR/TypeDetail.h b/mlir/lib/IR/TypeDetail.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3e0edd3a57353a06d6b3a01f519a3c01e75044b
--- /dev/null
+++ b/mlir/lib/IR/TypeDetail.h
@@ -0,0 +1,324 @@
+//===- TypeDetail.h - MLIR Type storage details -----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This holds implementation details of Type.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TYPEDETAIL_H_
+#define TYPEDETAIL_H_
+
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/TypeSupport.h"
+#include "mlir/IR/Types.h"
+#include "llvm/Support/TrailingObjects.h"
+
+namespace mlir {
+
+class MLIRContext;
+
+namespace detail {
+
+/// Opaque Type Storage and Uniquing.
+struct OpaqueTypeStorage : public TypeStorage {
+  OpaqueTypeStorage(Identifier dialectNamespace, StringRef typeData)
+      : dialectNamespace(dialectNamespace), typeData(typeData) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<Identifier, StringRef>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(dialectNamespace, typeData);
+  }
+
+  static OpaqueTypeStorage *construct(TypeStorageAllocator &allocator,
+                                      const KeyTy &key) {
+    StringRef tyData = allocator.copyInto(key.second);
+    return new (allocator.allocate<OpaqueTypeStorage>())
+        OpaqueTypeStorage(key.first, tyData);
+  }
+
+  // The dialect namespace.
+  Identifier dialectNamespace;
+
+  // The parser type data for this opaque type.
+  StringRef typeData;
+};
+
+/// Integer Type Storage and Uniquing.
+struct IntegerTypeStorage : public TypeStorage {
+  IntegerTypeStorage(unsigned width) : width(width) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = unsigned;
+  bool operator==(const KeyTy &key) const { return key == width; }
+
+  static IntegerTypeStorage *construct(TypeStorageAllocator &allocator,
+                                       KeyTy bitwidth) {
+    return new (allocator.allocate<IntegerTypeStorage>())
+        IntegerTypeStorage(bitwidth);
+  }
+
+  unsigned width;
+};
+
+/// Function Type Storage and Uniquing.
+struct FunctionTypeStorage : public TypeStorage {
+  FunctionTypeStorage(unsigned numInputs, unsigned numResults,
+                      Type const *inputsAndResults)
+      : TypeStorage(numInputs), numResults(numResults),
+        inputsAndResults(inputsAndResults) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<ArrayRef<Type>, ArrayRef<Type>>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getInputs(), getResults());
+  }
+
+  /// Construction.
+  static FunctionTypeStorage *construct(TypeStorageAllocator &allocator,
+                                        const KeyTy &key) {
+    ArrayRef<Type> inputs = key.first, results = key.second;
+
+    // Copy the inputs and results into the bump pointer.
+    SmallVector<Type, 16> types;
+    types.reserve(inputs.size() + results.size());
+    types.append(inputs.begin(), inputs.end());
+    types.append(results.begin(), results.end());
+    auto typesList = allocator.copyInto(ArrayRef<Type>(types));
+
+    // Initialize the memory using placement new.
+    return new (allocator.allocate<FunctionTypeStorage>())
+        FunctionTypeStorage(inputs.size(), results.size(), typesList.data());
+  }
+
+  ArrayRef<Type> getInputs() const {
+    return ArrayRef<Type>(inputsAndResults, getSubclassData());
+  }
+  ArrayRef<Type> getResults() const {
+    return ArrayRef<Type>(inputsAndResults + getSubclassData(), numResults);
+  }
+
+  unsigned numResults;
+  Type const *inputsAndResults;
+};
+
+/// Shaped Type Storage.
+struct ShapedTypeStorage : public TypeStorage {
+  ShapedTypeStorage(Type elementTy, unsigned subclassData = 0)
+      : TypeStorage(subclassData), elementType(elementTy) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = Type;
+  bool operator==(const KeyTy &key) const { return key == elementType; }
+
+  Type elementType;
+};
+
+/// Vector Type Storage and Uniquing.
+struct VectorTypeStorage : public ShapedTypeStorage {
+  VectorTypeStorage(unsigned shapeSize, Type elementTy,
+                    const int64_t *shapeElements)
+      : ShapedTypeStorage(elementTy, shapeSize), shapeElements(shapeElements) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<ArrayRef<int64_t>, Type>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getShape(), elementType);
+  }
+
+  /// Construction.
+  static VectorTypeStorage *construct(TypeStorageAllocator &allocator,
+                                      const KeyTy &key) {
+    // Copy the shape into the bump pointer.
+    ArrayRef<int64_t> shape = allocator.copyInto(key.first);
+
+    // Initialize the memory using placement new.
+    return new (allocator.allocate<VectorTypeStorage>())
+        VectorTypeStorage(shape.size(), key.second, shape.data());
+  }
+
+  ArrayRef<int64_t> getShape() const {
+    return ArrayRef<int64_t>(shapeElements, getSubclassData());
+  }
+
+  const int64_t *shapeElements;
+};
+
+struct RankedTensorTypeStorage : public ShapedTypeStorage {
+  RankedTensorTypeStorage(unsigned shapeSize, Type elementTy,
+                          const int64_t *shapeElements)
+      : ShapedTypeStorage(elementTy, shapeSize), shapeElements(shapeElements) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::pair<ArrayRef<int64_t>, Type>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getShape(), elementType);
+  }
+
+  /// Construction.
+  static RankedTensorTypeStorage *construct(TypeStorageAllocator &allocator,
+                                            const KeyTy &key) {
+    // Copy the shape into the bump pointer.
+    ArrayRef<int64_t> shape = allocator.copyInto(key.first);
+
+    // Initialize the memory using placement new.
+    return new (allocator.allocate<RankedTensorTypeStorage>())
+        RankedTensorTypeStorage(shape.size(), key.second, shape.data());
+  }
+
+  ArrayRef<int64_t> getShape() const {
+    return ArrayRef<int64_t>(shapeElements, getSubclassData());
+  }
+
+  const int64_t *shapeElements;
+};
+
+struct UnrankedTensorTypeStorage : public ShapedTypeStorage {
+  using ShapedTypeStorage::KeyTy;
+  using ShapedTypeStorage::ShapedTypeStorage;
+
+  /// Construction.
+  static UnrankedTensorTypeStorage *construct(TypeStorageAllocator &allocator,
+                                              Type elementTy) {
+    return new (allocator.allocate<UnrankedTensorTypeStorage>())
+        UnrankedTensorTypeStorage(elementTy);
+  }
+};
+
+struct MemRefTypeStorage : public ShapedTypeStorage {
+  MemRefTypeStorage(unsigned shapeSize, Type elementType,
+                    const int64_t *shapeElements, const unsigned numAffineMaps,
+                    AffineMap const *affineMapList, const unsigned memorySpace)
+      : ShapedTypeStorage(elementType, shapeSize), shapeElements(shapeElements),
+        numAffineMaps(numAffineMaps), affineMapList(affineMapList),
+        memorySpace(memorySpace) {}
+
+  /// The hash key used for uniquing.
+  // MemRefs are uniqued based on their shape, element type, affine map
+  // composition, and memory space.
+  using KeyTy =
+      std::tuple<ArrayRef<int64_t>, Type, ArrayRef<AffineMap>, unsigned>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(getShape(), elementType, getAffineMaps(), memorySpace);
+  }
+
+  /// Construction.
+  static MemRefTypeStorage *construct(TypeStorageAllocator &allocator,
+                                      const KeyTy &key) {
+    // Copy the shape into the bump pointer.
+    ArrayRef<int64_t> shape = allocator.copyInto(std::get<0>(key));
+
+    // Copy the affine map composition into the bump pointer.
+    ArrayRef<AffineMap> affineMapComposition =
+        allocator.copyInto(std::get<2>(key));
+
+    // Initialize the memory using placement new.
+    return new (allocator.allocate<MemRefTypeStorage>())
+        MemRefTypeStorage(shape.size(), std::get<1>(key), shape.data(),
+                          affineMapComposition.size(),
+                          affineMapComposition.data(), std::get<3>(key));
+  }
+
+  ArrayRef<int64_t> getShape() const {
+    return ArrayRef<int64_t>(shapeElements, getSubclassData());
+  }
+
+  ArrayRef<AffineMap> getAffineMaps() const {
+    return ArrayRef<AffineMap>(affineMapList, numAffineMaps);
+  }
+
+  /// An array of integers which stores the shape dimension sizes.
+  const int64_t *shapeElements;
+  /// The number of affine maps in the 'affineMapList' array.
+  const unsigned numAffineMaps;
+  /// List of affine maps in the memref's layout/index map composition.
+  AffineMap const *affineMapList;
+  /// Memory space in which data referenced by memref resides.
+  const unsigned memorySpace;
+};
+
+/// Unranked MemRef is a MemRef with unknown rank.
+/// Only element type and memory space are known
+struct UnrankedMemRefTypeStorage : public ShapedTypeStorage {
+
+  UnrankedMemRefTypeStorage(Type elementTy, const unsigned memorySpace)
+      : ShapedTypeStorage(elementTy), memorySpace(memorySpace) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = std::tuple<Type, unsigned>;
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(elementType, memorySpace);
+  }
+
+  /// Construction.
+  static UnrankedMemRefTypeStorage *construct(TypeStorageAllocator &allocator,
+                                              const KeyTy &key) {
+
+    // Initialize the memory using placement new.
+    return new (allocator.allocate<UnrankedMemRefTypeStorage>())
+        UnrankedMemRefTypeStorage(std::get<0>(key), std::get<1>(key));
+  }
+  /// Memory space in which data referenced by memref resides.
+  const unsigned memorySpace;
+};
+
+/// Complex Type Storage.
+struct ComplexTypeStorage : public TypeStorage {
+  ComplexTypeStorage(Type elementType) : elementType(elementType) {}
+
+  /// The hash key used for uniquing.
+  using KeyTy = Type;
+  bool operator==(const KeyTy &key) const { return key == elementType; }
+
+  /// Construction.
+  static ComplexTypeStorage *construct(TypeStorageAllocator &allocator,
+                                       Type elementType) {
+    return new (allocator.allocate<ComplexTypeStorage>())
+        ComplexTypeStorage(elementType);
+  }
+
+  Type elementType;
+};
+
+/// A type representing a collection of other types.
+struct TupleTypeStorage final
+    : public TypeStorage,
+      public llvm::TrailingObjects<TupleTypeStorage, Type> {
+  using KeyTy = ArrayRef<Type>;
+
+  TupleTypeStorage(unsigned numTypes) : TypeStorage(numTypes) {}
+
+  /// Construction.
+  static TupleTypeStorage *construct(TypeStorageAllocator &allocator,
+                                     ArrayRef<Type> key) {
+    // Allocate a new storage instance.
+    auto byteSize = TupleTypeStorage::totalSizeToAlloc<Type>(key.size());
+    auto rawMem = allocator.allocate(byteSize, alignof(TupleTypeStorage));
+    auto result = ::new (rawMem) TupleTypeStorage(key.size());
+
+    // Copy in the element types into the trailing storage.
+    std::uninitialized_copy(key.begin(), key.end(),
+                            result->getTrailingObjects<Type>());
+    return result;
+  }
+
+  bool operator==(const KeyTy &key) const { return key == getTypes(); }
+
+  /// Return the number of held types.
+  unsigned size() const { return getSubclassData(); }
+
+  /// Return the held types.
+  ArrayRef<Type> getTypes() const {
+    return {getTrailingObjects<Type>(), size()};
+  }
+};
+
+} // namespace detail
+} // namespace mlir
+#endif // TYPEDETAIL_H_
diff --git a/mlir/lib/IR/TypeUtilities.cpp b/mlir/lib/IR/TypeUtilities.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0bf1627b9d5c29e3b2bdb2ade410b82ed1172cee
--- /dev/null
+++ b/mlir/lib/IR/TypeUtilities.cpp
@@ -0,0 +1,105 @@
+//===- TypeUtilities.cpp - Helper function for type queries ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines generic type utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+
+using namespace mlir;
+
+Type mlir::getElementTypeOrSelf(Type type) {
+  if (auto st = type.dyn_cast<ShapedType>())
+    return st.getElementType();
+  return type;
+}
+
+Type mlir::getElementTypeOrSelf(Value val) {
+  return getElementTypeOrSelf(val->getType());
+}
+
+Type mlir::getElementTypeOrSelf(Attribute attr) {
+  return getElementTypeOrSelf(attr.getType());
+}
+
+SmallVector<Type, 10> mlir::getFlattenedTypes(TupleType t) {
+  SmallVector<Type, 10> fTypes;
+  t.getFlattenedTypes(fTypes);
+  return fTypes;
+}
+
+/// Return true if the specified type is an opaque type with the specified
+/// dialect and typeData.
+bool mlir::isOpaqueTypeWithName(Type type, StringRef dialect,
+                                StringRef typeData) {
+  if (auto opaque = type.dyn_cast<mlir::OpaqueType>())
+    return opaque.getDialectNamespace().is(dialect) &&
+           opaque.getTypeData() == typeData;
+  return false;
+}
+
+/// Returns success if the given two shapes are compatible. That is, they have
+/// the same size and each pair of the elements are equal or one of them is
+/// dynamic.
+LogicalResult mlir::verifyCompatibleShape(ArrayRef<int64_t> shape1,
+                                          ArrayRef<int64_t> shape2) {
+  if (shape1.size() != shape2.size())
+    return failure();
+  for (const auto &dims : llvm::zip(shape1, shape2)) {
+    int64_t dim1 = std::get<0>(dims);
+    int64_t dim2 = std::get<1>(dims);
+    if (!ShapedType::isDynamic(dim1) && !ShapedType::isDynamic(dim2) &&
+        dim1 != dim2)
+      return failure();
+  }
+  return success();
+}
+
+/// Returns success if the given two types have compatible shape. That is,
+/// they are both scalars (not shaped), or they are both shaped types and at
+/// least one is unranked or they have compatible dimensions. Dimensions are
+/// compatible if at least one is dynamic or both are equal. The element type
+/// does not matter.
+LogicalResult mlir::verifyCompatibleShape(Type type1, Type type2) {
+  auto sType1 = type1.dyn_cast<ShapedType>();
+  auto sType2 = type2.dyn_cast<ShapedType>();
+
+  // Either both or neither type should be shaped.
+  if (!sType1)
+    return success(!sType2);
+  if (!sType2)
+    return failure();
+
+  if (!sType1.hasRank() || !sType2.hasRank())
+    return success();
+
+  return verifyCompatibleShape(sType1.getShape(), sType2.getShape());
+}
+
+OperandElementTypeIterator::OperandElementTypeIterator(
+    Operation::operand_iterator it)
+    : llvm::mapped_iterator<Operation::operand_iterator, Type (*)(Value)>(
+          it, &unwrap) {}
+
+Type OperandElementTypeIterator::unwrap(Value value) {
+  return value->getType().cast<ShapedType>().getElementType();
+}
+
+ResultElementTypeIterator::ResultElementTypeIterator(
+    Operation::result_iterator it)
+    : llvm::mapped_iterator<Operation::result_iterator, Type (*)(Value)>(
+          it, &unwrap) {}
+
+Type ResultElementTypeIterator::unwrap(Value value) {
+  return value->getType().cast<ShapedType>().getElementType();
+}
diff --git a/mlir/lib/IR/Types.cpp b/mlir/lib/IR/Types.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..923d6e16f5755867c254be3f6a07767ef18e444c
--- /dev/null
+++ b/mlir/lib/IR/Types.cpp
@@ -0,0 +1,81 @@
+//===- Types.cpp - MLIR Type Classes --------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Types.h"
+#include "TypeDetail.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "llvm/ADT/Twine.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+//===----------------------------------------------------------------------===//
+// Type
+//===----------------------------------------------------------------------===//
+
+unsigned Type::getKind() const { return impl->getKind(); }
+
+/// Get the dialect this type is registered to.
+Dialect &Type::getDialect() const { return impl->getDialect(); }
+
+MLIRContext *Type::getContext() const { return getDialect().getContext(); }
+
+unsigned Type::getSubclassData() const { return impl->getSubclassData(); }
+void Type::setSubclassData(unsigned val) { impl->setSubclassData(val); }
+
+//===----------------------------------------------------------------------===//
+// FunctionType
+//===----------------------------------------------------------------------===//
+
+FunctionType FunctionType::get(ArrayRef<Type> inputs, ArrayRef<Type> results,
+                               MLIRContext *context) {
+  return Base::get(context, Type::Kind::Function, inputs, results);
+}
+
+ArrayRef<Type> FunctionType::getInputs() const {
+  return getImpl()->getInputs();
+}
+
+unsigned FunctionType::getNumResults() const { return getImpl()->numResults; }
+
+ArrayRef<Type> FunctionType::getResults() const {
+  return getImpl()->getResults();
+}
+
+//===----------------------------------------------------------------------===//
+// OpaqueType
+//===----------------------------------------------------------------------===//
+
+OpaqueType OpaqueType::get(Identifier dialect, StringRef typeData,
+                           MLIRContext *context) {
+  return Base::get(context, Type::Kind::Opaque, dialect, typeData);
+}
+
+OpaqueType OpaqueType::getChecked(Identifier dialect, StringRef typeData,
+                                  MLIRContext *context, Location location) {
+  return Base::getChecked(location, context, Kind::Opaque, dialect, typeData);
+}
+
+/// Returns the dialect namespace of the opaque type.
+Identifier OpaqueType::getDialectNamespace() const {
+  return getImpl()->dialectNamespace;
+}
+
+/// Returns the raw type data of the opaque type.
+StringRef OpaqueType::getTypeData() const { return getImpl()->typeData; }
+
+/// Verify the construction of an opaque type.
+LogicalResult OpaqueType::verifyConstructionInvariants(Optional<Location> loc,
+                                                       MLIRContext *context,
+                                                       Identifier dialect,
+                                                       StringRef typeData) {
+  if (!Dialect::isValidNamespace(dialect.strref()))
+    return emitOptionalError(loc, "invalid dialect namespace '", dialect, "'");
+  return success();
+}
diff --git a/mlir/lib/IR/Value.cpp b/mlir/lib/IR/Value.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ffb9601f1c93643136f284236f2d6deb3ddc3079
--- /dev/null
+++ b/mlir/lib/IR/Value.cpp
@@ -0,0 +1,54 @@
+//===- Value.cpp - MLIR Value Classes -------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Operation.h"
+using namespace mlir;
+
+/// If this value is the result of an Operation, return the operation that
+/// defines it.
+Operation *Value::getDefiningOp() const {
+  if (auto result = dyn_cast<OpResult>())
+    return result->getOwner();
+  return nullptr;
+}
+
+Location Value::getLoc() {
+  if (auto *op = getDefiningOp())
+    return op->getLoc();
+  return UnknownLoc::get(getContext());
+}
+
+/// Return the Region in which this Value is defined.
+Region *Value::getParentRegion() {
+  if (auto *op = getDefiningOp())
+    return op->getParentRegion();
+  return cast<BlockArgument>()->getOwner()->getParent();
+}
+
+//===----------------------------------------------------------------------===//
+// IRObjectWithUseList implementation.
+//===----------------------------------------------------------------------===//
+
+/// Replace all uses of 'this' value with the new value, updating anything in
+/// the IR that uses 'this' to use the other value instead.  When this returns
+/// there are zero uses of 'this'.
+void IRObjectWithUseList::replaceAllUsesWith(IRObjectWithUseList *newValue) {
+  assert(this != newValue && "cannot RAUW a value with itself");
+  while (!use_empty()) {
+    use_begin()->set(newValue);
+  }
+}
+
+/// Drop all uses of this object from their respective owners.
+void IRObjectWithUseList::dropAllUses() {
+  while (!use_empty()) {
+    use_begin()->drop();
+  }
+}
diff --git a/mlir/lib/IR/Visitors.cpp b/mlir/lib/IR/Visitors.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..404e74a82c9b11a54ab021dca1918c58624d8500
--- /dev/null
+++ b/mlir/lib/IR/Visitors.cpp
@@ -0,0 +1,42 @@
+//===- Visitors.cpp - MLIR Visitor Utilties -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Visitors.h"
+#include "mlir/IR/Operation.h"
+
+using namespace mlir;
+
+/// Walk all of the operations nested under and including the given operations.
+void detail::walkOperations(Operation *op,
+                            function_ref<void(Operation *op)> callback) {
+  // TODO(b/140235992) This walk should be iterative over the operations.
+  for (auto &region : op->getRegions())
+    for (auto &block : region)
+      // Early increment here in the case where the operation is erased.
+      for (auto &nestedOp : llvm::make_early_inc_range(block))
+        walkOperations(&nestedOp, callback);
+
+  callback(op);
+}
+
+/// Walk all of the operations nested under and including the given operations.
+/// This methods walks operations until an interrupt signal is received.
+WalkResult
+detail::walkOperations(Operation *op,
+                       function_ref<WalkResult(Operation *op)> callback) {
+  // TODO(b/140235992) This walk should be iterative over the operations.
+  for (auto &region : op->getRegions()) {
+    for (auto &block : region) {
+      // Early increment here in the case where the operation is erased.
+      for (auto &nestedOp : llvm::make_early_inc_range(block))
+        if (walkOperations(&nestedOp, callback).wasInterrupted())
+          return WalkResult::interrupt();
+    }
+  }
+  return callback(op);
+}
diff --git a/mlir/lib/Parser/CMakeLists.txt b/mlir/lib/Parser/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9fd29ae78791ccf517c4875a5079bfec58bb3c6f
--- /dev/null
+++ b/mlir/lib/Parser/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(MLIRParser
+  Lexer.cpp
+  Parser.cpp
+  Token.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Parser
+  )
+add_dependencies(MLIRParser MLIRIR MLIRAnalysis)
+target_link_libraries(MLIRParser MLIRIR MLIRAnalysis)
diff --git a/mlir/lib/Parser/Lexer.cpp b/mlir/lib/Parser/Lexer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d8337a9cb3cd9c2d97a49e600480c65e6d50fcd
--- /dev/null
+++ b/mlir/lib/Parser/Lexer.cpp
@@ -0,0 +1,394 @@
+//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lexer for the MLIR textual form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lexer.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "llvm/Support/SourceMgr.h"
+using namespace mlir;
+
+using llvm::SMLoc;
+using llvm::SourceMgr;
+
+// Returns true if 'c' is an allowable punctuation character: [$._-]
+// Returns false otherwise.
+static bool isPunct(char c) {
+  return c == '$' || c == '.' || c == '_' || c == '-';
+}
+
+Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context)
+    : sourceMgr(sourceMgr), context(context) {
+  auto bufferID = sourceMgr.getMainFileID();
+  curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
+  curPtr = curBuffer.begin();
+}
+
+/// Encode the specified source location information into an attribute for
+/// attachment to the IR.
+Location Lexer::getEncodedSourceLocation(llvm::SMLoc loc) {
+  auto &sourceMgr = getSourceMgr();
+  unsigned mainFileID = sourceMgr.getMainFileID();
+  auto lineAndColumn = sourceMgr.getLineAndColumn(loc, mainFileID);
+  auto *buffer = sourceMgr.getMemoryBuffer(mainFileID);
+
+  return FileLineColLoc::get(buffer->getBufferIdentifier(), lineAndColumn.first,
+                             lineAndColumn.second, context);
+}
+
+/// emitError - Emit an error message and return an Token::error token.
+Token Lexer::emitError(const char *loc, const Twine &message) {
+  mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc)),
+                  message);
+  return formToken(Token::error, loc);
+}
+
+Token Lexer::lexToken() {
+  while (true) {
+    const char *tokStart = curPtr;
+    switch (*curPtr++) {
+    default:
+      // Handle bare identifiers.
+      if (isalpha(curPtr[-1]))
+        return lexBareIdentifierOrKeyword(tokStart);
+
+      // Unknown character, emit an error.
+      return emitError(tokStart, "unexpected character");
+
+    case ' ':
+    case '\t':
+    case '\n':
+    case '\r':
+      // Handle whitespace.
+      continue;
+
+    case '_':
+      // Handle bare identifiers.
+      return lexBareIdentifierOrKeyword(tokStart);
+
+    case 0:
+      // This may either be a nul character in the source file or may be the EOF
+      // marker that llvm::MemoryBuffer guarantees will be there.
+      if (curPtr - 1 == curBuffer.end())
+        return formToken(Token::eof, tokStart);
+
+      LLVM_FALLTHROUGH;
+    case ':':
+      return formToken(Token::colon, tokStart);
+    case ',':
+      return formToken(Token::comma, tokStart);
+    case '.':
+      return lexEllipsis(tokStart);
+    case '(':
+      return formToken(Token::l_paren, tokStart);
+    case ')':
+      return formToken(Token::r_paren, tokStart);
+    case '{':
+      return formToken(Token::l_brace, tokStart);
+    case '}':
+      return formToken(Token::r_brace, tokStart);
+    case '[':
+      return formToken(Token::l_square, tokStart);
+    case ']':
+      return formToken(Token::r_square, tokStart);
+    case '<':
+      return formToken(Token::less, tokStart);
+    case '>':
+      return formToken(Token::greater, tokStart);
+    case '=':
+      return formToken(Token::equal, tokStart);
+
+    case '+':
+      return formToken(Token::plus, tokStart);
+    case '*':
+      return formToken(Token::star, tokStart);
+    case '-':
+      if (*curPtr == '>') {
+        ++curPtr;
+        return formToken(Token::arrow, tokStart);
+      }
+      return formToken(Token::minus, tokStart);
+
+    case '?':
+      return formToken(Token::question, tokStart);
+
+    case '/':
+      if (*curPtr == '/') {
+        skipComment();
+        continue;
+      }
+      return emitError(tokStart, "unexpected character");
+
+    case '@':
+      return lexAtIdentifier(tokStart);
+
+    case '!':
+      LLVM_FALLTHROUGH;
+    case '^':
+      LLVM_FALLTHROUGH;
+    case '#':
+      LLVM_FALLTHROUGH;
+    case '%':
+      return lexPrefixedIdentifier(tokStart);
+    case '"':
+      return lexString(tokStart);
+
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+      return lexNumber(tokStart);
+    }
+  }
+}
+
+/// Lex an '@foo' identifier.
+///
+///   symbol-ref-id ::= `@` (bare-id | string-literal)
+///
+Token Lexer::lexAtIdentifier(const char *tokStart) {
+  char cur = *curPtr++;
+
+  // Try to parse a string literal, if present.
+  if (cur == '"') {
+    Token stringIdentifier = lexString(curPtr);
+    if (stringIdentifier.is(Token::error))
+      return stringIdentifier;
+    return formToken(Token::at_identifier, tokStart);
+  }
+
+  // Otherwise, these always start with a letter or underscore.
+  if (!isalpha(cur) && cur != '_')
+    return emitError(curPtr - 1,
+                     "@ identifier expected to start with letter or '_'");
+
+  while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
+         *curPtr == '$' || *curPtr == '.')
+    ++curPtr;
+  return formToken(Token::at_identifier, tokStart);
+}
+
+/// Lex a bare identifier or keyword that starts with a letter.
+///
+///   bare-id ::= (letter|[_]) (letter|digit|[_$.])*
+///   integer-type ::= `i[1-9][0-9]*`
+///
+Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
+  // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
+  while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
+         *curPtr == '$' || *curPtr == '.')
+    ++curPtr;
+
+  // Check to see if this identifier is a keyword.
+  StringRef spelling(tokStart, curPtr - tokStart);
+
+  // Check for i123.
+  if (tokStart[0] == 'i') {
+    bool allDigits = true;
+    for (auto c : spelling.drop_front())
+      allDigits &= isdigit(c) != 0;
+    if (allDigits && spelling.size() != 1)
+      return Token(Token::inttype, spelling);
+  }
+
+  Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling)
+#define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
+#include "TokenKinds.def"
+                         .Default(Token::bare_identifier);
+
+  return Token(kind, spelling);
+}
+
+/// Skip a comment line, starting with a '//'.
+///
+///   TODO: add a regex for comments here and to the spec.
+///
+void Lexer::skipComment() {
+  // Advance over the second '/' in a '//' comment.
+  assert(*curPtr == '/');
+  ++curPtr;
+
+  while (true) {
+    switch (*curPtr++) {
+    case '\n':
+    case '\r':
+      // Newline is end of comment.
+      return;
+    case 0:
+      // If this is the end of the buffer, end the comment.
+      if (curPtr - 1 == curBuffer.end()) {
+        --curPtr;
+        return;
+      }
+      LLVM_FALLTHROUGH;
+    default:
+      // Skip over other characters.
+      break;
+    }
+  }
+}
+
+/// Lex an ellipsis.
+///
+///   ellipsis ::= '...'
+///
+Token Lexer::lexEllipsis(const char *tokStart) {
+  assert(curPtr[-1] == '.');
+
+  if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.')
+    return emitError(curPtr, "expected three consecutive dots for an ellipsis");
+
+  curPtr += 2;
+  return formToken(Token::ellipsis, tokStart);
+}
+
+/// Lex a number literal.
+///
+///   integer-literal ::= digit+ | `0x` hex_digit+
+///   float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
+///
+Token Lexer::lexNumber(const char *tokStart) {
+  assert(isdigit(curPtr[-1]));
+
+  // Handle the hexadecimal case.
+  if (curPtr[-1] == '0' && *curPtr == 'x') {
+    // If we see stuff like 0xi32, this is a literal `0` followed by an
+    // identifier `xi32`, stop after `0`.
+    if (!isxdigit(curPtr[1]))
+      return formToken(Token::integer, tokStart);
+
+    curPtr += 2;
+    while (isxdigit(*curPtr))
+      ++curPtr;
+
+    return formToken(Token::integer, tokStart);
+  }
+
+  // Handle the normal decimal case.
+  while (isdigit(*curPtr))
+    ++curPtr;
+
+  if (*curPtr != '.')
+    return formToken(Token::integer, tokStart);
+  ++curPtr;
+
+  // Skip over [0-9]*([eE][-+]?[0-9]+)?
+  while (isdigit(*curPtr))
+    ++curPtr;
+
+  if (*curPtr == 'e' || *curPtr == 'E') {
+    if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
+        ((curPtr[1] == '-' || curPtr[1] == '+') &&
+         isdigit(static_cast<unsigned char>(curPtr[2])))) {
+      curPtr += 2;
+      while (isdigit(*curPtr))
+        ++curPtr;
+    }
+  }
+  return formToken(Token::floatliteral, tokStart);
+}
+
+/// Lex an identifier that starts with a prefix followed by suffix-id.
+///
+///   attribute-id  ::= `#` suffix-id
+///   ssa-id        ::= '%' suffix-id
+///   block-id      ::= '^' suffix-id
+///   type-id       ::= '!' suffix-id
+///   suffix-id     ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
+///   id-punct      ::= `$` | `.` | `_` | `-`
+///
+Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
+  Token::Kind kind;
+  StringRef errorKind;
+  switch (*tokStart) {
+  case '#':
+    kind = Token::hash_identifier;
+    errorKind = "invalid attribute name";
+    break;
+  case '%':
+    kind = Token::percent_identifier;
+    errorKind = "invalid SSA name";
+    break;
+  case '^':
+    kind = Token::caret_identifier;
+    errorKind = "invalid block name";
+    break;
+  case '!':
+    kind = Token::exclamation_identifier;
+    errorKind = "invalid type identifier";
+    break;
+  default:
+    llvm_unreachable("invalid caller");
+  }
+
+  // Parse suffix-id.
+  if (isdigit(*curPtr)) {
+    // If suffix-id starts with a digit, the rest must be digits.
+    while (isdigit(*curPtr)) {
+      ++curPtr;
+    }
+  } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
+    do {
+      ++curPtr;
+    } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
+  } else {
+    return emitError(curPtr - 1, errorKind);
+  }
+
+  return formToken(kind, tokStart);
+}
+
+/// Lex a string literal.
+///
+///   string-literal ::= '"' [^"\n\f\v\r]* '"'
+///
+/// TODO: define escaping rules.
+Token Lexer::lexString(const char *tokStart) {
+  assert(curPtr[-1] == '"');
+
+  while (true) {
+    switch (*curPtr++) {
+    case '"':
+      return formToken(Token::string, tokStart);
+    case 0:
+      // If this is a random nul character in the middle of a string, just
+      // include it.  If it is the end of file, then it is an error.
+      if (curPtr - 1 != curBuffer.end())
+        continue;
+      LLVM_FALLTHROUGH;
+    case '\n':
+    case '\v':
+    case '\f':
+      return emitError(curPtr - 1, "expected '\"' in string literal");
+    case '\\':
+      // Handle explicitly a few escapes.
+      if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')
+        ++curPtr;
+      else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))
+        // Support \xx for two hex digits.
+        curPtr += 2;
+      else
+        return emitError(curPtr - 1, "unknown escape in string literal");
+      continue;
+
+    default:
+      continue;
+    }
+  }
+}
diff --git a/mlir/lib/Parser/Lexer.h b/mlir/lib/Parser/Lexer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a760dca93962f02dcb84631bd0aca478183aac52
--- /dev/null
+++ b/mlir/lib/Parser/Lexer.h
@@ -0,0 +1,73 @@
+//===- Lexer.h - MLIR Lexer Interface ---------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MLIR Lexer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_LIB_PARSER_LEXER_H
+#define MLIR_LIB_PARSER_LEXER_H
+
+#include "Token.h"
+#include "mlir/Parser.h"
+
+namespace mlir {
+class Location;
+
+/// This class breaks up the current file into a token stream.
+class Lexer {
+public:
+  explicit Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context);
+
+  const llvm::SourceMgr &getSourceMgr() { return sourceMgr; }
+
+  Token lexToken();
+
+  /// Encode the specified source location information into a Location object
+  /// for attachment to the IR or error reporting.
+  Location getEncodedSourceLocation(llvm::SMLoc loc);
+
+  /// Change the position of the lexer cursor.  The next token we lex will start
+  /// at the designated point in the input.
+  void resetPointer(const char *newPointer) { curPtr = newPointer; }
+
+  /// Returns the start of the buffer.
+  const char *getBufferBegin() { return curBuffer.data(); }
+
+private:
+  // Helpers.
+  Token formToken(Token::Kind kind, const char *tokStart) {
+    return Token(kind, StringRef(tokStart, curPtr - tokStart));
+  }
+
+  Token emitError(const char *loc, const Twine &message);
+
+  // Lexer implementation methods.
+  Token lexAtIdentifier(const char *tokStart);
+  Token lexBareIdentifierOrKeyword(const char *tokStart);
+  Token lexEllipsis(const char *tokStart);
+  Token lexNumber(const char *tokStart);
+  Token lexPrefixedIdentifier(const char *tokStart);
+  Token lexString(const char *tokStart);
+
+  /// Skip a comment line, starting with a '//'.
+  void skipComment();
+
+  const llvm::SourceMgr &sourceMgr;
+  MLIRContext *context;
+
+  StringRef curBuffer;
+  const char *curPtr;
+
+  Lexer(const Lexer &) = delete;
+  void operator=(const Lexer &) = delete;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_LIB_PARSER_LEXER_H
diff --git a/mlir/lib/Parser/Parser.cpp b/mlir/lib/Parser/Parser.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0198a45172b8b42d282a93cd12530430081c5a75
--- /dev/null
+++ b/mlir/lib/Parser/Parser.cpp
@@ -0,0 +1,4825 @@
+//===- Parser.cpp - MLIR Parser Implementation ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the parser for the MLIR textual form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Parser.h"
+#include "Lexer.h"
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
+#include <algorithm>
+using namespace mlir;
+using llvm::MemoryBuffer;
+using llvm::SMLoc;
+using llvm::SourceMgr;
+
+namespace {
+class Parser;
+
+//===----------------------------------------------------------------------===//
+// SymbolState
+//===----------------------------------------------------------------------===//
+
+/// This class contains record of any parsed top-level symbols.
+struct SymbolState {
+  // A map from attribute alias identifier to Attribute.
+  llvm::StringMap<Attribute> attributeAliasDefinitions;
+
+  // A map from type alias identifier to Type.
+  llvm::StringMap<Type> typeAliasDefinitions;
+
+  /// A set of locations into the main parser memory buffer for each of the
+  /// active nested parsers. Given that some nested parsers, i.e. custom dialect
+  /// parsers, operate on a temporary memory buffer, this provides an anchor
+  /// point for emitting diagnostics.
+  SmallVector<llvm::SMLoc, 1> nestedParserLocs;
+
+  /// The top-level lexer that contains the original memory buffer provided by
+  /// the user. This is used by nested parsers to get a properly encoded source
+  /// location.
+  Lexer *topLevelLexer = nullptr;
+};
+
+//===----------------------------------------------------------------------===//
+// ParserState
+//===----------------------------------------------------------------------===//
+
+/// This class refers to all of the state maintained globally by the parser,
+/// such as the current lexer position etc.
+struct ParserState {
+  ParserState(const llvm::SourceMgr &sourceMgr, MLIRContext *ctx,
+              SymbolState &symbols)
+      : context(ctx), lex(sourceMgr, ctx), curToken(lex.lexToken()),
+        symbols(symbols), parserDepth(symbols.nestedParserLocs.size()) {
+    // Set the top level lexer for the symbol state if one doesn't exist.
+    if (!symbols.topLevelLexer)
+      symbols.topLevelLexer = &lex;
+  }
+  ~ParserState() {
+    // Reset the top level lexer if it refers the lexer in our state.
+    if (symbols.topLevelLexer == &lex)
+      symbols.topLevelLexer = nullptr;
+  }
+  ParserState(const ParserState &) = delete;
+  void operator=(const ParserState &) = delete;
+
+  /// The context we're parsing into.
+  MLIRContext *const context;
+
+  /// The lexer for the source file we're parsing.
+  Lexer lex;
+
+  /// This is the next token that hasn't been consumed yet.
+  Token curToken;
+
+  /// The current state for symbol parsing.
+  SymbolState &symbols;
+
+  /// The depth of this parser in the nested parsing stack.
+  size_t parserDepth;
+};
+
+//===----------------------------------------------------------------------===//
+// Parser
+//===----------------------------------------------------------------------===//
+
+/// This class implement support for parsing global entities like types and
+/// shared entities like SSA names.  It is intended to be subclassed by
+/// specialized subparsers that include state, e.g. when a local symbol table.
+class Parser {
+public:
+  Builder builder;
+
+  Parser(ParserState &state) : builder(state.context), state(state) {}
+
+  // Helper methods to get stuff from the parser-global state.
+  ParserState &getState() const { return state; }
+  MLIRContext *getContext() const { return state.context; }
+  const llvm::SourceMgr &getSourceMgr() { return state.lex.getSourceMgr(); }
+
+  /// Parse a comma-separated list of elements up until the specified end token.
+  ParseResult
+  parseCommaSeparatedListUntil(Token::Kind rightToken,
+                               const std::function<ParseResult()> &parseElement,
+                               bool allowEmptyList = true);
+
+  /// Parse a comma separated list of elements that must have at least one entry
+  /// in it.
+  ParseResult
+  parseCommaSeparatedList(const std::function<ParseResult()> &parseElement);
+
+  ParseResult parsePrettyDialectSymbolName(StringRef &prettyName);
+
+  // We have two forms of parsing methods - those that return a non-null
+  // pointer on success, and those that return a ParseResult to indicate whether
+  // they returned a failure.  The second class fills in by-reference arguments
+  // as the results of their action.
+
+  //===--------------------------------------------------------------------===//
+  // Error Handling
+  //===--------------------------------------------------------------------===//
+
+  /// Emit an error and return failure.
+  InFlightDiagnostic emitError(const Twine &message = {}) {
+    return emitError(state.curToken.getLoc(), message);
+  }
+  InFlightDiagnostic emitError(SMLoc loc, const Twine &message = {});
+
+  /// Encode the specified source location information into an attribute for
+  /// attachment to the IR.
+  Location getEncodedSourceLocation(llvm::SMLoc loc) {
+    // If there are no active nested parsers, we can get the encoded source
+    // location directly.
+    if (state.parserDepth == 0)
+      return state.lex.getEncodedSourceLocation(loc);
+    // Otherwise, we need to re-encode it to point to the top level buffer.
+    return state.symbols.topLevelLexer->getEncodedSourceLocation(
+        remapLocationToTopLevelBuffer(loc));
+  }
+
+  /// Remaps the given SMLoc to the top level lexer of the parser. This is used
+  /// to adjust locations of potentially nested parsers to ensure that they can
+  /// be emitted properly as diagnostics.
+  llvm::SMLoc remapLocationToTopLevelBuffer(llvm::SMLoc loc) {
+    // If there are no active nested parsers, we can return location directly.
+    SymbolState &symbols = state.symbols;
+    if (state.parserDepth == 0)
+      return loc;
+    assert(symbols.topLevelLexer && "expected valid top-level lexer");
+
+    // Otherwise, we need to remap the location to the main parser. This is
+    // simply offseting the location onto the location of the last nested
+    // parser.
+    size_t offset = loc.getPointer() - state.lex.getBufferBegin();
+    auto *rawLoc =
+        symbols.nestedParserLocs[state.parserDepth - 1].getPointer() + offset;
+    return llvm::SMLoc::getFromPointer(rawLoc);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Token Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Return the current token the parser is inspecting.
+  const Token &getToken() const { return state.curToken; }
+  StringRef getTokenSpelling() const { return state.curToken.getSpelling(); }
+
+  /// If the current token has the specified kind, consume it and return true.
+  /// If not, return false.
+  bool consumeIf(Token::Kind kind) {
+    if (state.curToken.isNot(kind))
+      return false;
+    consumeToken(kind);
+    return true;
+  }
+
+  /// Advance the current lexer onto the next token.
+  void consumeToken() {
+    assert(state.curToken.isNot(Token::eof, Token::error) &&
+           "shouldn't advance past EOF or errors");
+    state.curToken = state.lex.lexToken();
+  }
+
+  /// Advance the current lexer onto the next token, asserting what the expected
+  /// current token is.  This is preferred to the above method because it leads
+  /// to more self-documenting code with better checking.
+  void consumeToken(Token::Kind kind) {
+    assert(state.curToken.is(kind) && "consumed an unexpected token");
+    consumeToken();
+  }
+
+  /// Consume the specified token if present and return success.  On failure,
+  /// output a diagnostic and return failure.
+  ParseResult parseToken(Token::Kind expectedToken, const Twine &message);
+
+  //===--------------------------------------------------------------------===//
+  // Type Parsing
+  //===--------------------------------------------------------------------===//
+
+  ParseResult parseFunctionResultTypes(SmallVectorImpl<Type> &elements);
+  ParseResult parseTypeListNoParens(SmallVectorImpl<Type> &elements);
+  ParseResult parseTypeListParens(SmallVectorImpl<Type> &elements);
+
+  /// Parse an arbitrary type.
+  Type parseType();
+
+  /// Parse a complex type.
+  Type parseComplexType();
+
+  /// Parse an extended type.
+  Type parseExtendedType();
+
+  /// Parse a function type.
+  Type parseFunctionType();
+
+  /// Parse a memref type.
+  Type parseMemRefType();
+
+  /// Parse a non function type.
+  Type parseNonFunctionType();
+
+  /// Parse a tensor type.
+  Type parseTensorType();
+
+  /// Parse a tuple type.
+  Type parseTupleType();
+
+  /// Parse a vector type.
+  VectorType parseVectorType();
+  ParseResult parseDimensionListRanked(SmallVectorImpl<int64_t> &dimensions,
+                                       bool allowDynamic = true);
+  ParseResult parseXInDimensionList();
+
+  /// Parse strided layout specification.
+  ParseResult parseStridedLayout(int64_t &offset,
+                                 SmallVectorImpl<int64_t> &strides);
+
+  // Parse a brace-delimiter list of comma-separated integers with `?` as an
+  // unknown marker.
+  ParseResult parseStrideList(SmallVectorImpl<int64_t> &dimensions);
+
+  //===--------------------------------------------------------------------===//
+  // Attribute Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an arbitrary attribute with an optional type.
+  Attribute parseAttribute(Type type = {});
+
+  /// Parse an attribute dictionary.
+  ParseResult parseAttributeDict(SmallVectorImpl<NamedAttribute> &attributes);
+
+  /// Parse an extended attribute.
+  Attribute parseExtendedAttr(Type type);
+
+  /// Parse a float attribute.
+  Attribute parseFloatAttr(Type type, bool isNegative);
+
+  /// Parse a decimal or a hexadecimal literal, which can be either an integer
+  /// or a float attribute.
+  Attribute parseDecOrHexAttr(Type type, bool isNegative);
+
+  /// Parse an opaque elements attribute.
+  Attribute parseOpaqueElementsAttr();
+
+  /// Parse a dense elements attribute.
+  Attribute parseDenseElementsAttr();
+  ShapedType parseElementsLiteralType();
+
+  /// Parse a sparse elements attribute.
+  Attribute parseSparseElementsAttr();
+
+  //===--------------------------------------------------------------------===//
+  // Location Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an inline location.
+  ParseResult parseLocation(LocationAttr &loc);
+
+  /// Parse a raw location instance.
+  ParseResult parseLocationInstance(LocationAttr &loc);
+
+  /// Parse a callsite location instance.
+  ParseResult parseCallSiteLocation(LocationAttr &loc);
+
+  /// Parse a fused location instance.
+  ParseResult parseFusedLocation(LocationAttr &loc);
+
+  /// Parse a name or FileLineCol location instance.
+  ParseResult parseNameOrFileLineColLocation(LocationAttr &loc);
+
+  /// Parse an optional trailing location.
+  ///
+  ///   trailing-location     ::= (`loc` `(` location `)`)?
+  ///
+  ParseResult parseOptionalTrailingLocation(Location &loc) {
+    // If there is a 'loc' we parse a trailing location.
+    if (!getToken().is(Token::kw_loc))
+      return success();
+
+    // Parse the location.
+    LocationAttr directLoc;
+    if (parseLocation(directLoc))
+      return failure();
+    loc = directLoc;
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Affine Parsing
+  //===--------------------------------------------------------------------===//
+
+  ParseResult parseAffineMapOrIntegerSetReference(AffineMap &map,
+                                                  IntegerSet &set);
+
+  /// Parse an AffineMap where the dim and symbol identifiers are SSA ids.
+  ParseResult
+  parseAffineMapOfSSAIds(AffineMap &map,
+                         function_ref<ParseResult(bool)> parseElement);
+
+private:
+  /// The Parser is subclassed and reinstantiated.  Do not add additional
+  /// non-trivial state here, add it to the ParserState class.
+  ParserState &state;
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Helper methods.
+//===----------------------------------------------------------------------===//
+
+/// Parse a comma separated list of elements that must have at least one entry
+/// in it.
+ParseResult Parser::parseCommaSeparatedList(
+    const std::function<ParseResult()> &parseElement) {
+  // Non-empty case starts with an element.
+  if (parseElement())
+    return failure();
+
+  // Otherwise we have a list of comma separated elements.
+  while (consumeIf(Token::comma)) {
+    if (parseElement())
+      return failure();
+  }
+  return success();
+}
+
+/// Parse a comma-separated list of elements, terminated with an arbitrary
+/// token.  This allows empty lists if allowEmptyList is true.
+///
+///   abstract-list ::= rightToken                  // if allowEmptyList == true
+///   abstract-list ::= element (',' element)* rightToken
+///
+ParseResult Parser::parseCommaSeparatedListUntil(
+    Token::Kind rightToken, const std::function<ParseResult()> &parseElement,
+    bool allowEmptyList) {
+  // Handle the empty case.
+  if (getToken().is(rightToken)) {
+    if (!allowEmptyList)
+      return emitError("expected list element");
+    consumeToken(rightToken);
+    return success();
+  }
+
+  if (parseCommaSeparatedList(parseElement) ||
+      parseToken(rightToken, "expected ',' or '" +
+                                 Token::getTokenSpelling(rightToken) + "'"))
+    return failure();
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// DialectAsmParser
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This class provides the main implementation of the DialectAsmParser that
+/// allows for dialects to parse attributes and types. This allows for dialect
+/// hooking into the main MLIR parsing logic.
+class CustomDialectAsmParser : public DialectAsmParser {
+public:
+  CustomDialectAsmParser(StringRef fullSpec, Parser &parser)
+      : fullSpec(fullSpec), nameLoc(parser.getToken().getLoc()),
+        parser(parser) {}
+  ~CustomDialectAsmParser() override {}
+
+  /// Emit a diagnostic at the specified location and return failure.
+  InFlightDiagnostic emitError(llvm::SMLoc loc, const Twine &message) override {
+    return parser.emitError(loc, message);
+  }
+
+  /// Return a builder which provides useful access to MLIRContext, global
+  /// objects like types and attributes.
+  Builder &getBuilder() const override { return parser.builder; }
+
+  /// Get the location of the next token and store it into the argument.  This
+  /// always succeeds.
+  llvm::SMLoc getCurrentLocation() override {
+    return parser.getToken().getLoc();
+  }
+
+  /// Return the location of the original name token.
+  llvm::SMLoc getNameLoc() const override { return nameLoc; }
+
+  /// Re-encode the given source location as an MLIR location and return it.
+  Location getEncodedSourceLoc(llvm::SMLoc loc) override {
+    return parser.getEncodedSourceLocation(loc);
+  }
+
+  /// Returns the full specification of the symbol being parsed. This allows
+  /// for using a separate parser if necessary.
+  StringRef getFullSymbolSpec() const override { return fullSpec; }
+
+  /// Parse a floating point value from the stream.
+  ParseResult parseFloat(double &result) override {
+    bool negative = parser.consumeIf(Token::minus);
+    Token curTok = parser.getToken();
+
+    // Check for a floating point value.
+    if (curTok.is(Token::floatliteral)) {
+      auto val = curTok.getFloatingPointValue();
+      if (!val.hasValue())
+        return emitError(curTok.getLoc(), "floating point value too large");
+      parser.consumeToken(Token::floatliteral);
+      result = negative ? -*val : *val;
+      return success();
+    }
+
+    // TODO(riverriddle) support hex floating point values.
+    return emitError(getCurrentLocation(), "expected floating point literal");
+  }
+
+  /// Parse an optional integer value from the stream.
+  OptionalParseResult parseOptionalInteger(uint64_t &result) override {
+    Token curToken = parser.getToken();
+    if (curToken.isNot(Token::integer, Token::minus))
+      return llvm::None;
+
+    bool negative = parser.consumeIf(Token::minus);
+    Token curTok = parser.getToken();
+    if (parser.parseToken(Token::integer, "expected integer value"))
+      return failure();
+
+    auto val = curTok.getUInt64IntegerValue();
+    if (!val)
+      return emitError(curTok.getLoc(), "integer value too large");
+    result = negative ? -*val : *val;
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Token Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a `->` token.
+  ParseResult parseArrow() override {
+    return parser.parseToken(Token::arrow, "expected '->'");
+  }
+
+  /// Parses a `->` if present.
+  ParseResult parseOptionalArrow() override {
+    return success(parser.consumeIf(Token::arrow));
+  }
+
+  /// Parse a '{' token.
+  ParseResult parseLBrace() override {
+    return parser.parseToken(Token::l_brace, "expected '{'");
+  }
+
+  /// Parse a '{' token if present
+  ParseResult parseOptionalLBrace() override {
+    return success(parser.consumeIf(Token::l_brace));
+  }
+
+  /// Parse a `}` token.
+  ParseResult parseRBrace() override {
+    return parser.parseToken(Token::r_brace, "expected '}'");
+  }
+
+  /// Parse a `}` token if present
+  ParseResult parseOptionalRBrace() override {
+    return success(parser.consumeIf(Token::r_brace));
+  }
+
+  /// Parse a `:` token.
+  ParseResult parseColon() override {
+    return parser.parseToken(Token::colon, "expected ':'");
+  }
+
+  /// Parse a `:` token if present.
+  ParseResult parseOptionalColon() override {
+    return success(parser.consumeIf(Token::colon));
+  }
+
+  /// Parse a `,` token.
+  ParseResult parseComma() override {
+    return parser.parseToken(Token::comma, "expected ','");
+  }
+
+  /// Parse a `,` token if present.
+  ParseResult parseOptionalComma() override {
+    return success(parser.consumeIf(Token::comma));
+  }
+
+  /// Parses a `...` if present.
+  ParseResult parseOptionalEllipsis() override {
+    return success(parser.consumeIf(Token::ellipsis));
+  }
+
+  /// Parse a `=` token.
+  ParseResult parseEqual() override {
+    return parser.parseToken(Token::equal, "expected '='");
+  }
+
+  /// Parse a '<' token.
+  ParseResult parseLess() override {
+    return parser.parseToken(Token::less, "expected '<'");
+  }
+
+  /// Parse a `<` token if present.
+  ParseResult parseOptionalLess() override {
+    return success(parser.consumeIf(Token::less));
+  }
+
+  /// Parse a '>' token.
+  ParseResult parseGreater() override {
+    return parser.parseToken(Token::greater, "expected '>'");
+  }
+
+  /// Parse a `>` token if present.
+  ParseResult parseOptionalGreater() override {
+    return success(parser.consumeIf(Token::greater));
+  }
+
+  /// Parse a `(` token.
+  ParseResult parseLParen() override {
+    return parser.parseToken(Token::l_paren, "expected '('");
+  }
+
+  /// Parses a '(' if present.
+  ParseResult parseOptionalLParen() override {
+    return success(parser.consumeIf(Token::l_paren));
+  }
+
+  /// Parse a `)` token.
+  ParseResult parseRParen() override {
+    return parser.parseToken(Token::r_paren, "expected ')'");
+  }
+
+  /// Parses a ')' if present.
+  ParseResult parseOptionalRParen() override {
+    return success(parser.consumeIf(Token::r_paren));
+  }
+
+  /// Parse a `[` token.
+  ParseResult parseLSquare() override {
+    return parser.parseToken(Token::l_square, "expected '['");
+  }
+
+  /// Parses a '[' if present.
+  ParseResult parseOptionalLSquare() override {
+    return success(parser.consumeIf(Token::l_square));
+  }
+
+  /// Parse a `]` token.
+  ParseResult parseRSquare() override {
+    return parser.parseToken(Token::r_square, "expected ']'");
+  }
+
+  /// Parses a ']' if present.
+  ParseResult parseOptionalRSquare() override {
+    return success(parser.consumeIf(Token::r_square));
+  }
+
+  /// Parses a '?' if present.
+  ParseResult parseOptionalQuestion() override {
+    return success(parser.consumeIf(Token::question));
+  }
+
+  /// Parses a '*' if present.
+  ParseResult parseOptionalStar() override {
+    return success(parser.consumeIf(Token::star));
+  }
+
+  /// Returns if the current token corresponds to a keyword.
+  bool isCurrentTokenAKeyword() const {
+    return parser.getToken().is(Token::bare_identifier) ||
+           parser.getToken().isKeyword();
+  }
+
+  /// Parse the given keyword if present.
+  ParseResult parseOptionalKeyword(StringRef keyword) override {
+    // Check that the current token has the same spelling.
+    if (!isCurrentTokenAKeyword() || parser.getTokenSpelling() != keyword)
+      return failure();
+    parser.consumeToken();
+    return success();
+  }
+
+  /// Parse a keyword, if present, into 'keyword'.
+  ParseResult parseOptionalKeyword(StringRef *keyword) override {
+    // Check that the current token is a keyword.
+    if (!isCurrentTokenAKeyword())
+      return failure();
+
+    *keyword = parser.getTokenSpelling();
+    parser.consumeToken();
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Attribute Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an arbitrary attribute and return it in result.
+  ParseResult parseAttribute(Attribute &result, Type type) override {
+    result = parser.parseAttribute(type);
+    return success(static_cast<bool>(result));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Type Parsing
+  //===--------------------------------------------------------------------===//
+
+  ParseResult parseType(Type &result) override {
+    result = parser.parseType();
+    return success(static_cast<bool>(result));
+  }
+
+  ParseResult parseDimensionList(SmallVectorImpl<int64_t> &dimensions,
+                                 bool allowDynamic) override {
+    return parser.parseDimensionListRanked(dimensions, allowDynamic);
+  }
+
+private:
+  /// The full symbol specification.
+  StringRef fullSpec;
+
+  /// The source location of the dialect symbol.
+  SMLoc nameLoc;
+
+  /// The main parser.
+  Parser &parser;
+};
+} // namespace
+
+/// Parse the body of a pretty dialect symbol, which starts and ends with <>'s,
+/// and may be recursive.  Return with the 'prettyName' StringRef encompassing
+/// the entire pretty name.
+///
+///   pretty-dialect-sym-body ::= '<' pretty-dialect-sym-contents+ '>'
+///   pretty-dialect-sym-contents ::= pretty-dialect-sym-body
+///                                  | '(' pretty-dialect-sym-contents+ ')'
+///                                  | '[' pretty-dialect-sym-contents+ ']'
+///                                  | '{' pretty-dialect-sym-contents+ '}'
+///                                  | '[^[<({>\])}\0]+'
+///
+ParseResult Parser::parsePrettyDialectSymbolName(StringRef &prettyName) {
+  // Pretty symbol names are a relatively unstructured format that contains a
+  // series of properly nested punctuation, with anything else in the middle.
+  // Scan ahead to find it and consume it if successful, otherwise emit an
+  // error.
+  auto *curPtr = getTokenSpelling().data();
+
+  SmallVector<char, 8> nestedPunctuation;
+
+  // Scan over the nested punctuation, bailing out on error and consuming until
+  // we find the end.  We know that we're currently looking at the '<', so we
+  // can go until we find the matching '>' character.
+  assert(*curPtr == '<');
+  do {
+    char c = *curPtr++;
+    switch (c) {
+    case '\0':
+      // This also handles the EOF case.
+      return emitError("unexpected nul or EOF in pretty dialect name");
+    case '<':
+    case '[':
+    case '(':
+    case '{':
+      nestedPunctuation.push_back(c);
+      continue;
+
+    case '-':
+      // The sequence `->` is treated as special token.
+      if (*curPtr == '>')
+        ++curPtr;
+      continue;
+
+    case '>':
+      if (nestedPunctuation.pop_back_val() != '<')
+        return emitError("unbalanced '>' character in pretty dialect name");
+      break;
+    case ']':
+      if (nestedPunctuation.pop_back_val() != '[')
+        return emitError("unbalanced ']' character in pretty dialect name");
+      break;
+    case ')':
+      if (nestedPunctuation.pop_back_val() != '(')
+        return emitError("unbalanced ')' character in pretty dialect name");
+      break;
+    case '}':
+      if (nestedPunctuation.pop_back_val() != '{')
+        return emitError("unbalanced '}' character in pretty dialect name");
+      break;
+
+    default:
+      continue;
+    }
+  } while (!nestedPunctuation.empty());
+
+  // Ok, we succeeded, remember where we stopped, reset the lexer to know it is
+  // consuming all this stuff, and return.
+  state.lex.resetPointer(curPtr);
+
+  unsigned length = curPtr - prettyName.begin();
+  prettyName = StringRef(prettyName.begin(), length);
+  consumeToken();
+  return success();
+}
+
+/// Parse an extended dialect symbol.
+template <typename Symbol, typename SymbolAliasMap, typename CreateFn>
+static Symbol parseExtendedSymbol(Parser &p, Token::Kind identifierTok,
+                                  SymbolAliasMap &aliases,
+                                  CreateFn &&createSymbol) {
+  // Parse the dialect namespace.
+  StringRef identifier = p.getTokenSpelling().drop_front();
+  auto loc = p.getToken().getLoc();
+  p.consumeToken(identifierTok);
+
+  // If there is no '<' token following this, and if the typename contains no
+  // dot, then we are parsing a symbol alias.
+  if (p.getToken().isNot(Token::less) && !identifier.contains('.')) {
+    // Check for an alias for this type.
+    auto aliasIt = aliases.find(identifier);
+    if (aliasIt == aliases.end())
+      return (p.emitError("undefined symbol alias id '" + identifier + "'"),
+              nullptr);
+    return aliasIt->second;
+  }
+
+  // Otherwise, we are parsing a dialect-specific symbol.  If the name contains
+  // a dot, then this is the "pretty" form.  If not, it is the verbose form that
+  // looks like <"...">.
+  std::string symbolData;
+  auto dialectName = identifier;
+
+  // Handle the verbose form, where "identifier" is a simple dialect name.
+  if (!identifier.contains('.')) {
+    // Consume the '<'.
+    if (p.parseToken(Token::less, "expected '<' in dialect type"))
+      return nullptr;
+
+    // Parse the symbol specific data.
+    if (p.getToken().isNot(Token::string))
+      return (p.emitError("expected string literal data in dialect symbol"),
+              nullptr);
+    symbolData = p.getToken().getStringValue();
+    loc = llvm::SMLoc::getFromPointer(p.getToken().getLoc().getPointer() + 1);
+    p.consumeToken(Token::string);
+
+    // Consume the '>'.
+    if (p.parseToken(Token::greater, "expected '>' in dialect symbol"))
+      return nullptr;
+  } else {
+    // Ok, the dialect name is the part of the identifier before the dot, the
+    // part after the dot is the dialect's symbol, or the start thereof.
+    auto dotHalves = identifier.split('.');
+    dialectName = dotHalves.first;
+    auto prettyName = dotHalves.second;
+    loc = llvm::SMLoc::getFromPointer(prettyName.data());
+
+    // If the dialect's symbol is followed immediately by a <, then lex the body
+    // of it into prettyName.
+    if (p.getToken().is(Token::less) &&
+        prettyName.bytes_end() == p.getTokenSpelling().bytes_begin()) {
+      if (p.parsePrettyDialectSymbolName(prettyName))
+        return nullptr;
+    }
+
+    symbolData = prettyName.str();
+  }
+
+  // Record the name location of the type remapped to the top level buffer.
+  llvm::SMLoc locInTopLevelBuffer = p.remapLocationToTopLevelBuffer(loc);
+  p.getState().symbols.nestedParserLocs.push_back(locInTopLevelBuffer);
+
+  // Call into the provided symbol construction function.
+  Symbol sym = createSymbol(dialectName, symbolData, loc);
+
+  // Pop the last parser location.
+  p.getState().symbols.nestedParserLocs.pop_back();
+  return sym;
+}
+
+/// Parses a symbol, of type 'T', and returns it if parsing was successful. If
+/// parsing failed, nullptr is returned. The number of bytes read from the input
+/// string is returned in 'numRead'.
+template <typename T, typename ParserFn>
+static T parseSymbol(StringRef inputStr, MLIRContext *context,
+                     SymbolState &symbolState, ParserFn &&parserFn,
+                     size_t *numRead = nullptr) {
+  SourceMgr sourceMgr;
+  auto memBuffer = MemoryBuffer::getMemBuffer(
+      inputStr, /*BufferName=*/"<mlir_parser_buffer>",
+      /*RequiresNullTerminator=*/false);
+  sourceMgr.AddNewSourceBuffer(std::move(memBuffer), SMLoc());
+  ParserState state(sourceMgr, context, symbolState);
+  Parser parser(state);
+
+  Token startTok = parser.getToken();
+  T symbol = parserFn(parser);
+  if (!symbol)
+    return T();
+
+  // If 'numRead' is valid, then provide the number of bytes that were read.
+  Token endTok = parser.getToken();
+  if (numRead) {
+    *numRead = static_cast<size_t>(endTok.getLoc().getPointer() -
+                                   startTok.getLoc().getPointer());
+
+    // Otherwise, ensure that all of the tokens were parsed.
+  } else if (startTok.getLoc() != endTok.getLoc() && endTok.isNot(Token::eof)) {
+    parser.emitError(endTok.getLoc(), "encountered unexpected token");
+    return T();
+  }
+  return symbol;
+}
+
+//===----------------------------------------------------------------------===//
+// Error Handling
+//===----------------------------------------------------------------------===//
+
+InFlightDiagnostic Parser::emitError(SMLoc loc, const Twine &message) {
+  auto diag = mlir::emitError(getEncodedSourceLocation(loc), message);
+
+  // If we hit a parse error in response to a lexer error, then the lexer
+  // already reported the error.
+  if (getToken().is(Token::error))
+    diag.abandon();
+  return diag;
+}
+
+//===----------------------------------------------------------------------===//
+// Token Parsing
+//===----------------------------------------------------------------------===//
+
+/// Consume the specified token if present and return success.  On failure,
+/// output a diagnostic and return failure.
+ParseResult Parser::parseToken(Token::Kind expectedToken,
+                               const Twine &message) {
+  if (consumeIf(expectedToken))
+    return success();
+  return emitError(message);
+}
+
+//===----------------------------------------------------------------------===//
+// Type Parsing
+//===----------------------------------------------------------------------===//
+
+/// Parse an arbitrary type.
+///
+///   type ::= function-type
+///          | non-function-type
+///
+Type Parser::parseType() {
+  if (getToken().is(Token::l_paren))
+    return parseFunctionType();
+  return parseNonFunctionType();
+}
+
+/// Parse a function result type.
+///
+///   function-result-type ::= type-list-parens
+///                          | non-function-type
+///
+ParseResult Parser::parseFunctionResultTypes(SmallVectorImpl<Type> &elements) {
+  if (getToken().is(Token::l_paren))
+    return parseTypeListParens(elements);
+
+  Type t = parseNonFunctionType();
+  if (!t)
+    return failure();
+  elements.push_back(t);
+  return success();
+}
+
+/// Parse a list of types without an enclosing parenthesis.  The list must have
+/// at least one member.
+///
+///   type-list-no-parens ::=  type (`,` type)*
+///
+ParseResult Parser::parseTypeListNoParens(SmallVectorImpl<Type> &elements) {
+  auto parseElt = [&]() -> ParseResult {
+    auto elt = parseType();
+    elements.push_back(elt);
+    return elt ? success() : failure();
+  };
+
+  return parseCommaSeparatedList(parseElt);
+}
+
+/// Parse a parenthesized list of types.
+///
+///   type-list-parens ::= `(` `)`
+///                      | `(` type-list-no-parens `)`
+///
+ParseResult Parser::parseTypeListParens(SmallVectorImpl<Type> &elements) {
+  if (parseToken(Token::l_paren, "expected '('"))
+    return failure();
+
+  // Handle empty lists.
+  if (getToken().is(Token::r_paren))
+    return consumeToken(), success();
+
+  if (parseTypeListNoParens(elements) ||
+      parseToken(Token::r_paren, "expected ')'"))
+    return failure();
+  return success();
+}
+
+/// Parse a complex type.
+///
+///   complex-type ::= `complex` `<` type `>`
+///
+Type Parser::parseComplexType() {
+  consumeToken(Token::kw_complex);
+
+  // Parse the '<'.
+  if (parseToken(Token::less, "expected '<' in complex type"))
+    return nullptr;
+
+  auto typeLocation = getEncodedSourceLocation(getToken().getLoc());
+  auto elementType = parseType();
+  if (!elementType ||
+      parseToken(Token::greater, "expected '>' in complex type"))
+    return nullptr;
+
+  return ComplexType::getChecked(elementType, typeLocation);
+}
+
+/// Parse an extended type.
+///
+///   extended-type ::= (dialect-type | type-alias)
+///   dialect-type  ::= `!` dialect-namespace `<` `"` type-data `"` `>`
+///   dialect-type  ::= `!` alias-name pretty-dialect-attribute-body?
+///   type-alias    ::= `!` alias-name
+///
+Type Parser::parseExtendedType() {
+  return parseExtendedSymbol<Type>(
+      *this, Token::exclamation_identifier, state.symbols.typeAliasDefinitions,
+      [&](StringRef dialectName, StringRef symbolData,
+          llvm::SMLoc loc) -> Type {
+        // If we found a registered dialect, then ask it to parse the type.
+        if (auto *dialect = state.context->getRegisteredDialect(dialectName)) {
+          return parseSymbol<Type>(
+              symbolData, state.context, state.symbols, [&](Parser &parser) {
+                CustomDialectAsmParser customParser(symbolData, parser);
+                return dialect->parseType(customParser);
+              });
+        }
+
+        // Otherwise, form a new opaque type.
+        return OpaqueType::getChecked(
+            Identifier::get(dialectName, state.context), symbolData,
+            state.context, getEncodedSourceLocation(loc));
+      });
+}
+
+/// Parse a function type.
+///
+///   function-type ::= type-list-parens `->` function-result-type
+///
+Type Parser::parseFunctionType() {
+  assert(getToken().is(Token::l_paren));
+
+  SmallVector<Type, 4> arguments, results;
+  if (parseTypeListParens(arguments) ||
+      parseToken(Token::arrow, "expected '->' in function type") ||
+      parseFunctionResultTypes(results))
+    return nullptr;
+
+  return builder.getFunctionType(arguments, results);
+}
+
+/// Parse the offset and strides from a strided layout specification.
+///
+///   strided-layout ::= `offset:` dimension `,` `strides: ` stride-list
+///
+ParseResult Parser::parseStridedLayout(int64_t &offset,
+                                       SmallVectorImpl<int64_t> &strides) {
+  // Parse offset.
+  consumeToken(Token::kw_offset);
+  if (!consumeIf(Token::colon))
+    return emitError("expected colon after `offset` keyword");
+  auto maybeOffset = getToken().getUnsignedIntegerValue();
+  bool question = getToken().is(Token::question);
+  if (!maybeOffset && !question)
+    return emitError("invalid offset");
+  offset = maybeOffset ? static_cast<int64_t>(maybeOffset.getValue())
+                       : MemRefType::getDynamicStrideOrOffset();
+  consumeToken();
+
+  if (!consumeIf(Token::comma))
+    return emitError("expected comma after offset value");
+
+  // Parse stride list.
+  if (!consumeIf(Token::kw_strides))
+    return emitError("expected `strides` keyword after offset specification");
+  if (!consumeIf(Token::colon))
+    return emitError("expected colon after `strides` keyword");
+  if (failed(parseStrideList(strides)))
+    return emitError("invalid braces-enclosed stride list");
+  if (llvm::any_of(strides, [](int64_t st) { return st == 0; }))
+    return emitError("invalid memref stride");
+
+  return success();
+}
+
+/// Parse a memref type.
+///
+///   memref-type ::= ranked-memref-type | unranked-memref-type
+///
+///   ranked-memref-type ::= `memref` `<` dimension-list-ranked type
+///                          (`,` semi-affine-map-composition)? (`,`
+///                          memory-space)? `>`
+///
+///   unranked-memref-type ::= `memref` `<*x` type (`,` memory-space)? `>`
+///
+///   semi-affine-map-composition ::= (semi-affine-map `,` )* semi-affine-map
+///   memory-space ::= integer-literal /* | TODO: address-space-id */
+///
+Type Parser::parseMemRefType() {
+  consumeToken(Token::kw_memref);
+
+  if (parseToken(Token::less, "expected '<' in memref type"))
+    return nullptr;
+
+  bool isUnranked;
+  SmallVector<int64_t, 4> dimensions;
+
+  if (consumeIf(Token::star)) {
+    // This is an unranked memref type.
+    isUnranked = true;
+    if (parseXInDimensionList())
+      return nullptr;
+
+  } else {
+    isUnranked = false;
+    if (parseDimensionListRanked(dimensions))
+      return nullptr;
+  }
+
+  // Parse the element type.
+  auto typeLoc = getToken().getLoc();
+  auto elementType = parseType();
+  if (!elementType)
+    return nullptr;
+
+  // Parse semi-affine-map-composition.
+  SmallVector<AffineMap, 2> affineMapComposition;
+  unsigned memorySpace = 0;
+  bool parsedMemorySpace = false;
+
+  auto parseElt = [&]() -> ParseResult {
+    if (getToken().is(Token::integer)) {
+      // Parse memory space.
+      if (parsedMemorySpace)
+        return emitError("multiple memory spaces specified in memref type");
+      auto v = getToken().getUnsignedIntegerValue();
+      if (!v.hasValue())
+        return emitError("invalid memory space in memref type");
+      memorySpace = v.getValue();
+      consumeToken(Token::integer);
+      parsedMemorySpace = true;
+    } else {
+      if (isUnranked)
+        return emitError("cannot have affine map for unranked memref type");
+      if (parsedMemorySpace)
+        return emitError("expected memory space to be last in memref type");
+      if (getToken().is(Token::kw_offset)) {
+        int64_t offset;
+        SmallVector<int64_t, 4> strides;
+        if (failed(parseStridedLayout(offset, strides)))
+          return failure();
+        // Construct strided affine map.
+        auto map = makeStridedLinearLayoutMap(strides, offset,
+                                              elementType.getContext());
+        affineMapComposition.push_back(map);
+      } else {
+        // Parse affine map.
+        auto affineMap = parseAttribute();
+        if (!affineMap)
+          return failure();
+        // Verify that the parsed attribute is an affine map.
+        if (auto affineMapAttr = affineMap.dyn_cast<AffineMapAttr>())
+          affineMapComposition.push_back(affineMapAttr.getValue());
+        else
+          return emitError("expected affine map in memref type");
+      }
+    }
+    return success();
+  };
+
+  // Parse a list of mappings and address space if present.
+  if (consumeIf(Token::comma)) {
+    // Parse comma separated list of affine maps, followed by memory space.
+    if (parseCommaSeparatedListUntil(Token::greater, parseElt,
+                                     /*allowEmptyList=*/false)) {
+      return nullptr;
+    }
+  } else {
+    if (parseToken(Token::greater, "expected ',' or '>' in memref type"))
+      return nullptr;
+  }
+
+  if (isUnranked)
+    return UnrankedMemRefType::getChecked(elementType, memorySpace,
+                                          getEncodedSourceLocation(typeLoc));
+
+  return MemRefType::getChecked(dimensions, elementType, affineMapComposition,
+                                memorySpace, getEncodedSourceLocation(typeLoc));
+}
+
+/// Parse any type except the function type.
+///
+///   non-function-type ::= integer-type
+///                       | index-type
+///                       | float-type
+///                       | extended-type
+///                       | vector-type
+///                       | tensor-type
+///                       | memref-type
+///                       | complex-type
+///                       | tuple-type
+///                       | none-type
+///
+///   index-type ::= `index`
+///   float-type ::= `f16` | `bf16` | `f32` | `f64`
+///   none-type ::= `none`
+///
+Type Parser::parseNonFunctionType() {
+  switch (getToken().getKind()) {
+  default:
+    return (emitError("expected non-function type"), nullptr);
+  case Token::kw_memref:
+    return parseMemRefType();
+  case Token::kw_tensor:
+    return parseTensorType();
+  case Token::kw_complex:
+    return parseComplexType();
+  case Token::kw_tuple:
+    return parseTupleType();
+  case Token::kw_vector:
+    return parseVectorType();
+  // integer-type
+  case Token::inttype: {
+    auto width = getToken().getIntTypeBitwidth();
+    if (!width.hasValue())
+      return (emitError("invalid integer width"), nullptr);
+    auto loc = getEncodedSourceLocation(getToken().getLoc());
+    consumeToken(Token::inttype);
+    return IntegerType::getChecked(width.getValue(), builder.getContext(), loc);
+  }
+
+  // float-type
+  case Token::kw_bf16:
+    consumeToken(Token::kw_bf16);
+    return builder.getBF16Type();
+  case Token::kw_f16:
+    consumeToken(Token::kw_f16);
+    return builder.getF16Type();
+  case Token::kw_f32:
+    consumeToken(Token::kw_f32);
+    return builder.getF32Type();
+  case Token::kw_f64:
+    consumeToken(Token::kw_f64);
+    return builder.getF64Type();
+
+  // index-type
+  case Token::kw_index:
+    consumeToken(Token::kw_index);
+    return builder.getIndexType();
+
+  // none-type
+  case Token::kw_none:
+    consumeToken(Token::kw_none);
+    return builder.getNoneType();
+
+  // extended type
+  case Token::exclamation_identifier:
+    return parseExtendedType();
+  }
+}
+
+/// Parse a tensor type.
+///
+///   tensor-type ::= `tensor` `<` dimension-list type `>`
+///   dimension-list ::= dimension-list-ranked | `*x`
+///
+Type Parser::parseTensorType() {
+  consumeToken(Token::kw_tensor);
+
+  if (parseToken(Token::less, "expected '<' in tensor type"))
+    return nullptr;
+
+  bool isUnranked;
+  SmallVector<int64_t, 4> dimensions;
+
+  if (consumeIf(Token::star)) {
+    // This is an unranked tensor type.
+    isUnranked = true;
+
+    if (parseXInDimensionList())
+      return nullptr;
+
+  } else {
+    isUnranked = false;
+    if (parseDimensionListRanked(dimensions))
+      return nullptr;
+  }
+
+  // Parse the element type.
+  auto typeLocation = getEncodedSourceLocation(getToken().getLoc());
+  auto elementType = parseType();
+  if (!elementType || parseToken(Token::greater, "expected '>' in tensor type"))
+    return nullptr;
+
+  if (isUnranked)
+    return UnrankedTensorType::getChecked(elementType, typeLocation);
+  return RankedTensorType::getChecked(dimensions, elementType, typeLocation);
+}
+
+/// Parse a tuple type.
+///
+///   tuple-type ::= `tuple` `<` (type (`,` type)*)? `>`
+///
+Type Parser::parseTupleType() {
+  consumeToken(Token::kw_tuple);
+
+  // Parse the '<'.
+  if (parseToken(Token::less, "expected '<' in tuple type"))
+    return nullptr;
+
+  // Check for an empty tuple by directly parsing '>'.
+  if (consumeIf(Token::greater))
+    return TupleType::get(getContext());
+
+  // Parse the element types and the '>'.
+  SmallVector<Type, 4> types;
+  if (parseTypeListNoParens(types) ||
+      parseToken(Token::greater, "expected '>' in tuple type"))
+    return nullptr;
+
+  return TupleType::get(types, getContext());
+}
+
+/// Parse a vector type.
+///
+///   vector-type ::= `vector` `<` non-empty-static-dimension-list type `>`
+///   non-empty-static-dimension-list ::= decimal-literal `x`
+///                                       static-dimension-list
+///   static-dimension-list ::= (decimal-literal `x`)*
+///
+VectorType Parser::parseVectorType() {
+  consumeToken(Token::kw_vector);
+
+  if (parseToken(Token::less, "expected '<' in vector type"))
+    return nullptr;
+
+  SmallVector<int64_t, 4> dimensions;
+  if (parseDimensionListRanked(dimensions, /*allowDynamic=*/false))
+    return nullptr;
+  if (dimensions.empty())
+    return (emitError("expected dimension size in vector type"), nullptr);
+
+  // Parse the element type.
+  auto typeLoc = getToken().getLoc();
+  auto elementType = parseType();
+  if (!elementType || parseToken(Token::greater, "expected '>' in vector type"))
+    return nullptr;
+
+  return VectorType::getChecked(dimensions, elementType,
+                                getEncodedSourceLocation(typeLoc));
+}
+
+/// Parse a dimension list of a tensor or memref type.  This populates the
+/// dimension list, using -1 for the `?` dimensions if `allowDynamic` is set and
+/// errors out on `?` otherwise.
+///
+///   dimension-list-ranked ::= (dimension `x`)*
+///   dimension ::= `?` | decimal-literal
+///
+/// When `allowDynamic` is not set, this is used to parse:
+///
+///   static-dimension-list ::= (decimal-literal `x`)*
+ParseResult
+Parser::parseDimensionListRanked(SmallVectorImpl<int64_t> &dimensions,
+                                 bool allowDynamic) {
+  while (getToken().isAny(Token::integer, Token::question)) {
+    if (consumeIf(Token::question)) {
+      if (!allowDynamic)
+        return emitError("expected static shape");
+      dimensions.push_back(-1);
+    } else {
+      // Hexadecimal integer literals (starting with `0x`) are not allowed in
+      // aggregate type declarations.  Therefore, `0xf32` should be processed as
+      // a sequence of separate elements `0`, `x`, `f32`.
+      if (getTokenSpelling().size() > 1 && getTokenSpelling()[1] == 'x') {
+        // We can get here only if the token is an integer literal.  Hexadecimal
+        // integer literals can only start with `0x` (`1x` wouldn't lex as a
+        // literal, just `1` would, at which point we don't get into this
+        // branch).
+        assert(getTokenSpelling()[0] == '0' && "invalid integer literal");
+        dimensions.push_back(0);
+        state.lex.resetPointer(getTokenSpelling().data() + 1);
+        consumeToken();
+      } else {
+        // Make sure this integer value is in bound and valid.
+        auto dimension = getToken().getUnsignedIntegerValue();
+        if (!dimension.hasValue())
+          return emitError("invalid dimension");
+        dimensions.push_back((int64_t)dimension.getValue());
+        consumeToken(Token::integer);
+      }
+    }
+
+    // Make sure we have an 'x' or something like 'xbf32'.
+    if (parseXInDimensionList())
+      return failure();
+  }
+
+  return success();
+}
+
+/// Parse an 'x' token in a dimension list, handling the case where the x is
+/// juxtaposed with an element type, as in "xf32", leaving the "f32" as the next
+/// token.
+ParseResult Parser::parseXInDimensionList() {
+  if (getToken().isNot(Token::bare_identifier) || getTokenSpelling()[0] != 'x')
+    return emitError("expected 'x' in dimension list");
+
+  // If we had a prefix of 'x', lex the next token immediately after the 'x'.
+  if (getTokenSpelling().size() != 1)
+    state.lex.resetPointer(getTokenSpelling().data() + 1);
+
+  // Consume the 'x'.
+  consumeToken(Token::bare_identifier);
+
+  return success();
+}
+
+// Parse a comma-separated list of dimensions, possibly empty:
+//   stride-list ::= `[` (dimension (`,` dimension)*)? `]`
+ParseResult Parser::parseStrideList(SmallVectorImpl<int64_t> &dimensions) {
+  if (!consumeIf(Token::l_square))
+    return failure();
+  // Empty list early exit.
+  if (consumeIf(Token::r_square))
+    return success();
+  while (true) {
+    if (consumeIf(Token::question)) {
+      dimensions.push_back(MemRefType::getDynamicStrideOrOffset());
+    } else {
+      // This must be an integer value.
+      int64_t val;
+      if (getToken().getSpelling().getAsInteger(10, val))
+        return emitError("invalid integer value: ") << getToken().getSpelling();
+      // Make sure it is not the one value for `?`.
+      if (ShapedType::isDynamic(val))
+        return emitError("invalid integer value: ")
+               << getToken().getSpelling()
+               << ", use `?` to specify a dynamic dimension";
+      dimensions.push_back(val);
+      consumeToken(Token::integer);
+    }
+    if (!consumeIf(Token::comma))
+      break;
+  }
+  if (!consumeIf(Token::r_square))
+    return failure();
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Attribute parsing.
+//===----------------------------------------------------------------------===//
+
+/// Return the symbol reference referred to by the given token, that is known to
+/// be an @-identifier.
+static std::string extractSymbolReference(Token tok) {
+  assert(tok.is(Token::at_identifier) && "expected valid @-identifier");
+  StringRef nameStr = tok.getSpelling().drop_front();
+
+  // Check to see if the reference is a string literal, or a bare identifier.
+  if (nameStr.front() == '"')
+    return tok.getStringValue();
+  return nameStr;
+}
+
+/// Parse an arbitrary attribute.
+///
+///  attribute-value ::= `unit`
+///                    | bool-literal
+///                    | integer-literal (`:` (index-type | integer-type))?
+///                    | float-literal (`:` float-type)?
+///                    | string-literal (`:` type)?
+///                    | type
+///                    | `[` (attribute-value (`,` attribute-value)*)? `]`
+///                    | `{` (attribute-entry (`,` attribute-entry)*)? `}`
+///                    | symbol-ref-id (`::` symbol-ref-id)*
+///                    | `dense` `<` attribute-value `>` `:`
+///                      (tensor-type | vector-type)
+///                    | `sparse` `<` attribute-value `,` attribute-value `>`
+///                      `:` (tensor-type | vector-type)
+///                    | `opaque` `<` dialect-namespace  `,` hex-string-literal
+///                      `>` `:` (tensor-type | vector-type)
+///                    | extended-attribute
+///
+Attribute Parser::parseAttribute(Type type) {
+  switch (getToken().getKind()) {
+  // Parse an AffineMap or IntegerSet attribute.
+  case Token::l_paren: {
+    // Try to parse an affine map or an integer set reference.
+    AffineMap map;
+    IntegerSet set;
+    if (parseAffineMapOrIntegerSetReference(map, set))
+      return nullptr;
+    if (map)
+      return AffineMapAttr::get(map);
+    assert(set);
+    return IntegerSetAttr::get(set);
+  }
+
+  // Parse an array attribute.
+  case Token::l_square: {
+    consumeToken(Token::l_square);
+
+    SmallVector<Attribute, 4> elements;
+    auto parseElt = [&]() -> ParseResult {
+      elements.push_back(parseAttribute());
+      return elements.back() ? success() : failure();
+    };
+
+    if (parseCommaSeparatedListUntil(Token::r_square, parseElt))
+      return nullptr;
+    return builder.getArrayAttr(elements);
+  }
+
+  // Parse a boolean attribute.
+  case Token::kw_false:
+    consumeToken(Token::kw_false);
+    return builder.getBoolAttr(false);
+  case Token::kw_true:
+    consumeToken(Token::kw_true);
+    return builder.getBoolAttr(true);
+
+  // Parse a dense elements attribute.
+  case Token::kw_dense:
+    return parseDenseElementsAttr();
+
+  // Parse a dictionary attribute.
+  case Token::l_brace: {
+    SmallVector<NamedAttribute, 4> elements;
+    if (parseAttributeDict(elements))
+      return nullptr;
+    return builder.getDictionaryAttr(elements);
+  }
+
+  // Parse an extended attribute, i.e. alias or dialect attribute.
+  case Token::hash_identifier:
+    return parseExtendedAttr(type);
+
+  // Parse floating point and integer attributes.
+  case Token::floatliteral:
+    return parseFloatAttr(type, /*isNegative=*/false);
+  case Token::integer:
+    return parseDecOrHexAttr(type, /*isNegative=*/false);
+  case Token::minus: {
+    consumeToken(Token::minus);
+    if (getToken().is(Token::integer))
+      return parseDecOrHexAttr(type, /*isNegative=*/true);
+    if (getToken().is(Token::floatliteral))
+      return parseFloatAttr(type, /*isNegative=*/true);
+
+    return (emitError("expected constant integer or floating point value"),
+            nullptr);
+  }
+
+  // Parse a location attribute.
+  case Token::kw_loc: {
+    LocationAttr attr;
+    return failed(parseLocation(attr)) ? Attribute() : attr;
+  }
+
+  // Parse an opaque elements attribute.
+  case Token::kw_opaque:
+    return parseOpaqueElementsAttr();
+
+  // Parse a sparse elements attribute.
+  case Token::kw_sparse:
+    return parseSparseElementsAttr();
+
+  // Parse a string attribute.
+  case Token::string: {
+    auto val = getToken().getStringValue();
+    consumeToken(Token::string);
+    // Parse the optional trailing colon type if one wasn't explicitly provided.
+    if (!type && consumeIf(Token::colon) && !(type = parseType()))
+      return Attribute();
+
+    return type ? StringAttr::get(val, type)
+                : StringAttr::get(val, getContext());
+  }
+
+  // Parse a symbol reference attribute.
+  case Token::at_identifier: {
+    std::string nameStr = extractSymbolReference(getToken());
+    consumeToken(Token::at_identifier);
+
+    // Parse any nested references.
+    std::vector<FlatSymbolRefAttr> nestedRefs;
+    while (getToken().is(Token::colon)) {
+      // Check for the '::' prefix.
+      const char *curPointer = getToken().getLoc().getPointer();
+      consumeToken(Token::colon);
+      if (!consumeIf(Token::colon)) {
+        state.lex.resetPointer(curPointer);
+        consumeToken();
+        break;
+      }
+      // Parse the reference itself.
+      auto curLoc = getToken().getLoc();
+      if (getToken().isNot(Token::at_identifier)) {
+        emitError(curLoc, "expected nested symbol reference identifier");
+        return Attribute();
+      }
+
+      std::string nameStr = extractSymbolReference(getToken());
+      consumeToken(Token::at_identifier);
+      nestedRefs.push_back(SymbolRefAttr::get(nameStr, getContext()));
+    }
+
+    return builder.getSymbolRefAttr(nameStr, nestedRefs);
+  }
+
+  // Parse a 'unit' attribute.
+  case Token::kw_unit:
+    consumeToken(Token::kw_unit);
+    return builder.getUnitAttr();
+
+  default:
+    // Parse a type attribute.
+    if (Type type = parseType())
+      return TypeAttr::get(type);
+    return nullptr;
+  }
+}
+
+/// Attribute dictionary.
+///
+///   attribute-dict ::= `{` `}`
+///                    | `{` attribute-entry (`,` attribute-entry)* `}`
+///   attribute-entry ::= bare-id `=` attribute-value
+///
+ParseResult
+Parser::parseAttributeDict(SmallVectorImpl<NamedAttribute> &attributes) {
+  if (parseToken(Token::l_brace, "expected '{' in attribute dictionary"))
+    return failure();
+
+  auto parseElt = [&]() -> ParseResult {
+    // We allow keywords as attribute names.
+    if (getToken().isNot(Token::bare_identifier, Token::inttype) &&
+        !getToken().isKeyword())
+      return emitError("expected attribute name");
+    Identifier nameId = builder.getIdentifier(getTokenSpelling());
+    consumeToken();
+
+    // Try to parse the '=' for the attribute value.
+    if (!consumeIf(Token::equal)) {
+      // If there is no '=', we treat this as a unit attribute.
+      attributes.push_back({nameId, builder.getUnitAttr()});
+      return success();
+    }
+
+    auto attr = parseAttribute();
+    if (!attr)
+      return failure();
+
+    attributes.push_back({nameId, attr});
+    return success();
+  };
+
+  if (parseCommaSeparatedListUntil(Token::r_brace, parseElt))
+    return failure();
+
+  return success();
+}
+
+/// Parse an extended attribute.
+///
+///   extended-attribute ::= (dialect-attribute | attribute-alias)
+///   dialect-attribute  ::= `#` dialect-namespace `<` `"` attr-data `"` `>`
+///   dialect-attribute  ::= `#` alias-name pretty-dialect-sym-body?
+///   attribute-alias    ::= `#` alias-name
+///
+Attribute Parser::parseExtendedAttr(Type type) {
+  Attribute attr = parseExtendedSymbol<Attribute>(
+      *this, Token::hash_identifier, state.symbols.attributeAliasDefinitions,
+      [&](StringRef dialectName, StringRef symbolData,
+          llvm::SMLoc loc) -> Attribute {
+        // Parse an optional trailing colon type.
+        Type attrType = type;
+        if (consumeIf(Token::colon) && !(attrType = parseType()))
+          return Attribute();
+
+        // If we found a registered dialect, then ask it to parse the attribute.
+        if (auto *dialect = state.context->getRegisteredDialect(dialectName)) {
+          return parseSymbol<Attribute>(
+              symbolData, state.context, state.symbols, [&](Parser &parser) {
+                CustomDialectAsmParser customParser(symbolData, parser);
+                return dialect->parseAttribute(customParser, attrType);
+              });
+        }
+
+        // Otherwise, form a new opaque attribute.
+        return OpaqueAttr::getChecked(
+            Identifier::get(dialectName, state.context), symbolData,
+            attrType ? attrType : NoneType::get(state.context),
+            getEncodedSourceLocation(loc));
+      });
+
+  // Ensure that the attribute has the same type as requested.
+  if (attr && type && attr.getType() != type) {
+    emitError("attribute type different than expected: expected ")
+        << type << ", but got " << attr.getType();
+    return nullptr;
+  }
+  return attr;
+}
+
+/// Parse a float attribute.
+Attribute Parser::parseFloatAttr(Type type, bool isNegative) {
+  auto val = getToken().getFloatingPointValue();
+  if (!val.hasValue())
+    return (emitError("floating point value too large for attribute"), nullptr);
+  consumeToken(Token::floatliteral);
+  if (!type) {
+    // Default to F64 when no type is specified.
+    if (!consumeIf(Token::colon))
+      type = builder.getF64Type();
+    else if (!(type = parseType()))
+      return nullptr;
+  }
+  if (!type.isa<FloatType>())
+    return (emitError("floating point value not valid for specified type"),
+            nullptr);
+  return FloatAttr::get(type, isNegative ? -val.getValue() : val.getValue());
+}
+
+/// Construct a float attribute bitwise equivalent to the integer literal.
+static FloatAttr buildHexadecimalFloatLiteral(Parser *p, FloatType type,
+                                              uint64_t value) {
+  int width = type.getIntOrFloatBitWidth();
+  APInt apInt(width, value);
+  if (apInt != value) {
+    p->emitError("hexadecimal float constant out of range for type");
+    return nullptr;
+  }
+  APFloat apFloat(type.getFloatSemantics(), apInt);
+  return p->builder.getFloatAttr(type, apFloat);
+}
+
+/// Parse a decimal or a hexadecimal literal, which can be either an integer
+/// or a float attribute.
+Attribute Parser::parseDecOrHexAttr(Type type, bool isNegative) {
+  auto val = getToken().getUInt64IntegerValue();
+  if (!val.hasValue())
+    return (emitError("integer constant out of range for attribute"), nullptr);
+
+  // Remember if the literal is hexadecimal.
+  StringRef spelling = getToken().getSpelling();
+  auto loc = state.curToken.getLoc();
+  bool isHex = spelling.size() > 1 && spelling[1] == 'x';
+
+  consumeToken(Token::integer);
+  if (!type) {
+    // Default to i64 if not type is specified.
+    if (!consumeIf(Token::colon))
+      type = builder.getIntegerType(64);
+    else if (!(type = parseType()))
+      return nullptr;
+  }
+
+  if (auto floatType = type.dyn_cast<FloatType>()) {
+    // TODO(zinenko): Update once hex format for bfloat16 is supported.
+    if (type.isBF16())
+      return emitError(loc,
+                       "hexadecimal float literal not supported for bfloat16"),
+             nullptr;
+    if (isNegative)
+      return emitError(
+                 loc,
+                 "hexadecimal float literal should not have a leading minus"),
+             nullptr;
+    if (!isHex) {
+      emitError(loc, "unexpected decimal integer literal for a float attribute")
+              .attachNote()
+          << "add a trailing dot to make the literal a float";
+      return nullptr;
+    }
+
+    // Construct a float attribute bitwise equivalent to the integer literal.
+    return buildHexadecimalFloatLiteral(this, floatType, *val);
+  }
+
+  if (!type.isIntOrIndex())
+    return emitError(loc, "integer literal not valid for specified type"),
+           nullptr;
+
+  // Parse the integer literal.
+  int width = type.isIndex() ? 64 : type.getIntOrFloatBitWidth();
+  APInt apInt(width, *val, isNegative);
+  if (apInt != *val)
+    return emitError(loc, "integer constant out of range for attribute"),
+           nullptr;
+
+  // Otherwise construct an integer attribute.
+  if (isNegative ? (int64_t)-val.getValue() >= 0 : (int64_t)val.getValue() < 0)
+    return emitError(loc, "integer constant out of range for attribute"),
+           nullptr;
+
+  return builder.getIntegerAttr(type, isNegative ? -apInt : apInt);
+}
+
+/// Parse an opaque elements attribute.
+Attribute Parser::parseOpaqueElementsAttr() {
+  consumeToken(Token::kw_opaque);
+  if (parseToken(Token::less, "expected '<' after 'opaque'"))
+    return nullptr;
+
+  if (getToken().isNot(Token::string))
+    return (emitError("expected dialect namespace"), nullptr);
+
+  auto name = getToken().getStringValue();
+  auto *dialect = builder.getContext()->getRegisteredDialect(name);
+  // TODO(shpeisman): Allow for having an unknown dialect on an opaque
+  // attribute. Otherwise, it can't be roundtripped without having the dialect
+  // registered.
+  if (!dialect)
+    return (emitError("no registered dialect with namespace '" + name + "'"),
+            nullptr);
+
+  consumeToken(Token::string);
+  if (parseToken(Token::comma, "expected ','"))
+    return nullptr;
+
+  if (getToken().getKind() != Token::string)
+    return (emitError("opaque string should start with '0x'"), nullptr);
+
+  auto val = getToken().getStringValue();
+  if (val.size() < 2 || val[0] != '0' || val[1] != 'x')
+    return (emitError("opaque string should start with '0x'"), nullptr);
+
+  val = val.substr(2);
+  if (!llvm::all_of(val, llvm::isHexDigit))
+    return (emitError("opaque string only contains hex digits"), nullptr);
+
+  consumeToken(Token::string);
+  if (parseToken(Token::greater, "expected '>'") ||
+      parseToken(Token::colon, "expected ':'"))
+    return nullptr;
+
+  auto type = parseElementsLiteralType();
+  if (!type)
+    return nullptr;
+
+  return OpaqueElementsAttr::get(dialect, type, llvm::fromHex(val));
+}
+
+namespace {
+class TensorLiteralParser {
+public:
+  TensorLiteralParser(Parser &p) : p(p) {}
+
+  ParseResult parse() {
+    if (p.getToken().is(Token::l_square))
+      return parseList(shape);
+    return parseElement();
+  }
+
+  /// Build a dense attribute instance with the parsed elements and the given
+  /// shaped type.
+  DenseElementsAttr getAttr(llvm::SMLoc loc, ShapedType type);
+
+  ArrayRef<int64_t> getShape() const { return shape; }
+
+private:
+  enum class ElementKind { Boolean, Integer, Float };
+
+  /// Return a string to represent the given element kind.
+  const char *getElementKindStr(ElementKind kind) {
+    switch (kind) {
+    case ElementKind::Boolean:
+      return "'boolean'";
+    case ElementKind::Integer:
+      return "'integer'";
+    case ElementKind::Float:
+      return "'float'";
+    }
+    llvm_unreachable("unknown element kind");
+  }
+
+  /// Build a Dense Integer attribute for the given type.
+  DenseElementsAttr getIntAttr(llvm::SMLoc loc, ShapedType type,
+                               IntegerType eltTy);
+
+  /// Build a Dense Float attribute for the given type.
+  DenseElementsAttr getFloatAttr(llvm::SMLoc loc, ShapedType type,
+                                 FloatType eltTy);
+
+  /// Parse a single element, returning failure if it isn't a valid element
+  /// literal. For example:
+  /// parseElement(1) -> Success, 1
+  /// parseElement([1]) -> Failure
+  ParseResult parseElement();
+
+  /// Parse a list of either lists or elements, returning the dimensions of the
+  /// parsed sub-tensors in dims. For example:
+  ///   parseList([1, 2, 3]) -> Success, [3]
+  ///   parseList([[1, 2], [3, 4]]) -> Success, [2, 2]
+  ///   parseList([[1, 2], 3]) -> Failure
+  ///   parseList([[1, [2, 3]], [4, [5]]]) -> Failure
+  ParseResult parseList(SmallVectorImpl<int64_t> &dims);
+
+  Parser &p;
+
+  /// The shape inferred from the parsed elements.
+  SmallVector<int64_t, 4> shape;
+
+  /// Storage used when parsing elements, this is a pair of <is_negated, token>.
+  std::vector<std::pair<bool, Token>> storage;
+
+  /// A flag that indicates the type of elements that have been parsed.
+  Optional<ElementKind> knownEltKind;
+};
+} // namespace
+
+/// Build a dense attribute instance with the parsed elements and the given
+/// shaped type.
+DenseElementsAttr TensorLiteralParser::getAttr(llvm::SMLoc loc,
+                                               ShapedType type) {
+  // Check that the parsed storage size has the same number of elements to the
+  // type, or is a known splat.
+  if (!shape.empty() && getShape() != type.getShape()) {
+    p.emitError(loc) << "inferred shape of elements literal ([" << getShape()
+                     << "]) does not match type ([" << type.getShape() << "])";
+    return nullptr;
+  }
+
+  // If the type is an integer, build a set of APInt values from the storage
+  // with the correct bitwidth.
+  if (auto intTy = type.getElementType().dyn_cast<IntegerType>())
+    return getIntAttr(loc, type, intTy);
+
+  // Otherwise, this must be a floating point type.
+  auto floatTy = type.getElementType().dyn_cast<FloatType>();
+  if (!floatTy) {
+    p.emitError(loc) << "expected floating-point or integer element type, got "
+                     << type.getElementType();
+    return nullptr;
+  }
+  return getFloatAttr(loc, type, floatTy);
+}
+
+/// Build a Dense Integer attribute for the given type.
+DenseElementsAttr TensorLiteralParser::getIntAttr(llvm::SMLoc loc,
+                                                  ShapedType type,
+                                                  IntegerType eltTy) {
+  std::vector<APInt> intElements;
+  intElements.reserve(storage.size());
+  for (const auto &signAndToken : storage) {
+    bool isNegative = signAndToken.first;
+    const Token &token = signAndToken.second;
+
+    // Check to see if floating point values were parsed.
+    if (token.is(Token::floatliteral)) {
+      p.emitError() << "expected integer elements, but parsed floating-point";
+      return nullptr;
+    }
+
+    assert(token.isAny(Token::integer, Token::kw_true, Token::kw_false) &&
+           "unexpected token type");
+    if (token.isAny(Token::kw_true, Token::kw_false)) {
+      if (!eltTy.isInteger(1))
+        p.emitError() << "expected i1 type for 'true' or 'false' values";
+      APInt apInt(eltTy.getWidth(), token.is(Token::kw_true),
+                  /*isSigned=*/false);
+      intElements.push_back(apInt);
+      continue;
+    }
+
+    // Create APInt values for each element with the correct bitwidth.
+    auto val = token.getUInt64IntegerValue();
+    if (!val.hasValue() || (isNegative ? (int64_t)-val.getValue() >= 0
+                                       : (int64_t)val.getValue() < 0)) {
+      p.emitError(token.getLoc(),
+                  "integer constant out of range for attribute");
+      return nullptr;
+    }
+    APInt apInt(eltTy.getWidth(), val.getValue(), isNegative);
+    if (apInt != val.getValue())
+      return (p.emitError("integer constant out of range for type"), nullptr);
+    intElements.push_back(isNegative ? -apInt : apInt);
+  }
+
+  return DenseElementsAttr::get(type, intElements);
+}
+
+/// Build a Dense Float attribute for the given type.
+DenseElementsAttr TensorLiteralParser::getFloatAttr(llvm::SMLoc loc,
+                                                    ShapedType type,
+                                                    FloatType eltTy) {
+  std::vector<Attribute> floatValues;
+  floatValues.reserve(storage.size());
+  for (const auto &signAndToken : storage) {
+    bool isNegative = signAndToken.first;
+    const Token &token = signAndToken.second;
+
+    // Handle hexadecimal float literals.
+    if (token.is(Token::integer) && token.getSpelling().startswith("0x")) {
+      if (isNegative) {
+        p.emitError(token.getLoc())
+            << "hexadecimal float literal should not have a leading minus";
+        return nullptr;
+      }
+      auto val = token.getUInt64IntegerValue();
+      if (!val.hasValue()) {
+        p.emitError("hexadecimal float constant out of range for attribute");
+        return nullptr;
+      }
+      FloatAttr attr = buildHexadecimalFloatLiteral(&p, eltTy, *val);
+      if (!attr)
+        return nullptr;
+      floatValues.push_back(attr);
+      continue;
+    }
+
+    // Check to see if any decimal integers or booleans were parsed.
+    if (!token.is(Token::floatliteral)) {
+      p.emitError() << "expected floating-point elements, but parsed integer";
+      return nullptr;
+    }
+
+    // Build the float values from tokens.
+    auto val = token.getFloatingPointValue();
+    if (!val.hasValue()) {
+      p.emitError("floating point value too large for attribute");
+      return nullptr;
+    }
+    floatValues.push_back(FloatAttr::get(eltTy, isNegative ? -*val : *val));
+  }
+
+  return DenseElementsAttr::get(type, floatValues);
+}
+
+ParseResult TensorLiteralParser::parseElement() {
+  switch (p.getToken().getKind()) {
+  // Parse a boolean element.
+  case Token::kw_true:
+  case Token::kw_false:
+  case Token::floatliteral:
+  case Token::integer:
+    storage.emplace_back(/*isNegative=*/false, p.getToken());
+    p.consumeToken();
+    break;
+
+  // Parse a signed integer or a negative floating-point element.
+  case Token::minus:
+    p.consumeToken(Token::minus);
+    if (!p.getToken().isAny(Token::floatliteral, Token::integer))
+      return p.emitError("expected integer or floating point literal");
+    storage.emplace_back(/*isNegative=*/true, p.getToken());
+    p.consumeToken();
+    break;
+
+  default:
+    return p.emitError("expected element literal of primitive type");
+  }
+
+  return success();
+}
+
+/// Parse a list of either lists or elements, returning the dimensions of the
+/// parsed sub-tensors in dims. For example:
+///   parseList([1, 2, 3]) -> Success, [3]
+///   parseList([[1, 2], [3, 4]]) -> Success, [2, 2]
+///   parseList([[1, 2], 3]) -> Failure
+///   parseList([[1, [2, 3]], [4, [5]]]) -> Failure
+ParseResult TensorLiteralParser::parseList(SmallVectorImpl<int64_t> &dims) {
+  p.consumeToken(Token::l_square);
+
+  auto checkDims = [&](const SmallVectorImpl<int64_t> &prevDims,
+                       const SmallVectorImpl<int64_t> &newDims) -> ParseResult {
+    if (prevDims == newDims)
+      return success();
+    return p.emitError("tensor literal is invalid; ranks are not consistent "
+                       "between elements");
+  };
+
+  bool first = true;
+  SmallVector<int64_t, 4> newDims;
+  unsigned size = 0;
+  auto parseCommaSeparatedList = [&]() -> ParseResult {
+    SmallVector<int64_t, 4> thisDims;
+    if (p.getToken().getKind() == Token::l_square) {
+      if (parseList(thisDims))
+        return failure();
+    } else if (parseElement()) {
+      return failure();
+    }
+    ++size;
+    if (!first)
+      return checkDims(newDims, thisDims);
+    newDims = thisDims;
+    first = false;
+    return success();
+  };
+  if (p.parseCommaSeparatedListUntil(Token::r_square, parseCommaSeparatedList))
+    return failure();
+
+  // Return the sublists' dimensions with 'size' prepended.
+  dims.clear();
+  dims.push_back(size);
+  dims.append(newDims.begin(), newDims.end());
+  return success();
+}
+
+/// Parse a dense elements attribute.
+Attribute Parser::parseDenseElementsAttr() {
+  consumeToken(Token::kw_dense);
+  if (parseToken(Token::less, "expected '<' after 'dense'"))
+    return nullptr;
+
+  // Parse the literal data.
+  TensorLiteralParser literalParser(*this);
+  if (literalParser.parse())
+    return nullptr;
+
+  if (parseToken(Token::greater, "expected '>'") ||
+      parseToken(Token::colon, "expected ':'"))
+    return nullptr;
+
+  auto typeLoc = getToken().getLoc();
+  auto type = parseElementsLiteralType();
+  if (!type)
+    return nullptr;
+  return literalParser.getAttr(typeLoc, type);
+}
+
+/// Shaped type for elements attribute.
+///
+///   elements-literal-type ::= vector-type | ranked-tensor-type
+///
+/// This method also checks the type has static shape.
+ShapedType Parser::parseElementsLiteralType() {
+  auto type = parseType();
+  if (!type)
+    return nullptr;
+
+  if (!type.isa<RankedTensorType>() && !type.isa<VectorType>()) {
+    emitError("elements literal must be a ranked tensor or vector type");
+    return nullptr;
+  }
+
+  auto sType = type.cast<ShapedType>();
+  if (!sType.hasStaticShape())
+    return (emitError("elements literal type must have static shape"), nullptr);
+
+  return sType;
+}
+
+/// Parse a sparse elements attribute.
+Attribute Parser::parseSparseElementsAttr() {
+  consumeToken(Token::kw_sparse);
+  if (parseToken(Token::less, "Expected '<' after 'sparse'"))
+    return nullptr;
+
+  /// Parse indices
+  auto indicesLoc = getToken().getLoc();
+  TensorLiteralParser indiceParser(*this);
+  if (indiceParser.parse())
+    return nullptr;
+
+  if (parseToken(Token::comma, "expected ','"))
+    return nullptr;
+
+  /// Parse values.
+  auto valuesLoc = getToken().getLoc();
+  TensorLiteralParser valuesParser(*this);
+  if (valuesParser.parse())
+    return nullptr;
+
+  if (parseToken(Token::greater, "expected '>'") ||
+      parseToken(Token::colon, "expected ':'"))
+    return nullptr;
+
+  auto type = parseElementsLiteralType();
+  if (!type)
+    return nullptr;
+
+  // If the indices are a splat, i.e. the literal parser parsed an element and
+  // not a list, we set the shape explicitly. The indices are represented by a
+  // 2-dimensional shape where the second dimension is the rank of the type.
+  // Given that the parsed indices is a splat, we know that we only have one
+  // indice and thus one for the first dimension.
+  auto indiceEltType = builder.getIntegerType(64);
+  ShapedType indicesType;
+  if (indiceParser.getShape().empty()) {
+    indicesType = RankedTensorType::get({1, type.getRank()}, indiceEltType);
+  } else {
+    // Otherwise, set the shape to the one parsed by the literal parser.
+    indicesType = RankedTensorType::get(indiceParser.getShape(), indiceEltType);
+  }
+  auto indices = indiceParser.getAttr(indicesLoc, indicesType);
+
+  // If the values are a splat, set the shape explicitly based on the number of
+  // indices. The number of indices is encoded in the first dimension of the
+  // indice shape type.
+  auto valuesEltType = type.getElementType();
+  ShapedType valuesType =
+      valuesParser.getShape().empty()
+          ? RankedTensorType::get({indicesType.getDimSize(0)}, valuesEltType)
+          : RankedTensorType::get(valuesParser.getShape(), valuesEltType);
+  auto values = valuesParser.getAttr(valuesLoc, valuesType);
+
+  /// Sanity check.
+  if (valuesType.getRank() != 1)
+    return (emitError("expected 1-d tensor for values"), nullptr);
+
+  auto sameShape = (indicesType.getRank() == 1) ||
+                   (type.getRank() == indicesType.getDimSize(1));
+  auto sameElementNum = indicesType.getDimSize(0) == valuesType.getDimSize(0);
+  if (!sameShape || !sameElementNum) {
+    emitError() << "expected shape ([" << type.getShape()
+                << "]); inferred shape of indices literal (["
+                << indicesType.getShape()
+                << "]); inferred shape of values literal (["
+                << valuesType.getShape() << "])";
+    return nullptr;
+  }
+
+  // Build the sparse elements attribute by the indices and values.
+  return SparseElementsAttr::get(type, indices, values);
+}
+
+//===----------------------------------------------------------------------===//
+// Location parsing.
+//===----------------------------------------------------------------------===//
+
+/// Parse a location.
+///
+///   location           ::= `loc` inline-location
+///   inline-location    ::= '(' location-inst ')'
+///
+ParseResult Parser::parseLocation(LocationAttr &loc) {
+  // Check for 'loc' identifier.
+  if (parseToken(Token::kw_loc, "expected 'loc' keyword"))
+    return emitError();
+
+  // Parse the inline-location.
+  if (parseToken(Token::l_paren, "expected '(' in inline location") ||
+      parseLocationInstance(loc) ||
+      parseToken(Token::r_paren, "expected ')' in inline location"))
+    return failure();
+  return success();
+}
+
+/// Specific location instances.
+///
+/// location-inst ::= filelinecol-location |
+///                   name-location |
+///                   callsite-location |
+///                   fused-location |
+///                   unknown-location
+/// filelinecol-location ::= string-literal ':' integer-literal
+///                                         ':' integer-literal
+/// name-location ::= string-literal
+/// callsite-location ::= 'callsite' '(' location-inst 'at' location-inst ')'
+/// fused-location ::= fused ('<' attribute-value '>')?
+///                    '[' location-inst (location-inst ',')* ']'
+/// unknown-location ::= 'unknown'
+///
+ParseResult Parser::parseCallSiteLocation(LocationAttr &loc) {
+  consumeToken(Token::bare_identifier);
+
+  // Parse the '('.
+  if (parseToken(Token::l_paren, "expected '(' in callsite location"))
+    return failure();
+
+  // Parse the callee location.
+  LocationAttr calleeLoc;
+  if (parseLocationInstance(calleeLoc))
+    return failure();
+
+  // Parse the 'at'.
+  if (getToken().isNot(Token::bare_identifier) ||
+      getToken().getSpelling() != "at")
+    return emitError("expected 'at' in callsite location");
+  consumeToken(Token::bare_identifier);
+
+  // Parse the caller location.
+  LocationAttr callerLoc;
+  if (parseLocationInstance(callerLoc))
+    return failure();
+
+  // Parse the ')'.
+  if (parseToken(Token::r_paren, "expected ')' in callsite location"))
+    return failure();
+
+  // Return the callsite location.
+  loc = CallSiteLoc::get(calleeLoc, callerLoc);
+  return success();
+}
+
+ParseResult Parser::parseFusedLocation(LocationAttr &loc) {
+  consumeToken(Token::bare_identifier);
+
+  // Try to parse the optional metadata.
+  Attribute metadata;
+  if (consumeIf(Token::less)) {
+    metadata = parseAttribute();
+    if (!metadata)
+      return emitError("expected valid attribute metadata");
+    // Parse the '>' token.
+    if (parseToken(Token::greater,
+                   "expected '>' after fused location metadata"))
+      return failure();
+  }
+
+  SmallVector<Location, 4> locations;
+  auto parseElt = [&] {
+    LocationAttr newLoc;
+    if (parseLocationInstance(newLoc))
+      return failure();
+    locations.push_back(newLoc);
+    return success();
+  };
+
+  if (parseToken(Token::l_square, "expected '[' in fused location") ||
+      parseCommaSeparatedList(parseElt) ||
+      parseToken(Token::r_square, "expected ']' in fused location"))
+    return failure();
+
+  // Return the fused location.
+  loc = FusedLoc::get(locations, metadata, getContext());
+  return success();
+}
+
+ParseResult Parser::parseNameOrFileLineColLocation(LocationAttr &loc) {
+  auto *ctx = getContext();
+  auto str = getToken().getStringValue();
+  consumeToken(Token::string);
+
+  // If the next token is ':' this is a filelinecol location.
+  if (consumeIf(Token::colon)) {
+    // Parse the line number.
+    if (getToken().isNot(Token::integer))
+      return emitError("expected integer line number in FileLineColLoc");
+    auto line = getToken().getUnsignedIntegerValue();
+    if (!line.hasValue())
+      return emitError("expected integer line number in FileLineColLoc");
+    consumeToken(Token::integer);
+
+    // Parse the ':'.
+    if (parseToken(Token::colon, "expected ':' in FileLineColLoc"))
+      return failure();
+
+    // Parse the column number.
+    if (getToken().isNot(Token::integer))
+      return emitError("expected integer column number in FileLineColLoc");
+    auto column = getToken().getUnsignedIntegerValue();
+    if (!column.hasValue())
+      return emitError("expected integer column number in FileLineColLoc");
+    consumeToken(Token::integer);
+
+    loc = FileLineColLoc::get(str, line.getValue(), column.getValue(), ctx);
+    return success();
+  }
+
+  // Otherwise, this is a NameLoc.
+
+  // Check for a child location.
+  if (consumeIf(Token::l_paren)) {
+    auto childSourceLoc = getToken().getLoc();
+
+    // Parse the child location.
+    LocationAttr childLoc;
+    if (parseLocationInstance(childLoc))
+      return failure();
+
+    // The child must not be another NameLoc.
+    if (childLoc.isa<NameLoc>())
+      return emitError(childSourceLoc,
+                       "child of NameLoc cannot be another NameLoc");
+    loc = NameLoc::get(Identifier::get(str, ctx), childLoc);
+
+    // Parse the closing ')'.
+    if (parseToken(Token::r_paren,
+                   "expected ')' after child location of NameLoc"))
+      return failure();
+  } else {
+    loc = NameLoc::get(Identifier::get(str, ctx), ctx);
+  }
+
+  return success();
+}
+
+ParseResult Parser::parseLocationInstance(LocationAttr &loc) {
+  // Handle either name or filelinecol locations.
+  if (getToken().is(Token::string))
+    return parseNameOrFileLineColLocation(loc);
+
+  // Bare tokens required for other cases.
+  if (!getToken().is(Token::bare_identifier))
+    return emitError("expected location instance");
+
+  // Check for the 'callsite' signifying a callsite location.
+  if (getToken().getSpelling() == "callsite")
+    return parseCallSiteLocation(loc);
+
+  // If the token is 'fused', then this is a fused location.
+  if (getToken().getSpelling() == "fused")
+    return parseFusedLocation(loc);
+
+  // Check for a 'unknown' for an unknown location.
+  if (getToken().getSpelling() == "unknown") {
+    consumeToken(Token::bare_identifier);
+    loc = UnknownLoc::get(getContext());
+    return success();
+  }
+
+  return emitError("expected location instance");
+}
+
+//===----------------------------------------------------------------------===//
+// Affine parsing.
+//===----------------------------------------------------------------------===//
+
+/// Lower precedence ops (all at the same precedence level). LNoOp is false in
+/// the boolean sense.
+enum AffineLowPrecOp {
+  /// Null value.
+  LNoOp,
+  Add,
+  Sub
+};
+
+/// Higher precedence ops - all at the same precedence level. HNoOp is false
+/// in the boolean sense.
+enum AffineHighPrecOp {
+  /// Null value.
+  HNoOp,
+  Mul,
+  FloorDiv,
+  CeilDiv,
+  Mod
+};
+
+namespace {
+/// This is a specialized parser for affine structures (affine maps, affine
+/// expressions, and integer sets), maintaining the state transient to their
+/// bodies.
+class AffineParser : public Parser {
+public:
+  AffineParser(ParserState &state, bool allowParsingSSAIds = false,
+               function_ref<ParseResult(bool)> parseElement = nullptr)
+      : Parser(state), allowParsingSSAIds(allowParsingSSAIds),
+        parseElement(parseElement), numDimOperands(0), numSymbolOperands(0) {}
+
+  AffineMap parseAffineMapRange(unsigned numDims, unsigned numSymbols);
+  ParseResult parseAffineMapOrIntegerSetInline(AffineMap &map, IntegerSet &set);
+  IntegerSet parseIntegerSetConstraints(unsigned numDims, unsigned numSymbols);
+  ParseResult parseAffineMapOfSSAIds(AffineMap &map);
+  void getDimsAndSymbolSSAIds(SmallVectorImpl<StringRef> &dimAndSymbolSSAIds,
+                              unsigned &numDims);
+
+private:
+  // Binary affine op parsing.
+  AffineLowPrecOp consumeIfLowPrecOp();
+  AffineHighPrecOp consumeIfHighPrecOp();
+
+  // Identifier lists for polyhedral structures.
+  ParseResult parseDimIdList(unsigned &numDims);
+  ParseResult parseSymbolIdList(unsigned &numSymbols);
+  ParseResult parseDimAndOptionalSymbolIdList(unsigned &numDims,
+                                              unsigned &numSymbols);
+  ParseResult parseIdentifierDefinition(AffineExpr idExpr);
+
+  AffineExpr parseAffineExpr();
+  AffineExpr parseParentheticalExpr();
+  AffineExpr parseNegateExpression(AffineExpr lhs);
+  AffineExpr parseIntegerExpr();
+  AffineExpr parseBareIdExpr();
+  AffineExpr parseSSAIdExpr(bool isSymbol);
+  AffineExpr parseSymbolSSAIdExpr();
+
+  AffineExpr getAffineBinaryOpExpr(AffineHighPrecOp op, AffineExpr lhs,
+                                   AffineExpr rhs, SMLoc opLoc);
+  AffineExpr getAffineBinaryOpExpr(AffineLowPrecOp op, AffineExpr lhs,
+                                   AffineExpr rhs);
+  AffineExpr parseAffineOperandExpr(AffineExpr lhs);
+  AffineExpr parseAffineLowPrecOpExpr(AffineExpr llhs, AffineLowPrecOp llhsOp);
+  AffineExpr parseAffineHighPrecOpExpr(AffineExpr llhs, AffineHighPrecOp llhsOp,
+                                       SMLoc llhsOpLoc);
+  AffineExpr parseAffineConstraint(bool *isEq);
+
+private:
+  bool allowParsingSSAIds;
+  function_ref<ParseResult(bool)> parseElement;
+  unsigned numDimOperands;
+  unsigned numSymbolOperands;
+  SmallVector<std::pair<StringRef, AffineExpr>, 4> dimsAndSymbols;
+};
+} // end anonymous namespace
+
+/// Create an affine binary high precedence op expression (mul's, div's, mod).
+/// opLoc is the location of the op token to be used to report errors
+/// for non-conforming expressions.
+AffineExpr AffineParser::getAffineBinaryOpExpr(AffineHighPrecOp op,
+                                               AffineExpr lhs, AffineExpr rhs,
+                                               SMLoc opLoc) {
+  // TODO: make the error location info accurate.
+  switch (op) {
+  case Mul:
+    if (!lhs.isSymbolicOrConstant() && !rhs.isSymbolicOrConstant()) {
+      emitError(opLoc, "non-affine expression: at least one of the multiply "
+                       "operands has to be either a constant or symbolic");
+      return nullptr;
+    }
+    return lhs * rhs;
+  case FloorDiv:
+    if (!rhs.isSymbolicOrConstant()) {
+      emitError(opLoc, "non-affine expression: right operand of floordiv "
+                       "has to be either a constant or symbolic");
+      return nullptr;
+    }
+    return lhs.floorDiv(rhs);
+  case CeilDiv:
+    if (!rhs.isSymbolicOrConstant()) {
+      emitError(opLoc, "non-affine expression: right operand of ceildiv "
+                       "has to be either a constant or symbolic");
+      return nullptr;
+    }
+    return lhs.ceilDiv(rhs);
+  case Mod:
+    if (!rhs.isSymbolicOrConstant()) {
+      emitError(opLoc, "non-affine expression: right operand of mod "
+                       "has to be either a constant or symbolic");
+      return nullptr;
+    }
+    return lhs % rhs;
+  case HNoOp:
+    llvm_unreachable("can't create affine expression for null high prec op");
+    return nullptr;
+  }
+  llvm_unreachable("Unknown AffineHighPrecOp");
+}
+
+/// Create an affine binary low precedence op expression (add, sub).
+AffineExpr AffineParser::getAffineBinaryOpExpr(AffineLowPrecOp op,
+                                               AffineExpr lhs, AffineExpr rhs) {
+  switch (op) {
+  case AffineLowPrecOp::Add:
+    return lhs + rhs;
+  case AffineLowPrecOp::Sub:
+    return lhs - rhs;
+  case AffineLowPrecOp::LNoOp:
+    llvm_unreachable("can't create affine expression for null low prec op");
+    return nullptr;
+  }
+  llvm_unreachable("Unknown AffineLowPrecOp");
+}
+
+/// Consume this token if it is a lower precedence affine op (there are only
+/// two precedence levels).
+AffineLowPrecOp AffineParser::consumeIfLowPrecOp() {
+  switch (getToken().getKind()) {
+  case Token::plus:
+    consumeToken(Token::plus);
+    return AffineLowPrecOp::Add;
+  case Token::minus:
+    consumeToken(Token::minus);
+    return AffineLowPrecOp::Sub;
+  default:
+    return AffineLowPrecOp::LNoOp;
+  }
+}
+
+/// Consume this token if it is a higher precedence affine op (there are only
+/// two precedence levels)
+AffineHighPrecOp AffineParser::consumeIfHighPrecOp() {
+  switch (getToken().getKind()) {
+  case Token::star:
+    consumeToken(Token::star);
+    return Mul;
+  case Token::kw_floordiv:
+    consumeToken(Token::kw_floordiv);
+    return FloorDiv;
+  case Token::kw_ceildiv:
+    consumeToken(Token::kw_ceildiv);
+    return CeilDiv;
+  case Token::kw_mod:
+    consumeToken(Token::kw_mod);
+    return Mod;
+  default:
+    return HNoOp;
+  }
+}
+
+/// Parse a high precedence op expression list: mul, div, and mod are high
+/// precedence binary ops, i.e., parse a
+///   expr_1 op_1 expr_2 op_2 ... expr_n
+/// where op_1, op_2 are all a AffineHighPrecOp (mul, div, mod).
+/// All affine binary ops are left associative.
+/// Given llhs, returns (llhs llhsOp lhs) op rhs, or (lhs op rhs) if llhs is
+/// null. If no rhs can be found, returns (llhs llhsOp lhs) or lhs if llhs is
+/// null. llhsOpLoc is the location of the llhsOp token that will be used to
+/// report an error for non-conforming expressions.
+AffineExpr AffineParser::parseAffineHighPrecOpExpr(AffineExpr llhs,
+                                                   AffineHighPrecOp llhsOp,
+                                                   SMLoc llhsOpLoc) {
+  AffineExpr lhs = parseAffineOperandExpr(llhs);
+  if (!lhs)
+    return nullptr;
+
+  // Found an LHS. Parse the remaining expression.
+  auto opLoc = getToken().getLoc();
+  if (AffineHighPrecOp op = consumeIfHighPrecOp()) {
+    if (llhs) {
+      AffineExpr expr = getAffineBinaryOpExpr(llhsOp, llhs, lhs, opLoc);
+      if (!expr)
+        return nullptr;
+      return parseAffineHighPrecOpExpr(expr, op, opLoc);
+    }
+    // No LLHS, get RHS
+    return parseAffineHighPrecOpExpr(lhs, op, opLoc);
+  }
+
+  // This is the last operand in this expression.
+  if (llhs)
+    return getAffineBinaryOpExpr(llhsOp, llhs, lhs, llhsOpLoc);
+
+  // No llhs, 'lhs' itself is the expression.
+  return lhs;
+}
+
+/// Parse an affine expression inside parentheses.
+///
+///   affine-expr ::= `(` affine-expr `)`
+AffineExpr AffineParser::parseParentheticalExpr() {
+  if (parseToken(Token::l_paren, "expected '('"))
+    return nullptr;
+  if (getToken().is(Token::r_paren))
+    return (emitError("no expression inside parentheses"), nullptr);
+
+  auto expr = parseAffineExpr();
+  if (!expr)
+    return nullptr;
+  if (parseToken(Token::r_paren, "expected ')'"))
+    return nullptr;
+
+  return expr;
+}
+
+/// Parse the negation expression.
+///
+///   affine-expr ::= `-` affine-expr
+AffineExpr AffineParser::parseNegateExpression(AffineExpr lhs) {
+  if (parseToken(Token::minus, "expected '-'"))
+    return nullptr;
+
+  AffineExpr operand = parseAffineOperandExpr(lhs);
+  // Since negation has the highest precedence of all ops (including high
+  // precedence ops) but lower than parentheses, we are only going to use
+  // parseAffineOperandExpr instead of parseAffineExpr here.
+  if (!operand)
+    // Extra error message although parseAffineOperandExpr would have
+    // complained. Leads to a better diagnostic.
+    return (emitError("missing operand of negation"), nullptr);
+  return (-1) * operand;
+}
+
+/// Parse a bare id that may appear in an affine expression.
+///
+///   affine-expr ::= bare-id
+AffineExpr AffineParser::parseBareIdExpr() {
+  if (getToken().isNot(Token::bare_identifier))
+    return (emitError("expected bare identifier"), nullptr);
+
+  StringRef sRef = getTokenSpelling();
+  for (auto entry : dimsAndSymbols) {
+    if (entry.first == sRef) {
+      consumeToken(Token::bare_identifier);
+      return entry.second;
+    }
+  }
+
+  return (emitError("use of undeclared identifier"), nullptr);
+}
+
+/// Parse an SSA id which may appear in an affine expression.
+AffineExpr AffineParser::parseSSAIdExpr(bool isSymbol) {
+  if (!allowParsingSSAIds)
+    return (emitError("unexpected ssa identifier"), nullptr);
+  if (getToken().isNot(Token::percent_identifier))
+    return (emitError("expected ssa identifier"), nullptr);
+  auto name = getTokenSpelling();
+  // Check if we already parsed this SSA id.
+  for (auto entry : dimsAndSymbols) {
+    if (entry.first == name) {
+      consumeToken(Token::percent_identifier);
+      return entry.second;
+    }
+  }
+  // Parse the SSA id and add an AffineDim/SymbolExpr to represent it.
+  if (parseElement(isSymbol))
+    return (emitError("failed to parse ssa identifier"), nullptr);
+  auto idExpr = isSymbol
+                    ? getAffineSymbolExpr(numSymbolOperands++, getContext())
+                    : getAffineDimExpr(numDimOperands++, getContext());
+  dimsAndSymbols.push_back({name, idExpr});
+  return idExpr;
+}
+
+AffineExpr AffineParser::parseSymbolSSAIdExpr() {
+  if (parseToken(Token::kw_symbol, "expected symbol keyword") ||
+      parseToken(Token::l_paren, "expected '(' at start of SSA symbol"))
+    return nullptr;
+  AffineExpr symbolExpr = parseSSAIdExpr(/*isSymbol=*/true);
+  if (!symbolExpr)
+    return nullptr;
+  if (parseToken(Token::r_paren, "expected ')' at end of SSA symbol"))
+    return nullptr;
+  return symbolExpr;
+}
+
+/// Parse a positive integral constant appearing in an affine expression.
+///
+///   affine-expr ::= integer-literal
+AffineExpr AffineParser::parseIntegerExpr() {
+  auto val = getToken().getUInt64IntegerValue();
+  if (!val.hasValue() || (int64_t)val.getValue() < 0)
+    return (emitError("constant too large for index"), nullptr);
+
+  consumeToken(Token::integer);
+  return builder.getAffineConstantExpr((int64_t)val.getValue());
+}
+
+/// Parses an expression that can be a valid operand of an affine expression.
+/// lhs: if non-null, lhs is an affine expression that is the lhs of a binary
+/// operator, the rhs of which is being parsed. This is used to determine
+/// whether an error should be emitted for a missing right operand.
+//  Eg: for an expression without parentheses (like i + j + k + l), each
+//  of the four identifiers is an operand. For i + j*k + l, j*k is not an
+//  operand expression, it's an op expression and will be parsed via
+//  parseAffineHighPrecOpExpression(). However, for i + (j*k) + -l, (j*k) and
+//  -l are valid operands that will be parsed by this function.
+AffineExpr AffineParser::parseAffineOperandExpr(AffineExpr lhs) {
+  switch (getToken().getKind()) {
+  case Token::bare_identifier:
+    return parseBareIdExpr();
+  case Token::kw_symbol:
+    return parseSymbolSSAIdExpr();
+  case Token::percent_identifier:
+    return parseSSAIdExpr(/*isSymbol=*/false);
+  case Token::integer:
+    return parseIntegerExpr();
+  case Token::l_paren:
+    return parseParentheticalExpr();
+  case Token::minus:
+    return parseNegateExpression(lhs);
+  case Token::kw_ceildiv:
+  case Token::kw_floordiv:
+  case Token::kw_mod:
+  case Token::plus:
+  case Token::star:
+    if (lhs)
+      emitError("missing right operand of binary operator");
+    else
+      emitError("missing left operand of binary operator");
+    return nullptr;
+  default:
+    if (lhs)
+      emitError("missing right operand of binary operator");
+    else
+      emitError("expected affine expression");
+    return nullptr;
+  }
+}
+
+/// Parse affine expressions that are bare-id's, integer constants,
+/// parenthetical affine expressions, and affine op expressions that are a
+/// composition of those.
+///
+/// All binary op's associate from left to right.
+///
+/// {add, sub} have lower precedence than {mul, div, and mod}.
+///
+/// Add, sub'are themselves at the same precedence level. Mul, floordiv,
+/// ceildiv, and mod are at the same higher precedence level. Negation has
+/// higher precedence than any binary op.
+///
+/// llhs: the affine expression appearing on the left of the one being parsed.
+/// This function will return ((llhs llhsOp lhs) op rhs) if llhs is non null,
+/// and lhs op rhs otherwise; if there is no rhs, llhs llhsOp lhs is returned
+/// if llhs is non-null; otherwise lhs is returned. This is to deal with left
+/// associativity.
+///
+/// Eg: when the expression is e1 + e2*e3 + e4, with e1 as llhs, this function
+/// will return the affine expr equivalent of (e1 + (e2*e3)) + e4, where
+/// (e2*e3) will be parsed using parseAffineHighPrecOpExpr().
+AffineExpr AffineParser::parseAffineLowPrecOpExpr(AffineExpr llhs,
+                                                  AffineLowPrecOp llhsOp) {
+  AffineExpr lhs;
+  if (!(lhs = parseAffineOperandExpr(llhs)))
+    return nullptr;
+
+  // Found an LHS. Deal with the ops.
+  if (AffineLowPrecOp lOp = consumeIfLowPrecOp()) {
+    if (llhs) {
+      AffineExpr sum = getAffineBinaryOpExpr(llhsOp, llhs, lhs);
+      return parseAffineLowPrecOpExpr(sum, lOp);
+    }
+    // No LLHS, get RHS and form the expression.
+    return parseAffineLowPrecOpExpr(lhs, lOp);
+  }
+  auto opLoc = getToken().getLoc();
+  if (AffineHighPrecOp hOp = consumeIfHighPrecOp()) {
+    // We have a higher precedence op here. Get the rhs operand for the llhs
+    // through parseAffineHighPrecOpExpr.
+    AffineExpr highRes = parseAffineHighPrecOpExpr(lhs, hOp, opLoc);
+    if (!highRes)
+      return nullptr;
+
+    // If llhs is null, the product forms the first operand of the yet to be
+    // found expression. If non-null, the op to associate with llhs is llhsOp.
+    AffineExpr expr =
+        llhs ? getAffineBinaryOpExpr(llhsOp, llhs, highRes) : highRes;
+
+    // Recurse for subsequent low prec op's after the affine high prec op
+    // expression.
+    if (AffineLowPrecOp nextOp = consumeIfLowPrecOp())
+      return parseAffineLowPrecOpExpr(expr, nextOp);
+    return expr;
+  }
+  // Last operand in the expression list.
+  if (llhs)
+    return getAffineBinaryOpExpr(llhsOp, llhs, lhs);
+  // No llhs, 'lhs' itself is the expression.
+  return lhs;
+}
+
+/// Parse an affine expression.
+///  affine-expr ::= `(` affine-expr `)`
+///                | `-` affine-expr
+///                | affine-expr `+` affine-expr
+///                | affine-expr `-` affine-expr
+///                | affine-expr `*` affine-expr
+///                | affine-expr `floordiv` affine-expr
+///                | affine-expr `ceildiv` affine-expr
+///                | affine-expr `mod` affine-expr
+///                | bare-id
+///                | integer-literal
+///
+/// Additional conditions are checked depending on the production. For eg.,
+/// one of the operands for `*` has to be either constant/symbolic; the second
+/// operand for floordiv, ceildiv, and mod has to be a positive integer.
+AffineExpr AffineParser::parseAffineExpr() {
+  return parseAffineLowPrecOpExpr(nullptr, AffineLowPrecOp::LNoOp);
+}
+
+/// Parse a dim or symbol from the lists appearing before the actual
+/// expressions of the affine map. Update our state to store the
+/// dimensional/symbolic identifier.
+ParseResult AffineParser::parseIdentifierDefinition(AffineExpr idExpr) {
+  if (getToken().isNot(Token::bare_identifier))
+    return emitError("expected bare identifier");
+
+  auto name = getTokenSpelling();
+  for (auto entry : dimsAndSymbols) {
+    if (entry.first == name)
+      return emitError("redefinition of identifier '" + name + "'");
+  }
+  consumeToken(Token::bare_identifier);
+
+  dimsAndSymbols.push_back({name, idExpr});
+  return success();
+}
+
+/// Parse the list of dimensional identifiers to an affine map.
+ParseResult AffineParser::parseDimIdList(unsigned &numDims) {
+  if (parseToken(Token::l_paren,
+                 "expected '(' at start of dimensional identifiers list")) {
+    return failure();
+  }
+
+  auto parseElt = [&]() -> ParseResult {
+    auto dimension = getAffineDimExpr(numDims++, getContext());
+    return parseIdentifierDefinition(dimension);
+  };
+  return parseCommaSeparatedListUntil(Token::r_paren, parseElt);
+}
+
+/// Parse the list of symbolic identifiers to an affine map.
+ParseResult AffineParser::parseSymbolIdList(unsigned &numSymbols) {
+  consumeToken(Token::l_square);
+  auto parseElt = [&]() -> ParseResult {
+    auto symbol = getAffineSymbolExpr(numSymbols++, getContext());
+    return parseIdentifierDefinition(symbol);
+  };
+  return parseCommaSeparatedListUntil(Token::r_square, parseElt);
+}
+
+/// Parse the list of symbolic identifiers to an affine map.
+ParseResult
+AffineParser::parseDimAndOptionalSymbolIdList(unsigned &numDims,
+                                              unsigned &numSymbols) {
+  if (parseDimIdList(numDims)) {
+    return failure();
+  }
+  if (!getToken().is(Token::l_square)) {
+    numSymbols = 0;
+    return success();
+  }
+  return parseSymbolIdList(numSymbols);
+}
+
+/// Parses an ambiguous affine map or integer set definition inline.
+ParseResult AffineParser::parseAffineMapOrIntegerSetInline(AffineMap &map,
+                                                           IntegerSet &set) {
+  unsigned numDims = 0, numSymbols = 0;
+
+  // List of dimensional and optional symbol identifiers.
+  if (parseDimAndOptionalSymbolIdList(numDims, numSymbols)) {
+    return failure();
+  }
+
+  // This is needed for parsing attributes as we wouldn't know whether we would
+  // be parsing an integer set attribute or an affine map attribute.
+  bool isArrow = getToken().is(Token::arrow);
+  bool isColon = getToken().is(Token::colon);
+  if (!isArrow && !isColon) {
+    return emitError("expected '->' or ':'");
+  } else if (isArrow) {
+    parseToken(Token::arrow, "expected '->' or '['");
+    map = parseAffineMapRange(numDims, numSymbols);
+    return map ? success() : failure();
+  } else if (parseToken(Token::colon, "expected ':' or '['")) {
+    return failure();
+  }
+
+  if ((set = parseIntegerSetConstraints(numDims, numSymbols)))
+    return success();
+
+  return failure();
+}
+
+/// Parse an AffineMap where the dim and symbol identifiers are SSA ids.
+ParseResult AffineParser::parseAffineMapOfSSAIds(AffineMap &map) {
+  if (parseToken(Token::l_square, "expected '['"))
+    return failure();
+
+  SmallVector<AffineExpr, 4> exprs;
+  auto parseElt = [&]() -> ParseResult {
+    auto elt = parseAffineExpr();
+    exprs.push_back(elt);
+    return elt ? success() : failure();
+  };
+
+  // Parse a multi-dimensional affine expression (a comma-separated list of
+  // 1-d affine expressions); the list cannot be empty. Grammar:
+  // multi-dim-affine-expr ::= `(` affine-expr (`,` affine-expr)* `)
+  if (parseCommaSeparatedListUntil(Token::r_square, parseElt,
+                                   /*allowEmptyList=*/true))
+    return failure();
+  // Parsed a valid affine map.
+  if (exprs.empty())
+    map = AffineMap::get(getContext());
+  else
+    map = AffineMap::get(numDimOperands, dimsAndSymbols.size() - numDimOperands,
+                         exprs);
+  return success();
+}
+
+/// Parse the range and sizes affine map definition inline.
+///
+///  affine-map ::= dim-and-symbol-id-lists `->` multi-dim-affine-expr
+///
+///  multi-dim-affine-expr ::= `(` `)`
+///  multi-dim-affine-expr ::= `(` affine-expr (`,` affine-expr)* `)`
+AffineMap AffineParser::parseAffineMapRange(unsigned numDims,
+                                            unsigned numSymbols) {
+  parseToken(Token::l_paren, "expected '(' at start of affine map range");
+
+  SmallVector<AffineExpr, 4> exprs;
+  auto parseElt = [&]() -> ParseResult {
+    auto elt = parseAffineExpr();
+    ParseResult res = elt ? success() : failure();
+    exprs.push_back(elt);
+    return res;
+  };
+
+  // Parse a multi-dimensional affine expression (a comma-separated list of
+  // 1-d affine expressions); the list cannot be empty. Grammar:
+  // multi-dim-affine-expr ::= `(` affine-expr (`,` affine-expr)* `)
+  if (parseCommaSeparatedListUntil(Token::r_paren, parseElt, true))
+    return AffineMap();
+
+  if (exprs.empty())
+    return AffineMap::get(getContext());
+
+  // Parsed a valid affine map.
+  return AffineMap::get(numDims, numSymbols, exprs);
+}
+
+/// Parse an affine constraint.
+///  affine-constraint ::= affine-expr `>=` `0`
+///                      | affine-expr `==` `0`
+///
+/// isEq is set to true if the parsed constraint is an equality, false if it
+/// is an inequality (greater than or equal).
+///
+AffineExpr AffineParser::parseAffineConstraint(bool *isEq) {
+  AffineExpr expr = parseAffineExpr();
+  if (!expr)
+    return nullptr;
+
+  if (consumeIf(Token::greater) && consumeIf(Token::equal) &&
+      getToken().is(Token::integer)) {
+    auto dim = getToken().getUnsignedIntegerValue();
+    if (dim.hasValue() && dim.getValue() == 0) {
+      consumeToken(Token::integer);
+      *isEq = false;
+      return expr;
+    }
+    return (emitError("expected '0' after '>='"), nullptr);
+  }
+
+  if (consumeIf(Token::equal) && consumeIf(Token::equal) &&
+      getToken().is(Token::integer)) {
+    auto dim = getToken().getUnsignedIntegerValue();
+    if (dim.hasValue() && dim.getValue() == 0) {
+      consumeToken(Token::integer);
+      *isEq = true;
+      return expr;
+    }
+    return (emitError("expected '0' after '=='"), nullptr);
+  }
+
+  return (emitError("expected '== 0' or '>= 0' at end of affine constraint"),
+          nullptr);
+}
+
+/// Parse the constraints that are part of an integer set definition.
+///  integer-set-inline
+///                ::= dim-and-symbol-id-lists `:`
+///                '(' affine-constraint-conjunction? ')'
+///  affine-constraint-conjunction ::= affine-constraint (`,`
+///                                       affine-constraint)*
+///
+IntegerSet AffineParser::parseIntegerSetConstraints(unsigned numDims,
+                                                    unsigned numSymbols) {
+  if (parseToken(Token::l_paren,
+                 "expected '(' at start of integer set constraint list"))
+    return IntegerSet();
+
+  SmallVector<AffineExpr, 4> constraints;
+  SmallVector<bool, 4> isEqs;
+  auto parseElt = [&]() -> ParseResult {
+    bool isEq;
+    auto elt = parseAffineConstraint(&isEq);
+    ParseResult res = elt ? success() : failure();
+    if (elt) {
+      constraints.push_back(elt);
+      isEqs.push_back(isEq);
+    }
+    return res;
+  };
+
+  // Parse a list of affine constraints (comma-separated).
+  if (parseCommaSeparatedListUntil(Token::r_paren, parseElt, true))
+    return IntegerSet();
+
+  // If no constraints were parsed, then treat this as a degenerate 'true' case.
+  if (constraints.empty()) {
+    /* 0 == 0 */
+    auto zero = getAffineConstantExpr(0, getContext());
+    return IntegerSet::get(numDims, numSymbols, zero, true);
+  }
+
+  // Parsed a valid integer set.
+  return IntegerSet::get(numDims, numSymbols, constraints, isEqs);
+}
+
+/// Parse an ambiguous reference to either and affine map or an integer set.
+ParseResult Parser::parseAffineMapOrIntegerSetReference(AffineMap &map,
+                                                        IntegerSet &set) {
+  return AffineParser(state).parseAffineMapOrIntegerSetInline(map, set);
+}
+
+/// Parse an AffineMap of SSA ids. The callback 'parseElement' is used to
+/// parse SSA value uses encountered while parsing affine expressions.
+ParseResult
+Parser::parseAffineMapOfSSAIds(AffineMap &map,
+                               function_ref<ParseResult(bool)> parseElement) {
+  return AffineParser(state, /*allowParsingSSAIds=*/true, parseElement)
+      .parseAffineMapOfSSAIds(map);
+}
+
+//===----------------------------------------------------------------------===//
+// OperationParser
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This class provides support for parsing operations and regions of
+/// operations.
+class OperationParser : public Parser {
+public:
+  OperationParser(ParserState &state, ModuleOp moduleOp)
+      : Parser(state), opBuilder(moduleOp.getBodyRegion()), moduleOp(moduleOp) {
+  }
+
+  ~OperationParser();
+
+  /// After parsing is finished, this function must be called to see if there
+  /// are any remaining issues.
+  ParseResult finalize();
+
+  //===--------------------------------------------------------------------===//
+  // SSA Value Handling
+  //===--------------------------------------------------------------------===//
+
+  /// This represents a use of an SSA value in the program.  The first two
+  /// entries in the tuple are the name and result number of a reference.  The
+  /// third is the location of the reference, which is used in case this ends
+  /// up being a use of an undefined value.
+  struct SSAUseInfo {
+    StringRef name;  // Value name, e.g. %42 or %abc
+    unsigned number; // Number, specified with #12
+    SMLoc loc;       // Location of first definition or use.
+  };
+
+  /// Push a new SSA name scope to the parser.
+  void pushSSANameScope(bool isIsolated);
+
+  /// Pop the last SSA name scope from the parser.
+  ParseResult popSSANameScope();
+
+  /// Register a definition of a value with the symbol table.
+  ParseResult addDefinition(SSAUseInfo useInfo, Value value);
+
+  /// Parse an optional list of SSA uses into 'results'.
+  ParseResult parseOptionalSSAUseList(SmallVectorImpl<SSAUseInfo> &results);
+
+  /// Parse a single SSA use into 'result'.
+  ParseResult parseSSAUse(SSAUseInfo &result);
+
+  /// Given a reference to an SSA value and its type, return a reference. This
+  /// returns null on failure.
+  Value resolveSSAUse(SSAUseInfo useInfo, Type type);
+
+  ParseResult parseSSADefOrUseAndType(
+      const std::function<ParseResult(SSAUseInfo, Type)> &action);
+
+  ParseResult parseOptionalSSAUseAndTypeList(SmallVectorImpl<Value> &results);
+
+  /// Return the location of the value identified by its name and number if it
+  /// has been already reference.
+  Optional<SMLoc> getReferenceLoc(StringRef name, unsigned number) {
+    auto &values = isolatedNameScopes.back().values;
+    if (!values.count(name) || number >= values[name].size())
+      return {};
+    if (values[name][number].first)
+      return values[name][number].second;
+    return {};
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Operation Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an operation instance.
+  ParseResult parseOperation();
+
+  /// Parse a single operation successor and its operand list.
+  ParseResult parseSuccessorAndUseList(Block *&dest,
+                                       SmallVectorImpl<Value> &operands);
+
+  /// Parse a comma-separated list of operation successors in brackets.
+  ParseResult parseSuccessors(SmallVectorImpl<Block *> &destinations,
+                              SmallVectorImpl<SmallVector<Value, 4>> &operands);
+
+  /// Parse an operation instance that is in the generic form.
+  Operation *parseGenericOperation();
+
+  /// Parse an operation instance that is in the generic form and insert it at
+  /// the provided insertion point.
+  Operation *parseGenericOperation(Block *insertBlock,
+                                   Block::iterator insertPt);
+
+  /// Parse an operation instance that is in the op-defined custom form.
+  Operation *parseCustomOperation();
+
+  //===--------------------------------------------------------------------===//
+  // Region Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a region into 'region' with the provided entry block arguments.
+  /// 'isIsolatedNameScope' indicates if the naming scope of this region is
+  /// isolated from those above.
+  ParseResult parseRegion(Region &region,
+                          ArrayRef<std::pair<SSAUseInfo, Type>> entryArguments,
+                          bool isIsolatedNameScope = false);
+
+  /// Parse a region body into 'region'.
+  ParseResult parseRegionBody(Region &region);
+
+  //===--------------------------------------------------------------------===//
+  // Block Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a new block into 'block'.
+  ParseResult parseBlock(Block *&block);
+
+  /// Parse a list of operations into 'block'.
+  ParseResult parseBlockBody(Block *block);
+
+  /// Parse a (possibly empty) list of block arguments.
+  ParseResult parseOptionalBlockArgList(SmallVectorImpl<BlockArgument> &results,
+                                        Block *owner);
+
+  /// Get the block with the specified name, creating it if it doesn't
+  /// already exist.  The location specified is the point of use, which allows
+  /// us to diagnose references to blocks that are not defined precisely.
+  Block *getBlockNamed(StringRef name, SMLoc loc);
+
+  /// Define the block with the specified name. Returns the Block* or nullptr in
+  /// the case of redefinition.
+  Block *defineBlockNamed(StringRef name, SMLoc loc, Block *existing);
+
+private:
+  /// Returns the info for a block at the current scope for the given name.
+  std::pair<Block *, SMLoc> &getBlockInfoByName(StringRef name) {
+    return blocksByName.back()[name];
+  }
+
+  /// Insert a new forward reference to the given block.
+  void insertForwardRef(Block *block, SMLoc loc) {
+    forwardRef.back().try_emplace(block, loc);
+  }
+
+  /// Erase any forward reference to the given block.
+  bool eraseForwardRef(Block *block) { return forwardRef.back().erase(block); }
+
+  /// Record that a definition was added at the current scope.
+  void recordDefinition(StringRef def);
+
+  /// Get the value entry for the given SSA name.
+  SmallVectorImpl<std::pair<Value, SMLoc>> &getSSAValueEntry(StringRef name);
+
+  /// Create a forward reference placeholder value with the given location and
+  /// result type.
+  Value createForwardRefPlaceholder(SMLoc loc, Type type);
+
+  /// Return true if this is a forward reference.
+  bool isForwardRefPlaceholder(Value value) {
+    return forwardRefPlaceholders.count(value);
+  }
+
+  /// This struct represents an isolated SSA name scope. This scope may contain
+  /// other nested non-isolated scopes. These scopes are used for operations
+  /// that are known to be isolated to allow for reusing names within their
+  /// regions, even if those names are used above.
+  struct IsolatedSSANameScope {
+    /// Record that a definition was added at the current scope.
+    void recordDefinition(StringRef def) {
+      definitionsPerScope.back().insert(def);
+    }
+
+    /// Push a nested name scope.
+    void pushSSANameScope() { definitionsPerScope.push_back({}); }
+
+    /// Pop a nested name scope.
+    void popSSANameScope() {
+      for (auto &def : definitionsPerScope.pop_back_val())
+        values.erase(def.getKey());
+    }
+
+    /// This keeps track of all of the SSA values we are tracking for each name
+    /// scope, indexed by their name. This has one entry per result number.
+    llvm::StringMap<SmallVector<std::pair<Value, SMLoc>, 1>> values;
+
+    /// This keeps track of all of the values defined by a specific name scope.
+    SmallVector<llvm::StringSet<>, 2> definitionsPerScope;
+  };
+
+  /// A list of isolated name scopes.
+  SmallVector<IsolatedSSANameScope, 2> isolatedNameScopes;
+
+  /// This keeps track of the block names as well as the location of the first
+  /// reference for each nested name scope. This is used to diagnose invalid
+  /// block references and memorize them.
+  SmallVector<DenseMap<StringRef, std::pair<Block *, SMLoc>>, 2> blocksByName;
+  SmallVector<DenseMap<Block *, SMLoc>, 2> forwardRef;
+
+  /// These are all of the placeholders we've made along with the location of
+  /// their first reference, to allow checking for use of undefined values.
+  DenseMap<Value, SMLoc> forwardRefPlaceholders;
+
+  /// The builder used when creating parsed operation instances.
+  OpBuilder opBuilder;
+
+  /// The top level module operation.
+  ModuleOp moduleOp;
+};
+} // end anonymous namespace
+
+OperationParser::~OperationParser() {
+  for (auto &fwd : forwardRefPlaceholders) {
+    // Drop all uses of undefined forward declared reference and destroy
+    // defining operation.
+    fwd.first->dropAllUses();
+    fwd.first->getDefiningOp()->destroy();
+  }
+}
+
+/// After parsing is finished, this function must be called to see if there are
+/// any remaining issues.
+ParseResult OperationParser::finalize() {
+  // Check for any forward references that are left.  If we find any, error
+  // out.
+  if (!forwardRefPlaceholders.empty()) {
+    SmallVector<std::pair<const char *, Value>, 4> errors;
+    // Iteration over the map isn't deterministic, so sort by source location.
+    for (auto entry : forwardRefPlaceholders)
+      errors.push_back({entry.second.getPointer(), entry.first});
+    llvm::array_pod_sort(errors.begin(), errors.end());
+
+    for (auto entry : errors) {
+      auto loc = SMLoc::getFromPointer(entry.first);
+      emitError(loc, "use of undeclared SSA value name");
+    }
+    return failure();
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SSA Value Handling
+//===----------------------------------------------------------------------===//
+
+void OperationParser::pushSSANameScope(bool isIsolated) {
+  blocksByName.push_back(DenseMap<StringRef, std::pair<Block *, SMLoc>>());
+  forwardRef.push_back(DenseMap<Block *, SMLoc>());
+
+  // Push back a new name definition scope.
+  if (isIsolated)
+    isolatedNameScopes.push_back({});
+  isolatedNameScopes.back().pushSSANameScope();
+}
+
+ParseResult OperationParser::popSSANameScope() {
+  auto forwardRefInCurrentScope = forwardRef.pop_back_val();
+
+  // Verify that all referenced blocks were defined.
+  if (!forwardRefInCurrentScope.empty()) {
+    SmallVector<std::pair<const char *, Block *>, 4> errors;
+    // Iteration over the map isn't deterministic, so sort by source location.
+    for (auto entry : forwardRefInCurrentScope) {
+      errors.push_back({entry.second.getPointer(), entry.first});
+      // Add this block to the top-level region to allow for automatic cleanup.
+      moduleOp.getOperation()->getRegion(0).push_back(entry.first);
+    }
+    llvm::array_pod_sort(errors.begin(), errors.end());
+
+    for (auto entry : errors) {
+      auto loc = SMLoc::getFromPointer(entry.first);
+      emitError(loc, "reference to an undefined block");
+    }
+    return failure();
+  }
+
+  // Pop the next nested namescope. If there is only one internal namescope,
+  // just pop the isolated scope.
+  auto &currentNameScope = isolatedNameScopes.back();
+  if (currentNameScope.definitionsPerScope.size() == 1)
+    isolatedNameScopes.pop_back();
+  else
+    currentNameScope.popSSANameScope();
+
+  blocksByName.pop_back();
+  return success();
+}
+
+/// Register a definition of a value with the symbol table.
+ParseResult OperationParser::addDefinition(SSAUseInfo useInfo, Value value) {
+  auto &entries = getSSAValueEntry(useInfo.name);
+
+  // Make sure there is a slot for this value.
+  if (entries.size() <= useInfo.number)
+    entries.resize(useInfo.number + 1);
+
+  // If we already have an entry for this, check to see if it was a definition
+  // or a forward reference.
+  if (auto existing = entries[useInfo.number].first) {
+    if (!isForwardRefPlaceholder(existing)) {
+      return emitError(useInfo.loc)
+          .append("redefinition of SSA value '", useInfo.name, "'")
+          .attachNote(getEncodedSourceLocation(entries[useInfo.number].second))
+          .append("previously defined here");
+    }
+
+    // If it was a forward reference, update everything that used it to use
+    // the actual definition instead, delete the forward ref, and remove it
+    // from our set of forward references we track.
+    existing->replaceAllUsesWith(value);
+    existing->getDefiningOp()->destroy();
+    forwardRefPlaceholders.erase(existing);
+  }
+
+  /// Record this definition for the current scope.
+  entries[useInfo.number] = {value, useInfo.loc};
+  recordDefinition(useInfo.name);
+  return success();
+}
+
+/// Parse a (possibly empty) list of SSA operands.
+///
+///   ssa-use-list ::= ssa-use (`,` ssa-use)*
+///   ssa-use-list-opt ::= ssa-use-list?
+///
+ParseResult
+OperationParser::parseOptionalSSAUseList(SmallVectorImpl<SSAUseInfo> &results) {
+  if (getToken().isNot(Token::percent_identifier))
+    return success();
+  return parseCommaSeparatedList([&]() -> ParseResult {
+    SSAUseInfo result;
+    if (parseSSAUse(result))
+      return failure();
+    results.push_back(result);
+    return success();
+  });
+}
+
+/// Parse a SSA operand for an operation.
+///
+///   ssa-use ::= ssa-id
+///
+ParseResult OperationParser::parseSSAUse(SSAUseInfo &result) {
+  result.name = getTokenSpelling();
+  result.number = 0;
+  result.loc = getToken().getLoc();
+  if (parseToken(Token::percent_identifier, "expected SSA operand"))
+    return failure();
+
+  // If we have an attribute ID, it is a result number.
+  if (getToken().is(Token::hash_identifier)) {
+    if (auto value = getToken().getHashIdentifierNumber())
+      result.number = value.getValue();
+    else
+      return emitError("invalid SSA value result number");
+    consumeToken(Token::hash_identifier);
+  }
+
+  return success();
+}
+
+/// Given an unbound reference to an SSA value and its type, return the value
+/// it specifies.  This returns null on failure.
+Value OperationParser::resolveSSAUse(SSAUseInfo useInfo, Type type) {
+  auto &entries = getSSAValueEntry(useInfo.name);
+
+  // If we have already seen a value of this name, return it.
+  if (useInfo.number < entries.size() && entries[useInfo.number].first) {
+    auto result = entries[useInfo.number].first;
+    // Check that the type matches the other uses.
+    if (result->getType() == type)
+      return result;
+
+    emitError(useInfo.loc, "use of value '")
+        .append(useInfo.name,
+                "' expects different type than prior uses: ", type, " vs ",
+                result->getType())
+        .attachNote(getEncodedSourceLocation(entries[useInfo.number].second))
+        .append("prior use here");
+    return nullptr;
+  }
+
+  // Make sure we have enough slots for this.
+  if (entries.size() <= useInfo.number)
+    entries.resize(useInfo.number + 1);
+
+  // If the value has already been defined and this is an overly large result
+  // number, diagnose that.
+  if (entries[0].first && !isForwardRefPlaceholder(entries[0].first))
+    return (emitError(useInfo.loc, "reference to invalid result number"),
+            nullptr);
+
+  // Otherwise, this is a forward reference.  Create a placeholder and remember
+  // that we did so.
+  auto result = createForwardRefPlaceholder(useInfo.loc, type);
+  entries[useInfo.number].first = result;
+  entries[useInfo.number].second = useInfo.loc;
+  return result;
+}
+
+/// Parse an SSA use with an associated type.
+///
+///   ssa-use-and-type ::= ssa-use `:` type
+ParseResult OperationParser::parseSSADefOrUseAndType(
+    const std::function<ParseResult(SSAUseInfo, Type)> &action) {
+  SSAUseInfo useInfo;
+  if (parseSSAUse(useInfo) ||
+      parseToken(Token::colon, "expected ':' and type for SSA operand"))
+    return failure();
+
+  auto type = parseType();
+  if (!type)
+    return failure();
+
+  return action(useInfo, type);
+}
+
+/// Parse a (possibly empty) list of SSA operands, followed by a colon, then
+/// followed by a type list.
+///
+///   ssa-use-and-type-list
+///     ::= ssa-use-list ':' type-list-no-parens
+///
+ParseResult OperationParser::parseOptionalSSAUseAndTypeList(
+    SmallVectorImpl<Value> &results) {
+  SmallVector<SSAUseInfo, 4> valueIDs;
+  if (parseOptionalSSAUseList(valueIDs))
+    return failure();
+
+  // If there were no operands, then there is no colon or type lists.
+  if (valueIDs.empty())
+    return success();
+
+  SmallVector<Type, 4> types;
+  if (parseToken(Token::colon, "expected ':' in operand list") ||
+      parseTypeListNoParens(types))
+    return failure();
+
+  if (valueIDs.size() != types.size())
+    return emitError("expected ")
+           << valueIDs.size() << " types to match operand list";
+
+  results.reserve(valueIDs.size());
+  for (unsigned i = 0, e = valueIDs.size(); i != e; ++i) {
+    if (auto value = resolveSSAUse(valueIDs[i], types[i]))
+      results.push_back(value);
+    else
+      return failure();
+  }
+
+  return success();
+}
+
+/// Record that a definition was added at the current scope.
+void OperationParser::recordDefinition(StringRef def) {
+  isolatedNameScopes.back().recordDefinition(def);
+}
+
+/// Get the value entry for the given SSA name.
+SmallVectorImpl<std::pair<Value, SMLoc>> &
+OperationParser::getSSAValueEntry(StringRef name) {
+  return isolatedNameScopes.back().values[name];
+}
+
+/// Create and remember a new placeholder for a forward reference.
+Value OperationParser::createForwardRefPlaceholder(SMLoc loc, Type type) {
+  // Forward references are always created as operations, because we just need
+  // something with a def/use chain.
+  //
+  // We create these placeholders as having an empty name, which we know
+  // cannot be created through normal user input, allowing us to distinguish
+  // them.
+  auto name = OperationName("placeholder", getContext());
+  auto *op = Operation::create(
+      getEncodedSourceLocation(loc), name, type, /*operands=*/{},
+      /*attributes=*/llvm::None, /*successors=*/{}, /*numRegions=*/0,
+      /*resizableOperandList=*/false);
+  forwardRefPlaceholders[op->getResult(0)] = loc;
+  return op->getResult(0);
+}
+
+//===----------------------------------------------------------------------===//
+// Operation Parsing
+//===----------------------------------------------------------------------===//
+
+/// Parse an operation.
+///
+///  operation         ::= op-result-list?
+///                        (generic-operation | custom-operation)
+///                        trailing-location?
+///  generic-operation ::= string-literal '(' ssa-use-list? ')' attribute-dict?
+///                        `:` function-type
+///  custom-operation  ::= bare-id custom-operation-format
+///  op-result-list    ::= op-result (`,` op-result)* `=`
+///  op-result         ::= ssa-id (`:` integer-literal)
+///
+ParseResult OperationParser::parseOperation() {
+  auto loc = getToken().getLoc();
+  SmallVector<std::tuple<StringRef, unsigned, SMLoc>, 1> resultIDs;
+  size_t numExpectedResults = 0;
+  if (getToken().is(Token::percent_identifier)) {
+    // Parse the group of result ids.
+    auto parseNextResult = [&]() -> ParseResult {
+      // Parse the next result id.
+      if (!getToken().is(Token::percent_identifier))
+        return emitError("expected valid ssa identifier");
+
+      Token nameTok = getToken();
+      consumeToken(Token::percent_identifier);
+
+      // If the next token is a ':', we parse the expected result count.
+      size_t expectedSubResults = 1;
+      if (consumeIf(Token::colon)) {
+        // Check that the next token is an integer.
+        if (!getToken().is(Token::integer))
+          return emitError("expected integer number of results");
+
+        // Check that number of results is > 0.
+        auto val = getToken().getUInt64IntegerValue();
+        if (!val.hasValue() || val.getValue() < 1)
+          return emitError("expected named operation to have atleast 1 result");
+        consumeToken(Token::integer);
+        expectedSubResults = *val;
+      }
+
+      resultIDs.emplace_back(nameTok.getSpelling(), expectedSubResults,
+                             nameTok.getLoc());
+      numExpectedResults += expectedSubResults;
+      return success();
+    };
+    if (parseCommaSeparatedList(parseNextResult))
+      return failure();
+
+    if (parseToken(Token::equal, "expected '=' after SSA name"))
+      return failure();
+  }
+
+  Operation *op;
+  if (getToken().is(Token::bare_identifier) || getToken().isKeyword())
+    op = parseCustomOperation();
+  else if (getToken().is(Token::string))
+    op = parseGenericOperation();
+  else
+    return emitError("expected operation name in quotes");
+
+  // If parsing of the basic operation failed, then this whole thing fails.
+  if (!op)
+    return failure();
+
+  // If the operation had a name, register it.
+  if (!resultIDs.empty()) {
+    if (op->getNumResults() == 0)
+      return emitError(loc, "cannot name an operation with no results");
+    if (numExpectedResults != op->getNumResults())
+      return emitError(loc, "operation defines ")
+             << op->getNumResults() << " results but was provided "
+             << numExpectedResults << " to bind";
+
+    // Add definitions for each of the result groups.
+    unsigned opResI = 0;
+    for (std::tuple<StringRef, unsigned, SMLoc> &resIt : resultIDs) {
+      for (unsigned subRes : llvm::seq<unsigned>(0, std::get<1>(resIt))) {
+        if (addDefinition({std::get<0>(resIt), subRes, std::get<2>(resIt)},
+                          op->getResult(opResI++)))
+          return failure();
+      }
+    }
+  }
+
+  return success();
+}
+
+/// Parse a single operation successor and its operand list.
+///
+///   successor ::= block-id branch-use-list?
+///   branch-use-list ::= `(` ssa-use-list ':' type-list-no-parens `)`
+///
+ParseResult
+OperationParser::parseSuccessorAndUseList(Block *&dest,
+                                          SmallVectorImpl<Value> &operands) {
+  // Verify branch is identifier and get the matching block.
+  if (!getToken().is(Token::caret_identifier))
+    return emitError("expected block name");
+  dest = getBlockNamed(getTokenSpelling(), getToken().getLoc());
+  consumeToken();
+
+  // Handle optional arguments.
+  if (consumeIf(Token::l_paren) &&
+      (parseOptionalSSAUseAndTypeList(operands) ||
+       parseToken(Token::r_paren, "expected ')' to close argument list"))) {
+    return failure();
+  }
+
+  return success();
+}
+
+/// Parse a comma-separated list of operation successors in brackets.
+///
+///   successor-list ::= `[` successor (`,` successor )* `]`
+///
+ParseResult OperationParser::parseSuccessors(
+    SmallVectorImpl<Block *> &destinations,
+    SmallVectorImpl<SmallVector<Value, 4>> &operands) {
+  if (parseToken(Token::l_square, "expected '['"))
+    return failure();
+
+  auto parseElt = [this, &destinations, &operands]() {
+    Block *dest;
+    SmallVector<Value, 4> destOperands;
+    auto res = parseSuccessorAndUseList(dest, destOperands);
+    destinations.push_back(dest);
+    operands.push_back(destOperands);
+    return res;
+  };
+  return parseCommaSeparatedListUntil(Token::r_square, parseElt,
+                                      /*allowEmptyList=*/false);
+}
+
+namespace {
+// RAII-style guard for cleaning up the regions in the operation state before
+// deleting them.  Within the parser, regions may get deleted if parsing failed,
+// and other errors may be present, in particular undominated uses.  This makes
+// sure such uses are deleted.
+struct CleanupOpStateRegions {
+  ~CleanupOpStateRegions() {
+    SmallVector<Region *, 4> regionsToClean;
+    regionsToClean.reserve(state.regions.size());
+    for (auto &region : state.regions)
+      if (region)
+        for (auto &block : *region)
+          block.dropAllDefinedValueUses();
+  }
+  OperationState &state;
+};
+} // namespace
+
+Operation *OperationParser::parseGenericOperation() {
+  // Get location information for the operation.
+  auto srcLocation = getEncodedSourceLocation(getToken().getLoc());
+
+  auto name = getToken().getStringValue();
+  if (name.empty())
+    return (emitError("empty operation name is invalid"), nullptr);
+  if (name.find('\0') != StringRef::npos)
+    return (emitError("null character not allowed in operation name"), nullptr);
+
+  consumeToken(Token::string);
+
+  OperationState result(srcLocation, name);
+
+  // Generic operations have a resizable operation list.
+  result.setOperandListToResizable();
+
+  // Parse the operand list.
+  SmallVector<SSAUseInfo, 8> operandInfos;
+
+  if (parseToken(Token::l_paren, "expected '(' to start operand list") ||
+      parseOptionalSSAUseList(operandInfos) ||
+      parseToken(Token::r_paren, "expected ')' to end operand list")) {
+    return nullptr;
+  }
+
+  // Parse the successor list but don't add successors to the result yet to
+  // avoid messing up with the argument order.
+  SmallVector<Block *, 2> successors;
+  SmallVector<SmallVector<Value, 4>, 2> successorOperands;
+  if (getToken().is(Token::l_square)) {
+    // Check if the operation is a known terminator.
+    const AbstractOperation *abstractOp = result.name.getAbstractOperation();
+    if (abstractOp && !abstractOp->hasProperty(OperationProperty::Terminator))
+      return emitError("successors in non-terminator"), nullptr;
+    if (parseSuccessors(successors, successorOperands))
+      return nullptr;
+  }
+
+  // Parse the region list.
+  CleanupOpStateRegions guard{result};
+  if (consumeIf(Token::l_paren)) {
+    do {
+      // Create temporary regions with the top level region as parent.
+      result.regions.emplace_back(new Region(moduleOp));
+      if (parseRegion(*result.regions.back(), /*entryArguments=*/{}))
+        return nullptr;
+    } while (consumeIf(Token::comma));
+    if (parseToken(Token::r_paren, "expected ')' to end region list"))
+      return nullptr;
+  }
+
+  if (getToken().is(Token::l_brace)) {
+    if (parseAttributeDict(result.attributes))
+      return nullptr;
+  }
+
+  if (parseToken(Token::colon, "expected ':' followed by operation type"))
+    return nullptr;
+
+  auto typeLoc = getToken().getLoc();
+  auto type = parseType();
+  if (!type)
+    return nullptr;
+  auto fnType = type.dyn_cast<FunctionType>();
+  if (!fnType)
+    return (emitError(typeLoc, "expected function type"), nullptr);
+
+  result.addTypes(fnType.getResults());
+
+  // Check that we have the right number of types for the operands.
+  auto operandTypes = fnType.getInputs();
+  if (operandTypes.size() != operandInfos.size()) {
+    auto plural = "s"[operandInfos.size() == 1];
+    return (emitError(typeLoc, "expected ")
+                << operandInfos.size() << " operand type" << plural
+                << " but had " << operandTypes.size(),
+            nullptr);
+  }
+
+  // Resolve all of the operands.
+  for (unsigned i = 0, e = operandInfos.size(); i != e; ++i) {
+    result.operands.push_back(resolveSSAUse(operandInfos[i], operandTypes[i]));
+    if (!result.operands.back())
+      return nullptr;
+  }
+
+  // Add the successors, and their operands after the proper operands.
+  for (const auto &succ : llvm::zip(successors, successorOperands)) {
+    Block *successor = std::get<0>(succ);
+    const SmallVector<Value, 4> &operands = std::get<1>(succ);
+    result.addSuccessor(successor, operands);
+  }
+
+  // Parse a location if one is present.
+  if (parseOptionalTrailingLocation(result.location))
+    return nullptr;
+
+  return opBuilder.createOperation(result);
+}
+
+Operation *OperationParser::parseGenericOperation(Block *insertBlock,
+                                                  Block::iterator insertPt) {
+  OpBuilder::InsertionGuard restoreInsertionPoint(opBuilder);
+  opBuilder.setInsertionPoint(insertBlock, insertPt);
+  return parseGenericOperation();
+}
+
+namespace {
+class CustomOpAsmParser : public OpAsmParser {
+public:
+  CustomOpAsmParser(SMLoc nameLoc, const AbstractOperation *opDefinition,
+                    OperationParser &parser)
+      : nameLoc(nameLoc), opDefinition(opDefinition), parser(parser) {}
+
+  /// Parse an instance of the operation described by 'opDefinition' into the
+  /// provided operation state.
+  ParseResult parseOperation(OperationState &opState) {
+    if (opDefinition->parseAssembly(*this, opState))
+      return failure();
+    return success();
+  }
+
+  Operation *parseGenericOperation(Block *insertBlock,
+                                   Block::iterator insertPt) final {
+    return parser.parseGenericOperation(insertBlock, insertPt);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Utilities
+  //===--------------------------------------------------------------------===//
+
+  /// Return if any errors were emitted during parsing.
+  bool didEmitError() const { return emittedError; }
+
+  /// Emit a diagnostic at the specified location and return failure.
+  InFlightDiagnostic emitError(llvm::SMLoc loc, const Twine &message) override {
+    emittedError = true;
+    return parser.emitError(loc, "custom op '" + opDefinition->name + "' " +
+                                     message);
+  }
+
+  llvm::SMLoc getCurrentLocation() override {
+    return parser.getToken().getLoc();
+  }
+
+  Builder &getBuilder() const override { return parser.builder; }
+
+  llvm::SMLoc getNameLoc() const override { return nameLoc; }
+
+  //===--------------------------------------------------------------------===//
+  // Token Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a `->` token.
+  ParseResult parseArrow() override {
+    return parser.parseToken(Token::arrow, "expected '->'");
+  }
+
+  /// Parses a `->` if present.
+  ParseResult parseOptionalArrow() override {
+    return success(parser.consumeIf(Token::arrow));
+  }
+
+  /// Parse a `:` token.
+  ParseResult parseColon() override {
+    return parser.parseToken(Token::colon, "expected ':'");
+  }
+
+  /// Parse a `:` token if present.
+  ParseResult parseOptionalColon() override {
+    return success(parser.consumeIf(Token::colon));
+  }
+
+  /// Parse a `,` token.
+  ParseResult parseComma() override {
+    return parser.parseToken(Token::comma, "expected ','");
+  }
+
+  /// Parse a `,` token if present.
+  ParseResult parseOptionalComma() override {
+    return success(parser.consumeIf(Token::comma));
+  }
+
+  /// Parses a `...` if present.
+  ParseResult parseOptionalEllipsis() override {
+    return success(parser.consumeIf(Token::ellipsis));
+  }
+
+  /// Parse a `=` token.
+  ParseResult parseEqual() override {
+    return parser.parseToken(Token::equal, "expected '='");
+  }
+
+  /// Parse a '<' token.
+  ParseResult parseLess() override {
+    return parser.parseToken(Token::less, "expected '<'");
+  }
+
+  /// Parse a '>' token.
+  ParseResult parseGreater() override {
+    return parser.parseToken(Token::greater, "expected '>'");
+  }
+
+  /// Parse a `(` token.
+  ParseResult parseLParen() override {
+    return parser.parseToken(Token::l_paren, "expected '('");
+  }
+
+  /// Parses a '(' if present.
+  ParseResult parseOptionalLParen() override {
+    return success(parser.consumeIf(Token::l_paren));
+  }
+
+  /// Parse a `)` token.
+  ParseResult parseRParen() override {
+    return parser.parseToken(Token::r_paren, "expected ')'");
+  }
+
+  /// Parses a ')' if present.
+  ParseResult parseOptionalRParen() override {
+    return success(parser.consumeIf(Token::r_paren));
+  }
+
+  /// Parse a `[` token.
+  ParseResult parseLSquare() override {
+    return parser.parseToken(Token::l_square, "expected '['");
+  }
+
+  /// Parses a '[' if present.
+  ParseResult parseOptionalLSquare() override {
+    return success(parser.consumeIf(Token::l_square));
+  }
+
+  /// Parse a `]` token.
+  ParseResult parseRSquare() override {
+    return parser.parseToken(Token::r_square, "expected ']'");
+  }
+
+  /// Parses a ']' if present.
+  ParseResult parseOptionalRSquare() override {
+    return success(parser.consumeIf(Token::r_square));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Attribute Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse an arbitrary attribute of a given type and return it in result. This
+  /// also adds the attribute to the specified attribute list with the specified
+  /// name.
+  ParseResult parseAttribute(Attribute &result, Type type, StringRef attrName,
+                             SmallVectorImpl<NamedAttribute> &attrs) override {
+    result = parser.parseAttribute(type);
+    if (!result)
+      return failure();
+
+    attrs.push_back(parser.builder.getNamedAttr(attrName, result));
+    return success();
+  }
+
+  /// Parse a named dictionary into 'result' if it is present.
+  ParseResult
+  parseOptionalAttrDict(SmallVectorImpl<NamedAttribute> &result) override {
+    if (parser.getToken().isNot(Token::l_brace))
+      return success();
+    return parser.parseAttributeDict(result);
+  }
+
+  /// Parse a named dictionary into 'result' if the `attributes` keyword is
+  /// present.
+  ParseResult parseOptionalAttrDictWithKeyword(
+      SmallVectorImpl<NamedAttribute> &result) override {
+    if (failed(parseOptionalKeyword("attributes")))
+      return success();
+    return parser.parseAttributeDict(result);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Identifier Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Returns if the current token corresponds to a keyword.
+  bool isCurrentTokenAKeyword() const {
+    return parser.getToken().is(Token::bare_identifier) ||
+           parser.getToken().isKeyword();
+  }
+
+  /// Parse the given keyword if present.
+  ParseResult parseOptionalKeyword(StringRef keyword) override {
+    // Check that the current token has the same spelling.
+    if (!isCurrentTokenAKeyword() || parser.getTokenSpelling() != keyword)
+      return failure();
+    parser.consumeToken();
+    return success();
+  }
+
+  /// Parse a keyword, if present, into 'keyword'.
+  ParseResult parseOptionalKeyword(StringRef *keyword) override {
+    // Check that the current token is a keyword.
+    if (!isCurrentTokenAKeyword())
+      return failure();
+
+    *keyword = parser.getTokenSpelling();
+    parser.consumeToken();
+    return success();
+  }
+
+  /// Parse an optional @-identifier and store it (without the '@' symbol) in a
+  /// string attribute named 'attrName'.
+  ParseResult
+  parseOptionalSymbolName(StringAttr &result, StringRef attrName,
+                          SmallVectorImpl<NamedAttribute> &attrs) override {
+    Token atToken = parser.getToken();
+    if (atToken.isNot(Token::at_identifier))
+      return failure();
+
+    result = getBuilder().getStringAttr(extractSymbolReference(atToken));
+    attrs.push_back(getBuilder().getNamedAttr(attrName, result));
+    parser.consumeToken();
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Operand Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a single operand.
+  ParseResult parseOperand(OperandType &result) override {
+    OperationParser::SSAUseInfo useInfo;
+    if (parser.parseSSAUse(useInfo))
+      return failure();
+
+    result = {useInfo.loc, useInfo.name, useInfo.number};
+    return success();
+  }
+
+  /// Parse zero or more SSA comma-separated operand references with a specified
+  /// surrounding delimiter, and an optional required operand count.
+  ParseResult parseOperandList(SmallVectorImpl<OperandType> &result,
+                               int requiredOperandCount = -1,
+                               Delimiter delimiter = Delimiter::None) override {
+    return parseOperandOrRegionArgList(result, /*isOperandList=*/true,
+                                       requiredOperandCount, delimiter);
+  }
+
+  /// Parse zero or more SSA comma-separated operand or region arguments with
+  ///  optional surrounding delimiter and required operand count.
+  ParseResult
+  parseOperandOrRegionArgList(SmallVectorImpl<OperandType> &result,
+                              bool isOperandList, int requiredOperandCount = -1,
+                              Delimiter delimiter = Delimiter::None) {
+    auto startLoc = parser.getToken().getLoc();
+
+    // Handle delimiters.
+    switch (delimiter) {
+    case Delimiter::None:
+      // Don't check for the absence of a delimiter if the number of operands
+      // is unknown (and hence the operand list could be empty).
+      if (requiredOperandCount == -1)
+        break;
+      // Token already matches an identifier and so can't be a delimiter.
+      if (parser.getToken().is(Token::percent_identifier))
+        break;
+      // Test against known delimiters.
+      if (parser.getToken().is(Token::l_paren) ||
+          parser.getToken().is(Token::l_square))
+        return emitError(startLoc, "unexpected delimiter");
+      return emitError(startLoc, "invalid operand");
+    case Delimiter::OptionalParen:
+      if (parser.getToken().isNot(Token::l_paren))
+        return success();
+      LLVM_FALLTHROUGH;
+    case Delimiter::Paren:
+      if (parser.parseToken(Token::l_paren, "expected '(' in operand list"))
+        return failure();
+      break;
+    case Delimiter::OptionalSquare:
+      if (parser.getToken().isNot(Token::l_square))
+        return success();
+      LLVM_FALLTHROUGH;
+    case Delimiter::Square:
+      if (parser.parseToken(Token::l_square, "expected '[' in operand list"))
+        return failure();
+      break;
+    }
+
+    // Check for zero operands.
+    if (parser.getToken().is(Token::percent_identifier)) {
+      do {
+        OperandType operandOrArg;
+        if (isOperandList ? parseOperand(operandOrArg)
+                          : parseRegionArgument(operandOrArg))
+          return failure();
+        result.push_back(operandOrArg);
+      } while (parser.consumeIf(Token::comma));
+    }
+
+    // Handle delimiters.   If we reach here, the optional delimiters were
+    // present, so we need to parse their closing one.
+    switch (delimiter) {
+    case Delimiter::None:
+      break;
+    case Delimiter::OptionalParen:
+    case Delimiter::Paren:
+      if (parser.parseToken(Token::r_paren, "expected ')' in operand list"))
+        return failure();
+      break;
+    case Delimiter::OptionalSquare:
+    case Delimiter::Square:
+      if (parser.parseToken(Token::r_square, "expected ']' in operand list"))
+        return failure();
+      break;
+    }
+
+    if (requiredOperandCount != -1 &&
+        result.size() != static_cast<size_t>(requiredOperandCount))
+      return emitError(startLoc, "expected ")
+             << requiredOperandCount << " operands";
+    return success();
+  }
+
+  /// Parse zero or more trailing SSA comma-separated trailing operand
+  /// references with a specified surrounding delimiter, and an optional
+  /// required operand count. A leading comma is expected before the operands.
+  ParseResult parseTrailingOperandList(SmallVectorImpl<OperandType> &result,
+                                       int requiredOperandCount,
+                                       Delimiter delimiter) override {
+    if (parser.getToken().is(Token::comma)) {
+      parseComma();
+      return parseOperandList(result, requiredOperandCount, delimiter);
+    }
+    if (requiredOperandCount != -1)
+      return emitError(parser.getToken().getLoc(), "expected ")
+             << requiredOperandCount << " operands";
+    return success();
+  }
+
+  /// Resolve an operand to an SSA value, emitting an error on failure.
+  ParseResult resolveOperand(const OperandType &operand, Type type,
+                             SmallVectorImpl<Value> &result) override {
+    OperationParser::SSAUseInfo operandInfo = {operand.name, operand.number,
+                                               operand.location};
+    if (auto value = parser.resolveSSAUse(operandInfo, type)) {
+      result.push_back(value);
+      return success();
+    }
+    return failure();
+  }
+
+  /// Parse an AffineMap of SSA ids.
+  ParseResult
+  parseAffineMapOfSSAIds(SmallVectorImpl<OperandType> &operands,
+                         Attribute &mapAttr, StringRef attrName,
+                         SmallVectorImpl<NamedAttribute> &attrs) override {
+    SmallVector<OperandType, 2> dimOperands;
+    SmallVector<OperandType, 1> symOperands;
+
+    auto parseElement = [&](bool isSymbol) -> ParseResult {
+      OperandType operand;
+      if (parseOperand(operand))
+        return failure();
+      if (isSymbol)
+        symOperands.push_back(operand);
+      else
+        dimOperands.push_back(operand);
+      return success();
+    };
+
+    AffineMap map;
+    if (parser.parseAffineMapOfSSAIds(map, parseElement))
+      return failure();
+    // Add AffineMap attribute.
+    if (map) {
+      mapAttr = AffineMapAttr::get(map);
+      attrs.push_back(parser.builder.getNamedAttr(attrName, mapAttr));
+    }
+
+    // Add dim operands before symbol operands in 'operands'.
+    operands.assign(dimOperands.begin(), dimOperands.end());
+    operands.append(symOperands.begin(), symOperands.end());
+    return success();
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Region Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a region that takes `arguments` of `argTypes` types.  This
+  /// effectively defines the SSA values of `arguments` and assigns their type.
+  ParseResult parseRegion(Region &region, ArrayRef<OperandType> arguments,
+                          ArrayRef<Type> argTypes,
+                          bool enableNameShadowing) override {
+    assert(arguments.size() == argTypes.size() &&
+           "mismatching number of arguments and types");
+
+    SmallVector<std::pair<OperationParser::SSAUseInfo, Type>, 2>
+        regionArguments;
+    for (const auto &pair : llvm::zip(arguments, argTypes)) {
+      const OperandType &operand = std::get<0>(pair);
+      Type type = std::get<1>(pair);
+      OperationParser::SSAUseInfo operandInfo = {operand.name, operand.number,
+                                                 operand.location};
+      regionArguments.emplace_back(operandInfo, type);
+    }
+
+    // Try to parse the region.
+    assert((!enableNameShadowing ||
+            opDefinition->hasProperty(OperationProperty::IsolatedFromAbove)) &&
+           "name shadowing is only allowed on isolated regions");
+    if (parser.parseRegion(region, regionArguments, enableNameShadowing))
+      return failure();
+    return success();
+  }
+
+  /// Parses a region if present.
+  ParseResult parseOptionalRegion(Region &region,
+                                  ArrayRef<OperandType> arguments,
+                                  ArrayRef<Type> argTypes,
+                                  bool enableNameShadowing) override {
+    if (parser.getToken().isNot(Token::l_brace))
+      return success();
+    return parseRegion(region, arguments, argTypes, enableNameShadowing);
+  }
+
+  /// Parse a region argument. The type of the argument will be resolved later
+  /// by a call to `parseRegion`.
+  ParseResult parseRegionArgument(OperandType &argument) override {
+    return parseOperand(argument);
+  }
+
+  /// Parse a region argument if present.
+  ParseResult parseOptionalRegionArgument(OperandType &argument) override {
+    if (parser.getToken().isNot(Token::percent_identifier))
+      return success();
+    return parseRegionArgument(argument);
+  }
+
+  ParseResult
+  parseRegionArgumentList(SmallVectorImpl<OperandType> &result,
+                          int requiredOperandCount = -1,
+                          Delimiter delimiter = Delimiter::None) override {
+    return parseOperandOrRegionArgList(result, /*isOperandList=*/false,
+                                       requiredOperandCount, delimiter);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Successor Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a single operation successor and its operand list.
+  ParseResult
+  parseSuccessorAndUseList(Block *&dest,
+                           SmallVectorImpl<Value> &operands) override {
+    return parser.parseSuccessorAndUseList(dest, operands);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Type Parsing
+  //===--------------------------------------------------------------------===//
+
+  /// Parse a type.
+  ParseResult parseType(Type &result) override {
+    return failure(!(result = parser.parseType()));
+  }
+
+  /// Parse an optional arrow followed by a type list.
+  ParseResult
+  parseOptionalArrowTypeList(SmallVectorImpl<Type> &result) override {
+    if (!parser.consumeIf(Token::arrow))
+      return success();
+    return parser.parseFunctionResultTypes(result);
+  }
+
+  /// Parse a colon followed by a type.
+  ParseResult parseColonType(Type &result) override {
+    return failure(parser.parseToken(Token::colon, "expected ':'") ||
+                   !(result = parser.parseType()));
+  }
+
+  /// Parse a colon followed by a type list, which must have at least one type.
+  ParseResult parseColonTypeList(SmallVectorImpl<Type> &result) override {
+    if (parser.parseToken(Token::colon, "expected ':'"))
+      return failure();
+    return parser.parseTypeListNoParens(result);
+  }
+
+  /// Parse an optional colon followed by a type list, which if present must
+  /// have at least one type.
+  ParseResult
+  parseOptionalColonTypeList(SmallVectorImpl<Type> &result) override {
+    if (!parser.consumeIf(Token::colon))
+      return success();
+    return parser.parseTypeListNoParens(result);
+  }
+
+private:
+  /// The source location of the operation name.
+  SMLoc nameLoc;
+
+  /// The abstract information of the operation.
+  const AbstractOperation *opDefinition;
+
+  /// The main operation parser.
+  OperationParser &parser;
+
+  /// A flag that indicates if any errors were emitted during parsing.
+  bool emittedError = false;
+};
+} // end anonymous namespace.
+
+Operation *OperationParser::parseCustomOperation() {
+  auto opLoc = getToken().getLoc();
+  auto opName = getTokenSpelling();
+
+  auto *opDefinition = AbstractOperation::lookup(opName, getContext());
+  if (!opDefinition && !opName.contains('.')) {
+    // If the operation name has no namespace prefix we treat it as a standard
+    // operation and prefix it with "std".
+    // TODO: Would it be better to just build a mapping of the registered
+    // operations in the standard dialect?
+    opDefinition =
+        AbstractOperation::lookup(Twine("std." + opName).str(), getContext());
+  }
+
+  if (!opDefinition) {
+    emitError(opLoc) << "custom op '" << opName << "' is unknown";
+    return nullptr;
+  }
+
+  consumeToken();
+
+  // If the custom op parser crashes, produce some indication to help
+  // debugging.
+  std::string opNameStr = opName.str();
+  llvm::PrettyStackTraceFormat fmt("MLIR Parser: custom op parser '%s'",
+                                   opNameStr.c_str());
+
+  // Get location information for the operation.
+  auto srcLocation = getEncodedSourceLocation(opLoc);
+
+  // Have the op implementation take a crack and parsing this.
+  OperationState opState(srcLocation, opDefinition->name);
+  CleanupOpStateRegions guard{opState};
+  CustomOpAsmParser opAsmParser(opLoc, opDefinition, *this);
+  if (opAsmParser.parseOperation(opState))
+    return nullptr;
+
+  // If it emitted an error, we failed.
+  if (opAsmParser.didEmitError())
+    return nullptr;
+
+  // Parse a location if one is present.
+  if (parseOptionalTrailingLocation(opState.location))
+    return nullptr;
+
+  // Otherwise, we succeeded.  Use the state it parsed as our op information.
+  return opBuilder.createOperation(opState);
+}
+
+//===----------------------------------------------------------------------===//
+// Region Parsing
+//===----------------------------------------------------------------------===//
+
+/// Region.
+///
+///   region ::= '{' region-body
+///
+ParseResult OperationParser::parseRegion(
+    Region &region,
+    ArrayRef<std::pair<OperationParser::SSAUseInfo, Type>> entryArguments,
+    bool isIsolatedNameScope) {
+  // Parse the '{'.
+  if (parseToken(Token::l_brace, "expected '{' to begin a region"))
+    return failure();
+
+  // Check for an empty region.
+  if (entryArguments.empty() && consumeIf(Token::r_brace))
+    return success();
+  auto currentPt = opBuilder.saveInsertionPoint();
+
+  // Push a new named value scope.
+  pushSSANameScope(isIsolatedNameScope);
+
+  // Parse the first block directly to allow for it to be unnamed.
+  Block *block = new Block();
+
+  // Add arguments to the entry block.
+  if (!entryArguments.empty()) {
+    for (auto &placeholderArgPair : entryArguments) {
+      auto &argInfo = placeholderArgPair.first;
+      // Ensure that the argument was not already defined.
+      if (auto defLoc = getReferenceLoc(argInfo.name, argInfo.number)) {
+        return emitError(argInfo.loc, "region entry argument '" + argInfo.name +
+                                          "' is already in use")
+                   .attachNote(getEncodedSourceLocation(*defLoc))
+               << "previously referenced here";
+      }
+      if (addDefinition(placeholderArgPair.first,
+                        block->addArgument(placeholderArgPair.second))) {
+        delete block;
+        return failure();
+      }
+    }
+
+    // If we had named arguments, then don't allow a block name.
+    if (getToken().is(Token::caret_identifier))
+      return emitError("invalid block name in region with named arguments");
+  }
+
+  if (parseBlock(block)) {
+    delete block;
+    return failure();
+  }
+
+  // Verify that no other arguments were parsed.
+  if (!entryArguments.empty() &&
+      block->getNumArguments() > entryArguments.size()) {
+    delete block;
+    return emitError("entry block arguments were already defined");
+  }
+
+  // Parse the rest of the region.
+  region.push_back(block);
+  if (parseRegionBody(region))
+    return failure();
+
+  // Pop the SSA value scope for this region.
+  if (popSSANameScope())
+    return failure();
+
+  // Reset the original insertion point.
+  opBuilder.restoreInsertionPoint(currentPt);
+  return success();
+}
+
+/// Region.
+///
+///   region-body ::= block* '}'
+///
+ParseResult OperationParser::parseRegionBody(Region &region) {
+  // Parse the list of blocks.
+  while (!consumeIf(Token::r_brace)) {
+    Block *newBlock = nullptr;
+    if (parseBlock(newBlock))
+      return failure();
+    region.push_back(newBlock);
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Block Parsing
+//===----------------------------------------------------------------------===//
+
+/// Block declaration.
+///
+///   block ::= block-label? operation*
+///   block-label    ::= block-id block-arg-list? `:`
+///   block-id       ::= caret-id
+///   block-arg-list ::= `(` ssa-id-and-type-list? `)`
+///
+ParseResult OperationParser::parseBlock(Block *&block) {
+  // The first block of a region may already exist, if it does the caret
+  // identifier is optional.
+  if (block && getToken().isNot(Token::caret_identifier))
+    return parseBlockBody(block);
+
+  SMLoc nameLoc = getToken().getLoc();
+  auto name = getTokenSpelling();
+  if (parseToken(Token::caret_identifier, "expected block name"))
+    return failure();
+
+  block = defineBlockNamed(name, nameLoc, block);
+
+  // Fail if the block was already defined.
+  if (!block)
+    return emitError(nameLoc, "redefinition of block '") << name << "'";
+
+  // If an argument list is present, parse it.
+  if (consumeIf(Token::l_paren)) {
+    SmallVector<BlockArgument, 8> bbArgs;
+    if (parseOptionalBlockArgList(bbArgs, block) ||
+        parseToken(Token::r_paren, "expected ')' to end argument list"))
+      return failure();
+  }
+
+  if (parseToken(Token::colon, "expected ':' after block name"))
+    return failure();
+
+  return parseBlockBody(block);
+}
+
+ParseResult OperationParser::parseBlockBody(Block *block) {
+  // Set the insertion point to the end of the block to parse.
+  opBuilder.setInsertionPointToEnd(block);
+
+  // Parse the list of operations that make up the body of the block.
+  while (getToken().isNot(Token::caret_identifier, Token::r_brace))
+    if (parseOperation())
+      return failure();
+
+  return success();
+}
+
+/// Get the block with the specified name, creating it if it doesn't already
+/// exist.  The location specified is the point of use, which allows
+/// us to diagnose references to blocks that are not defined precisely.
+Block *OperationParser::getBlockNamed(StringRef name, SMLoc loc) {
+  auto &blockAndLoc = getBlockInfoByName(name);
+  if (!blockAndLoc.first) {
+    blockAndLoc = {new Block(), loc};
+    insertForwardRef(blockAndLoc.first, loc);
+  }
+
+  return blockAndLoc.first;
+}
+
+/// Define the block with the specified name. Returns the Block* or nullptr in
+/// the case of redefinition.
+Block *OperationParser::defineBlockNamed(StringRef name, SMLoc loc,
+                                         Block *existing) {
+  auto &blockAndLoc = getBlockInfoByName(name);
+  if (!blockAndLoc.first) {
+    // If the caller provided a block, use it.  Otherwise create a new one.
+    if (!existing)
+      existing = new Block();
+    blockAndLoc.first = existing;
+    blockAndLoc.second = loc;
+    return blockAndLoc.first;
+  }
+
+  // Forward declarations are removed once defined, so if we are defining a
+  // existing block and it is not a forward declaration, then it is a
+  // redeclaration.
+  if (!eraseForwardRef(blockAndLoc.first))
+    return nullptr;
+  return blockAndLoc.first;
+}
+
+/// Parse a (possibly empty) list of SSA operands with types as block arguments.
+///
+///   ssa-id-and-type-list ::= ssa-id-and-type (`,` ssa-id-and-type)*
+///
+ParseResult OperationParser::parseOptionalBlockArgList(
+    SmallVectorImpl<BlockArgument> &results, Block *owner) {
+  if (getToken().is(Token::r_brace))
+    return success();
+
+  // If the block already has arguments, then we're handling the entry block.
+  // Parse and register the names for the arguments, but do not add them.
+  bool definingExistingArgs = owner->getNumArguments() != 0;
+  unsigned nextArgument = 0;
+
+  return parseCommaSeparatedList([&]() -> ParseResult {
+    return parseSSADefOrUseAndType(
+        [&](SSAUseInfo useInfo, Type type) -> ParseResult {
+          // If this block did not have existing arguments, define a new one.
+          if (!definingExistingArgs)
+            return addDefinition(useInfo, owner->addArgument(type));
+
+          // Otherwise, ensure that this argument has already been created.
+          if (nextArgument >= owner->getNumArguments())
+            return emitError("too many arguments specified in argument list");
+
+          // Finally, make sure the existing argument has the correct type.
+          auto arg = owner->getArgument(nextArgument++);
+          if (arg->getType() != type)
+            return emitError("argument and block argument type mismatch");
+          return addDefinition(useInfo, arg);
+        });
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// Top-level entity parsing.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This parser handles entities that are only valid at the top level of the
+/// file.
+class ModuleParser : public Parser {
+public:
+  explicit ModuleParser(ParserState &state) : Parser(state) {}
+
+  ParseResult parseModule(ModuleOp module);
+
+private:
+  /// Parse an attribute alias declaration.
+  ParseResult parseAttributeAliasDef();
+
+  /// Parse an attribute alias declaration.
+  ParseResult parseTypeAliasDef();
+};
+} // end anonymous namespace
+
+/// Parses an attribute alias declaration.
+///
+///   attribute-alias-def ::= '#' alias-name `=` attribute-value
+///
+ParseResult ModuleParser::parseAttributeAliasDef() {
+  assert(getToken().is(Token::hash_identifier));
+  StringRef aliasName = getTokenSpelling().drop_front();
+
+  // Check for redefinitions.
+  if (getState().symbols.attributeAliasDefinitions.count(aliasName) > 0)
+    return emitError("redefinition of attribute alias id '" + aliasName + "'");
+
+  // Make sure this isn't invading the dialect attribute namespace.
+  if (aliasName.contains('.'))
+    return emitError("attribute names with a '.' are reserved for "
+                     "dialect-defined names");
+
+  consumeToken(Token::hash_identifier);
+
+  // Parse the '='.
+  if (parseToken(Token::equal, "expected '=' in attribute alias definition"))
+    return failure();
+
+  // Parse the attribute value.
+  Attribute attr = parseAttribute();
+  if (!attr)
+    return failure();
+
+  getState().symbols.attributeAliasDefinitions[aliasName] = attr;
+  return success();
+}
+
+/// Parse a type alias declaration.
+///
+///   type-alias-def ::= '!' alias-name `=` 'type' type
+///
+ParseResult ModuleParser::parseTypeAliasDef() {
+  assert(getToken().is(Token::exclamation_identifier));
+  StringRef aliasName = getTokenSpelling().drop_front();
+
+  // Check for redefinitions.
+  if (getState().symbols.typeAliasDefinitions.count(aliasName) > 0)
+    return emitError("redefinition of type alias id '" + aliasName + "'");
+
+  // Make sure this isn't invading the dialect type namespace.
+  if (aliasName.contains('.'))
+    return emitError("type names with a '.' are reserved for "
+                     "dialect-defined names");
+
+  consumeToken(Token::exclamation_identifier);
+
+  // Parse the '=' and 'type'.
+  if (parseToken(Token::equal, "expected '=' in type alias definition") ||
+      parseToken(Token::kw_type, "expected 'type' in type alias definition"))
+    return failure();
+
+  // Parse the type.
+  Type aliasedType = parseType();
+  if (!aliasedType)
+    return failure();
+
+  // Register this alias with the parser state.
+  getState().symbols.typeAliasDefinitions.try_emplace(aliasName, aliasedType);
+  return success();
+}
+
+/// This is the top-level module parser.
+ParseResult ModuleParser::parseModule(ModuleOp module) {
+  OperationParser opParser(getState(), module);
+
+  // Module itself is a name scope.
+  opParser.pushSSANameScope(/*isIsolated=*/true);
+
+  while (true) {
+    switch (getToken().getKind()) {
+    default:
+      // Parse a top-level operation.
+      if (opParser.parseOperation())
+        return failure();
+      break;
+
+    // If we got to the end of the file, then we're done.
+    case Token::eof: {
+      if (opParser.finalize())
+        return failure();
+
+      // Handle the case where the top level module was explicitly defined.
+      auto &bodyBlocks = module.getBodyRegion().getBlocks();
+      auto &operations = bodyBlocks.front().getOperations();
+      assert(!operations.empty() && "expected a valid module terminator");
+
+      // Check that the first operation is a module, and it is the only
+      // non-terminator operation.
+      ModuleOp nested = dyn_cast<ModuleOp>(operations.front());
+      if (nested && std::next(operations.begin(), 2) == operations.end()) {
+        // Merge the data of the nested module operation into 'module'.
+        module.setLoc(nested.getLoc());
+        module.setAttrs(nested.getOperation()->getAttrList());
+        bodyBlocks.splice(bodyBlocks.end(), nested.getBodyRegion().getBlocks());
+
+        // Erase the original module body.
+        bodyBlocks.pop_front();
+      }
+
+      return opParser.popSSANameScope();
+    }
+
+    // If we got an error token, then the lexer already emitted an error, just
+    // stop.  Someday we could introduce error recovery if there was demand
+    // for it.
+    case Token::error:
+      return failure();
+
+    // Parse an attribute alias.
+    case Token::hash_identifier:
+      if (parseAttributeAliasDef())
+        return failure();
+      break;
+
+    // Parse a type alias.
+    case Token::exclamation_identifier:
+      if (parseTypeAliasDef())
+        return failure();
+      break;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+/// This parses the file specified by the indicated SourceMgr and returns an
+/// MLIR module if it was valid.  If not, it emits diagnostics and returns
+/// null.
+OwningModuleRef mlir::parseSourceFile(const llvm::SourceMgr &sourceMgr,
+                                      MLIRContext *context) {
+  auto sourceBuf = sourceMgr.getMemoryBuffer(sourceMgr.getMainFileID());
+
+  // This is the result module we are parsing into.
+  OwningModuleRef module(ModuleOp::create(FileLineColLoc::get(
+      sourceBuf->getBufferIdentifier(), /*line=*/0, /*column=*/0, context)));
+
+  SymbolState aliasState;
+  ParserState state(sourceMgr, context, aliasState);
+  if (ModuleParser(state).parseModule(*module))
+    return nullptr;
+
+  // Make sure the parse module has no other structural problems detected by
+  // the verifier.
+  if (failed(verify(*module)))
+    return nullptr;
+
+  return module;
+}
+
+/// This parses the file specified by the indicated filename and returns an
+/// MLIR module if it was valid.  If not, the error message is emitted through
+/// the error handler registered in the context, and a null pointer is returned.
+OwningModuleRef mlir::parseSourceFile(StringRef filename,
+                                      MLIRContext *context) {
+  llvm::SourceMgr sourceMgr;
+  return parseSourceFile(filename, sourceMgr, context);
+}
+
+/// This parses the file specified by the indicated filename using the provided
+/// SourceMgr and returns an MLIR module if it was valid.  If not, the error
+/// message is emitted through the error handler registered in the context, and
+/// a null pointer is returned.
+OwningModuleRef mlir::parseSourceFile(StringRef filename,
+                                      llvm::SourceMgr &sourceMgr,
+                                      MLIRContext *context) {
+  if (sourceMgr.getNumBuffers() != 0) {
+    // TODO(b/136086478): Extend to support multiple buffers.
+    emitError(mlir::UnknownLoc::get(context),
+              "only main buffer parsed at the moment");
+    return nullptr;
+  }
+  auto file_or_err = llvm::MemoryBuffer::getFileOrSTDIN(filename);
+  if (std::error_code error = file_or_err.getError()) {
+    emitError(mlir::UnknownLoc::get(context),
+              "could not open input file " + filename);
+    return nullptr;
+  }
+
+  // Load the MLIR module.
+  sourceMgr.AddNewSourceBuffer(std::move(*file_or_err), llvm::SMLoc());
+  return parseSourceFile(sourceMgr, context);
+}
+
+/// This parses the program string to a MLIR module if it was valid. If not,
+/// it emits diagnostics and returns null.
+OwningModuleRef mlir::parseSourceString(StringRef moduleStr,
+                                        MLIRContext *context) {
+  auto memBuffer = MemoryBuffer::getMemBuffer(moduleStr);
+  if (!memBuffer)
+    return nullptr;
+
+  SourceMgr sourceMgr;
+  sourceMgr.AddNewSourceBuffer(std::move(memBuffer), SMLoc());
+  return parseSourceFile(sourceMgr, context);
+}
+
+/// Parses a symbol, of type 'T', and returns it if parsing was successful. If
+/// parsing failed, nullptr is returned. The number of bytes read from the input
+/// string is returned in 'numRead'.
+template <typename T, typename ParserFn>
+static T parseSymbol(StringRef inputStr, MLIRContext *context, size_t &numRead,
+                     ParserFn &&parserFn) {
+  SymbolState aliasState;
+  return parseSymbol<T>(
+      inputStr, context, aliasState,
+      [&](Parser &parser) {
+        SourceMgrDiagnosticHandler handler(
+            const_cast<llvm::SourceMgr &>(parser.getSourceMgr()),
+            parser.getContext());
+        return parserFn(parser);
+      },
+      &numRead);
+}
+
+Attribute mlir::parseAttribute(StringRef attrStr, MLIRContext *context) {
+  size_t numRead = 0;
+  return parseAttribute(attrStr, context, numRead);
+}
+Attribute mlir::parseAttribute(StringRef attrStr, Type type) {
+  size_t numRead = 0;
+  return parseAttribute(attrStr, type, numRead);
+}
+
+Attribute mlir::parseAttribute(StringRef attrStr, MLIRContext *context,
+                               size_t &numRead) {
+  return parseSymbol<Attribute>(attrStr, context, numRead, [](Parser &parser) {
+    return parser.parseAttribute();
+  });
+}
+Attribute mlir::parseAttribute(StringRef attrStr, Type type, size_t &numRead) {
+  return parseSymbol<Attribute>(
+      attrStr, type.getContext(), numRead,
+      [type](Parser &parser) { return parser.parseAttribute(type); });
+}
+
+Type mlir::parseType(StringRef typeStr, MLIRContext *context) {
+  size_t numRead = 0;
+  return parseType(typeStr, context, numRead);
+}
+
+Type mlir::parseType(StringRef typeStr, MLIRContext *context, size_t &numRead) {
+  return parseSymbol<Type>(typeStr, context, numRead,
+                           [](Parser &parser) { return parser.parseType(); });
+}
diff --git a/mlir/lib/Parser/Token.cpp b/mlir/lib/Parser/Token.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..84de4c396f4f0fa1767d708f717667e22bb57912
--- /dev/null
+++ b/mlir/lib/Parser/Token.cpp
@@ -0,0 +1,155 @@
+//===- Token.cpp - MLIR Token Implementation ------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Token class for the MLIR textual form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Token.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace mlir;
+using llvm::SMLoc;
+using llvm::SMRange;
+
+SMLoc Token::getLoc() const { return SMLoc::getFromPointer(spelling.data()); }
+
+SMLoc Token::getEndLoc() const {
+  return SMLoc::getFromPointer(spelling.data() + spelling.size());
+}
+
+SMRange Token::getLocRange() const { return SMRange(getLoc(), getEndLoc()); }
+
+/// For an integer token, return its value as an unsigned.  If it doesn't fit,
+/// return None.
+Optional<unsigned> Token::getUnsignedIntegerValue() const {
+  bool isHex = spelling.size() > 1 && spelling[1] == 'x';
+
+  unsigned result = 0;
+  if (spelling.getAsInteger(isHex ? 0 : 10, result))
+    return None;
+  return result;
+}
+
+/// For an integer token, return its value as a uint64_t.  If it doesn't fit,
+/// return None.
+Optional<uint64_t> Token::getUInt64IntegerValue() const {
+  bool isHex = spelling.size() > 1 && spelling[1] == 'x';
+
+  uint64_t result = 0;
+  if (spelling.getAsInteger(isHex ? 0 : 10, result))
+    return None;
+  return result;
+}
+
+/// For a floatliteral, return its value as a double. Return None if the value
+/// underflows or overflows.
+Optional<double> Token::getFloatingPointValue() const {
+  double result = 0;
+  if (spelling.getAsDouble(result))
+    return None;
+  return result;
+}
+
+/// For an inttype token, return its bitwidth.
+Optional<unsigned> Token::getIntTypeBitwidth() const {
+  unsigned result = 0;
+  if (spelling[1] == '0' || spelling.drop_front().getAsInteger(10, result) ||
+      result == 0)
+    return None;
+  return result;
+}
+
+/// Given a token containing a string literal, return its value, including
+/// removing the quote characters and unescaping the contents of the string. The
+/// lexer has already verified that this token is valid.
+std::string Token::getStringValue() const {
+  assert(getKind() == string ||
+         (getKind() == at_identifier && getSpelling()[1] == '"'));
+  // Start by dropping the quotes.
+  StringRef bytes = getSpelling().drop_front().drop_back();
+  if (getKind() == at_identifier)
+    bytes = bytes.drop_front();
+
+  std::string result;
+  result.reserve(bytes.size());
+  for (unsigned i = 0, e = bytes.size(); i != e;) {
+    auto c = bytes[i++];
+    if (c != '\\') {
+      result.push_back(c);
+      continue;
+    }
+
+    assert(i + 1 <= e && "invalid string should be caught by lexer");
+    auto c1 = bytes[i++];
+    switch (c1) {
+    case '"':
+    case '\\':
+      result.push_back(c1);
+      continue;
+    case 'n':
+      result.push_back('\n');
+      continue;
+    case 't':
+      result.push_back('\t');
+      continue;
+    default:
+      break;
+    }
+
+    assert(i + 1 <= e && "invalid string should be caught by lexer");
+    auto c2 = bytes[i++];
+
+    assert(llvm::isHexDigit(c1) && llvm::isHexDigit(c2) && "invalid escape");
+    result.push_back((llvm::hexDigitValue(c1) << 4) | llvm::hexDigitValue(c2));
+  }
+
+  return result;
+}
+
+/// Given a hash_identifier token like #123, try to parse the number out of
+/// the identifier, returning None if it is a named identifier like #x or
+/// if the integer doesn't fit.
+Optional<unsigned> Token::getHashIdentifierNumber() const {
+  assert(getKind() == hash_identifier);
+  unsigned result = 0;
+  if (spelling.drop_front().getAsInteger(10, result))
+    return None;
+  return result;
+}
+
+/// Given a punctuation or keyword token kind, return the spelling of the
+/// token as a string.  Warning: This will abort on markers, identifiers and
+/// literal tokens since they have no fixed spelling.
+StringRef Token::getTokenSpelling(Kind kind) {
+  switch (kind) {
+  default:
+    llvm_unreachable("This token kind has no fixed spelling");
+#define TOK_PUNCTUATION(NAME, SPELLING)                                        \
+  case NAME:                                                                   \
+    return SPELLING;
+#define TOK_OPERATOR(NAME, SPELLING)                                           \
+  case NAME:                                                                   \
+    return SPELLING;
+#define TOK_KEYWORD(SPELLING)                                                  \
+  case kw_##SPELLING:                                                          \
+    return #SPELLING;
+#include "TokenKinds.def"
+  }
+}
+
+/// Return true if this is one of the keyword token kinds (e.g. kw_if).
+bool Token::isKeyword() const {
+  switch (kind) {
+  default:
+    return false;
+#define TOK_KEYWORD(SPELLING)                                                  \
+  case kw_##SPELLING:                                                          \
+    return true;
+#include "TokenKinds.def"
+  }
+}
diff --git a/mlir/lib/Parser/Token.h b/mlir/lib/Parser/Token.h
new file mode 100644
index 0000000000000000000000000000000000000000..7487736fac797f103f0761029241548a2b3e6aa8
--- /dev/null
+++ b/mlir/lib/Parser/Token.h
@@ -0,0 +1,107 @@
+//===- Token.h - MLIR Token Interface ---------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_LIB_PARSER_TOKEN_H
+#define MLIR_LIB_PARSER_TOKEN_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/SMLoc.h"
+
+namespace mlir {
+
+/// This represents a token in the MLIR syntax.
+class Token {
+public:
+  enum Kind {
+#define TOK_MARKER(NAME) NAME,
+#define TOK_IDENTIFIER(NAME) NAME,
+#define TOK_LITERAL(NAME) NAME,
+#define TOK_PUNCTUATION(NAME, SPELLING) NAME,
+#define TOK_OPERATOR(NAME, SPELLING) NAME,
+#define TOK_KEYWORD(SPELLING) kw_##SPELLING,
+#include "TokenKinds.def"
+  };
+
+  Token(Kind kind, StringRef spelling) : kind(kind), spelling(spelling) {}
+
+  // Return the bytes that make up this token.
+  StringRef getSpelling() const { return spelling; }
+
+  // Token classification.
+  Kind getKind() const { return kind; }
+  bool is(Kind K) const { return kind == K; }
+
+  bool isAny(Kind k1, Kind k2) const { return is(k1) || is(k2); }
+
+  /// Return true if this token is one of the specified kinds.
+  template <typename... T>
+  bool isAny(Kind k1, Kind k2, Kind k3, T... others) const {
+    if (is(k1))
+      return true;
+    return isAny(k2, k3, others...);
+  }
+
+  bool isNot(Kind k) const { return kind != k; }
+
+  /// Return true if this token isn't one of the specified kinds.
+  template <typename... T> bool isNot(Kind k1, Kind k2, T... others) const {
+    return !isAny(k1, k2, others...);
+  }
+
+  /// Return true if this is one of the keyword token kinds (e.g. kw_if).
+  bool isKeyword() const;
+
+  // Helpers to decode specific sorts of tokens.
+
+  /// For an integer token, return its value as an unsigned.  If it doesn't fit,
+  /// return None.
+  Optional<unsigned> getUnsignedIntegerValue() const;
+
+  /// For an integer token, return its value as an uint64_t.  If it doesn't fit,
+  /// return None.
+  Optional<uint64_t> getUInt64IntegerValue() const;
+
+  /// For a floatliteral token, return its value as a double. Returns None in
+  /// the case of underflow or overflow.
+  Optional<double> getFloatingPointValue() const;
+
+  /// For an inttype token, return its bitwidth.
+  Optional<unsigned> getIntTypeBitwidth() const;
+
+  /// Given a hash_identifier token like #123, try to parse the number out of
+  /// the identifier, returning None if it is a named identifier like #x or
+  /// if the integer doesn't fit.
+  Optional<unsigned> getHashIdentifierNumber() const;
+
+  /// Given a token containing a string literal, return its value, including
+  /// removing the quote characters and unescaping the contents of the string.
+  std::string getStringValue() const;
+
+  // Location processing.
+  llvm::SMLoc getLoc() const;
+  llvm::SMLoc getEndLoc() const;
+  llvm::SMRange getLocRange() const;
+
+  /// Given a punctuation or keyword token kind, return the spelling of the
+  /// token as a string.  Warning: This will abort on markers, identifiers and
+  /// literal tokens since they have no fixed spelling.
+  static StringRef getTokenSpelling(Kind kind);
+
+private:
+  /// Discriminator that indicates the sort of token this is.
+  Kind kind;
+
+  /// A reference to the entire token contents; this is always a pointer into
+  /// a memory buffer owned by the source manager.
+  StringRef spelling;
+};
+
+} // end namespace mlir
+
+#endif // MLIR_LIB_PARSER_TOKEN_H
diff --git a/mlir/lib/Parser/TokenKinds.def b/mlir/lib/Parser/TokenKinds.def
new file mode 100644
index 0000000000000000000000000000000000000000..fc9f7821f1a2a5ba9f1493649a5153a6ee8bc2bb
--- /dev/null
+++ b/mlir/lib/Parser/TokenKinds.def
@@ -0,0 +1,124 @@
+//===- TokenKinds.def - MLIR Token Description ------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is intended to be #include'd multiple times to extract information
+// about tokens for various clients in the lexer.
+//
+//===----------------------------------------------------------------------===//
+
+#if !defined(TOK_MARKER) && !defined(TOK_IDENTIFIER) && !defined(TOK_LITERAL)&&\
+    !defined(TOK_PUNCTUATION) && !defined(TOK_OPERATOR) && !defined(TOK_KEYWORD)
+#  error Must define one of the TOK_ macros.
+#endif
+
+#ifndef TOK_MARKER
+#define TOK_MARKER(X)
+#endif
+#ifndef TOK_IDENTIFIER
+#define TOK_IDENTIFIER(NAME)
+#endif
+#ifndef TOK_LITERAL
+#define TOK_LITERAL(NAME)
+#endif
+#ifndef TOK_PUNCTUATION
+#define TOK_PUNCTUATION(NAME, SPELLING)
+#endif
+#ifndef TOK_OPERATOR
+#define TOK_OPERATOR(NAME, SPELLING)
+#endif
+#ifndef TOK_KEYWORD
+#define TOK_KEYWORD(SPELLING)
+#endif
+
+
+// Markers
+TOK_MARKER(eof)
+TOK_MARKER(error)
+
+// Identifiers.
+TOK_IDENTIFIER(bare_identifier)        // foo
+TOK_IDENTIFIER(at_identifier)          // @foo
+TOK_IDENTIFIER(hash_identifier)        // #foo
+TOK_IDENTIFIER(percent_identifier)     // %foo
+TOK_IDENTIFIER(caret_identifier)       // ^foo
+TOK_IDENTIFIER(exclamation_identifier) // !foo
+
+// Literals
+TOK_LITERAL(floatliteral)               // 2.0
+TOK_LITERAL(integer)                    // 42
+TOK_LITERAL(string)                     // "foo"
+TOK_LITERAL(inttype)                    // i421
+
+// Punctuation.
+TOK_PUNCTUATION(arrow,            "->")
+TOK_PUNCTUATION(at,               "@")
+TOK_PUNCTUATION(colon,            ":")
+TOK_PUNCTUATION(comma,            ",")
+TOK_PUNCTUATION(question,         "?")
+TOK_PUNCTUATION(l_paren,          "(")
+TOK_PUNCTUATION(r_paren,          ")")
+TOK_PUNCTUATION(l_brace,          "{")
+TOK_PUNCTUATION(r_brace,          "}")
+TOK_PUNCTUATION(l_square,         "[")
+TOK_PUNCTUATION(r_square,         "]")
+TOK_PUNCTUATION(less,             "<")
+TOK_PUNCTUATION(greater,          ">")
+TOK_PUNCTUATION(equal,            "=")
+TOK_PUNCTUATION(ellipsis,         "...")
+// TODO: More punctuation.
+
+// Operators.
+TOK_OPERATOR(plus,               "+")
+TOK_OPERATOR(minus,              "-")
+TOK_OPERATOR(star,               "*")
+// TODO: More operator tokens
+
+// Keywords.  These turn "foo" into Token::kw_foo enums.
+
+// NOTE: Please key these alphabetized to make it easier to find something in
+// this list and to cater to OCD.
+TOK_KEYWORD(attributes)
+TOK_KEYWORD(bf16)
+TOK_KEYWORD(ceildiv)
+TOK_KEYWORD(complex)
+TOK_KEYWORD(dense)
+TOK_KEYWORD(f16)
+TOK_KEYWORD(f32)
+TOK_KEYWORD(f64)
+TOK_KEYWORD(false)
+TOK_KEYWORD(floordiv)
+TOK_KEYWORD(for)
+TOK_KEYWORD(func)
+TOK_KEYWORD(index)
+TOK_KEYWORD(loc)
+TOK_KEYWORD(max)
+TOK_KEYWORD(memref)
+TOK_KEYWORD(min)
+TOK_KEYWORD(mod)
+TOK_KEYWORD(none)
+TOK_KEYWORD(offset)
+TOK_KEYWORD(opaque)
+TOK_KEYWORD(size)
+TOK_KEYWORD(sparse)
+TOK_KEYWORD(step)
+TOK_KEYWORD(strides)
+TOK_KEYWORD(symbol)
+TOK_KEYWORD(tensor)
+TOK_KEYWORD(to)
+TOK_KEYWORD(true)
+TOK_KEYWORD(tuple)
+TOK_KEYWORD(type)
+TOK_KEYWORD(unit)
+TOK_KEYWORD(vector)
+
+#undef TOK_MARKER
+#undef TOK_IDENTIFIER
+#undef TOK_LITERAL
+#undef TOK_PUNCTUATION
+#undef TOK_OPERATOR
+#undef TOK_KEYWORD
diff --git a/mlir/lib/Pass/CMakeLists.txt b/mlir/lib/Pass/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..05122f5a7ed45fc96b298a63e29c21c1dcaf74f9
--- /dev/null
+++ b/mlir/lib/Pass/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB globbed *.c *.cpp)
+add_llvm_library(MLIRPass
+  ${globbed}
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Pass
+  )
+add_dependencies(MLIRPass MLIRAnalysis MLIRIR LLVMSupport)
+target_link_libraries(MLIRPass MLIRAnalysis MLIRIR LLVMSupport)
diff --git a/mlir/lib/Pass/IRPrinting.cpp b/mlir/lib/Pass/IRPrinting.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75aadbdf5cb37d1947924cd5b9c51b6d12493546
--- /dev/null
+++ b/mlir/lib/Pass/IRPrinting.cpp
@@ -0,0 +1,271 @@
+//===- IRPrinting.cpp -----------------------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PassDetail.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/PassManager.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/SHA1.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+namespace {
+//===----------------------------------------------------------------------===//
+// OperationFingerPrint
+//===----------------------------------------------------------------------===//
+
+/// A unique fingerprint for a specific operation, and all of it's internal
+/// operations.
+class OperationFingerPrint {
+public:
+  OperationFingerPrint(Operation *topOp) {
+    llvm::SHA1 hasher;
+
+    // Hash each of the operations based upon their mutable bits:
+    topOp->walk([&](Operation *op) {
+      //   - Operation pointer
+      addDataToHash(hasher, op);
+      //   - Attributes
+      addDataToHash(hasher,
+                    op->getAttrList().getDictionary().getAsOpaquePointer());
+      //   - Blocks in Regions
+      for (Region &region : op->getRegions()) {
+        for (Block &block : region) {
+          addDataToHash(hasher, &block);
+          for (BlockArgument arg : block.getArguments())
+            addDataToHash(hasher, arg);
+        }
+      }
+      //   - Location
+      addDataToHash(hasher, op->getLoc().getAsOpaquePointer());
+      //   - Operands
+      for (Value operand : op->getOperands())
+        addDataToHash(hasher, operand);
+      //   - Successors
+      for (unsigned i = 0, e = op->getNumSuccessors(); i != e; ++i)
+        addDataToHash(hasher, op->getSuccessor(i));
+    });
+    hash = hasher.result();
+  }
+
+  bool operator==(const OperationFingerPrint &other) const {
+    return hash == other.hash;
+  }
+  bool operator!=(const OperationFingerPrint &other) const {
+    return !(*this == other);
+  }
+
+private:
+  template <typename T> void addDataToHash(llvm::SHA1 &hasher, const T &data) {
+    hasher.update(
+        ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(&data), sizeof(T)));
+  }
+
+  SmallString<20> hash;
+};
+
+//===----------------------------------------------------------------------===//
+// IRPrinter
+//===----------------------------------------------------------------------===//
+
+class IRPrinterInstrumentation : public PassInstrumentation {
+public:
+  IRPrinterInstrumentation(std::unique_ptr<PassManager::IRPrinterConfig> config)
+      : config(std::move(config)) {}
+
+private:
+  /// Instrumentation hooks.
+  void runBeforePass(Pass *pass, Operation *op) override;
+  void runAfterPass(Pass *pass, Operation *op) override;
+  void runAfterPassFailed(Pass *pass, Operation *op) override;
+
+  /// Configuration to use.
+  std::unique_ptr<PassManager::IRPrinterConfig> config;
+
+  /// The following is a set of fingerprints for operations that are currently
+  /// being operated on in a pass. This field is only used when the
+  /// configuration asked for change detection.
+  DenseMap<Pass *, OperationFingerPrint> beforePassFingerPrints;
+};
+} // end anonymous namespace
+
+/// Returns true if the given pass is hidden from IR printing.
+static bool isHiddenPass(Pass *pass) {
+  return isAdaptorPass(pass) || isa<VerifierPass>(pass);
+}
+
+static void printIR(Operation *op, bool printModuleScope, raw_ostream &out,
+                    OpPrintingFlags flags) {
+  // Check to see if we are printing the top-level module.
+  auto module = dyn_cast<ModuleOp>(op);
+  if (module && !op->getBlock())
+    return module.print(out << "\n", flags);
+
+  // Otherwise, check to see if we are not printing at module scope.
+  if (!printModuleScope)
+    return op->print(out << "\n", flags.useLocalScope());
+
+  // Otherwise, we are printing at module scope.
+  out << " ('" << op->getName() << "' operation";
+  if (auto symbolName =
+          op->getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName()))
+    out << ": @" << symbolName.getValue();
+  out << ")\n";
+
+  // Find the top-level module operation.
+  auto *topLevelOp = op;
+  while (auto *parentOp = topLevelOp->getParentOp())
+    topLevelOp = parentOp;
+
+  // Check to see if the top-level operation is actually a module in the case of
+  // invalid-ir.
+  if (auto module = dyn_cast<ModuleOp>(topLevelOp))
+    module.print(out, flags);
+  else
+    topLevelOp->print(out, flags);
+}
+
+/// Instrumentation hooks.
+void IRPrinterInstrumentation::runBeforePass(Pass *pass, Operation *op) {
+  if (isHiddenPass(pass))
+    return;
+  // If the config asked to detect changes, record the current fingerprint.
+  if (config->shouldPrintAfterOnlyOnChange())
+    beforePassFingerPrints.try_emplace(pass, op);
+
+  config->printBeforeIfEnabled(pass, op, [&](raw_ostream &out) {
+    out << formatv("*** IR Dump Before {0} ***", pass->getName());
+    printIR(op, config->shouldPrintAtModuleScope(), out, OpPrintingFlags());
+    out << "\n\n";
+  });
+}
+
+void IRPrinterInstrumentation::runAfterPass(Pass *pass, Operation *op) {
+  if (isHiddenPass(pass))
+    return;
+  // If the config asked to detect changes, compare the current fingerprint with
+  // the previous.
+  if (config->shouldPrintAfterOnlyOnChange()) {
+    auto fingerPrintIt = beforePassFingerPrints.find(pass);
+    assert(fingerPrintIt != beforePassFingerPrints.end() &&
+           "expected valid fingerprint");
+    // If the fingerprints are the same, we don't print the IR.
+    if (fingerPrintIt->second == OperationFingerPrint(op)) {
+      beforePassFingerPrints.erase(fingerPrintIt);
+      return;
+    }
+    beforePassFingerPrints.erase(fingerPrintIt);
+  }
+
+  config->printAfterIfEnabled(pass, op, [&](raw_ostream &out) {
+    out << formatv("*** IR Dump After {0} ***", pass->getName());
+    printIR(op, config->shouldPrintAtModuleScope(), out, OpPrintingFlags());
+    out << "\n\n";
+  });
+}
+
+void IRPrinterInstrumentation::runAfterPassFailed(Pass *pass, Operation *op) {
+  if (isAdaptorPass(pass))
+    return;
+  if (config->shouldPrintAfterOnlyOnChange())
+    beforePassFingerPrints.erase(pass);
+
+  config->printAfterIfEnabled(pass, op, [&](raw_ostream &out) {
+    out << formatv("*** IR Dump After {0} Failed ***", pass->getName());
+    printIR(op, config->shouldPrintAtModuleScope(), out,
+            OpPrintingFlags().printGenericOpForm());
+    out << "\n\n";
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// IRPrinterConfig
+//===----------------------------------------------------------------------===//
+
+/// Initialize the configuration.
+PassManager::IRPrinterConfig::IRPrinterConfig(bool printModuleScope,
+                                              bool printAfterOnlyOnChange)
+    : printModuleScope(printModuleScope),
+      printAfterOnlyOnChange(printAfterOnlyOnChange) {}
+PassManager::IRPrinterConfig::~IRPrinterConfig() {}
+
+/// A hook that may be overridden by a derived config that checks if the IR
+/// of 'operation' should be dumped *before* the pass 'pass' has been
+/// executed. If the IR should be dumped, 'printCallback' should be invoked
+/// with the stream to dump into.
+void PassManager::IRPrinterConfig::printBeforeIfEnabled(
+    Pass *pass, Operation *operation, PrintCallbackFn printCallback) {
+  // By default, never print.
+}
+
+/// A hook that may be overridden by a derived config that checks if the IR
+/// of 'operation' should be dumped *after* the pass 'pass' has been
+/// executed. If the IR should be dumped, 'printCallback' should be invoked
+/// with the stream to dump into.
+void PassManager::IRPrinterConfig::printAfterIfEnabled(
+    Pass *pass, Operation *operation, PrintCallbackFn printCallback) {
+  // By default, never print.
+}
+
+//===----------------------------------------------------------------------===//
+// PassManager
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Simple wrapper config that allows for the simpler interface defined above.
+struct BasicIRPrinterConfig : public PassManager::IRPrinterConfig {
+  BasicIRPrinterConfig(
+      std::function<bool(Pass *, Operation *)> shouldPrintBeforePass,
+      std::function<bool(Pass *, Operation *)> shouldPrintAfterPass,
+      bool printModuleScope, bool printAfterOnlyOnChange, raw_ostream &out)
+      : IRPrinterConfig(printModuleScope, printAfterOnlyOnChange),
+        shouldPrintBeforePass(shouldPrintBeforePass),
+        shouldPrintAfterPass(shouldPrintAfterPass), out(out) {
+    assert((shouldPrintBeforePass || shouldPrintAfterPass) &&
+           "expected at least one valid filter function");
+  }
+
+  void printBeforeIfEnabled(Pass *pass, Operation *operation,
+                            PrintCallbackFn printCallback) final {
+    if (shouldPrintBeforePass && shouldPrintBeforePass(pass, operation))
+      printCallback(out);
+  }
+
+  void printAfterIfEnabled(Pass *pass, Operation *operation,
+                           PrintCallbackFn printCallback) final {
+    if (shouldPrintAfterPass && shouldPrintAfterPass(pass, operation))
+      printCallback(out);
+  }
+
+  /// Filter functions for before and after pass execution.
+  std::function<bool(Pass *, Operation *)> shouldPrintBeforePass;
+  std::function<bool(Pass *, Operation *)> shouldPrintAfterPass;
+
+  /// The stream to output to.
+  raw_ostream &out;
+};
+} // end anonymous namespace
+
+/// Add an instrumentation to print the IR before and after pass execution,
+/// using the provided configuration.
+void PassManager::enableIRPrinting(std::unique_ptr<IRPrinterConfig> config) {
+  addInstrumentation(
+      std::make_unique<IRPrinterInstrumentation>(std::move(config)));
+}
+
+/// Add an instrumentation to print the IR before and after pass execution.
+void PassManager::enableIRPrinting(
+    std::function<bool(Pass *, Operation *)> shouldPrintBeforePass,
+    std::function<bool(Pass *, Operation *)> shouldPrintAfterPass,
+    bool printModuleScope, bool printAfterOnlyOnChange, raw_ostream &out) {
+  enableIRPrinting(std::make_unique<BasicIRPrinterConfig>(
+      std::move(shouldPrintBeforePass), std::move(shouldPrintAfterPass),
+      printModuleScope, printAfterOnlyOnChange, out));
+}
diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8877cc5f684acc5de8b2f7f8e0e3495bc26cfd9a
--- /dev/null
+++ b/mlir/lib/Pass/Pass.cpp
@@ -0,0 +1,755 @@
+//===- Pass.cpp - Pass infrastructure implementation ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common pass infrastructure.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Pass/Pass.h"
+#include "PassDetail.h"
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Support/FileUtilities.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/CrashRecoveryContext.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/Parallel.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+/// Out of line virtual method to ensure vtables and metadata are emitted to a
+/// single .o file.
+void Pass::anchor() {}
+
+/// Attempt to initialize the options of this pass from the given string.
+LogicalResult Pass::initializeOptions(StringRef options) {
+  return passOptions.parseFromString(options);
+}
+
+/// Copy the option values from 'other', which is another instance of this
+/// pass.
+void Pass::copyOptionValuesFrom(const Pass *other) {
+  passOptions.copyOptionValuesFrom(other->passOptions);
+}
+
+/// Prints out the pass in the textual representation of pipelines. If this is
+/// an adaptor pass, print with the op_name(sub_pass,...) format.
+void Pass::printAsTextualPipeline(raw_ostream &os) {
+  // Special case for adaptors to use the 'op_name(sub_passes)' format.
+  if (auto *adaptor = getAdaptorPassBase(this)) {
+    interleaveComma(adaptor->getPassManagers(), os, [&](OpPassManager &pm) {
+      os << pm.getOpName() << "(";
+      pm.printAsTextualPipeline(os);
+      os << ")";
+    });
+    return;
+  }
+  // Otherwise, print the pass argument followed by its options.
+  if (const PassInfo *info = lookupPassInfo())
+    os << info->getPassArgument();
+  else
+    os << getName();
+  passOptions.print(os);
+}
+
+/// Forwarding function to execute this pass.
+LogicalResult Pass::run(Operation *op, AnalysisManager am) {
+  passState.emplace(op, am);
+
+  // Instrument before the pass has run.
+  auto pi = am.getPassInstrumentor();
+  if (pi)
+    pi->runBeforePass(this, op);
+
+  // Invoke the virtual runOnOperation method.
+  runOnOperation();
+
+  // Invalidate any non preserved analyses.
+  am.invalidate(passState->preservedAnalyses);
+
+  // Instrument after the pass has run.
+  bool passFailed = passState->irAndPassFailed.getInt();
+  if (pi) {
+    if (passFailed)
+      pi->runAfterPassFailed(this, op);
+    else
+      pi->runAfterPass(this, op);
+  }
+
+  // Return if the pass signaled a failure.
+  return failure(passFailed);
+}
+
+//===----------------------------------------------------------------------===//
+// Verifier Passes
+//===----------------------------------------------------------------------===//
+
+void VerifierPass::runOnOperation() {
+  if (failed(verify(getOperation())))
+    signalPassFailure();
+  markAllAnalysesPreserved();
+}
+
+//===----------------------------------------------------------------------===//
+// OpPassManagerImpl
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace detail {
+struct OpPassManagerImpl {
+  OpPassManagerImpl(OperationName name, bool disableThreads, bool verifyPasses)
+      : name(name), disableThreads(disableThreads), verifyPasses(verifyPasses) {
+  }
+
+  /// Merge the passes of this pass manager into the one provided.
+  void mergeInto(OpPassManagerImpl &rhs) {
+    assert(name == rhs.name && "merging unrelated pass managers");
+    for (auto &pass : passes)
+      rhs.passes.push_back(std::move(pass));
+    passes.clear();
+  }
+
+  /// Coalesce adjacent AdaptorPasses into one large adaptor. This runs
+  /// recursively through the pipeline graph.
+  void coalesceAdjacentAdaptorPasses();
+
+  /// The name of the operation that passes of this pass manager operate on.
+  OperationName name;
+
+  /// Flag to disable multi-threading of passes.
+  bool disableThreads : 1;
+
+  /// Flag that specifies if the IR should be verified after each pass has run.
+  bool verifyPasses : 1;
+
+  /// The set of passes to run as part of this pass manager.
+  std::vector<std::unique_ptr<Pass>> passes;
+};
+} // end namespace detail
+} // end namespace mlir
+
+/// Coalesce adjacent AdaptorPasses into one large adaptor. This runs
+/// recursively through the pipeline graph.
+void OpPassManagerImpl::coalesceAdjacentAdaptorPasses() {
+  // Bail out early if there are no adaptor passes.
+  if (llvm::none_of(passes, [](std::unique_ptr<Pass> &pass) {
+        return isAdaptorPass(pass.get());
+      }))
+    return;
+
+  // Walk the pass list and merge adjacent adaptors.
+  OpToOpPassAdaptorBase *lastAdaptor = nullptr;
+  for (auto it = passes.begin(), e = passes.end(); it != e; ++it) {
+    // Check to see if this pass is an adaptor.
+    if (auto *currentAdaptor = getAdaptorPassBase(it->get())) {
+      // If it is the first adaptor in a possible chain, remember it and
+      // continue.
+      if (!lastAdaptor) {
+        lastAdaptor = currentAdaptor;
+        continue;
+      }
+
+      // Otherwise, merge into the existing adaptor and delete the current one.
+      currentAdaptor->mergeInto(*lastAdaptor);
+      it->reset();
+
+      // If the verifier is enabled, then next pass is a verifier run so
+      // drop it. Verifier passes are inserted after every pass, so this one
+      // would be a duplicate.
+      if (verifyPasses) {
+        assert(std::next(it) != e && isa<VerifierPass>(*std::next(it)));
+        (++it)->reset();
+      }
+    } else if (lastAdaptor && !isa<VerifierPass>(*it)) {
+      // If this pass is not an adaptor and not a verifier pass, then coalesce
+      // and forget any existing adaptor.
+      for (auto &pm : lastAdaptor->getPassManagers())
+        pm.getImpl().coalesceAdjacentAdaptorPasses();
+      lastAdaptor = nullptr;
+    }
+  }
+
+  // If there was an adaptor at the end of the manager, coalesce it as well.
+  if (lastAdaptor) {
+    for (auto &pm : lastAdaptor->getPassManagers())
+      pm.getImpl().coalesceAdjacentAdaptorPasses();
+  }
+
+  // Now that the adaptors have been merged, erase the empty slot corresponding
+  // to the merged adaptors that were nulled-out in the loop above.
+  llvm::erase_if(passes, std::logical_not<std::unique_ptr<Pass>>());
+}
+
+//===----------------------------------------------------------------------===//
+// OpPassManager
+//===----------------------------------------------------------------------===//
+
+OpPassManager::OpPassManager(OperationName name, bool disableThreads,
+                             bool verifyPasses)
+    : impl(new OpPassManagerImpl(name, disableThreads, verifyPasses)) {
+  assert(name.getAbstractOperation() &&
+         "OpPassManager can only operate on registered operations");
+  assert(name.getAbstractOperation()->hasProperty(
+             OperationProperty::IsolatedFromAbove) &&
+         "OpPassManager only supports operating on operations marked as "
+         "'IsolatedFromAbove'");
+}
+OpPassManager::OpPassManager(OpPassManager &&rhs) : impl(std::move(rhs.impl)) {}
+OpPassManager::OpPassManager(const OpPassManager &rhs) { *this = rhs; }
+OpPassManager &OpPassManager::operator=(const OpPassManager &rhs) {
+  impl.reset(new OpPassManagerImpl(rhs.impl->name, rhs.impl->disableThreads,
+                                   rhs.impl->verifyPasses));
+  for (auto &pass : rhs.impl->passes)
+    impl->passes.emplace_back(pass->clone());
+  return *this;
+}
+
+OpPassManager::~OpPassManager() {}
+
+OpPassManager::pass_iterator OpPassManager::begin() {
+  return impl->passes.begin();
+}
+OpPassManager::pass_iterator OpPassManager::end() { return impl->passes.end(); }
+
+/// Run all of the passes in this manager over the current operation.
+LogicalResult OpPassManager::run(Operation *op, AnalysisManager am) {
+  // Run each of the held passes.
+  for (auto &pass : impl->passes)
+    if (failed(pass->run(op, am)))
+      return failure();
+  return success();
+}
+
+/// Nest a new operation pass manager for the given operation kind under this
+/// pass manager.
+OpPassManager &OpPassManager::nest(const OperationName &nestedName) {
+  OpPassManager nested(nestedName, impl->disableThreads, impl->verifyPasses);
+
+  /// Create an adaptor for this pass. If multi-threading is disabled, then
+  /// create a synchronous adaptor.
+  if (impl->disableThreads || !llvm::llvm_is_multithreaded()) {
+    auto *adaptor = new OpToOpPassAdaptor(std::move(nested));
+    addPass(std::unique_ptr<Pass>(adaptor));
+    return adaptor->getPassManagers().front();
+  }
+
+  auto *adaptor = new OpToOpPassAdaptorParallel(std::move(nested));
+  addPass(std::unique_ptr<Pass>(adaptor));
+  return adaptor->getPassManagers().front();
+}
+OpPassManager &OpPassManager::nest(StringRef nestedName) {
+  return nest(OperationName(nestedName, getContext()));
+}
+
+/// Add the given pass to this pass manager. If this pass has a concrete
+/// operation type, it must be the same type as this pass manager.
+void OpPassManager::addPass(std::unique_ptr<Pass> pass) {
+  // If this pass runs on a different operation than this pass manager, then
+  // implicitly nest a pass manager for this operation.
+  auto passOpName = pass->getOpName();
+  if (passOpName && passOpName != impl->name.getStringRef())
+    return nest(*passOpName).addPass(std::move(pass));
+
+  impl->passes.emplace_back(std::move(pass));
+  if (impl->verifyPasses)
+    impl->passes.emplace_back(std::make_unique<VerifierPass>());
+}
+
+/// Returns the number of passes held by this manager.
+size_t OpPassManager::size() const { return impl->passes.size(); }
+
+/// Returns the internal implementation instance.
+OpPassManagerImpl &OpPassManager::getImpl() { return *impl; }
+
+/// Return an instance of the context.
+MLIRContext *OpPassManager::getContext() const {
+  return impl->name.getAbstractOperation()->dialect.getContext();
+}
+
+/// Return the operation name that this pass manager operates on.
+const OperationName &OpPassManager::getOpName() const { return impl->name; }
+
+/// Prints out the passes of the pass manager as the textual representation
+/// of pipelines.
+void OpPassManager::printAsTextualPipeline(raw_ostream &os) {
+  // Filter out passes that are not part of the public pipeline.
+  auto filteredPasses = llvm::make_filter_range(
+      impl->passes, [](const std::unique_ptr<Pass> &pass) {
+        return !isa<VerifierPass>(pass);
+      });
+  interleaveComma(filteredPasses, os, [&](const std::unique_ptr<Pass> &pass) {
+    pass->printAsTextualPipeline(os);
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// OpToOpPassAdaptor
+//===----------------------------------------------------------------------===//
+
+/// Utility to run the given operation and analysis manager on a provided op
+/// pass manager.
+static LogicalResult runPipeline(OpPassManager &pm, Operation *op,
+                                 AnalysisManager am) {
+  // Run the pipeline over the provided operation.
+  auto result = pm.run(op, am);
+
+  // Clear out any computed operation analyses. These analyses won't be used
+  // any more in this pipeline, and this helps reduce the current working set
+  // of memory. If preserving these analyses becomes important in the future
+  // we can re-evaluate this.
+  am.clear();
+  return result;
+}
+
+/// Find an operation pass manager that can operate on an operation of the given
+/// type, or nullptr if one does not exist.
+static OpPassManager *findPassManagerFor(MutableArrayRef<OpPassManager> mgrs,
+                                         const OperationName &name) {
+  auto it = llvm::find_if(
+      mgrs, [&](OpPassManager &mgr) { return mgr.getOpName() == name; });
+  return it == mgrs.end() ? nullptr : &*it;
+}
+
+OpToOpPassAdaptorBase::OpToOpPassAdaptorBase(OpPassManager &&mgr) {
+  mgrs.emplace_back(std::move(mgr));
+}
+
+/// Merge the current pass adaptor into given 'rhs'.
+void OpToOpPassAdaptorBase::mergeInto(OpToOpPassAdaptorBase &rhs) {
+  for (auto &pm : mgrs) {
+    // If an existing pass manager exists, then merge the given pass manager
+    // into it.
+    if (auto *existingPM = findPassManagerFor(rhs.mgrs, pm.getOpName())) {
+      pm.getImpl().mergeInto(existingPM->getImpl());
+    } else {
+      // Otherwise, add the given pass manager to the list.
+      rhs.mgrs.emplace_back(std::move(pm));
+    }
+  }
+  mgrs.clear();
+
+  // After coalescing, sort the pass managers within rhs by name.
+  llvm::array_pod_sort(rhs.mgrs.begin(), rhs.mgrs.end(),
+                       [](const OpPassManager *lhs, const OpPassManager *rhs) {
+                         return lhs->getOpName().getStringRef().compare(
+                             rhs->getOpName().getStringRef());
+                       });
+}
+
+/// Returns the adaptor pass name.
+std::string OpToOpPassAdaptorBase::getName() {
+  std::string name = "Pipeline Collection : [";
+  llvm::raw_string_ostream os(name);
+  interleaveComma(getPassManagers(), os, [&](OpPassManager &pm) {
+    os << '\'' << pm.getOpName() << '\'';
+  });
+  os << ']';
+  return os.str();
+}
+
+OpToOpPassAdaptor::OpToOpPassAdaptor(OpPassManager &&mgr)
+    : OpToOpPassAdaptorBase(std::move(mgr)) {}
+
+/// Run the held pipeline over all nested operations.
+void OpToOpPassAdaptor::runOnOperation() {
+  auto am = getAnalysisManager();
+  PassInstrumentation::PipelineParentInfo parentInfo = {llvm::get_threadid(),
+                                                        this};
+  auto *instrumentor = am.getPassInstrumentor();
+  for (auto &region : getOperation()->getRegions()) {
+    for (auto &block : region) {
+      for (auto &op : block) {
+        auto *mgr = findPassManagerFor(mgrs, op.getName());
+        if (!mgr)
+          continue;
+
+        // Run the held pipeline over the current operation.
+        if (instrumentor)
+          instrumentor->runBeforePipeline(mgr->getOpName(), parentInfo);
+        auto result = runPipeline(*mgr, &op, am.slice(&op));
+        if (instrumentor)
+          instrumentor->runAfterPipeline(mgr->getOpName(), parentInfo);
+
+        if (failed(result))
+          return signalPassFailure();
+      }
+    }
+  }
+}
+
+OpToOpPassAdaptorParallel::OpToOpPassAdaptorParallel(OpPassManager &&mgr)
+    : OpToOpPassAdaptorBase(std::move(mgr)) {}
+
+/// Utility functor that checks if the two ranges of pass managers have a size
+/// mismatch.
+static bool hasSizeMismatch(ArrayRef<OpPassManager> lhs,
+                            ArrayRef<OpPassManager> rhs) {
+  return lhs.size() != rhs.size() ||
+         llvm::any_of(llvm::seq<size_t>(0, lhs.size()),
+                      [&](size_t i) { return lhs[i].size() != rhs[i].size(); });
+}
+
+// Run the held pipeline asynchronously across the functions within the module.
+void OpToOpPassAdaptorParallel::runOnOperation() {
+  AnalysisManager am = getAnalysisManager();
+
+  // Create the async executors if they haven't been created, or if the main
+  // pipeline has changed.
+  if (asyncExecutors.empty() || hasSizeMismatch(asyncExecutors.front(), mgrs))
+    asyncExecutors.assign(llvm::hardware_concurrency(), mgrs);
+
+  // Run a prepass over the module to collect the operations to execute over.
+  // This ensures that an analysis manager exists for each operation, as well as
+  // providing a queue of operations to execute over.
+  std::vector<std::pair<Operation *, AnalysisManager>> opAMPairs;
+  for (auto &region : getOperation()->getRegions()) {
+    for (auto &block : region) {
+      for (auto &op : block) {
+        // Add this operation iff the name matches the any of the pass managers.
+        if (findPassManagerFor(mgrs, op.getName()))
+          opAMPairs.emplace_back(&op, am.slice(&op));
+      }
+    }
+  }
+
+  // A parallel diagnostic handler that provides deterministic diagnostic
+  // ordering.
+  ParallelDiagnosticHandler diagHandler(&getContext());
+
+  // An index for the current operation/analysis manager pair.
+  std::atomic<unsigned> opIt(0);
+
+  // Get the current thread for this adaptor.
+  PassInstrumentation::PipelineParentInfo parentInfo = {llvm::get_threadid(),
+                                                        this};
+  auto *instrumentor = am.getPassInstrumentor();
+
+  // An atomic failure variable for the async executors.
+  std::atomic<bool> passFailed(false);
+  llvm::parallel::for_each(
+      llvm::parallel::par, asyncExecutors.begin(),
+      std::next(asyncExecutors.begin(),
+                std::min(asyncExecutors.size(), opAMPairs.size())),
+      [&](MutableArrayRef<OpPassManager> pms) {
+        for (auto e = opAMPairs.size(); !passFailed && opIt < e;) {
+          // Get the next available operation index.
+          unsigned nextID = opIt++;
+          if (nextID >= e)
+            break;
+
+          // Set the order id for this thread in the diagnostic handler.
+          diagHandler.setOrderIDForThread(nextID);
+
+          // Get the pass manager for this operation and execute it.
+          auto &it = opAMPairs[nextID];
+          auto *pm = findPassManagerFor(pms, it.first->getName());
+          assert(pm && "expected valid pass manager for operation");
+
+          if (instrumentor)
+            instrumentor->runBeforePipeline(pm->getOpName(), parentInfo);
+          auto pipelineResult = runPipeline(*pm, it.first, it.second);
+          if (instrumentor)
+            instrumentor->runAfterPipeline(pm->getOpName(), parentInfo);
+
+          // Drop this thread from being tracked by the diagnostic handler.
+          // After this task has finished, the thread may be used outside of
+          // this pass manager context meaning that we don't want to track
+          // diagnostics from it anymore.
+          diagHandler.eraseOrderIDForThread();
+
+          // Handle a failed pipeline result.
+          if (failed(pipelineResult)) {
+            passFailed = true;
+            break;
+          }
+        }
+      });
+
+  // Signal a failure if any of the executors failed.
+  if (passFailed)
+    signalPassFailure();
+}
+
+/// Utility function to convert the given class to the base adaptor it is an
+/// adaptor pass, returns nullptr otherwise.
+OpToOpPassAdaptorBase *mlir::detail::getAdaptorPassBase(Pass *pass) {
+  if (auto *adaptor = dyn_cast<OpToOpPassAdaptor>(pass))
+    return adaptor;
+  if (auto *adaptor = dyn_cast<OpToOpPassAdaptorParallel>(pass))
+    return adaptor;
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// PassCrashReproducer
+//===----------------------------------------------------------------------===//
+
+/// Safely run the pass manager over the given module, creating a reproducible
+/// on failure or crash.
+static LogicalResult runWithCrashRecovery(OpPassManager &pm,
+                                          ModuleAnalysisManager &am,
+                                          ModuleOp module,
+                                          StringRef crashReproducerFileName) {
+  /// Enable crash recovery.
+  llvm::CrashRecoveryContext::Enable();
+
+  // Grab the textual pipeline executing within the pass manager first, just in
+  // case the pass manager becomes compromised.
+  std::string pipeline;
+  {
+    llvm::raw_string_ostream pipelineOS(pipeline);
+    pm.printAsTextualPipeline(pipelineOS);
+  }
+
+  // Clone the initial module before running it through the pass pipeline.
+  OwningModuleRef reproducerModule = module.clone();
+
+  // Safely invoke the pass manager within a recovery context.
+  LogicalResult passManagerResult = failure();
+  llvm::CrashRecoveryContext recoveryContext;
+  recoveryContext.RunSafelyOnThread(
+      [&] { passManagerResult = pm.run(module, am); });
+
+  /// Disable crash recovery.
+  llvm::CrashRecoveryContext::Disable();
+  if (succeeded(passManagerResult))
+    return success();
+
+  // The conversion failed, so generate a reproducible.
+  std::string error;
+  std::unique_ptr<llvm::ToolOutputFile> outputFile =
+      mlir::openOutputFile(crashReproducerFileName, &error);
+  if (!outputFile)
+    return emitError(UnknownLoc::get(pm.getContext()),
+                     "<MLIR-PassManager-Crash-Reproducer>: ")
+           << error;
+  auto &outputOS = outputFile->os();
+
+  // Output the current pass manager configuration.
+  outputOS << "// configuration: -pass-pipeline='" << pipeline << "'";
+  if (pm.getImpl().disableThreads)
+    outputOS << " -disable-pass-threading";
+
+  // TODO(riverriddle) Should this also be configured with a pass manager flag?
+  outputOS << "\n// note: verifyPasses="
+           << (pm.getImpl().verifyPasses ? "true" : "false") << "\n";
+
+  // Output the .mlir module.
+  reproducerModule->print(outputOS);
+  outputFile->keep();
+
+  return reproducerModule->emitError()
+         << "A failure has been detected while processing the MLIR module, a "
+            "reproducer has been generated in '"
+         << crashReproducerFileName << "'";
+}
+
+//===----------------------------------------------------------------------===//
+// PassManager
+//===----------------------------------------------------------------------===//
+
+PassManager::PassManager(MLIRContext *ctx, bool verifyPasses)
+    : OpPassManager(OperationName(ModuleOp::getOperationName(), ctx),
+                    /*disableThreads=*/false, verifyPasses),
+      passTiming(false) {}
+
+PassManager::~PassManager() {}
+
+/// Run the passes within this manager on the provided module.
+LogicalResult PassManager::run(ModuleOp module) {
+  // Before running, make sure to coalesce any adjacent pass adaptors in the
+  // pipeline.
+  getImpl().coalesceAdjacentAdaptorPasses();
+
+  // Construct an analysis manager for the pipeline.
+  ModuleAnalysisManager am(module, instrumentor.get());
+
+  // If reproducer generation is enabled, run the pass manager with crash
+  // handling enabled.
+  LogicalResult result =
+      crashReproducerFileName
+          ? runWithCrashRecovery(*this, am, module, *crashReproducerFileName)
+          : OpPassManager::run(module, am);
+
+  // Dump all of the pass statistics if necessary.
+  if (passStatisticsMode)
+    dumpStatistics();
+  return result;
+}
+
+/// Disable support for multi-threading within the pass manager.
+void PassManager::disableMultithreading(bool disable) {
+  getImpl().disableThreads = disable;
+}
+
+/// Enable support for the pass manager to generate a reproducer on the event
+/// of a crash or a pass failure. `outputFile` is a .mlir filename used to write
+/// the generated reproducer.
+void PassManager::enableCrashReproducerGeneration(StringRef outputFile) {
+  crashReproducerFileName = outputFile;
+}
+
+/// Add the provided instrumentation to the pass manager.
+void PassManager::addInstrumentation(std::unique_ptr<PassInstrumentation> pi) {
+  if (!instrumentor)
+    instrumentor = std::make_unique<PassInstrumentor>();
+
+  instrumentor->addInstrumentation(std::move(pi));
+}
+
+//===----------------------------------------------------------------------===//
+// AnalysisManager
+//===----------------------------------------------------------------------===//
+
+/// Returns a pass instrumentation object for the current operation.
+PassInstrumentor *AnalysisManager::getPassInstrumentor() const {
+  ParentPointerT curParent = parent;
+  while (auto *parentAM = curParent.dyn_cast<const AnalysisManager *>())
+    curParent = parentAM->parent;
+  return curParent.get<const ModuleAnalysisManager *>()->getPassInstrumentor();
+}
+
+/// Get an analysis manager for the given child operation.
+AnalysisManager AnalysisManager::slice(Operation *op) {
+  assert(op->getParentOp() == impl->getOperation() &&
+         "'op' has a different parent operation");
+  auto it = impl->childAnalyses.find(op);
+  if (it == impl->childAnalyses.end())
+    it = impl->childAnalyses
+             .try_emplace(op, std::make_unique<NestedAnalysisMap>(op))
+             .first;
+  return {this, it->second.get()};
+}
+
+/// Invalidate any non preserved analyses.
+void detail::NestedAnalysisMap::invalidate(
+    const detail::PreservedAnalyses &pa) {
+  // If all analyses were preserved, then there is nothing to do here.
+  if (pa.isAll())
+    return;
+
+  // Invalidate the analyses for the current operation directly.
+  analyses.invalidate(pa);
+
+  // If no analyses were preserved, then just simply clear out the child
+  // analysis results.
+  if (pa.isNone()) {
+    childAnalyses.clear();
+    return;
+  }
+
+  // Otherwise, invalidate each child analysis map.
+  SmallVector<NestedAnalysisMap *, 8> mapsToInvalidate(1, this);
+  while (!mapsToInvalidate.empty()) {
+    auto *map = mapsToInvalidate.pop_back_val();
+    for (auto &analysisPair : map->childAnalyses) {
+      analysisPair.second->invalidate(pa);
+      if (!analysisPair.second->childAnalyses.empty())
+        mapsToInvalidate.push_back(analysisPair.second.get());
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// PassInstrumentation
+//===----------------------------------------------------------------------===//
+
+PassInstrumentation::~PassInstrumentation() {}
+
+//===----------------------------------------------------------------------===//
+// PassInstrumentor
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace detail {
+struct PassInstrumentorImpl {
+  /// Mutex to keep instrumentation access thread-safe.
+  llvm::sys::SmartMutex<true> mutex;
+
+  /// Set of registered instrumentations.
+  std::vector<std::unique_ptr<PassInstrumentation>> instrumentations;
+};
+} // end namespace detail
+} // end namespace mlir
+
+PassInstrumentor::PassInstrumentor() : impl(new PassInstrumentorImpl()) {}
+PassInstrumentor::~PassInstrumentor() {}
+
+/// See PassInstrumentation::runBeforePipeline for details.
+void PassInstrumentor::runBeforePipeline(
+    const OperationName &name,
+    const PassInstrumentation::PipelineParentInfo &parentInfo) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : impl->instrumentations)
+    instr->runBeforePipeline(name, parentInfo);
+}
+
+/// See PassInstrumentation::runAfterPipeline for details.
+void PassInstrumentor::runAfterPipeline(
+    const OperationName &name,
+    const PassInstrumentation::PipelineParentInfo &parentInfo) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : llvm::reverse(impl->instrumentations))
+    instr->runAfterPipeline(name, parentInfo);
+}
+
+/// See PassInstrumentation::runBeforePass for details.
+void PassInstrumentor::runBeforePass(Pass *pass, Operation *op) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : impl->instrumentations)
+    instr->runBeforePass(pass, op);
+}
+
+/// See PassInstrumentation::runAfterPass for details.
+void PassInstrumentor::runAfterPass(Pass *pass, Operation *op) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : llvm::reverse(impl->instrumentations))
+    instr->runAfterPass(pass, op);
+}
+
+/// See PassInstrumentation::runAfterPassFailed for details.
+void PassInstrumentor::runAfterPassFailed(Pass *pass, Operation *op) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : llvm::reverse(impl->instrumentations))
+    instr->runAfterPassFailed(pass, op);
+}
+
+/// See PassInstrumentation::runBeforeAnalysis for details.
+void PassInstrumentor::runBeforeAnalysis(StringRef name, AnalysisID *id,
+                                         Operation *op) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : impl->instrumentations)
+    instr->runBeforeAnalysis(name, id, op);
+}
+
+/// See PassInstrumentation::runAfterAnalysis for details.
+void PassInstrumentor::runAfterAnalysis(StringRef name, AnalysisID *id,
+                                        Operation *op) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : llvm::reverse(impl->instrumentations))
+    instr->runAfterAnalysis(name, id, op);
+}
+
+/// Add the given instrumentation to the collection.
+void PassInstrumentor::addInstrumentation(
+    std::unique_ptr<PassInstrumentation> pi) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  impl->instrumentations.emplace_back(std::move(pi));
+}
+
+constexpr AnalysisID mlir::detail::PreservedAnalyses::allAnalysesID;
diff --git a/mlir/lib/Pass/PassDetail.h b/mlir/lib/Pass/PassDetail.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a52535bedf0e3ebfa53af2a96c6f9d209bdd63c
--- /dev/null
+++ b/mlir/lib/Pass/PassDetail.h
@@ -0,0 +1,95 @@
+//===- PassDetail.h - MLIR Pass details -------------------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_PASS_PASSDETAIL_H_
+#define MLIR_PASS_PASSDETAIL_H_
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+
+namespace mlir {
+namespace detail {
+
+//===----------------------------------------------------------------------===//
+// Verifier Pass
+//===----------------------------------------------------------------------===//
+
+/// Pass to verify an operation and signal failure if necessary.
+class VerifierPass : public OperationPass<VerifierPass> {
+  void runOnOperation() override;
+};
+
+//===----------------------------------------------------------------------===//
+// OpToOpPassAdaptor
+//===----------------------------------------------------------------------===//
+
+/// A base class for Op-to-Op adaptor passes.
+class OpToOpPassAdaptorBase {
+public:
+  OpToOpPassAdaptorBase(OpPassManager &&mgr);
+  OpToOpPassAdaptorBase(const OpToOpPassAdaptorBase &rhs) = default;
+
+  /// Merge the current pass adaptor into given 'rhs'.
+  void mergeInto(OpToOpPassAdaptorBase &rhs);
+
+  /// Returns the pass managers held by this adaptor.
+  MutableArrayRef<OpPassManager> getPassManagers() { return mgrs; }
+
+  /// Returns the adaptor pass name.
+  std::string getName();
+
+protected:
+  // A set of adaptors to run.
+  SmallVector<OpPassManager, 1> mgrs;
+};
+
+/// An adaptor pass used to run operation passes over nested operations
+/// synchronously on a single thread.
+class OpToOpPassAdaptor : public OperationPass<OpToOpPassAdaptor>,
+                          public OpToOpPassAdaptorBase {
+public:
+  OpToOpPassAdaptor(OpPassManager &&mgr);
+
+  /// Run the held pipeline over all operations.
+  void runOnOperation() override;
+};
+
+/// An adaptor pass used to run operation passes over nested operations
+/// asynchronously across multiple threads.
+class OpToOpPassAdaptorParallel
+    : public OperationPass<OpToOpPassAdaptorParallel>,
+      public OpToOpPassAdaptorBase {
+public:
+  OpToOpPassAdaptorParallel(OpPassManager &&mgr);
+
+  /// Run the held pipeline over all operations.
+  void runOnOperation() override;
+
+  /// Return the async pass managers held by this parallel adaptor.
+  MutableArrayRef<SmallVector<OpPassManager, 1>> getParallelPassManagers() {
+    return asyncExecutors;
+  }
+
+private:
+  // A set of executors, cloned from the main executor, that run asynchronously
+  // on different threads.
+  SmallVector<SmallVector<OpPassManager, 1>, 8> asyncExecutors;
+};
+
+/// Utility function to convert the given class to the base adaptor it is an
+/// adaptor pass, returns nullptr otherwise.
+OpToOpPassAdaptorBase *getAdaptorPassBase(Pass *pass);
+
+/// Utility function to return if a pass refers to an adaptor pass. Adaptor
+/// passes are those that internally execute a pipeline.
+inline bool isAdaptorPass(Pass *pass) {
+  return isa<OpToOpPassAdaptorParallel>(pass) || isa<OpToOpPassAdaptor>(pass);
+}
+
+} // end namespace detail
+} // end namespace mlir
+#endif // MLIR_PASS_PASSDETAIL_H_
diff --git a/mlir/lib/Pass/PassManagerOptions.cpp b/mlir/lib/Pass/PassManagerOptions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..87487069d97ec2dcc0f728ab748a670b11c1c101
--- /dev/null
+++ b/mlir/lib/Pass/PassManagerOptions.cpp
@@ -0,0 +1,173 @@
+//===- PassManagerOptions.cpp - PassManager Command Line Options ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace mlir;
+
+namespace {
+struct PassManagerOptions {
+  //===--------------------------------------------------------------------===//
+  // Crash Reproducer Generator
+  //===--------------------------------------------------------------------===//
+  llvm::cl::opt<std::string> reproducerFile{
+      "pass-pipeline-crash-reproducer",
+      llvm::cl::desc("Generate a .mlir reproducer file at the given output path"
+                     " if the pass manager crashes or fails")};
+
+  //===--------------------------------------------------------------------===//
+  // Multi-threading
+  //===--------------------------------------------------------------------===//
+  llvm::cl::opt<bool> disableThreads{
+      "disable-pass-threading",
+      llvm::cl::desc("Disable multithreading in the pass manager"),
+      llvm::cl::init(false)};
+
+  //===--------------------------------------------------------------------===//
+  // IR Printing
+  //===--------------------------------------------------------------------===//
+  PassPipelineCLParser printBefore{"print-ir-before",
+                                   "Print IR before specified passes"};
+  PassPipelineCLParser printAfter{"print-ir-after",
+                                  "Print IR after specified passes"};
+  llvm::cl::opt<bool> printBeforeAll{
+      "print-ir-before-all", llvm::cl::desc("Print IR before each pass"),
+      llvm::cl::init(false)};
+  llvm::cl::opt<bool> printAfterAll{"print-ir-after-all",
+                                    llvm::cl::desc("Print IR after each pass"),
+                                    llvm::cl::init(false)};
+  llvm::cl::opt<bool> printAfterChange{
+      "print-ir-after-change",
+      llvm::cl::desc(
+          "When printing the IR after a pass, only print if the IR changed"),
+      llvm::cl::init(false)};
+  llvm::cl::opt<bool> printModuleScope{
+      "print-ir-module-scope",
+      llvm::cl::desc("When printing IR for print-ir-[before|after]{-all} "
+                     "always print the top-level module operation"),
+      llvm::cl::init(false)};
+
+  /// Add an IR printing instrumentation if enabled by any 'print-ir' flags.
+  void addPrinterInstrumentation(PassManager &pm);
+
+  //===--------------------------------------------------------------------===//
+  // Pass Timing
+  //===--------------------------------------------------------------------===//
+  llvm::cl::opt<bool> passTiming{
+      "pass-timing",
+      llvm::cl::desc("Display the execution times of each pass")};
+  llvm::cl::opt<PassDisplayMode> passTimingDisplayMode{
+      "pass-timing-display",
+      llvm::cl::desc("Display method for pass timing data"),
+      llvm::cl::init(PassDisplayMode::Pipeline),
+      llvm::cl::values(
+          clEnumValN(PassDisplayMode::List, "list",
+                     "display the results in a list sorted by total time"),
+          clEnumValN(PassDisplayMode::Pipeline, "pipeline",
+                     "display the results with a nested pipeline view"))};
+
+  //===--------------------------------------------------------------------===//
+  // Pass Statistics
+  //===--------------------------------------------------------------------===//
+  llvm::cl::opt<bool> passStatistics{
+      "pass-statistics", llvm::cl::desc("Display the statistics of each pass")};
+  llvm::cl::opt<PassDisplayMode> passStatisticsDisplayMode{
+      "pass-statistics-display",
+      llvm::cl::desc("Display method for pass statistics"),
+      llvm::cl::init(PassDisplayMode::Pipeline),
+      llvm::cl::values(
+          clEnumValN(
+              PassDisplayMode::List, "list",
+              "display the results in a merged list sorted by pass name"),
+          clEnumValN(PassDisplayMode::Pipeline, "pipeline",
+                     "display the results with a nested pipeline view"))};
+
+  /// Add a pass timing instrumentation if enabled by 'pass-timing' flags.
+  void addTimingInstrumentation(PassManager &pm);
+};
+} // end anonymous namespace
+
+static llvm::ManagedStatic<Optional<PassManagerOptions>> options;
+
+/// Add an IR printing instrumentation if enabled by any 'print-ir' flags.
+void PassManagerOptions::addPrinterInstrumentation(PassManager &pm) {
+  std::function<bool(Pass *, Operation *)> shouldPrintBeforePass;
+  std::function<bool(Pass *, Operation *)> shouldPrintAfterPass;
+
+  // Handle print-before.
+  if (printBeforeAll) {
+    // If we are printing before all, then just return true for the filter.
+    shouldPrintBeforePass = [](Pass *, Operation *) { return true; };
+  } else if (printBefore.hasAnyOccurrences()) {
+    // Otherwise if there are specific passes to print before, then check to see
+    // if the pass info for the current pass is included in the list.
+    shouldPrintBeforePass = [&](Pass *pass, Operation *) {
+      auto *passInfo = pass->lookupPassInfo();
+      return passInfo && printBefore.contains(passInfo);
+    };
+  }
+
+  // Handle print-after.
+  if (printAfterAll) {
+    // If we are printing after all, then just return true for the filter.
+    shouldPrintAfterPass = [](Pass *, Operation *) { return true; };
+  } else if (printAfter.hasAnyOccurrences()) {
+    // Otherwise if there are specific passes to print after, then check to see
+    // if the pass info for the current pass is included in the list.
+    shouldPrintAfterPass = [&](Pass *pass, Operation *) {
+      auto *passInfo = pass->lookupPassInfo();
+      return passInfo && printAfter.contains(passInfo);
+    };
+  }
+
+  // If there are no valid printing filters, then just return.
+  if (!shouldPrintBeforePass && !shouldPrintAfterPass)
+    return;
+
+  // Otherwise, add the IR printing instrumentation.
+  pm.enableIRPrinting(shouldPrintBeforePass, shouldPrintAfterPass,
+                      printModuleScope, printAfterChange, llvm::errs());
+}
+
+/// Add a pass timing instrumentation if enabled by 'pass-timing' flags.
+void PassManagerOptions::addTimingInstrumentation(PassManager &pm) {
+  if (passTiming)
+    pm.enableTiming(passTimingDisplayMode);
+}
+
+void mlir::registerPassManagerCLOptions() {
+  // Reset the options instance if it hasn't been enabled yet.
+  if (!options->hasValue())
+    options->emplace();
+}
+
+void mlir::applyPassManagerCLOptions(PassManager &pm) {
+  // Generate a reproducer on crash/failure.
+  if ((*options)->reproducerFile.getNumOccurrences())
+    pm.enableCrashReproducerGeneration((*options)->reproducerFile);
+
+  // Disable multi-threading.
+  if ((*options)->disableThreads)
+    pm.disableMultithreading();
+
+  // Enable statistics dumping.
+  if ((*options)->passStatistics)
+    pm.enableStatistics((*options)->passStatisticsDisplayMode);
+
+  // Add the IR printing instrumentation.
+  (*options)->addPrinterInstrumentation(pm);
+
+  // Note: The pass timing instrumentation should be added last to avoid any
+  // potential "ghost" timing from other instrumentations being unintentionally
+  // included in the timing results.
+  (*options)->addTimingInstrumentation(pm);
+}
diff --git a/mlir/lib/Pass/PassRegistry.cpp b/mlir/lib/Pass/PassRegistry.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c5193d05395b7b13c6dbc406ec237a7027cc3b1
--- /dev/null
+++ b/mlir/lib/Pass/PassRegistry.cpp
@@ -0,0 +1,542 @@
+//===- PassRegistry.cpp - Pass Registration Utilities ---------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+using namespace detail;
+
+/// Static mapping of all of the registered passes.
+static llvm::ManagedStatic<DenseMap<const PassID *, PassInfo>> passRegistry;
+
+/// Static mapping of all of the registered pass pipelines.
+static llvm::ManagedStatic<llvm::StringMap<PassPipelineInfo>>
+    passPipelineRegistry;
+
+/// Utility to create a default registry function from a pass instance.
+static PassRegistryFunction
+buildDefaultRegistryFn(const PassAllocatorFunction &allocator) {
+  return [=](OpPassManager &pm, StringRef options) {
+    std::unique_ptr<Pass> pass = allocator();
+    LogicalResult result = pass->initializeOptions(options);
+    pm.addPass(std::move(pass));
+    return result;
+  };
+}
+
+//===----------------------------------------------------------------------===//
+// PassPipelineInfo
+//===----------------------------------------------------------------------===//
+
+void mlir::registerPassPipeline(StringRef arg, StringRef description,
+                                const PassRegistryFunction &function) {
+  PassPipelineInfo pipelineInfo(arg, description, function);
+  bool inserted = passPipelineRegistry->try_emplace(arg, pipelineInfo).second;
+  assert(inserted && "Pass pipeline registered multiple times");
+  (void)inserted;
+}
+
+//===----------------------------------------------------------------------===//
+// PassInfo
+//===----------------------------------------------------------------------===//
+
+PassInfo::PassInfo(StringRef arg, StringRef description, const PassID *passID,
+                   const PassAllocatorFunction &allocator)
+    : PassRegistryEntry(arg, description, buildDefaultRegistryFn(allocator)) {}
+
+void mlir::registerPass(StringRef arg, StringRef description,
+                        const PassID *passID,
+                        const PassAllocatorFunction &function) {
+  PassInfo passInfo(arg, description, passID, function);
+  bool inserted = passRegistry->try_emplace(passID, passInfo).second;
+  assert(inserted && "Pass registered multiple times");
+  (void)inserted;
+}
+
+/// Returns the pass info for the specified pass class or null if unknown.
+const PassInfo *mlir::Pass::lookupPassInfo(const PassID *passID) {
+  auto it = passRegistry->find(passID);
+  if (it == passRegistry->end())
+    return nullptr;
+  return &it->getSecond();
+}
+
+//===----------------------------------------------------------------------===//
+// PassOptions
+//===----------------------------------------------------------------------===//
+
+/// Out of line virtual function to provide home for the class.
+void detail::PassOptions::OptionBase::anchor() {}
+
+/// Copy the option values from 'other'.
+void detail::PassOptions::copyOptionValuesFrom(const PassOptions &other) {
+  assert(options.size() == other.options.size());
+  if (options.empty())
+    return;
+  for (auto optionsIt : llvm::zip(options, other.options))
+    std::get<0>(optionsIt)->copyValueFrom(*std::get<1>(optionsIt));
+}
+
+LogicalResult detail::PassOptions::parseFromString(StringRef options) {
+  // TODO(parkers): Handle escaping strings.
+  // NOTE: `options` is modified in place to always refer to the unprocessed
+  // part of the string.
+  while (!options.empty()) {
+    size_t spacePos = options.find(' ');
+    StringRef arg = options;
+    if (spacePos != StringRef::npos) {
+      arg = options.substr(0, spacePos);
+      options = options.substr(spacePos + 1);
+    } else {
+      options = StringRef();
+    }
+    if (arg.empty())
+      continue;
+
+    // At this point, arg refers to everything that is non-space in options
+    // upto the next space, and options refers to the rest of the string after
+    // that point.
+
+    // Split the individual option on '=' to form key and value. If there is no
+    // '=', then value is `StringRef()`.
+    size_t equalPos = arg.find('=');
+    StringRef key = arg;
+    StringRef value;
+    if (equalPos != StringRef::npos) {
+      key = arg.substr(0, equalPos);
+      value = arg.substr(equalPos + 1);
+    }
+    auto it = OptionsMap.find(key);
+    if (it == OptionsMap.end()) {
+      llvm::errs() << "<Pass-Options-Parser>: no such option " << key << "\n";
+      return failure();
+    }
+    if (llvm::cl::ProvidePositionalOption(it->second, value, 0))
+      return failure();
+  }
+
+  return success();
+}
+
+/// Print the options held by this struct in a form that can be parsed via
+/// 'parseFromString'.
+void detail::PassOptions::print(raw_ostream &os) {
+  // If there are no options, there is nothing left to do.
+  if (OptionsMap.empty())
+    return;
+
+  // Sort the options to make the ordering deterministic.
+  SmallVector<OptionBase *, 4> orderedOptions(options.begin(), options.end());
+  llvm::array_pod_sort(orderedOptions.begin(), orderedOptions.end(),
+                       [](OptionBase *const *lhs, OptionBase *const *rhs) {
+                         return (*lhs)->getArgStr().compare(
+                             (*rhs)->getArgStr());
+                       });
+
+  // Interleave the options with ' '.
+  os << '{';
+  interleave(
+      orderedOptions, os, [&](OptionBase *option) { option->print(os); }, " ");
+  os << '}';
+}
+
+//===----------------------------------------------------------------------===//
+// TextualPassPipeline Parser
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This class represents a textual description of a pass pipeline.
+class TextualPipeline {
+public:
+  /// Try to initialize this pipeline with the given pipeline text.
+  /// `errorStream` is the output stream to emit errors to.
+  LogicalResult initialize(StringRef text, raw_ostream &errorStream);
+
+  /// Add the internal pipeline elements to the provided pass manager.
+  LogicalResult addToPipeline(OpPassManager &pm) const;
+
+private:
+  /// A functor used to emit errors found during pipeline handling. The first
+  /// parameter corresponds to the raw location within the pipeline string. This
+  /// should always return failure.
+  using ErrorHandlerT = function_ref<LogicalResult(const char *, Twine)>;
+
+  /// A struct to capture parsed pass pipeline names.
+  ///
+  /// A pipeline is defined as a series of names, each of which may in itself
+  /// recursively contain a nested pipeline. A name is either the name of a pass
+  /// (e.g. "cse") or the name of an operation type (e.g. "func"). If the name
+  /// is the name of a pass, the InnerPipeline is empty, since passes cannot
+  /// contain inner pipelines.
+  struct PipelineElement {
+    PipelineElement(StringRef name) : name(name), registryEntry(nullptr) {}
+
+    StringRef name;
+    StringRef options;
+    const PassRegistryEntry *registryEntry;
+    std::vector<PipelineElement> innerPipeline;
+  };
+
+  /// Parse the given pipeline text into the internal pipeline vector. This
+  /// function only parses the structure of the pipeline, and does not resolve
+  /// its elements.
+  LogicalResult parsePipelineText(StringRef text, ErrorHandlerT errorHandler);
+
+  /// Resolve the elements of the pipeline, i.e. connect passes and pipelines to
+  /// the corresponding registry entry.
+  LogicalResult
+  resolvePipelineElements(MutableArrayRef<PipelineElement> elements,
+                          ErrorHandlerT errorHandler);
+
+  /// Resolve a single element of the pipeline.
+  LogicalResult resolvePipelineElement(PipelineElement &element,
+                                       ErrorHandlerT errorHandler);
+
+  /// Add the given pipeline elements to the provided pass manager.
+  LogicalResult addToPipeline(ArrayRef<PipelineElement> elements,
+                              OpPassManager &pm) const;
+
+  std::vector<PipelineElement> pipeline;
+};
+
+} // end anonymous namespace
+
+/// Try to initialize this pipeline with the given pipeline text. An option is
+/// given to enable accurate error reporting.
+LogicalResult TextualPipeline::initialize(StringRef text,
+                                          raw_ostream &errorStream) {
+  // Build a source manager to use for error reporting.
+  llvm::SourceMgr pipelineMgr;
+  pipelineMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(
+                                     text, "MLIR Textual PassPipeline Parser"),
+                                 llvm::SMLoc());
+  auto errorHandler = [&](const char *rawLoc, Twine msg) {
+    pipelineMgr.PrintMessage(errorStream, llvm::SMLoc::getFromPointer(rawLoc),
+                             llvm::SourceMgr::DK_Error, msg);
+    return failure();
+  };
+
+  // Parse the provided pipeline string.
+  if (failed(parsePipelineText(text, errorHandler)))
+    return failure();
+  return resolvePipelineElements(pipeline, errorHandler);
+}
+
+/// Add the internal pipeline elements to the provided pass manager.
+LogicalResult TextualPipeline::addToPipeline(OpPassManager &pm) const {
+  return addToPipeline(pipeline, pm);
+}
+
+/// Parse the given pipeline text into the internal pipeline vector. This
+/// function only parses the structure of the pipeline, and does not resolve
+/// its elements.
+LogicalResult TextualPipeline::parsePipelineText(StringRef text,
+                                                 ErrorHandlerT errorHandler) {
+  SmallVector<std::vector<PipelineElement> *, 4> pipelineStack = {&pipeline};
+  for (;;) {
+    std::vector<PipelineElement> &pipeline = *pipelineStack.back();
+    size_t pos = text.find_first_of(",(){");
+    pipeline.emplace_back(/*name=*/text.substr(0, pos).trim());
+
+    // If we have a single terminating name, we're done.
+    if (pos == text.npos)
+      break;
+
+    text = text.substr(pos);
+    char sep = text[0];
+
+    // Handle pulling ... from 'pass{...}' out as PipelineElement.options.
+    if (sep == '{') {
+      text = text.substr(1);
+
+      // Skip over everything until the closing '}' and store as options.
+      size_t close = text.find('}');
+
+      // TODO(parkers): Handle skipping over quoted sub-strings.
+      if (close == StringRef::npos) {
+        return errorHandler(
+            /*rawLoc=*/text.data() - 1,
+            "missing closing '}' while processing pass options");
+      }
+      pipeline.back().options = text.substr(0, close);
+      text = text.substr(close + 1);
+
+      // Skip checking for '(' because nested pipelines cannot have options.
+    } else if (sep == '(') {
+      text = text.substr(1);
+
+      // Push the inner pipeline onto the stack to continue processing.
+      pipelineStack.push_back(&pipeline.back().innerPipeline);
+      continue;
+    }
+
+    // When handling the close parenthesis, we greedily consume them to avoid
+    // empty strings in the pipeline.
+    while (text.consume_front(")")) {
+      // If we try to pop the outer pipeline we have unbalanced parentheses.
+      if (pipelineStack.size() == 1)
+        return errorHandler(/*rawLoc=*/text.data() - 1,
+                            "encountered extra closing ')' creating unbalanced "
+                            "parentheses while parsing pipeline");
+
+      pipelineStack.pop_back();
+    }
+
+    // Check if we've finished parsing.
+    if (text.empty())
+      break;
+
+    // Otherwise, the end of an inner pipeline always has to be followed by
+    // a comma, and then we can continue.
+    if (!text.consume_front(","))
+      return errorHandler(text.data(), "expected ',' after parsing pipeline");
+  }
+
+  // Check for unbalanced parentheses.
+  if (pipelineStack.size() > 1)
+    return errorHandler(
+        text.data(),
+        "encountered unbalanced parentheses while parsing pipeline");
+
+  assert(pipelineStack.back() == &pipeline &&
+         "wrong pipeline at the bottom of the stack");
+  return success();
+}
+
+/// Resolve the elements of the pipeline, i.e. connect passes and pipelines to
+/// the corresponding registry entry.
+LogicalResult TextualPipeline::resolvePipelineElements(
+    MutableArrayRef<PipelineElement> elements, ErrorHandlerT errorHandler) {
+  for (auto &elt : elements)
+    if (failed(resolvePipelineElement(elt, errorHandler)))
+      return failure();
+  return success();
+}
+
+/// Resolve a single element of the pipeline.
+LogicalResult
+TextualPipeline::resolvePipelineElement(PipelineElement &element,
+                                        ErrorHandlerT errorHandler) {
+  // If the inner pipeline of this element is not empty, this is an operation
+  // pipeline.
+  if (!element.innerPipeline.empty())
+    return resolvePipelineElements(element.innerPipeline, errorHandler);
+
+  // Otherwise, this must be a pass or pass pipeline.
+  // Check to see if a pipeline was registered with this name.
+  auto pipelineRegistryIt = passPipelineRegistry->find(element.name);
+  if (pipelineRegistryIt != passPipelineRegistry->end()) {
+    element.registryEntry = &pipelineRegistryIt->second;
+    return success();
+  }
+
+  // If not, then this must be a specific pass name.
+  for (auto &passIt : *passRegistry) {
+    if (passIt.second.getPassArgument() == element.name) {
+      element.registryEntry = &passIt.second;
+      return success();
+    }
+  }
+
+  // Emit an error for the unknown pass.
+  auto *rawLoc = element.name.data();
+  return errorHandler(rawLoc, "'" + element.name +
+                                  "' does not refer to a "
+                                  "registered pass or pass pipeline");
+}
+
+/// Add the given pipeline elements to the provided pass manager.
+LogicalResult TextualPipeline::addToPipeline(ArrayRef<PipelineElement> elements,
+                                             OpPassManager &pm) const {
+  for (auto &elt : elements) {
+    if (elt.registryEntry) {
+      if (failed(elt.registryEntry->addToPipeline(pm, elt.options)))
+        return failure();
+    } else if (failed(addToPipeline(elt.innerPipeline, pm.nest(elt.name)))) {
+      return failure();
+    }
+  }
+  return success();
+}
+
+/// This function parses the textual representation of a pass pipeline, and adds
+/// the result to 'pm' on success. This function returns failure if the given
+/// pipeline was invalid. 'errorStream' is an optional parameter that, if
+/// non-null, will be used to emit errors found during parsing.
+LogicalResult mlir::parsePassPipeline(StringRef pipeline, OpPassManager &pm,
+                                      raw_ostream &errorStream) {
+  TextualPipeline pipelineParser;
+  if (failed(pipelineParser.initialize(pipeline, errorStream)))
+    return failure();
+  if (failed(pipelineParser.addToPipeline(pm)))
+    return failure();
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// PassNameParser
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This struct represents the possible data entries in a parsed pass pipeline
+/// list.
+struct PassArgData {
+  PassArgData() : registryEntry(nullptr) {}
+  PassArgData(const PassRegistryEntry *registryEntry)
+      : registryEntry(registryEntry) {}
+
+  /// This field is used when the parsed option corresponds to a registered pass
+  /// or pass pipeline.
+  const PassRegistryEntry *registryEntry;
+
+  /// This field is set when instance specific pass options have been provided
+  /// on the command line.
+  StringRef options;
+
+  /// This field is used when the parsed option corresponds to an explicit
+  /// pipeline.
+  TextualPipeline pipeline;
+};
+} // end anonymous namespace
+
+namespace llvm {
+namespace cl {
+/// Define a valid OptionValue for the command line pass argument.
+template <>
+struct OptionValue<PassArgData> final
+    : OptionValueBase<PassArgData, /*isClass=*/true> {
+  OptionValue(const PassArgData &value) { this->setValue(value); }
+  OptionValue() = default;
+  void anchor() override {}
+
+  bool hasValue() const { return true; }
+  const PassArgData &getValue() const { return value; }
+  void setValue(const PassArgData &value) { this->value = value; }
+
+  PassArgData value;
+};
+} // end namespace cl
+} // end namespace llvm
+
+namespace {
+
+/// The name for the command line option used for parsing the textual pass
+/// pipeline.
+static constexpr StringLiteral passPipelineArg = "pass-pipeline";
+
+/// Adds command line option for each registered pass or pass pipeline, as well
+/// as textual pass pipelines.
+struct PassNameParser : public llvm::cl::parser<PassArgData> {
+  PassNameParser(llvm::cl::Option &opt) : llvm::cl::parser<PassArgData>(opt) {}
+
+  void initialize();
+  void printOptionInfo(const llvm::cl::Option &opt,
+                       size_t globalWidth) const override;
+  bool parse(llvm::cl::Option &opt, StringRef argName, StringRef arg,
+             PassArgData &value);
+};
+} // namespace
+
+void PassNameParser::initialize() {
+  llvm::cl::parser<PassArgData>::initialize();
+
+  /// Add an entry for the textual pass pipeline option.
+  addLiteralOption(passPipelineArg, PassArgData(),
+                   "A textual description of a pass pipeline to run");
+
+  /// Add the pass entries.
+  for (const auto &kv : *passRegistry) {
+    addLiteralOption(kv.second.getPassArgument(), &kv.second,
+                     kv.second.getPassDescription());
+  }
+  /// Add the pass pipeline entries.
+  for (const auto &kv : *passPipelineRegistry) {
+    addLiteralOption(kv.second.getPassArgument(), &kv.second,
+                     kv.second.getPassDescription());
+  }
+}
+
+void PassNameParser::printOptionInfo(const llvm::cl::Option &O,
+                                     size_t GlobalWidth) const {
+  PassNameParser *TP = const_cast<PassNameParser *>(this);
+  llvm::array_pod_sort(TP->Values.begin(), TP->Values.end(),
+                       [](const PassNameParser::OptionInfo *VT1,
+                          const PassNameParser::OptionInfo *VT2) {
+                         return VT1->Name.compare(VT2->Name);
+                       });
+  llvm::cl::parser<PassArgData>::printOptionInfo(O, GlobalWidth);
+}
+
+bool PassNameParser::parse(llvm::cl::Option &opt, StringRef argName,
+                           StringRef arg, PassArgData &value) {
+  // Handle the pipeline option explicitly.
+  if (argName == passPipelineArg)
+    return failed(value.pipeline.initialize(arg, llvm::errs()));
+
+  // Otherwise, default to the base for handling.
+  if (llvm::cl::parser<PassArgData>::parse(opt, argName, arg, value))
+    return true;
+  value.options = arg;
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// PassPipelineCLParser
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace detail {
+struct PassPipelineCLParserImpl {
+  PassPipelineCLParserImpl(StringRef arg, StringRef description)
+      : passList(arg, llvm::cl::desc(description)) {
+    passList.setValueExpectedFlag(llvm::cl::ValueExpected::ValueOptional);
+  }
+
+  /// The set of passes and pass pipelines to run.
+  llvm::cl::list<PassArgData, bool, PassNameParser> passList;
+};
+} // end namespace detail
+} // end namespace mlir
+
+/// Construct a pass pipeline parser with the given command line description.
+PassPipelineCLParser::PassPipelineCLParser(StringRef arg, StringRef description)
+    : impl(std::make_unique<detail::PassPipelineCLParserImpl>(arg,
+                                                              description)) {}
+PassPipelineCLParser::~PassPipelineCLParser() {}
+
+/// Returns true if this parser contains any valid options to add.
+bool PassPipelineCLParser::hasAnyOccurrences() const {
+  return impl->passList.getNumOccurrences() != 0;
+}
+
+/// Returns true if the given pass registry entry was registered at the
+/// top-level of the parser, i.e. not within an explicit textual pipeline.
+bool PassPipelineCLParser::contains(const PassRegistryEntry *entry) const {
+  return llvm::any_of(impl->passList, [&](const PassArgData &data) {
+    return data.registryEntry == entry;
+  });
+}
+
+/// Adds the passes defined by this parser entry to the given pass manager.
+LogicalResult PassPipelineCLParser::addToPipeline(OpPassManager &pm) const {
+  for (auto &passIt : impl->passList) {
+    if (passIt.registryEntry) {
+      if (failed(passIt.registryEntry->addToPipeline(pm, passIt.options)))
+        return failure();
+    } else if (failed(passIt.pipeline.addToPipeline(pm))) {
+      return failure();
+    }
+  }
+  return success();
+}
diff --git a/mlir/lib/Pass/PassStatistics.cpp b/mlir/lib/Pass/PassStatistics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ab656c20547cab69074789e2c99b80a259cb5b7
--- /dev/null
+++ b/mlir/lib/Pass/PassStatistics.cpp
@@ -0,0 +1,249 @@
+//===- PassStatistics.cpp -------------------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PassDetail.h"
+#include "mlir/Pass/PassManager.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Format.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+constexpr StringLiteral kPassStatsDescription =
+    "... Pass statistics report ...";
+
+namespace {
+/// Information pertaining to a specific statistic.
+struct Statistic {
+  const char *name, *desc;
+  unsigned value;
+};
+} // end anonymous namespace
+
+/// Utility to print a pass entry in the statistics output.
+static void printPassEntry(raw_ostream &os, unsigned indent, StringRef pass,
+                           MutableArrayRef<Statistic> stats = llvm::None) {
+  os.indent(indent) << pass << "\n";
+  if (stats.empty())
+    return;
+
+  // Make sure to sort the statistics by name.
+  llvm::array_pod_sort(stats.begin(), stats.end(),
+                       [](const auto *lhs, const auto *rhs) {
+                         return llvm::array_pod_sort_comparator<const char *>(
+                             &lhs->name, &rhs->name);
+                       });
+
+  // Collect the largest name and value length from each of the statistics.
+  size_t largestName = 0, largestValue = 0;
+  for (auto &stat : stats) {
+    largestName = std::max(largestName, (size_t)strlen(stat.name));
+    largestValue =
+        std::max(largestValue, (size_t)llvm::utostr(stat.value).size());
+  }
+
+  // Print each of the statistics.
+  for (auto &stat : stats) {
+    os.indent(indent + 2) << llvm::format("(S) %*u %-*s - %s\n", largestValue,
+                                          stat.value, largestName, stat.name,
+                                          stat.desc);
+  }
+}
+
+/// Print the statistics results in a list form, where each pass is sorted by
+/// name.
+static void printResultsAsList(raw_ostream &os, OpPassManager &pm) {
+  llvm::StringMap<std::vector<Statistic>> mergedStats;
+  std::function<void(Pass *)> addStats = [&](Pass *pass) {
+    auto *adaptor = getAdaptorPassBase(pass);
+
+    // If this is not an adaptor, add the stats to the list if there are any.
+    if (!adaptor) {
+      auto statistics = pass->getStatistics();
+      if (statistics.empty())
+        return;
+
+      auto &passEntry = mergedStats[pass->getName()];
+      if (passEntry.empty()) {
+        for (Pass::Statistic *it : pass->getStatistics())
+          passEntry.push_back({it->getName(), it->getDesc(), it->getValue()});
+      } else {
+        for (auto &it : llvm::enumerate(pass->getStatistics()))
+          passEntry[it.index()].value += it.value()->getValue();
+      }
+      return;
+    }
+
+    // Otherwise, recursively add each of the children.
+    for (auto &mgr : adaptor->getPassManagers())
+      for (Pass &pass : mgr.getPasses())
+        addStats(&pass);
+  };
+  for (Pass &pass : pm.getPasses())
+    addStats(&pass);
+
+  // Sort the statistics by pass name and then by record name.
+  std::vector<std::pair<StringRef, std::vector<Statistic>>> passAndStatistics;
+  for (auto &passIt : mergedStats)
+    passAndStatistics.push_back({passIt.first(), std::move(passIt.second)});
+  llvm::sort(passAndStatistics, [](const auto &lhs, const auto &rhs) {
+    return lhs.first.compare(rhs.first) < 0;
+  });
+
+  // Print the timing information sequentially.
+  for (auto &statData : passAndStatistics)
+    printPassEntry(os, /*indent=*/2, statData.first, statData.second);
+}
+
+/// Print the results in pipeline mode that mirrors the internal pass manager
+/// structure.
+static void printResultsAsPipeline(raw_ostream &os, OpPassManager &pm) {
+  std::function<void(unsigned, Pass *)> printPass = [&](unsigned indent,
+                                                        Pass *pass) {
+    // Handle the case of an adaptor pass.
+    if (auto *adaptor = getAdaptorPassBase(pass)) {
+      // If this adaptor has more than one internal pipeline, print an entry for
+      // it.
+      auto mgrs = adaptor->getPassManagers();
+      if (mgrs.size() > 1) {
+        printPassEntry(os, indent, adaptor->getName());
+        indent += 2;
+      }
+
+      // Print each of the children passes.
+      for (OpPassManager &mgr : mgrs) {
+        auto name = ("'" + mgr.getOpName().getStringRef() + "' Pipeline").str();
+        printPassEntry(os, indent, name);
+        for (Pass &pass : mgr.getPasses())
+          printPass(indent + 2, &pass);
+      }
+      return;
+    }
+
+    // Otherwise, we print the statistics for this pass.
+    std::vector<Statistic> stats;
+    for (Pass::Statistic *stat : pass->getStatistics())
+      stats.push_back({stat->getName(), stat->getDesc(), stat->getValue()});
+    printPassEntry(os, indent, pass->getName(), stats);
+  };
+  for (Pass &pass : pm.getPasses())
+    printPass(/*indent=*/0, &pass);
+}
+
+void printStatistics(OpPassManager &pm, PassDisplayMode displayMode) {
+  auto os = llvm::CreateInfoOutputFile();
+
+  // Print the stats header.
+  *os << "===" << std::string(73, '-') << "===\n";
+  // Figure out how many spaces for the description name.
+  unsigned padding = (80 - kPassStatsDescription.size()) / 2;
+  os->indent(padding) << kPassStatsDescription << '\n';
+  *os << "===" << std::string(73, '-') << "===\n";
+
+  // Defer to a specialized printer for each display mode.
+  switch (displayMode) {
+  case PassDisplayMode::List:
+    printResultsAsList(*os, pm);
+    break;
+  case PassDisplayMode::Pipeline:
+    printResultsAsPipeline(*os, pm);
+    break;
+  }
+  *os << "\n";
+  os->flush();
+}
+
+//===----------------------------------------------------------------------===//
+// PassStatistics
+//===----------------------------------------------------------------------===//
+
+Pass::Statistic::Statistic(Pass *owner, const char *name,
+                           const char *description)
+    : llvm::Statistic{/*DebugType=*/"", name, description} {
+#if LLVM_ENABLE_STATS
+  // Always set the 'initialized' bit to true so that this statistic isn't
+  // placed in the static registry.
+  // TODO: This is sort of hack as `llvm::Statistic`s can't be setup to avoid
+  // automatic registration with the global registry. We should either add
+  // support for this in LLVM, or just write our own statistics classes.
+  Initialized = true;
+#endif
+
+  // Register this statistic with the parent.
+  owner->statistics.push_back(this);
+}
+
+auto Pass::Statistic::operator=(unsigned value) -> Statistic & {
+  llvm::Statistic::operator=(value);
+  return *this;
+}
+
+//===----------------------------------------------------------------------===//
+// PassManager
+//===----------------------------------------------------------------------===//
+
+/// Merge the pass statistics of this class into 'other'.
+void OpPassManager::mergeStatisticsInto(OpPassManager &other) {
+  auto passes = getPasses(), otherPasses = other.getPasses();
+
+  for (auto passPair : llvm::zip(passes, otherPasses)) {
+    Pass &pass = std::get<0>(passPair), &otherPass = std::get<1>(passPair);
+
+    // If this is an adaptor, then recursively merge the pass managers.
+    if (auto *adaptorPass = getAdaptorPassBase(&pass)) {
+      auto *otherAdaptorPass = getAdaptorPassBase(&otherPass);
+      for (auto mgrs : llvm::zip(adaptorPass->getPassManagers(),
+                                 otherAdaptorPass->getPassManagers()))
+        std::get<0>(mgrs).mergeStatisticsInto(std::get<1>(mgrs));
+      continue;
+    }
+    // Otherwise, merge the statistics for the current pass.
+    assert(pass.statistics.size() == otherPass.statistics.size());
+    for (unsigned i = 0, e = pass.statistics.size(); i != e; ++i) {
+      assert(pass.statistics[i]->getName() ==
+             StringRef(otherPass.statistics[i]->getName()));
+      *otherPass.statistics[i] += *pass.statistics[i];
+      *pass.statistics[i] = 0;
+    }
+  }
+}
+
+/// Prepare the statistics of passes within the given pass manager for
+/// consumption(e.g. dumping).
+static void prepareStatistics(OpPassManager &pm) {
+  for (Pass &pass : pm.getPasses()) {
+    OpToOpPassAdaptorBase *adaptor = getAdaptorPassBase(&pass);
+    if (!adaptor)
+      continue;
+    MutableArrayRef<OpPassManager> nestedPms = adaptor->getPassManagers();
+
+    // If this is a parallel adaptor, merge the statistics from the async
+    // pass managers into the main nested pass managers.
+    if (auto *parallelAdaptor = dyn_cast<OpToOpPassAdaptorParallel>(&pass)) {
+      for (auto &asyncPM : parallelAdaptor->getParallelPassManagers()) {
+        for (unsigned i = 0, e = asyncPM.size(); i != e; ++i)
+          asyncPM[i].mergeStatisticsInto(nestedPms[i]);
+      }
+    }
+
+    // Prepare the statistics of each of the nested passes.
+    for (OpPassManager &nestedPM : nestedPms)
+      prepareStatistics(nestedPM);
+  }
+}
+
+/// Dump the statistics of the passes within this pass manager.
+void PassManager::dumpStatistics() {
+  prepareStatistics(*this);
+  printStatistics(*this, *passStatisticsMode);
+}
+
+/// Dump the statistics for each pass after running.
+void PassManager::enableStatistics(PassDisplayMode displayMode) {
+  passStatisticsMode = displayMode;
+}
diff --git a/mlir/lib/Pass/PassTiming.cpp b/mlir/lib/Pass/PassTiming.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..93e640e7890805eb9ad10ea8736c03007507da11
--- /dev/null
+++ b/mlir/lib/Pass/PassTiming.cpp
@@ -0,0 +1,464 @@
+//===- PassTiming.cpp -----------------------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PassDetail.h"
+#include "mlir/Pass/PassManager.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Threading.h"
+#include <chrono>
+
+using namespace mlir;
+using namespace mlir::detail;
+
+constexpr StringLiteral kPassTimingDescription =
+    "... Pass execution timing report ...";
+
+namespace {
+/// Simple record class to record timing information.
+struct TimeRecord {
+  TimeRecord(double wall = 0.0, double user = 0.0) : wall(wall), user(user) {}
+
+  TimeRecord &operator+=(const TimeRecord &other) {
+    wall += other.wall;
+    user += other.user;
+    return *this;
+  }
+
+  /// Print the current time record to 'os', with a breakdown showing
+  /// contributions to the give 'total' time record.
+  void print(raw_ostream &os, const TimeRecord &total) {
+    if (total.user != total.wall)
+      os << llvm::format("  %7.4f (%5.1f%%)  ", user,
+                         100.0 * user / total.user);
+    os << llvm::format("  %7.4f (%5.1f%%)  ", wall, 100.0 * wall / total.wall);
+  }
+
+  double wall, user;
+};
+
+/// An enumeration of the different types of timers.
+enum class TimerKind {
+  /// This timer represents an ordered collection of pass timers, corresponding
+  /// to a pass pipeline.
+  Pipeline,
+
+  /// This timer represents a collection of pipeline timers.
+  PipelineCollection,
+
+  /// This timer represents an analysis or pass timer.
+  PassOrAnalysis
+};
+
+struct Timer {
+  explicit Timer(std::string &&name, TimerKind kind)
+      : name(std::move(name)), kind(kind) {}
+
+  /// Start the timer.
+  void start() { startTime = std::chrono::system_clock::now(); }
+
+  /// Stop the timer.
+  void stop() {
+    auto newTime = std::chrono::system_clock::now() - startTime;
+    wallTime += newTime;
+    userTime += newTime;
+  }
+
+  /// Get or create a child timer with the provided name and id.
+  Timer *getChildTimer(const void *id, TimerKind kind,
+                       std::function<std::string()> &&nameBuilder) {
+    auto &child = children[id];
+    if (!child)
+      child = std::make_unique<Timer>(nameBuilder(), kind);
+    return child.get();
+  }
+
+  /// Returns the total time for this timer in seconds.
+  TimeRecord getTotalTime() {
+    // If this is a pass or analysis timer, use the recorded time directly.
+    if (kind == TimerKind::PassOrAnalysis) {
+      return TimeRecord(
+          std::chrono::duration_cast<std::chrono::duration<double>>(wallTime)
+              .count(),
+          std::chrono::duration_cast<std::chrono::duration<double>>(userTime)
+              .count());
+    }
+
+    // Otherwise, accumulate the timing from each of the children.
+    TimeRecord totalTime;
+    for (auto &child : children)
+      totalTime += child.second->getTotalTime();
+    return totalTime;
+  }
+
+  /// A map of unique identifiers to child timers.
+  using ChildrenMap = llvm::MapVector<const void *, std::unique_ptr<Timer>>;
+
+  /// Merge the timing data from 'other' into this timer.
+  void merge(Timer &&other) {
+    if (wallTime < other.wallTime)
+      wallTime = other.wallTime;
+    userTime += other.userTime;
+    mergeChildren(std::move(other.children));
+  }
+
+  /// Merge the timer children in 'otherChildren' with the children of this
+  /// timer.
+  void mergeChildren(ChildrenMap &&otherChildren) {
+    // Check for an empty children list.
+    if (children.empty()) {
+      children = std::move(otherChildren);
+      return;
+    }
+
+    // Pipeline merges are handled separately as the children are merged
+    // lexicographically.
+    if (kind == TimerKind::Pipeline) {
+      assert(children.size() == otherChildren.size() &&
+             "pipeline merge requires the same number of children");
+      for (auto it : llvm::zip(children, otherChildren))
+        std::get<0>(it).second->merge(std::move(*std::get<1>(it).second));
+      return;
+    }
+
+    // Otherwise, we merge children based upon their timer key.
+    for (auto &otherChild : otherChildren)
+      mergeChild(std::move(otherChild));
+  }
+
+  /// Merge in the given child timer and id into this timer.
+  void mergeChild(ChildrenMap::value_type &&childIt) {
+    auto &child = children[childIt.first];
+    if (!child)
+      child = std::move(childIt.second);
+    else
+      child->merge(std::move(*childIt.second));
+  }
+
+  /// Raw timing information.
+  std::chrono::time_point<std::chrono::system_clock> startTime;
+  std::chrono::nanoseconds wallTime = std::chrono::nanoseconds(0);
+  std::chrono::nanoseconds userTime = std::chrono::nanoseconds(0);
+
+  /// A map of unique identifiers to child timers.
+  ChildrenMap children;
+
+  /// A descriptive name for this timer.
+  std::string name;
+
+  /// The type of timer this instance represents.
+  TimerKind kind;
+};
+
+struct PassTiming : public PassInstrumentation {
+  PassTiming(PassDisplayMode displayMode) : displayMode(displayMode) {}
+  ~PassTiming() override { print(); }
+
+  /// Setup the instrumentation hooks.
+  void runBeforePipeline(const OperationName &name,
+                         const PipelineParentInfo &parentInfo) override;
+  void runAfterPipeline(const OperationName &name,
+                        const PipelineParentInfo &parentInfo) override;
+  void runBeforePass(Pass *pass, Operation *) override { startPassTimer(pass); }
+  void runAfterPass(Pass *pass, Operation *) override;
+  void runAfterPassFailed(Pass *pass, Operation *op) override {
+    runAfterPass(pass, op);
+  }
+  void runBeforeAnalysis(StringRef name, AnalysisID *id, Operation *) override {
+    startAnalysisTimer(name, id);
+  }
+  void runAfterAnalysis(StringRef, AnalysisID *, Operation *) override;
+
+  /// Print and clear the timing results.
+  void print();
+
+  /// Start a new timer for the given pass.
+  void startPassTimer(Pass *pass);
+
+  /// Start a new timer for the given analysis.
+  void startAnalysisTimer(StringRef name, AnalysisID *id);
+
+  /// Pop the last active timer for the current thread.
+  Timer *popLastActiveTimer() {
+    auto tid = llvm::get_threadid();
+    auto &activeTimers = activeThreadTimers[tid];
+    assert(!activeTimers.empty() && "expected active timer");
+    return activeTimers.pop_back_val();
+  }
+
+  /// Print the timing result in list mode.
+  void printResultsAsList(raw_ostream &os, Timer *root, TimeRecord totalTime);
+
+  /// Print the timing result in pipeline mode.
+  void printResultsAsPipeline(raw_ostream &os, Timer *root,
+                              TimeRecord totalTime);
+
+  /// Returns a timer for the provided identifier and name.
+  Timer *getTimer(const void *id, TimerKind kind,
+                  std::function<std::string()> &&nameBuilder) {
+    auto tid = llvm::get_threadid();
+
+    // If there is no active timer then add to the root timer.
+    auto &activeTimers = activeThreadTimers[tid];
+    Timer *parentTimer;
+    if (activeTimers.empty()) {
+      auto &rootTimer = rootTimers[tid];
+      if (!rootTimer)
+        rootTimer = std::make_unique<Timer>("root", TimerKind::Pipeline);
+      parentTimer = rootTimer.get();
+    } else {
+      // Otherwise, add this to the active timer.
+      parentTimer = activeTimers.back();
+    }
+
+    auto timer = parentTimer->getChildTimer(id, kind, std::move(nameBuilder));
+    activeTimers.push_back(timer);
+    return timer;
+  }
+
+  /// The root top level timers for each thread.
+  DenseMap<uint64_t, std::unique_ptr<Timer>> rootTimers;
+
+  /// A stack of the currently active pass timers per thread.
+  DenseMap<uint64_t, SmallVector<Timer *, 4>> activeThreadTimers;
+
+  /// The display mode to use when printing the timing results.
+  PassDisplayMode displayMode;
+
+  /// A mapping of pipeline timers that need to be merged into the parent
+  /// collection. The timers are mapped to the parent info to merge into.
+  DenseMap<PipelineParentInfo, SmallVector<Timer::ChildrenMap::value_type, 4>>
+      pipelinesToMerge;
+};
+} // end anonymous namespace
+
+void PassTiming::runBeforePipeline(const OperationName &name,
+                                   const PipelineParentInfo &parentInfo) {
+  // We don't actually want to time the piplelines, they gather their total
+  // from their held passes.
+  getTimer(name.getAsOpaquePointer(), TimerKind::Pipeline,
+           [&] { return ("'" + name.getStringRef() + "' Pipeline").str(); });
+}
+
+void PassTiming::runAfterPipeline(const OperationName &name,
+                                  const PipelineParentInfo &parentInfo) {
+  // Pop the timer for the pipeline.
+  auto tid = llvm::get_threadid();
+  auto &activeTimers = activeThreadTimers[tid];
+  assert(!activeTimers.empty() && "expected active timer");
+  activeTimers.pop_back();
+
+  // If the current thread is the same as the parent, there is nothing left to
+  // do.
+  if (tid == parentInfo.parentThreadID)
+    return;
+
+  // Otherwise, mark the pipeline timer for merging into the correct parent
+  // thread.
+  assert(activeTimers.empty() && "expected parent timer to be root");
+  auto *parentTimer = rootTimers[tid].get();
+  assert(parentTimer->children.size() == 1 &&
+         parentTimer->children.count(name.getAsOpaquePointer()) &&
+         "expected a single pipeline timer");
+  pipelinesToMerge[parentInfo].push_back(
+      std::move(*parentTimer->children.begin()));
+  rootTimers.erase(tid);
+}
+
+/// Start a new timer for the given pass.
+void PassTiming::startPassTimer(Pass *pass) {
+  auto kind = isAdaptorPass(pass) ? TimerKind::PipelineCollection
+                                  : TimerKind::PassOrAnalysis;
+  Timer *timer = getTimer(pass, kind, [pass]() -> std::string {
+    if (auto *adaptor = getAdaptorPassBase(pass))
+      return adaptor->getName();
+    return pass->getName();
+  });
+
+  // We don't actually want to time the adaptor passes, they gather their total
+  // from their held passes.
+  if (!isAdaptorPass(pass))
+    timer->start();
+}
+
+/// Start a new timer for the given analysis.
+void PassTiming::startAnalysisTimer(StringRef name, AnalysisID *id) {
+  Timer *timer = getTimer(id, TimerKind::PassOrAnalysis,
+                          [name] { return "(A) " + name.str(); });
+  timer->start();
+}
+
+/// Stop a pass timer.
+void PassTiming::runAfterPass(Pass *pass, Operation *) {
+  Timer *timer = popLastActiveTimer();
+
+  // If this is an OpToOpPassAdaptorParallel, then we need to merge in the
+  // timing data for the pipelines running on other threads.
+  if (isa<OpToOpPassAdaptorParallel>(pass)) {
+    auto toMerge = pipelinesToMerge.find({llvm::get_threadid(), pass});
+    if (toMerge != pipelinesToMerge.end()) {
+      for (auto &it : toMerge->second)
+        timer->mergeChild(std::move(it));
+      pipelinesToMerge.erase(toMerge);
+    }
+    return;
+  }
+
+  // Adaptor passes aren't timed directly, so we don't need to stop their
+  // timers.
+  if (!isAdaptorPass(pass))
+    timer->stop();
+}
+
+/// Stop a timer.
+void PassTiming::runAfterAnalysis(StringRef, AnalysisID *, Operation *) {
+  popLastActiveTimer()->stop();
+}
+
+/// Utility to print the timer heading information.
+static void printTimerHeader(raw_ostream &os, TimeRecord total) {
+  os << "===" << std::string(73, '-') << "===\n";
+  // Figure out how many spaces to description name.
+  unsigned padding = (80 - kPassTimingDescription.size()) / 2;
+  os.indent(padding) << kPassTimingDescription << '\n';
+  os << "===" << std::string(73, '-') << "===\n";
+
+  // Print the total time followed by the section headers.
+  os << llvm::format("  Total Execution Time: %5.4f seconds\n\n", total.wall);
+  if (total.user != total.wall)
+    os << "   ---User Time---";
+  os << "   ---Wall Time---  --- Name ---\n";
+}
+
+/// Utility to print a single line entry in the timer output.
+static void printTimeEntry(raw_ostream &os, unsigned indent, StringRef name,
+                           TimeRecord time, TimeRecord totalTime) {
+  time.print(os, totalTime);
+  os.indent(indent) << name << "\n";
+}
+
+/// Print out the current timing information.
+void PassTiming::print() {
+  // Don't print anything if there is no timing data.
+  if (rootTimers.empty())
+    return;
+
+  assert(rootTimers.size() == 1 && "expected one remaining root timer");
+  auto &rootTimer = rootTimers.begin()->second;
+  auto os = llvm::CreateInfoOutputFile();
+
+  // Print the timer header.
+  TimeRecord totalTime = rootTimer->getTotalTime();
+  printTimerHeader(*os, totalTime);
+
+  // Defer to a specialized printer for each display mode.
+  switch (displayMode) {
+  case PassDisplayMode::List:
+    printResultsAsList(*os, rootTimer.get(), totalTime);
+    break;
+  case PassDisplayMode::Pipeline:
+    printResultsAsPipeline(*os, rootTimer.get(), totalTime);
+    break;
+  }
+  printTimeEntry(*os, 0, "Total", totalTime, totalTime);
+  os->flush();
+
+  // Reset root timers.
+  rootTimers.clear();
+  activeThreadTimers.clear();
+}
+
+/// Print the timing result in list mode.
+void PassTiming::printResultsAsList(raw_ostream &os, Timer *root,
+                                    TimeRecord totalTime) {
+  llvm::StringMap<TimeRecord> mergedTimings;
+
+  std::function<void(Timer *)> addTimer = [&](Timer *timer) {
+    // Only add timing information for passes and analyses.
+    if (timer->kind == TimerKind::PassOrAnalysis)
+      mergedTimings[timer->name] += timer->getTotalTime();
+    for (auto &children : timer->children)
+      addTimer(children.second.get());
+  };
+
+  // Add each of the top level timers.
+  for (auto &topLevelTimer : root->children)
+    addTimer(topLevelTimer.second.get());
+
+  // Sort the timing information by wall time.
+  std::vector<std::pair<StringRef, TimeRecord>> timerNameAndTime;
+  for (auto &it : mergedTimings)
+    timerNameAndTime.emplace_back(it.first(), it.second);
+  llvm::array_pod_sort(timerNameAndTime.begin(), timerNameAndTime.end(),
+                       [](const std::pair<StringRef, TimeRecord> *lhs,
+                          const std::pair<StringRef, TimeRecord> *rhs) {
+                         return llvm::array_pod_sort_comparator<double>(
+                             &rhs->second.wall, &lhs->second.wall);
+                       });
+
+  // Print the timing information sequentially.
+  for (auto &timeData : timerNameAndTime)
+    printTimeEntry(os, 0, timeData.first, timeData.second, totalTime);
+}
+
+/// Print the timing result in pipeline mode.
+void PassTiming::printResultsAsPipeline(raw_ostream &os, Timer *root,
+                                        TimeRecord totalTime) {
+  std::function<void(unsigned, Timer *)> printTimer = [&](unsigned indent,
+                                                          Timer *timer) {
+    // If this is a timer for a pipeline collection and the collection only has
+    // one pipeline child, then only print the child.
+    if (timer->kind == TimerKind::PipelineCollection &&
+        timer->children.size() == 1)
+      return printTimer(indent, timer->children.begin()->second.get());
+
+    printTimeEntry(os, indent, timer->name, timer->getTotalTime(), totalTime);
+
+    // If this timer is a pipeline, then print the children in-order.
+    if (timer->kind == TimerKind::Pipeline) {
+      for (auto &child : timer->children)
+        printTimer(indent + 2, child.second.get());
+      return;
+    }
+
+    // Otherwise, sort the children by name to give a deterministic ordering
+    // when emitting the time.
+    SmallVector<Timer *, 4> children;
+    children.reserve(timer->children.size());
+    for (auto &child : timer->children)
+      children.push_back(child.second.get());
+    llvm::array_pod_sort(children.begin(), children.end(),
+                         [](Timer *const *lhs, Timer *const *rhs) {
+                           return (*lhs)->name.compare((*rhs)->name);
+                         });
+    for (auto &child : children)
+      printTimer(indent + 2, child);
+  };
+
+  // Print each of the top level timers.
+  for (auto &topLevelTimer : root->children)
+    printTimer(0, topLevelTimer.second.get());
+}
+
+//===----------------------------------------------------------------------===//
+// PassManager
+//===----------------------------------------------------------------------===//
+
+/// Add an instrumentation to time the execution of passes and the computation
+/// of analyses.
+void PassManager::enableTiming(PassDisplayMode displayMode) {
+  // Check if pass timing is already enabled.
+  if (passTiming)
+    return;
+  addInstrumentation(std::make_unique<PassTiming>(displayMode));
+  passTiming = true;
+}
diff --git a/mlir/lib/Quantizer/CMakeLists.txt b/mlir/lib/Quantizer/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bc157d0e97990a258a78c09a137726ad2da2251f
--- /dev/null
+++ b/mlir/lib/Quantizer/CMakeLists.txt
@@ -0,0 +1,44 @@
+# Support.
+add_llvm_library(MLIRQuantizerSupport
+  Support/Configuration.cpp
+  Support/ConstraintAnalysisGraph.cpp
+  Support/Metadata.cpp
+  Support/Statistics.cpp
+  Support/TypeUtils.cpp
+  Support/UniformConstraints.cpp
+  Support/UniformSolvers.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  )
+add_dependencies(MLIRQuantizerSupport
+                 MLIRIR
+                 MLIRQuantOps
+                 MLIRSupport
+                 MLIRStandardOps)
+
+# Configurations.
+add_llvm_library(MLIRQuantizerFxpMathConfig
+  Configurations/FxpMathConfig.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  )
+add_dependencies(MLIRQuantizerFxpMathConfig
+                 MLIRFxpMathOpsIncGen
+                 MLIRQuantizerSupport)
+
+# Transforms.
+add_llvm_library(MLIRQuantizerTransforms
+  Transforms/AddDefaultStatsTestPass.cpp
+  Transforms/InferQuantizedTypesPass.cpp
+  Transforms/RemoveInstrumentationPass.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  )
+add_dependencies(MLIRQuantizerTransforms
+  MLIRQuantizerFxpMathConfig
+  MLIRQuantizerSupport
+  MLIRPass)
+target_link_libraries(MLIRQuantizerTransforms
+  MLIRQuantizerFxpMathConfig
+  MLIRQuantizerSupport
+  MLIRPass)
diff --git a/mlir/lib/Quantizer/Configurations/FxpMathConfig.cpp b/mlir/lib/Quantizer/Configurations/FxpMathConfig.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ba9c078a76569db5f6a488c2e1139d501e4d0e47
--- /dev/null
+++ b/mlir/lib/Quantizer/Configurations/FxpMathConfig.cpp
@@ -0,0 +1,278 @@
+//===- FxpMathConfig.cpp - Reference fixed point config -------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a TargetConfiguration for reference fixed-point math
+// quantization scheme based on the FxpMathOps (plus a small category of
+// extension ops that can be added from other dialects).
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Quantizer/Configurations/FxpMathConfig.h"
+
+#include "mlir/Dialect/FxpMathOps/FxpMathOps.h"
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+#include "mlir/Quantizer/Support/Metadata.h"
+#include "mlir/Quantizer/Support/Statistics.h"
+#include "mlir/Quantizer/Support/UniformConstraints.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+using namespace mlir::fxpmath;
+using namespace mlir::quant;
+using namespace std::placeholders;
+
+namespace {
+
+struct FxpMathTargetConfigImpl : public FxpMathTargetConfig {
+  FxpMathTargetConfigImpl(SolverContext &context)
+      : FxpMathTargetConfig(context) {
+    Builder b(&context.getMlirContext());
+    IntegerType i8Type = b.getIntegerType(8);
+    IntegerType i16Type = b.getIntegerType(16);
+    IntegerType i32Type = b.getIntegerType(32);
+
+    q8 = addCandidateType(
+        AnyQuantizedType::get(QuantizationFlags::Signed, i8Type, nullptr,
+                              std::numeric_limits<int8_t>::min(),
+                              std::numeric_limits<int8_t>::max()),
+        CandidateQuantizedType::Scheme::UniformPerLayer);
+    q16 = addCandidateType(
+        AnyQuantizedType::get(QuantizationFlags::Signed, i16Type, nullptr,
+                              std::numeric_limits<int16_t>::min(),
+                              std::numeric_limits<int16_t>::max()),
+        CandidateQuantizedType::Scheme::UniformPerLayer);
+    q32ExplicitFixedPoint = addCandidateType(
+        AnyQuantizedType::get(QuantizationFlags::Signed, i32Type, nullptr,
+                              std::numeric_limits<int32_t>::min(),
+                              std::numeric_limits<int32_t>::max()),
+        CandidateQuantizedType::Scheme::UniformExplicitFixedPointScale);
+
+    // Op handlers.
+    addOpHandler<ConstantOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleConstant, this, _1, _2));
+    addOpHandler<ReturnOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleTerminal, this, _1, _2));
+    addOpHandler<quant::StatisticsOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleStats, this, _1, _2));
+
+    // FxpMathOps.
+    addOpHandler<RealAddEwOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleAdd, this, _1, _2));
+    addOpHandler<RealMulEwOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleMul, this, _1, _2));
+    addOpHandler<RealMatMulOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleMatMul, this, _1, _2));
+    addOpHandler<RealMatMulBiasOp>(
+        std::bind(&FxpMathTargetConfigImpl::handleMatMulBias, this, _1, _2));
+
+    // Require stats ops.
+    addRequireStatsOp<RealAddEwOp>();
+    addRequireStatsOp<RealSubEwOp>();
+    addRequireStatsOp<RealDivEwOp>();
+    addRequireStatsOp<RealMulEwOp>();
+    addRequireStatsOp<RealMatMulOp>();
+    addRequireStatsOp<RealMatMulBiasOp>();
+  }
+
+  bool isHandledType(Type t) const final {
+    if (t.isa<FloatType>())
+      return true;
+    return (t.isa<VectorType>() || t.isa<TensorType>()) &&
+           t.cast<ShapedType>().getElementType().isa<FloatType>();
+  }
+
+  void finalizeAnchors(CAGSlice &cag) const override {
+    cag.enumerateImpliedConnections(
+        [&](CAGAnchorNode *from, CAGAnchorNode *to) {
+          UniformConstraintsBuilder(cag).coupleAnchors(from, to);
+        });
+  }
+
+  void addValueIdentityOpByName(StringRef opName) override {
+    addOpHandlerByName(
+        opName,
+        std::bind(&FxpMathTargetConfigImpl::handleValueIdentity, this, _1, _2));
+  }
+
+  void handleValueIdentity(Operation *op, CAGSlice &cag) const {
+    assert(op->getNumResults() == 1);
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto resultNode = cag.getResultAnchor(op, 0);
+    resultNode->setTypeTransformRule(
+        CAGAnchorNode::TypeTransformRule::DirectStorage);
+
+    for (unsigned opIdx = 0, e = op->getNumOperands(); opIdx < e; ++opIdx) {
+      if (!isHandledType(op->getOperand(opIdx)->getType()))
+        continue;
+      auto operandNode = cag.getOperandAnchor(op, opIdx);
+      operandNode->setTypeTransformRule(
+          CAGAnchorNode::TypeTransformRule::DirectStorage);
+      UniformConstraintsBuilder(cag).coupleAnchors(operandNode, resultNode);
+    }
+  }
+
+  void handleConstant(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto resultNode = cag.getResultAnchor(op, 0);
+    resultNode->setTypeTransformRule(
+        CAGAnchorNode::TypeTransformRule::ExpressedOnly);
+    Attribute valueAttr;
+    if (!matchPattern(op, m_Constant(&valueAttr))) {
+      return;
+    }
+
+    AttributeTensorStatistics stats(valueAttr);
+    TensorAxisStatistics layerStats;
+    if (!stats.get(layerStats)) {
+      op->emitOpError("could not compute statistics");
+      return;
+    }
+
+    UniformConstraintsBuilder(cag).applyStats(resultNode, layerStats);
+  }
+
+  void handleTerminal(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getOperand(0)->getType()))
+      return;
+    auto operandNode = cag.getOperandAnchor(op, 0);
+    operandNode->setTypeTransformRule(
+        CAGAnchorNode::TypeTransformRule::ExpressedOnly);
+  }
+
+  void handleStats(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto argNode = cag.getOperandAnchor(op, 0);
+    auto resultNode = cag.getResultAnchor(op, 0);
+    UniformConstraintsBuilder(cag).coupleAnchors(argNode, resultNode);
+
+    TensorAxisStatistics layerStats;
+    auto statsOp = cast<quant::StatisticsOp>(op);
+    auto layerStatsAttr = statsOp.layerStats();
+    layerStats.minValue =
+        layerStatsAttr.getValue<FloatAttr>(0).getValueAsDouble();
+    layerStats.maxValue =
+        layerStatsAttr.getValue<FloatAttr>(1).getValueAsDouble();
+    UniformConstraintsBuilder(cag).applyStats(resultNode, layerStats);
+  }
+
+  void handleAdd(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto lhs = cag.getOperandAnchor(op, 0);
+    auto rhs = cag.getOperandAnchor(op, 1);
+    auto resultNode = cag.getResultAnchor(op, 0);
+    // Add supports 8/16 bit math.
+    llvm::SmallBitVector disableMask =
+        getCandidateTypeDisabledExceptMask({q8, q16});
+    lhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    rhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    resultNode->getUniformMetadata().disabledCandidateTypes = disableMask;
+    // NOTE: We couple the add such that the scale/zeroPoint match between
+    // both args and the result. This is overly constrained in that it is
+    // possible to write efficient add kernels with a bit more freedom (i.e.
+    // zeroPoints can vary, scales can differ by a power of two, etc).
+    // However, fully coupled yields the simples solutions on the fast path.
+    // Further efficiency can be had by constraining the zeroPoint to 0, but
+    // there isn't a constraint for this yet (and there are tradeoffs).
+    UniformConstraintsBuilder(cag).coupleAnchors(lhs, resultNode);
+    UniformConstraintsBuilder(cag).coupleAnchors(rhs, resultNode);
+    addRealMathOptionalConstraints(op, resultNode, cag);
+  }
+
+  void handleMul(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto lhs = cag.getOperandAnchor(op, 0);
+    auto rhs = cag.getOperandAnchor(op, 1);
+    auto resultNode = cag.getResultAnchor(op, 0);
+    // Mul supports 8/16 bit math.
+    llvm::SmallBitVector disableMask =
+        getCandidateTypeDisabledExceptMask({q8, q16});
+    lhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    rhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    resultNode->getUniformMetadata().disabledCandidateTypes = disableMask;
+    addRealMathOptionalConstraints(op, resultNode, cag);
+  }
+
+  void handleMatMul(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto lhs = cag.getOperandAnchor(op, 0);
+    auto rhs = cag.getOperandAnchor(op, 1);
+    auto resultNode = cag.getResultAnchor(op, 0);
+    // Mul supports 8/16 bit math.
+    llvm::SmallBitVector disableMask =
+        getCandidateTypeDisabledExceptMask({q8, q16});
+    lhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    rhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    resultNode->getUniformMetadata().disabledCandidateTypes = disableMask;
+    addRealMathOptionalConstraints(op, resultNode, cag);
+  }
+
+  void handleMatMulBias(Operation *op, CAGSlice &cag) const {
+    if (!isHandledType(op->getResult(0)->getType()))
+      return;
+
+    auto lhs = cag.getOperandAnchor(op, 0);
+    auto rhs = cag.getOperandAnchor(op, 1);
+    auto bias = cag.getOperandAnchor(op, 2);
+    bias->getUniformMetadata().disabledCandidateTypes =
+        getCandidateTypeDisabledExceptMask({q32ExplicitFixedPoint});
+
+    auto resultNode = cag.getResultAnchor(op, 0);
+    UniformConstraintsBuilder(cag).propagateExplicitScale(resultNode, bias);
+
+    // Mul supports 8/16 bit math.
+    llvm::SmallBitVector disableMask =
+        getCandidateTypeDisabledExceptMask({q8, q16});
+    lhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    rhs->getUniformMetadata().disabledCandidateTypes = disableMask;
+    resultNode->getUniformMetadata().disabledCandidateTypes = disableMask;
+    addRealMathOptionalConstraints(op, resultNode, cag);
+  }
+
+  void addRealMathOptionalConstraints(Operation *op, CAGAnchorNode *anchor,
+                                      CAGSlice &cag) const {
+    // TODO: It would be nice if these all extended some base trait instead
+    // of requiring name lookup.
+    auto clampMinAttr = op->getAttrOfType<FloatAttr>("clamp_min");
+    auto clampMaxAttr = op->getAttrOfType<FloatAttr>("clamp_max");
+
+    if (clampMinAttr || clampMaxAttr) {
+      auto nan = APFloat::getQNaN(APFloat::IEEEdouble());
+      auto clampMin = clampMinAttr ? clampMinAttr.getValue() : nan;
+      auto clampMax = clampMaxAttr ? clampMaxAttr.getValue() : nan;
+      UniformConstraintsBuilder(cag).clamp(anchor, clampMin, clampMax);
+    }
+  }
+
+  unsigned q8;
+  unsigned q16;
+  unsigned q32ExplicitFixedPoint;
+};
+
+} // anonymous namespace
+
+std::unique_ptr<FxpMathTargetConfig>
+FxpMathTargetConfig::create(SolverContext &context) {
+  return std::make_unique<FxpMathTargetConfigImpl>(context);
+}
diff --git a/mlir/lib/Quantizer/Support/Configuration.cpp b/mlir/lib/Quantizer/Support/Configuration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f64cc85f0f714dbd189056bb360e390d11c6263a
--- /dev/null
+++ b/mlir/lib/Quantizer/Support/Configuration.cpp
@@ -0,0 +1,39 @@
+//===- Configuration.cpp - Configuration object base classes --------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Quantizer/Support/Configuration.h"
+
+#include <limits>
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/MLIRContext.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+TargetConfiguration::TargetConfiguration(SolverContext &context) {}
+
+void TargetConfiguration::addOpHandlerByName(StringRef name, OpHandlerFn fn) {
+  opHandlers[name] = fn;
+}
+
+void TargetConfiguration::addRequireStatsOpByName(StringRef opName) {
+  requireStatsOpNames.insert(opName);
+}
+
+bool TargetConfiguration::isRequireStatsOp(Operation *op) const {
+  return requireStatsOpNames.find(op->getName().getStringRef()) !=
+         requireStatsOpNames.end();
+}
+
+void TargetConfiguration::handleOp(Operation *op, CAGSlice &cag) const {
+  auto found_it = opHandlers.find(op->getName().getStringRef());
+  if (found_it != opHandlers.end())
+    found_it->second(op, cag);
+}
diff --git a/mlir/lib/Quantizer/Support/ConstraintAnalysisGraph.cpp b/mlir/lib/Quantizer/Support/ConstraintAnalysisGraph.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c194bbd459a8dc8ccbcf6b6c17e18d2d8c340c0
--- /dev/null
+++ b/mlir/lib/Quantizer/Support/ConstraintAnalysisGraph.cpp
@@ -0,0 +1,172 @@
+//===- ConstraintAnalysisGraph.cpp - Graphs type for constraints ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Quantizer/Support/Configuration.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+void CAGNode::replaceIncoming(CAGNode *otherNode) {
+  if (this == otherNode)
+    return;
+  for (CAGNode *parentNode : incoming) {
+    for (CAGNode *&it : parentNode->outgoing) {
+      if (it == this) {
+        it = otherNode;
+        otherNode->incoming.push_back(parentNode);
+      }
+    }
+  }
+  incoming.clear();
+}
+
+void CAGNode::addOutgoing(CAGNode *toNode) {
+  if (!llvm::is_contained(outgoing, toNode)) {
+    outgoing.push_back(toNode);
+    toNode->incoming.push_back(this);
+  }
+}
+
+CAGOperandAnchor::CAGOperandAnchor(Operation *op, unsigned operandIdx)
+    : CAGAnchorNode(Kind::OperandAnchor, op->getOperand(operandIdx)->getType()),
+      op(op), operandIdx(operandIdx) {}
+
+CAGResultAnchor::CAGResultAnchor(Operation *op, unsigned resultIdx)
+    : CAGAnchorNode(Kind::ResultAnchor, op->getResult(resultIdx)->getType()),
+      resultValue(op->getResult(resultIdx)) {}
+
+CAGSlice::CAGSlice(SolverContext &context) : context(context) {}
+CAGSlice::~CAGSlice() { llvm::DeleteContainerPointers(allNodes); }
+
+CAGOperandAnchor *CAGSlice::getOperandAnchor(Operation *op,
+                                             unsigned operandIdx) {
+  assert(operandIdx < op->getNumOperands() && "illegal operand index");
+
+  // Dedup.
+  auto key = std::make_pair(op, operandIdx);
+  auto foundIt = operandAnchors.find(key);
+  if (foundIt != operandAnchors.end()) {
+    return foundIt->second;
+  }
+
+  // Create.
+  auto anchor = std::make_unique<CAGOperandAnchor>(op, operandIdx);
+  auto *unowned = anchor.release();
+  unowned->nodeId = allNodes.size();
+  allNodes.push_back(unowned);
+  operandAnchors.insert(std::make_pair(key, unowned));
+  return unowned;
+}
+
+CAGResultAnchor *CAGSlice::getResultAnchor(Operation *op, unsigned resultIdx) {
+  assert(resultIdx < op->getNumResults() && "illegal result index");
+
+  // Dedup.
+  auto key = std::make_pair(op, resultIdx);
+  auto foundIt = resultAnchors.find(key);
+  if (foundIt != resultAnchors.end()) {
+    return foundIt->second;
+  }
+
+  // Create.
+  auto anchor = std::make_unique<CAGResultAnchor>(op, resultIdx);
+  auto *unowned = anchor.release();
+  unowned->nodeId = allNodes.size();
+  allNodes.push_back(unowned);
+  resultAnchors.insert(std::make_pair(key, unowned));
+  return unowned;
+}
+
+void CAGSlice::enumerateImpliedConnections(
+    std::function<void(CAGAnchorNode *from, CAGAnchorNode *to)> callback) {
+  // Discover peer identity pairs (i.e. implied edges from Result->Operand and
+  // Arg->Call). Use an intermediate vector so that the callback can modify.
+  std::vector<std::pair<CAGAnchorNode *, CAGAnchorNode *>> impliedPairs;
+  for (auto &resultAnchorPair : resultAnchors) {
+    CAGResultAnchor *resultAnchor = resultAnchorPair.second;
+    Value resultValue = resultAnchor->getValue();
+    for (auto &use : resultValue->getUses()) {
+      Operation *operandOp = use.getOwner();
+      unsigned operandIdx = use.getOperandNumber();
+      auto foundIt = operandAnchors.find(std::make_pair(operandOp, operandIdx));
+      if (foundIt != operandAnchors.end()) {
+        impliedPairs.push_back(std::make_pair(resultAnchor, foundIt->second));
+      }
+    }
+  }
+
+  // Callback for each pair.
+  for (auto &impliedPair : impliedPairs) {
+    callback(impliedPair.first, impliedPair.second);
+  }
+}
+
+unsigned CAGSlice::propagate(const TargetConfiguration &config) {
+  std::vector<CAGNode *> dirtyNodes;
+  dirtyNodes.reserve(allNodes.size());
+  // Note that because iteration happens in nodeId order, there is no need
+  // to sort in order to make deterministic. If the selection method changes,
+  // a sort should be explicitly done.
+  for (CAGNode *child : *this) {
+    if (child->isDirty()) {
+      dirtyNodes.push_back(child);
+    }
+  }
+
+  if (dirtyNodes.empty()) {
+    return 0;
+  }
+  for (auto dirtyNode : dirtyNodes) {
+    dirtyNode->clearDirty();
+    dirtyNode->propagate(context, config);
+  }
+
+  return dirtyNodes.size();
+}
+
+void CAGAnchorNode::propagate(SolverContext &solverContext,
+                              const TargetConfiguration &config) {
+  for (CAGNode *child : *this) {
+    child->markDirty();
+  }
+}
+
+Type CAGAnchorNode::getTransformedType() {
+  if (!getUniformMetadata().selectedType) {
+    return nullptr;
+  }
+  return getUniformMetadata().selectedType.castFromExpressedType(
+      getOriginalType());
+}
+
+void CAGNode::printLabel(raw_ostream &os) const {
+  os << "Node<" << static_cast<const void *>(this) << ">";
+}
+
+void CAGAnchorNode::printLabel(raw_ostream &os) const {
+  getUniformMetadata().printSummary(os);
+}
+
+void CAGOperandAnchor::printLabel(raw_ostream &os) const {
+  os << "Operand<";
+  op->getName().print(os);
+  os << "," << operandIdx;
+  os << ">";
+  CAGAnchorNode::printLabel(os);
+}
+
+void CAGResultAnchor::printLabel(raw_ostream &os) const {
+  os << "Result<";
+  getOp()->getName().print(os);
+  os << ">";
+  CAGAnchorNode::printLabel(os);
+}
diff --git a/mlir/lib/Quantizer/Support/Metadata.cpp b/mlir/lib/Quantizer/Support/Metadata.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7badfd5f87627557087a36c0fd98b2685f647d5
--- /dev/null
+++ b/mlir/lib/Quantizer/Support/Metadata.cpp
@@ -0,0 +1,33 @@
+//===- Metadata.cpp - Top level types and metadata ------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Quantizer/Support/Metadata.h"
+
+#include "mlir/IR/MLIRContext.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+void CAGUniformMetadata::printSummary(raw_ostream &os) const {
+  if (requiredRange.hasValue()) {
+    os << "\n[" << requiredRange.getValue().first << ","
+       << requiredRange.getValue().second << "]";
+  }
+
+  if (disabledCandidateTypes.any()) {
+    os << "\n![";
+    mlir::interleaveComma(disabledCandidateTypes.set_bits(), os);
+    os << "]";
+  }
+
+  if (selectedType) {
+    os << "\n" << selectedType;
+  }
+}
diff --git a/mlir/lib/Quantizer/Support/Statistics.cpp b/mlir/lib/Quantizer/Support/Statistics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c8b041e24474f7b56f8cbb826b3d1f659b692ec
--- /dev/null
+++ b/mlir/lib/Quantizer/Support/Statistics.cpp
@@ -0,0 +1,95 @@
+//===- Statistics.cpp - Collects statistics over tensors ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Quantizer/Support/Statistics.h"
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+//===----------------------------------------------------------------------===//
+// AttributeTensorStatistics implementation
+//===----------------------------------------------------------------------===//
+
+static void collectElementsStatisticsDim(ElementsAttr attr,
+                                         unsigned numElements,
+                                         ArrayRef<int64_t> shape,
+                                         SmallVectorImpl<uint64_t> &indices,
+                                         uint64_t dim,
+                                         TensorAxisStatistics &statistics) {
+  // Recursive terminating condition.
+  if (dim >= shape.size())
+    return;
+
+  if (dim < (shape.size() - 1)) {
+    // Recurse past dim.
+    for (uint64_t i = 0, s = shape[dim]; i < s; ++i) {
+      indices[dim] = i;
+      collectElementsStatisticsDim(attr, numElements, shape, indices, dim + 1,
+                                   statistics);
+    }
+    return;
+  }
+
+  // Collection dim.
+  for (uint64_t i = 0, s = shape[dim]; i < s; ++i) {
+    indices[dim] = i;
+    double value = attr.getValue<FloatAttr>(indices).getValueAsDouble();
+    statistics.minValue = std::min(statistics.minValue, value);
+    statistics.maxValue = std::max(statistics.maxValue, value);
+    statistics.mean += value / numElements;
+    // TODO: Calculate a running variance.
+  }
+}
+
+static bool getElementsStatistics(ElementsAttr attr,
+                                  TensorAxisStatistics &statistics) {
+  statistics.clear();
+  statistics.minValue = std::numeric_limits<double>::infinity();
+  statistics.maxValue = -std::numeric_limits<double>::infinity();
+
+  ShapedType sType = attr.getType();
+  if (!sType.hasStaticShape())
+    return false;
+  Type elementTy = sType.getElementType();
+  if (!elementTy.isa<FloatType>())
+    return false;
+
+  SmallVector<uint64_t, 4> indices;
+  indices.resize(sType.getRank());
+  ArrayRef<int64_t> shape = sType.getShape();
+
+  auto numElements = sType.getNumElements();
+  collectElementsStatisticsDim(attr, numElements, shape, indices, 0,
+                               statistics);
+  statistics.sampleSize = numElements;
+
+  return true;
+}
+
+bool AttributeTensorStatistics::get(TensorAxisStatistics &stats) const {
+  if (FloatAttr floatAttr = attr.dyn_cast<FloatAttr>()) {
+    double value = floatAttr.getValueAsDouble();
+    stats = TensorAxisStatistics(1, value, value, value, 0);
+    return true;
+  } else if (auto eltAttr = attr.dyn_cast<ElementsAttr>()) {
+    return getElementsStatistics(eltAttr, stats);
+  }
+  return false;
+}
+
+raw_ostream &mlir::quantizer::operator<<(raw_ostream &os,
+                                         const TensorAxisStatistics &stats) {
+  os << "STATS[sampleSize=" << stats.sampleSize << ", min=" << stats.minValue
+     << ", maxValue=" << stats.maxValue << ", mean=" << stats.mean
+     << ", variance=" << stats.variance << "]";
+  return os;
+}
diff --git a/mlir/lib/Quantizer/Support/TypeUtils.cpp b/mlir/lib/Quantizer/Support/TypeUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1f52c585a1e2a793833b0ae53de8e59999872de
--- /dev/null
+++ b/mlir/lib/Quantizer/Support/TypeUtils.cpp
@@ -0,0 +1,22 @@
+//===- TypeUtils.cpp - Helper function for manipulating types -------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Quantizer/Support/TypeUtils.h"
+
+#include "mlir/IR/StandardTypes.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+Type mlir::quantizer::getElementOrPrimitiveType(Type t) {
+  if (auto sType = t.dyn_cast<ShapedType>()) {
+    return sType.getElementType();
+  } else {
+    return t;
+  }
+}
diff --git a/mlir/lib/Quantizer/Support/UniformConstraints.cpp b/mlir/lib/Quantizer/Support/UniformConstraints.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b20213568a1a63c01b1627c06fd5478d7e9afc74
--- /dev/null
+++ b/mlir/lib/Quantizer/Support/UniformConstraints.cpp
@@ -0,0 +1,256 @@
+//===- UniformConstraints.cpp - Constraints for uniform quant -------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Quantizer/Support/UniformConstraints.h"
+
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Quantizer/Support/Configuration.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+#include "mlir/Quantizer/Support/Metadata.h"
+#include "mlir/Quantizer/Support/Rules.h"
+#include "mlir/Quantizer/Support/TypeUtils.h"
+#include "mlir/Quantizer/Support/UniformSolvers.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+using namespace mlir::quant;
+
+namespace {
+
+struct ClusteredFacts {
+  ExpandingMinMaxFact requiredRange;
+  DiscreteScaleZeroPointFact explicitScaleZeroPoint;
+};
+
+} // end anonymous namespace
+
+static QuantizedType solveUniformType(SolverContext &solverContext,
+                                      const ClusteredFacts &clusteredFacts,
+                                      const CandidateQuantizedType &ct,
+                                      Type originalElementType, Location loc) {
+  switch (ct.scheme) {
+  default:
+    emitError(loc, "unsupported scheme for uniform type conversion");
+    return nullptr;
+
+  case CandidateQuantizedType::Scheme::UniformPerLayer: {
+    if (!clusteredFacts.requiredRange.hasValue()) {
+      // TODO: Issue some kind of diagnostic. This is not an error.
+      return nullptr;
+    }
+
+    uint64_t numLevels = ct.quantizedType.getStorageTypeMax() -
+                         ct.quantizedType.getStorageTypeMin();
+    UniformStorageParams params{numLevels,
+                                ct.quantizedType.getStorageTypeMin()};
+    UniformParamsFromMinMaxSolver solver(
+        params, clusteredFacts.requiredRange.getValue().first,
+        clusteredFacts.requiredRange.getValue().second);
+    if (!solver.compute()) {
+      emitWarning(loc) << "unable to solve uniform type with "
+                       << "UniformParamsFromMinMaxSolver";
+      return nullptr;
+    }
+
+    return UniformQuantizedType::getChecked(
+        ct.quantizedType.getFlags(), ct.quantizedType.getStorageType(),
+        originalElementType, solver.getScale(), solver.getZp(),
+        ct.quantizedType.getStorageTypeMin(),
+        ct.quantizedType.getStorageTypeMax(), loc);
+  }
+  case CandidateQuantizedType::Scheme::UniformExplicitFixedPointScale: {
+    if (!clusteredFacts.explicitScaleZeroPoint.hasValue()) {
+      emitRemark(loc)
+          << "unable to solve uniform type with UniformExplicitFixedPointScale "
+          << "(no explicitScaleZeroPoint)";
+      return nullptr;
+    }
+
+    const auto &scaleZp = clusteredFacts.explicitScaleZeroPoint.getValue();
+    assert(scaleZp.value && "optional value not set on fact");
+
+    if (scaleZp.conflict) {
+      emitWarning(loc)
+          << "conflicting explicit scale/zeroPoint on node cluster: "
+          << "an arbitrary scale/zeroPoint will be used";
+    }
+
+    return UniformQuantizedType::getChecked(
+        ct.quantizedType.getFlags(), ct.quantizedType.getStorageType(),
+        originalElementType,
+        scaleZp.value->first, // scale
+        0, // zeroPoint (fixed point solutions only for this scheme)
+        ct.quantizedType.getStorageTypeMin(),
+        ct.quantizedType.getStorageTypeMax(), loc);
+
+    return nullptr;
+  }
+  }
+}
+
+namespace {
+
+class PropagateExplicitScale : public CAGConstraintNode {
+public:
+  PropagateExplicitScale()
+      : CAGConstraintNode(Kind::UniformPropagateExplicitScale) {}
+  static bool classof(const CAGNode *n) {
+    return n->getKind() == Kind::Constraint ||
+           n->getKind() == Kind::UniformPropagateExplicitScale;
+  }
+
+private:
+  void printLabel(raw_ostream &os) const override {
+    os << "PropagateExplicitScale";
+  }
+  void propagate(SolverContext &solverContext,
+                 const TargetConfiguration &config) override {
+    DiscreteScaleZeroPointFact scaleZp;
+
+    // Get scale/zp from all parents.
+    for (auto it = incoming_begin(), e = incoming_end(); it != e; ++it) {
+      auto parentAnchor = cast<CAGAnchorNode>(*it);
+      auto selectedType = parentAnchor->getUniformMetadata().selectedType;
+      if (auto uqType = selectedType.dyn_cast_or_null<UniformQuantizedType>()) {
+        scaleZp.assertValue(
+            CAGUniformMetadata::SalienceRequired,
+            std::make_pair(uqType.getScale(), static_cast<int64_t>(0)));
+      }
+    }
+
+    // Propagate to children.
+    if (scaleZp.hasValue()) {
+      for (auto it = begin(), e = end(); it != e; ++it) {
+        auto childAnchor = cast<CAGAnchorNode>(*it);
+        if (modified(childAnchor->getUniformMetadata()
+                         .explicitScaleZeroPoint.mergeFrom(scaleZp))) {
+          childAnchor->markDirty();
+        }
+      }
+    }
+  }
+};
+
+/// A constraint node which will solve uniform quantization for all parents
+/// of the constraint, assuming that they are coupled.
+class SolveUniformConstraintNode : public CAGConstraintNode {
+public:
+  SolveUniformConstraintNode()
+      : CAGConstraintNode(Kind::SolveUniformConstraint) {
+    markDirty();
+  }
+  static bool classof(const CAGNode *n) {
+    return n->getKind() == Kind::Constraint ||
+           n->getKind() == Kind::SolveUniformConstraint;
+  }
+
+private:
+  void printLabel(raw_ostream &os) const override { os << "SolveUniform"; }
+
+  void propagate(SolverContext &solverContext,
+                 const TargetConfiguration &config) override {
+    // First determine the required min/max range and type constraints.
+    Location fusedLoc = UnknownLoc::get(&solverContext.getMlirContext());
+    llvm::SmallBitVector enabledCandidateTypesMask(
+        config.getAllCandidateTypesMask());
+    ClusteredFacts clusteredFacts;
+    Type originalElementType;
+    for (auto it = incoming_begin(), e = incoming_end(); it != e; ++it) {
+      auto parentAnchor = cast<CAGAnchorNode>(*it);
+      auto metadata = parentAnchor->getUniformMetadata();
+      // TODO: Possibly use a location that fuses all involved parents.
+      fusedLoc = parentAnchor->getOp()->getLoc();
+
+      // Shared element type.
+      auto parentOriginalElementType =
+          getElementOrPrimitiveType(parentAnchor->getOriginalType());
+      if (!originalElementType) {
+        originalElementType = parentOriginalElementType;
+      } else {
+        if (originalElementType != parentOriginalElementType) {
+          parentAnchor->getOp()->emitError()
+              << "cannot compute uniform type: parent element types mismatch";
+          return;
+        }
+      }
+      // Range.
+      clusteredFacts.requiredRange.mergeFrom(metadata.requiredRange);
+
+      // Explicit scale and zero point.
+      clusteredFacts.explicitScaleZeroPoint.mergeFrom(
+          metadata.explicitScaleZeroPoint);
+
+      // Shared candidate types.
+      enabledCandidateTypesMask.reset(metadata.disabledCandidateTypes);
+    }
+
+    // Find the first enabled candidate type.
+    const CandidateQuantizedType *bestCandidateType = nullptr;
+    for (auto &ct : config.getCandidateTypes()) {
+      if (enabledCandidateTypesMask.test(ct.ordinal)) {
+        bestCandidateType = &ct;
+        break;
+      }
+    }
+
+    if (!bestCandidateType || !originalElementType) {
+      emitRemark(fusedLoc)
+          << "not solving uniform type (no viable candidate type)";
+      return;
+    }
+
+    // Solve for the type.
+    QuantizedType selectedType =
+        solveUniformType(solverContext, clusteredFacts, *bestCandidateType,
+                         originalElementType, fusedLoc);
+
+    // Apply it to all parents.
+    for (auto it = incoming_begin(), e = incoming_end(); it != e; ++it) {
+      auto parentAnchor = cast<CAGAnchorNode>(*it);
+      auto &metadata = parentAnchor->getUniformMetadata();
+      if (metadata.selectedType != selectedType) {
+        metadata.selectedType = selectedType;
+        // And mark all children of the parent dirty (except us).
+        for (auto child : *parentAnchor) {
+          if (child != this) {
+            child->markDirty();
+          }
+        }
+      }
+    }
+  }
+};
+
+} // end anonymous namespace
+
+void UniformConstraintsBuilder::coupleAnchors(CAGAnchorNode *a,
+                                              CAGAnchorNode *b) {
+  slice.addClusteredConstraint<SolveUniformConstraintNode>({a, b});
+}
+
+void UniformConstraintsBuilder::applyStats(CAGAnchorNode *a,
+                                           TensorAxisStatistics stats) {
+  a->getUniformMetadata().requiredRange.assertValue(
+      CAGUniformMetadata::SalienceDefault, {stats.minValue, stats.maxValue});
+}
+
+void UniformConstraintsBuilder::clamp(CAGAnchorNode *a, APFloat minValue,
+                                      APFloat maxValue) {
+  a->getUniformMetadata().requiredRange.assertValue(
+      CAGUniformMetadata::SalienceDefault,
+      {minValue.convertToDouble(), maxValue.convertToDouble()});
+}
+
+void UniformConstraintsBuilder::propagateExplicitScale(CAGAnchorNode *from,
+                                                       CAGAnchorNode *to) {
+  slice.addUnidirectionalConstraint<PropagateExplicitScale>(from, {to});
+}
diff --git a/mlir/lib/Quantizer/Support/UniformSolvers.cpp b/mlir/lib/Quantizer/Support/UniformSolvers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f6bb20792f93bc151f5d3c4651a438125740d96
--- /dev/null
+++ b/mlir/lib/Quantizer/Support/UniformSolvers.cpp
@@ -0,0 +1,143 @@
+//===- UniformSolvers.cpp - Uniform type solver algorithms ----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Quantizer/Support/UniformSolvers.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cmath>
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+bool UniformParamsFromMinMaxSolver::compute() {
+  // Compute adjMin, adjMax, clamping to ensure that they straddle zero.
+  if (boundingMin > 0 && boundingMax >= boundingMin) {
+    // Lop-sided to the positive.
+    adjMin = 0;
+    adjMax = boundingMax;
+  } else if (boundingMax < 0 && boundingMin <= boundingMax) {
+    // Lop-sided to the negative.
+    adjMin = boundingMin;
+    adjMax = 0;
+  } else if (boundingMin <= 0 && boundingMax >= 0) {
+    adjMin = boundingMin;
+    adjMax = boundingMax;
+  } else {
+    // Illegal bounds.
+    return satisfied = false;
+  }
+
+  const double origMinAdj = adjMin;
+  const double origMaxAdj = adjMax;
+  const double numLevelsDouble = storageParams.numLevels;
+
+  struct fns {
+    static std::pair<double, double>
+    computeMinMax(double boundingMin, double numLevels, double delta) {
+      double adjMin = delta * std::floor(boundingMin / delta);
+      return std::make_pair(adjMin, adjMin + numLevels * delta);
+    }
+    static double overshoot(double boundingMin, double boundingMax,
+                            double numLevels, double delta) {
+      auto adjMinMax = computeMinMax(boundingMin, numLevels, delta);
+      double maxOvershoot = adjMinMax.second - boundingMax;
+      double minOvershoot = boundingMin - adjMinMax.first;
+      // If undershooting on the min or max end, return that because it is
+      // to be unconditionally avoided. Otherwise return the end with the
+      // greatest magnitude of overshoot.
+      if (maxOvershoot < 0)
+        return maxOvershoot;
+      if (minOvershoot < 0)
+        return minOvershoot;
+      return std::max(maxOvershoot, minOvershoot);
+    }
+  };
+
+  // Bisect to find a suitable delta, starting with bounds of deltaInit
+  // and deltaMax.
+  double deltaInit = (adjMax - adjMin) / numLevelsDouble;
+  double deltaMax =
+      ((numLevelsDouble * deltaInit) + 2 * deltaInit) / numLevelsDouble;
+  double deltaMid;
+  double prevDeltaMid = 0.0;
+  for (stepCount = 0; stepCount < 60; ++stepCount) {
+    deltaMid = (deltaInit + deltaMax) / 2.0;
+    auto fInit =
+        fns::overshoot(origMinAdj, origMaxAdj, numLevelsDouble, deltaInit);
+    auto fMid =
+        fns::overshoot(origMinAdj, origMaxAdj, numLevelsDouble, deltaMid);
+    if (fMid == 0 || (fMid > 0 && std::fabs(deltaMid - prevDeltaMid) < 1e-15)) {
+      // Solution found (or step size is infinitesimal and an overshoot).
+      // Empirically, this seems to terminate around 30-50 steps or so.
+      // This will find a zero point for exactly representable ranges and
+      // will terminate on a small step size for inexact, biasing towards
+      // overshooting.
+      delta = deltaMid;
+      break;
+    }
+    bool signMid = fMid > 0;
+    bool signInit = fInit > 0;
+    if (signMid == signInit) {
+      deltaInit = deltaMid;
+    } else {
+      deltaMax = deltaMid;
+    }
+    prevDeltaMid = deltaMid;
+  }
+  delta = deltaMid;
+
+  // Recalculate adjMin/adjMax based on new delta.
+  auto adjMinMax = fns::computeMinMax(origMinAdj, numLevelsDouble, delta);
+  adjMin = adjMinMax.first;
+  adjMax = adjMinMax.second;
+
+  satisfied = false;
+  zp = 0;
+
+  if (!std::isnan(delta) && !std::isnan(adjMin) && !std::isnan(adjMax)) {
+    satisfied = true;
+    // Finally, scale and zeroPoint. Since it casts to integer, only valid
+    // if the inputs are valid.
+    zp = std::round(storageParams.minValue - adjMin / delta);
+  }
+
+  return satisfied;
+}
+
+int64_t UniformParamsFromMinMaxSolver::quantize(double x) const {
+  int64_t xq = std::round(x / delta + zp);
+  return std::max<int64_t>(0, std::min<int64_t>(storageParams.numLevels, xq));
+}
+
+double UniformParamsFromMinMaxSolver::dequantize(int64_t xq) const {
+  return (xq - zp) * delta;
+}
+
+raw_ostream &mlir::quantizer::operator<<(raw_ostream &os,
+                                         const UniformStorageParams &p) {
+  os << "UniformStorageParams{" << p.numLevels << ", " << p.minValue << "}";
+  return os;
+}
+
+raw_ostream &
+mlir::quantizer::operator<<(raw_ostream &os,
+                            const UniformParamsFromMinMaxSolver &s) {
+  os << "UniformParamsFromMinMaxSolver(" << s.getStepCount() << "){";
+  os << "(" << s.getBoundingMin() << ":" << s.getBoundingMax() << ") -> ";
+  if (!s.isSatisfied()) {
+    os << "unsat}";
+    return os;
+  }
+
+  os << "(" << s.getAdjMin() << ":" << s.getAdjMax() << ")";
+  os << ", scale = " << s.getScale();
+  os << ", zp = " << s.getZp();
+  os << "}";
+
+  return os;
+}
diff --git a/mlir/lib/Quantizer/Transforms/AddDefaultStatsTestPass.cpp b/mlir/lib/Quantizer/Transforms/AddDefaultStatsTestPass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a27f09bf942b9ffcb68e3bf66b536821a046e440
--- /dev/null
+++ b/mlir/lib/Quantizer/Transforms/AddDefaultStatsTestPass.cpp
@@ -0,0 +1,120 @@
+//===- AddDefaultStatsTestPass.cpp - Testing pass to add default stats ----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a testing pass to add default statistics nodes to every
+// quantization eligible op. Useful for unit testing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Quantizer/Configurations/FxpMathConfig.h"
+#include "mlir/Quantizer/Support/Configuration.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraphTraits.h"
+#include "mlir/Quantizer/Transforms/Passes.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+using namespace mlir::quant;
+
+namespace {
+
+class AddDefaultStatsPass : public FunctionPass<AddDefaultStatsPass> {
+public:
+  AddDefaultStatsPass() = default;
+  AddDefaultStatsPass(SolverContext &solverContext,
+                      const TargetConfiguration &config)
+      : explicitSolverContext(&solverContext), explicitConfig(&config) {}
+
+  void runOnFunction() override;
+  void runWithConfig(SolverContext &solverContext,
+                     const TargetConfiguration &config);
+
+private:
+  SolverContext *explicitSolverContext = nullptr;
+  const TargetConfiguration *explicitConfig = nullptr;
+};
+
+} // end anonymous namespace
+
+void AddDefaultStatsPass::runOnFunction() {
+  if (explicitSolverContext && explicitConfig) {
+    // If explicitly constructed with a config and context.
+    runWithConfig(*explicitSolverContext, *explicitConfig);
+    return;
+  }
+  // For global pass registration, use defaults.
+  SolverContext solverContext(*getFunction().getContext());
+  auto config = FxpMathTargetConfig::create(solverContext);
+  runWithConfig(solverContext, *config);
+}
+
+void AddDefaultStatsPass::runWithConfig(SolverContext &solverContext,
+                                        const TargetConfiguration &config) {
+  auto func = getFunction();
+
+  // Insert stats for each argument.
+  for (auto arg : func.getArguments()) {
+    if (!config.isHandledType(arg->getType()))
+      continue;
+    OpBuilder b(func.getBody());
+    APFloat minValue(-1.0f);
+    APFloat maxValue(1.0f);
+    ElementsAttr layerStats = DenseFPElementsAttr::get(
+        RankedTensorType::get({2}, b.getF32Type()), {minValue, maxValue});
+    auto statsOp = b.create<StatisticsOp>(func.getLoc(), arg, layerStats,
+                                          nullptr, nullptr);
+    arg->replaceAllUsesWith(statsOp);
+
+    // StatsOp contained a use to 'arg' so make sure to reset it after replacing
+    // all of the uses of 'arg'.
+    statsOp.getOperation()->replaceUsesOfWith(statsOp, arg);
+  }
+
+  // Walk the ops and insert stats.
+  func.walk([&](Operation *op) {
+    if (!config.isRequireStatsOp(op)) {
+      return;
+    }
+    assert(op->getNumResults() == 1);
+
+    auto originalResult = op->getResult(0);
+    if (!config.isHandledType(originalResult->getType()))
+      return;
+
+    OpBuilder b(op->getBlock(), ++op->getIterator());
+
+    APFloat minValue(-1.0f);
+    APFloat maxValue(1.0f);
+    ElementsAttr layerStats = DenseFPElementsAttr::get(
+        RankedTensorType::get({2}, b.getF32Type()), {minValue, maxValue});
+    auto statsOp = b.create<StatisticsOp>(op->getLoc(), op->getResult(0),
+                                          layerStats, nullptr, nullptr);
+    originalResult->replaceAllUsesWith(statsOp);
+
+    // StatsOp contained a use to 'op' so make sure to reset it after replacing
+    // all of the uses of 'op'.
+    statsOp.getOperation()->replaceUsesOfWith(statsOp, originalResult);
+  });
+}
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::quantizer::createAddDefaultStatsPass() {
+  return std::make_unique<AddDefaultStatsPass>();
+}
+
+static PassRegistration<AddDefaultStatsPass> pass(
+    "quantizer-add-default-stats-test",
+    "Adds default (dummy) statistics to all ops that can benefit from "
+    "runtime statistics. This is meant to help in early stage bootstrapping.");
diff --git a/mlir/lib/Quantizer/Transforms/InferQuantizedTypesPass.cpp b/mlir/lib/Quantizer/Transforms/InferQuantizedTypesPass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ecb668ce559c25843e5bd9e605882d83863a197
--- /dev/null
+++ b/mlir/lib/Quantizer/Transforms/InferQuantizedTypesPass.cpp
@@ -0,0 +1,288 @@
+//===- InferQuantizedTypesPass.cpp - Infers quantized types ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the primary pass for instantiating a CAG, running it to
+// convergence on a module to determine eligible quantized type transforms, and
+// applying those transforms to the IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Quantizer/Configurations/FxpMathConfig.h"
+#include "mlir/Quantizer/Support/Configuration.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraph.h"
+#include "mlir/Quantizer/Support/ConstraintAnalysisGraphTraits.h"
+#include "mlir/Quantizer/Transforms/Passes.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+using namespace mlir::quant;
+
+namespace llvm {
+
+template <>
+struct DOTGraphTraits<const CAGSlice *>
+    : public DOTGraphTraits<const CAGNode *> {
+  DOTGraphTraits(bool isSimple = false)
+      : DOTGraphTraits<const CAGNode *>(isSimple) {}
+
+  std::string getNodeLabel(const CAGNode *node, const CAGSlice *graph) {
+    std::string s;
+    llvm::raw_string_ostream out(s);
+    node->printLabel(out);
+    return out.str();
+  }
+
+  static std::string getGraphProperties(const CAGSlice *) {
+    return "rankdir=LR;";
+  }
+
+  static bool isNodeHidden(const CAGNode *node) {
+    // Filter constraint nodes with no incoming or outgoing connections.
+    // These orphans are often created as part of graph merging operations.
+    return llvm::isa<CAGConstraintNode>(node) && node->isOrphan();
+  }
+
+  std::string getNodeAttributes(const CAGNode *node, const CAGSlice *graph) {
+    switch (node->getKind()) {
+    default:
+      return std::string();
+    case CAGNode::Kind::OperandAnchor:
+      return "shape=record,color=yellow,style=filled";
+    case CAGNode::Kind::ResultAnchor:
+      return "shape=record,color=lightblue,style=filled";
+    case CAGNode::Kind::Constraint:
+      return "shape=record,style=dotted";
+    }
+  }
+};
+
+} // end namespace llvm
+
+namespace {
+
+class InferQuantizedTypesPass : public ModulePass<InferQuantizedTypesPass> {
+public:
+  InferQuantizedTypesPass() = default;
+  InferQuantizedTypesPass(SolverContext &solverContext,
+                          const TargetConfiguration &config)
+      : explicitSolverContext(&solverContext), explicitConfig(&config) {}
+  void runOnModule() override;
+  void runWithConfig(SolverContext &solverContext,
+                     const TargetConfiguration &config);
+
+  void transformOperandType(CAGOperandAnchor *anchor, Type newType);
+  void transformResultType(CAGResultAnchor *anchor, Type newType);
+
+private:
+  SolverContext *explicitSolverContext = nullptr;
+  const TargetConfiguration *explicitConfig = nullptr;
+};
+
+} // end anonymous namespace
+
+/// Maximum number of propagation rounds to run to converge the CAG before
+/// signalling an error.
+static const int kMaximumPropagationRounds = 1000;
+
+static LogicalResult validateTypeConversion(Type newType, Type origType,
+                                            Operation *op) {
+  if (!newType) {
+    return op->emitOpError() << "unsupported type conversion from " << newType;
+  }
+  return success();
+}
+
+void InferQuantizedTypesPass::runOnModule() {
+  if (explicitSolverContext && explicitConfig) {
+    // If explicitly constructed with a config and context.
+    runWithConfig(*explicitSolverContext, *explicitConfig);
+    return;
+  }
+
+  // For global pass registration, use defaults.
+  SolverContext solverContext(*getModule().getContext());
+  auto config = FxpMathTargetConfig::create(solverContext);
+  runWithConfig(solverContext, *config);
+}
+
+void InferQuantizedTypesPass::runWithConfig(SolverContext &solverContext,
+                                            const TargetConfiguration &config) {
+  CAGSlice cag(solverContext);
+  for (auto f : getModule().getOps<FuncOp>()) {
+    f.walk([&cag, &config](Operation *op) { config.handleOp(op, cag); });
+  }
+  config.finalizeAnchors(cag);
+
+  // Propagate.
+  int propRound;
+  for (propRound = kMaximumPropagationRounds; propRound > 0; --propRound) {
+    auto propCount = cag.propagate(config);
+    if (propCount == 0)
+      break;
+  }
+  if (propRound == 0) {
+    emitError(UnknownLoc::get(&getContext()),
+              "exceeded maximum number of solver iterations (infinite loop?)");
+    return;
+  }
+
+  // TODO: Only dump the GraphViz if a flag is set and move to a utility.
+  // GraphViz.
+  if (!solverContext.getDebugCAGDotPath().empty()) {
+    auto actFileName =
+        llvm::WriteGraph(const_cast<const CAGSlice *>(&cag), "CAG",
+                         /*ShortNames=*/false,
+                         /*Title=*/"CAG",
+                         /*Filename=*/solverContext.getDebugCAGDotPath());
+    llvm::errs() << "Wrote graphviz file: " << actFileName << "\n";
+  }
+
+  // Start transforming the types in order of anchor type (results, then
+  // operands).
+  // Apply result types.
+  for (auto *node : cag) {
+    auto anchorNode = dyn_cast<CAGResultAnchor>(node);
+    if (!anchorNode)
+      continue;
+    if (Type newType = anchorNode->getTransformedType())
+      transformResultType(anchorNode, newType);
+  }
+
+  // Apply operand types.
+  for (auto *node : cag) {
+    auto anchorNode = dyn_cast<CAGOperandAnchor>(node);
+    if (!anchorNode)
+      continue;
+    if (Type newType = anchorNode->getTransformedType())
+      transformOperandType(anchorNode, newType);
+  }
+}
+
+void InferQuantizedTypesPass::transformOperandType(CAGOperandAnchor *anchor,
+                                                   Type newType) {
+  Value inputValue = anchor->getValue();
+  Operation *op = anchor->getOp();
+  OpBuilder b(op->getBlock(), Block::iterator(op));
+
+  SmallVector<Value, 1> removeValuesIfDead;
+
+  // Because we've already run the result transforms at this phase, it is
+  // very likely that inputValue points to a dcast op whose input matches
+  // our type. We detect that situation and route around just to save some
+  // bulk in the IR.
+  Value newTypedInputValue = inputValue;
+  auto inputDcastOp =
+      dyn_cast_or_null<DequantizeCastOp>(inputValue->getDefiningOp());
+  if (inputDcastOp && inputDcastOp.arg()->getType() == newType) {
+    // Can just use the dcast's input value.
+    newTypedInputValue = inputDcastOp.arg();
+    removeValuesIfDead.push_back(inputDcastOp);
+  } else {
+    // Need to synthesize a qcast.
+    newTypedInputValue =
+        b.create<QuantizeCastOp>(op->getLoc(), newType, inputValue);
+  }
+
+  switch (anchor->getTypeTransformRule()) {
+  case CAGAnchorNode::TypeTransformRule::Direct:
+    anchor->getOp()->setOperand(anchor->getOperandIdx(), newTypedInputValue);
+    break;
+
+  case CAGAnchorNode::TypeTransformRule::DirectStorage: {
+    Type storageType = QuantizedType::castToStorageType(newType);
+    if (failed(validateTypeConversion(storageType, newType, op)))
+      return;
+    anchor->getOp()->setOperand(
+        anchor->getOperandIdx(),
+        b.create<StorageCastOp>(op->getLoc(), storageType, newTypedInputValue));
+    break;
+  }
+
+  case CAGAnchorNode::TypeTransformRule::ExpressedOnly:
+    // Leave the anchor as-is and just cast in/out after it.
+    anchor->getOp()->setOperand(
+        anchor->getOperandIdx(),
+        b.create<DequantizeCastOp>(op->getLoc(), anchor->getOriginalType(),
+                                   newTypedInputValue));
+    break;
+  }
+
+  for (Value removeValueIfDead : removeValuesIfDead) {
+    if (removeValueIfDead->use_empty()) {
+      removeValueIfDead->getDefiningOp()->erase();
+    }
+  }
+}
+
+void InferQuantizedTypesPass::transformResultType(CAGResultAnchor *anchor,
+                                                  Type newType) {
+  Value origResultValue = anchor->getValue();
+  Operation *op = origResultValue->getDefiningOp();
+  OpBuilder b(op->getBlock(), ++Block::iterator(op));
+
+  Value replacedResultValue = nullptr;
+  Value newResultValue = nullptr;
+  switch (anchor->getTypeTransformRule()) {
+  case CAGAnchorNode::TypeTransformRule::Direct:
+    origResultValue->setType(newType);
+    replacedResultValue = newResultValue = b.create<DequantizeCastOp>(
+        op->getLoc(), anchor->getOriginalType(), origResultValue);
+    break;
+
+  case CAGAnchorNode::TypeTransformRule::DirectStorage: {
+    Type storageType = QuantizedType::castToStorageType(newType);
+    if (failed(validateTypeConversion(storageType, newType, op)))
+      return;
+    origResultValue->setType(storageType);
+    replacedResultValue =
+        b.create<StorageCastOp>(op->getLoc(), newType, origResultValue);
+    newResultValue = b.create<DequantizeCastOp>(
+        op->getLoc(), anchor->getOriginalType(), replacedResultValue);
+    break;
+  }
+
+  case CAGAnchorNode::TypeTransformRule::ExpressedOnly:
+    // Leave the anchor as-is and just cast in/out after it.
+    replacedResultValue =
+        b.create<QuantizeCastOp>(op->getLoc(), newType, origResultValue);
+    newResultValue = b.create<DequantizeCastOp>(
+        op->getLoc(), anchor->getOriginalType(), replacedResultValue);
+    break;
+  }
+
+  if (replacedResultValue) {
+    // Transform:
+    //   origResultValue -->  replaceResultValue -> newResultValue
+    //                   \->  [original uses]
+    // To:
+    //   origResultValue -> replaceResultValue ->
+    //                      newResultValue -> [original uses]
+    // Note that replaceResultValue may equal newResultValue or there may
+    // be operands between the two.
+    origResultValue->replaceAllUsesWith(newResultValue);
+    replacedResultValue->getDefiningOp()->replaceUsesOfWith(newResultValue,
+                                                            origResultValue);
+  }
+}
+
+std::unique_ptr<OpPassBase<ModuleOp>>
+mlir::quantizer::createInferQuantizedTypesPass(
+    SolverContext &solverContext, const TargetConfiguration &config) {
+  return std::make_unique<InferQuantizedTypesPass>(solverContext, config);
+}
+
+static PassRegistration<InferQuantizedTypesPass>
+    pass("quantizer-infer-quantized-types",
+         "Infers quantized types for a module");
diff --git a/mlir/lib/Quantizer/Transforms/RemoveInstrumentationPass.cpp b/mlir/lib/Quantizer/Transforms/RemoveInstrumentationPass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..da5bd12ea1c3b9f523cc8e4cb020a2e6534790e4
--- /dev/null
+++ b/mlir/lib/Quantizer/Transforms/RemoveInstrumentationPass.cpp
@@ -0,0 +1,68 @@
+//===- RemoveInstrumentationPass.cpp - Removes instrumentation ------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass to remove any instrumentation ops. It is often one
+// of the final steps when performing quantization and is run after any
+// decisions requiring instrumentation have been made.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/QuantOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Quantizer/Transforms/Passes.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+using namespace mlir::quant;
+
+namespace {
+
+class RemoveInstrumentationPass
+    : public FunctionPass<RemoveInstrumentationPass> {
+  void runOnFunction() override;
+};
+
+template <typename OpTy>
+class RemoveIdentityOpRewrite : public RewritePattern {
+public:
+  RemoveIdentityOpRewrite(MLIRContext *context)
+      : RewritePattern(OpTy::getOperationName(), 1, context) {}
+
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const override {
+    assert(op->getNumOperands() == 1);
+    assert(op->getNumResults() == 1);
+
+    rewriter.replaceOp(op, op->getOperand(0));
+    return matchSuccess();
+  }
+};
+
+} // end anonymous namespace
+
+void RemoveInstrumentationPass::runOnFunction() {
+  OwningRewritePatternList patterns;
+  auto func = getFunction();
+  auto *context = &getContext();
+  patterns.insert<RemoveIdentityOpRewrite<StatisticsOp>,
+                  RemoveIdentityOpRewrite<StatisticsRefOp>,
+                  RemoveIdentityOpRewrite<CoupledRefOp>>(context);
+  applyPatternsGreedily(func, patterns);
+}
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::quantizer::createRemoveInstrumentationPass() {
+  return std::make_unique<RemoveInstrumentationPass>();
+}
+
+static PassRegistration<RemoveInstrumentationPass>
+    pass("quantizer-remove-instrumentation",
+         "Removes instrumentation and hints which have no effect on final "
+         "execution");
diff --git a/mlir/lib/Support/CMakeLists.txt b/mlir/lib/Support/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7594a8c7bbe77ef1a97b08a8525db49c5aa8ab36
--- /dev/null
+++ b/mlir/lib/Support/CMakeLists.txt
@@ -0,0 +1,54 @@
+set(LLVM_OPTIONAL_SOURCES
+  FileUtilities.cpp
+  JitRunner.cpp
+  MlirOptMain.cpp
+  StorageUniquer.cpp
+  ToolUtilities.cpp
+  TranslateClParser.cpp
+)
+
+add_llvm_library(MLIRSupport
+  FileUtilities.cpp
+  StorageUniquer.cpp
+  ToolUtilities.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Support
+  )
+target_link_libraries(MLIRSupport LLVMSupport)
+
+add_llvm_library(MLIROptMain
+  MlirOptMain.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Support
+  )
+target_link_libraries(MLIROptMain
+  MLIRPass
+  LLVMSupport
+  MLIRSupport
+  )
+
+add_llvm_library(MLIRTranslateClParser
+  TranslateClParser.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Support
+  )
+target_link_libraries(MLIRTranslateClParser LLVMSupport)
+
+add_llvm_library(MLIRJitRunner
+  JitRunner.cpp
+)
+target_link_libraries(MLIRJitRunner PRIVATE
+  MLIRExecutionEngine
+  MLIRIR
+  MLIRParser
+  MLIRStandardOps
+  MLIRTargetLLVMIR
+  MLIRTransforms
+  MLIRStandardToLLVM
+  MLIRSupport
+  LLVMCore
+  LLVMSupport
+)
diff --git a/mlir/lib/Support/FileUtilities.cpp b/mlir/lib/Support/FileUtilities.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a56ae57ba25f059496bf869c1fba379e1f138aff
--- /dev/null
+++ b/mlir/lib/Support/FileUtilities.cpp
@@ -0,0 +1,47 @@
+//===- FileUtilities.cpp - utilities for working with files ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definitions of common utilities for working with files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+std::unique_ptr<llvm::MemoryBuffer>
+mlir::openInputFile(StringRef inputFilename, std::string *errorMessage) {
+  auto fileOrErr = llvm::MemoryBuffer::getFileOrSTDIN(inputFilename);
+  if (std::error_code error = fileOrErr.getError()) {
+    if (errorMessage)
+      *errorMessage = "cannot open input file '" + inputFilename.str() +
+                      "': " + error.message();
+    return nullptr;
+  }
+
+  return std::move(*fileOrErr);
+}
+
+std::unique_ptr<llvm::ToolOutputFile>
+mlir::openOutputFile(StringRef outputFilename, std::string *errorMessage) {
+  std::error_code error;
+  auto result = std::make_unique<llvm::ToolOutputFile>(outputFilename, error,
+                                                       llvm::sys::fs::F_None);
+  if (error) {
+    if (errorMessage)
+      *errorMessage = "cannot open output file '" + outputFilename.str() +
+                      "': " + error.message();
+    return nullptr;
+  }
+
+  return result;
+}
diff --git a/mlir/lib/Support/JitRunner.cpp b/mlir/lib/Support/JitRunner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b327d3d4756b7ebb63d86591697ae61fb746d275
--- /dev/null
+++ b/mlir/lib/Support/JitRunner.cpp
@@ -0,0 +1,293 @@
+//===- jit-runner.cpp - MLIR CPU Execution Driver Library -----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a library that provides a shared implementation for command line
+// utilities that execute an MLIR file on the CPU by translating MLIR to LLVM
+// IR before JIT-compiling and executing the latter.
+//
+// The translation can be customized by providing an MLIR to MLIR
+// transformation.
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/JitRunner.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/ExecutionEngine/ExecutionEngine.h"
+#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Parser.h"
+#include "mlir/Support/FileUtilities.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassNameParser.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include <numeric>
+
+using namespace mlir;
+using llvm::Error;
+
+static llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
+                                                llvm::cl::desc("<input file>"),
+                                                llvm::cl::init("-"));
+static llvm::cl::opt<std::string>
+    mainFuncName("e", llvm::cl::desc("The function to be called"),
+                 llvm::cl::value_desc("<function name>"),
+                 llvm::cl::init("main"));
+static llvm::cl::opt<std::string> mainFuncType(
+    "entry-point-result",
+    llvm::cl::desc("Textual description of the function type to be called"),
+    llvm::cl::value_desc("f32 | void"), llvm::cl::init("f32"));
+
+static llvm::cl::OptionCategory optFlags("opt-like flags");
+
+// CLI list of pass information
+static llvm::cl::list<const llvm::PassInfo *, bool, llvm::PassNameParser>
+    llvmPasses(llvm::cl::desc("LLVM optimizing passes to run"),
+               llvm::cl::cat(optFlags));
+
+// CLI variables for -On options.
+static llvm::cl::opt<bool>
+    optO0("O0", llvm::cl::desc("Run opt passes and codegen at O0"),
+          llvm::cl::cat(optFlags));
+static llvm::cl::opt<bool>
+    optO1("O1", llvm::cl::desc("Run opt passes and codegen at O1"),
+          llvm::cl::cat(optFlags));
+static llvm::cl::opt<bool>
+    optO2("O2", llvm::cl::desc("Run opt passes and codegen at O2"),
+          llvm::cl::cat(optFlags));
+static llvm::cl::opt<bool>
+    optO3("O3", llvm::cl::desc("Run opt passes and codegen at O3"),
+          llvm::cl::cat(optFlags));
+
+static llvm::cl::OptionCategory clOptionsCategory("linking options");
+static llvm::cl::list<std::string>
+    clSharedLibs("shared-libs", llvm::cl::desc("Libraries to link dynamically"),
+                 llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
+                 llvm::cl::cat(clOptionsCategory));
+
+// CLI variables for debugging.
+static llvm::cl::opt<bool> dumpObjectFile(
+    "dump-object-file",
+    llvm::cl::desc("Dump JITted-compiled object to file specified with "
+                   "-object-filename (<input file>.o by default)."));
+
+static llvm::cl::opt<std::string> objectFilename(
+    "object-filename",
+    llvm::cl::desc("Dump JITted-compiled object to file <input file>.o"));
+
+static OwningModuleRef parseMLIRInput(StringRef inputFilename,
+                                      MLIRContext *context) {
+  // Set up the input file.
+  std::string errorMessage;
+  auto file = openInputFile(inputFilename, &errorMessage);
+  if (!file) {
+    llvm::errs() << errorMessage << "\n";
+    return nullptr;
+  }
+
+  llvm::SourceMgr sourceMgr;
+  sourceMgr.AddNewSourceBuffer(std::move(file), llvm::SMLoc());
+  return OwningModuleRef(parseSourceFile(sourceMgr, context));
+}
+
+// Initialize the relevant subsystems of LLVM.
+static void initializeLLVM() {
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+}
+
+static inline Error make_string_error(const Twine &message) {
+  return llvm::make_error<llvm::StringError>(message.str(),
+                                             llvm::inconvertibleErrorCode());
+}
+
+static Optional<unsigned> getCommandLineOptLevel() {
+  Optional<unsigned> optLevel;
+  SmallVector<std::reference_wrapper<llvm::cl::opt<bool>>, 4> optFlags{
+      optO0, optO1, optO2, optO3};
+
+  // Determine if there is an optimization flag present.
+  for (unsigned j = 0; j < 4; ++j) {
+    auto &flag = optFlags[j].get();
+    if (flag) {
+      optLevel = j;
+      break;
+    }
+  }
+  return optLevel;
+}
+
+// JIT-compile the given module and run "entryPoint" with "args" as arguments.
+static Error
+compileAndExecute(ModuleOp module, StringRef entryPoint,
+                  std::function<llvm::Error(llvm::Module *)> transformer,
+                  void **args) {
+  Optional<llvm::CodeGenOpt::Level> jitCodeGenOptLevel;
+  if (auto clOptLevel = getCommandLineOptLevel())
+    jitCodeGenOptLevel =
+        static_cast<llvm::CodeGenOpt::Level>(clOptLevel.getValue());
+  SmallVector<StringRef, 4> libs(clSharedLibs.begin(), clSharedLibs.end());
+  auto expectedEngine = mlir::ExecutionEngine::create(module, transformer,
+                                                      jitCodeGenOptLevel, libs);
+  if (!expectedEngine)
+    return expectedEngine.takeError();
+
+  auto engine = std::move(*expectedEngine);
+  auto expectedFPtr = engine->lookup(entryPoint);
+  if (!expectedFPtr)
+    return expectedFPtr.takeError();
+
+  if (dumpObjectFile)
+    engine->dumpToObjectFile(objectFilename.empty() ? inputFilename + ".o"
+                                                    : objectFilename);
+
+  void (*fptr)(void **) = *expectedFPtr;
+  (*fptr)(args);
+
+  return Error::success();
+}
+
+static Error compileAndExecuteVoidFunction(
+    ModuleOp module, StringRef entryPoint,
+    std::function<llvm::Error(llvm::Module *)> transformer) {
+  auto mainFunction = module.lookupSymbol<LLVM::LLVMFuncOp>(entryPoint);
+  if (!mainFunction || mainFunction.getBlocks().empty())
+    return make_string_error("entry point not found");
+  void *empty = nullptr;
+  return compileAndExecute(module, entryPoint, transformer, &empty);
+}
+
+static Error compileAndExecuteSingleFloatReturnFunction(
+    ModuleOp module, StringRef entryPoint,
+    std::function<llvm::Error(llvm::Module *)> transformer) {
+  auto mainFunction = module.lookupSymbol<LLVM::LLVMFuncOp>(entryPoint);
+  if (!mainFunction || mainFunction.isExternal())
+    return make_string_error("entry point not found");
+
+  if (mainFunction.getType().getFunctionNumParams() != 0)
+    return make_string_error("function inputs not supported");
+
+  if (!mainFunction.getType().getFunctionResultType().isFloatTy())
+    return make_string_error("only single llvm.f32 function result supported");
+
+  float res;
+  struct {
+    void *data;
+  } data;
+  data.data = &res;
+  if (auto error =
+          compileAndExecute(module, entryPoint, transformer, (void **)&data))
+    return error;
+
+  // Intentional printing of the output so we can test.
+  llvm::outs() << res << '\n';
+
+  return Error::success();
+}
+
+// Entry point for all CPU runners. Expects the common argc/argv arguments for
+// standard C++ main functions and an mlirTransformer.
+// The latter is applied after parsing the input into MLIR IR and before passing
+// the MLIR module to the ExecutionEngine.
+int mlir::JitRunnerMain(
+    int argc, char **argv,
+    function_ref<LogicalResult(mlir::ModuleOp)> mlirTransformer) {
+  llvm::InitLLVM y(argc, argv);
+
+  initializeLLVM();
+  mlir::initializeLLVMPasses();
+
+  llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR CPU execution driver\n");
+
+  Optional<unsigned> optLevel = getCommandLineOptLevel();
+  SmallVector<std::reference_wrapper<llvm::cl::opt<bool>>, 4> optFlags{
+      optO0, optO1, optO2, optO3};
+  unsigned optCLIPosition = 0;
+  // Determine if there is an optimization flag present, and its CLI position
+  // (optCLIPosition).
+  for (unsigned j = 0; j < 4; ++j) {
+    auto &flag = optFlags[j].get();
+    if (flag) {
+      optCLIPosition = flag.getPosition();
+      break;
+    }
+  }
+  // Generate vector of pass information, plus the index at which we should
+  // insert any optimization passes in that vector (optPosition).
+  SmallVector<const llvm::PassInfo *, 4> passes;
+  unsigned optPosition = 0;
+  for (unsigned i = 0, e = llvmPasses.size(); i < e; ++i) {
+    passes.push_back(llvmPasses[i]);
+    if (optCLIPosition < llvmPasses.getPosition(i)) {
+      optPosition = i;
+      optCLIPosition = UINT_MAX; // To ensure we never insert again
+    }
+  }
+
+  MLIRContext context;
+  auto m = parseMLIRInput(inputFilename, &context);
+  if (!m) {
+    llvm::errs() << "could not parse the input IR\n";
+    return 1;
+  }
+
+  if (mlirTransformer)
+    if (failed(mlirTransformer(m.get())))
+      return EXIT_FAILURE;
+
+  auto tmBuilderOrError = llvm::orc::JITTargetMachineBuilder::detectHost();
+  if (!tmBuilderOrError) {
+    llvm::errs() << "Failed to create a JITTargetMachineBuilder for the host\n";
+    return EXIT_FAILURE;
+  }
+  auto tmOrError = tmBuilderOrError->createTargetMachine();
+  if (!tmOrError) {
+    llvm::errs() << "Failed to create a TargetMachine for the host\n";
+    return EXIT_FAILURE;
+  }
+
+  auto transformer = mlir::makeLLVMPassesTransformer(
+      passes, optLevel, /*targetMachine=*/tmOrError->get(), optPosition);
+
+  // Get the function used to compile and execute the module.
+  using CompileAndExecuteFnT = Error (*)(
+      ModuleOp, StringRef, std::function<llvm::Error(llvm::Module *)>);
+  auto compileAndExecuteFn =
+      llvm::StringSwitch<CompileAndExecuteFnT>(mainFuncType.getValue())
+          .Case("f32", compileAndExecuteSingleFloatReturnFunction)
+          .Case("void", compileAndExecuteVoidFunction)
+          .Default(nullptr);
+
+  Error error =
+      compileAndExecuteFn
+          ? compileAndExecuteFn(m.get(), mainFuncName.getValue(), transformer)
+          : make_string_error("unsupported function type");
+
+  int exitCode = EXIT_SUCCESS;
+  llvm::handleAllErrors(std::move(error),
+                        [&exitCode](const llvm::ErrorInfoBase &info) {
+                          llvm::errs() << "Error: ";
+                          info.log(llvm::errs());
+                          llvm::errs() << '\n';
+                          exitCode = EXIT_FAILURE;
+                        });
+
+  return exitCode;
+}
diff --git a/mlir/lib/Support/MlirOptMain.cpp b/mlir/lib/Support/MlirOptMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a76801211caaa28daf26cbcbb23c9743d7d73ee
--- /dev/null
+++ b/mlir/lib/Support/MlirOptMain.cpp
@@ -0,0 +1,117 @@
+//===- MlirOptMain.cpp - MLIR Optimizer Driver ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility that runs an optimization pass and prints the result back
+// out. It is designed to support unit testing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/MlirOptMain.h"
+#include "mlir/Analysis/Passes.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Parser.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/ToolUtilities.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+using namespace llvm;
+using llvm::SMLoc;
+
+/// Perform the actions on the input file indicated by the command line flags
+/// within the specified context.
+///
+/// This typically parses the main source file, runs zero or more optimization
+/// passes, then prints the output.
+///
+static LogicalResult performActions(raw_ostream &os, bool verifyDiagnostics,
+                                    bool verifyPasses, SourceMgr &sourceMgr,
+                                    MLIRContext *context,
+                                    const PassPipelineCLParser &passPipeline) {
+  OwningModuleRef module(parseSourceFile(sourceMgr, context));
+  if (!module)
+    return failure();
+
+  // Apply any pass manager command line options.
+  PassManager pm(context, verifyPasses);
+  applyPassManagerCLOptions(pm);
+
+  // Build the provided pipeline.
+  if (failed(passPipeline.addToPipeline(pm)))
+    return failure();
+
+  // Run the pipeline.
+  if (failed(pm.run(*module)))
+    return failure();
+
+  // Print the output.
+  module->print(os);
+  return success();
+}
+
+/// Parses the memory buffer.  If successfully, run a series of passes against
+/// it and print the result.
+static LogicalResult processBuffer(raw_ostream &os,
+                                   std::unique_ptr<MemoryBuffer> ownedBuffer,
+                                   bool verifyDiagnostics, bool verifyPasses,
+                                   const PassPipelineCLParser &passPipeline) {
+  // Tell sourceMgr about this buffer, which is what the parser will pick up.
+  SourceMgr sourceMgr;
+  sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), SMLoc());
+
+  // Parse the input file.
+  MLIRContext context;
+
+  // If we are in verify diagnostics mode then we have a lot of work to do,
+  // otherwise just perform the actions without worrying about it.
+  if (!verifyDiagnostics) {
+    SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
+    return performActions(os, verifyDiagnostics, verifyPasses, sourceMgr,
+                          &context, passPipeline);
+  }
+
+  SourceMgrDiagnosticVerifierHandler sourceMgrHandler(sourceMgr, &context);
+
+  // Do any processing requested by command line flags.  We don't care whether
+  // these actions succeed or fail, we only care what diagnostics they produce
+  // and whether they match our expectations.
+  performActions(os, verifyDiagnostics, verifyPasses, sourceMgr, &context,
+                 passPipeline);
+
+  // Verify the diagnostic handler to make sure that each of the diagnostics
+  // matched.
+  return sourceMgrHandler.verify();
+}
+
+LogicalResult mlir::MlirOptMain(raw_ostream &os,
+                                std::unique_ptr<MemoryBuffer> buffer,
+                                const PassPipelineCLParser &passPipeline,
+                                bool splitInputFile, bool verifyDiagnostics,
+                                bool verifyPasses) {
+  // The split-input-file mode is a very specific mode that slices the file
+  // up into small pieces and checks each independently.
+  if (splitInputFile)
+    return splitAndProcessBuffer(
+        std::move(buffer),
+        [&](std::unique_ptr<MemoryBuffer> chunkBuffer, raw_ostream &os) {
+          return processBuffer(os, std::move(chunkBuffer), verifyDiagnostics,
+                               verifyPasses, passPipeline);
+        },
+        os);
+
+  return processBuffer(os, std::move(buffer), verifyDiagnostics, verifyPasses,
+                       passPipeline);
+}
diff --git a/mlir/lib/Support/StorageUniquer.cpp b/mlir/lib/Support/StorageUniquer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6f6bac423696bb82df5f4cdf4cfdee712b9ade8
--- /dev/null
+++ b/mlir/lib/Support/StorageUniquer.cpp
@@ -0,0 +1,197 @@
+//===- StorageUniquer.cpp - Common Storage Class Uniquer ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/StorageUniquer.h"
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/RWMutex.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+namespace mlir {
+namespace detail {
+/// This is the implementation of the StorageUniquer class.
+struct StorageUniquerImpl {
+  using BaseStorage = StorageUniquer::BaseStorage;
+  using StorageAllocator = StorageUniquer::StorageAllocator;
+
+  /// A lookup key for derived instances of storage objects.
+  struct LookupKey {
+    /// The known derived kind for the storage.
+    unsigned kind;
+
+    /// The known hash value of the key.
+    unsigned hashValue;
+
+    /// An equality function for comparing with an existing storage instance.
+    function_ref<bool(const BaseStorage *)> isEqual;
+  };
+
+  /// A utility wrapper object representing a hashed storage object. This class
+  /// contains a storage object and an existing computed hash value.
+  struct HashedStorage {
+    unsigned hashValue;
+    BaseStorage *storage;
+  };
+
+  /// Get or create an instance of a complex derived type.
+  BaseStorage *
+  getOrCreate(unsigned kind, unsigned hashValue,
+              function_ref<bool(const BaseStorage *)> isEqual,
+              function_ref<BaseStorage *(StorageAllocator &)> ctorFn) {
+    LookupKey lookupKey{kind, hashValue, isEqual};
+
+    // Check for an existing instance in read-only mode.
+    {
+      llvm::sys::SmartScopedReader<true> typeLock(mutex);
+      auto it = storageTypes.find_as(lookupKey);
+      if (it != storageTypes.end())
+        return it->storage;
+    }
+
+    // Acquire a writer-lock so that we can safely create the new type instance.
+    llvm::sys::SmartScopedWriter<true> typeLock(mutex);
+
+    // Check for an existing instance again here, because another writer thread
+    // may have already created one.
+    auto existing = storageTypes.insert_as({}, lookupKey);
+    if (!existing.second)
+      return existing.first->storage;
+
+    // Otherwise, construct and initialize the derived storage for this type
+    // instance.
+    BaseStorage *storage = initializeStorage(kind, ctorFn);
+    *existing.first = HashedStorage{hashValue, storage};
+    return storage;
+  }
+
+  /// Get or create an instance of a simple derived type.
+  BaseStorage *
+  getOrCreate(unsigned kind,
+              function_ref<BaseStorage *(StorageAllocator &)> ctorFn) {
+    // Check for an existing instance in read-only mode.
+    {
+      llvm::sys::SmartScopedReader<true> typeLock(mutex);
+      auto it = simpleTypes.find(kind);
+      if (it != simpleTypes.end())
+        return it->second;
+    }
+
+    // Acquire a writer-lock so that we can safely create the new type instance.
+    llvm::sys::SmartScopedWriter<true> typeLock(mutex);
+
+    // Check for an existing instance again here, because another writer thread
+    // may have already created one.
+    auto &result = simpleTypes[kind];
+    if (result)
+      return result;
+
+    // Otherwise, create and return a new storage instance.
+    return result = initializeStorage(kind, ctorFn);
+  }
+
+  /// Erase an instance of a complex derived type.
+  void erase(unsigned kind, unsigned hashValue,
+             function_ref<bool(const BaseStorage *)> isEqual,
+             function_ref<void(BaseStorage *)> cleanupFn) {
+    LookupKey lookupKey{kind, hashValue, isEqual};
+
+    // Acquire a writer-lock so that we can safely erase the type instance.
+    llvm::sys::SmartScopedWriter<true> typeLock(mutex);
+    auto existing = storageTypes.find_as(lookupKey);
+    if (existing == storageTypes.end())
+      return;
+
+    // Cleanup the storage and remove it from the map.
+    cleanupFn(existing->storage);
+    storageTypes.erase(existing);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Instance Storage
+  //===--------------------------------------------------------------------===//
+
+  /// Utility to create and initialize a storage instance.
+  BaseStorage *
+  initializeStorage(unsigned kind,
+                    function_ref<BaseStorage *(StorageAllocator &)> ctorFn) {
+    BaseStorage *storage = ctorFn(allocator);
+    storage->kind = kind;
+    return storage;
+  }
+
+  /// Storage info for derived TypeStorage objects.
+  struct StorageKeyInfo : DenseMapInfo<HashedStorage> {
+    static HashedStorage getEmptyKey() {
+      return HashedStorage{0, DenseMapInfo<BaseStorage *>::getEmptyKey()};
+    }
+    static HashedStorage getTombstoneKey() {
+      return HashedStorage{0, DenseMapInfo<BaseStorage *>::getTombstoneKey()};
+    }
+
+    static unsigned getHashValue(const HashedStorage &key) {
+      return key.hashValue;
+    }
+    static unsigned getHashValue(LookupKey key) { return key.hashValue; }
+
+    static bool isEqual(const HashedStorage &lhs, const HashedStorage &rhs) {
+      return lhs.storage == rhs.storage;
+    }
+    static bool isEqual(const LookupKey &lhs, const HashedStorage &rhs) {
+      if (isEqual(rhs, getEmptyKey()) || isEqual(rhs, getTombstoneKey()))
+        return false;
+      // If the lookup kind matches the kind of the storage, then invoke the
+      // equality function on the lookup key.
+      return lhs.kind == rhs.storage->getKind() && lhs.isEqual(rhs.storage);
+    }
+  };
+
+  // Unique types with specific hashing or storage constraints.
+  using StorageTypeSet = DenseSet<HashedStorage, StorageKeyInfo>;
+  StorageTypeSet storageTypes;
+
+  // Unique types with just the kind.
+  DenseMap<unsigned, BaseStorage *> simpleTypes;
+
+  // Allocator to use when constructing derived type instances.
+  StorageUniquer::StorageAllocator allocator;
+
+  // A mutex to keep type uniquing thread-safe.
+  llvm::sys::SmartRWMutex<true> mutex;
+};
+} // end namespace detail
+} // namespace mlir
+
+StorageUniquer::StorageUniquer() : impl(new StorageUniquerImpl()) {}
+StorageUniquer::~StorageUniquer() {}
+
+/// Implementation for getting/creating an instance of a derived type with
+/// complex storage.
+auto StorageUniquer::getImpl(
+    unsigned kind, unsigned hashValue,
+    function_ref<bool(const BaseStorage *)> isEqual,
+    std::function<BaseStorage *(StorageAllocator &)> ctorFn) -> BaseStorage * {
+  return impl->getOrCreate(kind, hashValue, isEqual, ctorFn);
+}
+
+/// Implementation for getting/creating an instance of a derived type with
+/// default storage.
+auto StorageUniquer::getImpl(
+    unsigned kind, std::function<BaseStorage *(StorageAllocator &)> ctorFn)
+    -> BaseStorage * {
+  return impl->getOrCreate(kind, ctorFn);
+}
+
+/// Implementation for erasing an instance of a derived type with complex
+/// storage.
+void StorageUniquer::eraseImpl(unsigned kind, unsigned hashValue,
+                               function_ref<bool(const BaseStorage *)> isEqual,
+                               std::function<void(BaseStorage *)> cleanupFn) {
+  impl->erase(kind, hashValue, isEqual, cleanupFn);
+}
diff --git a/mlir/lib/Support/ToolUtilities.cpp b/mlir/lib/Support/ToolUtilities.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd2df7809b71434eecf086c66ae287ba81f7a53c
--- /dev/null
+++ b/mlir/lib/Support/ToolUtilities.cpp
@@ -0,0 +1,48 @@
+//===- ToolUtilities.cpp - MLIR Tool Utilities ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines common utilities for implementing MLIR tools.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/ToolUtilities.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+
+LogicalResult
+mlir::splitAndProcessBuffer(std::unique_ptr<llvm::MemoryBuffer> originalBuffer,
+                            ChunkBufferHandler processChunkBuffer,
+                            raw_ostream &os) {
+  const char splitMarker[] = "// -----";
+
+  auto *origMemBuffer = originalBuffer.get();
+  SmallVector<StringRef, 8> sourceBuffers;
+  origMemBuffer->getBuffer().split(sourceBuffers, splitMarker);
+
+  // Add the original buffer to the source manager.
+  llvm::SourceMgr fileSourceMgr;
+  fileSourceMgr.AddNewSourceBuffer(std::move(originalBuffer), llvm::SMLoc());
+
+  // Process each chunk in turn.
+  bool hadFailure = false;
+  for (auto &subBuffer : sourceBuffers) {
+    auto splitLoc = llvm::SMLoc::getFromPointer(subBuffer.data());
+    unsigned splitLine = fileSourceMgr.getLineAndColumn(splitLoc).first;
+    auto subMemBuffer = llvm::MemoryBuffer::getMemBufferCopy(
+        subBuffer, origMemBuffer->getBufferIdentifier() +
+                       Twine(" split at line #") + Twine(splitLine));
+    if (failed(processChunkBuffer(std::move(subMemBuffer), os)))
+      hadFailure = true;
+  }
+
+  // If any fails, then return a failure of the tool.
+  return failure(hadFailure);
+}
diff --git a/mlir/lib/Support/TranslateClParser.cpp b/mlir/lib/Support/TranslateClParser.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f538cb531daa5da444d014d633b593411765cba
--- /dev/null
+++ b/mlir/lib/Support/TranslateClParser.cpp
@@ -0,0 +1,93 @@
+//===- TranslateClParser.h - Translations command line parser -------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains custom command line parser for translations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/TranslateClParser.h"
+
+#include "mlir/Analysis/Verifier.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Parser.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Translation.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+// Storage for the translation function wrappers that survive the parser.
+static SmallVector<TranslateFunction, 16> wrapperStorage;
+
+static LogicalResult printMLIROutput(ModuleOp module, raw_ostream &os) {
+  if (failed(verify(module)))
+    return failure();
+  module.print(os);
+  return success();
+}
+
+TranslationParser::TranslationParser(llvm::cl::Option &opt)
+    : llvm::cl::parser<const TranslateFunction *>(opt) {
+  const auto &toMLIRRegistry = getTranslationToMLIRRegistry();
+  const auto &fromMLIRRegistry = getTranslationFromMLIRRegistry();
+  const auto &fileToFileRegistry = getTranslationRegistry();
+
+  // Reserve the required capacity upfront so that pointers are not
+  // invalidated on reallocation.
+  wrapperStorage.reserve(toMLIRRegistry.size() + fromMLIRRegistry.size() +
+                         fileToFileRegistry.size());
+  for (const auto &kv : toMLIRRegistry) {
+    TranslateSourceMgrToMLIRFunction function = kv.second;
+    TranslateFunction wrapper = [function](llvm::SourceMgr &sourceMgr,
+                                           raw_ostream &output,
+                                           MLIRContext *context) {
+      OwningModuleRef module = function(sourceMgr, context);
+      if (!module)
+        return failure();
+      return printMLIROutput(*module, output);
+    };
+    wrapperStorage.emplace_back(std::move(wrapper));
+
+    addLiteralOption(kv.first(), &wrapperStorage.back(), kv.first());
+  }
+
+  for (const auto &kv : fromMLIRRegistry) {
+    TranslateFromMLIRFunction function = kv.second;
+    TranslateFunction wrapper = [function](llvm::SourceMgr &sourceMgr,
+                                           raw_ostream &output,
+                                           MLIRContext *context) {
+      auto module = OwningModuleRef(parseSourceFile(sourceMgr, context));
+      if (!module)
+        return failure();
+      return function(module.get(), output);
+    };
+    wrapperStorage.emplace_back(std::move(wrapper));
+
+    addLiteralOption(kv.first(), &wrapperStorage.back(), kv.first());
+  }
+  for (const auto &kv : fileToFileRegistry) {
+    wrapperStorage.emplace_back(kv.second);
+    addLiteralOption(kv.first(), &wrapperStorage.back(), kv.first());
+  }
+}
+
+void TranslationParser::printOptionInfo(const llvm::cl::Option &O,
+                                        size_t GlobalWidth) const {
+  TranslationParser *TP = const_cast<TranslationParser *>(this);
+  llvm::array_pod_sort(TP->Values.begin(), TP->Values.end(),
+                       [](const TranslationParser::OptionInfo *VT1,
+                          const TranslationParser::OptionInfo *VT2) {
+                         return VT1->Name.compare(VT2->Name);
+                       });
+  using llvm::cl::parser;
+  parser<const TranslateFunction *>::printOptionInfo(O, GlobalWidth);
+}
diff --git a/mlir/lib/TableGen/Argument.cpp b/mlir/lib/TableGen/Argument.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..080e717092eaf4e8cabb4c787dd2d3555bbc5737
--- /dev/null
+++ b/mlir/lib/TableGen/Argument.cpp
@@ -0,0 +1,20 @@
+//===- Argument.cpp - Argument definitions --------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Argument.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+
+bool tblgen::NamedTypeConstraint::hasPredicate() const {
+  return !constraint.getPredicate().isNull();
+}
+
+bool tblgen::NamedTypeConstraint::isVariadic() const {
+  return constraint.isVariadic();
+}
diff --git a/mlir/lib/TableGen/Attribute.cpp b/mlir/lib/TableGen/Attribute.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..92f5b1f7d9f726a1acad03975e3f91689690e31b
--- /dev/null
+++ b/mlir/lib/TableGen/Attribute.cpp
@@ -0,0 +1,265 @@
+//===- Attribute.cpp - Attribute wrapper class ----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Attribute wrapper to simplify using TableGen Record defining a MLIR
+// Attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Format.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+
+using llvm::CodeInit;
+using llvm::DefInit;
+using llvm::Init;
+using llvm::Record;
+using llvm::StringInit;
+
+// Returns the initializer's value as string if the given TableGen initializer
+// is a code or string initializer. Returns the empty StringRef otherwise.
+static StringRef getValueAsString(const Init *init) {
+  if (const auto *code = dyn_cast<CodeInit>(init))
+    return code->getValue().trim();
+  else if (const auto *str = dyn_cast<StringInit>(init))
+    return str->getValue().trim();
+  return {};
+}
+
+tblgen::AttrConstraint::AttrConstraint(const Record *record)
+    : Constraint(Constraint::CK_Attr, record) {
+  assert(isSubClassOf("AttrConstraint") &&
+         "must be subclass of TableGen 'AttrConstraint' class");
+}
+
+bool tblgen::AttrConstraint::isSubClassOf(StringRef className) const {
+  return def->isSubClassOf(className);
+}
+
+tblgen::Attribute::Attribute(const Record *record) : AttrConstraint(record) {
+  assert(record->isSubClassOf("Attr") &&
+         "must be subclass of TableGen 'Attr' class");
+}
+
+tblgen::Attribute::Attribute(const DefInit *init) : Attribute(init->getDef()) {}
+
+bool tblgen::Attribute::isDerivedAttr() const {
+  return isSubClassOf("DerivedAttr");
+}
+
+bool tblgen::Attribute::isTypeAttr() const {
+  return isSubClassOf("TypeAttrBase");
+}
+
+bool tblgen::Attribute::isEnumAttr() const {
+  return isSubClassOf("EnumAttrInfo");
+}
+
+StringRef tblgen::Attribute::getStorageType() const {
+  const auto *init = def->getValueInit("storageType");
+  auto type = getValueAsString(init);
+  if (type.empty())
+    return "Attribute";
+  return type;
+}
+
+StringRef tblgen::Attribute::getReturnType() const {
+  const auto *init = def->getValueInit("returnType");
+  return getValueAsString(init);
+}
+
+StringRef tblgen::Attribute::getConvertFromStorageCall() const {
+  const auto *init = def->getValueInit("convertFromStorage");
+  return getValueAsString(init);
+}
+
+bool tblgen::Attribute::isConstBuildable() const {
+  const auto *init = def->getValueInit("constBuilderCall");
+  return !getValueAsString(init).empty();
+}
+
+StringRef tblgen::Attribute::getConstBuilderTemplate() const {
+  const auto *init = def->getValueInit("constBuilderCall");
+  return getValueAsString(init);
+}
+
+tblgen::Attribute tblgen::Attribute::getBaseAttr() const {
+  if (const auto *defInit =
+          llvm::dyn_cast<llvm::DefInit>(def->getValueInit("baseAttr"))) {
+    return Attribute(defInit).getBaseAttr();
+  }
+  return *this;
+}
+
+bool tblgen::Attribute::hasDefaultValue() const {
+  const auto *init = def->getValueInit("defaultValue");
+  return !getValueAsString(init).empty();
+}
+
+StringRef tblgen::Attribute::getDefaultValue() const {
+  const auto *init = def->getValueInit("defaultValue");
+  return getValueAsString(init);
+}
+
+bool tblgen::Attribute::isOptional() const {
+  return def->getValueAsBit("isOptional");
+}
+
+StringRef tblgen::Attribute::getAttrDefName() const {
+  if (def->isAnonymous()) {
+    return getBaseAttr().def->getName();
+  }
+  return def->getName();
+}
+
+StringRef tblgen::Attribute::getDerivedCodeBody() const {
+  assert(isDerivedAttr() && "only derived attribute has 'body' field");
+  return def->getValueAsString("body");
+}
+
+tblgen::ConstantAttr::ConstantAttr(const DefInit *init) : def(init->getDef()) {
+  assert(def->isSubClassOf("ConstantAttr") &&
+         "must be subclass of TableGen 'ConstantAttr' class");
+}
+
+tblgen::Attribute tblgen::ConstantAttr::getAttribute() const {
+  return Attribute(def->getValueAsDef("attr"));
+}
+
+StringRef tblgen::ConstantAttr::getConstantValue() const {
+  return def->getValueAsString("value");
+}
+
+tblgen::EnumAttrCase::EnumAttrCase(const llvm::DefInit *init)
+    : Attribute(init) {
+  assert(isSubClassOf("EnumAttrCaseInfo") &&
+         "must be subclass of TableGen 'EnumAttrInfo' class");
+}
+
+bool tblgen::EnumAttrCase::isStrCase() const {
+  return isSubClassOf("StrEnumAttrCase");
+}
+
+StringRef tblgen::EnumAttrCase::getSymbol() const {
+  return def->getValueAsString("symbol");
+}
+
+int64_t tblgen::EnumAttrCase::getValue() const {
+  return def->getValueAsInt("value");
+}
+
+tblgen::EnumAttr::EnumAttr(const llvm::Record *record) : Attribute(record) {
+  assert(isSubClassOf("EnumAttrInfo") &&
+         "must be subclass of TableGen 'EnumAttr' class");
+}
+
+tblgen::EnumAttr::EnumAttr(const llvm::Record &record) : Attribute(&record) {}
+
+tblgen::EnumAttr::EnumAttr(const llvm::DefInit *init)
+    : EnumAttr(init->getDef()) {}
+
+bool tblgen::EnumAttr::isBitEnum() const { return isSubClassOf("BitEnumAttr"); }
+
+StringRef tblgen::EnumAttr::getEnumClassName() const {
+  return def->getValueAsString("className");
+}
+
+StringRef tblgen::EnumAttr::getCppNamespace() const {
+  return def->getValueAsString("cppNamespace");
+}
+
+StringRef tblgen::EnumAttr::getUnderlyingType() const {
+  return def->getValueAsString("underlyingType");
+}
+
+StringRef tblgen::EnumAttr::getUnderlyingToSymbolFnName() const {
+  return def->getValueAsString("underlyingToSymbolFnName");
+}
+
+StringRef tblgen::EnumAttr::getStringToSymbolFnName() const {
+  return def->getValueAsString("stringToSymbolFnName");
+}
+
+StringRef tblgen::EnumAttr::getSymbolToStringFnName() const {
+  return def->getValueAsString("symbolToStringFnName");
+}
+
+StringRef tblgen::EnumAttr::getSymbolToStringFnRetType() const {
+  return def->getValueAsString("symbolToStringFnRetType");
+}
+
+StringRef tblgen::EnumAttr::getMaxEnumValFnName() const {
+  return def->getValueAsString("maxEnumValFnName");
+}
+
+std::vector<tblgen::EnumAttrCase> tblgen::EnumAttr::getAllCases() const {
+  const auto *inits = def->getValueAsListInit("enumerants");
+
+  std::vector<tblgen::EnumAttrCase> cases;
+  cases.reserve(inits->size());
+
+  for (const llvm::Init *init : *inits) {
+    cases.push_back(tblgen::EnumAttrCase(cast<llvm::DefInit>(init)));
+  }
+
+  return cases;
+}
+
+tblgen::StructFieldAttr::StructFieldAttr(const llvm::Record *record)
+    : def(record) {
+  assert(def->isSubClassOf("StructFieldAttr") &&
+         "must be subclass of TableGen 'StructFieldAttr' class");
+}
+
+tblgen::StructFieldAttr::StructFieldAttr(const llvm::Record &record)
+    : StructFieldAttr(&record) {}
+
+tblgen::StructFieldAttr::StructFieldAttr(const llvm::DefInit *init)
+    : StructFieldAttr(init->getDef()) {}
+
+StringRef tblgen::StructFieldAttr::getName() const {
+  return def->getValueAsString("name");
+}
+
+tblgen::Attribute tblgen::StructFieldAttr::getType() const {
+  auto init = def->getValueInit("type");
+  return tblgen::Attribute(cast<llvm::DefInit>(init));
+}
+
+tblgen::StructAttr::StructAttr(const llvm::Record *record) : Attribute(record) {
+  assert(isSubClassOf("StructAttr") &&
+         "must be subclass of TableGen 'StructAttr' class");
+}
+
+tblgen::StructAttr::StructAttr(const llvm::DefInit *init)
+    : StructAttr(init->getDef()) {}
+
+StringRef tblgen::StructAttr::getStructClassName() const {
+  return def->getValueAsString("className");
+}
+
+StringRef tblgen::StructAttr::getCppNamespace() const {
+  Dialect dialect(def->getValueAsDef("structDialect"));
+  return dialect.getCppNamespace();
+}
+
+std::vector<mlir::tblgen::StructFieldAttr>
+tblgen::StructAttr::getAllFields() const {
+  std::vector<mlir::tblgen::StructFieldAttr> attributes;
+
+  const auto *inits = def->getValueAsListInit("fields");
+  attributes.reserve(inits->size());
+
+  for (const llvm::Init *init : *inits) {
+    attributes.emplace_back(cast<llvm::DefInit>(init));
+  }
+
+  return attributes;
+}
diff --git a/mlir/lib/TableGen/CMakeLists.txt b/mlir/lib/TableGen/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..89e0f024664cfe5c8fdf95e795614a821796b698
--- /dev/null
+++ b/mlir/lib/TableGen/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_llvm_library(LLVMMLIRTableGen
+  Argument.cpp
+  Attribute.cpp
+  Constraint.cpp
+  Dialect.cpp
+  Format.cpp
+  Operator.cpp
+  OpInterfaces.cpp
+  OpTrait.cpp
+  Pattern.cpp
+  Predicate.cpp
+  Type.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/TableGen
+  )
+target_link_libraries(LLVMMLIRTableGen LLVMSupport LLVMTableGen)
diff --git a/mlir/lib/TableGen/Constraint.cpp b/mlir/lib/TableGen/Constraint.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..022c5ad04dfdf28441a78f7476f909fdd9dc5f61
--- /dev/null
+++ b/mlir/lib/TableGen/Constraint.cpp
@@ -0,0 +1,60 @@
+//===- Constraint.cpp - Constraint class ----------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Constraint wrapper to simplify using TableGen Record for constraints.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Constraint.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir::tblgen;
+
+Constraint::Constraint(const llvm::Record *record)
+    : def(record), kind(CK_Uncategorized) {
+  if (record->isSubClassOf("TypeConstraint")) {
+    kind = CK_Type;
+  } else if (record->isSubClassOf("AttrConstraint")) {
+    kind = CK_Attr;
+  } else if (record->isSubClassOf("RegionConstraint")) {
+    kind = CK_Region;
+  } else {
+    assert(record->isSubClassOf("Constraint"));
+  }
+}
+
+Constraint::Constraint(Kind kind, const llvm::Record *record)
+    : def(record), kind(kind) {}
+
+Pred Constraint::getPredicate() const {
+  auto *val = def->getValue("predicate");
+
+  // If no predicate is specified, then return the null predicate (which
+  // corresponds to true).
+  if (!val)
+    return Pred();
+
+  const auto *pred = dyn_cast<llvm::DefInit>(val->getValue());
+  return Pred(pred);
+}
+
+std::string Constraint::getConditionTemplate() const {
+  return getPredicate().getCondition();
+}
+
+llvm::StringRef Constraint::getDescription() const {
+  auto doc = def->getValueAsString("description");
+  if (doc.empty())
+    return def->getName();
+  return doc;
+}
+
+AppliedConstraint::AppliedConstraint(Constraint &&constraint,
+                                     llvm::StringRef self,
+                                     std::vector<std::string> &&entities)
+    : constraint(constraint), self(self), entities(std::move(entities)) {}
diff --git a/mlir/lib/TableGen/Dialect.cpp b/mlir/lib/TableGen/Dialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9e8e2f715417bfdbd273302e07588354b6da1ab
--- /dev/null
+++ b/mlir/lib/TableGen/Dialect.cpp
@@ -0,0 +1,54 @@
+//===- Dialect.cpp - Dialect wrapper class --------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Dialect wrapper to simplify using TableGen Record defining a MLIR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Dialect.h"
+#include "llvm/TableGen/Record.h"
+
+namespace mlir {
+namespace tblgen {
+
+StringRef tblgen::Dialect::getName() const {
+  return def->getValueAsString("name");
+}
+
+StringRef tblgen::Dialect::getCppNamespace() const {
+  return def->getValueAsString("cppNamespace");
+}
+
+static StringRef getAsStringOrEmpty(const llvm::Record &record,
+                                    StringRef fieldName) {
+  if (auto valueInit = record.getValueInit(fieldName)) {
+    if (llvm::isa<llvm::CodeInit>(valueInit) ||
+        llvm::isa<llvm::StringInit>(valueInit))
+      return record.getValueAsString(fieldName);
+  }
+  return "";
+}
+
+StringRef tblgen::Dialect::getSummary() const {
+  return getAsStringOrEmpty(*def, "summary");
+}
+
+StringRef tblgen::Dialect::getDescription() const {
+  return getAsStringOrEmpty(*def, "description");
+}
+
+bool Dialect::operator==(const Dialect &other) const {
+  return def == other.def;
+}
+
+bool Dialect::operator<(const Dialect &other) const {
+  return getName() < other.getName();
+}
+
+} // end namespace tblgen
+} // end namespace mlir
diff --git a/mlir/lib/TableGen/Format.cpp b/mlir/lib/TableGen/Format.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07742ab6a405bbfb25aedcf07ff637d282f8028c
--- /dev/null
+++ b/mlir/lib/TableGen/Format.cpp
@@ -0,0 +1,176 @@
+//===- Format.cpp - Utilities for String Format ---------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utilities for formatting strings. They are specially
+// tailored to the needs of TableGen'ing op definitions and rewrite rules,
+// so they are not expected to be used as widely applicable utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Format.h"
+#include <cctype>
+
+using namespace mlir;
+using namespace mlir::tblgen;
+
+// Marker to indicate an error happened when replacing a placeholder.
+const char *const kMarkerForNoSubst = "<no-subst-found>";
+
+FmtContext &tblgen::FmtContext::addSubst(StringRef placeholder, Twine subst) {
+  customSubstMap[placeholder] = subst.str();
+  return *this;
+}
+
+FmtContext &tblgen::FmtContext::withBuilder(Twine subst) {
+  builtinSubstMap[PHKind::Builder] = subst.str();
+  return *this;
+}
+
+FmtContext &tblgen::FmtContext::withOp(Twine subst) {
+  builtinSubstMap[PHKind::Op] = subst.str();
+  return *this;
+}
+
+FmtContext &tblgen::FmtContext::withSelf(Twine subst) {
+  builtinSubstMap[PHKind::Self] = subst.str();
+  return *this;
+}
+
+Optional<StringRef>
+tblgen::FmtContext::getSubstFor(FmtContext::PHKind placeholder) const {
+  if (placeholder == FmtContext::PHKind::None ||
+      placeholder == FmtContext::PHKind::Custom)
+    return {};
+  auto it = builtinSubstMap.find(placeholder);
+  if (it == builtinSubstMap.end())
+    return {};
+  return StringRef(it->second);
+}
+
+Optional<StringRef>
+tblgen::FmtContext::getSubstFor(StringRef placeholder) const {
+  auto it = customSubstMap.find(placeholder);
+  if (it == customSubstMap.end())
+    return {};
+  return StringRef(it->second);
+}
+
+FmtContext::PHKind tblgen::FmtContext::getPlaceHolderKind(StringRef str) {
+  return llvm::StringSwitch<FmtContext::PHKind>(str)
+      .Case("_builder", FmtContext::PHKind::Builder)
+      .Case("_op", FmtContext::PHKind::Op)
+      .Case("_self", FmtContext::PHKind::Self)
+      .Case("", FmtContext::PHKind::None)
+      .Default(FmtContext::PHKind::Custom);
+}
+
+std::pair<FmtReplacement, StringRef>
+tblgen::FmtObjectBase::splitFmtSegment(StringRef fmt) {
+  size_t begin = fmt.find_first_of('$');
+  if (begin == StringRef::npos) {
+    // No placeholders: the whole format string should be returned as a
+    // literal string.
+    return {FmtReplacement{fmt}, StringRef()};
+  }
+  if (begin != 0) {
+    // The first placeholder is not at the beginning: we can split the format
+    // string into a literal string and the rest.
+    return {FmtReplacement{fmt.substr(0, begin)}, fmt.substr(begin)};
+  }
+
+  // The first placeholder is at the beginning
+
+  if (fmt.size() == 1) {
+    // The whole format string just contains '$': treat as literal.
+    return {FmtReplacement{fmt}, StringRef()};
+  }
+
+  // Allow escaping dollar with '$$'
+  if (fmt[1] == '$') {
+    return {FmtReplacement{fmt.substr(0, 1)}, fmt.substr(2)};
+  }
+
+  // First try to see if it's a positional placeholder, and then handle special
+  // placeholders.
+
+  size_t end = fmt.find_if_not([](char c) { return std::isdigit(c); }, 1);
+  if (end != 1) {
+    // We have a positional placeholder. Parse the index.
+    size_t index = 0;
+    if (fmt.substr(1, end - 1).consumeInteger(0, index)) {
+      llvm_unreachable("invalid replacement sequence index");
+    }
+
+    if (end == StringRef::npos) {
+      // All the remaining characters are part of the positional placeholder.
+      return {FmtReplacement{fmt, index}, StringRef()};
+    }
+    return {FmtReplacement{fmt.substr(0, end), index}, fmt.substr(end)};
+  }
+
+  end = fmt.find_if_not([](char c) { return std::isalnum(c) || c == '_'; }, 1);
+  auto placeholder = FmtContext::getPlaceHolderKind(fmt.substr(1, end - 1));
+  if (end == StringRef::npos) {
+    // All the remaining characters are part of the special placeholder.
+    return {FmtReplacement{fmt, placeholder}, StringRef()};
+  }
+  return {FmtReplacement{fmt.substr(0, end), placeholder}, fmt.substr(end)};
+}
+
+std::vector<FmtReplacement> FmtObjectBase::parseFormatString(StringRef fmt) {
+  std::vector<FmtReplacement> replacements;
+  FmtReplacement repl;
+  while (!fmt.empty()) {
+    std::tie(repl, fmt) = splitFmtSegment(fmt);
+    if (repl.type != FmtReplacement::Type::Empty)
+      replacements.push_back(repl);
+  }
+  return replacements;
+}
+
+void FmtObjectBase::format(raw_ostream &s) const {
+  for (auto &repl : replacements) {
+    if (repl.type == FmtReplacement::Type::Empty)
+      continue;
+
+    if (repl.type == FmtReplacement::Type::Literal) {
+      s << repl.spec;
+      continue;
+    }
+
+    if (repl.type == FmtReplacement::Type::SpecialPH) {
+      if (repl.placeholder == FmtContext::PHKind::None) {
+        s << repl.spec;
+      } else if (!context) {
+        // We need the context to replace special placeholders.
+        s << repl.spec << kMarkerForNoSubst;
+      } else {
+        Optional<StringRef> subst;
+        if (repl.placeholder == FmtContext::PHKind::Custom) {
+          // Skip the leading '$' sign for the custom placeholder
+          subst = context->getSubstFor(repl.spec.substr(1));
+        } else {
+          subst = context->getSubstFor(repl.placeholder);
+        }
+        if (subst)
+          s << *subst;
+        else
+          s << repl.spec << kMarkerForNoSubst;
+      }
+      continue;
+    }
+
+    assert(repl.type == FmtReplacement::Type::PositionalPH);
+
+    if (repl.index >= adapters.size()) {
+      s << repl.spec << kMarkerForNoSubst;
+      continue;
+    }
+    adapters[repl.index]->format(s, /*Options=*/"");
+  }
+}
diff --git a/mlir/lib/TableGen/OpInterfaces.cpp b/mlir/lib/TableGen/OpInterfaces.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b1e56efc029133570f598b9f9e7edf13cbb6cec7
--- /dev/null
+++ b/mlir/lib/TableGen/OpInterfaces.cpp
@@ -0,0 +1,87 @@
+//===- OpInterfaces.cpp - OpInterfaces class ------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpInterfaces wrapper to simplify using TableGen OpInterfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/OpInterfaces.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+using namespace mlir::tblgen;
+
+OpInterfaceMethod::OpInterfaceMethod(const llvm::Record *def) : def(def) {
+  llvm::DagInit *args = def->getValueAsDag("arguments");
+  for (unsigned i = 0, e = args->getNumArgs(); i != e; ++i) {
+    arguments.push_back(
+        {llvm::cast<llvm::StringInit>(args->getArg(i))->getValue(),
+         args->getArgNameStr(i)});
+  }
+}
+
+StringRef OpInterfaceMethod::getReturnType() const {
+  return def->getValueAsString("returnType");
+}
+
+// Return the name of this method.
+StringRef OpInterfaceMethod::getName() const {
+  return def->getValueAsString("name");
+}
+
+// Return if this method is static.
+bool OpInterfaceMethod::isStatic() const {
+  return def->isSubClassOf("StaticInterfaceMethod");
+}
+
+// Return the body for this method if it has one.
+llvm::Optional<StringRef> OpInterfaceMethod::getBody() const {
+  auto value = def->getValueAsString("body");
+  return value.empty() ? llvm::Optional<StringRef>() : value;
+}
+
+// Return the default implementation for this method if it has one.
+llvm::Optional<StringRef> OpInterfaceMethod::getDefaultImplementation() const {
+  auto value = def->getValueAsString("defaultBody");
+  return value.empty() ? llvm::Optional<StringRef>() : value;
+}
+
+// Return the description of this method if it has one.
+llvm::Optional<StringRef> OpInterfaceMethod::getDescription() const {
+  auto value = def->getValueAsString("description");
+  return value.empty() ? llvm::Optional<StringRef>() : value;
+}
+
+ArrayRef<OpInterfaceMethod::Argument> OpInterfaceMethod::getArguments() const {
+  return arguments;
+}
+
+bool OpInterfaceMethod::arg_empty() const { return arguments.empty(); }
+
+OpInterface::OpInterface(const llvm::Record *def) : def(def) {
+  auto *listInit = dyn_cast<llvm::ListInit>(def->getValueInit("methods"));
+  for (llvm::Init *init : listInit->getValues())
+    methods.emplace_back(cast<llvm::DefInit>(init)->getDef());
+}
+
+// Return the name of this interface.
+StringRef OpInterface::getName() const {
+  return def->getValueAsString("cppClassName");
+}
+
+// Return the methods of this interface.
+ArrayRef<OpInterfaceMethod> OpInterface::getMethods() const { return methods; }
+
+// Return the description of this method if it has one.
+llvm::Optional<StringRef> OpInterface::getDescription() const {
+  auto value = def->getValueAsString("description");
+  return value.empty() ? llvm::Optional<StringRef>() : value;
+}
diff --git a/mlir/lib/TableGen/OpTrait.cpp b/mlir/lib/TableGen/OpTrait.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86e34cd46b56d436edfc18539823310d81d08f69
--- /dev/null
+++ b/mlir/lib/TableGen/OpTrait.cpp
@@ -0,0 +1,66 @@
+//===- OpTrait.cpp - OpTrait class ----------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpTrait wrapper to simplify using TableGen Record defining a MLIR OpTrait.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/OpTrait.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/TableGen/OpInterfaces.h"
+#include "mlir/TableGen/Predicate.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+using namespace mlir::tblgen;
+
+OpTrait OpTrait::create(const llvm::Init *init) {
+  auto def = cast<llvm::DefInit>(init)->getDef();
+  if (def->isSubClassOf("PredOpTrait"))
+    return OpTrait(Kind::Pred, def);
+  if (def->isSubClassOf("GenInternalOpTrait"))
+    return OpTrait(Kind::Internal, def);
+  if (def->isSubClassOf("OpInterface"))
+    return OpTrait(Kind::Interface, def);
+  assert(def->isSubClassOf("NativeOpTrait"));
+  return OpTrait(Kind::Native, def);
+}
+
+OpTrait::OpTrait(Kind kind, const llvm::Record *def) : def(def), kind(kind) {}
+
+llvm::StringRef NativeOpTrait::getTrait() const {
+  return def->getValueAsString("trait");
+}
+
+llvm::StringRef InternalOpTrait::getTrait() const {
+  return def->getValueAsString("trait");
+}
+
+std::string PredOpTrait::getPredTemplate() const {
+  auto pred = tblgen::Pred(def->getValueInit("predicate"));
+  return pred.getCondition();
+}
+
+llvm::StringRef PredOpTrait::getDescription() const {
+  return def->getValueAsString("description");
+}
+
+OpInterface InterfaceOpTrait::getOpInterface() const {
+  return OpInterface(def);
+}
+
+llvm::StringRef InterfaceOpTrait::getTrait() const {
+  return def->getValueAsString("trait");
+}
+
+bool InterfaceOpTrait::shouldDeclareMethods() const {
+  return def->isSubClassOf("DeclareOpInterfaceMethods");
+}
diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d61eec4ad440ddbdde38316e654890319f52abef
--- /dev/null
+++ b/mlir/lib/TableGen/Operator.cpp
@@ -0,0 +1,347 @@
+//===- Operator.cpp - Operator class --------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Operator wrapper to simplify using TableGen Record defining a MLIR Op.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Operator.h"
+#include "mlir/TableGen/OpTrait.h"
+#include "mlir/TableGen/Predicate.h"
+#include "mlir/TableGen/Type.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+#define DEBUG_TYPE "mlir-tblgen-operator"
+
+using namespace mlir;
+
+using llvm::DagInit;
+using llvm::DefInit;
+using llvm::Record;
+
+tblgen::Operator::Operator(const llvm::Record &def)
+    : dialect(def.getValueAsDef("opDialect")), def(def) {
+  // The first `_` in the op's TableGen def name is treated as separating the
+  // dialect prefix and the op class name. The dialect prefix will be ignored if
+  // not empty. Otherwise, if def name starts with a `_`, the `_` is considered
+  // as part of the class name.
+  StringRef prefix;
+  std::tie(prefix, cppClassName) = def.getName().split('_');
+  if (prefix.empty()) {
+    // Class name with a leading underscore and without dialect prefix
+    cppClassName = def.getName();
+  } else if (cppClassName.empty()) {
+    // Class name without dialect prefix
+    cppClassName = prefix;
+  }
+
+  populateOpStructure();
+}
+
+std::string tblgen::Operator::getOperationName() const {
+  auto prefix = dialect.getName();
+  auto opName = def.getValueAsString("opName");
+  if (prefix.empty())
+    return opName;
+  return llvm::formatv("{0}.{1}", prefix, opName);
+}
+
+StringRef tblgen::Operator::getDialectName() const { return dialect.getName(); }
+
+StringRef tblgen::Operator::getCppClassName() const { return cppClassName; }
+
+std::string tblgen::Operator::getQualCppClassName() const {
+  auto prefix = dialect.getCppNamespace();
+  if (prefix.empty())
+    return cppClassName;
+  return llvm::formatv("{0}::{1}", prefix, cppClassName);
+}
+
+int tblgen::Operator::getNumResults() const {
+  DagInit *results = def.getValueAsDag("results");
+  return results->getNumArgs();
+}
+
+StringRef tblgen::Operator::getExtraClassDeclaration() const {
+  constexpr auto attr = "extraClassDeclaration";
+  if (def.isValueUnset(attr))
+    return {};
+  return def.getValueAsString(attr);
+}
+
+const llvm::Record &tblgen::Operator::getDef() const { return def; }
+
+bool tblgen::Operator::isVariadic() const {
+  return getNumVariadicOperands() != 0 || getNumVariadicResults() != 0;
+}
+
+bool tblgen::Operator::skipDefaultBuilders() const {
+  return def.getValueAsBit("skipDefaultBuilders");
+}
+
+auto tblgen::Operator::result_begin() -> value_iterator {
+  return results.begin();
+}
+
+auto tblgen::Operator::result_end() -> value_iterator { return results.end(); }
+
+auto tblgen::Operator::getResults() -> value_range {
+  return {result_begin(), result_end()};
+}
+
+tblgen::TypeConstraint
+tblgen::Operator::getResultTypeConstraint(int index) const {
+  DagInit *results = def.getValueAsDag("results");
+  return TypeConstraint(cast<DefInit>(results->getArg(index)));
+}
+
+StringRef tblgen::Operator::getResultName(int index) const {
+  DagInit *results = def.getValueAsDag("results");
+  return results->getArgNameStr(index);
+}
+
+unsigned tblgen::Operator::getNumVariadicResults() const {
+  return std::count_if(
+      results.begin(), results.end(),
+      [](const NamedTypeConstraint &c) { return c.constraint.isVariadic(); });
+}
+
+unsigned tblgen::Operator::getNumVariadicOperands() const {
+  return std::count_if(
+      operands.begin(), operands.end(),
+      [](const NamedTypeConstraint &c) { return c.constraint.isVariadic(); });
+}
+
+tblgen::Operator::arg_iterator tblgen::Operator::arg_begin() const {
+  return arguments.begin();
+}
+
+tblgen::Operator::arg_iterator tblgen::Operator::arg_end() const {
+  return arguments.end();
+}
+
+tblgen::Operator::arg_range tblgen::Operator::getArgs() const {
+  return {arg_begin(), arg_end()};
+}
+
+StringRef tblgen::Operator::getArgName(int index) const {
+  DagInit *argumentValues = def.getValueAsDag("arguments");
+  return argumentValues->getArgName(index)->getValue();
+}
+
+const tblgen::OpTrait *tblgen::Operator::getTrait(StringRef trait) const {
+  for (const auto &t : traits) {
+    if (auto opTrait = dyn_cast<tblgen::NativeOpTrait>(&t)) {
+      if (opTrait->getTrait() == trait)
+        return opTrait;
+    } else if (auto opTrait = dyn_cast<tblgen::InternalOpTrait>(&t)) {
+      if (opTrait->getTrait() == trait)
+        return opTrait;
+    } else if (auto opTrait = dyn_cast<tblgen::InterfaceOpTrait>(&t)) {
+      if (opTrait->getTrait() == trait)
+        return opTrait;
+    }
+  }
+  return nullptr;
+}
+
+unsigned tblgen::Operator::getNumRegions() const { return regions.size(); }
+
+const tblgen::NamedRegion &tblgen::Operator::getRegion(unsigned index) const {
+  return regions[index];
+}
+
+auto tblgen::Operator::trait_begin() const -> const_trait_iterator {
+  return traits.begin();
+}
+auto tblgen::Operator::trait_end() const -> const_trait_iterator {
+  return traits.end();
+}
+auto tblgen::Operator::getTraits() const
+    -> llvm::iterator_range<const_trait_iterator> {
+  return {trait_begin(), trait_end()};
+}
+
+auto tblgen::Operator::attribute_begin() const -> attribute_iterator {
+  return attributes.begin();
+}
+auto tblgen::Operator::attribute_end() const -> attribute_iterator {
+  return attributes.end();
+}
+auto tblgen::Operator::getAttributes() const
+    -> llvm::iterator_range<attribute_iterator> {
+  return {attribute_begin(), attribute_end()};
+}
+
+auto tblgen::Operator::operand_begin() -> value_iterator {
+  return operands.begin();
+}
+auto tblgen::Operator::operand_end() -> value_iterator {
+  return operands.end();
+}
+auto tblgen::Operator::getOperands() -> value_range {
+  return {operand_begin(), operand_end()};
+}
+
+auto tblgen::Operator::getArg(int index) const -> Argument {
+  return arguments[index];
+}
+
+void tblgen::Operator::populateOpStructure() {
+  auto &recordKeeper = def.getRecords();
+  auto typeConstraintClass = recordKeeper.getClass("TypeConstraint");
+  auto attrClass = recordKeeper.getClass("Attr");
+  auto derivedAttrClass = recordKeeper.getClass("DerivedAttr");
+  numNativeAttributes = 0;
+
+  DagInit *argumentValues = def.getValueAsDag("arguments");
+  unsigned numArgs = argumentValues->getNumArgs();
+
+  // Handle operands and native attributes.
+  for (unsigned i = 0; i != numArgs; ++i) {
+    auto arg = argumentValues->getArg(i);
+    auto givenName = argumentValues->getArgNameStr(i);
+    auto argDefInit = dyn_cast<DefInit>(arg);
+    if (!argDefInit)
+      PrintFatalError(def.getLoc(),
+                      Twine("undefined type for argument #") + Twine(i));
+    Record *argDef = argDefInit->getDef();
+
+    if (argDef->isSubClassOf(typeConstraintClass)) {
+      operands.push_back(
+          NamedTypeConstraint{givenName, TypeConstraint(argDefInit)});
+    } else if (argDef->isSubClassOf(attrClass)) {
+      if (givenName.empty())
+        PrintFatalError(argDef->getLoc(), "attributes must be named");
+      if (argDef->isSubClassOf(derivedAttrClass))
+        PrintFatalError(argDef->getLoc(),
+                        "derived attributes not allowed in argument list");
+      attributes.push_back({givenName, Attribute(argDef)});
+      ++numNativeAttributes;
+    } else {
+      PrintFatalError(def.getLoc(), "unexpected def type; only defs deriving "
+                                    "from TypeConstraint or Attr are allowed");
+    }
+  }
+
+  // Handle derived attributes.
+  for (const auto &val : def.getValues()) {
+    if (auto *record = dyn_cast<llvm::RecordRecTy>(val.getType())) {
+      if (!record->isSubClassOf(attrClass))
+        continue;
+      if (!record->isSubClassOf(derivedAttrClass))
+        PrintFatalError(def.getLoc(),
+                        "unexpected Attr where only DerivedAttr is allowed");
+
+      if (record->getClasses().size() != 1) {
+        PrintFatalError(
+            def.getLoc(),
+            "unsupported attribute modelling, only single class expected");
+      }
+      attributes.push_back(
+          {cast<llvm::StringInit>(val.getNameInit())->getValue(),
+           Attribute(cast<DefInit>(val.getValue()))});
+    }
+  }
+
+  // Populate `arguments`. This must happen after we've finalized `operands` and
+  // `attributes` because we will put their elements' pointers in `arguments`.
+  // SmallVector may perform re-allocation under the hood when adding new
+  // elements.
+  int operandIndex = 0, attrIndex = 0;
+  for (unsigned i = 0; i != numArgs; ++i) {
+    Record *argDef = dyn_cast<DefInit>(argumentValues->getArg(i))->getDef();
+
+    if (argDef->isSubClassOf(typeConstraintClass)) {
+      arguments.emplace_back(&operands[operandIndex++]);
+    } else {
+      assert(argDef->isSubClassOf(attrClass));
+      arguments.emplace_back(&attributes[attrIndex++]);
+    }
+  }
+
+  auto *resultsDag = def.getValueAsDag("results");
+  auto *outsOp = dyn_cast<DefInit>(resultsDag->getOperator());
+  if (!outsOp || outsOp->getDef()->getName() != "outs") {
+    PrintFatalError(def.getLoc(), "'results' must have 'outs' directive");
+  }
+
+  // Handle results.
+  for (unsigned i = 0, e = resultsDag->getNumArgs(); i < e; ++i) {
+    auto name = resultsDag->getArgNameStr(i);
+    auto *resultDef = dyn_cast<DefInit>(resultsDag->getArg(i));
+    if (!resultDef) {
+      PrintFatalError(def.getLoc(),
+                      Twine("undefined type for result #") + Twine(i));
+    }
+    results.push_back({name, TypeConstraint(resultDef)});
+  }
+
+  // Create list of traits, skipping over duplicates: appending to lists in
+  // tablegen is easy, making them unique less so, so dedupe here.
+  if (auto traitList = def.getValueAsListInit("traits")) {
+    // This is uniquing based on pointers of the trait.
+    SmallPtrSet<const llvm::Init *, 32> traitSet;
+    traits.reserve(traitSet.size());
+    for (auto traitInit : *traitList) {
+      // Keep traits in the same order while skipping over duplicates.
+      if (traitSet.insert(traitInit).second)
+        traits.push_back(OpTrait::create(traitInit));
+    }
+  }
+
+  // Handle regions
+  auto *regionsDag = def.getValueAsDag("regions");
+  auto *regionsOp = dyn_cast<DefInit>(regionsDag->getOperator());
+  if (!regionsOp || regionsOp->getDef()->getName() != "region") {
+    PrintFatalError(def.getLoc(), "'regions' must have 'region' directive");
+  }
+
+  for (unsigned i = 0, e = regionsDag->getNumArgs(); i < e; ++i) {
+    auto name = regionsDag->getArgNameStr(i);
+    auto *regionInit = dyn_cast<DefInit>(regionsDag->getArg(i));
+    if (!regionInit) {
+      PrintFatalError(def.getLoc(),
+                      Twine("undefined kind for region #") + Twine(i));
+    }
+    regions.push_back({name, Region(regionInit->getDef())});
+  }
+
+  LLVM_DEBUG(print(llvm::dbgs()));
+}
+
+ArrayRef<llvm::SMLoc> tblgen::Operator::getLoc() const { return def.getLoc(); }
+
+bool tblgen::Operator::hasDescription() const {
+  return def.getValue("description") != nullptr;
+}
+
+StringRef tblgen::Operator::getDescription() const {
+  return def.getValueAsString("description");
+}
+
+bool tblgen::Operator::hasSummary() const {
+  return def.getValue("summary") != nullptr;
+}
+
+StringRef tblgen::Operator::getSummary() const {
+  return def.getValueAsString("summary");
+}
+
+void tblgen::Operator::print(llvm::raw_ostream &os) const {
+  os << "op '" << getOperationName() << "'\n";
+  for (Argument arg : arguments) {
+    if (auto *attr = arg.dyn_cast<NamedAttribute *>())
+      os << "[attribute] " << attr->name << '\n';
+    else
+      os << "[operand] " << arg.get<NamedTypeConstraint *>()->name << '\n';
+  }
+}
diff --git a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ada2af8fb47096b662362f3cdbf23785f60aef90
--- /dev/null
+++ b/mlir/lib/TableGen/Pattern.cpp
@@ -0,0 +1,569 @@
+//===- Pattern.cpp - Pattern wrapper class --------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Pattern wrapper class to simplify using TableGen Record defining a MLIR
+// Pattern.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Pattern.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+#define DEBUG_TYPE "mlir-tblgen-pattern"
+
+using namespace mlir;
+
+using llvm::formatv;
+using mlir::tblgen::Operator;
+
+//===----------------------------------------------------------------------===//
+// DagLeaf
+//===----------------------------------------------------------------------===//
+
+bool tblgen::DagLeaf::isUnspecified() const {
+  return dyn_cast_or_null<llvm::UnsetInit>(def);
+}
+
+bool tblgen::DagLeaf::isOperandMatcher() const {
+  // Operand matchers specify a type constraint.
+  return isSubClassOf("TypeConstraint");
+}
+
+bool tblgen::DagLeaf::isAttrMatcher() const {
+  // Attribute matchers specify an attribute constraint.
+  return isSubClassOf("AttrConstraint");
+}
+
+bool tblgen::DagLeaf::isNativeCodeCall() const {
+  return isSubClassOf("NativeCodeCall");
+}
+
+bool tblgen::DagLeaf::isConstantAttr() const {
+  return isSubClassOf("ConstantAttr");
+}
+
+bool tblgen::DagLeaf::isEnumAttrCase() const {
+  return isSubClassOf("EnumAttrCaseInfo");
+}
+
+tblgen::Constraint tblgen::DagLeaf::getAsConstraint() const {
+  assert((isOperandMatcher() || isAttrMatcher()) &&
+         "the DAG leaf must be operand or attribute");
+  return Constraint(cast<llvm::DefInit>(def)->getDef());
+}
+
+tblgen::ConstantAttr tblgen::DagLeaf::getAsConstantAttr() const {
+  assert(isConstantAttr() && "the DAG leaf must be constant attribute");
+  return ConstantAttr(cast<llvm::DefInit>(def));
+}
+
+tblgen::EnumAttrCase tblgen::DagLeaf::getAsEnumAttrCase() const {
+  assert(isEnumAttrCase() && "the DAG leaf must be an enum attribute case");
+  return EnumAttrCase(cast<llvm::DefInit>(def));
+}
+
+std::string tblgen::DagLeaf::getConditionTemplate() const {
+  return getAsConstraint().getConditionTemplate();
+}
+
+llvm::StringRef tblgen::DagLeaf::getNativeCodeTemplate() const {
+  assert(isNativeCodeCall() && "the DAG leaf must be NativeCodeCall");
+  return cast<llvm::DefInit>(def)->getDef()->getValueAsString("expression");
+}
+
+bool tblgen::DagLeaf::isSubClassOf(StringRef superclass) const {
+  if (auto *defInit = dyn_cast_or_null<llvm::DefInit>(def))
+    return defInit->getDef()->isSubClassOf(superclass);
+  return false;
+}
+
+void tblgen::DagLeaf::print(raw_ostream &os) const {
+  if (def)
+    def->print(os);
+}
+
+//===----------------------------------------------------------------------===//
+// DagNode
+//===----------------------------------------------------------------------===//
+
+bool tblgen::DagNode::isNativeCodeCall() const {
+  if (auto *defInit = dyn_cast_or_null<llvm::DefInit>(node->getOperator()))
+    return defInit->getDef()->isSubClassOf("NativeCodeCall");
+  return false;
+}
+
+bool tblgen::DagNode::isOperation() const {
+  return !(isNativeCodeCall() || isReplaceWithValue());
+}
+
+llvm::StringRef tblgen::DagNode::getNativeCodeTemplate() const {
+  assert(isNativeCodeCall() && "the DAG leaf must be NativeCodeCall");
+  return cast<llvm::DefInit>(node->getOperator())
+      ->getDef()
+      ->getValueAsString("expression");
+}
+
+llvm::StringRef tblgen::DagNode::getSymbol() const {
+  return node->getNameStr();
+}
+
+Operator &tblgen::DagNode::getDialectOp(RecordOperatorMap *mapper) const {
+  llvm::Record *opDef = cast<llvm::DefInit>(node->getOperator())->getDef();
+  auto it = mapper->find(opDef);
+  if (it != mapper->end())
+    return *it->second;
+  return *mapper->try_emplace(opDef, std::make_unique<Operator>(opDef))
+              .first->second;
+}
+
+int tblgen::DagNode::getNumOps() const {
+  int count = isReplaceWithValue() ? 0 : 1;
+  for (int i = 0, e = getNumArgs(); i != e; ++i) {
+    if (auto child = getArgAsNestedDag(i))
+      count += child.getNumOps();
+  }
+  return count;
+}
+
+int tblgen::DagNode::getNumArgs() const { return node->getNumArgs(); }
+
+bool tblgen::DagNode::isNestedDagArg(unsigned index) const {
+  return isa<llvm::DagInit>(node->getArg(index));
+}
+
+tblgen::DagNode tblgen::DagNode::getArgAsNestedDag(unsigned index) const {
+  return DagNode(dyn_cast_or_null<llvm::DagInit>(node->getArg(index)));
+}
+
+tblgen::DagLeaf tblgen::DagNode::getArgAsLeaf(unsigned index) const {
+  assert(!isNestedDagArg(index));
+  return DagLeaf(node->getArg(index));
+}
+
+StringRef tblgen::DagNode::getArgName(unsigned index) const {
+  return node->getArgNameStr(index);
+}
+
+bool tblgen::DagNode::isReplaceWithValue() const {
+  auto *dagOpDef = cast<llvm::DefInit>(node->getOperator())->getDef();
+  return dagOpDef->getName() == "replaceWithValue";
+}
+
+void tblgen::DagNode::print(raw_ostream &os) const {
+  if (node)
+    node->print(os);
+}
+
+//===----------------------------------------------------------------------===//
+// SymbolInfoMap
+//===----------------------------------------------------------------------===//
+
+StringRef tblgen::SymbolInfoMap::getValuePackName(StringRef symbol,
+                                                  int *index) {
+  StringRef name, indexStr;
+  int idx = -1;
+  std::tie(name, indexStr) = symbol.rsplit("__");
+
+  if (indexStr.consumeInteger(10, idx)) {
+    // The second part is not an index; we return the whole symbol as-is.
+    return symbol;
+  }
+  if (index) {
+    *index = idx;
+  }
+  return name;
+}
+
+tblgen::SymbolInfoMap::SymbolInfo::SymbolInfo(const Operator *op,
+                                              SymbolInfo::Kind kind,
+                                              Optional<int> index)
+    : op(op), kind(kind), argIndex(index) {}
+
+int tblgen::SymbolInfoMap::SymbolInfo::getStaticValueCount() const {
+  switch (kind) {
+  case Kind::Attr:
+  case Kind::Operand:
+  case Kind::Value:
+    return 1;
+  case Kind::Result:
+    return op->getNumResults();
+  }
+  llvm_unreachable("unknown kind");
+}
+
+std::string
+tblgen::SymbolInfoMap::SymbolInfo::getVarDecl(StringRef name) const {
+  LLVM_DEBUG(llvm::dbgs() << "getVarDecl for '" << name << "': ");
+  switch (kind) {
+  case Kind::Attr: {
+    auto type =
+        op->getArg(*argIndex).get<NamedAttribute *>()->attr.getStorageType();
+    return formatv("{0} {1};\n", type, name);
+  }
+  case Kind::Operand: {
+    // Use operand range for captured operands (to support potential variadic
+    // operands).
+    return formatv("Operation::operand_range {0}(op0->getOperands());\n", name);
+  }
+  case Kind::Value: {
+    return formatv("ArrayRef<Value> {0};\n", name);
+  }
+  case Kind::Result: {
+    // Use the op itself for captured results.
+    return formatv("{0} {1};\n", op->getQualCppClassName(), name);
+  }
+  }
+  llvm_unreachable("unknown kind");
+}
+
+std::string tblgen::SymbolInfoMap::SymbolInfo::getValueAndRangeUse(
+    StringRef name, int index, const char *fmt, const char *separator) const {
+  LLVM_DEBUG(llvm::dbgs() << "getValueAndRangeUse for '" << name << "': ");
+  switch (kind) {
+  case Kind::Attr: {
+    assert(index < 0);
+    auto repl = formatv(fmt, name);
+    LLVM_DEBUG(llvm::dbgs() << repl << " (Attr)\n");
+    return repl;
+  }
+  case Kind::Operand: {
+    assert(index < 0);
+    auto *operand = op->getArg(*argIndex).get<NamedTypeConstraint *>();
+    // If this operand is variadic, then return a range. Otherwise, return the
+    // value itself.
+    if (operand->isVariadic()) {
+      auto repl = formatv(fmt, name);
+      LLVM_DEBUG(llvm::dbgs() << repl << " (VariadicOperand)\n");
+      return repl;
+    }
+    auto repl = formatv(fmt, formatv("(*{0}.begin())", name));
+    LLVM_DEBUG(llvm::dbgs() << repl << " (SingleOperand)\n");
+    return repl;
+  }
+  case Kind::Result: {
+    // If `index` is greater than zero, then we are referencing a specific
+    // result of a multi-result op. The result can still be variadic.
+    if (index >= 0) {
+      std::string v = formatv("{0}.getODSResults({1})", name, index);
+      if (!op->getResult(index).isVariadic())
+        v = formatv("(*{0}.begin())", v);
+      auto repl = formatv(fmt, v);
+      LLVM_DEBUG(llvm::dbgs() << repl << " (SingleResult)\n");
+      return repl;
+    }
+
+    // If this op has no result at all but still we bind a symbol to it, it
+    // means we want to capture the op itself.
+    if (op->getNumResults() == 0) {
+      LLVM_DEBUG(llvm::dbgs() << name << " (Op)\n");
+      return name;
+    }
+
+    // We are referencing all results of the multi-result op. A specific result
+    // can either be a value or a range. Then join them with `separator`.
+    SmallVector<std::string, 4> values;
+    values.reserve(op->getNumResults());
+
+    for (int i = 0, e = op->getNumResults(); i < e; ++i) {
+      std::string v = formatv("{0}.getODSResults({1})", name, i);
+      if (!op->getResult(i).isVariadic()) {
+        v = formatv("(*{0}.begin())", v);
+      }
+      values.push_back(formatv(fmt, v));
+    }
+    auto repl = llvm::join(values, separator);
+    LLVM_DEBUG(llvm::dbgs() << repl << " (VariadicResult)\n");
+    return repl;
+  }
+  case Kind::Value: {
+    assert(index < 0);
+    assert(op == nullptr);
+    auto repl = formatv(fmt, name);
+    LLVM_DEBUG(llvm::dbgs() << repl << " (Value)\n");
+    return repl;
+  }
+  }
+  llvm_unreachable("unknown kind");
+}
+
+std::string tblgen::SymbolInfoMap::SymbolInfo::getAllRangeUse(
+    StringRef name, int index, const char *fmt, const char *separator) const {
+  LLVM_DEBUG(llvm::dbgs() << "getAllRangeUse for '" << name << "': ");
+  switch (kind) {
+  case Kind::Attr:
+  case Kind::Operand: {
+    assert(index < 0 && "only allowed for symbol bound to result");
+    auto repl = formatv(fmt, name);
+    LLVM_DEBUG(llvm::dbgs() << repl << " (Operand/Attr)\n");
+    return repl;
+  }
+  case Kind::Result: {
+    if (index >= 0) {
+      auto repl = formatv(fmt, formatv("{0}.getODSResults({1})", name, index));
+      LLVM_DEBUG(llvm::dbgs() << repl << " (SingleResult)\n");
+      return repl;
+    }
+
+    // We are referencing all results of the multi-result op. Each result should
+    // have a value range, and then join them with `separator`.
+    SmallVector<std::string, 4> values;
+    values.reserve(op->getNumResults());
+
+    for (int i = 0, e = op->getNumResults(); i < e; ++i) {
+      values.push_back(
+          formatv(fmt, formatv("{0}.getODSResults({1})", name, i)));
+    }
+    auto repl = llvm::join(values, separator);
+    LLVM_DEBUG(llvm::dbgs() << repl << " (VariadicResult)\n");
+    return repl;
+  }
+  case Kind::Value: {
+    assert(index < 0 && "only allowed for symbol bound to result");
+    assert(op == nullptr);
+    auto repl = formatv(fmt, formatv("{{{0}}", name));
+    LLVM_DEBUG(llvm::dbgs() << repl << " (Value)\n");
+    return repl;
+  }
+  }
+  llvm_unreachable("unknown kind");
+}
+
+bool tblgen::SymbolInfoMap::bindOpArgument(StringRef symbol, const Operator &op,
+                                           int argIndex) {
+  StringRef name = getValuePackName(symbol);
+  if (name != symbol) {
+    auto error = formatv(
+        "symbol '{0}' with trailing index cannot bind to op argument", symbol);
+    PrintFatalError(loc, error);
+  }
+
+  auto symInfo = op.getArg(argIndex).is<NamedAttribute *>()
+                     ? SymbolInfo::getAttr(&op, argIndex)
+                     : SymbolInfo::getOperand(&op, argIndex);
+
+  return symbolInfoMap.insert({symbol, symInfo}).second;
+}
+
+bool tblgen::SymbolInfoMap::bindOpResult(StringRef symbol, const Operator &op) {
+  StringRef name = getValuePackName(symbol);
+  return symbolInfoMap.insert({name, SymbolInfo::getResult(&op)}).second;
+}
+
+bool tblgen::SymbolInfoMap::bindValue(StringRef symbol) {
+  return symbolInfoMap.insert({symbol, SymbolInfo::getValue()}).second;
+}
+
+bool tblgen::SymbolInfoMap::contains(StringRef symbol) const {
+  return find(symbol) != symbolInfoMap.end();
+}
+
+tblgen::SymbolInfoMap::const_iterator
+tblgen::SymbolInfoMap::find(StringRef key) const {
+  StringRef name = getValuePackName(key);
+  return symbolInfoMap.find(name);
+}
+
+int tblgen::SymbolInfoMap::getStaticValueCount(StringRef symbol) const {
+  StringRef name = getValuePackName(symbol);
+  if (name != symbol) {
+    // If there is a trailing index inside symbol, it references just one
+    // static value.
+    return 1;
+  }
+  // Otherwise, find how many it represents by querying the symbol's info.
+  return find(name)->getValue().getStaticValueCount();
+}
+
+std::string
+tblgen::SymbolInfoMap::getValueAndRangeUse(StringRef symbol, const char *fmt,
+                                           const char *separator) const {
+  int index = -1;
+  StringRef name = getValuePackName(symbol, &index);
+
+  auto it = symbolInfoMap.find(name);
+  if (it == symbolInfoMap.end()) {
+    auto error = formatv("referencing unbound symbol '{0}'", symbol);
+    PrintFatalError(loc, error);
+  }
+
+  return it->getValue().getValueAndRangeUse(name, index, fmt, separator);
+}
+
+std::string tblgen::SymbolInfoMap::getAllRangeUse(StringRef symbol,
+                                                  const char *fmt,
+                                                  const char *separator) const {
+  int index = -1;
+  StringRef name = getValuePackName(symbol, &index);
+
+  auto it = symbolInfoMap.find(name);
+  if (it == symbolInfoMap.end()) {
+    auto error = formatv("referencing unbound symbol '{0}'", symbol);
+    PrintFatalError(loc, error);
+  }
+
+  return it->getValue().getAllRangeUse(name, index, fmt, separator);
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern
+//==----------------------------------------------------------------------===//
+
+tblgen::Pattern::Pattern(const llvm::Record *def, RecordOperatorMap *mapper)
+    : def(*def), recordOpMap(mapper) {}
+
+tblgen::DagNode tblgen::Pattern::getSourcePattern() const {
+  return tblgen::DagNode(def.getValueAsDag("sourcePattern"));
+}
+
+int tblgen::Pattern::getNumResultPatterns() const {
+  auto *results = def.getValueAsListInit("resultPatterns");
+  return results->size();
+}
+
+tblgen::DagNode tblgen::Pattern::getResultPattern(unsigned index) const {
+  auto *results = def.getValueAsListInit("resultPatterns");
+  return tblgen::DagNode(cast<llvm::DagInit>(results->getElement(index)));
+}
+
+void tblgen::Pattern::collectSourcePatternBoundSymbols(
+    tblgen::SymbolInfoMap &infoMap) {
+  LLVM_DEBUG(llvm::dbgs() << "start collecting source pattern bound symbols\n");
+  collectBoundSymbols(getSourcePattern(), infoMap, /*isSrcPattern=*/true);
+  LLVM_DEBUG(llvm::dbgs() << "done collecting source pattern bound symbols\n");
+}
+
+void tblgen::Pattern::collectResultPatternBoundSymbols(
+    tblgen::SymbolInfoMap &infoMap) {
+  LLVM_DEBUG(llvm::dbgs() << "start collecting result pattern bound symbols\n");
+  for (int i = 0, e = getNumResultPatterns(); i < e; ++i) {
+    auto pattern = getResultPattern(i);
+    collectBoundSymbols(pattern, infoMap, /*isSrcPattern=*/false);
+  }
+  LLVM_DEBUG(llvm::dbgs() << "done collecting result pattern bound symbols\n");
+}
+
+const tblgen::Operator &tblgen::Pattern::getSourceRootOp() {
+  return getSourcePattern().getDialectOp(recordOpMap);
+}
+
+tblgen::Operator &tblgen::Pattern::getDialectOp(DagNode node) {
+  return node.getDialectOp(recordOpMap);
+}
+
+std::vector<tblgen::AppliedConstraint> tblgen::Pattern::getConstraints() const {
+  auto *listInit = def.getValueAsListInit("constraints");
+  std::vector<tblgen::AppliedConstraint> ret;
+  ret.reserve(listInit->size());
+
+  for (auto it : *listInit) {
+    auto *dagInit = dyn_cast<llvm::DagInit>(it);
+    if (!dagInit)
+      PrintFatalError(def.getLoc(), "all elements in Pattern multi-entity "
+                                    "constraints should be DAG nodes");
+
+    std::vector<std::string> entities;
+    entities.reserve(dagInit->arg_size());
+    for (auto *argName : dagInit->getArgNames()) {
+      if (!argName) {
+        PrintFatalError(
+            def.getLoc(),
+            "operands to additional constraints can only be symbol references");
+      }
+      entities.push_back(argName->getValue());
+    }
+
+    ret.emplace_back(cast<llvm::DefInit>(dagInit->getOperator())->getDef(),
+                     dagInit->getNameStr(), std::move(entities));
+  }
+  return ret;
+}
+
+int tblgen::Pattern::getBenefit() const {
+  // The initial benefit value is a heuristic with number of ops in the source
+  // pattern.
+  int initBenefit = getSourcePattern().getNumOps();
+  llvm::DagInit *delta = def.getValueAsDag("benefitDelta");
+  if (delta->getNumArgs() != 1 || !isa<llvm::IntInit>(delta->getArg(0))) {
+    PrintFatalError(def.getLoc(),
+                    "The 'addBenefit' takes and only takes one integer value");
+  }
+  return initBenefit + dyn_cast<llvm::IntInit>(delta->getArg(0))->getValue();
+}
+
+std::vector<tblgen::Pattern::IdentifierLine>
+tblgen::Pattern::getLocation() const {
+  std::vector<std::pair<StringRef, unsigned>> result;
+  result.reserve(def.getLoc().size());
+  for (auto loc : def.getLoc()) {
+    unsigned buf = llvm::SrcMgr.FindBufferContainingLoc(loc);
+    assert(buf && "invalid source location");
+    result.emplace_back(
+        llvm::SrcMgr.getBufferInfo(buf).Buffer->getBufferIdentifier(),
+        llvm::SrcMgr.getLineAndColumn(loc, buf).first);
+  }
+  return result;
+}
+
+void tblgen::Pattern::collectBoundSymbols(DagNode tree, SymbolInfoMap &infoMap,
+                                          bool isSrcPattern) {
+  auto treeName = tree.getSymbol();
+  if (!tree.isOperation()) {
+    if (!treeName.empty()) {
+      PrintFatalError(
+          def.getLoc(),
+          formatv("binding symbol '{0}' to non-operation unsupported right now",
+                  treeName));
+    }
+    return;
+  }
+
+  auto &op = getDialectOp(tree);
+  auto numOpArgs = op.getNumArgs();
+  auto numTreeArgs = tree.getNumArgs();
+
+  if (numOpArgs != numTreeArgs) {
+    auto err = formatv("op '{0}' argument number mismatch: "
+                       "{1} in pattern vs. {2} in definition",
+                       op.getOperationName(), numTreeArgs, numOpArgs);
+    PrintFatalError(def.getLoc(), err);
+  }
+
+  // The name attached to the DAG node's operator is for representing the
+  // results generated from this op. It should be remembered as bound results.
+  if (!treeName.empty()) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "found symbol bound to op result: " << treeName << '\n');
+    if (!infoMap.bindOpResult(treeName, op))
+      PrintFatalError(def.getLoc(),
+                      formatv("symbol '{0}' bound more than once", treeName));
+  }
+
+  for (int i = 0; i != numTreeArgs; ++i) {
+    if (auto treeArg = tree.getArgAsNestedDag(i)) {
+      // This DAG node argument is a DAG node itself. Go inside recursively.
+      collectBoundSymbols(treeArg, infoMap, isSrcPattern);
+    } else if (isSrcPattern) {
+      // We can only bind symbols to op arguments in source pattern. Those
+      // symbols are referenced in result patterns.
+      auto treeArgName = tree.getArgName(i);
+      // `$_` is a special symbol meaning ignore the current argument.
+      if (!treeArgName.empty() && treeArgName != "_") {
+        LLVM_DEBUG(llvm::dbgs() << "found symbol bound to op argument: "
+                                << treeArgName << '\n');
+        if (!infoMap.bindOpArgument(treeArgName, op, i)) {
+          auto err = formatv("symbol '{0}' bound more than once", treeArgName);
+          PrintFatalError(def.getLoc(), err);
+        }
+      }
+    }
+  }
+}
diff --git a/mlir/lib/TableGen/Predicate.cpp b/mlir/lib/TableGen/Predicate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c52e15dbdea6ef65d35d20b3e321d97732580fdd
--- /dev/null
+++ b/mlir/lib/TableGen/Predicate.cpp
@@ -0,0 +1,365 @@
+//===- Predicate.cpp - Predicate class ------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper around predicates defined in TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Predicate.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+
+// Construct a Predicate from a record.
+tblgen::Pred::Pred(const llvm::Record *record) : def(record) {
+  assert(def->isSubClassOf("Pred") &&
+         "must be a subclass of TableGen 'Pred' class");
+}
+
+// Construct a Predicate from an initializer.
+tblgen::Pred::Pred(const llvm::Init *init) : def(nullptr) {
+  if (const auto *defInit = dyn_cast_or_null<llvm::DefInit>(init))
+    def = defInit->getDef();
+}
+
+std::string tblgen::Pred::getCondition() const {
+  // Static dispatch to subclasses.
+  if (def->isSubClassOf("CombinedPred"))
+    return static_cast<const CombinedPred *>(this)->getConditionImpl();
+  if (def->isSubClassOf("CPred"))
+    return static_cast<const CPred *>(this)->getConditionImpl();
+  llvm_unreachable("Pred::getCondition must be overridden in subclasses");
+}
+
+bool tblgen::Pred::isCombined() const {
+  return def && def->isSubClassOf("CombinedPred");
+}
+
+ArrayRef<llvm::SMLoc> tblgen::Pred::getLoc() const { return def->getLoc(); }
+
+tblgen::CPred::CPred(const llvm::Record *record) : Pred(record) {
+  assert(def->isSubClassOf("CPred") &&
+         "must be a subclass of Tablegen 'CPred' class");
+}
+
+tblgen::CPred::CPred(const llvm::Init *init) : Pred(init) {
+  assert((!def || def->isSubClassOf("CPred")) &&
+         "must be a subclass of Tablegen 'CPred' class");
+}
+
+// Get condition of the C Predicate.
+std::string tblgen::CPred::getConditionImpl() const {
+  assert(!isNull() && "null predicate does not have a condition");
+  return def->getValueAsString("predExpr");
+}
+
+tblgen::CombinedPred::CombinedPred(const llvm::Record *record) : Pred(record) {
+  assert(def->isSubClassOf("CombinedPred") &&
+         "must be a subclass of Tablegen 'CombinedPred' class");
+}
+
+tblgen::CombinedPred::CombinedPred(const llvm::Init *init) : Pred(init) {
+  assert((!def || def->isSubClassOf("CombinedPred")) &&
+         "must be a subclass of Tablegen 'CombinedPred' class");
+}
+
+const llvm::Record *tblgen::CombinedPred::getCombinerDef() const {
+  assert(def->getValue("kind") && "CombinedPred must have a value 'kind'");
+  return def->getValueAsDef("kind");
+}
+
+const std::vector<llvm::Record *> tblgen::CombinedPred::getChildren() const {
+  assert(def->getValue("children") &&
+         "CombinedPred must have a value 'children'");
+  return def->getValueAsListOfDefs("children");
+}
+
+namespace {
+// Kinds of nodes in a logical predicate tree.
+enum class PredCombinerKind {
+  Leaf,
+  And,
+  Or,
+  Not,
+  SubstLeaves,
+  Concat,
+  // Special kinds that are used in simplification.
+  False,
+  True
+};
+
+// A node in a logical predicate tree.
+struct PredNode {
+  PredCombinerKind kind;
+  const tblgen::Pred *predicate;
+  SmallVector<PredNode *, 4> children;
+  std::string expr;
+
+  // Prefix and suffix are used by ConcatPred.
+  std::string prefix;
+  std::string suffix;
+};
+} // end anonymous namespace
+
+// Get a predicate tree node kind based on the kind used in the predicate
+// TableGen record.
+static PredCombinerKind getPredCombinerKind(const tblgen::Pred &pred) {
+  if (!pred.isCombined())
+    return PredCombinerKind::Leaf;
+
+  const auto &combinedPred = static_cast<const tblgen::CombinedPred &>(pred);
+  return llvm::StringSwitch<PredCombinerKind>(
+             combinedPred.getCombinerDef()->getName())
+      .Case("PredCombinerAnd", PredCombinerKind::And)
+      .Case("PredCombinerOr", PredCombinerKind::Or)
+      .Case("PredCombinerNot", PredCombinerKind::Not)
+      .Case("PredCombinerSubstLeaves", PredCombinerKind::SubstLeaves)
+      .Case("PredCombinerConcat", PredCombinerKind::Concat);
+}
+
+namespace {
+// Substitution<pattern, replacement>.
+using Subst = std::pair<StringRef, StringRef>;
+} // end anonymous namespace
+
+// Build the predicate tree starting from the top-level predicate, which may
+// have children, and perform leaf substitutions inplace.  Note that after
+// substitution, nodes are still pointing to the original TableGen record.
+// All nodes are created within "allocator".
+static PredNode *buildPredicateTree(const tblgen::Pred &root,
+                                    llvm::BumpPtrAllocator &allocator,
+                                    ArrayRef<Subst> substitutions) {
+  auto *rootNode = allocator.Allocate<PredNode>();
+  new (rootNode) PredNode;
+  rootNode->kind = getPredCombinerKind(root);
+  rootNode->predicate = &root;
+  if (!root.isCombined()) {
+    rootNode->expr = root.getCondition();
+    // Apply all parent substitutions from innermost to outermost.
+    for (const auto &subst : llvm::reverse(substitutions)) {
+      auto pos = rootNode->expr.find(subst.first);
+      while (pos != std::string::npos) {
+        rootNode->expr.replace(pos, subst.first.size(), subst.second);
+        // Skip the newly inserted substring, which itself may consider the
+        // pattern to match.
+        pos += subst.second.size();
+        // Find the next possible match position.
+        pos = rootNode->expr.find(subst.first, pos);
+      }
+    }
+    return rootNode;
+  }
+
+  // If the current combined predicate is a leaf substitution, append it to the
+  // list before continuing.
+  auto allSubstitutions = llvm::to_vector<4>(substitutions);
+  if (rootNode->kind == PredCombinerKind::SubstLeaves) {
+    const auto &substPred = static_cast<const tblgen::SubstLeavesPred &>(root);
+    allSubstitutions.push_back(
+        {substPred.getPattern(), substPred.getReplacement()});
+  }
+  // If the current predicate is a ConcatPred, record the prefix and suffix.
+  else if (rootNode->kind == PredCombinerKind::Concat) {
+    const auto &concatPred = static_cast<const tblgen::ConcatPred &>(root);
+    rootNode->prefix = concatPred.getPrefix();
+    rootNode->suffix = concatPred.getSuffix();
+  }
+
+  // Build child subtrees.
+  auto combined = static_cast<const tblgen::CombinedPred &>(root);
+  for (const auto *record : combined.getChildren()) {
+    auto childTree =
+        buildPredicateTree(tblgen::Pred(record), allocator, allSubstitutions);
+    rootNode->children.push_back(childTree);
+  }
+  return rootNode;
+}
+
+// Simplify a predicate tree rooted at "node" using the predicates that are
+// known to be true(false).  For AND(OR) combined predicates, if any of the
+// children is known to be false(true), the result is also false(true).
+// Furthermore, for AND(OR) combined predicates, children that are known to be
+// true(false) don't have to be checked dynamically.
+static PredNode *propagateGroundTruth(
+    PredNode *node, const llvm::SmallPtrSetImpl<tblgen::Pred *> &knownTruePreds,
+    const llvm::SmallPtrSetImpl<tblgen::Pred *> &knownFalsePreds) {
+  // If the current predicate is known to be true or false, change the kind of
+  // the node and return immediately.
+  if (knownTruePreds.count(node->predicate) != 0) {
+    node->kind = PredCombinerKind::True;
+    node->children.clear();
+    return node;
+  }
+  if (knownFalsePreds.count(node->predicate) != 0) {
+    node->kind = PredCombinerKind::False;
+    node->children.clear();
+    return node;
+  }
+
+  // If the current node is a substitution, stop recursion now.
+  // The expressions in the leaves below this node were rewritten, but the nodes
+  // still point to the original predicate records.  While the original
+  // predicate may be known to be true or false, it is not necessarily the case
+  // after rewriting.
+  // TODO(zinenko,jpienaar): we can support ground truth for rewritten
+  // predicates by either (a) having our own unique'ing of the predicates
+  // instead of relying on TableGen record pointers or (b) taking ground truth
+  // values optionally prefixed with a list of substitutions to apply, e.g.
+  // "predX is true by itself as well as predSubY leaf substitution had been
+  // applied to it".
+  if (node->kind == PredCombinerKind::SubstLeaves) {
+    return node;
+  }
+
+  // Otherwise, look at child nodes.
+
+  // Move child nodes into some local variable so that they can be optimized
+  // separately and re-added if necessary.
+  llvm::SmallVector<PredNode *, 4> children;
+  std::swap(node->children, children);
+
+  for (auto &child : children) {
+    // First, simplify the child.  This maintains the predicate as it was.
+    auto simplifiedChild =
+        propagateGroundTruth(child, knownTruePreds, knownFalsePreds);
+
+    // Just add the child if we don't know how to simplify the current node.
+    if (node->kind != PredCombinerKind::And &&
+        node->kind != PredCombinerKind::Or) {
+      node->children.push_back(simplifiedChild);
+      continue;
+    }
+
+    // Second, based on the type define which known values of child predicates
+    // immediately collapse this predicate to a known value, and which others
+    // may be safely ignored.
+    //   OR(..., True, ...) = True
+    //   OR(..., False, ...) = OR(..., ...)
+    //   AND(..., False, ...) = False
+    //   AND(..., True, ...) = AND(..., ...)
+    auto collapseKind = node->kind == PredCombinerKind::And
+                            ? PredCombinerKind::False
+                            : PredCombinerKind::True;
+    auto eraseKind = node->kind == PredCombinerKind::And
+                         ? PredCombinerKind::True
+                         : PredCombinerKind::False;
+    const auto &collapseList =
+        node->kind == PredCombinerKind::And ? knownFalsePreds : knownTruePreds;
+    const auto &eraseList =
+        node->kind == PredCombinerKind::And ? knownTruePreds : knownFalsePreds;
+    if (simplifiedChild->kind == collapseKind ||
+        collapseList.count(simplifiedChild->predicate) != 0) {
+      node->kind = collapseKind;
+      node->children.clear();
+      return node;
+    } else if (simplifiedChild->kind == eraseKind ||
+               eraseList.count(simplifiedChild->predicate) != 0) {
+      continue;
+    }
+    node->children.push_back(simplifiedChild);
+  }
+  return node;
+}
+
+// Combine a list of predicate expressions using a binary combiner.  If a list
+// is empty, return "init".
+static std::string combineBinary(ArrayRef<std::string> children,
+                                 std::string combiner, std::string init) {
+  if (children.empty())
+    return init;
+
+  auto size = children.size();
+  if (size == 1)
+    return children.front();
+
+  std::string str;
+  llvm::raw_string_ostream os(str);
+  os << '(' << children.front() << ')';
+  for (unsigned i = 1; i < size; ++i) {
+    os << ' ' << combiner << " (" << children[i] << ')';
+  }
+  return os.str();
+}
+
+// Prepend negation to the only condition in the predicate expression list.
+static std::string combineNot(ArrayRef<std::string> children) {
+  assert(children.size() == 1 && "expected exactly one child predicate of Neg");
+  return (Twine("!(") + children.front() + Twine(')')).str();
+}
+
+// Recursively traverse the predicate tree in depth-first post-order and build
+// the final expression.
+static std::string getCombinedCondition(const PredNode &root) {
+  // Immediately return for non-combiner predicates that don't have children.
+  if (root.kind == PredCombinerKind::Leaf)
+    return root.expr;
+  if (root.kind == PredCombinerKind::True)
+    return "true";
+  if (root.kind == PredCombinerKind::False)
+    return "false";
+
+  // Recurse into children.
+  llvm::SmallVector<std::string, 4> childExpressions;
+  childExpressions.reserve(root.children.size());
+  for (const auto &child : root.children)
+    childExpressions.push_back(getCombinedCondition(*child));
+
+  // Combine the expressions based on the predicate node kind.
+  if (root.kind == PredCombinerKind::And)
+    return combineBinary(childExpressions, "&&", "true");
+  if (root.kind == PredCombinerKind::Or)
+    return combineBinary(childExpressions, "||", "false");
+  if (root.kind == PredCombinerKind::Not)
+    return combineNot(childExpressions);
+  if (root.kind == PredCombinerKind::Concat) {
+    assert(childExpressions.size() == 1 &&
+           "ConcatPred should only have one child");
+    return root.prefix + childExpressions.front() + root.suffix;
+  }
+
+  // Substitutions were applied before so just ignore them.
+  if (root.kind == PredCombinerKind::SubstLeaves) {
+    assert(childExpressions.size() == 1 &&
+           "substitution predicate must have one child");
+    return childExpressions[0];
+  }
+
+  llvm::PrintFatalError(root.predicate->getLoc(), "unsupported predicate kind");
+}
+
+std::string tblgen::CombinedPred::getConditionImpl() const {
+  llvm::BumpPtrAllocator allocator;
+  auto predicateTree = buildPredicateTree(*this, allocator, {});
+  predicateTree = propagateGroundTruth(
+      predicateTree,
+      /*knownTruePreds=*/llvm::SmallPtrSet<tblgen::Pred *, 2>(),
+      /*knownFalsePreds=*/llvm::SmallPtrSet<tblgen::Pred *, 2>());
+
+  return getCombinedCondition(*predicateTree);
+}
+
+StringRef tblgen::SubstLeavesPred::getPattern() const {
+  return def->getValueAsString("pattern");
+}
+
+StringRef tblgen::SubstLeavesPred::getReplacement() const {
+  return def->getValueAsString("replacement");
+}
+
+StringRef tblgen::ConcatPred::getPrefix() const {
+  return def->getValueAsString("prefix");
+}
+
+StringRef tblgen::ConcatPred::getSuffix() const {
+  return def->getValueAsString("suffix");
+}
diff --git a/mlir/lib/TableGen/Type.cpp b/mlir/lib/TableGen/Type.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9a309bdde4660ca8577c671c9e8de503c4f0ec57
--- /dev/null
+++ b/mlir/lib/TableGen/Type.cpp
@@ -0,0 +1,40 @@
+//===- Type.cpp - Type class ----------------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Type wrapper to simplify using TableGen Record defining a MLIR Type.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Type.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace mlir;
+using namespace mlir::tblgen;
+
+TypeConstraint::TypeConstraint(const llvm::Record *record)
+    : Constraint(Constraint::CK_Type, record) {
+  assert(def->isSubClassOf("TypeConstraint") &&
+         "must be subclass of TableGen 'TypeConstraint' class");
+}
+
+TypeConstraint::TypeConstraint(const llvm::DefInit *init)
+    : TypeConstraint(init->getDef()) {}
+
+bool TypeConstraint::isVariadic() const {
+  return def->isSubClassOf("Variadic");
+}
+
+Type::Type(const llvm::Record *record) : TypeConstraint(record) {}
+
+StringRef Type::getTypeDescription() const {
+  return def->getValueAsString("typeDescription");
+}
+
+Dialect Type::getDialect() const {
+  return Dialect(def->getValueAsDef("dialect"));
+}
diff --git a/mlir/lib/Target/CMakeLists.txt b/mlir/lib/Target/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..25000d97f5fc08536150ee1bdb39a5c22299f753
--- /dev/null
+++ b/mlir/lib/Target/CMakeLists.txt
@@ -0,0 +1,49 @@
+add_llvm_library(MLIRTargetLLVMIRModuleTranslation
+  LLVMIR/ModuleTranslation.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR
+  DEPENDS
+  intrinsics_gen
+  )
+target_link_libraries(MLIRTargetLLVMIRModuleTranslation
+  MLIRLLVMIR LLVMCore LLVMIRReader LLVMSupport LLVMTransformUtils
+  MLIRTranslation)
+add_llvm_library(MLIRTargetLLVMIR
+  LLVMIR/ConvertFromLLVMIR.cpp
+  LLVMIR/ConvertToLLVMIR.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR
+  )
+target_link_libraries(MLIRTargetLLVMIR MLIRTargetLLVMIRModuleTranslation)
+add_llvm_library(MLIRTargetNVVMIR
+  LLVMIR/ConvertToNVVMIR.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR
+  DEPENDS
+  intrinsics_gen
+  )
+target_link_libraries(MLIRTargetNVVMIR
+  MLIRGPU
+  MLIRIR
+  MLIRLLVMIR
+  MLIRNVVMIR
+  MLIRTargetLLVMIRModuleTranslation
+  )
+add_llvm_library(MLIRTargetROCDLIR
+  LLVMIR/ConvertToROCDLIR.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR
+  DEPENDS
+  intrinsics_gen
+  )
+target_link_libraries(MLIRTargetROCDLIR
+  MLIRGPU
+  MLIRIR
+  MLIRLLVMIR
+  MLIRROCDLIR
+  MLIRTargetLLVMIRModuleTranslation
+  )
diff --git a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4466fb5fe2688ba1fb0d202f20c6931d491c7c4b
--- /dev/null
+++ b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
@@ -0,0 +1,622 @@
+//===- ConvertFromLLVMIR.cpp - MLIR to LLVM IR conversion -----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a translation between LLVM IR and the MLIR LLVM dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Target/LLVMIR.h"
+#include "mlir/Translation.h"
+
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+using namespace mlir::LLVM;
+
+// Utility to print an LLVM value as a string for passing to emitError().
+// FIXME: Diagnostic should be able to natively handle types that have
+// operator << (raw_ostream&) defined.
+static std::string diag(llvm::Value &v) {
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  os << v;
+  return os.str();
+}
+
+// Handles importing globals and functions from an LLVM module.
+namespace {
+class Importer {
+public:
+  Importer(MLIRContext *context, ModuleOp module)
+      : b(context), context(context), module(module),
+        unknownLoc(FileLineColLoc::get("imported-bitcode", 0, 0, context)) {
+    b.setInsertionPointToStart(module.getBody());
+    dialect = context->getRegisteredDialect<LLVMDialect>();
+  }
+
+  /// Imports `f` into the current module.
+  LogicalResult processFunction(llvm::Function *f);
+
+  /// Imports GV as a GlobalOp, creating it if it doesn't exist.
+  GlobalOp processGlobal(llvm::GlobalVariable *GV);
+
+private:
+  /// Imports `bb` into `block`, which must be initially empty.
+  LogicalResult processBasicBlock(llvm::BasicBlock *bb, Block *block);
+  /// Imports `inst` and populates instMap[inst] with the imported Value.
+  LogicalResult processInstruction(llvm::Instruction *inst);
+  /// Creates an LLVMType for `type`.
+  LLVMType processType(llvm::Type *type);
+  /// `value` is an SSA-use. Return the remapped version of `value` or a
+  /// placeholder that will be remapped later if this is an instruction that
+  /// has not yet been visited.
+  Value processValue(llvm::Value *value);
+  /// Create the most accurate Location possible using a llvm::DebugLoc and
+  /// possibly an llvm::Instruction to narrow the Location if debug information
+  /// is unavailable.
+  Location processDebugLoc(const llvm::DebugLoc &loc,
+                           llvm::Instruction *inst = nullptr);
+  /// `br` branches to `target`. Return the block arguments to attach to the
+  /// generated branch op. These should be in the same order as the PHIs in
+  /// `target`.
+  SmallVector<Value, 4> processBranchArgs(llvm::BranchInst *br,
+                                          llvm::BasicBlock *target);
+  /// Return `value` as an attribute to attach to a GlobalOp.
+  Attribute getConstantAsAttr(llvm::Constant *value);
+  /// Return `c` as an MLIR Value. This could either be a ConstantOp, or
+  /// an expanded sequence of ops in the current function's entry block (for
+  /// ConstantExprs or ConstantGEPs).
+  Value processConstant(llvm::Constant *c);
+
+  /// The current builder, pointing at where the next Instruction should be
+  /// generated.
+  OpBuilder b;
+  /// The current context.
+  MLIRContext *context;
+  /// The current module being created.
+  ModuleOp module;
+  /// The entry block of the current function being processed.
+  Block *currentEntryBlock;
+
+  /// Globals are inserted before the first function, if any.
+  Block::iterator getGlobalInsertPt() {
+    auto i = module.getBody()->begin();
+    while (!isa<LLVMFuncOp>(i) && !isa<ModuleTerminatorOp>(i))
+      ++i;
+    return i;
+  }
+
+  /// Functions are always inserted before the module terminator.
+  Block::iterator getFuncInsertPt() {
+    return std::prev(module.getBody()->end());
+  }
+
+  /// Remapped blocks, for the current function.
+  DenseMap<llvm::BasicBlock *, Block *> blocks;
+  /// Remapped values. These are function-local.
+  DenseMap<llvm::Value *, Value> instMap;
+  /// Instructions that had not been defined when first encountered as a use.
+  /// Maps to the dummy Operation that was created in processValue().
+  DenseMap<llvm::Value *, Operation *> unknownInstMap;
+  /// Uniquing map of GlobalVariables.
+  DenseMap<llvm::GlobalVariable *, GlobalOp> globals;
+  /// Cached FileLineColLoc::get("imported-bitcode", 0, 0).
+  Location unknownLoc;
+  /// Cached dialect.
+  LLVMDialect *dialect;
+};
+} // namespace
+
+Location Importer::processDebugLoc(const llvm::DebugLoc &loc,
+                                   llvm::Instruction *inst) {
+  if (!loc && inst) {
+    std::string s;
+    llvm::raw_string_ostream os(s);
+    os << "llvm-imported-inst-%";
+    inst->printAsOperand(os, /*PrintType=*/false);
+    return FileLineColLoc::get(os.str(), 0, 0, context);
+  } else if (!loc) {
+    return unknownLoc;
+  }
+  // FIXME: Obtain the filename from DILocationInfo.
+  return FileLineColLoc::get("imported-bitcode", loc.getLine(), loc.getCol(),
+                             context);
+}
+
+LLVMType Importer::processType(llvm::Type *type) {
+  switch (type->getTypeID()) {
+  case llvm::Type::FloatTyID:
+    return LLVMType::getFloatTy(dialect);
+  case llvm::Type::DoubleTyID:
+    return LLVMType::getDoubleTy(dialect);
+  case llvm::Type::IntegerTyID:
+    return LLVMType::getIntNTy(dialect, type->getIntegerBitWidth());
+  case llvm::Type::PointerTyID:
+    return processType(type->getPointerElementType())
+        .getPointerTo(type->getPointerAddressSpace());
+  case llvm::Type::ArrayTyID:
+    return LLVMType::getArrayTy(processType(type->getArrayElementType()),
+                                type->getArrayNumElements());
+  case llvm::Type::VectorTyID: {
+    if (type->getVectorIsScalable())
+      emitError(unknownLoc) << "scalable vector types not supported";
+    return LLVMType::getVectorTy(processType(type->getVectorElementType()),
+                                 type->getVectorNumElements());
+  }
+  case llvm::Type::VoidTyID:
+    return LLVMType::getVoidTy(dialect);
+  case llvm::Type::FP128TyID:
+    return LLVMType::getFP128Ty(dialect);
+  case llvm::Type::X86_FP80TyID:
+    return LLVMType::getX86_FP80Ty(dialect);
+  case llvm::Type::StructTyID: {
+    SmallVector<LLVMType, 4> elementTypes;
+    for (unsigned i = 0, e = type->getStructNumElements(); i != e; ++i)
+      elementTypes.push_back(processType(type->getStructElementType(i)));
+    return LLVMType::getStructTy(dialect, elementTypes,
+                                 cast<llvm::StructType>(type)->isPacked());
+  }
+  case llvm::Type::FunctionTyID: {
+    llvm::FunctionType *fty = cast<llvm::FunctionType>(type);
+    SmallVector<LLVMType, 4> paramTypes;
+    for (unsigned i = 0, e = fty->getNumParams(); i != e; ++i)
+      paramTypes.push_back(processType(fty->getParamType(i)));
+    return LLVMType::getFunctionTy(processType(fty->getReturnType()),
+                                   paramTypes, fty->isVarArg());
+  }
+  default: {
+    // FIXME: Diagnostic should be able to natively handle types that have
+    // operator<<(raw_ostream&) defined.
+    std::string s;
+    llvm::raw_string_ostream os(s);
+    os << *type;
+    emitError(unknownLoc) << "unhandled type: " << os.str();
+    return {};
+  }
+  }
+}
+
+// Get the given constant as an attribute. Not all constants can be represented
+// as attributes.
+Attribute Importer::getConstantAsAttr(llvm::Constant *value) {
+  if (auto *ci = dyn_cast<llvm::ConstantInt>(value))
+    return b.getIntegerAttr(
+        IntegerType::get(ci->getType()->getBitWidth(), context),
+        ci->getValue());
+  if (auto *c = dyn_cast<llvm::ConstantDataArray>(value))
+    if (c->isString())
+      return b.getStringAttr(c->getAsString());
+  return Attribute();
+}
+
+/// Converts LLVM global variable linkage type into the LLVM dialect predicate.
+static LLVM::Linkage
+processLinkage(llvm::GlobalVariable::LinkageTypes linkage) {
+  switch (linkage) {
+  case llvm::GlobalValue::PrivateLinkage:
+    return LLVM::Linkage::Private;
+  case llvm::GlobalValue::InternalLinkage:
+    return LLVM::Linkage::Internal;
+  case llvm::GlobalValue::AvailableExternallyLinkage:
+    return LLVM::Linkage::AvailableExternally;
+  case llvm::GlobalValue::LinkOnceAnyLinkage:
+    return LLVM::Linkage::Linkonce;
+  case llvm::GlobalValue::WeakAnyLinkage:
+    return LLVM::Linkage::Weak;
+  case llvm::GlobalValue::CommonLinkage:
+    return LLVM::Linkage::Common;
+  case llvm::GlobalValue::AppendingLinkage:
+    return LLVM::Linkage::Appending;
+  case llvm::GlobalValue::ExternalWeakLinkage:
+    return LLVM::Linkage::ExternWeak;
+  case llvm::GlobalValue::LinkOnceODRLinkage:
+    return LLVM::Linkage::LinkonceODR;
+  case llvm::GlobalValue::WeakODRLinkage:
+    return LLVM::Linkage::WeakODR;
+  case llvm::GlobalValue::ExternalLinkage:
+    return LLVM::Linkage::External;
+  }
+
+  llvm_unreachable("unhandled linkage type");
+}
+
+GlobalOp Importer::processGlobal(llvm::GlobalVariable *GV) {
+  auto it = globals.find(GV);
+  if (it != globals.end())
+    return it->second;
+
+  OpBuilder b(module.getBody(), getGlobalInsertPt());
+  Attribute valueAttr;
+  if (GV->hasInitializer())
+    valueAttr = getConstantAsAttr(GV->getInitializer());
+  GlobalOp op = b.create<GlobalOp>(
+      UnknownLoc::get(context), processType(GV->getValueType()),
+      GV->isConstant(), processLinkage(GV->getLinkage()), GV->getName(),
+      valueAttr);
+  if (GV->hasInitializer() && !valueAttr) {
+    Region &r = op.getInitializerRegion();
+    currentEntryBlock = b.createBlock(&r);
+    b.setInsertionPoint(currentEntryBlock, currentEntryBlock->begin());
+    Value v = processConstant(GV->getInitializer());
+    b.create<ReturnOp>(op.getLoc(), ArrayRef<Value>({v}));
+  }
+  return globals[GV] = op;
+}
+
+Value Importer::processConstant(llvm::Constant *c) {
+  if (Attribute attr = getConstantAsAttr(c)) {
+    // These constants can be represented as attributes.
+    OpBuilder b(currentEntryBlock, currentEntryBlock->begin());
+    return instMap[c] = b.create<ConstantOp>(unknownLoc,
+                                             processType(c->getType()), attr);
+  }
+  if (auto *cn = dyn_cast<llvm::ConstantPointerNull>(c)) {
+    OpBuilder b(currentEntryBlock, currentEntryBlock->begin());
+    return instMap[c] =
+               b.create<NullOp>(unknownLoc, processType(cn->getType()));
+  }
+  if (auto *ce = dyn_cast<llvm::ConstantExpr>(c)) {
+    llvm::Instruction *i = ce->getAsInstruction();
+    OpBuilder::InsertionGuard guard(b);
+    b.setInsertionPoint(currentEntryBlock, currentEntryBlock->begin());
+    if (failed(processInstruction(i)))
+      return nullptr;
+    assert(instMap.count(i));
+
+    // Remove this zombie LLVM instruction now, leaving us only with the MLIR
+    // op.
+    i->deleteValue();
+    return instMap[c] = instMap[i];
+  }
+  emitError(unknownLoc) << "unhandled constant: " << diag(*c);
+  return nullptr;
+}
+
+Value Importer::processValue(llvm::Value *value) {
+  auto it = instMap.find(value);
+  if (it != instMap.end())
+    return it->second;
+
+  // We don't expect to see instructions in dominator order. If we haven't seen
+  // this instruction yet, create an unknown op and remap it later.
+  if (isa<llvm::Instruction>(value)) {
+    OperationState state(UnknownLoc::get(context), "unknown");
+    state.addTypes({processType(value->getType())});
+    unknownInstMap[value] = b.createOperation(state);
+    return unknownInstMap[value]->getResult(0);
+  }
+
+  if (auto *GV = dyn_cast<llvm::GlobalVariable>(value)) {
+    return b.create<AddressOfOp>(UnknownLoc::get(context), processGlobal(GV),
+                                 ArrayRef<NamedAttribute>());
+  }
+
+  // Note, constant global variables are both GlobalVariables and Constants,
+  // so we handle GlobalVariables first above.
+  if (auto *c = dyn_cast<llvm::Constant>(value))
+    return processConstant(c);
+
+  emitError(unknownLoc) << "unhandled value: " << diag(*value);
+  return nullptr;
+}
+
+// Maps from LLVM opcode to MLIR OperationName. This is deliberately ordered
+// as in llvm/IR/Instructions.def to aid comprehension and spot missing
+// instructions.
+#define INST(llvm_n, mlir_n)                                                   \
+  { llvm::Instruction::llvm_n, LLVM::mlir_n##Op::getOperationName() }
+static const DenseMap<unsigned, StringRef> opcMap = {
+    // Ret is handled specially.
+    // Br is handled specially.
+    // FIXME: switch
+    // FIXME: indirectbr
+    // FIXME: invoke
+    // FIXME: resume
+    // FIXME: unreachable
+    // FIXME: cleanupret
+    // FIXME: catchret
+    // FIXME: catchswitch
+    // FIXME: callbr
+    // FIXME: fneg
+    INST(Add, Add), INST(FAdd, FAdd), INST(Sub, Sub), INST(FSub, FSub),
+    INST(Mul, Mul), INST(FMul, FMul), INST(UDiv, UDiv), INST(SDiv, SDiv),
+    INST(FDiv, FDiv), INST(URem, URem), INST(SRem, SRem), INST(FRem, FRem),
+    INST(Shl, Shl), INST(LShr, LShr), INST(AShr, AShr), INST(And, And),
+    INST(Or, Or), INST(Xor, XOr), INST(Alloca, Alloca), INST(Load, Load),
+    INST(Store, Store),
+    // Getelementptr is handled specially.
+    INST(Ret, Return),
+    // FIXME: fence
+    // FIXME: atomiccmpxchg
+    // FIXME: atomicrmw
+    INST(Trunc, Trunc), INST(ZExt, ZExt), INST(SExt, SExt),
+    INST(FPToUI, FPToUI), INST(FPToSI, FPToSI), INST(UIToFP, UIToFP),
+    INST(SIToFP, SIToFP), INST(FPTrunc, FPTrunc), INST(FPExt, FPExt),
+    INST(PtrToInt, PtrToInt), INST(IntToPtr, IntToPtr), INST(BitCast, Bitcast),
+    INST(AddrSpaceCast, AddrSpaceCast),
+    // FIXME: cleanuppad
+    // FIXME: catchpad
+    // ICmp is handled specially.
+    // FIXME: fcmp
+    // PHI is handled specially.
+    INST(Call, Call),
+    // FIXME: select
+    // FIXME: vaarg
+    // FIXME: extractelement
+    // FIXME: insertelement
+    // FIXME: shufflevector
+    // FIXME: extractvalue
+    // FIXME: insertvalue
+    // FIXME: landingpad
+};
+#undef INST
+
+static ICmpPredicate getICmpPredicate(llvm::CmpInst::Predicate p) {
+  switch (p) {
+  default:
+    llvm_unreachable("incorrect comparison predicate");
+  case llvm::CmpInst::Predicate::ICMP_EQ:
+    return LLVM::ICmpPredicate::eq;
+  case llvm::CmpInst::Predicate::ICMP_NE:
+    return LLVM::ICmpPredicate::ne;
+  case llvm::CmpInst::Predicate::ICMP_SLT:
+    return LLVM::ICmpPredicate::slt;
+  case llvm::CmpInst::Predicate::ICMP_SLE:
+    return LLVM::ICmpPredicate::sle;
+  case llvm::CmpInst::Predicate::ICMP_SGT:
+    return LLVM::ICmpPredicate::sgt;
+  case llvm::CmpInst::Predicate::ICMP_SGE:
+    return LLVM::ICmpPredicate::sge;
+  case llvm::CmpInst::Predicate::ICMP_ULT:
+    return LLVM::ICmpPredicate::ult;
+  case llvm::CmpInst::Predicate::ICMP_ULE:
+    return LLVM::ICmpPredicate::ule;
+  case llvm::CmpInst::Predicate::ICMP_UGT:
+    return LLVM::ICmpPredicate::ugt;
+  case llvm::CmpInst::Predicate::ICMP_UGE:
+    return LLVM::ICmpPredicate::uge;
+  }
+  llvm_unreachable("incorrect comparison predicate");
+}
+
+// `br` branches to `target`. Return the branch arguments to `br`, in the
+// same order of the PHIs in `target`.
+SmallVector<Value, 4> Importer::processBranchArgs(llvm::BranchInst *br,
+                                                  llvm::BasicBlock *target) {
+  SmallVector<Value, 4> v;
+  for (auto inst = target->begin(); isa<llvm::PHINode>(inst); ++inst) {
+    auto *PN = cast<llvm::PHINode>(&*inst);
+    v.push_back(processValue(PN->getIncomingValueForBlock(br->getParent())));
+  }
+  return v;
+}
+
+LogicalResult Importer::processInstruction(llvm::Instruction *inst) {
+  // FIXME: Support uses of SubtargetData. Currently inbounds GEPs, fast-math
+  // flags and call / operand attributes are not supported.
+  Location loc = processDebugLoc(inst->getDebugLoc(), inst);
+  Value &v = instMap[inst];
+  assert(!v && "processInstruction must be called only once per instruction!");
+  switch (inst->getOpcode()) {
+  default:
+    return emitError(loc) << "unknown instruction: " << diag(*inst);
+  case llvm::Instruction::Add:
+  case llvm::Instruction::FAdd:
+  case llvm::Instruction::Sub:
+  case llvm::Instruction::FSub:
+  case llvm::Instruction::Mul:
+  case llvm::Instruction::FMul:
+  case llvm::Instruction::UDiv:
+  case llvm::Instruction::SDiv:
+  case llvm::Instruction::FDiv:
+  case llvm::Instruction::URem:
+  case llvm::Instruction::SRem:
+  case llvm::Instruction::FRem:
+  case llvm::Instruction::Shl:
+  case llvm::Instruction::LShr:
+  case llvm::Instruction::AShr:
+  case llvm::Instruction::And:
+  case llvm::Instruction::Or:
+  case llvm::Instruction::Xor:
+  case llvm::Instruction::Alloca:
+  case llvm::Instruction::Load:
+  case llvm::Instruction::Store:
+  case llvm::Instruction::Ret:
+  case llvm::Instruction::Trunc:
+  case llvm::Instruction::ZExt:
+  case llvm::Instruction::SExt:
+  case llvm::Instruction::FPToUI:
+  case llvm::Instruction::FPToSI:
+  case llvm::Instruction::UIToFP:
+  case llvm::Instruction::SIToFP:
+  case llvm::Instruction::FPTrunc:
+  case llvm::Instruction::FPExt:
+  case llvm::Instruction::PtrToInt:
+  case llvm::Instruction::IntToPtr:
+  case llvm::Instruction::AddrSpaceCast:
+  case llvm::Instruction::BitCast: {
+    OperationState state(loc, opcMap.lookup(inst->getOpcode()));
+    SmallVector<Value, 4> ops;
+    ops.reserve(inst->getNumOperands());
+    for (auto *op : inst->operand_values())
+      ops.push_back(processValue(op));
+    state.addOperands(ops);
+    if (!inst->getType()->isVoidTy())
+      state.addTypes(ArrayRef<Type>({processType(inst->getType())}));
+    Operation *op = b.createOperation(state);
+    if (!inst->getType()->isVoidTy())
+      v = op->getResult(0);
+    return success();
+  }
+  case llvm::Instruction::ICmp: {
+    v = b.create<ICmpOp>(
+        loc, getICmpPredicate(cast<llvm::ICmpInst>(inst)->getPredicate()),
+        processValue(inst->getOperand(0)), processValue(inst->getOperand(1)));
+    return success();
+  }
+  case llvm::Instruction::Br: {
+    auto *brInst = cast<llvm::BranchInst>(inst);
+    OperationState state(loc,
+                         brInst->isConditional() ? "llvm.cond_br" : "llvm.br");
+    SmallVector<Value, 4> ops;
+    if (brInst->isConditional())
+      ops.push_back(processValue(brInst->getCondition()));
+    state.addOperands(ops);
+    SmallVector<Block *, 4> succs;
+    for (auto *succ : llvm::reverse(brInst->successors()))
+      state.addSuccessor(blocks[succ], processBranchArgs(brInst, succ));
+    b.createOperation(state);
+    return success();
+  }
+  case llvm::Instruction::PHI: {
+    v = b.getInsertionBlock()->addArgument(processType(inst->getType()));
+    return success();
+  }
+  case llvm::Instruction::Call: {
+    llvm::CallInst *ci = cast<llvm::CallInst>(inst);
+    SmallVector<Value, 4> ops;
+    ops.reserve(inst->getNumOperands());
+    for (auto &op : ci->arg_operands())
+      ops.push_back(processValue(op.get()));
+
+    SmallVector<Type, 2> tys;
+    if (!ci->getType()->isVoidTy())
+      tys.push_back(processType(inst->getType()));
+    Operation *op;
+    if (llvm::Function *callee = ci->getCalledFunction()) {
+      op = b.create<CallOp>(loc, tys, b.getSymbolRefAttr(callee->getName()),
+                            ops);
+    } else {
+      ops.insert(ops.begin(), processValue(ci->getCalledValue()));
+      op = b.create<CallOp>(loc, tys, ops, ArrayRef<NamedAttribute>());
+    }
+    if (!ci->getType()->isVoidTy())
+      v = op->getResult(0);
+    return success();
+  }
+  case llvm::Instruction::GetElementPtr: {
+    // FIXME: Support inbounds GEPs.
+    llvm::GetElementPtrInst *gep = cast<llvm::GetElementPtrInst>(inst);
+    SmallVector<Value, 4> ops;
+    for (auto *op : gep->operand_values())
+      ops.push_back(processValue(op));
+    v = b.create<GEPOp>(loc, processType(inst->getType()), ops,
+                        ArrayRef<NamedAttribute>());
+    return success();
+  }
+  }
+}
+
+LogicalResult Importer::processFunction(llvm::Function *f) {
+  blocks.clear();
+  instMap.clear();
+  unknownInstMap.clear();
+
+  b.setInsertionPoint(module.getBody(), getFuncInsertPt());
+  LLVMFuncOp fop = b.create<LLVMFuncOp>(UnknownLoc::get(context), f->getName(),
+                                        processType(f->getFunctionType()));
+  if (f->isDeclaration())
+    return success();
+
+  // Eagerly create all blocks.
+  SmallVector<Block *, 4> blockList;
+  for (llvm::BasicBlock &bb : *f) {
+    blockList.push_back(b.createBlock(&fop.body(), fop.body().end()));
+    blocks[&bb] = blockList.back();
+  }
+  currentEntryBlock = blockList[0];
+
+  // Add function arguments to the entry block.
+  for (auto &arg : f->args())
+    instMap[&arg] = blockList[0]->addArgument(processType(arg.getType()));
+
+  for (auto bbs : llvm::zip(*f, blockList)) {
+    if (failed(processBasicBlock(&std::get<0>(bbs), std::get<1>(bbs))))
+      return failure();
+  }
+
+  // Now that all instructions are guaranteed to have been visited, ensure
+  // any unknown uses we encountered are remapped.
+  for (auto &llvmAndUnknown : unknownInstMap) {
+    assert(instMap.count(llvmAndUnknown.first));
+    Value newValue = instMap[llvmAndUnknown.first];
+    Value oldValue = llvmAndUnknown.second->getResult(0);
+    oldValue->replaceAllUsesWith(newValue);
+    llvmAndUnknown.second->erase();
+  }
+  return success();
+}
+
+LogicalResult Importer::processBasicBlock(llvm::BasicBlock *bb, Block *block) {
+  b.setInsertionPointToStart(block);
+  for (llvm::Instruction &inst : *bb) {
+    if (failed(processInstruction(&inst)))
+      return failure();
+  }
+  return success();
+}
+
+OwningModuleRef
+mlir::translateLLVMIRToModule(std::unique_ptr<llvm::Module> llvmModule,
+                              MLIRContext *context) {
+  OwningModuleRef module(ModuleOp::create(
+      FileLineColLoc::get("", /*line=*/0, /*column=*/0, context)));
+
+  Importer deserializer(context, module.get());
+  for (llvm::GlobalVariable &gv : llvmModule->globals()) {
+    if (!deserializer.processGlobal(&gv))
+      return {};
+  }
+  for (llvm::Function &f : llvmModule->functions()) {
+    if (failed(deserializer.processFunction(&f)))
+      return {};
+  }
+
+  return module;
+}
+
+// Deserializes the LLVM bitcode stored in `input` into an MLIR module in the
+// LLVM dialect.
+OwningModuleRef translateLLVMIRToModule(llvm::SourceMgr &sourceMgr,
+                                        MLIRContext *context) {
+  LLVMDialect *dialect = context->getRegisteredDialect<LLVMDialect>();
+  assert(dialect && "Could not find LLVMDialect?");
+
+  llvm::SMDiagnostic err;
+  std::unique_ptr<llvm::Module> llvmModule =
+      llvm::parseIR(*sourceMgr.getMemoryBuffer(sourceMgr.getMainFileID()), err,
+                    dialect->getLLVMContext(),
+                    /*UpgradeDebugInfo=*/true,
+                    /*DataLayoutString=*/"");
+  if (!llvmModule) {
+    std::string errStr;
+    llvm::raw_string_ostream errStream(errStr);
+    err.print(/*ProgName=*/"", errStream);
+    emitError(UnknownLoc::get(context)) << errStream.str();
+    return {};
+  }
+  return translateLLVMIRToModule(std::move(llvmModule), context);
+}
+
+static TranslateToMLIRRegistration
+    fromLLVM("import-llvm",
+             [](llvm::SourceMgr &sourceMgr, MLIRContext *context) {
+               return translateLLVMIRToModule(sourceMgr, context);
+             });
diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4cc599749600f5d0add0e330917ce9901654b22a
--- /dev/null
+++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
@@ -0,0 +1,36 @@
+//===- ConvertToLLVMIR.cpp - MLIR to LLVM IR conversion -------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a translation between the MLIR LLVM dialect and LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVMIR.h"
+
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+#include "mlir/Translation.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+std::unique_ptr<llvm::Module> mlir::translateModuleToLLVMIR(ModuleOp m) {
+  return LLVM::ModuleTranslation::translateModule<>(m);
+}
+
+static TranslateFromMLIRRegistration
+    registration("mlir-to-llvmir", [](ModuleOp module, raw_ostream &output) {
+      auto llvmModule = LLVM::ModuleTranslation::translateModule<>(module);
+      if (!llvmModule)
+        return failure();
+
+      llvmModule->print(output, nullptr);
+      return success();
+    });
diff --git a/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5992174df3b04a0e2bb92ae92fa1d0e8100ebc1
--- /dev/null
+++ b/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp
@@ -0,0 +1,106 @@
+//===- ConvertToNVVMIR.cpp - MLIR to LLVM IR conversion -------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a translation between the MLIR LLVM + NVVM dialects and
+// LLVM IR with NVVM intrinsics and metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/NVVMIR.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+#include "mlir/Translation.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+static llvm::Value *createIntrinsicCall(llvm::IRBuilder<> &builder,
+                                        llvm::Intrinsic::ID intrinsic,
+                                        ArrayRef<llvm::Value *> args = {}) {
+  llvm::Module *module = builder.GetInsertBlock()->getModule();
+  llvm::Function *fn = llvm::Intrinsic::getDeclaration(module, intrinsic);
+  return builder.CreateCall(fn, args);
+}
+
+static llvm::Intrinsic::ID getShflBflyIntrinsicId(llvm::Type *resultType,
+                                                  bool withPredicate) {
+  if (withPredicate) {
+    resultType = cast<llvm::StructType>(resultType)->getElementType(0);
+    return resultType->isFloatTy() ? llvm::Intrinsic::nvvm_shfl_sync_bfly_f32p
+                                   : llvm::Intrinsic::nvvm_shfl_sync_bfly_i32p;
+  }
+  return resultType->isFloatTy() ? llvm::Intrinsic::nvvm_shfl_sync_bfly_f32
+                                 : llvm::Intrinsic::nvvm_shfl_sync_bfly_i32;
+}
+
+namespace {
+class ModuleTranslation : public LLVM::ModuleTranslation {
+
+public:
+  explicit ModuleTranslation(Operation *module)
+      : LLVM::ModuleTranslation(module) {}
+  ~ModuleTranslation() override {}
+
+protected:
+  LogicalResult convertOperation(Operation &opInst,
+                                 llvm::IRBuilder<> &builder) override {
+
+#include "mlir/Dialect/LLVMIR/NVVMConversions.inc"
+
+    return LLVM::ModuleTranslation::convertOperation(opInst, builder);
+  }
+};
+} // namespace
+
+std::unique_ptr<llvm::Module> mlir::translateModuleToNVVMIR(Operation *m) {
+  ModuleTranslation translation(m);
+  auto llvmModule =
+      LLVM::ModuleTranslation::translateModule<ModuleTranslation>(m);
+  if (!llvmModule)
+    return llvmModule;
+
+  // Insert the nvvm.annotations kernel so that the NVVM backend recognizes the
+  // function as a kernel.
+  for (auto func :
+       ModuleTranslation::getModuleBody(m).getOps<LLVM::LLVMFuncOp>()) {
+    if (!gpu::GPUDialect::isKernel(func))
+      continue;
+
+    auto *llvmFunc = llvmModule->getFunction(func.getName());
+
+    llvm::Metadata *llvmMetadata[] = {
+        llvm::ValueAsMetadata::get(llvmFunc),
+        llvm::MDString::get(llvmModule->getContext(), "kernel"),
+        llvm::ValueAsMetadata::get(llvm::ConstantInt::get(
+            llvm::Type::getInt32Ty(llvmModule->getContext()), 1))};
+    llvm::MDNode *llvmMetadataNode =
+        llvm::MDNode::get(llvmModule->getContext(), llvmMetadata);
+    llvmModule->getOrInsertNamedMetadata("nvvm.annotations")
+        ->addOperand(llvmMetadataNode);
+  }
+
+  return llvmModule;
+}
+
+static TranslateFromMLIRRegistration
+    registration("mlir-to-nvvmir", [](ModuleOp module, raw_ostream &output) {
+      auto llvmModule = mlir::translateModuleToNVVMIR(module);
+      if (!llvmModule)
+        return failure();
+
+      llvmModule->print(output, nullptr);
+      return success();
+    });
diff --git a/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..881d165e0c83c38655d6d5e0670b9e493f3666f3
--- /dev/null
+++ b/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp
@@ -0,0 +1,110 @@
+//===- ConvertToROCDLIR.cpp - MLIR to LLVM IR conversion ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a translation between the MLIR LLVM + ROCDL dialects and
+// LLVM IR with ROCDL intrinsics and metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/ROCDLIR.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+#include "mlir/Translation.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+// Create a call to llvm intrinsic
+static llvm::Value *createIntrinsicCall(llvm::IRBuilder<> &builder,
+                                        llvm::Intrinsic::ID intrinsic,
+                                        ArrayRef<llvm::Value *> args = {}) {
+  llvm::Module *module = builder.GetInsertBlock()->getModule();
+  llvm::Function *fn = llvm::Intrinsic::getDeclaration(module, intrinsic);
+  return builder.CreateCall(fn, args);
+}
+
+// Create a call to ROCm-Device-Library function
+//   Currently this routine will work only for calling ROCDL functions that
+// take a single int32 argument. It is likely that the interface of this
+// function will change to make it more generic.
+static llvm::Value *createDeviceFunctionCall(llvm::IRBuilder<> &builder,
+                                             StringRef fn_name, int parameter) {
+  llvm::Module *module = builder.GetInsertBlock()->getModule();
+  llvm::FunctionType *function_type = llvm::FunctionType::get(
+      llvm::Type::getInt64Ty(module->getContext()), // return type.
+      llvm::Type::getInt32Ty(module->getContext()), // parameter type.
+      false);                                       // no variadic arguments.
+  llvm::Function *fn = dyn_cast<llvm::Function>(
+      module->getOrInsertFunction(fn_name, function_type).getCallee());
+  llvm::Value *fn_op0 = llvm::ConstantInt::get(
+      llvm::Type::getInt32Ty(module->getContext()), parameter);
+  return builder.CreateCall(fn, ArrayRef<llvm::Value *>(fn_op0));
+}
+
+namespace {
+class ModuleTranslation : public LLVM::ModuleTranslation {
+
+public:
+  explicit ModuleTranslation(Operation *module)
+      : LLVM::ModuleTranslation(module) {}
+  ~ModuleTranslation() override {}
+
+protected:
+  LogicalResult convertOperation(Operation &opInst,
+                                 llvm::IRBuilder<> &builder) override {
+
+#include "mlir/Dialect/LLVMIR/ROCDLConversions.inc"
+
+    return LLVM::ModuleTranslation::convertOperation(opInst, builder);
+  }
+};
+} // namespace
+
+std::unique_ptr<llvm::Module> mlir::translateModuleToROCDLIR(Operation *m) {
+  ModuleTranslation translation(m);
+
+  // lower MLIR (with RODL Dialect) to LLVM IR (with ROCDL intrinsics)
+  auto llvmModule =
+      LLVM::ModuleTranslation::translateModule<ModuleTranslation>(m);
+
+  // foreach GPU kernel
+  // 1. Insert AMDGPU_KERNEL calling convention.
+  // 2. Insert amdgpu-flat-workgroup-size(1, 1024) attribute.
+  for (auto func :
+       ModuleTranslation::getModuleBody(m).getOps<LLVM::LLVMFuncOp>()) {
+    if (!func.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelFuncAttrName()))
+      continue;
+
+    auto *llvmFunc = llvmModule->getFunction(func.getName());
+
+    llvmFunc->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+
+    llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
+  }
+
+  return llvmModule;
+}
+
+static TranslateFromMLIRRegistration
+    registration("mlir-to-rocdlir", [](ModuleOp module, raw_ostream &output) {
+      auto llvmModule = mlir::translateModuleToROCDLIR(module);
+      if (!llvmModule)
+        return failure();
+
+      llvmModule->print(output, nullptr);
+      return success();
+    });
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e3c0768ef336811dcedd5fd91d6af7b09933394a
--- /dev/null
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -0,0 +1,516 @@
+//===- ModuleTranslation.cpp - MLIR to LLVM conversion --------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the translation between an MLIR LLVM dialect module and
+// the corresponding LLVMIR module. It only handles core LLVM IR operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Support/LLVM.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace mlir;
+using namespace mlir::LLVM;
+
+/// Create an LLVM IR constant of `llvmType` from the MLIR attribute `attr`.
+/// This currently supports integer, floating point, splat and dense element
+/// attributes and combinations thereof.  In case of error, report it to `loc`
+/// and return nullptr.
+llvm::Constant *ModuleTranslation::getLLVMConstant(llvm::Type *llvmType,
+                                                   Attribute attr,
+                                                   Location loc) {
+  if (!attr)
+    return llvm::UndefValue::get(llvmType);
+  if (auto intAttr = attr.dyn_cast<IntegerAttr>())
+    return llvm::ConstantInt::get(llvmType, intAttr.getValue());
+  if (auto floatAttr = attr.dyn_cast<FloatAttr>())
+    return llvm::ConstantFP::get(llvmType, floatAttr.getValue());
+  if (auto funcAttr = attr.dyn_cast<FlatSymbolRefAttr>())
+    return functionMapping.lookup(funcAttr.getValue());
+  if (auto splatAttr = attr.dyn_cast<SplatElementsAttr>()) {
+    auto *sequentialType = cast<llvm::SequentialType>(llvmType);
+    auto elementType = sequentialType->getElementType();
+    uint64_t numElements = sequentialType->getNumElements();
+    auto *child = getLLVMConstant(elementType, splatAttr.getSplatValue(), loc);
+    if (llvmType->isVectorTy())
+      return llvm::ConstantVector::getSplat(numElements, child);
+    if (llvmType->isArrayTy()) {
+      auto arrayType = llvm::ArrayType::get(elementType, numElements);
+      SmallVector<llvm::Constant *, 8> constants(numElements, child);
+      return llvm::ConstantArray::get(arrayType, constants);
+    }
+  }
+  if (auto elementsAttr = attr.dyn_cast<ElementsAttr>()) {
+    auto *sequentialType = cast<llvm::SequentialType>(llvmType);
+    auto elementType = sequentialType->getElementType();
+    uint64_t numElements = sequentialType->getNumElements();
+    SmallVector<llvm::Constant *, 8> constants;
+    constants.reserve(numElements);
+    for (auto n : elementsAttr.getValues<Attribute>()) {
+      constants.push_back(getLLVMConstant(elementType, n, loc));
+      if (!constants.back())
+        return nullptr;
+    }
+    if (llvmType->isVectorTy())
+      return llvm::ConstantVector::get(constants);
+    if (llvmType->isArrayTy()) {
+      auto arrayType = llvm::ArrayType::get(elementType, numElements);
+      return llvm::ConstantArray::get(arrayType, constants);
+    }
+  }
+  if (auto stringAttr = attr.dyn_cast<StringAttr>()) {
+    return llvm::ConstantDataArray::get(
+        llvmModule->getContext(), ArrayRef<char>{stringAttr.getValue().data(),
+                                                 stringAttr.getValue().size()});
+  }
+  emitError(loc, "unsupported constant value");
+  return nullptr;
+}
+
+/// Convert MLIR integer comparison predicate to LLVM IR comparison predicate.
+static llvm::CmpInst::Predicate getLLVMCmpPredicate(ICmpPredicate p) {
+  switch (p) {
+  case LLVM::ICmpPredicate::eq:
+    return llvm::CmpInst::Predicate::ICMP_EQ;
+  case LLVM::ICmpPredicate::ne:
+    return llvm::CmpInst::Predicate::ICMP_NE;
+  case LLVM::ICmpPredicate::slt:
+    return llvm::CmpInst::Predicate::ICMP_SLT;
+  case LLVM::ICmpPredicate::sle:
+    return llvm::CmpInst::Predicate::ICMP_SLE;
+  case LLVM::ICmpPredicate::sgt:
+    return llvm::CmpInst::Predicate::ICMP_SGT;
+  case LLVM::ICmpPredicate::sge:
+    return llvm::CmpInst::Predicate::ICMP_SGE;
+  case LLVM::ICmpPredicate::ult:
+    return llvm::CmpInst::Predicate::ICMP_ULT;
+  case LLVM::ICmpPredicate::ule:
+    return llvm::CmpInst::Predicate::ICMP_ULE;
+  case LLVM::ICmpPredicate::ugt:
+    return llvm::CmpInst::Predicate::ICMP_UGT;
+  case LLVM::ICmpPredicate::uge:
+    return llvm::CmpInst::Predicate::ICMP_UGE;
+  }
+  llvm_unreachable("incorrect comparison predicate");
+}
+
+static llvm::CmpInst::Predicate getLLVMCmpPredicate(FCmpPredicate p) {
+  switch (p) {
+  case LLVM::FCmpPredicate::_false:
+    return llvm::CmpInst::Predicate::FCMP_FALSE;
+  case LLVM::FCmpPredicate::oeq:
+    return llvm::CmpInst::Predicate::FCMP_OEQ;
+  case LLVM::FCmpPredicate::ogt:
+    return llvm::CmpInst::Predicate::FCMP_OGT;
+  case LLVM::FCmpPredicate::oge:
+    return llvm::CmpInst::Predicate::FCMP_OGE;
+  case LLVM::FCmpPredicate::olt:
+    return llvm::CmpInst::Predicate::FCMP_OLT;
+  case LLVM::FCmpPredicate::ole:
+    return llvm::CmpInst::Predicate::FCMP_OLE;
+  case LLVM::FCmpPredicate::one:
+    return llvm::CmpInst::Predicate::FCMP_ONE;
+  case LLVM::FCmpPredicate::ord:
+    return llvm::CmpInst::Predicate::FCMP_ORD;
+  case LLVM::FCmpPredicate::ueq:
+    return llvm::CmpInst::Predicate::FCMP_UEQ;
+  case LLVM::FCmpPredicate::ugt:
+    return llvm::CmpInst::Predicate::FCMP_UGT;
+  case LLVM::FCmpPredicate::uge:
+    return llvm::CmpInst::Predicate::FCMP_UGE;
+  case LLVM::FCmpPredicate::ult:
+    return llvm::CmpInst::Predicate::FCMP_ULT;
+  case LLVM::FCmpPredicate::ule:
+    return llvm::CmpInst::Predicate::FCMP_ULE;
+  case LLVM::FCmpPredicate::une:
+    return llvm::CmpInst::Predicate::FCMP_UNE;
+  case LLVM::FCmpPredicate::uno:
+    return llvm::CmpInst::Predicate::FCMP_UNO;
+  case LLVM::FCmpPredicate::_true:
+    return llvm::CmpInst::Predicate::FCMP_TRUE;
+  }
+  llvm_unreachable("incorrect comparison predicate");
+}
+
+/// Given a single MLIR operation, create the corresponding LLVM IR operation
+/// using the `builder`.  LLVM IR Builder does not have a generic interface so
+/// this has to be a long chain of `if`s calling different functions with a
+/// different number of arguments.
+LogicalResult ModuleTranslation::convertOperation(Operation &opInst,
+                                                  llvm::IRBuilder<> &builder) {
+  auto extractPosition = [](ArrayAttr attr) {
+    SmallVector<unsigned, 4> position;
+    position.reserve(attr.size());
+    for (Attribute v : attr)
+      position.push_back(v.cast<IntegerAttr>().getValue().getZExtValue());
+    return position;
+  };
+
+#include "mlir/Dialect/LLVMIR/LLVMConversions.inc"
+
+  // Emit function calls.  If the "callee" attribute is present, this is a
+  // direct function call and we also need to look up the remapped function
+  // itself.  Otherwise, this is an indirect call and the callee is the first
+  // operand, look it up as a normal value.  Return the llvm::Value representing
+  // the function result, which may be of llvm::VoidTy type.
+  auto convertCall = [this, &builder](Operation &op) -> llvm::Value * {
+    auto operands = lookupValues(op.getOperands());
+    ArrayRef<llvm::Value *> operandsRef(operands);
+    if (auto attr = op.getAttrOfType<FlatSymbolRefAttr>("callee")) {
+      return builder.CreateCall(functionMapping.lookup(attr.getValue()),
+                                operandsRef);
+    } else {
+      return builder.CreateCall(operandsRef.front(), operandsRef.drop_front());
+    }
+  };
+
+  // Emit calls.  If the called function has a result, remap the corresponding
+  // value.  Note that LLVM IR dialect CallOp has either 0 or 1 result.
+  if (isa<LLVM::CallOp>(opInst)) {
+    llvm::Value *result = convertCall(opInst);
+    if (opInst.getNumResults() != 0) {
+      valueMapping[opInst.getResult(0)] = result;
+      return success();
+    }
+    // Check that LLVM call returns void for 0-result functions.
+    return success(result->getType()->isVoidTy());
+  }
+
+  // Emit branches.  We need to look up the remapped blocks and ignore the block
+  // arguments that were transformed into PHI nodes.
+  if (auto brOp = dyn_cast<LLVM::BrOp>(opInst)) {
+    builder.CreateBr(blockMapping[brOp.getSuccessor(0)]);
+    return success();
+  }
+  if (auto condbrOp = dyn_cast<LLVM::CondBrOp>(opInst)) {
+    builder.CreateCondBr(valueMapping.lookup(condbrOp.getOperand(0)),
+                         blockMapping[condbrOp.getSuccessor(0)],
+                         blockMapping[condbrOp.getSuccessor(1)]);
+    return success();
+  }
+
+  // Emit addressof.  We need to look up the global value referenced by the
+  // operation and store it in the MLIR-to-LLVM value mapping.  This does not
+  // emit any LLVM instruction.
+  if (auto addressOfOp = dyn_cast<LLVM::AddressOfOp>(opInst)) {
+    LLVM::GlobalOp global = addressOfOp.getGlobal();
+    // The verifier should not have allowed this.
+    assert(global && "referencing an undefined global");
+
+    valueMapping[addressOfOp.getResult()] = globalsMapping.lookup(global);
+    return success();
+  }
+
+  return opInst.emitError("unsupported or non-LLVM operation: ")
+         << opInst.getName();
+}
+
+/// Convert block to LLVM IR.  Unless `ignoreArguments` is set, emit PHI nodes
+/// to define values corresponding to the MLIR block arguments.  These nodes
+/// are not connected to the source basic blocks, which may not exist yet.
+LogicalResult ModuleTranslation::convertBlock(Block &bb, bool ignoreArguments) {
+  llvm::IRBuilder<> builder(blockMapping[&bb]);
+
+  // Before traversing operations, make block arguments available through
+  // value remapping and PHI nodes, but do not add incoming edges for the PHI
+  // nodes just yet: those values may be defined by this or following blocks.
+  // This step is omitted if "ignoreArguments" is set.  The arguments of the
+  // first block have been already made available through the remapping of
+  // LLVM function arguments.
+  if (!ignoreArguments) {
+    auto predecessors = bb.getPredecessors();
+    unsigned numPredecessors =
+        std::distance(predecessors.begin(), predecessors.end());
+    for (auto arg : bb.getArguments()) {
+      auto wrappedType = arg->getType().dyn_cast<LLVM::LLVMType>();
+      if (!wrappedType)
+        return emitError(bb.front().getLoc(),
+                         "block argument does not have an LLVM type");
+      llvm::Type *type = wrappedType.getUnderlyingType();
+      llvm::PHINode *phi = builder.CreatePHI(type, numPredecessors);
+      valueMapping[arg] = phi;
+    }
+  }
+
+  // Traverse operations.
+  for (auto &op : bb) {
+    if (failed(convertOperation(op, builder)))
+      return failure();
+  }
+
+  return success();
+}
+
+/// Convert the LLVM dialect linkage type to LLVM IR linkage type.
+llvm::GlobalVariable::LinkageTypes convertLinkageType(LLVM::Linkage linkage) {
+  switch (linkage) {
+  case LLVM::Linkage::Private:
+    return llvm::GlobalValue::PrivateLinkage;
+  case LLVM::Linkage::Internal:
+    return llvm::GlobalValue::InternalLinkage;
+  case LLVM::Linkage::AvailableExternally:
+    return llvm::GlobalValue::AvailableExternallyLinkage;
+  case LLVM::Linkage::Linkonce:
+    return llvm::GlobalValue::LinkOnceAnyLinkage;
+  case LLVM::Linkage::Weak:
+    return llvm::GlobalValue::WeakAnyLinkage;
+  case LLVM::Linkage::Common:
+    return llvm::GlobalValue::CommonLinkage;
+  case LLVM::Linkage::Appending:
+    return llvm::GlobalValue::AppendingLinkage;
+  case LLVM::Linkage::ExternWeak:
+    return llvm::GlobalValue::ExternalWeakLinkage;
+  case LLVM::Linkage::LinkonceODR:
+    return llvm::GlobalValue::LinkOnceODRLinkage;
+  case LLVM::Linkage::WeakODR:
+    return llvm::GlobalValue::WeakODRLinkage;
+  case LLVM::Linkage::External:
+    return llvm::GlobalValue::ExternalLinkage;
+  }
+  llvm_unreachable("unknown linkage type");
+}
+
+/// Create named global variables that correspond to llvm.mlir.global
+/// definitions.
+void ModuleTranslation::convertGlobals() {
+  for (auto op : getModuleBody(mlirModule).getOps<LLVM::GlobalOp>()) {
+    llvm::Type *type = op.getType().getUnderlyingType();
+    llvm::Constant *cst = llvm::UndefValue::get(type);
+    if (op.getValueOrNull()) {
+      // String attributes are treated separately because they cannot appear as
+      // in-function constants and are thus not supported by getLLVMConstant.
+      if (auto strAttr = op.getValueOrNull().dyn_cast_or_null<StringAttr>()) {
+        cst = llvm::ConstantDataArray::getString(
+            llvmModule->getContext(), strAttr.getValue(), /*AddNull=*/false);
+        type = cst->getType();
+      } else {
+        cst = getLLVMConstant(type, op.getValueOrNull(), op.getLoc());
+      }
+    } else if (Block *initializer = op.getInitializerBlock()) {
+      llvm::IRBuilder<> builder(llvmModule->getContext());
+      for (auto &op : initializer->without_terminator()) {
+        if (failed(convertOperation(op, builder)) ||
+            !isa<llvm::Constant>(valueMapping.lookup(op.getResult(0)))) {
+          emitError(op.getLoc(), "unemittable constant value");
+          return;
+        }
+      }
+      ReturnOp ret = cast<ReturnOp>(initializer->getTerminator());
+      cst = cast<llvm::Constant>(valueMapping.lookup(ret.getOperand(0)));
+    }
+
+    auto linkage = convertLinkageType(op.linkage());
+    bool anyExternalLinkage =
+        (linkage == llvm::GlobalVariable::ExternalLinkage ||
+         linkage == llvm::GlobalVariable::ExternalWeakLinkage);
+    auto addrSpace = op.addr_space().getLimitedValue();
+    auto *var = new llvm::GlobalVariable(
+        *llvmModule, type, op.constant(), linkage,
+        anyExternalLinkage ? nullptr : cst, op.sym_name(),
+        /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, addrSpace);
+
+    globalsMapping.try_emplace(op, var);
+  }
+}
+
+/// Get the SSA value passed to the current block from the terminator operation
+/// of its predecessor.
+static Value getPHISourceValue(Block *current, Block *pred,
+                               unsigned numArguments, unsigned index) {
+  auto &terminator = *pred->getTerminator();
+  if (isa<LLVM::BrOp>(terminator)) {
+    return terminator.getOperand(index);
+  }
+
+  // For conditional branches, we need to check if the current block is reached
+  // through the "true" or the "false" branch and take the relevant operands.
+  auto condBranchOp = dyn_cast<LLVM::CondBrOp>(terminator);
+  assert(condBranchOp &&
+         "only branch operations can be terminators of a block that "
+         "has successors");
+  assert((condBranchOp.getSuccessor(0) != condBranchOp.getSuccessor(1)) &&
+         "successors with arguments in LLVM conditional branches must be "
+         "different blocks");
+
+  return condBranchOp.getSuccessor(0) == current
+             ? terminator.getSuccessorOperand(0, index)
+             : terminator.getSuccessorOperand(1, index);
+}
+
+void ModuleTranslation::connectPHINodes(LLVMFuncOp func) {
+  // Skip the first block, it cannot be branched to and its arguments correspond
+  // to the arguments of the LLVM function.
+  for (auto it = std::next(func.begin()), eit = func.end(); it != eit; ++it) {
+    Block *bb = &*it;
+    llvm::BasicBlock *llvmBB = blockMapping.lookup(bb);
+    auto phis = llvmBB->phis();
+    auto numArguments = bb->getNumArguments();
+    assert(numArguments == std::distance(phis.begin(), phis.end()));
+    for (auto &numberedPhiNode : llvm::enumerate(phis)) {
+      auto &phiNode = numberedPhiNode.value();
+      unsigned index = numberedPhiNode.index();
+      for (auto *pred : bb->getPredecessors()) {
+        phiNode.addIncoming(valueMapping.lookup(getPHISourceValue(
+                                bb, pred, numArguments, index)),
+                            blockMapping.lookup(pred));
+      }
+    }
+  }
+}
+
+// TODO(mlir-team): implement an iterative version
+static void topologicalSortImpl(llvm::SetVector<Block *> &blocks, Block *b) {
+  blocks.insert(b);
+  for (Block *bb : b->getSuccessors()) {
+    if (blocks.count(bb) == 0)
+      topologicalSortImpl(blocks, bb);
+  }
+}
+
+/// Sort function blocks topologically.
+static llvm::SetVector<Block *> topologicalSort(LLVMFuncOp f) {
+  // For each blocks that has not been visited yet (i.e. that has no
+  // predecessors), add it to the list and traverse its successors in DFS
+  // preorder.
+  llvm::SetVector<Block *> blocks;
+  for (Block &b : f.getBlocks()) {
+    if (blocks.count(&b) == 0)
+      topologicalSortImpl(blocks, &b);
+  }
+  assert(blocks.size() == f.getBlocks().size() && "some blocks are not sorted");
+
+  return blocks;
+}
+
+LogicalResult ModuleTranslation::convertOneFunction(LLVMFuncOp func) {
+  // Clear the block and value mappings, they are only relevant within one
+  // function.
+  blockMapping.clear();
+  valueMapping.clear();
+  llvm::Function *llvmFunc = functionMapping.lookup(func.getName());
+  // Add function arguments to the value remapping table.
+  // If there was noalias info then we decorate each argument accordingly.
+  unsigned int argIdx = 0;
+  for (const auto &kvp : llvm::zip(func.getArguments(), llvmFunc->args())) {
+    llvm::Argument &llvmArg = std::get<1>(kvp);
+    BlockArgument mlirArg = std::get<0>(kvp);
+
+    if (auto attr = func.getArgAttrOfType<BoolAttr>(argIdx, "llvm.noalias")) {
+      // NB: Attribute already verified to be boolean, so check if we can indeed
+      // attach the attribute to this argument, based on its type.
+      auto argTy = mlirArg->getType().dyn_cast<LLVM::LLVMType>();
+      if (!argTy.getUnderlyingType()->isPointerTy())
+        return func.emitError(
+            "llvm.noalias attribute attached to LLVM non-pointer argument");
+      if (attr.getValue())
+        llvmArg.addAttr(llvm::Attribute::AttrKind::NoAlias);
+    }
+    valueMapping[mlirArg] = &llvmArg;
+    argIdx++;
+  }
+
+  // First, create all blocks so we can jump to them.
+  llvm::LLVMContext &llvmContext = llvmFunc->getContext();
+  for (auto &bb : func) {
+    auto *llvmBB = llvm::BasicBlock::Create(llvmContext);
+    llvmBB->insertInto(llvmFunc);
+    blockMapping[&bb] = llvmBB;
+  }
+
+  // Then, convert blocks one by one in topological order to ensure defs are
+  // converted before uses.
+  auto blocks = topologicalSort(func);
+  for (auto indexedBB : llvm::enumerate(blocks)) {
+    auto *bb = indexedBB.value();
+    if (failed(convertBlock(*bb, /*ignoreArguments=*/indexedBB.index() == 0)))
+      return failure();
+  }
+
+  // Finally, after all blocks have been traversed and values mapped, connect
+  // the PHI nodes to the results of preceding blocks.
+  connectPHINodes(func);
+  return success();
+}
+
+LogicalResult ModuleTranslation::checkSupportedModuleOps(Operation *m) {
+  for (Operation &o : getModuleBody(m).getOperations())
+    if (!isa<LLVM::LLVMFuncOp>(&o) && !isa<LLVM::GlobalOp>(&o) &&
+        !o.isKnownTerminator())
+      return o.emitOpError("unsupported module-level operation");
+  return success();
+}
+
+LogicalResult ModuleTranslation::convertFunctions() {
+  // Declare all functions first because there may be function calls that form a
+  // call graph with cycles.
+  for (auto function : getModuleBody(mlirModule).getOps<LLVMFuncOp>()) {
+    llvm::FunctionCallee llvmFuncCst = llvmModule->getOrInsertFunction(
+        function.getName(),
+        cast<llvm::FunctionType>(function.getType().getUnderlyingType()));
+    assert(isa<llvm::Function>(llvmFuncCst.getCallee()));
+    functionMapping[function.getName()] =
+        cast<llvm::Function>(llvmFuncCst.getCallee());
+  }
+
+  // Convert functions.
+  for (auto function : getModuleBody(mlirModule).getOps<LLVMFuncOp>()) {
+    // Ignore external functions.
+    if (function.isExternal())
+      continue;
+
+    if (failed(convertOneFunction(function)))
+      return failure();
+  }
+
+  return success();
+}
+
+/// A helper to look up remapped operands in the value remapping table.`
+SmallVector<llvm::Value *, 8>
+ModuleTranslation::lookupValues(ValueRange values) {
+  SmallVector<llvm::Value *, 8> remapped;
+  remapped.reserve(values.size());
+  for (Value v : values)
+    remapped.push_back(valueMapping.lookup(v));
+  return remapped;
+}
+
+std::unique_ptr<llvm::Module>
+ModuleTranslation::prepareLLVMModule(Operation *m) {
+  auto *dialect = m->getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
+  assert(dialect && "LLVM dialect must be registered");
+
+  auto llvmModule = llvm::CloneModule(dialect->getLLVMModule());
+  if (!llvmModule)
+    return nullptr;
+
+  llvm::LLVMContext &llvmContext = llvmModule->getContext();
+  llvm::IRBuilder<> builder(llvmContext);
+
+  // Inject declarations for `malloc` and `free` functions that can be used in
+  // memref allocation/deallocation coming from standard ops lowering.
+  llvmModule->getOrInsertFunction("malloc", builder.getInt8PtrTy(),
+                                  builder.getInt64Ty());
+  llvmModule->getOrInsertFunction("free", builder.getVoidTy(),
+                                  builder.getInt8PtrTy());
+
+  return llvmModule;
+}
diff --git a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..902f5c3adcb0a99abc812ddfff99b91ad8c2c8cf
--- /dev/null
+++ b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
@@ -0,0 +1,268 @@
+//===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to automatically promote accessed memref regions
+// to buffers in a faster memory space that is explicitly managed, with the
+// necessary data movement operations performed through either regular
+// point-wise load/store's or DMAs. Such explicit copying (also referred to as
+// array packing/unpacking in the literature), when done on arrays that exhibit
+// reuse, results in near elimination of conflict misses, TLB misses, reduced
+// use of hardware prefetch streams, and reduced false sharing. It is also
+// necessary for hardware that explicitly managed levels in the memory
+// hierarchy, and where DMAs may have to be used. This optimization is often
+// performed on already tiled code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+
+#define DEBUG_TYPE "affine-data-copy-generate"
+
+using namespace mlir;
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+static llvm::cl::opt<unsigned long long> clFastMemoryCapacity(
+    "affine-data-copy-generate-fast-mem-capacity",
+    llvm::cl::desc(
+        "Set fast memory space capacity in KiB (default: unlimited)"),
+    llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<bool>
+    clDma("affine-data-copy-generate-dma",
+          llvm::cl::desc("Generate DMA instead of point-wise copy"),
+          llvm::cl::cat(clOptionsCategory), llvm::cl::init(true));
+
+static llvm::cl::opt<unsigned> clFastMemorySpace(
+    "affine-data-copy-generate-fast-mem-space", llvm::cl::init(1),
+    llvm::cl::desc(
+        "Fast memory space identifier for copy generation (default: 1)"),
+    llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<bool> clSkipNonUnitStrideLoop(
+    "affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden,
+    llvm::cl::init(false),
+    llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths "
+                   "for copy placement"),
+    llvm::cl::cat(clOptionsCategory));
+
+namespace {
+
+/// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
+/// introducing copy operations to transfer data into `fastMemorySpace` and
+/// rewriting the original load's/store's to instead load/store from the
+/// allocated fast memory buffers. Additional options specify the identifier
+/// corresponding to the fast memory space and the amount of fast memory space
+/// available. The pass traverses through the nesting structure, recursing to
+/// inner levels if necessary to determine at what depth copies need to be
+/// placed so that the allocated buffers fit within the memory capacity
+/// provided.
+// TODO(bondhugula): We currently can't generate copies correctly when stores
+// are strided. Check for strided stores.
+struct AffineDataCopyGeneration
+    : public FunctionPass<AffineDataCopyGeneration> {
+  explicit AffineDataCopyGeneration(
+      unsigned slowMemorySpace = 0,
+      unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0,
+      int minDmaTransferSize = 1024,
+      uint64_t fastMemCapacityBytes =
+          (clFastMemoryCapacity.getNumOccurrences() > 0
+               ? clFastMemoryCapacity * 1024 // cl-provided size is in KiB
+               : std::numeric_limits<uint64_t>::max()),
+      bool generateDma = clDma,
+      bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop)
+      : slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace),
+        tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize),
+        fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma),
+        skipNonUnitStrideLoops(skipNonUnitStrideLoops) {}
+
+  explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other)
+      : slowMemorySpace(other.slowMemorySpace),
+        fastMemorySpace(other.fastMemorySpace),
+        tagMemorySpace(other.tagMemorySpace),
+        minDmaTransferSize(other.minDmaTransferSize),
+        fastMemCapacityBytes(other.fastMemCapacityBytes),
+        generateDma(other.generateDma),
+        skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {}
+
+  void runOnFunction() override;
+  LogicalResult runOnBlock(Block *block, DenseSet<Operation *> &copyNests);
+
+  // Slow memory space associated with copies.
+  const unsigned slowMemorySpace;
+  // Fast memory space associated with copies.
+  unsigned fastMemorySpace;
+  // Memory space associated with DMA tags.
+  unsigned tagMemorySpace;
+  // Minimum DMA transfer size supported by the target in bytes.
+  const int minDmaTransferSize;
+  // Capacity of the faster memory space.
+  uint64_t fastMemCapacityBytes;
+
+  // If set, generate DMA operations instead of read/write.
+  bool generateDma;
+
+  // If set, ignore loops with steps other than 1.
+  bool skipNonUnitStrideLoops;
+
+  // Constant zero index to avoid too many duplicates.
+  Value zeroIndex = nullptr;
+};
+
+} // end anonymous namespace
+
+/// Generates copies for memref's living in 'slowMemorySpace' into newly created
+/// buffers in 'fastMemorySpace', and replaces memory operations to the former
+/// by the latter. Only load op's handled for now.
+/// TODO(bondhugula): extend this to store op's.
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass(
+    unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,
+    int minDmaTransferSize, uint64_t fastMemCapacityBytes) {
+  return std::make_unique<AffineDataCopyGeneration>(
+      slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
+      fastMemCapacityBytes);
+}
+
+/// Generate copies for this block. The block is partitioned into separate
+/// ranges: each range is either a sequence of one or more operations starting
+/// and ending with an affine load or store op, or just an affine.forop (which
+/// could have other affine for op's nested within).
+LogicalResult
+AffineDataCopyGeneration::runOnBlock(Block *block,
+                                     DenseSet<Operation *> &copyNests) {
+  if (block->empty())
+    return success();
+
+  AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
+                                   fastMemorySpace, tagMemorySpace,
+                                   fastMemCapacityBytes};
+
+  // Every affine.forop in the block starts and ends a block range for copying;
+  // in addition, a contiguous sequence of operations starting with a
+  // load/store op but not including any copy nests themselves is also
+  // identified as a copy block range. Straightline code (a contiguous chunk of
+  // operations excluding AffineForOp's) are always assumed to not exhaust
+  // memory. As a result, this approach is conservative in some cases at the
+  // moment; we do a check later and report an error with location info.
+  // TODO(bondhugula): An 'affine.if' operation is being treated similar to an
+  // operation. 'affine.if''s could have 'affine.for's in them;
+  // treat them separately.
+
+  // Get to the first load, store, or for op (that is not a copy nest itself).
+  auto curBegin =
+      std::find_if(block->begin(), block->end(), [&](Operation &op) {
+        return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
+                isa<AffineForOp>(op)) &&
+               copyNests.count(&op) == 0;
+      });
+
+  // Create [begin, end) ranges.
+  auto it = curBegin;
+  while (it != block->end()) {
+    AffineForOp forOp;
+    // If you hit a non-copy for loop, we will split there.
+    if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
+      // Perform the copying up unti this 'for' op first.
+      affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
+                             copyNests);
+
+      // Returns true if the footprint is known to exceed capacity.
+      auto exceedsCapacity = [&](AffineForOp forOp) {
+        Optional<int64_t> footprint =
+            getMemoryFootprintBytes(forOp,
+                                    /*memorySpace=*/0);
+        return (footprint.hasValue() &&
+                static_cast<uint64_t>(footprint.getValue()) >
+                    fastMemCapacityBytes);
+      };
+
+      // If the memory footprint of the 'affine.for' loop is higher than fast
+      // memory capacity (when provided), we recurse to copy at an inner level
+      // until we find a depth at which footprint fits in fast mem capacity. If
+      // the footprint can't be calculated, we assume for now it fits. Recurse
+      // inside if footprint for 'forOp' exceeds capacity, or when
+      // skipNonUnitStrideLoops is set and the step size is not one.
+      bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
+                                                 : exceedsCapacity(forOp);
+      if (recurseInner) {
+        // We'll recurse and do the copies at an inner level for 'forInst'.
+        // Recurse onto the body of this loop.
+        runOnBlock(forOp.getBody(), copyNests);
+      } else {
+        // We have enough capacity, i.e., copies will be computed for the
+        // portion of the block until 'it', and for 'it', which is 'forOp'. Note
+        // that for the latter, the copies are placed just before this loop (for
+        // incoming copies) and right after (for outgoing ones).
+
+        // Inner loop copies have their own scope - we don't thus update
+        // consumed capacity. The footprint check above guarantees this inner
+        // loop's footprint fits.
+        affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions,
+                               copyNests);
+      }
+      // Get to the next load or store op after 'forOp'.
+      curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
+        return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
+                isa<AffineForOp>(op)) &&
+               copyNests.count(&op) == 0;
+      });
+      it = curBegin;
+    } else {
+      assert(copyNests.count(&*it) == 0 &&
+             "all copy nests generated should have been skipped above");
+      // We simply include this op in the current range and continue for more.
+      ++it;
+    }
+  }
+
+  // Generate the copy for the final block range.
+  if (curBegin != block->end()) {
+    // Can't be a terminator because it would have been skipped above.
+    assert(!curBegin->isKnownTerminator() && "can't be a terminator");
+    // Exclude the affine terminator - hence, the std::prev.
+    affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()),
+                           copyOptions, copyNests);
+  }
+
+  return success();
+}
+
+void AffineDataCopyGeneration::runOnFunction() {
+  FuncOp f = getFunction();
+  OpBuilder topBuilder(f.getBody());
+  zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0);
+
+  // Nests that are copy-in's or copy-out's; the root AffineForOps of those
+  // nests are stored herein.
+  DenseSet<Operation *> copyNests;
+
+  // Clear recorded copy nests.
+  copyNests.clear();
+
+  for (auto &block : f)
+    runOnBlock(&block, copyNests);
+
+  // Promote any single iteration loops in the copy nests.
+  for (auto nest : copyNests) {
+    nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
+  }
+}
+
+static PassRegistration<AffineDataCopyGeneration>
+    pass("affine-data-copy-generate",
+         "Generate explicit copying for memory operations");
diff --git a/mlir/lib/Transforms/AffineLoopInvariantCodeMotion.cpp b/mlir/lib/Transforms/AffineLoopInvariantCodeMotion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..24ec2d7c70b19430cf2650394ebfe19a2fb954d9
--- /dev/null
+++ b/mlir/lib/Transforms/AffineLoopInvariantCodeMotion.cpp
@@ -0,0 +1,239 @@
+//===- AffineLoopInvariantCodeMotion.cpp - Code to perform loop fusion-----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop invariant code motion.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "licm"
+
+using namespace mlir;
+
+namespace {
+
+/// Loop invariant code motion (LICM) pass.
+/// TODO(asabne) : The pass is missing zero-trip tests.
+/// TODO(asabne) : Check for the presence of side effects before hoisting.
+/// TODO: This code should be removed once the new LICM pass can handle its
+///       uses.
+struct LoopInvariantCodeMotion : public FunctionPass<LoopInvariantCodeMotion> {
+  void runOnFunction() override;
+  void runOnAffineForOp(AffineForOp forOp);
+};
+} // end anonymous namespace
+
+static bool
+checkInvarianceOfNestedIfOps(Operation *op, Value indVar,
+                             SmallPtrSetImpl<Operation *> &definedOps,
+                             SmallPtrSetImpl<Operation *> &opsToHoist);
+static bool isOpLoopInvariant(Operation &op, Value indVar,
+                              SmallPtrSetImpl<Operation *> &definedOps,
+                              SmallPtrSetImpl<Operation *> &opsToHoist);
+
+static bool
+areAllOpsInTheBlockListInvariant(Region &blockList, Value indVar,
+                                 SmallPtrSetImpl<Operation *> &definedOps,
+                                 SmallPtrSetImpl<Operation *> &opsToHoist);
+
+static bool isMemRefDereferencingOp(Operation &op) {
+  // TODO(asabne): Support DMA Ops.
+  if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op)) {
+    return true;
+  }
+  return false;
+}
+
+// Returns true if the individual op is loop invariant.
+bool isOpLoopInvariant(Operation &op, Value indVar,
+                       SmallPtrSetImpl<Operation *> &definedOps,
+                       SmallPtrSetImpl<Operation *> &opsToHoist) {
+  LLVM_DEBUG(llvm::dbgs() << "iterating on op: " << op;);
+
+  if (isa<AffineIfOp>(op)) {
+    if (!checkInvarianceOfNestedIfOps(&op, indVar, definedOps, opsToHoist)) {
+      return false;
+    }
+  } else if (isa<AffineForOp>(op)) {
+    // If the body of a predicated region has a for loop, we don't hoist the
+    // 'affine.if'.
+    return false;
+  } else if (isa<AffineDmaStartOp>(op) || isa<AffineDmaWaitOp>(op)) {
+    // TODO(asabne): Support DMA ops.
+    return false;
+  } else if (!isa<ConstantOp>(op)) {
+    if (isMemRefDereferencingOp(op)) {
+      Value memref = isa<AffineLoadOp>(op)
+                         ? cast<AffineLoadOp>(op).getMemRef()
+                         : cast<AffineStoreOp>(op).getMemRef();
+      for (auto *user : memref->getUsers()) {
+        // If this memref has a user that is a DMA, give up because these
+        // operations write to this memref.
+        if (isa<AffineDmaStartOp>(op) || isa<AffineDmaWaitOp>(op)) {
+          return false;
+        }
+        // If the memref used by the load/store is used in a store elsewhere in
+        // the loop nest, we do not hoist. Similarly, if the memref used in a
+        // load is also being stored too, we do not hoist the load.
+        if (isa<AffineStoreOp>(user) ||
+            (isa<AffineLoadOp>(user) && isa<AffineStoreOp>(op))) {
+          if (&op != user) {
+            SmallVector<AffineForOp, 8> userIVs;
+            getLoopIVs(*user, &userIVs);
+            // Check that userIVs don't contain the for loop around the op.
+            if (llvm::is_contained(userIVs, getForInductionVarOwner(indVar))) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+
+    // Insert this op in the defined ops list.
+    definedOps.insert(&op);
+
+    if (op.getNumOperands() == 0 && !isa<AffineTerminatorOp>(op)) {
+      LLVM_DEBUG(llvm::dbgs() << "\nNon-constant op with 0 operands\n");
+      return false;
+    }
+    for (unsigned int i = 0; i < op.getNumOperands(); ++i) {
+      auto *operandSrc = op.getOperand(i)->getDefiningOp();
+
+      LLVM_DEBUG(
+          op.getOperand(i)->print(llvm::dbgs() << "\nIterating on operand\n"));
+
+      // If the loop IV is the operand, this op isn't loop invariant.
+      if (indVar == op.getOperand(i)) {
+        LLVM_DEBUG(llvm::dbgs() << "\nLoop IV is the operand\n");
+        return false;
+      }
+
+      if (operandSrc != nullptr) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << *operandSrc << "\nIterating on operand src\n");
+
+        // If the value was defined in the loop (outside of the
+        // if/else region), and that operation itself wasn't meant to
+        // be hoisted, then mark this operation loop dependent.
+        if (definedOps.count(operandSrc) && opsToHoist.count(operandSrc) == 0) {
+          return false;
+        }
+      }
+    }
+  }
+
+  // If no operand was loop variant, mark this op for motion.
+  opsToHoist.insert(&op);
+  return true;
+}
+
+// Checks if all ops in a region (i.e. list of blocks) are loop invariant.
+bool areAllOpsInTheBlockListInvariant(
+    Region &blockList, Value indVar, SmallPtrSetImpl<Operation *> &definedOps,
+    SmallPtrSetImpl<Operation *> &opsToHoist) {
+
+  for (auto &b : blockList) {
+    for (auto &op : b) {
+      if (!isOpLoopInvariant(op, indVar, definedOps, opsToHoist)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Returns true if the affine.if op can be hoisted.
+bool checkInvarianceOfNestedIfOps(Operation *op, Value indVar,
+                                  SmallPtrSetImpl<Operation *> &definedOps,
+                                  SmallPtrSetImpl<Operation *> &opsToHoist) {
+  assert(isa<AffineIfOp>(op));
+  auto ifOp = cast<AffineIfOp>(op);
+
+  if (!areAllOpsInTheBlockListInvariant(ifOp.thenRegion(), indVar, definedOps,
+                                        opsToHoist)) {
+    return false;
+  }
+
+  if (!areAllOpsInTheBlockListInvariant(ifOp.elseRegion(), indVar, definedOps,
+                                        opsToHoist)) {
+    return false;
+  }
+
+  return true;
+}
+
+void LoopInvariantCodeMotion::runOnAffineForOp(AffineForOp forOp) {
+  auto *loopBody = forOp.getBody();
+  auto indVar = forOp.getInductionVar();
+
+  SmallPtrSet<Operation *, 8> definedOps;
+  // This is the place where hoisted instructions would reside.
+  OpBuilder b(forOp.getOperation());
+
+  SmallPtrSet<Operation *, 8> opsToHoist;
+  SmallVector<Operation *, 8> opsToMove;
+
+  for (auto &op : *loopBody) {
+    // We don't hoist for loops.
+    if (!isa<AffineForOp>(op)) {
+      if (!isa<AffineTerminatorOp>(op)) {
+        if (isOpLoopInvariant(op, indVar, definedOps, opsToHoist)) {
+          opsToMove.push_back(&op);
+        }
+      }
+    }
+  }
+
+  // For all instructions that we found to be invariant, place sequentially
+  // right before the for loop.
+  for (auto *op : opsToMove) {
+    op->moveBefore(forOp);
+  }
+
+  LLVM_DEBUG(forOp.getOperation()->print(llvm::dbgs() << "Modified loop\n"));
+}
+
+void LoopInvariantCodeMotion::runOnFunction() {
+  // Walk through all loops in a function in innermost-loop-first order.  This
+  // way, we first LICM from the inner loop, and place the ops in
+  // the outer loop, which in turn can be further LICM'ed.
+  getFunction().walk([&](AffineForOp op) {
+    LLVM_DEBUG(op.getOperation()->print(llvm::dbgs() << "\nOriginal loop\n"));
+    runOnAffineForOp(op);
+  });
+}
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::createAffineLoopInvariantCodeMotionPass() {
+  return std::make_unique<LoopInvariantCodeMotion>();
+}
+
+static PassRegistration<LoopInvariantCodeMotion>
+    pass("affine-loop-invariant-code-motion",
+         "Hoist loop invariant instructions outside of the loop");
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d6c5bd88f7f720521d54eced79c6daf7cadd3413
--- /dev/null
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -0,0 +1,38 @@
+add_subdirectory(Utils)
+
+add_llvm_library(MLIRTransforms
+  AffineDataCopyGeneration.cpp
+  AffineLoopInvariantCodeMotion.cpp
+  Canonicalizer.cpp
+  CSE.cpp
+  DialectConversion.cpp
+  Inliner.cpp
+  LoopCoalescing.cpp
+  LoopFusion.cpp
+  LoopInvariantCodeMotion.cpp
+  LoopTiling.cpp
+  LoopUnrollAndJam.cpp
+  LoopUnroll.cpp
+  MemRefDataFlowOpt.cpp
+  PipelineDataTransfer.cpp
+  SimplifyAffineStructures.cpp
+  StripDebugInfo.cpp
+  Vectorize.cpp
+  ViewOpGraph.cpp
+  ViewRegionGraph.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
+  )
+
+add_dependencies(MLIRTransforms
+  MLIRLoopLikeInterfaceIncGen
+  MLIRStandardOpsIncGen)
+target_link_libraries(MLIRTransforms
+  MLIRAffineOps
+  MLIRAnalysis
+  MLIRLoopOps
+  MLIRPass
+  MLIRTransformUtils
+  MLIRVectorOps
+  )
diff --git a/mlir/lib/Transforms/CSE.cpp b/mlir/lib/Transforms/CSE.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..714fb1d0109b10e23eac2c1e540c15d612f79e09
--- /dev/null
+++ b/mlir/lib/Transforms/CSE.cpp
@@ -0,0 +1,263 @@
+//===- CSE.cpp - Common Sub-expression Elimination ------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation pass performs a simple common sub-expression elimination
+// algorithm on operations within a function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Dominance.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/RecyclingAllocator.h"
+#include <deque>
+using namespace mlir;
+
+namespace {
+// TODO(riverriddle) Handle commutative operations.
+struct SimpleOperationInfo : public llvm::DenseMapInfo<Operation *> {
+  static unsigned getHashValue(const Operation *opC) {
+    auto *op = const_cast<Operation *>(opC);
+    // Hash the operations based upon their:
+    //   - Operation Name
+    //   - Attributes
+    //   - Result Types
+    //   - Operands
+    return hash_combine(
+        op->getName(), op->getAttrList().getDictionary(),
+        hash_combine_range(op->result_type_begin(), op->result_type_end()),
+        hash_combine_range(op->operand_begin(), op->operand_end()));
+  }
+  static bool isEqual(const Operation *lhsC, const Operation *rhsC) {
+    auto *lhs = const_cast<Operation *>(lhsC);
+    auto *rhs = const_cast<Operation *>(rhsC);
+    if (lhs == rhs)
+      return true;
+    if (lhs == getTombstoneKey() || lhs == getEmptyKey() ||
+        rhs == getTombstoneKey() || rhs == getEmptyKey())
+      return false;
+
+    // Compare the operation name.
+    if (lhs->getName() != rhs->getName())
+      return false;
+    // Check operand and result type counts.
+    if (lhs->getNumOperands() != rhs->getNumOperands() ||
+        lhs->getNumResults() != rhs->getNumResults())
+      return false;
+    // Compare attributes.
+    if (lhs->getAttrList() != rhs->getAttrList())
+      return false;
+    // Compare operands.
+    if (!std::equal(lhs->operand_begin(), lhs->operand_end(),
+                    rhs->operand_begin()))
+      return false;
+    // Compare result types.
+    return std::equal(lhs->result_type_begin(), lhs->result_type_end(),
+                      rhs->result_type_begin());
+  }
+};
+} // end anonymous namespace
+
+namespace {
+/// Simple common sub-expression elimination.
+struct CSE : public OperationPass<CSE> {
+  CSE() = default;
+  CSE(const CSE &) {}
+
+  /// Shared implementation of operation elimination and scoped map definitions.
+  using AllocatorTy = llvm::RecyclingAllocator<
+      llvm::BumpPtrAllocator,
+      llvm::ScopedHashTableVal<Operation *, Operation *>>;
+  using ScopedMapTy = llvm::ScopedHashTable<Operation *, Operation *,
+                                            SimpleOperationInfo, AllocatorTy>;
+
+  /// Represents a single entry in the depth first traversal of a CFG.
+  struct CFGStackNode {
+    CFGStackNode(ScopedMapTy &knownValues, DominanceInfoNode *node)
+        : scope(knownValues), node(node), childIterator(node->begin()),
+          processed(false) {}
+
+    /// Scope for the known values.
+    ScopedMapTy::ScopeTy scope;
+
+    DominanceInfoNode *node;
+    DominanceInfoNode::iterator childIterator;
+
+    /// If this node has been fully processed yet or not.
+    bool processed;
+  };
+
+  /// Attempt to eliminate a redundant operation. Returns success if the
+  /// operation was marked for removal, failure otherwise.
+  LogicalResult simplifyOperation(ScopedMapTy &knownValues, Operation *op);
+
+  void simplifyBlock(ScopedMapTy &knownValues, DominanceInfo &domInfo,
+                     Block *bb);
+  void simplifyRegion(ScopedMapTy &knownValues, DominanceInfo &domInfo,
+                      Region &region);
+
+  void runOnOperation() override;
+
+private:
+  /// Operations marked as dead and to be erased.
+  std::vector<Operation *> opsToErase;
+
+  /// Statistics for CSE.
+  Statistic numCSE{this, "num-cse'd", "Number of operations CSE'd"};
+  Statistic numDCE{this, "num-dce'd", "Number of operations trivially DCE'd"};
+};
+} // end anonymous namespace
+
+/// Attempt to eliminate a redundant operation.
+LogicalResult CSE::simplifyOperation(ScopedMapTy &knownValues, Operation *op) {
+  // Don't simplify operations with nested blocks. We don't currently model
+  // equality comparisons correctly among other things. It is also unclear
+  // whether we would want to CSE such operations.
+  if (op->getNumRegions() != 0)
+    return failure();
+
+  // TODO(riverriddle) We currently only eliminate non side-effecting
+  // operations.
+  if (!op->hasNoSideEffect())
+    return failure();
+
+  // If the operation is already trivially dead just add it to the erase list.
+  if (op->use_empty()) {
+    opsToErase.push_back(op);
+    ++numDCE;
+    return success();
+  }
+
+  // Look for an existing definition for the operation.
+  if (auto *existing = knownValues.lookup(op)) {
+    // If we find one then replace all uses of the current operation with the
+    // existing one and mark it for deletion.
+    op->replaceAllUsesWith(existing);
+    opsToErase.push_back(op);
+
+    // If the existing operation has an unknown location and the current
+    // operation doesn't, then set the existing op's location to that of the
+    // current op.
+    if (existing->getLoc().isa<UnknownLoc>() &&
+        !op->getLoc().isa<UnknownLoc>()) {
+      existing->setLoc(op->getLoc());
+    }
+
+    ++numCSE;
+    return success();
+  }
+
+  // Otherwise, we add this operation to the known values map.
+  knownValues.insert(op, op);
+  return failure();
+}
+
+void CSE::simplifyBlock(ScopedMapTy &knownValues, DominanceInfo &domInfo,
+                        Block *bb) {
+  for (auto &inst : *bb) {
+    // If the operation is simplified, we don't process any held regions.
+    if (succeeded(simplifyOperation(knownValues, &inst)))
+      continue;
+
+    // If this operation is isolated above, we can't process nested regions with
+    // the given 'knownValues' map. This would cause the insertion of implicit
+    // captures in explicit capture only regions.
+    if (!inst.isRegistered() || inst.isKnownIsolatedFromAbove()) {
+      ScopedMapTy nestedKnownValues;
+      for (auto &region : inst.getRegions())
+        simplifyRegion(nestedKnownValues, domInfo, region);
+      continue;
+    }
+
+    // Otherwise, process nested regions normally.
+    for (auto &region : inst.getRegions())
+      simplifyRegion(knownValues, domInfo, region);
+  }
+}
+
+void CSE::simplifyRegion(ScopedMapTy &knownValues, DominanceInfo &domInfo,
+                         Region &region) {
+  // If the region is empty there is nothing to do.
+  if (region.empty())
+    return;
+
+  // If the region only contains one block, then simplify it directly.
+  if (std::next(region.begin()) == region.end()) {
+    ScopedMapTy::ScopeTy scope(knownValues);
+    simplifyBlock(knownValues, domInfo, &region.front());
+    return;
+  }
+
+  // Note, deque is being used here because there was significant performance
+  // gains over vector when the container becomes very large due to the
+  // specific access patterns. If/when these performance issues are no
+  // longer a problem we can change this to vector. For more information see
+  // the llvm mailing list discussion on this:
+  // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
+  std::deque<std::unique_ptr<CFGStackNode>> stack;
+
+  // Process the nodes of the dom tree for this region.
+  stack.emplace_back(std::make_unique<CFGStackNode>(
+      knownValues, domInfo.getRootNode(&region)));
+
+  while (!stack.empty()) {
+    auto &currentNode = stack.back();
+
+    // Check to see if we need to process this node.
+    if (!currentNode->processed) {
+      currentNode->processed = true;
+      simplifyBlock(knownValues, domInfo, currentNode->node->getBlock());
+    }
+
+    // Otherwise, check to see if we need to process a child node.
+    if (currentNode->childIterator != currentNode->node->end()) {
+      auto *childNode = *(currentNode->childIterator++);
+      stack.emplace_back(
+          std::make_unique<CFGStackNode>(knownValues, childNode));
+    } else {
+      // Finally, if the node and all of its children have been processed
+      // then we delete the node.
+      stack.pop_back();
+    }
+  }
+}
+
+void CSE::runOnOperation() {
+  /// A scoped hash table of defining operations within a region.
+  ScopedMapTy knownValues;
+
+  DominanceInfo &domInfo = getAnalysis<DominanceInfo>();
+  for (Region &region : getOperation()->getRegions())
+    simplifyRegion(knownValues, domInfo, region);
+
+  // If no operations were erased, then we mark all analyses as preserved.
+  if (opsToErase.empty())
+    return markAllAnalysesPreserved();
+
+  /// Erase any operations that were marked as dead during simplification.
+  for (auto *op : opsToErase)
+    op->erase();
+  opsToErase.clear();
+
+  // We currently don't remove region operations, so mark dominance as
+  // preserved.
+  markAnalysesPreserved<DominanceInfo, PostDominanceInfo>();
+}
+
+std::unique_ptr<Pass> mlir::createCSEPass() { return std::make_unique<CSE>(); }
+
+static PassRegistration<CSE> pass("cse", "Eliminate common sub-expressions");
diff --git a/mlir/lib/Transforms/Canonicalizer.cpp b/mlir/lib/Transforms/Canonicalizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b3a1eb1cf3ceb500b0cda34bedfe2c11bc4a323
--- /dev/null
+++ b/mlir/lib/Transforms/Canonicalizer.cpp
@@ -0,0 +1,45 @@
+//===- Canonicalizer.cpp - Canonicalize MLIR operations -------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation pass converts operations into their canonical forms by
+// folding constants, applying operation identity transformations etc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+using namespace mlir;
+
+namespace {
+/// Canonicalize operations in nested regions.
+struct Canonicalizer : public OperationPass<Canonicalizer> {
+  void runOnOperation() override {
+    OwningRewritePatternList patterns;
+
+    // TODO: Instead of adding all known patterns from the whole system lazily
+    // add and cache the canonicalization patterns for ops we see in practice
+    // when building the worklist.  For now, we just grab everything.
+    auto *context = &getContext();
+    for (auto *op : context->getRegisteredOperations())
+      op->getCanonicalizationPatterns(patterns, context);
+
+    Operation *op = getOperation();
+    applyPatternsGreedily(op->getRegions(), patterns);
+  }
+};
+} // end anonymous namespace
+
+/// Create a Canonicalizer pass.
+std::unique_ptr<Pass> mlir::createCanonicalizerPass() {
+  return std::make_unique<Canonicalizer>();
+}
+
+static PassRegistration<Canonicalizer> pass("canonicalize",
+                                            "Canonicalize operations");
diff --git a/mlir/lib/Transforms/DialectConversion.cpp b/mlir/lib/Transforms/DialectConversion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5f7fb7a68c9796578fca608c75243178c3bc7afa
--- /dev/null
+++ b/mlir/lib/Transforms/DialectConversion.cpp
@@ -0,0 +1,1846 @@
+//===- DialectConversion.cpp - MLIR dialect conversion generic pass -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+#define DEBUG_TYPE "dialect-conversion"
+
+/// Recursively collect all of the operations to convert from within 'region'.
+/// If 'target' is nonnull, operations that are recursively legal have their
+/// regions pre-filtered to avoid considering them for legalization.
+static LogicalResult
+computeConversionSet(iterator_range<Region::iterator> region,
+                     Location regionLoc, std::vector<Operation *> &toConvert,
+                     ConversionTarget *target = nullptr) {
+  if (llvm::empty(region))
+    return success();
+
+  // Traverse starting from the entry block.
+  SmallVector<Block *, 16> worklist(1, &*region.begin());
+  DenseSet<Block *> visitedBlocks;
+  visitedBlocks.insert(worklist.front());
+  while (!worklist.empty()) {
+    Block *block = worklist.pop_back_val();
+
+    // Compute the conversion set of each of the nested operations.
+    for (Operation &op : *block) {
+      toConvert.emplace_back(&op);
+
+      // Don't check this operation's children for conversion if the operation
+      // is recursively legal.
+      auto legalityInfo = target ? target->isLegal(&op)
+                                 : Optional<ConversionTarget::LegalOpDetails>();
+      if (legalityInfo && legalityInfo->isRecursivelyLegal)
+        continue;
+      for (auto &region : op.getRegions())
+        computeConversionSet(region.getBlocks(), region.getLoc(), toConvert,
+                             target);
+    }
+
+    // Recurse to children that haven't been visited.
+    for (Block *succ : block->getSuccessors())
+      if (visitedBlocks.insert(succ).second)
+        worklist.push_back(succ);
+  }
+
+  // Check that all blocks in the region were visited.
+  if (llvm::any_of(llvm::drop_begin(region, 1),
+                   [&](Block &block) { return !visitedBlocks.count(&block); }))
+    return emitError(regionLoc, "unreachable blocks were not converted");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Multi-Level Value Mapper
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This class wraps a BlockAndValueMapping to provide recursive lookup
+/// functionality, i.e. we will traverse if the mapped value also has a mapping.
+struct ConversionValueMapping {
+  /// Lookup a mapped value within the map. If a mapping for the provided value
+  /// does not exist then return the provided value.
+  Value lookupOrDefault(Value from) const;
+
+  /// Map a value to the one provided.
+  void map(Value oldVal, Value newVal) { mapping.map(oldVal, newVal); }
+
+  /// Drop the last mapping for the given value.
+  void erase(Value value) { mapping.erase(value); }
+
+private:
+  /// Current value mappings.
+  BlockAndValueMapping mapping;
+};
+} // end anonymous namespace
+
+/// Lookup a mapped value within the map. If a mapping for the provided value
+/// does not exist then return the provided value.
+Value ConversionValueMapping::lookupOrDefault(Value from) const {
+  // If this value had a valid mapping, unmap that value as well in the case
+  // that it was also replaced.
+  while (auto mappedValue = mapping.lookupOrNull(from))
+    from = mappedValue;
+  return from;
+}
+
+//===----------------------------------------------------------------------===//
+// ArgConverter
+//===----------------------------------------------------------------------===//
+namespace {
+/// This class provides a simple interface for converting the types of block
+/// arguments. This is done by creating a new block that contains the new legal
+/// types and extracting the block that contains the old illegal types to allow
+/// for undoing pending rewrites in the case of failure.
+struct ArgConverter {
+  ArgConverter(TypeConverter *typeConverter, PatternRewriter &rewriter)
+      : loc(rewriter.getUnknownLoc()), typeConverter(typeConverter),
+        rewriter(rewriter) {}
+
+  /// This structure contains the information pertaining to an argument that has
+  /// been converted.
+  struct ConvertedArgInfo {
+    ConvertedArgInfo(unsigned newArgIdx, unsigned newArgSize,
+                     Value castValue = nullptr)
+        : newArgIdx(newArgIdx), newArgSize(newArgSize), castValue(castValue) {}
+
+    /// The start index of in the new argument list that contains arguments that
+    /// replace the original.
+    unsigned newArgIdx;
+
+    /// The number of arguments that replaced the original argument.
+    unsigned newArgSize;
+
+    /// The cast value that was created to cast from the new arguments to the
+    /// old. This only used if 'newArgSize' > 1.
+    Value castValue;
+  };
+
+  /// This structure contains information pertaining to a block that has had its
+  /// signature converted.
+  struct ConvertedBlockInfo {
+    ConvertedBlockInfo(Block *origBlock) : origBlock(origBlock) {}
+
+    /// The original block that was requested to have its signature converted.
+    Block *origBlock;
+
+    /// The conversion information for each of the arguments. The information is
+    /// None if the argument was dropped during conversion.
+    SmallVector<Optional<ConvertedArgInfo>, 1> argInfo;
+  };
+
+  /// Return if the signature of the given block has already been converted.
+  bool hasBeenConverted(Block *block) const {
+    return conversionInfo.count(block);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Rewrite Application
+  //===--------------------------------------------------------------------===//
+
+  /// Erase any rewrites registered for the blocks within the given operation
+  /// which is about to be removed. This merely drops the rewrites without
+  /// undoing them.
+  void notifyOpRemoved(Operation *op);
+
+  /// Cleanup and undo any generated conversions for the arguments of block.
+  /// This method replaces the new block with the original, reverting the IR to
+  /// its original state.
+  void discardRewrites(Block *block);
+
+  /// Fully replace uses of the old arguments with the new, materializing cast
+  /// operations as necessary.
+  // FIXME(riverriddle) The 'mapping' parameter is only necessary because the
+  // implementation of replaceUsesOfBlockArgument is buggy.
+  void applyRewrites(ConversionValueMapping &mapping);
+
+  //===--------------------------------------------------------------------===//
+  // Conversion
+  //===--------------------------------------------------------------------===//
+
+  /// Attempt to convert the signature of the given block, if successful a new
+  /// block is returned containing the new arguments. On failure, nullptr is
+  /// returned.
+  Block *convertSignature(Block *block, ConversionValueMapping &mapping);
+
+  /// Apply the given signature conversion on the given block. The new block
+  /// containing the updated signature is returned.
+  Block *applySignatureConversion(
+      Block *block, TypeConverter::SignatureConversion &signatureConversion,
+      ConversionValueMapping &mapping);
+
+  /// Insert a new conversion into the cache.
+  void insertConversion(Block *newBlock, ConvertedBlockInfo &&info);
+
+  /// A collection of blocks that have had their arguments converted.
+  llvm::MapVector<Block *, ConvertedBlockInfo> conversionInfo;
+
+  /// A mapping from valid regions, to those containing the original blocks of a
+  /// conversion.
+  DenseMap<Region *, std::unique_ptr<Region>> regionMapping;
+
+  /// An instance of the unknown location that is used when materializing
+  /// conversions.
+  Location loc;
+
+  /// The type converter to use when changing types.
+  TypeConverter *typeConverter;
+
+  /// The pattern rewriter to use when materializing conversions.
+  PatternRewriter &rewriter;
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Rewrite Application
+
+void ArgConverter::notifyOpRemoved(Operation *op) {
+  for (Region &region : op->getRegions()) {
+    for (Block &block : region) {
+      // Drop any rewrites from within.
+      for (Operation &nestedOp : block)
+        if (nestedOp.getNumRegions())
+          notifyOpRemoved(&nestedOp);
+
+      // Check if this block was converted.
+      auto it = conversionInfo.find(&block);
+      if (it == conversionInfo.end())
+        return;
+
+      // Drop all uses of the original arguments and delete the original block.
+      Block *origBlock = it->second.origBlock;
+      for (BlockArgument arg : origBlock->getArguments())
+        arg->dropAllUses();
+      conversionInfo.erase(it);
+    }
+  }
+}
+
+void ArgConverter::discardRewrites(Block *block) {
+  auto it = conversionInfo.find(block);
+  if (it == conversionInfo.end())
+    return;
+  Block *origBlock = it->second.origBlock;
+
+  // Drop all uses of the new block arguments and replace uses of the new block.
+  for (int i = block->getNumArguments() - 1; i >= 0; --i)
+    block->getArgument(i)->dropAllUses();
+  block->replaceAllUsesWith(origBlock);
+
+  // Move the operations back the original block and the delete the new block.
+  origBlock->getOperations().splice(origBlock->end(), block->getOperations());
+  origBlock->moveBefore(block);
+  block->erase();
+
+  conversionInfo.erase(it);
+}
+
+void ArgConverter::applyRewrites(ConversionValueMapping &mapping) {
+  for (auto &info : conversionInfo) {
+    Block *newBlock = info.first;
+    ConvertedBlockInfo &blockInfo = info.second;
+    Block *origBlock = blockInfo.origBlock;
+
+    // Process the remapping for each of the original arguments.
+    for (unsigned i = 0, e = origBlock->getNumArguments(); i != e; ++i) {
+      Optional<ConvertedArgInfo> &argInfo = blockInfo.argInfo[i];
+      BlockArgument origArg = origBlock->getArgument(i);
+
+      // Handle the case of a 1->0 value mapping.
+      if (!argInfo) {
+        // If a replacement value was given for this argument, use that to
+        // replace all uses.
+        auto argReplacementValue = mapping.lookupOrDefault(origArg);
+        if (argReplacementValue != origArg) {
+          origArg->replaceAllUsesWith(argReplacementValue);
+          continue;
+        }
+        // If there are any dangling uses then replace the argument with one
+        // generated by the type converter. This is necessary as the cast must
+        // persist in the IR after conversion.
+        if (!origArg->use_empty()) {
+          rewriter.setInsertionPointToStart(newBlock);
+          auto *newOp = typeConverter->materializeConversion(
+              rewriter, origArg->getType(), llvm::None, loc);
+          origArg->replaceAllUsesWith(newOp->getResult(0));
+        }
+        continue;
+      }
+
+      // If mapping is 1-1, replace the remaining uses and drop the cast
+      // operation.
+      // FIXME(riverriddle) This should check that the result type and operand
+      // type are the same, otherwise it should force a conversion to be
+      // materialized.
+      if (argInfo->newArgSize == 1) {
+        origArg->replaceAllUsesWith(
+            mapping.lookupOrDefault(newBlock->getArgument(argInfo->newArgIdx)));
+        continue;
+      }
+
+      // Otherwise this is a 1->N value mapping.
+      Value castValue = argInfo->castValue;
+      assert(argInfo->newArgSize > 1 && castValue && "expected 1->N mapping");
+
+      // If the argument is still used, replace it with the generated cast.
+      if (!origArg->use_empty())
+        origArg->replaceAllUsesWith(mapping.lookupOrDefault(castValue));
+
+      // If all users of the cast were removed, we can drop it. Otherwise, keep
+      // the operation alive and let the user handle any remaining usages.
+      if (castValue->use_empty())
+        castValue->getDefiningOp()->erase();
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Conversion
+
+Block *ArgConverter::convertSignature(Block *block,
+                                      ConversionValueMapping &mapping) {
+  if (auto conversion = typeConverter->convertBlockSignature(block))
+    return applySignatureConversion(block, *conversion, mapping);
+  return nullptr;
+}
+
+Block *ArgConverter::applySignatureConversion(
+    Block *block, TypeConverter::SignatureConversion &signatureConversion,
+    ConversionValueMapping &mapping) {
+  // If no arguments are being changed or added, there is nothing to do.
+  unsigned origArgCount = block->getNumArguments();
+  auto convertedTypes = signatureConversion.getConvertedTypes();
+  if (origArgCount == 0 && convertedTypes.empty())
+    return block;
+
+  // Split the block at the beginning to get a new block to use for the updated
+  // signature.
+  Block *newBlock = block->splitBlock(block->begin());
+  block->replaceAllUsesWith(newBlock);
+
+  SmallVector<Value, 4> newArgRange(newBlock->addArguments(convertedTypes));
+  ArrayRef<Value> newArgs(newArgRange);
+
+  // Remap each of the original arguments as determined by the signature
+  // conversion.
+  ConvertedBlockInfo info(block);
+  info.argInfo.resize(origArgCount);
+
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointToStart(newBlock);
+  for (unsigned i = 0; i != origArgCount; ++i) {
+    auto inputMap = signatureConversion.getInputMapping(i);
+    if (!inputMap)
+      continue;
+    BlockArgument origArg = block->getArgument(i);
+
+    // If inputMap->replacementValue is not nullptr, then the argument is
+    // dropped and a replacement value is provided to be the remappedValue.
+    if (inputMap->replacementValue) {
+      assert(inputMap->size == 0 &&
+             "invalid to provide a replacement value when the argument isn't "
+             "dropped");
+      mapping.map(origArg, inputMap->replacementValue);
+      continue;
+    }
+
+    // If this is a 1->1 mapping, then map the argument directly.
+    if (inputMap->size == 1) {
+      mapping.map(origArg, newArgs[inputMap->inputNo]);
+      info.argInfo[i] = ConvertedArgInfo(inputMap->inputNo, inputMap->size);
+      continue;
+    }
+
+    // Otherwise, this is a 1->N mapping. Call into the provided type converter
+    // to pack the new values.
+    auto replArgs = newArgs.slice(inputMap->inputNo, inputMap->size);
+    Operation *cast = typeConverter->materializeConversion(
+        rewriter, origArg->getType(), replArgs, loc);
+    assert(cast->getNumResults() == 1 &&
+           cast->getNumOperands() == replArgs.size());
+    mapping.map(origArg, cast->getResult(0));
+    info.argInfo[i] =
+        ConvertedArgInfo(inputMap->inputNo, inputMap->size, cast->getResult(0));
+  }
+
+  // Remove the original block from the region and return the new one.
+  insertConversion(newBlock, std::move(info));
+  return newBlock;
+}
+
+void ArgConverter::insertConversion(Block *newBlock,
+                                    ConvertedBlockInfo &&info) {
+  // Get a region to insert the old block.
+  Region *region = newBlock->getParent();
+  std::unique_ptr<Region> &mappedRegion = regionMapping[region];
+  if (!mappedRegion)
+    mappedRegion = std::make_unique<Region>(region->getParentOp());
+
+  // Move the original block to the mapped region and emplace the conversion.
+  mappedRegion->getBlocks().splice(mappedRegion->end(), region->getBlocks(),
+                                   info.origBlock->getIterator());
+  conversionInfo.insert({newBlock, std::move(info)});
+}
+
+//===----------------------------------------------------------------------===//
+// ConversionPatternRewriterImpl
+//===----------------------------------------------------------------------===//
+namespace {
+/// This class contains a snapshot of the current conversion rewriter state.
+/// This is useful when saving and undoing a set of rewrites.
+struct RewriterState {
+  RewriterState(unsigned numCreatedOps, unsigned numReplacements,
+                unsigned numBlockActions, unsigned numIgnoredOperations,
+                unsigned numRootUpdates)
+      : numCreatedOps(numCreatedOps), numReplacements(numReplacements),
+        numBlockActions(numBlockActions),
+        numIgnoredOperations(numIgnoredOperations),
+        numRootUpdates(numRootUpdates) {}
+
+  /// The current number of created operations.
+  unsigned numCreatedOps;
+
+  /// The current number of replacements queued.
+  unsigned numReplacements;
+
+  /// The current number of block actions performed.
+  unsigned numBlockActions;
+
+  /// The current number of ignored operations.
+  unsigned numIgnoredOperations;
+
+  /// The current number of operations that were updated in place.
+  unsigned numRootUpdates;
+};
+
+/// The state of an operation that was updated by a pattern in-place. This
+/// contains all of the necessary information to reconstruct an operation that
+/// was updated in place.
+class OperationTransactionState {
+public:
+  OperationTransactionState() = default;
+  OperationTransactionState(Operation *op)
+      : op(op), loc(op->getLoc()), attrs(op->getAttrList()),
+        operands(op->operand_begin(), op->operand_end()),
+        successors(op->successor_begin(), op->successor_end()) {}
+
+  /// Discard the transaction state and reset the state of the original
+  /// operation.
+  void resetOperation() const {
+    op->setLoc(loc);
+    op->setAttrs(attrs);
+    op->setOperands(operands);
+    for (auto it : llvm::enumerate(successors))
+      op->setSuccessor(it.value(), it.index());
+  }
+
+  /// Return the original operation of this state.
+  Operation *getOperation() const { return op; }
+
+private:
+  Operation *op;
+  LocationAttr loc;
+  NamedAttributeList attrs;
+  SmallVector<Value, 8> operands;
+  SmallVector<Block *, 2> successors;
+};
+} // end anonymous namespace
+
+namespace mlir {
+namespace detail {
+struct ConversionPatternRewriterImpl {
+  /// This class represents one requested operation replacement via 'replaceOp'.
+  struct OpReplacement {
+    OpReplacement() = default;
+    OpReplacement(Operation *op, ValueRange newValues)
+        : op(op), newValues(newValues.begin(), newValues.end()) {}
+
+    Operation *op;
+    SmallVector<Value, 2> newValues;
+  };
+
+  /// The kind of the block action performed during the rewrite.  Actions can be
+  /// undone if the conversion fails.
+  enum class BlockActionKind { Create, Move, Split, TypeConversion };
+
+  /// Original position of the given block in its parent region.  We cannot use
+  /// a region iterator because it could have been invalidated by other region
+  /// operations since the position was stored.
+  struct BlockPosition {
+    Region *region;
+    Region::iterator::difference_type position;
+  };
+
+  /// The storage class for an undoable block action (one of BlockActionKind),
+  /// contains the information necessary to undo this action.
+  struct BlockAction {
+    static BlockAction getCreate(Block *block) {
+      return {BlockActionKind::Create, block, {}};
+    }
+    static BlockAction getMove(Block *block, BlockPosition originalPos) {
+      return {BlockActionKind::Move, block, {originalPos}};
+    }
+    static BlockAction getSplit(Block *block, Block *originalBlock) {
+      BlockAction action{BlockActionKind::Split, block, {}};
+      action.originalBlock = originalBlock;
+      return action;
+    }
+    static BlockAction getTypeConversion(Block *block) {
+      return BlockAction{BlockActionKind::TypeConversion, block, {}};
+    }
+
+    // The action kind.
+    BlockActionKind kind;
+
+    // A pointer to the block that was created by the action.
+    Block *block;
+
+    union {
+      // In use if kind == BlockActionKind::Move and contains a pointer to the
+      // region that originally contained the block as well as the position of
+      // the block in that region.
+      BlockPosition originalPosition;
+      // In use if kind == BlockActionKind::Split and contains a pointer to the
+      // block that was split into two parts.
+      Block *originalBlock;
+    };
+  };
+
+  ConversionPatternRewriterImpl(PatternRewriter &rewriter,
+                                TypeConverter *converter)
+      : argConverter(converter, rewriter) {}
+
+  /// Return the current state of the rewriter.
+  RewriterState getCurrentState();
+
+  /// Reset the state of the rewriter to a previously saved point.
+  void resetState(RewriterState state);
+
+  /// Undo the block actions (motions, splits) one by one in reverse order until
+  /// "numActionsToKeep" actions remains.
+  void undoBlockActions(unsigned numActionsToKeep = 0);
+
+  /// Cleanup and destroy any generated rewrite operations. This method is
+  /// invoked when the conversion process fails.
+  void discardRewrites();
+
+  /// Apply all requested operation rewrites. This method is invoked when the
+  /// conversion process succeeds.
+  void applyRewrites();
+
+  /// Convert the signature of the given block.
+  LogicalResult convertBlockSignature(Block *block);
+
+  /// Apply a signature conversion on the given region.
+  Block *
+  applySignatureConversion(Region *region,
+                           TypeConverter::SignatureConversion &conversion);
+
+  /// PatternRewriter hook for replacing the results of an operation.
+  void replaceOp(Operation *op, ValueRange newValues,
+                 ValueRange valuesToRemoveIfDead);
+
+  /// Notifies that a block was split.
+  void notifySplitBlock(Block *block, Block *continuation);
+
+  /// Notifies that the blocks of a region are about to be moved.
+  void notifyRegionIsBeingInlinedBefore(Region &region, Region &parent,
+                                        Region::iterator before);
+
+  /// Notifies that the blocks of a region were cloned into another.
+  void notifyRegionWasClonedBefore(iterator_range<Region::iterator> &blocks,
+                                   Location origRegionLoc);
+
+  /// Remap the given operands to those with potentially different types.
+  void remapValues(Operation::operand_range operands,
+                   SmallVectorImpl<Value> &remapped);
+
+  /// Returns true if the given operation is ignored, and does not need to be
+  /// converted.
+  bool isOpIgnored(Operation *op) const;
+
+  /// Recursively marks the nested operations under 'op' as ignored. This
+  /// removes them from being considered for legalization.
+  void markNestedOpsIgnored(Operation *op);
+
+  // Mapping between replaced values that differ in type. This happens when
+  // replacing a value with one of a different type.
+  ConversionValueMapping mapping;
+
+  /// Utility used to convert block arguments.
+  ArgConverter argConverter;
+
+  /// Ordered vector of all of the newly created operations during conversion.
+  std::vector<Operation *> createdOps;
+
+  /// Ordered vector of any requested operation replacements.
+  SmallVector<OpReplacement, 4> replacements;
+
+  /// Ordered list of block operations (creations, splits, motions).
+  SmallVector<BlockAction, 4> blockActions;
+
+  /// A set of operations that have been erased/replaced/etc that should no
+  /// longer be considered for legalization. This is not meant to be an
+  /// exhaustive list of all operations, but the minimal set that can be used to
+  /// detect if a given operation should be `ignored`. For example, we may add
+  /// the operations that define non-empty regions to the set, but not any of
+  /// the others. This simplifies the amount of memory needed as we can query if
+  /// the parent operation was ignored.
+  llvm::SetVector<Operation *> ignoredOps;
+
+  /// A transaction state for each of operations that were updated in-place.
+  SmallVector<OperationTransactionState, 4> rootUpdates;
+
+#ifndef NDEBUG
+  /// A set of operations that have pending updates. This tracking isn't
+  /// strictly necessary, and is thus only active during debug builds for extra
+  /// verification.
+  SmallPtrSet<Operation *, 1> pendingRootUpdates;
+#endif
+};
+} // end namespace detail
+} // end namespace mlir
+
+RewriterState ConversionPatternRewriterImpl::getCurrentState() {
+  return RewriterState(createdOps.size(), replacements.size(),
+                       blockActions.size(), ignoredOps.size(),
+                       rootUpdates.size());
+}
+
+void ConversionPatternRewriterImpl::resetState(RewriterState state) {
+  // Reset any operations that were updated in place.
+  for (unsigned i = state.numRootUpdates, e = rootUpdates.size(); i != e; ++i)
+    rootUpdates[i].resetOperation();
+  rootUpdates.resize(state.numRootUpdates);
+
+  // Undo any block actions.
+  undoBlockActions(state.numBlockActions);
+
+  // Reset any replaced operations and undo any saved mappings.
+  for (auto &repl : llvm::drop_begin(replacements, state.numReplacements))
+    for (auto result : repl.op->getResults())
+      mapping.erase(result);
+  replacements.resize(state.numReplacements);
+
+  // Pop all of the newly created operations.
+  while (createdOps.size() != state.numCreatedOps) {
+    createdOps.back()->erase();
+    createdOps.pop_back();
+  }
+
+  // Pop all of the recorded ignored operations that are no longer valid.
+  while (ignoredOps.size() != state.numIgnoredOperations)
+    ignoredOps.pop_back();
+}
+
+void ConversionPatternRewriterImpl::undoBlockActions(
+    unsigned numActionsToKeep) {
+  for (auto &action :
+       llvm::reverse(llvm::drop_begin(blockActions, numActionsToKeep))) {
+    switch (action.kind) {
+    // Delete the created block.
+    case BlockActionKind::Create: {
+      // Unlink all of the operations within this block, they will be deleted
+      // separately.
+      auto &blockOps = action.block->getOperations();
+      while (!blockOps.empty())
+        blockOps.remove(blockOps.begin());
+      action.block->dropAllDefinedValueUses();
+      action.block->erase();
+      break;
+    }
+    // Move the block back to its original position.
+    case BlockActionKind::Move: {
+      Region *originalRegion = action.originalPosition.region;
+      originalRegion->getBlocks().splice(
+          std::next(originalRegion->begin(), action.originalPosition.position),
+          action.block->getParent()->getBlocks(), action.block);
+      break;
+    }
+    // Merge back the block that was split out.
+    case BlockActionKind::Split: {
+      action.originalBlock->getOperations().splice(
+          action.originalBlock->end(), action.block->getOperations());
+      action.block->dropAllDefinedValueUses();
+      action.block->erase();
+      break;
+    }
+    // Undo the type conversion.
+    case BlockActionKind::TypeConversion: {
+      argConverter.discardRewrites(action.block);
+      break;
+    }
+    }
+  }
+  blockActions.resize(numActionsToKeep);
+}
+
+void ConversionPatternRewriterImpl::discardRewrites() {
+  // Reset any operations that were updated in place.
+  for (auto &state : rootUpdates)
+    state.resetOperation();
+
+  undoBlockActions();
+
+  // Remove any newly created ops.
+  for (auto *op : llvm::reverse(createdOps))
+    op->erase();
+}
+
+void ConversionPatternRewriterImpl::applyRewrites() {
+  // Apply all of the rewrites replacements requested during conversion.
+  for (auto &repl : replacements) {
+    for (unsigned i = 0, e = repl.newValues.size(); i != e; ++i) {
+      if (auto newValue = repl.newValues[i])
+        repl.op->getResult(i)->replaceAllUsesWith(
+            mapping.lookupOrDefault(newValue));
+    }
+
+    // If this operation defines any regions, drop any pending argument
+    // rewrites.
+    if (argConverter.typeConverter && repl.op->getNumRegions())
+      argConverter.notifyOpRemoved(repl.op);
+  }
+
+  // In a second pass, erase all of the replaced operations in reverse. This
+  // allows processing nested operations before their parent region is
+  // destroyed.
+  for (auto &repl : llvm::reverse(replacements))
+    repl.op->erase();
+
+  argConverter.applyRewrites(mapping);
+}
+
+LogicalResult
+ConversionPatternRewriterImpl::convertBlockSignature(Block *block) {
+  // Check to see if this block should not be converted:
+  // * There is no type converter.
+  // * The block has already been converted.
+  // * This is an entry block, these are converted explicitly via patterns.
+  if (!argConverter.typeConverter || argConverter.hasBeenConverted(block) ||
+      !block->getParent() || block->isEntryBlock())
+    return success();
+
+  // Otherwise, try to convert the block signature.
+  Block *newBlock = argConverter.convertSignature(block, mapping);
+  if (newBlock)
+    blockActions.push_back(BlockAction::getTypeConversion(newBlock));
+  return success(newBlock);
+}
+
+Block *ConversionPatternRewriterImpl::applySignatureConversion(
+    Region *region, TypeConverter::SignatureConversion &conversion) {
+  if (!region->empty()) {
+    Block *newEntry = argConverter.applySignatureConversion(
+        &region->front(), conversion, mapping);
+    blockActions.push_back(BlockAction::getTypeConversion(newEntry));
+    return newEntry;
+  }
+  return nullptr;
+}
+
+void ConversionPatternRewriterImpl::replaceOp(Operation *op,
+                                              ValueRange newValues,
+                                              ValueRange valuesToRemoveIfDead) {
+  assert(newValues.size() == op->getNumResults());
+
+  // Create mappings for each of the new result values.
+  for (unsigned i = 0, e = newValues.size(); i < e; ++i)
+    if (auto repl = newValues[i])
+      mapping.map(op->getResult(i), repl);
+
+  // Record the requested operation replacement.
+  replacements.emplace_back(op, newValues);
+
+  /// Mark this operation as recursively ignored so that we don't need to
+  /// convert any nested operations.
+  markNestedOpsIgnored(op);
+}
+
+void ConversionPatternRewriterImpl::notifySplitBlock(Block *block,
+                                                     Block *continuation) {
+  blockActions.push_back(BlockAction::getSplit(continuation, block));
+}
+
+void ConversionPatternRewriterImpl::notifyRegionIsBeingInlinedBefore(
+    Region &region, Region &parent, Region::iterator before) {
+  for (auto &pair : llvm::enumerate(region)) {
+    Block &block = pair.value();
+    unsigned position = pair.index();
+    blockActions.push_back(BlockAction::getMove(&block, {&region, position}));
+  }
+}
+
+void ConversionPatternRewriterImpl::notifyRegionWasClonedBefore(
+    iterator_range<Region::iterator> &blocks, Location origRegionLoc) {
+  for (Block &block : blocks)
+    blockActions.push_back(BlockAction::getCreate(&block));
+
+  // Compute the conversion set for the inlined region.
+  auto result = computeConversionSet(blocks, origRegionLoc, createdOps);
+
+  // This original region has already had its conversion set computed, so there
+  // shouldn't be any new failures.
+  (void)result;
+  assert(succeeded(result) && "expected region to have no unreachable blocks");
+}
+
+void ConversionPatternRewriterImpl::remapValues(
+    Operation::operand_range operands, SmallVectorImpl<Value> &remapped) {
+  remapped.reserve(llvm::size(operands));
+  for (Value operand : operands)
+    remapped.push_back(mapping.lookupOrDefault(operand));
+}
+
+bool ConversionPatternRewriterImpl::isOpIgnored(Operation *op) const {
+  // Check to see if this operation or its parent were ignored.
+  return ignoredOps.count(op) || ignoredOps.count(op->getParentOp());
+}
+
+void ConversionPatternRewriterImpl::markNestedOpsIgnored(Operation *op) {
+  // Walk this operation and collect nested operations that define non-empty
+  // regions. We mark such operations as 'ignored' so that we know we don't have
+  // to convert them, or their nested ops.
+  if (op->getNumRegions() == 0)
+    return;
+  op->walk([&](Operation *op) {
+    if (llvm::any_of(op->getRegions(),
+                     [](Region &region) { return !region.empty(); }))
+      ignoredOps.insert(op);
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// ConversionPatternRewriter
+//===----------------------------------------------------------------------===//
+
+ConversionPatternRewriter::ConversionPatternRewriter(MLIRContext *ctx,
+                                                     TypeConverter *converter)
+    : PatternRewriter(ctx),
+      impl(new detail::ConversionPatternRewriterImpl(*this, converter)) {}
+ConversionPatternRewriter::~ConversionPatternRewriter() {}
+
+/// PatternRewriter hook for replacing the results of an operation.
+void ConversionPatternRewriter::replaceOp(Operation *op, ValueRange newValues,
+                                          ValueRange valuesToRemoveIfDead) {
+  LLVM_DEBUG(llvm::dbgs() << "** Replacing operation : " << op->getName()
+                          << "\n");
+  impl->replaceOp(op, newValues, valuesToRemoveIfDead);
+}
+
+/// PatternRewriter hook for erasing a dead operation. The uses of this
+/// operation *must* be made dead by the end of the conversion process,
+/// otherwise an assert will be issued.
+void ConversionPatternRewriter::eraseOp(Operation *op) {
+  LLVM_DEBUG(llvm::dbgs() << "** Erasing operation : " << op->getName()
+                          << "\n");
+  SmallVector<Value, 1> nullRepls(op->getNumResults(), nullptr);
+  impl->replaceOp(op, nullRepls, /*valuesToRemoveIfDead=*/llvm::None);
+}
+
+/// Apply a signature conversion to the entry block of the given region.
+Block *ConversionPatternRewriter::applySignatureConversion(
+    Region *region, TypeConverter::SignatureConversion &conversion) {
+  return impl->applySignatureConversion(region, conversion);
+}
+
+void ConversionPatternRewriter::replaceUsesOfBlockArgument(BlockArgument from,
+                                                           Value to) {
+  for (auto &u : from->getUses()) {
+    if (u.getOwner() == to->getDefiningOp())
+      continue;
+    u.getOwner()->replaceUsesOfWith(from, to);
+  }
+  impl->mapping.map(impl->mapping.lookupOrDefault(from), to);
+}
+
+/// Return the converted value that replaces 'key'. Return 'key' if there is
+/// no such a converted value.
+Value ConversionPatternRewriter::getRemappedValue(Value key) {
+  return impl->mapping.lookupOrDefault(key);
+}
+
+/// PatternRewriter hook for splitting a block into two parts.
+Block *ConversionPatternRewriter::splitBlock(Block *block,
+                                             Block::iterator before) {
+  auto *continuation = PatternRewriter::splitBlock(block, before);
+  impl->notifySplitBlock(block, continuation);
+  return continuation;
+}
+
+/// PatternRewriter hook for merging a block into another.
+void ConversionPatternRewriter::mergeBlocks(Block *source, Block *dest,
+                                            ValueRange argValues) {
+  // TODO(riverriddle) This requires fixing the implementation of
+  // 'replaceUsesOfBlockArgument', which currently isn't undoable.
+  llvm_unreachable("block merging updates are currently not supported");
+}
+
+/// PatternRewriter hook for moving blocks out of a region.
+void ConversionPatternRewriter::inlineRegionBefore(Region &region,
+                                                   Region &parent,
+                                                   Region::iterator before) {
+  impl->notifyRegionIsBeingInlinedBefore(region, parent, before);
+  PatternRewriter::inlineRegionBefore(region, parent, before);
+}
+
+/// PatternRewriter hook for cloning blocks of one region into another.
+void ConversionPatternRewriter::cloneRegionBefore(
+    Region &region, Region &parent, Region::iterator before,
+    BlockAndValueMapping &mapping) {
+  if (region.empty())
+    return;
+  PatternRewriter::cloneRegionBefore(region, parent, before, mapping);
+
+  // Collect the range of the cloned blocks.
+  auto clonedBeginIt = mapping.lookup(&region.front())->getIterator();
+  auto clonedBlocks = llvm::make_range(clonedBeginIt, before);
+  impl->notifyRegionWasClonedBefore(clonedBlocks, region.getLoc());
+}
+
+/// PatternRewriter hook for creating a new operation.
+Operation *ConversionPatternRewriter::insert(Operation *op) {
+  LLVM_DEBUG(llvm::dbgs() << "** Inserting operation : " << op->getName()
+                          << "\n");
+  impl->createdOps.push_back(op);
+  return OpBuilder::insert(op);
+}
+
+/// PatternRewriter hook for updating the root operation in-place.
+void ConversionPatternRewriter::startRootUpdate(Operation *op) {
+#ifndef NDEBUG
+  impl->pendingRootUpdates.insert(op);
+#endif
+  impl->rootUpdates.emplace_back(op);
+}
+
+/// PatternRewriter hook for updating the root operation in-place.
+void ConversionPatternRewriter::finalizeRootUpdate(Operation *op) {
+  // There is nothing to do here, we only need to track the operation at the
+  // start of the update.
+#ifndef NDEBUG
+  assert(impl->pendingRootUpdates.erase(op) &&
+         "operation did not have a pending in-place update");
+#endif
+}
+
+/// PatternRewriter hook for updating the root operation in-place.
+void ConversionPatternRewriter::cancelRootUpdate(Operation *op) {
+#ifndef NDEBUG
+  assert(impl->pendingRootUpdates.erase(op) &&
+         "operation did not have a pending in-place update");
+#endif
+  // Erase the last update for this operation.
+  auto stateHasOp = [op](const auto &it) { return it.getOperation() == op; };
+  auto &rootUpdates = impl->rootUpdates;
+  auto it = llvm::find_if(llvm::reverse(rootUpdates), stateHasOp);
+  rootUpdates.erase(rootUpdates.begin() + (rootUpdates.rend() - it));
+}
+
+/// Return a reference to the internal implementation.
+detail::ConversionPatternRewriterImpl &ConversionPatternRewriter::getImpl() {
+  return *impl;
+}
+
+//===----------------------------------------------------------------------===//
+// Conversion Patterns
+//===----------------------------------------------------------------------===//
+
+/// Attempt to match and rewrite the IR root at the specified operation.
+PatternMatchResult
+ConversionPattern::matchAndRewrite(Operation *op,
+                                   PatternRewriter &rewriter) const {
+  SmallVector<Value, 4> operands;
+  auto &dialectRewriter = static_cast<ConversionPatternRewriter &>(rewriter);
+  dialectRewriter.getImpl().remapValues(op->getOperands(), operands);
+
+  // If this operation has no successors, invoke the rewrite directly.
+  if (op->getNumSuccessors() == 0)
+    return matchAndRewrite(op, operands, dialectRewriter);
+
+  // Otherwise, we need to remap the successors.
+  SmallVector<Block *, 2> destinations;
+  destinations.reserve(op->getNumSuccessors());
+
+  SmallVector<ArrayRef<Value>, 2> operandsPerDestination;
+  unsigned firstSuccessorOperand = op->getSuccessorOperandIndex(0);
+  for (unsigned i = 0, seen = 0, e = op->getNumSuccessors(); i < e; ++i) {
+    destinations.push_back(op->getSuccessor(i));
+
+    // Lookup the successors operands.
+    unsigned n = op->getNumSuccessorOperands(i);
+    operandsPerDestination.push_back(
+        llvm::makeArrayRef(operands.data() + firstSuccessorOperand + seen, n));
+    seen += n;
+  }
+
+  // Rewrite the operation.
+  return matchAndRewrite(
+      op,
+      llvm::makeArrayRef(operands.data(),
+                         operands.data() + firstSuccessorOperand),
+      destinations, operandsPerDestination, dialectRewriter);
+}
+
+//===----------------------------------------------------------------------===//
+// OperationLegalizer
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A set of rewrite patterns that can be used to legalize a given operation.
+using LegalizationPatterns = SmallVector<RewritePattern *, 1>;
+
+/// This class defines a recursive operation legalizer.
+class OperationLegalizer {
+public:
+  using LegalizationAction = ConversionTarget::LegalizationAction;
+
+  OperationLegalizer(ConversionTarget &targetInfo,
+                     const OwningRewritePatternList &patterns)
+      : target(targetInfo) {
+    buildLegalizationGraph(patterns);
+    computeLegalizationGraphBenefit();
+  }
+
+  /// Returns if the given operation is known to be illegal on the target.
+  bool isIllegal(Operation *op) const;
+
+  /// Attempt to legalize the given operation. Returns success if the operation
+  /// was legalized, failure otherwise.
+  LogicalResult legalize(Operation *op, ConversionPatternRewriter &rewriter);
+
+  /// Returns the conversion target in use by the legalizer.
+  ConversionTarget &getTarget() { return target; }
+
+private:
+  /// Attempt to legalize the given operation by folding it.
+  LogicalResult legalizeWithFold(Operation *op,
+                                 ConversionPatternRewriter &rewriter);
+
+  /// Attempt to legalize the given operation by applying the provided pattern.
+  /// Returns success if the operation was legalized, failure otherwise.
+  LogicalResult legalizePattern(Operation *op, RewritePattern *pattern,
+                                ConversionPatternRewriter &rewriter);
+
+  /// Build an optimistic legalization graph given the provided patterns. This
+  /// function populates 'legalizerPatterns' with the operations that are not
+  /// directly legal, but may be transitively legal for the current target given
+  /// the provided patterns.
+  void buildLegalizationGraph(const OwningRewritePatternList &patterns);
+
+  /// Compute the benefit of each node within the computed legalization graph.
+  /// This orders the patterns within 'legalizerPatterns' based upon two
+  /// criteria:
+  ///  1) Prefer patterns that have the lowest legalization depth, i.e.
+  ///     represent the more direct mapping to the target.
+  ///  2) When comparing patterns with the same legalization depth, prefer the
+  ///     pattern with the highest PatternBenefit. This allows for users to
+  ///     prefer specific legalizations over others.
+  void computeLegalizationGraphBenefit();
+
+  /// The current set of patterns that have been applied.
+  SmallPtrSet<RewritePattern *, 8> appliedPatterns;
+
+  /// The set of legality information for operations transitively supported by
+  /// the target.
+  DenseMap<OperationName, LegalizationPatterns> legalizerPatterns;
+
+  /// The legalization information provided by the target.
+  ConversionTarget &target;
+};
+} // namespace
+
+bool OperationLegalizer::isIllegal(Operation *op) const {
+  // Check if the target explicitly marked this operation as illegal.
+  return target.getOpAction(op->getName()) == LegalizationAction::Illegal;
+}
+
+LogicalResult
+OperationLegalizer::legalize(Operation *op,
+                             ConversionPatternRewriter &rewriter) {
+  LLVM_DEBUG(llvm::dbgs() << "Legalizing operation : " << op->getName()
+                          << "\n");
+
+  // Check if this operation is legal on the target.
+  if (auto legalityInfo = target.isLegal(op)) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "-- Success : Operation marked legal by the target\n");
+    // If this operation is recursively legal, mark its children as ignored so
+    // that we don't consider them for legalization.
+    if (legalityInfo->isRecursivelyLegal) {
+      LLVM_DEBUG(llvm::dbgs() << "-- Success : Operation is recursively legal; "
+                                 "Skipping internals\n");
+      rewriter.getImpl().markNestedOpsIgnored(op);
+    }
+    return success();
+  }
+
+  // Check to see if the operation is ignored and doesn't need to be converted.
+  if (rewriter.getImpl().isOpIgnored(op)) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "-- Success : Operation marked ignored during conversion\n");
+    return success();
+  }
+
+  // If the operation isn't legal, try to fold it in-place.
+  // TODO(riverriddle) Should we always try to do this, even if the op is
+  // already legal?
+  if (succeeded(legalizeWithFold(op, rewriter))) {
+    LLVM_DEBUG(llvm::dbgs() << "-- Success : Operation was folded\n");
+    return success();
+  }
+
+  // Otherwise, we need to apply a legalization pattern to this operation.
+  auto it = legalizerPatterns.find(op->getName());
+  if (it == legalizerPatterns.end()) {
+    LLVM_DEBUG(llvm::dbgs() << "-- FAIL : no known legalization path.\n");
+    return failure();
+  }
+
+  // The patterns are sorted by expected benefit, so try to apply each in-order.
+  for (auto *pattern : it->second)
+    if (succeeded(legalizePattern(op, pattern, rewriter)))
+      return success();
+
+  LLVM_DEBUG(llvm::dbgs() << "-- FAIL : no matched legalization pattern.\n");
+  return failure();
+}
+
+LogicalResult
+OperationLegalizer::legalizeWithFold(Operation *op,
+                                     ConversionPatternRewriter &rewriter) {
+  auto &rewriterImpl = rewriter.getImpl();
+  RewriterState curState = rewriterImpl.getCurrentState();
+
+  // Try to fold the operation.
+  SmallVector<Value, 2> replacementValues;
+  rewriter.setInsertionPoint(op);
+  if (failed(rewriter.tryFold(op, replacementValues)))
+    return failure();
+
+  // Insert a replacement for 'op' with the folded replacement values.
+  rewriter.replaceOp(op, replacementValues);
+
+  // Recursively legalize any new constant operations.
+  for (unsigned i = curState.numCreatedOps, e = rewriterImpl.createdOps.size();
+       i != e; ++i) {
+    Operation *cstOp = rewriterImpl.createdOps[i];
+    if (failed(legalize(cstOp, rewriter))) {
+      LLVM_DEBUG(llvm::dbgs() << "-- FAIL: Generated folding constant '"
+                              << cstOp->getName() << "' was illegal.\n");
+      rewriterImpl.resetState(curState);
+      return failure();
+    }
+  }
+  return success();
+}
+
+LogicalResult
+OperationLegalizer::legalizePattern(Operation *op, RewritePattern *pattern,
+                                    ConversionPatternRewriter &rewriter) {
+  LLVM_DEBUG({
+    llvm::dbgs() << "-* Applying rewrite pattern '" << op->getName() << " -> (";
+    interleaveComma(pattern->getGeneratedOps(), llvm::dbgs());
+    llvm::dbgs() << ")'.\n";
+  });
+
+  // Ensure that we don't cycle by not allowing the same pattern to be
+  // applied twice in the same recursion stack.
+  // TODO(riverriddle) We could eventually converge, but that requires more
+  // complicated analysis.
+  if (!appliedPatterns.insert(pattern).second) {
+    LLVM_DEBUG(llvm::dbgs() << "-- FAIL: Pattern was already applied.\n");
+    return failure();
+  }
+
+  auto &rewriterImpl = rewriter.getImpl();
+  RewriterState curState = rewriterImpl.getCurrentState();
+  auto cleanupFailure = [&] {
+    // Reset the rewriter state and pop this pattern.
+    rewriterImpl.resetState(curState);
+    appliedPatterns.erase(pattern);
+    return failure();
+  };
+
+  // Try to rewrite with the given pattern.
+  rewriter.setInsertionPoint(op);
+  auto matchedPattern = pattern->matchAndRewrite(op, rewriter);
+#ifndef NDEBUG
+  assert(rewriterImpl.pendingRootUpdates.empty() && "dangling root updates");
+#endif
+
+  if (!matchedPattern) {
+    LLVM_DEBUG(llvm::dbgs() << "-- FAIL: Pattern failed to match.\n");
+    return cleanupFailure();
+  }
+
+  // If the pattern moved or created any blocks, try to legalize their types.
+  // This ensures that the types of the block arguments are legal for the region
+  // they were moved into.
+  for (unsigned i = curState.numBlockActions,
+                e = rewriterImpl.blockActions.size();
+       i != e; ++i) {
+    auto &action = rewriterImpl.blockActions[i];
+    if (action.kind ==
+        ConversionPatternRewriterImpl::BlockActionKind::TypeConversion)
+      continue;
+
+    // Convert the block signature.
+    if (failed(rewriterImpl.convertBlockSignature(action.block))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "-- FAIL: failed to convert types of moved block.\n");
+      return cleanupFailure();
+    }
+  }
+
+  // Check all of the replacements to ensure that the pattern actually replaced
+  // the root operation. We also mark any other replaced ops as 'dead' so that
+  // we don't try to legalize them later.
+  bool replacedRoot = false;
+  for (unsigned i = curState.numReplacements,
+                e = rewriterImpl.replacements.size();
+       i != e; ++i) {
+    Operation *replacedOp = rewriterImpl.replacements[i].op;
+    if (replacedOp == op)
+      replacedRoot = true;
+    else
+      rewriterImpl.ignoredOps.insert(replacedOp);
+  }
+
+  // Check that the root was either updated or replace.
+  auto updatedRootInPlace = [&] {
+    return llvm::any_of(
+        llvm::drop_begin(rewriterImpl.rootUpdates, curState.numRootUpdates),
+        [op](auto &state) { return state.getOperation() == op; });
+  };
+  (void)replacedRoot;
+  (void)updatedRootInPlace;
+  assert((replacedRoot || updatedRootInPlace()) &&
+         "expected pattern to replace the root operation");
+
+  // Recursively legalize each of the operations updated in place.
+  for (unsigned i = curState.numRootUpdates,
+                e = rewriterImpl.rootUpdates.size();
+       i != e; ++i) {
+    auto &state = rewriterImpl.rootUpdates[i];
+    if (failed(legalize(state.getOperation(), rewriter))) {
+      LLVM_DEBUG(llvm::dbgs() << "-- FAIL: Operation updated in-place '"
+                              << op->getName() << "' was illegal.\n");
+      return cleanupFailure();
+    }
+  }
+
+  // Recursively legalize each of the new operations.
+  for (unsigned i = curState.numCreatedOps, e = rewriterImpl.createdOps.size();
+       i != e; ++i) {
+    Operation *op = rewriterImpl.createdOps[i];
+    if (failed(legalize(op, rewriter))) {
+      LLVM_DEBUG(llvm::dbgs() << "-- FAIL: Generated operation '"
+                              << op->getName() << "' was illegal.\n");
+      return cleanupFailure();
+    }
+  }
+
+  appliedPatterns.erase(pattern);
+  return success();
+}
+
+void OperationLegalizer::buildLegalizationGraph(
+    const OwningRewritePatternList &patterns) {
+  // A mapping between an operation and a set of operations that can be used to
+  // generate it.
+  DenseMap<OperationName, SmallPtrSet<OperationName, 2>> parentOps;
+  // A mapping between an operation and any currently invalid patterns it has.
+  DenseMap<OperationName, SmallPtrSet<RewritePattern *, 2>> invalidPatterns;
+  // A worklist of patterns to consider for legality.
+  llvm::SetVector<RewritePattern *> patternWorklist;
+
+  // Build the mapping from operations to the parent ops that may generate them.
+  for (auto &pattern : patterns) {
+    auto root = pattern->getRootKind();
+
+    // Skip operations that are always known to be legal.
+    if (target.getOpAction(root) == LegalizationAction::Legal)
+      continue;
+
+    // Add this pattern to the invalid set for the root op and record this root
+    // as a parent for any generated operations.
+    invalidPatterns[root].insert(pattern.get());
+    for (auto op : pattern->getGeneratedOps())
+      parentOps[op].insert(root);
+
+    // Add this pattern to the worklist.
+    patternWorklist.insert(pattern.get());
+  }
+
+  while (!patternWorklist.empty()) {
+    auto *pattern = patternWorklist.pop_back_val();
+
+    // Check to see if any of the generated operations are invalid.
+    if (llvm::any_of(pattern->getGeneratedOps(), [&](OperationName op) {
+          Optional<LegalizationAction> action = target.getOpAction(op);
+          return !legalizerPatterns.count(op) &&
+                 (!action || action == LegalizationAction::Illegal);
+        }))
+      continue;
+
+    // Otherwise, if all of the generated operation are valid, this op is now
+    // legal so add all of the child patterns to the worklist.
+    legalizerPatterns[pattern->getRootKind()].push_back(pattern);
+    invalidPatterns[pattern->getRootKind()].erase(pattern);
+
+    // Add any invalid patterns of the parent operations to see if they have now
+    // become legal.
+    for (auto op : parentOps[pattern->getRootKind()])
+      patternWorklist.set_union(invalidPatterns[op]);
+  }
+}
+
+void OperationLegalizer::computeLegalizationGraphBenefit() {
+  // The smallest pattern depth, when legalizing an operation.
+  DenseMap<OperationName, unsigned> minPatternDepth;
+
+  // Compute the minimum legalization depth for a given operation.
+  std::function<unsigned(OperationName)> computeDepth = [&](OperationName op) {
+    // Check for existing depth.
+    auto depthIt = minPatternDepth.find(op);
+    if (depthIt != minPatternDepth.end())
+      return depthIt->second;
+
+    // If a mapping for this operation does not exist, then this operation
+    // is always legal. Return 0 as the depth for a directly legal operation.
+    auto opPatternsIt = legalizerPatterns.find(op);
+    if (opPatternsIt == legalizerPatterns.end() || opPatternsIt->second.empty())
+      return 0u;
+
+    // Initialize the depth to the maximum value.
+    unsigned minDepth = std::numeric_limits<unsigned>::max();
+
+    // Record this initial depth in case we encounter this op again when
+    // recursively computing the depth.
+    minPatternDepth.try_emplace(op, minDepth);
+
+    // Compute the depth for each pattern used to legalize this operation.
+    SmallVector<std::pair<RewritePattern *, unsigned>, 4> patternsByDepth;
+    patternsByDepth.reserve(opPatternsIt->second.size());
+    for (RewritePattern *pattern : opPatternsIt->second) {
+      unsigned depth = 0;
+      for (auto generatedOp : pattern->getGeneratedOps())
+        depth = std::max(depth, computeDepth(generatedOp) + 1);
+      patternsByDepth.emplace_back(pattern, depth);
+
+      // Update the min depth for this operation.
+      minDepth = std::min(minDepth, depth);
+    }
+
+    // Update the pattern depth.
+    minPatternDepth[op] = minDepth;
+
+    // If the operation only has one legalization pattern, there is no need to
+    // sort them.
+    if (patternsByDepth.size() == 1)
+      return minDepth;
+
+    // Sort the patterns by those likely to be the most beneficial.
+    llvm::array_pod_sort(
+        patternsByDepth.begin(), patternsByDepth.end(),
+        [](const std::pair<RewritePattern *, unsigned> *lhs,
+           const std::pair<RewritePattern *, unsigned> *rhs) {
+          // First sort by the smaller pattern legalization depth.
+          if (lhs->second != rhs->second)
+            return llvm::array_pod_sort_comparator<unsigned>(&lhs->second,
+                                                             &rhs->second);
+
+          // Then sort by the larger pattern benefit.
+          auto lhsBenefit = lhs->first->getBenefit();
+          auto rhsBenefit = rhs->first->getBenefit();
+          return llvm::array_pod_sort_comparator<PatternBenefit>(&rhsBenefit,
+                                                                 &lhsBenefit);
+        });
+
+    // Update the legalization pattern to use the new sorted list.
+    opPatternsIt->second.clear();
+    for (auto &patternIt : patternsByDepth)
+      opPatternsIt->second.push_back(patternIt.first);
+
+    return minDepth;
+  };
+
+  // For each operation that is transitively legal, compute a cost for it.
+  for (auto &opIt : legalizerPatterns)
+    if (!minPatternDepth.count(opIt.first))
+      computeDepth(opIt.first);
+}
+
+//===----------------------------------------------------------------------===//
+// OperationConverter
+//===----------------------------------------------------------------------===//
+namespace {
+enum OpConversionMode {
+  // In this mode, the conversion will ignore failed conversions to allow
+  // illegal operations to co-exist in the IR.
+  Partial,
+
+  // In this mode, all operations must be legal for the given target for the
+  // conversion to succeed.
+  Full,
+
+  // In this mode, operations are analyzed for legality. No actual rewrites are
+  // applied to the operations on success.
+  Analysis,
+};
+
+// This class converts operations to a given conversion target via a set of
+// rewrite patterns. The conversion behaves differently depending on the
+// conversion mode.
+struct OperationConverter {
+  explicit OperationConverter(ConversionTarget &target,
+                              const OwningRewritePatternList &patterns,
+                              OpConversionMode mode,
+                              DenseSet<Operation *> *legalizableOps = nullptr)
+      : opLegalizer(target, patterns), mode(mode),
+        legalizableOps(legalizableOps) {}
+
+  /// Converts the given operations to the conversion target.
+  LogicalResult convertOperations(ArrayRef<Operation *> ops,
+                                  TypeConverter *typeConverter);
+
+private:
+  /// Converts an operation with the given rewriter.
+  LogicalResult convert(ConversionPatternRewriter &rewriter, Operation *op);
+
+  /// Converts the type signatures of the blocks nested within 'op'.
+  LogicalResult convertBlockSignatures(ConversionPatternRewriter &rewriter,
+                                       Operation *op);
+
+  /// The legalizer to use when converting operations.
+  OperationLegalizer opLegalizer;
+
+  /// The conversion mode to use when legalizing operations.
+  OpConversionMode mode;
+
+  /// A set of pre-existing operations that were found to be legalizable to the
+  /// target. This field is only used when mode == OpConversionMode::Analysis.
+  DenseSet<Operation *> *legalizableOps;
+};
+} // end anonymous namespace
+
+LogicalResult
+OperationConverter::convertBlockSignatures(ConversionPatternRewriter &rewriter,
+                                           Operation *op) {
+  // Check to see if type signatures need to be converted.
+  if (!rewriter.getImpl().argConverter.typeConverter)
+    return success();
+
+  for (auto &region : op->getRegions()) {
+    for (auto &block : llvm::make_early_inc_range(region))
+      if (failed(rewriter.getImpl().convertBlockSignature(&block)))
+        return failure();
+  }
+  return success();
+}
+
+LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter,
+                                          Operation *op) {
+  // Legalize the given operation.
+  if (failed(opLegalizer.legalize(op, rewriter))) {
+    // Handle the case of a failed conversion for each of the different modes.
+    /// Full conversions expect all operations to be converted.
+    if (mode == OpConversionMode::Full)
+      return op->emitError()
+             << "failed to legalize operation '" << op->getName() << "'";
+    /// Partial conversions allow conversions to fail iff the operation was not
+    /// explicitly marked as illegal.
+    if (mode == OpConversionMode::Partial && opLegalizer.isIllegal(op))
+      return op->emitError()
+             << "failed to legalize operation '" << op->getName()
+             << "' that was explicitly marked illegal";
+  } else {
+    /// Analysis conversions don't fail if any operations fail to legalize,
+    /// they are only interested in the operations that were successfully
+    /// legalized.
+    if (mode == OpConversionMode::Analysis)
+      legalizableOps->insert(op);
+
+    // If legalization succeeded, convert the types any of the blocks within
+    // this operation.
+    if (failed(convertBlockSignatures(rewriter, op)))
+      return failure();
+  }
+  return success();
+}
+
+LogicalResult
+OperationConverter::convertOperations(ArrayRef<Operation *> ops,
+                                      TypeConverter *typeConverter) {
+  if (ops.empty())
+    return success();
+  ConversionTarget &target = opLegalizer.getTarget();
+
+  /// Compute the set of operations and blocks to convert.
+  std::vector<Operation *> toConvert;
+  for (auto *op : ops) {
+    toConvert.emplace_back(op);
+    for (auto &region : op->getRegions())
+      if (failed(computeConversionSet(region.getBlocks(), region.getLoc(),
+                                      toConvert, &target)))
+        return failure();
+  }
+
+  // Convert each operation and discard rewrites on failure.
+  ConversionPatternRewriter rewriter(ops.front()->getContext(), typeConverter);
+  for (auto *op : toConvert)
+    if (failed(convert(rewriter, op)))
+      return rewriter.getImpl().discardRewrites(), failure();
+
+  // Otherwise, the body conversion succeeded. Apply rewrites if this is not an
+  // analysis conversion.
+  if (mode == OpConversionMode::Analysis)
+    rewriter.getImpl().discardRewrites();
+  else
+    rewriter.getImpl().applyRewrites();
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Type Conversion
+//===----------------------------------------------------------------------===//
+
+/// Remap an input of the original signature with a new set of types. The
+/// new types are appended to the new signature conversion.
+void TypeConverter::SignatureConversion::addInputs(unsigned origInputNo,
+                                                   ArrayRef<Type> types) {
+  assert(!types.empty() && "expected valid types");
+  remapInput(origInputNo, /*newInputNo=*/argTypes.size(), types.size());
+  addInputs(types);
+}
+
+/// Append new input types to the signature conversion, this should only be
+/// used if the new types are not intended to remap an existing input.
+void TypeConverter::SignatureConversion::addInputs(ArrayRef<Type> types) {
+  assert(!types.empty() &&
+         "1->0 type remappings don't need to be added explicitly");
+  argTypes.append(types.begin(), types.end());
+}
+
+/// Remap an input of the original signature with a range of types in the
+/// new signature.
+void TypeConverter::SignatureConversion::remapInput(unsigned origInputNo,
+                                                    unsigned newInputNo,
+                                                    unsigned newInputCount) {
+  assert(!remappedInputs[origInputNo] && "input has already been remapped");
+  assert(newInputCount != 0 && "expected valid input count");
+  remappedInputs[origInputNo] =
+      InputMapping{newInputNo, newInputCount, /*replacementValue=*/nullptr};
+}
+
+/// Remap an input of the original signature to another `replacementValue`
+/// value. This would make the signature converter drop this argument.
+void TypeConverter::SignatureConversion::remapInput(unsigned origInputNo,
+                                                    Value replacementValue) {
+  assert(!remappedInputs[origInputNo] && "input has already been remapped");
+  remappedInputs[origInputNo] =
+      InputMapping{origInputNo, /*size=*/0, replacementValue};
+}
+
+/// This hooks allows for converting a type.
+LogicalResult TypeConverter::convertType(Type t,
+                                         SmallVectorImpl<Type> &results) {
+  if (auto newT = convertType(t)) {
+    results.push_back(newT);
+    return success();
+  }
+  return failure();
+}
+
+/// Convert the given set of types, filling 'results' as necessary. This
+/// returns failure if the conversion of any of the types fails, success
+/// otherwise.
+LogicalResult TypeConverter::convertTypes(ArrayRef<Type> types,
+                                          SmallVectorImpl<Type> &results) {
+  for (auto type : types)
+    if (failed(convertType(type, results)))
+      return failure();
+  return success();
+}
+
+/// Return true if the given type is legal for this type converter, i.e. the
+/// type converts to itself.
+bool TypeConverter::isLegal(Type type) {
+  SmallVector<Type, 1> results;
+  return succeeded(convertType(type, results)) && results.size() == 1 &&
+         results.front() == type;
+}
+
+/// Return true if the inputs and outputs of the given function type are
+/// legal.
+bool TypeConverter::isSignatureLegal(FunctionType funcType) {
+  return llvm::all_of(
+      llvm::concat<const Type>(funcType.getInputs(), funcType.getResults()),
+      [this](Type type) { return isLegal(type); });
+}
+
+/// This hook allows for converting a specific argument of a signature.
+LogicalResult TypeConverter::convertSignatureArg(unsigned inputNo, Type type,
+                                                 SignatureConversion &result) {
+  // Try to convert the given input type.
+  SmallVector<Type, 1> convertedTypes;
+  if (failed(convertType(type, convertedTypes)))
+    return failure();
+
+  // If this argument is being dropped, there is nothing left to do.
+  if (convertedTypes.empty())
+    return success();
+
+  // Otherwise, add the new inputs.
+  result.addInputs(inputNo, convertedTypes);
+  return success();
+}
+
+/// Create a default conversion pattern that rewrites the type signature of a
+/// FuncOp.
+namespace {
+struct FuncOpSignatureConversion : public OpConversionPattern<FuncOp> {
+  FuncOpSignatureConversion(MLIRContext *ctx, TypeConverter &converter)
+      : OpConversionPattern(ctx), converter(converter) {}
+
+  /// Hook for derived classes to implement combined matching and rewriting.
+  PatternMatchResult
+  matchAndRewrite(FuncOp funcOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    FunctionType type = funcOp.getType();
+
+    // Convert the original function arguments.
+    TypeConverter::SignatureConversion result(type.getNumInputs());
+    for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i)
+      if (failed(converter.convertSignatureArg(i, type.getInput(i), result)))
+        return matchFailure();
+
+    // Convert the original function results.
+    SmallVector<Type, 1> convertedResults;
+    if (failed(converter.convertTypes(type.getResults(), convertedResults)))
+      return matchFailure();
+
+    // Update the function signature in-place.
+    rewriter.updateRootInPlace(funcOp, [&] {
+      funcOp.setType(FunctionType::get(result.getConvertedTypes(),
+                                       convertedResults, funcOp.getContext()));
+      rewriter.applySignatureConversion(&funcOp.getBody(), result);
+    });
+    return matchSuccess();
+  }
+
+  /// The type converter to use when rewriting the signature.
+  TypeConverter &converter;
+};
+} // end anonymous namespace
+
+void mlir::populateFuncOpTypeConversionPattern(
+    OwningRewritePatternList &patterns, MLIRContext *ctx,
+    TypeConverter &converter) {
+  patterns.insert<FuncOpSignatureConversion>(ctx, converter);
+}
+
+/// This function converts the type signature of the given block, by invoking
+/// 'convertSignatureArg' for each argument. This function should return a valid
+/// conversion for the signature on success, None otherwise.
+auto TypeConverter::convertBlockSignature(Block *block)
+    -> Optional<SignatureConversion> {
+  SignatureConversion conversion(block->getNumArguments());
+  for (unsigned i = 0, e = block->getNumArguments(); i != e; ++i)
+    if (failed(convertSignatureArg(i, block->getArgument(i)->getType(),
+                                   conversion)))
+      return llvm::None;
+  return conversion;
+}
+
+//===----------------------------------------------------------------------===//
+// ConversionTarget
+//===----------------------------------------------------------------------===//
+
+/// Register a legality action for the given operation.
+void ConversionTarget::setOpAction(OperationName op,
+                                   LegalizationAction action) {
+  legalOperations[op] = {action, /*isRecursivelyLegal=*/false};
+}
+
+/// Register a legality action for the given dialects.
+void ConversionTarget::setDialectAction(ArrayRef<StringRef> dialectNames,
+                                        LegalizationAction action) {
+  for (StringRef dialect : dialectNames)
+    legalDialects[dialect] = action;
+}
+
+/// Get the legality action for the given operation.
+auto ConversionTarget::getOpAction(OperationName op) const
+    -> Optional<LegalizationAction> {
+  Optional<LegalizationInfo> info = getOpInfo(op);
+  return info ? info->action : Optional<LegalizationAction>();
+}
+
+/// If the given operation instance is legal on this target, a structure
+/// containing legality information is returned. If the operation is not legal,
+/// None is returned.
+auto ConversionTarget::isLegal(Operation *op) const
+    -> Optional<LegalOpDetails> {
+  Optional<LegalizationInfo> info = getOpInfo(op->getName());
+  if (!info)
+    return llvm::None;
+
+  // Returns true if this operation instance is known to be legal.
+  auto isOpLegal = [&] {
+    // Handle dynamic legality.
+    if (info->action == LegalizationAction::Dynamic) {
+      // Check for callbacks on the operation or dialect.
+      auto opFn = opLegalityFns.find(op->getName());
+      if (opFn != opLegalityFns.end())
+        return opFn->second(op);
+      auto dialectFn = dialectLegalityFns.find(op->getName().getDialect());
+      if (dialectFn != dialectLegalityFns.end())
+        return dialectFn->second(op);
+
+      // Otherwise, invoke the hook on the derived instance.
+      return isDynamicallyLegal(op);
+    }
+
+    // Otherwise, the operation is only legal if it was marked 'Legal'.
+    return info->action == LegalizationAction::Legal;
+  };
+  if (!isOpLegal())
+    return llvm::None;
+
+  // This operation is legal, compute any additional legality information.
+  LegalOpDetails legalityDetails;
+
+  if (info->isRecursivelyLegal) {
+    auto legalityFnIt = opRecursiveLegalityFns.find(op->getName());
+    if (legalityFnIt != opRecursiveLegalityFns.end())
+      legalityDetails.isRecursivelyLegal = legalityFnIt->second(op);
+    else
+      legalityDetails.isRecursivelyLegal = true;
+  }
+  return legalityDetails;
+}
+
+/// Set the dynamic legality callback for the given operation.
+void ConversionTarget::setLegalityCallback(
+    OperationName name, const DynamicLegalityCallbackFn &callback) {
+  assert(callback && "expected valid legality callback");
+  opLegalityFns[name] = callback;
+}
+
+/// Set the recursive legality callback for the given operation and mark the
+/// operation as recursively legal.
+void ConversionTarget::markOpRecursivelyLegal(
+    OperationName name, const DynamicLegalityCallbackFn &callback) {
+  auto infoIt = legalOperations.find(name);
+  assert(infoIt != legalOperations.end() &&
+         infoIt->second.action != LegalizationAction::Illegal &&
+         "expected operation to already be marked as legal");
+  infoIt->second.isRecursivelyLegal = true;
+  if (callback)
+    opRecursiveLegalityFns[name] = callback;
+  else
+    opRecursiveLegalityFns.erase(name);
+}
+
+/// Set the dynamic legality callback for the given dialects.
+void ConversionTarget::setLegalityCallback(
+    ArrayRef<StringRef> dialects, const DynamicLegalityCallbackFn &callback) {
+  assert(callback && "expected valid legality callback");
+  for (StringRef dialect : dialects)
+    dialectLegalityFns[dialect] = callback;
+}
+
+/// Get the legalization information for the given operation.
+auto ConversionTarget::getOpInfo(OperationName op) const
+    -> Optional<LegalizationInfo> {
+  // Check for info for this specific operation.
+  auto it = legalOperations.find(op);
+  if (it != legalOperations.end())
+    return it->second;
+  // Otherwise, default to checking on the parent dialect.
+  auto dialectIt = legalDialects.find(op.getDialect());
+  if (dialectIt != legalDialects.end())
+    return LegalizationInfo{dialectIt->second, /*isRecursivelyLegal=*/false};
+  return llvm::None;
+}
+
+//===----------------------------------------------------------------------===//
+// Op Conversion Entry Points
+//===----------------------------------------------------------------------===//
+
+/// Apply a partial conversion on the given operations, and all nested
+/// operations. This method converts as many operations to the target as
+/// possible, ignoring operations that failed to legalize.
+LogicalResult mlir::applyPartialConversion(
+    ArrayRef<Operation *> ops, ConversionTarget &target,
+    const OwningRewritePatternList &patterns, TypeConverter *converter) {
+  OperationConverter opConverter(target, patterns, OpConversionMode::Partial);
+  return opConverter.convertOperations(ops, converter);
+}
+LogicalResult
+mlir::applyPartialConversion(Operation *op, ConversionTarget &target,
+                             const OwningRewritePatternList &patterns,
+                             TypeConverter *converter) {
+  return applyPartialConversion(llvm::makeArrayRef(op), target, patterns,
+                                converter);
+}
+
+/// Apply a complete conversion on the given operations, and all nested
+/// operations. This method will return failure if the conversion of any
+/// operation fails.
+LogicalResult
+mlir::applyFullConversion(ArrayRef<Operation *> ops, ConversionTarget &target,
+                          const OwningRewritePatternList &patterns,
+                          TypeConverter *converter) {
+  OperationConverter opConverter(target, patterns, OpConversionMode::Full);
+  return opConverter.convertOperations(ops, converter);
+}
+LogicalResult
+mlir::applyFullConversion(Operation *op, ConversionTarget &target,
+                          const OwningRewritePatternList &patterns,
+                          TypeConverter *converter) {
+  return applyFullConversion(llvm::makeArrayRef(op), target, patterns,
+                             converter);
+}
+
+/// Apply an analysis conversion on the given operations, and all nested
+/// operations. This method analyzes which operations would be successfully
+/// converted to the target if a conversion was applied. All operations that
+/// were found to be legalizable to the given 'target' are placed within the
+/// provided 'convertedOps' set; note that no actual rewrites are applied to the
+/// operations on success and only pre-existing operations are added to the set.
+LogicalResult mlir::applyAnalysisConversion(
+    ArrayRef<Operation *> ops, ConversionTarget &target,
+    const OwningRewritePatternList &patterns,
+    DenseSet<Operation *> &convertedOps, TypeConverter *converter) {
+  OperationConverter opConverter(target, patterns, OpConversionMode::Analysis,
+                                 &convertedOps);
+  return opConverter.convertOperations(ops, converter);
+}
+LogicalResult
+mlir::applyAnalysisConversion(Operation *op, ConversionTarget &target,
+                              const OwningRewritePatternList &patterns,
+                              DenseSet<Operation *> &convertedOps,
+                              TypeConverter *converter) {
+  return applyAnalysisConversion(llvm::makeArrayRef(op), target, patterns,
+                                 convertedOps, converter);
+}
diff --git a/mlir/lib/Transforms/Inliner.cpp b/mlir/lib/Transforms/Inliner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2cee7da0834b1b0b876176327e36f04e2b9c995
--- /dev/null
+++ b/mlir/lib/Transforms/Inliner.cpp
@@ -0,0 +1,296 @@
+//===- Inliner.cpp - Pass to inline function calls ------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a basic inlining algorithm that operates bottom up over
+// the Strongly Connect Components(SCCs) of the CallGraph. This enables a more
+// incremental propagation of inlining decisions from the leafs to the roots of
+// the callgraph.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/CallGraph.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Parallel.h"
+
+#define DEBUG_TYPE "inlining"
+
+using namespace mlir;
+
+static llvm::cl::opt<bool> disableCanonicalization(
+    "mlir-disable-inline-simplify",
+    llvm::cl::desc("Disable running simplifications during inlining"),
+    llvm::cl::ReallyHidden, llvm::cl::init(false));
+
+static llvm::cl::opt<unsigned> maxInliningIterations(
+    "mlir-max-inline-iterations",
+    llvm::cl::desc("Maximum number of iterations when inlining within an SCC"),
+    llvm::cl::ReallyHidden, llvm::cl::init(4));
+
+//===----------------------------------------------------------------------===//
+// CallGraph traversal
+//===----------------------------------------------------------------------===//
+
+/// Run a given transformation over the SCCs of the callgraph in a bottom up
+/// traversal.
+static void runTransformOnCGSCCs(
+    const CallGraph &cg,
+    function_ref<void(ArrayRef<CallGraphNode *>)> sccTransformer) {
+  std::vector<CallGraphNode *> currentSCCVec;
+  auto cgi = llvm::scc_begin(&cg);
+  while (!cgi.isAtEnd()) {
+    // Copy the current SCC and increment so that the transformer can modify the
+    // SCC without invalidating our iterator.
+    currentSCCVec = *cgi;
+    ++cgi;
+    sccTransformer(currentSCCVec);
+  }
+}
+
+namespace {
+/// This struct represents a resolved call to a given callgraph node. Given that
+/// the call does not actually contain a direct reference to the
+/// Region(CallGraphNode) that it is dispatching to, we need to resolve them
+/// explicitly.
+struct ResolvedCall {
+  ResolvedCall(CallOpInterface call, CallGraphNode *targetNode)
+      : call(call), targetNode(targetNode) {}
+  CallOpInterface call;
+  CallGraphNode *targetNode;
+};
+} // end anonymous namespace
+
+/// Collect all of the callable operations within the given range of blocks. If
+/// `traverseNestedCGNodes` is true, this will also collect call operations
+/// inside of nested callgraph nodes.
+static void collectCallOps(iterator_range<Region::iterator> blocks,
+                           CallGraph &cg, SmallVectorImpl<ResolvedCall> &calls,
+                           bool traverseNestedCGNodes) {
+  SmallVector<Block *, 8> worklist;
+  auto addToWorklist = [&](iterator_range<Region::iterator> blocks) {
+    for (Block &block : blocks)
+      worklist.push_back(&block);
+  };
+
+  addToWorklist(blocks);
+  while (!worklist.empty()) {
+    for (Operation &op : *worklist.pop_back_val()) {
+      if (auto call = dyn_cast<CallOpInterface>(op)) {
+        CallGraphNode *node =
+            cg.resolveCallable(call.getCallableForCallee(), &op);
+        if (!node->isExternal())
+          calls.emplace_back(call, node);
+        continue;
+      }
+
+      // If this is not a call, traverse the nested regions. If
+      // `traverseNestedCGNodes` is false, then don't traverse nested call graph
+      // regions.
+      for (auto &nestedRegion : op.getRegions())
+        if (traverseNestedCGNodes || !cg.lookupNode(&nestedRegion))
+          addToWorklist(nestedRegion);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Inliner
+//===----------------------------------------------------------------------===//
+namespace {
+/// This class provides a specialization of the main inlining interface.
+struct Inliner : public InlinerInterface {
+  Inliner(MLIRContext *context, CallGraph &cg)
+      : InlinerInterface(context), cg(cg) {}
+
+  /// Process a set of blocks that have been inlined. This callback is invoked
+  /// *before* inlined terminator operations have been processed.
+  void
+  processInlinedBlocks(iterator_range<Region::iterator> inlinedBlocks) final {
+    collectCallOps(inlinedBlocks, cg, calls, /*traverseNestedCGNodes=*/true);
+  }
+
+  /// The current set of call instructions to consider for inlining.
+  SmallVector<ResolvedCall, 8> calls;
+
+  /// The callgraph being operated on.
+  CallGraph &cg;
+};
+} // namespace
+
+/// Returns true if the given call should be inlined.
+static bool shouldInline(ResolvedCall &resolvedCall) {
+  // Don't allow inlining terminator calls. We currently don't support this
+  // case.
+  if (resolvedCall.call.getOperation()->isKnownTerminator())
+    return false;
+
+  // Don't allow inlining if the target is an ancestor of the call. This
+  // prevents inlining recursively.
+  if (resolvedCall.targetNode->getCallableRegion()->isAncestor(
+          resolvedCall.call.getParentRegion()))
+    return false;
+
+  // Otherwise, inline.
+  return true;
+}
+
+/// Attempt to inline calls within the given scc. This function returns
+/// success if any calls were inlined, failure otherwise.
+static LogicalResult inlineCallsInSCC(Inliner &inliner,
+                                      ArrayRef<CallGraphNode *> currentSCC) {
+  CallGraph &cg = inliner.cg;
+  auto &calls = inliner.calls;
+
+  // Collect all of the direct calls within the nodes of the current SCC. We
+  // don't traverse nested callgraph nodes, because they are handled separately
+  // likely within a different SCC.
+  for (auto *node : currentSCC) {
+    if (!node->isExternal())
+      collectCallOps(*node->getCallableRegion(), cg, calls,
+                     /*traverseNestedCGNodes=*/false);
+  }
+  if (calls.empty())
+    return failure();
+
+  // Try to inline each of the call operations. Don't cache the end iterator
+  // here as more calls may be added during inlining.
+  bool inlinedAnyCalls = false;
+  for (unsigned i = 0; i != calls.size(); ++i) {
+    ResolvedCall &it = calls[i];
+    LLVM_DEBUG({
+      llvm::dbgs() << "* Considering inlining call: ";
+      it.call.dump();
+    });
+    if (!shouldInline(it))
+      continue;
+
+    CallOpInterface call = it.call;
+    Region *targetRegion = it.targetNode->getCallableRegion();
+    LogicalResult inlineResult = inlineCall(
+        inliner, call, cast<CallableOpInterface>(targetRegion->getParentOp()),
+        targetRegion);
+    if (failed(inlineResult))
+      continue;
+
+    // If the inlining was successful, then erase the call.
+    call.erase();
+    inlinedAnyCalls = true;
+  }
+  calls.clear();
+  return success(inlinedAnyCalls);
+}
+
+/// Canonicalize the nodes within the given SCC with the given set of
+/// canonicalization patterns.
+static void canonicalizeSCC(CallGraph &cg, ArrayRef<CallGraphNode *> currentSCC,
+                            MLIRContext *context,
+                            const OwningRewritePatternList &canonPatterns) {
+  // Collect the sets of nodes to canonicalize.
+  SmallVector<CallGraphNode *, 4> nodesToCanonicalize;
+  for (auto *node : currentSCC) {
+    // Don't canonicalize the external node, it has no valid callable region.
+    if (node->isExternal())
+      continue;
+
+    // Don't canonicalize nodes with children. Nodes with children
+    // require special handling as we may remove the node during
+    // canonicalization. In the future, we should be able to handle this
+    // case with proper node deletion tracking.
+    if (node->hasChildren())
+      continue;
+
+    // We also won't apply canonicalizations for nodes that are not
+    // isolated. This avoids potentially mutating the regions of nodes defined
+    // above, this is also a stipulation of the 'applyPatternsGreedily' driver.
+    auto *region = node->getCallableRegion();
+    if (!region->getParentOp()->isKnownIsolatedFromAbove())
+      continue;
+    nodesToCanonicalize.push_back(node);
+  }
+  if (nodesToCanonicalize.empty())
+    return;
+
+  // Canonicalize each of the nodes within the SCC in parallel.
+  // NOTE: This is simple now, because we don't enable canonicalizing nodes
+  // within children. When we remove this restriction, this logic will need to
+  // be reworked.
+  ParallelDiagnosticHandler canonicalizationHandler(context);
+  llvm::parallel::for_each_n(
+      llvm::parallel::par, /*Begin=*/size_t(0),
+      /*End=*/nodesToCanonicalize.size(), [&](size_t index) {
+        // Set the order for this thread so that diagnostics will be properly
+        // ordered.
+        canonicalizationHandler.setOrderIDForThread(index);
+
+        // Apply the canonicalization patterns to this region.
+        auto *node = nodesToCanonicalize[index];
+        applyPatternsGreedily(*node->getCallableRegion(), canonPatterns);
+
+        // Make sure to reset the order ID for the diagnostic handler, as this
+        // thread may be used in a different context.
+        canonicalizationHandler.eraseOrderIDForThread();
+      });
+}
+
+/// Attempt to inline calls within the given scc, and run canonicalizations with
+/// the given patterns, until a fixed point is reached. This allows for the
+/// inlining of newly devirtualized calls.
+static void inlineSCC(Inliner &inliner, ArrayRef<CallGraphNode *> currentSCC,
+                      MLIRContext *context,
+                      const OwningRewritePatternList &canonPatterns) {
+  // If we successfully inlined any calls, run some simplifications on the
+  // nodes of the scc. Continue attempting to inline until we reach a fixed
+  // point, or a maximum iteration count. We canonicalize here as it may
+  // devirtualize new calls, as well as give us a better cost model.
+  unsigned iterationCount = 0;
+  while (succeeded(inlineCallsInSCC(inliner, currentSCC))) {
+    // If we aren't allowing simplifications or the max iteration count was
+    // reached, then bail out early.
+    if (disableCanonicalization || ++iterationCount >= maxInliningIterations)
+      break;
+    canonicalizeSCC(inliner.cg, currentSCC, context, canonPatterns);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// InlinerPass
+//===----------------------------------------------------------------------===//
+
+// TODO(riverriddle) This pass should currently only be used for basic testing
+// of inlining functionality.
+namespace {
+struct InlinerPass : public OperationPass<InlinerPass> {
+  void runOnOperation() override {
+    CallGraph &cg = getAnalysis<CallGraph>();
+    auto *context = &getContext();
+
+    // Collect a set of canonicalization patterns to use when simplifying
+    // callable regions within an SCC.
+    OwningRewritePatternList canonPatterns;
+    for (auto *op : context->getRegisteredOperations())
+      op->getCanonicalizationPatterns(canonPatterns, context);
+
+    // Run the inline transform in post-order over the SCCs in the callgraph.
+    Inliner inliner(context, cg);
+    runTransformOnCGSCCs(cg, [&](ArrayRef<CallGraphNode *> scc) {
+      inlineSCC(inliner, scc, context, canonPatterns);
+    });
+  }
+};
+} // end anonymous namespace
+
+std::unique_ptr<Pass> mlir::createInlinerPass() {
+  return std::make_unique<InlinerPass>();
+}
+
+static PassRegistration<InlinerPass> pass("inline", "Inline function calls");
diff --git a/mlir/lib/Transforms/LoopCoalescing.cpp b/mlir/lib/Transforms/LoopCoalescing.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2aee688c6c1dafbbe00af0a8194205cc8aec22a7
--- /dev/null
+++ b/mlir/lib/Transforms/LoopCoalescing.cpp
@@ -0,0 +1,96 @@
+//===- LoopCoalescing.cpp - Pass transforming loop nests into single loops-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Support/Debug.h"
+
+#define PASS_NAME "loop-coalescing"
+#define DEBUG_TYPE PASS_NAME
+
+using namespace mlir;
+
+namespace {
+class LoopCoalescingPass : public FunctionPass<LoopCoalescingPass> {
+public:
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+
+    func.walk([](loop::ForOp op) {
+      // Ignore nested loops.
+      if (op.getParentOfType<loop::ForOp>())
+        return;
+
+      SmallVector<loop::ForOp, 4> loops;
+      getPerfectlyNestedLoops(loops, op);
+      LLVM_DEBUG(llvm::dbgs()
+                 << "found a perfect nest of depth " << loops.size() << '\n');
+
+      // Look for a band of loops that can be coalesced, i.e. perfectly nested
+      // loops with bounds defined above some loop.
+      // 1. For each loop, find above which parent loop its operands are
+      // defined.
+      SmallVector<unsigned, 4> operandsDefinedAbove(loops.size());
+      for (unsigned i = 0, e = loops.size(); i < e; ++i) {
+        operandsDefinedAbove[i] = i;
+        for (unsigned j = 0; j < i; ++j) {
+          if (areValuesDefinedAbove(loops[i].getOperands(),
+                                    loops[j].region())) {
+            operandsDefinedAbove[i] = j;
+            break;
+          }
+        }
+        LLVM_DEBUG(llvm::dbgs()
+                   << "  bounds of loop " << i << " are known above depth "
+                   << operandsDefinedAbove[i] << '\n');
+      }
+
+      // 2. Identify bands of loops such that the operands of all of them are
+      // defined above the first loop in the band.  Traverse the nest bottom-up
+      // so that modifications don't invalidate the inner loops.
+      for (unsigned end = loops.size(); end > 0; --end) {
+        unsigned start = 0;
+        for (; start < end - 1; ++start) {
+          auto maxPos =
+              *std::max_element(std::next(operandsDefinedAbove.begin(), start),
+                                std::next(operandsDefinedAbove.begin(), end));
+          if (maxPos > start)
+            continue;
+
+          assert(maxPos == start &&
+                 "expected loop bounds to be known at the start of the band");
+          LLVM_DEBUG(llvm::dbgs() << "  found coalesceable band from " << start
+                                  << " to " << end << '\n');
+
+          auto band =
+              llvm::makeMutableArrayRef(loops.data() + start, end - start);
+          coalesceLoops(band);
+          break;
+        }
+        // If a band was found and transformed, keep looking at the loops above
+        // the outermost transformed loop.
+        if (start != end - 1)
+          end = start + 1;
+      }
+    });
+  }
+};
+
+} // namespace
+
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createLoopCoalescingPass() {
+  return std::make_unique<LoopCoalescingPass>();
+}
+
+static PassRegistration<LoopCoalescingPass>
+    reg(PASS_NAME,
+        "coalesce nested loops with independent bounds into a single loop");
diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fcfc1d7ae52e47af38552f0d28a0d5bfb04dd2ca
--- /dev/null
+++ b/mlir/lib/Transforms/LoopFusion.cpp
@@ -0,0 +1,1979 @@
+//===- LoopFusion.cpp - Code to perform loop fusion -----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop fusion.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopFusionUtils.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <iomanip>
+#include <sstream>
+#define DEBUG_TYPE "affine-loop-fusion"
+
+using llvm::SetVector;
+
+using namespace mlir;
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+/// Disables fusion profitability check and fuses if valid. Ignore any
+/// additional (redundant) computation tolerance threshold
+/// that would have prevented fusion.
+static llvm::cl::opt<bool>
+    clMaximalLoopFusion("fusion-maximal",
+                        llvm::cl::desc("Enables maximal loop fusion"),
+                        llvm::cl::cat(clOptionsCategory));
+
+/// A threshold in percent of additional computation allowed when fusing.
+static llvm::cl::opt<double> clFusionAddlComputeTolerance(
+    "fusion-compute-tolerance",
+    llvm::cl::desc("Fractional increase in additional "
+                   "computation tolerated while fusing"),
+    llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<unsigned> clFusionFastMemorySpace(
+    "fusion-fast-mem-space",
+    llvm::cl::desc("Faster memory space number to promote fusion buffers to"),
+    llvm::cl::cat(clOptionsCategory));
+
+// A local buffer of size less than or equal to this size is automatically
+// promoted to fast memory after producer-consumer fusion.
+static llvm::cl::opt<unsigned long long> clFusionLocalBufThreshold(
+    "fusion-local-buf-threshold",
+    llvm::cl::desc("Threshold size (KiB) for promoting local buffers to fast "
+                   "memory space"),
+    llvm::cl::cat(clOptionsCategory));
+
+namespace {
+
+/// Loop fusion pass. This pass currently supports a greedy fusion policy,
+/// which fuses loop nests with single-writer/single-reader memref dependences
+/// with the goal of improving locality.
+
+// TODO(andydavis) Support fusion of source loop nests which write to multiple
+// memrefs, where each memref can have multiple users (if profitable).
+// TODO(andydavis) Extend this pass to check for fusion preventing dependences,
+// and add support for more general loop fusion algorithms.
+
+struct LoopFusion : public FunctionPass<LoopFusion> {
+  LoopFusion(unsigned fastMemorySpace = 0, uint64_t localBufSizeThreshold = 0,
+             bool maximalFusion = false)
+      : localBufSizeThreshold(localBufSizeThreshold),
+        fastMemorySpace(fastMemorySpace), maximalFusion(maximalFusion) {}
+
+  void runOnFunction() override;
+
+  // Any local buffers smaller than this size (in bytes) will be created in
+  // `fastMemorySpace` if provided.
+  uint64_t localBufSizeThreshold;
+  Optional<unsigned> fastMemorySpace = None;
+  // If true, ignore any additional (redundant) computation tolerance threshold
+  // that would have prevented fusion.
+  bool maximalFusion;
+
+  // The amount of additional computation that is tolerated while fusing
+  // pair-wise as a fraction of the total computation.
+  constexpr static double kComputeToleranceThreshold = 0.30f;
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::createLoopFusionPass(unsigned fastMemorySpace,
+                           uint64_t localBufSizeThreshold, bool maximalFusion) {
+  return std::make_unique<LoopFusion>(fastMemorySpace, localBufSizeThreshold,
+                                      maximalFusion);
+}
+
+// TODO(b/117228571) Replace when this is modeled through side-effects/op traits
+static bool isMemRefDereferencingOp(Operation &op) {
+  if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
+      isa<AffineDmaStartOp>(op) || isa<AffineDmaWaitOp>(op))
+    return true;
+  return false;
+}
+
+namespace {
+
+// LoopNestStateCollector walks loop nests and collects load and store
+// operations, and whether or not an IfInst was encountered in the loop nest.
+struct LoopNestStateCollector {
+  SmallVector<AffineForOp, 4> forOps;
+  SmallVector<Operation *, 4> loadOpInsts;
+  SmallVector<Operation *, 4> storeOpInsts;
+  bool hasNonForRegion = false;
+
+  void collect(Operation *opToWalk) {
+    opToWalk->walk([&](Operation *op) {
+      if (isa<AffineForOp>(op))
+        forOps.push_back(cast<AffineForOp>(op));
+      else if (op->getNumRegions() != 0)
+        hasNonForRegion = true;
+      else if (isa<AffineLoadOp>(op))
+        loadOpInsts.push_back(op);
+      else if (isa<AffineStoreOp>(op))
+        storeOpInsts.push_back(op);
+    });
+  }
+};
+
+// MemRefDependenceGraph is a graph data structure where graph nodes are
+// top-level operations in a FuncOp which contain load/store ops, and edges
+// are memref dependences between the nodes.
+// TODO(andydavis) Add a more flexible dependence graph representation.
+// TODO(andydavis) Add a depth parameter to dependence graph construction.
+struct MemRefDependenceGraph {
+public:
+  // Node represents a node in the graph. A Node is either an entire loop nest
+  // rooted at the top level which contains loads/stores, or a top level
+  // load/store.
+  struct Node {
+    // The unique identifier of this node in the graph.
+    unsigned id;
+    // The top-level statement which is (or contains) a load/store.
+    Operation *op;
+    // List of load operations.
+    SmallVector<Operation *, 4> loads;
+    // List of store op insts.
+    SmallVector<Operation *, 4> stores;
+    Node(unsigned id, Operation *op) : id(id), op(op) {}
+
+    // Returns the load op count for 'memref'.
+    unsigned getLoadOpCount(Value memref) {
+      unsigned loadOpCount = 0;
+      for (auto *loadOpInst : loads) {
+        if (memref == cast<AffineLoadOp>(loadOpInst).getMemRef())
+          ++loadOpCount;
+      }
+      return loadOpCount;
+    }
+
+    // Returns the store op count for 'memref'.
+    unsigned getStoreOpCount(Value memref) {
+      unsigned storeOpCount = 0;
+      for (auto *storeOpInst : stores) {
+        if (memref == cast<AffineStoreOp>(storeOpInst).getMemRef())
+          ++storeOpCount;
+      }
+      return storeOpCount;
+    }
+
+    // Returns all store ops in 'storeOps' which access 'memref'.
+    void getStoreOpsForMemref(Value memref,
+                              SmallVectorImpl<Operation *> *storeOps) {
+      for (auto *storeOpInst : stores) {
+        if (memref == cast<AffineStoreOp>(storeOpInst).getMemRef())
+          storeOps->push_back(storeOpInst);
+      }
+    }
+
+    // Returns all load ops in 'loadOps' which access 'memref'.
+    void getLoadOpsForMemref(Value memref,
+                             SmallVectorImpl<Operation *> *loadOps) {
+      for (auto *loadOpInst : loads) {
+        if (memref == cast<AffineLoadOp>(loadOpInst).getMemRef())
+          loadOps->push_back(loadOpInst);
+      }
+    }
+
+    // Returns all memrefs in 'loadAndStoreMemrefSet' for which this node
+    // has at least one load and store operation.
+    void getLoadAndStoreMemrefSet(DenseSet<Value> *loadAndStoreMemrefSet) {
+      llvm::SmallDenseSet<Value, 2> loadMemrefs;
+      for (auto *loadOpInst : loads) {
+        loadMemrefs.insert(cast<AffineLoadOp>(loadOpInst).getMemRef());
+      }
+      for (auto *storeOpInst : stores) {
+        auto memref = cast<AffineStoreOp>(storeOpInst).getMemRef();
+        if (loadMemrefs.count(memref) > 0)
+          loadAndStoreMemrefSet->insert(memref);
+      }
+    }
+  };
+
+  // Edge represents a data dependence between nodes in the graph.
+  struct Edge {
+    // The id of the node at the other end of the edge.
+    // If this edge is stored in Edge = Node.inEdges[i], then
+    // 'Node.inEdges[i].id' is the identifier of the source node of the edge.
+    // If this edge is stored in Edge = Node.outEdges[i], then
+    // 'Node.outEdges[i].id' is the identifier of the dest node of the edge.
+    unsigned id;
+    // The SSA value on which this edge represents a dependence.
+    // If the value is a memref, then the dependence is between graph nodes
+    // which contain accesses to the same memref 'value'. If the value is a
+    // non-memref value, then the dependence is between a graph node which
+    // defines an SSA value and another graph node which uses the SSA value
+    // (e.g. a constant operation defining a value which is used inside a loop
+    // nest).
+    Value value;
+  };
+
+  // Map from node id to Node.
+  DenseMap<unsigned, Node> nodes;
+  // Map from node id to list of input edges.
+  DenseMap<unsigned, SmallVector<Edge, 2>> inEdges;
+  // Map from node id to list of output edges.
+  DenseMap<unsigned, SmallVector<Edge, 2>> outEdges;
+  // Map from memref to a count on the dependence edges associated with that
+  // memref.
+  DenseMap<Value, unsigned> memrefEdgeCount;
+  // The next unique identifier to use for newly created graph nodes.
+  unsigned nextNodeId = 0;
+
+  MemRefDependenceGraph() {}
+
+  // Initializes the dependence graph based on operations in 'f'.
+  // Returns true on success, false otherwise.
+  bool init(FuncOp f);
+
+  // Returns the graph node for 'id'.
+  Node *getNode(unsigned id) {
+    auto it = nodes.find(id);
+    assert(it != nodes.end());
+    return &it->second;
+  }
+
+  // Returns the graph node for 'forOp'.
+  Node *getForOpNode(AffineForOp forOp) {
+    for (auto &idAndNode : nodes)
+      if (idAndNode.second.op == forOp.getOperation())
+        return &idAndNode.second;
+    return nullptr;
+  }
+
+  // Adds a node with 'op' to the graph and returns its unique identifier.
+  unsigned addNode(Operation *op) {
+    Node node(nextNodeId++, op);
+    nodes.insert({node.id, node});
+    return node.id;
+  }
+
+  // Remove node 'id' (and its associated edges) from graph.
+  void removeNode(unsigned id) {
+    // Remove each edge in 'inEdges[id]'.
+    if (inEdges.count(id) > 0) {
+      SmallVector<Edge, 2> oldInEdges = inEdges[id];
+      for (auto &inEdge : oldInEdges) {
+        removeEdge(inEdge.id, id, inEdge.value);
+      }
+    }
+    // Remove each edge in 'outEdges[id]'.
+    if (outEdges.count(id) > 0) {
+      SmallVector<Edge, 2> oldOutEdges = outEdges[id];
+      for (auto &outEdge : oldOutEdges) {
+        removeEdge(id, outEdge.id, outEdge.value);
+      }
+    }
+    // Erase remaining node state.
+    inEdges.erase(id);
+    outEdges.erase(id);
+    nodes.erase(id);
+  }
+
+  // Returns true if node 'id' writes to any memref which escapes (or is an
+  // argument to) the function/block. Returns false otherwise.
+  bool writesToLiveInOrEscapingMemrefs(unsigned id) {
+    Node *node = getNode(id);
+    for (auto *storeOpInst : node->stores) {
+      auto memref = cast<AffineStoreOp>(storeOpInst).getMemRef();
+      auto *op = memref->getDefiningOp();
+      // Return true if 'memref' is a block argument.
+      if (!op)
+        return true;
+      // Return true if any use of 'memref' escapes the function.
+      for (auto *user : memref->getUsers())
+        if (!isMemRefDereferencingOp(*user))
+          return true;
+    }
+    return false;
+  }
+
+  // Returns the unique AffineStoreOp in `node` that meets all the following:
+  //   *) store is the only one that writes to a function-local memref live out
+  //      of `node`,
+  //   *) store is not the source of a self-dependence on `node`.
+  // Otherwise, returns a null AffineStoreOp.
+  AffineStoreOp getUniqueOutgoingStore(Node *node) {
+    AffineStoreOp uniqueStore;
+
+    // Return null if `node` doesn't have any outgoing edges.
+    auto outEdgeIt = outEdges.find(node->id);
+    if (outEdgeIt == outEdges.end())
+      return nullptr;
+
+    const auto &nodeOutEdges = outEdgeIt->second;
+    for (auto *op : node->stores) {
+      auto storeOp = cast<AffineStoreOp>(op);
+      auto memref = storeOp.getMemRef();
+      // Skip this store if there are no dependences on its memref. This means
+      // that store either:
+      // *) writes to a memref that is only read within the same loop nest
+      //    (self-dependence edges are not represented in graph at the moment),
+      // *) writes to a function live out memref (function parameter), or
+      // *) is dead.
+      if (llvm::all_of(nodeOutEdges, [=](const Edge &edge) {
+            return (edge.value != memref);
+          }))
+        continue;
+
+      if (uniqueStore)
+        // Found multiple stores to function-local live-out memrefs.
+        return nullptr;
+      // Found first store to function-local live-out memref.
+      uniqueStore = storeOp;
+    }
+
+    return uniqueStore;
+  }
+
+  // Returns true if node 'id' can be removed from the graph. Returns false
+  // otherwise. A node can be removed from the graph iff the following
+  // conditions are met:
+  // *) The node does not write to any memref which escapes (or is a
+  //    function/block argument).
+  // *) The node has no successors in the dependence graph.
+  bool canRemoveNode(unsigned id) {
+    if (writesToLiveInOrEscapingMemrefs(id))
+      return false;
+    Node *node = getNode(id);
+    for (auto *storeOpInst : node->stores) {
+      // Return false if there exist out edges from 'id' on 'memref'.
+      if (getOutEdgeCount(id, cast<AffineStoreOp>(storeOpInst).getMemRef()) > 0)
+        return false;
+    }
+    return true;
+  }
+
+  // Returns true iff there is an edge from node 'srcId' to node 'dstId' which
+  // is for 'value' if non-null, or for any value otherwise. Returns false
+  // otherwise.
+  bool hasEdge(unsigned srcId, unsigned dstId, Value value = nullptr) {
+    if (outEdges.count(srcId) == 0 || inEdges.count(dstId) == 0) {
+      return false;
+    }
+    bool hasOutEdge = llvm::any_of(outEdges[srcId], [=](Edge &edge) {
+      return edge.id == dstId && (!value || edge.value == value);
+    });
+    bool hasInEdge = llvm::any_of(inEdges[dstId], [=](Edge &edge) {
+      return edge.id == srcId && (!value || edge.value == value);
+    });
+    return hasOutEdge && hasInEdge;
+  }
+
+  // Adds an edge from node 'srcId' to node 'dstId' for 'value'.
+  void addEdge(unsigned srcId, unsigned dstId, Value value) {
+    if (!hasEdge(srcId, dstId, value)) {
+      outEdges[srcId].push_back({dstId, value});
+      inEdges[dstId].push_back({srcId, value});
+      if (value->getType().isa<MemRefType>())
+        memrefEdgeCount[value]++;
+    }
+  }
+
+  // Removes an edge from node 'srcId' to node 'dstId' for 'value'.
+  void removeEdge(unsigned srcId, unsigned dstId, Value value) {
+    assert(inEdges.count(dstId) > 0);
+    assert(outEdges.count(srcId) > 0);
+    if (value->getType().isa<MemRefType>()) {
+      assert(memrefEdgeCount.count(value) > 0);
+      memrefEdgeCount[value]--;
+    }
+    // Remove 'srcId' from 'inEdges[dstId]'.
+    for (auto it = inEdges[dstId].begin(); it != inEdges[dstId].end(); ++it) {
+      if ((*it).id == srcId && (*it).value == value) {
+        inEdges[dstId].erase(it);
+        break;
+      }
+    }
+    // Remove 'dstId' from 'outEdges[srcId]'.
+    for (auto it = outEdges[srcId].begin(); it != outEdges[srcId].end(); ++it) {
+      if ((*it).id == dstId && (*it).value == value) {
+        outEdges[srcId].erase(it);
+        break;
+      }
+    }
+  }
+
+  // Returns true if there is a path in the dependence graph from node 'srcId'
+  // to node 'dstId'. Returns false otherwise.
+  bool hasDependencePath(unsigned srcId, unsigned dstId) {
+    // Worklist state is: <node-id, next-output-edge-index-to-visit>
+    SmallVector<std::pair<unsigned, unsigned>, 4> worklist;
+    worklist.push_back({srcId, 0});
+    // Run DFS traversal to see if 'dstId' is reachable from 'srcId'.
+    while (!worklist.empty()) {
+      auto &idAndIndex = worklist.back();
+      // Return true if we have reached 'dstId'.
+      if (idAndIndex.first == dstId)
+        return true;
+      // Pop and continue if node has no out edges, or if all out edges have
+      // already been visited.
+      if (outEdges.count(idAndIndex.first) == 0 ||
+          idAndIndex.second == outEdges[idAndIndex.first].size()) {
+        worklist.pop_back();
+        continue;
+      }
+      // Get graph edge to traverse.
+      Edge edge = outEdges[idAndIndex.first][idAndIndex.second];
+      // Increment next output edge index for 'idAndIndex'.
+      ++idAndIndex.second;
+      // Add node at 'edge.id' to worklist.
+      worklist.push_back({edge.id, 0});
+    }
+    return false;
+  }
+
+  // Returns the input edge count for node 'id' and 'memref' from src nodes
+  // which access 'memref' with a store operation.
+  unsigned getIncomingMemRefAccesses(unsigned id, Value memref) {
+    unsigned inEdgeCount = 0;
+    if (inEdges.count(id) > 0)
+      for (auto &inEdge : inEdges[id])
+        if (inEdge.value == memref) {
+          Node *srcNode = getNode(inEdge.id);
+          // Only count in edges from 'srcNode' if 'srcNode' accesses 'memref'
+          if (srcNode->getStoreOpCount(memref) > 0)
+            ++inEdgeCount;
+        }
+    return inEdgeCount;
+  }
+
+  // Returns the output edge count for node 'id' and 'memref' (if non-null),
+  // otherwise returns the total output edge count from node 'id'.
+  unsigned getOutEdgeCount(unsigned id, Value memref = nullptr) {
+    unsigned outEdgeCount = 0;
+    if (outEdges.count(id) > 0)
+      for (auto &outEdge : outEdges[id])
+        if (!memref || outEdge.value == memref)
+          ++outEdgeCount;
+    return outEdgeCount;
+  }
+
+  // Computes and returns an insertion point operation, before which the
+  // the fused <srcId, dstId> loop nest can be inserted while preserving
+  // dependences. Returns nullptr if no such insertion point is found.
+  Operation *getFusedLoopNestInsertionPoint(unsigned srcId, unsigned dstId) {
+    if (outEdges.count(srcId) == 0)
+      return getNode(dstId)->op;
+
+    // Build set of insts in range (srcId, dstId) which depend on 'srcId'.
+    SmallPtrSet<Operation *, 2> srcDepInsts;
+    for (auto &outEdge : outEdges[srcId])
+      if (outEdge.id != dstId)
+        srcDepInsts.insert(getNode(outEdge.id)->op);
+
+    // Build set of insts in range (srcId, dstId) on which 'dstId' depends.
+    SmallPtrSet<Operation *, 2> dstDepInsts;
+    for (auto &inEdge : inEdges[dstId])
+      if (inEdge.id != srcId)
+        dstDepInsts.insert(getNode(inEdge.id)->op);
+
+    Operation *srcNodeInst = getNode(srcId)->op;
+    Operation *dstNodeInst = getNode(dstId)->op;
+
+    // Computing insertion point:
+    // *) Walk all operation positions in Block operation list in the
+    //    range (src, dst). For each operation 'op' visited in this search:
+    //   *) Store in 'firstSrcDepPos' the first position where 'op' has a
+    //      dependence edge from 'srcNode'.
+    //   *) Store in 'lastDstDepPost' the last position where 'op' has a
+    //      dependence edge to 'dstNode'.
+    // *) Compare 'firstSrcDepPos' and 'lastDstDepPost' to determine the
+    //    operation insertion point (or return null pointer if no such
+    //    insertion point exists: 'firstSrcDepPos' <= 'lastDstDepPos').
+    SmallVector<Operation *, 2> depInsts;
+    Optional<unsigned> firstSrcDepPos;
+    Optional<unsigned> lastDstDepPos;
+    unsigned pos = 0;
+    for (Block::iterator it = std::next(Block::iterator(srcNodeInst));
+         it != Block::iterator(dstNodeInst); ++it) {
+      Operation *op = &(*it);
+      if (srcDepInsts.count(op) > 0 && firstSrcDepPos == None)
+        firstSrcDepPos = pos;
+      if (dstDepInsts.count(op) > 0)
+        lastDstDepPos = pos;
+      depInsts.push_back(op);
+      ++pos;
+    }
+
+    if (firstSrcDepPos.hasValue()) {
+      if (lastDstDepPos.hasValue()) {
+        if (firstSrcDepPos.getValue() <= lastDstDepPos.getValue()) {
+          // No valid insertion point exists which preserves dependences.
+          return nullptr;
+        }
+      }
+      // Return the insertion point at 'firstSrcDepPos'.
+      return depInsts[firstSrcDepPos.getValue()];
+    }
+    // No dependence targets in range (or only dst deps in range), return
+    // 'dstNodInst' insertion point.
+    return dstNodeInst;
+  }
+
+  // Updates edge mappings from node 'srcId' to node 'dstId' after 'oldMemRef'
+  // has been replaced in node at 'dstId' by a private memref depending
+  // on the value of 'createPrivateMemRef'.
+  void updateEdges(unsigned srcId, unsigned dstId, Value oldMemRef,
+                   bool createPrivateMemRef) {
+    // For each edge in 'inEdges[srcId]': add new edge remaping to 'dstId'.
+    if (inEdges.count(srcId) > 0) {
+      SmallVector<Edge, 2> oldInEdges = inEdges[srcId];
+      for (auto &inEdge : oldInEdges) {
+        // Add edge from 'inEdge.id' to 'dstId' if not for 'oldMemRef'.
+        if (inEdge.value != oldMemRef)
+          addEdge(inEdge.id, dstId, inEdge.value);
+      }
+    }
+    // For each edge in 'outEdges[srcId]': remove edge from 'srcId' to 'dstId'.
+    if (outEdges.count(srcId) > 0) {
+      SmallVector<Edge, 2> oldOutEdges = outEdges[srcId];
+      for (auto &outEdge : oldOutEdges) {
+        // Remove any out edges from 'srcId' to 'dstId' across memrefs.
+        if (outEdge.id == dstId)
+          removeEdge(srcId, outEdge.id, outEdge.value);
+      }
+    }
+    // Remove any edges in 'inEdges[dstId]' on 'oldMemRef' (which is being
+    // replaced by a private memref). These edges could come from nodes
+    // other than 'srcId' which were removed in the previous step.
+    if (inEdges.count(dstId) > 0 && createPrivateMemRef) {
+      SmallVector<Edge, 2> oldInEdges = inEdges[dstId];
+      for (auto &inEdge : oldInEdges)
+        if (inEdge.value == oldMemRef)
+          removeEdge(inEdge.id, dstId, inEdge.value);
+    }
+  }
+
+  // Update edge mappings for nodes 'sibId' and 'dstId' to reflect fusion
+  // of sibling node 'sidId' into node 'dstId'.
+  void updateEdges(unsigned sibId, unsigned dstId) {
+    // For each edge in 'inEdges[sibId]':
+    // *) Add new edge from source node 'inEdge.id' to 'dstNode'.
+    // *) Remove edge from source node 'inEdge.id' to 'sibNode'.
+    if (inEdges.count(sibId) > 0) {
+      SmallVector<Edge, 2> oldInEdges = inEdges[sibId];
+      for (auto &inEdge : oldInEdges) {
+        addEdge(inEdge.id, dstId, inEdge.value);
+        removeEdge(inEdge.id, sibId, inEdge.value);
+      }
+    }
+
+    // For each edge in 'outEdges[sibId]' to node 'id'
+    // *) Add new edge from 'dstId' to 'outEdge.id'.
+    // *) Remove edge from 'sibId' to 'outEdge.id'.
+    if (outEdges.count(sibId) > 0) {
+      SmallVector<Edge, 2> oldOutEdges = outEdges[sibId];
+      for (auto &outEdge : oldOutEdges) {
+        addEdge(dstId, outEdge.id, outEdge.value);
+        removeEdge(sibId, outEdge.id, outEdge.value);
+      }
+    }
+  }
+
+  // Adds ops in 'loads' and 'stores' to node at 'id'.
+  void addToNode(unsigned id, const SmallVectorImpl<Operation *> &loads,
+                 const SmallVectorImpl<Operation *> &stores) {
+    Node *node = getNode(id);
+    for (auto *loadOpInst : loads)
+      node->loads.push_back(loadOpInst);
+    for (auto *storeOpInst : stores)
+      node->stores.push_back(storeOpInst);
+  }
+
+  void clearNodeLoadAndStores(unsigned id) {
+    Node *node = getNode(id);
+    node->loads.clear();
+    node->stores.clear();
+  }
+
+  // Calls 'callback' for each input edge incident to node 'id' which carries a
+  // memref dependence.
+  void forEachMemRefInputEdge(unsigned id,
+                              const std::function<void(Edge)> &callback) {
+    if (inEdges.count(id) > 0)
+      forEachMemRefEdge(inEdges[id], callback);
+  }
+
+  // Calls 'callback' for each output edge from node 'id' which carries a
+  // memref dependence.
+  void forEachMemRefOutputEdge(unsigned id,
+                               const std::function<void(Edge)> &callback) {
+    if (outEdges.count(id) > 0)
+      forEachMemRefEdge(outEdges[id], callback);
+  }
+
+  // Calls 'callback' for each edge in 'edges' which carries a memref
+  // dependence.
+  void forEachMemRefEdge(ArrayRef<Edge> edges,
+                         const std::function<void(Edge)> &callback) {
+    for (auto &edge : edges) {
+      // Skip if 'edge' is not a memref dependence edge.
+      if (!edge.value->getType().isa<MemRefType>())
+        continue;
+      assert(nodes.count(edge.id) > 0);
+      // Skip if 'edge.id' is not a loop nest.
+      if (!isa<AffineForOp>(getNode(edge.id)->op))
+        continue;
+      // Visit current input edge 'edge'.
+      callback(edge);
+    }
+  }
+
+  void print(raw_ostream &os) const {
+    os << "\nMemRefDependenceGraph\n";
+    os << "\nNodes:\n";
+    for (auto &idAndNode : nodes) {
+      os << "Node: " << idAndNode.first << "\n";
+      auto it = inEdges.find(idAndNode.first);
+      if (it != inEdges.end()) {
+        for (const auto &e : it->second)
+          os << "  InEdge: " << e.id << " " << e.value << "\n";
+      }
+      it = outEdges.find(idAndNode.first);
+      if (it != outEdges.end()) {
+        for (const auto &e : it->second)
+          os << "  OutEdge: " << e.id << " " << e.value << "\n";
+      }
+    }
+  }
+  void dump() const { print(llvm::errs()); }
+};
+
+} // end anonymous namespace
+
+// Initializes the data dependence graph by walking operations in 'f'.
+// Assigns each node in the graph a node id based on program order in 'f'.
+// TODO(andydavis) Add support for taking a Block arg to construct the
+// dependence graph at a different depth.
+bool MemRefDependenceGraph::init(FuncOp f) {
+  DenseMap<Value, SetVector<unsigned>> memrefAccesses;
+
+  // TODO: support multi-block functions.
+  if (f.getBlocks().size() != 1)
+    return false;
+
+  DenseMap<Operation *, unsigned> forToNodeMap;
+  for (auto &op : f.front()) {
+    if (auto forOp = dyn_cast<AffineForOp>(op)) {
+      // Create graph node 'id' to represent top-level 'forOp' and record
+      // all loads and store accesses it contains.
+      LoopNestStateCollector collector;
+      collector.collect(&op);
+      // Return false if a non 'affine.for' region was found (not currently
+      // supported).
+      if (collector.hasNonForRegion)
+        return false;
+      Node node(nextNodeId++, &op);
+      for (auto *opInst : collector.loadOpInsts) {
+        node.loads.push_back(opInst);
+        auto memref = cast<AffineLoadOp>(opInst).getMemRef();
+        memrefAccesses[memref].insert(node.id);
+      }
+      for (auto *opInst : collector.storeOpInsts) {
+        node.stores.push_back(opInst);
+        auto memref = cast<AffineStoreOp>(opInst).getMemRef();
+        memrefAccesses[memref].insert(node.id);
+      }
+      forToNodeMap[&op] = node.id;
+      nodes.insert({node.id, node});
+    } else if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
+      // Create graph node for top-level load op.
+      Node node(nextNodeId++, &op);
+      node.loads.push_back(&op);
+      auto memref = cast<AffineLoadOp>(op).getMemRef();
+      memrefAccesses[memref].insert(node.id);
+      nodes.insert({node.id, node});
+    } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
+      // Create graph node for top-level store op.
+      Node node(nextNodeId++, &op);
+      node.stores.push_back(&op);
+      auto memref = cast<AffineStoreOp>(op).getMemRef();
+      memrefAccesses[memref].insert(node.id);
+      nodes.insert({node.id, node});
+    } else if (op.getNumRegions() != 0) {
+      // Return false if another region is found (not currently supported).
+      return false;
+    } else if (op.getNumResults() > 0 && !op.use_empty()) {
+      // Create graph node for top-level producer of SSA values, which
+      // could be used by loop nest nodes.
+      Node node(nextNodeId++, &op);
+      nodes.insert({node.id, node});
+    }
+  }
+
+  // Add dependence edges between nodes which produce SSA values and their
+  // users.
+  for (auto &idAndNode : nodes) {
+    const Node &node = idAndNode.second;
+    if (!node.loads.empty() || !node.stores.empty())
+      continue;
+    auto *opInst = node.op;
+    for (auto value : opInst->getResults()) {
+      for (auto *user : value->getUsers()) {
+        SmallVector<AffineForOp, 4> loops;
+        getLoopIVs(*user, &loops);
+        if (loops.empty())
+          continue;
+        assert(forToNodeMap.count(loops[0].getOperation()) > 0);
+        unsigned userLoopNestId = forToNodeMap[loops[0].getOperation()];
+        addEdge(node.id, userLoopNestId, value);
+      }
+    }
+  }
+
+  // Walk memref access lists and add graph edges between dependent nodes.
+  for (auto &memrefAndList : memrefAccesses) {
+    unsigned n = memrefAndList.second.size();
+    for (unsigned i = 0; i < n; ++i) {
+      unsigned srcId = memrefAndList.second[i];
+      bool srcHasStore =
+          getNode(srcId)->getStoreOpCount(memrefAndList.first) > 0;
+      for (unsigned j = i + 1; j < n; ++j) {
+        unsigned dstId = memrefAndList.second[j];
+        bool dstHasStore =
+            getNode(dstId)->getStoreOpCount(memrefAndList.first) > 0;
+        if (srcHasStore || dstHasStore)
+          addEdge(srcId, dstId, memrefAndList.first);
+      }
+    }
+  }
+  return true;
+}
+
+// Removes load operations from 'srcLoads' which operate on 'memref', and
+// adds them to 'dstLoads'.
+static void moveLoadsAccessingMemrefTo(Value memref,
+                                       SmallVectorImpl<Operation *> *srcLoads,
+                                       SmallVectorImpl<Operation *> *dstLoads) {
+  dstLoads->clear();
+  SmallVector<Operation *, 4> srcLoadsToKeep;
+  for (auto *load : *srcLoads) {
+    if (cast<AffineLoadOp>(load).getMemRef() == memref)
+      dstLoads->push_back(load);
+    else
+      srcLoadsToKeep.push_back(load);
+  }
+  srcLoads->swap(srcLoadsToKeep);
+}
+
+// Returns the innermost common loop depth for the set of operations in 'ops'.
+static unsigned getInnermostCommonLoopDepth(ArrayRef<Operation *> ops) {
+  unsigned numOps = ops.size();
+  assert(numOps > 0);
+
+  std::vector<SmallVector<AffineForOp, 4>> loops(numOps);
+  unsigned loopDepthLimit = std::numeric_limits<unsigned>::max();
+  for (unsigned i = 0; i < numOps; ++i) {
+    getLoopIVs(*ops[i], &loops[i]);
+    loopDepthLimit =
+        std::min(loopDepthLimit, static_cast<unsigned>(loops[i].size()));
+  }
+
+  unsigned loopDepth = 0;
+  for (unsigned d = 0; d < loopDepthLimit; ++d) {
+    unsigned i;
+    for (i = 1; i < numOps; ++i) {
+      if (loops[i - 1][d] != loops[i][d])
+        break;
+    }
+    if (i != numOps)
+      break;
+    ++loopDepth;
+  }
+  return loopDepth;
+}
+
+// Returns the maximum loop depth at which no dependences between 'loadOpInsts'
+// and 'storeOpInsts' are satisfied.
+static unsigned getMaxLoopDepth(ArrayRef<Operation *> loadOpInsts,
+                                ArrayRef<Operation *> storeOpInsts) {
+  // Merge loads and stores into the same array.
+  SmallVector<Operation *, 2> ops(loadOpInsts.begin(), loadOpInsts.end());
+  ops.append(storeOpInsts.begin(), storeOpInsts.end());
+
+  // Compute the innermost common loop depth for loads and stores.
+  unsigned loopDepth = getInnermostCommonLoopDepth(ops);
+
+  // Return common loop depth for loads if there are no store ops.
+  if (storeOpInsts.empty())
+    return loopDepth;
+
+  // Check dependences on all pairs of ops in 'ops' and store the minimum
+  // loop depth at which a dependence is satisfied.
+  for (unsigned i = 0, e = ops.size(); i < e; ++i) {
+    auto *srcOpInst = ops[i];
+    MemRefAccess srcAccess(srcOpInst);
+    for (unsigned j = 0; j < e; ++j) {
+      auto *dstOpInst = ops[j];
+      MemRefAccess dstAccess(dstOpInst);
+
+      unsigned numCommonLoops =
+          getNumCommonSurroundingLoops(*srcOpInst, *dstOpInst);
+      for (unsigned d = 1; d <= numCommonLoops + 1; ++d) {
+        FlatAffineConstraints dependenceConstraints;
+        // TODO(andydavis) Cache dependence analysis results, check cache here.
+        DependenceResult result = checkMemrefAccessDependence(
+            srcAccess, dstAccess, d, &dependenceConstraints,
+            /*dependenceComponents=*/nullptr);
+        if (hasDependence(result)) {
+          // Store minimum loop depth and break because we want the min 'd' at
+          // which there is a dependence.
+          loopDepth = std::min(loopDepth, d - 1);
+          break;
+        }
+      }
+    }
+  }
+  return loopDepth;
+}
+
+// Sinks all sequential loops to the innermost levels (while preserving
+// relative order among them) and moves all parallel loops to the
+// outermost (while again preserving relative order among them).
+// This can increase the loop depth at which we can fuse a slice, since we are
+// pushing loop carried dependence to a greater depth in the loop nest.
+static void sinkSequentialLoops(MemRefDependenceGraph::Node *node) {
+  assert(isa<AffineForOp>(node->op));
+  AffineForOp newRootForOp = sinkSequentialLoops(cast<AffineForOp>(node->op));
+  node->op = newRootForOp.getOperation();
+}
+
+//  TODO(mlir-team): improve/complete this when we have target data.
+static unsigned getMemRefEltSizeInBytes(MemRefType memRefType) {
+  auto elementType = memRefType.getElementType();
+
+  unsigned sizeInBits;
+  if (elementType.isIntOrFloat()) {
+    sizeInBits = elementType.getIntOrFloatBitWidth();
+  } else {
+    auto vectorType = elementType.cast<VectorType>();
+    sizeInBits =
+        vectorType.getElementTypeBitWidth() * vectorType.getNumElements();
+  }
+  return llvm::divideCeil(sizeInBits, 8);
+}
+
+// Creates and returns a private (single-user) memref for fused loop rooted
+// at 'forOp', with (potentially reduced) memref size based on the
+// MemRefRegion written to by 'srcStoreOpInst' at depth 'dstLoopDepth'.
+// TODO(bondhugula): consider refactoring the common code from generateDma and
+// this one.
+static Value createPrivateMemRef(AffineForOp forOp, Operation *srcStoreOpInst,
+                                 unsigned dstLoopDepth,
+                                 Optional<unsigned> fastMemorySpace,
+                                 uint64_t localBufSizeThreshold) {
+  auto *forInst = forOp.getOperation();
+
+  // Create builder to insert alloc op just before 'forOp'.
+  OpBuilder b(forInst);
+  // Builder to create constants at the top level.
+  OpBuilder top(forInst->getParentOfType<FuncOp>().getBody());
+  // Create new memref type based on slice bounds.
+  auto oldMemRef = cast<AffineStoreOp>(srcStoreOpInst).getMemRef();
+  auto oldMemRefType = oldMemRef->getType().cast<MemRefType>();
+  unsigned rank = oldMemRefType.getRank();
+
+  // Compute MemRefRegion for 'srcStoreOpInst' at depth 'dstLoopDepth'.
+  MemRefRegion region(srcStoreOpInst->getLoc());
+  bool validRegion = succeeded(region.compute(srcStoreOpInst, dstLoopDepth));
+  (void)validRegion;
+  assert(validRegion && "unexpected memref region failure");
+  SmallVector<int64_t, 4> newShape;
+  std::vector<SmallVector<int64_t, 4>> lbs;
+  SmallVector<int64_t, 8> lbDivisors;
+  lbs.reserve(rank);
+  // Query 'region' for 'newShape' and lower bounds of MemRefRegion accessed
+  // by 'srcStoreOpInst' at depth 'dstLoopDepth'.
+  Optional<int64_t> numElements =
+      region.getConstantBoundingSizeAndShape(&newShape, &lbs, &lbDivisors);
+  assert(numElements.hasValue() &&
+         "non-constant number of elts in local buffer");
+
+  const FlatAffineConstraints *cst = region.getConstraints();
+  // 'outerIVs' holds the values that this memory region is symbolic/parametric
+  // on; this would correspond to loop IVs surrounding the level at which the
+  // slice is being materialized.
+  SmallVector<Value, 8> outerIVs;
+  cst->getIdValues(rank, cst->getNumIds(), &outerIVs);
+
+  // Build 'rank' AffineExprs from MemRefRegion 'lbs'
+  SmallVector<AffineExpr, 4> offsets;
+  offsets.reserve(rank);
+  for (unsigned d = 0; d < rank; ++d) {
+    assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
+
+    AffineExpr offset = top.getAffineConstantExpr(0);
+    for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) {
+      offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
+    }
+    assert(lbDivisors[d] > 0);
+    offset =
+        (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
+    offsets.push_back(offset);
+  }
+
+  // Create 'newMemRefType' using 'newShape' from MemRefRegion accessed
+  // by 'srcStoreOpInst'.
+  uint64_t bufSize =
+      getMemRefEltSizeInBytes(oldMemRefType) * numElements.getValue();
+  unsigned newMemSpace;
+  if (bufSize <= localBufSizeThreshold && fastMemorySpace.hasValue()) {
+    newMemSpace = fastMemorySpace.getValue();
+  } else {
+    newMemSpace = oldMemRefType.getMemorySpace();
+  }
+  auto newMemRefType = MemRefType::get(newShape, oldMemRefType.getElementType(),
+                                       {}, newMemSpace);
+  // Gather alloc operands for the dynamic dimensions of the memref.
+  SmallVector<Value, 4> allocOperands;
+  unsigned dynamicDimCount = 0;
+  for (auto dimSize : oldMemRefType.getShape()) {
+    if (dimSize == -1)
+      allocOperands.push_back(
+          top.create<DimOp>(forOp.getLoc(), oldMemRef, dynamicDimCount++));
+  }
+
+  // Create new private memref for fused loop 'forOp'.
+  // TODO(andydavis) Create/move alloc ops for private memrefs closer to their
+  // consumer loop nests to reduce their live range. Currently they are added
+  // at the beginning of the function, because loop nests can be reordered
+  // during the fusion pass.
+  Value newMemRef =
+      top.create<AllocOp>(forOp.getLoc(), newMemRefType, allocOperands);
+
+  // Build an AffineMap to remap access functions based on lower bound offsets.
+  SmallVector<AffineExpr, 4> remapExprs;
+  remapExprs.reserve(rank);
+  unsigned zeroOffsetCount = 0;
+  for (unsigned i = 0; i < rank; i++) {
+    if (auto constExpr = offsets[i].dyn_cast<AffineConstantExpr>())
+      if (constExpr.getValue() == 0)
+        ++zeroOffsetCount;
+    auto dimExpr = b.getAffineDimExpr(outerIVs.size() + i);
+
+    auto remapExpr =
+        simplifyAffineExpr(dimExpr - offsets[i], outerIVs.size() + rank, 0);
+    remapExprs.push_back(remapExpr);
+  }
+  auto indexRemap = zeroOffsetCount == rank
+                        ? AffineMap()
+                        : AffineMap::get(outerIVs.size() + rank, 0, remapExprs);
+  // Replace all users of 'oldMemRef' with 'newMemRef'.
+  LogicalResult res =
+      replaceAllMemRefUsesWith(oldMemRef, newMemRef, {}, indexRemap,
+                               /*extraOperands=*/outerIVs,
+                               /*symbolOperands=*/{},
+                               /*domInstFilter=*/&*forOp.getBody()->begin());
+  assert(succeeded(res) &&
+         "replaceAllMemrefUsesWith should always succeed here");
+  (void)res;
+  return newMemRef;
+}
+
+// Checks if node 'srcId' can be safely fused into node 'dstId'. Node 'srcId'
+// may write to multiple memrefs but it is required that only one of them,
+// 'srcLiveOutStoreOp', has output edges.
+// Returns true if 'dstNode's read/write region to 'memref' is a super set of
+// 'srcNode's write region to 'memref' and 'srcId' has only one output edge.
+// TODO(andydavis) Generalize this to handle more live in/out cases.
+static bool canFuseSrcWhichWritesToLiveOut(unsigned srcId, unsigned dstId,
+                                           AffineStoreOp srcLiveOutStoreOp,
+                                           MemRefDependenceGraph *mdg) {
+  assert(srcLiveOutStoreOp && "Expected a valid store op");
+  auto *dstNode = mdg->getNode(dstId);
+  Value memref = srcLiveOutStoreOp.getMemRef();
+  // Return false if 'srcNode' has more than one output edge on 'memref'.
+  if (mdg->getOutEdgeCount(srcId, memref) > 1)
+    return false;
+
+  // Compute MemRefRegion 'srcWriteRegion' for 'srcStoreOp' on 'memref'.
+  MemRefRegion srcWriteRegion(srcLiveOutStoreOp.getLoc());
+  if (failed(srcWriteRegion.compute(srcLiveOutStoreOp, /*loopDepth=*/0))) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Unable to compute MemRefRegion for source operation\n.");
+    return false;
+  }
+  SmallVector<int64_t, 4> srcShape;
+  // Query 'srcWriteRegion' for 'srcShape' and 'srcNumElements'.
+  // by 'srcStoreOp' at depth 'dstLoopDepth'.
+  Optional<int64_t> srcNumElements =
+      srcWriteRegion.getConstantBoundingSizeAndShape(&srcShape);
+  if (!srcNumElements.hasValue())
+    return false;
+
+  // Compute MemRefRegion 'dstRegion' for 'dstStore/LoadOpInst' on 'memref'.
+  // TODO(andydavis) Compute 'unionboundingbox' of all write regions (one for
+  // each store op in 'dstStoreOps').
+  SmallVector<Operation *, 2> dstStoreOps;
+  dstNode->getStoreOpsForMemref(memref, &dstStoreOps);
+  SmallVector<Operation *, 2> dstLoadOps;
+  dstNode->getLoadOpsForMemref(memref, &dstLoadOps);
+
+  auto *dstOpInst = dstStoreOps.empty() ? dstLoadOps[0] : dstStoreOps[0];
+  MemRefRegion dstRegion(dstOpInst->getLoc());
+  if (failed(dstRegion.compute(dstOpInst, /*loopDepth=*/0))) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Unable to compute MemRefRegion for dest operation\n.");
+    return false;
+  }
+  SmallVector<int64_t, 4> dstShape;
+  // Query 'dstRegion' for 'dstShape' and 'dstNumElements'.
+  // by 'dstOpInst' at depth 'dstLoopDepth'.
+  Optional<int64_t> dstNumElements =
+      dstRegion.getConstantBoundingSizeAndShape(&dstShape);
+  if (!dstNumElements.hasValue())
+    return false;
+
+  // Return false if write region is not a superset of 'srcNodes' write
+  // region to 'memref'.
+  // TODO(andydavis) Check the shape and lower bounds here too.
+  if (srcNumElements != dstNumElements)
+    return false;
+  return true;
+}
+
+// Checks the profitability of fusing a backwards slice of the loop nest
+// surrounding 'srcOpInst' into the loop nest surrounding 'dstLoadOpInsts'.
+// The argument 'srcStoreOpInst' is used to calculate the storage reduction on
+// the memref being produced and consumed, which is an input to the cost model.
+// For producer-consumer fusion, 'srcStoreOpInst' will be the same as
+// 'srcOpInst', as we are slicing w.r.t to that producer.
+// For input-reuse fusion, 'srcOpInst' will be the src loop nest LoadOp which
+// reads from the same memref as dst loop nest load ops, and 'srcStoreOpInst'
+// will be the unique store op in the src node, which will be used to check
+// that the write region is the same after input-reuse fusion.
+// Returns true if it is profitable to fuse the candidate loop nests. Returns
+// false otherwise. `dstLoopDepth` is set to the most profitable depth at which
+// to materialize the source loop nest slice.
+// The profitability model executes the following steps:
+// *) Computes the backward computation slice at 'srcOpInst'. This
+//    computation slice of the loop nest surrounding 'srcOpInst' is
+//    represented by modified src loop bounds in 'sliceState', which are
+//    functions of loop IVs in the loop nest surrounding 'srcOpInst'.
+// *) Computes the cost of unfused src/dst loop nests (currently the cost of a
+//    loop nest is the total number of dynamic operation instances in the loop
+//    nest).
+// *) Computes the cost of fusing a slice of the src loop nest into the dst
+//    loop nest at various values of dst loop depth, attempting to fuse
+//    the largest computation slice at the maximal dst loop depth (closest to
+//    the load) to minimize reuse distance and potentially enable subsequent
+//    load/store forwarding.
+//    NOTE: If the dst loop nest includes multiple loads in 'dstLoadOpInsts' for
+//    the same memref as is written by 'srcOpInst', then the union of slice
+//    loop bounds is used to compute the slice and associated slice cost.
+//    NOTE: 'dstLoopDepth' refers to the loop depth within the destination loop
+//    nest, at which the src computation slice is inserted/fused.
+//    NOTE: We attempt to maximize the dst loop depth, but there are cases
+//    where a particular setting for 'dstLoopNest' might fuse an unsliced
+//    loop (within the src computation slice) at a depth which results in
+//    excessive recomputation (see unit tests for examples).
+// *) Compares the total cost of the unfused loop nests to the min cost fused
+//    loop nest computed in the previous step, and returns true if the latter
+//    is lower.
+static bool isFusionProfitable(Operation *srcOpInst, Operation *srcStoreOpInst,
+                               ArrayRef<Operation *> dstLoadOpInsts,
+                               ArrayRef<Operation *> dstStoreOpInsts,
+                               ComputationSliceState *sliceState,
+                               unsigned *dstLoopDepth, bool maximalFusion) {
+  LLVM_DEBUG({
+    llvm::dbgs() << "Checking whether fusion is profitable between:\n";
+    llvm::dbgs() << " " << *srcOpInst << " and \n";
+    for (auto dstOpInst : dstLoadOpInsts) {
+      llvm::dbgs() << " " << *dstOpInst << "\n";
+    };
+  });
+
+  // Compute cost of sliced and unsliced src loop nest.
+  SmallVector<AffineForOp, 4> srcLoopIVs;
+  getLoopIVs(*srcOpInst, &srcLoopIVs);
+  unsigned numSrcLoopIVs = srcLoopIVs.size();
+
+  // Walk src loop nest and collect stats.
+  LoopNestStats srcLoopNestStats;
+  if (!getLoopNestStats(srcLoopIVs[0], &srcLoopNestStats))
+    return false;
+
+  // Compute cost of dst loop nest.
+  SmallVector<AffineForOp, 4> dstLoopIVs;
+  getLoopIVs(*dstLoadOpInsts[0], &dstLoopIVs);
+
+  LoopNestStats dstLoopNestStats;
+  if (!getLoopNestStats(dstLoopIVs[0], &dstLoopNestStats))
+    return false;
+
+  // Compute the maximum loop depth at which we can can insert the src slice
+  // and still satisfy dest loop nest dependences, for producer-consumer fusion.
+  unsigned maxDstLoopDepth =
+      (srcOpInst == srcStoreOpInst)
+          ? getMaxLoopDepth(dstLoadOpInsts, dstStoreOpInsts)
+          : dstLoopIVs.size();
+  if (maxDstLoopDepth == 0) {
+    LLVM_DEBUG(llvm::dbgs() << "Can't fuse: maxDstLoopDepth == 0 .\n");
+    return false;
+  }
+
+  // Search for min cost value for 'dstLoopDepth'. At each value of
+  // 'dstLoopDepth' from 'maxDstLoopDepth' to '1', compute computation slice
+  // bounds between 'srcOpInst' and each op in 'dstOpinsts' (taking the union
+  // of these bounds). Next the union slice bounds are used to calculate
+  // the cost of the slice and the cost of the slice inserted into the dst
+  // loop nest at 'dstLoopDepth'.
+  uint64_t minFusedLoopNestComputeCost = std::numeric_limits<uint64_t>::max();
+  double maxStorageReduction = 0.0;
+  Optional<uint64_t> sliceMemEstimate = None;
+
+  SmallVector<ComputationSliceState, 4> sliceStates;
+  sliceStates.resize(maxDstLoopDepth);
+  // The best loop depth at which to materialize the slice.
+  Optional<unsigned> bestDstLoopDepth = None;
+
+  // Compute op instance count for the src loop nest without iteration slicing.
+  uint64_t srcLoopNestCost = getComputeCost(srcLoopIVs[0], srcLoopNestStats);
+
+  // Compute src loop nest write region size.
+  MemRefRegion srcWriteRegion(srcStoreOpInst->getLoc());
+  if (failed(srcWriteRegion.compute(srcStoreOpInst, /*loopDepth=*/0))) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Unable to compute MemRefRegion for source operation\n.");
+    return false;
+  }
+
+  Optional<int64_t> maybeSrcWriteRegionSizeBytes =
+      srcWriteRegion.getRegionSize();
+  if (!maybeSrcWriteRegionSizeBytes.hasValue())
+    return false;
+  int64_t srcWriteRegionSizeBytes = maybeSrcWriteRegionSizeBytes.getValue();
+
+  // Compute op instance count for the src loop nest.
+  uint64_t dstLoopNestCost = getComputeCost(dstLoopIVs[0], dstLoopNestStats);
+
+  // Evaluate all depth choices for materializing the slice in the destination
+  // loop nest.
+  for (unsigned i = maxDstLoopDepth; i >= 1; --i) {
+    // Compute the union of slice bounds of all ops in 'dstLoadOpInsts'.
+    if (failed(mlir::computeSliceUnion({srcOpInst}, dstLoadOpInsts,
+                                       /*loopDepth=*/i,
+                                       /*numCommonLoops=*/0,
+                                       /*isBackwardSlice=*/true,
+                                       &sliceStates[i - 1]))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "computeSliceUnion failed for loopDepth: " << i << "\n");
+      continue;
+    }
+
+    int64_t fusedLoopNestComputeCost;
+    if (!getFusionComputeCost(srcLoopIVs[0], srcLoopNestStats, dstLoopIVs[0],
+                              dstLoopNestStats, &sliceStates[i - 1],
+                              &fusedLoopNestComputeCost)) {
+      LLVM_DEBUG(llvm::dbgs() << "Unable to compute fusion compute cost.\n.");
+      continue;
+    }
+
+    double additionalComputeFraction =
+        fusedLoopNestComputeCost /
+            (static_cast<double>(srcLoopNestCost) + dstLoopNestCost) -
+        1;
+
+    // Determine what the slice write MemRefRegion would be, if the src loop
+    // nest slice 'sliceStates[i - 1]' were to be inserted into the dst loop
+    // nest at loop depth 'i'
+    MemRefRegion sliceWriteRegion(srcStoreOpInst->getLoc());
+    if (failed(sliceWriteRegion.compute(srcStoreOpInst, /*loopDepth=*/0,
+                                        &sliceStates[i - 1]))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to compute slice write region at loopDepth: " << i
+                 << "\n");
+      continue;
+    }
+
+    Optional<int64_t> maybeSliceWriteRegionSizeBytes =
+        sliceWriteRegion.getRegionSize();
+    if (!maybeSliceWriteRegionSizeBytes.hasValue() ||
+        maybeSliceWriteRegionSizeBytes.getValue() == 0) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to get slice write region size at loopDepth: " << i
+                 << "\n");
+      continue;
+    }
+    int64_t sliceWriteRegionSizeBytes =
+        maybeSliceWriteRegionSizeBytes.getValue();
+
+    // If we are fusing for reuse, check that write regions remain the same.
+    // TODO(andydavis) Write region check should check sizes and offsets in
+    // each dimension, so that we are sure they are covering the same memref
+    // region. Also, move this out to a isMemRefRegionSuperSet helper function.
+    if (srcOpInst != srcStoreOpInst &&
+        sliceWriteRegionSizeBytes != srcWriteRegionSizeBytes)
+      continue;
+
+    double storageReduction = static_cast<double>(srcWriteRegionSizeBytes) /
+                              static_cast<double>(sliceWriteRegionSizeBytes);
+
+    LLVM_DEBUG({
+      std::stringstream msg;
+      msg << "  evaluating fusion profitability at depth : " << i << "\n"
+          << std::fixed << std::setprecision(2)
+          << "   additional compute fraction: "
+          << 100.0 * additionalComputeFraction << "%\n"
+          << "   storage reduction factor: " << storageReduction << "x\n"
+          << "   fused nest cost: " << fusedLoopNestComputeCost << "\n"
+          << "   src write region size: " << srcWriteRegionSizeBytes << "\n"
+          << "   slice write region size: " << sliceWriteRegionSizeBytes
+          << "\n";
+      llvm::dbgs() << msg.str();
+    });
+
+    double computeToleranceThreshold =
+        clFusionAddlComputeTolerance.getNumOccurrences() > 0
+            ? clFusionAddlComputeTolerance
+            : LoopFusion::kComputeToleranceThreshold;
+
+    // TODO(b/123247369): This is a placeholder cost model.
+    // Among all choices that add an acceptable amount of redundant computation
+    // (as per computeToleranceThreshold), we will simply pick the one that
+    // reduces the intermediary size the most.
+    if ((storageReduction > maxStorageReduction) &&
+        (maximalFusion ||
+         (additionalComputeFraction < computeToleranceThreshold))) {
+      maxStorageReduction = storageReduction;
+      bestDstLoopDepth = i;
+      minFusedLoopNestComputeCost = fusedLoopNestComputeCost;
+      sliceMemEstimate = sliceWriteRegionSizeBytes;
+    }
+  }
+
+  // A simple cost model: fuse if it reduces the memory footprint. If
+  // -maximal-fusion is set, fuse nevertheless.
+
+  if (!maximalFusion && !bestDstLoopDepth.hasValue()) {
+    LLVM_DEBUG(
+        llvm::dbgs()
+        << "All fusion choices involve more than the threshold amount of "
+           "redundant computation; NOT fusing.\n");
+    return false;
+  }
+
+  if (!bestDstLoopDepth.hasValue()) {
+    LLVM_DEBUG(llvm::dbgs() << "no fusion depth could be evaluated.\n");
+    return false;
+  }
+
+  // Set dstLoopDepth based on best values from search.
+  *dstLoopDepth = bestDstLoopDepth.getValue();
+
+  LLVM_DEBUG(
+      llvm::dbgs() << " LoopFusion fusion stats:"
+                   << "\n  best loop depth: " << bestDstLoopDepth
+                   << "\n  src loop nest compute cost: " << srcLoopNestCost
+                   << "\n  dst loop nest compute cost: " << dstLoopNestCost
+                   << "\n  fused loop nest compute cost: "
+                   << minFusedLoopNestComputeCost << "\n");
+
+  auto dstMemSize = getMemoryFootprintBytes(dstLoopIVs[0]);
+  auto srcMemSize = getMemoryFootprintBytes(srcLoopIVs[0]);
+
+  Optional<double> storageReduction = None;
+
+  if (!maximalFusion) {
+    if (!dstMemSize.hasValue() || !srcMemSize.hasValue()) {
+      LLVM_DEBUG(
+          llvm::dbgs()
+          << "  fusion memory benefit cannot be evaluated; NOT fusing.\n");
+      return false;
+    }
+
+    auto srcMemSizeVal = srcMemSize.getValue();
+    auto dstMemSizeVal = dstMemSize.getValue();
+
+    assert(sliceMemEstimate.hasValue() && "expected value");
+    auto fusedMem = dstMemSizeVal + sliceMemEstimate.getValue();
+
+    LLVM_DEBUG(llvm::dbgs() << "   src mem: " << srcMemSizeVal << "\n"
+                            << "   dst mem: " << dstMemSizeVal << "\n"
+                            << "   fused mem: " << fusedMem << "\n"
+                            << "   slice mem: " << sliceMemEstimate << "\n");
+
+    if (static_cast<long>(fusedMem) > srcMemSizeVal + dstMemSizeVal) {
+      LLVM_DEBUG(llvm::dbgs() << "Fusion is not profitable; NOT fusing.\n");
+      return false;
+    }
+    storageReduction =
+        100.0 *
+        (1.0 - fusedMem / (static_cast<double>(srcMemSizeVal) + dstMemSizeVal));
+  }
+
+  double additionalComputeFraction =
+      100.0 * (minFusedLoopNestComputeCost /
+                   (static_cast<double>(srcLoopNestCost) + dstLoopNestCost) -
+               1);
+  (void)additionalComputeFraction;
+  LLVM_DEBUG({
+    std::stringstream msg;
+    msg << " fusion is most profitable at depth " << *dstLoopDepth << " with "
+        << std::setprecision(2) << additionalComputeFraction
+        << "% redundant computation and a ";
+    msg << (storageReduction.hasValue()
+                ? std::to_string(storageReduction.getValue())
+                : "<unknown>");
+    msg << "% storage reduction.\n";
+    llvm::dbgs() << msg.str();
+  });
+
+  // Update return parameter 'sliceState' with 'bestSliceState'.
+  ComputationSliceState *bestSliceState = &sliceStates[*dstLoopDepth - 1];
+  sliceState->lbs = bestSliceState->lbs;
+  sliceState->ubs = bestSliceState->ubs;
+  sliceState->lbOperands = bestSliceState->lbOperands;
+  sliceState->ubOperands = bestSliceState->ubOperands;
+
+  // Canonicalize slice bound affine maps.
+  for (unsigned i = 0; i < numSrcLoopIVs; ++i) {
+    if (sliceState->lbs[i] != AffineMap()) {
+      canonicalizeMapAndOperands(&sliceState->lbs[i],
+                                 &sliceState->lbOperands[i]);
+    }
+    if (sliceState->ubs[i] != AffineMap()) {
+      canonicalizeMapAndOperands(&sliceState->ubs[i],
+                                 &sliceState->ubOperands[i]);
+    }
+  }
+  return true;
+}
+
+namespace {
+
+// GreedyFusion greedily fuses loop nests which have a producer/consumer or
+// input-reuse relationship on a memref, with the goal of improving locality.
+//
+// The steps of the producer-consumer fusion algorithm are as follows:
+//
+// *) A worklist is initialized with node ids from the dependence graph.
+// *) For each node id in the worklist:
+//   *) Pop an AffineForOp of the worklist. This 'dstAffineForOp' will be a
+//      candidate destination AffineForOp into which fusion will be attempted.
+//   *) Add each LoadOp currently in 'dstAffineForOp' into list 'dstLoadOps'.
+//   *) For each LoadOp in 'dstLoadOps' do:
+//      *) Look up dependent loop nests which have a single store op to the same
+//         memref.
+//      *) Check if dependences would be violated by the fusion.
+//      *) Get a computation slice of 'srcLoopNest', which adjusts its loop
+//         bounds to be functions of 'dstLoopNest' IVs and symbols.
+//      *) Fuse the 'srcLoopNest' computation slice into the 'dstLoopNest',
+//         at a loop depth determined by the cost model in 'isFusionProfitable'.
+//      *) Add the newly fused load/store operations to the state,
+//         and also add newly fused load ops to 'dstLoopOps' to be considered
+//         as fusion dst load ops in another iteration.
+//      *) Remove old src loop nest and its associated state.
+//
+// The steps of the input-reuse fusion algorithm are as follows:
+//
+// *) Initialize 'worklist' with node ids from the dependence graph.
+// *) For each 'dstNode' in the worklist:
+//   *) Find a candidate sibling node 'sibNode' to fuse with 'dstNode' which
+//      loads from the same memref, but which has no dependence paths to/from.
+//   *) Get a computation slice of 'sibLoopNest', which adjusts its loop
+//      bounds to be functions of 'dstLoopNest' IVs and symbols.
+//   *) Fuse the 'sibLoopNest' computation slice into the 'dstLoopNest',
+//      at a loop depth determined by the cost model in 'isFusionProfitable'.
+//      This function also checks that the memref write region of 'sibLoopNest',
+//      is preserved in the fused loop nest.
+//   *) Update graph state to reflect the fusion of 'sibNode' into 'dstNode'.
+//
+// Given a graph where top-level operations are vertices in the set 'V' and
+// edges in the set 'E' are dependences between vertices, this algorithm
+// takes O(V) time for initialization, and has runtime O(V + E).
+//
+// This greedy algorithm is not 'maximal' due to the current restriction of
+// fusing along single producer consumer edges, but there is a TODO to fix this.
+//
+// TODO(andydavis) Experiment with other fusion policies.
+struct GreedyFusion {
+public:
+  // The data dependence graph to traverse during fusion.
+  MemRefDependenceGraph *mdg;
+  // Worklist of graph nodes visited during the fusion pass.
+  SmallVector<unsigned, 8> worklist;
+  // Set of graph nodes which are present on the worklist.
+  llvm::SmallDenseSet<unsigned, 16> worklistSet;
+  // Parameter for local buffer size threshold.
+  unsigned localBufSizeThreshold;
+  // Parameter for fast memory space.
+  Optional<unsigned> fastMemorySpace;
+  // If true, ignore any additional (redundant) computation tolerance threshold
+  // that would have prevented fusion.
+  bool maximalFusion;
+
+  using Node = MemRefDependenceGraph::Node;
+
+  GreedyFusion(MemRefDependenceGraph *mdg, unsigned localBufSizeThreshold,
+               Optional<unsigned> fastMemorySpace, bool maximalFusion)
+      : mdg(mdg), localBufSizeThreshold(localBufSizeThreshold),
+        fastMemorySpace(fastMemorySpace), maximalFusion(maximalFusion) {}
+
+  // Initializes 'worklist' with nodes from 'mdg'
+  void init() {
+    // TODO(andydavis) Add a priority queue for prioritizing nodes by different
+    // metrics (e.g. arithmetic intensity/flops-to-bytes ratio).
+    worklist.clear();
+    worklistSet.clear();
+    for (auto &idAndNode : mdg->nodes) {
+      const Node &node = idAndNode.second;
+      worklist.push_back(node.id);
+      worklistSet.insert(node.id);
+    }
+  }
+
+  // Run the GreedyFusion pass.
+  // *) First pass through the nodes fuses single-use producer nodes into their
+  //    unique consumer.
+  // *) Second pass fuses sibling nodes which share no dependence edges.
+  // *) Third pass fuses any remaining producer nodes into their users.
+  void run() {
+    // TODO(andydavis) Run this repeatedly until a fixed-point is reached.
+    fuseProducerConsumerNodes(/*maxSrcUserCount=*/1);
+    fuseSiblingNodes();
+    fuseProducerConsumerNodes(
+        /*maxSrcUserCount=*/std::numeric_limits<unsigned>::max());
+    eraseUnusedMemRefAllocations();
+  }
+
+  void fuseProducerConsumerNodes(unsigned maxSrcUserCount) {
+    init();
+    while (!worklist.empty()) {
+      unsigned dstId = worklist.back();
+      worklist.pop_back();
+      worklistSet.erase(dstId);
+
+      // Skip if this node was removed (fused into another node).
+      if (mdg->nodes.count(dstId) == 0)
+        continue;
+      // Get 'dstNode' into which to attempt fusion.
+      auto *dstNode = mdg->getNode(dstId);
+      // Skip if 'dstNode' is not a loop nest.
+      if (!isa<AffineForOp>(dstNode->op))
+        continue;
+      // Sink sequential loops in 'dstNode' (and thus raise parallel loops)
+      // while preserving relative order. This can increase the maximum loop
+      // depth at which we can fuse a slice of a producer loop nest into a
+      // consumer loop nest.
+      sinkSequentialLoops(dstNode);
+
+      SmallVector<Operation *, 4> loads = dstNode->loads;
+      SmallVector<Operation *, 4> dstLoadOpInsts;
+      DenseSet<Value> visitedMemrefs;
+      while (!loads.empty()) {
+        // Get memref of load on top of the stack.
+        auto memref = cast<AffineLoadOp>(loads.back()).getMemRef();
+        if (visitedMemrefs.count(memref) > 0)
+          continue;
+        visitedMemrefs.insert(memref);
+        // Move all loads in 'loads' accessing 'memref' to 'dstLoadOpInsts'.
+        moveLoadsAccessingMemrefTo(memref, &loads, &dstLoadOpInsts);
+        // Skip if no input edges along which to fuse.
+        if (mdg->inEdges.count(dstId) == 0)
+          continue;
+        // Iterate through in-edges for 'dstId' and src node id for any
+        // edges on 'memref'.
+        SmallVector<unsigned, 2> srcNodeIds;
+        for (auto &srcEdge : mdg->inEdges[dstId]) {
+          // Skip 'srcEdge' if not for 'memref'.
+          if (srcEdge.value != memref)
+            continue;
+          srcNodeIds.push_back(srcEdge.id);
+        }
+        for (unsigned srcId : srcNodeIds) {
+          // Skip if this node was removed (fused into another node).
+          if (mdg->nodes.count(srcId) == 0)
+            continue;
+          // Get 'srcNode' from which to attempt fusion into 'dstNode'.
+          auto *srcNode = mdg->getNode(srcId);
+          // Skip if 'srcNode' is not a loop nest.
+          if (!isa<AffineForOp>(srcNode->op))
+            continue;
+          // Skip if 'srcNode' has more than one live-out store to a
+          // function-local memref.
+          // TODO(andydavis) Support more generic multi-output src loop nests
+          // fusion.
+          auto srcStoreOp = mdg->getUniqueOutgoingStore(srcNode);
+          if (!srcStoreOp) {
+            // Get the src store op at the deepest loop depth.
+            // We will use 'LoopFusionUtils::canFuseLoops' to check fusion
+            // feasibility for loops with multiple stores.
+            unsigned maxLoopDepth = 0;
+            for (auto *op : srcNode->stores) {
+              auto storeOp = cast<AffineStoreOp>(op);
+              if (storeOp.getMemRef() != memref) {
+                srcStoreOp = nullptr;
+                break;
+              }
+              unsigned loopDepth = getNestingDepth(*storeOp);
+              if (loopDepth > maxLoopDepth) {
+                maxLoopDepth = loopDepth;
+                srcStoreOp = storeOp;
+              }
+            }
+            if (!srcStoreOp)
+              continue;
+          }
+
+          // Unique outgoing store found must write to 'memref' since 'memref'
+          // is the one that established the producer-consumer relationship
+          // between 'srcNode' and 'dstNode'.
+          assert(srcStoreOp.getMemRef() == memref &&
+                 "Found store to unexpected memref");
+
+          // Skip if 'srcNode' writes to any live in or escaping memrefs,
+          // and cannot be fused.
+          bool writesToLiveInOrOut =
+              mdg->writesToLiveInOrEscapingMemrefs(srcNode->id);
+          if (writesToLiveInOrOut &&
+              !canFuseSrcWhichWritesToLiveOut(srcId, dstId, srcStoreOp, mdg))
+            continue;
+
+          // Don't create a private memref if 'writesToLiveInOrOut'.
+          bool createPrivateMemref = !writesToLiveInOrOut;
+          // Don't create a private memref if 'srcNode' has in edges on
+          // 'memref', or if 'dstNode' has out edges on 'memref'.
+          if (mdg->getIncomingMemRefAccesses(srcNode->id, memref) > 0 ||
+              mdg->getOutEdgeCount(dstNode->id, memref) > 0) {
+            createPrivateMemref = false;
+          }
+
+          // Skip if 'srcNode' out edge count on 'memref' > 'maxSrcUserCount'.
+          if (mdg->getOutEdgeCount(srcNode->id, memref) > maxSrcUserCount)
+            continue;
+
+          // Compute an operation list insertion point for the fused loop
+          // nest which preserves dependences.
+          Operation *insertPointInst =
+              mdg->getFusedLoopNestInsertionPoint(srcNode->id, dstNode->id);
+          if (insertPointInst == nullptr)
+            continue;
+
+          // Compute the innermost common loop depth for dstNode loads/stores.
+          SmallVector<Operation *, 2> dstOps(dstNode->loads.begin(),
+                                             dstNode->loads.end());
+          dstOps.append(dstNode->stores.begin(), dstNode->stores.end());
+          unsigned dstLoopDepthTest = getInnermostCommonLoopDepth(dstOps);
+          // Check the feasibility of fusing src loop nest into dst loop nest
+          // at loop depths in range [1, dstLoopDepthTest].
+          // TODO(andydavis) Use slice union computation and union of memref
+          // read/write regions to cost model and fusion.
+          bool canFuse = false;
+          for (unsigned i = 1; i <= dstLoopDepthTest; ++i) {
+            ComputationSliceState sliceUnion;
+            FusionResult result = mlir::canFuseLoops(
+                cast<AffineForOp>(srcNode->op), cast<AffineForOp>(dstNode->op),
+                /*dstLoopDepth=*/i, &sliceUnion);
+            if (result.value == FusionResult::Success)
+              canFuse = true;
+          }
+
+          // Skip if fusion is not feasible at all loop depths.
+          if (!canFuse)
+            continue;
+
+          // Gather 'dstNode' store ops to 'memref'.
+          SmallVector<Operation *, 2> dstStoreOpInsts;
+          for (auto *storeOpInst : dstNode->stores)
+            if (cast<AffineStoreOp>(storeOpInst).getMemRef() == memref)
+              dstStoreOpInsts.push_back(storeOpInst);
+
+          unsigned bestDstLoopDepth;
+          mlir::ComputationSliceState sliceState;
+          // Check if fusion would be profitable.
+          if (!isFusionProfitable(srcStoreOp, srcStoreOp, dstLoadOpInsts,
+                                  dstStoreOpInsts, &sliceState,
+                                  &bestDstLoopDepth, maximalFusion))
+            continue;
+
+          // Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'.
+          auto sliceLoopNest = mlir::insertBackwardComputationSlice(
+              srcStoreOp, dstLoadOpInsts[0], bestDstLoopDepth, &sliceState);
+          if (sliceLoopNest) {
+            LLVM_DEBUG(llvm::dbgs() << "\tslice loop nest:\n"
+                                    << *sliceLoopNest.getOperation() << "\n");
+            // Move 'dstAffineForOp' before 'insertPointInst' if needed.
+            auto dstAffineForOp = cast<AffineForOp>(dstNode->op);
+            if (insertPointInst != dstAffineForOp.getOperation()) {
+              dstAffineForOp.getOperation()->moveBefore(insertPointInst);
+            }
+            // Update edges between 'srcNode' and 'dstNode'.
+            mdg->updateEdges(srcNode->id, dstNode->id, memref,
+                             createPrivateMemref);
+
+            // Collect slice loop stats.
+            LoopNestStateCollector sliceCollector;
+            sliceCollector.collect(sliceLoopNest.getOperation());
+            // Promote single iteration slice loops to single IV value.
+            for (auto forOp : sliceCollector.forOps) {
+              promoteIfSingleIteration(forOp);
+            }
+            if (createPrivateMemref) {
+              // Create private memref for 'memref' in 'dstAffineForOp'.
+              SmallVector<Operation *, 4> storesForMemref;
+              for (auto *storeOpInst : sliceCollector.storeOpInsts) {
+                if (cast<AffineStoreOp>(storeOpInst).getMemRef() == memref)
+                  storesForMemref.push_back(storeOpInst);
+              }
+              // TODO(andydavis) Use union of memref write regions to compute
+              // private memref footprint.
+              auto newMemRef = createPrivateMemRef(
+                  dstAffineForOp, storesForMemref[0], bestDstLoopDepth,
+                  fastMemorySpace, localBufSizeThreshold);
+              visitedMemrefs.insert(newMemRef);
+              // Create new node in dependence graph for 'newMemRef' alloc op.
+              unsigned newMemRefNodeId =
+                  mdg->addNode(newMemRef->getDefiningOp());
+              // Add edge from 'newMemRef' node to dstNode.
+              mdg->addEdge(newMemRefNodeId, dstId, newMemRef);
+            }
+
+            // Collect dst loop stats after memref privatization transformation.
+            LoopNestStateCollector dstLoopCollector;
+            dstLoopCollector.collect(dstAffineForOp.getOperation());
+
+            // Add new load ops to current Node load op list 'loads' to
+            // continue fusing based on new operands.
+            for (auto *loadOpInst : dstLoopCollector.loadOpInsts) {
+              auto loadMemRef = cast<AffineLoadOp>(loadOpInst).getMemRef();
+              if (visitedMemrefs.count(loadMemRef) == 0)
+                loads.push_back(loadOpInst);
+            }
+
+            // Clear and add back loads and stores.
+            mdg->clearNodeLoadAndStores(dstNode->id);
+            mdg->addToNode(dstId, dstLoopCollector.loadOpInsts,
+                           dstLoopCollector.storeOpInsts);
+            // Remove old src loop nest if it no longer has outgoing dependence
+            // edges, and if it does not write to a memref which escapes the
+            // function. If 'writesToLiveInOrOut' is true, then 'srcNode' has
+            // been fused into 'dstNode' and write region of 'dstNode' covers
+            // the write region of 'srcNode', and 'srcNode' has no other users
+            // so it is safe to remove.
+            if (writesToLiveInOrOut || mdg->canRemoveNode(srcNode->id)) {
+              mdg->removeNode(srcNode->id);
+              srcNode->op->erase();
+            } else {
+              // Add remaining users of 'oldMemRef' back on the worklist (if not
+              // already there), as its replacement with a local/private memref
+              // has reduced dependences on 'oldMemRef' which may have created
+              // new fusion opportunities.
+              if (mdg->outEdges.count(srcNode->id) > 0) {
+                SmallVector<MemRefDependenceGraph::Edge, 2> oldOutEdges =
+                    mdg->outEdges[srcNode->id];
+                for (auto &outEdge : oldOutEdges) {
+                  if (outEdge.value == memref &&
+                      worklistSet.count(outEdge.id) == 0) {
+                    worklist.push_back(outEdge.id);
+                    worklistSet.insert(outEdge.id);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Visits each node in the graph, and for each node, attempts to fuse it with
+  // its sibling nodes (nodes which share a parent, but no dependence edges).
+  void fuseSiblingNodes() {
+    init();
+    while (!worklist.empty()) {
+      unsigned dstId = worklist.back();
+      worklist.pop_back();
+      worklistSet.erase(dstId);
+
+      // Skip if this node was removed (fused into another node).
+      if (mdg->nodes.count(dstId) == 0)
+        continue;
+      // Get 'dstNode' into which to attempt fusion.
+      auto *dstNode = mdg->getNode(dstId);
+      // Skip if 'dstNode' is not a loop nest.
+      if (!isa<AffineForOp>(dstNode->op))
+        continue;
+      // Attempt to fuse 'dstNode' with its sibling nodes in the graph.
+      fuseWithSiblingNodes(dstNode);
+    }
+  }
+
+  // Attempt to fuse 'dstNode' with sibling nodes in the graph.
+  void fuseWithSiblingNodes(Node *dstNode) {
+    DenseSet<unsigned> visitedSibNodeIds;
+    std::pair<unsigned, Value> idAndMemref;
+    while (findSiblingNodeToFuse(dstNode, &visitedSibNodeIds, &idAndMemref)) {
+      unsigned sibId = idAndMemref.first;
+      Value memref = idAndMemref.second;
+      // TODO(andydavis) Check that 'sibStoreOpInst' post-dominates all other
+      // stores to the same memref in 'sibNode' loop nest.
+      auto *sibNode = mdg->getNode(sibId);
+      // Compute an operation list insertion point for the fused loop
+      // nest which preserves dependences.
+      assert(sibNode->op->getBlock() == dstNode->op->getBlock());
+      Operation *insertPointInst =
+          sibNode->op->isBeforeInBlock(dstNode->op)
+              ? mdg->getFusedLoopNestInsertionPoint(sibNode->id, dstNode->id)
+              : mdg->getFusedLoopNestInsertionPoint(dstNode->id, sibNode->id);
+      if (insertPointInst == nullptr)
+        continue;
+
+      // Check if fusion would be profitable and at what depth.
+
+      // Get unique 'sibNode' load op to 'memref'.
+      SmallVector<Operation *, 2> sibLoadOpInsts;
+      sibNode->getLoadOpsForMemref(memref, &sibLoadOpInsts);
+      // Currently findSiblingNodeToFuse searches for siblings with one load.
+      assert(sibLoadOpInsts.size() == 1);
+      Operation *sibLoadOpInst = sibLoadOpInsts[0];
+      assert(!sibNode->stores.empty());
+      // TODO(andydavis) Choose the store which postdominates all other stores.
+      auto *sibStoreOpInst = sibNode->stores.back();
+
+      // Gather 'dstNode' load ops to 'memref'.
+      SmallVector<Operation *, 2> dstLoadOpInsts;
+      dstNode->getLoadOpsForMemref(memref, &dstLoadOpInsts);
+
+      // Gather 'dstNode' store ops to 'memref'.
+      SmallVector<Operation *, 2> dstStoreOpInsts;
+      dstNode->getStoreOpsForMemref(memref, &dstStoreOpInsts);
+
+      unsigned bestDstLoopDepth;
+      mlir::ComputationSliceState sliceState;
+
+      // Check if fusion would be profitable.
+      if (!isFusionProfitable(sibLoadOpInst, sibStoreOpInst, dstLoadOpInsts,
+                              dstStoreOpInsts, &sliceState, &bestDstLoopDepth,
+                              maximalFusion))
+        continue;
+
+      // Fuse computation slice of 'sibLoopNest' into 'dstLoopNest'.
+      auto sliceLoopNest = mlir::insertBackwardComputationSlice(
+          sibLoadOpInst, dstLoadOpInsts[0], bestDstLoopDepth, &sliceState);
+      if (sliceLoopNest != nullptr) {
+        auto dstForInst = cast<AffineForOp>(dstNode->op);
+        // Update operation position of fused loop nest (if needed).
+        if (insertPointInst != dstForInst.getOperation()) {
+          dstForInst.getOperation()->moveBefore(insertPointInst);
+        }
+        // Update data dependence graph state post fusion.
+        updateStateAfterSiblingFusion(sliceLoopNest, sibNode, dstNode);
+      }
+    }
+  }
+
+  // Searches function argument uses and the graph from 'dstNode' looking for a
+  // fusion candidate sibling node which shares no dependences with 'dstNode'
+  // but which loads from the same memref. Returns true and sets
+  // 'idAndMemrefToFuse' on success. Returns false otherwise.
+  bool findSiblingNodeToFuse(Node *dstNode,
+                             DenseSet<unsigned> *visitedSibNodeIds,
+                             std::pair<unsigned, Value> *idAndMemrefToFuse) {
+    // Returns true if 'sibNode' can be fused with 'dstNode' for input reuse
+    // on 'memref'.
+    auto canFuseWithSibNode = [&](Node *sibNode, Value memref) {
+      // Skip if 'outEdge' is not a read-after-write dependence.
+      // TODO(andydavis) Remove restrict to single load op restriction.
+      if (sibNode->getLoadOpCount(memref) != 1)
+        return false;
+      // Skip if there exists a path of dependent edges between
+      // 'sibNode' and 'dstNode'.
+      if (mdg->hasDependencePath(sibNode->id, dstNode->id) ||
+          mdg->hasDependencePath(dstNode->id, sibNode->id))
+        return false;
+      // Skip sib node if it loads to (and stores from) the same memref on
+      // which it also has an input dependence edge.
+      DenseSet<Value> loadAndStoreMemrefSet;
+      sibNode->getLoadAndStoreMemrefSet(&loadAndStoreMemrefSet);
+      if (llvm::any_of(loadAndStoreMemrefSet, [=](Value memref) {
+            return mdg->getIncomingMemRefAccesses(sibNode->id, memref) > 0;
+          }))
+        return false;
+
+      // Check that all stores are to the same memref.
+      DenseSet<Value> storeMemrefs;
+      for (auto *storeOpInst : sibNode->stores) {
+        storeMemrefs.insert(cast<AffineStoreOp>(storeOpInst).getMemRef());
+      }
+      if (storeMemrefs.size() != 1)
+        return false;
+      return true;
+    };
+
+    // Search for siblings which load the same memref function argument.
+    auto fn = dstNode->op->getParentOfType<FuncOp>();
+    for (unsigned i = 0, e = fn.getNumArguments(); i != e; ++i) {
+      for (auto *user : fn.getArgument(i)->getUsers()) {
+        if (auto loadOp = dyn_cast<AffineLoadOp>(user)) {
+          // Gather loops surrounding 'use'.
+          SmallVector<AffineForOp, 4> loops;
+          getLoopIVs(*user, &loops);
+          // Skip 'use' if it is not within a loop nest.
+          if (loops.empty())
+            continue;
+          Node *sibNode = mdg->getForOpNode(loops[0]);
+          assert(sibNode != nullptr);
+          // Skip 'use' if it not a sibling to 'dstNode'.
+          if (sibNode->id == dstNode->id)
+            continue;
+          // Skip 'use' if it has been visited.
+          if (visitedSibNodeIds->count(sibNode->id) > 0)
+            continue;
+          // Skip 'use' if it does not load from the same memref as 'dstNode'.
+          auto memref = loadOp.getMemRef();
+          if (dstNode->getLoadOpCount(memref) == 0)
+            continue;
+          // Check if 'sibNode/dstNode' can be input-reuse fused on 'memref'.
+          if (canFuseWithSibNode(sibNode, memref)) {
+            visitedSibNodeIds->insert(sibNode->id);
+            idAndMemrefToFuse->first = sibNode->id;
+            idAndMemrefToFuse->second = memref;
+            return true;
+          }
+        }
+      }
+    }
+
+    // Search for siblings by following edges through an intermediate src node.
+    // Collect candidate 'dstNode' input edges in 'inEdges'.
+    SmallVector<MemRefDependenceGraph::Edge, 2> inEdges;
+    mdg->forEachMemRefInputEdge(
+        dstNode->id, [&](MemRefDependenceGraph::Edge inEdge) {
+          // Add 'inEdge' if it is a read-after-write dependence.
+          if (dstNode->getLoadOpCount(inEdge.value) > 0 &&
+              mdg->getNode(inEdge.id)->getStoreOpCount(inEdge.value) > 0)
+            inEdges.push_back(inEdge);
+        });
+
+    // Search for sibling nodes to fuse by visiting output edges from each input
+    // edge in 'inEdges'.
+    for (auto &inEdge : inEdges) {
+      // Collect candidate output edges from each node 'inEdge.id' in 'inEdges'.
+      SmallVector<MemRefDependenceGraph::Edge, 2> outEdges;
+      mdg->forEachMemRefOutputEdge(
+          inEdge.id, [&](MemRefDependenceGraph::Edge outEdge) {
+            unsigned sibNodeId = outEdge.id;
+            if (visitedSibNodeIds->count(sibNodeId) > 0)
+              return;
+            // Skip output edge if not a sibling using the same memref.
+            if (outEdge.id == dstNode->id || outEdge.value != inEdge.value)
+              return;
+            auto *sibNode = mdg->getNode(sibNodeId);
+            if (!isa<AffineForOp>(sibNode->op))
+              return;
+            // Check if 'sibNode/dstNode' can be input-reuse fused on 'memref'.
+            if (canFuseWithSibNode(sibNode, outEdge.value)) {
+              // Add candidate 'outEdge' to sibling node.
+              outEdges.push_back(outEdge);
+            }
+          });
+
+      // Add first candidate if any were returned.
+      if (!outEdges.empty()) {
+        visitedSibNodeIds->insert(outEdges[0].id);
+        idAndMemrefToFuse->first = outEdges[0].id;
+        idAndMemrefToFuse->second = outEdges[0].value;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void updateStateAfterSiblingFusion(AffineForOp sliceLoopNest, Node *sibNode,
+                                     Node *dstNode) {
+    // Update 'sibNode' and 'dstNode' input/output edges to reflect fusion.
+    mdg->updateEdges(sibNode->id, dstNode->id);
+
+    // Collect slice loop stats.
+    LoopNestStateCollector sliceCollector;
+    sliceCollector.collect(sliceLoopNest.getOperation());
+    // Promote single iteration slice loops to single IV value.
+    for (auto forOp : sliceCollector.forOps) {
+      promoteIfSingleIteration(forOp);
+    }
+
+    // Collect dst loop stats after memref privatization transformation.
+    auto dstForInst = cast<AffineForOp>(dstNode->op);
+    LoopNestStateCollector dstLoopCollector;
+    dstLoopCollector.collect(dstForInst.getOperation());
+    // Clear and add back loads and stores
+    mdg->clearNodeLoadAndStores(dstNode->id);
+    mdg->addToNode(dstNode->id, dstLoopCollector.loadOpInsts,
+                   dstLoopCollector.storeOpInsts);
+    // Remove old sibling loop nest if it no longer has outgoing dependence
+    // edges, and it does not write to a memref which escapes the
+    // function.
+    if (mdg->getOutEdgeCount(sibNode->id) == 0) {
+      mdg->removeNode(sibNode->id);
+      sibNode->op->erase();
+    }
+  }
+
+  // Clean up any allocs with no users.
+  void eraseUnusedMemRefAllocations() {
+    for (auto &pair : mdg->memrefEdgeCount) {
+      if (pair.second > 0)
+        continue;
+      auto memref = pair.first;
+      // Skip if there exist other uses (return operation or function calls).
+      if (!memref->use_empty())
+        continue;
+      // Use list expected to match the dep graph info.
+      auto *op = memref->getDefiningOp();
+      if (isa_and_nonnull<AllocOp>(op))
+        op->erase();
+    }
+  }
+};
+
+} // end anonymous namespace
+
+void LoopFusion::runOnFunction() {
+  // Override if a command line argument was provided.
+  if (clFusionFastMemorySpace.getNumOccurrences() > 0) {
+    fastMemorySpace = clFusionFastMemorySpace.getValue();
+  }
+
+  // Override if a command line argument was provided.
+  if (clFusionLocalBufThreshold.getNumOccurrences() > 0) {
+    localBufSizeThreshold = clFusionLocalBufThreshold * 1024;
+  }
+
+  if (clMaximalLoopFusion.getNumOccurrences() > 0)
+    maximalFusion = clMaximalLoopFusion;
+
+  MemRefDependenceGraph g;
+  if (g.init(getFunction()))
+    GreedyFusion(&g, localBufSizeThreshold, fastMemorySpace, maximalFusion)
+        .run();
+}
+
+static PassRegistration<LoopFusion> pass("affine-loop-fusion",
+                                         "Fuse loop nests");
diff --git a/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fb3d0c0b45c8bf57e85070beb6a907f0c11a5706
--- /dev/null
+++ b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
@@ -0,0 +1,140 @@
+//===- LoopInvariantCodeMotion.cpp - Code to perform loop fusion-----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop invariant code motion.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopLikeInterface.h"
+#include "mlir/Transforms/SideEffectsInterface.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "licm"
+
+using namespace mlir;
+
+namespace {
+
+using SideEffecting = SideEffectsInterface::SideEffecting;
+
+/// Loop invariant code motion (LICM) pass.
+struct LoopInvariantCodeMotion : public OperationPass<LoopInvariantCodeMotion> {
+public:
+  void runOnOperation() override;
+};
+
+// Checks whether the given op can be hoisted by checking that
+// - the op and any of its contained operations do not depend on SSA values
+//   defined inside of the loop (by means of calling definedOutside).
+// - the op has no side-effects. If sideEffecting is Never, sideeffects of this
+//   op and its nested ops are ignored.
+static bool canBeHoisted(Operation *op,
+                         function_ref<bool(Value)> definedOutside,
+                         SideEffecting sideEffecting,
+                         SideEffectsInterface &interface) {
+  // Check that dependencies are defined outside of loop.
+  if (!llvm::all_of(op->getOperands(), definedOutside))
+    return false;
+  // Check whether this op is side-effect free. If we already know that there
+  // can be no side-effects because the surrounding op has claimed so, we can
+  // (and have to) skip this step.
+  auto thisOpIsSideEffecting = sideEffecting;
+  if (thisOpIsSideEffecting != SideEffecting::Never) {
+    thisOpIsSideEffecting = interface.isSideEffecting(op);
+    // If the op always has sideeffects, we cannot hoist.
+    if (thisOpIsSideEffecting == SideEffecting::Always)
+      return false;
+  }
+  // Recurse into the regions for this op and check whether the contained ops
+  // can be hoisted.
+  for (auto &region : op->getRegions()) {
+    for (auto &block : region.getBlocks()) {
+      for (auto &innerOp : block) {
+        if (innerOp.isKnownTerminator())
+          continue;
+        if (!canBeHoisted(&innerOp, definedOutside, thisOpIsSideEffecting,
+                          interface))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+static LogicalResult moveLoopInvariantCode(LoopLikeOpInterface looplike,
+                                           SideEffectsInterface &interface) {
+  auto &loopBody = looplike.getLoopBody();
+
+  // We use two collections here as we need to preserve the order for insertion
+  // and this is easiest.
+  SmallPtrSet<Operation *, 8> willBeMovedSet;
+  SmallVector<Operation *, 8> opsToMove;
+
+  // Helper to check whether an operation is loop invariant wrt. SSA properties.
+  auto isDefinedOutsideOfBody = [&](Value value) {
+    auto definingOp = value->getDefiningOp();
+    return (definingOp && !!willBeMovedSet.count(definingOp)) ||
+           looplike.isDefinedOutsideOfLoop(value);
+  };
+
+  // Do not use walk here, as we do not want to go into nested regions and hoist
+  // operations from there. These regions might have semantics unknown to this
+  // rewriting. If the nested regions are loops, they will have been processed.
+  for (auto &block : loopBody) {
+    for (auto &op : block.without_terminator()) {
+      if (canBeHoisted(&op, isDefinedOutsideOfBody,
+                       mlir::SideEffectsDialectInterface::Recursive,
+                       interface)) {
+        opsToMove.push_back(&op);
+        willBeMovedSet.insert(&op);
+      }
+    }
+  }
+
+  // For all instructions that we found to be invariant, move outside of the
+  // loop.
+  auto result = looplike.moveOutOfLoop(opsToMove);
+  LLVM_DEBUG(looplike.print(llvm::dbgs() << "Modified loop\n"));
+  return result;
+}
+
+} // end anonymous namespace
+
+void LoopInvariantCodeMotion::runOnOperation() {
+  SideEffectsInterface interface(&getContext());
+  // Walk through all loops in a function in innermost-loop-first order. This
+  // way, we first LICM from the inner loop, and place the ops in
+  // the outer loop, which in turn can be further LICM'ed.
+  getOperation()->walk([&](Operation *op) {
+    if (auto looplike = dyn_cast<LoopLikeOpInterface>(op)) {
+      LLVM_DEBUG(op->print(llvm::dbgs() << "\nOriginal loop\n"));
+      if (failed(moveLoopInvariantCode(looplike, interface)))
+        signalPassFailure();
+    }
+  });
+}
+
+// Include the generated code for the loop-like interface here, as it otherwise
+// has no compilation unit. This works as loop-invariant code motion is the
+// only user of that interface.
+#include "mlir/Transforms/LoopLikeInterface.cpp.inc"
+
+std::unique_ptr<Pass> mlir::createLoopInvariantCodeMotionPass() {
+  return std::make_unique<LoopInvariantCodeMotion>();
+}
+
+static PassRegistration<LoopInvariantCodeMotion>
+    pass("loop-invariant-code-motion",
+         "Hoist loop invariant instructions outside of the loop");
diff --git a/mlir/lib/Transforms/LoopTiling.cpp b/mlir/lib/Transforms/LoopTiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d3dc81760fc63d4bb20dd28bd831d53df781e909
--- /dev/null
+++ b/mlir/lib/Transforms/LoopTiling.cpp
@@ -0,0 +1,402 @@
+//===- LoopTiling.cpp --- Loop tiling pass ------------------------------*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to tile loop nests.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+using namespace mlir;
+
+#define DEBUG_TYPE "affine-loop-tile"
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+static llvm::cl::opt<unsigned long long>
+    clCacheSizeKiB("tile-cache-size",
+                   llvm::cl::desc("Set size of cache to tile for in KiB"),
+                   llvm::cl::cat(clOptionsCategory));
+
+// Tile size to use for all loops (overrides -tile-sizes if provided).
+static llvm::cl::opt<unsigned>
+    clTileSize("tile-size", llvm::cl::desc("Use this tile size for all loops"),
+               llvm::cl::cat(clOptionsCategory));
+
+// List of tile sizes. If any of them aren't provided, they are filled with
+// clTileSize / kDefaultTileSize.
+static llvm::cl::list<unsigned> clTileSizes(
+    "tile-sizes",
+    llvm::cl::desc(
+        "List of tile sizes for each perfect nest (overridden by -tile-size)"),
+    llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
+
+namespace {
+
+/// A pass to perform loop tiling on all suitable loop nests of a Function.
+struct LoopTiling : public FunctionPass<LoopTiling> {
+  explicit LoopTiling(uint64_t cacheSizeBytes = kDefaultCacheMemCapacity,
+                      bool avoidMaxMinBounds = true)
+      : cacheSizeBytes(cacheSizeBytes), avoidMaxMinBounds(avoidMaxMinBounds) {}
+
+  void runOnFunction() override;
+  void getTileSizes(ArrayRef<AffineForOp> band,
+                    SmallVectorImpl<unsigned> *tileSizes);
+
+  // Default tile size if nothing is provided.
+  constexpr static unsigned kDefaultTileSize = 4;
+  constexpr static uint64_t kDefaultCacheMemCapacity = 512 * 1024UL;
+
+  // Capacity of the cache to tile for.
+  uint64_t cacheSizeBytes;
+  // If true, tile sizes are set to avoid max/min in bounds if possible.
+  bool avoidMaxMinBounds;
+};
+
+} // end anonymous namespace
+
+/// Creates a pass to perform loop tiling on all suitable loop nests of a
+/// Function.
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::createLoopTilingPass(uint64_t cacheSizeBytes) {
+  return std::make_unique<LoopTiling>(cacheSizeBytes);
+}
+
+// Move the loop body of AffineForOp 'src' from 'src' into the specified
+// location in destination's body, ignoring the terminator.
+static inline void moveLoopBody(AffineForOp src, AffineForOp dest,
+                                Block::iterator loc) {
+  auto &insts = src.getBody()->getOperations();
+  dest.getBody()->getOperations().splice(loc, insts, insts.begin(),
+                                         std::prev(insts.end()));
+}
+
+// Move the loop body of AffineForOp 'src' from 'src' to the start of dest's
+// body.
+static inline void moveLoopBody(AffineForOp src, AffineForOp dest) {
+  moveLoopBody(src, dest, dest.getBody()->begin());
+}
+
+/// Constructs and sets new loop bounds after tiling for the case of
+/// hyper-rectangular index sets, where the bounds of one dimension do not
+/// depend on other dimensions. Bounds of each dimension can thus be treated
+/// independently, and deriving the new bounds is much simpler and faster
+/// than for the case of tiling arbitrary polyhedral shapes.
+static void
+constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
+                                MutableArrayRef<AffineForOp> newLoops,
+                                ArrayRef<unsigned> tileSizes) {
+  assert(!origLoops.empty());
+  assert(origLoops.size() == tileSizes.size());
+
+  OpBuilder b(origLoops[0].getOperation());
+  unsigned width = origLoops.size();
+
+  // Bounds for tile space loops.
+  for (unsigned i = 0; i < width; i++) {
+    auto lbOperands = origLoops[i].getLowerBoundOperands();
+    auto ubOperands = origLoops[i].getUpperBoundOperands();
+    SmallVector<Value, 4> newLbOperands(lbOperands);
+    SmallVector<Value, 4> newUbOperands(ubOperands);
+    newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
+    newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
+    newLoops[i].setStep(tileSizes[i]);
+  }
+  // Bounds for intra-tile loops.
+  for (unsigned i = 0; i < width; i++) {
+    int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
+    auto mayBeConstantCount = getConstantTripCount(origLoops[i]);
+    // The lower bound is just the tile-space loop.
+    AffineMap lbMap = b.getDimIdentityMap();
+    newLoops[width + i].setLowerBound(
+        /*operands=*/newLoops[i].getInductionVar(), lbMap);
+
+    // Set the upper bound.
+    if (mayBeConstantCount.hasValue() &&
+        mayBeConstantCount.getValue() < tileSizes[i]) {
+      // Trip count is less than tile size; upper bound is the trip count.
+      auto ubMap = b.getConstantAffineMap(mayBeConstantCount.getValue());
+      newLoops[width + i].setUpperBoundMap(ubMap);
+    } else if (largestDiv % tileSizes[i] != 0) {
+      // Intra-tile loop ii goes from i to min(i + tileSize, ub_i).
+      // Construct the upper bound map; the operands are the original operands
+      // with 'i' (tile-space loop) appended to it. The new upper bound map is
+      // the original one with an additional expression i + tileSize appended.
+      auto ub = origLoops[i].getUpperBound();
+      SmallVector<Value, 4> ubOperands;
+      ubOperands.reserve(ub.getNumOperands() + 1);
+      auto origUbMap = ub.getMap();
+      // Add dim operands from original upper bound.
+      for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) {
+        ubOperands.push_back(ub.getOperand(j));
+      }
+      // Add dim operand for new loop upper bound.
+      ubOperands.push_back(newLoops[i].getInductionVar());
+      // Add symbol operands from original upper bound.
+      for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) {
+        ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
+      }
+      SmallVector<AffineExpr, 4> boundExprs;
+      boundExprs.reserve(1 + origUbMap.getNumResults());
+      auto dim = b.getAffineDimExpr(origUbMap.getNumDims());
+      // The new upper bound map is the original one with an additional
+      // expression i + tileSize appended.
+      boundExprs.push_back(dim + tileSizes[i]);
+      boundExprs.append(origUbMap.getResults().begin(),
+                        origUbMap.getResults().end());
+      auto ubMap = AffineMap::get(origUbMap.getNumDims() + 1,
+                                  origUbMap.getNumSymbols(), boundExprs);
+      newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
+    } else {
+      // No need of the min expression.
+      auto dim = b.getAffineDimExpr(0);
+      auto ubMap = AffineMap::get(1, 0, dim + tileSizes[i]);
+      newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
+    }
+  }
+}
+
+/// Tiles the specified band of perfectly nested loops creating tile-space loops
+/// and intra-tile loops. A band is a contiguous set of loops.
+//  TODO(bondhugula): handle non hyper-rectangular spaces.
+LogicalResult mlir::tileCodeGen(MutableArrayRef<AffineForOp> band,
+                                ArrayRef<unsigned> tileSizes) {
+  assert(!band.empty());
+  assert(band.size() == tileSizes.size() && "Incorrect number of tile sizes");
+
+  // Check if the supplied for op's are all successively nested.
+  for (unsigned i = 1, e = band.size(); i < e; i++) {
+    assert(band[i].getParentOp() == band[i - 1].getOperation());
+  }
+
+  auto origLoops = band;
+
+  AffineForOp rootAffineForOp = origLoops[0];
+  auto loc = rootAffineForOp.getLoc();
+  // Note that width is at least one since band isn't empty.
+  unsigned width = band.size();
+
+  SmallVector<AffineForOp, 12> newLoops(2 * width);
+  AffineForOp innermostPointLoop;
+
+  // The outermost among the loops as we add more..
+  auto *topLoop = rootAffineForOp.getOperation();
+
+  // Add intra-tile (or point) loops.
+  for (unsigned i = 0; i < width; i++) {
+    OpBuilder b(topLoop);
+    // Loop bounds will be set later.
+    auto pointLoop = b.create<AffineForOp>(loc, 0, 0);
+    pointLoop.getBody()->getOperations().splice(
+        pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
+        topLoop);
+    newLoops[2 * width - 1 - i] = pointLoop;
+    topLoop = pointLoop.getOperation();
+    if (i == 0)
+      innermostPointLoop = pointLoop;
+  }
+
+  // Add tile space loops;
+  for (unsigned i = width; i < 2 * width; i++) {
+    OpBuilder b(topLoop);
+    // Loop bounds will be set later.
+    auto tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
+    tileSpaceLoop.getBody()->getOperations().splice(
+        tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
+        topLoop);
+    newLoops[2 * width - i - 1] = tileSpaceLoop;
+    topLoop = tileSpaceLoop.getOperation();
+  }
+
+  // Move the loop body of the original nest to the new one.
+  moveLoopBody(origLoops[origLoops.size() - 1], innermostPointLoop);
+
+  SmallVector<Value, 8> origLoopIVs;
+  extractForInductionVars(band, &origLoopIVs);
+  SmallVector<Optional<Value>, 6> ids(origLoopIVs.begin(), origLoopIVs.end());
+  FlatAffineConstraints cst;
+  getIndexSet(band, &cst);
+
+  if (!cst.isHyperRectangular(0, width)) {
+    rootAffineForOp.emitError("tiled code generation unimplemented for the "
+                              "non-hyperrectangular case");
+    return failure();
+  }
+
+  constructTiledIndexSetHyperRect(origLoops, newLoops, tileSizes);
+  // In this case, the point loop IVs just replace the original ones.
+  for (unsigned i = 0; i < width; i++) {
+    origLoopIVs[i]->replaceAllUsesWith(newLoops[i + width].getInductionVar());
+  }
+
+  // Erase the old loop nest.
+  rootAffineForOp.erase();
+
+  return success();
+}
+
+// Identify valid and profitable bands of loops to tile. This is currently just
+// a temporary placeholder to test the mechanics of tiled code generation.
+// Returns all maximal outermost perfect loop nests to tile.
+static void getTileableBands(FuncOp f,
+                             std::vector<SmallVector<AffineForOp, 6>> *bands) {
+  // Get maximal perfect nest of 'affine.for' insts starting from root
+  // (inclusive).
+  auto getMaximalPerfectLoopNest = [&](AffineForOp root) {
+    SmallVector<AffineForOp, 6> band;
+    getPerfectlyNestedLoops(band, root);
+    bands->push_back(band);
+  };
+
+  for (auto &block : f)
+    for (auto &op : block)
+      if (auto forOp = dyn_cast<AffineForOp>(op))
+        getMaximalPerfectLoopNest(forOp);
+}
+
+// Reduce each tile size to the largest divisor of the corresponding trip count
+// (if the trip count is known).
+static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
+                                         SmallVectorImpl<unsigned> *tileSizes) {
+  assert(band.size() == tileSizes->size() && "invalid tile size count");
+  for (unsigned i = 0, e = band.size(); i < e; i++) {
+    unsigned &tSizeAdjusted = (*tileSizes)[i];
+    auto mayConst = getConstantTripCount(band[i]);
+    if (!mayConst.hasValue())
+      continue;
+    // Adjust the tile size to largest factor of the trip count less than
+    // tSize.
+    uint64_t constTripCount = mayConst.getValue();
+    if (constTripCount > 1 && tSizeAdjusted > constTripCount / 2)
+      tSizeAdjusted = constTripCount / 2;
+    while (constTripCount % tSizeAdjusted != 0)
+      tSizeAdjusted--;
+  }
+}
+
+// Returns tile sizes to use. Checks CL options; if none are specified, sets it
+// based on a simple model that looks at the memory footprint and determines
+// tile sizes assuming identity accesses / 1:1 tile size proportional footprint
+// along each of the dimensions being tiled.
+// TODO(mlir-team): evolve this model. Tile size determination is a large area
+// to play with in general.
+void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
+                              SmallVectorImpl<unsigned> *tileSizes) {
+  if (band.empty())
+    return;
+
+  tileSizes->resize(band.size());
+
+  // Use clTileSize for all loops if specified.
+  if (clTileSize.getNumOccurrences() > 0) {
+    std::fill(tileSizes->begin(), tileSizes->end(), clTileSize);
+    return;
+  }
+
+  // Use clTileSizes and fill them with default tile size if it's short.
+  if (!clTileSizes.empty()) {
+    std::fill(tileSizes->begin(), tileSizes->end(),
+              LoopTiling::kDefaultTileSize);
+    std::copy(clTileSizes.begin(),
+              clTileSizes.begin() + std::min(clTileSizes.size(), band.size()),
+              tileSizes->begin());
+    return;
+  }
+
+  // The first loop in the band.
+  auto rootForOp = band[0];
+  (void)rootForOp;
+
+  // Obtain memory footprint and set tile sizes so that a tile fits in
+  // the cache size. This is an approximation with the assumption that the
+  // footprint increases with the tile size linearly in that dimension (i.e.,
+  // assumes one-to-one access function).
+  auto fp = getMemoryFootprintBytes(band[0], 0);
+  if (!fp.hasValue()) {
+    // Fill with default tile sizes if footprint is unknown.
+    std::fill(tileSizes->begin(), tileSizes->end(),
+              LoopTiling::kDefaultTileSize);
+    if (avoidMaxMinBounds)
+      adjustToDivisorsOfTripCounts(band, tileSizes);
+    LLVM_DEBUG(
+        rootForOp.emitWarning("memory footprint unknown: using default tile "
+                              "sizes adjusted to trip count divisors"));
+    return;
+  }
+
+  // Check how many times larger the cache size is when compared to footprint.
+  uint64_t excessFactor = llvm::divideCeil(fp.getValue(), cacheSizeBytes);
+  if (excessFactor <= 1) {
+    // No need of any tiling - set tile size to 1.
+    std::fill(tileSizes->begin(), tileSizes->end(), 1);
+    return;
+  }
+
+  // Divide all loops equally in an attempt to reduce footprint.
+  // TODO(bondhugula): this is approximate. Ideally, obtain reuse factor /
+  // profitability along each dimension and weight tile sizes based on that as
+  // one possible approach. Or compute a polynomial in tile sizes and solve for
+  // it.
+
+  // For an n-d tileable band, compute n^th root of the excess.
+  unsigned tSize =
+      static_cast<unsigned>(floorl(std::pow(excessFactor, 1.0 / band.size())));
+  // We'll keep a running product to determine the last tile size better.
+  unsigned cumulProductOfTileSizes = 1;
+  for (unsigned i = 0, e = band.size(); i < e; i++) {
+    if (i < e - 1)
+      (*tileSizes)[i] = tSize;
+    else
+      // Set last tile size to cover the balance.
+      (*tileSizes)[i] = std::max(
+          1U, static_cast<unsigned>(excessFactor / cumulProductOfTileSizes));
+    cumulProductOfTileSizes *= (*tileSizes)[i];
+  }
+  if (avoidMaxMinBounds)
+    adjustToDivisorsOfTripCounts(band, tileSizes);
+}
+
+void LoopTiling::runOnFunction() {
+  // Override cache size if provided on command line.
+  if (clCacheSizeKiB.getNumOccurrences() > 0)
+    cacheSizeBytes = clCacheSizeKiB * 1024;
+
+  // Bands of loops to tile.
+  std::vector<SmallVector<AffineForOp, 6>> bands;
+  getTileableBands(getFunction(), &bands);
+
+  for (auto &band : bands) {
+    // Set up tile sizes; fill missing tile sizes at the end with default tile
+    // size or clTileSize if one was provided.
+    SmallVector<unsigned, 6> tileSizes;
+    getTileSizes(band, &tileSizes);
+    if (llvm::DebugFlag) {
+      auto diag = band[0].emitRemark("using tile sizes [");
+      for (auto tSize : tileSizes)
+        diag << tSize << " ";
+      diag << "]\n";
+    }
+    if (failed(tileCodeGen(band, tileSizes)))
+      return signalPassFailure();
+  }
+}
+
+constexpr unsigned LoopTiling::kDefaultTileSize;
+constexpr uint64_t LoopTiling::kDefaultCacheMemCapacity;
+
+static PassRegistration<LoopTiling> pass("affine-loop-tile", "Tile loop nests");
diff --git a/mlir/lib/Transforms/LoopUnroll.cpp b/mlir/lib/Transforms/LoopUnroll.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e94c6c8b0bbad93615ce7e458d260e7520cd31f3
--- /dev/null
+++ b/mlir/lib/Transforms/LoopUnroll.cpp
@@ -0,0 +1,182 @@
+//===- LoopUnroll.cpp - Code to perform loop unrolling --------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop unrolling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace mlir;
+
+#define DEBUG_TYPE "affine-loop-unroll"
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+// Loop unrolling factor.
+static llvm::cl::opt<unsigned> clUnrollFactor(
+    "unroll-factor",
+    llvm::cl::desc("Use this unroll factor for all loops being unrolled"),
+    llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<bool> clUnrollFull("unroll-full",
+                                        llvm::cl::desc("Fully unroll loops"),
+                                        llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<unsigned> clUnrollNumRepetitions(
+    "unroll-num-reps",
+    llvm::cl::desc("Unroll innermost loops repeatedly this many times"),
+    llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<unsigned> clUnrollFullThreshold(
+    "unroll-full-threshold", llvm::cl::Hidden,
+    llvm::cl::desc(
+        "Unroll all loops with trip count less than or equal to this"),
+    llvm::cl::cat(clOptionsCategory));
+
+namespace {
+/// Loop unrolling pass. Unrolls all innermost loops unless full unrolling and a
+/// full unroll threshold was specified, in which case, fully unrolls all loops
+/// with trip count less than the specified threshold. The latter is for testing
+/// purposes, especially for testing outer loop unrolling.
+struct LoopUnroll : public FunctionPass<LoopUnroll> {
+  const Optional<unsigned> unrollFactor;
+  const Optional<bool> unrollFull;
+  // Callback to obtain unroll factors; if this has a callable target, takes
+  // precedence over command-line argument or passed argument.
+  const std::function<unsigned(AffineForOp)> getUnrollFactor;
+
+  explicit LoopUnroll(
+      Optional<unsigned> unrollFactor = None, Optional<bool> unrollFull = None,
+      const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr)
+      : unrollFactor(unrollFactor), unrollFull(unrollFull),
+        getUnrollFactor(getUnrollFactor) {}
+
+  void runOnFunction() override;
+
+  /// Unroll this for op. Returns failure if nothing was done.
+  LogicalResult runOnAffineForOp(AffineForOp forOp);
+
+  static const unsigned kDefaultUnrollFactor = 4;
+};
+} // end anonymous namespace
+
+void LoopUnroll::runOnFunction() {
+  // Gathers all innermost loops through a post order pruned walk.
+  struct InnermostLoopGatherer {
+    // Store innermost loops as we walk.
+    std::vector<AffineForOp> loops;
+
+    void walkPostOrder(FuncOp f) {
+      for (auto &b : f)
+        walkPostOrder(b.begin(), b.end());
+    }
+
+    bool walkPostOrder(Block::iterator Start, Block::iterator End) {
+      bool hasInnerLoops = false;
+      // We need to walk all elements since all innermost loops need to be
+      // gathered as opposed to determining whether this list has any inner
+      // loops or not.
+      while (Start != End)
+        hasInnerLoops |= walkPostOrder(&(*Start++));
+      return hasInnerLoops;
+    }
+    bool walkPostOrder(Operation *opInst) {
+      bool hasInnerLoops = false;
+      for (auto &region : opInst->getRegions())
+        for (auto &block : region)
+          hasInnerLoops |= walkPostOrder(block.begin(), block.end());
+      if (isa<AffineForOp>(opInst)) {
+        if (!hasInnerLoops)
+          loops.push_back(cast<AffineForOp>(opInst));
+        return true;
+      }
+      return hasInnerLoops;
+    }
+  };
+
+  if (clUnrollFull.getNumOccurrences() > 0 &&
+      clUnrollFullThreshold.getNumOccurrences() > 0) {
+    // Store short loops as we walk.
+    std::vector<AffineForOp> loops;
+
+    // Gathers all loops with trip count <= minTripCount. Do a post order walk
+    // so that loops are gathered from innermost to outermost (or else unrolling
+    // an outer one may delete gathered inner ones).
+    getFunction().walk([&](AffineForOp forOp) {
+      Optional<uint64_t> tripCount = getConstantTripCount(forOp);
+      if (tripCount.hasValue() && tripCount.getValue() <= clUnrollFullThreshold)
+        loops.push_back(forOp);
+    });
+    for (auto forOp : loops)
+      loopUnrollFull(forOp);
+    return;
+  }
+
+  unsigned numRepetitions = clUnrollNumRepetitions.getNumOccurrences() > 0
+                                ? clUnrollNumRepetitions
+                                : 1;
+  // If the call back is provided, we will recurse until no loops are found.
+  FuncOp func = getFunction();
+  for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) {
+    InnermostLoopGatherer ilg;
+    ilg.walkPostOrder(func);
+    auto &loops = ilg.loops;
+    if (loops.empty())
+      break;
+    bool unrolled = false;
+    for (auto forOp : loops)
+      unrolled |= succeeded(runOnAffineForOp(forOp));
+    if (!unrolled)
+      // Break out if nothing was unrolled.
+      break;
+  }
+}
+
+/// Unrolls a 'affine.for' op. Returns success if the loop was unrolled,
+/// failure otherwise. The default unroll factor is 4.
+LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) {
+  // Use the function callback if one was provided.
+  if (getUnrollFactor) {
+    return loopUnrollByFactor(forOp, getUnrollFactor(forOp));
+  }
+  // Unroll by the factor passed, if any.
+  if (unrollFactor.hasValue())
+    return loopUnrollByFactor(forOp, unrollFactor.getValue());
+  // Unroll by the command line factor if one was specified.
+  if (clUnrollFactor.getNumOccurrences() > 0)
+    return loopUnrollByFactor(forOp, clUnrollFactor);
+  // Unroll completely if full loop unroll was specified.
+  if (clUnrollFull.getNumOccurrences() > 0 ||
+      (unrollFull.hasValue() && unrollFull.getValue()))
+    return loopUnrollFull(forOp);
+
+  // Unroll by four otherwise.
+  return loopUnrollByFactor(forOp, kDefaultUnrollFactor);
+}
+
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createLoopUnrollPass(
+    int unrollFactor, int unrollFull,
+    const std::function<unsigned(AffineForOp)> &getUnrollFactor) {
+  return std::make_unique<LoopUnroll>(
+      unrollFactor == -1 ? None : Optional<unsigned>(unrollFactor),
+      unrollFull == -1 ? None : Optional<bool>(unrollFull), getUnrollFactor);
+}
+
+static PassRegistration<LoopUnroll> pass("affine-loop-unroll", "Unroll loops");
diff --git a/mlir/lib/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Transforms/LoopUnrollAndJam.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c74d5454975200a7b40c4464a3b7b7c69152ce4
--- /dev/null
+++ b/mlir/lib/Transforms/LoopUnrollAndJam.cpp
@@ -0,0 +1,235 @@
+//===- LoopUnrollAndJam.cpp - Code to perform loop unroll and jam ---------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop unroll and jam. Unroll and jam is a transformation
+// that improves locality, in particular, register reuse, while also improving
+// operation level parallelism. The example below shows what it does in nearly
+// the general case. Loop unroll and jam currently works if the bounds of the
+// loops inner to the loop being unroll-jammed do not depend on the latter.
+//
+// Before      After unroll and jam of i by factor 2:
+//
+//             for i, step = 2
+// for i         S1(i);
+//   S1;         S2(i);
+//   S2;         S1(i+1);
+//   for j       S2(i+1);
+//     S3;       for j
+//     S4;         S3(i, j);
+//   S5;           S4(i, j);
+//   S6;           S3(i+1, j)
+//                 S4(i+1, j)
+//               S5(i);
+//               S6(i);
+//               S5(i+1);
+//               S6(i+1);
+//
+// Note: 'if/else' blocks are not jammed. So, if there are loops inside if
+// op's, bodies of those loops will not be jammed.
+//===----------------------------------------------------------------------===//
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace mlir;
+
+#define DEBUG_TYPE "affine-loop-unroll-jam"
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+// Loop unroll and jam factor.
+static llvm::cl::opt<unsigned>
+    clUnrollJamFactor("unroll-jam-factor", llvm::cl::Hidden,
+                      llvm::cl::desc("Use this unroll jam factor for all loops"
+                                     " (default 4)"),
+                      llvm::cl::cat(clOptionsCategory));
+
+namespace {
+/// Loop unroll jam pass. Currently, this just unroll jams the first
+/// outer loop in a Function.
+struct LoopUnrollAndJam : public FunctionPass<LoopUnrollAndJam> {
+  Optional<unsigned> unrollJamFactor;
+  static const unsigned kDefaultUnrollJamFactor = 4;
+
+  explicit LoopUnrollAndJam(Optional<unsigned> unrollJamFactor = None)
+      : unrollJamFactor(unrollJamFactor) {}
+
+  void runOnFunction() override;
+  LogicalResult runOnAffineForOp(AffineForOp forOp);
+};
+} // end anonymous namespace
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::createLoopUnrollAndJamPass(int unrollJamFactor) {
+  return std::make_unique<LoopUnrollAndJam>(
+      unrollJamFactor == -1 ? None : Optional<unsigned>(unrollJamFactor));
+}
+
+void LoopUnrollAndJam::runOnFunction() {
+  // Currently, just the outermost loop from the first loop nest is
+  // unroll-and-jammed by this pass. However, runOnAffineForOp can be called on
+  // any for operation.
+  auto &entryBlock = getFunction().front();
+  if (auto forOp = dyn_cast<AffineForOp>(entryBlock.front()))
+    runOnAffineForOp(forOp);
+}
+
+/// Unroll and jam a 'affine.for' op. Default unroll jam factor is
+/// kDefaultUnrollJamFactor. Return failure if nothing was done.
+LogicalResult LoopUnrollAndJam::runOnAffineForOp(AffineForOp forOp) {
+  // Unroll and jam by the factor that was passed if any.
+  if (unrollJamFactor.hasValue())
+    return loopUnrollJamByFactor(forOp, unrollJamFactor.getValue());
+  // Otherwise, unroll jam by the command-line factor if one was specified.
+  if (clUnrollJamFactor.getNumOccurrences() > 0)
+    return loopUnrollJamByFactor(forOp, clUnrollJamFactor);
+
+  // Unroll and jam by four otherwise.
+  return loopUnrollJamByFactor(forOp, kDefaultUnrollJamFactor);
+}
+
+LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp,
+                                            uint64_t unrollJamFactor) {
+  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+
+  if (mayBeConstantTripCount.hasValue() &&
+      mayBeConstantTripCount.getValue() < unrollJamFactor)
+    return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.getValue());
+  return loopUnrollJamByFactor(forOp, unrollJamFactor);
+}
+
+/// Unrolls and jams this loop by the specified factor.
+LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
+                                          uint64_t unrollJamFactor) {
+  // Gathers all maximal sub-blocks of operations that do not themselves
+  // include a for op (a operation could have a descendant for op though
+  // in its tree).  Ignore the block terminators.
+  struct JamBlockGatherer {
+    // Store iterators to the first and last op of each sub-block found.
+    std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks;
+
+    // This is a linear time walk.
+    void walk(Operation *op) {
+      for (auto &region : op->getRegions())
+        for (auto &block : region)
+          walk(block);
+    }
+    void walk(Block &block) {
+      for (auto it = block.begin(), e = std::prev(block.end()); it != e;) {
+        auto subBlockStart = it;
+        while (it != e && !isa<AffineForOp>(&*it))
+          ++it;
+        if (it != subBlockStart)
+          subBlocks.push_back({subBlockStart, std::prev(it)});
+        // Process all for insts that appear next.
+        while (it != e && isa<AffineForOp>(&*it))
+          walk(&*it++);
+      }
+    }
+  };
+
+  assert(unrollJamFactor >= 1 && "unroll jam factor should be >= 1");
+
+  if (unrollJamFactor == 1)
+    return promoteIfSingleIteration(forOp);
+
+  if (forOp.getBody()->empty() ||
+      forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
+    return failure();
+
+  // Loops where both lower and upper bounds are multi-result maps won't be
+  // unrolled (since the trip can't be expressed as an affine function in
+  // general).
+  // TODO(mlir-team): this may not be common, but we could support the case
+  // where the lower bound is a multi-result map and the ub is a single result
+  // one.
+  if (forOp.getLowerBoundMap().getNumResults() != 1)
+    return failure();
+
+  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  // If the trip count is lower than the unroll jam factor, no unroll jam.
+  if (mayBeConstantTripCount.hasValue() &&
+      mayBeConstantTripCount.getValue() < unrollJamFactor)
+    return failure();
+
+  auto *forInst = forOp.getOperation();
+
+  // Gather all sub-blocks to jam upon the loop being unrolled.
+  JamBlockGatherer jbg;
+  jbg.walk(forInst);
+  auto &subBlocks = jbg.subBlocks;
+
+  // Generate the cleanup loop if trip count isn't a multiple of
+  // unrollJamFactor.
+  if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
+    // Insert the cleanup loop right after 'forOp'.
+    OpBuilder builder(forInst->getBlock(), std::next(Block::iterator(forInst)));
+    auto cleanupAffineForOp = cast<AffineForOp>(builder.clone(*forInst));
+    // Adjust the lower bound of the cleanup loop; its upper bound is the same
+    // as the original loop's upper bound.
+    AffineMap cleanupMap;
+    SmallVector<Value, 4> cleanupOperands;
+    getCleanupLoopLowerBound(forOp, unrollJamFactor, &cleanupMap,
+                             &cleanupOperands, builder);
+    cleanupAffineForOp.setLowerBound(cleanupOperands, cleanupMap);
+
+    // Promote the cleanup loop if it has turned into a single iteration loop.
+    promoteIfSingleIteration(cleanupAffineForOp);
+
+    // Adjust the upper bound of the original loop - it will be the same as the
+    // cleanup loop's lower bound. Its lower bound remains unchanged.
+    forOp.setUpperBound(cleanupOperands, cleanupMap);
+  }
+
+  // Scale the step of loop being unroll-jammed by the unroll-jam factor.
+  int64_t step = forOp.getStep();
+  forOp.setStep(step * unrollJamFactor);
+
+  auto forOpIV = forOp.getInductionVar();
+  // Unroll and jam (appends unrollJamFactor - 1 additional copies).
+  for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
+    // Operand map persists across all sub-blocks.
+    BlockAndValueMapping operandMapping;
+    for (auto &subBlock : subBlocks) {
+      // Builder to insert unroll-jammed bodies. Insert right at the end of
+      // sub-block.
+      OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
+
+      // If the induction variable is used, create a remapping to the value for
+      // this unrolled instance.
+      if (!forOpIV->use_empty()) {
+        // iv' = iv + i, i = 1 to unrollJamFactor-1.
+        auto d0 = builder.getAffineDimExpr(0);
+        auto bumpMap = AffineMap::get(1, 0, {d0 + i * step});
+        auto ivUnroll =
+            builder.create<AffineApplyOp>(forInst->getLoc(), bumpMap, forOpIV);
+        operandMapping.map(forOpIV, ivUnroll);
+      }
+      // Clone the sub-block being unroll-jammed.
+      for (auto it = subBlock.first; it != std::next(subBlock.second); ++it) {
+        builder.clone(*it, operandMapping);
+      }
+    }
+  }
+
+  // Promote the loop body up if this has turned into a single iteration loop.
+  promoteIfSingleIteration(forOp);
+  return success();
+}
+
+static PassRegistration<LoopUnrollAndJam> pass("affine-loop-unroll-jam",
+                                               "Unroll and jam loops");
diff --git a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e2514e12cc77958e9453b6f850850f497f864cd6
--- /dev/null
+++ b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp
@@ -0,0 +1,227 @@
+//===- MemRefDataFlowOpt.cpp - MemRef DataFlow Optimization pass ------ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to forward memref stores to loads, thereby
+// potentially getting rid of intermediate memref's entirely.
+// TODO(mlir-team): In the future, similar techniques could be used to eliminate
+// dead memref store's and perform more complex forwarding when support for
+// SSA scalars live out of 'affine.for'/'affine.if' statements is available.
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/Dominance.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include <algorithm>
+
+#define DEBUG_TYPE "memref-dataflow-opt"
+
+using namespace mlir;
+
+namespace {
+
+// The store to load forwarding relies on three conditions:
+//
+// 1) they need to have mathematically equivalent affine access functions
+// (checked after full composition of load/store operands); this implies that
+// they access the same single memref element for all iterations of the common
+// surrounding loop,
+//
+// 2) the store op should dominate the load op,
+//
+// 3) among all op's that satisfy both (1) and (2), the one that postdominates
+// all store op's that have a dependence into the load, is provably the last
+// writer to the particular memref location being loaded at the load op, and its
+// store value can be forwarded to the load. Note that the only dependences
+// that are to be considered are those that are satisfied at the block* of the
+// innermost common surrounding loop of the <store, load> being considered.
+//
+// (* A dependence being satisfied at a block: a dependence that is satisfied by
+// virtue of the destination operation appearing textually / lexically after
+// the source operation within the body of a 'affine.for' operation; thus, a
+// dependence is always either satisfied by a loop or by a block).
+//
+// The above conditions are simple to check, sufficient, and powerful for most
+// cases in practice - they are sufficient, but not necessary --- since they
+// don't reason about loops that are guaranteed to execute at least once or
+// multiple sources to forward from.
+//
+// TODO(mlir-team): more forwarding can be done when support for
+// loop/conditional live-out SSA values is available.
+// TODO(mlir-team): do general dead store elimination for memref's. This pass
+// currently only eliminates the stores only if no other loads/uses (other
+// than dealloc) remain.
+//
+struct MemRefDataFlowOpt : public FunctionPass<MemRefDataFlowOpt> {
+  void runOnFunction() override;
+
+  void forwardStoreToLoad(AffineLoadOp loadOp);
+
+  // A list of memref's that are potentially dead / could be eliminated.
+  SmallPtrSet<Value, 4> memrefsToErase;
+  // Load op's whose results were replaced by those forwarded from stores.
+  SmallVector<Operation *, 8> loadOpsToErase;
+
+  DominanceInfo *domInfo = nullptr;
+  PostDominanceInfo *postDomInfo = nullptr;
+};
+
+} // end anonymous namespace
+
+/// Creates a pass to perform optimizations relying on memref dataflow such as
+/// store to load forwarding, elimination of dead stores, and dead allocs.
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createMemRefDataFlowOptPass() {
+  return std::make_unique<MemRefDataFlowOpt>();
+}
+
+// This is a straightforward implementation not optimized for speed. Optimize
+// if needed.
+void MemRefDataFlowOpt::forwardStoreToLoad(AffineLoadOp loadOp) {
+  Operation *loadOpInst = loadOp.getOperation();
+
+  // First pass over the use list to get minimum number of surrounding
+  // loops common between the load op and the store op, with min taken across
+  // all store ops.
+  SmallVector<Operation *, 8> storeOps;
+  unsigned minSurroundingLoops = getNestingDepth(*loadOpInst);
+  for (auto *user : loadOp.getMemRef()->getUsers()) {
+    auto storeOp = dyn_cast<AffineStoreOp>(user);
+    if (!storeOp)
+      continue;
+    auto *storeOpInst = storeOp.getOperation();
+    unsigned nsLoops = getNumCommonSurroundingLoops(*loadOpInst, *storeOpInst);
+    minSurroundingLoops = std::min(nsLoops, minSurroundingLoops);
+    storeOps.push_back(storeOpInst);
+  }
+
+  // The list of store op candidates for forwarding that satisfy conditions
+  // (1) and (2) above - they will be filtered later when checking (3).
+  SmallVector<Operation *, 8> fwdingCandidates;
+
+  // Store ops that have a dependence into the load (even if they aren't
+  // forwarding candidates). Each forwarding candidate will be checked for a
+  // post-dominance on these. 'fwdingCandidates' are a subset of depSrcStores.
+  SmallVector<Operation *, 8> depSrcStores;
+
+  for (auto *storeOpInst : storeOps) {
+    MemRefAccess srcAccess(storeOpInst);
+    MemRefAccess destAccess(loadOpInst);
+    // Find stores that may be reaching the load.
+    FlatAffineConstraints dependenceConstraints;
+    unsigned nsLoops = getNumCommonSurroundingLoops(*loadOpInst, *storeOpInst);
+    unsigned d;
+    // Dependences at loop depth <= minSurroundingLoops do NOT matter.
+    for (d = nsLoops + 1; d > minSurroundingLoops; d--) {
+      DependenceResult result = checkMemrefAccessDependence(
+          srcAccess, destAccess, d, &dependenceConstraints,
+          /*dependenceComponents=*/nullptr);
+      if (hasDependence(result))
+        break;
+    }
+    if (d == minSurroundingLoops)
+      continue;
+
+    // Stores that *may* be reaching the load.
+    depSrcStores.push_back(storeOpInst);
+
+    // 1. Check if the store and the load have mathematically equivalent
+    // affine access functions; this implies that they statically refer to the
+    // same single memref element. As an example this filters out cases like:
+    //     store %A[%i0 + 1]
+    //     load %A[%i0]
+    //     store %A[%M]
+    //     load %A[%N]
+    // Use the AffineValueMap difference based memref access equality checking.
+    if (srcAccess != destAccess)
+      continue;
+
+    // 2. The store has to dominate the load op to be candidate.
+    if (!domInfo->dominates(storeOpInst, loadOpInst))
+      continue;
+
+    // We now have a candidate for forwarding.
+    fwdingCandidates.push_back(storeOpInst);
+  }
+
+  // 3. Of all the store op's that meet the above criteria, the store that
+  // postdominates all 'depSrcStores' (if one exists) is the unique store
+  // providing the value to the load, i.e., provably the last writer to that
+  // memref loc.
+  // Note: this can be implemented in a cleaner way with postdominator tree
+  // traversals. Consider this for the future if needed.
+  Operation *lastWriteStoreOp = nullptr;
+  for (auto *storeOpInst : fwdingCandidates) {
+    if (llvm::all_of(depSrcStores, [&](Operation *depStore) {
+          return postDomInfo->postDominates(storeOpInst, depStore);
+        })) {
+      lastWriteStoreOp = storeOpInst;
+      break;
+    }
+  }
+  if (!lastWriteStoreOp)
+    return;
+
+  // Perform the actual store to load forwarding.
+  Value storeVal = cast<AffineStoreOp>(lastWriteStoreOp).getValueToStore();
+  loadOp.replaceAllUsesWith(storeVal);
+  // Record the memref for a later sweep to optimize away.
+  memrefsToErase.insert(loadOp.getMemRef());
+  // Record this to erase later.
+  loadOpsToErase.push_back(loadOpInst);
+}
+
+void MemRefDataFlowOpt::runOnFunction() {
+  // Only supports single block functions at the moment.
+  FuncOp f = getFunction();
+  if (f.getBlocks().size() != 1) {
+    markAllAnalysesPreserved();
+    return;
+  }
+
+  domInfo = &getAnalysis<DominanceInfo>();
+  postDomInfo = &getAnalysis<PostDominanceInfo>();
+
+  loadOpsToErase.clear();
+  memrefsToErase.clear();
+
+  // Walk all load's and perform load/store forwarding.
+  f.walk([&](AffineLoadOp loadOp) { forwardStoreToLoad(loadOp); });
+
+  // Erase all load op's whose results were replaced with store fwd'ed ones.
+  for (auto *loadOp : loadOpsToErase) {
+    loadOp->erase();
+  }
+
+  // Check if the store fwd'ed memrefs are now left with only stores and can
+  // thus be completely deleted. Note: the canonicalize pass should be able
+  // to do this as well, but we'll do it here since we collected these anyway.
+  for (auto memref : memrefsToErase) {
+    // If the memref hasn't been alloc'ed in this function, skip.
+    Operation *defInst = memref->getDefiningOp();
+    if (!defInst || !isa<AllocOp>(defInst))
+      // TODO(mlir-team): if the memref was returned by a 'call' operation, we
+      // could still erase it if the call had no side-effects.
+      continue;
+    if (llvm::any_of(memref->getUsers(), [&](Operation *ownerInst) {
+          return (!isa<AffineStoreOp>(ownerInst) && !isa<DeallocOp>(ownerInst));
+        }))
+      continue;
+
+    // Erase all stores, the dealloc, and the alloc on the memref.
+    for (auto *user : llvm::make_early_inc_range(memref->getUsers()))
+      user->erase();
+    defInst->erase();
+  }
+}
+
+static PassRegistration<MemRefDataFlowOpt>
+    pass("memref-dataflow-opt", "Perform store/load forwarding for memrefs");
diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dce02737064e8c6bc3111f9042bef0f0900d3c1a
--- /dev/null
+++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp
@@ -0,0 +1,379 @@
+//===- PipelineDataTransfer.cpp --- Pass for pipelining data movement ---*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to pipeline data transfers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Debug.h"
+#define DEBUG_TYPE "affine-pipeline-data-transfer"
+
+using namespace mlir;
+
+namespace {
+
+struct PipelineDataTransfer : public FunctionPass<PipelineDataTransfer> {
+  void runOnFunction() override;
+  void runOnAffineForOp(AffineForOp forOp);
+
+  std::vector<AffineForOp> forOps;
+};
+
+} // end anonymous namespace
+
+/// Creates a pass to pipeline explicit movement of data across levels of the
+/// memory hierarchy.
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createPipelineDataTransferPass() {
+  return std::make_unique<PipelineDataTransfer>();
+}
+
+// Returns the position of the tag memref operand given a DMA operation.
+// Temporary utility: will be replaced when DmaStart/DmaFinish abstract op's are
+// added.  TODO(b/117228571)
+static unsigned getTagMemRefPos(Operation &dmaInst) {
+  assert(isa<AffineDmaStartOp>(dmaInst) || isa<AffineDmaWaitOp>(dmaInst));
+  if (auto dmaStartOp = dyn_cast<AffineDmaStartOp>(dmaInst)) {
+    return dmaStartOp.getTagMemRefOperandIndex();
+  }
+  // First operand for a dma finish operation.
+  return 0;
+}
+
+/// Doubles the buffer of the supplied memref on the specified 'affine.for'
+/// operation by adding a leading dimension of size two to the memref.
+/// Replaces all uses of the old memref by the new one while indexing the newly
+/// added dimension by the loop IV of the specified 'affine.for' operation
+/// modulo 2. Returns false if such a replacement cannot be performed.
+static bool doubleBuffer(Value oldMemRef, AffineForOp forOp) {
+  auto *forBody = forOp.getBody();
+  OpBuilder bInner(forBody, forBody->begin());
+
+  // Doubles the shape with a leading dimension extent of 2.
+  auto doubleShape = [&](MemRefType oldMemRefType) -> MemRefType {
+    // Add the leading dimension in the shape for the double buffer.
+    ArrayRef<int64_t> oldShape = oldMemRefType.getShape();
+    SmallVector<int64_t, 4> newShape(1 + oldMemRefType.getRank());
+    newShape[0] = 2;
+    std::copy(oldShape.begin(), oldShape.end(), newShape.begin() + 1);
+    auto newMemRefType =
+        MemRefType::get(newShape, oldMemRefType.getElementType(), {},
+                        oldMemRefType.getMemorySpace());
+    return newMemRefType;
+  };
+
+  auto oldMemRefType = oldMemRef->getType().cast<MemRefType>();
+  auto newMemRefType = doubleShape(oldMemRefType);
+
+  // The double buffer is allocated right before 'forInst'.
+  auto *forInst = forOp.getOperation();
+  OpBuilder bOuter(forInst);
+  // Put together alloc operands for any dynamic dimensions of the memref.
+  SmallVector<Value, 4> allocOperands;
+  unsigned dynamicDimCount = 0;
+  for (auto dimSize : oldMemRefType.getShape()) {
+    if (dimSize == -1)
+      allocOperands.push_back(bOuter.create<DimOp>(forInst->getLoc(), oldMemRef,
+                                                   dynamicDimCount++));
+  }
+
+  // Create and place the alloc right before the 'affine.for' operation.
+  Value newMemRef =
+      bOuter.create<AllocOp>(forInst->getLoc(), newMemRefType, allocOperands);
+
+  // Create 'iv mod 2' value to index the leading dimension.
+  auto d0 = bInner.getAffineDimExpr(0);
+  int64_t step = forOp.getStep();
+  auto modTwoMap = AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0,
+                                  {d0.floorDiv(step) % 2});
+  auto ivModTwoOp = bInner.create<AffineApplyOp>(forOp.getLoc(), modTwoMap,
+                                                 forOp.getInductionVar());
+
+  // replaceAllMemRefUsesWith will succeed unless the forOp body has
+  // non-dereferencing uses of the memref (dealloc's are fine though).
+  if (failed(replaceAllMemRefUsesWith(
+          oldMemRef, newMemRef,
+          /*extraIndices=*/{ivModTwoOp},
+          /*indexRemap=*/AffineMap(),
+          /*extraOperands=*/{},
+          /*symbolOperands=*/{},
+          /*domInstFilter=*/&*forOp.getBody()->begin()))) {
+    LLVM_DEBUG(
+        forOp.emitError("memref replacement for double buffering failed"));
+    ivModTwoOp.erase();
+    return false;
+  }
+  // Insert the dealloc op right after the for loop.
+  bOuter.setInsertionPointAfter(forInst);
+  bOuter.create<DeallocOp>(forInst->getLoc(), newMemRef);
+
+  return true;
+}
+
+/// Returns success if the IR is in a valid state.
+void PipelineDataTransfer::runOnFunction() {
+  // Do a post order walk so that inner loop DMAs are processed first. This is
+  // necessary since 'affine.for' operations nested within would otherwise
+  // become invalid (erased) when the outer loop is pipelined (the pipelined one
+  // gets deleted and replaced by a prologue, a new steady-state loop and an
+  // epilogue).
+  forOps.clear();
+  getFunction().walk([&](AffineForOp forOp) { forOps.push_back(forOp); });
+  for (auto forOp : forOps)
+    runOnAffineForOp(forOp);
+}
+
+// Check if tags of the dma start op and dma wait op match.
+static bool checkTagMatch(AffineDmaStartOp startOp, AffineDmaWaitOp waitOp) {
+  if (startOp.getTagMemRef() != waitOp.getTagMemRef())
+    return false;
+  auto startIndices = startOp.getTagIndices();
+  auto waitIndices = waitOp.getTagIndices();
+  // Both of these have the same number of indices since they correspond to the
+  // same tag memref.
+  for (auto it = startIndices.begin(), wIt = waitIndices.begin(),
+            e = startIndices.end();
+       it != e; ++it, ++wIt) {
+    // Keep it simple for now, just checking if indices match.
+    // TODO(mlir-team): this would in general need to check if there is no
+    // intervening write writing to the same tag location, i.e., memory last
+    // write/data flow analysis. This is however sufficient/powerful enough for
+    // now since the DMA generation pass or the input for it will always have
+    // start/wait with matching tags (same SSA operand indices).
+    if (*it != *wIt)
+      return false;
+  }
+  return true;
+}
+
+// Identify matching DMA start/finish operations to overlap computation with.
+static void findMatchingStartFinishInsts(
+    AffineForOp forOp,
+    SmallVectorImpl<std::pair<Operation *, Operation *>> &startWaitPairs) {
+
+  // Collect outgoing DMA operations - needed to check for dependences below.
+  SmallVector<AffineDmaStartOp, 4> outgoingDmaOps;
+  for (auto &op : *forOp.getBody()) {
+    auto dmaStartOp = dyn_cast<AffineDmaStartOp>(op);
+    if (dmaStartOp && dmaStartOp.isSrcMemorySpaceFaster())
+      outgoingDmaOps.push_back(dmaStartOp);
+  }
+
+  SmallVector<Operation *, 4> dmaStartInsts, dmaFinishInsts;
+  for (auto &op : *forOp.getBody()) {
+    // Collect DMA finish operations.
+    if (isa<AffineDmaWaitOp>(op)) {
+      dmaFinishInsts.push_back(&op);
+      continue;
+    }
+    auto dmaStartOp = dyn_cast<AffineDmaStartOp>(op);
+    if (!dmaStartOp)
+      continue;
+
+    // Only DMAs incoming into higher memory spaces are pipelined for now.
+    // TODO(bondhugula): handle outgoing DMA pipelining.
+    if (!dmaStartOp.isDestMemorySpaceFaster())
+      continue;
+
+    // Check for dependence with outgoing DMAs. Doing this conservatively.
+    // TODO(andydavis,bondhugula): use the dependence analysis to check for
+    // dependences between an incoming and outgoing DMA in the same iteration.
+    auto it = outgoingDmaOps.begin();
+    for (; it != outgoingDmaOps.end(); ++it) {
+      if (it->getDstMemRef() == dmaStartOp.getSrcMemRef())
+        break;
+    }
+    if (it != outgoingDmaOps.end())
+      continue;
+
+    // We only double buffer if the buffer is not live out of loop.
+    auto memref = dmaStartOp.getOperand(dmaStartOp.getFasterMemPos());
+    bool escapingUses = false;
+    for (auto *user : memref->getUsers()) {
+      // We can double buffer regardless of dealloc's outside the loop.
+      if (isa<DeallocOp>(user))
+        continue;
+      if (!forOp.getBody()->findAncestorOpInBlock(*user)) {
+        LLVM_DEBUG(llvm::dbgs()
+                       << "can't pipeline: buffer is live out of loop\n";);
+        escapingUses = true;
+        break;
+      }
+    }
+    if (!escapingUses)
+      dmaStartInsts.push_back(&op);
+  }
+
+  // For each start operation, we look for a matching finish operation.
+  for (auto *dmaStartInst : dmaStartInsts) {
+    for (auto *dmaFinishInst : dmaFinishInsts) {
+      if (checkTagMatch(cast<AffineDmaStartOp>(dmaStartInst),
+                        cast<AffineDmaWaitOp>(dmaFinishInst))) {
+        startWaitPairs.push_back({dmaStartInst, dmaFinishInst});
+        break;
+      }
+    }
+  }
+}
+
+/// Overlap DMA transfers with computation in this loop. If successful,
+/// 'forOp' is deleted, and a prologue, a new pipelined loop, and epilogue are
+/// inserted right before where it was.
+void PipelineDataTransfer::runOnAffineForOp(AffineForOp forOp) {
+  auto mayBeConstTripCount = getConstantTripCount(forOp);
+  if (!mayBeConstTripCount.hasValue()) {
+    LLVM_DEBUG(
+        forOp.emitRemark("won't pipeline due to unknown trip count loop"));
+    return;
+  }
+
+  SmallVector<std::pair<Operation *, Operation *>, 4> startWaitPairs;
+  findMatchingStartFinishInsts(forOp, startWaitPairs);
+
+  if (startWaitPairs.empty()) {
+    LLVM_DEBUG(forOp.emitRemark("No dma start/finish pairs\n"));
+    return;
+  }
+
+  // Double the buffers for the higher memory space memref's.
+  // Identify memref's to replace by scanning through all DMA start
+  // operations. A DMA start operation has two memref's - the one from the
+  // higher level of memory hierarchy is the one to double buffer.
+  // TODO(bondhugula): check whether double-buffering is even necessary.
+  // TODO(bondhugula): make this work with different layouts: assuming here that
+  // the dimension we are adding here for the double buffering is the outermost
+  // dimension.
+  for (auto &pair : startWaitPairs) {
+    auto *dmaStartInst = pair.first;
+    Value oldMemRef = dmaStartInst->getOperand(
+        cast<AffineDmaStartOp>(dmaStartInst).getFasterMemPos());
+    if (!doubleBuffer(oldMemRef, forOp)) {
+      // Normally, double buffering should not fail because we already checked
+      // that there are no uses outside.
+      LLVM_DEBUG(llvm::dbgs()
+                     << "double buffering failed for" << dmaStartInst << "\n";);
+      // IR still valid and semantically correct.
+      return;
+    }
+    // If the old memref has no more uses, remove its 'dead' alloc if it was
+    // alloc'ed. (note: DMA buffers are rarely function live-in; but a 'dim'
+    // operation could have been used on it if it was dynamically shaped in
+    // order to create the double buffer above.)
+    // '-canonicalize' does this in a more general way, but we'll anyway do the
+    // simple/common case so that the output / test cases looks clear.
+    if (auto *allocInst = oldMemRef->getDefiningOp()) {
+      if (oldMemRef->use_empty()) {
+        allocInst->erase();
+      } else if (oldMemRef->hasOneUse()) {
+        if (auto dealloc = dyn_cast<DeallocOp>(*oldMemRef->user_begin())) {
+          dealloc.erase();
+          allocInst->erase();
+        }
+      }
+    }
+  }
+
+  // Double the buffers for tag memrefs.
+  for (auto &pair : startWaitPairs) {
+    auto *dmaFinishInst = pair.second;
+    Value oldTagMemRef =
+        dmaFinishInst->getOperand(getTagMemRefPos(*dmaFinishInst));
+    if (!doubleBuffer(oldTagMemRef, forOp)) {
+      LLVM_DEBUG(llvm::dbgs() << "tag double buffering failed\n";);
+      return;
+    }
+    // If the old tag has no uses or a single dealloc use, remove it.
+    // (canonicalization handles more complex cases).
+    if (auto *tagAllocInst = oldTagMemRef->getDefiningOp()) {
+      if (oldTagMemRef->use_empty()) {
+        tagAllocInst->erase();
+      } else if (oldTagMemRef->hasOneUse()) {
+        if (auto dealloc = dyn_cast<DeallocOp>(*oldTagMemRef->user_begin())) {
+          dealloc.erase();
+          tagAllocInst->erase();
+        }
+      }
+    }
+  }
+
+  // Double buffering would have invalidated all the old DMA start/wait insts.
+  startWaitPairs.clear();
+  findMatchingStartFinishInsts(forOp, startWaitPairs);
+
+  // Store shift for operation for later lookup for AffineApplyOp's.
+  DenseMap<Operation *, unsigned> instShiftMap;
+  for (auto &pair : startWaitPairs) {
+    auto *dmaStartInst = pair.first;
+    assert(isa<AffineDmaStartOp>(dmaStartInst));
+    instShiftMap[dmaStartInst] = 0;
+    // Set shifts for DMA start op's affine operand computation slices to 0.
+    SmallVector<AffineApplyOp, 4> sliceOps;
+    mlir::createAffineComputationSlice(dmaStartInst, &sliceOps);
+    if (!sliceOps.empty()) {
+      for (auto sliceOp : sliceOps) {
+        instShiftMap[sliceOp.getOperation()] = 0;
+      }
+    } else {
+      // If a slice wasn't created, the reachable affine.apply op's from its
+      // operands are the ones that go with it.
+      SmallVector<Operation *, 4> affineApplyInsts;
+      SmallVector<Value, 4> operands(dmaStartInst->getOperands());
+      getReachableAffineApplyOps(operands, affineApplyInsts);
+      for (auto *op : affineApplyInsts) {
+        instShiftMap[op] = 0;
+      }
+    }
+  }
+  // Everything else (including compute ops and dma finish) are shifted by one.
+  for (auto &op : *forOp.getBody()) {
+    if (instShiftMap.find(&op) == instShiftMap.end()) {
+      instShiftMap[&op] = 1;
+    }
+  }
+
+  // Get shifts stored in map.
+  std::vector<uint64_t> shifts(forOp.getBody()->getOperations().size());
+  unsigned s = 0;
+  for (auto &op : *forOp.getBody()) {
+    assert(instShiftMap.find(&op) != instShiftMap.end());
+    shifts[s++] = instShiftMap[&op];
+
+    // Tagging operations with shifts for debugging purposes.
+    LLVM_DEBUG({
+      OpBuilder b(&op);
+      op.setAttr("shift", b.getI64IntegerAttr(shifts[s - 1]));
+    });
+  }
+
+  if (!isInstwiseShiftValid(forOp, shifts)) {
+    // Violates dependences.
+    LLVM_DEBUG(llvm::dbgs() << "Shifts invalid - unexpected\n";);
+    return;
+  }
+
+  if (failed(instBodySkew(forOp, shifts))) {
+    LLVM_DEBUG(llvm::dbgs() << "op body skewing failed - unexpected\n";);
+    return;
+  }
+}
+
+static PassRegistration<PipelineDataTransfer> pass(
+    "affine-pipeline-data-transfer",
+    "Pipeline non-blocking data transfers between explicitly managed levels of "
+    "the memory hierarchy");
diff --git a/mlir/lib/Transforms/SimplifyAffineStructures.cpp b/mlir/lib/Transforms/SimplifyAffineStructures.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..217e06bc877cc9d546abddf417738811a4da7328
--- /dev/null
+++ b/mlir/lib/Transforms/SimplifyAffineStructures.cpp
@@ -0,0 +1,108 @@
+//===- SimplifyAffineStructures.cpp ---------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to simplify affine structures.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+
+#define DEBUG_TYPE "simplify-affine-structure"
+
+using namespace mlir;
+
+namespace {
+
+/// Simplifies affine maps and sets appearing in the operations of the Function.
+/// This part is mainly to test the simplifyAffineExpr method. In addition,
+/// all memrefs with non-trivial layout maps are converted to ones with trivial
+/// identity layout ones.
+struct SimplifyAffineStructures
+    : public FunctionPass<SimplifyAffineStructures> {
+  void runOnFunction() override;
+
+  /// Utility to simplify an affine attribute and update its entry in the parent
+  /// operation if necessary.
+  template <typename AttributeT>
+  void simplifyAndUpdateAttribute(Operation *op, Identifier name,
+                                  AttributeT attr) {
+    auto &simplified = simplifiedAttributes[attr];
+    if (simplified == attr)
+      return;
+
+    // This is a newly encountered attribute.
+    if (!simplified) {
+      // Try to simplify the value of the attribute.
+      auto value = attr.getValue();
+      auto simplifiedValue = simplify(value);
+      if (simplifiedValue == value) {
+        simplified = attr;
+        return;
+      }
+      simplified = AttributeT::get(simplifiedValue);
+    }
+
+    // Simplification was successful, so update the attribute.
+    op->setAttr(name, simplified);
+  }
+
+  /// Performs basic integer set simplifications. Checks if it's empty, and
+  /// replaces it with the canonical empty set if it is.
+  IntegerSet simplify(IntegerSet set) {
+    FlatAffineConstraints fac(set);
+    if (fac.isEmpty())
+      return IntegerSet::getEmptySet(set.getNumDims(), set.getNumSymbols(),
+                                     &getContext());
+    return set;
+  }
+
+  /// Performs basic affine map simplifications.
+  AffineMap simplify(AffineMap map) {
+    MutableAffineMap mMap(map);
+    mMap.simplify();
+    return mMap.getAffineMap();
+  }
+
+  DenseMap<Attribute, Attribute> simplifiedAttributes;
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createSimplifyAffineStructuresPass() {
+  return std::make_unique<SimplifyAffineStructures>();
+}
+
+void SimplifyAffineStructures::runOnFunction() {
+  auto func = getFunction();
+  simplifiedAttributes.clear();
+  func.walk([&](Operation *opInst) {
+    for (auto attr : opInst->getAttrs()) {
+      if (auto mapAttr = attr.second.dyn_cast<AffineMapAttr>())
+        simplifyAndUpdateAttribute(opInst, attr.first, mapAttr);
+      else if (auto setAttr = attr.second.dyn_cast<IntegerSetAttr>())
+        simplifyAndUpdateAttribute(opInst, attr.first, setAttr);
+    }
+  });
+
+  // Turn memrefs' non-identity layouts maps into ones with identity. Collect
+  // alloc ops first and then process since normalizeMemRef replaces/erases ops
+  // during memref rewriting.
+  SmallVector<AllocOp, 4> allocOps;
+  func.walk([&](AllocOp op) { allocOps.push_back(op); });
+  for (auto allocOp : allocOps) {
+    normalizeMemRef(allocOp);
+  }
+}
+
+static PassRegistration<SimplifyAffineStructures>
+    pass("simplify-affine-structures",
+         "Simplify affine expressions in maps/sets and normalize memrefs");
diff --git a/mlir/lib/Transforms/StripDebugInfo.cpp b/mlir/lib/Transforms/StripDebugInfo.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cdfc7fd7e41d8a82b9acef6a5f2c2eea7d7fe1a2
--- /dev/null
+++ b/mlir/lib/Transforms/StripDebugInfo.cpp
@@ -0,0 +1,37 @@
+//===- StripDebugInfo.cpp - Pass to strip debug information ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+struct StripDebugInfo : public FunctionPass<StripDebugInfo> {
+  void runOnFunction() override;
+};
+} // end anonymous namespace
+
+void StripDebugInfo::runOnFunction() {
+  FuncOp func = getFunction();
+  auto unknownLoc = UnknownLoc::get(&getContext());
+
+  // Strip the debug info from the function and its operations.
+  func.setLoc(unknownLoc);
+  func.walk([&](Operation *op) { op->setLoc(unknownLoc); });
+}
+
+/// Creates a pass to strip debug information from a function.
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createStripDebugInfoPass() {
+  return std::make_unique<StripDebugInfo>();
+}
+
+static PassRegistration<StripDebugInfo>
+    pass("strip-debuginfo", "Strip debug info from functions and operations");
diff --git a/mlir/lib/Transforms/Utils/CMakeLists.txt b/mlir/lib/Transforms/Utils/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e1dc5e4b4ec3e617bdf5099c4e2f4ca78009b05
--- /dev/null
+++ b/mlir/lib/Transforms/Utils/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_llvm_library(MLIRTransformUtils
+  FoldUtils.cpp
+  GreedyPatternRewriteDriver.cpp
+  InliningUtils.cpp
+  LoopFusionUtils.cpp
+  LoopUtils.cpp
+  RegionUtils.cpp
+  Utils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
+  )
+
+add_dependencies(MLIRTransformUtils MLIRStandardOpsIncGen)
+target_link_libraries(MLIRTransformUtils
+  MLIRAffineOps
+  MLIRAnalysis
+  MLIRLoopOps
+  MLIRPass
+  MLIRStandardOps
+  )
diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..719c6fac731c4326289781ae4b3105a96dae37c9
--- /dev/null
+++ b/mlir/lib/Transforms/Utils/FoldUtils.cpp
@@ -0,0 +1,246 @@
+//===- FoldUtils.cpp ---- Fold Utilities ----------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines various operation fold utilities. These utilities are
+// intended to be used by passes to unify and simply their logic.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/FoldUtils.h"
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Operation.h"
+
+using namespace mlir;
+
+/// Given an operation, find the parent region that folded constants should be
+/// inserted into.
+static Region *getInsertionRegion(
+    DialectInterfaceCollection<OpFolderDialectInterface> &interfaces,
+    Operation *op) {
+  while (Region *region = op->getParentRegion()) {
+    // Insert in this region for any of the following scenarios:
+    //  * The parent is unregistered, or is known to be isolated from above.
+    //  * The parent is a top-level operation.
+    auto *parentOp = region->getParentOp();
+    if (!parentOp->isRegistered() || parentOp->isKnownIsolatedFromAbove() ||
+        !parentOp->getBlock())
+      return region;
+
+    // Otherwise, check if this region is a desired insertion region.
+    auto *interface = interfaces.getInterfaceFor(parentOp);
+    if (LLVM_UNLIKELY(interface && interface->shouldMaterializeInto(region)))
+      return region;
+
+    // Traverse up the parent looking for an insertion region.
+    op = parentOp;
+  }
+  llvm_unreachable("expected valid insertion region");
+}
+
+/// A utility function used to materialize a constant for a given attribute and
+/// type. On success, a valid constant value is returned. Otherwise, null is
+/// returned
+static Operation *materializeConstant(Dialect *dialect, OpBuilder &builder,
+                                      Attribute value, Type type,
+                                      Location loc) {
+  auto insertPt = builder.getInsertionPoint();
+  (void)insertPt;
+
+  // Ask the dialect to materialize a constant operation for this value.
+  if (auto *constOp = dialect->materializeConstant(builder, value, type, loc)) {
+    assert(insertPt == builder.getInsertionPoint());
+    assert(matchPattern(constOp, m_Constant(&value)));
+    return constOp;
+  }
+
+  // If the dialect is unable to materialize a constant, check to see if the
+  // standard constant can be used.
+  if (ConstantOp::isBuildableWith(value, type))
+    return builder.create<ConstantOp>(loc, type, value);
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// OperationFolder
+//===----------------------------------------------------------------------===//
+
+LogicalResult OperationFolder::tryToFold(
+    Operation *op, function_ref<void(Operation *)> processGeneratedConstants,
+    function_ref<void(Operation *)> preReplaceAction) {
+  // If this is a unique'd constant, return failure as we know that it has
+  // already been folded.
+  if (referencedDialects.count(op))
+    return failure();
+
+  // Try to fold the operation.
+  SmallVector<Value, 8> results;
+  if (failed(tryToFold(op, results, processGeneratedConstants)))
+    return failure();
+
+  // Constant folding succeeded. We will start replacing this op's uses and
+  // eventually erase this op. Invoke the callback provided by the caller to
+  // perform any pre-replacement action.
+  if (preReplaceAction)
+    preReplaceAction(op);
+
+  // Check to see if the operation was just updated in place.
+  if (results.empty())
+    return success();
+
+  // Otherwise, replace all of the result values and erase the operation.
+  for (unsigned i = 0, e = results.size(); i != e; ++i)
+    op->getResult(i)->replaceAllUsesWith(results[i]);
+  op->erase();
+  return success();
+}
+
+/// Notifies that the given constant `op` should be remove from this
+/// OperationFolder's internal bookkeeping.
+void OperationFolder::notifyRemoval(Operation *op) {
+  // Check to see if this operation is uniqued within the folder.
+  auto it = referencedDialects.find(op);
+  if (it == referencedDialects.end())
+    return;
+
+  // Get the constant value for this operation, this is the value that was used
+  // to unique the operation internally.
+  Attribute constValue;
+  matchPattern(op, m_Constant(&constValue));
+  assert(constValue);
+
+  // Get the constant map that this operation was uniqued in.
+  auto &uniquedConstants = foldScopes[getInsertionRegion(interfaces, op)];
+
+  // Erase all of the references to this operation.
+  auto type = op->getResult(0)->getType();
+  for (auto *dialect : it->second)
+    uniquedConstants.erase(std::make_tuple(dialect, constValue, type));
+  referencedDialects.erase(it);
+}
+
+/// Tries to perform folding on the given `op`. If successful, populates
+/// `results` with the results of the folding.
+LogicalResult OperationFolder::tryToFold(
+    Operation *op, SmallVectorImpl<Value> &results,
+    function_ref<void(Operation *)> processGeneratedConstants) {
+  SmallVector<Attribute, 8> operandConstants;
+  SmallVector<OpFoldResult, 8> foldResults;
+
+  // Check to see if any operands to the operation is constant and whether
+  // the operation knows how to constant fold itself.
+  operandConstants.assign(op->getNumOperands(), Attribute());
+  for (unsigned i = 0, e = op->getNumOperands(); i != e; ++i)
+    matchPattern(op->getOperand(i), m_Constant(&operandConstants[i]));
+
+  // If this is a commutative binary operation with a constant on the left
+  // side move it to the right side.
+  if (operandConstants.size() == 2 && operandConstants[0] &&
+      !operandConstants[1] && op->isCommutative()) {
+    std::swap(op->getOpOperand(0), op->getOpOperand(1));
+    std::swap(operandConstants[0], operandConstants[1]);
+  }
+
+  // Attempt to constant fold the operation.
+  if (failed(op->fold(operandConstants, foldResults)))
+    return failure();
+
+  // Check to see if the operation was just updated in place.
+  if (foldResults.empty())
+    return success();
+  assert(foldResults.size() == op->getNumResults());
+
+  // Create a builder to insert new operations into the entry block of the
+  // insertion region.
+  auto *insertRegion = getInsertionRegion(interfaces, op);
+  auto &entry = insertRegion->front();
+  OpBuilder builder(&entry, entry.begin());
+
+  // Get the constant map for the insertion region of this operation.
+  auto &uniquedConstants = foldScopes[insertRegion];
+
+  // Create the result constants and replace the results.
+  auto *dialect = op->getDialect();
+  for (unsigned i = 0, e = op->getNumResults(); i != e; ++i) {
+    assert(!foldResults[i].isNull() && "expected valid OpFoldResult");
+
+    // Check if the result was an SSA value.
+    if (auto repl = foldResults[i].dyn_cast<Value>()) {
+      results.emplace_back(repl);
+      continue;
+    }
+
+    // Check to see if there is a canonicalized version of this constant.
+    auto res = op->getResult(i);
+    Attribute attrRepl = foldResults[i].get<Attribute>();
+    if (auto *constOp =
+            tryGetOrCreateConstant(uniquedConstants, dialect, builder, attrRepl,
+                                   res->getType(), op->getLoc())) {
+      results.push_back(constOp->getResult(0));
+      continue;
+    }
+    // If materialization fails, cleanup any operations generated for the
+    // previous results and return failure.
+    for (Operation &op : llvm::make_early_inc_range(
+             llvm::make_range(entry.begin(), builder.getInsertionPoint()))) {
+      notifyRemoval(&op);
+      op.erase();
+    }
+    return failure();
+  }
+
+  // Process any newly generated operations.
+  if (processGeneratedConstants) {
+    for (auto i = entry.begin(), e = builder.getInsertionPoint(); i != e; ++i)
+      processGeneratedConstants(&*i);
+  }
+
+  return success();
+}
+
+/// Try to get or create a new constant entry. On success this returns the
+/// constant operation value, nullptr otherwise.
+Operation *OperationFolder::tryGetOrCreateConstant(
+    ConstantMap &uniquedConstants, Dialect *dialect, OpBuilder &builder,
+    Attribute value, Type type, Location loc) {
+  // Check if an existing mapping already exists.
+  auto constKey = std::make_tuple(dialect, value, type);
+  auto *&constInst = uniquedConstants[constKey];
+  if (constInst)
+    return constInst;
+
+  // If one doesn't exist, try to materialize one.
+  if (!(constInst = materializeConstant(dialect, builder, value, type, loc)))
+    return nullptr;
+
+  // Check to see if the generated constant is in the expected dialect.
+  auto *newDialect = constInst->getDialect();
+  if (newDialect == dialect) {
+    referencedDialects[constInst].push_back(dialect);
+    return constInst;
+  }
+
+  // If it isn't, then we also need to make sure that the mapping for the new
+  // dialect is valid.
+  auto newKey = std::make_tuple(newDialect, value, type);
+
+  // If an existing operation in the new dialect already exists, delete the
+  // materialized operation in favor of the existing one.
+  if (auto *existingOp = uniquedConstants.lookup(newKey)) {
+    constInst->erase();
+    referencedDialects[existingOp].push_back(dialect);
+    return constInst = existingOp;
+  }
+
+  // Otherwise, update the new dialect to the materialized operation.
+  referencedDialects[constInst].assign({dialect, newDialect});
+  auto newIt = uniquedConstants.insert({newKey, constInst});
+  return newIt.first->second;
+}
diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1eb9c57639af01d374641591178ae029b7b34276
--- /dev/null
+++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -0,0 +1,247 @@
+//===- GreedyPatternRewriteDriver.cpp - A greedy rewriter -----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements mlir::applyPatternsGreedily.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/FoldUtils.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+#define DEBUG_TYPE "pattern-matcher"
+
+static llvm::cl::opt<unsigned> maxPatternMatchIterations(
+    "mlir-max-pattern-match-iterations",
+    llvm::cl::desc("Max number of iterations scanning for pattern match"),
+    llvm::cl::init(10));
+
+namespace {
+
+/// This is a worklist-driven driver for the PatternMatcher, which repeatedly
+/// applies the locally optimal patterns in a roughly "bottom up" way.
+class GreedyPatternRewriteDriver : public PatternRewriter {
+public:
+  explicit GreedyPatternRewriteDriver(MLIRContext *ctx,
+                                      const OwningRewritePatternList &patterns)
+      : PatternRewriter(ctx), matcher(patterns), folder(ctx) {
+    worklist.reserve(64);
+  }
+
+  /// Perform the rewrites. Return true if the rewrite converges in
+  /// `maxIterations`.
+  bool simplify(MutableArrayRef<Region> regions, int maxIterations);
+
+  void addToWorklist(Operation *op) {
+    // Check to see if the worklist already contains this op.
+    if (worklistMap.count(op))
+      return;
+
+    worklistMap[op] = worklist.size();
+    worklist.push_back(op);
+  }
+
+  Operation *popFromWorklist() {
+    auto *op = worklist.back();
+    worklist.pop_back();
+
+    // This operation is no longer in the worklist, keep worklistMap up to date.
+    if (op)
+      worklistMap.erase(op);
+    return op;
+  }
+
+  /// If the specified operation is in the worklist, remove it.  If not, this is
+  /// a no-op.
+  void removeFromWorklist(Operation *op) {
+    auto it = worklistMap.find(op);
+    if (it != worklistMap.end()) {
+      assert(worklist[it->second] == op && "malformed worklist data structure");
+      worklist[it->second] = nullptr;
+      worklistMap.erase(it);
+    }
+  }
+
+  // These are hooks implemented for PatternRewriter.
+protected:
+  // Implement the hook for inserting operations, and make sure that newly
+  // inserted ops are added to the worklist for processing.
+  Operation *insert(Operation *op) override {
+    addToWorklist(op);
+    return OpBuilder::insert(op);
+  }
+
+  // If an operation is about to be removed, make sure it is not in our
+  // worklist anymore because we'd get dangling references to it.
+  void notifyOperationRemoved(Operation *op) override {
+    addToWorklist(op->getOperands());
+    op->walk([this](Operation *operation) {
+      removeFromWorklist(operation);
+      folder.notifyRemoval(operation);
+    });
+  }
+
+  // When the root of a pattern is about to be replaced, it can trigger
+  // simplifications to its users - make sure to add them to the worklist
+  // before the root is changed.
+  void notifyRootReplaced(Operation *op) override {
+    for (auto result : op->getResults())
+      for (auto *user : result->getUsers())
+        addToWorklist(user);
+  }
+
+private:
+  // Look over the provided operands for any defining operations that should
+  // be re-added to the worklist. This function should be called when an
+  // operation is modified or removed, as it may trigger further
+  // simplifications.
+  template <typename Operands> void addToWorklist(Operands &&operands) {
+    for (Value operand : operands) {
+      // If the use count of this operand is now < 2, we re-add the defining
+      // operation to the worklist.
+      // TODO(riverriddle) This is based on the fact that zero use operations
+      // may be deleted, and that single use values often have more
+      // canonicalization opportunities.
+      if (!operand->use_empty() && !operand->hasOneUse())
+        continue;
+      if (auto *defInst = operand->getDefiningOp())
+        addToWorklist(defInst);
+    }
+  }
+
+  /// The low-level pattern matcher.
+  RewritePatternMatcher matcher;
+
+  /// The worklist for this transformation keeps track of the operations that
+  /// need to be revisited, plus their index in the worklist.  This allows us to
+  /// efficiently remove operations from the worklist when they are erased, even
+  /// if they aren't the root of a pattern.
+  std::vector<Operation *> worklist;
+  DenseMap<Operation *, unsigned> worklistMap;
+
+  /// Non-pattern based folder for operations.
+  OperationFolder folder;
+};
+} // end anonymous namespace
+
+/// Perform the rewrites.
+bool GreedyPatternRewriteDriver::simplify(MutableArrayRef<Region> regions,
+                                          int maxIterations) {
+  // Add the given operation to the worklist.
+  auto collectOps = [this](Operation *op) { addToWorklist(op); };
+
+  bool changed = false;
+  int i = 0;
+  do {
+    // Add all nested operations to the worklist.
+    for (auto &region : regions)
+      region.walk(collectOps);
+
+    // These are scratch vectors used in the folding loop below.
+    SmallVector<Value, 8> originalOperands, resultValues;
+
+    changed = false;
+    while (!worklist.empty()) {
+      auto *op = popFromWorklist();
+
+      // Nulls get added to the worklist when operations are removed, ignore
+      // them.
+      if (op == nullptr)
+        continue;
+
+      // If the operation has no side effects, and no users, then it is
+      // trivially dead - remove it.
+      if (op->hasNoSideEffect() && op->use_empty()) {
+        // Be careful to update bookkeeping.
+        notifyOperationRemoved(op);
+        op->erase();
+        continue;
+      }
+
+      // Collects all the operands and result uses of the given `op` into work
+      // list. Also remove `op` and nested ops from worklist.
+      originalOperands.assign(op->operand_begin(), op->operand_end());
+      auto preReplaceAction = [&](Operation *op) {
+        // Add the operands to the worklist for visitation.
+        addToWorklist(originalOperands);
+
+        // Add all the users of the result to the worklist so we make sure
+        // to revisit them.
+        for (auto result : op->getResults())
+          for (auto *operand : result->getUsers())
+            addToWorklist(operand);
+
+        notifyOperationRemoved(op);
+      };
+
+      // Try to fold this op.
+      if (succeeded(folder.tryToFold(op, collectOps, preReplaceAction))) {
+        changed |= true;
+        continue;
+      }
+
+      // Make sure that any new operations are inserted at this point.
+      setInsertionPoint(op);
+
+      // Try to match one of the patterns. The rewriter is automatically
+      // notified of any necessary changes, so there is nothing else to do here.
+      changed |= matcher.matchAndRewrite(op, *this);
+    }
+
+    // After applying patterns, make sure that the CFG of each of the regions is
+    // kept up to date.
+    changed |= succeeded(simplifyRegions(regions));
+  } while (changed && ++i < maxIterations);
+  // Whether the rewrite converges, i.e. wasn't changed in the last iteration.
+  return !changed;
+}
+
+/// Rewrite the regions of the specified operation, which must be isolated from
+/// above, by repeatedly applying the highest benefit patterns in a greedy
+/// work-list driven manner. Return true if no more patterns can be matched in
+/// the result operation regions.
+/// Note: This does not apply patterns to the top-level operation itself.
+///
+bool mlir::applyPatternsGreedily(Operation *op,
+                                 const OwningRewritePatternList &patterns) {
+  return applyPatternsGreedily(op->getRegions(), patterns);
+}
+
+/// Rewrite the given regions, which must be isolated from above.
+bool mlir::applyPatternsGreedily(MutableArrayRef<Region> regions,
+                                 const OwningRewritePatternList &patterns) {
+  if (regions.empty())
+    return true;
+
+  // The top-level operation must be known to be isolated from above to
+  // prevent performing canonicalizations on operations defined at or above
+  // the region containing 'op'.
+  auto regionIsIsolated = [](Region &region) {
+    return region.getParentOp()->isKnownIsolatedFromAbove();
+  };
+  (void)regionIsIsolated;
+  assert(llvm::all_of(regions, regionIsIsolated) &&
+         "patterns can only be applied to operations IsolatedFromAbove");
+
+  // Start the pattern driver.
+  GreedyPatternRewriteDriver driver(regions[0].getContext(), patterns);
+  bool converged = driver.simplify(regions, maxPatternMatchIterations);
+  LLVM_DEBUG(if (!converged) {
+    llvm::dbgs() << "The pattern rewrite doesn't converge after scanning "
+                 << maxPatternMatchIterations << " times";
+  });
+  return converged;
+}
diff --git a/mlir/lib/Transforms/Utils/InliningUtils.cpp b/mlir/lib/Transforms/Utils/InliningUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ac286c67fbea3999e5bbbcf16dd81840ba75822
--- /dev/null
+++ b/mlir/lib/Transforms/Utils/InliningUtils.cpp
@@ -0,0 +1,356 @@
+//===- InliningUtils.cpp ---- Misc utilities for inlining -----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements miscellaneous inlining utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/InliningUtils.h"
+
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "inlining"
+
+using namespace mlir;
+
+/// Remap locations from the inlined blocks with CallSiteLoc locations with the
+/// provided caller location.
+static void
+remapInlinedLocations(iterator_range<Region::iterator> inlinedBlocks,
+                      Location callerLoc) {
+  DenseMap<Location, Location> mappedLocations;
+  auto remapOpLoc = [&](Operation *op) {
+    auto it = mappedLocations.find(op->getLoc());
+    if (it == mappedLocations.end()) {
+      auto newLoc = CallSiteLoc::get(op->getLoc(), callerLoc);
+      it = mappedLocations.try_emplace(op->getLoc(), newLoc).first;
+    }
+    op->setLoc(it->second);
+  };
+  for (auto &block : inlinedBlocks)
+    block.walk(remapOpLoc);
+}
+
+static void remapInlinedOperands(iterator_range<Region::iterator> inlinedBlocks,
+                                 BlockAndValueMapping &mapper) {
+  auto remapOperands = [&](Operation *op) {
+    for (auto &operand : op->getOpOperands())
+      if (auto mappedOp = mapper.lookupOrNull(operand.get()))
+        operand.set(mappedOp);
+  };
+  for (auto &block : inlinedBlocks)
+    block.walk(remapOperands);
+}
+
+//===----------------------------------------------------------------------===//
+// InlinerInterface
+//===----------------------------------------------------------------------===//
+
+bool InlinerInterface::isLegalToInline(
+    Region *dest, Region *src, BlockAndValueMapping &valueMapping) const {
+  // Regions can always be inlined into functions.
+  if (isa<FuncOp>(dest->getParentOp()))
+    return true;
+
+  auto *handler = getInterfaceFor(dest->getParentOp());
+  return handler ? handler->isLegalToInline(dest, src, valueMapping) : false;
+}
+
+bool InlinerInterface::isLegalToInline(
+    Operation *op, Region *dest, BlockAndValueMapping &valueMapping) const {
+  auto *handler = getInterfaceFor(op);
+  return handler ? handler->isLegalToInline(op, dest, valueMapping) : false;
+}
+
+bool InlinerInterface::shouldAnalyzeRecursively(Operation *op) const {
+  auto *handler = getInterfaceFor(op);
+  return handler ? handler->shouldAnalyzeRecursively(op) : true;
+}
+
+/// Handle the given inlined terminator by replacing it with a new operation
+/// as necessary.
+void InlinerInterface::handleTerminator(Operation *op, Block *newDest) const {
+  auto *handler = getInterfaceFor(op);
+  assert(handler && "expected valid dialect handler");
+  handler->handleTerminator(op, newDest);
+}
+
+/// Handle the given inlined terminator by replacing it with a new operation
+/// as necessary.
+void InlinerInterface::handleTerminator(Operation *op,
+                                        ArrayRef<Value> valuesToRepl) const {
+  auto *handler = getInterfaceFor(op);
+  assert(handler && "expected valid dialect handler");
+  handler->handleTerminator(op, valuesToRepl);
+}
+
+/// Utility to check that all of the operations within 'src' can be inlined.
+static bool isLegalToInline(InlinerInterface &interface, Region *src,
+                            Region *insertRegion,
+                            BlockAndValueMapping &valueMapping) {
+  for (auto &block : *src) {
+    for (auto &op : block) {
+      // Check this operation.
+      if (!interface.isLegalToInline(&op, insertRegion, valueMapping)) {
+        LLVM_DEBUG({
+          llvm::dbgs() << "* Illegal to inline because of op: ";
+          op.dump();
+        });
+        return false;
+      }
+      // Check any nested regions.
+      if (interface.shouldAnalyzeRecursively(&op) &&
+          llvm::any_of(op.getRegions(), [&](Region &region) {
+            return !isLegalToInline(interface, &region, insertRegion,
+                                    valueMapping);
+          }))
+        return false;
+    }
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Methods
+//===----------------------------------------------------------------------===//
+
+LogicalResult mlir::inlineRegion(InlinerInterface &interface, Region *src,
+                                 Operation *inlinePoint,
+                                 BlockAndValueMapping &mapper,
+                                 ArrayRef<Value> resultsToReplace,
+                                 Optional<Location> inlineLoc,
+                                 bool shouldCloneInlinedRegion) {
+  // We expect the region to have at least one block.
+  if (src->empty())
+    return failure();
+
+  // Check that all of the region arguments have been mapped.
+  auto *srcEntryBlock = &src->front();
+  if (llvm::any_of(srcEntryBlock->getArguments(),
+                   [&](BlockArgument arg) { return !mapper.contains(arg); }))
+    return failure();
+
+  // The insertion point must be within a block.
+  Block *insertBlock = inlinePoint->getBlock();
+  if (!insertBlock)
+    return failure();
+  Region *insertRegion = insertBlock->getParent();
+
+  // Check that the operations within the source region are valid to inline.
+  if (!interface.isLegalToInline(insertRegion, src, mapper) ||
+      !isLegalToInline(interface, src, insertRegion, mapper))
+    return failure();
+
+  // Split the insertion block.
+  Block *postInsertBlock =
+      insertBlock->splitBlock(++inlinePoint->getIterator());
+
+  // Check to see if the region is being cloned, or moved inline. In either
+  // case, move the new blocks after the 'insertBlock' to improve IR
+  // readability.
+  if (shouldCloneInlinedRegion)
+    src->cloneInto(insertRegion, postInsertBlock->getIterator(), mapper);
+  else
+    insertRegion->getBlocks().splice(postInsertBlock->getIterator(),
+                                     src->getBlocks(), src->begin(),
+                                     src->end());
+
+  // Get the range of newly inserted blocks.
+  auto newBlocks = llvm::make_range(std::next(insertBlock->getIterator()),
+                                    postInsertBlock->getIterator());
+  Block *firstNewBlock = &*newBlocks.begin();
+
+  // Remap the locations of the inlined operations if a valid source location
+  // was provided.
+  if (inlineLoc && !inlineLoc->isa<UnknownLoc>())
+    remapInlinedLocations(newBlocks, *inlineLoc);
+
+  // If the blocks were moved in-place, make sure to remap any necessary
+  // operands.
+  if (!shouldCloneInlinedRegion)
+    remapInlinedOperands(newBlocks, mapper);
+
+  // Process the newly inlined blocks.
+  interface.processInlinedBlocks(newBlocks);
+
+  // Handle the case where only a single block was inlined.
+  if (std::next(newBlocks.begin()) == newBlocks.end()) {
+    // Have the interface handle the terminator of this block.
+    auto *firstBlockTerminator = firstNewBlock->getTerminator();
+    interface.handleTerminator(firstBlockTerminator, resultsToReplace);
+    firstBlockTerminator->erase();
+
+    // Merge the post insert block into the cloned entry block.
+    firstNewBlock->getOperations().splice(firstNewBlock->end(),
+                                          postInsertBlock->getOperations());
+    postInsertBlock->erase();
+  } else {
+    // Otherwise, there were multiple blocks inlined. Add arguments to the post
+    // insertion block to represent the results to replace.
+    for (Value resultToRepl : resultsToReplace) {
+      resultToRepl->replaceAllUsesWith(
+          postInsertBlock->addArgument(resultToRepl->getType()));
+    }
+
+    /// Handle the terminators for each of the new blocks.
+    for (auto &newBlock : newBlocks)
+      interface.handleTerminator(newBlock.getTerminator(), postInsertBlock);
+  }
+
+  // Splice the instructions of the inlined entry block into the insert block.
+  insertBlock->getOperations().splice(insertBlock->end(),
+                                      firstNewBlock->getOperations());
+  firstNewBlock->erase();
+  return success();
+}
+
+/// This function is an overload of the above 'inlineRegion' that allows for
+/// providing the set of operands ('inlinedOperands') that should be used
+/// in-favor of the region arguments when inlining.
+LogicalResult mlir::inlineRegion(InlinerInterface &interface, Region *src,
+                                 Operation *inlinePoint,
+                                 ArrayRef<Value> inlinedOperands,
+                                 ArrayRef<Value> resultsToReplace,
+                                 Optional<Location> inlineLoc,
+                                 bool shouldCloneInlinedRegion) {
+  // We expect the region to have at least one block.
+  if (src->empty())
+    return failure();
+
+  auto *entryBlock = &src->front();
+  if (inlinedOperands.size() != entryBlock->getNumArguments())
+    return failure();
+
+  // Map the provided call operands to the arguments of the region.
+  BlockAndValueMapping mapper;
+  for (unsigned i = 0, e = inlinedOperands.size(); i != e; ++i) {
+    // Verify that the types of the provided values match the function argument
+    // types.
+    BlockArgument regionArg = entryBlock->getArgument(i);
+    if (inlinedOperands[i]->getType() != regionArg->getType())
+      return failure();
+    mapper.map(regionArg, inlinedOperands[i]);
+  }
+
+  // Call into the main region inliner function.
+  return inlineRegion(interface, src, inlinePoint, mapper, resultsToReplace,
+                      inlineLoc, shouldCloneInlinedRegion);
+}
+
+/// Utility function used to generate a cast operation from the given interface,
+/// or return nullptr if a cast could not be generated.
+static Value materializeConversion(const DialectInlinerInterface *interface,
+                                   SmallVectorImpl<Operation *> &castOps,
+                                   OpBuilder &castBuilder, Value arg, Type type,
+                                   Location conversionLoc) {
+  if (!interface)
+    return nullptr;
+
+  // Check to see if the interface for the call can materialize a conversion.
+  Operation *castOp = interface->materializeCallConversion(castBuilder, arg,
+                                                           type, conversionLoc);
+  if (!castOp)
+    return nullptr;
+  castOps.push_back(castOp);
+
+  // Ensure that the generated cast is correct.
+  assert(castOp->getNumOperands() == 1 && castOp->getOperand(0) == arg &&
+         castOp->getNumResults() == 1 && *castOp->result_type_begin() == type);
+  return castOp->getResult(0);
+}
+
+/// This function inlines a given region, 'src', of a callable operation,
+/// 'callable', into the location defined by the given call operation. This
+/// function returns failure if inlining is not possible, success otherwise. On
+/// failure, no changes are made to the module. 'shouldCloneInlinedRegion'
+/// corresponds to whether the source region should be cloned into the 'call' or
+/// spliced directly.
+LogicalResult mlir::inlineCall(InlinerInterface &interface,
+                               CallOpInterface call,
+                               CallableOpInterface callable, Region *src,
+                               bool shouldCloneInlinedRegion) {
+  // We expect the region to have at least one block.
+  if (src->empty())
+    return failure();
+  auto *entryBlock = &src->front();
+  ArrayRef<Type> callableResultTypes = callable.getCallableResults(src);
+
+  // Make sure that the number of arguments and results matchup between the call
+  // and the region.
+  SmallVector<Value, 8> callOperands(call.getArgOperands());
+  SmallVector<Value, 8> callResults(call.getOperation()->getResults());
+  if (callOperands.size() != entryBlock->getNumArguments() ||
+      callResults.size() != callableResultTypes.size())
+    return failure();
+
+  // A set of cast operations generated to matchup the signature of the region
+  // with the signature of the call.
+  SmallVector<Operation *, 4> castOps;
+  castOps.reserve(callOperands.size() + callResults.size());
+
+  // Functor used to cleanup generated state on failure.
+  auto cleanupState = [&] {
+    for (auto *op : castOps) {
+      op->getResult(0)->replaceAllUsesWith(op->getOperand(0));
+      op->erase();
+    }
+    return failure();
+  };
+
+  // Builder used for any conversion operations that need to be materialized.
+  OpBuilder castBuilder(call);
+  Location castLoc = call.getLoc();
+  auto *callInterface = interface.getInterfaceFor(call.getDialect());
+
+  // Map the provided call operands to the arguments of the region.
+  BlockAndValueMapping mapper;
+  for (unsigned i = 0, e = callOperands.size(); i != e; ++i) {
+    BlockArgument regionArg = entryBlock->getArgument(i);
+    Value operand = callOperands[i];
+
+    // If the call operand doesn't match the expected region argument, try to
+    // generate a cast.
+    Type regionArgType = regionArg->getType();
+    if (operand->getType() != regionArgType) {
+      if (!(operand = materializeConversion(callInterface, castOps, castBuilder,
+                                            operand, regionArgType, castLoc)))
+        return cleanupState();
+    }
+    mapper.map(regionArg, operand);
+  }
+
+  // Ensure that the resultant values of the call, match the callable.
+  castBuilder.setInsertionPointAfter(call);
+  for (unsigned i = 0, e = callResults.size(); i != e; ++i) {
+    Value callResult = callResults[i];
+    if (callResult->getType() == callableResultTypes[i])
+      continue;
+
+    // Generate a conversion that will produce the original type, so that the IR
+    // is still valid after the original call gets replaced.
+    Value castResult =
+        materializeConversion(callInterface, castOps, castBuilder, callResult,
+                              callResult->getType(), castLoc);
+    if (!castResult)
+      return cleanupState();
+    callResult->replaceAllUsesWith(castResult);
+    castResult->getDefiningOp()->replaceUsesOfWith(castResult, callResult);
+  }
+
+  // Attempt to inline the call.
+  if (failed(inlineRegion(interface, src, call, mapper, callResults,
+                          call.getLoc(), shouldCloneInlinedRegion)))
+    return cleanupState();
+  return success();
+}
diff --git a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0d9fdf5fd8dd87e2514c8dc33209695d3e8f8fd
--- /dev/null
+++ b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
@@ -0,0 +1,480 @@
+//===- LoopFusionUtils.cpp ---- Utilities for loop fusion ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop fusion transformation utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/LoopFusionUtils.h"
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "loop-fusion-utils"
+
+using namespace mlir;
+
+// Gathers all load and store memref accesses in 'opA' into 'values', where
+// 'values[memref] == true' for each store operation.
+static void getLoadAndStoreMemRefAccesses(Operation *opA,
+                                          DenseMap<Value, bool> &values) {
+  opA->walk([&](Operation *op) {
+    if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
+      if (values.count(loadOp.getMemRef()) == 0)
+        values[loadOp.getMemRef()] = false;
+    } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
+      values[storeOp.getMemRef()] = true;
+    }
+  });
+}
+
+// Returns true if 'op' is a load or store operation which access an memref
+// accessed 'values' and at least one of the access is a store operation.
+// Returns false otherwise.
+static bool isDependentLoadOrStoreOp(Operation *op,
+                                     DenseMap<Value, bool> &values) {
+  if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
+    return values.count(loadOp.getMemRef()) > 0 &&
+           values[loadOp.getMemRef()] == true;
+  } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
+    return values.count(storeOp.getMemRef()) > 0;
+  }
+  return false;
+}
+
+// Returns the first operation in range ('opA', 'opB') which has a data
+// dependence on 'opA'. Returns 'nullptr' of no dependence exists.
+static Operation *getFirstDependentOpInRange(Operation *opA, Operation *opB) {
+  // Record memref values from all loads/store in loop nest rooted at 'opA'.
+  // Map from memref value to bool which is true if store, false otherwise.
+  DenseMap<Value, bool> values;
+  getLoadAndStoreMemRefAccesses(opA, values);
+
+  // For each 'opX' in block in range ('opA', 'opB'), check if there is a data
+  // dependence from 'opA' to 'opX' ('opA' and 'opX' access the same memref
+  // and at least one of the accesses is a store).
+  Operation *firstDepOp = nullptr;
+  for (Block::iterator it = std::next(Block::iterator(opA));
+       it != Block::iterator(opB); ++it) {
+    Operation *opX = &(*it);
+    opX->walk([&](Operation *op) {
+      if (!firstDepOp && isDependentLoadOrStoreOp(op, values))
+        firstDepOp = opX;
+    });
+    if (firstDepOp)
+      break;
+  }
+  return firstDepOp;
+}
+
+// Returns the last operation 'opX' in range ('opA', 'opB'), for which there
+// exists a data dependence from 'opX' to 'opB'.
+// Returns 'nullptr' of no dependence exists.
+static Operation *getLastDependentOpInRange(Operation *opA, Operation *opB) {
+  // Record memref values from all loads/store in loop nest rooted at 'opB'.
+  // Map from memref value to bool which is true if store, false otherwise.
+  DenseMap<Value, bool> values;
+  getLoadAndStoreMemRefAccesses(opB, values);
+
+  // For each 'opX' in block in range ('opA', 'opB') in reverse order,
+  // check if there is a data dependence from 'opX' to 'opB':
+  // *) 'opX' and 'opB' access the same memref and at least one of the accesses
+  //    is a store.
+  // *) 'opX' produces an SSA Value which is used by 'opB'.
+  Operation *lastDepOp = nullptr;
+  for (Block::reverse_iterator it = std::next(Block::reverse_iterator(opB));
+       it != Block::reverse_iterator(opA); ++it) {
+    Operation *opX = &(*it);
+    opX->walk([&](Operation *op) {
+      if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op)) {
+        if (isDependentLoadOrStoreOp(op, values)) {
+          lastDepOp = opX;
+          return WalkResult::interrupt();
+        }
+        return WalkResult::advance();
+      }
+      for (auto value : op->getResults()) {
+        for (auto user : value->getUsers()) {
+          SmallVector<AffineForOp, 4> loops;
+          // Check if any loop in loop nest surrounding 'user' is 'opB'.
+          getLoopIVs(*user, &loops);
+          if (llvm::is_contained(loops, cast<AffineForOp>(opB))) {
+            lastDepOp = opX;
+            return WalkResult::interrupt();
+          }
+        }
+      }
+      return WalkResult::advance();
+    });
+    if (lastDepOp)
+      break;
+  }
+  return lastDepOp;
+}
+
+// Computes and returns an insertion point operation, before which the
+// the fused <srcForOp, dstForOp> loop nest can be inserted while preserving
+// dependences. Returns nullptr if no such insertion point is found.
+static Operation *getFusedLoopNestInsertionPoint(AffineForOp srcForOp,
+                                                 AffineForOp dstForOp) {
+  bool isSrcForOpBeforeDstForOp =
+      srcForOp.getOperation()->isBeforeInBlock(dstForOp.getOperation());
+  auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
+  auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
+
+  auto *firstDepOpA =
+      getFirstDependentOpInRange(forOpA.getOperation(), forOpB.getOperation());
+  auto *lastDepOpB =
+      getLastDependentOpInRange(forOpA.getOperation(), forOpB.getOperation());
+  // Block:
+  //      ...
+  //  |-- opA
+  //  |   ...
+  //  |   lastDepOpB --|
+  //  |   ...          |
+  //  |-> firstDepOpA  |
+  //      ...          |
+  //      opB <---------
+  //
+  // Valid insertion point range: (lastDepOpB, firstDepOpA)
+  //
+  if (firstDepOpA != nullptr) {
+    if (lastDepOpB != nullptr) {
+      if (firstDepOpA->isBeforeInBlock(lastDepOpB) || firstDepOpA == lastDepOpB)
+        // No valid insertion point exists which preserves dependences.
+        return nullptr;
+    }
+    // Return insertion point in valid range closest to 'opB'.
+    // TODO(andydavis) Consider other insertion points in valid range.
+    return firstDepOpA;
+  }
+  // No dependences from 'opA' to operation in range ('opA', 'opB'), return
+  // 'opB' insertion point.
+  return forOpB.getOperation();
+}
+
+// Gathers all load and store ops in loop nest rooted at 'forOp' into
+// 'loadAndStoreOps'.
+static bool
+gatherLoadsAndStores(AffineForOp forOp,
+                     SmallVectorImpl<Operation *> &loadAndStoreOps) {
+  bool hasIfOp = false;
+  forOp.walk([&](Operation *op) {
+    if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op))
+      loadAndStoreOps.push_back(op);
+    else if (isa<AffineIfOp>(op))
+      hasIfOp = true;
+  });
+  return !hasIfOp;
+}
+
+// TODO(andydavis) Prevent fusion of loop nests with side-effecting operations.
+FusionResult mlir::canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
+                                unsigned dstLoopDepth,
+                                ComputationSliceState *srcSlice) {
+  // Return 'failure' if 'dstLoopDepth == 0'.
+  if (dstLoopDepth == 0) {
+    LLVM_DEBUG(llvm::dbgs() << "Cannot fuse loop nests at depth 0\n.");
+    return FusionResult::FailPrecondition;
+  }
+  // Return 'failure' if 'srcForOp' and 'dstForOp' are not in the same block.
+  auto *block = srcForOp.getOperation()->getBlock();
+  if (block != dstForOp.getOperation()->getBlock()) {
+    LLVM_DEBUG(llvm::dbgs() << "Cannot fuse loop nests in different blocks\n.");
+    return FusionResult::FailPrecondition;
+  }
+
+  // Return 'failure' if no valid insertion point for fused loop nest in 'block'
+  // exists which would preserve dependences.
+  if (!getFusedLoopNestInsertionPoint(srcForOp, dstForOp)) {
+    LLVM_DEBUG(llvm::dbgs() << "Fusion would violate dependences in block\n.");
+    return FusionResult::FailBlockDependence;
+  }
+
+  // Check if 'srcForOp' precedes 'dstForOp' in 'block'.
+  bool isSrcForOpBeforeDstForOp =
+      srcForOp.getOperation()->isBeforeInBlock(dstForOp.getOperation());
+  // 'forOpA' executes before 'forOpB' in 'block'.
+  auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
+  auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
+
+  // Gather all load and store from 'forOpA' which precedes 'forOpB' in 'block'.
+  SmallVector<Operation *, 4> opsA;
+  if (!gatherLoadsAndStores(forOpA, opsA)) {
+    LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported.\n.");
+    return FusionResult::FailPrecondition;
+  }
+
+  // Gather all load and store from 'forOpB' which succeeds 'forOpA' in 'block'.
+  SmallVector<Operation *, 4> opsB;
+  if (!gatherLoadsAndStores(forOpB, opsB)) {
+    LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported.\n.");
+    return FusionResult::FailPrecondition;
+  }
+
+  // Calculate the number of common loops surrounding 'srcForOp' and 'dstForOp'.
+  unsigned numCommonLoops = mlir::getNumCommonSurroundingLoops(
+      *srcForOp.getOperation(), *dstForOp.getOperation());
+
+  // Compute union of computation slices computed between all pairs of ops
+  // from 'forOpA' and 'forOpB'.
+  if (failed(mlir::computeSliceUnion(opsA, opsB, dstLoopDepth, numCommonLoops,
+                                     isSrcForOpBeforeDstForOp, srcSlice))) {
+    LLVM_DEBUG(llvm::dbgs() << "computeSliceUnion failed\n");
+    return FusionResult::FailPrecondition;
+  }
+
+  return FusionResult::Success;
+}
+
+/// Collect loop nest statistics (eg. loop trip count and operation count)
+/// in 'stats' for loop nest rooted at 'forOp'. Returns true on success,
+/// returns false otherwise.
+bool mlir::getLoopNestStats(AffineForOp forOpRoot, LoopNestStats *stats) {
+  auto walkResult = forOpRoot.walk([&](AffineForOp forOp) {
+    auto *childForOp = forOp.getOperation();
+    auto *parentForOp = forOp.getParentOp();
+    if (!llvm::isa<FuncOp>(parentForOp)) {
+      if (!isa<AffineForOp>(parentForOp)) {
+        LLVM_DEBUG(llvm::dbgs() << "Expected parent AffineForOp");
+        return WalkResult::interrupt();
+      }
+      // Add mapping to 'forOp' from its parent AffineForOp.
+      stats->loopMap[parentForOp].push_back(forOp);
+    }
+
+    // Record the number of op operations in the body of 'forOp'.
+    unsigned count = 0;
+    stats->opCountMap[childForOp] = 0;
+    for (auto &op : *forOp.getBody()) {
+      if (!isa<AffineForOp>(op) && !isa<AffineIfOp>(op))
+        ++count;
+    }
+    stats->opCountMap[childForOp] = count;
+
+    // Record trip count for 'forOp'. Set flag if trip count is not
+    // constant.
+    Optional<uint64_t> maybeConstTripCount = getConstantTripCount(forOp);
+    if (!maybeConstTripCount.hasValue()) {
+      // Currently only constant trip count loop nests are supported.
+      LLVM_DEBUG(llvm::dbgs() << "Non-constant trip count unsupported");
+      return WalkResult::interrupt();
+    }
+
+    stats->tripCountMap[childForOp] = maybeConstTripCount.getValue();
+    return WalkResult::advance();
+  });
+  return !walkResult.wasInterrupted();
+}
+
+// Computes the total cost of the loop nest rooted at 'forOp'.
+// Currently, the total cost is computed by counting the total operation
+// instance count (i.e. total number of operations in the loop bodyloop
+// operation count * loop trip count) for the entire loop nest.
+// If 'tripCountOverrideMap' is non-null, overrides the trip count for loops
+// specified in the map when computing the total op instance count.
+// NOTEs: 1) This is used to compute the cost of computation slices, which are
+// sliced along the iteration dimension, and thus reduce the trip count.
+// If 'computeCostMap' is non-null, the total op count for forOps specified
+// in the map is increased (not overridden) by adding the op count from the
+// map to the existing op count for the for loop. This is done before
+// multiplying by the loop's trip count, and is used to model the cost of
+// inserting a sliced loop nest of known cost into the loop's body.
+// 2) This is also used to compute the cost of fusing a slice of some loop nest
+// within another loop.
+static int64_t getComputeCostHelper(
+    Operation *forOp, LoopNestStats &stats,
+    llvm::SmallDenseMap<Operation *, uint64_t, 8> *tripCountOverrideMap,
+    DenseMap<Operation *, int64_t> *computeCostMap) {
+  // 'opCount' is the total number operations in one iteration of 'forOp' body,
+  // minus terminator op which is a no-op.
+  int64_t opCount = stats.opCountMap[forOp] - 1;
+  if (stats.loopMap.count(forOp) > 0) {
+    for (auto childForOp : stats.loopMap[forOp]) {
+      opCount += getComputeCostHelper(childForOp.getOperation(), stats,
+                                      tripCountOverrideMap, computeCostMap);
+    }
+  }
+  // Add in additional op instances from slice (if specified in map).
+  if (computeCostMap != nullptr) {
+    auto it = computeCostMap->find(forOp);
+    if (it != computeCostMap->end()) {
+      opCount += it->second;
+    }
+  }
+  // Override trip count (if specified in map).
+  int64_t tripCount = stats.tripCountMap[forOp];
+  if (tripCountOverrideMap != nullptr) {
+    auto it = tripCountOverrideMap->find(forOp);
+    if (it != tripCountOverrideMap->end()) {
+      tripCount = it->second;
+    }
+  }
+  // Returns the total number of dynamic instances of operations in loop body.
+  return tripCount * opCount;
+}
+
+// TODO(andydavis,b/126426796): extend this to handle multiple result maps.
+static Optional<uint64_t> getConstDifference(AffineMap lbMap, AffineMap ubMap) {
+  assert(lbMap.getNumResults() == 1 && "expected single result bound map");
+  assert(ubMap.getNumResults() == 1 && "expected single result bound map");
+  assert(lbMap.getNumDims() == ubMap.getNumDims());
+  assert(lbMap.getNumSymbols() == ubMap.getNumSymbols());
+  AffineExpr lbExpr(lbMap.getResult(0));
+  AffineExpr ubExpr(ubMap.getResult(0));
+  auto loopSpanExpr = simplifyAffineExpr(ubExpr - lbExpr, lbMap.getNumDims(),
+                                         lbMap.getNumSymbols());
+  auto cExpr = loopSpanExpr.dyn_cast<AffineConstantExpr>();
+  if (!cExpr)
+    return None;
+  return cExpr.getValue();
+}
+
+// Return the number of iterations in the given slice.
+static uint64_t getSliceIterationCount(
+    const llvm::SmallDenseMap<Operation *, uint64_t, 8> &sliceTripCountMap) {
+  uint64_t iterCount = 1;
+  for (const auto &count : sliceTripCountMap) {
+    iterCount *= count.second;
+  }
+  return iterCount;
+}
+
+// Builds a map 'tripCountMap' from AffineForOp to constant trip count for loop
+// nest surrounding represented by slice loop bounds in 'slice'.
+// Returns true on success, false otherwise (if a non-constant trip count
+// was encountered).
+// TODO(andydavis) Make this work with non-unit step loops.
+static bool buildSliceTripCountMap(
+    ComputationSliceState *slice,
+    llvm::SmallDenseMap<Operation *, uint64_t, 8> *tripCountMap) {
+  unsigned numSrcLoopIVs = slice->ivs.size();
+  // Populate map from AffineForOp -> trip count
+  for (unsigned i = 0; i < numSrcLoopIVs; ++i) {
+    AffineForOp forOp = getForInductionVarOwner(slice->ivs[i]);
+    auto *op = forOp.getOperation();
+    AffineMap lbMap = slice->lbs[i];
+    AffineMap ubMap = slice->ubs[i];
+    if (lbMap == AffineMap() || ubMap == AffineMap()) {
+      // The iteration of src loop IV 'i' was not sliced. Use full loop bounds.
+      if (forOp.hasConstantLowerBound() && forOp.hasConstantUpperBound()) {
+        (*tripCountMap)[op] =
+            forOp.getConstantUpperBound() - forOp.getConstantLowerBound();
+        continue;
+      }
+      Optional<uint64_t> maybeConstTripCount = getConstantTripCount(forOp);
+      if (maybeConstTripCount.hasValue()) {
+        (*tripCountMap)[op] = maybeConstTripCount.getValue();
+        continue;
+      }
+      return false;
+    }
+    Optional<uint64_t> tripCount = getConstDifference(lbMap, ubMap);
+    // Slice bounds are created with a constant ub - lb difference.
+    if (!tripCount.hasValue())
+      return false;
+    (*tripCountMap)[op] = tripCount.getValue();
+  }
+  return true;
+}
+
+/// Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
+/// Currently, the total cost is computed by counting the total operation
+/// instance count (i.e. total number of operations in the loop body * loop
+/// trip count) for the entire loop nest.
+int64_t mlir::getComputeCost(AffineForOp forOp, LoopNestStats &stats) {
+  return getComputeCostHelper(forOp.getOperation(), stats,
+                              /*tripCountOverrideMap=*/nullptr,
+                              /*computeCostMap=*/nullptr);
+}
+
+/// Computes and returns in 'computeCost', the total compute cost of fusing the
+/// 'slice' of the loop nest rooted at 'srcForOp' into 'dstForOp'. Currently,
+/// the total cost is computed by counting the total operation instance count
+/// (i.e. total number of operations in the loop body * loop trip count) for
+/// the entire loop nest.
+bool mlir::getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats,
+                                AffineForOp dstForOp, LoopNestStats &dstStats,
+                                ComputationSliceState *slice,
+                                int64_t *computeCost) {
+  llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
+  DenseMap<Operation *, int64_t> computeCostMap;
+
+  // Build trip count map for computation slice.
+  if (!buildSliceTripCountMap(slice, &sliceTripCountMap))
+    return false;
+  // Checks whether a store to load forwarding will happen.
+  int64_t sliceIterationCount = getSliceIterationCount(sliceTripCountMap);
+  assert(sliceIterationCount > 0);
+  bool storeLoadFwdGuaranteed = (sliceIterationCount == 1);
+  auto *insertPointParent = slice->insertPoint->getParentOp();
+
+  // The store and loads to this memref will disappear.
+  // TODO(andydavis) Add load coalescing to memref data flow opt pass.
+  if (storeLoadFwdGuaranteed) {
+    // Subtract from operation count the loads/store we expect load/store
+    // forwarding to remove.
+    unsigned storeCount = 0;
+    llvm::SmallDenseSet<Value, 4> storeMemrefs;
+    srcForOp.walk([&](Operation *op) {
+      if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
+        storeMemrefs.insert(storeOp.getMemRef());
+        ++storeCount;
+      }
+    });
+    // Subtract out any store ops in single-iteration src slice loop nest.
+    if (storeCount > 0)
+      computeCostMap[insertPointParent] = -storeCount;
+    // Subtract out any load users of 'storeMemrefs' nested below
+    // 'insertPointParent'.
+    for (auto value : storeMemrefs) {
+      for (auto *user : value->getUsers()) {
+        if (auto loadOp = dyn_cast<AffineLoadOp>(user)) {
+          SmallVector<AffineForOp, 4> loops;
+          // Check if any loop in loop nest surrounding 'user' is
+          // 'insertPointParent'.
+          getLoopIVs(*user, &loops);
+          if (llvm::is_contained(loops, cast<AffineForOp>(insertPointParent))) {
+            if (auto forOp =
+                    dyn_cast_or_null<AffineForOp>(user->getParentOp())) {
+              if (computeCostMap.count(forOp) == 0)
+                computeCostMap[forOp] = 0;
+              computeCostMap[forOp] -= 1;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Compute op instance count for the src loop nest with iteration slicing.
+  int64_t sliceComputeCost = getComputeCostHelper(
+      srcForOp.getOperation(), srcStats, &sliceTripCountMap, &computeCostMap);
+
+  // Compute cost of fusion for this depth.
+  computeCostMap[insertPointParent] = sliceComputeCost;
+
+  *computeCost =
+      getComputeCostHelper(dstForOp.getOperation(), dstStats,
+                           /*tripCountOverrideMap=*/nullptr, &computeCostMap);
+  return true;
+}
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0fece54132a98fd9919a7c67ac32db11f20c8786
--- /dev/null
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -0,0 +1,1779 @@
+//===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements miscellaneous loop transformation routines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/LoopUtils.h"
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Function.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "LoopUtils"
+
+using namespace mlir;
+using llvm::SetVector;
+using llvm::SmallMapVector;
+
+/// Computes the cleanup loop lower bound of the loop being unrolled with
+/// the specified unroll factor; this bound will also be upper bound of the main
+/// part of the unrolled loop. Computes the bound as an AffineMap with its
+/// operands or a null map when the trip count can't be expressed as an affine
+/// expression.
+void mlir::getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
+                                    AffineMap *map,
+                                    SmallVectorImpl<Value> *operands,
+                                    OpBuilder &b) {
+  auto lbMap = forOp.getLowerBoundMap();
+
+  // Single result lower bound map only.
+  if (lbMap.getNumResults() != 1) {
+    *map = AffineMap();
+    return;
+  }
+
+  AffineMap tripCountMap;
+  SmallVector<Value, 4> tripCountOperands;
+  buildTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
+
+  // Sometimes the trip count cannot be expressed as an affine expression.
+  if (!tripCountMap) {
+    *map = AffineMap();
+    return;
+  }
+
+  unsigned step = forOp.getStep();
+  auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
+                                    forOp.getLowerBoundOperands());
+
+  // For each upper bound expr, get the range.
+  // Eg: affine.for %i = lb to min (ub1, ub2),
+  // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
+  // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
+  // these affine.apply's make up the cleanup loop lower bound.
+  SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
+  SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
+  for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
+    auto tripCountExpr = tripCountMap.getResult(i);
+    bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
+    auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
+                                  tripCountMap.getNumSymbols(), bumpExprs[i]);
+    bumpValues[i] =
+        b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
+  }
+
+  SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
+  for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
+    newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
+
+  operands->clear();
+  operands->push_back(lb);
+  operands->append(bumpValues.begin(), bumpValues.end());
+  *map = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs);
+  // Simplify the map + operands.
+  fullyComposeAffineMapAndOperands(map, operands);
+  *map = simplifyAffineMap(*map);
+  canonicalizeMapAndOperands(map, operands);
+  // Remove any affine.apply's that became dead from the simplification above.
+  for (auto v : bumpValues) {
+    if (v->use_empty()) {
+      v->getDefiningOp()->erase();
+    }
+  }
+  if (lb.use_empty())
+    lb.erase();
+}
+
+/// Promotes the loop body of a forOp to its containing block if the forOp
+/// was known to have a single iteration.
+// TODO(bondhugula): extend this for arbitrary affine bounds.
+LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
+  Optional<uint64_t> tripCount = getConstantTripCount(forOp);
+  if (!tripCount.hasValue() || tripCount.getValue() != 1)
+    return failure();
+
+  // TODO(mlir-team): there is no builder for a max.
+  if (forOp.getLowerBoundMap().getNumResults() != 1)
+    return failure();
+
+  // Replaces all IV uses to its single iteration value.
+  auto iv = forOp.getInductionVar();
+  Operation *op = forOp.getOperation();
+  if (!iv->use_empty()) {
+    if (forOp.hasConstantLowerBound()) {
+      OpBuilder topBuilder(op->getParentOfType<FuncOp>().getBody());
+      auto constOp = topBuilder.create<ConstantIndexOp>(
+          forOp.getLoc(), forOp.getConstantLowerBound());
+      iv->replaceAllUsesWith(constOp);
+    } else {
+      AffineBound lb = forOp.getLowerBound();
+      SmallVector<Value, 4> lbOperands(lb.operand_begin(), lb.operand_end());
+      OpBuilder builder(op->getBlock(), Block::iterator(op));
+      if (lb.getMap() == builder.getDimIdentityMap()) {
+        // No need of generating an affine.apply.
+        iv->replaceAllUsesWith(lbOperands[0]);
+      } else {
+        auto affineApplyOp = builder.create<AffineApplyOp>(
+            op->getLoc(), lb.getMap(), lbOperands);
+        iv->replaceAllUsesWith(affineApplyOp);
+      }
+    }
+  }
+  // Move the loop body operations, except for terminator, to the loop's
+  // containing block.
+  auto *block = op->getBlock();
+  forOp.getBody()->getOperations().back().erase();
+  block->getOperations().splice(Block::iterator(op),
+                                forOp.getBody()->getOperations());
+  forOp.erase();
+  return success();
+}
+
+/// Promotes all single iteration for op's in the FuncOp, i.e., moves
+/// their body into the containing Block.
+void mlir::promoteSingleIterationLoops(FuncOp f) {
+  // Gathers all innermost loops through a post order pruned walk.
+  f.walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
+}
+
+/// Generates a 'affine.for' op with the specified lower and upper bounds
+/// while generating the right IV remappings for the shifted operations. The
+/// operation blocks that go into the loop are specified in instGroupQueue
+/// starting from the specified offset, and in that order; the first element of
+/// the pair specifies the shift applied to that group of operations; note
+/// that the shift is multiplied by the loop step before being applied. Returns
+/// nullptr if the generated loop simplifies to a single iteration one.
+static AffineForOp
+generateLoop(AffineMap lbMap, AffineMap ubMap,
+             const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>>
+                 &instGroupQueue,
+             unsigned offset, AffineForOp srcForInst, OpBuilder b) {
+  SmallVector<Value, 4> lbOperands(srcForInst.getLowerBoundOperands());
+  SmallVector<Value, 4> ubOperands(srcForInst.getUpperBoundOperands());
+
+  assert(lbMap.getNumInputs() == lbOperands.size());
+  assert(ubMap.getNumInputs() == ubOperands.size());
+
+  auto loopChunk =
+      b.create<AffineForOp>(srcForInst.getLoc(), lbOperands, lbMap, ubOperands,
+                            ubMap, srcForInst.getStep());
+  auto loopChunkIV = loopChunk.getInductionVar();
+  auto srcIV = srcForInst.getInductionVar();
+
+  BlockAndValueMapping operandMap;
+
+  OpBuilder bodyBuilder = loopChunk.getBodyBuilder();
+  for (auto it = instGroupQueue.begin() + offset, e = instGroupQueue.end();
+       it != e; ++it) {
+    uint64_t shift = it->first;
+    auto insts = it->second;
+    // All 'same shift' operations get added with their operands being
+    // remapped to results of cloned operations, and their IV used remapped.
+    // Generate the remapping if the shift is not zero: remappedIV = newIV -
+    // shift.
+    if (!srcIV->use_empty() && shift != 0) {
+      auto ivRemap = bodyBuilder.create<AffineApplyOp>(
+          srcForInst.getLoc(),
+          bodyBuilder.getSingleDimShiftAffineMap(
+              -static_cast<int64_t>(srcForInst.getStep() * shift)),
+          loopChunkIV);
+      operandMap.map(srcIV, ivRemap);
+    } else {
+      operandMap.map(srcIV, loopChunkIV);
+    }
+    for (auto *op : insts) {
+      if (!isa<AffineTerminatorOp>(op))
+        bodyBuilder.clone(*op, operandMap);
+    }
+  };
+  if (succeeded(promoteIfSingleIteration(loopChunk)))
+    return AffineForOp();
+  return loopChunk;
+}
+
+/// Skew the operations in the body of a 'affine.for' operation with the
+/// specified operation-wise shifts. The shifts are with respect to the
+/// original execution order, and are multiplied by the loop 'step' before being
+/// applied. A shift of zero for each operation will lead to no change.
+// The skewing of operations with respect to one another can be used for
+// example to allow overlap of asynchronous operations (such as DMA
+// communication) with computation, or just relative shifting of operations
+// for better register reuse, locality or parallelism. As such, the shifts are
+// typically expected to be at most of the order of the number of operations.
+// This method should not be used as a substitute for loop distribution/fission.
+// This method uses an algorithm// in time linear in the number of operations
+// in the body of the for loop - (using the 'sweep line' paradigm). This method
+// asserts preservation of SSA dominance. A check for that as well as that for
+// memory-based dependence preservation check rests with the users of this
+// method.
+LogicalResult mlir::instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
+                                 bool unrollPrologueEpilogue) {
+  if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
+    return success();
+
+  // If the trip counts aren't constant, we would need versioning and
+  // conditional guards (or context information to prevent such versioning). The
+  // better way to pipeline for such loops is to first tile them and extract
+  // constant trip count "full tiles" before applying this.
+  auto mayBeConstTripCount = getConstantTripCount(forOp);
+  if (!mayBeConstTripCount.hasValue()) {
+    LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
+    return success();
+  }
+  uint64_t tripCount = mayBeConstTripCount.getValue();
+
+  assert(isInstwiseShiftValid(forOp, shifts) &&
+         "shifts will lead to an invalid transformation\n");
+
+  int64_t step = forOp.getStep();
+
+  unsigned numChildInsts = forOp.getBody()->getOperations().size();
+
+  // Do a linear time (counting) sort for the shifts.
+  uint64_t maxShift = 0;
+  for (unsigned i = 0; i < numChildInsts; i++) {
+    maxShift = std::max(maxShift, shifts[i]);
+  }
+  // Such large shifts are not the typical use case.
+  if (maxShift >= numChildInsts) {
+    forOp.emitWarning("not shifting because shifts are unrealistically large");
+    return success();
+  }
+
+  // An array of operation groups sorted by shift amount; each group has all
+  // operations with the same shift in the order in which they appear in the
+  // body of the 'affine.for' op.
+  std::vector<std::vector<Operation *>> sortedInstGroups(maxShift + 1);
+  unsigned pos = 0;
+  for (auto &op : *forOp.getBody()) {
+    auto shift = shifts[pos++];
+    sortedInstGroups[shift].push_back(&op);
+  }
+
+  // Unless the shifts have a specific pattern (which actually would be the
+  // common use case), prologue and epilogue are not meaningfully defined.
+  // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
+  // loop generated as the prologue and the last as epilogue and unroll these
+  // fully.
+  AffineForOp prologue;
+  AffineForOp epilogue;
+
+  // Do a sweep over the sorted shifts while storing open groups in a
+  // vector, and generating loop portions as necessary during the sweep. A block
+  // of operations is paired with its shift.
+  std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> instGroupQueue;
+
+  auto origLbMap = forOp.getLowerBoundMap();
+  uint64_t lbShift = 0;
+  OpBuilder b(forOp.getOperation());
+  for (uint64_t d = 0, e = sortedInstGroups.size(); d < e; ++d) {
+    // If nothing is shifted by d, continue.
+    if (sortedInstGroups[d].empty())
+      continue;
+    if (!instGroupQueue.empty()) {
+      assert(d >= 1 &&
+             "Queue expected to be empty when the first block is found");
+      // The interval for which the loop needs to be generated here is:
+      // [lbShift, min(lbShift + tripCount, d)) and the body of the
+      // loop needs to have all operations in instQueue in that order.
+      AffineForOp res;
+      if (lbShift + tripCount * step < d * step) {
+        res = generateLoop(
+            b.getShiftedAffineMap(origLbMap, lbShift),
+            b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
+            instGroupQueue, 0, forOp, b);
+        // Entire loop for the queued op groups generated, empty it.
+        instGroupQueue.clear();
+        lbShift += tripCount * step;
+      } else {
+        res = generateLoop(b.getShiftedAffineMap(origLbMap, lbShift),
+                           b.getShiftedAffineMap(origLbMap, d), instGroupQueue,
+                           0, forOp, b);
+        lbShift = d * step;
+      }
+      if (!prologue && res)
+        prologue = res;
+      epilogue = res;
+    } else {
+      // Start of first interval.
+      lbShift = d * step;
+    }
+    // Augment the list of operations that get into the current open interval.
+    instGroupQueue.push_back({d, sortedInstGroups[d]});
+  }
+
+  // Those operations groups left in the queue now need to be processed (FIFO)
+  // and their loops completed.
+  for (unsigned i = 0, e = instGroupQueue.size(); i < e; ++i) {
+    uint64_t ubShift = (instGroupQueue[i].first + tripCount) * step;
+    epilogue = generateLoop(b.getShiftedAffineMap(origLbMap, lbShift),
+                            b.getShiftedAffineMap(origLbMap, ubShift),
+                            instGroupQueue, i, forOp, b);
+    lbShift = ubShift;
+    if (!prologue)
+      prologue = epilogue;
+  }
+
+  // Erase the original for op.
+  forOp.erase();
+
+  if (unrollPrologueEpilogue && prologue)
+    loopUnrollFull(prologue);
+  if (unrollPrologueEpilogue && !epilogue &&
+      epilogue.getOperation() != prologue.getOperation())
+    loopUnrollFull(epilogue);
+
+  return success();
+}
+
+// Collect perfectly nested loops starting from `rootForOps`.  Loops are
+// perfectly nested if each loop is the first and only non-terminator operation
+// in the parent loop.  Collect at most `maxLoops` loops and append them to
+// `forOps`.
+template <typename T>
+void getPerfectlyNestedLoopsImpl(
+    SmallVectorImpl<T> &forOps, T rootForOp,
+    unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
+  for (unsigned i = 0; i < maxLoops; ++i) {
+    forOps.push_back(rootForOp);
+    Block &body = rootForOp.region().front();
+    if (body.begin() != std::prev(body.end(), 2))
+      return;
+
+    rootForOp = dyn_cast<T>(&body.front());
+    if (!rootForOp)
+      return;
+  }
+}
+
+/// Get perfectly nested sequence of loops starting at root of loop nest
+/// (the first op being another AffineFor, and the second op - a terminator).
+/// A loop is perfectly nested iff: the first op in the loop's body is another
+/// AffineForOp, and the second op is a terminator).
+void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
+                                   AffineForOp root) {
+  getPerfectlyNestedLoopsImpl(nestedLoops, root);
+}
+
+void mlir::getPerfectlyNestedLoops(SmallVectorImpl<loop::ForOp> &nestedLoops,
+                                   loop::ForOp root) {
+  getPerfectlyNestedLoopsImpl(nestedLoops, root);
+}
+
+/// Unrolls this loop completely.
+LogicalResult mlir::loopUnrollFull(AffineForOp forOp) {
+  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  if (mayBeConstantTripCount.hasValue()) {
+    uint64_t tripCount = mayBeConstantTripCount.getValue();
+    if (tripCount == 1) {
+      return promoteIfSingleIteration(forOp);
+    }
+    return loopUnrollByFactor(forOp, tripCount);
+  }
+  return failure();
+}
+
+/// Unrolls and jams this loop by the specified factor or by the trip count (if
+/// constant) whichever is lower.
+LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp,
+                                         uint64_t unrollFactor) {
+  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+
+  if (mayBeConstantTripCount.hasValue() &&
+      mayBeConstantTripCount.getValue() < unrollFactor)
+    return loopUnrollByFactor(forOp, mayBeConstantTripCount.getValue());
+  return loopUnrollByFactor(forOp, unrollFactor);
+}
+
+/// Unrolls this loop by the specified factor. Returns success if the loop
+/// is successfully unrolled.
+LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp,
+                                       uint64_t unrollFactor) {
+  assert(unrollFactor >= 1 && "unroll factor should be >= 1");
+
+  if (unrollFactor == 1)
+    return promoteIfSingleIteration(forOp);
+
+  if (forOp.getBody()->empty() ||
+      forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
+    return failure();
+
+  // Loops where the lower bound is a max expression isn't supported for
+  // unrolling since the trip count can be expressed as an affine function when
+  // both the lower bound and the upper bound are multi-result maps. However,
+  // one meaningful way to do such unrolling would be to specialize the loop for
+  // the 'hotspot' case and unroll that hotspot.
+  if (forOp.getLowerBoundMap().getNumResults() != 1)
+    return failure();
+
+  // If the trip count is lower than the unroll factor, no unrolled body.
+  // TODO(bondhugula): option to specify cleanup loop unrolling.
+  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  if (mayBeConstantTripCount.hasValue() &&
+      mayBeConstantTripCount.getValue() < unrollFactor)
+    return failure();
+
+  // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
+  Operation *op = forOp.getOperation();
+  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
+    OpBuilder builder(op->getBlock(), ++Block::iterator(op));
+    auto cleanupForInst = cast<AffineForOp>(builder.clone(*op));
+    AffineMap cleanupMap;
+    SmallVector<Value, 4> cleanupOperands;
+    getCleanupLoopLowerBound(forOp, unrollFactor, &cleanupMap, &cleanupOperands,
+                             builder);
+    assert(cleanupMap &&
+           "cleanup loop lower bound map for single result lower bound maps "
+           "can always be determined");
+    cleanupForInst.setLowerBound(cleanupOperands, cleanupMap);
+    // Promote the loop body up if this has turned into a single iteration loop.
+    promoteIfSingleIteration(cleanupForInst);
+
+    // Adjust upper bound of the original loop; this is the same as the lower
+    // bound of the cleanup loop.
+    forOp.setUpperBound(cleanupOperands, cleanupMap);
+  }
+
+  // Scale the step of loop being unrolled by unroll factor.
+  int64_t step = forOp.getStep();
+  forOp.setStep(step * unrollFactor);
+
+  // Builder to insert unrolled bodies just before the terminator of the body of
+  // 'forOp'.
+  OpBuilder builder = forOp.getBodyBuilder();
+
+  // Keep a pointer to the last non-terminator operation in the original block
+  // so that we know what to clone (since we are doing this in-place).
+  Block::iterator srcBlockEnd = std::prev(forOp.getBody()->end(), 2);
+
+  // Unroll the contents of 'forOp' (append unrollFactor-1 additional copies).
+  auto forOpIV = forOp.getInductionVar();
+  for (unsigned i = 1; i < unrollFactor; i++) {
+    BlockAndValueMapping operandMap;
+
+    // If the induction variable is used, create a remapping to the value for
+    // this unrolled instance.
+    if (!forOpIV->use_empty()) {
+      // iv' = iv + 1/2/3...unrollFactor-1;
+      auto d0 = builder.getAffineDimExpr(0);
+      auto bumpMap = AffineMap::get(1, 0, {d0 + i * step});
+      auto ivUnroll =
+          builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
+      operandMap.map(forOpIV, ivUnroll);
+    }
+
+    // Clone the original body of 'forOp'.
+    for (auto it = forOp.getBody()->begin(); it != std::next(srcBlockEnd);
+         it++) {
+      builder.clone(*it, operandMap);
+    }
+  }
+
+  // Promote the loop body up if this has turned into a single iteration loop.
+  promoteIfSingleIteration(forOp);
+  return success();
+}
+
+/// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
+/// nested within 'forOpA' as the only non-terminator operation in its block.
+void mlir::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
+  auto *forOpAInst = forOpA.getOperation();
+
+  assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
+  auto &forOpABody = forOpA.getBody()->getOperations();
+  auto &forOpBBody = forOpB.getBody()->getOperations();
+
+  // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
+  // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
+  // body containing only the terminator.
+  forOpAInst->getBlock()->getOperations().splice(Block::iterator(forOpAInst),
+                                                 forOpABody, forOpABody.begin(),
+                                                 std::prev(forOpABody.end()));
+  // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
+  // body (this leaves forOpB's body containing only the terminator).
+  forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
+                    std::prev(forOpBBody.end()));
+  // 3) Splice forOpA into the beginning of forOpB's body.
+  forOpBBody.splice(forOpBBody.begin(), forOpAInst->getBlock()->getOperations(),
+                    Block::iterator(forOpAInst));
+}
+
+// Checks each dependence component against the permutation to see if the
+// desired loop interchange would violate dependences by making the
+// dependence component lexicographically negative.
+static bool checkLoopInterchangeDependences(
+    const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
+    ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
+  // Invert permutation map.
+  unsigned maxLoopDepth = loops.size();
+  SmallVector<unsigned, 4> loopPermMapInv;
+  loopPermMapInv.resize(maxLoopDepth);
+  for (unsigned i = 0; i < maxLoopDepth; ++i)
+    loopPermMapInv[loopPermMap[i]] = i;
+
+  // Check each dependence component against the permutation to see if the
+  // desired loop interchange permutation would make the dependence vectors
+  // lexicographically negative.
+  // Example 1: [-1, 1][0, 0]
+  // Example 2: [0, 0][-1, 1]
+  for (unsigned i = 0, e = depCompsVec.size(); i < e; ++i) {
+    const SmallVector<DependenceComponent, 2> &depComps = depCompsVec[i];
+    assert(depComps.size() >= maxLoopDepth);
+    // Check if the first non-zero dependence component is positive.
+    // This iterates through loops in the desired order.
+    for (unsigned j = 0; j < maxLoopDepth; ++j) {
+      unsigned permIndex = loopPermMapInv[j];
+      assert(depComps[permIndex].lb.hasValue());
+      int64_t depCompLb = depComps[permIndex].lb.getValue();
+      if (depCompLb > 0)
+        break;
+      if (depCompLb < 0)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
+/// nested sequence of loops in 'loops' would violate dependences.
+bool mlir::isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,
+                                             ArrayRef<unsigned> loopPermMap) {
+  // Gather dependence components for dependences between all ops in loop nest
+  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
+  assert(loopPermMap.size() == loops.size());
+  unsigned maxLoopDepth = loops.size();
+  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
+  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
+  return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
+}
+
+/// Performs a sequence of loop interchanges of loops in perfectly nested
+/// sequence of loops in 'loops', as specified by permutation in 'loopPermMap'.
+unsigned mlir::interchangeLoops(ArrayRef<AffineForOp> loops,
+                                ArrayRef<unsigned> loopPermMap) {
+  Optional<unsigned> loopNestRootIndex;
+  for (int i = loops.size() - 1; i >= 0; --i) {
+    int permIndex = static_cast<int>(loopPermMap[i]);
+    // Store the index of the for loop which will be the new loop nest root.
+    if (permIndex == 0)
+      loopNestRootIndex = i;
+    if (permIndex > i) {
+      // Sink loop 'i' by 'permIndex - i' levels deeper into the loop nest.
+      sinkLoop(loops[i], permIndex - i);
+    }
+  }
+  assert(loopNestRootIndex.hasValue());
+  return loopNestRootIndex.getValue();
+}
+
+// Sinks all sequential loops to the innermost levels (while preserving
+// relative order among them) and moves all parallel loops to the
+// outermost (while again preserving relative order among them).
+AffineForOp mlir::sinkSequentialLoops(AffineForOp forOp) {
+  SmallVector<AffineForOp, 4> loops;
+  getPerfectlyNestedLoops(loops, forOp);
+  if (loops.size() < 2)
+    return forOp;
+
+  // Gather dependence components for dependences between all ops in loop nest
+  // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
+  unsigned maxLoopDepth = loops.size();
+  std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
+  getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
+
+  // Mark loops as either parallel or sequential.
+  SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
+  for (unsigned i = 0, e = depCompsVec.size(); i < e; ++i) {
+    SmallVector<DependenceComponent, 2> &depComps = depCompsVec[i];
+    assert(depComps.size() >= maxLoopDepth);
+    for (unsigned j = 0; j < maxLoopDepth; ++j) {
+      DependenceComponent &depComp = depComps[j];
+      assert(depComp.lb.hasValue() && depComp.ub.hasValue());
+      if (depComp.lb.getValue() != 0 || depComp.ub.getValue() != 0)
+        isParallelLoop[j] = false;
+    }
+  }
+
+  // Count the number of parallel loops.
+  unsigned numParallelLoops = 0;
+  for (unsigned i = 0, e = isParallelLoop.size(); i < e; ++i)
+    if (isParallelLoop[i])
+      ++numParallelLoops;
+
+  // Compute permutation of loops that sinks sequential loops (and thus raises
+  // parallel loops) while preserving relative order.
+  SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
+  unsigned nextSequentialLoop = numParallelLoops;
+  unsigned nextParallelLoop = 0;
+  for (unsigned i = 0; i < maxLoopDepth; ++i) {
+    if (isParallelLoop[i]) {
+      loopPermMap[i] = nextParallelLoop++;
+    } else {
+      loopPermMap[i] = nextSequentialLoop++;
+    }
+  }
+
+  // Check if permutation 'loopPermMap' would violate dependences.
+  if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
+    return forOp;
+  // Perform loop interchange according to permutation 'loopPermMap'.
+  unsigned loopNestRootIndex = interchangeLoops(loops, loopPermMap);
+  return loops[loopNestRootIndex];
+}
+
+/// Performs a series of loop interchanges to sink 'forOp' 'loopDepth' levels
+/// deeper in the loop nest.
+void mlir::sinkLoop(AffineForOp forOp, unsigned loopDepth) {
+  for (unsigned i = 0; i < loopDepth; ++i) {
+    AffineForOp nextForOp = cast<AffineForOp>(forOp.getBody()->front());
+    interchangeLoops(forOp, nextForOp);
+  }
+}
+
+// Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
+// lower (resp. upper) loop bound. When called for both the lower and upper
+// bounds, the resulting IR resembles:
+//
+// ```mlir
+//    affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
+//      ...
+//    }
+// ```
+static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map,
+                                SmallVector<Value, 4> *operands,
+                                int64_t offset = 0) {
+  auto bounds = llvm::to_vector<4>(map->getResults());
+  bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
+  operands->insert(operands->begin() + map->getNumDims(), iv);
+  *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds);
+  canonicalizeMapAndOperands(map, operands);
+}
+
+// Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
+// Stripmine-sink is a primitive building block for generalized tiling of
+// imperfectly nested loops.
+// This transformation is purely mechanical and does not check legality,
+// profitability or even structural correctness. It is the user's
+// responsibility to specify `targets` that are dominated by `forOp`.
+// Returns the new AffineForOps, one per `targets`, nested immediately under
+// each of the `targets`.
+static SmallVector<AffineForOp, 8>
+stripmineSink(AffineForOp forOp, uint64_t factor,
+              ArrayRef<AffineForOp> targets) {
+  auto originalStep = forOp.getStep();
+  auto scaledStep = originalStep * factor;
+  forOp.setStep(scaledStep);
+
+  auto *op = forOp.getOperation();
+  OpBuilder b(op->getBlock(), ++Block::iterator(op));
+
+  // Lower-bound map creation.
+  auto lbMap = forOp.getLowerBoundMap();
+  SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
+  augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
+
+  // Upper-bound map creation.
+  auto ubMap = forOp.getUpperBoundMap();
+  SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
+  augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
+                      /*offset=*/scaledStep);
+
+  auto iv = forOp.getInductionVar();
+  SmallVector<AffineForOp, 8> innerLoops;
+  for (auto t : targets) {
+    // Insert newForOp before the terminator of `t`.
+    OpBuilder b = t.getBodyBuilder();
+    auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
+                                          ubOperands, ubMap, originalStep);
+    auto begin = t.getBody()->begin();
+    // Skip terminator and `newForOp` which is just before the terminator.
+    auto nOps = t.getBody()->getOperations().size() - 2;
+    newForOp.getBody()->getOperations().splice(
+        newForOp.getBody()->getOperations().begin(),
+        t.getBody()->getOperations(), begin, std::next(begin, nOps));
+    replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
+                               newForOp.region());
+    innerLoops.push_back(newForOp);
+  }
+
+  return innerLoops;
+}
+
+static Loops stripmineSink(loop::ForOp forOp, Value factor,
+                           ArrayRef<loop::ForOp> targets) {
+  auto originalStep = forOp.step();
+  auto iv = forOp.getInductionVar();
+
+  OpBuilder b(forOp);
+  forOp.setStep(b.create<MulIOp>(forOp.getLoc(), originalStep, factor));
+
+  Loops innerLoops;
+  for (auto t : targets) {
+    // Save information for splicing ops out of t when done
+    auto begin = t.getBody()->begin();
+    auto nOps = t.getBody()->getOperations().size();
+
+    // Insert newForOp before the terminator of `t`.
+    OpBuilder b(t.getBodyBuilder());
+    Value stepped = b.create<AddIOp>(t.getLoc(), iv, forOp.step());
+    Value less = b.create<CmpIOp>(t.getLoc(), CmpIPredicate::slt,
+                                  forOp.upperBound(), stepped);
+    Value ub =
+        b.create<SelectOp>(t.getLoc(), less, forOp.upperBound(), stepped);
+
+    // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
+    auto newForOp = b.create<loop::ForOp>(t.getLoc(), iv, ub, originalStep);
+    newForOp.getBody()->getOperations().splice(
+        newForOp.getBody()->getOperations().begin(),
+        t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
+    replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
+                               newForOp.region());
+
+    innerLoops.push_back(newForOp);
+  }
+
+  return innerLoops;
+}
+
+// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
+// Returns the new AffineForOps, nested immediately under `target`.
+template <typename ForType, typename SizeType>
+static ForType stripmineSink(ForType forOp, SizeType factor, ForType target) {
+  // TODO(ntv): Use cheap structural assertions that targets are nested under
+  // forOp and that targets are not nested under each other when DominanceInfo
+  // exposes the capability. It seems overkill to construct a whole function
+  // dominance tree at this point.
+  auto res = stripmineSink(forOp, factor, ArrayRef<ForType>{target});
+  assert(res.size() == 1 && "Expected 1 inner forOp");
+  return res[0];
+}
+
+template <typename ForType, typename SizeType>
+static SmallVector<SmallVector<ForType, 8>, 8>
+tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes,
+         ArrayRef<ForType> targets) {
+  SmallVector<SmallVector<ForType, 8>, 8> res;
+  SmallVector<ForType, 8> currentTargets(targets.begin(), targets.end());
+  for (auto it : llvm::zip(forOps, sizes)) {
+    auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
+    res.push_back(step);
+    currentTargets = step;
+  }
+  return res;
+}
+
+SmallVector<SmallVector<AffineForOp, 8>, 8>
+mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
+           ArrayRef<AffineForOp> targets) {
+  return tileImpl(forOps, sizes, targets);
+}
+
+SmallVector<Loops, 8> mlir::tile(ArrayRef<loop::ForOp> forOps,
+                                 ArrayRef<Value> sizes,
+                                 ArrayRef<loop::ForOp> targets) {
+  return tileImpl(forOps, sizes, targets);
+}
+
+template <typename ForType, typename SizeType>
+static SmallVector<ForType, 8>
+tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes, ForType target) {
+  SmallVector<ForType, 8> res;
+  for (auto loops : tile(forOps, sizes, ArrayRef<ForType>{target})) {
+    assert(loops.size() == 1);
+    res.push_back(loops[0]);
+  }
+  return res;
+}
+
+SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
+                                       ArrayRef<uint64_t> sizes,
+                                       AffineForOp target) {
+  return tileImpl(forOps, sizes, target);
+}
+
+Loops mlir::tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value> sizes,
+                 loop::ForOp target) {
+  return tileImpl(forOps, sizes, target);
+}
+
+Loops mlir::tilePerfectlyNested(loop::ForOp rootForOp, ArrayRef<Value> sizes) {
+  // Collect perfectly nested loops.  If more size values provided than nested
+  // loops available, truncate `sizes`.
+  SmallVector<loop::ForOp, 4> forOps;
+  forOps.reserve(sizes.size());
+  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
+  if (forOps.size() < sizes.size())
+    sizes = sizes.take_front(forOps.size());
+
+  return ::tile(forOps, sizes, forOps.back());
+}
+
+// Build the IR that performs ceil division of a positive value by a constant:
+//    ceildiv(a, B) = divis(a + (B-1), B)
+// where divis is rounding-to-zero division.
+static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
+                             int64_t divisor) {
+  assert(divisor > 0 && "expected positive divisor");
+  assert(dividend->getType().isIndex() && "expected index-typed value");
+
+  Value divisorMinusOneCst = builder.create<ConstantIndexOp>(loc, divisor - 1);
+  Value divisorCst = builder.create<ConstantIndexOp>(loc, divisor);
+  Value sum = builder.create<AddIOp>(loc, dividend, divisorMinusOneCst);
+  return builder.create<SignedDivIOp>(loc, sum, divisorCst);
+}
+
+// Build the IR that performs ceil division of a positive value by another
+// positive value:
+//    ceildiv(a, b) = divis(a + (b - 1), b)
+// where divis is rounding-to-zero division.
+static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
+                             Value divisor) {
+  assert(dividend->getType().isIndex() && "expected index-typed value");
+
+  Value cstOne = builder.create<ConstantIndexOp>(loc, 1);
+  Value divisorMinusOne = builder.create<SubIOp>(loc, divisor, cstOne);
+  Value sum = builder.create<AddIOp>(loc, dividend, divisorMinusOne);
+  return builder.create<SignedDivIOp>(loc, sum, divisor);
+}
+
+// Hoist the ops within `outer` that appear before `inner`.
+// Such ops include the ops that have been introduced by parametric tiling.
+// Ops that come from triangular loops (i.e. that belong to the program slice
+// rooted at `outer`) and ops that have side effects cannot be hoisted.
+// Return failure when any op fails to hoist.
+static LogicalResult hoistOpsBetween(loop::ForOp outer, loop::ForOp inner) {
+  SetVector<Operation *> forwardSlice;
+  getForwardSlice(outer.getOperation(), &forwardSlice, [&inner](Operation *op) {
+    return op != inner.getOperation();
+  });
+  LogicalResult status = success();
+  SmallVector<Operation *, 8> toHoist;
+  for (auto &op : outer.getBody()->getOperations()) {
+    // Stop when encountering the inner loop.
+    if (&op == inner.getOperation())
+      break;
+    // Skip over non-hoistable ops.
+    if (forwardSlice.count(&op) > 0) {
+      status = failure();
+      continue;
+    }
+    // Skip loop::ForOp, these are not considered a failure.
+    if (op.getNumRegions() > 0)
+      continue;
+    // Skip other ops with regions.
+    if (op.getNumRegions() > 0) {
+      status = failure();
+      continue;
+    }
+    // Skip if op has side effects.
+    // TODO(ntv): loads to immutable memory regions are ok.
+    if (!op.hasNoSideEffect()) {
+      status = failure();
+      continue;
+    }
+    toHoist.push_back(&op);
+  }
+  auto *outerForOp = outer.getOperation();
+  for (auto *op : toHoist)
+    op->moveBefore(outerForOp);
+  return status;
+}
+
+// Traverse the interTile and intraTile loops and try to hoist ops such that
+// bands of perfectly nested loops are isolated.
+// Return failure if either perfect interTile or perfect intraTile bands cannot
+// be formed.
+static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
+  LogicalResult status = success();
+  auto &interTile = tileLoops.first;
+  auto &intraTile = tileLoops.second;
+  auto size = interTile.size();
+  assert(size == intraTile.size());
+  if (size <= 1)
+    return success();
+  for (unsigned s = 1; s < size; ++s)
+    status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
+                               : failure();
+  for (unsigned s = 1; s < size; ++s)
+    status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
+                               : failure();
+  return status;
+}
+
+TileLoops mlir::extractFixedOuterLoops(loop::ForOp rootForOp,
+                                       ArrayRef<int64_t> sizes) {
+  // Collect perfectly nested loops.  If more size values provided than nested
+  // loops available, truncate `sizes`.
+  SmallVector<loop::ForOp, 4> forOps;
+  forOps.reserve(sizes.size());
+  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
+  if (forOps.size() < sizes.size())
+    sizes = sizes.take_front(forOps.size());
+
+  // Compute the tile sizes such that i-th outer loop executes size[i]
+  // iterations.  Given that the loop current executes
+  //   numIterations = ceildiv((upperBound - lowerBound), step)
+  // iterations, we need to tile with size ceildiv(numIterations, size[i]).
+  SmallVector<Value, 4> tileSizes;
+  tileSizes.reserve(sizes.size());
+  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
+    assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
+
+    auto forOp = forOps[i];
+    OpBuilder builder(forOp);
+    auto loc = forOp.getLoc();
+    Value diff =
+        builder.create<SubIOp>(loc, forOp.upperBound(), forOp.lowerBound());
+    Value numIterations = ceilDivPositive(builder, loc, diff, forOp.step());
+    Value iterationsPerBlock =
+        ceilDivPositive(builder, loc, numIterations, sizes[i]);
+    tileSizes.push_back(iterationsPerBlock);
+  }
+
+  // Call parametric tiling with the given sizes.
+  auto intraTile = tile(forOps, tileSizes, forOps.back());
+  TileLoops tileLoops = std::make_pair(forOps, intraTile);
+
+  // TODO(ntv, zinenko) for now we just ignore the result of band isolation.
+  // In the future, mapping decisions may be impacted by the ability to
+  // isolate perfectly nested bands.
+  tryIsolateBands(tileLoops);
+
+  return tileLoops;
+}
+
+// Replaces all uses of `orig` with `replacement` except if the user is listed
+// in `exceptions`.
+static void
+replaceAllUsesExcept(Value orig, Value replacement,
+                     const SmallPtrSetImpl<Operation *> &exceptions) {
+  for (auto &use : llvm::make_early_inc_range(orig->getUses())) {
+    if (exceptions.count(use.getOwner()) == 0)
+      use.set(replacement);
+  }
+}
+
+// Transform a loop with a strictly positive step
+//   for %i = %lb to %ub step %s
+// into a 0-based loop with step 1
+//   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
+//     %i = %ii * %s + %lb
+// Insert the induction variable remapping in the body of `inner`, which is
+// expected to be either `loop` or another loop perfectly nested under `loop`.
+// Insert the definition of new bounds immediate before `outer`, which is
+// expected to be either `loop` or its parent in the loop nest.
+static void normalizeLoop(loop::ForOp loop, loop::ForOp outer,
+                          loop::ForOp inner) {
+  OpBuilder builder(outer);
+  Location loc = loop.getLoc();
+
+  // Check if the loop is already known to have a constant zero lower bound or
+  // a constant one step.
+  bool isZeroBased = false;
+  if (auto ubCst =
+          dyn_cast_or_null<ConstantIndexOp>(loop.lowerBound()->getDefiningOp()))
+    isZeroBased = ubCst.getValue() == 0;
+
+  bool isStepOne = false;
+  if (auto stepCst =
+          dyn_cast_or_null<ConstantIndexOp>(loop.step()->getDefiningOp()))
+    isStepOne = stepCst.getValue() == 1;
+
+  if (isZeroBased && isStepOne)
+    return;
+
+  // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
+  // assuming the step is strictly positive.  Update the bounds and the step
+  // of the loop to go from 0 to the number of iterations, if necessary.
+  // TODO(zinenko): introduce support for negative steps or emit dynamic asserts
+  // on step positivity, whatever gets implemented first.
+  Value diff =
+      builder.create<SubIOp>(loc, loop.upperBound(), loop.lowerBound());
+  Value numIterations = ceilDivPositive(builder, loc, diff, loop.step());
+  loop.setUpperBound(numIterations);
+
+  Value lb = loop.lowerBound();
+  if (!isZeroBased) {
+    Value cst0 = builder.create<ConstantIndexOp>(loc, 0);
+    loop.setLowerBound(cst0);
+  }
+
+  Value step = loop.step();
+  if (!isStepOne) {
+    Value cst1 = builder.create<ConstantIndexOp>(loc, 1);
+    loop.setStep(cst1);
+  }
+
+  // Insert code computing the value of the original loop induction variable
+  // from the "normalized" one.
+  builder.setInsertionPointToStart(inner.getBody());
+  Value scaled =
+      isStepOne ? loop.getInductionVar()
+                : builder.create<MulIOp>(loc, loop.getInductionVar(), step);
+  Value shifted =
+      isZeroBased ? scaled : builder.create<AddIOp>(loc, scaled, lb);
+
+  SmallPtrSet<Operation *, 2> preserve{scaled->getDefiningOp(),
+                                       shifted->getDefiningOp()};
+  replaceAllUsesExcept(loop.getInductionVar(), shifted, preserve);
+}
+
+void mlir::coalesceLoops(MutableArrayRef<loop::ForOp> loops) {
+  if (loops.size() < 2)
+    return;
+
+  loop::ForOp innermost = loops.back();
+  loop::ForOp outermost = loops.front();
+
+  // 1. Make sure all loops iterate from 0 to upperBound with step 1.  This
+  // allows the following code to assume upperBound is the number of iterations.
+  for (auto loop : loops)
+    normalizeLoop(loop, outermost, innermost);
+
+  // 2. Emit code computing the upper bound of the coalesced loop as product
+  // of the number of iterations of all loops.
+  OpBuilder builder(outermost);
+  Location loc = outermost.getLoc();
+  Value upperBound = outermost.upperBound();
+  for (auto loop : loops.drop_front())
+    upperBound = builder.create<MulIOp>(loc, upperBound, loop.upperBound());
+  outermost.setUpperBound(upperBound);
+
+  builder.setInsertionPointToStart(outermost.getBody());
+
+  // 3. Remap induction variables.  For each original loop, the value of the
+  // induction variable can be obtained by dividing the induction variable of
+  // the linearized loop by the total number of iterations of the loops nested
+  // in it modulo the number of iterations in this loop (remove the values
+  // related to the outer loops):
+  //   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
+  // Compute these iteratively from the innermost loop by creating a "running
+  // quotient" of division by the range.
+  Value previous = outermost.getInductionVar();
+  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
+    unsigned idx = loops.size() - i - 1;
+    if (i != 0)
+      previous = builder.create<SignedDivIOp>(loc, previous,
+                                              loops[idx + 1].upperBound());
+
+    Value iv = (i == e - 1) ? previous
+                            : builder.create<SignedRemIOp>(
+                                  loc, previous, loops[idx].upperBound());
+    replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
+                               loops.back().region());
+  }
+
+  // 4. Move the operations from the innermost just above the second-outermost
+  // loop, delete the extra terminator and the second-outermost loop.
+  loop::ForOp second = loops[1];
+  innermost.getBody()->back().erase();
+  outermost.getBody()->getOperations().splice(
+      Block::iterator(second.getOperation()),
+      innermost.getBody()->getOperations());
+  second.erase();
+}
+
+void mlir::mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value> processorId,
+                                 ArrayRef<Value> numProcessors) {
+  assert(processorId.size() == numProcessors.size());
+  if (processorId.empty())
+    return;
+
+  OpBuilder b(forOp);
+  Location loc(forOp.getLoc());
+  Value mul = processorId.front();
+  for (unsigned i = 1, e = processorId.size(); i < e; ++i)
+    mul = b.create<AddIOp>(loc, b.create<MulIOp>(loc, mul, numProcessors[i]),
+                           processorId[i]);
+  Value lb = b.create<AddIOp>(loc, forOp.lowerBound(),
+                              b.create<MulIOp>(loc, forOp.step(), mul));
+  forOp.setLowerBound(lb);
+
+  Value step = forOp.step();
+  for (auto numProcs : numProcessors)
+    step = b.create<MulIOp>(loc, step, numProcs);
+  forOp.setStep(step);
+}
+
+/// Given a memref region, determine the lowest depth at which transfers can be
+/// placed for it, and return the corresponding block, start and end positions
+/// in the block for placing incoming (read) and outgoing (write) copies
+/// respectively. The lowest depth depends on whether the region being accessed
+/// is hoistable with respect to one or more immediately surrounding loops.
+static void
+findHighestBlockForPlacement(const MemRefRegion &region, Block &block,
+                             Block::iterator &begin, Block::iterator &end,
+                             Block **copyPlacementBlock,
+                             Block::iterator *copyInPlacementStart,
+                             Block::iterator *copyOutPlacementStart) {
+  const auto *cst = region.getConstraints();
+  SmallVector<Value, 4> symbols;
+  cst->getIdValues(cst->getNumDimIds(), cst->getNumDimAndSymbolIds(), &symbols);
+
+  SmallVector<AffineForOp, 4> enclosingFors;
+  getLoopIVs(*block.begin(), &enclosingFors);
+  // Walk up loop parents till we find an IV on which this region is
+  // symbolic/variant.
+  auto it = enclosingFors.rbegin();
+  for (auto e = enclosingFors.rend(); it != e; ++it) {
+    // TODO(bondhugula): also need to be checking this for regions symbols that
+    // aren't loop IVs, whether we are within their resp. defs' dominance scope.
+    if (llvm::is_contained(symbols, it->getInductionVar()))
+      break;
+  }
+
+  if (it != enclosingFors.rbegin()) {
+    auto lastInvariantIV = *std::prev(it);
+    *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation());
+    *copyOutPlacementStart = std::next(*copyInPlacementStart);
+    *copyPlacementBlock = lastInvariantIV.getOperation()->getBlock();
+  } else {
+    *copyInPlacementStart = begin;
+    *copyOutPlacementStart = end;
+    *copyPlacementBlock = &block;
+  }
+}
+
+// Info comprising stride and number of elements transferred every stride.
+struct StrideInfo {
+  int64_t stride;
+  int64_t numEltPerStride;
+};
+
+/// Returns striding information for a copy/transfer of this region with
+/// potentially multiple striding levels from outermost to innermost. For an
+/// n-dimensional region, there can be at most n-1 levels of striding
+/// successively nested.
+//  TODO(bondhugula): make this work with non-identity layout maps.
+static void getMultiLevelStrides(const MemRefRegion &region,
+                                 ArrayRef<int64_t> bufferShape,
+                                 SmallVectorImpl<StrideInfo> *strideInfos) {
+  if (bufferShape.size() <= 1)
+    return;
+
+  int64_t numEltPerStride = 1;
+  int64_t stride = 1;
+  for (int d = bufferShape.size() - 1; d >= 1; d--) {
+    int64_t dimSize = region.memref->getType().cast<MemRefType>().getDimSize(d);
+    stride *= dimSize;
+    numEltPerStride *= bufferShape[d];
+    // A stride is needed only if the region has a shorter extent than the
+    // memref along the dimension *and* has an extent greater than one along the
+    // next major dimension.
+    if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
+      strideInfos->push_back({stride, numEltPerStride});
+    }
+  }
+}
+
+/// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
+/// returns the outermost AffineForOp of the copy loop nest. `memIndicesStart'
+/// holds the lower coordinates of the region in the original memref to copy
+/// in/out. If `copyOut' is true, generates a copy-out; otherwise a copy-in.
+static AffineForOp generatePointWiseCopy(Location loc, Value memref,
+                                         Value fastMemRef,
+                                         AffineMap memAffineMap,
+                                         ArrayRef<Value> memIndicesStart,
+                                         ArrayRef<int64_t> fastBufferShape,
+                                         bool isCopyOut, OpBuilder b) {
+  assert(!memIndicesStart.empty() && "only 1-d or more memrefs");
+
+  // The copy-in nest is generated as follows as an example for a 2-d region:
+  // for x = ...
+  //   for y = ...
+  //     fast_buf[x][y] = buf[mem_x + x][mem_y + y]
+
+  SmallVector<Value, 4> fastBufIndices, memIndices;
+  AffineForOp copyNestRoot;
+  for (unsigned d = 0, e = fastBufferShape.size(); d < e; ++d) {
+    auto forOp = b.create<AffineForOp>(loc, 0, fastBufferShape[d]);
+    if (d == 0)
+      copyNestRoot = forOp;
+    b = forOp.getBodyBuilder();
+    fastBufIndices.push_back(forOp.getInductionVar());
+
+    Value memBase =
+        (memAffineMap == b.getMultiDimIdentityMap(memAffineMap.getNumDims()))
+            ? memIndicesStart[d]
+            : b.create<AffineApplyOp>(
+                  loc,
+                  AffineMap::get(memAffineMap.getNumDims(),
+                                 memAffineMap.getNumSymbols(),
+                                 memAffineMap.getResult(d)),
+                  memIndicesStart);
+
+    // Construct the subscript for the slow memref being copied.
+    auto memIndex = b.create<AffineApplyOp>(
+        loc,
+        AffineMap::get(2, 0, b.getAffineDimExpr(0) + b.getAffineDimExpr(1)),
+        ValueRange({memBase, forOp.getInductionVar()}));
+    memIndices.push_back(memIndex);
+  }
+
+  if (!isCopyOut) {
+    // Copy in.
+    auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
+    b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufIndices);
+    return copyNestRoot;
+  }
+
+  // Copy out.
+  auto load = b.create<AffineLoadOp>(loc, fastMemRef, fastBufIndices);
+  b.create<AffineStoreOp>(loc, load, memref, memIndices);
+  return copyNestRoot;
+}
+
+static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
+emitRemarkForBlock(Block &block) {
+  return block.getParentOp()->emitRemark();
+}
+
+/// Creates a buffer in the faster memory space for the specified memref region;
+/// generates a copy from the lower memory space to this one, and replaces all
+/// loads/stores in the block range [`begin', `end') of `block' to load/store
+/// from that buffer. Returns failure if copies could not be generated due to
+/// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart`
+/// in copyPlacementBlock specify the insertion points where the incoming copies
+/// and outgoing copies, respectively, should be inserted (the insertion happens
+/// right before the insertion point). Since `begin` can itself be invalidated
+/// due to the memref rewriting done from this method, the output argument
+/// `nBegin` is set to its replacement (set to `begin` if no invalidation
+/// happens). Since outgoing copies could have  been inserted at `end`, the
+/// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the
+/// size of the fast buffer allocated.
+static LogicalResult generateCopy(
+    const MemRefRegion &region, Block *block, Block::iterator begin,
+    Block::iterator end, Block *copyPlacementBlock,
+    Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
+    AffineCopyOptions copyOptions, DenseMap<Value, Value> &fastBufferMap,
+    DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
+    Block::iterator *nBegin, Block::iterator *nEnd) {
+  *nBegin = begin;
+  *nEnd = end;
+
+  FuncOp f = begin->getParentOfType<FuncOp>();
+  OpBuilder topBuilder(f.getBody());
+  Value zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0);
+
+  if (begin == end)
+    return success();
+
+  // Is the copy out point at the end of the block where we are doing
+  // explicit copying.
+  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
+
+  // Copies for read regions are going to be inserted at 'begin'.
+  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
+  // Copies for write regions are going to be inserted at 'end'.
+  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
+  OpBuilder &b = region.isWrite() ? epilogue : prologue;
+
+  // Builder to create constants at the top level.
+  auto func = copyPlacementBlock->getParent()->getParentOfType<FuncOp>();
+  OpBuilder top(func.getBody());
+
+  auto loc = region.loc;
+  auto memref = region.memref;
+  auto memRefType = memref->getType().cast<MemRefType>();
+
+  auto layoutMaps = memRefType.getAffineMaps();
+  if (layoutMaps.size() > 1 ||
+      (layoutMaps.size() == 1 && !layoutMaps[0].isIdentity())) {
+    LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
+    return failure();
+  }
+
+  // Indices to use for the copying.
+  // Indices for the original memref being copied from/to.
+  SmallVector<Value, 4> memIndices;
+  // Indices for the faster buffer being copied into/from.
+  SmallVector<Value, 4> bufIndices;
+
+  unsigned rank = memRefType.getRank();
+  SmallVector<int64_t, 4> fastBufferShape;
+
+  // Compute the extents of the buffer.
+  std::vector<SmallVector<int64_t, 4>> lbs;
+  SmallVector<int64_t, 8> lbDivisors;
+  lbs.reserve(rank);
+  Optional<int64_t> numElements = region.getConstantBoundingSizeAndShape(
+      &fastBufferShape, &lbs, &lbDivisors);
+  if (!numElements.hasValue()) {
+    LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
+    return failure();
+  }
+
+  if (numElements.getValue() == 0) {
+    LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
+    *sizeInBytes = 0;
+    return success();
+  }
+
+  const FlatAffineConstraints *cst = region.getConstraints();
+  // 'regionSymbols' hold values that this memory region is symbolic/parametric
+  // on; these typically include loop IVs surrounding the level at which the
+  // copy generation is being done or other valid symbols in MLIR.
+  SmallVector<Value, 8> regionSymbols;
+  cst->getIdValues(rank, cst->getNumIds(), &regionSymbols);
+
+  // Construct the index expressions for the fast memory buffer. The index
+  // expression for a particular dimension of the fast buffer is obtained by
+  // subtracting out the lower bound on the original memref's data region
+  // along the corresponding dimension.
+
+  // Index start offsets for faster memory buffer relative to the original.
+  SmallVector<AffineExpr, 4> offsets;
+  offsets.reserve(rank);
+  for (unsigned d = 0; d < rank; d++) {
+    assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
+
+    AffineExpr offset = top.getAffineConstantExpr(0);
+    for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) {
+      offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
+    }
+    assert(lbDivisors[d] > 0);
+    offset =
+        (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
+
+    // Set copy start location for this dimension in the lower memory space
+    // memref.
+    if (auto caf = offset.dyn_cast<AffineConstantExpr>()) {
+      auto indexVal = caf.getValue();
+      if (indexVal == 0) {
+        memIndices.push_back(zeroIndex);
+      } else {
+        memIndices.push_back(
+            top.create<ConstantIndexOp>(loc, indexVal).getResult());
+      }
+    } else {
+      // The coordinate for the start location is just the lower bound along the
+      // corresponding dimension on the memory region (stored in 'offset').
+      auto map = AffineMap::get(
+          cst->getNumDimIds() + cst->getNumSymbolIds() - rank, 0, offset);
+      memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));
+    }
+    // The fast buffer is copied into at location zero; addressing is relative.
+    bufIndices.push_back(zeroIndex);
+
+    // Record the offsets since they are needed to remap the memory accesses of
+    // the original memref further below.
+    offsets.push_back(offset);
+  }
+
+  // The faster memory space buffer.
+  Value fastMemRef;
+
+  // Check if a buffer was already created.
+  bool existingBuf = fastBufferMap.count(memref) > 0;
+  if (!existingBuf) {
+    AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
+    auto fastMemRefType =
+        MemRefType::get(fastBufferShape, memRefType.getElementType(),
+                        fastBufferLayout, copyOptions.fastMemorySpace);
+
+    // Create the fast memory space buffer just before the 'affine.for'
+    // operation.
+    fastMemRef = prologue.create<AllocOp>(loc, fastMemRefType).getResult();
+    // Record it.
+    fastBufferMap[memref] = fastMemRef;
+    // fastMemRefType is a constant shaped memref.
+    *sizeInBytes = getMemRefSizeInBytes(fastMemRefType).getValue();
+    LLVM_DEBUG(emitRemarkForBlock(*block)
+               << "Creating fast buffer of type " << fastMemRefType
+               << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
+               << " KiB\n");
+  } else {
+    // Reuse the one already created.
+    fastMemRef = fastBufferMap[memref];
+    *sizeInBytes = 0;
+  }
+
+  auto numElementsSSA =
+      top.create<ConstantIndexOp>(loc, numElements.getValue());
+
+  SmallVector<StrideInfo, 4> strideInfos;
+  getMultiLevelStrides(region, fastBufferShape, &strideInfos);
+
+  // TODO(bondhugula): use all stride levels once DmaStartOp is extended for
+  // multi-level strides.
+  if (strideInfos.size() > 1) {
+    LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
+    return failure();
+  }
+
+  Value stride = nullptr;
+  Value numEltPerStride = nullptr;
+  if (!strideInfos.empty()) {
+    stride = top.create<ConstantIndexOp>(loc, strideInfos[0].stride);
+    numEltPerStride =
+        top.create<ConstantIndexOp>(loc, strideInfos[0].numEltPerStride);
+  }
+
+  // Record the last operation where we want the memref replacement to end. We
+  // later do the memref replacement only in [begin, postDomFilter] so
+  // that the original memref's used in the data movement code themselves don't
+  // get replaced.
+  auto postDomFilter = std::prev(end);
+
+  // Create fully composed affine maps for each memref.
+  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
+  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
+  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
+  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
+
+  if (!copyOptions.generateDma) {
+    // Point-wise copy generation.
+    auto copyNest = generatePointWiseCopy(loc, memref, fastMemRef, memAffineMap,
+                                          memIndices, fastBufferShape,
+                                          /*isCopyOut=*/region.isWrite(), b);
+
+    // Record this so that we can skip it from yet another copy.
+    copyNests.insert(copyNest);
+
+    // Since new ops are being appended (for copy out's), adjust the end to
+    // mark end of block range being processed if necessary.
+    if (region.isWrite() && isCopyOutAtEndOfBlock)
+      *nEnd = Block::iterator(copyNest.getOperation());
+  } else {
+    // DMA generation.
+    // Create a tag (single element 1-d memref) for the DMA.
+    auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
+                                         copyOptions.tagMemorySpace);
+    auto tagMemRef = prologue.create<AllocOp>(loc, tagMemRefType);
+
+    SmallVector<Value, 4> tagIndices({zeroIndex});
+    auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
+    fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
+    if (!region.isWrite()) {
+      // DMA non-blocking read from original buffer to fast buffer.
+      b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
+                                 fastMemRef, bufAffineMap, bufIndices,
+                                 tagMemRef, tagAffineMap, tagIndices,
+                                 numElementsSSA, stride, numEltPerStride);
+    } else {
+      // DMA non-blocking write from fast buffer to the original memref.
+      auto op = b.create<AffineDmaStartOp>(
+          loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
+          memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
+          stride, numEltPerStride);
+      // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
+      // end to mark end of block range being processed.
+      if (isCopyOutAtEndOfBlock)
+        *nEnd = Block::iterator(op.getOperation());
+    }
+
+    // Matching DMA wait to block on completion; tag always has a 0 index.
+    b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
+                              numElementsSSA);
+
+    // Generate dealloc for the tag.
+    auto tagDeallocOp = epilogue.create<DeallocOp>(loc, tagMemRef);
+    if (*nEnd == end && isCopyOutAtEndOfBlock)
+      // Since new ops are being appended (for outgoing DMAs), adjust the end to
+      // mark end of range of the original.
+      *nEnd = Block::iterator(tagDeallocOp.getOperation());
+  }
+
+  // Generate dealloc for the buffer.
+  if (!existingBuf) {
+    auto bufDeallocOp = epilogue.create<DeallocOp>(loc, fastMemRef);
+    // When generating pointwise copies, `nEnd' has to be set to deallocOp on
+    // the fast buffer (since it marks the new end insertion point).
+    if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
+      *nEnd = Block::iterator(bufDeallocOp.getOperation());
+  }
+
+  // Replace all uses of the old memref with the faster one while remapping
+  // access indices (subtracting out lower bound offsets for each dimension).
+  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
+  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
+  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
+  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
+  // d2, d3 correspond to the original indices (%i, %j).
+  SmallVector<AffineExpr, 4> remapExprs;
+  remapExprs.reserve(rank);
+  for (unsigned i = 0; i < rank; i++) {
+    // The starting operands of indexRemap will be regionSymbols (the symbols on
+    // which the memref region is parametric); then those corresponding to
+    // the memref's original indices follow.
+    auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
+    remapExprs.push_back(dimExpr - offsets[i]);
+  }
+  auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs);
+
+  // Record the begin since it may be invalidated by memref replacement.
+  Block::iterator prevOfBegin;
+  bool isBeginAtStartOfBlock = (begin == block->begin());
+  if (!isBeginAtStartOfBlock)
+    prevOfBegin = std::prev(begin);
+
+  // *Only* those uses within the range [begin, end) of 'block' are replaced.
+  replaceAllMemRefUsesWith(memref, fastMemRef,
+                           /*extraIndices=*/{}, indexRemap,
+                           /*extraOperands=*/regionSymbols,
+                           /*symbolOperands=*/{},
+                           /*domInstFilter=*/&*begin,
+                           /*postDomInstFilter=*/&*postDomFilter);
+
+  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
+
+  return success();
+}
+
+/// Construct the memref region to just include the entire memref. Returns false
+/// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
+/// enclosing loop IVs of opInst (starting from the outermost) that the region
+/// is parametric on.
+static bool getFullMemRefAsRegion(Operation *opInst, unsigned numParamLoopIVs,
+                                  MemRefRegion *region) {
+  unsigned rank;
+  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
+    rank = loadOp.getMemRefType().getRank();
+    region->memref = loadOp.getMemRef();
+    region->setWrite(false);
+  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
+    rank = storeOp.getMemRefType().getRank();
+    region->memref = storeOp.getMemRef();
+    region->setWrite(true);
+  } else {
+    assert(false && "expected load or store op");
+    return false;
+  }
+  auto memRefType = region->memref->getType().cast<MemRefType>();
+  if (!memRefType.hasStaticShape())
+    return false;
+
+  auto *regionCst = region->getConstraints();
+
+  // Just get the first numSymbols IVs, which the memref region is parametric
+  // on.
+  SmallVector<AffineForOp, 4> ivs;
+  getLoopIVs(*opInst, &ivs);
+  ivs.resize(numParamLoopIVs);
+  SmallVector<Value, 4> symbols;
+  extractForInductionVars(ivs, &symbols);
+  regionCst->reset(rank, numParamLoopIVs, 0);
+  regionCst->setIdValues(rank, rank + numParamLoopIVs, symbols);
+
+  // Memref dim sizes provide the bounds.
+  for (unsigned d = 0; d < rank; d++) {
+    auto dimSize = memRefType.getDimSize(d);
+    assert(dimSize > 0 && "filtered dynamic shapes above");
+    regionCst->addConstantLowerBound(d, 0);
+    regionCst->addConstantUpperBound(d, dimSize - 1);
+  }
+  return true;
+}
+
+/// Generates copies for a contiguous sequence of operations in `block` in the
+/// iterator range [`begin', `end'), where `end' can't be past the terminator of
+/// the block (since additional operations are potentially inserted right before
+/// `end'. Returns the total size of the fast buffers used.
+//  Since we generate alloc's and dealloc's for all fast buffers (before and
+//  after the range of operations resp.), all of the fast memory capacity is
+//  assumed to be available for processing this block range.
+uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
+                                      Block::iterator end,
+                                      const AffineCopyOptions &copyOptions,
+                                      DenseSet<Operation *> &copyNests) {
+  if (begin == end)
+    return 0;
+
+  assert(begin->getBlock() == std::prev(end)->getBlock() &&
+         "Inconsistent block begin/end args");
+  assert(end != end->getBlock()->end() && "end can't be the block terminator");
+
+  Block *block = begin->getBlock();
+
+  // Copies will be generated for this depth, i.e., symbolic in all loops
+  // surrounding the this block range.
+  unsigned copyDepth = getNestingDepth(*begin);
+
+  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
+                          << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
+
+  // List of memory regions to copy for. We need a map vector to have a
+  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
+  // since the alloc's for example are identical except for the SSA id.
+  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
+  SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
+
+  // Map from original memref's to the fast buffers that their accesses are
+  // replaced with.
+  DenseMap<Value, Value> fastBufferMap;
+
+  // To check for errors when walking the block.
+  bool error = false;
+
+  // Walk this range of operations  to gather all memory regions.
+  block->walk(begin, end, [&](Operation *opInst) {
+    // Gather regions to allocate to buffers in faster memory space.
+    if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
+      if ((loadOp.getMemRefType().getMemorySpace() !=
+           copyOptions.slowMemorySpace))
+        return;
+    } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
+      if (storeOp.getMemRefType().getMemorySpace() !=
+          copyOptions.slowMemorySpace)
+        return;
+    } else {
+      // Neither load nor a store op.
+      return;
+    }
+
+    // Compute the MemRefRegion accessed.
+    auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
+    if (failed(region->compute(opInst, copyDepth))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Error obtaining memory region: semi-affine maps?\n");
+      LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
+      if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
+        LLVM_DEBUG(
+            opInst->emitError("non-constant memref sizes not yet supported"));
+        error = true;
+        return;
+      }
+    }
+
+    // Each memref has a single buffer associated with it irrespective of how
+    // many load's and store's happen on it.
+    // TODO(bondhugula): in the future, when regions don't intersect and satisfy
+    // other properties (based on load/store regions), we could consider
+    // multiple buffers per memref.
+
+    // Add to the appropriate region if it's not already in it, or take a
+    // bounding box union with the existing one if it's already in there.
+    // Note that a memref may have both read and write regions - so update the
+    // region in the other list if one exists (write in case of read and vice
+    // versa) since there is a single bounding box for a memref across all reads
+    // and writes that happen on it.
+
+    // Attempts to update; returns true if 'region' exists in targetRegions.
+    auto updateRegion =
+        [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
+                &targetRegions) {
+          auto it = targetRegions.find(region->memref);
+          if (it == targetRegions.end())
+            return false;
+
+          // Perform a union with the existing region.
+          if (failed(it->second->unionBoundingBox(*region))) {
+            LLVM_DEBUG(llvm::dbgs()
+                       << "Memory region bounding box failed; "
+                          "over-approximating to the entire memref\n");
+            // If the union fails, we will overapproximate.
+            if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
+              LLVM_DEBUG(opInst->emitError(
+                  "non-constant memref sizes not yet supported"));
+              error = true;
+              return true;
+            }
+            it->second->getConstraints()->clearAndCopyFrom(
+                *region->getConstraints());
+          } else {
+            // Union was computed and stored in 'it->second': copy to 'region'.
+            region->getConstraints()->clearAndCopyFrom(
+                *it->second->getConstraints());
+          }
+          return true;
+        };
+
+    bool existsInRead = updateRegion(readRegions);
+    if (error)
+      return;
+    bool existsInWrite = updateRegion(writeRegions);
+    if (error)
+      return;
+
+    // Finally add it to the region list.
+    if (region->isWrite() && !existsInWrite) {
+      writeRegions[region->memref] = std::move(region);
+    } else if (!region->isWrite() && !existsInRead) {
+      readRegions[region->memref] = std::move(region);
+    }
+  });
+
+  if (error) {
+    begin->emitError(
+        "copy generation failed for one or more memref's in this block\n");
+    return 0;
+  }
+
+  uint64_t totalCopyBuffersSizeInBytes = 0;
+  bool ret = true;
+  auto processRegions =
+      [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
+              &regions) {
+        for (const auto &regionEntry : regions) {
+          // For each region, hoist copy in/out past all hoistable
+          // 'affine.for's.
+          Block::iterator copyInPlacementStart, copyOutPlacementStart;
+          Block *copyPlacementBlock;
+          findHighestBlockForPlacement(
+              *regionEntry.second, *block, begin, end, &copyPlacementBlock,
+              &copyInPlacementStart, &copyOutPlacementStart);
+
+          uint64_t sizeInBytes;
+          Block::iterator nBegin, nEnd;
+          LogicalResult iRet = generateCopy(
+              *regionEntry.second, block, begin, end, copyPlacementBlock,
+              copyInPlacementStart, copyOutPlacementStart, copyOptions,
+              fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
+          if (succeeded(iRet)) {
+            // begin/end could have been invalidated, and need update.
+            begin = nBegin;
+            end = nEnd;
+            totalCopyBuffersSizeInBytes += sizeInBytes;
+          }
+          ret = ret & succeeded(iRet);
+        }
+      };
+  processRegions(readRegions);
+  processRegions(writeRegions);
+
+  if (!ret) {
+    begin->emitError(
+        "copy generation failed for one or more memref's in this block\n");
+    return totalCopyBuffersSizeInBytes;
+  }
+
+  // For a range of operations, a note will be emitted at the caller.
+  AffineForOp forOp;
+  uint64_t sizeInKib = llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024);
+  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
+    forOp.emitRemark()
+        << sizeInKib
+        << " KiB of copy buffers in fast memory space for this block\n";
+  }
+
+  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
+    StringRef str = "Total size of all copy buffers' for this block "
+                    "exceeds fast memory capacity\n";
+    block->getParentOp()->emitError(str);
+  }
+
+  return totalCopyBuffersSizeInBytes;
+}
diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ca26074f288d8d636b1598fa365f3e402286838f
--- /dev/null
+++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp
@@ -0,0 +1,348 @@
+//===- RegionUtils.cpp - Region-related transformation utilities ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/RegionUtils.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/RegionGraphTraits.h"
+#include "mlir/IR/Value.h"
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallSet.h"
+
+using namespace mlir;
+
+void mlir::replaceAllUsesInRegionWith(Value orig, Value replacement,
+                                      Region &region) {
+  for (auto &use : llvm::make_early_inc_range(orig->getUses())) {
+    if (region.isAncestor(use.getOwner()->getParentRegion()))
+      use.set(replacement);
+  }
+}
+
+void mlir::visitUsedValuesDefinedAbove(
+    Region &region, Region &limit, function_ref<void(OpOperand *)> callback) {
+  assert(limit.isAncestor(&region) &&
+         "expected isolation limit to be an ancestor of the given region");
+
+  // Collect proper ancestors of `limit` upfront to avoid traversing the region
+  // tree for every value.
+  SmallPtrSet<Region *, 4> properAncestors;
+  for (auto *reg = limit.getParentRegion(); reg != nullptr;
+       reg = reg->getParentRegion()) {
+    properAncestors.insert(reg);
+  }
+
+  region.walk([callback, &properAncestors](Operation *op) {
+    for (OpOperand &operand : op->getOpOperands())
+      // Callback on values defined in a proper ancestor of region.
+      if (properAncestors.count(operand.get()->getParentRegion()))
+        callback(&operand);
+  });
+}
+
+void mlir::visitUsedValuesDefinedAbove(
+    MutableArrayRef<Region> regions, function_ref<void(OpOperand *)> callback) {
+  for (Region &region : regions)
+    visitUsedValuesDefinedAbove(region, region, callback);
+}
+
+void mlir::getUsedValuesDefinedAbove(Region &region, Region &limit,
+                                     llvm::SetVector<Value> &values) {
+  visitUsedValuesDefinedAbove(region, limit, [&](OpOperand *operand) {
+    values.insert(operand->get());
+  });
+}
+
+void mlir::getUsedValuesDefinedAbove(MutableArrayRef<Region> regions,
+                                     llvm::SetVector<Value> &values) {
+  for (Region &region : regions)
+    getUsedValuesDefinedAbove(region, region, values);
+}
+
+//===----------------------------------------------------------------------===//
+// Unreachable Block Elimination
+//===----------------------------------------------------------------------===//
+
+/// Erase the unreachable blocks within the provided regions. Returns success
+/// if any blocks were erased, failure otherwise.
+// TODO: We could likely merge this with the DCE algorithm below.
+static LogicalResult eraseUnreachableBlocks(MutableArrayRef<Region> regions) {
+  // Set of blocks found to be reachable within a given region.
+  llvm::df_iterator_default_set<Block *, 16> reachable;
+  // If any blocks were found to be dead.
+  bool erasedDeadBlocks = false;
+
+  SmallVector<Region *, 1> worklist;
+  worklist.reserve(regions.size());
+  for (Region &region : regions)
+    worklist.push_back(&region);
+  while (!worklist.empty()) {
+    Region *region = worklist.pop_back_val();
+    if (region->empty())
+      continue;
+
+    // If this is a single block region, just collect the nested regions.
+    if (std::next(region->begin()) == region->end()) {
+      for (Operation &op : region->front())
+        for (Region &region : op.getRegions())
+          worklist.push_back(&region);
+      continue;
+    }
+
+    // Mark all reachable blocks.
+    reachable.clear();
+    for (Block *block : depth_first_ext(&region->front(), reachable))
+      (void)block /* Mark all reachable blocks */;
+
+    // Collect all of the dead blocks and push the live regions onto the
+    // worklist.
+    for (Block &block : llvm::make_early_inc_range(*region)) {
+      if (!reachable.count(&block)) {
+        block.dropAllDefinedValueUses();
+        block.erase();
+        erasedDeadBlocks = true;
+        continue;
+      }
+
+      // Walk any regions within this block.
+      for (Operation &op : block)
+        for (Region &region : op.getRegions())
+          worklist.push_back(&region);
+    }
+  }
+
+  return success(erasedDeadBlocks);
+}
+
+//===----------------------------------------------------------------------===//
+// Dead Code Elimination
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Data structure used to track which values have already been proved live.
+///
+/// Because Operation's can have multiple results, this data structure tracks
+/// liveness for both Value's and Operation's to avoid having to look through
+/// all Operation results when analyzing a use.
+///
+/// This data structure essentially tracks the dataflow lattice.
+/// The set of values/ops proved live increases monotonically to a fixed-point.
+class LiveMap {
+public:
+  /// Value methods.
+  bool wasProvenLive(Value value) { return liveValues.count(value); }
+  void setProvedLive(Value value) {
+    changed |= liveValues.insert(value).second;
+  }
+
+  /// Operation methods.
+  bool wasProvenLive(Operation *op) { return liveOps.count(op); }
+  void setProvedLive(Operation *op) { changed |= liveOps.insert(op).second; }
+
+  /// Methods for tracking if we have reached a fixed-point.
+  void resetChanged() { changed = false; }
+  bool hasChanged() { return changed; }
+
+private:
+  bool changed = false;
+  DenseSet<Value> liveValues;
+  DenseSet<Operation *> liveOps;
+};
+} // namespace
+
+static bool isUseSpeciallyKnownDead(OpOperand &use, LiveMap &liveMap) {
+  Operation *owner = use.getOwner();
+  unsigned operandIndex = use.getOperandNumber();
+  // This pass generally treats all uses of an op as live if the op itself is
+  // considered live. However, for successor operands to terminators we need a
+  // finer-grained notion where we deduce liveness for operands individually.
+  // The reason for this is easiest to think about in terms of a classical phi
+  // node based SSA IR, where each successor operand is really an operand to a
+  // *separate* phi node, rather than all operands to the branch itself as with
+  // the block argument representation that MLIR uses.
+  //
+  // And similarly, because each successor operand is really an operand to a phi
+  // node, rather than to the terminator op itself, a terminator op can't e.g.
+  // "print" the value of a successor operand.
+  if (owner->isKnownTerminator()) {
+    if (auto arg = owner->getSuccessorBlockArgument(operandIndex))
+      return !liveMap.wasProvenLive(*arg);
+    return false;
+  }
+  return false;
+}
+
+static void processValue(Value value, LiveMap &liveMap) {
+  bool provedLive = llvm::any_of(value->getUses(), [&](OpOperand &use) {
+    if (isUseSpeciallyKnownDead(use, liveMap))
+      return false;
+    return liveMap.wasProvenLive(use.getOwner());
+  });
+  if (provedLive)
+    liveMap.setProvedLive(value);
+}
+
+static bool isOpIntrinsicallyLive(Operation *op) {
+  // This pass doesn't modify the CFG, so terminators are never deleted.
+  if (!op->isKnownNonTerminator())
+    return true;
+  // If the op has a side effect, we treat it as live.
+  if (!op->hasNoSideEffect())
+    return true;
+  return false;
+}
+
+static void propagateLiveness(Region &region, LiveMap &liveMap);
+static void propagateLiveness(Operation *op, LiveMap &liveMap) {
+  // All Value's are either a block argument or an op result.
+  // We call processValue on those cases.
+
+  // Recurse on any regions the op has.
+  for (Region &region : op->getRegions())
+    propagateLiveness(region, liveMap);
+
+  // Process the op itself.
+  if (isOpIntrinsicallyLive(op)) {
+    liveMap.setProvedLive(op);
+    return;
+  }
+  for (Value value : op->getResults())
+    processValue(value, liveMap);
+  bool provedLive = llvm::any_of(op->getResults(), [&](Value value) {
+    return liveMap.wasProvenLive(value);
+  });
+  if (provedLive)
+    liveMap.setProvedLive(op);
+}
+
+static void propagateLiveness(Region &region, LiveMap &liveMap) {
+  if (region.empty())
+    return;
+
+  for (Block *block : llvm::post_order(&region.front())) {
+    // We process block arguments after the ops in the block, to promote
+    // faster convergence to a fixed point (we try to visit uses before defs).
+    for (Operation &op : llvm::reverse(block->getOperations()))
+      propagateLiveness(&op, liveMap);
+    for (Value value : block->getArguments())
+      processValue(value, liveMap);
+  }
+}
+
+static void eraseTerminatorSuccessorOperands(Operation *terminator,
+                                             LiveMap &liveMap) {
+  for (unsigned succI = 0, succE = terminator->getNumSuccessors();
+       succI < succE; succI++) {
+    // Iterating successors in reverse is not strictly needed, since we
+    // aren't erasing any successors. But it is slightly more efficient
+    // since it will promote later operands of the terminator being erased
+    // first, reducing the quadratic-ness.
+    unsigned succ = succE - succI - 1;
+    for (unsigned argI = 0, argE = terminator->getNumSuccessorOperands(succ);
+         argI < argE; argI++) {
+      // Iterating args in reverse is needed for correctness, to avoid
+      // shifting later args when earlier args are erased.
+      unsigned arg = argE - argI - 1;
+      Value value = terminator->getSuccessor(succ)->getArgument(arg);
+      if (!liveMap.wasProvenLive(value)) {
+        terminator->eraseSuccessorOperand(succ, arg);
+      }
+    }
+  }
+}
+
+static LogicalResult deleteDeadness(MutableArrayRef<Region> regions,
+                                    LiveMap &liveMap) {
+  bool erasedAnything = false;
+  for (Region &region : regions) {
+    if (region.empty())
+      continue;
+
+    // We do the deletion in an order that deletes all uses before deleting
+    // defs.
+    // MLIR's SSA structural invariants guarantee that except for block
+    // arguments, the use-def graph is acyclic, so this is possible with a
+    // single walk of ops and then a final pass to clean up block arguments.
+    //
+    // To do this, we visit ops in an order that visits domtree children
+    // before domtree parents. A CFG post-order (with reverse iteration with a
+    // block) satisfies that without needing an explicit domtree calculation.
+    for (Block *block : llvm::post_order(&region.front())) {
+      eraseTerminatorSuccessorOperands(block->getTerminator(), liveMap);
+      for (Operation &childOp :
+           llvm::make_early_inc_range(llvm::reverse(block->getOperations()))) {
+        erasedAnything |=
+            succeeded(deleteDeadness(childOp.getRegions(), liveMap));
+        if (!liveMap.wasProvenLive(&childOp)) {
+          erasedAnything = true;
+          childOp.erase();
+        }
+      }
+    }
+    // Delete block arguments.
+    // The entry block has an unknown contract with their enclosing block, so
+    // skip it.
+    for (Block &block : llvm::drop_begin(region.getBlocks(), 1)) {
+      // Iterate in reverse to avoid shifting later arguments when deleting
+      // earlier arguments.
+      for (unsigned i = 0, e = block.getNumArguments(); i < e; i++)
+        if (!liveMap.wasProvenLive(block.getArgument(e - i - 1))) {
+          block.eraseArgument(e - i - 1, /*updatePredTerms=*/false);
+          erasedAnything = true;
+        }
+    }
+  }
+  return success(erasedAnything);
+}
+
+// This function performs a simple dead code elimination algorithm over the
+// given regions.
+//
+// The overall goal is to prove that Values are dead, which allows deleting ops
+// and block arguments.
+//
+// This uses an optimistic algorithm that assumes everything is dead until
+// proved otherwise, allowing it to delete recursively dead cycles.
+//
+// This is a simple fixed-point dataflow analysis algorithm on a lattice
+// {Dead,Alive}. Because liveness flows backward, we generally try to
+// iterate everything backward to speed up convergence to the fixed-point. This
+// allows for being able to delete recursively dead cycles of the use-def graph,
+// including block arguments.
+//
+// This function returns success if any operations or arguments were deleted,
+// failure otherwise.
+static LogicalResult runRegionDCE(MutableArrayRef<Region> regions) {
+  assert(regions.size() == 1);
+
+  LiveMap liveMap;
+  do {
+    liveMap.resetChanged();
+
+    for (Region &region : regions)
+      propagateLiveness(region, liveMap);
+  } while (liveMap.hasChanged());
+
+  return deleteDeadness(regions, liveMap);
+}
+
+//===----------------------------------------------------------------------===//
+// Region Simplification
+//===----------------------------------------------------------------------===//
+
+/// Run a set of structural simplifications over the given regions. This
+/// includes transformations like unreachable block elimination, dead argument
+/// elimination, as well as some other DCE. This function returns success if any
+/// of the regions were simplified, failure otherwise.
+LogicalResult mlir::simplifyRegions(MutableArrayRef<Region> regions) {
+  LogicalResult eliminatedBlocks = eraseUnreachableBlocks(regions);
+  LogicalResult eliminatedOpsOrArgs = runRegionDCE(regions);
+  return success(succeeded(eliminatedBlocks) || succeeded(eliminatedOpsOrArgs));
+}
diff --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a6629183dee110e5b53e4961668ce136184ca23e
--- /dev/null
+++ b/mlir/lib/Transforms/Utils/Utils.cpp
@@ -0,0 +1,469 @@
+//===- Utils.cpp ---- Misc utilities for code and data transformation -----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements miscellaneous transformation routines for non-loop IR
+// structures.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/Utils.h"
+
+#include "mlir/ADT/TypeSwitch.h"
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/Dominance.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Support/MathExtras.h"
+#include "llvm/ADT/DenseMap.h"
+using namespace mlir;
+
+/// Return true if this operation dereferences one or more memref's.
+// Temporary utility: will be replaced when this is modeled through
+// side-effects/op traits. TODO(b/117228571)
+static bool isMemRefDereferencingOp(Operation &op) {
+  if (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
+      isa<AffineDmaStartOp>(op) || isa<AffineDmaWaitOp>(op))
+    return true;
+  return false;
+}
+
+/// Return the AffineMapAttr associated with memory 'op' on 'memref'.
+static NamedAttribute getAffineMapAttrForMemRef(Operation *op, Value memref) {
+  return TypeSwitch<Operation *, NamedAttribute>(op)
+      .Case<AffineDmaStartOp, AffineLoadOp, AffinePrefetchOp, AffineStoreOp,
+            AffineDmaWaitOp>(
+          [=](auto op) { return op.getAffineMapAttrForMemRef(memref); });
+}
+
+// Perform the replacement in `op`.
+LogicalResult mlir::replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
+                                             Operation *op,
+                                             ArrayRef<Value> extraIndices,
+                                             AffineMap indexRemap,
+                                             ArrayRef<Value> extraOperands,
+                                             ArrayRef<Value> symbolOperands) {
+  unsigned newMemRefRank = newMemRef->getType().cast<MemRefType>().getRank();
+  (void)newMemRefRank; // unused in opt mode
+  unsigned oldMemRefRank = oldMemRef->getType().cast<MemRefType>().getRank();
+  (void)oldMemRefRank; // unused in opt mode
+  if (indexRemap) {
+    assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
+           "symbolic operand count mismatch");
+    assert(indexRemap.getNumInputs() ==
+           extraOperands.size() + oldMemRefRank + symbolOperands.size());
+    assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
+  } else {
+    assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
+  }
+
+  // Assert same elemental type.
+  assert(oldMemRef->getType().cast<MemRefType>().getElementType() ==
+         newMemRef->getType().cast<MemRefType>().getElementType());
+
+  if (!isMemRefDereferencingOp(*op))
+    // Failure: memref used in a non-dereferencing context (potentially
+    // escapes); no replacement in these cases.
+    return failure();
+
+  SmallVector<unsigned, 2> usePositions;
+  for (const auto &opEntry : llvm::enumerate(op->getOperands())) {
+    if (opEntry.value() == oldMemRef)
+      usePositions.push_back(opEntry.index());
+  }
+
+  // If memref doesn't appear, nothing to do.
+  if (usePositions.empty())
+    return success();
+
+  if (usePositions.size() > 1) {
+    // TODO(mlir-team): extend it for this case when needed (rare).
+    assert(false && "multiple dereferencing uses in a single op not supported");
+    return failure();
+  }
+
+  unsigned memRefOperandPos = usePositions.front();
+
+  OpBuilder builder(op);
+  NamedAttribute oldMapAttrPair = getAffineMapAttrForMemRef(op, oldMemRef);
+  AffineMap oldMap = oldMapAttrPair.second.cast<AffineMapAttr>().getValue();
+  unsigned oldMapNumInputs = oldMap.getNumInputs();
+  SmallVector<Value, 4> oldMapOperands(
+      op->operand_begin() + memRefOperandPos + 1,
+      op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs);
+
+  // Apply 'oldMemRefOperands = oldMap(oldMapOperands)'.
+  SmallVector<Value, 4> oldMemRefOperands;
+  SmallVector<Value, 4> affineApplyOps;
+  oldMemRefOperands.reserve(oldMemRefRank);
+  if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) {
+    for (auto resultExpr : oldMap.getResults()) {
+      auto singleResMap = AffineMap::get(oldMap.getNumDims(),
+                                         oldMap.getNumSymbols(), resultExpr);
+      auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
+                                                oldMapOperands);
+      oldMemRefOperands.push_back(afOp);
+      affineApplyOps.push_back(afOp);
+    }
+  } else {
+    oldMemRefOperands.append(oldMapOperands.begin(), oldMapOperands.end());
+  }
+
+  // Construct new indices as a remap of the old ones if a remapping has been
+  // provided. The indices of a memref come right after it, i.e.,
+  // at position memRefOperandPos + 1.
+  SmallVector<Value, 4> remapOperands;
+  remapOperands.reserve(extraOperands.size() + oldMemRefRank +
+                        symbolOperands.size());
+  remapOperands.append(extraOperands.begin(), extraOperands.end());
+  remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end());
+  remapOperands.append(symbolOperands.begin(), symbolOperands.end());
+
+  SmallVector<Value, 4> remapOutputs;
+  remapOutputs.reserve(oldMemRefRank);
+
+  if (indexRemap &&
+      indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) {
+    // Remapped indices.
+    for (auto resultExpr : indexRemap.getResults()) {
+      auto singleResMap = AffineMap::get(
+          indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr);
+      auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
+                                                remapOperands);
+      remapOutputs.push_back(afOp);
+      affineApplyOps.push_back(afOp);
+    }
+  } else {
+    // No remapping specified.
+    remapOutputs.append(remapOperands.begin(), remapOperands.end());
+  }
+
+  SmallVector<Value, 4> newMapOperands;
+  newMapOperands.reserve(newMemRefRank);
+
+  // Prepend 'extraIndices' in 'newMapOperands'.
+  for (auto extraIndex : extraIndices) {
+    assert(extraIndex->getDefiningOp()->getNumResults() == 1 &&
+           "single result op's expected to generate these indices");
+    assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) &&
+           "invalid memory op index");
+    newMapOperands.push_back(extraIndex);
+  }
+
+  // Append 'remapOutputs' to 'newMapOperands'.
+  newMapOperands.append(remapOutputs.begin(), remapOutputs.end());
+
+  // Create new fully composed AffineMap for new op to be created.
+  assert(newMapOperands.size() == newMemRefRank);
+  auto newMap = builder.getMultiDimIdentityMap(newMemRefRank);
+  // TODO(b/136262594) Avoid creating/deleting temporary AffineApplyOps here.
+  fullyComposeAffineMapAndOperands(&newMap, &newMapOperands);
+  newMap = simplifyAffineMap(newMap);
+  canonicalizeMapAndOperands(&newMap, &newMapOperands);
+  // Remove any affine.apply's that became dead as a result of composition.
+  for (auto value : affineApplyOps)
+    if (value->use_empty())
+      value->getDefiningOp()->erase();
+
+  // Construct the new operation using this memref.
+  OperationState state(op->getLoc(), op->getName());
+  state.setOperandListToResizable(op->hasResizableOperandsList());
+  state.operands.reserve(op->getNumOperands() + extraIndices.size());
+  // Insert the non-memref operands.
+  state.operands.append(op->operand_begin(),
+                        op->operand_begin() + memRefOperandPos);
+  // Insert the new memref value.
+  state.operands.push_back(newMemRef);
+
+  // Insert the new memref map operands.
+  state.operands.append(newMapOperands.begin(), newMapOperands.end());
+
+  // Insert the remaining operands unmodified.
+  state.operands.append(op->operand_begin() + memRefOperandPos + 1 +
+                            oldMapNumInputs,
+                        op->operand_end());
+
+  // Result types don't change. Both memref's are of the same elemental type.
+  state.types.reserve(op->getNumResults());
+  for (auto result : op->getResults())
+    state.types.push_back(result->getType());
+
+  // Add attribute for 'newMap', other Attributes do not change.
+  auto newMapAttr = AffineMapAttr::get(newMap);
+  for (auto namedAttr : op->getAttrs()) {
+    if (namedAttr.first == oldMapAttrPair.first) {
+      state.attributes.push_back({namedAttr.first, newMapAttr});
+    } else {
+      state.attributes.push_back(namedAttr);
+    }
+  }
+
+  // Create the new operation.
+  auto *repOp = builder.createOperation(state);
+  op->replaceAllUsesWith(repOp);
+  op->erase();
+
+  return success();
+}
+
+LogicalResult mlir::replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef,
+                                             ArrayRef<Value> extraIndices,
+                                             AffineMap indexRemap,
+                                             ArrayRef<Value> extraOperands,
+                                             ArrayRef<Value> symbolOperands,
+                                             Operation *domInstFilter,
+                                             Operation *postDomInstFilter) {
+  unsigned newMemRefRank = newMemRef->getType().cast<MemRefType>().getRank();
+  (void)newMemRefRank; // unused in opt mode
+  unsigned oldMemRefRank = oldMemRef->getType().cast<MemRefType>().getRank();
+  (void)oldMemRefRank;
+  if (indexRemap) {
+    assert(indexRemap.getNumSymbols() == symbolOperands.size() &&
+           "symbol operand count mismatch");
+    assert(indexRemap.getNumInputs() ==
+           extraOperands.size() + oldMemRefRank + symbolOperands.size());
+    assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank);
+  } else {
+    assert(oldMemRefRank + extraIndices.size() == newMemRefRank);
+  }
+
+  // Assert same elemental type.
+  assert(oldMemRef->getType().cast<MemRefType>().getElementType() ==
+         newMemRef->getType().cast<MemRefType>().getElementType());
+
+  std::unique_ptr<DominanceInfo> domInfo;
+  std::unique_ptr<PostDominanceInfo> postDomInfo;
+  if (domInstFilter)
+    domInfo = std::make_unique<DominanceInfo>(
+        domInstFilter->getParentOfType<FuncOp>());
+
+  if (postDomInstFilter)
+    postDomInfo = std::make_unique<PostDominanceInfo>(
+        postDomInstFilter->getParentOfType<FuncOp>());
+
+  // Walk all uses of old memref; collect ops to perform replacement. We use a
+  // DenseSet since an operation could potentially have multiple uses of a
+  // memref (although rare), and the replacement later is going to erase ops.
+  DenseSet<Operation *> opsToReplace;
+  for (auto *op : oldMemRef->getUsers()) {
+    // Skip this use if it's not dominated by domInstFilter.
+    if (domInstFilter && !domInfo->dominates(domInstFilter, op))
+      continue;
+
+    // Skip this use if it's not post-dominated by postDomInstFilter.
+    if (postDomInstFilter && !postDomInfo->postDominates(postDomInstFilter, op))
+      continue;
+
+    // Skip dealloc's - no replacement is necessary, and a memref replacement
+    // at other uses doesn't hurt these dealloc's.
+    if (isa<DeallocOp>(op))
+      continue;
+
+    // Check if the memref was used in a non-dereferencing context. It is fine
+    // for the memref to be used in a non-dereferencing way outside of the
+    // region where this replacement is happening.
+    if (!isMemRefDereferencingOp(*op))
+      // Failure: memref used in a non-dereferencing op (potentially escapes);
+      // no replacement in these cases.
+      return failure();
+
+    // We'll first collect and then replace --- since replacement erases the op
+    // that has the use, and that op could be postDomFilter or domFilter itself!
+    opsToReplace.insert(op);
+  }
+
+  for (auto *op : opsToReplace) {
+    if (failed(replaceAllMemRefUsesWith(oldMemRef, newMemRef, op, extraIndices,
+                                        indexRemap, extraOperands,
+                                        symbolOperands)))
+      llvm_unreachable("memref replacement guaranteed to succeed here");
+  }
+
+  return success();
+}
+
+/// Given an operation, inserts one or more single result affine
+/// apply operations, results of which are exclusively used by this operation
+/// operation. The operands of these newly created affine apply ops are
+/// guaranteed to be loop iterators or terminal symbols of a function.
+///
+/// Before
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   "send"(%idx, %A, ...)
+///   "compute"(%idx)
+///
+/// After
+///
+/// affine.for %i = 0 to #map(%N)
+///   %idx = affine.apply (d0) -> (d0 mod 2) (%i)
+///   "send"(%idx, %A, ...)
+///   %idx_ = affine.apply (d0) -> (d0 mod 2) (%i)
+///   "compute"(%idx_)
+///
+/// This allows applying different transformations on send and compute (for eg.
+/// different shifts/delays).
+///
+/// Returns nullptr either if none of opInst's operands were the result of an
+/// affine.apply and thus there was no affine computation slice to create, or if
+/// all the affine.apply op's supplying operands to this opInst did not have any
+/// uses besides this opInst; otherwise returns the list of affine.apply
+/// operations created in output argument `sliceOps`.
+void mlir::createAffineComputationSlice(
+    Operation *opInst, SmallVectorImpl<AffineApplyOp> *sliceOps) {
+  // Collect all operands that are results of affine apply ops.
+  SmallVector<Value, 4> subOperands;
+  subOperands.reserve(opInst->getNumOperands());
+  for (auto operand : opInst->getOperands())
+    if (isa_and_nonnull<AffineApplyOp>(operand->getDefiningOp()))
+      subOperands.push_back(operand);
+
+  // Gather sequence of AffineApplyOps reachable from 'subOperands'.
+  SmallVector<Operation *, 4> affineApplyOps;
+  getReachableAffineApplyOps(subOperands, affineApplyOps);
+  // Skip transforming if there are no affine maps to compose.
+  if (affineApplyOps.empty())
+    return;
+
+  // Check if all uses of the affine apply op's lie only in this op op, in
+  // which case there would be nothing to do.
+  bool localized = true;
+  for (auto *op : affineApplyOps) {
+    for (auto result : op->getResults()) {
+      for (auto *user : result->getUsers()) {
+        if (user != opInst) {
+          localized = false;
+          break;
+        }
+      }
+    }
+  }
+  if (localized)
+    return;
+
+  OpBuilder builder(opInst);
+  SmallVector<Value, 4> composedOpOperands(subOperands);
+  auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size());
+  fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands);
+
+  // Create an affine.apply for each of the map results.
+  sliceOps->reserve(composedMap.getNumResults());
+  for (auto resultExpr : composedMap.getResults()) {
+    auto singleResMap = AffineMap::get(composedMap.getNumDims(),
+                                       composedMap.getNumSymbols(), resultExpr);
+    sliceOps->push_back(builder.create<AffineApplyOp>(
+        opInst->getLoc(), singleResMap, composedOpOperands));
+  }
+
+  // Construct the new operands that include the results from the composed
+  // affine apply op above instead of existing ones (subOperands). So, they
+  // differ from opInst's operands only for those operands in 'subOperands', for
+  // which they will be replaced by the corresponding one from 'sliceOps'.
+  SmallVector<Value, 4> newOperands(opInst->getOperands());
+  for (unsigned i = 0, e = newOperands.size(); i < e; i++) {
+    // Replace the subOperands from among the new operands.
+    unsigned j, f;
+    for (j = 0, f = subOperands.size(); j < f; j++) {
+      if (newOperands[i] == subOperands[j])
+        break;
+    }
+    if (j < subOperands.size()) {
+      newOperands[i] = (*sliceOps)[j];
+    }
+  }
+  for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) {
+    opInst->setOperand(idx, newOperands[idx]);
+  }
+}
+
+// TODO: Currently works for static memrefs with a single layout map.
+LogicalResult mlir::normalizeMemRef(AllocOp allocOp) {
+  MemRefType memrefType = allocOp.getType();
+  unsigned rank = memrefType.getRank();
+  if (rank == 0)
+    return success();
+
+  auto layoutMaps = memrefType.getAffineMaps();
+  OpBuilder b(allocOp);
+  if (layoutMaps.size() != 1)
+    return failure();
+
+  AffineMap layoutMap = layoutMaps.front();
+
+  // Nothing to do for identity layout maps.
+  if (layoutMap == b.getMultiDimIdentityMap(rank))
+    return success();
+
+  // We don't do any checks for one-to-one'ness; we assume that it is
+  // one-to-one.
+
+  // TODO: Only for static memref's for now.
+  if (memrefType.getNumDynamicDims() > 0)
+    return failure();
+
+  // We have a single map that is not an identity map. Create a new memref with
+  // the right shape and an identity layout map.
+  auto shape = memrefType.getShape();
+  FlatAffineConstraints fac(rank, allocOp.getNumSymbolicOperands());
+  for (unsigned d = 0; d < rank; ++d) {
+    fac.addConstantLowerBound(d, 0);
+    fac.addConstantUpperBound(d, shape[d] - 1);
+  }
+
+  // We compose this map with the original index (logical) space to derive the
+  // upper bounds for the new index space.
+  unsigned newRank = layoutMap.getNumResults();
+  if (failed(fac.composeMatchingMap(layoutMap)))
+    // TODO: semi-affine maps.
+    return failure();
+
+  // Project out the old data dimensions.
+  fac.projectOut(newRank, fac.getNumIds() - newRank - fac.getNumLocalIds());
+  SmallVector<int64_t, 4> newShape(newRank);
+  for (unsigned d = 0; d < newRank; ++d) {
+    // The lower bound for the shape is always zero.
+    auto ubConst = fac.getConstantUpperBound(d);
+    // For a static memref and an affine map with no symbols, this is always
+    // bounded.
+    assert(ubConst.hasValue() && "should always have an upper bound");
+    if (ubConst.getValue() < 0)
+      // This is due to an invalid map that maps to a negative space.
+      return failure();
+    newShape[d] = ubConst.getValue() + 1;
+  }
+
+  auto oldMemRef = allocOp.getResult();
+  SmallVector<Value, 4> symbolOperands(allocOp.getSymbolicOperands());
+
+  auto newMemRefType = MemRefType::get(newShape, memrefType.getElementType(),
+                                       b.getMultiDimIdentityMap(newRank));
+  auto newAlloc = b.create<AllocOp>(allocOp.getLoc(), newMemRefType);
+
+  // Replace all uses of the old memref.
+  if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc,
+                                      /*extraIndices=*/{},
+                                      /*indexRemap=*/layoutMap,
+                                      /*extraOperands=*/{},
+                                      /*symbolOperands=*/symbolOperands))) {
+    // If it failed (due to escapes for example), bail out.
+    newAlloc.erase();
+    return failure();
+  }
+  // Replace any uses of the original alloc op and erase it. All remaining uses
+  // have to be dealloc's; RAMUW above would've failed otherwise.
+  assert(std::all_of(oldMemRef->user_begin(), oldMemRef->user_end(),
+                     [](Operation *op) { return isa<DeallocOp>(op); }));
+  oldMemRef->replaceAllUsesWith(newAlloc);
+  allocOp.erase();
+  return success();
+}
diff --git a/mlir/lib/Transforms/Vectorize.cpp b/mlir/lib/Transforms/Vectorize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b2b3e1ee7eb3c5c174a29cbfbb6731a06d794b3
--- /dev/null
+++ b/mlir/lib/Transforms/Vectorize.cpp
@@ -0,0 +1,1292 @@
+//===- Vectorize.cpp - Vectorize Pass Impl --------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements vectorization of loops, operations and data types to
+// a target-independent, n-D super-vector abstraction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/NestedMatcher.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Dialect/VectorOps/Utils.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/FoldUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace mlir;
+
+///
+/// Implements a high-level vectorization strategy on a Function.
+/// The abstraction used is that of super-vectors, which provide a single,
+/// compact, representation in the vector types, information that is expected
+/// to reduce the impact of the phase ordering problem
+///
+/// Vector granularity:
+/// ===================
+/// This pass is designed to perform vectorization at a super-vector
+/// granularity. A super-vector is loosely defined as a vector type that is a
+/// multiple of a "good" vector size so the HW can efficiently implement a set
+/// of high-level primitives. Multiple is understood along any dimension; e.g.
+/// both vector<16xf32> and vector<2x8xf32> are valid super-vectors for a
+/// vector<8xf32> HW vector. Note that a "good vector size so the HW can
+/// efficiently implement a set of high-level primitives" is not necessarily an
+/// integer multiple of actual hardware registers. We leave details of this
+/// distinction unspecified for now.
+///
+/// Some may prefer the terminology a "tile of HW vectors". In this case, one
+/// should note that super-vectors implement an "always full tile" abstraction.
+/// They guarantee no partial-tile separation is necessary by relying on a
+/// high-level copy-reshape abstraction that we call vector.transfer. This
+/// copy-reshape operations is also responsible for performing layout
+/// transposition if necessary. In the general case this will require a scoped
+/// allocation in some notional local memory.
+///
+/// Whatever the mental model one prefers to use for this abstraction, the key
+/// point is that we burn into a single, compact, representation in the vector
+/// types, information that is expected to reduce the impact of the phase
+/// ordering problem. Indeed, a vector type conveys information that:
+///   1. the associated loops have dependency semantics that do not prevent
+///      vectorization;
+///   2. the associate loops have been sliced in chunks of static sizes that are
+///      compatible with vector sizes (i.e. similar to unroll-and-jam);
+///   3. the inner loops, in the unroll-and-jam analogy of 2, are captured by
+///   the
+///      vector type and no vectorization hampering transformations can be
+///      applied to them anymore;
+///   4. the underlying memrefs are accessed in some notional contiguous way
+///      that allows loading into vectors with some amount of spatial locality;
+/// In other words, super-vectorization provides a level of separation of
+/// concern by way of opacity to subsequent passes. This has the effect of
+/// encapsulating and propagating vectorization constraints down the list of
+/// passes until we are ready to lower further.
+///
+/// For a particular target, a notion of minimal n-d vector size will be
+/// specified and vectorization targets a multiple of those. In the following
+/// paragraph, let "k ." represent "a multiple of", to be understood as a
+/// multiple in the same dimension (e.g. vector<16 x k . 128> summarizes
+/// vector<16 x 128>, vector<16 x 256>, vector<16 x 1024>, etc).
+///
+/// Some non-exhaustive notable super-vector sizes of interest include:
+///   - CPU: vector<k . HW_vector_size>,
+///          vector<k' . core_count x k . HW_vector_size>,
+///          vector<socket_count x k' . core_count x k . HW_vector_size>;
+///   - GPU: vector<k . warp_size>,
+///          vector<k . warp_size x float2>,
+///          vector<k . warp_size x float4>,
+///          vector<k . warp_size x 4 x 4x 4> (for tensor_core sizes).
+///
+/// Loops and operations are emitted that operate on those super-vector shapes.
+/// Subsequent lowering passes will materialize to actual HW vector sizes. These
+/// passes are expected to be (gradually) more target-specific.
+///
+/// At a high level, a vectorized load in a loop will resemble:
+/// ```mlir
+///   affine.for %i = ? to ? step ? {
+///     %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32>
+///   }
+/// ```
+/// It is the responsibility of the implementation of vector.transfer_read to
+/// materialize vector registers from the original scalar memrefs. A later (more
+/// target-dependent) lowering pass will materialize to actual HW vector sizes.
+/// This lowering may be occur at different times:
+///   1. at the MLIR level into a combination of loops, unrolling, DmaStartOp +
+///      DmaWaitOp + vectorized operations for data transformations and shuffle;
+///      thus opening opportunities for unrolling and pipelining. This is an
+///      instance of library call "whiteboxing"; or
+///   2. later in the a target-specific lowering pass or hand-written library
+///      call; achieving full separation of concerns. This is an instance of
+///      library call; or
+///   3. a mix of both, e.g. based on a model.
+/// In the future, these operations will expose a contract to constrain the
+/// search on vectorization patterns and sizes.
+///
+/// Occurrence of super-vectorization in the compiler flow:
+/// =======================================================
+/// This is an active area of investigation. We start with 2 remarks to position
+/// super-vectorization in the context of existing ongoing work: LLVM VPLAN
+/// and LLVM SLP Vectorizer.
+///
+/// LLVM VPLAN:
+/// -----------
+/// The astute reader may have noticed that in the limit, super-vectorization
+/// can be applied at a similar time and with similar objectives than VPLAN.
+/// For instance, in the case of a traditional, polyhedral compilation-flow (for
+/// instance, the PPCG project uses ISL to provide dependence analysis,
+/// multi-level(scheduling + tiling), lifting footprint to fast memory,
+/// communication synthesis, mapping, register optimizations) and before
+/// unrolling. When vectorization is applied at this *late* level in a typical
+/// polyhedral flow, and is instantiated with actual hardware vector sizes,
+/// super-vectorization is expected to match (or subsume) the type of patterns
+/// that LLVM's VPLAN aims at targeting. The main difference here is that MLIR
+/// is higher level and our implementation should be significantly simpler. Also
+/// note that in this mode, recursive patterns are probably a bit of an overkill
+/// although it is reasonable to expect that mixing a bit of outer loop and
+/// inner loop vectorization + unrolling will provide interesting choices to
+/// MLIR.
+///
+/// LLVM SLP Vectorizer:
+/// --------------------
+/// Super-vectorization however is not meant to be usable in a similar fashion
+/// to the SLP vectorizer. The main difference lies in the information that
+/// both vectorizers use: super-vectorization examines contiguity of memory
+/// references along fastest varying dimensions and loops with recursive nested
+/// patterns capturing imperfectly-nested loop nests; the SLP vectorizer, on
+/// the other hand, performs flat pattern matching inside a single unrolled loop
+/// body and stitches together pieces of load and store operations into full
+/// 1-D vectors. We envision that the SLP vectorizer is a good way to capture
+/// innermost loop, control-flow dependent patterns that super-vectorization may
+/// not be able to capture easily. In other words, super-vectorization does not
+/// aim at replacing the SLP vectorizer and the two solutions are complementary.
+///
+/// Ongoing investigations:
+/// -----------------------
+/// We discuss the following *early* places where super-vectorization is
+/// applicable and touch on the expected benefits and risks . We list the
+/// opportunities in the context of the traditional polyhedral compiler flow
+/// described in PPCG. There are essentially 6 places in the MLIR pass pipeline
+/// we expect to experiment with super-vectorization:
+/// 1. Right after language lowering to MLIR: this is the earliest time where
+///    super-vectorization is expected to be applied. At this level, all the
+///    language/user/library-level annotations are available and can be fully
+///    exploited. Examples include loop-type annotations (such as parallel,
+///    reduction, scan, dependence distance vector, vectorizable) as well as
+///    memory access annotations (such as non-aliasing writes guaranteed,
+///    indirect accesses that are permutations by construction) accesses or
+///    that a particular operation is prescribed atomic by the user. At this
+///    level, anything that enriches what dependence analysis can do should be
+///    aggressively exploited. At this level we are close to having explicit
+///    vector types in the language, except we do not impose that burden on the
+///    programmer/library: we derive information from scalar code + annotations.
+/// 2. After dependence analysis and before polyhedral scheduling: the
+///    information that supports vectorization does not need to be supplied by a
+///    higher level of abstraction. Traditional dependence analysis is available
+///    in MLIR and will be used to drive vectorization and cost models.
+///
+/// Let's pause here and remark that applying super-vectorization as described
+/// in 1. and 2. presents clear opportunities and risks:
+///   - the opportunity is that vectorization is burned in the type system and
+///   is protected from the adverse effect of loop scheduling, tiling, loop
+///   interchange and all passes downstream. Provided that subsequent passes are
+///   able to operate on vector types; the vector shapes, associated loop
+///   iterator properties, alignment, and contiguity of fastest varying
+///   dimensions are preserved until we lower the super-vector types. We expect
+///   this to significantly rein in on the adverse effects of phase ordering.
+///   - the risks are that a. all passes after super-vectorization have to work
+///   on elemental vector types (not that this is always true, wherever
+///   vectorization is applied) and b. that imposing vectorization constraints
+///   too early may be overall detrimental to loop fusion, tiling and other
+///   transformations because the dependence distances are coarsened when
+///   operating on elemental vector types. For this reason, the pattern
+///   profitability analysis should include a component that also captures the
+///   maximal amount of fusion available under a particular pattern. This is
+///   still at the stage of rough ideas but in this context, search is our
+///   friend as the Tensor Comprehensions and auto-TVM contributions
+///   demonstrated previously.
+/// Bottom-line is we do not yet have good answers for the above but aim at
+/// making it easy to answer such questions.
+///
+/// Back to our listing, the last places where early super-vectorization makes
+/// sense are:
+/// 3. right after polyhedral-style scheduling: PLUTO-style algorithms are known
+///    to improve locality, parallelism and be configurable (e.g. max-fuse,
+///    smart-fuse etc). They can also have adverse effects on contiguity
+///    properties that are required for vectorization but the vector.transfer
+///    copy-reshape-pad-transpose abstraction is expected to help recapture
+///    these properties.
+/// 4. right after polyhedral-style scheduling+tiling;
+/// 5. right after scheduling+tiling+rescheduling: points 4 and 5 represent
+///    probably the most promising places because applying tiling achieves a
+///    separation of concerns that allows rescheduling to worry less about
+///    locality and more about parallelism and distribution (e.g. min-fuse).
+///
+/// At these levels the risk-reward looks different: on one hand we probably
+/// lost a good deal of language/user/library-level annotation; on the other
+/// hand we gained parallelism and locality through scheduling and tiling.
+/// However we probably want to ensure tiling is compatible with the
+/// full-tile-only abstraction used in super-vectorization or suffer the
+/// consequences. It is too early to place bets on what will win but we expect
+/// super-vectorization to be the right abstraction to allow exploring at all
+/// these levels. And again, search is our friend.
+///
+/// Lastly, we mention it again here:
+/// 6. as a MLIR-based alternative to VPLAN.
+///
+/// Lowering, unrolling, pipelining:
+/// ================================
+/// TODO(ntv): point to the proper places.
+///
+/// Algorithm:
+/// ==========
+/// The algorithm proceeds in a few steps:
+///  1. defining super-vectorization patterns and matching them on the tree of
+///     AffineForOp. A super-vectorization pattern is defined as a recursive
+///     data structures that matches and captures nested, imperfectly-nested
+///     loops that have a. conformable loop annotations attached (e.g. parallel,
+///     reduction, vectorizable, ...) as well as b. all contiguous load/store
+///     operations along a specified minor dimension (not necessarily the
+///     fastest varying) ;
+///  2. analyzing those patterns for profitability (TODO(ntv): and
+///     interference);
+///  3. Then, for each pattern in order:
+///    a. applying iterative rewriting of the loop and the load operations in
+///       DFS postorder. Rewriting is implemented by coarsening the loops and
+///       turning load operations into opaque vector.transfer_read ops;
+///    b. keeping track of the load operations encountered as "roots" and the
+///       store operations as "terminals";
+///    c. traversing the use-def chains starting from the roots and iteratively
+///       propagating vectorized values. Scalar values that are encountered
+///       during this process must come from outside the scope of the current
+///       pattern (TODO(ntv): enforce this and generalize). Such a scalar value
+///       is vectorized only if it is a constant (into a vector splat). The
+///       non-constant case is not supported for now and results in the pattern
+///       failing to vectorize;
+///    d. performing a second traversal on the terminals (store ops) to
+///       rewriting the scalar value they write to memory into vector form.
+///       If the scalar value has been vectorized previously, we simply replace
+///       it by its vector form. Otherwise, if the scalar value is a constant,
+///       it is vectorized into a splat. In all other cases, vectorization for
+///       the pattern currently fails.
+///    e. if everything under the root AffineForOp in the current pattern
+///       vectorizes properly, we commit that loop to the IR. Otherwise we
+///       discard it and restore a previously cloned version of the loop. Thanks
+///       to the recursive scoping nature of matchers and captured patterns,
+///       this is transparently achieved by a simple RAII implementation.
+///    f. vectorization is applied on the next pattern in the list. Because
+///       pattern interference avoidance is not yet implemented and that we do
+///       not support further vectorizing an already vector load we need to
+///       re-verify that the pattern is still vectorizable. This is expected to
+///       make cost models more difficult to write and is subject to improvement
+///       in the future.
+///
+/// Points c. and d. above are worth additional comment. In most passes that
+/// do not change the type of operands, it is usually preferred to eagerly
+/// `replaceAllUsesWith`. Unfortunately this does not work for vectorization
+/// because during the use-def chain traversal, all the operands of an operation
+/// must be available in vector form. Trying to propagate eagerly makes the IR
+/// temporarily invalid and results in errors such as:
+///   `vectorize.mlir:308:13: error: 'addf' op requires the same type for all
+///   operands and results
+///      %s5 = addf %a5, %b5 : f32`
+///
+/// Lastly, we show a minimal example for which use-def chains rooted in load /
+/// vector.transfer_read are not enough. This is what motivated splitting
+/// terminal processing out of the use-def chains starting from loads. In the
+/// following snippet, there is simply no load::
+/// ```mlir
+/// func @fill(%A : memref<128xf32>) -> () {
+///   %f1 = constant 1.0 : f32
+///   affine.for %i0 = 0 to 32 {
+///     affine.store %f1, %A[%i0] : memref<128xf32, 0>
+///   }
+///   return
+/// }
+/// ```
+///
+/// Choice of loop transformation to support the algorithm:
+/// =======================================================
+/// The choice of loop transformation to apply for coarsening vectorized loops
+/// is still subject to exploratory tradeoffs. In particular, say we want to
+/// vectorize by a factor 128, we want to transform the following input:
+/// ```mlir
+///   affine.for %i = %M to %N {
+///     %a = affine.load %A[%i] : memref<?xf32>
+///   }
+/// ```
+///
+/// Traditionally, one would vectorize late (after scheduling, tiling,
+/// memory promotion etc) say after stripmining (and potentially unrolling in
+/// the case of LLVM's SLP vectorizer):
+/// ```mlir
+///   affine.for %i = floor(%M, 128) to ceil(%N, 128) {
+///     affine.for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) {
+///       %a = affine.load %A[%ii] : memref<?xf32>
+///     }
+///   }
+/// ```
+///
+/// Instead, we seek to vectorize early and freeze vector types before
+/// scheduling, so we want to generate a pattern that resembles:
+/// ```mlir
+///   affine.for %i = ? to ? step ? {
+///     %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>
+///   }
+/// ```
+///
+/// i. simply dividing the lower / upper bounds by 128 creates issues
+///    when representing expressions such as ii + 1 because now we only
+///    have access to original values that have been divided. Additional
+///    information is needed to specify accesses at below-128 granularity;
+/// ii. another alternative is to coarsen the loop step but this may have
+///    consequences on dependence analysis and fusability of loops: fusable
+///    loops probably need to have the same step (because we don't want to
+///    stripmine/unroll to enable fusion).
+/// As a consequence, we choose to represent the coarsening using the loop
+/// step for now and reevaluate in the future. Note that we can renormalize
+/// loop steps later if/when we have evidence that they are problematic.
+///
+/// For the simple strawman example above, vectorizing for a 1-D vector
+/// abstraction of size 128 returns code similar to:
+/// ```mlir
+///   affine.for %i = %M to %N step 128 {
+///     %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>
+///   }
+/// ```
+///
+/// Unsupported cases, extensions, and work in progress (help welcome :-) ):
+/// ========================================================================
+///   1. lowering to concrete vector types for various HW;
+///   2. reduction support;
+///   3. non-effecting padding during vector.transfer_read and filter during
+///      vector.transfer_write;
+///   4. misalignment support vector.transfer_read / vector.transfer_write
+///      (hopefully without read-modify-writes);
+///   5. control-flow support;
+///   6. cost-models, heuristics and search;
+///   7. Op implementation, extensions and implication on memref views;
+///   8. many TODOs left around.
+///
+/// Examples:
+/// =========
+/// Consider the following Function:
+/// ```mlir
+/// func @vector_add_2d(%M : index, %N : index) -> f32 {
+///   %A = alloc (%M, %N) : memref<?x?xf32, 0>
+///   %B = alloc (%M, %N) : memref<?x?xf32, 0>
+///   %C = alloc (%M, %N) : memref<?x?xf32, 0>
+///   %f1 = constant 1.0 : f32
+///   %f2 = constant 2.0 : f32
+///   affine.for %i0 = 0 to %M {
+///     affine.for %i1 = 0 to %N {
+///       // non-scoped %f1
+///       affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
+///     }
+///   }
+///   affine.for %i2 = 0 to %M {
+///     affine.for %i3 = 0 to %N {
+///       // non-scoped %f2
+///       affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
+///     }
+///   }
+///   affine.for %i4 = 0 to %M {
+///     affine.for %i5 = 0 to %N {
+///       %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0>
+///       %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0>
+///       %s5 = addf %a5, %b5 : f32
+///       // non-scoped %f1
+///       %s6 = addf %s5, %f1 : f32
+///       // non-scoped %f2
+///       %s7 = addf %s5, %f2 : f32
+///       // diamond dependency.
+///       %s8 = addf %s7, %s6 : f32
+///       affine.store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
+///     }
+///   }
+///   %c7 = constant 7 : index
+///   %c42 = constant 42 : index
+///   %res = load %C[%c7, %c42] : memref<?x?xf32, 0>
+///   return %res : f32
+/// }
+/// ```
+///
+/// The -affine-vectorize pass with the following arguments:
+/// ```
+/// -affine-vectorize -virtual-vector-size 256 --test-fastest-varying=0
+/// ```
+///
+/// produces this standard innermost-loop vectorized code:
+/// ```mlir
+/// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
+///   %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
+///   %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
+///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
+///   %cst = constant 1.0 : f32
+///   %cst_0 = constant 2.0 : f32
+///   affine.for %i0 = 0 to %arg0 {
+///     affine.for %i1 = 0 to %arg1 step 256 {
+///       %cst_1 = constant dense<vector<256xf32>, 1.0> :
+///                vector<256xf32>
+///       vector.transfer_write %cst_1, %0[%i0, %i1] :
+///                vector<256xf32>, memref<?x?xf32>
+///     }
+///   }
+///   affine.for %i2 = 0 to %arg0 {
+///     affine.for %i3 = 0 to %arg1 step 256 {
+///       %cst_2 = constant dense<vector<256xf32>, 2.0> :
+///                vector<256xf32>
+///       vector.transfer_write %cst_2, %1[%i2, %i3] :
+///                vector<256xf32>, memref<?x?xf32>
+///     }
+///   }
+///   affine.for %i4 = 0 to %arg0 {
+///     affine.for %i5 = 0 to %arg1 step 256 {
+///       %3 = vector.transfer_read %0[%i4, %i5] :
+///            memref<?x?xf32>, vector<256xf32>
+///       %4 = vector.transfer_read %1[%i4, %i5] :
+///            memref<?x?xf32>, vector<256xf32>
+///       %5 = addf %3, %4 : vector<256xf32>
+///       %cst_3 = constant dense<vector<256xf32>, 1.0> :
+///                vector<256xf32>
+///       %6 = addf %5, %cst_3 : vector<256xf32>
+///       %cst_4 = constant dense<vector<256xf32>, 2.0> :
+///                vector<256xf32>
+///       %7 = addf %5, %cst_4 : vector<256xf32>
+///       %8 = addf %7, %6 : vector<256xf32>
+///       vector.transfer_write %8, %2[%i4, %i5] :
+///                vector<256xf32>, memref<?x?xf32>
+///     }
+///   }
+///   %c7 = constant 7 : index
+///   %c42 = constant 42 : index
+///   %9 = load %2[%c7, %c42] : memref<?x?xf32>
+///   return %9 : f32
+/// }
+/// ```
+///
+/// The -affine-vectorize pass with the following arguments:
+/// ```
+/// -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256
+/// --test-fastest-varying=1 --test-fastest-varying=0
+/// ```
+///
+/// produces this more interesting mixed outer-innermost-loop vectorized code:
+/// ```mlir
+/// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
+///   %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
+///   %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
+///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
+///   %cst = constant 1.0 : f32
+///   %cst_0 = constant 2.0 : f32
+///   affine.for %i0 = 0 to %arg0 step 32 {
+///     affine.for %i1 = 0 to %arg1 step 256 {
+///       %cst_1 = constant dense<vector<32x256xf32>, 1.0> :
+///                vector<32x256xf32>
+///       vector.transfer_write %cst_1, %0[%i0, %i1] :
+///                vector<32x256xf32>, memref<?x?xf32>
+///     }
+///   }
+///   affine.for %i2 = 0 to %arg0 step 32 {
+///     affine.for %i3 = 0 to %arg1 step 256 {
+///       %cst_2 = constant dense<vector<32x256xf32>, 2.0> :
+///                vector<32x256xf32>
+///       vector.transfer_write %cst_2, %1[%i2, %i3] :
+///                vector<32x256xf32>, memref<?x?xf32>
+///     }
+///   }
+///   affine.for %i4 = 0 to %arg0 step 32 {
+///     affine.for %i5 = 0 to %arg1 step 256 {
+///       %3 = vector.transfer_read %0[%i4, %i5] :
+///                memref<?x?xf32> vector<32x256xf32>
+///       %4 = vector.transfer_read %1[%i4, %i5] :
+///                memref<?x?xf32>, vector<32x256xf32>
+///       %5 = addf %3, %4 : vector<32x256xf32>
+///       %cst_3 = constant dense<vector<32x256xf32>, 1.0> :
+///                vector<32x256xf32>
+///       %6 = addf %5, %cst_3 : vector<32x256xf32>
+///       %cst_4 = constant dense<vector<32x256xf32>, 2.0> :
+///                vector<32x256xf32>
+///       %7 = addf %5, %cst_4 : vector<32x256xf32>
+///       %8 = addf %7, %6 : vector<32x256xf32>
+///       vector.transfer_write %8, %2[%i4, %i5] :
+///                vector<32x256xf32>, memref<?x?xf32>
+///     }
+///   }
+///   %c7 = constant 7 : index
+///   %c42 = constant 42 : index
+///   %9 = load %2[%c7, %c42] : memref<?x?xf32>
+///   return %9 : f32
+/// }
+/// ```
+///
+/// Of course, much more intricate n-D imperfectly-nested patterns can be
+/// vectorized too and specified in a fully declarative fashion.
+
+#define DEBUG_TYPE "early-vect"
+
+using functional::makePtrDynCaster;
+using functional::map;
+using llvm::dbgs;
+using llvm::SetVector;
+
+static llvm::cl::OptionCategory clOptionsCategory("vectorize options");
+
+static llvm::cl::list<int> clVirtualVectorSize(
+    "virtual-vector-size",
+    llvm::cl::desc("Specify an n-D virtual vector size for vectorization"),
+    llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::list<int> clFastestVaryingPattern(
+    "test-fastest-varying",
+    llvm::cl::desc(
+        "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory"
+        " dimensions to match. See defaultPatterns in Vectorize.cpp for a"
+        " description and examples. This is used for testing purposes"),
+    llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
+
+/// Forward declaration.
+static FilterFunctionType
+isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
+                             int fastestVaryingMemRefDimension);
+
+/// Creates a vectorization pattern from the command line arguments.
+/// Up to 3-D patterns are supported.
+/// If the command line argument requests a pattern of higher order, returns an
+/// empty pattern list which will conservatively result in no vectorization.
+static std::vector<NestedPattern>
+makePatterns(const DenseSet<Operation *> &parallelLoops, int vectorRank,
+             ArrayRef<int64_t> fastestVaryingPattern) {
+  using matcher::For;
+  int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0];
+  int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1];
+  int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2];
+  switch (vectorRank) {
+  case 1:
+    return {For(isVectorizableLoopPtrFactory(parallelLoops, d0))};
+  case 2:
+    return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
+                For(isVectorizableLoopPtrFactory(parallelLoops, d1)))};
+  case 3:
+    return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
+                For(isVectorizableLoopPtrFactory(parallelLoops, d1),
+                    For(isVectorizableLoopPtrFactory(parallelLoops, d2))))};
+  default: {
+    return std::vector<NestedPattern>();
+  }
+  }
+}
+
+static NestedPattern &vectorTransferPattern() {
+  static auto pattern = matcher::Op([](Operation &op) {
+    return isa<vector::TransferReadOp>(op) || isa<vector::TransferWriteOp>(op);
+  });
+  return pattern;
+}
+
+namespace {
+
+/// Base state for the vectorize pass.
+/// Command line arguments are preempted by non-empty pass arguments.
+struct Vectorize : public FunctionPass<Vectorize> {
+  Vectorize();
+  Vectorize(ArrayRef<int64_t> virtualVectorSize);
+  void runOnFunction() override;
+
+  // The virtual vector size that we vectorize to.
+  SmallVector<int64_t, 4> vectorSizes;
+  // Optionally, the fixed mapping from loop to fastest varying MemRef dimension
+  // for all the MemRefs within a loop pattern:
+  //   the index represents the loop depth, the value represents the k^th
+  //   fastest varying memory dimension.
+  // This is voluntarily restrictive and is meant to precisely target a
+  // particular loop/op pair, for testing purposes.
+  SmallVector<int64_t, 4> fastestVaryingPattern;
+};
+
+} // end anonymous namespace
+
+Vectorize::Vectorize()
+    : vectorSizes(clVirtualVectorSize.begin(), clVirtualVectorSize.end()),
+      fastestVaryingPattern(clFastestVaryingPattern.begin(),
+                            clFastestVaryingPattern.end()) {}
+
+Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) : Vectorize() {
+  if (!virtualVectorSize.empty()) {
+    this->vectorSizes.assign(virtualVectorSize.begin(),
+                             virtualVectorSize.end());
+  }
+}
+
+/////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate.
+/////////
+namespace {
+
+struct VectorizationStrategy {
+  SmallVector<int64_t, 8> vectorSizes;
+  DenseMap<Operation *, unsigned> loopToVectorDim;
+};
+
+} // end anonymous namespace
+
+static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern,
+                                      unsigned patternDepth,
+                                      VectorizationStrategy *strategy) {
+  assert(patternDepth > depthInPattern &&
+         "patternDepth is greater than depthInPattern");
+  if (patternDepth - depthInPattern > strategy->vectorSizes.size()) {
+    // Don't vectorize this loop
+    return;
+  }
+  strategy->loopToVectorDim[loop] =
+      strategy->vectorSizes.size() - (patternDepth - depthInPattern);
+}
+
+/// Implements a simple strawman strategy for vectorization.
+/// Given a matched pattern `matches` of depth `patternDepth`, this strategy
+/// greedily assigns the fastest varying dimension ** of the vector ** to the
+/// innermost loop in the pattern.
+/// When coupled with a pattern that looks for the fastest varying dimension in
+/// load/store MemRefs, this creates a generic vectorization strategy that works
+/// for any loop in a hierarchy (outermost, innermost or intermediate).
+///
+/// TODO(ntv): In the future we should additionally increase the power of the
+/// profitability analysis along 3 directions:
+///   1. account for loop extents (both static and parametric + annotations);
+///   2. account for data layout permutations;
+///   3. account for impact of vectorization on maximal loop fusion.
+/// Then we can quantify the above to build a cost model and search over
+/// strategies.
+static LogicalResult analyzeProfitability(ArrayRef<NestedMatch> matches,
+                                          unsigned depthInPattern,
+                                          unsigned patternDepth,
+                                          VectorizationStrategy *strategy) {
+  for (auto m : matches) {
+    if (failed(analyzeProfitability(m.getMatchedChildren(), depthInPattern + 1,
+                                    patternDepth, strategy))) {
+      return failure();
+    }
+    vectorizeLoopIfProfitable(m.getMatchedOperation(), depthInPattern,
+                              patternDepth, strategy);
+  }
+  return success();
+}
+
+///// end TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate /////
+
+namespace {
+
+struct VectorizationState {
+  /// Adds an entry of pre/post vectorization operations in the state.
+  void registerReplacement(Operation *key, Operation *value);
+  /// When the current vectorization pattern is successful, this erases the
+  /// operations that were marked for erasure in the proper order and resets
+  /// the internal state for the next pattern.
+  void finishVectorizationPattern();
+
+  // In-order tracking of original Operation that have been vectorized.
+  // Erase in reverse order.
+  SmallVector<Operation *, 16> toErase;
+  // Set of Operation that have been vectorized (the values in the
+  // vectorizationMap for hashed access). The vectorizedSet is used in
+  // particular to filter the operations that have already been vectorized by
+  // this pattern, when iterating over nested loops in this pattern.
+  DenseSet<Operation *> vectorizedSet;
+  // Map of old scalar Operation to new vectorized Operation.
+  DenseMap<Operation *, Operation *> vectorizationMap;
+  // Map of old scalar Value to new vectorized Value.
+  DenseMap<Value, Value> replacementMap;
+  // The strategy drives which loop to vectorize by which amount.
+  const VectorizationStrategy *strategy;
+  // Use-def roots. These represent the starting points for the worklist in the
+  // vectorizeNonTerminals function. They consist of the subset of load
+  // operations that have been vectorized. They can be retrieved from
+  // `vectorizationMap` but it is convenient to keep track of them in a separate
+  // data structure.
+  DenseSet<Operation *> roots;
+  // Terminal operations for the worklist in the vectorizeNonTerminals
+  // function. They consist of the subset of store operations that have been
+  // vectorized. They can be retrieved from `vectorizationMap` but it is
+  // convenient to keep track of them in a separate data structure. Since they
+  // do not necessarily belong to use-def chains starting from loads (e.g
+  // storing a constant), we need to handle them in a post-pass.
+  DenseSet<Operation *> terminals;
+  // Checks that the type of `op` is AffineStoreOp and adds it to the terminals
+  // set.
+  void registerTerminal(Operation *op);
+  // Folder used to factor out constant creation.
+  OperationFolder *folder;
+
+private:
+  void registerReplacement(Value key, Value value);
+};
+
+} // end namespace
+
+void VectorizationState::registerReplacement(Operation *key, Operation *value) {
+  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op: ");
+  LLVM_DEBUG(key->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  into  ");
+  LLVM_DEBUG(value->print(dbgs()));
+  assert(key->getNumResults() == 1 && "already registered");
+  assert(value->getNumResults() == 1 && "already registered");
+  assert(vectorizedSet.count(value) == 0 && "already registered");
+  assert(vectorizationMap.count(key) == 0 && "already registered");
+  toErase.push_back(key);
+  vectorizedSet.insert(value);
+  vectorizationMap.insert(std::make_pair(key, value));
+  registerReplacement(key->getResult(0), value->getResult(0));
+  if (isa<AffineLoadOp>(key)) {
+    assert(roots.count(key) == 0 && "root was already inserted previously");
+    roots.insert(key);
+  }
+}
+
+void VectorizationState::registerTerminal(Operation *op) {
+  assert(isa<AffineStoreOp>(op) && "terminal must be a AffineStoreOp");
+  assert(terminals.count(op) == 0 &&
+         "terminal was already inserted previously");
+  terminals.insert(op);
+}
+
+void VectorizationState::finishVectorizationPattern() {
+  while (!toErase.empty()) {
+    auto *op = toErase.pop_back_val();
+    LLVM_DEBUG(dbgs() << "\n[early-vect] finishVectorizationPattern erase: ");
+    LLVM_DEBUG(op->print(dbgs()));
+    op->erase();
+  }
+}
+
+void VectorizationState::registerReplacement(Value key, Value value) {
+  assert(replacementMap.count(key) == 0 && "replacement already registered");
+  replacementMap.insert(std::make_pair(key, value));
+}
+
+// Apply 'map' with 'mapOperands' returning resulting values in 'results'.
+static void computeMemoryOpIndices(Operation *op, AffineMap map,
+                                   ValueRange mapOperands,
+                                   SmallVectorImpl<Value> &results) {
+  OpBuilder builder(op);
+  for (auto resultExpr : map.getResults()) {
+    auto singleResMap =
+        AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
+    auto afOp =
+        builder.create<AffineApplyOp>(op->getLoc(), singleResMap, mapOperands);
+    results.push_back(afOp);
+  }
+}
+
+////// TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. ////
+
+/// Handles the vectorization of load and store MLIR operations.
+///
+/// AffineLoadOp operations are the roots of the vectorizeNonTerminals call.
+/// They are vectorized immediately. The resulting vector.transfer_read is
+/// immediately registered to replace all uses of the AffineLoadOp in this
+/// pattern's scope.
+///
+/// AffineStoreOp are the terminals of the vectorizeNonTerminals call. They
+/// need to be vectorized late once all the use-def chains have been traversed.
+/// Additionally, they may have ssa-values operands which come from outside the
+/// scope of the current pattern.
+/// Such special cases force us to delay the vectorization of the stores until
+/// the last step. Here we merely register the store operation.
+template <typename LoadOrStoreOpPointer>
+static LogicalResult vectorizeRootOrTerminal(Value iv,
+                                             LoadOrStoreOpPointer memoryOp,
+                                             VectorizationState *state) {
+  auto memRefType = memoryOp.getMemRef()->getType().template cast<MemRefType>();
+
+  auto elementType = memRefType.getElementType();
+  // TODO(ntv): ponder whether we want to further vectorize a vector value.
+  assert(VectorType::isValidElementType(elementType) &&
+         "Not a valid vector element type");
+  auto vectorType = VectorType::get(state->strategy->vectorSizes, elementType);
+
+  // Materialize a MemRef with 1 vector.
+  auto *opInst = memoryOp.getOperation();
+  // For now, vector.transfers must be aligned, operate only on indices with an
+  // identity subset of AffineMap and do not change layout.
+  // TODO(ntv): increase the expressiveness power of vector.transfer operations
+  // as needed by various targets.
+  if (auto load = dyn_cast<AffineLoadOp>(opInst)) {
+    OpBuilder b(opInst);
+    ValueRange mapOperands = load.getMapOperands();
+    SmallVector<Value, 8> indices;
+    indices.reserve(load.getMemRefType().getRank());
+    if (load.getAffineMap() !=
+        b.getMultiDimIdentityMap(load.getMemRefType().getRank())) {
+      computeMemoryOpIndices(opInst, load.getAffineMap(), mapOperands, indices);
+    } else {
+      indices.append(mapOperands.begin(), mapOperands.end());
+    }
+    auto permutationMap =
+        makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
+    if (!permutationMap)
+      return LogicalResult::Failure;
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
+    LLVM_DEBUG(permutationMap.print(dbgs()));
+    auto transfer = b.create<vector::TransferReadOp>(
+        opInst->getLoc(), vectorType, memoryOp.getMemRef(), indices,
+        AffineMapAttr::get(permutationMap),
+        // TODO(b/144455320) add a proper padding value, not just 0.0 : f32
+        state->folder->create<ConstantFloatOp>(b, opInst->getLoc(),
+                                               APFloat(0.0f), b.getF32Type()));
+    state->registerReplacement(opInst, transfer.getOperation());
+  } else {
+    state->registerTerminal(opInst);
+  }
+  return success();
+}
+/// end TODO(ntv): Hoist to a VectorizationMaterialize.cpp when appropriate. ///
+
+/// Coarsens the loops bounds and transforms all remaining load and store
+/// operations into the appropriate vector.transfer.
+static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step,
+                                          VectorizationState *state) {
+  using namespace functional;
+  loop.setStep(step);
+
+  FilterFunctionType notVectorizedThisPattern = [state](Operation &op) {
+    if (!matcher::isLoadOrStore(op)) {
+      return false;
+    }
+    return state->vectorizationMap.count(&op) == 0 &&
+           state->vectorizedSet.count(&op) == 0 &&
+           state->roots.count(&op) == 0 && state->terminals.count(&op) == 0;
+  };
+  auto loadAndStores = matcher::Op(notVectorizedThisPattern);
+  SmallVector<NestedMatch, 8> loadAndStoresMatches;
+  loadAndStores.match(loop.getOperation(), &loadAndStoresMatches);
+  for (auto ls : loadAndStoresMatches) {
+    auto *opInst = ls.getMatchedOperation();
+    auto load = dyn_cast<AffineLoadOp>(opInst);
+    auto store = dyn_cast<AffineStoreOp>(opInst);
+    LLVM_DEBUG(opInst->print(dbgs()));
+    LogicalResult result =
+        load ? vectorizeRootOrTerminal(loop.getInductionVar(), load, state)
+             : vectorizeRootOrTerminal(loop.getInductionVar(), store, state);
+    if (failed(result)) {
+      return failure();
+    }
+  }
+  return success();
+}
+
+/// Returns a FilterFunctionType that can be used in NestedPattern to match a
+/// loop whose underlying load/store accesses are either invariant or all
+// varying along the `fastestVaryingMemRefDimension`.
+static FilterFunctionType
+isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
+                             int fastestVaryingMemRefDimension) {
+  return [&parallelLoops, fastestVaryingMemRefDimension](Operation &forOp) {
+    auto loop = cast<AffineForOp>(forOp);
+    auto parallelIt = parallelLoops.find(loop);
+    if (parallelIt == parallelLoops.end())
+      return false;
+    int memRefDim = -1;
+    auto vectorizableBody =
+        isVectorizableLoopBody(loop, &memRefDim, vectorTransferPattern());
+    if (!vectorizableBody)
+      return false;
+    return memRefDim == -1 || fastestVaryingMemRefDimension == -1 ||
+           memRefDim == fastestVaryingMemRefDimension;
+  };
+}
+
+/// Apply vectorization of `loop` according to `state`. This is only triggered
+/// if all vectorizations in `childrenMatches` have already succeeded
+/// recursively in DFS post-order.
+static LogicalResult
+vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch,
+                                  VectorizationState *state) {
+  auto *loopInst = oneMatch.getMatchedOperation();
+  auto loop = cast<AffineForOp>(loopInst);
+  auto childrenMatches = oneMatch.getMatchedChildren();
+
+  // 1. DFS postorder recursion, if any of my children fails, I fail too.
+  for (auto m : childrenMatches) {
+    if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) {
+      return failure();
+    }
+  }
+
+  // 2. This loop may have been omitted from vectorization for various reasons
+  // (e.g. due to the performance model or pattern depth > vector size).
+  auto it = state->strategy->loopToVectorDim.find(loopInst);
+  if (it == state->strategy->loopToVectorDim.end()) {
+    return success();
+  }
+
+  // 3. Actual post-order transformation.
+  auto vectorDim = it->second;
+  assert(vectorDim < state->strategy->vectorSizes.size() &&
+         "vector dim overflow");
+  //   a. get actual vector size
+  auto vectorSize = state->strategy->vectorSizes[vectorDim];
+  //   b. loop transformation for early vectorization is still subject to
+  //     exploratory tradeoffs (see top of the file). Apply coarsening, i.e.:
+  //        | ub -> ub
+  //        | step -> step * vectorSize
+  LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
+                    << " : ");
+  LLVM_DEBUG(loopInst->print(dbgs()));
+  return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state);
+}
+
+/// Tries to transform a scalar constant into a vector splat of that constant.
+/// Returns the vectorized splat operation if the constant is a valid vector
+/// element type.
+/// If `type` is not a valid vector type or if the scalar constant is not a
+/// valid vector element type, returns nullptr.
+static Value vectorizeConstant(Operation *op, ConstantOp constant, Type type) {
+  if (!type || !type.isa<VectorType>() ||
+      !VectorType::isValidElementType(constant.getType())) {
+    return nullptr;
+  }
+  OpBuilder b(op);
+  Location loc = op->getLoc();
+  auto vectorType = type.cast<VectorType>();
+  auto attr = DenseElementsAttr::get(vectorType, constant.getValue());
+  auto *constantOpInst = constant.getOperation();
+
+  OperationState state(loc, constantOpInst->getName().getStringRef(), {},
+                       {vectorType}, {b.getNamedAttr("value", attr)});
+
+  return b.createOperation(state)->getResult(0);
+}
+
+/// Tries to vectorize a given operand `op` of Operation `op` during
+/// def-chain propagation or during terminal vectorization, by applying the
+/// following logic:
+/// 1. if the defining operation is part of the vectorizedSet (i.e. vectorized
+///    useby -def propagation), `op` is already in the proper vector form;
+/// 2. otherwise, the `op` may be in some other vector form that fails to
+///    vectorize atm (i.e. broadcasting required), returns nullptr to indicate
+///    failure;
+/// 3. if the `op` is a constant, returns the vectorized form of the constant;
+/// 4. non-constant scalars are currently non-vectorizable, in particular to
+///    guard against vectorizing an index which may be loop-variant and needs
+///    special handling.
+///
+/// In particular this logic captures some of the use cases where definitions
+/// that are not scoped under the current pattern are needed to vectorize.
+/// One such example is top level function constants that need to be splatted.
+///
+/// Returns an operand that has been vectorized to match `state`'s strategy if
+/// vectorization is possible with the above logic. Returns nullptr otherwise.
+///
+/// TODO(ntv): handle more complex cases.
+static Value vectorizeOperand(Value operand, Operation *op,
+                              VectorizationState *state) {
+  LLVM_DEBUG(dbgs() << "\n[early-vect]vectorize operand: ");
+  LLVM_DEBUG(operand->print(dbgs()));
+  // 1. If this value has already been vectorized this round, we are done.
+  if (state->vectorizedSet.count(operand->getDefiningOp()) > 0) {
+    LLVM_DEBUG(dbgs() << " -> already vector operand");
+    return operand;
+  }
+  // 1.b. Delayed on-demand replacement of a use.
+  //    Note that we cannot just call replaceAllUsesWith because it may result
+  //    in ops with mixed types, for ops whose operands have not all yet
+  //    been vectorized. This would be invalid IR.
+  auto it = state->replacementMap.find(operand);
+  if (it != state->replacementMap.end()) {
+    auto res = it->second;
+    LLVM_DEBUG(dbgs() << "-> delayed replacement by: ");
+    LLVM_DEBUG(res->print(dbgs()));
+    return res;
+  }
+  // 2. TODO(ntv): broadcast needed.
+  if (operand->getType().isa<VectorType>()) {
+    LLVM_DEBUG(dbgs() << "-> non-vectorizable");
+    return nullptr;
+  }
+  // 3. vectorize constant.
+  if (auto constant = dyn_cast<ConstantOp>(operand->getDefiningOp())) {
+    return vectorizeConstant(
+        op, constant,
+        VectorType::get(state->strategy->vectorSizes, operand->getType()));
+  }
+  // 4. currently non-vectorizable.
+  LLVM_DEBUG(dbgs() << "-> non-vectorizable");
+  LLVM_DEBUG(operand->print(dbgs()));
+  return nullptr;
+}
+
+/// Encodes Operation-specific behavior for vectorization. In general we assume
+/// that all operands of an op must be vectorized but this is not always true.
+/// In the future, it would be nice to have a trait that describes how a
+/// particular operation vectorizes. For now we implement the case distinction
+/// here.
+/// Returns a vectorized form of an operation or nullptr if vectorization fails.
+// TODO(ntv): consider adding a trait to Op to describe how it gets vectorized.
+// Maybe some Ops are not vectorizable or require some tricky logic, we cannot
+// do one-off logic here; ideally it would be TableGen'd.
+static Operation *vectorizeOneOperation(Operation *opInst,
+                                        VectorizationState *state) {
+  // Sanity checks.
+  assert(!isa<AffineLoadOp>(opInst) &&
+         "all loads must have already been fully vectorized independently");
+  assert(!isa<vector::TransferReadOp>(opInst) &&
+         "vector.transfer_read cannot be further vectorized");
+  assert(!isa<vector::TransferWriteOp>(opInst) &&
+         "vector.transfer_write cannot be further vectorized");
+
+  if (auto store = dyn_cast<AffineStoreOp>(opInst)) {
+    OpBuilder b(opInst);
+    auto memRef = store.getMemRef();
+    auto value = store.getValueToStore();
+    auto vectorValue = vectorizeOperand(value, opInst, state);
+
+    ValueRange mapOperands = store.getMapOperands();
+    SmallVector<Value, 8> indices;
+    indices.reserve(store.getMemRefType().getRank());
+    if (store.getAffineMap() !=
+        b.getMultiDimIdentityMap(store.getMemRefType().getRank())) {
+      computeMemoryOpIndices(opInst, store.getAffineMap(), mapOperands,
+                             indices);
+    } else {
+      indices.append(mapOperands.begin(), mapOperands.end());
+    }
+
+    auto permutationMap =
+        makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
+    if (!permutationMap)
+      return nullptr;
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
+    LLVM_DEBUG(permutationMap.print(dbgs()));
+    auto transfer = b.create<vector::TransferWriteOp>(
+        opInst->getLoc(), vectorValue, memRef, indices,
+        AffineMapAttr::get(permutationMap));
+    auto *res = transfer.getOperation();
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << *res);
+    // "Terminals" (i.e. AffineStoreOps) are erased on the spot.
+    opInst->erase();
+    return res;
+  }
+  if (opInst->getNumRegions() != 0)
+    return nullptr;
+
+  SmallVector<Type, 8> vectorTypes;
+  for (auto v : opInst->getResults()) {
+    vectorTypes.push_back(
+        VectorType::get(state->strategy->vectorSizes, v->getType()));
+  }
+  SmallVector<Value, 8> vectorOperands;
+  for (auto v : opInst->getOperands()) {
+    vectorOperands.push_back(vectorizeOperand(v, opInst, state));
+  }
+  // Check whether a single operand is null. If so, vectorization failed.
+  bool success = llvm::all_of(vectorOperands, [](Value op) { return op; });
+  if (!success) {
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize");
+    return nullptr;
+  }
+
+  // Create a clone of the op with the proper operands and return types.
+  // TODO(ntv): The following assumes there is always an op with a fixed
+  // name that works both in scalar mode and vector mode.
+  // TODO(ntv): Is it worth considering an Operation.clone operation which
+  // changes the type so we can promote an Operation with less boilerplate?
+  OpBuilder b(opInst);
+  OperationState newOp(opInst->getLoc(), opInst->getName().getStringRef(),
+                       vectorOperands, vectorTypes, opInst->getAttrs(),
+                       /*successors=*/{},
+                       /*regions=*/{}, opInst->hasResizableOperandsList());
+  return b.createOperation(newOp);
+}
+
+/// Iterates over the forward slice from the loads in the vectorization pattern
+/// and rewrites them using their vectorized counterpart by:
+///   1. Create the forward slice starting from the loads in the vectorization
+///   pattern.
+///   2. Topologically sorts the forward slice.
+///   3. For each operation in the slice, create the vector form of this
+///   operation, replacing each operand by a replacement operands retrieved from
+///   replacementMap. If any such replacement is missing, vectorization fails.
+static LogicalResult vectorizeNonTerminals(VectorizationState *state) {
+  // 1. create initial worklist with the uses of the roots.
+  SetVector<Operation *> worklist;
+  // Note: state->roots have already been vectorized and must not be vectorized
+  // again. This fits `getForwardSlice` which does not insert `op` in the
+  // result.
+  // Note: we have to exclude terminals because some of their defs may not be
+  // nested under the vectorization pattern (e.g. constants defined in an
+  // encompassing scope).
+  // TODO(ntv): Use a backward slice for terminals, avoid special casing and
+  // merge implementations.
+  for (auto *op : state->roots) {
+    getForwardSlice(op, &worklist, [state](Operation *op) {
+      return state->terminals.count(op) == 0; // propagate if not terminal
+    });
+  }
+  // We merged multiple slices, topological order may not hold anymore.
+  worklist = topologicalSort(worklist);
+
+  for (unsigned i = 0; i < worklist.size(); ++i) {
+    auto *op = worklist[i];
+    LLVM_DEBUG(dbgs() << "\n[early-vect] vectorize use: ");
+    LLVM_DEBUG(op->print(dbgs()));
+
+    // Create vector form of the operation.
+    // Insert it just before op, on success register op as replaced.
+    auto *vectorizedInst = vectorizeOneOperation(op, state);
+    if (!vectorizedInst) {
+      return failure();
+    }
+
+    // 3. Register replacement for future uses in the scope.
+    //    Note that we cannot just call replaceAllUsesWith because it may
+    //    result in ops with mixed types, for ops whose operands have not all
+    //    yet been vectorized. This would be invalid IR.
+    state->registerReplacement(op, vectorizedInst);
+  }
+  return success();
+}
+
+/// Vectorization is a recursive procedure where anything below can fail.
+/// The root match thus needs to maintain a clone for handling failure.
+/// Each root may succeed independently but will otherwise clean after itself if
+/// anything below it fails.
+static LogicalResult vectorizeRootMatch(NestedMatch m,
+                                        VectorizationStrategy *strategy) {
+  auto loop = cast<AffineForOp>(m.getMatchedOperation());
+  OperationFolder folder(loop.getContext());
+  VectorizationState state;
+  state.strategy = strategy;
+  state.folder = &folder;
+
+  // Since patterns are recursive, they can very well intersect.
+  // Since we do not want a fully greedy strategy in general, we decouple
+  // pattern matching, from profitability analysis, from application.
+  // As a consequence we must check that each root pattern is still
+  // vectorizable. If a pattern is not vectorizable anymore, we just skip it.
+  // TODO(ntv): implement a non-greedy profitability analysis that keeps only
+  // non-intersecting patterns.
+  if (!isVectorizableLoopBody(loop, vectorTransferPattern())) {
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
+    return failure();
+  }
+
+  /// Sets up error handling for this root loop. This is how the root match
+  /// maintains a clone for handling failure and restores the proper state via
+  /// RAII.
+  auto *loopInst = loop.getOperation();
+  OpBuilder builder(loopInst);
+  auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst));
+  struct Guard {
+    LogicalResult failure() {
+      loop.getInductionVar()->replaceAllUsesWith(clonedLoop.getInductionVar());
+      loop.erase();
+      return mlir::failure();
+    }
+    LogicalResult success() {
+      clonedLoop.erase();
+      return mlir::success();
+    }
+    AffineForOp loop;
+    AffineForOp clonedLoop;
+  } guard{loop, clonedLoop};
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Start vectorizing.
+  // From now on, any error triggers the scope guard above.
+  //////////////////////////////////////////////////////////////////////////////
+  // 1. Vectorize all the loops matched by the pattern, recursively.
+  // This also vectorizes the roots (AffineLoadOp) as well as registers the
+  // terminals (AffineStoreOp) for post-processing vectorization (we need to
+  // wait for all use-def chains into them to be vectorized first).
+  if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) {
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop");
+    return guard.failure();
+  }
+
+  // 2. Vectorize operations reached by use-def chains from root except the
+  // terminals (store operations) that need to be post-processed separately.
+  // TODO(ntv): add more as we expand.
+  if (failed(vectorizeNonTerminals(&state))) {
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed vectorizeNonTerminals");
+    return guard.failure();
+  }
+
+  // 3. Post-process terminals.
+  // Note: we have to post-process terminals because some of their defs may not
+  // be nested under the vectorization pattern (e.g. constants defined in an
+  // encompassing scope).
+  // TODO(ntv): Use a backward slice for terminals, avoid special casing and
+  // merge implementations.
+  for (auto *op : state.terminals) {
+    if (!vectorizeOneOperation(op, &state)) { // nullptr == failure
+      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed to vectorize terminals");
+      return guard.failure();
+    }
+  }
+
+  // 4. Finish this vectorization pattern.
+  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
+  state.finishVectorizationPattern();
+  return guard.success();
+}
+
+/// Applies vectorization to the current Function by searching over a bunch of
+/// predetermined patterns.
+void Vectorize::runOnFunction() {
+  FuncOp f = getFunction();
+  if (!fastestVaryingPattern.empty() &&
+      fastestVaryingPattern.size() != vectorSizes.size()) {
+    f.emitRemark("Fastest varying pattern specified with different size than "
+                 "the vector size.");
+    return signalPassFailure();
+  }
+
+  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+  NestedPatternContext mlContext;
+
+  DenseSet<Operation *> parallelLoops;
+  f.walk([&parallelLoops](AffineForOp loop) {
+    if (isLoopParallel(loop))
+      parallelLoops.insert(loop);
+  });
+
+  for (auto &pat :
+       makePatterns(parallelLoops, vectorSizes.size(), fastestVaryingPattern)) {
+    LLVM_DEBUG(dbgs() << "\n******************************************");
+    LLVM_DEBUG(dbgs() << "\n******************************************");
+    LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on Function\n");
+    LLVM_DEBUG(f.print(dbgs()));
+    unsigned patternDepth = pat.getDepth();
+
+    SmallVector<NestedMatch, 8> matches;
+    pat.match(f, &matches);
+    // Iterate over all the top-level matches and vectorize eagerly.
+    // This automatically prunes intersecting matches.
+    for (auto m : matches) {
+      VectorizationStrategy strategy;
+      // TODO(ntv): depending on profitability, elect to reduce the vector size.
+      strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());
+      if (failed(analyzeProfitability(m.getMatchedChildren(), 1, patternDepth,
+                                      &strategy))) {
+        continue;
+      }
+      vectorizeLoopIfProfitable(m.getMatchedOperation(), 0, patternDepth,
+                                &strategy);
+      // TODO(ntv): if pattern does not apply, report it; alter the
+      // cost/benefit.
+      vectorizeRootMatch(m, &strategy);
+      // TODO(ntv): some diagnostics if failure to vectorize occurs.
+    }
+  }
+  LLVM_DEBUG(dbgs() << "\n");
+}
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::createVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
+  return std::make_unique<Vectorize>(virtualVectorSize);
+}
+
+static PassRegistration<Vectorize>
+    pass("affine-vectorize",
+         "Vectorize to a target independent n-D vector abstraction");
diff --git a/mlir/lib/Transforms/ViewOpGraph.cpp b/mlir/lib/Transforms/ViewOpGraph.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..508c547a52bf1967f797756fd42fe47bf32cc99c
--- /dev/null
+++ b/mlir/lib/Transforms/ViewOpGraph.cpp
@@ -0,0 +1,170 @@
+//===- ViewOpGraph.cpp - View/write op graphviz graphs --------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/ViewOpGraph.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+
+static llvm::cl::opt<int> elideIfLarger(
+    "print-op-graph-elide-if-larger",
+    llvm::cl::desc("Upper limit to emit elements attribute rather than elide"),
+    llvm::cl::init(16));
+
+using namespace mlir;
+
+namespace llvm {
+
+// Specialize GraphTraits to treat Block as a graph of Operations as nodes and
+// uses as edges.
+template <> struct GraphTraits<Block *> {
+  using GraphType = Block *;
+  using NodeRef = Operation *;
+
+  using ChildIteratorType = UseIterator;
+  static ChildIteratorType child_begin(NodeRef n) {
+    return ChildIteratorType(n);
+  }
+  static ChildIteratorType child_end(NodeRef n) {
+    return ChildIteratorType(n, /*end=*/true);
+  }
+
+  // Operation's destructor is private so use Operation* instead and use
+  // mapped iterator.
+  static Operation *AddressOf(Operation &op) { return &op; }
+  using nodes_iterator = mapped_iterator<Block::iterator, decltype(&AddressOf)>;
+  static nodes_iterator nodes_begin(Block *b) {
+    return nodes_iterator(b->begin(), &AddressOf);
+  }
+  static nodes_iterator nodes_end(Block *b) {
+    return nodes_iterator(b->end(), &AddressOf);
+  }
+};
+
+// Specialize DOTGraphTraits to produce more readable output.
+template <> struct DOTGraphTraits<Block *> : public DefaultDOTGraphTraits {
+  using DefaultDOTGraphTraits::DefaultDOTGraphTraits;
+  static std::string getNodeLabel(Operation *op, Block *);
+};
+
+std::string DOTGraphTraits<Block *>::getNodeLabel(Operation *op, Block *b) {
+  // Reuse the print output for the node labels.
+  std::string ostr;
+  raw_string_ostream os(ostr);
+  os << op->getName() << "\n";
+
+  if (!op->getLoc().isa<UnknownLoc>()) {
+    os << op->getLoc() << "\n";
+  }
+
+  // Print resultant types
+  interleaveComma(op->getResultTypes(), os);
+  os << "\n";
+
+  for (auto attr : op->getAttrs()) {
+    os << '\n' << attr.first << ": ";
+    // Always emit splat attributes.
+    if (attr.second.isa<SplatElementsAttr>()) {
+      attr.second.print(os);
+      continue;
+    }
+
+    // Elide "big" elements attributes.
+    auto elements = attr.second.dyn_cast<ElementsAttr>();
+    if (elements && elements.getNumElements() > elideIfLarger) {
+      os << std::string(elements.getType().getRank(), '[') << "..."
+         << std::string(elements.getType().getRank(), ']') << " : "
+         << elements.getType();
+      continue;
+    }
+
+    auto array = attr.second.dyn_cast<ArrayAttr>();
+    if (array && static_cast<int64_t>(array.size()) > elideIfLarger) {
+      os << "[...]";
+      continue;
+    }
+
+    // Print all other attributes.
+    attr.second.print(os);
+  }
+  return os.str();
+}
+
+} // end namespace llvm
+
+namespace {
+// PrintOpPass is simple pass to write graph per function.
+// Note: this is a module pass only to avoid interleaving on the same ostream
+// due to multi-threading over functions.
+struct PrintOpPass : public ModulePass<PrintOpPass> {
+  explicit PrintOpPass(raw_ostream &os = llvm::errs(), bool short_names = false,
+                       const Twine &title = "")
+      : os(os), title(title.str()), short_names(short_names) {}
+
+  std::string getOpName(Operation &op) {
+    auto symbolAttr =
+        op.getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName());
+    if (symbolAttr)
+      return symbolAttr.getValue();
+    ++unnamedOpCtr;
+    return (op.getName().getStringRef() + llvm::utostr(unnamedOpCtr)).str();
+  }
+
+  // Print all the ops in a module.
+  void processModule(ModuleOp module) {
+    for (Operation &op : module) {
+      // Modules may actually be nested, recurse on nesting.
+      if (auto nestedModule = dyn_cast<ModuleOp>(op)) {
+        processModule(nestedModule);
+        continue;
+      }
+      auto opName = getOpName(op);
+      for (Region &region : op.getRegions()) {
+        for (auto indexed_block : llvm::enumerate(region)) {
+          // Suffix block number if there are more than 1 block.
+          auto blockName = region.getBlocks().size() == 1
+                               ? ""
+                               : ("__" + llvm::utostr(indexed_block.index()));
+          llvm::WriteGraph(os, &indexed_block.value(), short_names,
+                           Twine(title) + opName + blockName);
+        }
+      }
+    }
+  }
+
+  void runOnModule() override { processModule(getModule()); }
+
+private:
+  raw_ostream &os;
+  std::string title;
+  int unnamedOpCtr = 0;
+  bool short_names;
+};
+} // namespace
+
+void mlir::viewGraph(Block &block, const Twine &name, bool shortNames,
+                     const Twine &title, llvm::GraphProgram::Name program) {
+  llvm::ViewGraph(&block, name, shortNames, title, program);
+}
+
+raw_ostream &mlir::writeGraph(raw_ostream &os, Block &block, bool shortNames,
+                              const Twine &title) {
+  return llvm::WriteGraph(os, &block, shortNames, title);
+}
+
+std::unique_ptr<OpPassBase<ModuleOp>>
+mlir::createPrintOpGraphPass(raw_ostream &os, bool shortNames,
+                             const Twine &title) {
+  return std::make_unique<PrintOpPass>(os, shortNames, title);
+}
+
+static PassRegistration<PrintOpPass> pass("print-op-graph",
+                                          "Print op graph per region");
diff --git a/mlir/lib/Transforms/ViewRegionGraph.cpp b/mlir/lib/Transforms/ViewRegionGraph.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..77111087d079bd33ba33d5b0a330b5f1d7b445d4
--- /dev/null
+++ b/mlir/lib/Transforms/ViewRegionGraph.cpp
@@ -0,0 +1,85 @@
+//===- ViewRegionGraph.cpp - View/write graphviz graphs -------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/ViewRegionGraph.h"
+#include "mlir/IR/RegionGraphTraits.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace llvm {
+
+// Specialize DOTGraphTraits to produce more readable output.
+template <> struct DOTGraphTraits<Region *> : public DefaultDOTGraphTraits {
+  using DefaultDOTGraphTraits::DefaultDOTGraphTraits;
+
+  static std::string getNodeLabel(Block *Block, Region *);
+};
+
+std::string DOTGraphTraits<Region *>::getNodeLabel(Block *Block, Region *) {
+  // Reuse the print output for the node labels.
+  std::string outStreamStr;
+  raw_string_ostream os(outStreamStr);
+  Block->print(os);
+  std::string &outStr = os.str();
+
+  if (outStr[0] == '\n')
+    outStr.erase(outStr.begin());
+
+  // Process string output to left justify the block.
+  for (unsigned i = 0; i != outStr.length(); ++i) {
+    if (outStr[i] == '\n') {
+      outStr[i] = '\\';
+      outStr.insert(outStr.begin() + i + 1, 'l');
+    }
+  }
+
+  return outStr;
+}
+
+} // end namespace llvm
+
+void mlir::viewGraph(Region &region, const Twine &name, bool shortNames,
+                     const Twine &title, llvm::GraphProgram::Name program) {
+  llvm::ViewGraph(&region, name, shortNames, title, program);
+}
+
+raw_ostream &mlir::writeGraph(raw_ostream &os, Region &region, bool shortNames,
+                              const Twine &title) {
+  return llvm::WriteGraph(os, &region, shortNames, title);
+}
+
+void mlir::Region::viewGraph(const Twine &regionName) {
+  ::mlir::viewGraph(*this, regionName);
+}
+void mlir::Region::viewGraph() { viewGraph("region"); }
+
+namespace {
+struct PrintCFGPass : public FunctionPass<PrintCFGPass> {
+  PrintCFGPass(raw_ostream &os = llvm::errs(), bool shortNames = false,
+               const Twine &title = "")
+      : os(os), shortNames(shortNames), title(title.str()) {}
+  void runOnFunction() override {
+    mlir::writeGraph(os, getFunction().getBody(), shortNames, title);
+  }
+
+private:
+  raw_ostream &os;
+  bool shortNames;
+  std::string title;
+};
+} // namespace
+
+std::unique_ptr<mlir::OpPassBase<mlir::FuncOp>>
+mlir::createPrintCFGGraphPass(raw_ostream &os, bool shortNames,
+                              const Twine &title) {
+  return std::make_unique<PrintCFGPass>(os, shortNames, title);
+}
+
+static PassRegistration<PrintCFGPass> pass("print-cfg-graph",
+                                           "Print CFG graph per Function");
diff --git a/mlir/lib/Translation/CMakeLists.txt b/mlir/lib/Translation/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..122db2e6a3190564efd689a5e8d94310d97db414
--- /dev/null
+++ b/mlir/lib/Translation/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_llvm_library(MLIRTranslation
+  Translation.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Translation
+  )
+target_link_libraries(MLIRTranslation LLVMSupport)
diff --git a/mlir/lib/Translation/Translation.cpp b/mlir/lib/Translation/Translation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..80c1e48373188734e338f947b83d29160b6a5782
--- /dev/null
+++ b/mlir/lib/Translation/Translation.cpp
@@ -0,0 +1,111 @@
+//===- Translation.cpp - Translation registry -----------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definitions of the translation registry.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Translation.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+
+// Get the mutable static map between registered "to MLIR" translations and the
+// TranslateToMLIRFunctions that perform those translations.
+static llvm::StringMap<TranslateSourceMgrToMLIRFunction> &
+getMutableTranslationToMLIRRegistry() {
+  static llvm::StringMap<TranslateSourceMgrToMLIRFunction>
+      translationToMLIRRegistry;
+  return translationToMLIRRegistry;
+}
+// Get the mutable static map between registered "from MLIR" translations and
+// the TranslateFromMLIRFunctions that perform those translations.
+static llvm::StringMap<TranslateFromMLIRFunction> &
+getMutableTranslationFromMLIRRegistry() {
+  static llvm::StringMap<TranslateFromMLIRFunction> translationFromMLIRRegistry;
+  return translationFromMLIRRegistry;
+}
+
+// Get the mutable static map between registered file-to-file MLIR translations
+// and the TranslateFunctions that perform those translations.
+static llvm::StringMap<TranslateFunction> &getMutableTranslationRegistry() {
+  static llvm::StringMap<TranslateFunction> translationRegistry;
+  return translationRegistry;
+}
+
+// Puts `function` into the to-MLIR translation registry unless there is already
+// a function registered for the same name.
+static void registerTranslateToMLIRFunction(
+    StringRef name, const TranslateSourceMgrToMLIRFunction &function) {
+  auto &translationToMLIRRegistry = getMutableTranslationToMLIRRegistry();
+  if (translationToMLIRRegistry.find(name) != translationToMLIRRegistry.end())
+    llvm::report_fatal_error(
+        "Attempting to overwrite an existing <to> function");
+  assert(function && "Attempting to register an empty translate <to> function");
+  translationToMLIRRegistry[name] = function;
+}
+
+TranslateToMLIRRegistration::TranslateToMLIRRegistration(
+    StringRef name, const TranslateSourceMgrToMLIRFunction &function) {
+  registerTranslateToMLIRFunction(name, function);
+}
+
+// Wraps `function` with a lambda that extracts a StringRef from a source
+// manager and registers the wrapper lambda as a to-MLIR conversion.
+TranslateToMLIRRegistration::TranslateToMLIRRegistration(
+    StringRef name, const TranslateStringRefToMLIRFunction &function) {
+  auto translationFunction = [function](llvm::SourceMgr &sourceMgr,
+                                        MLIRContext *ctx) {
+    const llvm::MemoryBuffer *buffer =
+        sourceMgr.getMemoryBuffer(sourceMgr.getMainFileID());
+    return function(buffer->getBuffer(), ctx);
+  };
+  registerTranslateToMLIRFunction(name, translationFunction);
+}
+
+TranslateFromMLIRRegistration::TranslateFromMLIRRegistration(
+    StringRef name, const TranslateFromMLIRFunction &function) {
+  auto &translationFromMLIRRegistry = getMutableTranslationFromMLIRRegistry();
+  if (translationFromMLIRRegistry.find(name) !=
+      translationFromMLIRRegistry.end())
+    llvm::report_fatal_error(
+        "Attempting to overwrite an existing <from> function");
+  assert(function &&
+         "Attempting to register an empty translate <from> function");
+  translationFromMLIRRegistry[name] = function;
+}
+
+TranslateRegistration::TranslateRegistration(
+    StringRef name, const TranslateFunction &function) {
+  auto &translationRegistry = getMutableTranslationRegistry();
+  if (translationRegistry.find(name) != translationRegistry.end())
+    llvm::report_fatal_error(
+        "Attempting to overwrite an existing <file-to-file> function");
+  assert(function &&
+         "Attempting to register an empty translate <file-to-file> function");
+  translationRegistry[name] = function;
+}
+
+// Merely add the const qualifier to the mutable registry so that external users
+// cannot modify it.
+const llvm::StringMap<TranslateSourceMgrToMLIRFunction> &
+mlir::getTranslationToMLIRRegistry() {
+  return getMutableTranslationToMLIRRegistry();
+}
+
+const llvm::StringMap<TranslateFromMLIRFunction> &
+mlir::getTranslationFromMLIRRegistry() {
+  return getMutableTranslationFromMLIRRegistry();
+}
+
+const llvm::StringMap<TranslateFunction> &mlir::getTranslationRegistry() {
+  return getMutableTranslationRegistry();
+}
diff --git a/mlir/test/APITest.h b/mlir/test/APITest.h
new file mode 100644
index 0000000000000000000000000000000000000000..08d64a0e48d19ac1d9ebed0e9563f13a678e0472
--- /dev/null
+++ b/mlir/test/APITest.h
@@ -0,0 +1,63 @@
+//===- Test.h - Simple macros for API unit tests ----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file define simple macros for declaring test functions and running them.
+// The actual checking must be performed on the outputs with FileCheck.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TEST_TEST_H_
+#define MLIR_TEST_TEST_H_
+
+#include <functional>
+#include <vector>
+
+namespace test_detail {
+// Returns a mutable list of known test functions.  Used internally by test
+// macros to add and run tests.  This function is static to ensure it creates a
+// new list in each test file.
+static std::vector<std::function<void()>> &tests() {
+  static std::vector<std::function<void()>> list;
+  return list;
+}
+
+// Test registration class.  Used internally by test macros to register tests
+// during static allocation.
+struct TestRegistration {
+  explicit TestRegistration(std::function<void()> func) {
+    test_detail::tests().push_back(func);
+  }
+};
+} // end namespace test_detail
+
+/// Declares a test function with the given name and adds it to the list of
+/// known tests.  The body of the function must follow immediately.  Example:
+///
+/// TEST_FUNC(mytest) {
+///   // CHECK: expected-output-here
+///   emitSomethingToStdOut();
+/// }
+///
+#define TEST_FUNC(name)                                                        \
+  void name();                                                                 \
+  static test_detail::TestRegistration name##Registration(name);               \
+  void name()
+
+/// Runs all registered tests.  Example:
+///
+/// int main() {
+///   RUN_TESTS();
+///   return 0;
+/// }
+#define RUN_TESTS                                                              \
+  []() {                                                                       \
+    for (auto f : test_detail::tests())                                        \
+      f();                                                                     \
+  }
+
+#endif // MLIR_TEST_TEST_H_
diff --git a/mlir/test/AffineOps/canonicalize.mlir b/mlir/test/AffineOps/canonicalize.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..98b78f101c9af453d944d06db1a41c9d696a8d75
--- /dev/null
+++ b/mlir/test/AffineOps/canonicalize.mlir
@@ -0,0 +1,528 @@
+// RUN: mlir-opt %s -split-input-file -pass-pipeline='func(canonicalize)' | FileCheck %s
+
+// Affine maps for test case: compose_affine_maps_1dto2d_no_symbols
+// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0 - 1)
+// CHECK-DAG: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 1)
+
+// Affine maps for test case: compose_affine_maps_1dto2d_with_symbols
+// CHECK-DAG: [[MAP4:#map[0-9]+]] = (d0) -> (d0 - 4)
+// CHECK-DAG: [[MAP4b:#map[0-9]+]] = (d0) -> (d0 - 7)
+// CHECK-DAG: [[MAP7:#map[0-9]+]] = (d0) -> (d0 * 2 - 3)
+// CHECK-DAG: [[MAP7a:#map[0-9]+]] = (d0) -> (d0 * 2 + 1)
+
+// Affine map for test case: compose_affine_maps_d2_tile
+// CHECK-DAG: [[MAP8:#map[0-9]+]] = (d0, d1) -> (d1 + (d0 ceildiv 4) * 4 - (d1 floordiv 4) * 4)
+// CHECK-DAG: [[MAP8a:#map[0-9]+]] = (d0, d1) -> (d1 + (d0 ceildiv 8) * 8 - (d1 floordiv 8) * 8)
+
+// Affine maps for test case: compose_affine_maps_dependent_loads
+// CHECK-DAG: [[MAP9:#map[0-9]+]] = (d0) -> (d0 + 3)
+// CHECK-DAG: [[MAP10:#map[0-9]+]] = (d0) -> (d0 * 3)
+// CHECK-DAG: [[MAP11:#map[0-9]+]] = (d0) -> ((d0 + 7) ceildiv 3)
+// CHECK-DAG: [[MAP12:#map[0-9]+]] = (d0) -> (d0 * 7 - 49)
+
+// Affine maps for test case: compose_affine_maps_diamond_dependency
+// CHECK-DAG: [[MAP13A:#map[0-9]+]] = (d0) -> ((d0 + 6) ceildiv 8)
+// CHECK-DAG: [[MAP13B:#map[0-9]+]] = (d0) -> ((d0 * 4 - 4) floordiv 3)
+
+// Affine maps for test case: partial_fold_map
+// CHECK-DAG: [[MAP15:#map[0-9]+]] = ()[s0] -> (s0 - 42)
+
+// Affine maps for test cases: symbolic_composition_*
+// CHECK-DAG: [[map_symbolic_composition_a:#map[0-9]+]] = ()[s0] -> (s0 * 512)
+// CHECK-DAG: [[map_symbolic_composition_b:#map[0-9]+]] = ()[s0] -> (s0 * 4)
+// CHECK-DAG: [[map_symbolic_composition_c:#map[0-9]+]] = ()[s0, s1] -> (s0 * 3 + s1)
+// CHECK-DAG: [[map_symbolic_composition_d:#map[0-9]+]] = ()[s0, s1] -> (s1 * 3 + s0)
+
+// Affine maps for test cases: map_mix_dims_and_symbols_*
+// CHECK-DAG: [[map_mix_dims_and_symbols_b:#map[0-9]+]] = ()[s0, s1] -> (s1 + s0 * 42 + 6)
+// CHECK-DAG: [[map_mix_dims_and_symbols_c:#map[0-9]+]] = ()[s0, s1] -> (s1 * 4 + s0 * 168 - 4)
+// CHECK-DAG: [[map_mix_dims_and_symbols_d:#map[0-9]+]] = ()[s0, s1] -> ((s1 + s0 * 42 + 6) ceildiv 8)
+// CHECK-DAG: [[map_mix_dims_and_symbols_e:#map[0-9]+]] = ()[s0, s1] -> ((s1 * 4 + s0 * 168 - 4) floordiv 3)
+
+// Affine maps for test case: symbolic_semi_affine
+// CHECK-DAG: [[symbolic_semi_affine:#map[0-9]+]] = (d0)[s0] -> (d0 floordiv (s0 + 1))
+
+// CHECK-LABEL: func @compose_affine_maps_1dto2d_no_symbols() {
+func @compose_affine_maps_1dto2d_no_symbols() {
+  %0 = alloc() : memref<4x4xf32>
+
+  affine.for %i0 = 0 to 15 {
+    // Test load[%x, %x]
+
+    %x0 = affine.apply (d0) -> (d0 - 1) (%i0)
+    %x1_0 = affine.apply (d0, d1) -> (d0) (%x0, %x0)
+    %x1_1 = affine.apply (d0, d1) -> (d1) (%x0, %x0)
+
+    // CHECK: [[I0A:%[0-9]+]] = affine.apply [[MAP0]](%{{.*}})
+    // CHECK-NEXT: load %0{{\[}}[[I0A]], [[I0A]]{{\]}}
+    %v0 = load %0[%x1_0, %x1_1] : memref<4x4xf32>
+
+    // Test load[%y, %y]
+    %y0 = affine.apply (d0) -> (d0 + 1) (%i0)
+    %y1_0 = affine.apply (d0, d1) -> (d0) (%y0, %y0)
+    %y1_1 = affine.apply (d0, d1) -> (d1) (%y0, %y0)
+
+    // CHECK-NEXT: [[I1A:%[0-9]+]] = affine.apply [[MAP1]](%{{.*}})
+    // CHECK-NEXT: load %0{{\[}}[[I1A]], [[I1A]]{{\]}}
+    %v1 = load %0[%y1_0, %y1_1] : memref<4x4xf32>
+
+    // Test load[%x, %y]
+    %xy_0 = affine.apply (d0, d1) -> (d0) (%x0, %y0)
+    %xy_1 = affine.apply (d0, d1) -> (d1) (%x0, %y0)
+
+    // CHECK-NEXT: load %0{{\[}}[[I0A]], [[I1A]]{{\]}}
+    %v2 = load %0[%xy_0, %xy_1] : memref<4x4xf32>
+
+    // Test load[%y, %x]
+    %yx_0 = affine.apply (d0, d1) -> (d0) (%y0, %x0)
+    %yx_1 = affine.apply (d0, d1) -> (d1) (%y0, %x0)
+    // CHECK-NEXT: load %0{{\[}}[[I1A]], [[I0A]]{{\]}}
+    %v3 = load %0[%yx_0, %yx_1] : memref<4x4xf32>
+  }
+  return
+}
+
+// CHECK-LABEL: func @compose_affine_maps_1dto2d_with_symbols() {
+func @compose_affine_maps_1dto2d_with_symbols() {
+  %0 = alloc() : memref<4x4xf32>
+
+  affine.for %i0 = 0 to 15 {
+    // Test load[%x0, %x0] with symbol %c4
+    %c4 = constant 4 : index
+    %x0 = affine.apply (d0)[s0] -> (d0 - s0) (%i0)[%c4]
+
+    // CHECK: [[I0:%[0-9]+]] = affine.apply [[MAP4]](%{{.*}})
+    // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I0]], [[I0]]{{\]}}
+    %v0 = load %0[%x0, %x0] : memref<4x4xf32>
+
+    // Test load[%x0, %x1] with symbol %c4 captured by '%x0' map.
+    %x1 = affine.apply (d0) -> (d0 + 1) (%i0)
+    %y1 = affine.apply (d0, d1) -> (d0+d1) (%x0, %x1)
+    // CHECK-NEXT: [[I1:%[0-9]+]] = affine.apply [[MAP7]](%{{.*}})
+    // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I1]], [[I1]]{{\]}}
+    %v1 = load %0[%y1, %y1] : memref<4x4xf32>
+
+    // Test load[%x1, %x0] with symbol %c4 captured by '%x0' map.
+    %y2 = affine.apply (d0, d1) -> (d0 + d1) (%x1, %x0)
+    // CHECK-NEXT: [[I2:%[0-9]+]] = affine.apply [[MAP7]](%{{.*}})
+    // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I2]], [[I2]]{{\]}}
+    %v2 = load %0[%y2, %y2] : memref<4x4xf32>
+
+    // Test load[%x2, %x0] with symbol %c4 from '%x0' and %c5 from '%x2'
+    %c5 = constant 5 : index
+    %x2 = affine.apply (d0)[s0] -> (d0 + s0) (%i0)[%c5]
+    %y3 = affine.apply (d0, d1) -> (d0 + d1) (%x2, %x0)
+    // CHECK: [[I3:%[0-9]+]] = affine.apply [[MAP7a]](%{{.*}})
+    // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I3]], [[I3]]{{\]}}
+    %v3 = load %0[%y3, %y3] : memref<4x4xf32>
+  }
+  return
+}
+
+// CHECK-LABEL: func @compose_affine_maps_2d_tile() {
+func @compose_affine_maps_2d_tile() {
+  %0 = alloc() : memref<16x32xf32>
+  %1 = alloc() : memref<16x32xf32>
+
+  %c4 = constant 4 : index
+  %c8 = constant 8 : index
+
+  affine.for %i0 = 0 to 3 {
+    %x0 = affine.apply (d0)[s0] -> (d0 ceildiv s0) (%i0)[%c4]
+    affine.for %i1 = 0 to 3 {
+      %x1 = affine.apply (d0)[s0] -> (d0 ceildiv s0) (%i1)[%c8]
+      affine.for %i2 = 0 to 3 {
+        %x2 = affine.apply (d0)[s0] -> (d0 mod s0) (%i2)[%c4]
+        affine.for %i3 = 0 to 3 {
+          %x3 = affine.apply (d0)[s0] -> (d0 mod s0) (%i3)[%c8]
+
+          %x40 = affine.apply (d0, d1, d2, d3)[s0, s1] ->
+            ((d0 * s0) + d2) (%x0, %x1, %x2, %x3)[%c4, %c8]
+          %x41 = affine.apply (d0, d1, d2, d3)[s0, s1] ->
+            ((d1 * s1) + d3) (%x0, %x1, %x2, %x3)[%c4, %c8]
+          // CHECK: [[I0:%[0-9]+]] = affine.apply [[MAP8]](%{{.*}}, %{{.*}})
+          // CHECK: [[I1:%[0-9]+]] = affine.apply [[MAP8a]](%{{.*}}, %{{.*}})
+          // CHECK-NEXT: [[L0:%[0-9]+]] = load %{{[0-9]+}}{{\[}}[[I0]], [[I1]]{{\]}}
+          %v0 = load %0[%x40, %x41] : memref<16x32xf32>
+
+          // CHECK-NEXT: store [[L0]], %{{[0-9]+}}{{\[}}[[I0]], [[I1]]{{\]}}
+          store %v0, %1[%x40, %x41] : memref<16x32xf32>
+        }
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @compose_affine_maps_dependent_loads() {
+func @compose_affine_maps_dependent_loads() {
+  %0 = alloc() : memref<16x32xf32>
+  %1 = alloc() : memref<16x32xf32>
+
+  affine.for %i0 = 0 to 3 {
+    affine.for %i1 = 0 to 3 {
+      affine.for %i2 = 0 to 3 {
+        %c3 = constant 3 : index
+        %c7 = constant 7 : index
+
+        %x00 = affine.apply (d0, d1, d2)[s0, s1] -> (d0 + s0)
+            (%i0, %i1, %i2)[%c3, %c7]
+        %x01 = affine.apply (d0, d1, d2)[s0, s1] -> (d1 - s1)
+            (%i0, %i1, %i2)[%c3, %c7]
+        %x02 = affine.apply (d0, d1, d2)[s0, s1] -> (d2 * s0)
+            (%i0, %i1, %i2)[%c3, %c7]
+
+        // CHECK: [[I0:%[0-9]+]] = affine.apply [[MAP9]](%{{.*}})
+        // CHECK: [[I1:%[0-9]+]] = affine.apply [[MAP4b]](%{{.*}})
+        // CHECK: [[I2:%[0-9]+]] = affine.apply [[MAP10]](%{{.*}})
+        // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I0]], [[I1]]{{\]}}
+        %v0 = load %0[%x00, %x01] : memref<16x32xf32>
+
+        // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I0]], [[I2]]{{\]}}
+        %v1 = load %0[%x00, %x02] : memref<16x32xf32>
+
+        // Swizzle %i0, %i1
+        // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I1]], [[I0]]{{\]}}
+        %v2 = load %0[%x01, %x00] : memref<16x32xf32>
+
+        // Swizzle %x00, %x01 and %c3, %c7
+        %x10 = affine.apply (d0, d1)[s0, s1] -> (d0 * s1)
+           (%x01, %x00)[%c7, %c3]
+        %x11 = affine.apply (d0, d1)[s0, s1] -> (d1 ceildiv s0)
+           (%x01, %x00)[%c7, %c3]
+
+        // CHECK-NEXT: [[I2A:%[0-9]+]] = affine.apply [[MAP12]](%{{.*}})
+        // CHECK-NEXT: [[I2B:%[0-9]+]] = affine.apply [[MAP11]](%{{.*}})
+        // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I2A]], [[I2B]]{{\]}}
+        %v3 = load %0[%x10, %x11] : memref<16x32xf32>
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @compose_affine_maps_diamond_dependency() {
+func @compose_affine_maps_diamond_dependency() {
+  %0 = alloc() : memref<4x4xf32>
+
+  affine.for %i0 = 0 to 15 {
+    %a = affine.apply (d0) -> (d0 - 1) (%i0)
+    %b = affine.apply (d0) -> (d0 + 7) (%a)
+    %c = affine.apply (d0) -> (d0 * 4) (%a)
+    %d0 = affine.apply (d0, d1) -> (d0 ceildiv 8) (%b, %c)
+    %d1 = affine.apply (d0, d1) -> (d1 floordiv 3) (%b, %c)
+    // CHECK: [[I0:%[0-9]+]] = affine.apply [[MAP13A]](%{{.*}})
+    // CHECK: [[I1:%[0-9]+]] = affine.apply [[MAP13B]](%{{.*}})
+    // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I0]], [[I1]]{{\]}}
+    %v = load %0[%d0, %d1] : memref<4x4xf32>
+  }
+
+  return
+}
+
+// CHECK-LABEL: func @arg_used_as_dim_and_symbol
+func @arg_used_as_dim_and_symbol(%arg0: memref<100x100xf32>, %arg1: index) {
+  %c9 = constant 9 : index
+  %1 = alloc() : memref<100x100xf32, 1>
+  %2 = alloc() : memref<1xi32>
+  affine.for %i0 = 0 to 100 {
+    affine.for %i1 = 0 to 100 {
+      %3 = affine.apply (d0, d1)[s0, s1] -> (d1 + s0 + s1)
+        (%i0, %i1)[%arg1, %c9]
+      %4 = affine.apply (d0, d1, d3) -> (d3 - (d0 + d1))
+        (%arg1, %c9, %3)
+      // CHECK: load %{{[0-9]+}}{{\[}}%{{.*}}, %{{.*}}{{\]}}
+      %5 = load %1[%4, %arg1] : memref<100x100xf32, 1>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @trivial_maps
+func @trivial_maps() {
+  // CHECK-NOT: affine.apply
+
+  %0 = alloc() : memref<10xf32>
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  affine.for %i1 = 0 to 10 {
+    %1 = affine.apply ()[s0] -> (s0)()[%c0]
+    store %cst, %0[%1] : memref<10xf32>
+    %2 = load %0[%c0] : memref<10xf32>
+
+    %3 = affine.apply ()[] -> (0)()[]
+    store %cst, %0[%3] : memref<10xf32>
+    %4 = load %0[%c0] : memref<10xf32>
+  }
+  return
+}
+
+// CHECK-LABEL: func @partial_fold_map
+func @partial_fold_map(%arg1: index, %arg2: index) -> index {
+  // TODO: Constant fold one index into affine.apply
+  %c42 = constant 42 : index
+  %2 = affine.apply (d0, d1) -> (d0 - d1) (%arg1, %c42)
+  // CHECK: [[X:%[0-9]+]] = affine.apply [[MAP15]]()[%{{.*}}]
+  return %2 : index
+}
+
+// CHECK-LABEL: func @symbolic_composition_a(%{{.*}}: index, %{{.*}}: index) -> index {
+func @symbolic_composition_a(%arg0: index, %arg1: index) -> index {
+  %0 = affine.apply (d0) -> (d0 * 4)(%arg0)
+  %1 = affine.apply ()[s0, s1] -> (8 * s0)()[%0, %arg0]
+  %2 = affine.apply ()[s0, s1] -> (16 * s1)()[%arg1, %1]
+  // CHECK: %{{.*}} = affine.apply [[map_symbolic_composition_a]]()[%{{.*}}]
+  return %2 : index
+}
+
+// CHECK-LABEL: func @symbolic_composition_b(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> index {
+func @symbolic_composition_b(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> index {
+  %0 = affine.apply (d0) -> (d0)(%arg0)
+  %1 = affine.apply ()[s0, s1, s2, s3] -> (s0 + s1 + s2 + s3)()[%0, %0, %0, %0]
+  // CHECK: %{{.*}} = affine.apply [[map_symbolic_composition_b]]()[%{{.*}}]
+  return %1 : index
+}
+
+// CHECK-LABEL: func @symbolic_composition_c(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> index {
+func @symbolic_composition_c(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> index {
+  %0 = affine.apply (d0) -> (d0)(%arg0)
+  %1 = affine.apply (d0) -> (d0)(%arg1)
+  %2 = affine.apply ()[s0, s1, s2, s3] -> (s0 + s1 + s2 + s3)()[%0, %0, %0, %1]
+  // CHECK: %{{.*}} = affine.apply [[map_symbolic_composition_c]]()[%{{.*}}, %{{.*}}]
+  return %2 : index
+}
+
+// CHECK-LABEL: func @symbolic_composition_d(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> index {
+func @symbolic_composition_d(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> index {
+  %0 = affine.apply (d0) -> (d0)(%arg0)
+  %1 = affine.apply ()[s0] -> (s0)()[%arg1]
+  %2 = affine.apply ()[s0, s1, s2, s3] -> (s0 + s1 + s2 + s3)()[%0, %0, %0, %1]
+  // CHECK: %{{.*}} = affine.apply [[map_symbolic_composition_d]]()[%{{.*}}, %{{.*}}]
+  return %2 : index
+}
+
+
+// CHECK-LABEL: func @mix_dims_and_symbols_b(%arg0: index, %arg1: index) -> index {
+func @mix_dims_and_symbols_b(%arg0: index, %arg1: index) -> index {
+  %a = affine.apply (d0)[s0] -> (d0 - 1 + 42 * s0) (%arg0)[%arg1]
+  %b = affine.apply (d0) -> (d0 + 7) (%a)
+  // CHECK: {{.*}} = affine.apply [[map_mix_dims_and_symbols_b]]()[%{{.*}}, %{{.*}}]
+
+  return %b : index
+}
+
+// CHECK-LABEL: func @mix_dims_and_symbols_c(%arg0: index, %arg1: index) -> index {
+func @mix_dims_and_symbols_c(%arg0: index, %arg1: index) -> index {
+  %a = affine.apply (d0)[s0] -> (d0 - 1 + 42 * s0) (%arg0)[%arg1]
+  %b = affine.apply (d0) -> (d0 + 7) (%a)
+  %c = affine.apply (d0) -> (d0 * 4) (%a)
+  // CHECK: {{.*}} = affine.apply [[map_mix_dims_and_symbols_c]]()[%{{.*}}, %{{.*}}]
+  return %c : index
+}
+
+// CHECK-LABEL: func @mix_dims_and_symbols_d(%arg0: index, %arg1: index) -> index {
+func @mix_dims_and_symbols_d(%arg0: index, %arg1: index) -> index {
+  %a = affine.apply (d0)[s0] -> (d0 - 1 + 42 * s0) (%arg0)[%arg1]
+  %b = affine.apply (d0) -> (d0 + 7) (%a)
+  %c = affine.apply (d0) -> (d0 * 4) (%a)
+  %d = affine.apply ()[s0] -> (s0 ceildiv 8) ()[%b]
+  // CHECK: {{.*}} = affine.apply [[map_mix_dims_and_symbols_d]]()[%{{.*}}, %{{.*}}]
+  return %d : index
+}
+
+// CHECK-LABEL: func @mix_dims_and_symbols_e(%arg0: index, %arg1: index) -> index {
+func @mix_dims_and_symbols_e(%arg0: index, %arg1: index) -> index {
+  %a = affine.apply (d0)[s0] -> (d0 - 1 + 42 * s0) (%arg0)[%arg1]
+  %b = affine.apply (d0) -> (d0 + 7) (%a)
+  %c = affine.apply (d0) -> (d0 * 4) (%a)
+  %d = affine.apply ()[s0] -> (s0 ceildiv 8) ()[%b]
+  %e = affine.apply (d0) -> (d0 floordiv 3) (%c)
+  // CHECK: {{.*}} = affine.apply [[map_mix_dims_and_symbols_e]]()[%{{.*}}, %{{.*}}]
+  return %e : index
+}
+
+// CHECK-LABEL: func @mix_dims_and_symbols_f(%arg0: index, %arg1: index) -> index {
+func @mix_dims_and_symbols_f(%arg0: index, %arg1: index) -> index {
+  %a = affine.apply (d0)[s0] -> (d0 - 1 + 42 * s0) (%arg0)[%arg1]
+  %b = affine.apply (d0) -> (d0 + 7) (%a)
+  %c = affine.apply (d0) -> (d0 * 4) (%a)
+  %d = affine.apply ()[s0] -> (s0 ceildiv 8) ()[%b]
+  %e = affine.apply (d0) -> (d0 floordiv 3) (%c)
+  %f = affine.apply (d0, d1)[s0, s1] -> (d0 - s1 +  d1 - s0) (%d, %e)[%e, %d]
+  // CHECK: {{.*}} = constant 0 : index
+
+  return %f : index
+}
+
+// CHECK-LABEL: func @mix_dims_and_symbols_g(%arg0: index, %arg1: index) -> (index, index, index) {
+func @mix_dims_and_symbols_g(%M: index, %N: index) -> (index, index, index) {
+  %K = affine.apply (d0) -> (4*d0) (%M)
+  %res1 = affine.apply ()[s0, s1] -> (4 * s0)()[%N, %K]
+  %res2 = affine.apply ()[s0, s1] -> (s1)()[%N, %K]
+  %res3 = affine.apply ()[s0, s1] -> (1024)()[%N, %K]
+  // CHECK-DAG: {{.*}} = constant 1024 : index
+  // CHECK-DAG: {{.*}} = affine.apply [[map_symbolic_composition_b]]()[%{{.*}}]
+  // CHECK-DAG: {{.*}} = affine.apply [[map_symbolic_composition_b]]()[%{{.*}}]
+  return %res1, %res2, %res3 : index, index, index
+}
+
+// CHECK-LABEL: func @symbolic_semi_affine(%arg0: index, %arg1: index, %arg2: memref<?xf32>) {
+func @symbolic_semi_affine(%M: index, %N: index, %A: memref<?xf32>) {
+  %f1 = constant 1.0 : f32
+  affine.for %i0 = 1 to 100 {
+    %1 = affine.apply ()[s0] -> (s0 + 1) ()[%M]
+    %2 = affine.apply (d0)[s0] -> (d0 floordiv s0) (%i0)[%1]
+    // CHECK-DAG: {{.*}} = affine.apply [[symbolic_semi_affine]](%{{.*}})[%{{.*}}]
+    store %f1, %A[%2] : memref<?xf32>
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = ()[s0] -> (0, s0)
+// CHECK: [[MAP1:#map[0-9]+]] = ()[s0] -> (100, s0)
+
+// CHECK-LABEL:  func @constant_fold_bounds(%arg0: index) {
+func @constant_fold_bounds(%N : index) {
+  // CHECK:      constant 3 : index
+  // CHECK-NEXT: "foo"() : () -> index
+  %c9 = constant 9 : index
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  %c3 = affine.apply (d0, d1) -> (d0 + d1) (%c1, %c2)
+  %l = "foo"() : () -> index
+
+  // CHECK:  affine.for %{{.*}} = 5 to 7 {
+  affine.for %i = max (d0, d1) -> (0, d0 + d1)(%c2, %c3) to min (d0, d1) -> (d0 - 2, 32*d1) (%c9, %c1) {
+    "foo"(%i, %c3) : (index, index) -> ()
+  }
+
+  // Bound takes a non-constant argument but can still be folded.
+  // CHECK:  affine.for %{{.*}} = 1 to 7 {
+  affine.for %j = max (d0) -> (0, 1)(%N) to min (d0, d1) -> (7, 9)(%N, %l) {
+    "foo"(%j, %c3) : (index, index) -> ()
+  }
+
+  // None of the bounds can be folded.
+  // CHECK: affine.for %{{.*}} = max [[MAP0]]()[%{{.*}}] to min [[MAP1]]()[%{{.*}}] {
+  affine.for %k = max ()[s0] -> (0, s0) ()[%l] to min ()[s0] -> (100, s0)()[%N] {
+    "foo"(%k, %c3) : (index, index) -> ()
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL:  func @fold_empty_loop() {
+func @fold_empty_loop() {
+  // CHECK-NOT: affine.for
+  affine.for %i = 0 to 10 {
+  }
+  return
+}
+// CHECK: return
+
+// -----
+
+// CHECK-DAG: [[SET:#set[0-9]+]] = (d0, d1)[s0] : (d0 >= 0, -d0 + 1022 >= 0, d1 >= 0, -d1 + s0 - 2 >= 0)
+
+// CHECK-LABEL: func @canonicalize_affine_if
+// CHECK-SAME: [[M:%.*]]: index,
+// CHECK-SAME: [[N:%.*]]: index)
+func @canonicalize_affine_if(%M : index, %N : index) {
+  %c1022 = constant 1022 : index
+  // Drop unused operand %M, propagate %c1022, and promote %N to symbolic.
+  affine.for %i = 0 to 1024 {
+    affine.for %j = 0 to %N {
+      // CHECK: affine.if [[SET]](%{{.*}}, %{{.*}}){{\[}}[[N]]{{\]}}
+      affine.if (d0, d1, d2, d3)[s0] : (d1 >= 0, d0 - d1 >= 0, d2 >= 0, d3 - d2 - 2 >= 0) (%c1022, %i, %j, %N)[%M] {
+        "foo"() : () -> ()
+      }
+      "bar"() : () -> ()
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[LBMAP:#map[0-9]+]] = ()[s0] -> (0, s0)
+// CHECK-DAG: [[UBMAP:#map[0-9]+]] = ()[s0] -> (1024, s0 + s0)
+
+// CHECK-LABEL: func @canonicalize_bounds
+// CHECK-SAME: [[M:%.*]]: index,
+// CHECK-SAME: [[N:%.*]]: index)
+func @canonicalize_bounds(%M : index, %N : index) {
+  %c0 = constant 0 : index
+  %c1024 = constant 1024 : index
+  // Drop unused operand %N, drop duplicate operand %M, propagate %c1024, and
+  // promote %M to a symbolic one.
+  // CHECK: affine.for %{{.*}} = 0 to min [[UBMAP]](){{\[}}[[M]]{{\]}}
+  affine.for %i = 0 to min (d0, d1, d2, d3) -> (d0, d1 + d2) (%c1024, %M, %M, %N) {
+    "foo"() : () -> ()
+  }
+  // Promote %M to symbolic position.
+  // CHECK: affine.for %{{.*}} = 0 to #map{{[0-9]+}}(){{\[}}[[M]]{{\]}}
+  affine.for %i = 0 to (d0) -> (4 * d0) (%M) {
+    "foo"() : () -> ()
+  }
+  // Lower bound canonicalize.
+  // CHECK: affine.for %{{.*}} = max [[LBMAP]](){{\[}}[[N]]{{\]}} to [[M]]
+  affine.for %i = max (d0, d1) -> (d0, d1) (%c0, %N) to %M {
+    "foo"() : () -> ()
+  }
+  return
+}
+
+// -----
+
+// Compose maps into affine load and store ops.
+
+// CHECK-DAG: #map{{[0-9]+}} = (d0) -> (d0 + 1)
+
+// CHECK-LABEL: @compose_into_affine_load_store
+func @compose_into_affine_load_store(%A : memref<1024xf32>, %u : index) {
+  %cf1 = constant 1.0 : f32
+  // CHECK: affine.for %[[IV:.*]] = 0 to 1024
+  affine.for %i = 0 to 1024 {
+    // Make sure the unused operand (%u below) gets dropped as well.
+    %idx = affine.apply (d0, d1) -> (d0 + 1) (%i, %u)
+    affine.load %A[%idx] : memref<1024xf32>
+    affine.store %cf1, %A[%idx] : memref<1024xf32>
+    // CHECK-NEXT: affine.load %{{.*}}[%[[IV]] + 1]
+    // CHECK-NEXT: affine.store %cst, %{{.*}}[%[[IV]] + 1]
+
+    // Map remains the same, but operand changes on composition.
+    %copy = affine.apply (d0) -> (d0) (%i)
+    affine.load %A[%copy] : memref<1024xf32>
+    // CHECK-NEXT: affine.load %{{.*}}[%[[IV]]]
+  }
+  return
+}
+
+// -----
+
+func @affine_min(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %c511 = constant 511 : index
+  %c1 = constant 0 : index
+  %0 = affine.min (d0)[s0] -> (1000, d0 + 512, s0 + 1) (%c1)[%c511]
+  "op0"(%0) : (index) -> ()
+  // CHECK:       %[[CST:.*]] = constant 512 : index
+  // CHECK-NEXT:  "op0"(%[[CST]]) : (index) -> ()
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+func @affine_min(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %c3 = constant 3 : index
+  %c20 = constant 20 : index
+  %0 = affine.min (d0)[s0] -> (1000, d0 floordiv 4, (s0 mod 5) + 1) (%c20)[%c3]
+  "op0"(%0) : (index) -> ()
+  // CHECK:       %[[CST:.*]] = constant 4 : index
+  // CHECK-NEXT:  "op0"(%[[CST]]) : (index) -> ()
+  // CHECK-NEXT:  return
+  return
+}
diff --git a/mlir/test/AffineOps/dma.mlir b/mlir/test/AffineOps/dma.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..785d46aa89a7c30be69cf108ff4fb13cb62cd746
--- /dev/null
+++ b/mlir/test/AffineOps/dma.mlir
@@ -0,0 +1,127 @@
+// RUN: mlir-opt %s -split-input-file | FileCheck %s
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (d0, d1)
+
+// Test with loop IVs.
+func @test0(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<100x100xf32>
+  %1 = alloc() : memref<100x100xf32, (d0, d1) -> (d0, d1), 2>
+  %2 = alloc() : memref<1xi32>
+  %c0 = constant 0 : index
+  %c64 = constant 64 : index
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.dma_start %0[%i0, %i1], %1[%i0, %i1], %2[%c0], %c64
+        : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
+      affine.dma_wait %2[%c0], %c64 : memref<1xi32>
+// CHECK: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
+// CHECK: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (d0, d1)
+
+// Test with loop IVs and optional stride arguments.
+func @test1(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<100x100xf32>
+  %1 = alloc() : memref<100x100xf32, (d0, d1) -> (d0, d1), 2>
+  %2 = alloc() : memref<1xi32>
+  %c0 = constant 0 : index
+  %c64 = constant 64 : index
+  %c128 = constant 128 : index
+  %c256 = constant 256 : index
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.dma_start %0[%i0, %i1], %1[%i0, %i1], %2[%c0], %c64, %c128, %c256
+        : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
+      affine.dma_wait %2[%c0], %c64 : memref<1xi32>
+// CHECK: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}, %{{.*}} : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
+// CHECK: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1, d2) -> (d0, d1 + d2 + 5)
+// CHECK: [[MAP1:#map[0-9]+]] = (d0, d1, d2) -> (d0 + d1, d2)
+
+// Test with loop IVs and symbols (without symbol keyword).
+func @test2(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<100x100xf32>
+  %1 = alloc() : memref<100x100xf32, (d0, d1) -> (d0, d1), 2>
+  %2 = alloc() : memref<1xi32>
+  %c0 = constant 0 : index
+  %c64 = constant 64 : index
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.dma_start %0[%i0 + %arg0, %i1], %1[%i0, %i1 + %arg1 + 5],
+                       %2[%c0], %c64
+        : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
+      affine.dma_wait %2[%c0], %c64 : memref<1xi32>
+// CHECK: affine.dma_start %{{.*}}[%{{.*}} + %{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}} + %{{.*}} + 5], %{{.*}}[%{{.*}}], %{{.*}} : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
+// CHECK: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1)[s0] -> (d0, d1 + s0 + 7)
+// CHECK: [[MAP1:#map[0-9]+]] = (d0, d1)[s0] -> (d0 + s0, d1)
+// CHECK: [[MAP1:#map[0-9]+]] = (d0, d1) -> (d0 + d1 + 11)
+
+// Test with loop IVs and symbols (with symbol keyword).
+func @test3(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<100x100xf32>
+  %1 = alloc() : memref<100x100xf32, (d0, d1) -> (d0, d1), 2>
+  %2 = alloc() : memref<1xi32>
+  %c0 = constant 0 : index
+  %c64 = constant 64 : index
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.dma_start %0[%i0 + symbol(%arg0), %i1],
+                       %1[%i0, %i1 + symbol(%arg1) + 7],
+                       %2[%i0 + %i1 + 11], %c64
+        : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
+      affine.dma_wait %2[%c0], %c64 : memref<1xi32>
+// CHECK: affine.dma_start %{{.*}}[%{{.*}} + symbol(%{{.*}}), %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}} + symbol(%{{.*}}) + 7], %{{.*}}[%{{.*}} + %{{.*}} + 11], %{{.*}} : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
+// CHECK: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1)[s0] -> (d0, (d1 + s0) mod 9 + 7)
+// CHECK: [[MAP1:#map[0-9]+]] = (d0, d1)[s0] -> ((d0 + s0) floordiv 3, d1)
+// CHECK: [[MAP2:#map[0-9]+]] = (d0, d1) -> (d0 + d1 + 11)
+
+// Test with loop IVs, symbols and constants in nested affine expressions.
+func @test4(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<100x100xf32>
+  %1 = alloc() : memref<100x100xf32, 2>
+  %2 = alloc() : memref<1xi32>
+  %c64 = constant 64 : index
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.dma_start %0[(%i0 + symbol(%arg0)) floordiv 3, %i1],
+                       %1[%i0, (%i1 + symbol(%arg1)) mod 9 + 7],
+                       %2[%i0 + %i1 + 11], %c64
+        : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
+      affine.dma_wait %2[%i0 + %i1 + 11], %c64 : memref<1xi32>
+// CHECK: affine.dma_start %{{.*}}[(%{{.*}} + symbol(%{{.*}})) floordiv 3, %{{.*}}], %{{.*}}[%{{.*}}, (%{{.*}} + symbol(%{{.*}})) mod 9 + 7], %{{.*}}[%{{.*}} + %{{.*}} + 11], %{{.*}} : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
+// CHECK: affine.dma_wait %{{.*}}[%{{.*}} + %{{.*}} + 11], %{{.*}} : memref<1xi32>
+    }
+  }
+  return
+}
diff --git a/mlir/test/AffineOps/inlining.mlir b/mlir/test/AffineOps/inlining.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..6710663489b5bb9819301f8d1a07dc6bb58ef743
--- /dev/null
+++ b/mlir/test/AffineOps/inlining.mlir
@@ -0,0 +1,69 @@
+// RUN: mlir-opt %s -inline -mlir-disable-inline-simplify | FileCheck %s
+
+// Basic test that functions within affine operations are inlined.
+func @func_with_affine_ops(%N: index) {
+  %c = constant 200 : index
+  affine.for %i = 1 to 10 {
+    affine.if (i)[N] : (i - 2 >= 0, 4 - i >= 0)(%i)[%c]  {
+      %w = affine.apply (d0,d1)[s0] -> (d0+d1+s0) (%i, %i) [%N]
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @inline_with_affine_ops
+func @inline_with_affine_ops() {
+  %c = constant 1 : index
+
+  // CHECK: affine.for
+  // CHECK-NEXT: affine.if
+  // CHECK-NEXT: affine.apply
+  // CHECK-NOT: call
+  call @func_with_affine_ops(%c) : (index) -> ()
+  return
+}
+
+// CHECK-LABEL: func @not_inline_in_affine_op
+func @not_inline_in_affine_op() {
+  %c = constant 1 : index
+
+  // CHECK-NOT: affine.if
+  // CHECK: call
+  affine.for %i = 1 to 10 {
+    call @func_with_affine_ops(%c) : (index) -> ()
+  }
+  return
+}
+
+// -----
+
+// Test when an invalid operation is nested in an affine op.
+func @func_with_invalid_nested_op() {
+  affine.for %i = 1 to 10 {
+    "foo.opaque"() : () -> ()
+  }
+  return
+}
+
+// CHECK-LABEL: func @not_inline_invalid_nest_op
+func @not_inline_invalid_nest_op() {
+  // CHECK: call @func_with_invalid_nested_op
+  call @func_with_invalid_nested_op() : () -> ()
+  return
+}
+
+// -----
+
+// Test that calls are not inlined into affine structures.
+func @func_noop() {
+  return
+}
+
+// CHECK-LABEL: func @not_inline_into_affine_ops
+func @not_inline_into_affine_ops() {
+  // CHECK: call @func_noop
+  affine.for %i = 1 to 10 {
+    call @func_noop() : () -> ()
+  }
+  return
+}
diff --git a/mlir/test/AffineOps/invalid.mlir b/mlir/test/AffineOps/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..2ade76c194c86e548e1ef8e302685c79452e96ed
--- /dev/null
+++ b/mlir/test/AffineOps/invalid.mlir
@@ -0,0 +1,170 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// -----
+
+func @affine_apply_operand_non_index(%arg0 : i32) {
+  // Custom parser automatically assigns all arguments the `index` so we must
+  // use the generic syntax here to exercise the verifier.
+  // expected-error@+1 {{operands must be of type 'index'}}
+  %0 = "affine.apply"(%arg0) {map = (d0) -> (d0)} : (i32) -> (index)
+  return
+}
+
+// -----
+
+func @affine_apply_resul_non_index(%arg0 : index) {
+  // Custom parser automatically assigns `index` as the result type so we must
+  // use the generic syntax here to exercise the verifier.
+  // expected-error@+1 {{result must be of type 'index'}}
+  %0 = "affine.apply"(%arg0) {map = (d0) -> (d0)} : (index) -> (i32)
+  return
+}
+
+// -----
+
+#map = (d0)[s0] -> (d0 + s0)
+
+func @affine_for_lower_bound_invalid_dim(%arg : index) {
+  affine.for %n0 = 0 to 7 {
+    %dim = addi %arg, %arg : index
+
+    // expected-error@+1 {{operand cannot be used as a dimension id}}
+    affine.for %n1 = 0 to #map(%dim)[%arg] {
+    }
+  }
+  return
+}
+
+// -----
+
+#map = (d0)[s0] -> (d0 + s0)
+
+func @affine_for_upper_bound_invalid_dim(%arg : index) {
+  affine.for %n0 = 0 to 7 {
+    %dim = addi %arg, %arg : index
+
+    // expected-error@+1 {{operand cannot be used as a dimension id}}
+    affine.for %n1 = #map(%dim)[%arg] to 7 {
+    }
+  }
+  return
+}
+
+// -----
+func @affine_load_invalid_dim(%M : memref<10xi32>) {
+  "unknown"() ({
+  ^bb0(%arg: index):
+    affine.load %M[%arg] : memref<10xi32>
+    // expected-error@-1 {{index must be a dimension or symbol identifier}}
+    br ^bb1
+  ^bb1:
+    br ^bb1
+  }) : () -> ()
+  return
+}
+
+// -----
+
+#map0 = (d0)[s0] -> (d0 + s0)
+
+func @affine_for_lower_bound_invalid_sym() {
+  affine.for %i0 = 0 to 7 {
+    // expected-error@+1 {{operand cannot be used as a symbol}}
+    affine.for %n0 = #map0(%i0)[%i0] to 7 {
+    }
+  }
+  return
+}
+
+// -----
+
+#map0 = (d0)[s0] -> (d0 + s0)
+
+func @affine_for_upper_bound_invalid_sym() {
+  affine.for %i0 = 0 to 7 {
+    // expected-error@+1 {{operand cannot be used as a symbol}}
+    affine.for %n0 = 0 to #map0(%i0)[%i0] {
+    }
+  }
+  return
+}
+
+// -----
+
+#set0 = (i)[N] : (i >= 0, N - i >= 0)
+
+func @affine_if_invalid_dim(%arg : index) {
+  affine.for %n0 = 0 to 7 {
+    %dim = addi %arg, %arg : index
+
+    // expected-error@+1 {{operand cannot be used as a dimension id}}
+    affine.if #set0(%dim)[%n0] {}
+  }
+  return
+}
+
+// -----
+
+#set0 = (i)[N] : (i >= 0, N - i >= 0)
+
+func @affine_if_invalid_sym() {
+  affine.for %i0 = 0 to 7 {
+    // expected-error@+1 {{operand cannot be used as a symbol}}
+    affine.if #set0(%i0)[%i0] {}
+  }
+  return
+}
+
+// -----
+
+#set0 = (i)[N] : (i >= 0, N - i >= 0)
+
+func @affine_if_invalid_dimop_dim(%arg0: index, %arg1: index, %arg2: index, %arg3: index) {
+  affine.for %n0 = 0 to 7 {
+    %0 = alloc(%arg0, %arg1, %arg2, %arg3) : memref<?x?x?x?xf32>
+    %dim = dim %0, 0 : memref<?x?x?x?xf32>
+
+    // expected-error@+1 {{operand cannot be used as a dimension id}}
+    affine.if #set0(%dim)[%n0] {}
+  }
+  return
+}
+
+// -----
+
+func @affine_store_missing_l_square(%C: memref<4096x4096xf32>) {
+  %9 = constant 0.0 : f32
+  // expected-error@+1 {{expected '['}}
+  affine.store %9, %C : memref<4096x4096xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @affine_min
+func @affine_min(%arg0 : index, %arg1 : index, %arg2 : index) {
+  // expected-error@+1 {{operand count and affine map dimension and symbol count must match}}
+  %0 = affine.min (d0) -> (d0) (%arg0, %arg1)
+
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @affine_min
+func @affine_min(%arg0 : index, %arg1 : index, %arg2 : index) {
+  // expected-error@+1 {{operand count and affine map dimension and symbol count must match}}
+  %0 = affine.min ()[s0] -> (s0) (%arg0, %arg1)
+
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @affine_min
+func @affine_min(%arg0 : index, %arg1 : index, %arg2 : index) {
+  // expected-error@+1 {{operand count and affine map dimension and symbol count must match}}
+  %0 = affine.min (d0) -> (d0) ()
+
+  return
+}
diff --git a/mlir/test/AffineOps/load-store-invalid.mlir b/mlir/test/AffineOps/load-store-invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..9168a40f5ee385298e0f8c6b029b2f407f3adc51
--- /dev/null
+++ b/mlir/test/AffineOps/load-store-invalid.mlir
@@ -0,0 +1,171 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+func @load_too_many_subscripts(%arg0: memref<?x?xf32>, %arg1: index, %arg2: index, %arg3: index) {
+  // expected-error@+1 {{expects the number of subscripts to be equal to memref rank}}
+  "affine.load"(%arg0, %arg1, %arg2, %arg3) : (memref<?x?xf32>, index, index, index) -> f32
+}
+
+// -----
+
+func @load_too_many_subscripts_map(%arg0: memref<?x?xf32>, %arg1: index, %arg2: index, %arg3: index) {
+  // expected-error@+1 {{op expects as many subscripts as affine map inputs}}
+  "affine.load"(%arg0, %arg1, %arg2, %arg3)
+    {map = (i, j) -> (i, j) } : (memref<?x?xf32>, index, index, index) -> f32
+}
+
+// -----
+
+func @load_too_few_subscripts(%arg0: memref<?x?xf32>, %arg1: index) {
+  // expected-error@+1 {{expects the number of subscripts to be equal to memref rank}}
+  "affine.load"(%arg0, %arg1) : (memref<?x?xf32>, index) -> f32
+}
+
+// -----
+
+func @load_too_few_subscripts_map(%arg0: memref<?x?xf32>, %arg1: index) {
+  // expected-error@+1 {{op expects as many subscripts as affine map inputs}}
+  "affine.load"(%arg0, %arg1)
+    {map = (i, j) -> (i, j) } : (memref<?x?xf32>, index) -> f32
+}
+
+// -----
+
+func @store_too_many_subscripts(%arg0: memref<?x?xf32>, %arg1: index, %arg2: index,
+                                %arg3: index, %val: f32) {
+  // expected-error@+1 {{expects the number of subscripts to be equal to memref rank}}
+  "affine.store"(%val, %arg0, %arg1, %arg2, %arg3) : (f32, memref<?x?xf32>, index, index, index) -> ()
+}
+
+// -----
+
+func @store_too_many_subscripts_map(%arg0: memref<?x?xf32>, %arg1: index, %arg2: index,
+                                    %arg3: index, %val: f32) {
+  // expected-error@+1 {{op expects as many subscripts as affine map inputs}}
+  "affine.store"(%val, %arg0, %arg1, %arg2, %arg3)
+    {map = (i, j) -> (i, j) } : (f32, memref<?x?xf32>, index, index, index) -> ()
+}
+
+// -----
+
+func @store_too_few_subscripts(%arg0: memref<?x?xf32>, %arg1: index, %val: f32) {
+  // expected-error@+1 {{expects the number of subscripts to be equal to memref rank}}
+  "affine.store"(%val, %arg0, %arg1) : (f32, memref<?x?xf32>, index) -> ()
+}
+
+// -----
+
+func @store_too_few_subscripts_map(%arg0: memref<?x?xf32>, %arg1: index, %val: f32) {
+  // expected-error@+1 {{op expects as many subscripts as affine map inputs}}
+  "affine.store"(%val, %arg0, %arg1)
+    {map = (i, j) -> (i, j) } : (f32, memref<?x?xf32>, index) -> ()
+}
+
+// -----
+
+func @load_non_affine_index(%arg0 : index) {
+  %0 = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    %1 = muli %i0, %arg0 : index
+    // expected-error@+1 {{op index must be a dimension or symbol identifier}}
+    %v = affine.load %0[%1] : memref<10xf32>
+  }
+  return
+}
+
+// -----
+
+func @store_non_affine_index(%arg0 : index) {
+  %0 = alloc() : memref<10xf32>
+  %1 = constant 11.0 : f32
+  affine.for %i0 = 0 to 10 {
+    %2 = muli %i0, %arg0 : index
+    // expected-error@+1 {{op index must be a dimension or symbol identifier}}
+    affine.store %1, %0[%2] : memref<10xf32>
+  }
+  return
+}
+
+// -----
+
+func @invalid_prefetch_rw(%i : index) {
+  %0 = alloc() : memref<10xf32>
+  // expected-error@+1 {{rw specifier has to be 'read' or 'write'}}
+  affine.prefetch %0[%i], rw, locality<0>, data  : memref<10xf32>
+  return
+}
+
+// -----
+
+func @invalid_prefetch_cache_type(%i : index) {
+  %0 = alloc() : memref<10xf32>
+  // expected-error@+1 {{cache type has to be 'data' or 'instr'}}
+  affine.prefetch %0[%i], read, locality<0>, false  : memref<10xf32>
+  return
+}
+
+// -----
+
+func @dma_start_non_affine_src_index(%arg0 : index) {
+  %0 = alloc() : memref<100xf32>
+  %1 = alloc() : memref<100xf32, 2>
+  %2 = alloc() : memref<1xi32, 4>
+  %c0 = constant 0 : index
+  %c64 = constant 64 : index
+  affine.for %i0 = 0 to 10 {
+    %3 = muli %i0, %arg0 : index
+    // expected-error@+1 {{op src index must be a dimension or symbol identifier}}
+    affine.dma_start %0[%3], %1[%i0], %2[%c0], %c64
+        : memref<100xf32>, memref<100xf32, 2>, memref<1xi32, 4>
+  }
+  return
+}
+
+// -----
+
+func @dma_start_non_affine_dst_index(%arg0 : index) {
+  %0 = alloc() : memref<100xf32>
+  %1 = alloc() : memref<100xf32, 2>
+  %2 = alloc() : memref<1xi32, 4>
+  %c0 = constant 0 : index
+  %c64 = constant 64 : index
+  affine.for %i0 = 0 to 10 {
+    %3 = muli %i0, %arg0 : index
+    // expected-error@+1 {{op dst index must be a dimension or symbol identifier}}
+    affine.dma_start %0[%i0], %1[%3], %2[%c0], %c64
+        : memref<100xf32>, memref<100xf32, 2>, memref<1xi32, 4>
+  }
+  return
+}
+
+// -----
+
+func @dma_start_non_affine_tag_index(%arg0 : index) {
+  %0 = alloc() : memref<100xf32>
+  %1 = alloc() : memref<100xf32, 2>
+  %2 = alloc() : memref<1xi32, 4>
+  %c0 = constant 0 : index
+  %c64 = constant 64 : index
+  affine.for %i0 = 0 to 10 {
+    %3 = muli %i0, %arg0 : index
+    // expected-error@+1 {{op tag index must be a dimension or symbol identifier}}
+    affine.dma_start %0[%i0], %1[%arg0], %2[%3], %c64
+        : memref<100xf32>, memref<100xf32, 2>, memref<1xi32, 4>
+  }
+  return
+}
+
+// -----
+
+func @dma_wait_non_affine_tag_index(%arg0 : index) {
+  %0 = alloc() : memref<100xf32>
+  %1 = alloc() : memref<100xf32, 2>
+  %2 = alloc() : memref<1xi32, 4>
+  %c0 = constant 0 : index
+  %c64 = constant 64 : index
+  affine.for %i0 = 0 to 10 {
+    %3 = muli %i0, %arg0 : index
+    // expected-error@+1 {{op index must be a dimension or symbol identifier}}
+    affine.dma_wait %2[%3], %c64 : memref<1xi32, 4>
+  }
+  return
+}
diff --git a/mlir/test/AffineOps/load-store.mlir b/mlir/test/AffineOps/load-store.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ebf632c65f16421cb4b7f8b9546f1f4bc4641f9b
--- /dev/null
+++ b/mlir/test/AffineOps/load-store.mlir
@@ -0,0 +1,216 @@
+// RUN: mlir-opt %s -split-input-file | FileCheck %s
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (d0, d1)
+
+// Test with just loop IVs.
+func @test0(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<100x100xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      %1 = affine.load %0[%i0, %i1] : memref<100x100xf32>
+// CHECK: %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<100x100xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (d0 + 3, d1 + 7)
+
+// Test with loop IVs and constants.
+func @test1(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<100x100xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      %1 = affine.load %0[%i0 + 3, %i1 + 7] : memref<100x100xf32>
+      affine.store %1, %0[%i0 + 3, %i1 + 7] : memref<100x100xf32>
+// CHECK: %{{.*}} = affine.load %{{.*}}[%{{.*}} + 3, %{{.*}} + 7] : memref<100x100xf32>
+// CHECK: affine.store %{{.*}}, %{{.*}}[%{{.*}} + 3, %{{.*}} + 7] : memref<100x100xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1, d2, d3) -> (d0 + d1, d2 + d3)
+
+// Test with loop IVs and function args without 'symbol' keyword (should
+// be parsed as dim identifiers).
+func @test2(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<100x100xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      %1 = affine.load %0[%i0 + %arg0, %i1 + %arg1] : memref<100x100xf32>
+      affine.store %1, %0[%i0 + %arg0, %i1 + %arg1] : memref<100x100xf32>
+// CHECK: %{{.*}} = affine.load %{{.*}}[%{{.*}} + %{{.*}}, %{{.*}} + %{{.*}}] : memref<100x100xf32>
+// CHECK: affine.store %{{.*}}, %{{.*}}[%{{.*}} + %{{.*}}, %{{.*}} + %{{.*}}] : memref<100x100xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1)[s0, s1] -> (d0 + s0, d1 + s1)
+
+// Test with loop IVs and function args with 'symbol' keyword (should
+// be parsed as symbol identifiers).
+func @test3(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<100x100xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      %1 = affine.load %0[%i0 + symbol(%arg0), %i1 + symbol(%arg1)]
+        : memref<100x100xf32>
+      affine.store %1, %0[%i0 + symbol(%arg0), %i1 + symbol(%arg1)]
+        : memref<100x100xf32>
+// CHECK: %{{.*}} = affine.load %{{.*}}[%{{.*}} + symbol(%{{.*}}), %{{.*}} + symbol(%{{.*}})] : memref<100x100xf32>
+// CHECK: affine.store %{{.*}}, %{{.*}}[%{{.*}} + symbol(%{{.*}}), %{{.*}} + symbol(%{{.*}})] : memref<100x100xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1)[s0, s1] -> ((d0 + s0) floordiv 3 + 11, (d1 + s1) mod 4 + 7)
+
+// Test with loop IVs, symbols and constants in nested affine expressions.
+func @test4(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<100x100xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      %1 = affine.load %0[(%i0 + symbol(%arg0)) floordiv 3 + 11,
+                          (%i1 + symbol(%arg1)) mod 4 + 7] : memref<100x100xf32>
+      affine.store %1, %0[(%i0 + symbol(%arg0)) floordiv 3 + 11,
+                          (%i1 + symbol(%arg1)) mod 4 + 7] : memref<100x100xf32>
+// CHECK: %{{.*}} = affine.load %{{.*}}[(%{{.*}} + symbol(%{{.*}})) floordiv 3 + 11, (%{{.*}} + symbol(%{{.*}})) mod 4 + 7] : memref<100x100xf32>
+// CHECK: affine.store %{{.*}}, %{{.*}}[(%{{.*}} + symbol(%{{.*}})) floordiv 3 + 11, (%{{.*}} + symbol(%{{.*}})) mod 4 + 7] : memref<100x100xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1, d2) -> (d0, d1, d2)
+
+// Test with swizzled loop IVs.
+func @test5(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<10x10x10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.for %i2 = 0 to 10 {
+        %1 = affine.load %0[%i2, %i0, %i1] : memref<10x10x10xf32>
+        affine.store %1, %0[%i2, %i0, %i1] : memref<10x10x10xf32>
+// CHECK: %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<10x10x10xf32>
+// CHECK: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<10x10x10xf32>
+      }
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1, d2, d3, d4) -> (d0 + d1, d2 + d3, d3 + d1 + d4)
+
+// Test with swizzled loop IVs, duplicate args, and function args used as dims.
+// Dim identifiers are assigned in parse order:
+// d0 = %i2, d1 = %arg0, d2 = %i0, d3 = %i1, d4 = %arg1
+func @test6(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<10x10x10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.for %i2 = 0 to 10 {
+        %1 = affine.load %0[%i2 + %arg0, %i0 + %i1, %i1 + %arg0 + %arg1]
+          : memref<10x10x10xf32>
+        affine.store %1, %0[%i2 + %arg0, %i0 + %i1, %i1 + %arg0 + %arg1]
+          : memref<10x10x10xf32>
+// CHECK: %{{.*}} = affine.load %{{.*}}[%{{.*}} + %{{.*}}, %{{.*}} + %{{.*}}, %{{.*}} + %{{.*}} + %{{.*}}] : memref<10x10x10xf32>
+// CHECK: affine.store %{{.*}}, %{{.*}}[%{{.*}} + %{{.*}}, %{{.*}} + %{{.*}}, %{{.*}} + %{{.*}} + %{{.*}}] : memref<10x10x10xf32>
+      }
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1, d2)[s0, s1] -> (d0 + s0, d1 + d2, d2 + s0 + s1)
+
+// Test with swizzled loop IVs, duplicate args, and function args used as syms.
+// Dim and symbol identifiers are assigned in parse order:
+// d0 = %i2, d1 = %i0, d2 = %i1
+// s0 = %arg0, s1 = %arg1
+func @test6(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<10x10x10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.for %i2 = 0 to 10 {
+        %1 = affine.load %0[%i2 + symbol(%arg0),
+                            %i0 + %i1,
+                            %i1 + symbol(%arg0) + symbol(%arg1)]
+                              : memref<10x10x10xf32>
+        affine.store %1, %0[%i2 + symbol(%arg0),
+                             %i0 + %i1,
+                             %i1 + symbol(%arg0) + symbol(%arg1)]
+                              : memref<10x10x10xf32>
+// CHECK: %{{.*}} = affine.load %{{.*}}[%{{.*}} + symbol(%{{.*}}), %{{.*}} + %{{.*}}, %{{.*}} + symbol(%{{.*}}) + symbol(%{{.*}})] : memref<10x10x10xf32>
+// CHECK: affine.store %{{.*}}, %{{.*}}[%{{.*}} + symbol(%{{.*}}), %{{.*}} + %{{.*}}, %{{.*}} + symbol(%{{.*}}) + symbol(%{{.*}})] : memref<10x10x10xf32>
+      }
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
+
+// Test with operands without special SSA name.
+func @test7() {
+  %0 = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    %1 = affine.apply (d1) -> (d1 + 1)(%i0)
+    %2 = affine.load %0[%1] : memref<10xf32>
+    affine.store %2, %0[%1] : memref<10xf32>
+// CHECK: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  }
+  return
+}
+
+// -----
+
+// Test with zero-dimensional operands.
+func @zero_dim(%arg0 : memref<i32>, %arg1 : memref<i32>) {
+  %0 = affine.load %arg0[] : memref<i32>
+  affine.store %0, %arg1[] : memref<i32>
+  // CHECK: affine.load %{{.*}}[] : memref<i32>
+  // CHECK: affine.store %{{.*}}, %{{.*}}[] : memref<i32>
+  return
+}
+
+// -----
+
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (d0 + 3, d1 + 7)
+// CHECK: [[MAP1:#map[0-9]+]] = (d0, d1) -> (d0 + 3, d1 + 11)
+
+// Test with loop IVs and constants.
+func @test_prefetch(%arg0 : index, %arg1 : index) {
+  %0 = alloc() : memref<100x100xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      %1 = affine.load %0[%i0 + 3, %i1 + 7] : memref<100x100xf32>
+      affine.prefetch %0[%i0 + 3, %i1 + 11], write, locality<0>, data : memref<100x100xf32>
+      // CHECK: affine.prefetch %{{.*}}[%{{.*}} + 3, %{{.*}} + 11], write, locality<0>, data : memref<100x100xf32>
+      affine.prefetch %0[%i0, %i1 + 1], read, locality<3>, instr : memref<100x100xf32>
+      // CHECK: affine.prefetch %{{.*}}[%{{.*}}, %{{.*}} + 1], read, locality<3>, instr : memref<100x100xf32>
+    }
+  }
+  return
+}
diff --git a/mlir/test/AffineOps/memref-stride-calculation.mlir b/mlir/test/AffineOps/memref-stride-calculation.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..6efd21d01dc6eea91a3f714120168b94849ee015
--- /dev/null
+++ b/mlir/test/AffineOps/memref-stride-calculation.mlir
@@ -0,0 +1,71 @@
+// RUN: mlir-opt %s -test-memref-stride-calculation -o /dev/null | FileCheck %s
+
+func @f(%0: index) {
+// CHECK-LABEL: Testing: f
+  %1 = alloc() : memref<3x4x5xf32>
+// CHECK: MemRefType offset: 0 strides: 20, 5, 1
+  %2 = alloc(%0) : memref<3x4x?xf32>
+// CHECK: MemRefType offset: 0 strides: ?, ?, 1
+  %3 = alloc(%0) : memref<3x?x5xf32>
+// CHECK: MemRefType offset: 0 strides: ?, 5, 1
+  %4 = alloc(%0) : memref<?x4x5xf32>
+// CHECK: MemRefType offset: 0 strides: 20, 5, 1
+  %5 = alloc(%0, %0) : memref<?x4x?xf32>
+// CHECK: MemRefType offset: 0 strides: ?, ?, 1
+  %6 = alloc(%0, %0, %0) : memref<?x?x?xf32>
+// CHECK: MemRefType offset: 0 strides: ?, ?, 1
+
+  %11 = alloc() : memref<3x4x5xf32, (i, j, k)->(i, j, k)>
+// CHECK: MemRefType offset: 0 strides: 20, 5, 1
+  %b11 = alloc() : memref<3x4x5xf32, offset: 0, strides: [20, 5, 1]>
+// CHECK: MemRefType offset: 0 strides: 20, 5, 1
+  %12 = alloc(%0) : memref<3x4x?xf32, (i, j, k)->(i, j, k)>
+// CHECK: MemRefType offset: 0 strides: ?, ?, 1
+  %13 = alloc(%0) : memref<3x?x5xf32, (i, j, k)->(i, j, k)>
+// CHECK: MemRefType offset: 0 strides: ?, 5, 1
+  %14 = alloc(%0) : memref<?x4x5xf32, (i, j, k)->(i, j, k)>
+// CHECK: MemRefType offset: 0 strides: 20, 5, 1
+  %15 = alloc(%0, %0) : memref<?x4x?xf32, (i, j, k)->(i, j, k)>
+// CHECK: MemRefType offset: 0 strides: ?, ?, 1
+  %16 = alloc(%0, %0, %0) : memref<?x?x?xf32, (i, j, k)->(i, j, k)>
+// CHECK: MemRefType offset: 0 strides: ?, ?, 1
+
+  %21 = alloc()[%0] : memref<3x4x5xf32, (i, j, k)[M]->(32 * i + 16 * j + M * k + 1)>
+// CHECK: MemRefType offset: 1 strides: 32, 16, ?
+  %22 = alloc()[%0] : memref<3x4x5xf32, (i, j, k)[M]->(32 * i + M * j + 16 * k + 3)>
+// CHECK: MemRefType offset: 3 strides: 32, ?, 16
+  %b22 = alloc(%0)[%0, %0] : memref<3x4x?xf32, offset: 0, strides: [?, ?, 1]>
+// CHECK: MemRefType offset: 0 strides: ?, ?, 1
+  %23 = alloc(%0)[%0] : memref<3x?x5xf32, (i, j, k)[M]->(M * i + 32 * j + 16 * k + 7)>
+// CHECK: MemRefType offset: 7 strides: ?, 32, 16
+  %b23 = alloc(%0)[%0] : memref<3x?x5xf32, offset: 0, strides: [?, 5, 1]>
+// CHECK: MemRefType offset: 0 strides: ?, 5, 1
+  %24 = alloc(%0)[%0] : memref<3x?x5xf32, (i, j, k)[M]->(M * i + 32 * j + 16 * k + M)>
+// CHECK: MemRefType offset: ? strides: ?, 32, 16
+  %b24 = alloc(%0)[%0, %0] : memref<3x?x5xf32, offset: ?, strides: [?, 32, 16]>
+// CHECK: MemRefType offset: ? strides: ?, 32, 16
+  %25 = alloc(%0, %0)[%0, %0] : memref<?x?x16xf32, (i, j, k)[M, N]->(M * i + N * j + k + 1)>
+// CHECK: MemRefType offset: 1 strides: ?, ?, 1
+  %b25 = alloc(%0, %0)[%0, %0] : memref<?x?x16xf32, offset: 1, strides: [?, ?, 1]>
+// CHECK: MemRefType offset: 1 strides: ?, ?, 1
+  %26 = alloc(%0)[] : memref<?xf32, (i)[M]->(i)>
+// CHECK: MemRefType offset: 0 strides: 1
+  %27 = alloc()[%0] : memref<5xf32, (i)[M]->(M)>
+// CHECK: MemRefType memref<5xf32, (d0)[s0] -> (s0)> cannot be converted to strided form
+  %28 = alloc()[%0] : memref<5xf32, (i)[M]->(123)>
+// CHECK: MemRefType memref<5xf32, (d0)[s0] -> (123)> cannot be converted to strided form
+  %29 = alloc()[%0] : memref<f32, ()[M]->(M)>
+// CHECK: MemRefType offset: ? strides:
+  %30 = alloc()[%0] : memref<f32, ()[M]->(123)>
+// CHECK: MemRefType offset: 123 strides:
+
+  %100 = alloc(%0, %0)[%0, %0] : memref<?x?x16xf32, (i, j, k)[M, N]->(i + j, j, k), (i, j, k)[M, N]->(M * i + N * j + k + 1)>
+// CHECK: MemRefType memref<?x?x16xf32, (d0, d1, d2)[s0, s1] -> (d0 + d1, d1, d2), (d0, d1, d2)[s0, s1] -> (d0 * s0 + d1 * s1 + d2 + 1)> cannot be converted to strided form
+  %101 = alloc() : memref<3x4x5xf32, (i, j, k)->(i floordiv 4 + j + k)>
+// CHECK: MemRefType memref<3x4x5xf32, (d0, d1, d2) -> (d0 floordiv 4 + d1 + d2)> cannot be converted to strided form
+  %102 = alloc() : memref<3x4x5xf32, (i, j, k)->(i ceildiv 4 + j + k)>
+// CHECK: MemRefType memref<3x4x5xf32, (d0, d1, d2) -> (d0 ceildiv 4 + d1 + d2)> cannot be converted to strided form
+  %103 = alloc() : memref<3x4x5xf32, (i, j, k)->(i mod 4 + j + k)>
+// CHECK: MemRefType memref<3x4x5xf32, (d0, d1, d2) -> (d0 mod 4 + d1 + d2)> cannot be converted to strided form
+  return
+}
diff --git a/mlir/test/AffineOps/ops.mlir b/mlir/test/AffineOps/ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d78ddd2d76fc2d50c29cc538f8ba2ae5a8e5d7db
--- /dev/null
+++ b/mlir/test/AffineOps/ops.mlir
@@ -0,0 +1,101 @@
+// RUN: mlir-opt -split-input-file %s | FileCheck %s
+// RUN: mlir-opt %s -mlir-print-op-generic | FileCheck -check-prefix=GENERIC %s
+
+// Check that the attributes for the affine operations are round-tripped.
+// Check that `affine.terminator` is visible in the generic form.
+// CHECK-LABEL: @empty
+func @empty() {
+  // CHECK: affine.for
+  // CHECK-NEXT: } {some_attr = true}
+  //
+  // GENERIC:      "affine.for"()
+  // GENERIC-NEXT: ^bb0(%{{.*}}: index):
+  // GENERIC-NEXT:   "affine.terminator"() : () -> ()
+  // GENERIC-NEXT: })
+  affine.for %i = 0 to 10 {
+  } {some_attr = true}
+
+  // CHECK: affine.if
+  // CHECK-NEXT: } {some_attr = true}
+  //
+  // GENERIC:      "affine.if"()
+  // GENERIC-NEXT:   "affine.terminator"() : () -> ()
+  // GENERIC-NEXT: },  {
+  // GENERIC-NEXT: })
+  affine.if () : () () {
+  } {some_attr = true}
+
+  // CHECK: } else {
+  // CHECK: } {some_attr = true}
+  //
+  // GENERIC:      "affine.if"()
+  // GENERIC-NEXT:   "affine.terminator"() : () -> ()
+  // GENERIC-NEXT: },  {
+  // GENERIC-NEXT:   "foo"() : () -> ()
+  // GENERIC-NEXT:   "affine.terminator"() : () -> ()
+  // GENERIC-NEXT: })
+  affine.if () : () () {
+  } else {
+    "foo"() : () -> ()
+  } {some_attr = true}
+
+  return
+}
+
+// Check that an explicit affine terminator is not printed in custom format.
+// Check that no extra terminator is introduced.
+// CHECK-LABEL: @affine_terminator
+func @affine_terminator() {
+  // CHECK: affine.for
+  // CHECK-NEXT: }
+  //
+  // GENERIC:      "affine.for"() ( {
+  // GENERIC-NEXT: ^bb0(%{{.*}}: index):	// no predecessors
+  // GENERIC-NEXT:   "affine.terminator"() : () -> ()
+  // GENERIC-NEXT: }) {lower_bound = #map0, step = 1 : index, upper_bound = #map1} : () -> ()
+  affine.for %i = 0 to 10 {
+    "affine.terminator"() : () -> ()
+  }
+  return
+}
+
+// -----
+
+// CHECK-DAG: #[[MAP0:map[0-9]+]] = (d0)[s0] -> (1000, d0 + 512, s0)
+// CHECK-DAG: #[[MAP1:map[0-9]+]] = (d0, d1)[s0] -> (d0 - d1, s0 + 512)
+// CHECK-DAG: #[[MAP2:map[0-9]+]] = ()[s0, s1] -> (s0 - s1, 11)
+// CHECK-DAG: #[[MAP3:map[0-9]+]] = () -> (77, 78, 79)
+
+// CHECK-LABEL: @affine_min
+func @affine_min(%arg0 : index, %arg1 : index, %arg2 : index) {
+  // CHECK: affine.min #[[MAP0]](%arg0)[%arg1]
+  %0 = affine.min (d0)[s0] -> (1000, d0 + 512, s0) (%arg0)[%arg1]
+  // CHECK: affine.min #[[MAP1]](%arg0, %arg1)[%arg2]
+  %1 = affine.min (d0, d1)[s0] -> (d0 - d1, s0 + 512) (%arg0, %arg1)[%arg2]
+  // CHECK: affine.min #[[MAP2]]()[%arg1, %arg2]
+  %2 = affine.min ()[s0, s1] -> (s0 - s1, 11) ()[%arg1, %arg2]
+  // CHECK: affine.min #[[MAP3]]()
+  %3 = affine.min ()[] -> (77, 78, 79) ()[]
+  return
+}
+
+// -----
+
+func @valid_symbols(%arg0: index, %arg1: index, %arg2: index) {
+  %c0 = constant 1 : index
+  %c1 = constant 0 : index
+  %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
+  affine.for %arg3 = 0 to %arg2 step 768 {
+    %13 = dim %0, 1 : memref<?x?xf32>
+    affine.for %arg4 = 0 to %13 step 264 {
+      %18 = dim %0, 0 : memref<?x?xf32>
+      %20 = std.subview %0[%c0, %c0][%18,%arg4][%c1,%c1] : memref<?x?xf32>
+                          to memref<?x?xf32, (d0, d1)[s0, s1, s2] -> (d0 * s1 + d1 * s2 + s0)>
+      %24 = dim %20, 0 : memref<?x?xf32, (d0, d1)[s0, s1, s2] -> (d0 * s1 + d1 * s2 + s0)>
+      affine.for %arg5 = 0 to %24 step 768 {
+        "foo"() : () -> ()
+      }
+    }
+  }
+  return
+}
diff --git a/mlir/test/Analysis/test-callgraph.mlir b/mlir/test/Analysis/test-callgraph.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..39e4fb8ba2796b16333a9dc90d611f8996fa8499
--- /dev/null
+++ b/mlir/test/Analysis/test-callgraph.mlir
@@ -0,0 +1,52 @@
+// RUN: mlir-opt %s -test-print-callgraph 2>&1 | FileCheck %s --dump-input-on-failure
+
+// CHECK-LABEL: Testing : "simple"
+module attributes {test.name = "simple"} {
+
+  // CHECK: Node{{.*}}func_a
+  func @func_a() {
+    return
+  }
+
+  func @func_b()
+
+  // CHECK: Node{{.*}}func_c
+  // CHECK-NEXT: Call-Edge{{.*}}External-Node
+  func @func_c() {
+    call @func_b() : () -> ()
+    return
+  }
+
+  // CHECK: Node{{.*}}func_d
+  // CHECK-NEXT: Call-Edge{{.*}}func_c
+  func @func_d() {
+    call @func_c() : () -> ()
+    return
+  }
+
+  // CHECK: Node{{.*}}func_e
+  // CHECK-DAG: Call-Edge{{.*}}func_c
+  // CHECK-DAG: Call-Edge{{.*}}func_d
+  // CHECK-DAG: Call-Edge{{.*}}func_e
+  func @func_e() {
+    call @func_c() : () -> ()
+    call @func_d() : () -> ()
+    call @func_e() : () -> ()
+    return
+  }
+
+  // CHECK: Node{{.*}}func_f
+  // CHECK: Child-Edge{{.*}}test.functional_region_op
+  // CHECK: Call-Edge{{.*}}test.functional_region_op
+  func @func_f() {
+    // CHECK: Node{{.*}}test.functional_region_op
+    // CHECK: Call-Edge{{.*}}func_f
+    %fn = "test.functional_region_op"() ({
+      call @func_f() : () -> ()
+      "test.return"() : () -> ()
+    }) : () -> (() -> ())
+
+    call_indirect %fn() : () -> ()
+    return
+  }
+}
diff --git a/mlir/test/Analysis/test-liveness.mlir b/mlir/test/Analysis/test-liveness.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..2f4604974d061bd169b5c7a7848658360f5d0284
--- /dev/null
+++ b/mlir/test/Analysis/test-liveness.mlir
@@ -0,0 +1,191 @@
+// RUN: mlir-opt %s -test-print-liveness -split-input-file 2>&1 | FileCheck %s --dump-input-on-failure
+
+// CHECK-LABEL: Testing : func_empty
+func @func_empty() {
+  // CHECK: Block: 0
+  // CHECK-NEXT: LiveIn:{{ *$}}
+  // CHECK-NEXT: LiveOut:{{ *$}}
+  // CHECK-NEXT: BeginLiveness
+  // CHECK-NEXT: EndLiveness
+  return
+}
+
+// -----
+
+// CHECK-LABEL: Testing : func_simpleBranch
+func @func_simpleBranch(%arg0: i32, %arg1 : i32) -> i32 {
+  // CHECK: Block: 0
+  // CHECK-NEXT: LiveIn:{{ *$}}
+  // CHECK-NEXT: LiveOut: arg0@0 arg1@0
+  // CHECK-NEXT: BeginLiveness
+  // CHECK-NEXT: EndLiveness
+  br ^exit
+^exit:
+  // CHECK: Block: 1
+  // CHECK-NEXT: LiveIn: arg0@0 arg1@0
+  // CHECK-NEXT: LiveOut:{{ *$}}
+  // CHECK-NEXT: BeginLiveness
+  // CHECK: val_std.addi
+  // CHECK-NEXT:     %0 = addi
+  // CHECK-NEXT:     return
+  // CHECK-NEXT: EndLiveness
+  %result = addi %arg0, %arg1 : i32
+  return %result : i32
+}
+
+// -----
+
+// CHECK-LABEL: Testing : func_condBranch
+func @func_condBranch(%cond : i1, %arg1: i32, %arg2 : i32) -> i32 {
+  // CHECK: Block: 0
+  // CHECK-NEXT: LiveIn:{{ *$}}
+  // CHECK-NEXT: LiveOut: arg1@0 arg2@0
+  // CHECK-NEXT: BeginLiveness
+  // CHECK-NEXT: EndLiveness
+  cond_br %cond, ^bb1, ^bb2
+^bb1:
+  // CHECK: Block: 1
+  // CHECK-NEXT: LiveIn: arg1@0 arg2@0
+  // CHECK-NEXT: LiveOut: arg1@0 arg2@0
+  br ^exit
+^bb2:
+  // CHECK: Block: 2
+  // CHECK-NEXT: LiveIn: arg1@0 arg2@0
+  // CHECK-NEXT: LiveOut: arg1@0 arg2@0
+  br ^exit
+^exit:
+  // CHECK: Block: 3
+  // CHECK-NEXT: LiveIn: arg1@0 arg2@0
+  // CHECK-NEXT: LiveOut:{{ *$}}
+  // CHECK-NEXT: BeginLiveness
+  // CHECK: val_std.addi
+  // CHECK-NEXT:     %0 = addi
+  // CHECK-NEXT:     return
+  // CHECK-NEXT: EndLiveness
+  %result = addi %arg1, %arg2 : i32
+  return %result : i32
+}
+
+// -----
+
+// CHECK-LABEL: Testing : func_loop
+func @func_loop(%arg0 : i32, %arg1 : i32) -> i32 {
+  // CHECK: Block: 0
+  // CHECK-NEXT: LiveIn:{{ *$}}
+  // CHECK-NEXT: LiveOut: arg1@0
+  %const0 = constant 0 : i32
+  br ^loopHeader(%const0, %arg0 : i32, i32)
+^loopHeader(%counter : i32, %i : i32):
+  // CHECK: Block: 1
+  // CHECK-NEXT: LiveIn: arg1@0
+  // CHECK-NEXT: LiveOut: arg1@0 arg0@1
+  // CHECK-NEXT: BeginLiveness
+  // CHECK-NEXT: val_std.cmpi
+  // CHECK-NEXT:     %2 = cmpi
+  // CHECK-NEXT:     cond_br
+  // CHECK-NEXT: EndLiveness
+  %lessThan = cmpi "slt", %counter, %arg1 : i32
+  cond_br %lessThan, ^loopBody(%i : i32), ^exit(%i : i32)
+^loopBody(%val : i32):
+  // CHECK: Block: 2
+  // CHECK-NEXT: LiveIn: arg1@0 arg0@1
+  // CHECK-NEXT: LiveOut: arg1@0
+  // CHECK-NEXT: BeginLiveness
+  // CHECK-NEXT: val_std.constant
+  // CHECK-NEXT:     %c
+  // CHECK-NEXT:     %4 = addi
+  // CHECK-NEXT:     %5 = addi
+  // CHECK-NEXT: val_std.addi
+  // CHECK-NEXT:     %4 = addi
+  // CHECK-NEXT:     %5 = addi
+  // CHECK-NEXT:     br
+  // CHECK: EndLiveness
+  %const1 = constant 1 : i32
+  %inc = addi %val, %const1 : i32
+  %inc2 = addi %counter, %const1 : i32
+  br ^loopHeader(%inc, %inc2 : i32, i32)
+^exit(%sum : i32):
+  // CHECK: Block: 3
+  // CHECK-NEXT: LiveIn: arg1@0
+  // CHECK-NEXT: LiveOut:{{ *$}}
+  %result = addi %sum, %arg1 : i32
+  return %result : i32
+}
+
+// -----
+
+// CHECK-LABEL: Testing : func_ranges
+func @func_ranges(%cond : i1, %arg1 : i32, %arg2 : i32, %arg3 : i32) -> i32 {
+  // CHECK: Block: 0
+  // CHECK-NEXT: LiveIn:{{ *$}}
+  // CHECK-NEXT: LiveOut: arg2@0 val_std.muli val_std.addi
+  // CHECK-NEXT: BeginLiveness
+  // CHECK-NEXT: val_std.addi
+  // CHECK-NEXT:    %0 = addi
+  // CHECK-NEXT:    %c
+  // CHECK-NEXT:    %1 = addi
+  // CHECK-NEXT:    %2 = addi
+  // CHECK-NEXT:    %3 = muli
+  // CHECK-NEXT: val_std.constant
+  // CHECK-NEXT:    %c
+  // CHECK-NEXT:    %1 = addi
+  // CHECK-NEXT:    %2 = addi
+  // CHECK-NEXT:    %3 = muli
+  // CHECK-NEXT:    %4 = muli
+  // CHECK-NEXT:    %5 = addi
+  // CHECK-NEXT: val_std.addi
+  // CHECK-NEXT:    %1 = addi
+  // CHECK-NEXT:    %2 = addi
+  // CHECK-NEXT:    %3 = muli
+  // CHECK-NEXT: val_std.addi
+  // CHECK-NEXT    %2 = addi
+  // CHECK-NEXT    %3 = muli
+  // CHECK-NEXT    %4 = muli
+  // CHECK: val_std.muli
+  // CHECK-NEXT:    %3 = muli
+  // CHECK-NEXT:    %4 = muli
+  // CHECK-NEXT: val_std.muli
+  // CHECK-NEXT:    %4 = muli
+  // CHECK-NEXT:    %5 = addi
+  // CHECK-NEXT:    cond_br
+  // CHECK-NEXT:    %c
+  // CHECK-NEXT:    %6 = muli
+  // CHECK-NEXT:    %7 = muli
+  // CHECK-NEXT:    %8 = addi
+  // CHECK-NEXT: val_std.addi
+  // CHECK-NEXT:    %5 = addi
+  // CHECK-NEXT:    cond_br
+  // CHECK-NEXT:    %7
+  // CHECK: EndLiveness
+  %0 = addi %arg1, %arg2 : i32
+  %const1 = constant 1 : i32
+  %1 = addi %const1, %arg2 : i32
+  %2 = addi %const1, %arg3 : i32
+  %3 = muli %0, %1 : i32
+  %4 = muli %3, %2 : i32
+  %5 = addi %4, %const1 : i32
+  cond_br %cond, ^bb1, ^bb2
+
+^bb1:
+  // CHECK: Block: 1
+  // CHECK-NEXT: LiveIn: arg2@0 val_std.muli
+  // CHECK-NEXT: LiveOut: arg2@0
+  %const4 = constant 4 : i32
+  %6 = muli %4, %const4 : i32
+  br ^exit(%6 : i32)
+
+^bb2:
+  // CHECK: Block: 2
+  // CHECK-NEXT: LiveIn: arg2@0 val_std.muli val_std.addi
+  // CHECK-NEXT: LiveOut: arg2@0
+  %7 = muli %4, %5 : i32
+  %8 = addi %4, %arg2 : i32
+  br ^exit(%8 : i32)
+
+^exit(%sum : i32):
+  // CHECK: Block: 3
+  // CHECK-NEXT: LiveIn: arg2@0
+  // CHECK-NEXT: LiveOut:{{ *$}}
+  %result = addi %sum, %arg2 : i32
+  return %result : i32
+}
\ No newline at end of file
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95792548221c5b97c1b84d2a070eb753d769a56c
--- /dev/null
+++ b/mlir/test/CMakeLists.txt
@@ -0,0 +1,71 @@
+add_subdirectory(EDSC)
+add_subdirectory(mlir-cpu-runner)
+add_subdirectory(SDBM)
+add_subdirectory(lib)
+
+llvm_canonicalize_cmake_booleans(
+  LLVM_BUILD_EXAMPLES
+  )
+
+# Passed to lit.site.cfg.py.in to set up the path where to find the libraries
+# for linalg integration tests.
+set(MLIR_DIALECT_LINALG_INTEGRATION_TEST_LIB_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+
+# Passed to lit.site.cfg.py.in to set up the path where to find the libraries
+# for the mlir cuda runner tests.
+set(MLIR_CUDA_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+  MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+  )
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py
+  MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.cfg.py
+  )
+
+set(MLIR_TEST_DEPENDS
+  FileCheck count not
+  MLIRUnitTests
+  mlir-cpu-runner
+  mlir-edsc-builder-api-test
+  mlir-opt
+  mlir-sdbm-api-test
+  mlir-tblgen
+  mlir-translate
+  cblas
+  cblas_interface
+  mlir_runner_utils
+  )
+
+if(LLVM_BUILD_EXAMPLES)
+  list(APPEND MLIR_TEST_DEPENDS
+    toyc-ch1
+    toyc-ch2
+    toyc-ch3
+    toyc-ch4
+    toyc-ch5
+    toyc-ch6
+    toyc-ch7
+    )
+endif()
+
+if(MLIR_CUDA_RUNNER_ENABLED)
+  list(APPEND MLIR_TEST_DEPENDS
+    mlir-cuda-runner
+  )
+endif()
+
+add_lit_testsuite(check-mlir "Running the MLIR regression tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${MLIR_TEST_DEPENDS}
+  )
+set_target_properties(check-mlir PROPERTIES FOLDER "Tests")
+
+add_lit_testsuites(MLIR ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS ${MLIR_TEST_DEPS}
+)
diff --git a/mlir/test/Conversion/GPUToCUDA/lit.local.cfg b/mlir/test/Conversion/GPUToCUDA/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..847c3efbf24ac284d6f4f786fdc1a8cbb3109a9a
--- /dev/null
+++ b/mlir/test/Conversion/GPUToCUDA/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.run_cuda_tests:
+  config.unsupported = True
\ No newline at end of file
diff --git a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..6865462595f74276171a705492036e9fa4fa3540
--- /dev/null
+++ b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-opt %s --launch-func-to-cuda | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+  // CHECK: llvm.mlir.global internal constant @[[kernel_name:.*]]("kernel\00")
+  // CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN")
+
+  module @kernel_module attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"} {
+    gpu.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} {
+      gpu.return
+    }
+  }
+
+  llvm.func @foo() {
+    %0 = "op"() : () -> (!llvm.float)
+    %1 = "op"() : () -> (!llvm<"float*">)
+    %cst = llvm.mlir.constant(8 : index) : !llvm.i64
+
+    // CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
+    // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index)
+    // CHECK: %[[cubin_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
+    // CHECK-SAME: -> !llvm<"i8*">
+    // CHECK: %[[module_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
+    // CHECK: llvm.call @mcuModuleLoad(%[[module_ptr]], %[[cubin_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
+    // CHECK: %[[func_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
+    // CHECK: llvm.call @mcuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
+    // CHECK: llvm.call @mcuGetStreamHelper
+    // CHECK: llvm.call @mcuLaunchKernel
+    // CHECK: llvm.call @mcuStreamSynchronize
+    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = "kernel", kernel_module = @kernel_module }
+        : (!llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.float, !llvm<"float*">) -> ()
+
+    llvm.return
+  }
+
+}
diff --git a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..62fe2b993388b7d0818d0fd57cb49b6b4bbf2d42
--- /dev/null
+++ b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-opt %s --test-kernel-to-cubin -split-input-file | FileCheck %s
+
+// CHECK: attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"}
+module @foo attributes {gpu.kernel_module} {
+  llvm.func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
+    // CHECK: attributes  {gpu.kernel}
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+}
+
+// -----
+
+module @bar attributes {gpu.kernel_module} {
+  // CHECK: func @kernel_a
+  llvm.func @kernel_a()
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+
+  // CHECK: func @kernel_b
+  llvm.func @kernel_b()
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+}
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..b1820cb778ff3638bed1e3e60095f10ead998ad6
--- /dev/null
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -0,0 +1,150 @@
+// RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file | FileCheck %s
+
+module attributes {gpu.kernel_module} {
+  // CHECK-LABEL: func @gpu_index_ops()
+  func @gpu_index_ops()
+      attributes { gpu.kernel } {
+    // CHECK: = nvvm.read.ptx.sreg.tid.x : !llvm.i32
+    %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
+    // CHECK: = nvvm.read.ptx.sreg.tid.y : !llvm.i32
+    %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
+    // CHECK: = nvvm.read.ptx.sreg.tid.z : !llvm.i32
+    %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
+
+    // CHECK: = nvvm.read.ptx.sreg.ntid.x : !llvm.i32
+    %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
+    // CHECK: = nvvm.read.ptx.sreg.ntid.y : !llvm.i32
+    %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
+    // CHECK: = nvvm.read.ptx.sreg.ntid.z : !llvm.i32
+    %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
+
+    // CHECK: = nvvm.read.ptx.sreg.ctaid.x : !llvm.i32
+    %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
+    // CHECK: = nvvm.read.ptx.sreg.ctaid.y : !llvm.i32
+    %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
+    // CHECK: = nvvm.read.ptx.sreg.ctaid.z : !llvm.i32
+    %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
+
+    // CHECK: = nvvm.read.ptx.sreg.nctaid.x : !llvm.i32
+    %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
+    // CHECK: = nvvm.read.ptx.sreg.nctaid.y : !llvm.i32
+    %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
+    // CHECK: = nvvm.read.ptx.sreg.nctaid.z : !llvm.i32
+    %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+
+    std.return
+  }
+}
+
+// -----
+
+module attributes {gpu.kernel_module} {
+  // CHECK-LABEL: func @gpu_all_reduce_op()
+  func @gpu_all_reduce_op()
+      attributes { gpu.kernel } {
+    %arg0 = constant 1.0 : f32
+    // TODO(csigg): Check full IR expansion once lowering has settled.
+    // CHECK: nvvm.shfl.sync.bfly
+    // CHECK: nvvm.barrier0
+    // CHECK: llvm.fadd
+    %result = "gpu.all_reduce"(%arg0) ({}) {op = "add"} : (f32) -> (f32)
+
+    std.return
+  }
+}
+
+// -----
+
+module attributes {gpu.kernel_module} {
+  // CHECK-LABEL: func @gpu_all_reduce_region()
+  func @gpu_all_reduce_region()
+      attributes { gpu.kernel } {
+    %arg0 = constant 1 : i32
+    // TODO(csigg): Check full IR expansion once lowering has settled.
+    // CHECK: nvvm.shfl.sync.bfly
+    // CHECK: nvvm.barrier0
+    %result = "gpu.all_reduce"(%arg0) ({
+    ^bb(%lhs : i32, %rhs : i32):
+      %xor = xor %lhs, %rhs : i32
+      "gpu.yield"(%xor) : (i32) -> ()
+    }) : (i32) -> (i32)
+    std.return
+  }
+}
+
+// -----
+
+module attributes {gpu.kernel_module} {
+  // CHECK-LABEL: func @gpu_shuffle()
+  func @gpu_shuffle()
+      attributes { gpu.kernel } {
+    // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : !llvm.float
+    %arg0 = constant 1.0 : f32
+    // CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : !llvm.i32
+    %arg1 = constant 4 : i32
+    // CHECK: %[[#WIDTH:]] = llvm.mlir.constant(23 : i32) : !llvm.i32
+    %arg2 = constant 23 : i32
+    // CHECK: %[[#ONE:]] = llvm.mlir.constant(1 : i32) : !llvm.i32
+    // CHECK: %[[#SHL:]] = llvm.shl %[[#ONE]], %[[#WIDTH]] : !llvm.i32
+    // CHECK: %[[#MASK:]] = llvm.sub %[[#SHL]], %[[#ONE]] : !llvm.i32
+    // CHECK: %[[#CLAMP:]] = llvm.sub %[[#WIDTH]], %[[#ONE]] : !llvm.i32
+    // CHECK: %[[#SHFL:]] = nvvm.shfl.sync.bfly %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#CLAMP]] : !llvm<"{ float, i1 }">
+    // CHECK: llvm.extractvalue %[[#SHFL]][0 : index] : !llvm<"{ float, i1 }">
+    // CHECK: llvm.extractvalue %[[#SHFL]][1 : index] : !llvm<"{ float, i1 }">
+    %shfl, %pred = "gpu.shuffle"(%arg0, %arg1, %arg2) { mode = "xor" } : (f32, i32, i32) -> (f32, i1)
+
+    std.return
+  }
+}
+
+// -----
+
+module attributes {gpu.kernel_module} {
+  // CHECK-LABEL: func @gpu_sync()
+  func @gpu_sync()
+      attributes { gpu.kernel } {
+    // CHECK: nvvm.barrier0
+    gpu.barrier
+    std.return
+  }
+}
+
+// -----
+
+module attributes {gpu.kernel_module} {
+  // CHECK: llvm.func @__nv_expf(!llvm.float) -> !llvm.float
+  // CHECK: llvm.func @__nv_exp(!llvm.double) -> !llvm.double
+  // CHECK-LABEL: func @gpu_exp
+  func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
+    %exp_f32 = std.exp %arg_f32 : f32
+    // CHECK: llvm.call @__nv_expf(%{{.*}}) : (!llvm.float) -> !llvm.float
+    %result_f32 = std.exp %exp_f32 : f32
+    // CHECK: llvm.call @__nv_expf(%{{.*}}) : (!llvm.float) -> !llvm.float
+    %result64 = std.exp %arg_f64 : f64
+    // CHECK: llvm.call @__nv_exp(%{{.*}}) : (!llvm.double) -> !llvm.double
+    std.return
+  }
+}
+
+// -----
+
+// Test that we handled properly operation with SymbolTable other than module op
+module attributes {gpu.kernel_module} {
+  "test.symbol_scope"() ({
+  // CHECK: test.symbol_scope
+  // CHECK: llvm.func @__nv_expf(!llvm.float) -> !llvm.float
+  // CHECK: llvm.func @__nv_exp(!llvm.double) -> !llvm.double
+  // CHECK-LABEL: func @gpu_exp
+    func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
+      %exp_f32 = std.exp %arg_f32 : f32
+      // CHECK: llvm.call @__nv_expf(%{{.*}}) : (!llvm.float) -> !llvm.float
+      %result_f32 = std.exp %exp_f32 : f32
+      // CHECK: llvm.call @__nv_expf(%{{.*}}) : (!llvm.float) -> !llvm.float
+      %result64 = std.exp %arg_f64 : f64
+      // CHECK: llvm.call @__nv_exp(%{{.*}}) : (!llvm.double) -> !llvm.double
+      std.return
+    }
+    "test.finish" () : () -> ()
+  }) : () -> ()
+}
+
diff --git a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir b/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..69a16b25139e4e942d5df51e5a541c34cf7e32e1
--- /dev/null
+++ b/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir
@@ -0,0 +1,145 @@
+// RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s
+
+module attributes {gpu.kernel_module} {
+  // CHECK-LABEL:  llvm.func @private
+  gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) {
+    // Allocate private memory inside the function.
+    // CHECK: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64
+    // CHECK: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
+
+    // Populate the memref descriptor.
+    // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+    // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
+    // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
+    // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+    // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
+    // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+    // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
+    // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+    // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
+
+    // "Store" lowering should work just as any other memref, only check that
+    // we emit some core instructions.
+    // CHECK: llvm.extractvalue %[[descr6:.*]]
+    // CHECK: llvm.getelementptr
+    // CHECK: llvm.store
+    %c0 = constant 0 : index
+    store %arg0, %arg1[%c0] : memref<4xf32, 5>
+
+    "terminator"() : () -> ()
+  }
+}
+
+// -----
+
+module attributes {gpu.kernel_module} {
+  // Workgroup buffers are allocated as globals.
+  // CHECK: llvm.mlir.global internal @[[buffer:.*]]()
+  // CHECK-SAME:  addr_space = 3
+  // CHECK-SAME:  !llvm<"[4 x float]">
+
+  // CHECK-LABEL: llvm.func @workgroup
+  // CHECK-SAME: {
+  gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, 3>) {
+    // Get the address of the first element in the global array.
+    // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+    // CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*">
+    // CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
+    // CHECK-SAME: !llvm<"float addrspace(3)*">
+
+    // Populate the memref descriptor.
+    // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }">
+    // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
+    // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
+    // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+    // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
+    // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+    // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
+    // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+    // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
+
+    // "Store" lowering should work just as any other memref, only check that
+    // we emit some core instructions.
+    // CHECK: llvm.extractvalue %[[descr6:.*]]
+    // CHECK: llvm.getelementptr
+    // CHECK: llvm.store
+    %c0 = constant 0 : index
+    store %arg0, %arg1[%c0] : memref<4xf32, 3>
+
+    "terminator"() : () -> ()
+  }
+}
+
+// -----
+
+module attributes {gpu.kernel_module} {
+  // Check that the total size was computed correctly.
+  // CHECK: llvm.mlir.global internal @[[buffer:.*]]()
+  // CHECK-SAME:  addr_space = 3
+  // CHECK-SAME:  !llvm<"[48 x float]">
+
+  // CHECK-LABEL: llvm.func @workgroup3d
+  gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, 3>) {
+    // Get the address of the first element in the global array.
+    // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+    // CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*">
+    // CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
+    // CHECK-SAME: !llvm<"float addrspace(3)*">
+
+    // Populate the memref descriptor.
+    // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }">
+    // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
+    // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
+    // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+    // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
+    // CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
+    // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c6]], %[[descr4]][3, 2]
+    // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+    // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 2]
+    // CHECK: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
+    // CHECK: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
+    // CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
+    // CHECK: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
+    // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+    // CHECK: %[[descr9:.*]] = llvm.insertvalue %[[c4]], %[[descr8]][3, 0]
+    // CHECK: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64
+    // CHECK: %[[descr10:.*]] = llvm.insertvalue %[[c12]], %[[descr9]][4, 0]
+
+    %c0 = constant 0 : index
+    store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, 3>
+    "terminator"() : () -> ()
+  }
+}
+
+// -----
+
+module attributes {gpu.kernel_module} {
+  // Check that several buffers are defined.
+  // CHECK: llvm.mlir.global internal @[[buffer1:.*]]()
+  // CHECK-SAME:  !llvm<"[1 x float]">
+  // CHECK: llvm.mlir.global internal @[[buffer2:.*]]()
+  // CHECK-SAME:  !llvm<"[2 x float]">
+
+  // CHECK-LABEL: llvm.func @multiple
+  gpu.func @multiple(%arg0: f32)
+      workgroup(%arg1: memref<1xf32, 3>, %arg2: memref<2xf32, 3>)
+      private(%arg3: memref<3xf32, 5>, %arg4: memref<4xf32, 5>) {
+
+    // Workgroup buffers.
+    // CHECK: llvm.mlir.addressof @[[buffer1]]
+    // CHECK: llvm.mlir.addressof @[[buffer2]]
+
+    // Private buffers.
+    // CHECK: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
+    // CHECK: llvm.alloca %[[c3]] x !llvm.float
+    // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
+    // CHECK: llvm.alloca %[[c4]] x !llvm.float
+
+    %c0 = constant 0 : index
+    store %arg0, %arg1[%c0] : memref<1xf32, 3>
+    store %arg0, %arg2[%c0] : memref<2xf32, 3>
+    store %arg0, %arg3[%c0] : memref<3xf32, 5>
+    store %arg0, %arg4[%c0] : memref<4xf32, 5>
+    "terminator"() : () -> ()
+  }
+}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..f803266ad28e059c9b933e12748daba3ffc997d4
--- /dev/null
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -0,0 +1,77 @@
+// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
+
+module attributes {gpu.kernel_module} {
+  // CHECK-LABEL: func @gpu_index_ops()
+  func @gpu_index_ops()
+      attributes { gpu.kernel } {
+    // CHECK: rocdl.workitem.id.x : !llvm.i32
+    %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
+    // CHECK: rocdl.workitem.id.y : !llvm.i32
+    %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
+    // CHECK: rocdl.workitem.id.z : !llvm.i32
+    %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
+
+    // CHECK: rocdl.workgroup.dim.x : !llvm.i32
+    %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
+    // CHECK: rocdl.workgroup.dim.y : !llvm.i32
+    %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
+    // CHECK: rocdl.workgroup.dim.z : !llvm.i32
+    %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
+
+    // CHECK: rocdl.workgroup.id.x : !llvm.i32
+    %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
+    // CHECK: rocdl.workgroup.id.y : !llvm.i32
+    %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
+    // CHECK: rocdl.workgroup.id.z : !llvm.i32
+    %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
+
+    // CHECK: rocdl.grid.dim.x : !llvm.i32
+    %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
+    // CHECK: rocdl.grid.dim.y : !llvm.i32
+    %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
+    // CHECK: rocdl.grid.dim.z : !llvm.i32
+    %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+
+    std.return
+  }
+}
+
+// -----
+
+module attributes {gpu.kernel_module} {
+  // CHECK: llvm.func @_ocml_exp_f32(!llvm.float) -> !llvm.float
+  // CHECK: llvm.func @_ocml_exp_f64(!llvm.double) -> !llvm.double
+  // CHECK-LABEL: func @gpu_exp
+  func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
+    %exp_f32 = std.exp %arg_f32 : f32
+    // CHECK: llvm.call @_ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
+    %result_f32 = std.exp %exp_f32 : f32
+    // CHECK: llvm.call @_ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
+    %result64 = std.exp %arg_f64 : f64
+    // CHECK: llvm.call @_ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
+    std.return
+  }
+}
+
+
+// -----
+
+// Test that we handled properly operation with SymbolTable other than module op
+module attributes {gpu.kernel_module} {
+  "test.symbol_scope"() ({
+    // CHECK: test.symbol_scope
+    // CHECK: llvm.func @_ocml_exp_f32(!llvm.float) -> !llvm.float
+    // CHECK: llvm.func @_ocml_exp_f64(!llvm.double) -> !llvm.double
+    // CHECK-LABEL: func @gpu_exp
+    func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
+      %exp_f32 = std.exp %arg_f32 : f32
+      // CHECK: llvm.call @_ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
+      %result_f32 = std.exp %exp_f32 : f32
+      // CHECK: llvm.call @_ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
+      %result64 = std.exp %arg_f64 : f64
+      // CHECK: llvm.call @_ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
+      std.return
+    }
+    "test.finish" () : () -> ()
+  }) : () -> ()
+}
diff --git a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..c0a68a9db2af42934442a625e8b018913624077c
--- /dev/null
+++ b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
@@ -0,0 +1,137 @@
+// RUN: mlir-opt -split-input-file -convert-gpu-to-spirv %s -o - | FileCheck %s
+
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_workgroup_id_x", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }
+
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
+  module @kernels attributes {gpu.kernel_module} {
+    gpu.func @builtin_workgroup_id_x()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.block_id"() {dimension = "x"} : () -> index
+      gpu.return
+    }
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_workgroup_id_y", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }
+
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
+  module @kernels attributes {gpu.kernel_module} {
+    gpu.func @builtin_workgroup_id_y()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}1 : i32{{\]}}
+      %0 = "gpu.block_id"() {dimension = "y"} : () -> index
+      gpu.return
+    }
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_workgroup_id_z", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }
+
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
+  module @kernels attributes {gpu.kernel_module} {
+    gpu.func @builtin_workgroup_id_z()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}2 : i32{{\]}}
+      %0 = "gpu.block_id"() {dimension = "z"} : () -> index
+      gpu.return
+    }
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_workgroup_size_x", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }
+
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize")
+  module @kernels attributes {gpu.kernel_module} {
+    gpu.func @builtin_workgroup_size_x()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPSIZE]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.block_dim"() {dimension = "x"} : () -> index
+      gpu.return
+    }
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_local_id_x", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }
+
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[LOCALINVOCATIONID:@.*]] built_in("LocalInvocationId")
+  module @kernels attributes {gpu.kernel_module} {
+    gpu.func @builtin_local_id_x()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[LOCALINVOCATIONID]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.thread_id"() {dimension = "x"} : () -> index
+      gpu.return
+    }
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_num_workgroups_x", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }
+
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[NUMWORKGROUPS:@.*]] built_in("NumWorkgroups")
+  module @kernels attributes {gpu.kernel_module} {
+    gpu.func @builtin_num_workgroups_x()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[NUMWORKGROUPS]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.grid_dim"() {dimension = "x"} : () -> index
+      gpu.return
+    }
+  }
+}
diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..32442561627273fd22068c6078f748cde748ad20
--- /dev/null
+++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
@@ -0,0 +1,76 @@
+// RUN: mlir-opt -convert-gpu-to-spirv %s -o - | FileCheck %s
+
+module attributes {gpu.container_module} {
+  func @load_store(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>) {
+    %c0 = constant 0 : index
+    %c12 = constant 12 : index
+    %0 = subi %c12, %c0 : index
+    %c1 = constant 1 : index
+    %c0_0 = constant 0 : index
+    %c4 = constant 4 : index
+    %1 = subi %c4, %c0_0 : index
+    %c1_1 = constant 1 : index
+    %c1_2 = constant 1 : index
+    "gpu.launch_func"(%0, %c1_2, %c1_2, %1, %c1_2, %c1_2, %arg0, %arg1, %arg2, %c0, %c0_0, %c1, %c1_1) {kernel = "load_store_kernel", kernel_module = @kernels} : (index, index, index, index, index, index, memref<12x4xf32>, memref<12x4xf32>, memref<12x4xf32>, index, index, index, index) -> ()
+    return
+  }
+
+  // CHECK-LABEL: spv.module "Logical" "GLSL450"
+  module @kernels attributes {gpu.kernel_module} {
+    // CHECK-DAG: spv.globalVariable [[WORKGROUPSIZEVAR:@.*]] built_in("WorkgroupSize") : !spv.ptr<vector<3xi32>, Input>
+    // CHECK-DAG: spv.globalVariable [[NUMWORKGROUPSVAR:@.*]] built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
+    // CHECK-DAG: spv.globalVariable [[LOCALINVOCATIONIDVAR:@.*]] built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
+    // CHECK-DAG: spv.globalVariable [[WORKGROUPIDVAR:@.*]] built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
+    // CHECK-LABEL:    func @load_store_kernel
+    // CHECK-SAME: [[ARG0:%.*]]: !spv.ptr<!spv.struct<!spv.array<48 x f32 [4]> [0]>, StorageBuffer> {spirv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
+    // CHECK-SAME: [[ARG1:%.*]]: !spv.ptr<!spv.struct<!spv.array<48 x f32 [4]> [0]>, StorageBuffer> {spirv.interface_var_abi = {binding = 1 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
+    // CHECK-SAME: [[ARG2:%.*]]: !spv.ptr<!spv.struct<!spv.array<48 x f32 [4]> [0]>, StorageBuffer> {spirv.interface_var_abi = {binding = 2 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
+    // CHECK-SAME: [[ARG3:%.*]]: i32 {spirv.interface_var_abi = {binding = 3 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
+    // CHECK-SAME: [[ARG4:%.*]]: i32 {spirv.interface_var_abi = {binding = 4 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
+    // CHECK-SAME: [[ARG5:%.*]]: i32 {spirv.interface_var_abi = {binding = 5 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
+    // CHECK-SAME: [[ARG6:%.*]]: i32 {spirv.interface_var_abi = {binding = 6 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
+    gpu.func @load_store_kernel(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>, %arg3: index, %arg4: index, %arg5: index, %arg6: index)
+      attributes  {gpu.kernel} {
+      // CHECK: [[ADDRESSWORKGROUPID:%.*]] = spv._address_of [[WORKGROUPIDVAR]]
+      // CHECK: [[WORKGROUPID:%.*]] = spv.Load "Input" [[ADDRESSWORKGROUPID]]
+      // CHECK: [[WORKGROUPIDX:%.*]] = spv.CompositeExtract [[WORKGROUPID]]{{\[}}0 : i32{{\]}}
+      // CHECK: [[ADDRESSLOCALINVOCATIONID:%.*]] = spv._address_of [[LOCALINVOCATIONIDVAR]]
+      // CHECK: [[LOCALINVOCATIONID:%.*]] = spv.Load "Input" [[ADDRESSLOCALINVOCATIONID]]
+      // CHECK: [[LOCALINVOCATIONIDX:%.*]] = spv.CompositeExtract [[LOCALINVOCATIONID]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.block_id"() {dimension = "x"} : () -> index
+      %1 = "gpu.block_id"() {dimension = "y"} : () -> index
+      %2 = "gpu.block_id"() {dimension = "z"} : () -> index
+      %3 = "gpu.thread_id"() {dimension = "x"} : () -> index
+      %4 = "gpu.thread_id"() {dimension = "y"} : () -> index
+      %5 = "gpu.thread_id"() {dimension = "z"} : () -> index
+      %6 = "gpu.grid_dim"() {dimension = "x"} : () -> index
+      %7 = "gpu.grid_dim"() {dimension = "y"} : () -> index
+      %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
+      %9 = "gpu.block_dim"() {dimension = "x"} : () -> index
+      %10 = "gpu.block_dim"() {dimension = "y"} : () -> index
+      %11 = "gpu.block_dim"() {dimension = "z"} : () -> index
+      // CHECK: [[INDEX1:%.*]] = spv.IAdd [[ARG3]], [[WORKGROUPIDX]]
+      %12 = addi %arg3, %0 : index
+      // CHECK: [[INDEX2:%.*]] = spv.IAdd [[ARG4]], [[LOCALINVOCATIONIDX]]
+      %13 = addi %arg4, %3 : index
+      // CHECK: [[STRIDE1_1:%.*]] = spv.constant 4 : i32
+      // CHECK: [[OFFSET1_1:%.*]] = spv.IMul [[STRIDE1_1]], [[INDEX1]] : i32
+      // CHECK: [[STRIDE1_2:%.*]] = spv.constant 1 : i32
+      // CHECK: [[UPDATE1_2:%.*]] = spv.IMul [[STRIDE1_2]], [[INDEX2]] : i32
+      // CHECK: [[OFFSET1_2:%.*]] = spv.IAdd [[OFFSET1_1]], [[UPDATE1_2]] : i32
+      // CHECK: [[ZERO1:%.*]] = spv.constant 0 : i32
+      // CHECK: [[PTR1:%.*]] = spv.AccessChain [[ARG0]]{{\[}}[[ZERO1]], [[OFFSET1_2]]{{\]}}
+      // CHECK-NEXT: [[VAL1:%.*]] = spv.Load "StorageBuffer" [[PTR1]]
+      %14 = load %arg0[%12, %13] : memref<12x4xf32>
+      // CHECK: [[PTR2:%.*]] = spv.AccessChain [[ARG1]]{{\[}}{{%.*}}, {{%.*}}{{\]}}
+      // CHECK-NEXT: [[VAL2:%.*]] = spv.Load "StorageBuffer" [[PTR2]]
+      %15 = load %arg1[%12, %13] : memref<12x4xf32>
+      // CHECK: [[VAL3:%.*]] = spv.FAdd [[VAL1]], [[VAL2]]
+      %16 = addf %14, %15 : f32
+      // CHECK: [[PTR3:%.*]] = spv.AccessChain [[ARG2]]{{\[}}{{%.*}}, {{%.*}}{{\]}}
+      // CHECK-NEXT: spv.Store "StorageBuffer" [[PTR3]], [[VAL3]]
+      store %16, %arg2[%12, %13] : memref<12x4xf32>
+      gpu.return
+    }
+  }
+}
diff --git a/mlir/test/Conversion/GPUToSPIRV/loop.mlir b/mlir/test/Conversion/GPUToSPIRV/loop.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..6d38360b7e84a2746b93be77ee35655df31cf593
--- /dev/null
+++ b/mlir/test/Conversion/GPUToSPIRV/loop.mlir
@@ -0,0 +1,43 @@
+// RUN: mlir-opt -convert-gpu-to-spirv %s -o - | FileCheck %s
+
+module attributes {gpu.container_module} {
+  func @loop(%arg0 : memref<10xf32>, %arg1 : memref<10xf32>) {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0, %arg0, %arg1) { kernel = "loop_kernel", kernel_module = @kernels} : (index, index, index, index, index, index, memref<10xf32>, memref<10xf32>) -> ()
+    return
+  }
+
+  module @kernels attributes {gpu.kernel_module} {
+    gpu.func @loop_kernel(%arg2 : memref<10xf32>, %arg3 : memref<10xf32>)
+    attributes {gpu.kernel} {
+      // CHECK: [[LB:%.*]] = spv.constant 4 : i32
+      %lb = constant 4 : index
+      // CHECK: [[UB:%.*]] = spv.constant 42 : i32
+      %ub = constant 42 : index
+      // CHECK: [[STEP:%.*]] = spv.constant 2 : i32
+      %step = constant 2 : index
+      // CHECK:      spv.loop {
+      // CHECK-NEXT:   spv.Branch [[HEADER:\^.*]]([[LB]] : i32)
+      // CHECK:      [[HEADER]]([[INDVAR:%.*]]: i32):
+      // CHECK:        [[CMP:%.*]] = spv.SLessThan [[INDVAR]], [[UB]] : i32
+      // CHECK:        spv.BranchConditional [[CMP]], [[BODY:\^.*]], [[MERGE:\^.*]]
+      // CHECK:      [[BODY]]:
+      // CHECK:        [[STRIDE1:%.*]] = spv.constant 1 : i32
+      // CHECK:        [[OFFSET1:%.*]] = spv.IMul [[STRIDE1]], [[INDVAR]] : i32
+      // CHECK:        spv.AccessChain {{%.*}}{{\[}}{{%.*}}, [[OFFSET1]]{{\]}} : {{.*}}
+      // CHECK:        [[STRIDE2:%.*]] = spv.constant 1 : i32
+      // CHECK:        [[OFFSET2:%.*]] = spv.IMul [[STRIDE2]], [[INDVAR]] : i32
+      // CHECK:        spv.AccessChain {{%.*}}{{\[}}{{%.*}}, [[OFFSET2]]{{\]}} : {{.*}}
+      // CHECK:        [[INCREMENT:%.*]] = spv.IAdd [[INDVAR]], [[STEP]] : i32
+      // CHECK:        spv.Branch [[HEADER]]([[INCREMENT]] : i32)
+      // CHECK:      [[MERGE]]
+      // CHECK:        spv._merge
+      // CHECK:      }
+      loop.for %arg4 = %lb to %ub step %step {
+        %1 = load %arg2[%arg4] : memref<10xf32>
+        store %1, %arg3[%arg4] : memref<10xf32>
+      }
+      gpu.return
+    }
+  }
+}
diff --git a/mlir/test/Conversion/GPUToSPIRV/simple.mlir b/mlir/test/Conversion/GPUToSPIRV/simple.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..c1f4324c30381051d3a6604ce06cc56b4165a218
--- /dev/null
+++ b/mlir/test/Conversion/GPUToSPIRV/simple.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-opt -pass-pipeline='convert-gpu-to-spirv{workgroup-size=32,4}' %s -o - | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+  module @kernels attributes {gpu.kernel_module} {
+    // CHECK:       spv.module "Logical" "GLSL450" {
+    // CHECK-LABEL: func @kernel_1
+    // CHECK-SAME: {{%.*}}: f32 {spirv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
+    // CHECK-SAME: {{%.*}}: !spv.ptr<!spv.struct<!spv.array<12 x f32 [4]> [0]>, StorageBuffer> {spirv.interface_var_abi = {binding = 1 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
+    // CHECK-SAME: spirv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}
+    gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<12xf32>) attributes {gpu.kernel} {
+      // CHECK: spv.Return
+      gpu.return
+    }
+    // CHECK: attributes {capabilities = ["Shader"], extensions = ["SPV_KHR_storage_buffer_storage_class"]}
+  }
+
+  func @foo() {
+    %0 = "op"() : () -> (f32)
+    %1 = "op"() : () -> (memref<12xf32>)
+    %cst = constant 1 : index
+    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index, f32, memref<12xf32>) -> ()
+    return
+  }
+}
diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..cc80954c6a9965049da84f6273767de69805f890
--- /dev/null
+++ b/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir
@@ -0,0 +1,83 @@
+// RUN: mlir-opt -convert-loop-op-to-gpu -gpu-num-workgroups=2,2 -gpu-workgroup-size=32,4 %s | FileCheck %s
+
+module {
+  // arg2 = arg0 * transpose(arg1) ; with intermediate buffer and tile size passed as argument
+  // CHECK: func {{@.*}}([[ARG0:%.*]]: memref<?x?xf32>, [[ARG1:%.*]]: memref<?x?xf32>, [[ARG2:%.*]]: memref<?x?xf32>, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index)
+  func @foo(%arg0: memref<?x?xf32>, %arg1 : memref<?x?xf32>, %arg2 : memref<?x?xf32>, %arg3 : index, %arg4 : index) {
+    %0 = dim %arg0, 0 : memref<?x?xf32>
+    %1 = dim %arg0, 1 : memref<?x?xf32>
+    %c0 = constant 0 : index
+    %c1 = constant 1 : index
+    // CHECK: gpu.launch blocks([[ARG5:%.*]], [[ARG6:%.*]], [[ARG7:%.*]]) in ([[ARG11:%.*]] = {{%.*}}, [[ARG12:%.*]] = {{%.*}}, [[ARG13:%.*]] = {{%.*}}) threads([[ARG8:%.*]], [[ARG9:%.*]], [[ARG10:%.*]]) in ([[ARG14:%.*]] = {{%.*}}, [[ARG15:%.*]] = {{%.*}}, [[ARG16:%.*]] = {{%.*}}) args([[ARG17:%.*]] = [[ARG3]], [[ARG18:%.*]] = [[ARG4]], [[ARG19:%.*]] = [[ARG1]], [[ARG20:%.*]] = {{%.*}}, {{%.*}} = {{%.*}}, [[ARG22:%.*]] = [[ARG0]], [[ARG23:%.*]] = [[ARG2]]
+    // CHECK: [[TEMP1:%.*]] = muli [[ARG17]], [[ARG6]] : index
+    // CHECK: [[BLOCKLOOPYLB:%.*]] = addi {{%.*}}, [[TEMP1]] : index
+    // CHECK: [[BLOCKLOOPYSTEP:%.*]] = muli [[ARG17]], [[ARG12]] : index
+    // CHECK: loop.for [[BLOCKLOOPYIV:%.*]] = [[BLOCKLOOPYLB]] to {{%.*}} step [[BLOCKLOOPYSTEP]]
+    loop.for %iv1 = %c0 to %0 step %arg3 {
+
+      // CHECK: [[TEMP2:%.*]] = muli [[ARG18]], [[ARG5]] : index
+      // CHECK: [[BLOCKLOOPXLB:%.*]] = addi  {{%.*}}, [[TEMP2]] : index
+      // CHECK: [[BLOCKLOOPXSTEP:%.*]] = muli [[ARG18]], [[ARG11]] : index
+      // CHECK: loop.for [[BLOCKLOOPXIV:%.*]] = [[BLOCKLOOPXLB]] to {{%.*}} step [[BLOCKLOOPXSTEP]]
+
+      loop.for %iv2 = %c0 to %1 step %arg4 {
+
+        // TODO: This is effectively shared memory. Lower it to a
+        // shared memory.
+        %2 = alloc(%arg3, %arg4) : memref<?x?xf32>
+
+        // Load transpose tile
+        // CHECK: [[TEMP3:%.*]] = muli [[ARG20]], [[ARG9:%.*]] : index
+        // CHECK: [[THREADLOOP1YLB:%.*]] = addi {{%.*}}, [[TEMP3]] : index
+        // CHECK: [[THREADLOOP1YSTEP:%.*]] = muli [[ARG20]], [[ARG15]] : index
+        // CHECK: loop.for [[THREADLOOP1YIV:%.*]] = [[THREADLOOP1YLB]] to {{%.*}} step [[THREADLOOP1YSTEP]]
+        loop.for %iv3 = %c0 to %arg3 step %c1 {
+          // CHECK: [[TEMP4:%.*]] = muli [[ARG20]], [[ARG8]] : index
+          // CHECK: [[THREADLOOP1XLB:%.*]] = addi {{%.*}}, [[TEMP4]] : index
+          // CHECK: [[THREADLOOP1XSTEP:%.*]] = muli [[ARG20]], [[ARG14]] : index
+          // CHECK: loop.for [[THREADLOOP1XIV:%.*]] = [[THREADLOOP1XLB]] to {{%.*}} step [[THREADLOOP1XSTEP]]
+          loop.for %iv4 = %c1 to %arg4 step %c1 {
+            // CHECK: [[INDEX2:%.*]] = addi [[BLOCKLOOPYIV]], [[THREADLOOP1YIV]] : index
+            %10 = addi %iv1, %iv3 : index
+            // CHECK: [[INDEX1:%.*]] = addi [[BLOCKLOOPXIV]], [[THREADLOOP1XIV]] : index
+            %11 = addi %iv2, %iv4 : index
+            // CHECK: [[VAL1:%.*]] = load [[ARG19]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}} : memref<?x?xf32>
+            %12 = load %arg1[%11, %10] : memref<?x?xf32>
+            // CHECK: store [[VAL1]], [[SCRATCHSPACE:%.*]]{{\[}}[[THREADLOOP1XIV]], [[THREADLOOP1YIV]]{{\]}} : memref<?x?xf32>
+            store %12, %2[%iv4, %iv3] : memref<?x?xf32>
+          }
+        }
+
+        // TODO: There needs to be a sync here for correctness, but
+        // testing only loop partitioning for now.
+
+        // CHECK: [[TEMP5:%.*]] = muli [[ARG20]], [[ARG9]] : index
+        // CHECK: [[THREADLOOP2YLB:%.*]] = addi {{%.*}}, [[TEMP5]] : index
+        // CHECK: [[THREADLOOP2YSTEP:%.*]] = muli [[ARG20]], [[ARG15]] : index
+        // CHECK: loop.for [[THREADLOOP2YIV:%.*]] = [[THREADLOOP2YLB]] to {{%.*}} step [[THREADLOOP2YSTEP]]
+        loop.for %iv3 = %c0 to %arg3 step %c1 {
+          // CHECK: [[TEMP6:%.*]] = muli [[ARG20]], [[ARG8]] : index
+          // CHECK: [[THREADLOOP2XLB:%.*]] = addi {{%.*}}, [[TEMP6]] : index
+          // CHECK: [[THREADLOOP2XSTEP:%.*]] = muli [[ARG20]], [[ARG14]] : index
+          // CHECK: loop.for [[THREADLOOP2XIV:%.*]] = [[THREADLOOP2XLB]] to {{%.*}} step [[THREADLOOP2XSTEP]]
+          loop.for %iv4 = %c1 to %arg4 step %c1 {
+            // CHECK: [[INDEX3:%.*]] = addi [[BLOCKLOOPYIV]], [[THREADLOOP2YIV]] : index
+            %13 = addi %iv1, %iv3 : index
+            // CHECK: [[INDEX4:%.*]] = addi [[BLOCKLOOPXIV]], [[THREADLOOP2XIV]] : index
+            %14 = addi %iv2, %iv4 : index
+            // CHECK: {{%.*}} = load [[SCRATCHSPACE]]{{\[}}[[THREADLOOP2XIV]], [[THREADLOOP2YIV]]{{\]}} : memref<?x?xf32>
+            %15 = load %2[%iv4, %iv3] : memref<?x?xf32>
+            // CHECK: {{%.*}} = load [[ARG22]]{{\[}}[[INDEX3]], [[INDEX4]]{{\]}}
+            %16 = load %arg0[%13, %14] : memref<?x?xf32>
+            %17 = mulf %15, %16 : f32
+            // CHECK: store {{%.*}}, [[ARG23]]{{\[}}[[INDEX3]], [[INDEX4]]{{\]}}
+            store %17, %arg2[%13, %14] : memref<?x?xf32>
+          }
+        }
+
+        dealloc %2 : memref<?x?xf32>
+      }
+    }
+    return
+  }
+}
\ No newline at end of file
diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_3D.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_3D.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..73f0ab7d71baec3840bbb896007788905f3adcdd
--- /dev/null
+++ b/mlir/test/Conversion/LoopsToGPU/imperfect_3D.mlir
@@ -0,0 +1,83 @@
+// RUN: mlir-opt -convert-loop-op-to-gpu -gpu-num-workgroups=4,2,2 -gpu-workgroup-size=32,2,2 %s | FileCheck %s
+
+module {
+  func @imperfect_3D(%arg0 : memref<?x?x?xf32>, %arg1 : memref<?x?x?xf32>, %arg2 : memref<?x?x?xf32>, %arg3 : memref<?x?x?xf32>, %t1 : index, %t2 : index, %t3 : index, %step1 : index, %step2 : index, %step3 : index) {
+    %0 = dim %arg0, 0 : memref<?x?x?xf32>
+    %1 = dim %arg0, 1 : memref<?x?x?xf32>
+    %2 = dim %arg0, 2 : memref<?x?x?xf32>
+    %c0 = constant 0 : index
+    // CHECK: gpu.launch
+    // CHECK:   loop.for {{.*}} {
+    // CHECK:     loop.for {{.*}} {
+    // CHECK:       loop.for {{.*}} {
+    // CHECK:         alloc
+    // CHECK:         loop.for {{.*}} {
+    // CHECK:           loop.for {{.*}} {
+    // CHECK:             loop.for {{.*}} {
+    // CHECK:               load
+    // CHECK:               load
+    // CHECK:               addf
+    // CHECK:               store
+    // CHECK:             }
+    // CHECK-NEXT:      }
+    // CHECK-NEXT:    }
+    // CHECK:         loop.for {{.*}} {
+    // CHECK:           loop.for {{.*}} {
+    // CHECK:             loop.for {{.*}} {
+    // CHECK:               load
+    // CHECK:               load
+    // CHECK:               mulf
+    // CHECK:               store
+    // CHECK:             }
+    // CHECK-NEXT:      }
+    // CHECK-NEXT:    }
+    // CHECK:         dealloc
+    loop.for %iv1 = %c0 to %0 step %t1 {
+      loop.for %iv2 = %c0 to %1 step %t2 {
+        loop.for %iv3 = %c0 to %2 step %t3 {
+          %6 = alloc(%t1, %t2, %t3) : memref<?x?x?xf32>
+          %ubcmp1 = cmpi "slt", %0, %t1 : index
+          %ub1 = select %ubcmp1, %0, %t1 : index
+          %ubcmp2 = cmpi "slt", %1, %t2 : index
+          %ub2 = select %ubcmp2, %1, %t2 : index
+          %ubcmp3 = cmpi "slt", %2, %t3 : index
+          %ub3 = select %ubcmp3, %2, %t3 : index
+          loop.for %iv4 = %iv1 to %ub1 step %step1 {
+            loop.for %iv5 = %iv2 to %ub2 step %step2 {
+              loop.for %iv6 = %iv3 to %ub3 step %step3 {
+                %7 = load %arg0[%iv4, %iv5, %iv6] : memref<?x?x?xf32>
+                %8 = load %arg1[%iv4, %iv6, %iv5] : memref<?x?x?xf32>
+                %9 = addf %7, %8 : f32
+                %10 = subi %iv4, %iv1 : index
+                %11 = divi_signed %10, %step1 : index
+                %12 = subi %iv5, %iv2 : index
+                %13 = divi_signed %12, %step2 : index
+                %14 = subi %iv6, %iv3 : index
+                %15 = divi_signed %14, %step3 : index
+                store %9, %6[%11, %13, %15] : memref<?x?x?xf32>
+              }
+            }
+          }
+          loop.for %iv7 = %iv1 to %ub1 step %step1 {
+            loop.for %iv8 = %iv2 to %ub2 step %step2 {
+              loop.for %iv9 = %iv3 to %ub3 step %step3 {
+                %16 = subi %iv7, %iv1 : index
+                %17 = divi_signed %16, %step1 : index
+                %18 = subi %iv8, %iv2 : index
+                %19 = divi_signed %18, %step2 : index
+                %20 = subi %iv9, %iv3 : index
+                %21 = divi_signed %20, %step3 : index
+                %22 = load %6[%17, %19, %21] : memref<?x?x?xf32>
+                %23 = load %arg2[%iv9, %iv8, %iv7] : memref<?x?x?xf32>
+                %24 = mulf %22, %23 : f32
+                store %24, %arg3[%iv7, %iv8, %iv9] : memref<?x?x?xf32>
+              }
+            }
+          }
+          dealloc %6 : memref<?x?x?xf32>
+        }
+      }
+    }
+    return
+  }
+}
\ No newline at end of file
diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_4D.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_4D.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..2c5dd5c0fb2fa396c0377dea19c8649b0fcc804f
--- /dev/null
+++ b/mlir/test/Conversion/LoopsToGPU/imperfect_4D.mlir
@@ -0,0 +1,86 @@
+// RUN: mlir-opt -convert-loop-op-to-gpu -gpu-num-workgroups=4,2,2 -gpu-workgroup-size=32,2,2 %s | FileCheck %s
+
+module {
+  func @imperfect_3D(%arg0 : memref<?x?x?x?xf32>, %arg1 : memref<?x?x?x?xf32>, %arg2 : memref<?x?x?x?xf32>, %arg3 : memref<?x?x?x?xf32>, %t1 : index, %t2 : index, %t3 : index, %t4 : index, %step1 : index, %step2 : index, %step3 : index, %step4 : index) {
+    %0 = dim %arg0, 0 : memref<?x?x?x?xf32>
+    %1 = dim %arg0, 1 : memref<?x?x?x?xf32>
+    %2 = dim %arg0, 2 : memref<?x?x?x?xf32>
+    %3 = dim %arg0, 3 : memref<?x?x?x?xf32>
+    %c0 = constant 0 : index
+    // CHECK: gpu.launch
+    // CHECK:   loop.for
+    // CHECK:     loop.for
+    // CHECK:       loop.for
+    // CHECK:         alloc
+    // CHECK:         loop.for
+    // CHECK:           loop.for
+    // CHECK:             loop.for
+    // CHECK:               loop.for
+    // CHECK:                 load
+    // CHECK:                 load
+    // CHECK:                 addf
+    // CHECK:                 store
+    // CHECK:         loop.for
+    // CHECK:           loop.for
+    // CHECK:             loop.for
+    // CHECK:               loop.for
+    // CHECK:                 load
+    // CHECK:                 load
+    // CHECK:                 mulf
+    // CHECK:                 store
+    // CHECK:         dealloc
+    loop.for %iv1 = %c0 to %0 step %t1 {
+      loop.for %iv2 = %c0 to %1 step %t2 {
+        loop.for %iv3 = %c0 to %2 step %t3 {
+          %6 = alloc(%t1, %t2, %t3, %3) : memref<?x?x?x?xf32>
+          %ubcmp1 = cmpi "slt", %0, %t1 : index
+          %ub1 = select %ubcmp1, %0, %t1 : index
+          %ubcmp2 = cmpi "slt", %1, %t2 : index
+          %ub2 = select %ubcmp2, %1, %t2 : index
+          %ubcmp3 = cmpi "slt", %2, %t3 : index
+          %ub3 = select %ubcmp3, %2, %t3 : index
+          %ubcmp4 = cmpi "slt", %3, %t4 : index
+          %ub4 = select %ubcmp3, %3, %t4 : index
+          loop.for %iv5 = %iv1 to %ub1 step %step1 {
+            loop.for %iv6 = %iv2 to %ub2 step %step2 {
+              loop.for %iv7 = %iv3 to %ub3 step %step3 {
+                loop.for %iv8 = %c0 to %3 step %step4 {
+                  %7 = load %arg0[%iv5, %iv6, %iv7, %iv8] : memref<?x?x?x?xf32>
+                  %8 = load %arg1[%iv5, %iv6, %iv7, %iv8] : memref<?x?x?x?xf32>
+                  %9 = addf %7, %8 : f32
+                  %10 = subi %iv5, %iv1 : index
+                  %11 = divi_signed %10, %step1 : index
+                  %12 = subi %iv6, %iv2 : index
+                  %13 = divi_signed %12, %step2 : index
+                  %14 = subi %iv7, %iv3 : index
+                  %15 = divi_signed %14, %step3 : index
+                  store %9, %6[%11, %13, %15, %iv8] : memref<?x?x?x?xf32>
+                }
+              }
+            }
+          }
+          loop.for %iv9 = %iv1 to %ub1 step %step1 {
+            loop.for %iv10 = %iv2 to %ub2 step %step2 {
+              loop.for %iv11 = %iv3 to %ub3 step %step3 {
+                loop.for %iv12 = %c0 to %3 step %step4 {
+                  %18 = subi %iv9, %iv1 : index
+                  %19 = divi_signed %18, %step1 : index
+                  %20 = subi %iv10, %iv2 : index
+                  %21 = divi_signed %20, %step2 : index
+                  %22 = subi %iv11, %iv3 : index
+                  %23 = divi_signed %22, %step3 : index
+                  %26 = load %6[%19, %21, %23, %iv12] : memref<?x?x?x?xf32>
+                  %27 = load %arg2[%iv9, %iv10, %iv12, %iv11] : memref<?x?x?x?xf32>
+                  %28 = mulf %26, %27 : f32
+                  store %28, %arg3[%iv9, %iv10, %iv11, %iv12] : memref<?x?x?x?xf32>
+                }
+              }
+            }
+          }
+          dealloc %6 : memref<?x?x?x?xf32>
+        }
+      }
+    }
+    return
+  }
+}
\ No newline at end of file
diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_linalg.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_linalg.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..abf8da6b562ea36ec8985de766251024b6b68f48
--- /dev/null
+++ b/mlir/test/Conversion/LoopsToGPU/imperfect_linalg.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-opt %s -convert-loop-op-to-gpu -gpu-num-workgroups=2,16 -gpu-workgroup-size=32,4 | FileCheck %s
+
+module {
+  func @fmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
+    %c1 = constant 1 : index
+    %c0 = constant 0 : index
+    %c2 = constant 2 : index
+    %0 = dim %arg0, 0 : memref<?x?xf32>
+    %1 = dim %arg0, 1 : memref<?x?xf32>
+    // CHECK-LABEL: gpu.launch
+    // CHECK:   loop.for
+    // CHECK:     loop.for
+    // CHECK:       loop.for
+    // CHECK:         loop.for
+    // CHECK:           load
+    // CHECK:           load
+    // CHECK:           load
+    // CHECK:           mulf
+    // CHECK:           store
+    loop.for %arg3 = %c0 to %0 step %c2 {
+      loop.for %arg4 = %c0 to %1 step %c2 {
+        %4 = std.subview %arg0[%arg3, %arg4][%c2, %c2][%c1, %c1] : memref<?x?xf32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %7 = std.subview %arg1[%arg3, %arg4][%c2, %c2][%c1, %c1] : memref<?x?xf32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %10 = std.subview %arg2[%arg3, %arg4][%c2, %c2][%c1, %c1] : memref<?x?xf32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %11 = dim %4, 0 : memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %12 = dim %4, 1 : memref<?x?xf32, offset: ?, strides: [?, ?]>
+        loop.for %arg5 = %c0 to %11 step %c1 {
+          loop.for %arg6 = %c0 to %12 step %c1 {
+            %13 = load %4[%arg5, %arg6] : memref<?x?xf32, offset: ?, strides: [?, ?]>
+            %14 = load %7[%arg5, %arg6] : memref<?x?xf32, offset: ?, strides: [?, ?]>
+            %15 = load %10[%arg5, %arg6] : memref<?x?xf32, offset: ?, strides: [?, ?]>
+            %16 = mulf %13, %14 : f32
+            store %16, %10[%arg5, %arg6] : memref<?x?xf32, offset: ?, strides: [?, ?]>
+          }
+        }
+      }
+    }
+    return
+  }
+}
diff --git a/mlir/test/Conversion/LoopsToGPU/linalg_to_gpu.mlir b/mlir/test/Conversion/LoopsToGPU/linalg_to_gpu.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..f4567fb49438c239f13c17d2eae002b585fc5252
--- /dev/null
+++ b/mlir/test/Conversion/LoopsToGPU/linalg_to_gpu.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-opt -convert-loops-to-gpu %s | FileCheck %s
+
+// CHECK-LABEL: @foo
+func @foo(%arg0: memref<?xf32>, %arg1 : index) {
+  %c0 = constant 0 : index
+  %c42 = constant 42 : index
+  %c3 = constant 3 : index
+  // CHECK:      subi %{{.*}}, %{{.*}} : index
+  // CHECK-NEXT: %[[range_i:.*]] = divi_signed {{.*}}, %{{.*}} : index
+  loop.for %i0 = %c0 to %c42 step %c3 {
+    // CHECK:      subi %{{.*}}, %{{.*}} : index
+    // CHECK-NEXT: %[[range_j:.*]] = divi_signed {{.*}}, %{{.*}} : index
+    loop.for %i1 = %c3 to %c42 step %arg1 {
+      // CHECK:      gpu.launch
+      // CHECK-SAME: blocks
+      // CHECK-SAME: threads
+      // CHECK-SAME: args
+
+      // Replacements of loop induction variables.  Take a product with the
+      // step and add the lower bound.
+      // CHECK: %[[prod_i:.*]] = muli %{{.*}}, %{{.*}} : index
+      // CHECK: addi %{{.*}}, %[[prod_i]] : index
+      // CHECK: %[[prod_j:.*]] = muli %{{.*}}, %{{.*}} : index
+      // CHECK: addi %{{.*}}, %[[prod_j]] : index
+
+      // CHECK: gpu.return
+    }
+  }
+  return
+}
diff --git a/mlir/test/Conversion/LoopsToGPU/perfect_1D_setlaunch.mlir b/mlir/test/Conversion/LoopsToGPU/perfect_1D_setlaunch.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..bf437a348b64f7514c69840bd4fec2970881cb7d
--- /dev/null
+++ b/mlir/test/Conversion/LoopsToGPU/perfect_1D_setlaunch.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-opt -convert-loop-op-to-gpu -gpu-num-workgroups=2 -gpu-workgroup-size=32 %s | FileCheck %s
+
+module {
+  func @foo(%arg0: memref<?x?xf32>, %arg1 : memref<?x?xf32>, %arg2 : memref<?x?xf32>) {
+    %0 = dim %arg0, 0 : memref<?x?xf32>
+    %1 = dim %arg0, 1 : memref<?x?xf32>
+    %c0 = constant 0 : index
+    %c1 = constant 1 : index
+    // CHECK: gpu.launch
+    // CHECK:   loop.for
+    // CHECK:     loop.for
+    // CHECK:       load
+    // CHECK:       load
+    // CHECK:       add
+    // CHECK:       store
+    loop.for %iv1 = %c0 to %0 step %c1 {
+      loop.for %iv2 = %c0 to %1 step %c1 {
+         %12 = load %arg0[%iv1, %iv2] : memref<?x?xf32>
+         %13 = load %arg1[%iv2, %iv1] : memref<?x?xf32>
+         %14 = addf %12, %13 : f32
+         store %12, %arg2[%iv1, %iv2] : memref<?x?xf32>
+       }
+     }
+    return
+  }
+}
\ No newline at end of file
diff --git a/mlir/test/Conversion/LoopsToGPU/step_one.mlir b/mlir/test/Conversion/LoopsToGPU/step_one.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..a8db6044900d599bc2a8397668c09e8c25c66229
--- /dev/null
+++ b/mlir/test/Conversion/LoopsToGPU/step_one.mlir
@@ -0,0 +1,84 @@
+// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=1 -gpu-thread-dims=1 %s | FileCheck --check-prefix=CHECK-11 %s
+// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=2 -gpu-thread-dims=2 %s | FileCheck --check-prefix=CHECK-22 %s
+
+// CHECK-11-LABEL: @step_1
+// CHECK-22-LABEL: @step_1
+func @step_1(%A : memref<?x?x?x?xf32>, %B : memref<?x?x?x?xf32>) {
+  // Bounds of the loop, its range and step.
+  // CHECK-11-NEXT: %{{.*}} = constant 0 : index
+  // CHECK-11-NEXT: %{{.*}} = constant 42 : index
+  // CHECK-11-NEXT: %{{.*}} = subi %{{.*}}, %{{.*}} : index
+  // CHECK-11-NEXT: %{{.*}} = constant 1 : index
+  //
+  // CHECK-22-NEXT: %{{.*}} = constant 0 : index
+  // CHECK-22-NEXT: %{{.*}} = constant 42 : index
+  // CHECK-22-NEXT: %{{.*}} = subi %{{.*}}, %{{.*}} : index
+  // CHECK-22-NEXT: %{{.*}} = constant 1 : index
+  affine.for %i = 0 to 42 {
+
+    // Bounds of the loop, its range and step.
+    // CHECK-11-NEXT: %{{.*}} = constant 0 : index
+    // CHECK-11-NEXT: %{{.*}} = constant 10 : index
+    // CHECK-11-NEXT: %{{.*}} = subi %{{.*}}, %{{.*}} : index
+    // CHECK-11-NEXT: %{{.*}} = constant 1 : index
+    //
+    // CHECK-22-NEXT: %{{.*}} = constant 0 : index
+    // CHECK-22-NEXT: %{{.*}} = constant 10 : index
+    // CHECK-22-NEXT: %{{.*}} = subi %{{.*}}, %{{.*}} : index
+    // CHECK-22-NEXT: %{{.*}} = constant 1 : index
+    affine.for %j = 0 to 10 {
+    // CHECK-11: gpu.launch
+    // CHECK-11-SAME: blocks
+    // CHECK-11-SAME: threads
+    // CHECK-11-SAME: args
+
+      // Remapping of the loop induction variables.
+      // CHECK-11:        %[[i:.*]] = addi %{{.*}}, %{{.*}} : index
+      // CHECK-11-NEXT:   %[[j:.*]] = addi %{{.*}}, %{{.*}} : index
+
+      // This loop is not converted if mapping to 1, 1 dimensions.
+      // CHECK-11-NEXT: affine.for %[[ii:.*]] = 2 to 16
+      //
+      // Bounds of the loop, its range and step.
+      // CHECK-22-NEXT: %{{.*}} = constant 2 : index
+      // CHECK-22-NEXT: %{{.*}} = constant 16 : index
+      // CHECK-22-NEXT: %{{.*}} = subi %{{.*}}, %{{.*}} : index
+      // CHECK-22-NEXT: %{{.*}} = constant 1 : index
+      affine.for %ii = 2 to 16 {
+        // This loop is not converted if mapping to 1, 1 dimensions.
+        // CHECK-11-NEXT: affine.for %[[jj:.*]] = 5 to 17
+        //
+        // Bounds of the loop, its range and step.
+        // CHECK-22-NEXT: %{{.*}} = constant 5 : index
+        // CHECK-22-NEXT: %{{.*}} = constant 17 : index
+        // CHECK-22-NEXT: %{{.*}} = subi %{{.*}}, %{{.*}} : index
+        // CHECK-22-NEXT: %{{.*}} = constant 1 : index
+        affine.for %jj = 5 to 17 {
+        // CHECK-22: gpu.launch
+        // CHECK-22-SAME: blocks
+        // CHECK-22-SAME: threads
+        // CHECK-22-SAME: args
+
+          // Remapping of the loop induction variables in the last mapped loop.
+          // CHECK-22:        %[[i:.*]] = addi %{{.*}}, %{{.*}} : index
+          // CHECK-22-NEXT:   %[[j:.*]] = addi %{{.*}}, %{{.*}} : index
+          // CHECK-22-NEXT:   %[[ii:.*]] = addi %{{.*}}, %{{.*}} : index
+          // CHECK-22-NEXT:   %[[jj:.*]] = addi %{{.*}}, %{{.*}} : index
+
+          // Using remapped values instead of loop iterators.
+          // CHECK-11:        {{.*}} = load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
+          // CHECK-22:        {{.*}} = load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
+          %0 = load %A[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
+          // CHECK-11-NEXT:   store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
+          // CHECK-22-NEXT:   store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
+          store %0, %B[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
+
+          // CHECK-11: gpu.return
+          // CHECK-22: gpu.return
+        }
+      }
+    }
+  }
+  return
+}
+
diff --git a/mlir/test/Conversion/LoopsToGPU/step_positive.mlir b/mlir/test/Conversion/LoopsToGPU/step_positive.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..6bedc92abca627cc4eee574b0cf7d69d19408b35
--- /dev/null
+++ b/mlir/test/Conversion/LoopsToGPU/step_positive.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=1 -gpu-thread-dims=1 %s | FileCheck %s
+
+// CHECK-LABEL: @step_var
+func @step_var(%A : memref<?x?xf32>, %B : memref<?x?xf32>) {
+  // Check that we divide by step.
+  // CHECK:  %[[range_i:.*]] = divi_signed {{.*}}, %{{.*}}
+  // CHECK:  %[[range_j:.*]] = divi_signed {{.*}}, %{{.*}}
+
+  // CHECK: gpu.launch
+  // CHECK-SAME: blocks(%{{[^)]*}}, %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[range_i]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
+  // CHECK-SAME: threads(%{{[^)]*}}, %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[range_j]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
+  affine.for %i = 5 to 15 step 4 {
+    affine.for %j = 3 to 19 step 7 {
+      // Loop induction variable remapping:
+      //     iv = thread(block)_id * step + lower_bound
+      // CHECK:      %[[prod_i:.*]] = muli %{{.*}}, %{{.*}} : index
+      // CHECK-NEXT: %[[i:.*]] = addi %{{.*}}, %[[prod_i]] : index
+      // CHECK-NEXT: %[[prod_j:.*]] = muli %{{.*}}, %{{.*}} : index
+      // CHECK-NEXT: %[[j:.*]] = addi %{{.*}}, %[[prod_j]] : index
+
+      // CHECK:     {{.*}} = load %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32>
+      %0 = load %A[%i, %j] : memref<?x?xf32>
+      // CHECK:     store {{.*}}, %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32>
+      store %0, %B[%i, %j] : memref<?x?xf32>
+    }
+  }
+  return
+}
diff --git a/mlir/test/Conversion/StandardToLLVM/convert-argattrs.mlir b/mlir/test/Conversion/StandardToLLVM/convert-argattrs.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ca796d103ad66f8dfacbc28e2a5289c2877c8ae8
--- /dev/null
+++ b/mlir/test/Conversion/StandardToLLVM/convert-argattrs.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-opt -convert-std-to-llvm %s | FileCheck %s
+
+
+// CHECK-LABEL: func @check_attributes(%arg0: !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*"> {dialect.a = true, dialect.b = 4 : i64}) {
+//  CHECK-NEXT:   llvm.load %arg0 : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+func @check_attributes(%static: memref<10x20xf32> {dialect.a = true, dialect.b = 4 : i64 }) {
+  %c0 = constant 0 : index
+  %0 = load %static[%c0, %c0]: memref<10x20xf32>
+  return
+}
+
+// CHECK-LABEL: func @external_func(!llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">)
+//       CHECK: func @call_external(%[[arg:.*]]: !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">) {
+//       CHECK:   %[[ld:.*]] = llvm.load %[[arg]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+//       CHECK:   %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:   %[[alloca:.*]] = llvm.alloca %[[c1]] x !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }"> : (!llvm.i64) -> !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+//       CHECK:   llvm.store %[[ld]], %[[alloca]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+//       CHECK:   call @external_func(%[[alloca]]) : (!llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">) -> ()
+func @external_func(memref<10x20xf32>)
+
+func @call_external(%static: memref<10x20xf32>) {
+  call @external_func(%static) : (memref<10x20xf32>) -> ()
+  return
+}
+
diff --git a/mlir/test/Conversion/StandardToLLVM/convert-funcs.mlir b/mlir/test/Conversion/StandardToLLVM/convert-funcs.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..9984f8c3766bd236092209b1fbbaee40ba19d26d
--- /dev/null
+++ b/mlir/test/Conversion/StandardToLLVM/convert-funcs.mlir
@@ -0,0 +1,60 @@
+// RUN: mlir-opt -convert-std-to-llvm %s | FileCheck %s
+
+//CHECK: llvm.func @second_order_arg(!llvm<"void ()*">)
+func @second_order_arg(%arg0 : () -> ())
+
+//CHECK: llvm.func @second_order_result() -> !llvm<"void ()*">
+func @second_order_result() -> (() -> ())
+
+//CHECK: llvm.func @second_order_multi_result() -> !llvm<"{ i32 ()*, i64 ()*, float ()* }">
+func @second_order_multi_result() -> (() -> (i32), () -> (i64), () -> (f32))
+
+//CHECK: llvm.func @third_order(!llvm<"void ()* (void ()*)*">) -> !llvm<"void ()* (void ()*)*">
+func @third_order(%arg0 : (() -> ()) -> (() -> ())) -> ((() -> ()) -> (() -> ()))
+
+//CHECK: llvm.func @fifth_order_left(!llvm<"void (void (void (void ()*)*)*)*">)
+func @fifth_order_left(%arg0: (((() -> ()) -> ()) -> ()) -> ())
+
+//CHECK: llvm.func @fifth_order_right(!llvm<"void ()* ()* ()* ()*">)
+func @fifth_order_right(%arg0: () -> (() -> (() -> (() -> ()))))
+
+// Check that memrefs are converted to pointers-to-struct if appear as function arguments.
+// CHECK: llvm.func @memref_call_conv(!llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }*">)
+func @memref_call_conv(%arg0: memref<?xf32>)
+
+// Same in nested functions.
+// CHECK: llvm.func @memref_call_conv_nested(!llvm<"void ({ float*, float*, i64, [1 x i64], [1 x i64] }*)*">)
+func @memref_call_conv_nested(%arg0: (memref<?xf32>) -> ())
+
+//CHECK-LABEL: llvm.func @pass_through(%arg0: !llvm<"void ()*">) -> !llvm<"void ()*"> {
+func @pass_through(%arg0: () -> ()) -> (() -> ()) {
+// CHECK-NEXT:  llvm.br ^bb1(%arg0 : !llvm<"void ()*">)
+  br ^bb1(%arg0 : () -> ())
+
+//CHECK-NEXT: ^bb1(%0: !llvm<"void ()*">):	// pred: ^bb0
+^bb1(%bbarg: () -> ()):
+// CHECK-NEXT:  llvm.return %0 : !llvm<"void ()*">
+  return %bbarg : () -> ()
+}
+
+// CHECK-LABEL: llvm.func @body(!llvm.i32)
+func @body(i32)
+
+// CHECK-LABEL: llvm.func @indirect_const_call(%arg0: !llvm.i32) {
+func @indirect_const_call(%arg0: i32) {
+// CHECK-NEXT: %0 = llvm.mlir.constant(@body) : !llvm<"void (i32)*">
+  %0 = constant @body : (i32) -> ()
+// CHECK-NEXT:  llvm.call %0(%arg0) : (!llvm.i32) -> ()
+  call_indirect %0(%arg0) : (i32) -> ()
+// CHECK-NEXT:  llvm.return
+  return
+}
+
+// CHECK-LABEL: llvm.func @indirect_call(%arg0: !llvm<"i32 (float)*">, %arg1: !llvm.float) -> !llvm.i32 {
+func @indirect_call(%arg0: (f32) -> i32, %arg1: f32) -> i32 {
+// CHECK-NEXT:  %0 = llvm.call %arg0(%arg1) : (!llvm.float) -> !llvm.i32
+  %0 = call_indirect %arg0(%arg1) : (f32) -> i32
+// CHECK-NEXT:  llvm.return %0 : !llvm.i32
+  return %0 : i32
+}
+
diff --git a/mlir/test/Conversion/StandardToLLVM/convert-memref-ops.mlir b/mlir/test/Conversion/StandardToLLVM/convert-memref-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d92ded7f3aa9edefaf7745ceb4a5e03236312ece
--- /dev/null
+++ b/mlir/test/Conversion/StandardToLLVM/convert-memref-ops.mlir
@@ -0,0 +1,459 @@
+// RUN: mlir-opt -convert-std-to-llvm %s | FileCheck %s
+// RUN: mlir-opt -convert-std-to-llvm -convert-std-to-llvm-use-alloca=1 %s | FileCheck %s --check-prefix=ALLOCA
+
+// CHECK-LABEL: func @check_arguments(%arg0: !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">, %arg1: !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">, %arg2: !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">)
+func @check_arguments(%static: memref<10x20xf32>, %dynamic : memref<?x?xf32>, %mixed : memref<10x?xf32>) {
+  return
+}
+
+//   CHECK-LABEL: func @check_strided_memref_arguments(
+// CHECK-COUNT-3:   !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+func @check_strided_memref_arguments(%static: memref<10x20xf32, (i,j)->(20 * i + j + 1)>,
+                                     %dynamic : memref<?x?xf32, (i,j)[M]->(M * i + j + 1)>,
+                                     %mixed : memref<10x?xf32, (i,j)[M]->(M * i + j + 1)>) {
+  return
+}
+
+// CHECK-LABEL: func @check_static_return(%arg0: !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">) -> !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }"> {
+func @check_static_return(%static : memref<32x18xf32>) -> memref<32x18xf32> {
+// CHECK:  llvm.return %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  return %static : memref<32x18xf32>
+}
+
+// CHECK-LABEL: func @zero_d_alloc() -> !llvm<"{ float*, float*, i64 }"> {
+// ALLOCA-LABEL: func @zero_d_alloc() -> !llvm<"{ float*, float*, i64 }"> {
+func @zero_d_alloc() -> memref<f32> {
+// CHECK-NEXT:  llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT:  %[[null:.*]] = llvm.mlir.null : !llvm<"float*">
+// CHECK-NEXT:  %[[one:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT:  %[[gep:.*]] = llvm.getelementptr %[[null]][%[[one]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+// CHECK-NEXT:  %[[sizeof:.*]] = llvm.ptrtoint %[[gep]] : !llvm<"float*"> to !llvm.i64
+// CHECK-NEXT:  llvm.mul %{{.*}}, %[[sizeof]] : !llvm.i64
+// CHECK-NEXT:  llvm.call @malloc(%{{.*}}) : (!llvm.i64) -> !llvm<"i8*">
+// CHECK-NEXT:  %[[ptr:.*]] = llvm.bitcast %{{.*}} : !llvm<"i8*"> to !llvm<"float*">
+// CHECK-NEXT:  llvm.mlir.undef : !llvm<"{ float*, float*, i64 }">
+// CHECK-NEXT:  llvm.insertvalue %[[ptr]], %{{.*}}[0] : !llvm<"{ float*, float*, i64 }">
+// CHECK-NEXT:  llvm.insertvalue %[[ptr]], %{{.*}}[1] : !llvm<"{ float*, float*, i64 }">
+// CHECK-NEXT:  %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+// CHECK-NEXT:  llvm.insertvalue %[[c0]], %{{.*}}[2] : !llvm<"{ float*, float*, i64 }">
+
+// ALLOCA-NOT: malloc
+//     ALLOCA: alloca
+// ALLOCA-NOT: malloc
+  %0 = alloc() : memref<f32>
+  return %0 : memref<f32>
+}
+
+// CHECK-LABEL: func @zero_d_dealloc(%{{.*}}: !llvm<"{ float*, float*, i64 }*">) {
+func @zero_d_dealloc(%arg0: memref<f32>) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64 }*">
+// CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][0] : !llvm<"{ float*, float*, i64 }">
+// CHECK-NEXT:  %[[bc:.*]] = llvm.bitcast %[[ptr]] : !llvm<"float*"> to !llvm<"i8*">
+// CHECK-NEXT:  llvm.call @free(%[[bc]]) : (!llvm<"i8*">) -> ()
+  dealloc %arg0 : memref<f32>
+  return
+}
+
+// CHECK-LABEL: func @aligned_1d_alloc(
+func @aligned_1d_alloc() -> memref<42xf32> {
+// CHECK-NEXT:  llvm.mlir.constant(42 : index) : !llvm.i64
+// CHECK-NEXT:  %[[null:.*]] = llvm.mlir.null : !llvm<"float*">
+// CHECK-NEXT:  %[[one:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT:  %[[gep:.*]] = llvm.getelementptr %[[null]][%[[one]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+// CHECK-NEXT:  %[[sizeof:.*]] = llvm.ptrtoint %[[gep]] : !llvm<"float*"> to !llvm.i64
+// CHECK-NEXT:  llvm.mul %{{.*}}, %[[sizeof]] : !llvm.i64
+// CHECK-NEXT:  %[[alignment:.*]] = llvm.mlir.constant(8 : index) : !llvm.i64
+// CHECK-NEXT:  %[[alignmentMinus1:.*]] = llvm.add {{.*}}, %[[alignment]] : !llvm.i64
+// CHECK-NEXT:  %[[allocsize:.*]] = llvm.sub %[[alignmentMinus1]], %[[one]] : !llvm.i64
+// CHECK-NEXT:  %[[allocated:.*]] = llvm.call @malloc(%[[allocsize]]) : (!llvm.i64) -> !llvm<"i8*">
+// CHECK-NEXT:  %[[ptr:.*]] = llvm.bitcast %{{.*}} : !llvm<"i8*"> to !llvm<"float*">
+// CHECK-NEXT:  llvm.mlir.undef : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+// CHECK-NEXT:  llvm.insertvalue %[[ptr]], %{{.*}}[0] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+// CHECK-NEXT:  %[[allocatedAsInt:.*]] = llvm.ptrtoint %[[allocated]] : !llvm<"i8*"> to !llvm.i64
+// CHECK-NEXT:  %[[alignAdj1:.*]] = llvm.urem %[[allocatedAsInt]], %[[alignment]] : !llvm.i64
+// CHECK-NEXT:  %[[alignAdj2:.*]] = llvm.sub %[[alignment]], %[[alignAdj1]] : !llvm.i64
+// CHECK-NEXT:  %[[alignAdj3:.*]] = llvm.urem %[[alignAdj2]], %[[alignment]] : !llvm.i64
+// CHECK-NEXT:  %[[aligned:.*]] = llvm.getelementptr %9[%[[alignAdj3]]] : (!llvm<"i8*">, !llvm.i64) -> !llvm<"i8*">
+// CHECK-NEXT:  %[[alignedBitCast:.*]] = llvm.bitcast %[[aligned]] : !llvm<"i8*"> to !llvm<"float*">
+// CHECK-NEXT:  llvm.insertvalue %[[alignedBitCast]], %{{.*}}[1] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+// CHECK-NEXT:  %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+// CHECK-NEXT:  llvm.insertvalue %[[c0]], %{{.*}}[2] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+  %0 = alloc() {alignment = 8} : memref<42xf32>
+  return %0 : memref<42xf32>
+}
+
+// CHECK-LABEL: func @mixed_alloc(
+//       CHECK:   %[[M:.*]]: !llvm.i64, %[[N:.*]]: !llvm.i64) -> !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }"> {
+func @mixed_alloc(%arg0: index, %arg1: index) -> memref<?x42x?xf32> {
+//  CHECK-NEXT:  %[[c42:.*]] = llvm.mlir.constant(42 : index) : !llvm.i64
+//  CHECK-NEXT:  llvm.mul %[[M]], %[[c42]] : !llvm.i64
+//  CHECK-NEXT:  %[[sz:.*]] = llvm.mul %{{.*}}, %[[N]] : !llvm.i64
+//  CHECK-NEXT:  %[[null:.*]] = llvm.mlir.null : !llvm<"float*">
+//  CHECK-NEXT:  %[[one:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[gep:.*]] = llvm.getelementptr %[[null]][%[[one]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+//  CHECK-NEXT:  %[[sizeof:.*]] = llvm.ptrtoint %[[gep]] : !llvm<"float*"> to !llvm.i64
+//  CHECK-NEXT:  %[[sz_bytes:.*]] = llvm.mul %[[sz]], %[[sizeof]] : !llvm.i64
+//  CHECK-NEXT:  llvm.call @malloc(%[[sz_bytes]]) : (!llvm.i64) -> !llvm<"i8*">
+//  CHECK-NEXT:  llvm.bitcast %{{.*}} : !llvm<"i8*"> to !llvm<"float*">
+//  CHECK-NEXT:  llvm.mlir.undef : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//  CHECK-NEXT:  llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//  CHECK-NEXT:  llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//  CHECK-NEXT:  %[[off:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:  llvm.insertvalue %[[off]], %{{.*}}[2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//  CHECK-NEXT:  %[[st2:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[st1:.*]] = llvm.mul %{{.*}}, %[[c42]] : !llvm.i64
+//  CHECK-NEXT:  %[[st0:.*]] = llvm.mul %{{.*}}, %[[M]] : !llvm.i64
+//  CHECK-NEXT:  llvm.insertvalue %[[M]], %{{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//  CHECK-NEXT:  llvm.insertvalue %[[st0]], %{{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//  CHECK-NEXT:  llvm.insertvalue %[[c42]], %{{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//  CHECK-NEXT:  llvm.insertvalue %[[st1]], %{{.*}}[4, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//  CHECK-NEXT:  llvm.insertvalue %[[N]], %{{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//  CHECK-NEXT:  llvm.insertvalue %[[st2]], %{{.*}}[4, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+  %0 = alloc(%arg0, %arg1) : memref<?x42x?xf32>
+//  CHECK-NEXT:  llvm.return %{{.*}} : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+  return %0 : memref<?x42x?xf32>
+}
+
+// CHECK-LABEL: func @mixed_dealloc(%arg0: !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }*">) {
+func @mixed_dealloc(%arg0: memref<?x42x?xf32>) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }*">
+// CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+// CHECK-NEXT:  %[[ptri8:.*]] = llvm.bitcast %[[ptr]] : !llvm<"float*"> to !llvm<"i8*">
+// CHECK-NEXT:  llvm.call @free(%[[ptri8]]) : (!llvm<"i8*">) -> ()
+  dealloc %arg0 : memref<?x42x?xf32>
+// CHECK-NEXT:  llvm.return
+  return
+}
+
+// CHECK-LABEL: func @dynamic_alloc(
+//       CHECK:   %[[M:.*]]: !llvm.i64, %[[N:.*]]: !llvm.i64) -> !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }"> {
+func @dynamic_alloc(%arg0: index, %arg1: index) -> memref<?x?xf32> {
+//  CHECK-NEXT:  %[[sz:.*]] = llvm.mul %[[M]], %[[N]] : !llvm.i64
+//  CHECK-NEXT:  %[[null:.*]] = llvm.mlir.null : !llvm<"float*">
+//  CHECK-NEXT:  %[[one:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[gep:.*]] = llvm.getelementptr %[[null]][%[[one]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+//  CHECK-NEXT:  %[[sizeof:.*]] = llvm.ptrtoint %[[gep]] : !llvm<"float*"> to !llvm.i64
+//  CHECK-NEXT:  %[[sz_bytes:.*]] = llvm.mul %[[sz]], %[[sizeof]] : !llvm.i64
+//  CHECK-NEXT:  llvm.call @malloc(%[[sz_bytes]]) : (!llvm.i64) -> !llvm<"i8*">
+//  CHECK-NEXT:  llvm.bitcast %{{.*}} : !llvm<"i8*"> to !llvm<"float*">
+//  CHECK-NEXT:  llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  %[[off:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:  llvm.insertvalue %[[off]], %{{.*}}[2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  %[[st1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[st0:.*]] = llvm.mul %{{.*}}, %[[M]] : !llvm.i64
+//  CHECK-NEXT:  llvm.insertvalue %[[M]], %{{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  llvm.insertvalue %[[st0]], %{{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  llvm.insertvalue %[[N]], %{{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  llvm.insertvalue %[[st1]], %{{.*}}[4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
+//  CHECK-NEXT:  llvm.return %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  return %0 : memref<?x?xf32>
+}
+
+// CHECK-LABEL: func @dynamic_dealloc(%arg0: !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">) {
+func @dynamic_dealloc(%arg0: memref<?x?xf32>) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+// CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+// CHECK-NEXT:  %[[ptri8:.*]] = llvm.bitcast %[[ptr]] : !llvm<"float*"> to !llvm<"i8*">
+// CHECK-NEXT:  llvm.call @free(%[[ptri8]]) : (!llvm<"i8*">) -> ()
+  dealloc %arg0 : memref<?x?xf32>
+  return
+}
+
+// CHECK-LABEL: func @static_alloc() -> !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }"> {
+func @static_alloc() -> memref<32x18xf32> {
+// CHECK-NEXT:  %[[sz1:.*]] = llvm.mlir.constant(32 : index) : !llvm.i64
+// CHECK-NEXT:  %[[sz2:.*]] = llvm.mlir.constant(18 : index) : !llvm.i64
+// CHECK-NEXT:  %[[num_elems:.*]] = llvm.mul %0, %1 : !llvm.i64
+// CHECK-NEXT:  %[[null:.*]] = llvm.mlir.null : !llvm<"float*">
+// CHECK-NEXT:  %[[one:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT:  %[[gep:.*]] = llvm.getelementptr %[[null]][%[[one]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+//  CHECK-NEXT:  %[[sizeof:.*]] = llvm.ptrtoint %[[gep]] : !llvm<"float*"> to !llvm.i64
+// CHECK-NEXT:  %[[bytes:.*]] = llvm.mul %[[num_elems]], %[[sizeof]] : !llvm.i64
+// CHECK-NEXT:  %[[allocated:.*]] = llvm.call @malloc(%[[bytes]]) : (!llvm.i64) -> !llvm<"i8*">
+// CHECK-NEXT:  llvm.bitcast %[[allocated]] : !llvm<"i8*"> to !llvm<"float*">
+ %0 = alloc() : memref<32x18xf32>
+ return %0 : memref<32x18xf32>
+}
+
+// CHECK-LABEL: func @static_dealloc(%{{.*}}: !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">) {
+func @static_dealloc(%static: memref<10x8xf32>) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+// CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+// CHECK-NEXT:  %[[bc:.*]] = llvm.bitcast %[[ptr]] : !llvm<"float*"> to !llvm<"i8*">
+// CHECK-NEXT:  llvm.call @free(%[[bc]]) : (!llvm<"i8*">) -> ()
+  dealloc %static : memref<10x8xf32>
+  return
+}
+
+// CHECK-LABEL: func @zero_d_load(%{{.*}}: !llvm<"{ float*, float*, i64 }*">) -> !llvm.float {
+func @zero_d_load(%arg0: memref<f32>) -> f32 {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64 }*">
+// CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][1] : !llvm<"{ float*, float*, i64 }">
+// CHECK-NEXT:  %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+// CHECK-NEXT:  %[[addr:.*]] = llvm.getelementptr %[[ptr]][%[[c0]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+// CHECK-NEXT:  %{{.*}} = llvm.load %[[addr]] : !llvm<"float*">
+  %0 = load %arg0[] : memref<f32>
+  return %0 : f32
+}
+
+// CHECK-LABEL: func @static_load(
+//       CHECK:   %[[A:.*]]: !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">, %[[I:.*]]: !llvm.i64, %[[J:.*]]: !llvm.i64
+func @static_load(%static : memref<10x42xf32>, %i : index, %j : index) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+//  CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  %[[off:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[st0:.*]] = llvm.mlir.constant(42 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[offI:.*]] = llvm.mul %[[I]], %[[st0]] : !llvm.i64
+//  CHECK-NEXT:  %[[off0:.*]] = llvm.add %[[off]], %[[offI]] : !llvm.i64
+//  CHECK-NEXT:  %[[st1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[offJ:.*]] = llvm.mul %[[J]], %[[st1]] : !llvm.i64
+//  CHECK-NEXT:  %[[off1:.*]] = llvm.add %[[off0]], %[[offJ]] : !llvm.i64
+//  CHECK-NEXT:  %[[addr:.*]] = llvm.getelementptr %[[ptr]][%[[off1]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+// CHECK-NEXT:  llvm.load %[[addr]] : !llvm<"float*">
+  %0 = load %static[%i, %j] : memref<10x42xf32>
+  return
+}
+
+// CHECK-LABEL: func @mixed_load(
+//       CHECK:   %[[A:.*]]: !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">, %[[I:.*]]: !llvm.i64, %[[J:.*]]: !llvm.i64
+func @mixed_load(%mixed : memref<42x?xf32>, %i : index, %j : index) {
+//  CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+//  CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  %[[off:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[st0:.*]] = llvm.extractvalue %[[ld]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  %[[offI:.*]] = llvm.mul %[[I]], %[[st0]] : !llvm.i64
+//  CHECK-NEXT:  %[[off0:.*]] = llvm.add %[[off]], %[[offI]] : !llvm.i64
+//  CHECK-NEXT:  %[[st1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[offJ:.*]] = llvm.mul %[[J]], %[[st1]] : !llvm.i64
+//  CHECK-NEXT:  %[[off1:.*]] = llvm.add %[[off0]], %[[offJ]] : !llvm.i64
+//  CHECK-NEXT:  %[[addr:.*]] = llvm.getelementptr %[[ptr]][%[[off1]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+//  CHECK-NEXT:  llvm.load %[[addr]] : !llvm<"float*">
+  %0 = load %mixed[%i, %j] : memref<42x?xf32>
+  return
+}
+
+// CHECK-LABEL: func @dynamic_load(
+//       CHECK:   %[[A:.*]]: !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">, %[[I:.*]]: !llvm.i64, %[[J:.*]]: !llvm.i64
+func @dynamic_load(%dynamic : memref<?x?xf32>, %i : index, %j : index) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+//  CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  %[[off:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[st0:.*]] = llvm.extractvalue %[[ld]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  %[[offI:.*]] = llvm.mul %[[I]], %[[st0]] : !llvm.i64
+//  CHECK-NEXT:  %[[off0:.*]] = llvm.add %[[off]], %[[offI]] : !llvm.i64
+//  CHECK-NEXT:  %[[st1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[offJ:.*]] = llvm.mul %[[J]], %[[st1]] : !llvm.i64
+//  CHECK-NEXT:  %[[off1:.*]] = llvm.add %[[off0]], %[[offJ]] : !llvm.i64
+//  CHECK-NEXT:  %[[addr:.*]] = llvm.getelementptr %[[ptr]][%[[off1]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+//  CHECK-NEXT:  llvm.load %[[addr]] : !llvm<"float*">
+  %0 = load %dynamic[%i, %j] : memref<?x?xf32>
+  return
+}
+
+// CHECK-LABEL: func @prefetch
+func @prefetch(%A : memref<?x?xf32>, %i : index, %j : index) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+// CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+// CHECK-NEXT:  %[[off:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+// CHECK-NEXT:  %[[st0:.*]] = llvm.extractvalue %[[ld]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+// CHECK-NEXT:  %[[offI:.*]] = llvm.mul %[[I]], %[[st0]] : !llvm.i64
+// CHECK-NEXT:  %[[off0:.*]] = llvm.add %[[off]], %[[offI]] : !llvm.i64
+// CHECK-NEXT:  %[[st1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT:  %[[offJ:.*]] = llvm.mul %[[J]], %[[st1]] : !llvm.i64
+// CHECK-NEXT:  %[[off1:.*]] = llvm.add %[[off0]], %[[offJ]] : !llvm.i64
+// CHECK-NEXT:  %[[addr:.*]] = llvm.getelementptr %[[ptr]][%[[off1]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+// CHECK-NEXT:  [[C1:%.*]] = llvm.mlir.constant(1 : i32) : !llvm.i32
+// CHECK-NEXT:  [[C3:%.*]] = llvm.mlir.constant(3 : i32) : !llvm.i32
+// CHECK-NEXT:  [[C1_1:%.*]] = llvm.mlir.constant(1 : i32) : !llvm.i32
+// CHECK-NEXT:  "llvm.intr.prefetch"(%[[addr]], [[C1]], [[C3]], [[C1_1]]) : (!llvm<"float*">, !llvm.i32, !llvm.i32, !llvm.i32) -> ()
+  prefetch %A[%i, %j], write, locality<3>, data : memref<?x?xf32>
+// CHECK:  [[C0:%.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+// CHECK:  [[C0_1:%.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+// CHECK:  [[C1_2:%.*]] = llvm.mlir.constant(1 : i32) : !llvm.i32
+// CHECK:  "llvm.intr.prefetch"(%{{.*}}, [[C0]], [[C0_1]], [[C1_2]]) : (!llvm<"float*">, !llvm.i32, !llvm.i32, !llvm.i32) -> ()
+  prefetch %A[%i, %j], read, locality<0>, data : memref<?x?xf32>
+// CHECK:  [[C0_2:%.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+// CHECK:  [[C2:%.*]] = llvm.mlir.constant(2 : i32) : !llvm.i32
+// CHECK:  [[C0_3:%.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+// CHECK:  "llvm.intr.prefetch"(%{{.*}}, [[C0_2]], [[C2]], [[C0_3]]) : (!llvm<"float*">, !llvm.i32, !llvm.i32, !llvm.i32) -> ()
+  prefetch %A[%i, %j], read, locality<2>, instr : memref<?x?xf32>
+  return
+}
+
+// CHECK-LABEL: func @zero_d_store(%arg0: !llvm<"{ float*, float*, i64 }*">, %arg1: !llvm.float) {
+func @zero_d_store(%arg0: memref<f32>, %arg1: f32) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64 }*">
+// CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][1] : !llvm<"{ float*, float*, i64 }">
+// CHECK-NEXT:  %[[off:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+// CHECK-NEXT:  %[[addr:.*]] = llvm.getelementptr %[[ptr]][%[[off]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+// CHECK-NEXT:  llvm.store %arg1, %[[addr]] : !llvm<"float*">
+  store %arg1, %arg0[] : memref<f32>
+  return
+}
+
+// CHECK-LABEL: func @static_store
+func @static_store(%static : memref<10x42xf32>, %i : index, %j : index, %val : f32) {
+//  CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+//  CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  %[[off:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[st0:.*]] = llvm.mlir.constant(42 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[offI:.*]] = llvm.mul %[[I]], %[[st0]] : !llvm.i64
+//  CHECK-NEXT:  %[[off0:.*]] = llvm.add %[[off]], %[[offI]] : !llvm.i64
+//  CHECK-NEXT:  %[[st1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[offJ:.*]] = llvm.mul %[[J]], %[[st1]] : !llvm.i64
+//  CHECK-NEXT:  %[[off1:.*]] = llvm.add %[[off0]], %[[offJ]] : !llvm.i64
+//  CHECK-NEXT:  %[[addr:.*]] = llvm.getelementptr %[[ptr]][%[[off1]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+//  CHECK-NEXT:  llvm.store %arg3, %[[addr]] : !llvm<"float*">
+  store %val, %static[%i, %j] : memref<10x42xf32>
+  return
+}
+
+// CHECK-LABEL: func @dynamic_store
+func @dynamic_store(%dynamic : memref<?x?xf32>, %i : index, %j : index, %val : f32) {
+//  CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+//  CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  %[[off:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[st0:.*]] = llvm.extractvalue %[[ld]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  %[[offI:.*]] = llvm.mul %[[I]], %[[st0]] : !llvm.i64
+//  CHECK-NEXT:  %[[off0:.*]] = llvm.add %[[off]], %[[offI]] : !llvm.i64
+//  CHECK-NEXT:  %[[st1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[offJ:.*]] = llvm.mul %[[J]], %[[st1]] : !llvm.i64
+//  CHECK-NEXT:  %[[off1:.*]] = llvm.add %[[off0]], %[[offJ]] : !llvm.i64
+//  CHECK-NEXT:  %[[addr:.*]] = llvm.getelementptr %[[ptr]][%[[off1]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+//  CHECK-NEXT:  llvm.store %arg3, %[[addr]] : !llvm<"float*">
+  store %val, %dynamic[%i, %j] : memref<?x?xf32>
+  return
+}
+
+// CHECK-LABEL: func @mixed_store
+func @mixed_store(%mixed : memref<42x?xf32>, %i : index, %j : index, %val : f32) {
+//  CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+//  CHECK-NEXT:  %[[ptr:.*]] = llvm.extractvalue %[[ld]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  %[[off:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[st0:.*]] = llvm.extractvalue %[[ld]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK-NEXT:  %[[offI:.*]] = llvm.mul %[[I]], %[[st0]] : !llvm.i64
+//  CHECK-NEXT:  %[[off0:.*]] = llvm.add %[[off]], %[[offI]] : !llvm.i64
+//  CHECK-NEXT:  %[[st1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:  %[[offJ:.*]] = llvm.mul %[[J]], %[[st1]] : !llvm.i64
+//  CHECK-NEXT:  %[[off1:.*]] = llvm.add %[[off0]], %[[offJ]] : !llvm.i64
+//  CHECK-NEXT:  %[[addr:.*]] = llvm.getelementptr %[[ptr]][%[[off1]]] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+//  CHECK-NEXT:  llvm.store %arg3, %[[addr]] : !llvm<"float*">
+  store %val, %mixed[%i, %j] : memref<42x?xf32>
+  return
+}
+
+// CHECK-LABEL: func @memref_cast_static_to_dynamic
+func @memref_cast_static_to_dynamic(%static : memref<10x42xf32>) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+// CHECK-NEXT:  llvm.bitcast %[[ld]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }"> to !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %0 = memref_cast %static : memref<10x42xf32> to memref<?x?xf32>
+  return
+}
+
+// CHECK-LABEL: func @memref_cast_static_to_mixed
+func @memref_cast_static_to_mixed(%static : memref<10x42xf32>) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+// CHECK-NEXT:  llvm.bitcast %[[ld]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }"> to !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %0 = memref_cast %static : memref<10x42xf32> to memref<?x42xf32>
+  return
+}
+
+// CHECK-LABEL: func @memref_cast_dynamic_to_static
+func @memref_cast_dynamic_to_static(%dynamic : memref<?x?xf32>) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+// CHECK-NEXT:  llvm.bitcast %[[ld]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }"> to !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %0 = memref_cast %dynamic : memref<?x?xf32> to memref<10x12xf32>
+  return
+}
+
+// CHECK-LABEL: func @memref_cast_dynamic_to_mixed
+func @memref_cast_dynamic_to_mixed(%dynamic : memref<?x?xf32>) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+// CHECK-NEXT:  llvm.bitcast %[[ld]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }"> to !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %0 = memref_cast %dynamic : memref<?x?xf32> to memref<?x12xf32>
+  return
+}
+
+// CHECK-LABEL: func @memref_cast_mixed_to_dynamic
+func @memref_cast_mixed_to_dynamic(%mixed : memref<42x?xf32>) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+// CHECK-NEXT:  llvm.bitcast %[[ld]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }"> to !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %0 = memref_cast %mixed : memref<42x?xf32> to memref<?x?xf32>
+  return
+}
+
+// CHECK-LABEL: func @memref_cast_mixed_to_static
+func @memref_cast_mixed_to_static(%mixed : memref<42x?xf32>) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+// CHECK-NEXT:  llvm.bitcast %[[ld]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }"> to !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %0 = memref_cast %mixed : memref<42x?xf32> to memref<42x1xf32>
+  return
+}
+
+// CHECK-LABEL: func @memref_cast_mixed_to_mixed
+func @memref_cast_mixed_to_mixed(%mixed : memref<42x?xf32>) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+// CHECK-NEXT:  llvm.bitcast %[[ld]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }"> to !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %0 = memref_cast %mixed : memref<42x?xf32> to memref<?x1xf32>
+  return
+}
+
+// CHECK-LABEL: func @memref_cast_ranked_to_unranked
+func @memref_cast_ranked_to_unranked(%arg : memref<42x2x?xf32>) {
+// CHECK-NEXT: %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }*">
+// CHECK-DAG:  %[[c:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-DAG:  %[[p:.*]] = llvm.alloca %[[c]] x !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }"> : (!llvm.i64) -> !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }*">
+// CHECK-DAG:  llvm.store %[[ld]], %[[p]] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }*">
+// CHECK-DAG:  %[[p2:.*]] = llvm.bitcast %2 : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }*"> to !llvm<"i8*">
+// CHECK-DAG:  %[[r:.*]] = llvm.mlir.constant(3 : i64) : !llvm.i64
+// CHECK    :  llvm.mlir.undef : !llvm<"{ i64, i8* }">
+// CHECK-DAG:  llvm.insertvalue %[[r]], %{{.*}}[0] : !llvm<"{ i64, i8* }">
+// CHECK-DAG:  llvm.insertvalue %[[p2]], %{{.*}}[1] : !llvm<"{ i64, i8* }">
+  %0 = memref_cast %arg : memref<42x2x?xf32> to memref<*xf32>
+  return
+}
+
+// CHECK-LABEL: func @memref_cast_unranked_to_ranked
+func @memref_cast_unranked_to_ranked(%arg : memref<*xf32>) {
+// CHECK: %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ i64, i8* }*">
+// CHECK-NEXT: %[[p:.*]] = llvm.extractvalue %[[ld]][1] : !llvm<"{ i64, i8* }">
+// CHECK-NEXT: llvm.bitcast %[[p]] : !llvm<"i8*"> to !llvm<"{ float*, float*, i64, [4 x i64], [4 x i64] }*">
+  %0 = memref_cast %arg : memref<*xf32> to memref<?x?x10x2xf32>
+  return
+}
+
+// CHECK-LABEL: func @mixed_memref_dim(%arg0: !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }*">) {
+func @mixed_memref_dim(%mixed : memref<42x?x?x13x?xf32>) {
+//  CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }*">
+//  CHECK-NEXT:  llvm.mlir.constant(42 : index) : !llvm.i64
+  %0 = dim %mixed, 0 : memref<42x?x?x13x?xf32>
+//  CHECK-NEXT:  llvm.extractvalue %[[ld]][3, 1] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
+  %1 = dim %mixed, 1 : memref<42x?x?x13x?xf32>
+//  CHECK-NEXT:  llvm.extractvalue %[[ld]][3, 2] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
+  %2 = dim %mixed, 2 : memref<42x?x?x13x?xf32>
+//  CHECK-NEXT:  llvm.mlir.constant(13 : index) : !llvm.i64
+  %3 = dim %mixed, 3 : memref<42x?x?x13x?xf32>
+//  CHECK-NEXT:  llvm.extractvalue %[[ld]][3, 4] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
+  %4 = dim %mixed, 4 : memref<42x?x?x13x?xf32>
+  return
+}
+
+// CHECK-LABEL: func @static_memref_dim(%arg0: !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }*">) {
+func @static_memref_dim(%static : memref<42x32x15x13x27xf32>) {
+// CHECK-NEXT:  %[[ld:.*]] = llvm.load %{{.*}} : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }*">
+// CHECK-NEXT:  llvm.mlir.constant(42 : index) : !llvm.i64
+  %0 = dim %static, 0 : memref<42x32x15x13x27xf32>
+// CHECK-NEXT:  llvm.mlir.constant(32 : index) : !llvm.i64
+  %1 = dim %static, 1 : memref<42x32x15x13x27xf32>
+// CHECK-NEXT:  llvm.mlir.constant(15 : index) : !llvm.i64
+  %2 = dim %static, 2 : memref<42x32x15x13x27xf32>
+// CHECK-NEXT:  llvm.mlir.constant(13 : index) : !llvm.i64
+  %3 = dim %static, 3 : memref<42x32x15x13x27xf32>
+// CHECK-NEXT:  llvm.mlir.constant(27 : index) : !llvm.i64
+  %4 = dim %static, 4 : memref<42x32x15x13x27xf32>
+  return
+}
+
diff --git a/mlir/test/Conversion/StandardToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/StandardToLLVM/convert-to-llvmir.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..25054bb340e5a0e483c920c11b252441319c4d6d
--- /dev/null
+++ b/mlir/test/Conversion/StandardToLLVM/convert-to-llvmir.mlir
@@ -0,0 +1,821 @@
+// RUN: mlir-opt -convert-std-to-llvm %s -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @empty() {
+// CHECK-NEXT:  llvm.return
+// CHECK-NEXT: }
+func @empty() {
+^bb0:
+  return
+}
+
+// CHECK-LABEL: func @body(!llvm.i64)
+func @body(index)
+
+// CHECK-LABEL: func @simple_loop() {
+func @simple_loop() {
+^bb0:
+// CHECK-NEXT:  llvm.br ^bb1
+  br ^bb1
+
+// CHECK-NEXT: ^bb1:	// pred: ^bb0
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(42 : index) : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb2({{.*}} : !llvm.i64)
+^bb1:	// pred: ^bb0
+  %c1 = constant 1 : index
+  %c42 = constant 42 : index
+  br ^bb2(%c1 : index)
+
+// CHECK:      ^bb2({{.*}}: !llvm.i64):	// 2 preds: ^bb1, ^bb3
+// CHECK-NEXT:  {{.*}} = llvm.icmp "slt" {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.cond_br {{.*}}, ^bb3, ^bb4
+^bb2(%0: index):	// 2 preds: ^bb1, ^bb3
+  %1 = cmpi "slt", %0, %c42 : index
+  cond_br %1, ^bb3, ^bb4
+
+// CHECK:      ^bb3:	// pred: ^bb2
+// CHECK-NEXT:  llvm.call @body({{.*}}) : (!llvm.i64) -> ()
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.add {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb2({{.*}} : !llvm.i64)
+^bb3:	// pred: ^bb2
+  call @body(%0) : (index) -> ()
+  %c1_0 = constant 1 : index
+  %2 = addi %0, %c1_0 : index
+  br ^bb2(%2 : index)
+
+// CHECK:      ^bb4:	// pred: ^bb2
+// CHECK-NEXT:  llvm.return
+^bb4:	// pred: ^bb2
+  return
+}
+
+// CHECK-LABEL: func @simple_caller() {
+// CHECK-NEXT:  llvm.call @simple_loop() : () -> ()
+// CHECK-NEXT:  llvm.return
+// CHECK-NEXT: }
+func @simple_caller() {
+^bb0:
+  call @simple_loop() : () -> ()
+  return
+}
+
+// Check that function call attributes persist during conversion.
+// CHECK-LABEL: @call_with_attributes
+func @call_with_attributes() {
+  // CHECK: llvm.call @simple_loop() {baz = [1, 2, 3, 4], foo = "bar"} : () -> ()
+  call @simple_loop() {foo="bar", baz=[1,2,3,4]} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @ml_caller() {
+// CHECK-NEXT:  llvm.call @simple_loop() : () -> ()
+// CHECK-NEXT:  llvm.call @more_imperfectly_nested_loops() : () -> ()
+// CHECK-NEXT:  llvm.return
+// CHECK-NEXT: }
+func @ml_caller() {
+^bb0:
+  call @simple_loop() : () -> ()
+  call @more_imperfectly_nested_loops() : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @body_args(!llvm.i64) -> !llvm.i64
+func @body_args(index) -> index
+// CHECK-LABEL: func @other(!llvm.i64, !llvm.i32) -> !llvm.i32
+func @other(index, i32) -> i32
+
+// CHECK-LABEL: func @func_args(%arg0: !llvm.i32, %arg1: !llvm.i32) -> !llvm.i32 {
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(0 : i32) : !llvm.i32
+// CHECK-NEXT:  llvm.br ^bb1
+func @func_args(i32, i32) -> i32 {
+^bb0(%arg0: i32, %arg1: i32):
+  %c0_i32 = constant 0 : i32
+  br ^bb1
+
+// CHECK-NEXT: ^bb1:	// pred: ^bb0
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(0 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(42 : index) : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb2({{.*}} : !llvm.i64)
+^bb1:	// pred: ^bb0
+  %c0 = constant 0 : index
+  %c42 = constant 42 : index
+  br ^bb2(%c0 : index)
+
+// CHECK-NEXT: ^bb2({{.*}}: !llvm.i64):	// 2 preds: ^bb1, ^bb3
+// CHECK-NEXT:  {{.*}} = llvm.icmp "slt" {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.cond_br {{.*}}, ^bb3, ^bb4
+^bb2(%0: index):	// 2 preds: ^bb1, ^bb3
+  %1 = cmpi "slt", %0, %c42 : index
+  cond_br %1, ^bb3, ^bb4
+
+// CHECK-NEXT: ^bb3:	// pred: ^bb2
+// CHECK-NEXT:  {{.*}} = llvm.call @body_args({{.*}}) : (!llvm.i64) -> !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.call @other({{.*}}, %arg0) : (!llvm.i64, !llvm.i32) -> !llvm.i32
+// CHECK-NEXT:  {{.*}} = llvm.call @other({{.*}}, {{.*}}) : (!llvm.i64, !llvm.i32) -> !llvm.i32
+// CHECK-NEXT:  {{.*}} = llvm.call @other({{.*}}, %arg1) : (!llvm.i64, !llvm.i32) -> !llvm.i32
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.add {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb2({{.*}} : !llvm.i64)
+^bb3:	// pred: ^bb2
+  %2 = call @body_args(%0) : (index) -> index
+  %3 = call @other(%2, %arg0) : (index, i32) -> i32
+  %4 = call @other(%2, %3) : (index, i32) -> i32
+  %5 = call @other(%2, %arg1) : (index, i32) -> i32
+  %c1 = constant 1 : index
+  %6 = addi %0, %c1 : index
+  br ^bb2(%6 : index)
+
+// CHECK-NEXT: ^bb4:	// pred: ^bb2
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(0 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.call @other({{.*}}, {{.*}}) : (!llvm.i64, !llvm.i32) -> !llvm.i32
+// CHECK-NEXT:  llvm.return {{.*}} : !llvm.i32
+^bb4:	// pred: ^bb2
+  %c0_0 = constant 0 : index
+  %7 = call @other(%c0_0, %c0_i32) : (index, i32) -> i32
+  return %7 : i32
+}
+
+// CHECK-LABEL: func @pre(!llvm.i64)
+func @pre(index)
+
+// CHECK-LABEL: func @body2(!llvm.i64, !llvm.i64)
+func @body2(index, index)
+
+// CHECK-LABEL: func @post(!llvm.i64)
+func @post(index)
+
+// CHECK-LABEL: func @imperfectly_nested_loops() {
+// CHECK-NEXT:  llvm.br ^bb1
+func @imperfectly_nested_loops() {
+^bb0:
+  br ^bb1
+
+// CHECK-NEXT: ^bb1:	// pred: ^bb0
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(0 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(42 : index) : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb2({{.*}} : !llvm.i64)
+^bb1:	// pred: ^bb0
+  %c0 = constant 0 : index
+  %c42 = constant 42 : index
+  br ^bb2(%c0 : index)
+
+// CHECK-NEXT: ^bb2({{.*}}: !llvm.i64):	// 2 preds: ^bb1, ^bb7
+// CHECK-NEXT:  {{.*}} = llvm.icmp "slt" {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.cond_br {{.*}}, ^bb3, ^bb8
+^bb2(%0: index):	// 2 preds: ^bb1, ^bb7
+  %1 = cmpi "slt", %0, %c42 : index
+  cond_br %1, ^bb3, ^bb8
+
+// CHECK-NEXT: ^bb3:
+// CHECK-NEXT:  llvm.call @pre({{.*}}) : (!llvm.i64) -> ()
+// CHECK-NEXT:  llvm.br ^bb4
+^bb3:	// pred: ^bb2
+  call @pre(%0) : (index) -> ()
+  br ^bb4
+
+// CHECK-NEXT: ^bb4:	// pred: ^bb3
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(7 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(56 : index) : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb5({{.*}} : !llvm.i64)
+^bb4:	// pred: ^bb3
+  %c7 = constant 7 : index
+  %c56 = constant 56 : index
+  br ^bb5(%c7 : index)
+
+// CHECK-NEXT: ^bb5({{.*}}: !llvm.i64):	// 2 preds: ^bb4, ^bb6
+// CHECK-NEXT:  {{.*}} = llvm.icmp "slt" {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.cond_br {{.*}}, ^bb6, ^bb7
+^bb5(%2: index):	// 2 preds: ^bb4, ^bb6
+  %3 = cmpi "slt", %2, %c56 : index
+  cond_br %3, ^bb6, ^bb7
+
+// CHECK-NEXT: ^bb6:	// pred: ^bb5
+// CHECK-NEXT:  llvm.call @body2({{.*}}, {{.*}}) : (!llvm.i64, !llvm.i64) -> ()
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(2 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.add {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb5({{.*}} : !llvm.i64)
+^bb6:	// pred: ^bb5
+  call @body2(%0, %2) : (index, index) -> ()
+  %c2 = constant 2 : index
+  %4 = addi %2, %c2 : index
+  br ^bb5(%4 : index)
+
+// CHECK-NEXT: ^bb7:	// pred: ^bb5
+// CHECK-NEXT:  llvm.call @post({{.*}}) : (!llvm.i64) -> ()
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.add {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb2({{.*}} : !llvm.i64)
+^bb7:	// pred: ^bb5
+  call @post(%0) : (index) -> ()
+  %c1 = constant 1 : index
+  %5 = addi %0, %c1 : index
+  br ^bb2(%5 : index)
+
+// CHECK-NEXT: ^bb8:	// pred: ^bb2
+// CHECK-NEXT:  llvm.return
+^bb8:	// pred: ^bb2
+  return
+}
+
+// CHECK-LABEL: func @mid(!llvm.i64)
+func @mid(index)
+
+// CHECK-LABEL: func @body3(!llvm.i64, !llvm.i64)
+func @body3(index, index)
+
+// A complete function transformation check.
+// CHECK-LABEL: func @more_imperfectly_nested_loops() {
+// CHECK-NEXT:  llvm.br ^bb1
+// CHECK-NEXT:^bb1:	// pred: ^bb0
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(0 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(42 : index) : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb2({{.*}} : !llvm.i64)
+// CHECK-NEXT:^bb2({{.*}}: !llvm.i64):	// 2 preds: ^bb1, ^bb11
+// CHECK-NEXT:  {{.*}} = llvm.icmp "slt" {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.cond_br {{.*}}, ^bb3, ^bb12
+// CHECK-NEXT:^bb3:	// pred: ^bb2
+// CHECK-NEXT:  llvm.call @pre({{.*}}) : (!llvm.i64) -> ()
+// CHECK-NEXT:  llvm.br ^bb4
+// CHECK-NEXT:^bb4:	// pred: ^bb3
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(7 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(56 : index) : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb5({{.*}} : !llvm.i64)
+// CHECK-NEXT:^bb5({{.*}}: !llvm.i64):	// 2 preds: ^bb4, ^bb6
+// CHECK-NEXT:  {{.*}} = llvm.icmp "slt" {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.cond_br {{.*}}, ^bb6, ^bb7
+// CHECK-NEXT:^bb6:	// pred: ^bb5
+// CHECK-NEXT:  llvm.call @body2({{.*}}, {{.*}}) : (!llvm.i64, !llvm.i64) -> ()
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(2 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.add {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb5({{.*}} : !llvm.i64)
+// CHECK-NEXT:^bb7:	// pred: ^bb5
+// CHECK-NEXT:  llvm.call @mid({{.*}}) : (!llvm.i64) -> ()
+// CHECK-NEXT:  llvm.br ^bb8
+// CHECK-NEXT:^bb8:	// pred: ^bb7
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(18 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(37 : index) : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb9({{.*}} : !llvm.i64)
+// CHECK-NEXT:^bb9({{.*}}: !llvm.i64):	// 2 preds: ^bb8, ^bb10
+// CHECK-NEXT:  {{.*}} = llvm.icmp "slt" {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.cond_br {{.*}}, ^bb10, ^bb11
+// CHECK-NEXT:^bb10:	// pred: ^bb9
+// CHECK-NEXT:  llvm.call @body3({{.*}}, {{.*}}) : (!llvm.i64, !llvm.i64) -> ()
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(3 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.add {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb9({{.*}} : !llvm.i64)
+// CHECK-NEXT:^bb11:	// pred: ^bb9
+// CHECK-NEXT:  llvm.call @post({{.*}}) : (!llvm.i64) -> ()
+// CHECK-NEXT:  {{.*}} = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.add {{.*}}, {{.*}} : !llvm.i64
+// CHECK-NEXT:  llvm.br ^bb2({{.*}} : !llvm.i64)
+// CHECK-NEXT:^bb12:	// pred: ^bb2
+// CHECK-NEXT:  llvm.return
+// CHECK-NEXT: }
+func @more_imperfectly_nested_loops() {
+^bb0:
+  br ^bb1
+^bb1:	// pred: ^bb0
+  %c0 = constant 0 : index
+  %c42 = constant 42 : index
+  br ^bb2(%c0 : index)
+^bb2(%0: index):	// 2 preds: ^bb1, ^bb11
+  %1 = cmpi "slt", %0, %c42 : index
+  cond_br %1, ^bb3, ^bb12
+^bb3:	// pred: ^bb2
+  call @pre(%0) : (index) -> ()
+  br ^bb4
+^bb4:	// pred: ^bb3
+  %c7 = constant 7 : index
+  %c56 = constant 56 : index
+  br ^bb5(%c7 : index)
+^bb5(%2: index):	// 2 preds: ^bb4, ^bb6
+  %3 = cmpi "slt", %2, %c56 : index
+  cond_br %3, ^bb6, ^bb7
+^bb6:	// pred: ^bb5
+  call @body2(%0, %2) : (index, index) -> ()
+  %c2 = constant 2 : index
+  %4 = addi %2, %c2 : index
+  br ^bb5(%4 : index)
+^bb7:	// pred: ^bb5
+  call @mid(%0) : (index) -> ()
+  br ^bb8
+^bb8:	// pred: ^bb7
+  %c18 = constant 18 : index
+  %c37 = constant 37 : index
+  br ^bb9(%c18 : index)
+^bb9(%5: index):	// 2 preds: ^bb8, ^bb10
+  %6 = cmpi "slt", %5, %c37 : index
+  cond_br %6, ^bb10, ^bb11
+^bb10:	// pred: ^bb9
+  call @body3(%0, %5) : (index, index) -> ()
+  %c3 = constant 3 : index
+  %7 = addi %5, %c3 : index
+  br ^bb9(%7 : index)
+^bb11:	// pred: ^bb9
+  call @post(%0) : (index) -> ()
+  %c1 = constant 1 : index
+  %8 = addi %0, %c1 : index
+  br ^bb2(%8 : index)
+^bb12:	// pred: ^bb2
+  return
+}
+
+// CHECK-LABEL: func @get_i64() -> !llvm.i64
+func @get_i64() -> (i64)
+// CHECK-LABEL: func @get_f32() -> !llvm.float
+func @get_f32() -> (f32)
+// CHECK-LABEL: func @get_memref() -> !llvm<"{ float*, float*, i64, [4 x i64], [4 x i64] }">
+func @get_memref() -> (memref<42x?x10x?xf32>)
+
+// CHECK-LABEL: func @multireturn() -> !llvm<"{ i64, float, { float*, float*, i64, [4 x i64], [4 x i64] } }"> {
+func @multireturn() -> (i64, f32, memref<42x?x10x?xf32>) {
+^bb0:
+// CHECK-NEXT:  {{.*}} = llvm.call @get_i64() : () -> !llvm.i64
+// CHECK-NEXT:  {{.*}} = llvm.call @get_f32() : () -> !llvm.float
+// CHECK-NEXT:  {{.*}} = llvm.call @get_memref() : () -> !llvm<"{ float*, float*, i64, [4 x i64], [4 x i64] }">
+  %0 = call @get_i64() : () -> (i64)
+  %1 = call @get_f32() : () -> (f32)
+  %2 = call @get_memref() : () -> (memref<42x?x10x?xf32>)
+// CHECK-NEXT:  {{.*}} = llvm.mlir.undef : !llvm<"{ i64, float, { float*, float*, i64, [4 x i64], [4 x i64] } }">
+// CHECK-NEXT:  {{.*}} = llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"{ i64, float, { float*, float*, i64, [4 x i64], [4 x i64] } }">
+// CHECK-NEXT:  {{.*}} = llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"{ i64, float, { float*, float*, i64, [4 x i64], [4 x i64] } }">
+// CHECK-NEXT:  {{.*}} = llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"{ i64, float, { float*, float*, i64, [4 x i64], [4 x i64] } }">
+// CHECK-NEXT:  llvm.return {{.*}} : !llvm<"{ i64, float, { float*, float*, i64, [4 x i64], [4 x i64] } }">
+  return %0, %1, %2 : i64, f32, memref<42x?x10x?xf32>
+}
+
+
+// CHECK-LABEL: func @multireturn_caller() {
+func @multireturn_caller() {
+^bb0:
+// CHECK-NEXT:  {{.*}} = llvm.call @multireturn() : () -> !llvm<"{ i64, float, { float*, float*, i64, [4 x i64], [4 x i64] } }">
+// CHECK-NEXT:  {{.*}} = llvm.extractvalue {{.*}}[0] : !llvm<"{ i64, float, { float*, float*, i64, [4 x i64], [4 x i64] } }">
+// CHECK-NEXT:  {{.*}} = llvm.extractvalue {{.*}}[1] : !llvm<"{ i64, float, { float*, float*, i64, [4 x i64], [4 x i64] } }">
+// CHECK-NEXT:  {{.*}} = llvm.extractvalue {{.*}}[2] : !llvm<"{ i64, float, { float*, float*, i64, [4 x i64], [4 x i64] } }">
+  %0:3 = call @multireturn() : () -> (i64, f32, memref<42x?x10x?xf32>)
+  %1 = constant 42 : i64
+// CHECK:       {{.*}} = llvm.add {{.*}}, {{.*}} : !llvm.i64
+  %2 = addi %0#0, %1 : i64
+  %3 = constant 42.0 : f32
+// CHECK:       {{.*}} = llvm.fadd {{.*}}, {{.*}} : !llvm.float
+  %4 = addf %0#1, %3 : f32
+  %5 = constant 0 : index
+  return
+}
+
+// CHECK-LABEL: func @vector_ops(%arg0: !llvm<"<4 x float>">, %arg1: !llvm<"<4 x i1>">, %arg2: !llvm<"<4 x i64>">, %arg3: !llvm<"<4 x i64>">) -> !llvm<"<4 x float>"> {
+func @vector_ops(%arg0: vector<4xf32>, %arg1: vector<4xi1>, %arg2: vector<4xi64>, %arg3: vector<4xi64>) -> vector<4xf32> {
+// CHECK-NEXT:  %0 = llvm.mlir.constant(dense<4.200000e+01> : vector<4xf32>) : !llvm<"<4 x float>">
+  %0 = constant dense<42.> : vector<4xf32>
+// CHECK-NEXT:  %1 = llvm.fadd %arg0, %0 : !llvm<"<4 x float>">
+  %1 = addf %arg0, %0 : vector<4xf32>
+// CHECK-NEXT:  %2 = llvm.sdiv %arg2, %arg2 : !llvm<"<4 x i64>">
+  %3 = divi_signed %arg2, %arg2 : vector<4xi64>
+// CHECK-NEXT:  %3 = llvm.udiv %arg2, %arg2 : !llvm<"<4 x i64>">
+  %4 = divi_unsigned %arg2, %arg2 : vector<4xi64>
+// CHECK-NEXT:  %4 = llvm.srem %arg2, %arg2 : !llvm<"<4 x i64>">
+  %5 = remi_signed %arg2, %arg2 : vector<4xi64>
+// CHECK-NEXT:  %5 = llvm.urem %arg2, %arg2 : !llvm<"<4 x i64>">
+  %6 = remi_unsigned %arg2, %arg2 : vector<4xi64>
+// CHECK-NEXT:  %6 = llvm.fdiv %arg0, %0 : !llvm<"<4 x float>">
+  %7 = divf %arg0, %0 : vector<4xf32>
+// CHECK-NEXT:  %7 = llvm.frem %arg0, %0 : !llvm<"<4 x float>">
+  %8 = remf %arg0, %0 : vector<4xf32>
+// CHECK-NEXT:  %8 = llvm.and %arg2, %arg3 : !llvm<"<4 x i64>">
+  %9 = and %arg2, %arg3 : vector<4xi64>
+// CHECK-NEXT:  %9 = llvm.or %arg2, %arg3 : !llvm<"<4 x i64>">
+  %10 = or %arg2, %arg3 : vector<4xi64>
+// CHECK-NEXT:  %10 = llvm.xor %arg2, %arg3 : !llvm<"<4 x i64>">
+  %11 = xor %arg2, %arg3 : vector<4xi64>
+// CHECK-NEXT:  %11 = llvm.shl %arg2, %arg2 : !llvm<"<4 x i64>">
+  %12 = shift_left %arg2, %arg2 : vector<4xi64>
+// CHECK-NEXT:  %12 = llvm.ashr %arg2, %arg2 : !llvm<"<4 x i64>">
+  %13 = shift_right_signed %arg2, %arg2 : vector<4xi64>
+// CHECK-NEXT:  %13 = llvm.lshr %arg2, %arg2 : !llvm<"<4 x i64>">
+  %14 = shift_right_unsigned %arg2, %arg2 : vector<4xi64>
+  return %1 : vector<4xf32>
+}
+
+// CHECK-LABEL: @ops
+func @ops(f32, f32, i32, i32) -> (f32, i32) {
+^bb0(%arg0: f32, %arg1: f32, %arg2: i32, %arg3: i32):
+// CHECK-NEXT:  %0 = llvm.fsub %arg0, %arg1 : !llvm.float
+  %0 = subf %arg0, %arg1: f32
+// CHECK-NEXT:  %1 = llvm.sub %arg2, %arg3 : !llvm.i32
+  %1 = subi %arg2, %arg3: i32
+// CHECK-NEXT:  %2 = llvm.icmp "slt" %arg2, %1 : !llvm.i32
+  %2 = cmpi "slt", %arg2, %1 : i32
+// CHECK-NEXT:  %3 = llvm.sdiv %arg2, %arg3 : !llvm.i32
+  %4 = divi_signed %arg2, %arg3 : i32
+// CHECK-NEXT:  %4 = llvm.udiv %arg2, %arg3 : !llvm.i32
+  %5 = divi_unsigned %arg2, %arg3 : i32
+// CHECK-NEXT:  %5 = llvm.srem %arg2, %arg3 : !llvm.i32
+  %6 = remi_signed %arg2, %arg3 : i32
+// CHECK-NEXT:  %6 = llvm.urem %arg2, %arg3 : !llvm.i32
+  %7 = remi_unsigned %arg2, %arg3 : i32
+// CHECK-NEXT:  %7 = llvm.select %2, %arg2, %arg3 : !llvm.i1, !llvm.i32
+  %8 = select %2, %arg2, %arg3 : i32
+// CHECK-NEXT:  %8 = llvm.fdiv %arg0, %arg1 : !llvm.float
+  %9 = divf %arg0, %arg1 : f32
+// CHECK-NEXT:  %9 = llvm.frem %arg0, %arg1 : !llvm.float
+  %10 = remf %arg0, %arg1 : f32
+// CHECK-NEXT: %10 = llvm.and %arg2, %arg3 : !llvm.i32
+  %11 = and %arg2, %arg3 : i32
+// CHECK-NEXT: %11 = llvm.or %arg2, %arg3 : !llvm.i32
+  %12 = or %arg2, %arg3 : i32
+// CHECK-NEXT: %12 = llvm.xor %arg2, %arg3 : !llvm.i32
+  %13 = xor %arg2, %arg3 : i32
+// CHECK-NEXT: %13 = "llvm.intr.exp"(%arg0) : (!llvm.float) -> !llvm.float
+  %14 = std.exp %arg0 : f32
+// CHECK-NEXT: %14 = llvm.call @tanhf(%arg0) : (!llvm.float) -> !llvm.float
+  %15 = std.tanh %arg0 : f32
+// CHECK-NEXT: %15 = llvm.mlir.constant(7.900000e-01 : f64) : !llvm.double
+  %16 = constant 7.9e-01 : f64
+// CHECK-NEXT: %16 = llvm.call @tanh(%15) : (!llvm.double) -> !llvm.double
+  %17 = std.tanh %16 : f64
+// CHECK-NEXT: %17 = llvm.shl %arg2, %arg3 : !llvm.i32
+  %18 = shift_left %arg2, %arg3 : i32
+// CHECK-NEXT: %18 = llvm.ashr %arg2, %arg3 : !llvm.i32
+  %19 = shift_right_signed %arg2, %arg3 : i32
+// CHECK-NEXT: %19 = llvm.lshr %arg2, %arg3 : !llvm.i32
+  %20 = shift_right_unsigned %arg2, %arg3 : i32
+
+  return %0, %4 : f32, i32
+}
+
+// Checking conversion of index types to integers using i1, assuming no target
+// system would have a 1-bit address space.  Otherwise, we would have had to
+// make this test dependent on the pointer size on the target system.
+// CHECK-LABEL: @index_cast
+func @index_cast(%arg0: index, %arg1: i1) {
+// CHECK-NEXT: = llvm.trunc %arg0 : !llvm.i{{.*}} to !llvm.i1
+  %0 = index_cast %arg0: index to i1
+// CHECK-NEXT: = llvm.sext %arg1 : !llvm.i1 to !llvm.i{{.*}}
+  %1 = index_cast %arg1: i1 to index
+  return
+}
+
+// Checking conversion of integer types to floating point.
+// CHECK-LABEL: @sitofp
+func @sitofp(%arg0 : i32, %arg1 : i64) {
+// CHECK-NEXT: = llvm.sitofp {{.*}} : !llvm.i{{.*}} to !llvm.float
+  %0 = sitofp %arg0: i32 to f32
+// CHECK-NEXT: = llvm.sitofp {{.*}} : !llvm.i{{.*}} to !llvm.double
+  %1 = sitofp %arg0: i32 to f64
+// CHECK-NEXT: = llvm.sitofp {{.*}} : !llvm.i{{.*}} to !llvm.float
+  %2 = sitofp %arg1: i64 to f32
+// CHECK-NEXT: = llvm.sitofp {{.*}} : !llvm.i{{.*}} to !llvm.double
+  %3 = sitofp %arg1: i64 to f64
+  return
+}
+
+// Checking conversion of integer types to floating point.
+// CHECK-LABEL: @fpext
+func @fpext(%arg0 : f16, %arg1 : f32) {
+// CHECK-NEXT: = llvm.fpext {{.*}} : !llvm.half to !llvm.float
+  %0 = fpext %arg0: f16 to f32
+// CHECK-NEXT: = llvm.fpext {{.*}} : !llvm.half to !llvm.double
+  %1 = fpext %arg0: f16 to f64
+// CHECK-NEXT: = llvm.fpext {{.*}} : !llvm.float to !llvm.double
+  %2 = fpext %arg1: f32 to f64
+  return
+}
+
+// Checking conversion of integer types to floating point.
+// CHECK-LABEL: @fptrunc
+func @fptrunc(%arg0 : f32, %arg1 : f64) {
+// CHECK-NEXT: = llvm.fptrunc {{.*}} : !llvm.float to !llvm.half
+  %0 = fptrunc %arg0: f32 to f16
+// CHECK-NEXT: = llvm.fptrunc {{.*}} : !llvm.double to !llvm.half
+  %1 = fptrunc %arg1: f64 to f16
+// CHECK-NEXT: = llvm.fptrunc {{.*}} : !llvm.double to !llvm.float
+  %2 = fptrunc %arg1: f64 to f32
+  return
+}
+
+// Check sign and zero extension and truncation of integers.
+// CHECK-LABEL: @integer_extension_and_truncation
+func @integer_extension_and_truncation() {
+// CHECK-NEXT:  %0 = llvm.mlir.constant(-3 : i3) : !llvm.i3
+  %0 = constant 5 : i3
+// CHECK-NEXT: = llvm.sext %0 : !llvm.i3 to !llvm.i6
+  %1 = sexti %0 : i3 to i6
+// CHECK-NEXT: = llvm.zext %0 : !llvm.i3 to !llvm.i6
+  %2 = zexti %0 : i3 to i6
+// CHECK-NEXT: = llvm.trunc %0 : !llvm.i3 to !llvm.i2
+   %3 = trunci %0 : i3 to i2
+  return
+}
+
+// CHECK-LABEL: @dfs_block_order
+func @dfs_block_order(%arg0: i32) -> (i32) {
+// CHECK-NEXT:  %[[CST:.*]] = llvm.mlir.constant(42 : i32) : !llvm.i32
+  %0 = constant 42 : i32
+// CHECK-NEXT:  llvm.br ^bb2
+  br ^bb2
+
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:  %[[ADD:.*]] = llvm.add %arg0, %[[CST]] : !llvm.i32
+// CHECK-NEXT:  llvm.return %[[ADD]] : !llvm.i32
+^bb1:
+  %2 = addi %arg0, %0 : i32
+  return %2 : i32
+
+// CHECK-NEXT: ^bb2:
+^bb2:
+// CHECK-NEXT:  llvm.br ^bb1
+  br ^bb1
+}
+// CHECK-LABEL: func @cond_br_same_target(%arg0: !llvm.i1, %arg1: !llvm.i32, %arg2: !llvm.i32)
+func @cond_br_same_target(%arg0: i1, %arg1: i32, %arg2 : i32) -> (i32) {
+// CHECK-NEXT:  llvm.cond_br %arg0, ^[[origBlock:bb[0-9]+]](%arg1 : !llvm.i32), ^[[dummyBlock:bb[0-9]+]]
+  cond_br %arg0, ^bb1(%arg1 : i32), ^bb1(%arg2 : i32)
+
+// CHECK:      ^[[origBlock]](%0: !llvm.i32):
+// CHECK-NEXT:  llvm.return %0 : !llvm.i32
+^bb1(%0 : i32):
+  return %0 : i32
+
+// CHECK:      ^[[dummyBlock]]:
+// CHECK-NEXT:  llvm.br ^[[origBlock]](%arg2 : !llvm.i32)
+}
+
+// CHECK-LABEL: func @fcmp(%arg0: !llvm.float, %arg1: !llvm.float) {
+func @fcmp(f32, f32) -> () {
+^bb0(%arg0: f32, %arg1: f32):
+  // CHECK:      llvm.fcmp "oeq" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "ogt" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "oge" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "olt" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "ole" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "one" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "ord" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "ueq" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "ugt" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "uge" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "ult" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "ule" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "une" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.fcmp "uno" %arg0, %arg1 : !llvm.float
+  // CHECK-NEXT: llvm.return
+  %1 = cmpf "oeq", %arg0, %arg1 : f32
+  %2 = cmpf "ogt", %arg0, %arg1 : f32
+  %3 = cmpf "oge", %arg0, %arg1 : f32
+  %4 = cmpf "olt", %arg0, %arg1 : f32
+  %5 = cmpf "ole", %arg0, %arg1 : f32
+  %6 = cmpf "one", %arg0, %arg1 : f32
+  %7 = cmpf "ord", %arg0, %arg1 : f32
+  %8 = cmpf "ueq", %arg0, %arg1 : f32
+  %9 = cmpf "ugt", %arg0, %arg1 : f32
+  %10 = cmpf "uge", %arg0, %arg1 : f32
+  %11 = cmpf "ult", %arg0, %arg1 : f32
+  %12 = cmpf "ule", %arg0, %arg1 : f32
+  %13 = cmpf "une", %arg0, %arg1 : f32
+  %14 = cmpf "uno", %arg0, %arg1 : f32
+
+  return
+}
+
+// CHECK-LABEL: @vec_bin
+func @vec_bin(%arg0: vector<2x2x2xf32>) -> vector<2x2x2xf32> {
+  %0 = addf %arg0, %arg0 : vector<2x2x2xf32>
+  return %0 : vector<2x2x2xf32>
+
+//  CHECK-NEXT: llvm.mlir.undef : !llvm<"[2 x [2 x <2 x float>]]">
+
+// This block appears 2x2 times
+//  CHECK-NEXT: llvm.extractvalue %{{.*}}[0, 0] : !llvm<"[2 x [2 x <2 x float>]]">
+//  CHECK-NEXT: llvm.extractvalue %{{.*}}[0, 0] : !llvm<"[2 x [2 x <2 x float>]]">
+//  CHECK-NEXT: llvm.fadd %{{.*}} : !llvm<"<2 x float>">
+//  CHECK-NEXT: llvm.insertvalue %{{.*}}[0, 0] : !llvm<"[2 x [2 x <2 x float>]]">
+
+// We check the proper indexing of extract/insert in the remaining 3 positions.
+//       CHECK: llvm.extractvalue %{{.*}}[0, 1] : !llvm<"[2 x [2 x <2 x float>]]">
+//       CHECK: llvm.insertvalue %{{.*}}[0, 1] : !llvm<"[2 x [2 x <2 x float>]]">
+//       CHECK: llvm.extractvalue %{{.*}}[1, 0] : !llvm<"[2 x [2 x <2 x float>]]">
+//       CHECK: llvm.insertvalue %{{.*}}[1, 0] : !llvm<"[2 x [2 x <2 x float>]]">
+//       CHECK: llvm.extractvalue %{{.*}}[1, 1] : !llvm<"[2 x [2 x <2 x float>]]">
+//       CHECK: llvm.insertvalue %{{.*}}[1, 1] : !llvm<"[2 x [2 x <2 x float>]]">
+
+// And we're done
+//   CHECK-NEXT: return
+}
+
+// CHECK-LABEL: @splat
+// CHECK-SAME: %[[A:arg[0-9]+]]: !llvm<"<4 x float>">
+// CHECK-SAME: %[[ELT:arg[0-9]+]]: !llvm.float
+func @splat(%a: vector<4xf32>, %b: f32) -> vector<4xf32> {
+  %vb = splat %b : vector<4xf32>
+  %r = mulf %a, %vb : vector<4xf32>
+  return %r : vector<4xf32>
+}
+// CHECK-NEXT: %[[UNDEF:[0-9]+]] = llvm.mlir.undef : !llvm<"<4 x float>">
+// CHECK-NEXT: %[[ZERO:[0-9]+]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+// CHECK-NEXT: %[[V:[0-9]+]] = llvm.insertelement %[[ELT]], %[[UNDEF]][%[[ZERO]] : !llvm.i32] : !llvm<"<4 x float>">
+// CHECK-NEXT: %[[SPLAT:[0-9]+]] = llvm.shufflevector %[[V]], %[[UNDEF]] [0 : i32, 0 : i32, 0 : i32, 0 : i32]
+// CHECK-NEXT: %[[SCALE:[0-9]+]] = llvm.fmul %[[A]], %[[SPLAT]] : !llvm<"<4 x float>">
+// CHECK-NEXT: llvm.return %[[SCALE]] : !llvm<"<4 x float>">
+
+// CHECK-LABEL: func @view(
+// CHECK: %[[ARG0:.*]]: !llvm.i64, %[[ARG1:.*]]: !llvm.i64, %[[ARG2:.*]]: !llvm.i64
+func @view(%arg0 : index, %arg1 : index, %arg2 : index) {
+  // CHECK: llvm.mlir.constant(2048 : index) : !llvm.i64
+  // CHECK: llvm.mlir.undef : !llvm<"{ i8*, i8*, i64, [1 x i64], [1 x i64] }">
+  %0 = alloc() : memref<2048xi8>
+
+  // Test two dynamic sizes and dynamic offset.
+  // CHECK: llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.extractvalue %{{.*}}[1] : !llvm<"{ i8*, i8*, i64, [1 x i64], [1 x i64] }">
+  // CHECK: llvm.bitcast %{{.*}} : !llvm<"i8*"> to !llvm<"float*">
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.insertvalue %[[ARG2]], %{{.*}}[2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.insertvalue %[[ARG1]], %{{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(1 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.insertvalue %[[ARG0]], %{{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mul %{{.*}}, %[[ARG1]]
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %1 = view %0[%arg2][%arg0, %arg1]
+    : memref<2048xi8> to memref<?x?xf32, (d0, d1)[s0, s1] -> (d0 * s0 + d1 + s1)>
+
+  // Test two dynamic sizes and static offset.
+  // CHECK: llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.extractvalue %{{.*}}[1] : !llvm<"{ i8*, i8*, i64, [1 x i64], [1 x i64] }">
+  // CHECK: llvm.bitcast %{{.*}} : !llvm<"i8*"> to !llvm<"float*">
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(0 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(1 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.insertvalue %arg0, %{{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mul %{{.*}}, %[[ARG1]]
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %2 = view %0[][%arg0, %arg1]
+    : memref<2048xi8> to memref<?x?xf32, (d0, d1)[s0] -> (d0 * s0 + d1)>
+
+  // Test one dynamic size and dynamic offset.
+  // CHECK: llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.extractvalue %{{.*}}[1] : !llvm<"{ i8*, i8*, i64, [1 x i64], [1 x i64] }">
+  // CHECK: llvm.bitcast %{{.*}} : !llvm<"i8*"> to !llvm<"float*">
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.insertvalue %[[ARG2]], %{{.*}}[2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.insertvalue %[[ARG1]], %{{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(1 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(4 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mul %{{.*}}, %[[ARG1]]
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %3 = view %0[%arg2][%arg1]
+    : memref<2048xi8> to memref<4x?xf32, (d0, d1)[s0, s1] -> (d0 * s0 + d1 + s1)>
+
+  // Test one dynamic size and static offset.
+  // CHECK: llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.extractvalue %{{.*}}[1] : !llvm<"{ i8*, i8*, i64, [1 x i64], [1 x i64] }">
+  // CHECK: llvm.bitcast %{{.*}} : !llvm<"i8*"> to !llvm<"float*">
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(0 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(16 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(1 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.insertvalue %[[ARG0]], %{{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(4 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %4 = view %0[][%arg0]
+    : memref<2048xi8> to memref<?x16xf32, (d0, d1) -> (d0 * 4 + d1)>
+
+  // Test static sizes and static offset.
+  // CHECK: llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.extractvalue %{{.*}}[1] : !llvm<"{ i8*, i8*, i64, [1 x i64], [1 x i64] }">
+  // CHECK: llvm.bitcast %{{.*}} : !llvm<"i8*"> to !llvm<"float*">
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(0 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(4 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(1 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(64 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mlir.constant(4 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %5 = view %0[][]
+    : memref<2048xi8> to memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1)>
+
+  // Test dynamic everything.
+  // CHECK: llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.extractvalue %{{.*}}[1] : !llvm<"{ i8*, i8*, i64, [1 x i64], [1 x i64] }">
+  // CHECK: llvm.bitcast %{{.*}} : !llvm<"i8*"> to !llvm<"float*">
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.insertvalue %[[ARG2]], %{{.*}}[2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.insertvalue %[[ARG1]], %{{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[STRIDE_1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+  // CHECK: llvm.insertvalue %[[STRIDE_1]], %{{.*}}[4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.insertvalue %[[ARG0]], %{{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: llvm.mul %[[STRIDE_1]], %[[ARG1]] : !llvm.i64
+  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %6 = view %0[%arg2][%arg0, %arg1]
+    : memref<2048xi8> to memref<?x?xf32, (d0, d1)[s0, s1] -> (d0 * s0 + d1 + s1)>
+
+  return
+}
+
+// CHECK-LABEL: func @subview(
+// CHECK: %[[MEMREFPTR:.*]]: !llvm<{{.*}}>, %[[ARG0:.*]]: !llvm.i64, %[[ARG1:.*]]: !llvm.i64, %[[ARG2:.*]]: !llvm.i64
+func @subview(%0 : memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1)>, %arg0 : index, %arg1 : index, %arg2 : index) {
+  // CHECK: %[[MEMREF:.*]] = llvm.load %[[MEMREFPTR]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+  // CHECK: %[[DESC:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESC0:.*]] = llvm.insertvalue %{{.*}}, %[[DESC]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESC1:.*]] = llvm.insertvalue %{{.*}}, %[[DESC0]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[STRIDE0:.*]] = llvm.extractvalue %[[MEMREF]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[STRIDE1:.*]] = llvm.extractvalue %[[MEMREF]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[OFF:.*]] = llvm.extractvalue %[[MEMREF]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[OFFINC:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] : !llvm.i64
+  // CHECK: %[[OFF1:.*]] = llvm.add %[[OFF]], %[[OFFINC]] : !llvm.i64
+  // CHECK: %[[OFFINC1:.*]] = llvm.mul %[[ARG1]], %[[STRIDE1]] : !llvm.i64
+  // CHECK: %[[OFF2:.*]] = llvm.add %[[OFF1]], %[[OFFINC1]] : !llvm.i64
+  // CHECK: %[[DESC2:.*]] = llvm.insertvalue %[[OFF2]], %[[DESC1]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESC3:.*]] = llvm.insertvalue %[[ARG1]], %[[DESC2]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESCSTRIDE1:.*]] = llvm.mul %[[ARG1]], %[[STRIDE1]] : !llvm.i64
+  // CHECK: %[[DESC4:.*]] = llvm.insertvalue %[[DESCSTRIDE1]], %[[DESC3]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESC5:.*]] = llvm.insertvalue %[[ARG0]], %[[DESC4]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] : !llvm.i64
+  // CHECK: llvm.insertvalue %[[DESCSTRIDE0]], %[[DESC5]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %1 = subview %0[%arg0, %arg1][%arg0, %arg1][%arg0, %arg1] :
+    memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1)> to memref<?x?xf32, (d0, d1)[s0, s1, s2] -> (d0 * s1 + d1 * s2 + s0)>
+  return
+}
+
+// CHECK-LABEL: func @subview_const_size(
+// CHECK: %[[MEMREFPTR:.*]]: !llvm<{{.*}}>, %[[ARG0:.*]]: !llvm.i64, %[[ARG1:.*]]: !llvm.i64, %[[ARG2:.*]]: !llvm.i64
+func @subview_const_size(%0 : memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1)>, %arg0 : index, %arg1 : index, %arg2 : index) {
+  // CHECK: %[[MEMREF:.*]] = llvm.load %[[MEMREFPTR]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+  // CHECK: %[[DESC:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESC0:.*]] = llvm.insertvalue %{{.*}}, %[[DESC]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESC1:.*]] = llvm.insertvalue %{{.*}}, %[[DESC0]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[STRIDE0:.*]] = llvm.extractvalue %[[MEMREF]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[STRIDE1:.*]] = llvm.extractvalue %[[MEMREF]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[CST4:.*]] = llvm.mlir.constant(4 : i64)
+  // CHECK: %[[CST2:.*]] = llvm.mlir.constant(2 : i64)
+  // CHECK: %[[OFF:.*]] = llvm.extractvalue %[[MEMREF]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[OFFINC:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] : !llvm.i64
+  // CHECK: %[[OFF1:.*]] = llvm.add %[[OFF]], %[[OFFINC]] : !llvm.i64
+  // CHECK: %[[OFFINC1:.*]] = llvm.mul %[[ARG1]], %[[STRIDE1]] : !llvm.i64
+  // CHECK: %[[OFF2:.*]] = llvm.add %[[OFF1]], %[[OFFINC1]] : !llvm.i64
+  // CHECK: %[[DESC2:.*]] = llvm.insertvalue %[[OFF2]], %[[DESC1]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESC3:.*]] = llvm.insertvalue %[[CST2]], %[[DESC2]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESCSTRIDE1:.*]] = llvm.mul %[[ARG1]], %[[STRIDE1]] : !llvm.i64
+  // CHECK: %[[DESC4:.*]] = llvm.insertvalue %[[DESCSTRIDE1]], %[[DESC3]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESC5:.*]] = llvm.insertvalue %[[CST4]], %[[DESC4]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] : !llvm.i64
+  // CHECK: llvm.insertvalue %[[DESCSTRIDE0]], %[[DESC5]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %1 = subview %0[%arg0, %arg1][][%arg0, %arg1] :
+    memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1)> to memref<4x2xf32, (d0, d1)[s0, s1, s2] -> (d0 * s1 + d1 * s2 + s0)>
+  return
+}
+
+// CHECK-LABEL: func @subview_const_stride(
+// CHECK: %[[MEMREFPTR:.*]]: !llvm<{{.*}}>, %[[ARG0:.*]]: !llvm.i64, %[[ARG1:.*]]: !llvm.i64, %[[ARG2:.*]]: !llvm.i64
+func @subview_const_stride(%0 : memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1)>, %arg0 : index, %arg1 : index, %arg2 : index) {
+  // CHECK: %[[MEMREF:.*]] = llvm.load %[[MEMREFPTR]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">
+  // CHECK: %[[DESC:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESC0:.*]] = llvm.insertvalue %{{.*}}, %[[DESC]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESC1:.*]] = llvm.insertvalue %{{.*}}, %[[DESC0]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[STRIDE0:.*]] = llvm.extractvalue %[[MEMREF]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[STRIDE1:.*]] = llvm.extractvalue %[[MEMREF]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[OFF:.*]] = llvm.extractvalue %[[MEMREF]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[OFFINC:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] : !llvm.i64
+  // CHECK: %[[OFF1:.*]] = llvm.add %[[OFF]], %[[OFFINC]] : !llvm.i64
+  // CHECK: %[[OFFINC1:.*]] = llvm.mul %[[ARG1]], %[[STRIDE1]] : !llvm.i64
+  // CHECK: %[[OFF2:.*]] = llvm.add %[[OFF1]], %[[OFFINC1]] : !llvm.i64
+  // CHECK: %[[DESC2:.*]] = llvm.insertvalue %[[OFF2]], %[[DESC1]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESC3:.*]] = llvm.insertvalue %[[ARG1]], %[[DESC2]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[CST2:.*]] = llvm.mlir.constant(2 : i64)
+  // CHECK: %[[DESC4:.*]] = llvm.insertvalue %[[CST2]], %[[DESC3]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[DESC5:.*]] = llvm.insertvalue %[[ARG0]], %[[DESC4]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  // CHECK: %[[CST4:.*]] = llvm.mlir.constant(4 : i64)
+  // CHECK: llvm.insertvalue %[[CST4]], %[[DESC5]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+  %1 = subview %0[%arg0, %arg1][%arg0, %arg1][] :
+    memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1)> to memref<?x?xf32, (d0, d1)[s0] -> (d0 * 4 + d1 * 2 + s0)>
+  return
+}
+
+// -----
+
+module {
+  func @check_tanh_func_added_only_once_to_symbol_table(%f: f32, %lf: f64) -> () {
+    %f0 = std.tanh %f : f32
+    %f1 = std.tanh %f0 : f32
+    %lf0 = std.tanh %lf : f64
+    %lf1 = std.tanh %lf0 : f64
+    return
+  }
+// CHECK: module {
+// CHECK: llvm.func @tanh(!llvm.double) -> !llvm.double
+// CHECK: llvm.func @tanhf(!llvm.float) -> !llvm.float
+// CHECK-LABEL: func @check_tanh_func_added_only_once_to_symbol_table
+}
diff --git a/mlir/test/Conversion/StandardToLLVM/standard-to-llvm.mlir b/mlir/test/Conversion/StandardToLLVM/standard-to-llvm.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..50b3e4cdcfb5c0771193369a597392908aee887b
--- /dev/null
+++ b/mlir/test/Conversion/StandardToLLVM/standard-to-llvm.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-opt %s -convert-std-to-llvm -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: func @address_space(
+//       CHECK:   %{{.*}}: !llvm<"{ float addrspace(7)*, float addrspace(7)*, i64, [1 x i64], [1 x i64] }*">)
+//       CHECK:   llvm.load %{{.*}} : !llvm<"{ float addrspace(7)*, float addrspace(7)*, i64, [1 x i64], [1 x i64] }*">
+func @address_space(%arg0 : memref<32xf32, (d0) -> (d0), 7>) {
+  %0 = alloc() : memref<32xf32, (d0) -> (d0), 5>
+  %1 = constant 7 : index
+  // CHECK: llvm.load %{{.*}} : !llvm<"float addrspace(5)*">
+  %2 = load %0[%1] : memref<32xf32, (d0) -> (d0), 5>
+  std.return
+}
+
+// CHECK-LABEL: func @strided_memref(
+func @strided_memref(%ind: index) {
+  %0 = alloc()[%ind] : memref<32x64xf32, (i, j)[M] -> (32 + M * i + j)>
+  std.return
+}
+
+// -----
+
+// This should not crash. The first operation cannot be converted, so the
+// second should not match. This attempts to convert `return` to `llvm.return`
+// and complains about non-LLVM types.
+func @unknown_source() -> i32 {
+  %0 = "foo"() : () -> i32
+  %1 = addi %0, %0 : i32
+  // expected-error@+1 {{must be LLVM dialect type}}
+  return %1 : i32
+}
diff --git a/mlir/test/Conversion/StandardToSPIRV/legalization.mlir b/mlir/test/Conversion/StandardToSPIRV/legalization.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d81036edf67d65491bd3946a7c03439d4c7b4719
--- /dev/null
+++ b/mlir/test/Conversion/StandardToSPIRV/legalization.mlir
@@ -0,0 +1,61 @@
+// RUN: mlir-opt -legalize-std-for-spirv -verify-diagnostics %s -o - | FileCheck %s
+
+// CHECK-LABEL: @fold_static_stride_subview_with_load
+// CHECK-SAME: [[ARG0:%.*]]: memref<12x32xf32>, [[ARG1:%.*]]: index, [[ARG2:%.*]]: index, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index
+func @fold_static_stride_subview_with_load(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index) {
+  // CHECK-NOT: subview
+  // CHECK: [[C2:%.*]] = constant 2 : index
+  // CHECK: [[C3:%.*]] = constant 3 : index
+  // CHECK: [[STRIDE1:%.*]] = muli [[ARG3]], [[C2]] : index
+  // CHECK: [[INDEX1:%.*]] = addi [[ARG1]], [[STRIDE1]] : index
+  // CHECK: [[STRIDE2:%.*]] = muli [[ARG4]], [[C3]] : index
+  // CHECK: [[INDEX2:%.*]] = addi [[ARG2]], [[STRIDE2]] : index
+  // CHECK: load [[ARG0]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
+  %0 = subview %arg0[%arg1, %arg2][][] : memref<12x32xf32> to memref<4x4xf32, offset:?, strides: [64, 3]>
+  %1 = load %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [64, 3]>
+  return
+}
+
+// CHECK-LABEL: @fold_dynamic_stride_subview_with_load
+// CHECK-SAME: [[ARG0:%.*]]: memref<12x32xf32>, [[ARG1:%.*]]: index, [[ARG2:%.*]]: index, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index, [[ARG5:%.*]]: index, [[ARG6:%.*]]: index
+func @fold_dynamic_stride_subview_with_load(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index) {
+  // CHECK-NOT: subview
+  // CHECK: [[STRIDE1:%.*]] = muli [[ARG3]], [[ARG5]] : index
+  // CHECK: [[INDEX1:%.*]] = addi [[ARG1]], [[STRIDE1]] : index
+  // CHECK: [[STRIDE2:%.*]] = muli [[ARG4]], [[ARG6]] : index
+  // CHECK: [[INDEX2:%.*]] = addi [[ARG2]], [[STRIDE2]] : index
+  // CHECK: load [[ARG0]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
+  %0 = subview %arg0[%arg1, %arg2][][%arg5, %arg6] : memref<12x32xf32> to memref<4x4xf32, offset:?, strides: [?, ?]>
+  %1 = load %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [?, ?]>
+  return
+}
+
+// CHECK-LABEL: @fold_static_stride_subview_with_store
+// CHECK-SAME: [[ARG0:%.*]]: memref<12x32xf32>, [[ARG1:%.*]]: index, [[ARG2:%.*]]: index, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index, [[ARG5:%.*]]: f32
+func @fold_static_stride_subview_with_store(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : f32) {
+  // CHECK-NOT: subview
+  // CHECK: [[C2:%.*]] = constant 2 : index
+  // CHECK: [[C3:%.*]] = constant 3 : index
+  // CHECK: [[STRIDE1:%.*]] = muli [[ARG3]], [[C2]] : index
+  // CHECK: [[INDEX1:%.*]] = addi [[ARG1]], [[STRIDE1]] : index
+  // CHECK: [[STRIDE2:%.*]] = muli [[ARG4]], [[C3]] : index
+  // CHECK: [[INDEX2:%.*]] = addi [[ARG2]], [[STRIDE2]] : index
+  // CHECK: store [[ARG5]], [[ARG0]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
+  %0 = subview %arg0[%arg1, %arg2][][] : memref<12x32xf32> to memref<4x4xf32, offset:?, strides: [64, 3]>
+  store %arg5, %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [64, 3]>
+  return
+}
+
+// CHECK-LABEL: @fold_dynamic_stride_subview_with_store
+// CHECK-SAME: [[ARG0:%.*]]: memref<12x32xf32>, [[ARG1:%.*]]: index, [[ARG2:%.*]]: index, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index, [[ARG5:%.*]]: index, [[ARG6:%.*]]: index, [[ARG7:%.*]]: f32
+func @fold_dynamic_stride_subview_with_store(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index, %arg7 : f32) {
+  // CHECK-NOT: subview
+  // CHECK: [[STRIDE1:%.*]] = muli [[ARG3]], [[ARG5]] : index
+  // CHECK: [[INDEX1:%.*]] = addi [[ARG1]], [[STRIDE1]] : index
+  // CHECK: [[STRIDE2:%.*]] = muli [[ARG4]], [[ARG6]] : index
+  // CHECK: [[INDEX2:%.*]] = addi [[ARG2]], [[STRIDE2]] : index
+  // CHECK: store [[ARG7]], [[ARG0]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
+  %0 = subview %arg0[%arg1, %arg2][][%arg5, %arg6] : memref<12x32xf32> to memref<4x4xf32, offset:?, strides: [?, ?]>
+  store %arg7, %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [?, ?]>
+  return
+}
diff --git a/mlir/test/Conversion/StandardToSPIRV/std-to-spirv.mlir b/mlir/test/Conversion/StandardToSPIRV/std-to-spirv.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8876c5178e08100de7290a3d813e8ce0bc4f1e7b
--- /dev/null
+++ b/mlir/test/Conversion/StandardToSPIRV/std-to-spirv.mlir
@@ -0,0 +1,147 @@
+// RUN: mlir-opt -convert-std-to-spirv %s -o - | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// std binary arithmetic ops
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @add_sub
+func @add_sub(%arg0 : i32, %arg1 : i32) {
+  // CHECK: spv.IAdd
+  %0 = addi %arg0, %arg1 : i32
+  // CHECK: spv.ISub
+  %1 = subi %arg0, %arg1 : i32
+  return
+}
+
+// CHECK-LABEL: @fadd_scalar
+func @fadd_scalar(%arg: f32) -> f32 {
+  // CHECK: spv.FAdd
+  %0 = addf %arg, %arg : f32
+  return %0 : f32
+}
+
+// CHECK-LABEL: @fdiv_scalar
+func @fdiv_scalar(%arg: f32) -> f32 {
+  // CHECK: spv.FDiv
+  %0 = divf %arg, %arg : f32
+  return %0 : f32
+}
+
+// CHECK-LABEL: @fmul_scalar
+func @fmul_scalar(%arg: f32) -> f32 {
+  // CHECK: spv.FMul
+  %0 = mulf %arg, %arg : f32
+  return %0 : f32
+}
+
+// CHECK-LABEL: @fmul_vector2
+func @fmul_vector2(%arg: vector<2xf32>) -> vector<2xf32> {
+  // CHECK: spv.FMul
+  %0 = mulf %arg, %arg : vector<2xf32>
+  return %0 : vector<2xf32>
+}
+
+// CHECK-LABEL: @fmul_vector3
+func @fmul_vector3(%arg: vector<3xf32>) -> vector<3xf32> {
+  // CHECK: spv.FMul
+  %0 = mulf %arg, %arg : vector<3xf32>
+  return %0 : vector<3xf32>
+}
+
+// CHECK-LABEL: @fmul_vector4
+func @fmul_vector4(%arg: vector<4xf32>) -> vector<4xf32> {
+  // CHECK: spv.FMul
+  %0 = mulf %arg, %arg : vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: @fmul_vector5
+func @fmul_vector5(%arg: vector<5xf32>) -> vector<5xf32> {
+  // Vector length of only 2, 3, and 4 is valid for SPIR-V
+  // CHECK: mulf
+  %0 = mulf %arg, %arg : vector<5xf32>
+  return %0 : vector<5xf32>
+}
+
+// CHECK-LABEL: @fmul_tensor
+func @fmul_tensor(%arg: tensor<4xf32>) -> tensor<4xf32> {
+  // For tensors mulf cannot be lowered directly to spv.FMul
+  // CHECK: mulf
+  %0 = mulf %arg, %arg : tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// CHECK-LABEL: @frem_scalar
+func @frem_scalar(%arg: f32) -> f32 {
+  // CHECK: spv.FRem
+  %0 = remf %arg, %arg : f32
+  return %0 : f32
+}
+
+// CHECK-LABEL: @fsub_scalar
+func @fsub_scalar(%arg: f32) -> f32 {
+  // CHECK: spv.FSub
+  %0 = subf %arg, %arg : f32
+  return %0 : f32
+}
+
+// CHECK-LABEL: @div_rem
+func @div_rem(%arg0 : i32, %arg1 : i32) {
+  // CHECK: spv.SDiv
+  %0 = divi_signed %arg0, %arg1 : i32
+  // CHECK: spv.SMod
+  %1 = remi_signed %arg0, %arg1 : i32
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// std.cmpi
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @cmpi
+func @cmpi(%arg0 : i32, %arg1 : i32) {
+  // CHECK: spv.IEqual
+  %0 = cmpi "eq", %arg0, %arg1 : i32
+  // CHECK: spv.INotEqual
+  %1 = cmpi "ne", %arg0, %arg1 : i32
+  // CHECK: spv.SLessThan
+  %2 = cmpi "slt", %arg0, %arg1 : i32
+  // CHECK: spv.SLessThanEqual
+  %3 = cmpi "sle", %arg0, %arg1 : i32
+  // CHECK: spv.SGreaterThan
+  %4 = cmpi "sgt", %arg0, %arg1 : i32
+  // CHECK: spv.SGreaterThanEqual
+  %5 = cmpi "sge", %arg0, %arg1 : i32
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// std.constant
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @constant
+func @constant() {
+  // CHECK: spv.constant true
+  %0 = constant true
+  // CHECK: spv.constant 42 : i64
+  %1 = constant 42
+  // CHECK: spv.constant {{[0-9]*\.[0-9]*e?-?[0-9]*}} : f32
+  %2 = constant 0.5 : f32
+  // CHECK: spv.constant dense<[2, 3]> : vector<2xi32>
+  %3 = constant dense<[2, 3]> : vector<2xi32>
+  // CHECK: spv.constant 1 : i32
+  %4 = constant 1 : index
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// std.select
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @select
+func @select(%arg0 : i32, %arg1 : i32) {
+  %0 = cmpi "sle", %arg0, %arg1 : i32
+  // CHECK: spv.Select
+  %1 = select %0, %arg0, %arg1 : i32
+  return
+}
diff --git a/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir b/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..c9d1195bc0562fc8779620af5b2a8e3225571317
--- /dev/null
+++ b/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt -legalize-std-for-spirv -convert-std-to-spirv %s -o - | FileCheck %s
+
+// TODO: For these examples running these passes separately produces
+// the desired output. Adding all of patterns within a single pass does
+// not seem to work.
+
+//===----------------------------------------------------------------------===//
+// std.subview
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @fold_static_stride_subview_with_load
+// CHECK-SAME: [[ARG0:%.*]]: !spv.ptr<!spv.struct<!spv.array<384 x f32 [4]> [0]>, StorageBuffer>, [[ARG1:%.*]]: i32, [[ARG2:%.*]]: i32, [[ARG3:%.*]]: i32, [[ARG4:%.*]]: i32
+func @fold_static_stride_subview_with_load(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index) {
+  // CHECK: [[C2:%.*]] = spv.constant 2
+  // CHECK: [[C3:%.*]] = spv.constant 3
+  // CHECK: [[T2:%.*]] = spv.IMul [[ARG3]], [[C2]]
+  // CHECK: [[T3:%.*]] = spv.IAdd [[ARG1]], [[T2]]
+  // CHECK: [[T4:%.*]] = spv.IMul [[ARG4]], [[C3]]
+  // CHECK: [[T5:%.*]] = spv.IAdd [[ARG2]], [[T4]]
+  // CHECK: [[C32:%.*]] = spv.constant 32
+  // CHECK: [[T7:%.*]] = spv.IMul [[C32]], [[T3]]
+  // CHECK: [[C1:%.*]] = spv.constant 1
+  // CHECK: [[T9:%.*]] = spv.IMul [[C1]], [[T5]]
+  // CHECK: [[T10:%.*]] = spv.IAdd [[T7]], [[T9]]
+  // CHECK: [[C0:%.*]] = spv.constant 0
+  // CHECK: [[T12:%.*]] = spv.AccessChain [[ARG0]]{{\[}}[[C0]], [[T10]]
+  // CHECK: spv.Load "StorageBuffer" [[T12]] : f32
+  %0 = subview %arg0[%arg1, %arg2][][] : memref<12x32xf32> to memref<4x4xf32, offset:?, strides: [64, 3]>
+  %1 = load %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [64, 3]>
+  return
+}
+
+// CHECK-LABEL: @fold_static_stride_subview_with_store
+// CHECK-SAME: [[ARG0:%.*]]: !spv.ptr<!spv.struct<!spv.array<384 x f32 [4]> [0]>, StorageBuffer>, [[ARG1:%.*]]: i32, [[ARG2:%.*]]: i32, [[ARG3:%.*]]: i32, [[ARG4:%.*]]: i32, [[ARG5:%.*]]: f32
+func @fold_static_stride_subview_with_store(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : f32) {
+  // CHECK: [[C2:%.*]] = spv.constant 2
+  // CHECK: [[C3:%.*]] = spv.constant 3
+  // CHECK: [[T2:%.*]] = spv.IMul [[ARG3]], [[C2]]
+  // CHECK: [[T3:%.*]] = spv.IAdd [[ARG1]], [[T2]]
+  // CHECK: [[T4:%.*]] = spv.IMul [[ARG4]], [[C3]]
+  // CHECK: [[T5:%.*]] = spv.IAdd [[ARG2]], [[T4]]
+  // CHECK: [[C32:%.*]] = spv.constant 32
+  // CHECK: [[T7:%.*]] = spv.IMul [[C32]], [[T3]]
+  // CHECK: [[C1:%.*]] = spv.constant 1
+  // CHECK: [[T9:%.*]] = spv.IMul [[C1]], [[T5]]
+  // CHECK: [[T10:%.*]] = spv.IAdd [[T7]], [[T9]]
+  // CHECK: [[C0:%.*]] = spv.constant 0
+  // CHECK: [[T12:%.*]] = spv.AccessChain [[ARG0]]{{\[}}[[C0]], [[T10]]
+  // CHECK: spv.Store "StorageBuffer" [[T12]], [[ARG5]] : f32
+  %0 = subview %arg0[%arg1, %arg2][][] : memref<12x32xf32> to memref<4x4xf32, offset:?, strides: [64, 3]>
+  store %arg5, %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [64, 3]>
+  return
+}
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..1725a0b7c75c37340b80fb58d9e601944fdb3208
--- /dev/null
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -0,0 +1,425 @@
+// RUN: mlir-opt %s -convert-vector-to-llvm | FileCheck %s
+
+func @broadcast_vec1d_from_scalar(%arg0: f32) -> vector<2xf32> {
+  %0 = vector.broadcast %arg0 : f32 to vector<2xf32>
+  return %0 : vector<2xf32>
+}
+// CHECK-LABEL: broadcast_vec1d_from_scalar
+//       CHECK:   llvm.mlir.undef : !llvm<"<2 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}}[0 : i32, 0 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
+//       CHECK:   llvm.return {{.*}} : !llvm<"<2 x float>">
+
+func @broadcast_vec2d_from_scalar(%arg0: f32) -> vector<2x3xf32> {
+  %0 = vector.broadcast %arg0 : f32 to vector<2x3xf32>
+  return %0 : vector<2x3xf32>
+}
+// CHECK-LABEL: broadcast_vec2d_from_scalar
+//       CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}}[0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
+//       CHECK:   llvm.mlir.undef : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[2 x <3 x float>]">
+
+func @broadcast_vec3d_from_scalar(%arg0: f32) -> vector<2x3x4xf32> {
+  %0 = vector.broadcast %arg0 : f32 to vector<2x3x4xf32>
+  return %0 : vector<2x3x4xf32>
+}
+// CHECK-LABEL: broadcast_vec3d_from_scalar
+//       CHECK:   llvm.mlir.undef : !llvm<"<4 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32, 0 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.mlir.undef : !llvm<"[2 x [3 x <4 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[2 x [3 x <4 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[2 x [3 x <4 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[2 x [3 x <4 x float>]]">
+
+func @broadcast_vec1d_from_vec1d(%arg0: vector<2xf32>) -> vector<2xf32> {
+  %0 = vector.broadcast %arg0 : vector<2xf32> to vector<2xf32>
+  return %0 : vector<2xf32>
+}
+// CHECK-LABEL: broadcast_vec1d_from_vec1d
+//       CHECK:   llvm.return {{.*}} : !llvm<"<2 x float>">
+
+func @broadcast_vec2d_from_vec1d(%arg0: vector<2xf32>) -> vector<3x2xf32> {
+  %0 = vector.broadcast %arg0 : vector<2xf32> to vector<3x2xf32>
+  return %0 : vector<3x2xf32>
+}
+// CHECK-LABEL: broadcast_vec2d_from_vec1d
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[3 x <2 x float>]">
+
+func @broadcast_vec3d_from_vec1d(%arg0: vector<2xf32>) -> vector<4x3x2xf32> {
+  %0 = vector.broadcast %arg0 : vector<2xf32> to vector<4x3x2xf32>
+  return %0 : vector<4x3x2xf32>
+}
+// CHECK-LABEL: broadcast_vec3d_from_vec1d
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.mlir.undef : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[4 x [3 x <2 x float>]]">
+
+func @broadcast_vec3d_from_vec2d(%arg0: vector<3x2xf32>) -> vector<4x3x2xf32> {
+  %0 = vector.broadcast %arg0 : vector<3x2xf32> to vector<4x3x2xf32>
+  return %0 : vector<4x3x2xf32>
+}
+// CHECK-LABEL: broadcast_vec3d_from_vec2d
+//       CHECK:   llvm.mlir.undef : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[4 x [3 x <2 x float>]]">
+
+func @broadcast_stretch(%arg0: vector<1xf32>) -> vector<4xf32> {
+  %0 = vector.broadcast %arg0 : vector<1xf32> to vector<4xf32>
+  return %0 : vector<4xf32>
+}
+// CHECK-LABEL: broadcast_stretch
+//       CHECK:   llvm.mlir.undef : !llvm<"<4 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32, 0 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
+//       CHECK:   llvm.return {{.*}} : !llvm<"<4 x float>">
+
+func @broadcast_stretch_at_start(%arg0: vector<1x4xf32>) -> vector<3x4xf32> {
+  %0 = vector.broadcast %arg0 : vector<1x4xf32> to vector<3x4xf32>
+  return %0 : vector<3x4xf32>
+}
+// CHECK-LABEL: broadcast_stretch_at_start
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <4 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <4 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <4 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[3 x <4 x float>]">
+
+func @broadcast_stretch_at_end(%arg0: vector<4x1xf32>) -> vector<4x3xf32> {
+  %0 = vector.broadcast %arg0 : vector<4x1xf32> to vector<4x3xf32>
+  return %0 : vector<4x3xf32>
+}
+// CHECK-LABEL: broadcast_stretch_at_end
+//       CHECK:   llvm.mlir.undef : !llvm<"[4 x <3 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[4 x <1 x float>]">
+//       CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[4 x <3 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[1] : !llvm<"[4 x <1 x float>]">
+//       CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[4 x <3 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[2] : !llvm<"[4 x <1 x float>]">
+//       CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x <3 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[3] : !llvm<"[4 x <1 x float>]">
+//       CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x <3 x float>]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[4 x <3 x float>]">
+
+func @broadcast_stretch_in_middle(%arg0: vector<4x1x2xf32>) -> vector<4x3x2xf32> {
+  %0 = vector.broadcast %arg0 : vector<4x1x2xf32> to vector<4x3x2xf32>
+  return %0 : vector<4x3x2xf32>
+}
+// CHECK-LABEL: broadcast_stretch_in_middle
+//       CHECK:   llvm.mlir.undef : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[4 x [1 x <2 x float>]]">
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.extractvalue {{.*}}[1] : !llvm<"[4 x [1 x <2 x float>]]">
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.extractvalue {{.*}}[2] : !llvm<"[4 x [1 x <2 x float>]]">
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.extractvalue {{.*}}[3] : !llvm<"[4 x [1 x <2 x float>]]">
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[4 x [3 x <2 x float>]]">
+
+func @outerproduct(%arg0: vector<2xf32>, %arg1: vector<3xf32>) -> vector<2x3xf32> {
+  %2 = vector.outerproduct %arg0, %arg1 : vector<2xf32>, vector<3xf32>
+  return %2 : vector<2x3xf32>
+}
+// CHECK-LABEL: outerproduct
+//       CHECK:   llvm.mlir.undef : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.shufflevector {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
+//       CHECK:   llvm.fmul {{.*}}, {{.*}} : !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}[0] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.shufflevector {{.*}} [1 : i32, 1 : i32, 1 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
+//       CHECK:   llvm.fmul {{.*}}, {{.*}} : !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}[1] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[2 x <3 x float>]">
+
+func @outerproduct_add(%arg0: vector<2xf32>, %arg1: vector<3xf32>, %arg2: vector<2x3xf32>) -> vector<2x3xf32> {
+  %2 = vector.outerproduct %arg0, %arg1, %arg2 : vector<2xf32>, vector<3xf32>
+  return %2 : vector<2x3xf32>
+}
+// CHECK-LABEL: outerproduct_add
+//       CHECK:   llvm.mlir.undef : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.shufflevector {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   "llvm.intr.fmuladd"({{.*}}) : (!llvm<"<3 x float>">, !llvm<"<3 x float>">, !llvm<"<3 x float>">) -> !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}[0] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.shufflevector {{.*}} [1 : i32, 1 : i32, 1 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
+//       CHECK:   llvm.extractvalue {{.*}}[1] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   "llvm.intr.fmuladd"({{.*}}) : (!llvm<"<3 x float>">, !llvm<"<3 x float>">, !llvm<"<3 x float>">) -> !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}[1] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[2 x <3 x float>]">
+
+func @shuffle_1D_direct(%arg0: vector<2xf32>, %arg1: vector<2xf32>) -> vector<2xf32> {
+  %1 = vector.shuffle %arg0, %arg1 [0, 1] : vector<2xf32>, vector<2xf32>
+  return %1 : vector<2xf32>
+}
+// CHECK-LABEL: shuffle_1D_direct(%arg0: !llvm<"<2 x float>">, %arg1: !llvm<"<2 x float>">)
+//       CHECK:   %[[s:.*]] = llvm.shufflevector %arg0, %arg1 [0, 1] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
+//       CHECK:   llvm.return %[[s]] : !llvm<"<2 x float>">
+
+func @shuffle_1D(%arg0: vector<2xf32>, %arg1: vector<3xf32>) -> vector<5xf32> {
+  %1 = vector.shuffle %arg0, %arg1 [4, 3, 2, 1, 0] : vector<2xf32>, vector<3xf32>
+  return %1 : vector<5xf32>
+}
+// CHECK-LABEL: shuffle_1D(%arg0: !llvm<"<2 x float>">, %arg1: !llvm<"<3 x float>">)
+//       CHECK:   %[[u0:.*]] = llvm.mlir.undef : !llvm<"<5 x float>">
+//       CHECK:   %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
+//       CHECK:   %[[e1:.*]] = llvm.extractelement %arg1[%[[c2]] : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   %[[i1:.*]] = llvm.insertelement %[[e1]], %[[u0]][%[[c0]] : !llvm.i64] : !llvm<"<5 x float>">
+//       CHECK:   %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:   %[[e2:.*]] = llvm.extractelement %arg1[%[[c1]] : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:   %[[i2:.*]] = llvm.insertelement %[[e2]], %[[i1]][%[[c1]] : !llvm.i64] : !llvm<"<5 x float>">
+//       CHECK:   %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   %[[e3:.*]] = llvm.extractelement %arg1[%[[c0]] : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
+//       CHECK:   %[[i3:.*]] = llvm.insertelement %[[e3]], %[[i2]][%[[c2]] : !llvm.i64] : !llvm<"<5 x float>">
+//       CHECK:   %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:   %[[e4:.*]] = llvm.extractelement %arg0[%[[c1]] : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:   %[[c3:.*]] = llvm.mlir.constant(3 : index) : !llvm.i64
+//       CHECK:   %[[i4:.*]] = llvm.insertelement %[[e4]], %[[i3]][%[[c3]] : !llvm.i64] : !llvm<"<5 x float>">
+//       CHECK:   %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   %[[e5:.*]] = llvm.extractelement %arg0[%[[c0]] : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:   %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+//       CHECK:   %[[i5:.*]] = llvm.insertelement %[[e5]], %[[i4]][%[[c4]] : !llvm.i64] : !llvm<"<5 x float>">
+//       CHECK:   llvm.return %[[i5]] : !llvm<"<5 x float>">
+
+func @shuffle_2D(%a: vector<1x4xf32>, %b: vector<2x4xf32>) -> vector<3x4xf32> {
+  %1 = vector.shuffle %a, %b[1, 0, 2] : vector<1x4xf32>, vector<2x4xf32>
+  return %1 : vector<3x4xf32>
+}
+// CHECK-LABEL: shuffle_2D(%arg0: !llvm<"[1 x <4 x float>]">, %arg1: !llvm<"[2 x <4 x float>]">)
+//       CHECK:   %[[u0:.*]] = llvm.mlir.undef : !llvm<"[3 x <4 x float>]">
+//       CHECK:   %[[e1:.*]] = llvm.extractvalue %arg1[0] : !llvm<"[2 x <4 x float>]">
+//       CHECK:   %[[i1:.*]] = llvm.insertvalue %[[e1]], %[[u0]][0] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   %[[e2:.*]] = llvm.extractvalue %arg0[0] : !llvm<"[1 x <4 x float>]">
+//       CHECK:   %[[i2:.*]] = llvm.insertvalue %[[e2]], %[[i1]][1] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   %[[e3:.*]] = llvm.extractvalue %arg1[1] : !llvm<"[2 x <4 x float>]">
+//       CHECK:   %[[i3:.*]] = llvm.insertvalue %[[e3]], %[[i2]][2] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.return %[[i3]] : !llvm<"[3 x <4 x float>]">
+
+func @extract_element(%arg0: vector<16xf32>) -> f32 {
+  %0 = constant 15 : i32
+  %1 = vector.extractelement %arg0[%0 : i32]: vector<16xf32>
+  return %1 : f32
+}
+// CHECK-LABEL: extract_element(%arg0: !llvm<"<16 x float>">)
+//       CHECK:   %[[c:.*]] = llvm.mlir.constant(15 : i32) : !llvm.i32
+//       CHECK:   %[[x:.*]] = llvm.extractelement %arg0[%[[c]] : !llvm.i32] : !llvm<"<16 x float>">
+//       CHECK:   llvm.return %[[x]] : !llvm.float
+
+func @extract_element_from_vec_1d(%arg0: vector<16xf32>) -> f32 {
+  %0 = vector.extract %arg0[15]: vector<16xf32>
+  return %0 : f32
+}
+// CHECK-LABEL: extract_element_from_vec_1d
+//       CHECK:   llvm.mlir.constant(15 : i64) : !llvm.i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<16 x float>">
+//       CHECK:   llvm.return {{.*}} : !llvm.float
+
+func @extract_vec_2d_from_vec_3d(%arg0: vector<4x3x16xf32>) -> vector<3x16xf32> {
+  %0 = vector.extract %arg0[0]: vector<4x3x16xf32>
+  return %0 : vector<3x16xf32>
+}
+// CHECK-LABEL: extract_vec_2d_from_vec_3d
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[4 x [3 x <16 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[3 x <16 x float>]">
+
+func @extract_vec_1d_from_vec_3d(%arg0: vector<4x3x16xf32>) -> vector<16xf32> {
+  %0 = vector.extract %arg0[0, 0]: vector<4x3x16xf32>
+  return %0 : vector<16xf32>
+}
+// CHECK-LABEL: extract_vec_1d_from_vec_3d
+//       CHECK:   llvm.extractvalue {{.*}}[0, 0] : !llvm<"[4 x [3 x <16 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"<16 x float>">
+
+func @extract_element_from_vec_3d(%arg0: vector<4x3x16xf32>) -> f32 {
+  %0 = vector.extract %arg0[0, 0, 0]: vector<4x3x16xf32>
+  return %0 : f32
+}
+// CHECK-LABEL: extract_element_from_vec_3d
+//       CHECK:   llvm.extractvalue {{.*}}[0, 0] : !llvm<"[4 x [3 x <16 x float>]]">
+//       CHECK:   llvm.mlir.constant(0 : i64) : !llvm.i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<16 x float>">
+//       CHECK:   llvm.return {{.*}} : !llvm.float
+
+func @insert_element(%arg0: f32, %arg1: vector<4xf32>) -> vector<4xf32> {
+  %0 = constant 3 : i32
+  %1 = vector.insertelement %arg0, %arg1[%0 : i32] : vector<4xf32>
+  return %1 : vector<4xf32>
+}
+// CHECK-LABEL: insert_element(%arg0: !llvm.float, %arg1: !llvm<"<4 x float>">)
+//       CHECK:   %[[c:.*]] = llvm.mlir.constant(3 : i32) : !llvm.i32
+//       CHECK:   %[[x:.*]] = llvm.insertelement %arg0, %arg1[%[[c]] : !llvm.i32] : !llvm<"<4 x float>">
+//       CHECK:   llvm.return %[[x]] : !llvm<"<4 x float>">
+
+func @insert_element_into_vec_1d(%arg0: f32, %arg1: vector<4xf32>) -> vector<4xf32> {
+  %0 = vector.insert %arg0, %arg1[3] : f32 into vector<4xf32>
+  return %0 : vector<4xf32>
+}
+// CHECK-LABEL: insert_element_into_vec_1d
+//       CHECK:   llvm.mlir.constant(3 : i64) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//       CHECK:   llvm.return {{.*}} : !llvm<"<4 x float>">
+
+func @insert_vec_2d_into_vec_3d(%arg0: vector<8x16xf32>, %arg1: vector<4x8x16xf32>) -> vector<4x8x16xf32> {
+  %0 = vector.insert %arg0, %arg1[3] : vector<8x16xf32> into vector<4x8x16xf32>
+  return %0 : vector<4x8x16xf32>
+}
+// CHECK-LABEL: insert_vec_2d_into_vec_3d
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x [8 x <16 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[4 x [8 x <16 x float>]]">
+
+func @insert_vec_1d_into_vec_3d(%arg0: vector<16xf32>, %arg1: vector<4x8x16xf32>) -> vector<4x8x16xf32> {
+  %0 = vector.insert %arg0, %arg1[3, 7] : vector<16xf32> into vector<4x8x16xf32>
+  return %0 : vector<4x8x16xf32>
+}
+// CHECK-LABEL: insert_vec_1d_into_vec_3d
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3, 7] : !llvm<"[4 x [8 x <16 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[4 x [8 x <16 x float>]]">
+
+func @insert_element_into_vec_3d(%arg0: f32, %arg1: vector<4x8x16xf32>) -> vector<4x8x16xf32> {
+  %0 = vector.insert %arg0, %arg1[3, 7, 15] : f32 into vector<4x8x16xf32>
+  return %0 : vector<4x8x16xf32>
+}
+// CHECK-LABEL: insert_element_into_vec_3d
+//       CHECK:   llvm.extractvalue {{.*}}[3, 7] : !llvm<"[4 x [8 x <16 x float>]]">
+//       CHECK:   llvm.mlir.constant(15 : i64) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<16 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3, 7] : !llvm<"[4 x [8 x <16 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[4 x [8 x <16 x float>]]">
+
+func @vector_type_cast(%arg0: memref<8x8x8xf32>) -> memref<vector<8x8x8xf32>> {
+  %0 = vector.type_cast %arg0: memref<8x8x8xf32> to memref<vector<8x8x8xf32>>
+  return %0 : memref<vector<8x8x8xf32>>
+}
+// CHECK-LABEL: vector_type_cast
+//       CHECK:   llvm.mlir.undef : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64 }">
+//       CHECK:   %[[allocated:.*]] = llvm.extractvalue {{.*}}[0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//       CHECK:   %[[allocatedBit:.*]] = llvm.bitcast %[[allocated]] : !llvm<"float*"> to !llvm<"[8 x [8 x <8 x float>]]*">
+//       CHECK:   llvm.insertvalue %[[allocatedBit]], {{.*}}[0] : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64 }">
+//       CHECK:   %[[aligned:.*]] = llvm.extractvalue {{.*}}[1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//       CHECK:   %[[alignedBit:.*]] = llvm.bitcast %[[aligned]] : !llvm<"float*"> to !llvm<"[8 x [8 x <8 x float>]]*">
+//       CHECK:   llvm.insertvalue %[[alignedBit]], {{.*}}[1] : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64 }">
+//       CHECK:   llvm.mlir.constant(0 : index
+//       CHECK:   llvm.insertvalue {{.*}}[2] : !llvm<"{ [8 x [8 x <8 x float>]]*, [8 x [8 x <8 x float>]]*, i64 }">
+
+func @vector_print_scalar(%arg0: f32) {
+  vector.print %arg0 : f32
+  return
+}
+// CHECK-LABEL: vector_print_scalar(%arg0: !llvm.float)
+//       CHECK:    llvm.call @print_f32(%arg0) : (!llvm.float) -> ()
+//       CHECK:    llvm.call @print_newline() : () -> ()
+
+func @vector_print_vector(%arg0: vector<2x2xf32>) {
+  vector.print %arg0 : vector<2x2xf32>
+  return
+}
+// CHECK-LABEL: vector_print_vector(%arg0: !llvm<"[2 x <2 x float>]">)
+//       CHECK:    llvm.call @print_open() : () -> ()
+//       CHECK:    %[[x0:.*]] = llvm.extractvalue %arg0[0] : !llvm<"[2 x <2 x float>]">
+//       CHECK:    llvm.call @print_open() : () -> ()
+//       CHECK:    %[[x1:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:    %[[x2:.*]] = llvm.extractelement %[[x0]][%[[x1]] : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.call @print_f32(%[[x2]]) : (!llvm.float) -> ()
+//       CHECK:    llvm.call @print_comma() : () -> ()
+//       CHECK:    %[[x3:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:    %[[x4:.*]] = llvm.extractelement %[[x0]][%[[x3]] : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.call @print_f32(%[[x4]]) : (!llvm.float) -> ()
+//       CHECK:    llvm.call @print_close() : () -> ()
+//       CHECK:    llvm.call @print_comma() : () -> ()
+//       CHECK:    %[[x5:.*]] = llvm.extractvalue %arg0[1] : !llvm<"[2 x <2 x float>]">
+//       CHECK:    llvm.call @print_open() : () -> ()
+//       CHECK:    %[[x6:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:    %[[x7:.*]] = llvm.extractelement %[[x5]][%[[x6]] : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.call @print_f32(%[[x7]]) : (!llvm.float) -> ()
+//       CHECK:    llvm.call @print_comma() : () -> ()
+//       CHECK:    %[[x8:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:    %[[x9:.*]] = llvm.extractelement %[[x5]][%[[x8]] : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.call @print_f32(%[[x9]]) : (!llvm.float) -> ()
+//       CHECK:    llvm.call @print_close() : () -> ()
+//       CHECK:    llvm.call @print_close() : () -> ()
+//       CHECK:    llvm.call @print_newline() : () -> ()
diff --git a/mlir/test/Conversion/VectorToLoops/vector-to-loops.mlir b/mlir/test/Conversion/VectorToLoops/vector-to-loops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..e73e658a139d08f82338286b0df7fde4f51cde76
--- /dev/null
+++ b/mlir/test/Conversion/VectorToLoops/vector-to-loops.mlir
@@ -0,0 +1,202 @@
+// RUN: mlir-opt %s -test-convert-vector-to-loops | FileCheck %s
+
+// CHECK: #[[ADD:map[0-9]+]] = (d0, d1) -> (d0 + d1)
+// CHECK: #[[SUB:map[0-9]+]] = ()[s0] -> (s0 - 1)
+
+// CHECK-LABEL: func @materialize_read_1d() {
+func @materialize_read_1d() {
+  %f0 = constant 0.0: f32
+  %A = alloc () : memref<7x42xf32>
+  affine.for %i0 = 0 to 7 step 4 {
+    affine.for %i1 = 0 to 42 step 4 {
+      %f1 = vector.transfer_read %A[%i0, %i1], %f0 {permutation_map = (d0, d1) -> (d0)} : memref<7x42xf32>, vector<4xf32>
+      %ip1 = affine.apply (d0) -> (d0 + 1) (%i1)
+      %f2 = vector.transfer_read %A[%i0, %ip1], %f0 {permutation_map = (d0, d1) -> (d0)} : memref<7x42xf32>, vector<4xf32>
+      %ip2 = affine.apply (d0) -> (d0 + 2) (%i1)
+      %f3 = vector.transfer_read %A[%i0, %ip2], %f0 {permutation_map = (d0, d1) -> (d0)} : memref<7x42xf32>, vector<4xf32>
+      %ip3 = affine.apply (d0) -> (d0 + 3) (%i1)
+      %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {permutation_map = (d0, d1) -> (d0)} : memref<7x42xf32>, vector<4xf32>
+      // Both accesses in the load must be clipped otherwise %i1 + 2 and %i1 + 3 will go out of bounds.
+      // CHECK: {{.*}} = select
+      // CHECK: %[[FILTERED1:.*]] = select
+      // CHECK: {{.*}} = select
+      // CHECK: %[[FILTERED2:.*]] = select
+      // CHECK-NEXT: %{{.*}} = load {{.*}}[%[[FILTERED1]], %[[FILTERED2]]] : memref<7x42xf32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @materialize_read_1d_partially_specialized
+func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %dyn4 : index) {
+  %f0 = constant 0.0: f32
+  %A = alloc (%dyn1, %dyn2, %dyn4) : memref<7x?x?x42x?xf32>
+  affine.for %i0 = 0 to 7 {
+    affine.for %i1 = 0 to %dyn1 {
+      affine.for %i2 = 0 to %dyn2 {
+        affine.for %i3 = 0 to 42 step 2 {
+          affine.for %i4 = 0 to %dyn4 {
+            %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4], %f0 {permutation_map = (d0, d1, d2, d3, d4) -> (d3)} : memref<7x?x?x42x?xf32>, vector<4xf32>
+            %i3p1 = affine.apply (d0) -> (d0 + 1) (%i3)
+            %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4], %f0 {permutation_map = (d0, d1, d2, d3, d4) -> (d3)} : memref<7x?x?x42x?xf32>, vector<4xf32>
+          }
+        }
+      }
+    }
+  }
+  // CHECK: %[[tensor:[0-9]+]] = alloc
+  // CHECK-NOT: {{.*}} dim %[[tensor]], 0
+  // CHECK-NOT: {{.*}} dim %[[tensor]], 3
+  return
+}
+
+// CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
+func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
+  %f0 = constant 0.0: f32
+  // CHECK-DAG:  %[[C0:.*]] = constant 0 : index
+  // CHECK-DAG:  %[[C1:.*]] = constant 1 : index
+  // CHECK-DAG:  %[[C3:.*]] = constant 3 : index
+  // CHECK-DAG:  %[[C4:.*]] = constant 4 : index
+  // CHECK-DAG:  %[[C5:.*]] = constant 5 : index
+  //     CHECK:  %{{.*}} = alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
+  // CHECK-NEXT:  affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
+  // CHECK-NEXT:    affine.for %[[I1:.*]] = 0 to %{{.*}} {
+  // CHECK-NEXT:      affine.for %[[I2:.*]] = 0 to %{{.*}} {
+  // CHECK-NEXT:        affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
+  //      CHECK:          %[[ALLOC:.*]] = alloc() : memref<5x4x3xf32>
+  // CHECK-NEXT:          %[[VECTOR_VIEW:.*]] = vector.type_cast %[[ALLOC]] : memref<5x4x3xf32>
+  // CHECK-NEXT:          loop.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
+  // CHECK-NEXT:            loop.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
+  // CHECK-NEXT:              loop.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[ADD]](%[[I0]], %[[I4]])
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[SUB]]()[%{{.*}}]
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}} : index
+  // CHECK-NEXT:                {{.*}} = select
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
+  // CHECK-NEXT:                %[[L0:.*]] = select
+  //
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[SUB]]()[%{{.*}}]
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}} : index
+  // CHECK-NEXT:                {{.*}} = select
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
+  // CHECK-NEXT:                %[[L1:.*]] = select
+  //
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[SUB]]()[%{{.*}}]
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}} : index
+  // CHECK-NEXT:                {{.*}} = select
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
+  // CHECK-NEXT:                %[[L2:.*]] = select
+  //
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[ADD]](%[[I3]], %[[I6]])
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[SUB]]()[%{{.*}}]
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}} : index
+  // CHECK-NEXT:                {{.*}} = select
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
+  // CHECK-NEXT:                %[[L3:.*]] = select
+  //
+  // CHECK-NEXT:                {{.*}} = load %{{.*}}[%[[L0]], %[[L1]], %[[L2]], %[[L3]]] : memref<?x?x?x?xf32>
+  // CHECK-NEXT:                store {{.*}}, %[[ALLOC]][%[[I6]], %[[I5]], %[[I4]]] : memref<5x4x3xf32>
+  // CHECK-NEXT:              }
+  // CHECK-NEXT:            }
+  // CHECK-NEXT:          }
+  //      CHECK:          {{.*}} = load %[[VECTOR_VIEW]][] : memref<vector<5x4x3xf32>>
+  // CHECK-NEXT:          dealloc %[[ALLOC]] : memref<5x4x3xf32>
+  // CHECK-NEXT:        }
+  // CHECK-NEXT:      }
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  // CHECK-NEXT:}
+
+  // Check that I0 + I4 (of size 3) read from first index load(L0, ...) and write into last index store(..., I4)
+  // Check that I3 + I6 (of size 5) read from last index load(..., L3) and write into first index store(I6, ...)
+  // Other dimensions are just accessed with I1, I2 resp.
+  %A = alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
+  affine.for %i0 = 0 to %M step 3 {
+    affine.for %i1 = 0 to %N {
+      affine.for %i2 = 0 to %O {
+        affine.for %i3 = 0 to %P step 5 {
+          %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {permutation_map = (d0, d1, d2, d3) -> (d3, 0, d0)} : memref<?x?x?x?xf32>, vector<5x4x3xf32>
+        }
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
+func @materialize_write(%M: index, %N: index, %O: index, %P: index) {
+  // CHECK-DAG:  %{{.*}} = constant dense<1.000000e+00> : vector<5x4x3xf32>
+  // CHECK-DAG:  %[[C0:.*]] = constant 0 : index
+  // CHECK-DAG:  %[[C1:.*]] = constant 1 : index
+  // CHECK-DAG:  %[[C3:.*]] = constant 3 : index
+  // CHECK-DAG:  %[[C4:.*]] = constant 4 : index
+  // CHECK-DAG:  %[[C5:.*]] = constant 5 : index
+  //     CHECK:  %{{.*}} = alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
+  // CHECK-NEXT:  affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
+  // CHECK-NEXT:    affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 {
+  // CHECK-NEXT:      affine.for %[[I2:.*]] = 0 to %{{.*}} {
+  // CHECK-NEXT:        affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
+  // CHECK:               %[[ALLOC:.*]] = alloc() : memref<5x4x3xf32>
+  // CHECK-NEXT:          %[[VECTOR_VIEW:.*]] = vector.type_cast {{.*}} : memref<5x4x3xf32>
+  //      CHECK:          store %{{.*}}, {{.*}} : memref<vector<5x4x3xf32>>
+  // CHECK-NEXT:          loop.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
+  // CHECK-NEXT:            loop.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
+  // CHECK-NEXT:              loop.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[ADD]](%[[I0]], %[[I4]])
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[SUB]]()[%{{.*}}]
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, {{.*}} : index
+  // CHECK-NEXT:                {{.*}} = select {{.*}}, {{.*}}, {{.*}} : index
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
+  // CHECK-NEXT:                %[[S0:.*]] = select {{.*}}, %[[C0]], {{.*}} : index
+  //
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[ADD]](%[[I1]], %[[I5]])
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[SUB]]()[%{{.*}}]
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, {{.*}} : index
+  // CHECK-NEXT:                {{.*}} = select {{.*}}, {{.*}}, {{.*}} : index
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
+  // CHECK-NEXT:                %[[S1:.*]] = select {{.*}}, %[[C0]], {{.*}} : index
+  //
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[SUB]]()[%{{.*}}]
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", %[[I2]], %{{.*}} : index
+  // CHECK-NEXT:                {{.*}} = select {{.*}}, %[[I2]], {{.*}} : index
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", %[[I2]], %[[C0]] : index
+  // CHECK-NEXT:                %[[S2:.*]] = select {{.*}}, %[[C0]], {{.*}} : index
+  //
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[ADD]](%[[I3]], %[[I6]])
+  // CHECK-NEXT:                {{.*}} = affine.apply #[[SUB]]()[%{{.*}}]
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, {{.*}} : index
+  // CHECK-NEXT:                {{.*}} = select {{.*}}, {{.*}}, {{.*}} : index
+  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
+  // CHECK-NEXT:                %[[S3:.*]] = select {{.*}}, %[[C0]], {{.*}} : index
+  //
+  // CHECK-NEXT:                {{.*}} = load {{.*}}[%[[I6]], %[[I5]], %[[I4]]] : memref<5x4x3xf32>
+  //      CHECK:                store {{.*}}, {{.*}}[%[[S0]], %[[S1]], %[[S2]], %[[S3]]] : memref<?x?x?x?xf32>
+  // CHECK-NEXT:              }
+  // CHECK-NEXT:            }
+  // CHECK-NEXT:          }
+  // CHECK-NEXT:          dealloc {{.*}} : memref<5x4x3xf32>
+  // CHECK-NEXT:        }
+  // CHECK-NEXT:      }
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  // CHECK-NEXT:}
+  //
+  // Check that I0 + I4 (of size 3) read from last index load(..., I4) and write into first index store(S0, ...)
+  // Check that I1 + I5 (of size 4) read from second index load(..., I5, ...) and write into second index store(..., S1, ...)
+  // Check that I3 + I6 (of size 5) read from first index load(I6, ...) and write into last index store(..., S3)
+  // Other dimension is just accessed with I2.
+  %A = alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
+  %f1 = constant dense<1.000000e+00> : vector<5x4x3xf32>
+  affine.for %i0 = 0 to %M step 3 {
+    affine.for %i1 = 0 to %N step 4 {
+      affine.for %i2 = 0 to %O {
+        affine.for %i3 = 0 to %P step 5 {
+          vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] {permutation_map = (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32>
+        }
+      }
+    }
+  }
+  return
+}
diff --git a/mlir/test/Conversion/convert-to-cfg.mlir b/mlir/test/Conversion/convert-to-cfg.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8cf0bb29995d42b465efa5eacc7190facf4f9218
--- /dev/null
+++ b/mlir/test/Conversion/convert-to-cfg.mlir
@@ -0,0 +1,149 @@
+// RUN: mlir-opt -convert-loop-to-std %s | FileCheck %s
+
+// CHECK-LABEL: func @simple_std_for_loop(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
+//  CHECK-NEXT:  br ^bb1(%{{.*}} : index)
+//  CHECK-NEXT:  ^bb1(%{{.*}}: index):    // 2 preds: ^bb0, ^bb2
+//  CHECK-NEXT:    %{{.*}} = cmpi "slt", %{{.*}}, %{{.*}} : index
+//  CHECK-NEXT:    cond_br %{{.*}}, ^bb2, ^bb3
+//  CHECK-NEXT:  ^bb2:   // pred: ^bb1
+//  CHECK-NEXT:    %{{.*}} = constant 1 : index
+//  CHECK-NEXT:    %[[iv:.*]] = addi %{{.*}}, %{{.*}} : index
+//  CHECK-NEXT:    br ^bb1(%[[iv]] : index)
+//  CHECK-NEXT:  ^bb3:   // pred: ^bb1
+//  CHECK-NEXT:    return
+func @simple_std_for_loop(%arg0 : index, %arg1 : index, %arg2 : index) {
+  loop.for %i0 = %arg0 to %arg1 step %arg2 {
+    %c1 = constant 1 : index
+  }
+  return
+}
+
+// CHECK-LABEL: func @simple_std_2_for_loops(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
+//  CHECK-NEXT:    br ^bb1(%{{.*}} : index)
+//  CHECK-NEXT:  ^bb1(%[[ub0:.*]]: index):    // 2 preds: ^bb0, ^bb5
+//  CHECK-NEXT:    %[[cond0:.*]] = cmpi "slt", %[[ub0]], %{{.*}} : index
+//  CHECK-NEXT:    cond_br %[[cond0]], ^bb2, ^bb6
+//  CHECK-NEXT:  ^bb2:   // pred: ^bb1
+//  CHECK-NEXT:    %{{.*}} = constant 1 : index
+//  CHECK-NEXT:    br ^bb3(%{{.*}} : index)
+//  CHECK-NEXT:  ^bb3(%[[ub1:.*]]: index):    // 2 preds: ^bb2, ^bb4
+//  CHECK-NEXT:    %[[cond1:.*]] = cmpi "slt", %{{.*}}, %{{.*}} : index
+//  CHECK-NEXT:    cond_br %[[cond1]], ^bb4, ^bb5
+//  CHECK-NEXT:  ^bb4:   // pred: ^bb3
+//  CHECK-NEXT:    %{{.*}} = constant 1 : index
+//  CHECK-NEXT:    %[[iv1:.*]] = addi %{{.*}}, %{{.*}} : index
+//  CHECK-NEXT:    br ^bb3(%[[iv1]] : index)
+//  CHECK-NEXT:  ^bb5:   // pred: ^bb3
+//  CHECK-NEXT:    %[[iv0:.*]] = addi %{{.*}}, %{{.*}} : index
+//  CHECK-NEXT:    br ^bb1(%[[iv0]] : index)
+//  CHECK-NEXT:  ^bb6:   // pred: ^bb1
+//  CHECK-NEXT:    return
+func @simple_std_2_for_loops(%arg0 : index, %arg1 : index, %arg2 : index) {
+  loop.for %i0 = %arg0 to %arg1 step %arg2 {
+    %c1 = constant 1 : index
+    loop.for %i1 = %arg0 to %arg1 step %arg2 {
+      %c1_0 = constant 1 : index
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @simple_std_if(%{{.*}}: i1) {
+//  CHECK-NEXT:   cond_br %{{.*}}, ^bb1, ^bb2
+//  CHECK-NEXT:   ^bb1:   // pred: ^bb0
+//  CHECK-NEXT:     %{{.*}} = constant 1 : index
+//  CHECK-NEXT:     br ^bb2
+//  CHECK-NEXT:   ^bb2:   // 2 preds: ^bb0, ^bb1
+//  CHECK-NEXT:     return
+func @simple_std_if(%arg0: i1) {
+  loop.if %arg0 {
+    %c1 = constant 1 : index
+  }
+  return
+}
+
+// CHECK-LABEL: func @simple_std_if_else(%{{.*}}: i1) {
+//  CHECK-NEXT:   cond_br %{{.*}}, ^bb1, ^bb2
+//  CHECK-NEXT:   ^bb1:   // pred: ^bb0
+//  CHECK-NEXT:     %{{.*}} = constant 1 : index
+//  CHECK-NEXT:     br ^bb3
+//  CHECK-NEXT:   ^bb2:   // pred: ^bb0
+//  CHECK-NEXT:     %{{.*}} = constant 1 : index
+//  CHECK-NEXT:     br ^bb3
+//  CHECK-NEXT:   ^bb3:   // 2 preds: ^bb1, ^bb2
+//  CHECK-NEXT:     return
+func @simple_std_if_else(%arg0: i1) {
+  loop.if %arg0 {
+    %c1 = constant 1 : index
+  } else {
+    %c1_0 = constant 1 : index
+  }
+  return
+}
+
+// CHECK-LABEL: func @simple_std_2_ifs(%{{.*}}: i1) {
+//  CHECK-NEXT:   cond_br %{{.*}}, ^bb1, ^bb5
+//  CHECK-NEXT: ^bb1:   // pred: ^bb0
+//  CHECK-NEXT:   %{{.*}} = constant 1 : index
+//  CHECK-NEXT:   cond_br %{{.*}}, ^bb2, ^bb3
+//  CHECK-NEXT: ^bb2:   // pred: ^bb1
+//  CHECK-NEXT:   %{{.*}} = constant 1 : index
+//  CHECK-NEXT:   br ^bb4
+//  CHECK-NEXT: ^bb3:   // pred: ^bb1
+//  CHECK-NEXT:   %{{.*}} = constant 1 : index
+//  CHECK-NEXT:   br ^bb4
+//  CHECK-NEXT: ^bb4:   // 2 preds: ^bb2, ^bb3
+//  CHECK-NEXT:   br ^bb5
+//  CHECK-NEXT: ^bb5:   // 2 preds: ^bb0, ^bb4
+//  CHECK-NEXT:   return
+func @simple_std_2_ifs(%arg0: i1) {
+  loop.if %arg0 {
+    %c1 = constant 1 : index
+    loop.if %arg0 {
+      %c1_0 = constant 1 : index
+    } else {
+      %c1_1 = constant 1 : index
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @simple_std_for_loop_with_2_ifs(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: i1) {
+//  CHECK-NEXT:   br ^bb1(%{{.*}} : index)
+//  CHECK-NEXT:   ^bb1(%{{.*}}: index):    // 2 preds: ^bb0, ^bb7
+//  CHECK-NEXT:     %{{.*}} = cmpi "slt", %{{.*}}, %{{.*}} : index
+//  CHECK-NEXT:     cond_br %{{.*}}, ^bb2, ^bb8
+//  CHECK-NEXT:   ^bb2:   // pred: ^bb1
+//  CHECK-NEXT:     %{{.*}} = constant 1 : index
+//  CHECK-NEXT:     cond_br %{{.*}}, ^bb3, ^bb7
+//  CHECK-NEXT:   ^bb3:   // pred: ^bb2
+//  CHECK-NEXT:     %{{.*}} = constant 1 : index
+//  CHECK-NEXT:     cond_br %{{.*}}, ^bb4, ^bb5
+//  CHECK-NEXT:   ^bb4:   // pred: ^bb3
+//  CHECK-NEXT:     %{{.*}} = constant 1 : index
+//  CHECK-NEXT:     br ^bb6
+//  CHECK-NEXT:   ^bb5:   // pred: ^bb3
+//  CHECK-NEXT:     %{{.*}} = constant 1 : index
+//  CHECK-NEXT:     br ^bb6
+//  CHECK-NEXT:   ^bb6:   // 2 preds: ^bb4, ^bb5
+//  CHECK-NEXT:     br ^bb7
+//  CHECK-NEXT:   ^bb7:   // 2 preds: ^bb2, ^bb6
+//  CHECK-NEXT:     %[[iv0:.*]] = addi %{{.*}}, %{{.*}} : index
+//  CHECK-NEXT:     br ^bb1(%[[iv0]] : index)
+//  CHECK-NEXT:   ^bb8:   // pred: ^bb1
+//  CHECK-NEXT:     return
+//  CHECK-NEXT: }
+func @simple_std_for_loop_with_2_ifs(%arg0 : index, %arg1 : index, %arg2 : index, %arg3 : i1) {
+  loop.for %i0 = %arg0 to %arg1 step %arg2 {
+    %c1 = constant 1 : index
+    loop.if %arg3 {
+      %c1_0 = constant 1 : index
+      loop.if %arg3 {
+        %c1_1 = constant 1 : index
+      } else {
+        %c1_2 = constant 1 : index
+      }
+    }
+  }
+  return
+}
diff --git a/mlir/test/Dialect/FxpMathOps/lower-uniform-casts.mlir b/mlir/test/Dialect/FxpMathOps/lower-uniform-casts.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..2401bebc3af03721cce7c16ad2a8e8a4123eaa3f
--- /dev/null
+++ b/mlir/test/Dialect/FxpMathOps/lower-uniform-casts.mlir
@@ -0,0 +1,64 @@
+// RUN: mlir-opt %s -split-input-file -fxpmath-lower-uniform-casts | FileCheck %s --dump-input=always
+
+// -----
+// CHECK-LABEL: dequantize_per_layer_fixedpoint
+!type_input = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_result = type tensor<4xf32>
+func @dequantize_per_layer_fixedpoint(%arg0 : !type_input) -> !type_result {
+  // CHECK: %cst = constant dense<6.250000e-02> : tensor<4xf32>
+  // CHECK-NEXT: %0 = "quant.scast"(%arg0) : (tensor<4x!quant.uniform<i8:f32, 6.250000e-02>>) -> tensor<4xi8>
+  // CHECK-NEXT: %1 = "fxpmath.convertis"(%0) : (tensor<4xi8>) -> tensor<4xi32>
+  // CHECK-NEXT: %2 = "fxpmath.convertistof"(%1) : (tensor<4xi32>) -> tensor<4xf32>
+  // CHECK-NEXT: %3 = mulf %2, %cst : tensor<4xf32>
+  // CHECK-NEXT: return %3 : tensor<4xf32>
+  %0 = "quant.dcast"(%arg0) : (!type_input) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// CHECK-LABEL: dequantize_per_layer_affine
+!type_input = type tensor<4x!quant.uniform<i8:f32, 6.25e-2:-36>>
+!type_result = type tensor<4xf32>
+func @dequantize_per_layer_affine(%arg0 : !type_input) -> !type_result {
+  // CHECK: %cst = constant dense<36> : tensor<4xi32>
+  // CHECK-NEXT: %cst_0 = constant dense<6.250000e-02> : tensor<4xf32>
+  // CHECK-NEXT: %0 = "quant.scast"(%arg0) : (tensor<4x!quant.uniform<i8:f32, 6.250000e-02:-36>>) -> tensor<4xi8>
+  // CHECK-NEXT: %1 = "fxpmath.convertis"(%0) : (tensor<4xi8>) -> tensor<4xi32>
+  // CHECK-NEXT: %2 = addi %1, %cst : tensor<4xi32>
+  // CHECK-NEXT: %3 = "fxpmath.convertistof"(%2) : (tensor<4xi32>) -> tensor<4xf32>
+  // CHECK-NEXT: %4 = mulf %3, %cst_0 : tensor<4xf32>
+  // CHECK-NEXT: return %4 : tensor<4xf32>
+  %0 = "quant.dcast"(%arg0) : (!type_input) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// CHECK-LABEL: dequantize_per_axis_fixedpoint
+!type_input = type tensor<4x!quant.uniform<i8:f32:0, {6.25e-2,3.26e-2,4.25e-2,1.23e-2}>>
+!type_result = type tensor<4xf32>
+func @dequantize_per_axis_fixedpoint(%arg0 : !type_input) -> !type_result {
+  // expected-warning@+1 {{unimplemented: per-axis uniform dequantization}}
+  %0 = "quant.dcast"(%arg0) : (!type_input) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// CHECK-LABEL: dequantize_per_axis_affine
+!type_input = type tensor<4x!quant.uniform<i8:f32:0, {6.25e-2,3.26e-2,4.25e-2,1.23e-2}>>
+!type_result = type tensor<4xf32>
+func @dequantize_per_axis_affine(%arg0 : !type_input) -> !type_result {
+  // expected-warning@+1 {{unimplemented: per-axis uniform dequantization}}
+  %0 = "quant.dcast"(%arg0) : (!type_input) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// Noop dequantize should be skipped (will be canonicalized away later).
+// CHECK-LABEL: dequantize_noop
+!type_input = type tensor<4x!quant.uniform<i8:f32, 6.25e-2:-36>>
+!type_result = type tensor<4x!quant.uniform<i8:f32, 6.25e-2:-36>>
+func @dequantize_noop(%arg0 : !type_input) -> !type_result {
+  // CHECK: %0 = "quant.dcast"(%arg0)
+  %0 = "quant.dcast"(%arg0) : (!type_input) -> (!type_result)
+  return %0 : !type_result
+}
diff --git a/mlir/test/Dialect/FxpMathOps/lower-uniform-real-math-addew.mlir b/mlir/test/Dialect/FxpMathOps/lower-uniform-real-math-addew.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..0af8c5cc0c5759dedbd94f58ee90d193cb22bb82
--- /dev/null
+++ b/mlir/test/Dialect/FxpMathOps/lower-uniform-real-math-addew.mlir
@@ -0,0 +1,102 @@
+// RUN: mlir-opt %s -split-input-file -fxpmath-lower-uniform-real-math -pass-pipeline='func(canonicalize)' | FileCheck %s --dump-input=always
+
+// -----
+// Verify lowering when operands and result have the same fixedpoint scale.
+// CHECK-LABEL: real_addew_fixedpoint_isomorphic
+!type_lhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_rhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_result = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+func @real_addew_fixedpoint_isomorphic(%arg0 : !type_lhs, %arg1: !type_rhs) -> !type_result {
+  // CHECK-NEXT: %0 = "quant.scast"(%arg0) : (tensor<4x!quant.uniform<i8:f32, 6.250000e-02>>) -> tensor<4xi8>
+  // CHECK-NEXT: %1 = "quant.scast"(%arg1) : (tensor<4x!quant.uniform<i8:f32, 6.250000e-02>>) -> tensor<4xi8>
+  // CHECK-NEXT: %2 = "fxpmath.convertis"(%0) : (tensor<4xi8>) -> tensor<4xi16>
+  // CHECK-NEXT: %3 = "fxpmath.convertis"(%1) : (tensor<4xi8>) -> tensor<4xi16>
+  // CHECK-NEXT: %4 = addi %2, %3 : tensor<4xi16>
+  // CHECK-NEXT: %5 = "fxpmath.clampis"(%4) {clamp_max = 127 : i16, clamp_min = -128 : i16} : (tensor<4xi16>) -> tensor<4xi16>
+  // CHECK-NEXT: %6 = "fxpmath.convertis"(%5) : (tensor<4xi16>) -> tensor<4xi8>
+  // CHECK-NEXT: %7 = "quant.scast"(%6) : (tensor<4xi8>) -> tensor<4x!quant.uniform<i8:f32, 6.250000e-02>>
+  // CHECK-NEXT: return %7 : tensor<4x!quant.uniform<i8:f32, 6.250000e-02>>
+  %0 = "fxpmath.real_add_ew"(%arg0, %arg1) : (!type_lhs, !type_rhs) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// Verify lowering when operands and result have the same fixedpoint scale
+// and non-zero zero points.
+// CHECK-LABEL: real_addew_affine_isomorphic
+!type_lhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2:-5>>
+!type_rhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2:-5>>
+!type_result = type tensor<4x!quant.uniform<i8:f32, 6.25e-2:-5>>
+func @real_addew_affine_isomorphic(%arg0 : !type_lhs, %arg1: !type_rhs) -> !type_result {
+  // CHECK-NEXT: %cst = constant dense<5> : tensor<4xi16>
+  // CHECK-NEXT: %0 = "quant.scast"(%arg0) : (tensor<4x!quant.uniform<i8:f32, 6.250000e-02:-5>>) -> tensor<4xi8>
+  // CHECK-NEXT: %1 = "quant.scast"(%arg1) : (tensor<4x!quant.uniform<i8:f32, 6.250000e-02:-5>>) -> tensor<4xi8>
+  // CHECK-NEXT: %2 = "fxpmath.convertis"(%0) : (tensor<4xi8>) -> tensor<4xi16>
+  // CHECK-NEXT: %3 = "fxpmath.convertis"(%1) : (tensor<4xi8>) -> tensor<4xi16>
+  // CHECK-NEXT: %4 = addi %2, %3 : tensor<4xi16>
+  // CHECK-NEXT: %5 = addi %4, %cst : tensor<4xi16>
+  // CHECK-NEXT: %6 = "fxpmath.clampis"(%5) {clamp_max = 127 : i16, clamp_min = -128 : i16} : (tensor<4xi16>) -> tensor<4xi16>
+  // CHECK-NEXT: %7 = "fxpmath.convertis"(%6) : (tensor<4xi16>) -> tensor<4xi8>
+  // CHECK-NEXT: %8 = "quant.scast"(%7) : (tensor<4xi8>) -> tensor<4x!quant.uniform<i8:f32, 6.250000e-02:-5>>
+  // CHECK-NEXT: return %8 : tensor<4x!quant.uniform<i8:f32, 6.250000e-02:-5>>
+  %0 = "fxpmath.real_add_ew"(%arg0, %arg1) : (!type_lhs, !type_rhs) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// The RHS quant parameters proscribe a range of [-8..8) so an explicit clamp
+// of [-4..4] should result in an integral clamp range of [-64..64].
+// CHECK-LABEL: real_addew_fixedpoint_clamp
+!type_lhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_rhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_result = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+func @real_addew_fixedpoint_clamp(%arg0 : !type_lhs, %arg1: !type_rhs) -> !type_result {
+  // CHECK-NEXT: %0 = "quant.scast"(%arg0) : (tensor<4x!quant.uniform<i8:f32, 6.250000e-02>>) -> tensor<4xi8>
+  // CHECK-NEXT: %1 = "quant.scast"(%arg1) : (tensor<4x!quant.uniform<i8:f32, 6.250000e-02>>) -> tensor<4xi8>
+  // CHECK-NEXT: %2 = "fxpmath.convertis"(%0) : (tensor<4xi8>) -> tensor<4xi16>
+  // CHECK-NEXT: %3 = "fxpmath.convertis"(%1) : (tensor<4xi8>) -> tensor<4xi16>
+  // CHECK-NEXT: %4 = addi %2, %3 : tensor<4xi16>
+  // CHECK-NEXT: %5 = "fxpmath.clampis"(%4) {clamp_max = 64 : i16, clamp_min = -64 : i16} : (tensor<4xi16>) -> tensor<4xi16>
+  // CHECK-NEXT: %6 = "fxpmath.convertis"(%5) : (tensor<4xi16>) -> tensor<4xi8>
+  // CHECK-NEXT: %7 = "quant.scast"(%6) : (tensor<4xi8>) -> tensor<4x!quant.uniform<i8:f32, 6.250000e-02>>
+  // CHECK-NEXT: return %7 : tensor<4x!quant.uniform<i8:f32, 6.250000e-02>>
+  %0 = "fxpmath.real_add_ew"(%arg0, %arg1) { clamp_min = -4.0, clamp_max = 4.0 }
+      : (!type_lhs, !type_rhs) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// CHECK-LABEL: real_addew_unquantized_lhs
+// Verifies that leaves as-is for unquantized lhs.
+!type_lhs = type tensor<4xf32>
+!type_rhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_result = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+func @real_addew_unquantized_lhs(%arg0 : !type_lhs, %arg1: !type_rhs) -> !type_result {
+  // CHECK: %0 = "fxpmath.real_add_ew"(%arg0, %arg1)
+  %0 = "fxpmath.real_add_ew"(%arg0, %arg1) : (!type_lhs, !type_rhs) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// CHECK-LABEL: real_addew_unquantized_rhs
+// Verifies that leaves as-is for unquantized rhs.
+!type_lhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_rhs = type tensor<4xf32>
+!type_result = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+func @real_addew_unquantized_rhs(%arg0 : !type_lhs, %arg1: !type_rhs) -> !type_result {
+  // CHECK: %0 = "fxpmath.real_add_ew"(%arg0, %arg1)
+  %0 = "fxpmath.real_add_ew"(%arg0, %arg1) : (!type_lhs, !type_rhs) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// CHECK-LABEL: real_addew_unquantized_result
+// Verifies that leaves as-is for unquantized result.
+!type_lhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_rhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_result = type tensor<4xf32>
+func @real_addew_unquantized_result(%arg0 : !type_lhs, %arg1: !type_rhs) -> !type_result {
+  // CHECK: %0 = "fxpmath.real_add_ew"(%arg0, %arg1)
+  %0 = "fxpmath.real_add_ew"(%arg0, %arg1) : (!type_lhs, !type_rhs) -> (!type_result)
+  return %0 : !type_result
+}
diff --git a/mlir/test/Dialect/FxpMathOps/lower-uniform-real-math-mulew.mlir b/mlir/test/Dialect/FxpMathOps/lower-uniform-real-math-mulew.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..85bcffdd51c52221a6f282d47f9ed33ba264ba5b
--- /dev/null
+++ b/mlir/test/Dialect/FxpMathOps/lower-uniform-real-math-mulew.mlir
@@ -0,0 +1,94 @@
+// RUN: mlir-opt %s -split-input-file -fxpmath-lower-uniform-real-math -pass-pipeline='func(canonicalize)' -verify-diagnostics | FileCheck %s --dump-input=always
+
+// -----
+// Verify lowering when operands and result have the same fixedpoint scale.
+// CHECK-LABEL: real_mulew_fixedpoint
+!type_lhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_rhs = type tensor<4x!quant.uniform<i8:f32, 3.875e-2>>
+!type_result = type tensor<4x!quant.uniform<i8:f32, 1.065e-1>>
+func @real_mulew_fixedpoint(%arg0 : !type_lhs, %arg1: !type_rhs) -> !type_result {
+  // CHECK: %0 = "quant.scast"(%arg0) : (tensor<4x!quant.uniform<i8:f32, 6.250000e-02>>) -> tensor<4xi8>
+  // CHECK-NEXT: %1 = "quant.scast"(%arg1) : (tensor<4x!quant.uniform<i8:f32, 3.875000e-02>>) -> tensor<4xi8>
+  // CHECK-NEXT: %2 = "fxpmath.convertis"(%0) : (tensor<4xi8>) -> tensor<4xi32>
+  // CHECK-NEXT: %3 = "fxpmath.convertis"(%1) : (tensor<4xi8>) -> tensor<4xi32>
+  // CHECK-NEXT: %4 = muli %2, %3 : tensor<4xi32>
+  // CHECK-NEXT: %5 = "fxpmath.vs_saturating_rounding_doubling_high_mulis"(%4) {b = 1562722842 : i32} : (tensor<4xi32>) -> tensor<4xi32>
+  // CHECK-NEXT: %6 = "fxpmath.rounding_divide_by_potis"(%5) {exponent = 5 : i32} : (tensor<4xi32>) -> tensor<4xi32>
+  // CHECK-NEXT: %7 = "fxpmath.clampis"(%6) {clamp_max = 127 : i32, clamp_min = -128 : i32} : (tensor<4xi32>) -> tensor<4xi32>
+  // CHECK-NEXT: %8 = "fxpmath.convertis"(%7) : (tensor<4xi32>) -> tensor<4xi8>
+  // CHECK-NEXT: %9 = "quant.scast"(%8) : (tensor<4xi8>) -> tensor<4x!quant.uniform<i8:f32, 1.065000e-01>>
+  // CHECK-NEXT: return %9 : tensor<4x!quant.uniform<i8:f32, 1.065000e-01>>
+  %0 = "fxpmath.real_mul_ew"(%arg0, %arg1) : (!type_lhs, !type_rhs) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// Verify lowering when operands and result have the same fixedpoint scale
+// and non-zero zero points.
+// CHECK-LABEL: real_mulew_affine_clamp
+!type_lhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2:-3>>
+!type_rhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2:-5>>
+!type_result = type tensor<4x!quant.uniform<i8:f32, 6.25e-2:-9>>
+func @real_mulew_affine_clamp(%arg0 : !type_lhs, %arg1: !type_rhs) -> !type_result {
+  // Just verify that the affine adds/constants and clamps are present.
+  // CHECK: %cst = constant dense<3> : tensor<4xi32>
+  // CHECK: %cst_0 = constant dense<5> : tensor<4xi32>
+  // CHECK: %cst_1 = constant dense<-9> : tensor<4xi32>
+  // CHECK: addi %2, %cst : tensor<4xi32>
+  // CHECK: addi %3, %cst_0 : tensor<4xi32>
+  // CHECK: muli %4, %5 : tensor<4xi32>
+  // CHECK: addi %8, %cst_1 : tensor<4xi32>
+  // CHECK: {clamp_max = 55 : i32, clamp_min = -73 : i32}
+  %0 = "fxpmath.real_mul_ew"(%arg0, %arg1) { clamp_min = -4.0, clamp_max = 4.0 } : (!type_lhs, !type_rhs) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// CHECK-LABEL: real_mulew_unquantized_lhs
+// Verifies that leaves as-is for unquantized lhs.
+!type_lhs = type tensor<4xf32>
+!type_rhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_result = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+func @real_mulew_unquantized_lhs(%arg0 : !type_lhs, %arg1: !type_rhs) -> !type_result {
+  // CHECK: %0 = "fxpmath.real_mul_ew"(%arg0, %arg1)
+  %0 = "fxpmath.real_mul_ew"(%arg0, %arg1) : (!type_lhs, !type_rhs) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// CHECK-LABEL: real_mulew_unquantized_rhs
+// Verifies that leaves as-is for unquantized rhs.
+!type_lhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_rhs = type tensor<4xf32>
+!type_result = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+func @real_mulew_unquantized_rhs(%arg0 : !type_lhs, %arg1: !type_rhs) -> !type_result {
+  // CHECK: %0 = "fxpmath.real_mul_ew"(%arg0, %arg1)
+  %0 = "fxpmath.real_mul_ew"(%arg0, %arg1) : (!type_lhs, !type_rhs) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// CHECK-LABEL: real_mulew_unquantized_result
+// Verifies that leaves as-is for unquantized result.
+!type_lhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_rhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_result = type tensor<4xf32>
+func @real_mulew_unquantized_result(%arg0 : !type_lhs, %arg1: !type_rhs) -> !type_result {
+  // CHECK: %0 = "fxpmath.real_mul_ew"(%arg0, %arg1)
+  %0 = "fxpmath.real_mul_ew"(%arg0, %arg1) : (!type_lhs, !type_rhs) -> (!type_result)
+  return %0 : !type_result
+}
+
+// -----
+// Verify lowering when operands and result have the same fixedpoint scale.
+// Note that the multiplier = lhs_scale * rhs_scale / result_scale
+//   = 22.740610328638496
+// CHECK-LABEL: real_mulew_multiplier_gt_1
+!type_lhs = type tensor<4x!quant.uniform<i8:f32, 6.25e-2>>
+!type_rhs = type tensor<4x!quant.uniform<i8:f32, 3.875e-2>>
+!type_result = type tensor<4x!quant.uniform<i8:f32, 1.065e-4>>
+func @real_mulew_multiplier_gt_1(%arg0 : !type_lhs, %arg1: !type_rhs) -> !type_result {
+  // expected-warning@+1 {{unimplemented: cannot multiply with multiplier > 1.0}}
+  %0 = "fxpmath.real_mul_ew"(%arg0, %arg1) : (!type_lhs, !type_rhs) -> (!type_result)
+  return %0 : !type_result
+}
diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8bb170c38404f531e20836674937d244b363fad9
--- /dev/null
+++ b/mlir/test/Dialect/GPU/canonicalize.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-opt -pass-pipeline='func(canonicalize)' %s | FileCheck %s
+
+// CHECK-LABEL: @propagate_constant
+// CHECK-SAME:  %[[arg1:.*]]: memref
+func @propagate_constant(%arg1: memref<?xf32>) {
+  // The outer constant must be preserved because it still has uses.
+  // CHECK: %[[outer_cst:.*]] = constant 1
+  %c1 = constant 1 : index
+
+  // The constant must be dropped from the args list, but the memref should
+  // remain.
+  // CHECK: gpu.launch
+  // CHECK-SAME: args(%[[inner_arg:.*]] = %[[arg1]]) : memref
+  gpu.launch blocks(%bx, %by, %bz) in (%sbx = %c1, %sby = %c1, %sbz = %c1)
+             threads(%tx, %ty, %tz) in (%stx = %c1, %sty = %c1, %stz = %c1)
+             args(%x = %c1, %y = %arg1) : index, memref<?xf32> {
+    // The constant is propagated into the kernel body and used.
+    // CHECK: %[[inner_cst:.*]] = constant 1
+    // CHECK: "foo"(%[[inner_cst]])
+    "foo"(%x) : (index) -> ()
+
+    // CHECK: "bar"(%[[inner_arg]])
+    "bar"(%y) : (memref<?xf32>) -> ()
+    gpu.return
+  }
+  return
+}
+
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8323fdf8709e25b89e3e6edb7e580cf3cbbd890e
--- /dev/null
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -0,0 +1,431 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s
+
+func @not_enough_sizes(%sz : index) {
+  // expected-error@+1 {{expected 6 or more operands}}
+  "gpu.launch"(%sz, %sz, %sz, %sz, %sz) ({
+    gpu.return
+  }) : (index, index, index, index, index) -> ()
+  return
+}
+
+// -----
+
+func @no_region_attrs(%sz : index) {
+  // expected-error@+1 {{unexpected number of region arguments}}
+ "gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({
+  ^bb1(%bx: index, %by: index, %bz: index,
+       %tx: index, %ty: index, %tz: index):
+    gpu.return
+  }) : (index, index, index, index, index, index) -> ()
+  return
+}
+
+// -----
+
+func @isolation_arg(%sz : index) {
+ // expected-note@+1 {{required by region isolation constraints}}
+ "gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({
+  ^bb1(%bx: index, %by: index, %bz: index,
+       %tx: index, %ty: index, %tz: index,
+       %szbx: index, %szby: index, %szbz: index,
+       %sztx: index, %szty: index, %sztz: index):
+    // expected-error@+1 {{using value defined outside the region}}
+    "use"(%sz) : (index) -> ()
+    gpu.return
+  }) : (index, index, index, index, index, index) -> ()
+  return
+}
+
+// -----
+
+func @isolation_op(%sz : index) {
+ %val = "produce"() : () -> (index)
+ // expected-note@+1 {{required by region isolation constraints}}
+ "gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({
+  ^bb1(%bx: index, %by: index, %bz: index,
+       %tx: index, %ty: index, %tz: index,
+       %szbx: index, %szby: index, %szbz: index,
+       %sztx: index, %szty: index, %sztz: index):
+    // expected-error@+1 {{using value defined outside the region}}
+    "use"(%val) : (index) -> ()
+    gpu.return
+  }) : (index, index, index, index, index, index) -> ()
+  return
+}
+
+// -----
+
+func @nested_isolation(%sz : index) {
+  // expected-note@+1 {{required by region isolation constraints}}
+  "gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({
+  ^bb1(%bx: index, %by: index, %bz: index,
+       %tx: index, %ty: index, %tz: index,
+       %szbx: index, %szby: index, %szbz: index,
+       %sztx: index, %szty: index, %sztz: index):
+    "region"() ({
+      "region"() ({
+        // expected-error@+1 {{using value defined outside the region}}
+        "use"(%sz) : (index) -> ()
+      }) : () -> ()
+    }) : () -> ()
+    gpu.return
+  }) : (index, index, index, index, index, index) -> ()
+  return
+}
+
+// -----
+
+func @launch_requires_gpu_return(%sz : index) {
+  // @expected-note@+1 {{in 'gpu.launch' body region}}
+  gpu.launch blocks(%bx, %by, %bz) in (%sbx = %sz, %sby = %sz, %sbz = %sz)
+             threads(%tx, %ty, %tz) in (%stx = %sz, %sty = %sz, %stz = %sz) {
+    // @expected-error@+1 {{expected 'gpu.terminator' or a terminator with successors}}
+    return
+  }
+  return
+}
+
+// -----
+
+func @launch_func_too_few_operands(%sz : index) {
+  // expected-error@+1 {{expected 6 or more operands}}
+  "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz)
+      : (index, index, index, index, index) -> ()
+  return
+}
+
+// -----
+
+func @launch_func_missing_parent_module_attribute(%sz : index) {
+  // expected-error@+1 {{expected the closest surrounding module to have the 'gpu.container_module' attribute}}
+  "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {foo = "bar"}
+      : (index, index, index, index, index, index) -> ()
+  return
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  func @launch_func_missing_callee_attribute(%sz : index) {
+    // expected-error@+1 {{string attribute 'kernel' must be specified}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {foo = "bar"}
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  func @launch_func_missing_module_attribute(%sz : index) {
+    // expected-error@+1 {{attribute 'kernel_module' must be specified}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {kernel = "launch_func_missing_kernel_attr"}
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  func @launch_func_no_function_attribute(%sz : index) {
+    // expected-error@+1 {{string attribute 'kernel' must be specified}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {kernel = 10}
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  func @launch_func_module_attribute_wrong_type(%sz : index) {
+    // expected-error@+1 {{symbol reference attribute 'kernel_module' must be specified}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+    {kernel = "launch_func_module_attribute_wrong_type", kernel_module = 10}
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  func @launch_func_undefined_module(%sz : index) {
+    // expected-error@+1 {{kernel module 'kernels' is undefined}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+    { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  module @kernels {
+  }
+
+  func @launch_func_missing_module_attribute(%sz : index) {
+    // expected-error@+1 {{module 'kernels' is missing the 'gpu.kernel_module' attribute}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+    { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  module @kernels attributes {gpu.kernel_module} {
+  }
+
+  func @launch_func_undefined_function(%sz : index) {
+    // expected-error@+1 {{kernel function 'kernel_1' is undefined}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+    { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  module @kernels attributes {gpu.kernel_module} {
+    gpu.func @kernel_1(%arg1 : !llvm<"float*">) kernel {
+      gpu.return
+    }
+  }
+
+  func @launch_func_missing_kernel_attr(%sz : index, %arg : !llvm<"float*">) {
+    // xpected-error@+1 {{kernel function is missing the 'gpu.kernel' attribute}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg)
+    {kernel = "kernel_1", kernel_module = @kernels}
+        : (index, index, index, index, index, index, !llvm<"float*">) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  module @kernels attributes {gpu.kernel_module} {
+    gpu.func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
+      gpu.return
+    }
+  }
+
+  func @launch_func_kernel_operand_size(%sz : index, %arg : !llvm<"float*">) {
+    // expected-error@+1 {{got 2 kernel operands but expected 1}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg, %arg)
+        {kernel = "kernel_1", kernel_module = @kernels}
+        : (index, index, index, index, index, index, !llvm<"float*">,
+           !llvm<"float*">) -> ()
+    return
+  }
+}
+
+// -----
+
+module @kernels attributes {gpu.kernel_module} {
+  gpu.func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
+    gpu.return
+  }
+}
+
+// Due to the ordering of the current impl of lowering and LLVMLowering, type
+// checks need to be temporarily disabled.
+// TODO(ntv,zinenko,herhut): reactivate checks once "changing gpu.launchFunc
+// to encode target module" has landed.
+// func @launch_func_kernel_operand_types(%sz : index, %arg : f32) {
+//   // expected-err@+1 {{type of function argument 0 does not match}}
+//   "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg)
+//       {kernel = "kernel_1"}
+//       : (index, index, index, index, index, index, f32) -> ()
+//   return
+// }
+
+// -----
+
+func @illegal_dimension() {
+  // expected-error@+1 {{dimension "o" is invalid}}
+  %tIdX = "gpu.thread_id"() {dimension = "o"} : () -> (index)
+
+  return
+}
+
+// -----
+
+func @illegal_dimension() {
+  // expected-error@+1 {{dimension "o" is invalid}}
+  %bDimX = "gpu.block_dim"() {dimension = "o"} : () -> (index)
+
+  return
+}
+
+// -----
+
+func @illegal_dimension() {
+  // expected-error@+1 {{dimension "o" is invalid}}
+  %bIdX = "gpu.block_id"() {dimension = "o"} : () -> (index)
+
+  return
+}
+
+// -----
+
+func @illegal_dimension() {
+  // expected-error@+1 {{dimension "o" is invalid}}
+  %gDimX = "gpu.grid_dim"() {dimension = "o"} : () -> (index)
+
+  return
+}
+
+// -----
+
+func @reduce_no_op_no_body(%arg0 : f32) {
+  // expected-error@+1 {{expected either an op attribute or a non-empty body}}
+  %res = "gpu.all_reduce"(%arg0) ({}) : (f32) -> (f32)
+  return
+}
+
+// -----
+
+func @reduce_op_and_body(%arg0 : f32) {
+  // expected-error@+1 {{expected either an op attribute or a non-empty body}}
+  %res = "gpu.all_reduce"(%arg0) ({
+  ^bb(%lhs : f32, %rhs : f32):
+    "gpu.yield"(%lhs) : (f32) -> ()
+  }) {op = "add"} : (f32) -> (f32)
+}
+
+// -----
+
+func @reduce_invalid_op(%arg0 : f32) {
+  // expected-error@+1 {{gpu.all_reduce' op attribute 'op' failed to satisfy constraint}}
+  %res = "gpu.all_reduce"(%arg0) ({}) {op = "foo"} : (f32) -> (f32)
+  return
+}
+
+// -----
+
+func @reduce_incorrect_region_arguments(%arg0 : f32) {
+  // expected-error@+1 {{expected two region arguments}}
+  %res = "gpu.all_reduce"(%arg0) ({
+  ^bb(%lhs : f32):
+    "gpu.yield"(%lhs) : (f32) -> ()
+  }) : (f32) -> (f32)
+}
+
+// -----
+
+func @reduce_incorrect_region_arguments(%arg0 : f32) {
+  // expected-error@+1 {{incorrect region argument type}}
+  %res = "gpu.all_reduce"(%arg0) ({
+  ^bb(%lhs : f32, %rhs : i32):
+    "gpu.yield"(%lhs) : (f32) -> ()
+  }) : (f32) -> (f32)
+}
+
+// -----
+
+func @reduce_incorrect_yield(%arg0 : f32) {
+  // expected-error@+1 {{expected one gpu.yield operand}}
+  %res = "gpu.all_reduce"(%arg0) ({
+  ^bb(%lhs : f32, %rhs : f32):
+    "gpu.yield"(%lhs, %rhs) : (f32, f32) -> ()
+  }) : (f32) -> (f32)
+}
+
+// -----
+
+func @reduce_incorrect_yield(%arg0 : f32) {
+  // expected-error@+1 {{incorrect gpu.yield type}}
+  %res = "gpu.all_reduce"(%arg0) ({
+  ^bb(%lhs : f32, %rhs : f32):
+    %one = constant 1 : i32
+    "gpu.yield"(%one) : (i32) -> ()
+  }) : (f32) -> (f32)
+}
+
+// -----
+
+func @reduce_incorrect_yield(%arg0 : f32) {
+  // expected-error@+1 {{expected gpu.yield op in region}}
+  %res = "gpu.all_reduce"(%arg0) ({
+  ^bb(%lhs : f32, %rhs : f32):
+    return
+  }) : (f32) -> (f32)
+}
+
+// -----
+
+func @shuffle_mismatching_type(%arg0 : f32, %arg1 : i32, %arg2 : i32) {
+  // expected-error@+1 {{'gpu.shuffle' op requires the same type for value operand and result}}
+  %shfl, %pred = "gpu.shuffle"(%arg0, %arg1, %arg2) { mode = "xor" } : (f32, i32, i32) -> (i32, i1)
+}
+
+// -----
+
+func @shuffle_unsupported_type(%arg0 : index, %arg1 : i32, %arg2 : i32) {
+  // expected-error@+1 {{'gpu.shuffle' op requires value operand type to be f32 or i32}}
+  %shfl, %pred = gpu.shuffle %arg0, %arg1, %arg2 xor : index
+}
+
+// -----
+
+module {
+  module @gpu_funcs attributes {gpu.kernel_module} {
+    // expected-error @+1 {{custom op 'gpu.func' gpu.func requires named arguments}}
+    gpu.func @kernel_1(f32, f32) {
+    ^bb0(%arg0: f32):
+      gpu.return
+    }
+  }
+}
+
+// -----
+
+module {
+  module @gpu_funcs attributes {gpu.kernel_module} {
+    // expected-error @+1 {{requires 'type' attribute of function type}}
+    "gpu.func"() ({
+      gpu.return
+    }) {sym_name="kernel_1", type=f32} : () -> ()
+  }
+}
+
+// -----
+
+module {
+  module @gpu_funcs attributes {gpu.kernel_module} {
+    // expected-error @+1 {{expected memref type in attribution}}
+    gpu.func @kernel() workgroup(%0: i32) {
+      gpu.return
+    }
+  }
+}
+
+// -----
+
+module {
+  module @gpu_funcs attributes {gpu.kernel_module} {
+    // expected-error @+1 {{expected memory space 3 in attribution}}
+    gpu.func @kernel() workgroup(%0: memref<4xf32>) {
+      gpu.return
+    }
+  }
+}
+
+// -----
+
+module {
+  module @gpu_funcs attributes {gpu.kernel_module} {
+    // expected-error @+1 {{expected memory space 5 in attribution}}
+    gpu.func @kernel() private(%0: memref<4xf32>) {
+      gpu.return
+    }
+  }
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..1dd08cea492a059b5713c812f5b539cfda08021e
--- /dev/null
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -0,0 +1,169 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+  // CHECK-LABEL:func @no_args(%{{.*}}: index)
+  func @no_args(%sz : index) {
+    // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
+               threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
+      // CHECK: gpu.return
+      gpu.return
+    }
+    return
+  }
+
+  // CHECK-LABEL:func @args(%{{.*}}: index, %{{.*}}: index, %{{.*}}: f32, %{{.*}}: memref<?xf32, 1>) {
+  func @args(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
+    // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) args(%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) : f32, memref<?xf32, 1>
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
+               threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
+               args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
+      // CHECK: gpu.return
+      gpu.return
+    }
+    return
+  }
+
+  // It is possible to use values passed into the region as arguments.
+  // CHECK-LABEL: func @passing_values
+  func @passing_values(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
+    // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) args(%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) : f32, memref<?xf32, 1>
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
+               threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
+               args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
+      // CHECK: "use"(%{{.*}})
+      "use"(%kernel_arg0): (f32) -> ()
+      // CHECK: gpu.return
+      gpu.return
+    }
+    return
+  }
+
+  // It is possible to use values defined in nested regions as long as they don't
+  // cross kernel launch region boundaries.
+  // CHECK-LABEL: func @nested_isolation
+  func @nested_isolation(%sz : index) {
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
+               threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
+      "region"() ({
+        // CHECK: %{{.*}} = "produce"()
+        %val = "produce"() : () -> (index)
+        "region"() ({
+          // CHECK: "use"(%{{.*}})
+          "use"(%val) : (index) -> ()
+        }) : () -> ()
+      }) : () -> ()
+      // CHECK: gpu.return
+      gpu.return
+    }
+    return
+  }
+
+  module @kernels attributes {gpu.kernel_module} {
+    gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) attributes {gpu.kernel} {
+      %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
+      %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
+      %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
+
+      %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
+      %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
+      %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
+
+      %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
+      %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
+      %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
+
+      %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
+      %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
+      %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+
+      %one = constant 1.0 : f32
+      %sum = "gpu.all_reduce"(%one) ({}) {op = "add"} : (f32) -> (f32)
+
+      %width = constant 7 : i32
+      %offset = constant 3 : i32
+      // CHECK: gpu.shuffle %{{.*}}, %{{.*}}, %{{.*}} xor : f32
+      %shfl, %pred = gpu.shuffle %arg0, %offset, %width xor : f32
+
+      "gpu.barrier"() : () -> ()
+
+      "some_op"(%bIdX, %tIdX) : (index, index) -> ()
+      %42 = load %arg1[%bIdX] : memref<?xf32, 1>
+      gpu.return
+    }
+
+    gpu.func @kernel_2(%arg0: f32, %arg1: memref<?xf32, 1>) attributes {gpu.kernel} {
+      gpu.return
+    }
+  }
+
+  func @foo() {
+    %0 = "op"() : () -> (f32)
+    %1 = "op"() : () -> (memref<?xf32, 1>)
+    // CHECK: %{{.*}} = constant 8
+    %cst = constant 8 : index
+
+    // CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {kernel = "kernel_1", kernel_module = @kernels} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1)
+    { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+
+    // CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {kernel = "kernel_2", kernel_module = @kernels} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1)
+    { kernel = "kernel_2", kernel_module = @kernels }
+        : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+
+    return
+  }
+
+  module @gpu_funcs attributes {gpu.kernel_module} {
+    // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32) -> f32
+    // CHECK:       workgroup
+    // CHECK:       private
+    // CHECK:       attributes
+    gpu.func @kernel_1(%arg0: f32) -> f32
+        workgroup(%arg1: memref<42xf32, 3>)
+        private(%arg2: memref<2xf32, 5>, %arg3: memref<1xf32, 5>)
+        kernel
+        attributes {foo="bar"} {
+      "use"(%arg1) : (memref<42xf32, 3>) -> ()
+      "use"(%arg2) : (memref<2xf32, 5>) -> ()
+      "use"(%arg3) : (memref<1xf32, 5>) -> ()
+      gpu.return
+    }
+
+    // CHECK-LABEL: gpu.func @no_attribution
+    // CHECK: {
+    gpu.func @no_attribution(%arg0: f32) {
+      gpu.return
+    }
+
+    // CHECK-LABEL: @no_attribution_attrs
+    // CHECK:       attributes
+    // CHECK:       {
+    gpu.func @no_attribution_attrs(%arg0: f32) attributes {foo="bar"} {
+      gpu.return
+    }
+
+    // CHECK-LABEL: @workgroup_only
+    // CHECK:       workgroup({{.*}}: {{.*}})
+    // CHECK:       {
+    gpu.func @workgroup_only() workgroup(%arg0: memref<42xf32, 3>) {
+      gpu.return
+    }
+    // CHECK-LABEL: @private_only
+    // CHECK:       private({{.*}}: {{.*}})
+    // CHECK:       {
+    gpu.func @private_only() private(%arg0: memref<2xf32, 5>) {
+      gpu.return
+    }
+
+    // CHECK-LABEL: @empty_attribution
+    // CHECK:       {
+    gpu.func @empty_attribution(%arg0: f32) workgroup() private() {
+      gpu.return
+    }
+  }
+
+}
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..5adb881a1dc6ad8f8ad9434b41dcc925ab0b537f
--- /dev/null
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -0,0 +1,149 @@
+// RUN: mlir-opt -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s
+
+// CHECK: module attributes {gpu.container_module}
+
+// CHECK-LABEL: func @launch()
+func @launch() {
+  // CHECK: %[[ARG0:.*]] = "op"() : () -> f32
+  %0 = "op"() : () -> (f32)
+  // CHECK: %[[ARG1:.*]] = "op"() : () -> memref<?xf32, 1>
+  %1 = "op"() : () -> (memref<?xf32, 1>)
+  // CHECK: %[[GDIMX:.*]] = constant 8
+  %gDimX = constant 8 : index
+  // CHECK: %[[GDIMY:.*]] = constant 12
+  %gDimY = constant 12 : index
+  // CHECK: %[[GDIMZ:.*]] = constant 16
+  %gDimZ = constant 16 : index
+  // CHECK: %[[BDIMX:.*]] = constant 20
+  %bDimX = constant 20 : index
+  // CHECK: %[[BDIMY:.*]] = constant 24
+  %bDimY = constant 24 : index
+  // CHECK: %[[BDIMZ:.*]] = constant 28
+  %bDimZ = constant 28 : index
+
+  // CHECK: "gpu.launch_func"(%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]], %[[BDIMX]], %[[BDIMY]], %[[BDIMZ]], %[[ARG0]], %[[ARG1]]) {kernel = "launch_kernel", kernel_module = @launch_kernel} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+  // CHECK-NOT: gpu.launch blocks
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
+                                       %grid_z = %gDimZ)
+             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY,
+                                        %block_z = %bDimZ)
+             args(%arg0 = %0, %arg1 = %1) : f32, memref<?xf32, 1> {
+    "use"(%arg0): (f32) -> ()
+    "some_op"(%bx, %block_x) : (index, index) -> ()
+    %42 = load %arg1[%tx] : memref<?xf32, 1>
+    gpu.return
+  }
+  return
+}
+
+
+// CHECK-LABEL: module @launch_kernel
+// CHECK-NEXT: gpu.func @launch_kernel
+// CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
+// CHECK-NEXT: %[[BID:.*]] = "gpu.block_id"() {dimension = "x"} : () -> index
+// CHECK-NEXT: = "gpu.block_id"() {dimension = "y"} : () -> index
+// CHECK-NEXT: = "gpu.block_id"() {dimension = "z"} : () -> index
+// CHECK-NEXT: %[[TID:.*]] = "gpu.thread_id"() {dimension = "x"} : () -> index
+// CHECK-NEXT: = "gpu.thread_id"() {dimension = "y"} : () -> index
+// CHECK-NEXT: = "gpu.thread_id"() {dimension = "z"} : () -> index
+// CHECK-NEXT: = "gpu.grid_dim"() {dimension = "x"} : () -> index
+// CHECK-NEXT: = "gpu.grid_dim"() {dimension = "y"} : () -> index
+// CHECK-NEXT: = "gpu.grid_dim"() {dimension = "z"} : () -> index
+// CHECK-NEXT: %[[BDIM:.*]] = "gpu.block_dim"() {dimension = "x"} : () -> index
+// CHECK-NEXT: = "gpu.block_dim"() {dimension = "y"} : () -> index
+// CHECK-NEXT: = "gpu.block_dim"() {dimension = "z"} : () -> index
+// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
+// CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> ()
+// CHECK-NEXT: = load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
+
+// -----
+
+// CHECK: module attributes {gpu.container_module}
+
+func @multiple_launches() {
+  // CHECK: %[[CST:.*]] = constant 8 : index
+  %cst = constant 8 : index
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = "multiple_launches_kernel", kernel_module = @multiple_launches_kernel} : (index, index, index, index, index, index) -> ()
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
+                                       %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
+                                        %block_z = %cst) {
+    gpu.return
+  }
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = "multiple_launches_kernel", kernel_module = @multiple_launches_kernel_0} : (index, index, index, index, index, index) -> ()
+  gpu.launch blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst,
+                                          %grid_z2 = %cst)
+             threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst,
+                                           %block_z2 = %cst) {
+    gpu.return
+  }
+  return
+}
+
+// CHECK: module @multiple_launches_kernel
+// CHECK: func @multiple_launches_kernel
+// CHECK: module @multiple_launches_kernel_0
+// CHECK: func @multiple_launches_kernel
+
+// -----
+
+func @extra_constants(%arg0 : memref<?xf32>) {
+  // CHECK: %[[CST:.*]] = constant 8 : index
+  %cst = constant 8 : index
+  %cst2 = constant 2 : index
+  %cst3 = dim %arg0, 0 : memref<?xf32>
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}) {kernel = "extra_constants_kernel", kernel_module = @extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>) -> ()
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
+                                       %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
+                                        %block_z = %cst)
+             args(%kernel_arg0 = %cst2, %kernel_arg1 = %arg0, %kernel_arg2 = %cst3) : index, memref<?xf32>, index {
+    "use"(%kernel_arg0, %kernel_arg1, %kernel_arg2) : (index, memref<?xf32>, index) -> ()
+    gpu.return
+  }
+  return
+}
+
+// CHECK-LABEL: func @extra_constants_kernel(%{{.*}}: memref<?xf32>)
+// CHECK: constant
+// CHECK: constant
+
+// -----
+
+llvm.mlir.global internal @global(42 : i64) : !llvm.i64
+
+func @function_call(%arg0 : memref<?xf32>) {
+  %cst = constant 8 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
+                                       %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
+                                        %block_z = %cst) {
+    call @device_function() : () -> ()
+    call @device_function() : () -> ()
+    %0 = llvm.mlir.addressof @global : !llvm<"i64*">
+    gpu.return
+  }
+  return
+}
+
+func @device_function() {
+  call @recursive_device_function() : () -> ()
+  gpu.return
+}
+
+func @recursive_device_function() {
+  call @recursive_device_function() : () -> ()
+  gpu.return
+}
+
+// CHECK: module @function_call_kernel attributes {gpu.kernel_module} {
+// CHECK:   gpu.func @function_call_kernel()
+// CHECK:     call @device_function() : () -> ()
+// CHECK:     call @device_function() : () -> ()
+// CHECK:     llvm.mlir.addressof @global : !llvm<"i64*">
+//
+// CHECK:   llvm.mlir.global internal @global(42 : i64) : !llvm.i64
+//
+// CHECK:   func @device_function()
+// CHECK:   func @recursive_device_function()
+// CHECK-NOT:   func @device_function
diff --git a/mlir/test/Dialect/LLVMIR/func.mlir b/mlir/test/Dialect/LLVMIR/func.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..2db5d3553e930de5f3fc999b2d1ea8d194f4259b
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/func.mlir
@@ -0,0 +1,225 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | mlir-opt | FileCheck %s
+// RUN: mlir-opt -split-input-file -verify-diagnostics -mlir-print-op-generic %s | FileCheck %s --check-prefix=GENERIC
+
+module {
+  // GENERIC: "llvm.func"
+  // GENERIC: sym_name = "foo"
+  // GENERIC-SAME: type = !llvm<"void ()">
+  // GENERIC-SAME: () -> ()
+  // CHECK: llvm.func @foo()
+  "llvm.func"() ({
+  }) {sym_name = "foo", type = !llvm<"void ()">} : () -> ()
+
+  // GENERIC: "llvm.func"
+  // GENERIC: sym_name = "bar"
+  // GENERIC-SAME: type = !llvm<"i64 (i64, i64)">
+  // GENERIC-SAME: () -> ()
+  // CHECK: llvm.func @bar(!llvm.i64, !llvm.i64) -> !llvm.i64
+  "llvm.func"() ({
+  }) {sym_name = "bar", type = !llvm<"i64 (i64, i64)">} : () -> ()
+
+  // GENERIC: "llvm.func"
+  // CHECK: llvm.func @baz(%{{.*}}: !llvm.i64) -> !llvm.i64
+  "llvm.func"() ({
+  // GENERIC: ^bb0
+  ^bb0(%arg0: !llvm.i64):
+    // GENERIC: llvm.return
+    llvm.return %arg0 : !llvm.i64
+
+  // GENERIC: sym_name = "baz"
+  // GENERIC-SAME: type = !llvm<"i64 (i64)">
+  // GENERIC-SAME: () -> ()
+  }) {sym_name = "baz", type = !llvm<"i64 (i64)">} : () -> ()
+
+  // CHECK: llvm.func @qux(!llvm<"i64*"> {llvm.noalias = true}, !llvm.i64)
+  // CHECK: attributes {xxx = {yyy = 42 : i64}}
+  "llvm.func"() ({
+  }) {sym_name = "qux", type = !llvm<"void (i64*, i64)">,
+      arg0 = {llvm.noalias = true}, xxx = {yyy = 42}} : () -> ()
+
+  // CHECK: llvm.func @roundtrip1()
+  llvm.func @roundtrip1()
+
+  // CHECK: llvm.func @roundtrip2(!llvm.i64, !llvm.float) -> !llvm.double
+  llvm.func @roundtrip2(!llvm.i64, !llvm.float) -> !llvm.double
+
+  // CHECK: llvm.func @roundtrip3(!llvm.i32, !llvm.i1)
+  llvm.func @roundtrip3(%a: !llvm.i32, %b: !llvm.i1)
+
+  // CHECK: llvm.func @roundtrip4(%{{.*}}: !llvm.i32, %{{.*}}: !llvm.i1) {
+  llvm.func @roundtrip4(%a: !llvm.i32, %b: !llvm.i1) {
+    llvm.return
+  }
+
+  // CHECK: llvm.func @roundtrip5()
+  // CHECK: attributes {baz = 42 : i64, foo = "bar"}
+  llvm.func @roundtrip5() attributes {foo = "bar", baz = 42}
+
+  // CHECK: llvm.func @roundtrip6()
+  // CHECK: attributes {baz = 42 : i64, foo = "bar"}
+  llvm.func @roundtrip6() attributes {foo = "bar", baz = 42} {
+    llvm.return
+  }
+
+  // CHECK: llvm.func @roundtrip7() {
+  llvm.func @roundtrip7() attributes {} {
+    llvm.return
+  }
+
+  // CHECK: llvm.func @roundtrip8() -> !llvm.i32
+  llvm.func @roundtrip8() -> !llvm.i32 attributes {}
+
+  // CHECK: llvm.func @roundtrip9(!llvm<"i32*"> {llvm.noalias = true})
+  llvm.func @roundtrip9(!llvm<"i32*"> {llvm.noalias = true})
+
+  // CHECK: llvm.func @roundtrip10(!llvm<"i32*"> {llvm.noalias = true})
+  llvm.func @roundtrip10(%arg0: !llvm<"i32*"> {llvm.noalias = true})
+
+  // CHECK: llvm.func @roundtrip11(%{{.*}}: !llvm<"i32*"> {llvm.noalias = true}) {
+  llvm.func @roundtrip11(%arg0: !llvm<"i32*"> {llvm.noalias = true}) {
+    llvm.return
+  }
+
+  // CHECK: llvm.func @roundtrip12(%{{.*}}: !llvm<"i32*"> {llvm.noalias = true})
+  // CHECK: attributes {foo = 42 : i32}
+  llvm.func @roundtrip12(%arg0: !llvm<"i32*"> {llvm.noalias = true})
+  attributes {foo = 42 : i32} {
+    llvm.return
+  }
+
+  // CHECK: llvm.func @variadic(...)
+  llvm.func @variadic(...)
+
+  // CHECK: llvm.func @variadic_args(!llvm.i32, !llvm.i32, ...)
+  llvm.func @variadic_args(!llvm.i32, !llvm.i32, ...)
+
+  //
+  // Check that functions can have linkage attributes.
+  //
+
+  // CHECK: llvm.func internal
+  llvm.func internal @internal_func() {
+    llvm.return
+  }
+
+  // CHECK: llvm.func weak
+  llvm.func weak @weak_linkage() {
+    llvm.return
+  }
+
+  // Omit the `external` linkage, which is the default, in the custom format.
+  // Check that it is present in the generic format using its numeric value.
+  //
+  // CHECK: llvm.func @external_func
+  // GENERIC: linkage = 10
+  llvm.func external @external_func()
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{expects one region}}
+  "llvm.func"() {sym_name = "no_region", type = !llvm<"void ()">} : () -> ()
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{requires a type attribute 'type'}}
+  "llvm.func"() ({}) {sym_name = "missing_type"} : () -> ()
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{requires 'type' attribute of wrapped LLVM function type}}
+  "llvm.func"() ({}) {sym_name = "non_llvm_type", type = i64} : () -> ()
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{requires 'type' attribute of wrapped LLVM function type}}
+  "llvm.func"() ({}) {sym_name = "non_function_type", type = !llvm<"i64">} : () -> ()
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{entry block must have 0 arguments}}
+  "llvm.func"() ({
+  ^bb0(%arg0: !llvm.i64):
+    llvm.return
+  }) {sym_name = "wrong_arg_number", type = !llvm<"void ()">} : () -> ()
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{entry block argument #0 is not of LLVM type}}
+  "llvm.func"() ({
+  ^bb0(%arg0: i64):
+    llvm.return
+  }) {sym_name = "wrong_arg_number", type = !llvm<"void (i64)">} : () -> ()
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{entry block argument #0 does not match the function signature}}
+  "llvm.func"() ({
+  ^bb0(%arg0: !llvm.i32):
+    llvm.return
+  }) {sym_name = "wrong_arg_number", type = !llvm<"void (i64)">} : () -> ()
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{failed to construct function type: expected LLVM type for function arguments}}
+  llvm.func @foo(i64)
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{failed to construct function type: expected LLVM type for function results}}
+  llvm.func @foo() -> i64
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{failed to construct function type: expected zero or one function result}}
+  llvm.func @foo() -> (!llvm.i64, !llvm.i64)
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{only external functions can be variadic}}
+  llvm.func @variadic_def(...) {
+    llvm.return
+  }
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{variadic arguments must be in the end of the argument list}}
+  llvm.func @variadic_inside(%arg0: !llvm.i32, ..., %arg1: !llvm.i32)
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{external functions must have 'external' or 'extern_weak' linkage}}
+  llvm.func internal @internal_external_func()
+}
+
+// -----
+
+module {
+  // expected-error@+1 {{functions cannot have 'common' linkage}}
+  llvm.func common @common_linkage_func()
+}
diff --git a/mlir/test/Dialect/LLVMIR/global.mlir b/mlir/test/Dialect/LLVMIR/global.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..35e9415bbee3e44a00531dd7d4be023353178ea4
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/global.mlir
@@ -0,0 +1,179 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+// CHECK: llvm.mlir.global internal @global(42 : i64) : !llvm.i64
+llvm.mlir.global internal @global(42 : i64) : !llvm.i64
+
+// CHECK: llvm.mlir.global internal constant @constant(3.700000e+01 : f64) : !llvm.float
+llvm.mlir.global internal constant @constant(37.0) : !llvm.float
+
+// CHECK: llvm.mlir.global internal constant @string("foobar")
+llvm.mlir.global internal constant @string("foobar") : !llvm<"[6 x i8]">
+
+// CHECK: llvm.mlir.global internal @string_notype("1234567")
+llvm.mlir.global internal @string_notype("1234567")
+
+// CHECK: llvm.mlir.global internal @global_undef()
+llvm.mlir.global internal @global_undef() : !llvm.i64
+
+// CHECK: llvm.mlir.global internal @global_mega_initializer() : !llvm.i64 {
+// CHECK-NEXT:  %[[c:[0-9]+]] = llvm.mlir.constant(42 : i64) : !llvm.i64
+// CHECK-NEXT:  llvm.return %[[c]] : !llvm.i64
+// CHECK-NEXT: }
+llvm.mlir.global internal @global_mega_initializer() : !llvm.i64 {
+  %c = llvm.mlir.constant(42 : i64) : !llvm.i64
+  llvm.return %c : !llvm.i64
+}
+
+// Check different linkage types.
+// CHECK: llvm.mlir.global private
+llvm.mlir.global private @private() : !llvm.i64
+// CHECK: llvm.mlir.global internal
+llvm.mlir.global internal @internal() : !llvm.i64
+// CHECK: llvm.mlir.global available_externally
+llvm.mlir.global available_externally @available_externally() : !llvm.i64
+// CHECK: llvm.mlir.global linkonce
+llvm.mlir.global linkonce @linkonce() : !llvm.i64
+// CHECK: llvm.mlir.global weak
+llvm.mlir.global weak @weak() : !llvm.i64
+// CHECK: llvm.mlir.global common
+llvm.mlir.global common @common() : !llvm.i64
+// CHECK: llvm.mlir.global appending
+llvm.mlir.global appending @appending() : !llvm.i64
+// CHECK: llvm.mlir.global extern_weak
+llvm.mlir.global extern_weak @extern_weak() : !llvm.i64
+// CHECK: llvm.mlir.global linkonce_odr
+llvm.mlir.global linkonce_odr @linkonce_odr() : !llvm.i64
+// CHECK: llvm.mlir.global weak_odr
+llvm.mlir.global weak_odr @weak_odr() : !llvm.i64
+
+// CHECK-LABEL: references
+func @references() {
+  // CHECK: llvm.mlir.addressof @global : !llvm<"i64*">
+  %0 = llvm.mlir.addressof @global : !llvm<"i64*">
+
+  // CHECK: llvm.mlir.addressof @string : !llvm<"[6 x i8]*">
+  %1 = llvm.mlir.addressof @string : !llvm<"[6 x i8]*">
+
+  llvm.return
+}
+
+// -----
+
+// expected-error @+1 {{op requires string attribute 'sym_name'}}
+"llvm.mlir.global"() {type = !llvm.i64, constant, value = 42 : i64} : () -> ()
+
+// -----
+
+// expected-error @+1 {{op requires attribute 'type'}}
+"llvm.mlir.global"() {sym_name = "foo", constant, value = 42 : i64} : () -> ()
+
+// -----
+
+// expected-error @+1 {{expects type to be a valid element type for an LLVM pointer}}
+llvm.mlir.global internal constant @constant(37.0) : !llvm<"label">
+
+// -----
+
+// expected-error @+1 {{'addr_space' failed to satisfy constraint: non-negative 32-bit integer}}
+"llvm.mlir.global"() {sym_name = "foo", type = !llvm.i64, value = 42 : i64, addr_space = -1 : i32, linkage = 0} : () -> ()
+
+// -----
+
+// expected-error @+1 {{'addr_space' failed to satisfy constraint: non-negative 32-bit integer}}
+"llvm.mlir.global"() {sym_name = "foo", type = !llvm.i64, value = 42 : i64, addr_space = 1.0 : f32, linkage = 0} : () -> ()
+
+// -----
+
+func @foo() {
+  // expected-error @+1 {{must appear at the module level}}
+  llvm.mlir.global internal @bar(42) : !llvm.i32
+}
+
+// -----
+
+// expected-error @+1 {{requires an i8 array type of the length equal to that of the string}}
+llvm.mlir.global internal constant @string("foobar") : !llvm<"[42 x i8]">
+
+// -----
+
+// expected-error @+1 {{type can only be omitted for string globals}}
+llvm.mlir.global internal @i64_needs_type(0: i64)
+
+// -----
+
+// expected-error @+1 {{expected zero or one type}}
+llvm.mlir.global internal @more_than_one_type(0) : !llvm.i64, !llvm.i32
+
+// -----
+
+llvm.mlir.global internal @foo(0: i32) : !llvm.i32
+
+func @bar() {
+  // expected-error @+2{{expected ':'}}
+  llvm.mlir.addressof @foo
+}
+
+// -----
+
+func @foo() {
+  // The attribute parser will consume the first colon-type, so we put two of
+  // them to trigger the attribute type mismatch error.
+  // expected-error @+1 {{expected symbol reference}}
+  llvm.mlir.addressof "foo" : i64 : !llvm<"void ()*">
+}
+
+// -----
+
+func @foo() {
+  // expected-error @+1 {{must reference a global defined by 'llvm.mlir.global'}}
+  llvm.mlir.addressof @foo : !llvm<"void ()*">
+}
+
+// -----
+
+llvm.mlir.global internal @foo(0: i32) : !llvm.i32
+
+func @bar() {
+  // expected-error @+1 {{the type must be a pointer to the type of the referred global}}
+  llvm.mlir.addressof @foo : !llvm<"i64*">
+}
+
+// -----
+
+// expected-error @+2 {{'llvm.mlir.global' op expects regions to end with 'llvm.return', found 'llvm.mlir.constant'}}
+// expected-note @+1 {{in custom textual format, the absence of terminator implies 'llvm.return'}}
+llvm.mlir.global internal @g() : !llvm.i64 {
+  %c = llvm.mlir.constant(42 : i64) : !llvm.i64
+}
+
+// -----
+
+// expected-error @+1 {{'llvm.mlir.global' op initializer region type '!llvm.i64' does not match global type '!llvm.i32'}}
+llvm.mlir.global internal @g() : !llvm.i32 {
+  %c = llvm.mlir.constant(42 : i64) : !llvm.i64
+  llvm.return %c : !llvm.i64
+}
+
+// -----
+
+// expected-error @+1 {{'llvm.mlir.global' op cannot have both initializer value and region}}
+llvm.mlir.global internal @g(43 : i64) : !llvm.i64 {
+  %c = llvm.mlir.constant(42 : i64) : !llvm.i64
+  llvm.return %c : !llvm.i64
+}
+
+// -----
+
+llvm.mlir.global internal @g(32 : i64) {addr_space = 3: i32} : !llvm.i64
+func @mismatch_addr_space_implicit_global() {
+  // expected-error @+1 {{op the type must be a pointer to the type of the referred global}}
+  llvm.mlir.addressof @g : !llvm<"i64*">
+}
+
+// -----
+
+llvm.mlir.global internal @g(32 : i64) {addr_space = 3: i32} : !llvm.i64
+func @mismatch_addr_space() {
+  // expected-error @+1 {{op the type must be a pointer to the type of the referred global}}
+  llvm.mlir.addressof @g : !llvm<"i64 addrspace(4)*">
+}
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..c7d4ff4337fc7f560aa4844f32bb1aa7974269d4
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -0,0 +1,384 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// expected-error@+1{{llvm.noalias argument attribute of non boolean type}}
+func @invalid_noalias(%arg0: !llvm.i32 {llvm.noalias = 3}) {
+  "llvm.return"() : () -> ()
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Check that parser errors are properly produced and do not crash the compiler.
+
+// -----
+
+func @icmp_non_string(%arg0 : !llvm.i32, %arg1 : !llvm<"i16">) {
+  // expected-error@+1 {{expected 'predicate' attribute of string type}}
+  llvm.icmp 42 %arg0, %arg0 : !llvm.i32
+  return
+}
+
+// -----
+
+func @icmp_wrong_string(%arg0 : !llvm.i32, %arg1 : !llvm<"i16">) {
+  // expected-error@+1 {{'foo' is an incorrect value of the 'predicate' attribute}}
+  llvm.icmp "foo" %arg0, %arg0 : !llvm.i32
+  return
+}
+
+// -----
+
+func @alloca_missing_input_result_type(%size : !llvm.i64) {
+  // expected-error@+1 {{expected trailing function type with one argument and one result}}
+  llvm.alloca %size x !llvm.i32 : () -> ()
+}
+
+// -----
+
+func @alloca_missing_input_type() {
+  // expected-error@+1 {{expected trailing function type with one argument and one result}}
+  llvm.alloca %size x !llvm.i32 : () -> (!llvm<"i32*">)
+}
+
+// -----
+
+func @alloca_mising_result_type() {
+  // expected-error@+1 {{expected trailing function type with one argument and one result}}
+  llvm.alloca %size x !llvm.i32 : (!llvm.i64) -> ()
+}
+
+// -----
+
+func @alloca_non_function_type() {
+  // expected-error@+1 {{expected trailing function type with one argument and one result}}
+  llvm.alloca %size x !llvm.i32 : !llvm<"i32*">
+}
+
+// -----
+
+func @alloca_nonpositive_alignment(%size : !llvm.i64) {
+  // expected-error@+1 {{expected positive alignment}}
+  llvm.alloca %size x !llvm.i32 {alignment = -1} : (!llvm.i64) -> (!llvm<"i32*">)
+}
+
+// -----
+
+func @gep_missing_input_result_type(%pos : !llvm.i64, %base : !llvm<"float*">) {
+  // expected-error@+1 {{expected trailing function type with at least one argument and one result}}
+  llvm.getelementptr %base[%pos] : () -> ()
+}
+
+// -----
+
+func @gep_missing_input_type(%pos : !llvm.i64, %base : !llvm<"float*">) {
+  // expected-error@+1 {{expected trailing function type with at least one argument and one result}}
+  llvm.getelementptr %base[%pos] : () -> (!llvm<"float*">)
+}
+
+// -----
+
+func @gep_missing_result_type(%pos : !llvm.i64, %base : !llvm<"float*">) {
+  // expected-error@+1 {{expected trailing function type with at least one argument and one result}}
+  llvm.getelementptr %base[%pos] : (!llvm<"float *">, !llvm.i64) -> ()
+}
+
+// -----
+
+func @gep_non_function_type(%pos : !llvm.i64, %base : !llvm<"float*">) {
+  // expected-error@+1 {{expected trailing function type with at least one argument and one result}}
+  llvm.getelementptr %base[%pos] : !llvm<"float*">
+}
+
+// -----
+
+func @load_non_llvm_type(%foo : memref<f32>) {
+  // expected-error@+1 {{expected LLVM IR dialect type}}
+  llvm.load %foo : memref<f32>
+}
+
+// -----
+
+func @load_non_ptr_type(%foo : !llvm.float) {
+  // expected-error@+1 {{expected LLVM pointer type}}
+  llvm.load %foo : !llvm.float
+}
+
+// -----
+
+func @store_non_llvm_type(%foo : memref<f32>, %bar : !llvm.float) {
+  // expected-error@+1 {{expected LLVM IR dialect type}}
+  llvm.store %bar, %foo : memref<f32>
+}
+
+// -----
+
+func @store_non_ptr_type(%foo : !llvm.float, %bar : !llvm.float) {
+  // expected-error@+1 {{expected LLVM pointer type}}
+  llvm.store %bar, %foo : !llvm.float
+}
+
+// -----
+
+func @call_non_function_type(%callee : !llvm<"i8(i8)">, %arg : !llvm<"i8">) {
+  // expected-error@+1 {{expected function type}}
+  llvm.call %callee(%arg) : !llvm<"i8(i8)">
+}
+
+// -----
+
+func @call_too_many_results(%callee : () -> (i32,i32)) {
+  // expected-error@+1 {{expected function with 0 or 1 result}}
+  llvm.call %callee() : () -> (i32, i32)
+}
+
+// -----
+
+func @call_non_llvm_result(%callee : () -> (i32)) {
+  // expected-error@+1 {{expected result to have LLVM type}}
+  llvm.call %callee() : () -> (i32)
+}
+
+// -----
+
+func @call_non_llvm_input(%callee : (i32) -> (), %arg : i32) {
+  // expected-error@+1 {{expected LLVM types as inputs}}
+  llvm.call %callee(%arg) : (i32) -> ()
+}
+
+// -----
+
+func @insertvalue_non_llvm_type(%a : i32, %b : i32) {
+  // expected-error@+1 {{expected LLVM IR Dialect type}}
+  llvm.insertvalue %a, %b[0] : i32
+}
+
+// -----
+
+func @insertvalue_non_array_position() {
+  // Note the double-type, otherwise attribute parsing consumes the trailing
+  // type of the op as the (wrong) attribute type.
+  // expected-error@+1 {{expected an array attribute}}
+  llvm.insertvalue %a, %b 0 : i32 : !llvm<"{i32}">
+}
+
+// -----
+
+func @insertvalue_non_integer_position() {
+  // expected-error@+1 {{expected an array of integer literals}}
+  llvm.insertvalue %a, %b[0.0] : !llvm<"{i32}">
+}
+
+// -----
+
+func @insertvalue_struct_out_of_bounds() {
+  // expected-error@+1 {{position out of bounds}}
+  llvm.insertvalue %a, %b[1] : !llvm<"{i32}">
+}
+
+// -----
+
+func @insertvalue_array_out_of_bounds() {
+  // expected-error@+1 {{position out of bounds}}
+  llvm.insertvalue %a, %b[1] : !llvm<"[1 x i32]">
+}
+
+// -----
+
+func @insertvalue_wrong_nesting() {
+  // expected-error@+1 {{expected wrapped LLVM IR structure/array type}}
+  llvm.insertvalue %a, %b[0,0] : !llvm<"{i32}">
+}
+
+// -----
+
+func @extractvalue_non_llvm_type(%a : i32, %b : i32) {
+  // expected-error@+1 {{expected LLVM IR Dialect type}}
+  llvm.extractvalue %b[0] : i32
+}
+
+// -----
+
+func @extractvalue_non_array_position() {
+  // Note the double-type, otherwise attribute parsing consumes the trailing
+  // type of the op as the (wrong) attribute type.
+  // expected-error@+1 {{expected an array attribute}}
+  llvm.extractvalue %b 0 : i32 : !llvm<"{i32}">
+}
+
+// -----
+
+func @extractvalue_non_integer_position() {
+  // expected-error@+1 {{expected an array of integer literals}}
+  llvm.extractvalue %b[0.0] : !llvm<"{i32}">
+}
+
+// -----
+
+func @extractvalue_struct_out_of_bounds() {
+  // expected-error@+1 {{position out of bounds}}
+  llvm.extractvalue %b[1] : !llvm<"{i32}">
+}
+
+// -----
+
+func @extractvalue_array_out_of_bounds() {
+  // expected-error@+1 {{position out of bounds}}
+  llvm.extractvalue %b[1] : !llvm<"[1 x i32]">
+}
+
+// -----
+
+func @extractvalue_wrong_nesting() {
+  // expected-error@+1 {{expected wrapped LLVM IR structure/array type}}
+  llvm.extractvalue %b[0,0] : !llvm<"{i32}">
+}
+
+// -----
+
+// CHECK-LABEL: @invalid_vector_type_1
+func @invalid_vector_type_1(%arg0: !llvm<"<4 x float>">, %arg1: !llvm.i32, %arg2: !llvm.float) {
+  // expected-error@+1 {{expected LLVM IR dialect vector type for operand #1}}
+  %0 = llvm.extractelement %arg2[%arg1 : !llvm.i32] : !llvm.float
+}
+
+// -----
+
+// CHECK-LABEL: @invalid_vector_type_2
+func @invalid_vector_type_2(%arg0: !llvm<"<4 x float>">, %arg1: !llvm.i32, %arg2: !llvm.float) {
+  // expected-error@+1 {{expected LLVM IR dialect vector type for operand #1}}
+  %0 = llvm.insertelement %arg2, %arg2[%arg1 : !llvm.i32] : !llvm.float
+}
+
+// -----
+
+// CHECK-LABEL: @invalid_vector_type_3
+func @invalid_vector_type_3(%arg0: !llvm<"<4 x float>">, %arg1: !llvm.i32, %arg2: !llvm.float) {
+  // expected-error@+1 {{expected LLVM IR dialect vector type for operand #1}}
+  %0 = llvm.shufflevector %arg2, %arg2 [0 : i32, 0 : i32, 0 : i32, 0 : i32, 7 : i32] : !llvm.float, !llvm.float
+}
+
+// -----
+
+func @null_non_llvm_type() {
+  // expected-error@+1 {{expected LLVM IR pointer type}}
+  llvm.mlir.null : !llvm.i32
+}
+
+// -----
+
+// CHECK-LABEL: @nvvm_invalid_shfl_pred_1
+func @nvvm_invalid_shfl_pred_1(%arg0 : !llvm.i32, %arg1 : !llvm.i32, %arg2 : !llvm.i32, %arg3 : !llvm.i32) {
+  // expected-error@+1 {{expected return type !llvm<"{ ?, i1 }">}}
+  %0 = nvvm.shfl.sync.bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : !llvm.i32
+}
+
+// -----
+
+// CHECK-LABEL: @nvvm_invalid_shfl_pred_2
+func @nvvm_invalid_shfl_pred_2(%arg0 : !llvm.i32, %arg1 : !llvm.i32, %arg2 : !llvm.i32, %arg3 : !llvm.i32) {
+  // expected-error@+1 {{expected return type !llvm<"{ ?, i1 }">}}
+  %0 = nvvm.shfl.sync.bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : !llvm<"{ i32 }">
+}
+
+// -----
+
+// CHECK-LABEL: @nvvm_invalid_shfl_pred_3
+func @nvvm_invalid_shfl_pred_3(%arg0 : !llvm.i32, %arg1 : !llvm.i32, %arg2 : !llvm.i32, %arg3 : !llvm.i32) {
+  // expected-error@+1 {{expected return type !llvm<"{ ?, i1 }">}}
+  %0 = nvvm.shfl.sync.bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : !llvm<"{ i32, i32 }">
+}
+
+// -----
+
+// CHECK-LABEL: @nvvm_invalid_mma_0
+func @nvvm_invalid_mma_0(%a0 : !llvm.half, %a1 : !llvm<"<2 x half>">,
+                         %b0 : !llvm<"<2 x half>">, %b1 : !llvm<"<2 x half>">,
+                         %c0 : !llvm.float, %c1 : !llvm.float, %c2 : !llvm.float, %c3 : !llvm.float,
+                         %c4 : !llvm.float, %c5 : !llvm.float, %c6 : !llvm.float, %c7 : !llvm.float) {
+  // expected-error@+1 {{expected operands to be 4 <halfx2>s followed by either 4 <halfx2>s or 8 floats}}
+  %0 = nvvm.mma.sync %a0, %a1, %b0, %b1, %c0, %c1, %c2, %c3, %c4, %c5, %c6, %c7 {alayout="row", blayout="row"} : (!llvm.half, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float) -> !llvm<"{ float, float, float, float, float, float, float, float }">
+  llvm.return %0 : !llvm<"{ float, float, float, float, float, float, float, float }">
+}
+
+// -----
+
+// CHECK-LABEL: @nvvm_invalid_mma_1
+func @nvvm_invalid_mma_1(%a0 : !llvm<"<2 x half>">, %a1 : !llvm<"<2 x half>">,
+                         %b0 : !llvm<"<2 x half>">, %b1 : !llvm<"<2 x half>">,
+                         %c0 : !llvm.float, %c1 : !llvm.float, %c2 : !llvm.float, %c3 : !llvm.float,
+                         %c4 : !llvm.float, %c5 : !llvm.float, %c6 : !llvm.float, %c7 : !llvm.float) {
+  // expected-error@+1 {{expected result type to be a struct of either 4 <halfx2>s or 8 floats}}
+  %0 = nvvm.mma.sync %a0, %a1, %b0, %b1, %c0, %c1, %c2, %c3, %c4, %c5, %c6, %c7 {alayout="row", blayout="row"} : (!llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float) -> !llvm<"{ float, float, float, float, float, float, float, half }">
+  llvm.return %0 : !llvm<"{ float, float, float, float, float, float, float, half }">
+}
+
+// -----
+
+// CHECK-LABEL: @nvvm_invalid_mma_2
+func @nvvm_invalid_mma_2(%a0 : !llvm<"<2 x half>">, %a1 : !llvm<"<2 x half>">,
+                         %b0 : !llvm<"<2 x half>">, %b1 : !llvm<"<2 x half>">,
+                         %c0 : !llvm.float, %c1 : !llvm.float, %c2 : !llvm.float, %c3 : !llvm.float,
+                         %c4 : !llvm.float, %c5 : !llvm.float, %c6 : !llvm.float, %c7 : !llvm.float) {
+  // expected-error@+1 {{alayout and blayout attributes must be set to either "row" or "col"}}
+  %0 = nvvm.mma.sync %a0, %a1, %b0, %b1, %c0, %c1, %c2, %c3, %c4, %c5, %c6, %c7 : (!llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float) -> !llvm<"{ float, float, float, float, float, float, float, float }">
+  llvm.return %0 : !llvm<"{ float, float, float, float, float, float, float, float }">
+}
+
+// -----
+
+// CHECK-LABEL: @nvvm_invalid_mma_3
+func @nvvm_invalid_mma_3(%a0 : !llvm<"<2 x half>">, %a1 : !llvm<"<2 x half>">,
+                         %b0 : !llvm<"<2 x half>">, %b1 : !llvm<"<2 x half>">,
+                         %c0 : !llvm<"<2 x half>">, %c1 : !llvm<"<2 x half>">,
+                         %c2 : !llvm<"<2 x half>">, %c3 : !llvm<"<2 x half>">) {
+  // expected-error@+1 {{unimplemented mma.sync variant}}
+  %0 = nvvm.mma.sync %a0, %a1, %b0, %b1, %c0, %c1, %c2, %c3 {alayout="row", blayout="row"} : (!llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">) -> !llvm<"{ float, float, float, float, float, float, float, float }">
+  llvm.return %0 : !llvm<"{ float, float, float, float, float, float, float, float }">
+}
+
+// -----
+
+// CHECK-LABEL: @nvvm_invalid_mma_4
+func @nvvm_invalid_mma_4(%a0 : !llvm<"<2 x half>">, %a1 : !llvm<"<2 x half>">,
+                         %b0 : !llvm<"<2 x half>">, %b1 : !llvm<"<2 x half>">,
+                         %c0 : !llvm.float, %c1 : !llvm.float, %c2 : !llvm.float, %c3 : !llvm.float,
+                         %c4 : !llvm.float, %c5 : !llvm.float, %c6 : !llvm.float, %c7 : !llvm.float) {
+  // expected-error@+1 {{unimplemented mma.sync variant}}
+  %0 = nvvm.mma.sync %a0, %a1, %b0, %b1, %c0, %c1, %c2, %c3, %c4, %c5, %c6, %c7 {alayout="row", blayout="row"} : (!llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float) -> !llvm<"{<2 x half>, <2 x half>, <2 x half>, <2 x half>}">
+  llvm.return %0 : !llvm<"{<2 x half>, <2 x half>, <2 x half>, <2 x half>}">
+}
+
+// -----
+
+// CHECK-LABEL: @nvvm_invalid_mma_5
+func @nvvm_invalid_mma_5(%a0 : !llvm<"<2 x half>">, %a1 : !llvm<"<2 x half>">,
+                         %b0 : !llvm<"<2 x half>">, %b1 : !llvm<"<2 x half>">,
+                         %c0 : !llvm.float, %c1 : !llvm.float, %c2 : !llvm.float, %c3 : !llvm.float,
+                         %c4 : !llvm.float, %c5 : !llvm.float, %c6 : !llvm.float, %c7 : !llvm.float) {
+  // expected-error@+1 {{unimplemented mma.sync variant}}
+  %0 = nvvm.mma.sync %a0, %a1, %b0, %b1, %c0, %c1, %c2, %c3, %c4, %c5, %c6, %c7 {alayout="col", blayout="row"} : (!llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float) -> !llvm<"{ float, float, float, float, float, float, float, float }">
+  llvm.return %0 : !llvm<"{ float, float, float, float, float, float, float, float }">
+}
+
+// -----
+
+// CHECK-LABEL: @nvvm_invalid_mma_6
+func @nvvm_invalid_mma_6(%a0 : !llvm<"<2 x half>">, %a1 : !llvm<"<2 x half>">,
+                         %b0 : !llvm<"<2 x half>">, %b1 : !llvm<"<2 x half>">,
+                         %c0 : !llvm.float, %c1 : !llvm.float, %c2 : !llvm.float, %c3 : !llvm.float,
+                         %c4 : !llvm.float, %c5 : !llvm.float, %c6 : !llvm.float, %c7 : !llvm.float) {
+  // expected-error@+1 {{expected the type to be the full list of input and output}}
+  %0 = nvvm.mma.sync %a0, %a1, %b0, %b1, %c0, %c1, %c2, %c3, %c4, %c5, %c6, %c7 {alayout="col", blayout="row"} : !llvm<"{ float, float, float, float, float, float, float, float }">
+  llvm.return %0 : !llvm<"{ float, float, float, float, float, float, float, float }">
+}
+
+// -----
+
+// CHECK-LABEL: @nvvm_invalid_mma_7
+func @nvvm_invalid_mma_7(%a0 : !llvm<"<2 x half>">, %a1 : !llvm<"<2 x half>">,
+                         %b0 : !llvm<"<2 x half>">, %b1 : !llvm<"<2 x half>">,
+                         %c0 : !llvm.float, %c1 : !llvm.float, %c2 : !llvm.float, %c3 : !llvm.float,
+                         %c4 : !llvm.float, %c5 : !llvm.float, %c6 : !llvm.float, %c7 : !llvm.float) {
+  // expected-error@+1 {{expected single result}}
+  %0 = nvvm.mma.sync %a0, %a1, %b0, %b1, %c0, %c1, %c2, %c3, %c4, %c5, %c6, %c7 {alayout="col", blayout="row"} : (!llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float) -> (!llvm<"{ float, float, float, float, float, float, float, float }">, !llvm.i32)
+  llvm.return %0 : (!llvm<"{ float, float, float, float, float, float, float, float }">, !llvm.i32)
+}
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..3858ace530d523e10668c3190855e6ccd6caa9de
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -0,0 +1,70 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+func @nvvm_special_regs() -> !llvm.i32 {
+  // CHECK: nvvm.read.ptx.sreg.tid.x : !llvm.i32
+  %0 = nvvm.read.ptx.sreg.tid.x : !llvm.i32
+  // CHECK: nvvm.read.ptx.sreg.tid.y : !llvm.i32
+  %1 = nvvm.read.ptx.sreg.tid.y : !llvm.i32
+  // CHECK: nvvm.read.ptx.sreg.tid.z : !llvm.i32
+  %2 = nvvm.read.ptx.sreg.tid.z : !llvm.i32
+  // CHECK: nvvm.read.ptx.sreg.ntid.x : !llvm.i32
+  %3 = nvvm.read.ptx.sreg.ntid.x : !llvm.i32
+  // CHECK: nvvm.read.ptx.sreg.ntid.y : !llvm.i32
+  %4 = nvvm.read.ptx.sreg.ntid.y : !llvm.i32
+  // CHECK: nvvm.read.ptx.sreg.ntid.z : !llvm.i32
+  %5 = nvvm.read.ptx.sreg.ntid.z : !llvm.i32
+  // CHECK: nvvm.read.ptx.sreg.ctaid.x : !llvm.i32
+  %6 = nvvm.read.ptx.sreg.ctaid.x : !llvm.i32
+  // CHECK: nvvm.read.ptx.sreg.ctaid.y : !llvm.i32
+  %7 = nvvm.read.ptx.sreg.ctaid.y : !llvm.i32
+  // CHECK: nvvm.read.ptx.sreg.ctaid.z : !llvm.i32
+  %8 = nvvm.read.ptx.sreg.ctaid.z : !llvm.i32
+  // CHECK: nvvm.read.ptx.sreg.nctaid.x : !llvm.i32
+  %9 = nvvm.read.ptx.sreg.nctaid.x : !llvm.i32
+  // CHECK: nvvm.read.ptx.sreg.nctaid.y : !llvm.i32
+  %10 = nvvm.read.ptx.sreg.nctaid.y : !llvm.i32
+  // CHECK: nvvm.read.ptx.sreg.nctaid.z : !llvm.i32
+  %11 = nvvm.read.ptx.sreg.nctaid.z : !llvm.i32
+  llvm.return %0 : !llvm.i32
+}
+
+func @llvm.nvvm.barrier0() {
+  // CHECK: nvvm.barrier0
+  nvvm.barrier0
+  llvm.return
+}
+
+func @nvvm_shfl(
+    %arg0 : !llvm.i32, %arg1 : !llvm.i32, %arg2 : !llvm.i32,
+    %arg3 : !llvm.i32, %arg4 : !llvm.float) -> !llvm.i32 {
+  // CHECK: nvvm.shfl.sync.bfly %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !llvm.i32
+  %0 = nvvm.shfl.sync.bfly %arg0, %arg3, %arg1, %arg2 : !llvm.i32
+  // CHECK: nvvm.shfl.sync.bfly %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !llvm.float
+  %1 = nvvm.shfl.sync.bfly %arg0, %arg4, %arg1, %arg2 : !llvm.float
+  llvm.return %0 : !llvm.i32
+}
+
+func @nvvm_shfl_pred(
+    %arg0 : !llvm.i32, %arg1 : !llvm.i32, %arg2 : !llvm.i32,
+    %arg3 : !llvm.i32, %arg4 : !llvm.float) -> !llvm<"{ i32, i1 }"> {
+  // CHECK: nvvm.shfl.sync.bfly %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !llvm<"{ i32, i1 }">
+  %0 = nvvm.shfl.sync.bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : !llvm<"{ i32, i1 }">
+  // CHECK: nvvm.shfl.sync.bfly %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !llvm<"{ float, i1 }">
+  %1 = nvvm.shfl.sync.bfly %arg0, %arg4, %arg1, %arg2 {return_value_and_is_valid} : !llvm<"{ float, i1 }">
+  llvm.return %0 : !llvm<"{ i32, i1 }">
+}
+
+func @nvvm_vote(%arg0 : !llvm.i32, %arg1 : !llvm.i1) -> !llvm.i32 {
+  // CHECK: nvvm.vote.ballot.sync %{{.*}}, %{{.*}} : !llvm.i32
+  %0 = nvvm.vote.ballot.sync %arg0, %arg1 : !llvm.i32
+  llvm.return %0 : !llvm.i32
+}
+
+func @nvvm_mma(%a0 : !llvm<"<2 x half>">, %a1 : !llvm<"<2 x half>">,
+               %b0 : !llvm<"<2 x half>">, %b1 : !llvm<"<2 x half>">,
+               %c0 : !llvm.float, %c1 : !llvm.float, %c2 : !llvm.float, %c3 : !llvm.float,
+               %c4 : !llvm.float, %c5 : !llvm.float, %c6 : !llvm.float, %c7 : !llvm.float) {
+  // CHECK: nvvm.mma.sync {{.*}} {alayout = "row", blayout = "row"} : (!llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float) -> !llvm<"{ float, float, float, float, float, float, float, float }">
+  %0 = nvvm.mma.sync %a0, %a1, %b0, %b1, %c0, %c1, %c2, %c3, %c4, %c5, %c6, %c7 {alayout="row", blayout="row"} : (!llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float) -> !llvm<"{ float, float, float, float, float, float, float, float }">
+  llvm.return %0 : !llvm<"{ float, float, float, float, float, float, float, float }">
+}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..5a7178030843cd7e750d89456334249f7ae8e723
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+func @rocdl_special_regs() -> !llvm.i32 {
+  // CHECK-LABEL: rocdl_special_regs
+  // CHECK: rocdl.workitem.id.x : !llvm.i32
+  %0 = rocdl.workitem.id.x : !llvm.i32
+  // CHECK: rocdl.workitem.id.y : !llvm.i32
+  %1 = rocdl.workitem.id.y : !llvm.i32
+  // CHECK: rocdl.workitem.id.z : !llvm.i32
+  %2 = rocdl.workitem.id.z : !llvm.i32
+  // CHECK: rocdl.workgroup.id.x : !llvm.i32
+  %3 = rocdl.workgroup.id.x : !llvm.i32
+  // CHECK: rocdl.workgroup.id.y : !llvm.i32
+  %4 = rocdl.workgroup.id.y : !llvm.i32
+  // CHECK: rocdl.workgroup.id.z : !llvm.i32
+  %5 = rocdl.workgroup.id.z : !llvm.i32
+  // CHECK: rocdl.workgroup.dim.x : !llvm.i32
+  %6 = rocdl.workgroup.dim.x : !llvm.i32
+  // CHECK: rocdl.workgroup.dim.y : !llvm.i32
+  %7 = rocdl.workgroup.dim.y : !llvm.i32
+  // CHECK: rocdl.workgroup.dim.z : !llvm.i32
+  %8 = rocdl.workgroup.dim.z : !llvm.i32
+  // CHECK: rocdl.grid.dim.x : !llvm.i32
+  %9 = rocdl.grid.dim.x : !llvm.i32
+  // CHECK: rocdl.grid.dim.y : !llvm.i32
+  %10 = rocdl.grid.dim.y : !llvm.i32
+  // CHECK: rocdl.grid.dim.z : !llvm.i32
+  %11 = rocdl.grid.dim.z : !llvm.i32
+  llvm.return %0 : !llvm.i32
+}
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..45dd8198d4ad7bc02d2bea6caac758c2ccb5b1dc
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -0,0 +1,220 @@
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: func @ops(%arg0: !llvm.i32, %arg1: !llvm.float)
+func @ops(%arg0 : !llvm.i32, %arg1 : !llvm.float) {
+// Integer arithmetic binary operations.
+//
+// CHECK-NEXT:  %0 = llvm.add %arg0, %arg0 : !llvm.i32
+// CHECK-NEXT:  %1 = llvm.sub %arg0, %arg0 : !llvm.i32
+// CHECK-NEXT:  %2 = llvm.mul %arg0, %arg0 : !llvm.i32
+// CHECK-NEXT:  %3 = llvm.udiv %arg0, %arg0 : !llvm.i32
+// CHECK-NEXT:  %4 = llvm.sdiv %arg0, %arg0 : !llvm.i32
+// CHECK-NEXT:  %5 = llvm.urem %arg0, %arg0 : !llvm.i32
+// CHECK-NEXT:  %6 = llvm.srem %arg0, %arg0 : !llvm.i32
+// CHECK-NEXT:  %7 = llvm.icmp "ne" %arg0, %arg0 : !llvm.i32
+  %0 = llvm.add %arg0, %arg0 : !llvm.i32
+  %1 = llvm.sub %arg0, %arg0 : !llvm.i32
+  %2 = llvm.mul %arg0, %arg0 : !llvm.i32
+  %3 = llvm.udiv %arg0, %arg0 : !llvm.i32
+  %4 = llvm.sdiv %arg0, %arg0 : !llvm.i32
+  %5 = llvm.urem %arg0, %arg0 : !llvm.i32
+  %6 = llvm.srem %arg0, %arg0 : !llvm.i32
+  %7 = llvm.icmp "ne" %arg0, %arg0 : !llvm.i32
+
+// Floating point binary operations.
+//
+// CHECK-NEXT:  %8 = llvm.fadd %arg1, %arg1 : !llvm.float
+// CHECK-NEXT:  %9 = llvm.fsub %arg1, %arg1 : !llvm.float
+// CHECK-NEXT:  %10 = llvm.fmul %arg1, %arg1 : !llvm.float
+// CHECK-NEXT:  %11 = llvm.fdiv %arg1, %arg1 : !llvm.float
+// CHECK-NEXT:  %12 = llvm.frem %arg1, %arg1 : !llvm.float
+  %8 = llvm.fadd %arg1, %arg1 : !llvm.float
+  %9 = llvm.fsub %arg1, %arg1 : !llvm.float
+  %10 = llvm.fmul %arg1, %arg1 : !llvm.float
+  %11 = llvm.fdiv %arg1, %arg1 : !llvm.float
+  %12 = llvm.frem %arg1, %arg1 : !llvm.float
+
+// Memory-related operations.
+//
+// CHECK-NEXT:  %13 = llvm.alloca %arg0 x !llvm.double : (!llvm.i32) -> !llvm<"double*">
+// CHECK-NEXT:  %14 = llvm.getelementptr %13[%arg0, %arg0] : (!llvm<"double*">, !llvm.i32, !llvm.i32) -> !llvm<"double*">
+// CHECK-NEXT:  %15 = llvm.load %14 : !llvm<"double*">
+// CHECK-NEXT:  llvm.store %15, %13 : !llvm<"double*">
+// CHECK-NEXT:  %16 = llvm.bitcast %13 : !llvm<"double*"> to !llvm<"i64*">
+  %13 = llvm.alloca %arg0 x !llvm.double : (!llvm.i32) -> !llvm<"double*">
+  %14 = llvm.getelementptr %13[%arg0, %arg0] : (!llvm<"double*">, !llvm.i32, !llvm.i32) -> !llvm<"double*">
+  %15 = llvm.load %14 : !llvm<"double*">
+  llvm.store %15, %13 : !llvm<"double*">
+  %16 = llvm.bitcast %13 : !llvm<"double*"> to !llvm<"i64*">
+
+// Function call-related operations.
+//
+// CHECK-NEXT:  %17 = llvm.call @foo(%arg0) : (!llvm.i32) -> !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %18 = llvm.extractvalue %17[0] : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %19 = llvm.insertvalue %18, %17[2] : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %20 = llvm.mlir.constant(@foo) : !llvm<"{ i32, double, i32 } (i32)*">
+// CHECK-NEXT:  %21 = llvm.call %20(%arg0) : (!llvm.i32) -> !llvm<"{ i32, double, i32 }">
+  %17 = llvm.call @foo(%arg0) : (!llvm.i32) -> !llvm<"{ i32, double, i32 }">
+  %18 = llvm.extractvalue %17[0] : !llvm<"{ i32, double, i32 }">
+  %19 = llvm.insertvalue %18, %17[2] : !llvm<"{ i32, double, i32 }">
+  %20 = llvm.mlir.constant(@foo) : !llvm<"{ i32, double, i32 } (i32)*">
+  %21 = llvm.call %20(%arg0) : (!llvm.i32) -> !llvm<"{ i32, double, i32 }">
+
+
+// Terminator operations and their successors.
+//
+// CHECK: llvm.br ^bb1
+  llvm.br ^bb1
+
+^bb1:
+// CHECK: llvm.cond_br %7, ^bb2, ^bb1
+  llvm.cond_br %7, ^bb2, ^bb1
+
+^bb2:
+// CHECK:       %22 = llvm.mlir.undef : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %23 = llvm.mlir.constant(42 : i64) : !llvm.i47
+  %22 = llvm.mlir.undef : !llvm<"{ i32, double, i32 }">
+  %23 = llvm.mlir.constant(42) : !llvm.i47
+
+// Misc operations.
+// CHECK:       %24 = llvm.select %7, %0, %1 : !llvm.i1, !llvm.i32
+  %24 = llvm.select %7, %0, %1 : !llvm.i1, !llvm.i32
+
+// Integer to pointer and pointer to integer conversions.
+//
+// CHECK:       %25 = llvm.inttoptr %arg0 : !llvm.i32 to !llvm<"i32*">
+// CHECK-NEXT:  %26 = llvm.ptrtoint %25 : !llvm<"i32*"> to !llvm.i32
+  %25 = llvm.inttoptr %arg0 : !llvm.i32 to !llvm<"i32*">
+  %26 = llvm.ptrtoint %25 : !llvm<"i32*"> to !llvm.i32
+
+// Extended and Quad floating point
+//
+// CHECK:       %27 = llvm.fpext %arg1 : !llvm.float to !llvm.x86_fp80
+// CHECK-NEXT:  %28 = llvm.fpext %arg1 : !llvm.float to !llvm.fp128
+  %27 = llvm.fpext %arg1 : !llvm.float to !llvm.x86_fp80
+  %28 = llvm.fpext %arg1 : !llvm.float to !llvm.fp128
+
+// CHECK:  %29 = llvm.fneg %arg1 : !llvm.float
+  %29 = llvm.fneg %arg1 : !llvm.float
+
+// CHECK:  llvm.return
+  llvm.return
+}
+
+// An larger self-contained function.
+// CHECK-LABEL:func @foo(%arg0: !llvm.i32) -> !llvm<"{ i32, double, i32 }"> {
+func @foo(%arg0: !llvm.i32) -> !llvm<"{ i32, double, i32 }"> {
+// CHECK-NEXT:  %0 = llvm.mlir.constant(3 : i64) : !llvm.i32
+// CHECK-NEXT:  %1 = llvm.mlir.constant(3 : i64) : !llvm.i32
+// CHECK-NEXT:  %2 = llvm.mlir.constant(4.200000e+01 : f64) : !llvm.double
+// CHECK-NEXT:  %3 = llvm.mlir.constant(4.200000e+01 : f64) : !llvm.double
+// CHECK-NEXT:  %4 = llvm.add %0, %1 : !llvm.i32
+// CHECK-NEXT:  %5 = llvm.mul %4, %1 : !llvm.i32
+// CHECK-NEXT:  %6 = llvm.fadd %2, %3 : !llvm.double
+// CHECK-NEXT:  %7 = llvm.fsub %3, %6 : !llvm.double
+// CHECK-NEXT:  %8 = llvm.mlir.constant(1 : i64) : !llvm.i1
+// CHECK-NEXT:  llvm.cond_br %8, ^bb1(%4 : !llvm.i32), ^bb2(%4 : !llvm.i32)
+  %0 = llvm.mlir.constant(3) : !llvm.i32
+  %1 = llvm.mlir.constant(3) : !llvm.i32
+  %2 = llvm.mlir.constant(4.200000e+01) : !llvm.double
+  %3 = llvm.mlir.constant(4.200000e+01) : !llvm.double
+  %4 = llvm.add %0, %1 : !llvm.i32
+  %5 = llvm.mul %4, %1 : !llvm.i32
+  %6 = llvm.fadd %2, %3 : !llvm.double
+  %7 = llvm.fsub %3, %6 : !llvm.double
+  %8 = llvm.mlir.constant(1) : !llvm.i1
+  llvm.cond_br %8, ^bb1(%4 : !llvm.i32), ^bb2(%4 : !llvm.i32)
+
+// CHECK-NEXT:^bb1(%9: !llvm.i32):
+// CHECK-NEXT:  %10 = llvm.call @foo(%9) : (!llvm.i32) -> !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %11 = llvm.extractvalue %10[0] : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %12 = llvm.extractvalue %10[1] : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %13 = llvm.extractvalue %10[2] : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %14 = llvm.mlir.undef : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %15 = llvm.insertvalue %5, %14[0] : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %16 = llvm.insertvalue %7, %15[1] : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %17 = llvm.insertvalue %11, %16[2] : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  llvm.return %17 : !llvm<"{ i32, double, i32 }">
+^bb1(%9: !llvm.i32):
+  %10 = llvm.call @foo(%9) : (!llvm.i32) -> !llvm<"{ i32, double, i32 }">
+  %11 = llvm.extractvalue %10[0] : !llvm<"{ i32, double, i32 }">
+  %12 = llvm.extractvalue %10[1] : !llvm<"{ i32, double, i32 }">
+  %13 = llvm.extractvalue %10[2] : !llvm<"{ i32, double, i32 }">
+  %14 = llvm.mlir.undef : !llvm<"{ i32, double, i32 }">
+  %15 = llvm.insertvalue %5, %14[0] : !llvm<"{ i32, double, i32 }">
+  %16 = llvm.insertvalue %7, %15[1] : !llvm<"{ i32, double, i32 }">
+  %17 = llvm.insertvalue %11, %16[2] : !llvm<"{ i32, double, i32 }">
+  llvm.return %17 : !llvm<"{ i32, double, i32 }">
+
+// CHECK-NEXT:^bb2(%18: !llvm.i32):	// pred: ^bb0
+// CHECK-NEXT:  %19 = llvm.mlir.undef : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %20 = llvm.insertvalue %18, %19[0] : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %21 = llvm.insertvalue %7, %20[1] : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  %22 = llvm.insertvalue %5, %21[2] : !llvm<"{ i32, double, i32 }">
+// CHECK-NEXT:  llvm.return %22 : !llvm<"{ i32, double, i32 }">
+^bb2(%18: !llvm.i32):	// pred: ^bb0
+  %19 = llvm.mlir.undef : !llvm<"{ i32, double, i32 }">
+  %20 = llvm.insertvalue %18, %19[0] : !llvm<"{ i32, double, i32 }">
+  %21 = llvm.insertvalue %7, %20[1] : !llvm<"{ i32, double, i32 }">
+  %22 = llvm.insertvalue %5, %21[2] : !llvm<"{ i32, double, i32 }">
+  llvm.return %22 : !llvm<"{ i32, double, i32 }">
+}
+
+// CHECK-LABEL: @casts
+func @casts(%arg0: !llvm.i32, %arg1: !llvm.i64, %arg2: !llvm<"<4 x i32>">,
+            %arg3: !llvm<"<4 x i64>">, %arg4: !llvm<"i32*">) {
+// CHECK-NEXT:  = llvm.sext %arg0 : !llvm.i32 to !llvm.i56
+  %0 = llvm.sext %arg0 : !llvm.i32 to !llvm.i56
+// CHECK-NEXT:  = llvm.zext %arg0 : !llvm.i32 to !llvm.i64
+  %1 = llvm.zext %arg0 : !llvm.i32 to !llvm.i64
+// CHECK-NEXT:  = llvm.trunc %arg1 : !llvm.i64 to !llvm.i56
+  %2 = llvm.trunc %arg1 : !llvm.i64 to !llvm.i56
+// CHECK-NEXT:  = llvm.sext %arg2 : !llvm<"<4 x i32>"> to !llvm<"<4 x i56>">
+  %3 = llvm.sext %arg2 : !llvm<"<4 x i32>"> to !llvm<"<4 x i56>">
+// CHECK-NEXT:  = llvm.zext %arg2 : !llvm<"<4 x i32>"> to !llvm<"<4 x i64>">
+  %4 = llvm.zext %arg2 : !llvm<"<4 x i32>"> to !llvm<"<4 x i64>">
+// CHECK-NEXT:  = llvm.trunc %arg3 : !llvm<"<4 x i64>"> to !llvm<"<4 x i56>">
+  %5 = llvm.trunc %arg3 : !llvm<"<4 x i64>"> to !llvm<"<4 x i56>">
+// CHECK-NEXT:  = llvm.sitofp %arg0 : !llvm.i32 to !llvm.float
+  %6 = llvm.sitofp %arg0 : !llvm.i32 to !llvm.float
+// CHECK-NEXT:  = llvm.uitofp %arg0 : !llvm.i32 to !llvm.float
+  %7 = llvm.uitofp %arg0 : !llvm.i32 to !llvm.float
+// CHECK-NEXT:  = llvm.fptosi %7 : !llvm.float to !llvm.i32
+  %8 = llvm.fptosi %7 : !llvm.float to !llvm.i32
+// CHECK-NEXT:  = llvm.fptoui %7 : !llvm.float to !llvm.i32
+  %9 = llvm.fptoui %7 : !llvm.float to !llvm.i32
+// CHECK-NEXT:  = llvm.addrspacecast %arg4 : !llvm<"i32*"> to !llvm<"i32 addrspace(2)*">
+  %10 = llvm.addrspacecast %arg4 : !llvm<"i32*"> to !llvm<"i32 addrspace(2)*">
+  llvm.return
+}
+
+// CHECK-LABEL: @vect
+func @vect(%arg0: !llvm<"<4 x float>">, %arg1: !llvm.i32, %arg2: !llvm.float) {
+// CHECK-NEXT:  = llvm.extractelement {{.*}} : !llvm<"<4 x float>">
+  %0 = llvm.extractelement %arg0[%arg1 : !llvm.i32] : !llvm<"<4 x float>">
+// CHECK-NEXT:  = llvm.insertelement {{.*}} : !llvm<"<4 x float>">
+  %1 = llvm.insertelement %arg2, %arg0[%arg1 : !llvm.i32] : !llvm<"<4 x float>">
+// CHECK-NEXT:  = llvm.shufflevector {{.*}} [0 : i32, 0 : i32, 0 : i32, 0 : i32, 7 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
+  %2 = llvm.shufflevector %arg0, %arg0 [0 : i32, 0 : i32, 0 : i32, 0 : i32, 7 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
+// CHECK-NEXT:  = llvm.mlir.constant(dense<1.000000e+00> : vector<4xf32>) : !llvm<"<4 x float>">
+  %3 = llvm.mlir.constant(dense<1.0> : vector<4xf32>) : !llvm<"<4 x float>">
+  return
+}
+
+// CHECK-LABEL: @alloca
+func @alloca(%size : !llvm.i64) {
+  //      CHECK: llvm.alloca %{{.*}} x !llvm.i32 : (!llvm.i64) -> !llvm<"i32*">
+  llvm.alloca %size x !llvm.i32 {alignment = 0} : (!llvm.i64) -> (!llvm<"i32*">)
+  // CHECK-NEXT: llvm.alloca %{{.*}} x !llvm.i32 {alignment = 8 : i64} : (!llvm.i64) -> !llvm<"i32*">
+  llvm.alloca %size x !llvm.i32 {alignment = 8} : (!llvm.i64) -> (!llvm<"i32*">)
+  llvm.return
+}
+
+// CHECK-LABEL: @null
+func @null() {
+  // CHECK: llvm.mlir.null : !llvm<"i8*">
+  %0 = llvm.mlir.null : !llvm<"i8*">
+  // CHECK: llvm.mlir.null : !llvm<"{ void (i32, void ()*)*, i64 }*">
+  %1 = llvm.mlir.null : !llvm<"{void(i32, void()*)*, i64}*">
+  llvm.return
+}
diff --git a/mlir/test/Dialect/LLVMIR/terminator.mlir b/mlir/test/Dialect/LLVMIR/terminator.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..b8e8fcd486e9e47a6b9522da71604615aa10e4bc
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/terminator.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-opt -pass-pipeline='func(canonicalize)' %s | FileCheck %s
+// verify that terminators survive the canonicalizer
+
+// CHECK-LABEL: @return
+// CHECK: llvm.return
+func @return() {
+  llvm.return
+}
+
+// CHECK-LABEL: @control_flow
+// CHECK: llvm.br
+// CHECK: llvm.cond_br
+// CHECK: llvm.return
+func @control_flow(%cond : !llvm.i1) {
+  llvm.br ^bb1
+^bb1:
+  llvm.cond_br %cond, ^bb2, ^bb1
+^bb2:
+   llvm.return
+}
+
diff --git a/mlir/test/Dialect/Linalg/fusion-2-level.mlir b/mlir/test/Dialect/Linalg/fusion-2-level.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..b6a0cabf57f7a8423e161c26f967464dad9e8ff2
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/fusion-2-level.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-opt %s -linalg-fusion | FileCheck %s
+
+func @f1(%A: memref<?x?xf32, offset: ?, strides: [?, 1]>, %B: memref<?x?xf32, offset: ?, strides: [?, 1]>, %C: memref<?x?xf32, offset: ?, strides: [?, 1]>, %D: memref<?x?xf32, offset: ?, strides: [?, 1]>, %E: memref<?x?xf32, offset: ?, strides: [?, 1]>) -> memref<?x?xf32, offset: ?, strides: [?, 1]> {
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  %c3 = constant 3 : index
+  %c2 = constant 2 : index
+  %c40 = constant 40 : index
+  %c30 = constant 30 : index
+  %c20 = constant 20 : index
+  %0 = dim %C, 0 : memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %1 = dim %C, 1 : memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %2 = dim %D, 1 : memref<?x?xf32, offset: ?, strides: [?, 1]>
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>
+  loop.for %arg5 = %c0 to %0 step %c20 {
+    loop.for %arg6 = %c0 to %2 step %c30 {
+      loop.for %arg7 = %c0 to %1 step %c40 {
+        %5 = std.subview %C[%arg5, %arg7][%c20, %c40][%c1, %c1] : memref<?x?xf32, offset: ?, strides: [?, 1]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %7 = std.subview %D[%arg7, %arg6][%c40, %c30][%c1, %c1]: memref<?x?xf32, offset: ?, strides: [?, 1]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %8 = std.subview %E[%arg5, %arg6][%c20, %c40][%c1, %c1] : memref<?x?xf32, offset: ?, strides: [?, 1]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %9 = dim %5, 0 : memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %10 = dim %5, 1 : memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %11 = dim %7, 1 : memref<?x?xf32, offset: ?, strides: [?, ?]>
+        loop.for %arg8 = %c0 to %9 step %c2 {
+          loop.for %arg9 = %c0 to %11 step %c3 {
+            loop.for %arg10 = %c0 to %10 step %c4 {
+              %14 = std.subview %5[%arg8, %arg10][%c2, %c4][%c1, %c1] : memref<?x?xf32, offset: ?, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+              %16 = std.subview %7[%arg10, %arg9][%c4, %c3][%c1, %c1]: memref<?x?xf32, offset: ?, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+              %17 = std.subview %8[%arg8, %arg9][%c2, %c4][%c1, %c1] : memref<?x?xf32, offset: ?, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+              linalg.matmul(%14, %16, %17) : memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+            }
+          }
+        }
+      }
+    }
+  }
+  return %E : memref<?x?xf32, offset: ?, strides: [?, 1]>
+}
+// CHECK-LABEL: func @f1
+//       CHECK:   (%[[A:.*]]:{{.*}}, %[[B:.*]]:{{.*}}, %[[C:.*]]:{{.*}}, %[[D:.*]]:{{.*}}, %[[E:.*]]:{{.*}})
+//      CHECK: loop.for
+//      CHECK:   loop.for
+//      CHECK:     loop.for
+//      CHECK:      loop.for
+//      CHECK:        loop.for
+//      CHECK:          loop.for
+//      CHECK:            linalg.matmul
+//      CHECK:            linalg.matmul
diff --git a/mlir/test/Dialect/Linalg/fusion.mlir b/mlir/test/Dialect/Linalg/fusion.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ba74813f566b7db17cd1f0515a57bb1b41379431
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/fusion.mlir
@@ -0,0 +1,441 @@
+// RUN: mlir-opt %s -linalg-fusion | FileCheck %s
+
+#map0 = (d0) -> (d0 + 2)
+#map1 = (d0) -> (d0 + 4)
+#map2 = (d0) -> (d0 + 3)
+#map3 = (d0)[s0, s1] -> (d0 * s1 + s0)
+#map4 = (d0) -> (d0)
+#map5 = (d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)
+#map6 = (d0, d1) -> (d0, d1)
+// CHECK-DAG: #[[strided2D:.*]] = (d0, d1)[s0, s1] -> (d0 * s0 + d1 * s1)
+
+func @f1(%A: memref<?x?xf32, offset: 0, strides: [?, 1]>, %B: memref<?x?xf32, offset: 0, strides: [?, 1]>, %C: memref<?x?xf32, offset: 0, strides: [?, 1]>, %D: memref<?x?xf32, offset: 0, strides: [?, 1]>, %E: memref<?x?xf32, offset: 0, strides: [?, 1]>) -> memref<?x?xf32, offset: 0, strides: [?, 1]> {
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  %c3 = constant 3 : index
+  %c2 = constant 2 : index
+  %0 = dim %A, 0 : memref<?x?xf32, offset: 0, strides: [?, 1]>
+  %1 = dim %A, 1 : memref<?x?xf32, offset: 0, strides: [?, 1]>
+  %2 = dim %B, 1 : memref<?x?xf32, offset: 0, strides: [?, 1]>
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, offset: 0, strides: [?, 1]>, memref<?x?xf32, offset: 0, strides: [?, 1]>, memref<?x?xf32, offset: 0, strides: [?, 1]>
+  %c1 = constant 1 : index
+  loop.for %arg5 = %c0 to %0 step %c2 {
+    loop.for %arg6 = %c0 to %2 step %c3 {
+      loop.for %arg7 = %c0 to %1 step %c4 {
+        %5 = std.subview %A[%arg5, %arg7][%c2, %c4][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, 1]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %7 = std.subview %B[%arg7, %arg6][%c4, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, 1]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %8 = std.subview %C[%arg5, %arg6][%c2, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, 1]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        linalg.matmul(%5, %7, %8) : memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+      }
+    }
+  }
+  return %E : memref<?x?xf32, offset: 0, strides: [?, 1]>
+}
+// CHECK-LABEL: func @f1
+//       CHECK:   (%[[A:.*]]:{{.*}}, %[[B:.*]]:{{.*}}, %[[C:.*]]:{{.*}}, %[[D:.*]]:{{.*}}, %[[E:.*]]:{{.*}})
+// No RAW dependences, the pass does not fuse RAR atm.
+//      CHECK: linalg.matmul
+//      CHECK: loop.for
+//      CHECK:   loop.for
+//      CHECK:     loop.for
+//      CHECK:       linalg.matmul
+
+func @f2(%A: memref<?x?xf32, offset: 0, strides: [?, ?]>, %B: memref<?x?xf32, offset: 0, strides: [?, ?]>, %C: memref<?x?xf32, offset: 0, strides: [?, ?]>, %D: memref<?x?xf32, offset: 0, strides: [?, ?]>, %E: memref<?x?xf32, offset: 0, strides: [?, ?]>) -> memref<?x?xf32, offset: 0, strides: [?, ?]> {
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  %c3 = constant 3 : index
+  %c2 = constant 2 : index
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %0 = dim %C, 0 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %1 = dim %C, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %2 = dim %D, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  loop.for %arg5 = %c0 to %0 step %c2 {
+    loop.for %arg6 = %c0 to %2 step %c3 {
+      loop.for %arg7 = %c0 to %1 step %c4 {
+        %5 = std.subview %C[%arg5, %arg7][%c2, %c4][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %7 = std.subview %D[%arg7, %arg6][%c4, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %8 = std.subview %E[%arg5, %arg6][%c2, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        linalg.matmul(%5, %7, %8) : memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+      }
+    }
+  }
+  return %E : memref<?x?xf32, offset: 0, strides: [?, ?]>
+}
+// CHECK-LABEL: func @f2
+//       CHECK:   (%[[A:.*]]:{{.*}}, %[[B:.*]]:{{.*}}, %[[C:.*]]:{{.*}}, %[[D:.*]]:{{.*}}, %[[E:.*]]:{{.*}})
+//   CHECK-DAG:   %[[C_0:.*]] = dim %[[C]], 0 : memref<?x?xf32, #[[strided2D]]>
+//   CHECK-DAG:   %[[C_1:.*]] = dim %[[C]], 1 : memref<?x?xf32, #[[strided2D]]>
+//   CHECK-DAG:   %[[D_1:.*]] = dim %[[D]], 1 : memref<?x?xf32, #[[strided2D]]>
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %[[C_0]] step %{{.*}} {
+//       CHECK:     loop.for %{{.*}} = %{{.*}} to %[[D_1]] step %{{.*}} {
+//       CHECK:       loop.for %{{.*}} = %{{.*}} to %[[C_1]] step %{{.*}} {
+//       CHECK:         linalg.matmul
+//       CHECK:         linalg.matmul
+
+func @f3(%A: memref<?x?xf32, offset: 0, strides: [?, ?]>, %B: memref<?x?xf32, offset: 0, strides: [?, ?]>, %C: memref<?x?xf32, offset: 0, strides: [?, ?]>, %D: memref<?x?xf32, offset: 0, strides: [?, ?]>, %E: memref<?x?xf32, offset: 0, strides: [?, ?]>) -> memref<?x?xf32, offset: 0, strides: [?, ?]> {
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  %c3 = constant 3 : index
+  %c2 = constant 2 : index
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %0 = dim %D, 0 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %1 = dim %D, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %2 = dim %C, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  loop.for %arg5 = %c0 to %0 step %c2 {
+    loop.for %arg6 = %c0 to %2 step %c3 {
+      loop.for %arg7 = %c0 to %1 step %c4 {
+        %5 = std.subview %D[%arg5, %arg7][%c2, %c4][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %7 = std.subview %C[%arg7, %arg6][%c4, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %8 = std.subview %E[%arg5, %arg6][%c2, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        linalg.matmul(%5, %7, %8) : memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+      }
+    }
+  }
+  return %E : memref<?x?xf32, offset: 0, strides: [?, ?]>
+}
+// CHECK-LABEL: func @f3
+//       CHECK:   (%[[A:.*]]:{{.*}}, %[[B:.*]]:{{.*}}, %[[C:.*]]:{{.*}}, %[[D:.*]]:{{.*}}, %[[E:.*]]:{{.*}})
+//          CHECK:   %[[D_0:.*]] = dim %[[D]], 0 : memref<?x?xf32, #[[strided2D]]>
+//          CHECK:   %[[D_1:.*]] = dim %[[D]], 1 : memref<?x?xf32, #[[strided2D]]>
+//          CHECK:   %[[C_1:.*]] = dim %[[C]], 1 : memref<?x?xf32, #[[strided2D]]>
+//          CHECK:   loop.for %{{.*}} = %{{.*}} to %[[D_0]] step %{{.*}} {
+//          CHECK:     loop.for %{{.*}} = %{{.*}} to %[[C_1]] step %{{.*}} {
+//          CHECK:       loop.for %{{.*}} = %{{.*}} to %[[D_1]] step %{{.*}} {
+//          CHECK:         linalg.matmul
+//          CHECK:         linalg.matmul
+
+func @f4(%A: memref<?x?xf32, offset: 0, strides: [?, ?]>, %B: memref<?x?xf32, offset: 0, strides: [?, ?]>, %C: memref<?x?xf32, offset: 0, strides: [?, ?]>, %D: memref<?x?xf32, offset: 0, strides: [?, ?]>, %E: memref<?x?xf32, offset: 0, strides: [?, ?]>) -> memref<?x?xf32, offset: 0, strides: [?, ?]> {
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  %c3 = constant 3 : index
+  %c2 = constant 2 : index
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  linalg.matmul(%A, %B, %D) : memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %0 = dim %C, 0 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %1 = dim %C, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %2 = dim %D, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  loop.for %arg5 = %c0 to %0 step %c2 {
+    loop.for %arg6 = %c0 to %2 step %c3 {
+      loop.for %arg7 = %c0 to %1 step %c4 {
+        %5 = std.subview %C[%arg5, %arg7][%c2, %c4][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %7 = std.subview %D[%arg7, %arg6][%c4, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %8 = std.subview %E[%arg5, %arg6][%c2, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        linalg.matmul(%5, %7, %8) : memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+      }
+    }
+  }
+  return %E : memref<?x?xf32, offset: 0, strides: [?, ?]>
+}
+// CHECK-LABEL: func @f4
+//       CHECK:   (%[[A:.*]]:{{.*}}, %[[B:.*]]:{{.*}}, %[[C:.*]]:{{.*}}, %[[D:.*]]:{{.*}}, %[[E:.*]]:{{.*}})
+//          CHECK:   %[[C_0:.*]] = dim %[[C]], 0 : memref<?x?xf32, #[[strided2D]]>
+//          CHECK:   %[[C_1:.*]] = dim %[[C]], 1 : memref<?x?xf32, #[[strided2D]]>
+//          CHECK:   %[[D_1:.*]] = dim %[[D]], 1 : memref<?x?xf32, #[[strided2D]]>
+//          CHECK:   loop.for %{{.*}} = %{{.*}} to %[[C_0]] step %{{.*}} {
+//          CHECK:     loop.for %{{.*}} = %{{.*}} to %[[D_1]] step %{{.*}} {
+//          CHECK:       loop.for %{{.*}} = %{{.*}} to %[[C_1]] step %{{.*}} {
+// Fuse D then fuse C, no false dependence prevent it.
+//          CHECK:         linalg.matmul
+//          CHECK:         linalg.matmul
+//          CHECK:         linalg.matmul
+
+func @f5(%A: memref<?x?xf32, offset: 0, strides: [?, ?]>, %B: memref<?x?xf32, offset: 0, strides: [?, ?]>, %C: memref<?x?xf32, offset: 0, strides: [?, ?]>, %D: memref<?x?xf32, offset: 0, strides: [?, ?]>, %E: memref<?x?xf32, offset: 0, strides: [?, ?]>) -> memref<?x?xf32, offset: 0, strides: [?, ?]> {
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  %c3 = constant 3 : index
+  %c2 = constant 2 : index
+  %0 = dim %B, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %1 = dim %D, 0 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %2 = dim %D, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  linalg.matmul(%C, %B, %D) : memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  loop.for %arg5 = %c0 to %1 step %c2 {
+    loop.for %arg6 = %c0 to %0 step %c3 {
+      loop.for %arg7 = %c0 to %2 step %c4 {
+        %5 = std.subview %D[%arg5, %arg7][%c2, %c4][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %7 = std.subview %B[%arg7, %arg6][%c4, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %8 = std.subview %E[%arg5, %arg6][%c2, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        linalg.matmul(%5, %7, %8) : memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+      }
+    }
+  }
+  return %E : memref<?x?xf32, offset: 0, strides: [?, ?]>
+}
+// CHECK-LABEL: func @f5
+//       CHECK:   (%[[A:.*]]:{{.*}}, %[[B:.*]]:{{.*}}, %[[C:.*]]:{{.*}}, %[[D:.*]]:{{.*}}, %[[E:.*]]:{{.*}})
+//      CHECK-DAG:   %[[B_1:.*]] = dim %[[B]], 1 : memref<?x?xf32, #[[strided2D]]>
+//      CHECK-DAG:   %[[D_0:.*]] = dim %[[D]], 0 : memref<?x?xf32, #[[strided2D]]>
+//      CHECK-DAG:   %[[D_1:.*]] = dim %[[D]], 1 : memref<?x?xf32, #[[strided2D]]>
+// Don't fuse C due to false dependence, note that this is too conservative though.
+//          CHECK:   linalg.matmul(%{{.*}}, %{{.*}}, %{{.*}})
+//          CHECK:   loop.for %{{.*}} = %{{.*}} to %[[D_0]] step %{{.*}} {
+//          CHECK:     loop.for %{{.*}} = %{{.*}} to %[[B_1]] step %{{.*}} {
+//          CHECK:       loop.for %{{.*}} = %{{.*}} to %[[D_1]] step %{{.*}} {
+//          CHECK:         linalg.matmul
+//          CHECK:         linalg.matmul
+
+func @f6(%A: memref<?x?xf32, offset: 0, strides: [?, ?]>, %B: memref<?x?xf32, offset: 0, strides: [?, ?]>, %C: memref<?x?xf32, offset: 0, strides: [?, ?]>, %D: memref<?x?xf32, offset: 0, strides: [?, ?]>, %E: memref<?x?xf32, offset: 0, strides: [?, ?]>) -> memref<?x?xf32, offset: 0, strides: [?, ?]> {
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  %c3 = constant 3 : index
+  %c2 = constant 2 : index
+  %0 = dim %C, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  linalg.matmul(%A, %C, %E) : memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %1 = dim %C, 0 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %2 = dim %D, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  loop.for %arg5 = %c0 to %1 step %c2 {
+    loop.for %arg6 = %c0 to %2 step %c3 {
+      loop.for %arg7 = %c0 to %0 step %c4 {
+        %3 = affine.apply #map0(%arg5)
+        %4 = affine.apply #map1(%arg7)
+        %5 = std.subview %C[%arg5, %arg7][%c2, %c4][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %6 = affine.apply #map2(%arg6)
+        %7 = std.subview %D[%arg7, %arg6][%c4, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %8 = std.subview %E[%arg5, %arg6][%c2, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        linalg.matmul(%5, %7, %8) : memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+      }
+    }
+  }
+  return %E : memref<?x?xf32, offset: 0, strides: [?, ?]>
+}
+// CHECK-LABEL: func @f6
+//       CHECK:   (%[[A:.*]]:{{.*}}, %[[B:.*]]:{{.*}}, %[[C:.*]]:{{.*}}, %[[D:.*]]:{{.*}}, %[[E:.*]]:{{.*}})
+// Cannot fuse C due to interleaved read of C that would be bypassed.
+// Cannot fuse E (WAW).
+//   CHECK:   linalg.matmul
+//   CHECK:   linalg.matmul
+//   CHECK:   loop.for
+//   CHECK:     loop.for
+//   CHECK:       loop.for
+//   CHECK:         linalg.matmul
+// CHECK-NOT:       linalg.matmul
+
+func @f7(%A: memref<?x?xf32, offset: 0, strides: [?, ?]>, %B: memref<?x?xf32, offset: 0, strides: [?, ?]>, %C: memref<?x?xf32, offset: 0, strides: [?, ?]>, %D: memref<?x?xf32, offset: 0, strides: [?, ?]>, %E: memref<?x?xf32, offset: 0, strides: [?, ?]>) -> memref<?x?xf32, offset: 0, strides: [?, ?]> {
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  %c3 = constant 3 : index
+  %c2 = constant 2 : index
+  %0 = dim %A, 0 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %1 = dim %A, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %2 = dim %C, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %3 = dim %C, 0 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %4 = dim %D, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  linalg.matmul(%A, %C, %E) : memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  loop.for %arg5 = %c0 to %0 step %c2 {
+    loop.for %arg6 = %c0 to %2 step %c3 {
+      loop.for %arg7 = %c0 to %1 step %c4 {
+        %7 = std.subview %A[%arg5, %arg7][%c2, %c4][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %9 = std.subview %C[%arg7, %arg6][%c4, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %10 = std.subview %E[%arg5, %arg6][%c2, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        linalg.matmul(%7, %9, %10) : memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+      }
+    }
+  }
+  loop.for %arg5 = %c0 to %3 step %c2 {
+    loop.for %arg6 = %c0 to %4 step %c3 {
+      loop.for %arg7 = %c0 to %2 step %c4 {
+        %7 = std.subview %C[%arg5, %arg7][%c2, %c4][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %9 = std.subview %D[%arg7, %arg6][%c4, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %10 = std.subview %E[%arg5, %arg6][%c2, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        linalg.matmul(%7, %9, %10) : memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+      }
+    }
+  }
+  return %E : memref<?x?xf32, offset: 0, strides: [?, ?]>
+}
+// CHECK-LABEL: func @f7
+//       CHECK:   (%[[A:.*]]:{{.*}}, %[[B:.*]]:{{.*}}, %[[C:.*]]:{{.*}}, %[[D:.*]]:{{.*}}, %[[E:.*]]:{{.*}})
+//       CHECK:   %[[A_0:.*]] = dim %[[A]], 0 : memref<?x?xf32, #[[strided2D]]>
+//       CHECK:   %[[A_1:.*]] = dim %[[A]], 1 : memref<?x?xf32, #[[strided2D]]>
+//       CHECK:   %[[C_1:.*]] = dim %[[C]], 1 : memref<?x?xf32, #[[strided2D]]>
+//       CHECK:   %[[C_0:.*]] = dim %[[C]], 0 : memref<?x?xf32, #[[strided2D]]>
+//       CHECK:   %[[D_1:.*]] = dim %[[D]], 1 : memref<?x?xf32, #[[strided2D]]>
+//       CHECK:   linalg.matmul(%[[A]], %[[C]], %[[E]])
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %[[A_0]] step %{{.*}} {
+//       CHECK:     loop.for %{{.*}} = %{{.*}} to %[[C_1]] step %{{.*}} {
+//       CHECK:       loop.for %{{.*}} = %{{.*}} to %[[A_1]] step %{{.*}} {
+//       CHECK:         linalg.matmul
+//       CHECK:         linalg.matmul
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %[[C_0]] step %{{.*}} {
+//       CHECK:     loop.for %{{.*}} = %{{.*}} to %[[D_1]] step %{{.*}} {
+//       CHECK:       loop.for %{{.*}} = %{{.*}} to %[[C_1]] step %{{.*}} {
+//       CHECK:         linalg.matmul
+//   CHECK-NOT:         linalg.matmul
+
+func @f8(%A: memref<?x?xf32, offset: 0, strides: [?, ?]>, %B: memref<?x?xf32, offset: 0, strides: [?, ?]>, %C: memref<?x?xf32, offset: 0, strides: [?, ?]>, %D: memref<?x?xf32, offset: 0, strides: [?, ?]>, %E: memref<?x?xf32, offset: 0, strides: [?, ?]>) -> memref<?x?xf32, offset: 0, strides: [?, ?]> {
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  %c3 = constant 3 : index
+  %c2 = constant 2 : index
+  %0 = dim %A, 0 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %1 = dim %A, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  linalg.matmul(%A, %C, %D) : memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %2 = dim %D, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  loop.for %arg5 = %c0 to %0 step %c2 {
+    loop.for %arg6 = %c0 to %2 step %c3 {
+      loop.for %arg7 = %c0 to %1 step %c4 {
+        %3 = affine.apply #map0(%arg5)
+        %4 = affine.apply #map1(%arg7)
+        %5 = std.subview %A[%arg5, %arg7][%c2, %c4][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %6 = affine.apply #map2(%arg6)
+        %7 = std.subview %D[%arg7, %arg6][%c4, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %8 = std.subview %E[%arg5, %arg6][%c2, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        linalg.matmul(%5, %7, %8) : memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+      }
+    }
+  }
+  return %E : memref<?x?xf32, offset: 0, strides: [?, ?]>
+}
+// CHECK-LABEL: func @f8
+//       CHECK:   (%[[A:.*]]: memref{{.*}}, %[[B:.*]]: memref{{.*}}, %[[C:.*]]: memref{{.*}}, %[[D:.*]]: memref{{.*}}, %[[E:.*]]: memref{{.*}})
+//   CHECK:   linalg.matmul
+//   CHECK:   linalg.matmul
+//   CHECK:   loop.for
+//   CHECK:     loop.for
+//   CHECK:       loop.for
+//   CHECK:         linalg.matmul
+// CHECK-NOT:       linalg.matmul
+
+#id_2d = (i, j) -> (i, j)
+#pointwise_2d_trait = {
+  args_in = 2,
+  args_out = 1,
+  indexing_maps = [#id_2d, #id_2d, #id_2d],
+  iterator_types = ["parallel", "parallel"]
+}
+func @pointwise(%A: memref<?x?xf32, offset: 0, strides: [?, ?]>, %B: memref<?x?xf32, offset: 0, strides: [?, ?]>, %C: memref<?x?xf32, offset: 0, strides: [?, ?]>, %D: memref<?x?xf32, offset: 0, strides: [?, ?]>) {
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %c3 = constant 3 : index
+  %c2 = constant 2 : index
+  linalg.generic #pointwise_2d_trait %A, %A, %B {
+  ^bb0(%E: f32, %arg5: f32, %arg6: f32):   // no predecessors
+    %2 = addf %E, %arg5 : f32
+    linalg.yield %2 : f32
+  }: memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>, memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %0 = dim %B, 0 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  %1 = dim %B, 1 : memref<?x?xf32, offset: 0, strides: [?, ?]>
+  loop.for %arg4 = %c0 to %0 step %c2 {
+    loop.for %arg5 = %c0 to %1 step %c3 {
+      %4 = std.subview %B[%arg4, %arg5][%c2, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+      %5 = std.subview %C[%arg4, %arg5][%c2, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+      %6 = std.subview %D[%arg4, %arg5][%c2, %c3][%c1, %c1] : memref<?x?xf32, offset: 0, strides: [?, ?]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+      linalg.generic #pointwise_2d_trait %4, %5, %6 {
+      ^bb0(%arg6: f32, %arg7: f32, %arg8: f32):       // no predecessors
+        %7 = mulf %arg6, %arg7 : f32
+        linalg.yield %7 : f32
+      }: memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+    }
+  }
+  return
+}
+// CHECK-LABEL: func @pointwise
+//       CHECK:   loop.for
+//       CHECK:     loop.for
+//   CHECK-NOT:   loop.for
+//       CHECK:       linalg.generic
+//       CHECK:         addf
+//       CHECK:       linalg.generic
+//       CHECK:         mulf
+
+func @pointwise_no_view(%M: index, %N: index) {
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %c3 = constant 3 : index
+  %c2 = constant 2 : index
+  %A = alloc (%M, %N): memref<?x?xf32>
+  %B = alloc (%M, %N): memref<?x?xf32>
+  %C = alloc (%M, %N): memref<?x?xf32>
+  %D = alloc (%M, %N): memref<?x?xf32>
+  %E = alloc (%M, %N): memref<?x?xf32>
+  linalg.generic #pointwise_2d_trait %A, %A, %B {
+  ^bb0(%e: f32, %arg5: f32, %arg6: f32):   // no predecessors
+    %2 = addf %e, %arg5 : f32
+    linalg.yield %2 : f32
+  }: memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
+  %0 = dim %B, 0 : memref<?x?xf32>
+  %1 = dim %B, 1 : memref<?x?xf32>
+  loop.for %arg4 = %c0 to %0 step %c2 {
+    loop.for %arg5 = %c0 to %1 step %c3 {
+      %4 = std.subview %B[%arg4, %arg5][%c2, %c3][%c1, %c1] : memref<?x?xf32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+      %5 = std.subview %C[%arg4, %arg5][%c2, %c3][%c1, %c1] : memref<?x?xf32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+      %6 = std.subview %D[%arg4, %arg5][%c2, %c3][%c1, %c1] : memref<?x?xf32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+      linalg.generic #pointwise_2d_trait %4, %5, %6 {
+      ^bb0(%arg6: f32, %arg7: f32, %arg8: f32):       // no predecessors
+        %7 = mulf %arg6, %arg7 : f32
+        linalg.yield %7 : f32
+      }: memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+    }
+  }
+  return
+}
+// CHECK-LABEL: func @pointwise_no_view
+//       CHECK:   loop.for
+//       CHECK:     loop.for
+//   CHECK-NOT:   loop.for
+//       CHECK:       linalg.generic
+//       CHECK:         addf
+//       CHECK:       linalg.generic
+//       CHECK:         mulf
+
+func @indexed_generic_test(%A: memref<?x?xf32>,
+                           %B: memref<?x?xf32>,
+                           %C: memref<?x?xf32>,
+                           %D: memref<?x?xf32>) {
+  linalg.generic #pointwise_2d_trait %A, %B, %C {
+  ^bb0(%e: f32, %arg5: f32, %arg6: f32):   // no predecessors
+    %2 = addf %e, %arg5 : f32
+    linalg.yield %2 : f32
+  }: memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %c25 = constant 25 : index
+  %c10 = constant 10 : index
+  %0 = dim %C, 0 : memref<?x?xf32>
+  %1 = dim %C, 1 : memref<?x?xf32>
+  %2 = dim %D, 0 : memref<?x?xf32>
+  %3 = dim %D, 1 : memref<?x?xf32>
+  loop.for %arg2 = %c0 to %0 step %c10 {
+    loop.for %arg3 = %c0 to %1 step %c25 {
+      %4 = std.subview %C[%arg2, %arg3][%c10, %c25][%c1, %c1] :
+          memref<?x?xf32> to memref<?x?xf32, #map5>
+      %5 = std.subview %D[%arg2, %arg3][%c10, %c25][%c1, %c1] :
+          memref<?x?xf32> to memref<?x?xf32, #map5>
+      linalg.indexed_generic {
+        indexing_maps = [#map6, #map6],
+        iterator_types = ["parallel", "parallel"],
+        args_in = 1,
+        args_out = 1
+      } %4, %5 {
+      ^bb0(%arg4: index, %arg5: index, %arg6: f32, %arg7: f32):
+        %6 = addi %arg4, %arg2 : index
+        %7 = addi %arg5, %arg3 : index
+        %8 = index_cast %6 : index to i32
+        %9 = sitofp %8 : i32 to f32
+        %10 = index_cast %7 : index to i32
+        %11 = sitofp %10 : i32 to f32
+        %12 = addf %9, %11 : f32
+        linalg.yield %12 : f32
+      }: memref<?x?xf32, #map5>, memref<?x?xf32, #map5>
+    }
+  }
+  return
+}
+// CHECK-LABEL: func @indexed_generic_test
+//       CHECK:   loop.for
+//       CHECK:     loop.for
+//   CHECK-NOT:   loop.for
+//       CHECK:       linalg.generic
+//       CHECK:         addf
+//       CHECK:       linalg.indexed_generic
+//       CHECK:         index_cast
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..f99ee74ceeafe24ec512d5340b59ae591d47b87e
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -0,0 +1,439 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+func @load_number_of_indices(%v : memref<f32>) {
+  // expected-error @+2 {{incorrect number of indices for load}}
+  %c0 = constant 0 : index
+  load %v[%c0] : memref<f32>
+}
+
+// -----
+
+func @slice_number_of_indexings(%arg0: memref<?x?xf32, (i, j)[off, M]->(off + M * i + j)>) {
+  // expected-error @+2 {{expected 2 indexings, got 1}}
+  %c0 = constant 0: index
+  %0 = linalg.slice %arg0[%c0] : memref<?x?xf32, (i, j)[off, M]->(off + M * i + j)>, index, memref<?x?xf32, (i, j)[off, M]->(off + M * i + j)>
+}
+
+// -----
+
+func @slice_rank_vs_range_indices(%arg0: memref<?x?xf32, (i, j)[off, M]->(off + M * i + j)>) {
+  // expected-error @+2 {{op expected rank of the view(1) to be the number of ranges(0)}}
+  %c0 = constant 0: index
+  %0 = linalg.slice %arg0[%c0, %c0] : memref<?x?xf32, (i, j)[off, M]->(off + M * i + j)>, index, index, memref<?xf32, (i)[off]->(off + i)>
+}
+
+// -----
+
+func @store_number_of_indices(%v : memref<f32>) {
+  // expected-error @+3 {{store index operand count not equal to memref rank}}
+  %c0 = constant 0 : index
+  %f0 = constant 0.0 : f32
+  store %f0, %v[%c0] : memref<f32>
+}
+
+// -----
+
+func @transpose_not_permutation(%v : memref<?x?xf32, (i, j)[off, M]->(off + M * i + j)>) {
+  // expected-error @+1 {{expected a permutation map}}
+  linalg.transpose %v (i, j) -> (i, i) : memref<?x?xf32, (i, j)[off, M]->(off + M * i + j)>
+}
+
+// -----
+
+func @transpose_bad_rank(%v : memref<?x?xf32, (i, j)[off, M]->(off + M * i + j)>) {
+  // expected-error @+1 {{expected a permutation map of same rank as the view}}
+  linalg.transpose %v (i) -> (i) : memref<?x?xf32, (i, j)[off, M]->(off + M * i + j)>
+}
+
+// -----
+
+func @yield_parent(%arg0: memref<?xf32, (i)[off]->(off + i)>) {
+  // expected-error @+1 {{op expected 'linalg.generic' or 'linalg.indexed_generic' parent op}}
+  linalg.yield %arg0: memref<?xf32, (i)[off]->(off + i)>
+}
+
+// -----
+
+func @generic_at_least_2_operands(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected 2 or more operands}}
+  linalg.generic {
+    args_in = 1,
+    args_out = 1,
+    fun = @foo,
+    indexing_maps =  [ () -> (0) ],
+    iterator_types = []
+  } %arg0: memref<f32>
+}
+
+// -----
+
+func @generic_exactly_2_views(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected exactly 2 view operands}}
+  linalg.generic {
+    args_in = 1,
+    args_out = 1,
+    fun = @foo,
+    indexing_maps =  [ () -> (0) ],
+    iterator_types = []
+  } %arg0, %arg0, %arg0: memref<f32>, memref<f32>, memref<f32>
+}
+
+// -----
+
+func @generic_undefined_fun(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected fun attribute to refer to a defined symbol}}
+  linalg.generic {
+    args_in = 1,
+    args_out = 1,
+    fun = @foo,
+    indexing_maps =  [ () -> (0) ],
+    iterator_types = []
+  } %arg0, %arg0: memref<f32>, memref<f32>
+}
+
+// -----
+
+func @foo() { return }
+
+func @generic_mismatched_num_arguments(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected fun arguments to match number of views}}
+  linalg.generic {
+    args_in = 0,
+    args_out = 1,
+    fun = @foo,
+    indexing_maps =  [ () -> (0) ],
+    iterator_types = []
+  } %arg0: memref<f32>
+}
+
+// -----
+
+func @foo(%0: i32) { return }
+
+func @generic_mismatched_num_returns(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected fun results to match number of output views}}
+  linalg.generic {
+    args_in = 0,
+    args_out = 1,
+    fun = @foo,
+    indexing_maps =  [ () -> (0) ],
+    iterator_types = []
+  } %arg0: memref<f32>
+}
+
+// -----
+
+func @foo(%0: i32) -> i32 { return %0: i32 }
+
+func @generic_symbol_in_map(%arg0: memref<i32>) {
+  // expected-error @+1 {{op expected indexing_map #0 to have no symbols}}
+  linalg.generic {
+    args_in = 0,
+    args_out = 1,
+    fun = @foo,
+    indexing_maps =  [ ()[N] -> (0) ],
+    iterator_types = ["parallel"]
+  } %arg0: memref<i32>
+}
+
+// -----
+
+func @foo(%0: i32) -> i32 { return %0: i32 }
+
+func @generic_wrong_dim_in_map(%arg0: memref<i32>) {
+  // expected-error @+1 {{op expected indexing_map #0 to have 1 dim(s) to match the number of loops}}
+  linalg.generic {
+    args_in = 0,
+    args_out = 1,
+    fun = @foo,
+    indexing_maps =  [ () -> (0) ],
+    iterator_types = ["parallel"]
+  } %arg0: memref<i32>
+}
+
+// -----
+
+func @foo(%0: i32) -> i32 { return %0: i32 }
+
+func @generic_zero_d_view(%arg0: memref<i32>) {
+  // expected-error @+1 {{op expected indexing_map #0 to be 0 to match 0-D view: 'memref<i32>'}}
+  linalg.generic {
+    args_in = 0,
+    args_out = 1,
+    fun = @foo,
+    indexing_maps =  [ () -> (1) ],
+    iterator_types = []
+  } %arg0: memref<i32>
+}
+
+// -----
+
+func @foo(%0: f32) -> f32 { return %0: f32 }
+
+func @generic_one_d_view(%arg0: memref<?xf32, (i)[off]->(off + i)>) {
+  // expected-error @+1 {{op expected indexing_map #0 results to match view rank: 'memref<?xf32, (d0)[s0] -> (d0 + s0)>'}}
+  linalg.generic {
+    args_in = 0,
+    args_out = 1,
+    fun = @foo,
+    indexing_maps =  [ () -> (0, 0) ],
+    iterator_types = []
+  } %arg0: memref<?xf32, (i)[off]->(off + i)>
+}
+
+// -----
+
+func @foo(%0: i32) -> f32 {
+  %1 = constant 0.0: f32
+  return %1: f32
+}
+
+func @generic_fun_arg_0_element_type(%arg0: memref<?xf32, (i)[off]->(off + i)>) {
+  // expected-error @+1 {{op expected fun argument 0 of the same type as elemental type 'f32' of view 0}}
+  linalg.generic {
+    args_in = 0,
+    args_out = 1,
+    fun = @foo,
+    indexing_maps =  [ () -> (0) ],
+    iterator_types = []
+  } %arg0: memref<?xf32, (i)[off]->(off + i)>
+}
+
+// -----
+
+func @foo(%0: f32) -> i4 {
+  %1 = constant 1: i4
+  return %1: i4
+}
+
+func @generic_fun_result_0_element_type(%arg0: memref<?xf32, (i)[off]->(off + i)>) {
+  // expected-error @+1 {{op expected fun result 0 of the same type as elemental type 'f32' of view 0}}
+  linalg.generic {
+    args_in = 0,
+    args_out = 1,
+    fun = @foo,
+    indexing_maps =  [ () -> (0) ],
+    iterator_types = []
+  } %arg0: memref<?xf32, (i)[off]->(off + i)>
+}
+
+// -----
+
+func @foo(%0: f32, %1: f32) -> f32 { return %1: f32 }
+
+func @generic_singular_maps(%arg0: memref<?xf32, (i)[off]->(off + i)>, %arg1: memref<?xf32, (i)[off]->(off + i)>) {
+  // expected-error @+1 {{op expected the concatenation of maps in indexing_map to be invertible}}
+  linalg.generic {
+    args_in = 1,
+    args_out = 1,
+    fun = @foo,
+    indexing_maps =  [
+      (i, j) -> (i + j) ,
+      (i, j) -> (i + j)
+    ],
+    iterator_types = ["parallel","parallel"]
+  } %arg0, %arg1: memref<?xf32, (i)[off]->(off + i)>, memref<?xf32, (i)[off]->(off + i)>
+}
+
+////////////////////////////////////////////////////////////////////////////////
+///////////////////////////// Region tests /////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+// -----
+
+func @generic_empty_region(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected region with 1 block}}
+  linalg.generic {
+    args_in = 1,
+    args_out = 1,
+    indexing_maps =  [ () -> (0) ],
+    iterator_types = []
+  } %arg0, %arg0 {
+    ^bb1:
+    ^bb2:
+  }: memref<f32>, memref<f32>
+}
+
+// -----
+
+func @generic_mismatched_num_arguments(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected number of block arguments to match number of views}}
+  linalg.generic {
+    args_in = 0,
+    args_out = 1,
+    indexing_maps =  [ () -> (0) ],
+    iterator_types = []
+  } %arg0 {
+    ^bb:
+  }: memref<f32>
+}
+
+// -----
+
+func @generic_block_arg_type(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected block argument 0 of the same type as elemental type of output view: 'memref<f32>'}}
+  linalg.generic {
+    args_in = 0,
+    args_out = 1,
+    indexing_maps =  [ () -> (0) ],
+    iterator_types = []
+  } %arg0 {
+    ^bb(%i: i1):
+  }: memref<f32>
+}
+
+// -----
+
+func @indexed_generic_block_arg_count(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected number of block arguments to match number of views + number of loops}}
+  linalg.indexed_generic {
+    args_in = 0,
+    args_out = 1,
+    indexing_maps =  [ (d0) -> (d0) ],
+    iterator_types = ["parallel"]
+  } %arg0 {
+    ^bb(%f: f32):
+  }: memref<f32>
+}
+
+// -----
+
+func @indexed_generic_block_induction_var_arg_type(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected block argument 0 to be of IndexType}}
+  linalg.indexed_generic {
+    args_in = 0,
+    args_out = 1,
+    indexing_maps =  [ (d0) -> (d0) ],
+    iterator_types = ["parallel"]
+  } %arg0 {
+    ^bb(%i: f64, %f: f32):
+  }: memref<f32>
+}
+
+// -----
+
+func @indexed_generic_block_arg_type(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected block argument 1 of the same type as elemental type of output view: 'memref<f32>'}}
+  linalg.indexed_generic {
+    args_in = 0,
+    args_out = 1,
+    indexing_maps =  [ (d0) -> (d0) ],
+    iterator_types = ["parallel"]
+  } %arg0 {
+    ^bb(%i: index, %f: i1):
+  }: memref<f32>
+}
+
+// -----
+
+func @foo(%f: f32) -> (f32) {
+  return %f : f32
+}
+func @indexed_generic_fun_arg_count(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected fun arguments to match number of views + number of loops}}
+  linalg.indexed_generic {
+    args_in = 0,
+    args_out = 1,
+    indexing_maps =  [ (d0) -> (d0) ],
+    iterator_types = ["parallel"],
+    fun = @foo
+  } %arg0:  memref<f32>
+}
+
+// -----
+
+func @foo(%i: i32, %val: f32) -> (f32) {
+  return %val : f32
+}
+func @indexed_generic_fun_induction_var_arg_type(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected fun argument 0 to be of IndexType}}
+  linalg.indexed_generic {
+    args_in = 0,
+    args_out = 1,
+    iterator_types = ["parallel"],
+    indexing_maps = [ (i) -> (i) ],
+    fun = @foo
+  } %arg0 : memref<f32>
+}
+
+// -----
+
+func @foo(%i: index, %val: i1) -> (i1) {
+  return %val : i1
+}
+func @indexed_generic_fun_arg_type(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected fun argument 1 of the same type as elemental type 'f32' of view 0}}
+  linalg.indexed_generic {
+    args_in = 0,
+    args_out = 1,
+    indexing_maps =  [ (d0) -> (d0) ],
+    iterator_types = ["parallel"],
+    fun = @foo
+  } %arg0: memref<f32>
+}
+
+// -----
+
+func @foo(%i: index, %val: i1) -> (i1, i1) {
+  return %val, %val : i1, i1
+}
+func @indexed_generic_fun_result_count(%arg0: memref<f32>) {
+  // expected-error @+1 {{op expected fun results to match number of output views}}
+  linalg.indexed_generic {
+    args_in = 0,
+    args_out = 1,
+    indexing_maps =  [ (d0) -> (d0) ],
+    iterator_types = ["parallel"],
+    fun = @foo
+  } %arg0: memref<f32>
+}
+
+// -----
+
+func @foo(%i: index, %val: i32) -> (f32) {
+  %val_float = sitofp %val : i32 to f32
+  return %val_float : f32
+}
+func @indexed_generic_fun_result_count(%arg0: memref<i32>) {
+  // expected-error @+1 {{op expected fun result 0 of the same type as elemental type 'i32' of view 0}}
+  linalg.indexed_generic {
+    args_in = 0,
+    args_out = 1,
+    indexing_maps =  [ (d0) -> (d0) ],
+    iterator_types = ["parallel"],
+    fun = @foo
+  } %arg0: memref<i32>
+}
+
+// -----
+
+func @generic_fun_result_0_element_type(%arg0: memref<?xf32, (i)[off]->(off + i)>) {
+  // expected-error @+9 {{type of return operand 0 ('i1') doesn't match view element type ('f32')}}
+  linalg.generic {
+    args_in = 0,
+    args_out = 1,
+    indexing_maps = [ (i) -> (i) ],
+    iterator_types = ["parallel"]
+  } %arg0 {
+    ^bb(%i: f32):
+      %0 = constant 0: i1
+      linalg.yield %0: i1
+  }: memref<?xf32, (i)[off]->(off + i)>
+}
+
+// -----
+
+func @generic_fun_result_0_element_type(%arg0: memref<?xf32>) {
+  // expected-error @+1 {{'linalg.dot' op expected 3 or more operands}}
+  linalg.dot(%arg0, %arg0): memref<?xf32>, memref<?xf32>
+}
+
+// -----
+
+// expected-error @+1 {{unknown Linalg type}}
+!invalid_type = type !linalg.unknown
+
+// -----
+
+// expected-error @+1 {{expected valid keyword}}
+!invalid_type = type !linalg<"?">
diff --git a/mlir/test/Dialect/Linalg/llvm.mlir b/mlir/test/Dialect/Linalg/llvm.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..7054a3d93167d1b359262fa82e6b9ec462f763cc
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/llvm.mlir
@@ -0,0 +1,198 @@
+// RUN: mlir-opt %s -convert-linalg-to-llvm | FileCheck %s
+// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm | FileCheck %s --check-prefix=LLVM-LOOPS
+
+func @range(%arg0: index) {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %R = linalg.range %c0:%arg0:%c1 : !linalg.range
+  return
+}
+// CHECK-LABEL: func @range(%{{.*}}: !llvm.i64) {
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:   llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:   llvm.mlir.undef : !llvm<"{ i64, i64, i64 }">
+//  CHECK-NEXT:   llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm<"{ i64, i64, i64 }">
+//  CHECK-NEXT:   llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"{ i64, i64, i64 }">
+//  CHECK-NEXT:   llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm<"{ i64, i64, i64 }">
+
+func @slice(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: !linalg.range) {
+  %1 = linalg.slice %arg0[%arg1] : memref<?xf32, offset: ?, strides: [1]>, !linalg.range, memref<?xf32, offset: ?, strides: [1]>
+  return
+}
+// CHECK-LABEL: func @slice
+//   insert data ptr for slice op
+//       CHECK:   llvm.extractvalue %{{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+//  CHECK-NEXT:   llvm.extractvalue %{{.*}}[2] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+//  CHECK-NEXT:   llvm.extractvalue %{{.*}}[0] : !llvm<"{ i64, i64, i64 }">
+//  CHECK-NEXT:   llvm.mul %{{.*}}, %{{.*}} : !llvm.i64
+//  CHECK-NEXT:   llvm.add %{{.*}}, %{{.*}} : !llvm.i64
+//    insert offset
+//       CHECK:   llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+//  CHECK-NEXT:   llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+//  CHECK-NEXT:   llvm.mlir.constant(0 : index)
+//  CHECK-NEXT:   llvm.extractvalue %{{.*}}[0] : !llvm<"{ i64, i64, i64 }">
+//  CHECK-NEXT:   llvm.extractvalue %{{.*}}[1] : !llvm<"{ i64, i64, i64 }">
+//  CHECK-NEXT:   llvm.extractvalue %{{.*}}[2] : !llvm<"{ i64, i64, i64 }">
+//    get size[0] from parent view
+//  CHECK-NEXT:   llvm.extractvalue %{{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+//  CHECK-NEXT:   llvm.icmp "slt" %{{.*}}, %{{.*}} : !llvm.i64
+//  CHECK-NEXT:   llvm.select %{{.*}}, %{{.*}}, %{{.*}} : !llvm.i1, !llvm.i64
+//    compute size[0] bounded by parent view's size[0]
+//  CHECK-NEXT:   llvm.sub %{{.*}}, %{{.*}} : !llvm.i64
+//    bound below by 0
+//  CHECK-NEXT:   llvm.icmp "slt" %{{.*}}, %{{.*}} : !llvm.i64
+//  CHECK-NEXT:   llvm.select %{{.*}}, %{{.*}}, %{{.*}} : !llvm.i1, !llvm.i64
+//    compute stride[0] using bounded size
+//  CHECK-NEXT:   llvm.mul %{{.*}}, %{{.*}} : !llvm.i64
+//    insert size and stride
+//  CHECK-NEXT:   llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+//  CHECK-NEXT:   llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
+
+func @dot(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: memref<?xf32, offset: ?, strides: [1]>, %arg2: memref<f32>) {
+  linalg.dot(%arg0, %arg1, %arg2) : memref<?xf32, offset: ?, strides: [1]>, memref<?xf32, offset: ?, strides: [1]>, memref<f32>
+  return
+}
+//      CHECK-LABEL: func @dot(%{{.*}}: !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }*">, %{{.*}}: !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }*">, %{{.*}}: !llvm<"{ float*, float*, i64 }*">) {
+//    CHECK-COUNT-3:   llvm.mlir.constant(1 : index){{.*[[:space:]].*}}llvm.alloca{{.*[[:space:]].*}}llvm.store
+//       CHECK-NEXT:   llvm.call @linalg_dot_viewsxf32_viewsxf32_viewf32(%{{.*}}, %{{.*}}, %{{.*}}) : (!llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }*">, !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }*">, !llvm<"{ float*, float*, i64 }*">) -> ()
+
+func @slice_with_range_and_index(%arg0: memref<?x?xf64, offset: ?, strides: [?, 1]>) {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %R = linalg.range %c0:%c1:%c1 : !linalg.range
+  loop.for %i0 = %c0 to %c1 step %c1 {
+    %1 = linalg.slice %arg0[%i0, %R] : memref<?x?xf64, offset: ?, strides: [?, 1]>, index, !linalg.range, memref<?xf64, offset: ?, strides: [1]>
+  }
+  return
+}
+// CHECK-LABEL: func @slice_with_range_and_index
+// loop-body.
+//       CHECK:   llvm.mlir.undef : !llvm<"{ double*, double*, i64, [1 x i64], [1 x i64] }">
+//       CHECK:   llvm.extractvalue %{{.*}}[4, 0] : !llvm<"{ double*, double*, i64, [2 x i64], [2 x i64] }">
+//       CHECK:   llvm.extractvalue %{{.*}}[4, 1] : !llvm<"{ double*, double*, i64, [2 x i64], [2 x i64] }">
+//       CHECK:   llvm.extractvalue %{{.*}}[2] : !llvm<"{ double*, double*, i64, [2 x i64], [2 x i64] }">
+//       CHECK:   llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm<"{ double*, double*, i64, [1 x i64], [1 x i64] }">
+//       CHECK:   llvm.insertvalue %{{.*}}[2] : !llvm<"{ double*, double*, i64, [1 x i64], [1 x i64] }">
+//       CHECK:   llvm.extractvalue %{{.*}}[0] : !llvm<"{ i64, i64, i64 }">
+//       CHECK:   llvm.extractvalue %{{.*}}[1] : !llvm<"{ i64, i64, i64 }">
+//       CHECK:   llvm.insertvalue %{{.*}}[3, 0] : !llvm<"{ double*, double*, i64, [1 x i64], [1 x i64] }">
+//       CHECK:   llvm.insertvalue %{{.*}}[4, 0] : !llvm<"{ double*, double*, i64, [1 x i64], [1 x i64] }">
+
+func @copy(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.copy(%arg0, %arg1) : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @copy
+//       CHECK:   llvm.call @linalg_copy_viewsxsxsxf32_viewsxsxsxf32(%{{.*}}, %{{.*}}) : (!llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }*">, !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }*">) -> ()
+
+func @transpose(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  %0 = linalg.transpose %arg0 (i, j, k) -> (k, i, j) : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @transpose
+//       CHECK:   llvm.mlir.undef : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//       CHECK:   llvm.insertvalue {{.*}}[0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//       CHECK:    llvm.insertvalue {{.*}}[1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//       CHECK:    llvm.insertvalue {{.*}}[2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//       CHECK:   llvm.extractvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//       CHECK:    llvm.insertvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//       CHECK:   llvm.extractvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//       CHECK:    llvm.insertvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//       CHECK:   llvm.extractvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//       CHECK:    llvm.insertvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+
+func @copy_transpose(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.copy(%arg0, %arg1) {inputPermutation = (i, j, k) -> (i, k, j),
+                             outputPermutation = (i, j, k) -> (k, j, i)}
+    : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @copy
+// Transpose input
+//         CHECK:   llvm.insertvalue {{.*}}[0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:    llvm.insertvalue {{.*}}[1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:    llvm.insertvalue {{.*}}[2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:   llvm.extractvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:    llvm.insertvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:   llvm.extractvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:    llvm.insertvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:   llvm.extractvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:    llvm.insertvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+// Transpose output
+//         CHECK:   llvm.insertvalue {{.*}}[0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:    llvm.insertvalue {{.*}}[1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:    llvm.insertvalue {{.*}}[2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:   llvm.extractvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:    llvm.insertvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:   llvm.extractvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:    llvm.insertvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:   llvm.extractvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+//         CHECK:    llvm.insertvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
+// Call external copy after promoting input and output structs to pointers
+// CHECK-COUNT-2:   llvm.mlir.constant(1 : index){{.*[[:space:]].*}}llvm.alloca{{.*[[:space:]].*}}llvm.store
+//         CHECK:   llvm.call @linalg_copy_viewsxsxsxf32_viewsxsxsxf32(%{{.*}}, %{{.*}}) : (!llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }*">, !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }*">) -> ()
+
+#matmul_accesses = [
+  (m, n, k) -> (m, k),
+  (m, n, k) -> (k, n),
+  (m, n, k) -> (m, n)
+]
+#matmul_trait = {
+  args_in = 2,
+  args_out = 1,
+  iterator_types = ["parallel", "parallel", "reduction"],
+  indexing_maps = #matmul_accesses,
+  library_call = "external_outerproduct_matmul"
+}
+
+!vector_type_A = type vector<4xf32>
+!vector_type_B = type vector<4xf32>
+!vector_type_C = type vector<4x4xf32>
+
+!matrix_type_A = type memref<?x?x!vector_type_A>
+!matrix_type_B = type memref<?x?x!vector_type_B>
+!matrix_type_C = type memref<?x?x!vector_type_C>
+
+func @matmul_vec_impl(%A: !matrix_type_A, %B: !matrix_type_B, %C: !matrix_type_C) {
+  linalg.generic #matmul_trait %A, %B, %C {
+    ^bb0(%a: !vector_type_A, %b: !vector_type_B, %c: !vector_type_C):
+      %d = vector.outerproduct %a, %b, %c: !vector_type_A, !vector_type_B
+      linalg.yield %d: !vector_type_C
+  } : !matrix_type_A, !matrix_type_B, !matrix_type_C
+
+  return
+}
+// CHECK-LABEL: func @matmul_vec_impl(
+//   CHECK:  llvm.call @external_outerproduct_matmul(%{{.*}}) : (!llvm<"{ <4 x float>*, <4 x float>*, i64, [2 x i64], [2 x i64] }*">, !llvm<"{ <4 x float>*, <4 x float>*, i64, [2 x i64], [2 x i64] }*">, !llvm<"{ [4 x <4 x float>]*, [4 x <4 x float>]*, i64, [2 x i64], [2 x i64] }*">) -> ()
+
+// LLVM-LOOPS-LABEL: func @matmul_vec_impl(
+//   LLVM-LOOPS: llvm.shufflevector {{.*}} [0 : i32, 0 : i32, 0 : i32, 0 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
+//   LLVM-LOOPS: llvm.shufflevector {{.*}} [1 : i32, 1 : i32, 1 : i32, 1 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
+//   LLVM-LOOPS: llvm.shufflevector {{.*}} [2 : i32, 2 : i32, 2 : i32, 2 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
+//   LLVM-LOOPS: llvm.shufflevector {{.*}} [3 : i32, 3 : i32, 3 : i32, 3 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
+//   LLVM-LOOPS-NEXT: llvm.extractvalue {{.*}}[3] : !llvm<"[4 x <4 x float>]">
+//   LLVM-LOOPS-NEXT: "llvm.intr.fmuladd"({{.*}}) : (!llvm<"<4 x float>">, !llvm<"<4 x float>">, !llvm<"<4 x float>">) -> !llvm<"<4 x float>">
+//   LLVM-LOOPS-NEXT: llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x <4 x float>]">
+
+
+#indexed_matmul_trait = {
+  args_in = 2,
+  args_out = 1,
+  iterator_types = ["parallel", "parallel", "reduction"],
+  indexing_maps = #matmul_accesses,
+  library_call = "external_indexed_outerproduct_matmul"
+}
+func @matmul_vec_indexed(%A: !matrix_type_A,
+                         %B: !matrix_type_B,
+                         %C: !matrix_type_C) {
+  linalg.indexed_generic #indexed_matmul_trait %A, %B, %C {
+    ^bb0(%i: index, %j: index, %k: index,
+         %a: !vector_type_A, %b: !vector_type_B, %c: !vector_type_C):
+      %d = vector.outerproduct %a, %b, %c: !vector_type_A, !vector_type_B
+      linalg.yield %d: !vector_type_C
+  } : !matrix_type_A, !matrix_type_B, !matrix_type_C
+  return
+}
+// CHECK-LABEL: func @matmul_vec_indexed(
+//   CHECK: %[[ZERO:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//   CHECK: llvm.call @external_indexed_outerproduct_matmul(%[[ZERO]], %[[ZERO]], %[[ZERO]], %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.i64, !llvm.i64, !llvm.i64, !llvm<"{ <4 x float>*, <4 x float>*, i64, [2 x i64], [2 x i64] }*">, !llvm<"{ <4 x float>*, <4 x float>*, i64, [2 x i64], [2 x i64] }*">, !llvm<"{ [4 x <4 x float>]*, [4 x <4 x float>]*, i64, [2 x i64], [2 x i64] }*">) -> ()
diff --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..1425b4ed3a447e068de25c448c8f97133316937b
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/loops.mlir
@@ -0,0 +1,358 @@
+// RUN: mlir-opt %s -convert-linalg-to-loops | FileCheck %s
+
+// Test that we can lower all the way to LLVM without crashing, don't check results here.
+// RUN: mlir-opt %s --convert-linalg-to-llvm -o=/dev/null 2>&1
+
+// CHECK-DAG: #[[strided1D:.*]] = (d0)[s0] -> (d0 + s0)
+// CHECK-DAG: #[[strided2D:.*]] = (d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)
+// CHECK-DAG: #[[strided3D:.*]] = (d0, d1, d2)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2 + d2)
+// CHECK-DAG: #[[strided4D:.*]] = (d0, d1, d2, d3)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3)
+
+// CHECK-DAG: #[[Stride2Dilation1:.*]] = (d0, d1) -> (d0 * 2 + d1)
+// CHECK-DAG: #[[Stride2Dilation4:.*]] = (d0, d1) -> (d0 * 2 + d1 * 4)
+// CHECK-DAG: #[[Stride3Dilation5:.*]] = (d0, d1) -> (d0 * 3 + d1 * 5)
+
+
+func @matmul(%arg0: memref<?xi8>, %M: index, %N: index, %K: index) {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %A = view %arg0[%c0][%M, %K] : memref<?xi8> to memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %B = view %arg0[%c0][%K, %N] : memref<?xi8> to memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %C = view %arg0[%c0][%M, %N] : memref<?xi8> to memref<?x?xf32, offset: ?, strides: [?, 1]>
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>
+  return
+}
+// CHECK-LABEL: func @matmul(%{{.*}}: memref<?xi8>,
+// CHECK-SAME: [[M:arg[0-9]+]]: index
+// CHECK-SAME: [[N:arg[0-9]+]]: index
+// CHECK-SAME: [[K:arg[0-9]+]]: index
+//       CHECK: %[[A:.*]] = std.view %{{.*}}[{{.*}}] : memref<?xi8> to memref<?x?xf32, #[[strided2D]]>
+//       CHECK: %[[B:.*]] = std.view %{{.*}}[{{.*}}] : memref<?xi8> to memref<?x?xf32, #[[strided2D]]>
+//       CHECK: %[[C:.*]] = std.view %{{.*}}[{{.*}}] : memref<?xi8> to memref<?x?xf32, #[[strided2D]]>
+//       CHECK: loop.for %{{.*}} = %{{.*}} to %[[M]] step %{{.*}} {
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %[[N]] step %{{.*}} {
+//       CHECK:     loop.for %{{.*}} = %{{.*}} to %[[K]] step %{{.*}} {
+//   CHECK-DAG:       %[[a:.*]] = load %[[A]][%{{.*}}, %{{.*}}] : memref<?x?xf32, #[[strided2D]]>
+//   CHECK-DAG:       %[[b:.*]] = load %[[B]][%{{.*}}, %{{.*}}] : memref<?x?xf32, #[[strided2D]]>
+//   CHECK-DAG:       %[[inc:.*]] = mulf %[[a]], %[[b]] : f32
+//   CHECK-DAG:       %[[c:.*]] = load %[[C]][%{{.*}}, %{{.*}}] : memref<?x?xf32, #[[strided2D]]>
+//   CHECK-DAG:       %[[res:.*]] = addf %[[c]], %[[inc]] : f32
+//       CHECK:       store %[[res]], %[[C]][%{{.*}}, %{{.*}}] : memref<?x?xf32, #[[strided2D]]>
+
+func @matvec(%arg0: memref<?xi8>, %M: index, %N: index) {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %2 = view %arg0[%c0][%M, %N] : memref<?xi8> to memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %3 = view %arg0[%c0][%M] : memref<?xi8> to memref<?xf32, offset: ?, strides: [1]>
+  %4 = view %arg0[%c0][%N] : memref<?xi8> to memref<?xf32, offset: ?, strides: [1]>
+  linalg.matvec(%2, %3, %4) : memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?xf32, offset: ?, strides: [1]>, memref<?xf32, offset: ?, strides: [1]>
+  return
+}
+// CHECK-LABEL: func @matvec(%{{.*}}: memref<?xi8>,
+// CHECK-SAME: [[M:arg[0-9]+]]: index
+// CHECK-SAME: [[K:arg[0-9]+]]: index
+//       CHECK: %[[A:.*]] = std.view %{{.*}}[{{.*}}] : memref<?xi8> to memref<?x?xf32, #[[strided2D]]>
+//       CHECK: %[[B:.*]] = std.view %{{.*}}[{{.*}}] : memref<?xi8> to memref<?xf32, #[[strided1D]]>
+//       CHECK: %[[C:.*]] = std.view %{{.*}}[{{.*}}] : memref<?xi8> to memref<?xf32, #[[strided1D]]>
+//       CHECK: loop.for %{{.*}} = %{{.*}} to %[[M]] step %{{.*}} {
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %[[K]] step %{{.*}} {
+//   CHECK-DAG:     %[[a:.*]] = load %[[A]][%{{.*}}, %{{.*}}] : memref<?x?xf32, #[[strided2D]]>
+//   CHECK-DAG:     %[[b:.*]] = load %[[B]][%{{.*}}] : memref<?xf32, #[[strided1D]]>
+//   CHECK-DAG:     %[[inc:.*]] = mulf %[[a]], %[[b]] : f32
+//   CHECK-DAG:     %[[c:.*]] = load %[[C]][%{{.*}}] : memref<?xf32, #[[strided1D]]>
+//   CHECK-DAG:     %[[res:.*]] = addf %[[c]], %[[inc]] : f32
+//       CHECK:     store %[[res]], %[[C]][%{{.*}}] : memref<?xf32, #[[strided1D]]>
+
+func @dot(%arg0: memref<?xi8>, %M: index) {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %1 = view %arg0[%c0][%M] : memref<?xi8> to memref<?xf32, offset: ?, strides: [1]>
+  %2 = view %arg0[%c0][%M] : memref<?xi8> to memref<?xf32, offset: ?, strides: [1]>
+  %3 = view %arg0[][] : memref<?xi8> to memref<f32>
+  linalg.dot(%1, %2, %3) : memref<?xf32, offset: ?, strides: [1]>, memref<?xf32, offset: ?, strides: [1]>, memref<f32>
+  return
+}
+// CHECK-LABEL: func @dot(%{{.*}}: memref<?xi8>,
+// CHECK-SAME: [[K:arg[0-9]+]]: index
+//       CHECK: %[[A:.*]] = std.view %{{.*}}[{{.*}}][{{.*}}] : memref<?xi8> to memref<?xf32, #[[strided1D]]>
+//       CHECK: %[[B:.*]] = std.view %{{.*}}[{{.*}}][{{.*}}] : memref<?xi8> to memref<?xf32, #[[strided1D]]>
+//       CHECK: %[[C:.*]] = std.view %{{.*}}[][] : memref<?xi8> to memref<f32>
+//       CHECK: loop.for %{{.*}} = %{{.*}} to %[[K]] step %{{.*}} {
+//   CHECK-DAG:   %[[a:.*]] = load %[[A]][%{{.*}}] : memref<?xf32, #[[strided1D]]>
+//   CHECK-DAG:   %[[b:.*]] = load %[[B]][%{{.*}}] : memref<?xf32, #[[strided1D]]>
+//   CHECK-DAG:   %[[inc:.*]] = mulf %[[a]], %[[b]] : f32
+//   CHECK-DAG:   %[[c:.*]] = load %[[C]][] : memref<f32>
+//   CHECK-DAG:   %[[res:.*]] = addf %[[c]], %[[inc]] : f32
+//       CHECK:   store %[[res]], %[[C]][] : memref<f32>
+
+func @dot_view(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: memref<?xf32, offset: ?, strides: [1]>, %arg2: memref<f32>) {
+  linalg.dot(%arg0, %arg1, %arg2) : memref<?xf32, offset: ?, strides: [1]>, memref<?xf32, offset: ?, strides: [1]>, memref<f32>
+  return
+}
+// CHECK-LABEL: func @dot_view(
+//       CHECK:   %{{.*}}: memref<?xf32, #[[strided1D]]>, %{{.*}}: memref<?xf32, #[[strided1D]]>, %{{.*}}: memref<f32>) {
+//       CHECK: %[[K:.*]] = dim %arg0, 0 : memref<?xf32, #[[strided1D]]>
+//       CHECK: loop.for %{{.*}} = %{{.*}} to %[[K]] step %{{.*}} {
+//   CHECK-DAG:   %[[a:.*]] = load %arg0[%{{.*}}] : memref<?xf32, #[[strided1D]]>
+//   CHECK-DAG:   %[[b:.*]] = load %{{.*}}[%{{.*}}] : memref<?xf32, #[[strided1D]]>
+//   CHECK-DAG:   %[[inc:.*]] = mulf %[[a]], %[[b]] : f32
+//   CHECK-DAG:   %[[c:.*]] = load %{{.*}}[] : memref<f32>
+//   CHECK-DAG:   %[[res:.*]] = addf %[[c]], %[[inc]] : f32
+//       CHECK:   store %[[res]], %{{.*}}[] : memref<f32>
+
+func @fill_view(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: f32) {
+  linalg.fill(%arg0, %arg1) : memref<?xf32, offset: ?, strides: [1]>, f32
+  return
+}
+// CHECK-LABEL: func @fill_view(
+//       CHECK: %{{.*}}: memref<?xf32, #[[strided1D]]>, %{{.*}}: f32) {
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//       CHECK:     store %{{.*}}, %{{.*}}[%{{.*}}] : memref<?xf32, #[[strided1D]]>
+
+func @fill_view0(%arg0: memref<f32>, %arg1: f32) {
+  linalg.fill(%arg0, %arg1) : memref<f32>, f32
+  return
+}
+// CHECK-LABEL: func @fill_view0(%{{.*}}: memref<f32>, %{{.*}}: f32) {
+//       CHECK:   store %{{.*}}, %{{.*}}[] : memref<f32>
+
+func @fill_view3(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: f32) {
+  linalg.fill(%arg0, %arg1) : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, f32
+  return
+}
+// CHECK-LABEL: func @fill_view3(
+//       CHECK: %{{.*}}: memref<?x?x?xf32, #[[strided3D]]>, %{{.*}}: f32) {
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//       CHECK:     loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//       CHECK:       loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//       CHECK:         store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32, #[[strided3D]]>
+
+func @copy_view(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: memref<?xf32, offset: ?, strides: [1]>) {
+  linalg.copy(%arg0, %arg1) : memref<?xf32, offset: ?, strides: [1]>, memref<?xf32, offset: ?, strides: [1]>
+  return
+}
+// CHECK-LABEL: func @copy_view(
+//       CHECK: %{{.*}}: memref<?xf32, #[[strided1D]]>, %{{.*}}: memref<?xf32, #[[strided1D]]>) {
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//       CHECK:     %[[L:.*]] = load %{{.*}}[%{{.*}}] : memref<?xf32, #[[strided1D]]>
+//       CHECK:     store %[[L]], %{{.*}}[%{{.*}}] : memref<?xf32, #[[strided1D]]>
+
+func @copy_view0(%arg0: memref<f32>, %arg1: memref<f32>) {
+  linalg.copy(%arg0, %arg1) : memref<f32>, memref<f32>
+  return
+}
+// CHECK-LABEL: func @copy_view0(%{{.*}}: memref<f32>, %{{.*}}: memref<f32>) {
+//       CHECK:   %{{.*}} = load %{{.*}}[] : memref<f32>
+//       CHECK:   store %{{.*}}, %{{.*}}[] : memref<f32>
+
+func @copy_view3(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.copy(%arg0, %arg1) {inputPermutation = (i, j, k) -> (i, k, j),
+                             outputPermutation = (i, j, k) -> (k, j, i)} :
+    memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @copy_view3
+//       CHECK: (%{{.*}}: memref<?x?x?xf32, #[[strided3D]]>, %{{.*}}: memref<?x?x?xf32, #[[strided3D]]>) {
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//       CHECK:     loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//       CHECK:       loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//       CHECK:         %[[L:.*]] = load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:         store %[[L]], %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32, #[[strided3D]]>
+
+func @conv_view3(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg2: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.conv(%arg0, %arg1, %arg2) {strides = [2]}: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @conv_view3(
+//       CHECK: %{{.*}}: memref<?x?x?xf32, #[[strided3D]]>, %{{.*}}: memref<?x?x?xf32, #[[strided3D]]>, %{{.*}}: memref<?x?x?xf32, #[[strided3D]]>) {
+//       CHECK:   %[[Z0:.*]] = dim %arg0, 0 : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:   %[[Q:.*]] = dim %arg0, 1 : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:   %[[K:.*]] = dim %arg0, 2 : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:   %[[B:.*]] = dim %arg1, 0 : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:   %[[X0:.*]] = dim %arg2, 1 : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %[[B]] step %{{.*}} {
+//       CHECK:     loop.for %{{.*}} = %{{.*}} to %[[X0]] step %{{.*}} {
+//       CHECK:       loop.for %{{.*}} = %{{.*}} to %[[K]] step %{{.*}} {
+//       CHECK:         loop.for %{{.*}} = %{{.*}} to %[[Q]] step %{{.*}} {
+//       CHECK:           loop.for %{{.*}} = %{{.*}} to %[[Z0]] step %{{.*}} {
+//       CHECK:             %[[SUM:.*]] = affine.apply #[[Stride2Dilation1]](%{{.*}}, %{{.*}})
+//       CHECK:             %{{.*}} = load %{{.*}}[%{{.*}}, %[[SUM]], %{{.*}}] : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:             %{{.*}} = load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:             %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+//       CHECK:             %{{.*}} = load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:             %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+//       CHECK:             store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32, #[[strided3D]]>
+
+func @conv_view4(%arg0: memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>, %arg1: memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>, %arg2: memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>) {
+  linalg.conv(%arg0, %arg1, %arg2) {dilations = [4, 5], strides = [2, 3]} : memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>, memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>, memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @conv_view4(
+//       CHECK: %{{.*}}: memref<?x?x?x?xf32, #[[strided4D]]>, %{{.*}}: memref<?x?x?x?xf32, #[[strided4D]]>, %{{.*}}: memref<?x?x?x?xf32, #[[strided4D]]>) {
+//       CHECK:   %[[Z0:.*]] = dim %arg0, 0 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       CHECK:   %[[Z1:.*]] = dim %arg0, 1 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       CHECK:   %[[Q:.*]] = dim %arg0, 2 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       CHECK:   %[[K:.*]] = dim %arg0, 3 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       CHECK:   %[[B:.*]] = dim %arg1, 0 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       CHECK:   %[[X0:.*]] = dim %arg2, 1 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       CHECK:   %[[X1:.*]] = dim %arg2, 2 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %[[B]] step %{{.*}} {
+//       CHECK:     loop.for %{{.*}} = %{{.*}} to %[[X0]] step %{{.*}} {
+//       CHECK:       loop.for %{{.*}} = %{{.*}} to %[[X1]] step %{{.*}} {
+//       CHECK:         loop.for %{{.*}} = %{{.*}} to %[[K]] step %{{.*}} {
+//       CHECK:           loop.for %{{.*}} = %{{.*}} to %[[Q]] step %{{.*}} {
+//       CHECK:             loop.for %{{.*}} = %{{.*}} to %[[Z0]] step %{{.*}} {
+//       CHECK:               loop.for %{{.*}} = %{{.*}} to %[[Z1]] step %{{.*}} {
+//       CHECK:                 %[[SUM0:.*]] = affine.apply #[[Stride2Dilation4]](%{{.*}}, %{{.*}})
+//       CHECK:                 %[[SUM1:.*]] = affine.apply #[[Stride3Dilation5]](%{{.*}}, %{{.*}})
+//       CHECK:                 %{{.*}} = load %{{.*}}[%{{.*}}, %[[SUM0]], %[[SUM1]], %{{.*}}] : memref<?x?x?x?xf32, #[[strided4D]]>
+//       CHECK:                 %{{.*}} = load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?x?xf32, #[[strided4D]]>
+//       CHECK:                 %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+//       CHECK:                 %{{.*}} = load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?x?xf32, #[[strided4D]]>
+//       CHECK:                 %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+//       CHECK:                 store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?x?xf32, #[[strided4D]]>
+
+func @foo(%0: f32, %1: f32, %2: f32) -> (f32, f32) {
+  %f0 = constant 0.0 : f32
+  return %f0, %f0 : f32, f32
+}
+#accesses = [
+  (i, j, k) -> (i, j),
+  (i, j, k) -> (i, j, k),
+  (i, j, k) -> (i, k, j)
+]
+#trait = {
+  args_in = 1,
+  args_out = 2,
+  iterator_types = ["parallel", "parallel", "parallel"],
+  indexing_maps = #accesses,
+  fun = @foo,
+  library_call = "some_external_function_name_1",
+  doc = "B(i,j,k), C(i,k,j) = foo(A(i, j), B(i,j,k), C(i,k,j))"
+}
+func @generic_function(%arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg2: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.generic #trait %arg0, %arg1, %arg2:
+    memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: @foo
+// CHECK-LABEL: @generic_function
+//       CHECK: loop.for %[[i:.*]] = {{.*}}
+//       CHECK:   loop.for %[[j:.*]] = {{.*}}
+//       CHECK:     loop.for %[[k:.*]] = {{.*}}
+//       CHECK:       %[[a:.*]] = load %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32, #[[strided2D]]>
+//       CHECK:       %[[b:.*]] = load %{{.*}}[%[[i]], %[[j]], %[[k]]] : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:       %[[c:.*]] = load %{{.*}}[%[[i]], %[[k]], %[[j]]] : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:       %[[res:.*]]:2 = call @foo(%[[a]], %[[b]], %[[c]]) : (f32, f32, f32) -> (f32, f32)
+//       CHECK:       store %[[res]]#0, %{{.*}}[%[[i]], %[[j]], %[[k]]] : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:       store %[[res]]#1, %{{.*}}[%[[i]], %[[k]], %[[j]]] : memref<?x?x?xf32, #[[strided3D]]>
+
+#trait2 = {
+  args_in = 1,
+  args_out = 2,
+  iterator_types = ["parallel", "parallel", "parallel"],
+  indexing_maps = #accesses,
+  library_call = "some_external_function_name_2",
+  doc = "B(i,j,k), C(i,k,j) = foo(A(i, j), B(i,j,k), C(i,k,j))"
+}
+func @generic_region(%arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg2: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.generic #trait2 %arg0, %arg1, %arg2 {
+    ^bb0(%a: f32, %b: f32, %c: f32):
+      %d = mulf %a, %b : f32
+      %e = addf %c, %d : f32
+      linalg.yield %d, %e : f32, f32
+  }: memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: @generic_region
+//       CHECK: loop.for %[[i:.*]] = {{.*}}
+//       CHECK:   loop.for %[[j:.*]] = {{.*}}
+//       CHECK:     loop.for %[[k:.*]] = {{.*}}
+//       CHECK:       %[[a:.*]] = load %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32, #[[strided2D]]>
+//       CHECK:       %[[b:.*]] = load %{{.*}}[%[[i]], %[[j]], %[[k]]] : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:       %[[c:.*]] = load %{{.*}}[%[[i]], %[[k]], %[[j]]] : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:       %[[d:.*]] = mulf %[[a]], %[[b]] : f32
+//       CHECK:       %[[e:.*]] = addf %[[c]], %[[d]] : f32
+//       CHECK:       store %[[d]], %{{.*}}[%[[i]], %[[j]], %[[k]]] : memref<?x?x?xf32, #[[strided3D]]>
+//       CHECK:       store %[[e]], %{{.*}}[%[[i]], %[[k]], %[[j]]] : memref<?x?x?xf32, #[[strided3D]]>
+
+func @indexed_foo(%i: index, %j: index, %k: index, %0: f32, %1: f32, %2: f32) -> (f32, f32) {
+  %i_int = index_cast %i: index to i32
+  %i_float = sitofp %i_int : i32 to f32
+  return %i_float, %i_float : f32, f32
+}
+#trait3 = {
+  args_in = 1,
+  args_out = 2,
+  iterator_types = ["parallel", "parallel", "parallel"],
+  indexing_maps = #accesses,
+  fun = @indexed_foo,
+  library_call = "some_external_function_name_1",
+  doc = "b(i,j,k), c(i,k,j) = foo(a(i, j), b(i,j,k), c(i,k,j))"
+}
+func @indexed_generic_function(
+         %arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+         %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>,
+         %arg2: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.indexed_generic #trait3 %arg0, %arg1, %arg2:
+    memref<?x?xf32, offset: ?, strides: [?, 1]>,
+    memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>,
+    memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: @indexed_foo
+// CHECK-LABEL: @indexed_generic_function
+// CHECK: loop.for %[[i:.*]] = {{.*}}
+// CHECK:   loop.for %[[j:.*]] = {{.*}}
+// CHECK:     loop.for %[[k:.*]] = {{.*}}
+// CHECK:       %[[a:.*]] = load %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32, #[[strided2D]]>
+// CHECK:       %[[b:.*]] = load %{{.*}}[%[[i]], %[[j]], %[[k]]] : memref<?x?x?xf32, #[[strided3D]]>
+// CHECK:       %[[c:.*]] = load %{{.*}}[%[[i]], %[[k]], %[[j]]] : memref<?x?x?xf32, #[[strided3D]]>
+// CHECK:       %[[res:.*]]:2 = call @indexed_foo(%[[i]], %[[j]], %[[k]], %[[a]], %[[b]], %[[c]]) : (index, index, index, f32, f32, f32) -> (f32, f32)
+// CHECK:       store %[[res]]#0, %{{.*}}[%[[i]], %[[j]], %[[k]]] : memref<?x?x?xf32, #[[strided3D]]>
+// CHECK:       store %[[res]]#1, %{{.*}}[%[[i]], %[[k]], %[[j]]] : memref<?x?x?xf32, #[[strided3D]]>
+
+#trait4 = {
+  args_in = 1,
+  args_out = 2,
+  iterator_types = ["parallel", "parallel", "parallel"],
+  indexing_maps = #accesses,
+  library_call = "some_external_function_name_2",
+  doc = "B(i,j,k), C(i,k,j) = foo(A(i, j) * B(i,j,k), i * j * k + C(i,k,j))"
+}
+func @indexed_generic_region(
+        %arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+        %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>,
+        %arg2: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.indexed_generic #trait4 %arg0, %arg1, %arg2 {
+    ^bb0(%i: index, %j: index, %k: index, %a: f32, %b: f32, %c: f32):
+      %result_1 = mulf %a, %b : f32
+
+      %ij = addi %i, %j : index
+      %ijk = addi %ij, %k : index
+      %ijk_int = index_cast %ijk : index to i32
+      %ijk_float = sitofp %ijk_int : i32 to f32
+
+      %result_2 = addf %c, %ijk_float : f32
+      linalg.yield %result_1, %result_2 : f32, f32
+  }: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+     memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>,
+     memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+
+// CHECK-LABEL: @indexed_generic_region
+// CHECK: loop.for %[[i:.*]] = {{.*}}
+// CHECK:   loop.for %[[j:.*]] = {{.*}}
+// CHECK:     loop.for %[[k:.*]] = {{.*}}
+// CHECK:       %[[a:.*]] = load %{{.*}}[%[[i]], %[[j]]]
+// CHECK:       %[[b:.*]] = load %{{.*}}[%[[i]], %[[j]], %[[k]]]
+// CHECK:       %[[c:.*]] = load %{{.*}}[%[[i]], %[[k]], %[[j]]]
+// CHECK:       %[[result_1:.*]] = mulf %[[a]], %[[b]] : f32
+// CHECK:       %[[ij:.*]] = addi %[[i]], %[[j]] : index
+// CHECK:       %[[ijk:.*]] = addi %[[ij]], %[[k]] : index
+// CHECK:       %[[ijk_int:.*]] = index_cast %[[ijk]] : index to i32
+// CHECK:       %[[ijk_float:.*]] = sitofp %[[ijk_int]] : i32 to f32
+// CHECK:       %[[result_2:.*]] = addf %[[c]], %[[ijk_float]] : f32
+// CHECK:       store %[[result_1]], %{{.*}}[%[[i]], %[[j]], %[[k]]]
+// CHECK:       store %[[result_2]], %{{.*}}[%[[i]], %[[k]], %[[j]]]
diff --git a/mlir/test/Dialect/Linalg/promote.mlir b/mlir/test/Dialect/Linalg/promote.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..51261fcc37b8dbccd7aac036134cb779cf8c8e60
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/promote.mlir
@@ -0,0 +1,76 @@
+// RUN: mlir-opt %s -linalg-promote-subviews | FileCheck %s
+// RUN: mlir-opt %s -linalg-promote-subviews -test-linalg-promote-dynamic | FileCheck %s --check-prefix=DYNAMIC
+
+#map0 = (d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)
+#map1 = (d0) -> (d0 + 2)
+#map2 = (d0) -> (d0 + 4)
+#map3 = (d0) -> (d0 + 3)
+
+// CHECK-DAG: #[[strided2D:.*]] = (d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)
+// CHECK-DAG: #[[strided2DnoOffset:.*]] = (d0, d1)[s0] -> (d0 * s0 + d1)
+// CHECK-DAG: #[[strided2D_dynamic:.*]] = (d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)
+
+module {
+  func @matmul(%A: memref<?xi8>, %M: index, %N: index, %K: index) {
+    %c4 = constant 4 : index
+    %c3 = constant 3 : index
+    %c2 = constant 2 : index
+    %c0 = constant 0 : index
+    %c1 = constant 1 : index
+    %3 = view %A[%c0][%M, %K] : memref<?xi8> to memref<?x?xf32, #map0>
+    %4 = view %A[%c0][%K, %N] : memref<?xi8> to memref<?x?xf32, #map0>
+    %5 = view %A[%c0][%M, %N] : memref<?xi8> to memref<?x?xf32, #map0>
+    %6 = dim %3, 0 : memref<?x?xf32, #map0>
+    %7 = dim %3, 1 : memref<?x?xf32, #map0>
+    %8 = dim %4, 1 : memref<?x?xf32, #map0>
+    loop.for %arg4 = %c0 to %6 step %c2 {
+      loop.for %arg5 = %c0 to %8 step %c3 {
+        loop.for %arg6 = %c0 to %7 step %c4 {
+          %11 = std.subview %3[%arg4, %arg6][%c2, %c4][%c1, %c1] : memref<?x?xf32, #map0> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+          %14 = std.subview %4[%arg6, %arg5][%c4, %c3][%c1, %c1] : memref<?x?xf32, #map0> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+          %17 = std.subview %5[%arg4, %arg5][%c2, %c3][%c1, %c1] : memref<?x?xf32, #map0> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+          linalg.matmul(%11, %14, %17) : memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>, memref<?x?xf32, offset: ?, strides: [?, ?]>
+        }
+      }
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func @matmul(%{{.*}}: memref<?xi8>, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
+//       CHECK:   loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//       CHECK:     loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//       CHECK:       loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//       CHECK:         %[[vA:.*]] = std.subview {{.*}} : memref<?x?xf32, #[[strided2D]]>
+//       CHECK:         %[[vB:.*]] = std.subview {{.*}} : memref<?x?xf32, #[[strided2D]]>
+//       CHECK:         %[[vC:.*]] = std.subview {{.*}} : memref<?x?xf32, #[[strided2D]]>
+///
+//       CHECK:         %[[tmpA:.*]] = alloc() : memref<32xi8>
+//       CHECK:         %[[fullA:.*]] = std.view %[[tmpA]][][{{.*}}] : memref<32xi8> to memref<?x?xf32>
+//     DYNAMIC:         std.view %{{.*}}[][{{.*}}] : memref<?xi8> to memref<?x?xf32>
+//       CHECK:         %[[partialA:.*]] = linalg.slice %[[fullA]][%{{.*}}, %{{.*}}] : memref<?x?xf32>, !linalg.range, !linalg.range, memref<?x?xf32, #[[strided2DnoOffset]]>
+///
+//       CHECK:         %[[tmpB:.*]] = alloc() : memref<48xi8>
+//       CHECK:         %[[fullB:.*]] = std.view %[[tmpB]][][{{.*}}] : memref<48xi8> to memref<?x?xf32>
+//     DYNAMIC:         std.view %{{.*}}[][{{.*}}] : memref<?xi8> to memref<?x?xf32>
+//       CHECK:         %[[partialB:.*]] = linalg.slice %[[fullB]][%{{.*}}, %{{.*}}] : memref<?x?xf32>, !linalg.range, !linalg.range, memref<?x?xf32, #[[strided2DnoOffset]]>
+///
+//       CHECK:         %[[tmpC:.*]] = alloc() : memref<24xi8>
+//       CHECK:         %[[fullC:.*]] = std.view %[[tmpC]][][{{.*}}] : memref<24xi8> to memref<?x?xf32>
+//     DYNAMIC:         std.view %{{.*}}[][{{.*}}] : memref<?xi8> to memref<?x?xf32>
+//       CHECK:         %[[partialC:.*]] = linalg.slice %[[fullC]][%{{.*}}, %{{.*}}] : memref<?x?xf32>, !linalg.range, !linalg.range, memref<?x?xf32, #[[strided2DnoOffset]]>
+
+//       CHECK:         linalg.fill(%[[fullA]], {{.*}}) : memref<?x?xf32>, f32
+//       CHECK:         linalg.fill(%[[fullB]], {{.*}}) : memref<?x?xf32>, f32
+//       CHECK:         linalg.fill(%[[fullC]], {{.*}}) : memref<?x?xf32>, f32
+//       CHECK:         linalg.copy(%[[vA]], %[[partialA]]) : memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?x?xf32, #[[strided2DnoOffset]]>
+//       CHECK:         linalg.copy(%[[vB]], %[[partialB]]) : memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?x?xf32, #[[strided2DnoOffset]]>
+//       CHECK:         linalg.copy(%[[vC]], %[[partialC]]) : memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?x?xf32, #[[strided2DnoOffset]]>
+//
+//       CHECK:         linalg.matmul(%[[fullA]], %[[fullB]], %[[fullC]]) : memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
+//
+//       CHECK:         linalg.copy(%[[partialC]], %[[vC]]) : memref<?x?xf32, #[[strided2DnoOffset]]>, memref<?x?xf32, #[[strided2D_dynamic]]>
+//
+//       CHECK:         dealloc %[[tmpA]] : memref<32xi8>
+//       CHECK:         dealloc %[[tmpB]] : memref<48xi8>
+//       CHECK:         dealloc %[[tmpC]] : memref<24xi8>
diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..75d732d540d2d3c70dfe390113110a996fb09023
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/roundtrip.mlir
@@ -0,0 +1,174 @@
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+
+// TODO(pifon): Re-enable LLVM lowering test after IndexedGenericOp is lowered.
+//
+// Test that we can lower all the way to LLVM without crashing, don't check results here.
+// DISABLED: mlir-opt %s --convert-linalg-to-llvm -o=/dev/null 2>&1
+
+// CHECK-DAG: #[[strided1D:.*]] = (d0)[s0] -> (d0 + s0)
+// CHECK-DAG: #[[strided2D:.*]] = (d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)
+// CHECK-DAG: #[[strided3D:.*]] = (d0, d1, d2)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2 + d2)
+// CHECK-DAG: #[[strided6D:.*]] = (d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4 + d4 * s5 + d5)
+
+// CHECK-DAG: #[[map0:.*]] = (d0, d1, d2) -> (d0, d2, d1)
+// CHECK-DAG: #[[map1:.*]] = (d0, d1, d2) -> (d2, d1, d0)
+
+func @range(%arg0: index, %arg1: index, %arg2: index) {
+  %0 = linalg.range %arg0:%arg1:%arg2 : !linalg.range
+  return
+}
+// CHECK-LABEL: func @range(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
+//  CHECK-NEXT:  linalg.range %{{.*}}:%{{.*}}:%{{.*}} : !linalg.range
+
+func @views(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) {
+  %c0 = constant 0 : index
+  %0 = muli %arg0, %arg0 : index
+  %1 = alloc (%0) : memref<?xi8>
+  %2 = linalg.range %arg0:%arg1:%arg2 : !linalg.range
+  %3 = view %1[%c0][%arg0, %arg0] : memref<?xi8> to memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %4 = linalg.slice %3[%2, %2] : memref<?x?xf32, offset: ?, strides: [?, 1]>, !linalg.range, !linalg.range, memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %5 = linalg.slice %3[%2, %arg2] : memref<?x?xf32, offset: ?, strides: [?, 1]>, !linalg.range, index, memref<?xf32, offset: ?, strides: [1]>
+  %6 = linalg.slice %3[%arg2, %2] : memref<?x?xf32, offset: ?, strides: [?, 1]>, index, !linalg.range, memref<?xf32, offset: ?, strides: [1]>
+  %7 = linalg.slice %3[%arg2, %arg3] : memref<?x?xf32, offset: ?, strides: [?, 1]>, index, index, memref<f32>
+  %8 = view %1[%c0][%arg0, %arg0] : memref<?xi8> to memref<?x?xvector<4x4xf32>, offset: ?, strides: [?, 1]>
+  dealloc %1 : memref<?xi8>
+  return
+}
+// CHECK-LABEL: func @views(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
+//  CHECK:  muli %{{.*}}, %{{.*}} : index
+//  CHECK-NEXT:  alloc(%{{.*}}) : memref<?xi8>
+//  CHECK-NEXT:  range
+//  CHECK-NEXT:  std.view %{{.*}}[%{{.*}}][%{{.*}}] : memref<?xi8> to memref<?x?xf32, #[[strided2D]]>
+//  CHECK-NEXT:  linalg.slice %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32, #[[strided2D]]>, !linalg.range, !linalg.range, memref<?x?xf32, #[[strided2D]]>
+//  CHECK-NEXT:  linalg.slice %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32, #[[strided2D]]>, !linalg.range, index, memref<?xf32, #[[strided1D]]>
+//  CHECK-NEXT:  linalg.slice %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32, #[[strided2D]]>, index, !linalg.range, memref<?xf32, #[[strided1D]]>
+//  CHECK-NEXT:  linalg.slice %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32, #[[strided2D]]>, index, index, memref<f32>
+//  CHECK-NEXT:  view %{{.*}}[%{{.*}}][%{{.*}}] : memref<?xi8> to memref<?x?xvector<4x4xf32>, #[[strided2D]]>
+//  CHECK-NEXT:  dealloc %{{.*}} : memref<?xi8>
+
+func @ops(%arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>, %arg1: memref<?xf32, offset: ?, strides: [1]>, %arg2: memref<?xf32, offset: ?, strides: [1]>, %arg3: memref<f32>) {
+  linalg.matmul(%arg0, %arg0, %arg0) : memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>
+  linalg.matvec(%arg0, %arg1, %arg2) : memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?xf32, offset: ?, strides: [1]>, memref<?xf32, offset: ?, strides: [1]>
+  linalg.dot(%arg1, %arg2, %arg3) : memref<?xf32, offset: ?, strides: [1]>, memref<?xf32, offset: ?, strides: [1]>, memref<f32>
+  return
+}
+// CHECK-LABEL: func @ops(%
+//       CHECK:  {{.*}}: memref<?x?xf32, #[[strided2D]]>, %{{.*}}: memref<?xf32, #[[strided1D]]>, %{{.*}}: memref<?xf32, #[[strided1D]]>, %{{.*}}: memref<f32>) {
+//  CHECK-NEXT:  linalg.matmul(%{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?xf32, #[[strided2D]]>, memref<?x?xf32, #[[strided2D]]>, memref<?x?xf32, #[[strided2D]]>
+//  CHECK-NEXT:  linalg.matvec(%{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?xf32, #[[strided2D]]>, memref<?xf32, #[[strided1D]]>, memref<?xf32, #[[strided1D]]>
+//  CHECK-NEXT:  linalg.dot(%{{.*}}, %{{.*}}, %{{.*}}) : memref<?xf32, #[[strided1D]]>, memref<?xf32, #[[strided1D]]>, memref<f32>
+
+func @fill_view(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: f32) {
+  linalg.fill(%arg0, %arg1) : memref<?xf32, offset: ?, strides: [1]>, f32
+  return
+}
+// CHECK-LABEL: func @fill_view(
+//       CHECK:  %{{.*}}: memref<?xf32, #[[strided1D]]>, %{{.*}}: f32) {
+//       CHECK:   linalg.fill(%{{.*}}, %{{.*}}) : memref<?xf32, #[[strided1D]]>, f32
+
+func @transpose(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  %0 = linalg.transpose %arg0 (i, j, k) -> (k, j, i) : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @transpose
+//       CHECK:   linalg.transpose %{{.*}} ([[i:.*]], [[j:.*]], [[k:.*]]) -> ([[k]], [[j]], [[i]]) : memref<?x?x?xf32, #[[strided3D]]>
+
+func @fill_view3(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: f32) {
+  linalg.fill(%arg0, %arg1) : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, f32
+  return
+}
+// CHECK-LABEL: func @fill_view3(
+//       CHECK:  %{{.*}}: memref<?x?x?xf32, #[[strided3D]]>, %{{.*}}: f32) {
+//       CHECK:   linalg.fill(%{{.*}}, %{{.*}}) : memref<?x?x?xf32, #[[strided3D]]>, f32
+
+func @copy_view(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: memref<?xf32, offset: ?, strides: [1]>) {
+  linalg.copy(%arg0, %arg1) : memref<?xf32, offset: ?, strides: [1]>, memref<?xf32, offset: ?, strides: [1]>
+  return
+}
+// CHECK-LABEL: func @copy_view(
+//       CHECK:  %{{.*}}: memref<?xf32, #[[strided1D]]>, %{{.*}}: memref<?xf32, #[[strided1D]]>) {
+//       CHECK:   linalg.copy(%{{.*}}, %{{.*}}) : memref<?xf32, #[[strided1D]]>, memref<?xf32, #[[strided1D]]>
+
+func @copy_view3(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.copy(%arg0, %arg1) {inputPermutation = (i, j, k) -> (i, k, j),
+                             outputPermutation = (i, j, k) -> (k, j, i)} :
+    memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @copy_view3(
+//       CHECK:  %{{.*}}: memref<?x?x?xf32, #[[strided3D]]>, %{{.*}}: memref<?x?x?xf32, #[[strided3D]]>) {
+//       CHECK:   linalg.copy(%{{.*}}, %{{.*}}) {inputPermutation = #[[map0]], outputPermutation = #[[map1]]} : memref<?x?x?xf32, #[[strided3D]]>, memref<?x?x?xf32, #[[strided3D]]>
+
+func @conv_view3(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg2: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.conv(%arg0, %arg1, %arg2) : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @conv_view3(
+//       CHECK:  %{{.*}}: memref<?x?x?xf32, #[[strided3D]]>, %{{.*}}: memref<?x?x?xf32, #[[strided3D]]>, %{{.*}}: memref<?x?x?xf32, #[[strided3D]]>) {
+//       CHECK:   linalg.conv(%{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?xf32, #[[strided3D]]>, memref<?x?x?xf32, #[[strided3D]]>, memref<?x?x?xf32, #[[strided3D]]>
+
+func @conv_view6(%arg0: memref<?x?x?x?x?x?xf32, offset: ?, strides: [?, ?, ?, ?, ?, 1]>, %arg1: memref<?x?x?x?x?x?xf32, offset: ?, strides: [?, ?, ?, ?, ?, 1]>, %arg2: memref<?x?x?x?x?x?xf32, offset: ?, strides: [?, ?, ?, ?, ?, 1]>) {
+  linalg.conv(%arg0, %arg1, %arg2) {dilations = [4, 4, 5, 5], strides = [2, 2, 3, 3]} : memref<?x?x?x?x?x?xf32, offset: ?, strides: [?, ?, ?, ?, ?, 1]>, memref<?x?x?x?x?x?xf32, offset: ?, strides: [?, ?, ?, ?, ?, 1]>, memref<?x?x?x?x?x?xf32, offset: ?, strides: [?, ?, ?, ?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @conv_view6(
+//       CHECK:  %{{.*}}: memref<?x?x?x?x?x?xf32, #[[strided6D]]>, %{{.*}}: memref<?x?x?x?x?x?xf32, #[[strided6D]]>, %{{.*}}: memref<?x?x?x?x?x?xf32, #[[strided6D]]>) {
+//       CHECK:   linalg.conv(%{{.*}}, %{{.*}}, %{{.*}}) {dilations = [4, 4, 5, 5], strides = [2, 2, 3, 3]} : memref<?x?x?x?x?x?xf32, #[[strided6D]]>, memref<?x?x?x?x?x?xf32, #[[strided6D]]>, memref<?x?x?x?x?x?xf32, #[[strided6D]]>
+
+#accesses = [
+  (i, j, k) -> (j, i),
+  (i, j, k) -> (i, k, i + j)
+]
+#trait = {
+  args_in = 1,
+  args_out = 1,
+  indexing_maps = #accesses,
+  iterator_types = ["parallel", "parallel", "parallel"],
+  fun = @foo,
+  library_call = "some_external_function_name_1"
+}
+func @foo(%0: vector<3x4xi4>, %1: f32) -> f32 {
+  %f0 = constant 0.0 : f32
+  return %f0 : f32
+}
+func @generic(%arg0: memref<?x?xvector<3x4xi4>, offset: ?, strides: [?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.generic #trait %arg0, %arg1 {foo = 1} : memref<?x?xvector<3x4xi4>, offset: ?, strides: [?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @foo
+// CHECK-LABEL: func @generic
+//       CHECK:   linalg.generic {args_in = 1 : i64, args_out = 1 : i64, fun = @foo, indexing_maps = [#{{.*}}, #{{.*}}], iterator_types = ["parallel", "parallel", "parallel"], library_call = "some_external_function_name_1"} %{{.*}}, %{{.*}} {foo = 1 : i64}: memref<?x?xvector<3x4xi4>, #[[strided2D]]>, memref<?x?x?xf32, #[[strided3D]]>
+
+#trait2 = {
+  args_in = 1,
+  args_out = 1,
+  indexing_maps = #accesses,
+  iterator_types = ["parallel", "parallel", "parallel"],
+  library_call = "some_external_function_name_2"
+}
+func @generic_region(%arg0: memref<?x?xvector<3x4xi4>, offset: ?, strides: [?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.generic #trait2 %arg0, %arg1 {
+    ^bb(%a: vector<3x4xi4>, %b: f32) :
+      linalg.yield %b : f32
+  } {foo = 1}: memref<?x?xvector<3x4xi4>, offset: ?, strides: [?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @generic_region
+//       CHECK:   linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#{{.*}}, #{{.*}}], iterator_types = ["parallel", "parallel", "parallel"], library_call = "some_external_function_name_2"} %{{.*}}, %{{.*}} {
+//       CHECK:    ^{{.*}}(%{{.*}}: vector<3x4xi4>, %{{.*}}: f32):    // no predecessors
+//       CHECK:      linalg.yield %{{.*}} : f32
+//       CHECK:    } {foo = 1 : i64}: memref<?x?xvector<3x4xi4>, #[[strided2D]]>, memref<?x?x?xf32, #[[strided3D]]>
+
+func @indexed_generic(%arg0: memref<?x?xvector<3x4xi4>, offset: ?, strides: [?, 1]>,
+                      %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
+  linalg.indexed_generic #trait2 %arg0, %arg1 {
+  ^bb(%i: index, %j: index, %k: index, %a: vector<3x4xi4>, %b: f32) :
+      linalg.yield %b : f32
+  } {foo = 1}: memref<?x?xvector<3x4xi4>, offset: ?, strides: [?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
+  return
+}
+// CHECK-LABEL: func @indexed_generic
+//       CHECK:   linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#{{.*}}, #{{.*}}],  iterator_types = ["parallel", "parallel", "parallel"], library_call = "some_external_function_name_2"} %{{.*}}, %{{.*}} {
+//       CHECK:    ^{{.*}}(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: vector<3x4xi4>, %{{.*}}: f32):
+//       CHECK:      linalg.yield %{{.*}} : f32
+//       CHECK:    } {foo = 1 : i64}: memref<?x?xvector<3x4xi4>, #[[strided2D]]>, memref<?x?x?xf32, #[[strided3D]]>
diff --git a/mlir/test/Dialect/Linalg/tile.mlir b/mlir/test/Dialect/Linalg/tile.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..763b33b797369ca681ae1d74891196a8ecdc914f
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/tile.mlir
@@ -0,0 +1,249 @@
+// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2 | FileCheck %s -check-prefix=TILE-2
+// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=0,2 | FileCheck %s -check-prefix=TILE-02
+// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=0,0,2 | FileCheck %s -check-prefix=TILE-002
+// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 | FileCheck %s -check-prefix=TILE-234
+
+//   TILE-2-DAG: #[[strided1D:.*]] = (d0)[s0] -> (d0 + s0)
+//  TILE-02-DAG: #[[strided1D:.*]] = (d0)[s0] -> (d0 + s0)
+// TILE-002-DAG: #[[strided1D:.*]] = (d0)[s0] -> (d0 + s0)
+// TILE-234-DAG: #[[strided1D:.*]] = (d0)[s0] -> (d0 + s0)
+
+//   TILE-2-DAG: #[[strided2D:.*]] = (d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)
+//  TILE-02-DAG: #[[strided2D:.*]] = (d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)
+// TILE-002-DAG: #[[strided2D:.*]] = (d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)
+// TILE-234-DAG: #[[strided2D:.*]] = (d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)
+
+//   TILE-2-DAG: #[[strided1D_dynamic:.*]] = (d0)[s0, s1] -> (d0 * s1 + s0)
+//   TILE-02-DAG: #[[strided1D_dynamic:.*]] = (d0)[s0, s1] -> (d0 * s1 + s0)
+//   T_ILE-002-DAG: #[[strided1D_dynamic:.*]] = (d0)[s0, s1] -> (d0 * s1 + s0)
+//   TILE-234-DAG: #[[strided1D_dynamic:.*]] = (d0)[s0, s1] -> (d0 * s1 + s0)
+
+//   TILE-2-DAG: #[[strided2D_dynamic:.*]] = (d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)
+//   TILE-02-DAG: #[[strided2D_dynamic:.*]] = (d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)
+//   TILE-002-DAG: #[[strided2D_dynamic:.*]] = (d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)
+//   TILE-234-DAG: #[[strided2D_dynamic:.*]] = (d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)
+
+//   REACTIVATE_ME_TILE-2-DAG: #[[stride_99_1_layout_map:.*]] = (d0, d1)[s0] -> (d0 * 99 + s0 + d1)
+//  REACTIVATE_ME_TILE-02-DAG: #[[stride_99_1_layout_map:.*]] = (d0, d1)[s0] -> (d0 * 99 + s0 + d1)
+// REACTIVATE_ME_TILE-234-DAG: #[[stride_99_1_layout_map:.*]] = (d0, d1)[s0] -> (d0 * 99 + s0 + d1)
+
+func @matmul(%arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>, %arg1: memref<?x?xf32, offset: ?, strides: [?, 1]>, %arg2: memref<?x?xf32, offset: ?, strides: [?, 1]>) {
+  linalg.matmul(%arg0, %arg1, %arg2) : memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>
+  return
+}
+// TILE-2-LABEL: func @matmul(
+//       TILE-2-DAG: %[[C0:.*]] = constant 0 : index
+//       TILE-2-DAG: %[[C1:.*]] = constant 1 : index
+//       TILE-2-DAG: %[[C2:.*]] = constant 2 : index
+//       TILE-2: %[[M:.*]] = dim %{{.*}}, 0 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-2: loop.for %[[I:.*]] = %{{.*}}{{.*}} to %[[M]] step %{{.*}} {
+//       TILE-2:   %[[K:.*]] = dim %{{.*}}, 1 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-2:   %[[sAi:.*]] = std.subview %{{.*}}[%[[I]], %[[C0]]][%[[C2]], %[[K]]][%[[C1]], %[[C1]]] : memref<?x?xf32, #[[strided2D]]> to memref<?x?xf32, #[[strided2D_dynamic]]>
+//       TILE-2:   %[[N:.*]] = dim %{{.*}}, 1 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-2:   %[[sCi:.*]] = std.subview %{{.*}}[%[[I]], %[[C0]]][%[[C2]], %[[N]]][%[[C1]], %[[C1]]] : memref<?x?xf32, #[[strided2D]]> to memref<?x?xf32, #[[strided2D_dynamic]]>
+//       TILE-2:   linalg.matmul(%[[sAi]], %{{.*}}, %[[sCi]]) : memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?x?xf32, #[[strided2D]]>, memref<?x?xf32, #[[strided2D_dynamic]]>
+
+// TILE-02-LABEL: func @matmul(
+//       TILE-02-DAG: %[[C0:.*]] = constant 0 : index
+//       TILE-02-DAG: %[[C1:.*]] = constant 1 : index
+//       TILE-02-DAG: %[[C2:.*]] = constant 2 : index
+//       TILE-02: %[[N:.*]] = dim %arg1, 1 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-02: loop.for %[[J:.*]] = %{{.*}} to %[[N]] step %{{.*}} {
+//       TILE-02:   %[[K:.*]] = dim %{{.*}}, 0 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-02:   %[[sBj:.*]] = std.subview %{{.*}}[%[[C0]], %[[J]]][%[[K]], %[[C2]]][%[[C1]], %[[C1]]] : memref<?x?xf32, #[[strided2D]]> to memref<?x?xf32, #[[strided2D_dynamic]]>
+//       TILE-02:   %[[M:.*]] = dim %{{.*}}, 0 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-02:   %[[sCj:.*]] = std.subview %{{.*}}[%[[C0]], %[[J]]][%[[M]], %[[C2]]][%[[C1]], %[[C1]]] : memref<?x?xf32, #[[strided2D]]> to memref<?x?xf32, #[[strided2D_dynamic]]>
+//       TILE-02:   linalg.matmul(%{{.*}}, %[[sBj]], %[[sCj]]) : memref<?x?xf32, #[[strided2D]]>, memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?x?xf32, #[[strided2D_dynamic]]>
+
+// TILE-002-LABEL: func @matmul(
+//       TILE-002-DAG: %[[C0:.*]] = constant 0 : index
+//       TILE-002-DAG: %[[C1:.*]] = constant 1 : index
+//       TILE-002-DAG: %[[C2:.*]] = constant 2 : index
+//       TILE-002: %[[ubK:.*]] = dim %{{.*}}, 1 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-002: loop.for %[[K:.*]] = %{{.*}}{{.*}} to %[[ubK]] step %{{.*}} {
+//       TILE-002:   %[[M:.*]] = dim %{{.*}}, 0 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-002:   %[[sAj:.*]] = std.subview %{{.*}}[%[[C0]], %[[K]]][%[[M]], %[[C2]]][%[[C1]], %[[C1]]] : memref<?x?xf32, #[[strided2D]]> to memref<?x?xf32, #[[strided2D_dynamic]]>
+//       TILE-002:   %[[N:.*]] = dim %{{.*}}, 1 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-002:   %[[sBj:.*]] = std.subview %{{.*}}[%[[K]], %[[C0]]][%[[C2]], %[[N]]][%[[C1]], %[[C1]]] : memref<?x?xf32, #[[strided2D]]> to memref<?x?xf32, #[[strided2D_dynamic]]>
+//       TILE-002:   linalg.matmul(%[[sAj]], %[[sBj]], %{{.*}}) : memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?x?xf32, #[[strided2D]]>
+
+// TILE-234-LABEL: func @matmul(
+//       TILE-234-DAG: %[[C0:.*]] = constant 0 : index
+//       TILE-234-DAG: %[[C1:.*]] = constant 1 : index
+//       TILE-234-DAG: %[[C2:.*]] = constant 2 : index
+//       TILE-234-DAG: %[[C3:.*]] = constant 3 : index
+//       TILE-234-DAG: %[[C4:.*]] = constant 4 : index
+//       TILE-234: %[[ubM:.*]] = dim %{{.*}}, 0 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-234: %[[ubK:.*]] = dim %{{.*}}, 1 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-234: %[[ubN:.*]] = dim %{{.*}}, 1 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-234:  loop.for %[[I:.*]] = %{{.*}}{{.*}} to %[[ubM]] step %{{.*}} {
+//       TILE-234:    loop.for %[[J:.*]] = %{{.*}}{{.*}} to %[[ubN]] step %{{.*}} {
+//       TILE-234:      loop.for %[[K:.*]] = %{{.*}}{{.*}} to %[[ubK]] step %{{.*}} {
+//       TILE-234:        %[[sAik:.*]] = std.subview %{{.*}}[%[[I]], %[[K]]][%[[C2]], %[[C4]]][%[[C1]], %[[C1]]] : memref<?x?xf32, #[[strided2D]]> to memref<?x?xf32, #[[strided2D_dynamic]]>
+//       TILE-234:        %[[sBkj:.*]] = std.subview %{{.*}}[%[[K]], %[[J]]][%[[C4]], %[[C3]]][%[[C1]], %[[C1]]] : memref<?x?xf32, #[[strided2D]]> to memref<?x?xf32, #[[strided2D_dynamic]]>
+//       TILE-234:        %[[sCij:.*]] = std.subview %{{.*}}[%[[I]], %[[J]]][%[[C2]], %[[C3]]][%[[C1]], %[[C1]]] : memref<?x?xf32, #[[strided2D]]> to memref<?x?xf32, #[[strided2D_dynamic]]>
+//
+//       TILE-234:        linalg.matmul(%[[sAik]], %[[sBkj]], %[[sCij]]) : memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?x?xf32, #[[strided2D_dynamic]]>
+
+func @matvec(%arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>, %arg1: memref<?xf32, offset: ?, strides: [1]>, %arg2: memref<?xf32, offset: ?, strides: [1]>) {
+  linalg.matvec(%arg0, %arg1, %arg2) : memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?xf32, offset: ?, strides: [1]>, memref<?xf32, offset: ?, strides: [1]>
+  return
+}
+// TILE-2-LABEL: func @matvec(
+//       TILE-2-DAG: %[[C0:.*]] = constant 0 : index
+//       TILE-2-DAG: %[[C1:.*]] = constant 1 : index
+//       TILE-2-DAG: %[[C2:.*]] = constant 2 : index
+//       TILE-2: %[[M:.*]] = dim %{{.*}}, 0 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-2: loop.for %[[I]] = %{{.*}}{{.*}} to %[[M]] step %{{.*}} {
+//       TILE-2:   %[[N:.*]] = dim %{{.*}}, 1 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-2:   %[[sAi:.*]] = std.subview %{{.*}}[%[[I]], %[[C0]]][%[[C2]], %[[N]]][%[[C1]], %[[C1]]] : memref<?x?xf32, #[[strided2D]]> to memref<?x?xf32, #[[strided2D_dynamic]]>
+//       TILE-2:   %[[sCi:.*]] = std.subview %{{.*}}[%[[I]]][%[[C2]]][%[[C1]]] : memref<?xf32, #[[strided1D]]> to memref<?xf32, #[[strided1D_dynamic]]>
+//       TILE-2:   linalg.matvec(%[[sAi]], %{{.*}}, %[[sCi]]) : memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?xf32, #[[strided1D]]>, memref<?xf32, #[[strided1D_dynamic]]>
+
+// TILE-02-LABEL: func @matvec(
+//       TILE-02-DAG: %[[C0:.*]] = constant 0 : index
+//       TILE-02-DAG: %[[C1:.*]] = constant 1 : index
+//       TILE-02-DAG: %[[C2:.*]] = constant 2 : index
+//       TILE-02: %[[K:.*]] = dim %{{.*}}, 1 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-02: loop.for %[[J]] = %{{.*}}{{.*}} to %[[K]] step %{{.*}} {
+//       TILE-02:   %[[M:.*]] = dim %{{.*}}, 0 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-02:   %[[sAj:.*]] = std.subview %{{.*}}[%[[C0]], %[[J]]][%[[M]], %[[C2]]][%[[C1]], %[[C1]]] : memref<?x?xf32, #[[strided2D]]> to memref<?x?xf32, #[[strided2D_dynamic]]>
+//       TILE-02:   %[[sBj:.*]] = std.subview %{{.*}}[%[[J]]][%[[C2]]][%[[C1]]] : memref<?xf32, #[[strided1D]]> to memref<?xf32, #[[strided1D_dynamic]]>
+//       TILE-02:   linalg.matvec(%[[sAj]], %[[sBj]], %{{.*}}) : memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?xf32, #[[strided1D_dynamic]]>, memref<?xf32, #[[strided1D]]>
+
+// TILE-002-LABEL: func @matvec(
+//   TILE-002-NOT: loop.for
+
+// TILE-234-LABEL: func @matvec(
+//       TILE-234-DAG: %[[C0:.*]] = constant 0 : index
+//       TILE-234-DAG: %[[C1:.*]] = constant 1 : index
+//       TILE-234-DAG: %[[C2:.*]] = constant 2 : index
+//       TILE-234-DAG: %[[C3:.*]] = constant 3 : index
+//       TILE-234: %[[M:.*]] = dim %{{.*}}, 0 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-234: %[[K:.*]] = dim %{{.*}}, 1 : memref<?x?xf32, #[[strided2D]]>
+//       TILE-234:  loop.for %[[I:.*]] = %{{.*}}{{.*}} to %[[M]] step %{{.*}} {
+//       TILE-234:    loop.for %[[J:.*]] = %{{.*}}{{.*}} to %[[K]] step %{{.*}} {
+//       TILE-234:      %[[sAij:.*]] = std.subview %{{.*}}[%[[I]], %[[J]]][%[[C2]], %[[C3]]][%[[C1]], %[[C1]]] : memref<?x?xf32, #[[strided2D]]> to memref<?x?xf32, #[[strided2D_dynamic]]>
+//       TILE-234:      %[[sBj:.*]] = std.subview %{{.*}}[%[[J]]][%[[C3]]][%[[C1]]] : memref<?xf32, #[[strided1D]]> to memref<?xf32, #[[strided1D_dynamic]]>
+//       TILE-234:      %[[sCi:.*]] = std.subview %{{.*}}[%[[I]]][%[[C2]]][%[[C1]]] : memref<?xf32, #[[strided1D]]> to memref<?xf32, #[[strided1D_dynamic]]>
+//
+//       TILE-234:      linalg.matvec(%[[sAij]], %[[sBj]], %[[sCi]]) : memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?xf32, #[[strided1D_dynamic]]>, memref<?xf32, #[[strided1D_dynamic]]>
+
+func @dot(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: memref<?xf32, offset: ?, strides: [1]>, %arg2: memref<f32>) {
+  linalg.dot(%arg0, %arg1, %arg2) : memref<?xf32, offset: ?, strides: [1]>, memref<?xf32, offset: ?, strides: [1]>, memref<f32>
+  return
+}
+// TILE-2-LABEL: func @dot(
+//       TILE-2-DAG: %[[C0:.*]] = constant 0 : index
+//       TILE-2-DAG: %[[C1:.*]] = constant 1 : index
+//       TILE-2-DAG: %[[C2:.*]] = constant 2 : index
+//       TILE-2: %[[M:.*]] = dim %{{.*}}, 0 : memref<?xf32, #[[strided1D]]>
+//       TILE-2: loop.for %[[I]] = %{{.*}}{{.*}} to %[[M]] step %{{.*}} {
+//       TILE-2:   %[[sAi:.*]] = std.subview %{{.*}}[%[[I]]][%[[C2]]][%[[C1]]] : memref<?xf32, #[[strided1D]]> to memref<?xf32, #[[strided1D_dynamic]]>
+//       TILE-2:   %[[sBi:.*]] = std.subview %{{.*}}[%[[I]]][%[[C2]]][%[[C1]]] : memref<?xf32, #[[strided1D]]> to memref<?xf32, #[[strided1D_dynamic]]>
+//       TILE-2:   linalg.dot(%[[sAi]], %[[sBi]], {{.*}}) : memref<?xf32, #[[strided1D_dynamic]]>, memref<?xf32, #[[strided1D_dynamic]]>, memref<f32>
+
+// TILE-02-LABEL: func @dot(
+//   TILE-02-NOT: loop.for
+
+// TILE-002-LABEL: func @dot(
+//   TILE-002-NOT: loop.for
+
+// TILE-234-LABEL: func @dot(
+//       TILE-234-DAG: %[[C0:.*]] = constant 0 : index
+//       TILE-234-DAG: %[[C1:.*]] = constant 1 : index
+//       TILE-234-DAG: %[[C2:.*]] = constant 2 : index
+//       TILE-234:  %[[ubK:.*]] = dim %{{.*}}, 0 : memref<?xf32, #[[strided1D]]>
+//       TILE-234:  loop.for %[[I:.*]] = %{{.*}} to %[[ubK]] step %{{.*}} {
+//       TILE-234:    %[[sAi:.*]] = std.subview %{{.*}}[%[[I]]][%[[C2]]][%[[C1]]] : memref<?xf32, #[[strided1D]]> to memref<?xf32, #[[strided1D_dynamic]]>
+//       TILE-234:    %[[sBi:.*]] = std.subview %{{.*}}[%[[I]]][%[[C2]]][%[[C1]]] : memref<?xf32, #[[strided1D]]> to memref<?xf32, #[[strided1D_dynamic]]>
+//       TILE-234:    linalg.dot(%[[sAi]], %[[sBi]], %{{.*}}) : memref<?xf32, #[[strided1D_dynamic]]>, memref<?xf32, #[[strided1D_dynamic]]>, memref<f32>
+
+func @fill_static(%arg0: memref<127x99xf32>, %arg1: f32) {
+  linalg.fill(%arg0, %arg1) : memref<127x99xf32>, f32
+  return
+}
+// TILE-2-LABEL: func @fill_static
+//       TILE-2:   for
+//   TILE-2-NOT:   for
+//       TILE-2:       std.subview{{.*}} : memref<127x99xf32>
+//       TILE-2:       linalg.fill{{.*}} : memref<?x?xf32, #[[strided2D_dynamic]]>, f32
+
+// TILE-02-LABEL: func @fill_static
+//       TILE-02:   for
+//   TILE-02-NOT:   for
+//       TILE-02:       std.subview{{.*}} : memref<127x99xf32>
+//       TILE-02:       linalg.fill{{.*}} : memref<?x?xf32, #[[strided2D_dynamic]]>, f32
+
+// TILE-002-LABEL: func @fill_static
+//   TILE-002-NOT:   for
+//       TILE-002:     linalg.fill{{.*}} memref<127x99xf32>, f32
+
+// TILE-234-LABEL: func @fill_static
+//       TILE-234:   for
+//       TILE-234:     for
+//   TILE-234-NOT:   for
+//       TILE-234:       std.subview{{.*}} : memref<127x99xf32>
+//       TILE-234:       linalg.fill{{.*}} : memref<?x?xf32, #[[strided2D_dynamic]]>, f32
+
+
+func @fill(%arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>, %arg1: f32) {
+  linalg.fill(%arg0, %arg1) : memref<?x?xf32, offset: ?, strides: [?, 1]>, f32
+  return
+}
+// TILE-2-LABEL: func @fill
+//       TILE-2:   for
+//   TILE-2-NOT:   for
+//       TILE-2:   fill{{.*}} f32
+
+// TILE-02-LABEL: func @fill
+//       TILE-02:   for
+//   TILE-02-NOT:   for
+//       TILE-02:     fill{{.*}} f32
+
+// TILE-002-LABEL: func @fill
+//   TILE-002-NOT:   for
+//       TILE-002:     fill{{.*}} f32
+
+// TILE-234-LABEL: func @fill
+//       TILE-234:   for
+//       TILE-234:     for
+//   TILE-234-NOT:   for
+//       TILE-234:       fill{{.*}} f32
+
+#id_2d = (i, j) -> (i, j)
+#pointwise_2d_trait = {
+  args_in = 2,
+  args_out = 1,
+  indexing_maps = [#id_2d, #id_2d, #id_2d],
+  iterator_types = ["parallel", "parallel"]
+}
+
+func @pointwise(%arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>, %arg1: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                %arg2: memref<?x?xf32, offset: ?, strides: [?, 1]>) {
+  linalg.generic #pointwise_2d_trait %arg0, %arg1, %arg2 {
+  ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):   // no predecessors
+    %4 = addf %arg4, %arg5 : f32
+    linalg.yield %4 : f32
+  }: memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>
+  return
+}
+// TILE-2-LABEL: func @pointwise
+//       TILE-2:   for
+//   TILE-2-NOT:   for
+//       TILE-2:   linalg.generic
+
+// TILE-02-LABEL: func @pointwise
+//       TILE-02:   for
+//   TILE-02-NOT:   for
+//       TILE-02:     linalg.generic
+
+// TILE-002-LABEL: func @pointwise
+//   TILE-002-NOT:   for
+//       TILE-002:     linalg.generic
+
+// TILE-234-LABEL: func @pointwise
+//       TILE-234:   for
+//       TILE-234:     for
+//   TILE-234-NOT:   for
+//       TILE-234:       linalg.generic
diff --git a/mlir/test/Dialect/Linalg/tile_conv.mlir b/mlir/test/Dialect/Linalg/tile_conv.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..64c55cb1e45b93f7a40e48bcbd72fccd6fd6a00b
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/tile_conv.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,0,0,4 | FileCheck %s -check-prefix=TILE-23004
+
+// TILE-23004-DAG: #[[D0x30pS0x10:.*]] = (d0) -> (d0 * 30)
+// TILE-23004-DAG: #[[S0x10p90:.*]] = ()[s0] -> (s0 * 10 + 90)
+// TILE-23004-DAG: #[[strided4D:.*]] = (d0, d1, d2, d3)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3)
+// TILE-23004-DAG: #[[strided4D_dynamic:.*]] = (d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)
+
+func @conv(%arg0: memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>, %arg1: memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>, %arg2: memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>) {
+  linalg.conv(%arg0, %arg1, %arg2) {dilations = [10, 20], strides = [30, 40]} : memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>, memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>, memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>
+  return
+}
+// TILE-23004-LABEL: func @conv(
+//       TILE-23004:   %{{.*}}: memref<?x?x?x?xf32, #[[strided4D]]>, %{{.*}}: memref<?x?x?x?xf32, #[[strided4D]]>, %{{.*}}: memref<?x?x?x?xf32, #[[strided4D]]>) {
+//       TILE-23004-DAG: %[[C0:.*]] = constant 0 : index
+//       TILE-23004-DAG: %[[C1:.*]] = constant 1 : index
+//       TILE-23004-DAG: %[[C2:.*]] = constant 2 : index
+//       TILE-23004-DAG: %[[C3:.*]] = constant 3 : index
+//       TILE-23004-DAG: %[[C4:.*]] = constant 4 : index
+//       TILE-23004:   %[[Q:.*]] = dim %{{.*}}, 2 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       TILE-23004:   %[[B:.*]] = dim %{{.*}}, 0 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       TILE-23004:   %[[PaddedInput0:.*]] = dim %{{.*}}, 1 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       TILE-23004:   %[[X0:.*]] = dim %{{.*}}, 1 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       TILE-23004:   loop.for %[[ivI:.*]] = %{{.*}} to %[[B]] step %{{.*}} {
+//       TILE-23004:     loop.for %[[ivJ:.*]] = %{{.*}} to %[[X0]] step %{{.*}} {
+//       TILE-23004:       loop.for %[[ivK:.*]] = %{{.*}} to %[[Q]] step %{{.*}} {
+//       TILE-23004:         %[[Z0:.*]] = dim %{{.*}}, 0 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       TILE-23004:         %[[Z1:.*]] = dim %{{.*}}, 1 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       TILE-23004:         %[[K:.*]] = dim %{{.*}}, 3 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       TILE-23004:         %[[FilterView:.*]] = std.subview %{{.*}}[%[[C0]], %[[C0]], %[[ivK]], %[[C0]]][%[[Z0]], %[[Z1]], %[[C4]], %[[K]]][%[[C1]], %[[C1]], %[[C1]], %[[C1]]] : memref<?x?x?x?xf32, #[[strided4D]]> to memref<?x?x?x?xf32, #[[strided4D_dynamic]]>
+//
+//       TILE-23004:         %[[J1:.*]] = affine.apply #[[D0x30pS0x10]](%[[ivJ]])
+//       T__ILE-23004:         %[[I1pStep:.*]] = affine.apply #[[S0x10p90]]()[%[[I1]]]
+//       TILE-23004:         %[[SZ2:.*]] = dim %{{.*}}, 2 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       TILE-23004:         %[[InputView:.*]] = std.subview %{{.*}}[%[[ivI]], %[[J1]], %[[C0]], %[[ivK]]][%[[C2]], %{{.*}}, %[[SZ2]], %[[C4]]][%[[C1]], %[[C1]], %[[C1]], %[[C1]]] : memref<?x?x?x?xf32, #[[strided4D]]> to memref<?x?x?x?xf32, #[[strided4D_dynamic]]>
+//
+//       TILE-23004:         %[[X0:.*]] = dim %{{.*}}, 2 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       TILE-23004:         %[[X1:.*]] = dim %{{.*}}, 3 : memref<?x?x?x?xf32, #[[strided4D]]>
+//       TILE-23004:         %[[OutputView:.*]] = std.subview %{{.*}}[%[[ivI]], %[[ivJ]], %[[C0]], %[[C0]]][%[[C2]], %[[C3]], %[[X0]], %[[X1]]][%[[C1]], %[[C1]], %[[C1]], %[[C1]]] : memref<?x?x?x?xf32, #[[strided4D]]> to memref<?x?x?x?xf32, #[[strided4D_dynamic]]>
+//
+//       TILE-23004:         linalg.conv(%[[FilterView]], %[[InputView]], %[[OutputView]]) {dilations = [10, 20], strides = [30, 40]} : memref<?x?x?x?xf32, #[[strided4D_dynamic]]>, memref<?x?x?x?xf32, #[[strided4D_dynamic]]>, memref<?x?x?x?xf32, #[[strided4D_dynamic]]>
diff --git a/mlir/test/Dialect/Linalg/tile_indexed_generic.mlir b/mlir/test/Dialect/Linalg/tile_indexed_generic.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..c7cd61b76e35f686ad4362d8528556e79ecb6859
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/tile_indexed_generic.mlir
@@ -0,0 +1,104 @@
+// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=10,25 | FileCheck %s -check-prefix=TILE-10n25
+// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=25,0 | FileCheck %s -check-prefix=TILE-25n0
+// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=0,25 | FileCheck %s -check-prefix=TILE-0n25
+
+#id_1d = (i) -> (i)
+#pointwise_1d_trait = {
+  args_in = 1,
+  args_out = 1,
+  indexing_maps = [#id_1d, #id_1d],
+  iterator_types = ["parallel"]
+}
+func @indexed_generic_vector(%operand: memref<50xf32>, %result: memref<50xf32>) {
+  linalg.indexed_generic #pointwise_1d_trait %operand, %result {
+    ^bb0(%i: index, %operand_in: f32, %result_in: f32):
+      %i_int = index_cast %i: index to i32
+      %i_float = sitofp %i_int : i32 to f32
+      %out = addf %operand_in, %i_float : f32
+      linalg.yield %out : f32
+  }: memref<50xf32>, memref<50xf32>
+  return
+}
+// TILE-10n25-LABEL: func @indexed_generic_vector
+// TILE-10n25: %[[C10:.*]] = constant 10 : index
+// TILE-10n25: loop.for %[[J:.*]] = {{.*}} step %[[C10]]
+// TILE-10n25:   linalg.indexed_generic
+// TILE-10n25:   ^bb0(%[[I:.*]]: index, %[[IN:.*]]: f32, %[[OUT:.*]]: f32)
+// TILE-10n25:     %[[NEW_I:.*]] = addi %[[I]], %[[J]] : index
+// TILE-10n25:     %[[NEW_I_INT:.*]] = index_cast %[[NEW_I]] : index to i32
+// TILE-10n25:     %[[NEW_I_FLOAT:.*]] = sitofp %[[NEW_I_INT]] : i32 to f32
+// TILE-10n25:     %[[OUT:.*]] = addf %[[IN]], %[[NEW_I_FLOAT]] : f32
+
+// TILE-25n0-LABEL: func @indexed_generic_vector
+// TILE-25n0: %[[C25:.*]] = constant 25 : index
+// TILE-25n0: loop.for %[[J:.*]] = {{.*}} step %[[C25]]
+// TILE-25n0:   linalg.indexed_generic
+// TILE-25n0:   ^bb0(%[[I:.*]]: index, %[[IN:.*]]: f32, %[[OUT:.*]]: f32)
+// TILE-25n0:     %[[NEW_I:.*]] = addi %[[I]], %[[J]] : index
+// TILE-25n0:     %[[NEW_I_INT:.*]] = index_cast %[[NEW_I]] : index to i32
+// TILE-25n0:     %[[NEW_I_FLOAT:.*]] = sitofp %[[NEW_I_INT]] : i32 to f32
+// TILE-25n0:     %[[OUT:.*]] = addf %[[IN]], %[[NEW_I_FLOAT]] : f32
+
+// TILE-0n25-LABEL: func @indexed_generic_vector
+// TILE-0n25-NOT: loop.for %[[J:.*]] = {{.*}} step %[[C25]]
+// TILE-0n25: linalg.indexed_generic
+
+#combined_indices_trait = {
+  args_in = 1,
+  args_out = 1,
+  indexing_maps = [
+    (i, j) -> (j, i + j),
+    (i, j) -> (i, j)
+  ],
+  iterator_types = ["parallel", "parallel"]
+}
+func @indexed_generic_matrix(%operand: memref<50x100xf32>, %result: memref<50x100xf32>) {
+  linalg.indexed_generic #combined_indices_trait %operand, %result {
+    ^bb0(%i: index, %j: index, %operand_in: f32, %result_in: f32):
+      %i_int = index_cast %i: index to i32
+      %i_float = sitofp %i_int : i32 to f32
+      %j_int = index_cast %j: index to i32
+      %j_float = sitofp %j_int : i32 to f32
+      %out = addf %i_float, %j_float : f32
+      linalg.yield %out : f32
+  }: memref<50x100xf32>, memref<50x100xf32>
+  return
+}
+// TILE-10n25-LABEL: func @indexed_generic_matrix
+// TILE-10n25: %[[C25:.*]] = constant 25 : index
+// TILE-10n25: %[[C10:.*]] = constant 10 : index
+// TILE-10n25: loop.for %[[K:.*]] = {{.*}} step %[[C10]]
+// TILE-10n25:   loop.for %[[L:.*]] = {{.*}} step %[[C25]]
+// TILE-10n25:     linalg.indexed_generic
+// TILE-10n25:     ^bb0(%[[I:.*]]: index, %[[J:.*]]: index, %[[IN:.*]]: f32, %[[OUT:.*]]: f32):
+// TILE-10n25:       %[[NEW_I:.*]] = addi %[[I]], %[[K]] : index
+// TILE-10n25:       %[[NEW_J:.*]] = addi %[[J]], %[[L]] : index
+// TILE-10n25:       %[[NEW_INT_I:.*]] = index_cast %[[NEW_I]] : index to i32
+// TILE-10n25:       %[[NEW_FLOAT_I:.*]] = sitofp %[[NEW_INT_I]] : i32 to f32
+// TILE-10n25:       %[[NEW_INT_J:.*]] = index_cast %[[NEW_J]] : index to i32
+// TILE-10n25:       %[[NEW_FLOAT_J:.*]] = sitofp %[[NEW_INT_J]] : i32 to f32
+// TILE-10n25:       %[[OUT:.*]] = addf %[[NEW_FLOAT_I]], %[[NEW_FLOAT_J]] : f32
+
+// TILE-25n0-LABEL: func @indexed_generic_matrix
+// TILE-25n0: %[[C25:.*]] = constant 25 : index
+// TILE-25n0: loop.for %[[L:.*]] = {{.*}} step %[[C25]]
+// TILE-25n0:   linalg.indexed_generic
+// TILE-25n0:   ^bb0(%[[I:.*]]: index, %[[J:.*]]: index, %[[IN:.*]]: f32, %[[OUT:.*]]: f32):
+// TILE-25n0:     %[[NEW_I:.*]] = addi %[[I]], %[[L]] : index
+// TILE-25n0:     %[[NEW_INT_I:.*]] = index_cast %[[NEW_I]] : index to i32
+// TILE-25n0:     %[[NEW_FLOAT_I:.*]] = sitofp %[[NEW_INT_I]] : i32 to f32
+// TILE-25n0:     %[[INT_J:.*]] = index_cast %[[J]] : index to i32
+// TILE-25n0:     %[[FLOAT_J:.*]] = sitofp %[[INT_J]] : i32 to f32
+// TILE-25n0:     %[[OUT:.*]] = addf %[[NEW_FLOAT_I]], %[[FLOAT_J]] : f32
+
+// TILE-0n25-LABEL: func @indexed_generic_matrix
+// TILE-0n25: %[[C25:.*]] = constant 25 : index
+// TILE-0n25: loop.for %[[L:.*]] = {{.*}} step %[[C25]]
+// TILE-0n25:   linalg.indexed_generic
+// TILE-0n25:   ^bb0(%[[I:.*]]: index, %[[J:.*]]: index, %[[IN:.*]]: f32, %[[OUT:.*]]: f32):
+// TILE-0n25:     %[[NEW_J:.*]] = addi %[[J]], %[[L]] : index
+// TILE-0n25:     %[[INT_I:.*]] = index_cast %[[I]] : index to i32
+// TILE-0n25:     %[[FLOAT_I:.*]] = sitofp %[[INT_I]] : i32 to f32
+// TILE-0n25:     %[[NEW_INT_J:.*]] = index_cast %[[NEW_J]] : index to i32
+// TILE-0n25:     %[[NEW_FLOAT_J:.*]] = sitofp %[[NEW_INT_J]] : i32 to f32
+// TILE-0n25:     %[[OUT:.*]] = addf %[[FLOAT_I]], %[[NEW_FLOAT_J]] : f32
diff --git a/mlir/test/Dialect/Linalg/transform-patterns.mlir b/mlir/test/Dialect/Linalg/transform-patterns.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8a08bf850fff7c9e514a8417a796a984df891f12
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/transform-patterns.mlir
@@ -0,0 +1,367 @@
+// RUN: mlir-opt %s -test-linalg-transform-patterns | FileCheck %s
+
+// CHECK-DAG: #[[STRIDED_1D:.*]] = (d0)[s0] -> (d0 + s0)
+// CHECK-DAG: #[[STRIDED_2D:.*]] = (d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)
+// CHECK-DAG: #[[mk:.*]] = (d0, d1, d2) -> (d0, d2)
+// CHECK-DAG: #[[kn:.*]] = (d0, d1, d2) -> (d2, d1)
+// CHECK-DAG: #[[mn:.*]] = (d0, d1, d2) -> (d0, d1)
+// CHECK-DAG: #[[nm:.*]] = (d0, d1, d2) -> (d1, d0)
+// CHECK-DAG: #[[km:.*]] = (d0, d1, d2) -> (d2, d0)
+
+func @dot(%x: memref<?xf32, offset: ?, strides: [1]>,
+          %y: memref<?xf32, offset: ?, strides: [1]>,
+          %v: memref<f32>) {
+  linalg.dot(%x, %y, %v) : memref<?xf32, offset: ?, strides: [1]>,
+                           memref<?xf32, offset: ?, strides: [1]>,
+                           memref<f32>
+  return
+}
+// CHECK-LABEL: func @dot
+// CHECK-DAG  :   %[[c0:.*]] = constant 0 : index
+// CHECK-DAG  :   %[[c8:.*]] = constant 8 : index
+// CHECK-DAG  :   %[[c8000:.*]] = constant 8000 : index
+// CHECK      :   loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c8000]] {
+// CHECK      :     loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c8]] {
+// CHECK      :       loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c1]] {
+// CHECK      :         load
+// CHECK      :         load
+// CHECK      :         mulf
+// CHECK      :         load
+// CHECK      :         addf
+// CHECK      :         store
+
+func @matvec(%A: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+             %x: memref<?xf32, offset: ?, strides: [1]>,
+             %y: memref<?xf32, offset: ?, strides: [1]>) {
+  linalg.matvec(%A, %x, %y) : memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                              memref<?xf32, offset: ?, strides: [1]>,
+                              memref<?xf32, offset: ?, strides: [1]>
+  return
+}
+// CHECK-LABEL: func @matvec
+// CHECK-DAG  :   %[[c0:.*]] = constant 0 : index
+// CHECK-DAG  :   %[[c5:.*]] = constant 5 : index
+// CHECK-DAG  :   %[[c6:.*]] = constant 6 : index
+// CHECK      :   loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c5]]
+// CHECK      :     loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c6]]
+// CHECK      :       linalg.matvec({{.*}}, {{.*}}, {{.*}}) : memref<?x?xf32, #[[STRIDED_2D]]>, memref<?xf32, #[[STRIDED_1D]]>, memref<?xf32, #[[STRIDED_1D]]>
+
+func @matmul(%A: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+             %B: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+             %C: memref<?x?xf32, offset: ?, strides: [?, 1]>) {
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                              memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                              memref<?x?xf32, offset: ?, strides: [?, 1]>
+  return
+}
+// CHECK-LABEL: func @matmul
+// CHECK-DAG  :   %[[c0:.*]] = constant 0 : index
+// CHECK-DAG  :   %[[c2:.*]] = constant 2 : index
+// CHECK-DAG  :   %[[c3:.*]] = constant 3 : index
+// CHECK-DAG  :   %[[c4:.*]] = constant 4 : index
+// CHECK-DAG  :   %[[c20:.*]] = constant 20 : index
+// CHECK-DAG  :   %[[c30:.*]] = constant 30 : index
+// CHECK-DAG  :   %[[c40:.*]] = constant 40 : index
+// CHECK-DAG  :   %[[c200:.*]] = constant 200 : index
+// CHECK-DAG  :   %[[c300:.*]] = constant 300 : index
+// CHECK-DAG  :   %[[c400:.*]] = constant 400 : index
+// CHECK-DAG  :   %[[c2000:.*]] = constant 2000 : index
+// CHECK-DAG  :   %[[c3000:.*]] = constant 3000 : index
+// CHECK-DAG  :   %[[c4000:.*]] = constant 4000 : index
+// CHECK      :   loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c2000]] {
+// CHECK      :     loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c3000]] {
+// CHECK      :       loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c4000]] {
+// CHECK      :         loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c200]] {
+// CHECK      :           loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c300]] {
+// CHECK      :             loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c400]] {
+// CHECK      :               loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c20]] {
+// CHECK      :                 loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c30]] {
+// CHECK      :                   loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c40]] {
+// CHECK      :                     loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c2]] {
+// CHECK      :                       loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c3]] {
+// CHECK      :                         loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c4]] {
+// CHECK      :                           linalg.matmul({{.*}}, {{.*}}, {{.*}}) : memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>
+
+#some_generic_trait = {
+  args_in = 1,
+  args_out = 1,
+  indexing_maps = [
+    (i, j) -> (i, j),
+    (i, j) -> (i, j)
+  ],
+  iterator_types = ["parallel", "parallel"]
+}
+func @fusion_test(%A: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                  %B: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                  %C: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                  %D: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                  %E: memref<?x?xf32, offset: ?, strides: [?, 1]>) {
+  // This should not be fused as it would violate dependencies. It will get
+  // tiled for all levels of the memory hierarchy.
+  linalg.matmul(%A, %A, %C) : memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                              memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                              memref<?x?xf32, offset: ?, strides: [?, 1]>
+
+  // This should be fused.
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                              memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                              memref<?x?xf32, offset: ?, strides: [?, 1]>
+
+  // This should not be fused or transformed at all since there are no patterns
+  // on it. However it will be reordered because there are no dependencies.
+  linalg.generic #some_generic_trait %A, %D {
+    ^bb(%a: f32, %b: f32) :
+      linalg.yield %a : f32
+  } : memref<?x?xf32, offset: ?, strides: [?, 1]>,
+      memref<?x?xf32, offset: ?, strides: [?, 1]>
+
+  linalg.matmul(%C, %D, %E) : memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                              memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                              memref<?x?xf32, offset: ?, strides: [?, 1]>
+
+  return
+}
+// CHECK-LABEL: func @fusion_test
+// CHECK-DAG  :   %[[c0:.*]] = constant 0 : index
+// CHECK-DAG  :   %[[c2:.*]] = constant 2 : index
+// CHECK-DAG  :   %[[c3:.*]] = constant 3 : index
+// CHECK-DAG  :   %[[c4:.*]] = constant 4 : index
+// CHECK-DAG  :   %[[c20:.*]] = constant 20 : index
+// CHECK-DAG  :   %[[c30:.*]] = constant 30 : index
+// CHECK-DAG  :   %[[c40:.*]] = constant 40 : index
+// CHECK-DAG  :   %[[c100:.*]] = constant 100 : index
+// CHECK-DAG  :   %[[c150:.*]] = constant 150 : index
+// CHECK-DAG  :   %[[c200:.*]] = constant 200 : index
+// CHECK-DAG  :   %[[c300:.*]] = constant 300 : index
+// CHECK-DAG  :   %[[c400:.*]] = constant 400 : index
+// CHECK-DAG  :   %[[c2000:.*]] = constant 2000 : index
+// CHECK-DAG  :   %[[c3000:.*]] = constant 3000 : index
+// CHECK-DAG  :   %[[c4000:.*]] = constant 4000 : index
+// CHECK      :   loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c2000]] {
+// CHECK      :     loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c3000]] {
+// CHECK      :       loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c4000]] {
+// CHECK      :         loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c200]] {
+// CHECK      :           loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c300]] {
+// CHECK      :             loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c400]] {
+// CHECK      :               loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c20]] {
+// CHECK      :                 loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c30]] {
+// CHECK      :                   loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c40]] {
+// CHECK      :                     loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c2]] {
+// CHECK      :                       loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c3]] {
+// CHECK      :                         loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c4]] {
+// CHECK      :                           linalg.matmul({{.*}}, {{.*}}, {{.*}}) : memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>
+//
+// CHECK      :   linalg.generic
+//
+// CHECK      :   loop.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c100]] {
+// CHECK      :     loop.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c150]] {
+// CHECK      :       loop.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c2]] {
+// CHECK      :         loop.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c3]] {
+// CHECK      :           loop.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c4]] {
+// CHECK      :             linalg.matmul(%{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>
+// CHECK      :       loop.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c2]] {
+// CHECK      :         loop.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c3]] {
+// CHECK      :           loop.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c4]] {
+// CHECK      :             linalg.matmul(%{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>
+
+#matmul_trait = {
+  args_in = 2,
+  args_out = 1,
+  indexing_maps = [
+    (m, n, k) -> (m, k),
+    (m, n, k) -> (k, n),
+    (m, n, k) -> (m, n)
+  ],
+  iterator_types = ["parallel", "parallel", "reduction"],
+  __internal_linalg_transform__ = "_marked_matmul_"
+}
+func @vectorization_test(%A: memref<8x16xf32>, %B: memref<16x32xf32>,
+                         %C: memref<8x32xf32>) {
+  linalg.generic #matmul_trait %A, %B, %C {
+    ^bb(%a: f32, %b: f32, %c: f32) :
+      %d = mulf %a, %b: f32
+      %e = addf %c, %d: f32
+      linalg.yield %e : f32
+  } : memref<8x16xf32>, memref<16x32xf32>, memref<8x32xf32>
+  return
+}
+
+// CHECK-LABEL: func @vectorization_test
+//       CHECK: vector.type_cast %{{.*}} : memref<8x16xf32> to memref<vector<8x16xf32>>
+//       CHECK: load %{{.*}}[] : memref<vector<8x16xf32>>
+//       CHECK: vector.type_cast %{{.*}} : memref<16x32xf32> to memref<vector<16x32xf32>>
+//       CHECK: load %{{.*}}[] : memref<vector<16x32xf32>>
+//       CHECK: vector.type_cast %{{.*}} : memref<8x32xf32> to memref<vector<8x32xf32>>
+//       CHECK: load %{{.*}}[] : memref<vector<8x32xf32>>
+//       CHECK: vector.contract {indexing_maps = [#[[mk]], #[[kn]], #[[mn]]], iterator_types = ["parallel", "parallel", "reduction"]} %{{.*}}, %{{.*}}, %{{.*}} : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
+//       CHECK: store %{{.*}}, %{{.*}}[] : memref<vector<8x32xf32>>
+func @fma(%a: f32, %b: f32, %c: f32) -> f32 {
+          %d = mulf %a, %b: f32
+          %e = addf %c, %d: f32
+          return %e: f32
+        }
+#matmul_accesses = [
+          (m, n, k) -> (m, k),
+          (m, n, k) -> (k, n),
+          (m, n, k) -> (m, n)
+]
+#generic_matmul_trait = {
+          args_in = 2,
+          args_out = 1,
+          fun = @fma,
+          indexing_maps = #matmul_accesses,
+          library_call = "linalg_matmul",
+          iterator_types = ["parallel", "parallel", "reduction"]
+        }
+
+func @permute_generic(%A: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+           %B: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+           %C: memref<?x?xf32, offset: ?, strides: [?, 1]>) {
+  linalg.generic #generic_matmul_trait %A, %B, %C : memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>
+
+  return
+}
+// CHECK-LABEL : func @fma
+// CHECK-LABEL : func @permute_generic
+// CHECK       : linalg.generic {args_in = 2, args_out = 1, fun = @fma, indexing_maps = [#[[kn]], #[[nm]], #[[km]]], iterator_types = ["parallel", "reduction", "parallel"], library_call = "linalg_matmul"} %{{.*}}, %{{.*}}, %{{.*}} : memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>
+
+func @fma_indexed(%i: index, %j: index, %k: index, %a: f32, %b: f32, %c: f32) -> f32 {
+          %d = mulf %a, %b: f32
+          %e = addf %c, %d: f32
+          return %e: f32
+}
+#indexed_matmul_trait = {
+          args_in = 2,
+          args_out = 1,
+          fun = @fma_indexed,
+          indexing_maps = #matmul_accesses,
+          library_call = "linalg_matmul_indexed",
+          iterator_types = ["parallel", "parallel", "reduction"]
+}
+func @permute_generic_indexed(%A: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+           %B: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+           %C: memref<?x?xf32, offset: ?, strides: [?, 1]>) {
+  linalg.indexed_generic #indexed_matmul_trait %A, %B, %C : memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>, memref<?x?xf32, offset: ?, strides: [?, 1]>
+  return
+}
+// CHECK-LABEL : func @fma_indexed
+// CHECK-LABEL : func @permute_generic_indexed
+// CHECK       : linalg.indexed_generic {args_in = 2, args_out = 1, fun = @fma, indexing_maps = [#[[kn]], #[[nm]], #[[km]]], iterator_types = ["parallel", "reduction", "parallel"], library_call = "linalg_matmul_indexed"} %{{.*}}, %{{.*}}, %{{.*}} : memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>
+
+func @dot_perm(%x: memref<?xf32, offset: ?, strides: [1]>,
+          %y: memref<?xf32, offset: ?, strides: [1]>,
+          %v: memref<f32>) {
+  linalg.dot(%x, %y, %v) {__internal_linalg_transform__ = "__with_perm__"} :
+            memref<?xf32, offset: ?, strides: [1]>,
+            memref<?xf32, offset: ?, strides: [1]>,
+            memref<f32>
+  return
+}
+// CHECK-LABEL: func @dot_perm
+// CHECK-DAG  :   %[[c0:.*]] = constant 0 : index
+// CHECK-DAG  :   %[[c8:.*]] = constant 8 : index
+// CHECK-DAG  :   %[[c8000:.*]] = constant 8000 : index
+// CHECK      :   loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c8000]] {
+// CHECK      :     loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c8]] {
+// CHECK      :       linalg.dot({{.*}}, {{.*}}, {{.*}}) : memref<?xf32, #[[STRIDED_1D]]>, memref<?xf32, #[[STRIDED_1D]]>, memref<f32>
+
+func @matvec_perm(%A: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+             %x: memref<?xf32, offset: ?, strides: [1]>,
+             %y: memref<?xf32, offset: ?, strides: [1]>) {
+  linalg.matvec(%A, %x, %y) {__internal_linalg_transform__ = "__with_perm__"} :
+               memref<?x?xf32, offset: ?, strides: [?, 1]>,
+               memref<?xf32, offset: ?, strides: [1]>,
+               memref<?xf32, offset: ?, strides: [1]>
+  return
+}
+// CHECK-LABEL: func @matvec_perm
+// CHECK-DAG  :   %[[c0:.*]] = constant 0 : index
+// CHECK-DAG  :   %[[c5:.*]] = constant 5 : index
+// CHECK-DAG  :   %[[c6:.*]] = constant 6 : index
+// CHECK      :   loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c6]]
+// CHECK      :     loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c5]]
+// CHECK      :       linalg.matvec({{.*}}, {{.*}}, {{.*}}) : memref<?x?xf32, #[[STRIDED_2D]]>, memref<?xf32, #[[STRIDED_1D]]>, memref<?xf32, #[[STRIDED_1D]]>
+
+func @matmul_perm(%A: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+             %B: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+             %C: memref<?x?xf32, offset: ?, strides: [?, 1]>) {
+  linalg.matmul(%A, %B, %C) {__internal_linalg_transform__ = "__with_perm__"} :
+               memref<?x?xf32, offset: ?, strides: [?, 1]>,
+               memref<?x?xf32, offset: ?, strides: [?, 1]>,
+               memref<?x?xf32, offset: ?, strides: [?, 1]>
+  return
+}
+// CHECK-LABEL: func @matmul_perm
+// CHECK-DAG  :   %[[c0:.*]] = constant 0 : index
+// CHECK-DAG  :   %[[c2:.*]] = constant 2 : index
+// CHECK-DAG  :   %[[c3:.*]] = constant 3 : index
+// CHECK-DAG  :   %[[c4:.*]] = constant 4 : index
+// CHECK-DAG  :   %[[c20:.*]] = constant 20 : index
+// CHECK-DAG  :   %[[c30:.*]] = constant 30 : index
+// CHECK-DAG  :   %[[c40:.*]] = constant 40 : index
+// CHECK-DAG  :   %[[c200:.*]] = constant 200 : index
+// CHECK-DAG  :   %[[c300:.*]] = constant 300 : index
+// CHECK-DAG  :   %[[c400:.*]] = constant 400 : index
+// CHECK-DAG  :   %[[c2000:.*]] = constant 2000 : index
+// CHECK-DAG  :   %[[c3000:.*]] = constant 3000 : index
+// CHECK-DAG  :   %[[c4000:.*]] = constant 4000 : index
+// CHECK      :   loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c3000]] {
+// CHECK      :     loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c4000]] {
+// CHECK      :       loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c2000]] {
+// CHECK      :         loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c300]] {
+// CHECK      :           loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c200]] {
+// CHECK      :             loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c400]] {
+// CHECK      :               loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c20]] {
+// CHECK      :                 loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c30]] {
+// CHECK      :                   loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c40]] {
+// CHECK      :                           linalg.matmul({{.*}}, {{.*}}, {{.*}}) : memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>
+
+func @promote_subview_matmul(%arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                             %arg1: memref<?x?xf32, offset: ?, strides: [?, 1]>,
+                             %arg2: memref<?x?xf32, offset: ?, strides: [?, 1]>) {
+  %c2000 = constant 2000 : index
+  %c3000 = constant 3000 : index
+  %c4000 = constant 4000 : index
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %0 = dim %arg0, 0 : memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %1 = dim %arg0, 1 : memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %2 = dim %arg1, 1 : memref<?x?xf32, offset: ?, strides: [?, 1]>
+  loop.for %arg3 = %c0 to %0 step %c2000 {
+    loop.for %arg4 = %c0 to %2 step %c3000 {
+      loop.for %arg5 = %c0 to %1 step %c4000 {
+        %3 = std.subview %arg0[%arg3, %arg5][%c2000, %c4000][%c1, %c1] :
+             memref<?x?xf32, offset: ?, strides: [?, 1]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %4 = std.subview %arg1[%arg5, %arg4][%c4000, %c3000][%c1, %c1] :
+             memref<?x?xf32, offset: ?, strides: [?, 1]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        %5 = std.subview %arg2[%arg3, %arg4][%c2000, %c3000][%c1, %c1] :
+             memref<?x?xf32, offset: ?, strides: [?, 1]> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        linalg.matmul(%3, %4, %5) {__internal_linalg_transform__ = "_promote_views_"} :
+                      memref<?x?xf32, offset: ?, strides: [?, ?]>,
+                      memref<?x?xf32, offset: ?, strides: [?, ?]>,
+                      memref<?x?xf32, offset: ?, strides: [?, ?]>
+      }
+    }
+  }
+  return
+}
+// CHECK-LABEL: func @promote_subview_matmul
+// CHECK      :   loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c2000]] {
+// CHECK      :     loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c3000]] {
+// CHECK      :       loop.for {{.*}} = %[[c0]] to {{.*}} step %[[c4000]] {
+// CHECK      :         %[[s0:.*]] = std.subview {{%.*}}[{{%.*}}, {{%.*}}][{{%.*}}, {{%.*}}][{{%.*}}, {{%.*}}] : memref<?x?xf32, #map{{.*}}> to memref<?x?xf32, #map{{.*}}>
+// CHECK      :         %[[s1:.*]] = std.subview {{%.*}}[{{%.*}}, {{%.*}}][{{%.*}}, {{%.*}}][{{%.*}}, {{%.*}}] : memref<?x?xf32, #map{{.*}}> to memref<?x?xf32, #map{{.*}}>
+// CHECK      :         %[[s2:.*]] = std.subview {{%.*}}[{{%.*}}, {{%.*}}][{{%.*}}, {{%.*}}][{{%.*}}, {{%.*}}] : memref<?x?xf32, #map{{.*}}> to memref<?x?xf32, #map{{.*}}>
+// CHECK      :         %[[a0:.*]] = alloc({{%.*}}) : memref<?xi8>
+// CHECK      :         %[[v0:.*]] = std.view %[[a0]][][{{%.*}}, {{%.*}}]: memref<?xi8> to memref<?x?xf32>
+// CHECK      :         %[[l0:.*]] = linalg.slice %[[v0]][{{%.*}}, {{%.*}}] : memref<?x?xf32>, !linalg.range, !linalg.range, memref<?x?xf32, #map{{.*}}>
+// CHECK      :         %[[a1:.*]] = alloc({{%.*}}) : memref<?xi8>
+// CHECK      :         %[[v1:.*]] = std.view %[[a1]][][{{%.*}}, {{%.*}}]: memref<?xi8> to memref<?x?xf32>
+// CHECK      :         %[[l1:.*]] = linalg.slice %[[v1]][{{%.*}}, {{%.*}}] : memref<?x?xf32>, !linalg.range, !linalg.range, memref<?x?xf32, #map{{.*}}>
+// CHECK      :         %[[a2:.*]] = alloc({{%.*}}) : memref<?xi8>
+// CHECK      :         %[[v2:.*]] = std.view %[[a2]][][{{%.*}}, {{%.*}}]: memref<?xi8> to memref<?x?xf32>
+// CHECK      :         %[[l2:.*]] = linalg.slice %[[v2]][{{%.*}}, {{%.*}}] : memref<?x?xf32>, !linalg.range, !linalg.range, memref<?x?xf32, #map{{.*}}>
+// CHECK      :         linalg.copy(%[[s0]], %[[l0]]) : memref<?x?xf32, #map{{.*}}>, memref<?x?xf32, #map{{.*}}>
+// CHECK      :         linalg.copy(%[[s1]], %[[l1]]) : memref<?x?xf32, #map{{.*}}>, memref<?x?xf32, #map{{.*}}>
+// CHECK      :         linalg.copy(%[[s2]], %[[l2]]) : memref<?x?xf32, #map{{.*}}>, memref<?x?xf32, #map{{.*}}>
+// CHECK      :         linalg.matmul(%[[v0]], %[[v1]], %[[v2]]) : memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>, memref<?x?xf32, #[[STRIDED_2D]]>
diff --git a/mlir/test/Dialect/Loops/invalid.mlir b/mlir/test/Dialect/Loops/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..e346ad9772eef39cd8f8198bb3938be756ca4ab2
--- /dev/null
+++ b/mlir/test/Dialect/Loops/invalid.mlir
@@ -0,0 +1,115 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+func @loop_for_lb(%arg0: f32, %arg1: index) {
+  // expected-error@+1 {{operand #0 must be index}}
+  "loop.for"(%arg0, %arg1, %arg1) : (f32, index, index) -> ()
+  return
+}
+
+// -----
+
+func @loop_for_ub(%arg0: f32, %arg1: index) {
+  // expected-error@+1 {{operand #1 must be index}}
+  "loop.for"(%arg1, %arg0, %arg1) : (index, f32, index) -> ()
+  return
+}
+
+// -----
+
+func @loop_for_step(%arg0: f32, %arg1: index) {
+  // expected-error@+1 {{operand #2 must be index}}
+  "loop.for"(%arg1, %arg1, %arg0) : (index, index, f32) -> ()
+  return
+}
+
+// -----
+
+func @loop_for_step_positive(%arg0: index) {
+  // expected-error@+2 {{constant step operand must be positive}}
+  %c0 = constant 0 : index
+  "loop.for"(%arg0, %arg0, %c0) ({
+    ^bb0(%arg1: index):
+      "loop.terminator"() : () -> ()
+  }) : (index, index, index) -> ()
+  return
+}
+
+// -----
+
+func @loop_for_one_region(%arg0: index) {
+  // expected-error@+1 {{incorrect number of regions: expected 1 but found 2}}
+  "loop.for"(%arg0, %arg0, %arg0) (
+    {"loop.terminator"() : () -> ()},
+    {"loop.terminator"() : () -> ()}
+  ) : (index, index, index) -> ()
+  return
+}
+
+// -----
+
+func @loop_for_single_block(%arg0: index) {
+  // expected-error@+1 {{expects region #0 to have 0 or 1 blocks}}
+  "loop.for"(%arg0, %arg0, %arg0) (
+    {
+    ^bb1:
+      "loop.terminator"() : () -> ()
+    ^bb2:
+      "loop.terminator"() : () -> ()
+    }
+  ) : (index, index, index) -> ()
+  return
+}
+
+// -----
+
+func @loop_for_single_index_argument(%arg0: index) {
+  // expected-error@+1 {{expected body to have a single index argument for the induction variable}}
+  "loop.for"(%arg0, %arg0, %arg0) (
+    {
+    ^bb0(%i0 : f32):
+      "loop.terminator"() : () -> ()
+    }
+  ) : (index, index, index) -> ()
+  return
+}
+
+// -----
+
+func @loop_if_not_i1(%arg0: index) {
+  // expected-error@+1 {{operand #0 must be 1-bit integer}}
+  "loop.if"(%arg0) : (index) -> ()
+  return
+}
+
+// -----
+
+func @loop_if_more_than_2_regions(%arg0: i1) {
+  // expected-error@+1 {{op has incorrect number of regions: expected 2}}
+  "loop.if"(%arg0) ({}, {}, {}): (i1) -> ()
+  return
+}
+
+// -----
+
+func @loop_if_not_one_block_per_region(%arg0: i1) {
+  // expected-error@+1 {{expects region #0 to have 0 or 1 blocks}}
+  "loop.if"(%arg0) ({
+    ^bb0:
+      "loop.terminator"() : () -> ()
+    ^bb1:
+      "loop.terminator"() : () -> ()
+  }, {}): (i1) -> ()
+  return
+}
+
+// -----
+
+func @loop_if_illegal_block_argument(%arg0: i1) {
+  // expected-error@+1 {{requires that child entry blocks have no arguments}}
+  "loop.if"(%arg0) ({
+    ^bb0(%0 : index):
+      "loop.terminator"() : () -> ()
+  }, {}): (i1) -> ()
+  return
+}
+
diff --git a/mlir/test/Dialect/Loops/ops.mlir b/mlir/test/Dialect/Loops/ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..a51438bc727ea48bc674433a135634e43fca90e4
--- /dev/null
+++ b/mlir/test/Dialect/Loops/ops.mlir
@@ -0,0 +1,51 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+func @std_for(%arg0 : index, %arg1 : index, %arg2 : index) {
+  loop.for %i0 = %arg0 to %arg1 step %arg2 {
+    loop.for %i1 = %arg0 to %arg1 step %arg2 {
+      %min_cmp = cmpi "slt", %i0, %i1 : index
+      %min = select %min_cmp, %i0, %i1 : index
+      %max_cmp = cmpi "sge", %i0, %i1 : index
+      %max = select %max_cmp, %i0, %i1 : index
+      loop.for %i2 = %min to %max step %i1 {
+      }
+    }
+  }
+  return
+}
+// CHECK-LABEL: func @std_for(
+//  CHECK-NEXT:   loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//  CHECK-NEXT:     loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+//  CHECK-NEXT:       %{{.*}} = cmpi "slt", %{{.*}}, %{{.*}} : index
+//  CHECK-NEXT:       %{{.*}} = select %{{.*}}, %{{.*}}, %{{.*}} : index
+//  CHECK-NEXT:       %{{.*}} = cmpi "sge", %{{.*}}, %{{.*}} : index
+//  CHECK-NEXT:       %{{.*}} = select %{{.*}}, %{{.*}}, %{{.*}} : index
+//  CHECK-NEXT:       loop.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+
+func @std_if(%arg0: i1, %arg1: f32) {
+  loop.if %arg0 {
+    %0 = addf %arg1, %arg1 : f32
+  }
+  return
+}
+// CHECK-LABEL: func @std_if(
+//  CHECK-NEXT:   loop.if %{{.*}} {
+//  CHECK-NEXT:     %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+
+func @std_if_else(%arg0: i1, %arg1: f32) {
+  loop.if %arg0 {
+    %0 = addf %arg1, %arg1 : f32
+  } else {
+    %1 = addf %arg1, %arg1 : f32
+  }
+  return
+}
+// CHECK-LABEL: func @std_if_else(
+//  CHECK-NEXT:   loop.if %{{.*}} {
+//  CHECK-NEXT:     %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+//  CHECK-NEXT:   } else {
+//  CHECK-NEXT:     %{{.*}} = addf %{{.*}}, %{{.*}} : f32
diff --git a/mlir/test/Dialect/QuantOps/canonicalize.mlir b/mlir/test/Dialect/QuantOps/canonicalize.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..f9fc4fc466c52462d32016731b7f2278bf4b5c9e
--- /dev/null
+++ b/mlir/test/Dialect/QuantOps/canonicalize.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -split-input-file -pass-pipeline='func(canonicalize)' | FileCheck %s --dump-input=fail
+
+// -----
+// CHECK-LABEL: redundant_scast
+func @redundant_scast() -> tensor<4xi8> {
+  // CHECK-NEXT: constant dense<10> : tensor<4xi8>
+  // CHECK-NEXT: return
+  %cst = constant dense<5> : tensor<4xi8>
+  %1 = "quant.scast"(%cst) : (tensor<4xi8>) -> tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>>
+  %2 = "quant.scast"(%1) : (tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<4xi8>
+  %3 = addi %2, %2 : tensor<4xi8>
+  return %3 : tensor<4xi8>
+}
+
+// -----
+// CHECK-LABEL: non_redundant_scast
+func @non_redundant_scast() -> tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>> {
+  // CHECK-NEXT: constant dense<5> : tensor<4xi8>
+  // CHECK-NEXT: scast
+  // CHECK-NEXT: return
+  %cst = constant dense<5> : tensor<4xi8>
+  %1 = "quant.scast"(%cst) : (tensor<4xi8>) -> tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>>
+  return %1 : tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>>
+}
diff --git a/mlir/test/Dialect/QuantOps/convert-const.mlir b/mlir/test/Dialect/QuantOps/convert-const.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..611ed7038c068fdc4a7900ac90755df7e6f7a4c5
--- /dev/null
+++ b/mlir/test/Dialect/QuantOps/convert-const.mlir
@@ -0,0 +1,193 @@
+// RUN: mlir-opt %s -split-input-file -quant-convert-const | FileCheck %s --dump-input=fail
+
+// Magic numbers:
+//   7.8125e-03 = 1/128 = 2/256 : real range = [-1.0, 0.9921875] (for 8bit, zeroPoint=128)
+//   1.250000e-01 = 1/8 = 2/16  : real range = [-1.0, 0.875] (for 4bit, zeroPoint=8)
+
+// -----
+// Verifies u8 affine quantization on a splat tensor.
+// Note that MLIR prints int attributes as signed, so the constant, when
+// quantized, is the signed printed version of an unsigned quantity
+// (-64 signed == 192 unsigned).
+// CHECK-LABEL: constant_splat_tensor_u8_affine
+func @constant_splat_tensor_u8_affine() -> tensor<4xf32> {
+  // CHECK: %cst = constant dense<-64> : tensor<4xi8>
+  // CHECK-NEXT: %0 = "quant.scast"(%cst) : (tensor<4xi8>) -> tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>>
+  %cst = constant dense<0.5> : tensor<4xf32>
+  %1 = "quant.qcast"(%cst) : (tensor<4xf32>) -> tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>>
+  %2 = "quant.dcast"(%1) : (tensor<4x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> (tensor<4xf32>)
+  return %2 : tensor<4xf32>
+}
+
+// -----
+// Verifies i8 affine quantization on a splat tensor.
+// CHECK-LABEL: constant_splat_tensor_i8_affine
+func @constant_splat_tensor_i8_affine() -> tensor<4xf32> {
+  // CHECK: %cst = constant dense<63> : tensor<4xi8>
+  // CHECK-NEXT: %0 = "quant.scast"(%cst) : (tensor<4xi8>) -> tensor<4x!quant.uniform<i8:f32, 7.812500e-03:-1>>
+  %cst = constant dense<0.5> : tensor<4xf32>
+  %1 = "quant.qcast"(%cst) : (tensor<4xf32>) -> tensor<4x!quant.uniform<i8:f32, 7.812500e-03:-1>>
+  %2 = "quant.dcast"(%1) : (tensor<4x!quant.uniform<i8:f32, 7.812500e-03:-1>>) -> (tensor<4xf32>)
+  return %2 : tensor<4xf32>
+}
+
+// -----
+// Verifies i8 fixedpoint quantization on a splat tensor.
+// CHECK-LABEL: const_splat_tensor_i8_fixedpoint
+func @const_splat_tensor_i8_fixedpoint() -> tensor<4xf32> {
+  // CHECK: %cst = constant dense<64> : tensor<4xi8>
+  // CHECK-NEXT: %0 = "quant.scast"(%cst) : (tensor<4xi8>) -> tensor<4x!quant.uniform<i8:f32, 7.812500e-03>>
+  %cst = constant dense<0.5> : tensor<4xf32>
+  %1 = "quant.qcast"(%cst) : (tensor<4xf32>) -> tensor<4x!quant.uniform<i8:f32, 7.812500e-03>>
+  %2 = "quant.dcast"(%1) : (tensor<4x!quant.uniform<i8:f32, 7.812500e-03>>) -> (tensor<4xf32>)
+  return %2 : tensor<4xf32>
+}
+
+// -----
+// Verifies i8 fixedpoint quantization on a splat tensor resulting in a negative storage value.
+// CHECK-LABEL: const_splat_tensor_i8_fixedpoint_neg
+func @const_splat_tensor_i8_fixedpoint_neg() -> tensor<4xf32> {
+  // CHECK: %cst = constant dense<-64> : tensor<4xi8>
+  %cst = constant dense<-0.5> : tensor<4xf32>
+  %1 = "quant.qcast"(%cst) : (tensor<4xf32>) -> tensor<4x!quant.uniform<i8:f32, 7.812500e-03>>
+  %2 = "quant.dcast"(%1) : (tensor<4x!quant.uniform<i8:f32, 7.812500e-03>>) -> (tensor<4xf32>)
+  return %2 : tensor<4xf32>
+}
+
+// -----
+// Verifies i8 fixedpoint quantization on a dense tensor, sweeping values.
+// CHECK-LABEL: const_dense_tensor_i8_fixedpoint
+func @const_dense_tensor_i8_fixedpoint() -> tensor<7xf32> {
+  // CHECK: %cst = constant dense<[-128, -128, -64, 0, 64, 127, 127]> : tensor<7xi8>
+  %cst = constant dense<[-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0]> : tensor<7xf32>
+  %1 = "quant.qcast"(%cst) : (tensor<7xf32>) -> tensor<7x!quant.uniform<i8:f32, 7.812500e-03>>
+  %2 = "quant.dcast"(%1) : (tensor<7x!quant.uniform<i8:f32, 7.812500e-03>>) -> (tensor<7xf32>)
+  return %2 : tensor<7xf32>
+}
+
+// -----
+// Verifies i8 fixedpoint quantization on a sparse tensor, sweeping values.
+// CHECK-LABEL: const_sparse_tensor_i8_fixedpoint
+func @const_sparse_tensor_i8_fixedpoint() -> tensor<7x2xf32> {
+  // NOTE: Ugly regex match pattern for opening "[[" of indices tensor.
+  // CHECK: %cst = constant sparse<{{\[}}[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6]], [-128, -128, -64, 0, 64, 127, 127]> : tensor<7x2xi8>
+  %cst = constant sparse<
+      [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6]],
+      [-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0]> : tensor<7x2xf32>
+  %1 = "quant.qcast"(%cst) : (tensor<7x2xf32>) -> tensor<7x2x!quant.uniform<i8:f32, 7.812500e-03>>
+  %2 = "quant.dcast"(%1) : (tensor<7x2x!quant.uniform<i8:f32, 7.812500e-03>>) -> (tensor<7x2xf32>)
+  return %2 : tensor<7x2xf32>
+}
+
+// -----
+// Verifies i8 fixedpoint quantization on a primitive const.
+// CHECK-LABEL: const_primitive_float_i8_fixedpoint
+func @const_primitive_float_i8_fixedpoint() -> f32 {
+  // CHECK: %c64_i8 = constant 64 : i8
+  // CHECK-NEXT: %0 = "quant.scast"(%c64_i8) : (i8) -> !quant.uniform<i8:f32, 7.812500e-03>
+  %cst = constant 0.5 : f32
+  %1 = "quant.qcast"(%cst) : (f32) -> !quant.uniform<i8:f32, 7.812500e-03>
+  %2 = "quant.dcast"(%1) : (!quant.uniform<i8:f32, 7.812500e-03>) -> (f32)
+  return %2 : f32
+}
+
+// -----
+// Verifies u4 affine quantization on a dense tensor, sweeping values.
+// CHECK-LABEL: const_dense_tensor_u4_affine
+func @const_dense_tensor_u4_affine() -> tensor<7xf32> {
+  // NOTE: Unsigned quantities printed by MLIR as signed.
+  // CHECK: %cst = constant dense<[0, 0, 4, -8, -4, -1, -1]> : tensor<7xi4>
+  %cst = constant dense<[-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0]> : tensor<7xf32>
+  %1 = "quant.qcast"(%cst) : (tensor<7xf32>) -> tensor<7x!quant.uniform<u4:f32, 1.250000e-01:8>>
+  %2 = "quant.dcast"(%1) : (tensor<7x!quant.uniform<u4:f32, 1.250000e-01:8>>) -> (tensor<7xf32>)
+  return %2 : tensor<7xf32>
+}
+
+// -----
+// Verifies i4 affine quantization on a dense tensor, sweeping values.
+// CHECK-LABEL: const_dense_tensor_i4_affine
+func @const_dense_tensor_i4_affine() -> tensor<7xf32> {
+  // NOTE: Unsigned quantities printed by MLIR as signed.
+  // CHECK: %cst = constant dense<[-8, -8, -5, -1, 3, 7, 7]> : tensor<7xi4>
+  %cst = constant dense<[-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0]> : tensor<7xf32>
+  %1 = "quant.qcast"(%cst) : (tensor<7xf32>) -> tensor<7x!quant.uniform<i4:f32, 1.250000e-01:-1>>
+  %2 = "quant.dcast"(%1) : (tensor<7x!quant.uniform<i4:f32, 1.250000e-01:-1>>) -> (tensor<7xf32>)
+  return %2 : tensor<7xf32>
+}
+
+// -----
+// Verifies i4 fixed point quantization on a dense tensor, sweeping values.
+// CHECK-LABEL: const_dense_tensor_i4_fixedpoint
+func @const_dense_tensor_i4_fixedpoint() -> tensor<7xf32> {
+  // CHECK: %cst = constant dense<[-8, -8, -4, 0, 4, 7, 7]> : tensor<7xi4>
+  %cst = constant dense<[-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0]> : tensor<7xf32>
+  %1 = "quant.qcast"(%cst) : (tensor<7xf32>) -> tensor<7x!quant.uniform<i4:f32, 1.250000e-01>>
+  %2 = "quant.dcast"(%1) : (tensor<7x!quant.uniform<i4:f32, 1.250000e-01>>) -> (tensor<7xf32>)
+  return %2 : tensor<7xf32>
+}
+
+// -----
+// Verifies i8 fixedpoint quantization on a dense tensor, sweeping values, and
+// custom storage range. (the -128 should be clamped to -100, and the 127 should
+// be clamped to 100).
+// CHECK-LABEL: const_custom_storage_range_i8_fixedpoint
+func @const_custom_storage_range_i8_fixedpoint() -> tensor<7xf32> {
+  // CHECK: %cst = constant dense<[-100, -100, -64, 0, 64, 100, 100]> : tensor<7xi8>
+  %cst = constant dense<[-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0]> : tensor<7xf32>
+  %1 = "quant.qcast"(%cst) : (tensor<7xf32>) -> tensor<7x!quant.uniform<i8<-100:100>:f32, 7.812500e-03>>
+  %2 = "quant.dcast"(%1) : (tensor<7x!quant.uniform<i8<-100:100>:f32, 7.812500e-03>>) -> (tensor<7xf32>)
+  return %2 : tensor<7xf32>
+}
+
+// -----
+// Verifies quantization results of all-0.0 tensors are quantized to zero points.
+// CHECK-LABEL: zero_tensors_to_zero_points
+func @zero_tensors_to_zero_points() -> (tensor<7xf32>, tensor<7xf32>, tensor<7xf32>, tensor<7xf32>) {
+
+// CHECK: %[[cst:.*]] = constant dense<-127> : tensor<7xi8>
+// CHECK: %[[cst0:.*]] = constant dense<0> : tensor<7xi8>
+// CHECK: %[[cst1:.*]] = constant dense<1> : tensor<7xi8>
+// CHECK: "quant.scast"(%[[cst0]]) : (tensor<7xi8>) -> tensor<7x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK: "quant.scast"(%[[cst]]) : (tensor<7xi8>) -> tensor<7x!quant.uniform<i8<-127:127>:f32, 1.000000e+00:-127>>
+// CHECK: "quant.scast"(%[[cst0]]) : (tensor<7xi8>) -> tensor<7x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK: "quant.scast"(%[[cst1]]) : (tensor<7xi8>) -> tensor<7x!quant.uniform<u8<1:255>:f32, 1.000000e+00:1>>
+
+  %cst = constant dense<0.0> : tensor<7xf32>
+  %1 = "quant.qcast"(%cst) : (tensor<7xf32>) -> tensor<7x!quant.uniform<i8:f32, 1.0>>
+  %2 = "quant.dcast"(%1) : (tensor<7x!quant.uniform<i8:f32, 1.0>>) -> (tensor<7xf32>)
+
+  %cst0 = constant dense<0.0> : tensor<7xf32>
+  %3 = "quant.qcast"(%cst0) : (tensor<7xf32>) -> tensor<7x!quant.uniform<i8<-127:127>:f32, 1.0:-127>>
+  %4 = "quant.dcast"(%3) : (tensor<7x!quant.uniform<i8<-127:127>:f32, 1.0:-127>>) -> (tensor<7xf32>)
+
+  %cst1 = constant dense<0.0> : tensor<7xf32>
+  %5 = "quant.qcast"(%cst1) : (tensor<7xf32>) -> tensor<7x!quant.uniform<u8:f32, 1.0>>
+  %6 = "quant.dcast"(%5) : (tensor<7x!quant.uniform<u8:f32, 1.0>>) -> (tensor<7xf32>)
+
+  %cst2 = constant dense<0.0> : tensor<7xf32>
+  %7 = "quant.qcast"(%cst2) : (tensor<7xf32>) -> tensor<7x!quant.uniform<u8<1:255>:f32, 1.0:1>>
+  %8 = "quant.dcast"(%7) : (tensor<7x!quant.uniform<u8<1:255>:f32, 1.0:1>>) -> (tensor<7xf32>)
+
+  return %2, %4, %6, %8 : tensor<7xf32>, tensor<7xf32>, tensor<7xf32>, tensor<7xf32>
+}
+
+// -----
+// Verifies per-axis quantization results for dense.
+// CHECK-LABEL: per_axis_dense_quantization
+func @per_axis_dense_quantization() -> (tensor<2x3xf32>, tensor<2x3xf32>) {
+
+// CHECK-NEXT: %[[cst:.*]] = constant dense<{{\[}}[-128, 64, 127], [0, 1, 2]]> : tensor<2x3xi8>
+// CHECK-NEXT: %[[cst0:.*]] = constant dense<{{\[}}[-128, -1, 1], [127, 1, 3]]> : tensor<2x3xi8>
+// CHECK: "quant.scast"(%[[cst]]) : (tensor<2x3xi8>) -> tensor<2x3x!quant.uniform<i8:f32:0, {7.812500e-03:128,1.000000e+00}>>
+// CHECK: "quant.scast"(%cst_0) : (tensor<2x3xi8>) -> tensor<2x3x!quant.uniform<i8:f32:1, {7.812500e-03:128,1.000000e+00,1.000000e+00:1}>>
+
+  %cst = constant dense<[[-2.0, -0.5, 0.0], [0.0, 1.0, 2.0]]> : tensor<2x3xf32>
+  %1 = "quant.qcast"(%cst) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32:0, {7.812500e-03:128, 1.0}>>
+  %2 = "quant.dcast"(%1) : (tensor<2x3x!quant.uniform<i8:f32:0, {7.812500e-03:128, 1.0}>>) -> (tensor<2x3xf32>)
+
+  %cst0 = constant dense<[[-2.0, -0.5, 0.0], [0.0, 1.0, 2.0]]> : tensor<2x3xf32>
+  %3 = "quant.qcast"(%cst0) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32:1, {7.812500e-03:128, 1.0, 1.0:1}>>
+  %4 = "quant.dcast"(%3) : (tensor<2x3x!quant.uniform<i8:f32:1, {7.812500e-03:128, 1.0, 1.0:1}>>) -> (tensor<2x3xf32>)
+
+  return %2, %4 : tensor<2x3xf32>, tensor<2x3xf32>
+}
diff --git a/mlir/test/Dialect/QuantOps/convert-fakequant-invalid.mlir b/mlir/test/Dialect/QuantOps/convert-fakequant-invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d6b6a524e593c5a9b67c26e5cf83c054ddcd62ba
--- /dev/null
+++ b/mlir/test/Dialect/QuantOps/convert-fakequant-invalid.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -quant-convert-simulated-quantization
+
+// -----
+// Unsupported quantizable type (i1 is currently not a supported element type).
+func @fakeQuantArgs(tensor<8x4x3xi1>) -> tensor<8x4x3xi1> {
+^bb0(%arg0: tensor<8x4x3xi1>):
+  // expected-error@+1 {{op operand #0 must be tensor of 32-bit float values}}
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = 1.1 : f32, max = 1.0 : f32, num_bits = 8
+  } : (tensor<8x4x3xi1>) -> tensor<8x4x3xi1>
+  return %0 : tensor<8x4x3xi1>
+}
diff --git a/mlir/test/Dialect/QuantOps/convert-fakequant.mlir b/mlir/test/Dialect/QuantOps/convert-fakequant.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..f5709e6a8e10ac144ecd590ade2385d94bc486e1
--- /dev/null
+++ b/mlir/test/Dialect/QuantOps/convert-fakequant.mlir
@@ -0,0 +1,233 @@
+// RUN: mlir-opt %s -split-input-file -quant-convert-simulated-quantization | FileCheck %s --dump-input=fail
+
+// -----
+// Verifies a quint8 single point.
+// CHECK-LABEL: fakeQuantArgs_Quint8_0
+func @fakeQuantArgs_Quint8_0(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+  // CHECK: %[[qc:.*]] = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<u8:f32, 1.000000e+00>>
+  // CHECK-NEXT: "quant.dcast"(%[[qc]]) : (tensor<8x4x3x!quant.uniform<u8:f32, 1.000000e+00>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = 0.0 : f32, max = 0.0 : f32, num_bits = 8
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// Verifies a quint8 single point (with narrow_range = true).
+// CHECK-LABEL: fakeQuantArgs_Quint8_0_NarrowRange
+func @fakeQuantArgs_Quint8_0_NarrowRange(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+  // CHECK: %[[qc:.*]] = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<u8<1:255>:f32, 1.000000e+00:1>>
+  // CHECK-NEXT: "quant.dcast"(%[[qc]]) : (tensor<8x4x3x!quant.uniform<u8<1:255>:f32, 1.000000e+00:1>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = 0.0 : f32, max = 0.0 : f32, num_bits = 8, narrow_range = true
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// Verifies a quint8 asymmetric 0..1 range.
+// CHECK-LABEL: fakeQuantArgs_Quint8_0_1
+func @fakeQuantArgs_Quint8_0_1(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+  // CHECK: %0 = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<u8:f32, 0.0039215686274509803>>
+  // CHECK-NEXT: %1 = "quant.dcast"(%0) : (tensor<8x4x3x!quant.uniform<u8:f32, 0.0039215686274509803>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = 0.0 : f32, max = 1.0 : f32, num_bits = 8
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// Verifies a quint8 asymmetric 0..1 range (with narrow_range = true).
+// CHECK-LABEL: fakeQuantArgs_Quint8_NarrowRange
+func @fakeQuantArgs_Quint8_NarrowRange(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+  // CHECK: %0 = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<u8<1:255>:f32, 0.003937007874015748:1>>
+  // CHECK-NEXT: %1 = "quant.dcast"(%0) : (tensor<8x4x3x!quant.uniform<u8<1:255>:f32, 0.003937007874015748:1>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = 0.0 : f32, max = 1.0 : f32, num_bits = 8, narrow_range = true
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// Verifies a quint8 symmetric range of -1..127/128.
+// CHECK-LABEL: fakeQuantArgs_Quint8_SymmetricRange
+func @fakeQuantArgs_Quint8_SymmetricRange(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+  // CHECK: %0 = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<u8:f32, 7.812500e-03:128>>
+  // CHECK-NEXT: %1 = "quant.dcast"(%0) : (tensor<8x4x3x!quant.uniform<u8:f32, 7.812500e-03:128>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = -1.0 : f32, max = 0.9921875 : f32, num_bits = 8, narrow_range = false
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// Verifies a qint8 single point.
+// CHECK-LABEL: fakeQuantArgs_Qint8_0
+func @fakeQuantArgs_Qint8_0(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+  // CHECK: %[[qc:.*]] = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>
+  // CHECK-NEXT: "quant.dcast"(%[[qc]]) : (tensor<8x4x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = 0.0 : f32, max = 0.0 : f32, num_bits = 8, is_signed = true
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// Verifies a qint8 single point (with narrow_range = true).
+// CHECK-LABEL: fakeQuantArgs_Qint8_0_NarrowRange
+func @fakeQuantArgs_Qint8_0_NarrowRange(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+  // CHECK: %[[qc:.*]]  = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00:-127>>
+  // CHECK-NEXT: "quant.dcast"(%[[qc]]) : (tensor<8x4x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00:-127>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = 0.0 : f32, max = 0.0 : f32, num_bits = 8, narrow_range = true, is_signed = true
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// Verifies a qint8 asymmetric 0..1 range.
+// CHECK-LABEL: fakeQuantArgs_Qint8_0_1
+func @fakeQuantArgs_Qint8_0_1(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+  // CHECK: %0 = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<i8:f32, 0.0039215686274509803:-128>>
+  // CHECK-NEXT: %1 = "quant.dcast"(%0) : (tensor<8x4x3x!quant.uniform<i8:f32, 0.0039215686274509803:-128>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = 0.0 : f32, max = 1.0 : f32, num_bits = 8, is_signed = true
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// Verifies a qint8 asymmetric 0..1 range (with narrow_range = true).
+// CHECK-LABEL: fakeQuantArgs_Qint8_NarrowRange
+func @fakeQuantArgs_Qint8_NarrowRange(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+  // CHECK: %0 = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<i8<-127:127>:f32, 0.003937007874015748:-127>>
+  // CHECK-NEXT: %1 = "quant.dcast"(%0) : (tensor<8x4x3x!quant.uniform<i8<-127:127>:f32, 0.003937007874015748:-127>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = 0.0 : f32, max = 1.0 : f32, num_bits = 8, narrow_range = true, is_signed = true
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// Verifies a qint8 symmetric range of -1..127/128.
+// CHECK-LABEL: fakeQuantArgs_Qint8_SymmetricRange
+func @fakeQuantArgs_Qint8_SymmetricRange(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+  // CHECK: %0 = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<i8:f32, 7.812500e-03>>
+  // CHECK-NEXT: %1 = "quant.dcast"(%0) : (tensor<8x4x3x!quant.uniform<i8:f32, 7.812500e-03>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = -1.0 : f32, max = 0.9921875 : f32, num_bits = 8, narrow_range = false, is_signed = true
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// Verifies a commonly used -1..1 symmetric 16bit range with a zero point of
+// 0 and range -1.0 .. 32767/32768.
+// CHECK-LABEL: fakeQuantArgs_Qint16_Symmetric
+func @fakeQuantArgs_Qint16_Symmetric(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+  // CHECK: %0 = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<i16:f32, 3.0517578125E-5>>
+  // CHECK-NEXT: %1 = "quant.dcast"(%0) : (tensor<8x4x3x!quant.uniform<i16:f32, 3.0517578125E-5>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = -1.0 : f32, max = 0.999969482 : f32, num_bits = 16, is_signed = true
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// Verify that lowering to barriers of unranked tensors functions.
+// CHECK-LABEL: fakeQuantArgs_UnrankedTensor
+func @fakeQuantArgs_UnrankedTensor(tensor<f32>) -> tensor<f32> {
+^bb0(%arg0: tensor<f32>):
+  // CHECK: %0 = "quant.qcast"(%arg0) : (tensor<f32>)
+  // CHECK-SAME: -> tensor<!quant.uniform<u8:f32, 0.0039215686274509803>>
+  // CHECK-NEXT: %1 = "quant.dcast"(%0) : (tensor<!quant.uniform<u8:f32, 0.0039215686274509803>>)
+  // CHECK-SAME: -> tensor<f32>
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = 0.0 : f32, max = 1.0 : f32, num_bits = 8
+  } : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+// CHECK-LABEL: fakeQuantArgs_all_positive
+func @fakeQuantArgs_all_positive(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+
+  // CHECK: %[[qc:.*]] = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<i8:f32, 0.0039215686274509803:-128>>
+  // CHECK-NEXT: "quant.dcast"(%[[qc]]) : (tensor<8x4x3x!quant.uniform<i8:f32, 0.0039215686274509803:-128>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = 0.5 : f32, max = 1.5 : f32, num_bits = 8, narrow_range = false, is_signed = true
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// CHECK-LABEL: fakeQuantArgs_all_negative
+func @fakeQuantArgs_all_negative(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+
+  // CHECK: %[[qc:.*]] = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<i8:f32, 0.0039215686274509803:127>>
+  // CHECK-NEXT: "quant.dcast"(%[[qc]]) : (tensor<8x4x3x!quant.uniform<i8:f32, 0.0039215686274509803:127>>)
+  // CHECK-SAME: -> tensor<8x4x3xf32>
+
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = -1.5 : f32, max = -0.5 : f32, num_bits = 8, narrow_range = false, is_signed = true
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// Verifies a qint8 per axis
+// CHECK-LABEL: fakeQuantPerAxis
+func @fakeQuantPerAxis(tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+^bb0(%arg0: tensor<8x4x3xf32>):
+
+  // CHECK: %[[q:.*]] = "quant.qcast"(%arg0) : (tensor<8x4x3xf32>)
+  // CHECK-SAME: -> tensor<8x4x3x!quant.uniform<i8:f32:2, {7.812500e-03,1.000000e+00:-128,0.0039215686274509803:-128}>>
+  // CHECK: %[[d:.*]] = "quant.dcast"(%[[q]])
+  // CHECK-SAME: (tensor<8x4x3x!quant.uniform<i8:f32:2, {7.812500e-03,1.000000e+00:-128,0.0039215686274509803:-128}>>)
+
+  %0 = "quant.const_fake_quant_per_axis"(%arg0) {
+    min = [-1.0 : f32, 0.0 : f32, 0.0 : f32],
+    max = [0.9921875 : f32, 0.0: f32, 1.0 : f32],
+    num_bits = 8, narrow_range = false, is_signed = true, axis = 2
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
diff --git a/mlir/test/Dialect/QuantOps/parse-any-invalid.mlir b/mlir/test/Dialect/QuantOps/parse-any-invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d90423ef4085d17614fde55e4162e7f17a038c9f
--- /dev/null
+++ b/mlir/test/Dialect/QuantOps/parse-any-invalid.mlir
@@ -0,0 +1,61 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// -----
+// Unrecognized token: missing storage type maximum
+// expected-error@+1 {{expected ':'}}
+!qalias = type !quant.any<i8<16>:f32>
+
+// -----
+// Unrecognized token: missing closing angle bracket
+// expected-error@+1 {{expected '>'}}
+!qalias = type !quant<"any<i8<-4:3:f32>">
+
+// -----
+// Unrecognized token: missing type colon
+// expected-error@+1 {{expected '>'}}
+!qalias = type !quant.any<i8<-4:3>f32>
+
+// -----
+// Unrecognized storage type: illegal prefix
+// expected-error@+1 {{illegal storage type prefix}}
+!qalias = type !quant.any<int8<-4:3>:f32>
+
+// -----
+// Unrecognized storage type: no width
+// expected-error@+1 {{illegal storage type prefix}}
+!qalias = type !quant.any<i<-4:3>:f32>
+
+// -----
+// Unrecognized storage type: storage size > 32
+// expected-error@+1 {{illegal storage type size: 33}}
+!qalias = type !quant.any<i33:f32>
+
+// -----
+// Unrecognized storage type: storage size < 0
+// expected-error@+1 {{illegal storage type size: 1024}}
+!qalias = type !quant.any<i1024<-4:3>:f32>
+
+// -----
+// Unrecognized storage type: storage size == 0
+// expected-error@+1 {{invalid integer width}}
+!qalias = type !quant.any<i0<-4:3>:f32>
+
+// -----
+// Illegal storage min/max: max - min < 0
+// expected-error@+1 {{illegal storage min and storage max: (2:1)}}
+!qalias = type !quant.any<i8<2:1>:f32>
+
+// -----
+// Illegal storage min/max: max - min == 0
+// expected-error@+1 {{illegal storage min and storage max: (1:1)}}
+!qalias = type !quant.any<i8<1:1>:f32>
+
+// -----
+// Illegal storage min/max: max > defaultMax
+// expected-error@+1 {{illegal storage type maximum: 9}}
+!qalias = type !quant.any<i4<-1:9>:f32>
+
+// -----
+// Illegal storage min/max: min < defaultMin
+// expected-error@+1 {{illegal storage type minimum: -9}}
+!qalias = type !quant.any<i4<-9:1>:f32>
diff --git a/mlir/test/Dialect/QuantOps/parse-any.mlir b/mlir/test/Dialect/QuantOps/parse-any.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..90976e8d437425657ae0ec6a2f658b5ce7628e4f
--- /dev/null
+++ b/mlir/test/Dialect/QuantOps/parse-any.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-opt %s -split-input-file | FileCheck %s
+
+// -----
+// CHECK-LABEL: parseFullySpecified
+// CHECK: !quant.any<i8<-8:7>:f32>
+!qalias = type !quant.any<i8<-8:7>:f32>
+func @parseFullySpecified() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// CHECK-LABEL: parseNoExpressedType
+// CHECK: !quant.any<i8<-8:7>>
+!qalias = type !quant.any<i8<-8:7>>
+func @parseNoExpressedType() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// CHECK-LABEL: parseOnlyStorageType
+// CHECK: !quant.any<i8>
+!qalias = type !quant.any<i8>
+func @parseOnlyStorageType() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
diff --git a/mlir/test/Dialect/QuantOps/parse-ops-invalid.mlir b/mlir/test/Dialect/QuantOps/parse-ops-invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..272c53070c7e1c483666ac097d345753c122c4a3
--- /dev/null
+++ b/mlir/test/Dialect/QuantOps/parse-ops-invalid.mlir
@@ -0,0 +1,93 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// -----
+func @invalidStatisticsMismatchedLayerType(%arg0: tensor<8x4x3xf32>) ->
+    tensor<8x4x3xf32> {
+  // expected-error@+1 {{layerStats must have a floating point element type}}
+  %0 = "quant.stats"(%arg0) {
+    layerStats = dense<[-1, 1]> : tensor<2xi8>
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+func @invalidStatisticsMismatchedLayerRank(%arg0: tensor<8x4x3xf32>) ->
+    tensor<8x4x3xf32> {
+  // expected-error@+1 {{layerStats must have shape [2]}}
+  %0 = "quant.stats"(%arg0) {
+    layerStats = dense<[[-1.0, 1.0]]> : tensor<1x2xf32>
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+func @invalidStatisticsMismatchedLayerShape(%arg0: tensor<8x4x3xf32>) ->
+    tensor<8x4x3xf32> {
+  // expected-error@+1 {{layerStats must have shape [2]}}
+  %0 = "quant.stats"(%arg0) {
+    layerStats = dense<[-1.0, 1.0, 2.0]> : tensor<3xf32>
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// CHECK-LABEL: validStatistics
+func @invalidStatisticsMismatchedAxisType(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+  // expected-error@+1 {{axisStats must have a floating point element type}}
+  %0 = "quant.stats"(%0) {
+    layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>,
+    axisStats = dense<[
+      [-1, 1],
+      [-8, 8],
+      [-1, 0]
+    ]> : tensor<3x2xi8>, axis = 3 : i64
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+func @invalidStatisticsMismatchedAxisSize(%arg0: tensor<8x4x3xf32>) ->
+    tensor<8x4x3xf32> {
+  // expected-error@+1 {{axisStats must have shape [N,2] where N = the slice size defined by the axis dim}}
+  %0 = "quant.stats"(%arg0) {
+    layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>,
+    axisStats = dense<[
+      [-1.0, 1.0],
+      [-8.0, 8.0],
+      [-0.5, 0.5],
+      [-2.0, 3.5]
+    ]> : tensor<4x2xf32>, axis = 3 : i64
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+func @invalidStatisticsMismatchedAxisShape(%arg0: tensor<8x4x3xf32>) ->
+    tensor<8x4x3xf32> {
+  // expected-error@+1 {{axisStats must have shape [N,2] where N = the slice size defined by the axis dim}}
+  %0 = "quant.stats"(%arg0) {
+    layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>,
+    axisStats = dense<[
+      [-1.0, 1.0, 1.0],
+      [-8.0, 8.0, 1.0],
+      [-0.5, 0.5, 1.0]
+    ]> : tensor<3x3xf32>, axis = 3 : i64
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+func @axisIsRequiredForAxisStats(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+  // expected-error@+1 {{axis must be specified for axisStats}}
+  %1 = "quant.stats"(%arg0) {
+    layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>,
+    axisStats = dense<[
+      [-1.0, 1.0],
+      [-8.0, 8.0],
+      [-0.5, 0.5]
+    ]> : tensor<3x2xf32>
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %1 : tensor<8x4x3xf32>
+}
+
+// -----
diff --git a/mlir/test/Dialect/QuantOps/parse-ops.mlir b/mlir/test/Dialect/QuantOps/parse-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..bdcd751a969d010b70f709be5fd60890cceb4c66
--- /dev/null
+++ b/mlir/test/Dialect/QuantOps/parse-ops.mlir
@@ -0,0 +1,64 @@
+// RUN: mlir-opt %s -split-input-file | FileCheck %s
+
+// -----
+// CHECK-LABEL: validConstFakeQuant
+func @validConstFakeQuant(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+  %0 = "quant.const_fake_quant"(%arg0) {
+    min = 0.0 : f32, max = 1.0 : f32, num_bits = 8, narrow_range = true
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  %1 = "quant.const_fake_quant"(%0) {
+    min = 0.0 : f32, max = 1.0 : f32, num_bits = 8, narrow_range = false
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  %2 = "quant.const_fake_quant"(%1) {
+    min = 0.0 : f32, max = 1.0 : f32, num_bits = 8
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %2 : tensor<8x4x3xf32>
+}
+
+// -----
+// CHECK-LABEL: validConstFakeQuantPerAxis
+func @validConstFakeQuantPerAxis(%arg0: tensor<8x4x2xf32>) -> tensor<8x4x2xf32> {
+  %0 = "quant.const_fake_quant_per_axis"(%arg0) {
+    min = [0.0 : f32, 1.0 : f32], max = [2.0 : f32, 3.0 : f32], axis = 2, num_bits = 8, narrow_range = true
+  } : (tensor<8x4x2xf32>) -> tensor<8x4x2xf32>
+  %1 = "quant.const_fake_quant_per_axis"(%0) {
+    min = [0.0 : f32, 1.0 : f32], max = [2.0 : f32, 3.0 : f32], axis = 2, num_bits = 8, narrow_range = false
+  } : (tensor<8x4x2xf32>) -> tensor<8x4x2xf32>
+  %2 = "quant.const_fake_quant_per_axis"(%1) {
+    min = [0.0 : f32, 1.0 : f32], max = [2.0 : f32, 3.0 : f32], axis = 2, num_bits = 8
+  } : (tensor<8x4x2xf32>) -> tensor<8x4x2xf32>
+  return %2 : tensor<8x4x2xf32>
+}
+
+// -----
+// CHECK-LABEL: validStatisticsRef
+func @validStatisticsRef(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+  %0 = "quant.stats_ref"(%arg0) { statsKey = "foobar" } :
+      (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
+
+// -----
+// CHECK-LABEL: validStatistics
+func @validStatistics(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+  %0 = "quant.stats"(%arg0) {
+    layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  %1 = "quant.stats"(%0) {
+    layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>,
+    axisStats = dense<[
+      [-1.0, 1.0],
+      [-8.0, 8.0],
+      [-0.5, 0.5]
+    ]> : tensor<3x2xf32>, axis = 2 : i64
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %1 : tensor<8x4x3xf32>
+}
+
+// -----
+// CHECK-LABEL: validCoupledRef
+func @validCoupledRef(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+  %0 = "quant.coupled_ref"(%arg0) { coupledKey = "foobar" } :
+      (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  return %0 : tensor<8x4x3xf32>
+}
diff --git a/mlir/test/Dialect/QuantOps/parse-uniform-invalid.mlir b/mlir/test/Dialect/QuantOps/parse-uniform-invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..114ab7dc0d7b9b42ac4764cfdfaff5c075e309f7
--- /dev/null
+++ b/mlir/test/Dialect/QuantOps/parse-uniform-invalid.mlir
@@ -0,0 +1,122 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// -----
+// Invalid type.
+// expected-error@+1 {{unknown quantized type foobar}}
+!qalias = type !quant.foobar
+
+// -----
+// Unrecognized token: illegal token
+// expected-error@+1 {{unknown quantized type __}}
+!qalias = type !quant.__
+
+// -----
+// Unrecognized token: trailing
+// expected-error@+1 {{expected '>'}}
+!qalias = type !quant.uniform<i8<-4:3>:f32, 0.99872:127 23>
+
+// -----
+// Unrecognized token: missing storage type maximum
+// expected-error@+1 {{expected ':'}}
+!qalias = type !quant.uniform<i8<16>:f32, 0.99872:127>
+
+// -----
+// Unrecognized token: missing closing angle bracket
+// expected-error@+1 {{expected '>'}}
+!qalias = type !quant<"uniform<i8<-4:3:f32, 0.99872:127>">
+
+// -----
+// Unrecognized token: missing type colon
+// expected-error@+1 {{expected ':'}}
+!qalias = type !quant.uniform<i8<-4:3>f32, 0.99872:127>
+
+// -----
+// Unrecognized token: missing comma
+// expected-error@+1 {{expected ','}}
+!qalias = type !quant.uniform<i8<-4:3>:f32 0.99872:127>
+
+// -----
+// Unrecognized storage type: illegal prefix
+// expected-error@+1 {{illegal storage type prefix}}
+!qalias = type !quant.uniform<int8<-4:3>:f32, 0.99872:127>
+
+// -----
+// Unrecognized storage type: no width
+// expected-error@+1 {{illegal storage type prefix}}
+!qalias = type !quant.uniform<i<-4:3>:f32, 0.99872:127>
+
+// -----
+// Unrecognized storage type: storage size > 32
+// expected-error@+1 {{illegal storage type size: 33}}
+!qalias = type !quant.uniform<i33:f32, 0.99872:127>
+
+// -----
+// Unrecognized storage type: storage size < 0
+// expected-error@+1 {{illegal storage type prefix}}
+!qalias = type !quant.uniform<i-1<-4:3>:f32, 0.99872:127>
+
+// -----
+// Unrecognized storage type: storage size == 0
+// expected-error@+1 {{invalid integer width}}
+!qalias = type !quant.uniform<i0<-4:3>:f32, 0.99872:127>
+
+// -----
+// Illegal storage min/max: max - min < 0
+// expected-error@+1 {{illegal storage min and storage max: (2:1)}}
+!qalias = type !quant.uniform<i8<2:1>:f32, 0.99872:127>
+
+// -----
+// Illegal storage min/max: max - min == 0
+// expected-error@+1 {{illegal storage min and storage max: (1:1)}}
+!qalias = type !quant.uniform<i8<1:1>:f32, 0.99872:127>
+
+// -----
+// Illegal storage min/max: max > defaultMax
+// expected-error@+1 {{illegal storage type maximum: 9}}
+!qalias = type !quant.uniform<i4<-1:9>:f32, 0.99872:127>
+
+// -----
+// Illegal storage min/max: min < defaultMin
+// expected-error@+1 {{illegal storage type minimum: -9}}
+!qalias = type !quant.uniform<i4<-9:1>:f32, 0.99872:127>
+
+// -----
+// Illegal uniform params: invalid scale
+// expected-error@+1 {{expected floating point literal}}
+!qalias = type !quant.uniform<i8<-4:3>:f32, abc:127>
+
+// -----
+// Illegal uniform params: invalid zero point separator
+// expected-error@+1 {{expected '>'}}
+!qalias = type !quant.uniform<i8<-4:3>:f32, 0.1abc>
+
+// -----
+// Illegal uniform params: missing zero point
+// expected-error@+1 {{expected integer value}}
+!qalias = type !quant.uniform<i8<-4:3>:f32, 0.1:>
+
+// -----
+// Illegal uniform params: invalid zero point
+// expected-error@+1 {{expected integer value}}
+!qalias = type !quant.uniform<i8<-4:3>:f32, 0.1:abc>
+
+// -----
+// Illegal expressed type: f33
+// expected-error@+1 {{expected non-function type}}
+!qalias = type !quant.uniform<i8<-4:3>:f33, 0.99872:127>
+
+// -----
+// Illegal scale: negative
+// expected-error@+1 {{illegal scale: -1.000000}}
+!qalias = type !quant.uniform<i8<-4:3>:f32, -1.0:127>
+
+// -----
+// Illegal uniform params: missing quantized dimension
+// expected-error@+1 {{expected integer value}}
+!qalias = type !quant.uniform<i8<-4:3>:f32:, {2.000000e+02:-19.987200e-01:1}>
+
+// -----
+// Illegal uniform params: unspecified quantized dimension, when multiple scales
+// provided.
+// expected-error@+1 {{expected floating point literal}}
+!qalias = type !quant.uniform<i8<-4:3>:f32, {2.000000e+02,-19.987200e-01:1}>
diff --git a/mlir/test/Dialect/QuantOps/parse-uniform.mlir b/mlir/test/Dialect/QuantOps/parse-uniform.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..82d2ae65649940725de70d63df8e15bf73c43927
--- /dev/null
+++ b/mlir/test/Dialect/QuantOps/parse-uniform.mlir
@@ -0,0 +1,147 @@
+// RUN: mlir-opt %s -split-input-file | FileCheck %s
+
+// -----
+// All per-layer params specified:
+//   [signed] storageType, storageTypeMin, storageTypeMax, expressedType, scale, zeroPoint
+// CHECK: !quant.uniform<i8<-8:7>:f32, 9.987200e-01:127>
+!qalias = type !quant.uniform<i8<-8:7>:f32, 0.99872:127>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Trailing whitespace.
+// CHECK: !quant.uniform<i8<-8:7>:f32, 9.987200e-01:127>
+!qalias = type !quant.uniform<i8<-8:7>:f32, 0.99872:127  >
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Required per-layer params specified:
+//   [unsigned] storageType, expressedType, scale
+// CHECK: !quant.uniform<u8:f32, 9.987200e-01>
+!qalias = type !quant.uniform<u8:f32, 0.99872>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Exponential scale (-)
+// CHECK: !quant.uniform<u8:f32, 2.000000e-02>
+!qalias = type !quant.uniform<u8:f32, 2.0e-2>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Exponential scale (+)
+// CHECK: !quant.uniform<u8:f32, 2.000000e+02>
+!qalias = type !quant.uniform<u8:f32, 2.0e+2>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Storage type: i16
+// CHECK: !quant.uniform<i16:f32, 2.000000e+02>
+!qalias = type !quant.uniform<i16:f32, 2.0e+2>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Storage type: u16
+// CHECK: !quant.uniform<u16:f32, 2.000000e+02>
+!qalias = type !quant.uniform<u16:f32, 2.0e+2>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Storage type: i32
+// CHECK: !quant.uniform<i32:f32, 2.000000e+02>
+!qalias = type !quant.uniform<i32:f32, 2.0e+2>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Storage type: u32
+// CHECK: !quant.uniform<u32:f32, 2.000000e+02>
+!qalias = type !quant.uniform<u32:f32, 2.0e+2>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Expressed type: f32
+// CHECK: !quant.uniform<u8:f32, 2.000000e+02>
+!qalias = type !quant.uniform<u8:f32, 2.0e+2>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Expressed type: f16
+// CHECK: !quant.uniform<u8:f16, 2.000000e+02>
+!qalias = type !quant.uniform<u8:f16, 2.0e+2>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Expressed type: f64
+// CHECK: !quant.uniform<u8:f64, 2.000000e+02>
+!qalias = type !quant.uniform<u8:f64, 2.0e+2>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Expressed type: bf16
+// CHECK: !quant.uniform<u8:bf16, 2.000000e+02>
+!qalias = type !quant.uniform<u8:bf16, 2.0e+2>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Per-axis scales and zero points (affine)
+// CHECK: !quant.uniform<u8:f32:1, {2.000000e+02:-120,9.987200e-01:127}>
+!qalias = type !quant.uniform<u8:f32:1, {2.0e+2:-120,0.99872:127}>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Per-axis scales and no zero points (fixedpoint)
+// CHECK: !quant.uniform<i8:f32:1, {2.000000e+02,9.987200e-01}>
+!qalias = type !quant.uniform<i8:f32:1, {2.0e+2,0.99872}>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
+
+// -----
+// Per-axis scales and zero points (mixed affine and fixedpoint)
+// CHECK: !quant.uniform<i8:f32:1, {2.000000e+02,9.987200e-01:120}>
+!qalias = type !quant.uniform<i8:f32:1, {2.0e+2,0.99872:120}>
+func @parse() -> !qalias {
+  %0 = "foo"() : () -> !qalias
+  return %0 : !qalias
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/arithmetic-ops.mlir b/mlir/test/Dialect/SPIRV/Serialization/arithmetic-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ff55e8120fecec8d63e4816140571dd187774a18
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/arithmetic-ops.mlir
@@ -0,0 +1,79 @@
+// RUN: mlir-translate -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  func @fmul(%arg0 : f32, %arg1 : f32) {
+    // CHECK: {{%.*}}= spv.FMul {{%.*}}, {{%.*}} : f32
+    %0 = spv.FMul %arg0, %arg1 : f32
+    spv.Return
+  }
+  func @fadd(%arg0 : vector<4xf32>, %arg1 : vector<4xf32>) {
+    // CHECK: {{%.*}} = spv.FAdd {{%.*}}, {{%.*}} : vector<4xf32>
+    %0 = spv.FAdd %arg0, %arg1 : vector<4xf32>
+    spv.Return
+  }
+  func @fdiv(%arg0 : vector<4xf32>, %arg1 : vector<4xf32>) {
+    // CHECK: {{%.*}} = spv.FDiv {{%.*}}, {{%.*}} : vector<4xf32>
+    %0 = spv.FDiv %arg0, %arg1 : vector<4xf32>
+    spv.Return
+  }
+  func @fmod(%arg0 : vector<4xf32>, %arg1 : vector<4xf32>) {
+    // CHECK: {{%.*}} = spv.FMod {{%.*}}, {{%.*}} : vector<4xf32>
+    %0 = spv.FMod %arg0, %arg1 : vector<4xf32>
+    spv.Return
+  }
+  func @fnegate(%arg0 : vector<4xf32>) {
+    // CHECK: {{%.*}} = spv.FNegate {{%.*}} : vector<4xf32>
+    %0 = spv.FNegate %arg0 : vector<4xf32>
+    spv.Return
+  }
+  func @fsub(%arg0 : vector<4xf32>, %arg1 : vector<4xf32>) {
+    // CHECK: {{%.*}} = spv.FSub {{%.*}}, {{%.*}} : vector<4xf32>
+    %0 = spv.FSub %arg0, %arg1 : vector<4xf32>
+    spv.Return
+  }
+  func @frem(%arg0 : vector<4xf32>, %arg1 : vector<4xf32>) {
+    // CHECK: {{%.*}} = spv.FRem {{%.*}}, {{%.*}} : vector<4xf32>
+    %0 = spv.FRem %arg0, %arg1 : vector<4xf32>
+    spv.Return
+  }
+  func @iadd(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>) {
+    // CHECK: {{%.*}} = spv.IAdd {{%.*}}, {{%.*}} : vector<4xi32>
+    %0 = spv.IAdd %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @isub(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>) {
+    // CHECK: {{%.*}} = spv.ISub {{%.*}}, {{%.*}} : vector<4xi32>
+    %0 = spv.ISub %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @imul(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>) {
+    // CHECK: {{%.*}} = spv.IMul {{%.*}}, {{%.*}} : vector<4xi32>
+    %0 = spv.IMul %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @udiv(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>) {
+    // CHECK: {{%.*}} = spv.UDiv {{%.*}}, {{%.*}} : vector<4xi32>
+    %0 = spv.UDiv %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @umod(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>) {
+    // CHECK: {{%.*}} = spv.UMod {{%.*}}, {{%.*}} : vector<4xi32>
+    %0 = spv.UMod %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @sdiv(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>) {
+    // CHECK: {{%.*}} = spv.SDiv {{%.*}}, {{%.*}} : vector<4xi32>
+    %0 = spv.SDiv %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @smod(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>) {
+    // CHECK: {{%.*}} = spv.SMod {{%.*}}, {{%.*}} : vector<4xi32>
+    %0 = spv.SMod %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @srem(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>) {
+    // CHECK: {{%.*}} = spv.SRem {{%.*}}, {{%.*}} : vector<4xi32>
+    %0 = spv.SRem %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/array.mlir b/mlir/test/Dialect/SPIRV/Serialization/array.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..9dfef23fb3983b9e8c4ac5c19414db3fee94ea6e
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/array.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-translate -split-input-file -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  func @array_stride(%arg0 : !spv.ptr<!spv.array<4x!spv.array<4xf32 [4]> [128]>, StorageBuffer>,
+                     %arg1 : i32, %arg2 : i32) {
+    // CHECK: {{%.*}} = spv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spv.ptr<!spv.array<4 x !spv.array<4 x f32 [4]> [128]>, StorageBuffer>
+    %2 = spv.AccessChain %arg0[%arg1, %arg2] : !spv.ptr<!spv.array<4x!spv.array<4xf32 [4]> [128]>, StorageBuffer>
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: spv.globalVariable {{@.*}} : !spv.ptr<!spv.rtarray<f32>, StorageBuffer>
+  spv.globalVariable @var0 : !spv.ptr<!spv.rtarray<f32>, StorageBuffer>
+  // CHECK: spv.globalVariable {{@.*}} : !spv.ptr<!spv.rtarray<vector<4xf16>>, Input>
+  spv.globalVariable @var1 : !spv.ptr<!spv.rtarray<vector<4xf16>>, Input>
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/atomic-ops.mlir b/mlir/test/Dialect/SPIRV/Serialization/atomic-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..93d4af1494fff9998323d15ba98f8baf666044a9
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/atomic-ops.mlir
@@ -0,0 +1,32 @@
+// RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  // CHECK-LABEL: @atomic_compare_exchange_weak
+  func @atomic_compare_exchange_weak(%ptr: !spv.ptr<i32, Workgroup>, %value: i32, %comparator: i32) -> i32 {
+    // CHECK: spv.AtomicCompareExchangeWeak "Workgroup" "Release" "Acquire" %{{.*}}, %{{.*}}, %{{.*}} : !spv.ptr<i32, Workgroup>
+    %0 = spv.AtomicCompareExchangeWeak "Workgroup" "Release" "Acquire" %ptr, %value, %comparator: !spv.ptr<i32, Workgroup>
+    // CHECK: spv.AtomicAnd "Device" "None" %{{.*}}, %{{.*}} : !spv.ptr<i32, Workgroup>
+    %1 = spv.AtomicAnd "Device" "None" %ptr, %value : !spv.ptr<i32, Workgroup>
+    // CHECK: spv.AtomicIAdd "Workgroup" "Acquire" %{{.*}}, %{{.*}} : !spv.ptr<i32, Workgroup>
+    %2 = spv.AtomicIAdd "Workgroup" "Acquire" %ptr, %value : !spv.ptr<i32, Workgroup>
+    // CHECK: spv.AtomicIDecrement "Workgroup" "Acquire" %{{.*}} : !spv.ptr<i32, Workgroup>
+    %3 = spv.AtomicIDecrement "Workgroup" "Acquire" %ptr : !spv.ptr<i32, Workgroup>
+    // CHECK: spv.AtomicIIncrement "Device" "Release" %{{.*}} : !spv.ptr<i32, Workgroup>
+    %4 = spv.AtomicIIncrement "Device" "Release" %ptr : !spv.ptr<i32, Workgroup>
+    // CHECK: spv.AtomicISub "Workgroup" "Acquire" %{{.*}}, %{{.*}} : !spv.ptr<i32, Workgroup>
+    %5 = spv.AtomicISub "Workgroup" "Acquire" %ptr, %value : !spv.ptr<i32, Workgroup>
+    // CHECK: spv.AtomicOr "Workgroup" "AcquireRelease" %{{.*}}, %{{.*}} : !spv.ptr<i32, Workgroup>
+    %6 = spv.AtomicOr "Workgroup" "AcquireRelease" %ptr, %value : !spv.ptr<i32, Workgroup>
+    // CHECK: spv.AtomicSMax "Subgroup" "None" %{{.*}}, %{{.*}} : !spv.ptr<i32, Workgroup>
+    %7 = spv.AtomicSMax "Subgroup" "None" %ptr, %value : !spv.ptr<i32, Workgroup>
+    // CHECK: spv.AtomicSMin "Device" "Release" %{{.*}}, %{{.*}} : !spv.ptr<i32, Workgroup>
+    %8 = spv.AtomicSMin "Device" "Release" %ptr, %value : !spv.ptr<i32, Workgroup>
+    // CHECK: spv.AtomicUMax "Subgroup" "None" %{{.*}}, %{{.*}} : !spv.ptr<i32, Workgroup>
+    %9 = spv.AtomicUMax "Subgroup" "None" %ptr, %value : !spv.ptr<i32, Workgroup>
+    // CHECK: spv.AtomicUMin "Device" "Release" %{{.*}}, %{{.*}} : !spv.ptr<i32, Workgroup>
+    %10 = spv.AtomicUMin "Device" "Release" %ptr, %value : !spv.ptr<i32, Workgroup>
+    // CHECK: spv.AtomicXor "Workgroup" "AcquireRelease" %{{.*}}, %{{.*}} : !spv.ptr<i32, Workgroup>
+    %11 = spv.AtomicXor "Workgroup" "AcquireRelease" %ptr, %value : !spv.ptr<i32, Workgroup>
+    spv.ReturnValue %0: i32
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/barrier.mlir b/mlir/test/Dialect/SPIRV/Serialization/barrier.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..90efffa59c0fb4b8d31b2484c5534500fd789286
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/barrier.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-translate -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  func @memory_barrier_0() -> () {
+    // CHECK: spv.MemoryBarrier "Device", "Release|UniformMemory"
+    spv.MemoryBarrier "Device", "Release|UniformMemory"
+    spv.Return
+  }
+  func @memory_barrier_1() -> () {
+    // CHECK: spv.MemoryBarrier "Subgroup", "AcquireRelease|SubgroupMemory"
+    spv.MemoryBarrier "Subgroup", "AcquireRelease|SubgroupMemory"
+    spv.Return
+  }
+  func @control_barrier_0() -> () {
+    // CHECK: spv.ControlBarrier "Device", "Workgroup", "Release|UniformMemory"
+    spv.ControlBarrier "Device", "Workgroup", "Release|UniformMemory"
+    spv.Return
+  }
+  func @control_barrier_1() -> () {
+    // CHECK: spv.ControlBarrier "Workgroup", "Invocation", "AcquireRelease|UniformMemory"
+    spv.ControlBarrier "Workgroup", "Invocation", "AcquireRelease|UniformMemory"
+    spv.Return
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/bit-ops.mlir b/mlir/test/Dialect/SPIRV/Serialization/bit-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..fa823ea8a9d580144df6850819753186953ad885
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/bit-ops.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  func @bitcount(%arg: i32) -> i32 {
+    // CHECK: spv.BitCount {{%.*}} : i32
+    %0 = spv.BitCount %arg : i32
+    spv.ReturnValue %0 : i32
+  }
+  func @bit_field_insert(%base: vector<3xi32>, %insert: vector<3xi32>, %offset: i32, %count: i16) -> vector<3xi32> {
+    // CHECK: {{%.*}} = spv.BitFieldInsert {{%.*}}, {{%.*}}, {{%.*}}, {{%.*}} : vector<3xi32>, i32, i16
+    %0 = spv.BitFieldInsert %base, %insert, %offset, %count : vector<3xi32>, i32, i16
+    spv.ReturnValue %0 : vector<3xi32>
+  }
+  func @bit_field_s_extract(%base: vector<3xi32>, %offset: i8, %count: i8) -> vector<3xi32> {
+    // CHECK: {{%.*}} = spv.BitFieldSExtract {{%.*}}, {{%.*}}, {{%.*}} : vector<3xi32>, i8, i8
+    %0 = spv.BitFieldSExtract %base, %offset, %count : vector<3xi32>, i8, i8
+    spv.ReturnValue %0 : vector<3xi32>
+  }
+  func @bit_field_u_extract(%base: vector<3xi32>, %offset: i8, %count: i8) -> vector<3xi32> {
+    // CHECK: {{%.*}} = spv.BitFieldUExtract {{%.*}}, {{%.*}}, {{%.*}} : vector<3xi32>, i8, i8
+    %0 = spv.BitFieldUExtract %base, %offset, %count : vector<3xi32>, i8, i8
+    spv.ReturnValue %0 : vector<3xi32>
+  }
+  func @bitreverse(%arg: i32) -> i32 {
+    // CHECK: spv.BitReverse {{%.*}} : i32
+    %0 = spv.BitReverse %arg : i32
+    spv.ReturnValue %0 : i32
+  }
+  func @not(%arg: i32) -> i32 {
+    // CHECK: spv.Not {{%.*}} : i32
+    %0 = spv.Not %arg : i32
+    spv.ReturnValue %0 : i32
+  }
+  func @shift_left_logical(%arg0: i32, %arg1 : i16) -> i32 {
+    // CHECK: {{%.*}} = spv.ShiftLeftLogical {{%.*}}, {{%.*}} : i32, i16
+    %0 = spv.ShiftLeftLogical %arg0, %arg1: i32, i16
+    spv.ReturnValue %0 : i32
+  }
+  func @shift_right_arithmetic(%arg0: vector<4xi32>, %arg1 : vector<4xi8>) -> vector<4xi32> {
+    // CHECK: {{%.*}} = spv.ShiftRightArithmetic {{%.*}}, {{%.*}} : vector<4xi32>, vector<4xi8>
+    %0 = spv.ShiftRightArithmetic %arg0, %arg1: vector<4xi32>, vector<4xi8>
+    spv.ReturnValue %0 : vector<4xi32>
+  }
+  func @shift_right_logical(%arg0: vector<2xi32>, %arg1 : vector<2xi8>) -> vector<2xi32> {
+    // CHECK: {{%.*}} = spv.ShiftRightLogical {{%.*}}, {{%.*}} : vector<2xi32>, vector<2xi8>
+    %0 = spv.ShiftRightLogical %arg0, %arg1: vector<2xi32>, vector<2xi8>
+    spv.ReturnValue %0 : vector<2xi32>
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/cast-ops.mlir b/mlir/test/Dialect/SPIRV/Serialization/cast-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..f3f8ddf65a1f861fbfd8600fecc74e805e81c12b
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/cast-ops.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  func @bit_cast(%arg0 : f32) {
+    // CHECK: {{%.*}} = spv.Bitcast {{%.*}} : f32 to i32
+    %0 = spv.Bitcast %arg0 : f32 to i32
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @convert_f_to_s(%arg0 : f32) -> i32 {
+    // CHECK: {{%.*}} = spv.ConvertFToS {{%.*}} : f32 to i32
+    %0 = spv.ConvertFToS %arg0 : f32 to i32
+    spv.ReturnValue %0 : i32
+  }
+  func @convert_f_to_u(%arg0 : f32) -> i32 {
+    // CHECK: {{%.*}} = spv.ConvertFToU {{%.*}} : f32 to i32
+    %0 = spv.ConvertFToU %arg0 : f32 to i32
+    spv.ReturnValue %0 : i32
+  }
+  func @convert_s_to_f(%arg0 : i32) -> f32 {
+    // CHECK: {{%.*}} = spv.ConvertSToF {{%.*}} : i32 to f32
+    %0 = spv.ConvertSToF %arg0 : i32 to f32
+    spv.ReturnValue %0 : f32
+  }
+  func @convert_u_to_f(%arg0 : i32) -> f32 {
+    // CHECK: {{%.*}} = spv.ConvertUToF {{%.*}} : i32 to f32
+    %0 = spv.ConvertUToF %arg0 : i32 to f32
+    spv.ReturnValue %0 : f32
+  }
+  func @f_convert(%arg0 : f32) -> f64 {
+    // CHECK: {{%.*}} = spv.FConvert {{%.*}} : f32 to f64
+    %0 = spv.FConvert %arg0 : f32 to f64
+    spv.ReturnValue %0 : f64
+  }
+  func @s_convert(%arg0 : i32) -> i64 {
+    // CHECK: {{%.*}} = spv.SConvert {{%.*}} : i32 to i64
+    %0 = spv.SConvert %arg0 : i32 to i64
+    spv.ReturnValue %0 : i64
+  }
+  func @u_convert(%arg0 : i32) -> i64 {
+    // CHECK: {{%.*}} = spv.UConvert {{%.*}} : i32 to i64
+    %0 = spv.UConvert %arg0 : i32 to i64
+    spv.ReturnValue %0 : i64
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/composite-op.mlir b/mlir/test/Dialect/SPIRV/Serialization/composite-op.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ba01cc84f0c5eee773649628b454768bef701a84
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/composite-op.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-translate -split-input-file -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  func @composite_insert(%arg0 : !spv.struct<f32, !spv.struct<!spv.array<4xf32>, f32>>, %arg1: !spv.array<4xf32>) -> !spv.struct<f32, !spv.struct<!spv.array<4xf32>, f32>> {
+    // CHECK: spv.CompositeInsert {{%.*}}, {{%.*}}[1 : i32, 0 : i32] : !spv.array<4 x f32> into !spv.struct<f32, !spv.struct<!spv.array<4 x f32>, f32>>
+    %0 = spv.CompositeInsert %arg1, %arg0[1 : i32, 0 : i32] : !spv.array<4xf32> into !spv.struct<f32, !spv.struct<!spv.array<4xf32>, f32>>
+    spv.ReturnValue %0: !spv.struct<f32, !spv.struct<!spv.array<4xf32>, f32>>
+  }
+  func @composite_construct_vector(%arg0: f32, %arg1: f32, %arg2 : f32) -> vector<3xf32> {
+    // CHECK: spv.CompositeConstruct {{%.*}}, {{%.*}}, {{%.*}} : vector<3xf32>
+    %0 = spv.CompositeConstruct %arg0, %arg1, %arg2 : vector<3xf32>
+    spv.ReturnValue %0: vector<3xf32>
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/constant.mlir b/mlir/test/Dialect/SPIRV/Serialization/constant.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..50005bed5e2e08a33823bdc54c7c188f699ade5d
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/constant.mlir
@@ -0,0 +1,195 @@
+// RUN: mlir-translate -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  // CHECK-LABEL: @bool_const
+  func @bool_const() -> () {
+    // CHECK: spv.constant true
+    %0 = spv.constant true
+    // CHECK: spv.constant false
+    %1 = spv.constant false
+
+    %2 = spv.Variable init(%0): !spv.ptr<i1, Function>
+    %3 = spv.Variable init(%1): !spv.ptr<i1, Function>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @i32_const
+  func @i32_const() -> () {
+    // CHECK: spv.constant 0 : i32
+    %0 = spv.constant  0 : i32
+    // CHECK: spv.constant 10 : i32
+    %1 = spv.constant 10 : i32
+    // CHECK: spv.constant -5 : i32
+    %2 = spv.constant -5 : i32
+
+    %3 = spv.IAdd %0, %1 : i32
+    %4 = spv.IAdd %2, %3 : i32
+    spv.Return
+  }
+
+  // CHECK-LABEL: @i64_const
+  func @i64_const() -> () {
+    // CHECK: spv.constant 4294967296 : i64
+    %0 = spv.constant           4294967296 : i64 //  2^32
+    // CHECK: spv.constant -4294967296 : i64
+    %1 = spv.constant          -4294967296 : i64 // -2^32
+    // CHECK: spv.constant 9223372036854775807 : i64
+    %2 = spv.constant  9223372036854775807 : i64 //  2^63 - 1
+    // CHECK: spv.constant -9223372036854775808 : i64
+    %3 = spv.constant -9223372036854775808 : i64 // -2^63
+
+    %4 = spv.IAdd %0, %1 : i64
+    %5 = spv.IAdd %2, %3 : i64
+    spv.Return
+  }
+
+  // CHECK-LABEL: @i16_const
+  func @i16_const() -> () {
+    // CHECK: spv.constant -32768 : i16
+    %0 = spv.constant -32768 : i16 // -2^15
+    // CHECK: spv.constant 32767 : i16
+    %1 = spv.constant 32767 : i16 //  2^15 - 1
+
+    %2 = spv.IAdd %0, %1 : i16
+    spv.Return
+  }
+
+  // CHECK-LABEL: @float_const
+  func @float_const() -> () {
+    // CHECK: spv.constant 0.000000e+00 : f32
+    %0 = spv.constant 0. : f32
+    // CHECK: spv.constant 1.000000e+00 : f32
+    %1 = spv.constant 1. : f32
+    // CHECK: spv.constant -0.000000e+00 : f32
+    %2 = spv.constant -0. : f32
+    // CHECK: spv.constant -1.000000e+00 : f32
+    %3 = spv.constant -1. : f32
+    // CHECK: spv.constant 7.500000e-01 : f32
+    %4 = spv.constant 0.75 : f32
+    // CHECK: spv.constant -2.500000e-01 : f32
+    %5 = spv.constant -0.25 : f32
+
+    %6 = spv.FAdd %0, %1 : f32
+    %7 = spv.FAdd %2, %3 : f32
+    %8 = spv.FAdd %4, %5 : f32
+    spv.Return
+  }
+
+  // CHECK-LABEL: @double_const
+  func @double_const() -> () {
+    // TODO(antiagainst): test range boundary values
+    // CHECK: spv.constant 1.024000e+03 : f64
+    %0 = spv.constant 1024. : f64
+    // CHECK: spv.constant -1.024000e+03 : f64
+    %1 = spv.constant -1024. : f64
+
+    %2 = spv.FAdd %0, %1 : f64
+    spv.Return
+  }
+
+  // CHECK-LABEL: @half_const
+  func @half_const() -> () {
+    // CHECK: spv.constant 5.120000e+02 : f16
+    %0 = spv.constant 512. : f16
+    // CHECK: spv.constant -5.120000e+02 : f16
+    %1 = spv.constant -512. : f16
+
+    %2 = spv.FAdd %0, %1 : f16
+    spv.Return
+  }
+
+  // CHECK-LABEL: @bool_vector_const
+  func @bool_vector_const() -> () {
+    // CHECK: spv.constant dense<false> : vector<2xi1>
+    %0 = spv.constant dense<false> : vector<2xi1>
+    // CHECK: spv.constant dense<[true, true, true]> : vector<3xi1>
+    %1 = spv.constant dense<true> : vector<3xi1>
+    // CHECK: spv.constant dense<[false, true]> : vector<2xi1>
+    %2 = spv.constant dense<[false, true]> : vector<2xi1>
+
+    %3 = spv.Variable init(%0): !spv.ptr<vector<2xi1>, Function>
+    %4 = spv.Variable init(%1): !spv.ptr<vector<3xi1>, Function>
+    %5 = spv.Variable init(%2): !spv.ptr<vector<2xi1>, Function>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @int_vector_const
+  func @int_vector_const() -> () {
+    // CHECK: spv.constant dense<0> : vector<3xi32>
+    %0 = spv.constant dense<0> : vector<3xi32>
+    // CHECK: spv.constant dense<1> : vector<3xi32>
+    %1 = spv.constant dense<1> : vector<3xi32>
+    // CHECK: spv.constant dense<[2, -3, 4]> : vector<3xi32>
+    %2 = spv.constant dense<[2, -3, 4]> : vector<3xi32>
+
+    %3 = spv.IAdd %0, %1 : vector<3xi32>
+    %4 = spv.IAdd %2, %3 : vector<3xi32>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @fp_vector_const
+  func @fp_vector_const() -> () {
+    // CHECK: spv.constant dense<0.000000e+00> : vector<4xf32>
+    %0 = spv.constant dense<0.> : vector<4xf32>
+    // CHECK: spv.constant dense<-1.500000e+01> : vector<4xf32>
+    %1 = spv.constant dense<-15.> : vector<4xf32>
+    // CHECK: spv.constant dense<[7.500000e-01, -2.500000e-01, 1.000000e+01, 4.200000e+01]> : vector<4xf32>
+    %2 = spv.constant dense<[0.75, -0.25, 10., 42.]> : vector<4xf32>
+
+    %3 = spv.FAdd %0, %1 : vector<4xf32>
+    %4 = spv.FAdd %2, %3 : vector<4xf32>
+    spv.Return
+  }
+
+  // CHECK-LABEL: @array_const
+  func @array_const() -> (!spv.array<2 x vector<2xf32>>) {
+    // CHECK: spv.constant [dense<3.000000e+00> : vector<2xf32>, dense<[4.000000e+00, 5.000000e+00]> : vector<2xf32>] : !spv.array<2 x vector<2xf32>>
+    %0 = spv.constant [dense<3.0> : vector<2xf32>, dense<[4., 5.]> : vector<2xf32>] : !spv.array<2 x vector<2xf32>>
+
+    spv.ReturnValue %0 : !spv.array<2 x vector<2xf32>>
+  }
+
+  // CHECK-LABEL: @ignore_not_used_const
+  func @ignore_not_used_const() -> () {
+    %0 = spv.constant false
+    // CHECK-NEXT: spv.Return
+    spv.Return
+  }
+
+  // CHECK-LABEL: @materialize_const_at_each_use
+  func @materialize_const_at_each_use() -> (i32) {
+    // CHECK: %[[USE1:.*]] = spv.constant 42 : i32
+    // CHECK: %[[USE2:.*]] = spv.constant 42 : i32
+    // CHECK: spv.IAdd %[[USE1]], %[[USE2]]
+    %0 = spv.constant 42 : i32
+    %1 = spv.IAdd %0, %0 : i32
+    spv.ReturnValue %1 : i32
+  }
+
+  // CHECK-LABEL: @const_variable
+  func @const_variable(%arg0 : i32, %arg1 : i32) -> () {
+    // CHECK: %[[CONST:.*]] = spv.constant 5 : i32
+    // CHECK: spv.Variable init(%[[CONST]]) : !spv.ptr<i32, Function>
+    // CHECK: spv.IAdd %arg0, %arg1
+    %0 = spv.IAdd %arg0, %arg1 : i32
+    %1 = spv.constant 5 : i32
+    %2 = spv.Variable init(%1) : !spv.ptr<i32, Function>
+    %3 = spv.Load "Function" %2 : i32
+    %4 = spv.IAdd %0, %3 : i32
+    spv.Return
+  }
+
+  // CHECK-LABEL: @multi_dimensions_const
+  func @multi_dimensions_const() -> (!spv.array<2 x !spv.array<2 x !spv.array<3 x i32 [4]> [12]> [24]>) {
+    // CHECK: spv.constant {{\[}}{{\[}}[1 : i32, 2 : i32, 3 : i32], [4 : i32, 5 : i32, 6 : i32]], {{\[}}[7 : i32, 8 : i32, 9 : i32], [10 : i32, 11 : i32, 12 : i32]]] : !spv.array<2 x !spv.array<2 x !spv.array<3 x i32 [4]> [12]> [24]>
+    %0 = spv.constant dense<[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]> : tensor<2x2x3xi32> : !spv.array<2 x !spv.array<2 x !spv.array<3 x i32 [4]> [12]> [24]>
+    spv.ReturnValue %0 : !spv.array<2 x !spv.array<2 x !spv.array<3 x i32 [4]> [12]> [24]>
+  }
+
+  // CHECK-LABEL: @multi_dimensions_splat_const
+  func @multi_dimensions_splat_const() -> (!spv.array<2 x !spv.array<2 x !spv.array<3 x i32 [4]> [12]> [24]>) {
+    // CHECK: spv.constant {{\[}}{{\[}}[1 : i32, 1 : i32, 1 : i32], [1 : i32, 1 : i32, 1 : i32]], {{\[}}[1 : i32, 1 : i32, 1 : i32], [1 : i32, 1 : i32, 1 : i32]]] : !spv.array<2 x !spv.array<2 x !spv.array<3 x i32 [4]> [12]> [24]>
+    %0 = spv.constant dense<1> : tensor<2x2x3xi32> : !spv.array<2 x !spv.array<2 x !spv.array<3 x i32 [4]> [12]> [24]>
+    spv.ReturnValue %0 : !spv.array<2 x !spv.array<2 x !spv.array<3 x i32 [4]> [12]> [24]>
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/entry-point.mlir b/mlir/test/Dialect/SPIRV/Serialization/entry-point.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..be5ac0bc1a00c4ef2ca277430499e80bf44bf0d0
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/entry-point.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  func @noop() -> () {
+    spv.Return
+  }
+  // CHECK:      spv.EntryPoint "GLCompute" @noop
+  // CHECK-NEXT: spv.ExecutionMode @noop "ContractionOff"
+  spv.EntryPoint "GLCompute" @noop
+  spv.ExecutionMode @noop "ContractionOff"
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // CHECK:       spv.globalVariable @var2 : !spv.ptr<f32, Input>
+  // CHECK-NEXT:  spv.globalVariable @var3 : !spv.ptr<f32, Output>
+  // CHECK-NEXT:  func @noop({{%.*}}: !spv.ptr<f32, Input>, {{%.*}}: !spv.ptr<f32, Output>)
+  // CHECK:       spv.EntryPoint "GLCompute" @noop, @var2, @var3
+  spv.globalVariable @var2 : !spv.ptr<f32, Input>
+  spv.globalVariable @var3 : !spv.ptr<f32, Output>
+  func @noop(%arg0 : !spv.ptr<f32, Input>, %arg1 : !spv.ptr<f32, Output>) -> () {
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @noop, @var2, @var3
+  spv.ExecutionMode @noop "ContractionOff"
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/execution-mode.mlir b/mlir/test/Dialect/SPIRV/Serialization/execution-mode.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ad3ba1eb5dae0e3d876cb09a0dfc36cae173f561
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/execution-mode.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-translate -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  func @foo() -> () {
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @foo
+  // CHECK: spv.ExecutionMode @foo "LocalSizeHint", 3, 4, 5
+  spv.ExecutionMode @foo "LocalSizeHint", 3, 4, 5
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/function-call.mlir b/mlir/test/Dialect/SPIRV/Serialization/function-call.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..530be05bcd31f1fac6ce07001323f07b82b6aae1
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/function-call.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-translate -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @var1 : !spv.ptr<!spv.array<4xf32>, Input>
+  func @fmain() -> i32 {
+    %0 = spv.constant 16 : i32
+    %1 = spv._address_of @var1 : !spv.ptr<!spv.array<4xf32>, Input>
+    // CHECK: {{%.*}} = spv.FunctionCall @f_0({{%.*}}) : (i32) -> i32
+    %3 = spv.FunctionCall @f_0(%0) : (i32) -> i32
+    // CHECK: spv.FunctionCall @f_1({{%.*}}, {{%.*}}) : (i32, !spv.ptr<!spv.array<4 x f32>, Input>) -> ()
+    spv.FunctionCall @f_1(%3, %1) : (i32, !spv.ptr<!spv.array<4xf32>, Input>) ->  ()
+    // CHECK: {{%.*}} =  spv.FunctionCall @f_2({{%.*}}) : (!spv.ptr<!spv.array<4 x f32>, Input>) -> !spv.ptr<!spv.array<4 x f32>, Input>
+    %4 = spv.FunctionCall @f_2(%1) : (!spv.ptr<!spv.array<4xf32>, Input>) -> !spv.ptr<!spv.array<4xf32>, Input>
+    spv.ReturnValue %3 : i32
+  }
+  func @f_0(%arg0 : i32) -> i32 {
+    spv.ReturnValue %arg0 : i32
+  }
+  func @f_1(%arg0 : i32, %arg1 : !spv.ptr<!spv.array<4xf32>, Input>) -> () {
+    spv.Return
+  }
+  func @f_2(%arg0 : !spv.ptr<!spv.array<4xf32>, Input>) -> !spv.ptr<!spv.array<4xf32>, Input> {
+    spv.ReturnValue %arg0 : !spv.ptr<!spv.array<4xf32>, Input>
+  }
+
+  func @f_loop_with_function_call(%count : i32) -> () {
+    %zero = spv.constant 0: i32
+    %var = spv.Variable init(%zero) : !spv.ptr<i32, Function>
+    spv.loop {
+      spv.Branch ^header
+    ^header:
+      %val0 = spv.Load "Function" %var : i32
+      %cmp = spv.SLessThan %val0, %count : i32
+      spv.BranchConditional %cmp, ^body, ^merge
+    ^body:
+      spv.Branch ^continue
+    ^continue:
+      // CHECK: spv.FunctionCall @f_inc({{%.*}}) : (!spv.ptr<i32, Function>) -> ()
+      spv.FunctionCall @f_inc(%var) : (!spv.ptr<i32, Function>) -> ()
+      spv.Branch ^header
+    ^merge:
+      spv._merge
+    }
+    spv.Return
+  }
+  func @f_inc(%arg0 : !spv.ptr<i32, Function>) -> () {
+      %one = spv.constant 1 : i32
+      %0 = spv.Load "Function" %arg0 : i32
+      %1 = spv.IAdd %0, %one : i32
+      spv.Store "Function" %arg0, %1 : i32
+      spv.Return
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/global-variable.mlir b/mlir/test/Dialect/SPIRV/Serialization/global-variable.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..12a16a4b269c1f7a873d5ed66a50e9b66219a629
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/global-variable.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+
+// CHECK:      spv.globalVariable @var0 bind(1, 0) : !spv.ptr<f32, Input>
+// CHECK-NEXT: spv.globalVariable @var1 bind(0, 1) : !spv.ptr<f32, Output>
+// CHECK-NEXT: spv.globalVariable @var2 built_in("GlobalInvocationId") : !spv.ptr<vector<3xi32>, Input>
+// CHECK-NEXT: spv.globalVariable @var3 built_in("GlobalInvocationId") : !spv.ptr<vector<3xi32>, Input>
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @var0 bind(1, 0) : !spv.ptr<f32, Input>
+  spv.globalVariable @var1 bind(0, 1) : !spv.ptr<f32, Output>
+  spv.globalVariable @var2 {built_in = "GlobalInvocationId"} : !spv.ptr<vector<3xi32>, Input>
+  spv.globalVariable @var3 built_in("GlobalInvocationId") : !spv.ptr<vector<3xi32>, Input>
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // CHECK:         spv.globalVariable @var1 : !spv.ptr<f32, Input>
+  // CHECK-NEXT:    spv.globalVariable @var2 initializer(@var1) bind(1, 0) : !spv.ptr<f32, Input>
+  spv.globalVariable @var1 : !spv.ptr<f32, Input>
+  spv.globalVariable @var2 initializer(@var1) bind(1, 0) : !spv.ptr<f32, Input>
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @globalInvocationID built_in("GlobalInvocationId") : !spv.ptr<vector<3xi32>, Input>
+  func @foo() {
+    // CHECK: %[[ADDR:.*]] = spv._address_of @globalInvocationID : !spv.ptr<vector<3xi32>, Input>
+    %0 = spv._address_of @globalInvocationID : !spv.ptr<vector<3xi32>, Input>
+    %1 = spv.constant 0: i32
+    // CHECK: spv.AccessChain %[[ADDR]]
+    %2 = spv.AccessChain %0[%1] : !spv.ptr<vector<3xi32>, Input>
+    spv.Return
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/glsl-ops.mlir b/mlir/test/Dialect/SPIRV/Serialization/glsl-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..2ebe1882a926b0095f820196feac7cbbfbce4c2a
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/glsl-ops.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-translate -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  func @fmul(%arg0 : f32, %arg1 : f32) {
+    // CHECK: {{%.*}} = spv.GLSL.Exp {{%.*}} : f32
+    %0 = spv.GLSL.Exp %arg0 : f32
+    // CHECK: {{%.*}} = spv.GLSL.FMax {{%.*}}, {{%.*}} : f32
+    %1 = spv.GLSL.FMax %arg0, %arg1 : f32
+    // CHECK: {{%.*}} = spv.GLSL.Sqrt {{%.*}} : f32
+    %2 = spv.GLSL.Sqrt %arg0 : f32
+    spv.Return
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/group-ops.mlir b/mlir/test/Dialect/SPIRV/Serialization/group-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..020a4cb9897e6303b6a70d5b5c6786ba9e738bf5
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/group-ops.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  // CHECK-LABEL: @subgroup_ballot
+  func @subgroup_ballot(%predicate: i1) -> vector<4xi32> {
+    // CHECK: %{{.*}} = spv.SubgroupBallotKHR %{{.*}}: vector<4xi32>
+    %0 = spv.SubgroupBallotKHR %predicate: vector<4xi32>
+    spv.ReturnValue %0: vector<4xi32>
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/logical-ops.mlir b/mlir/test/Dialect/SPIRV/Serialization/logical-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..09915c7f0cf53c08abb94557fdfc6ecb6da1a87d
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/logical-ops.mlir
@@ -0,0 +1,75 @@
+// RUN: mlir-translate -split-input-file -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  func @iequal_scalar(%arg0: i32, %arg1: i32)  {
+    // CHECK: {{.*}} = spv.IEqual {{.*}}, {{.*}} : i32
+    %0 = spv.IEqual %arg0, %arg1 : i32
+    spv.Return
+  }
+  func @inotequal_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) {
+    // CHECK: {{.*}} = spv.INotEqual {{.*}}, {{.*}} : vector<4xi32>
+    %0 = spv.INotEqual %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @sgt_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) {
+    // CHECK: {{.*}} = spv.SGreaterThan {{.*}}, {{.*}} : vector<4xi32>
+    %0 = spv.SGreaterThan %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @sge_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) {
+    // CHECK: {{.*}} = spv.SGreaterThanEqual {{.*}}, {{.*}} : vector<4xi32>
+    %0 = spv.SGreaterThanEqual %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @slt_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) {
+    // CHECK: {{.*}} = spv.SLessThan {{.*}}, {{.*}} : vector<4xi32>
+    %0 = spv.SLessThan %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @slte_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) {
+    // CHECK: {{.*}} = spv.SLessThanEqual {{.*}}, {{.*}} : vector<4xi32>
+    %0 = spv.SLessThanEqual %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @ugt_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) {
+    // CHECK: {{.*}} = spv.UGreaterThan {{.*}}, {{.*}} : vector<4xi32>
+    %0 = spv.UGreaterThan %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @ugte_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) {
+    // CHECK: {{.*}} = spv.UGreaterThanEqual {{.*}}, {{.*}} : vector<4xi32>
+    %0 = spv.UGreaterThanEqual %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @ult_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) {
+    // CHECK: {{.*}} = spv.ULessThan {{.*}}, {{.*}} : vector<4xi32>
+    %0 = spv.ULessThan %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+  func @ulte_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>)  {
+    // CHECK: {{.*}} = spv.ULessThanEqual {{.*}}, {{.*}} : vector<4xi32>
+    %0 = spv.ULessThanEqual %arg0, %arg1 : vector<4xi32>
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.specConstant @condition_scalar = true
+  func @select() -> () {
+    %0 = spv.constant 4.0 : f32
+    %1 = spv.constant 5.0 : f32
+    %2 = spv._reference_of @condition_scalar : i1
+    // CHECK: spv.Select {{.*}}, {{.*}}, {{.*}} : i1, f32
+    %3 = spv.Select %2, %0, %1 : i1, f32
+    %4 = spv.constant dense<[2.0, 3.0, 4.0, 5.0]> : vector<4xf32>
+    %5 = spv.constant dense<[6.0, 7.0, 8.0, 9.0]> : vector<4xf32>
+    // CHECK: spv.Select {{.*}}, {{.*}}, {{.*}} : i1, vector<4xf32>
+    %6 = spv.Select %2, %4, %5 : i1, vector<4xf32>
+    %7 = spv.constant dense<[true, true, true, true]> : vector<4xi1>
+    // CHECK: spv.Select {{.*}}, {{.*}}, {{.*}} : vector<4xi1>, vector<4xf32>
+    %8 = spv.Select %7, %4, %5 : vector<4xi1>, vector<4xf32>
+    spv.Return
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/loop.mlir b/mlir/test/Dialect/SPIRV/Serialization/loop.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..84c5ec3e1171840ed33df795305fb33aafbbb586
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/loop.mlir
@@ -0,0 +1,213 @@
+// RUN: mlir-translate -split-input-file -test-spirv-roundtrip %s | FileCheck %s
+
+// Single loop
+
+spv.module "Logical" "GLSL450" {
+  // for (int i = 0; i < count; ++i) {}
+  func @loop(%count : i32) -> () {
+    %zero = spv.constant 0: i32
+    %one = spv.constant 1: i32
+    %var = spv.Variable init(%zero) : !spv.ptr<i32, Function>
+
+// CHECK:        spv.Branch ^bb1
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:   spv.loop
+    spv.loop {
+// CHECK-NEXT:     spv.Branch ^bb1
+      spv.Branch ^header
+
+// CHECK-NEXT:   ^bb1:
+    ^header:
+// CHECK-NEXT:     spv.Load
+      %val0 = spv.Load "Function" %var : i32
+// CHECK-NEXT:     spv.SLessThan
+      %cmp = spv.SLessThan %val0, %count : i32
+// CHECK-NEXT:     spv.BranchConditional %{{.*}} [1, 1], ^bb2, ^bb4
+      spv.BranchConditional %cmp [1, 1], ^body, ^merge
+
+// CHECK-NEXT:   ^bb2:
+    ^body:
+      // Do nothing
+// CHECK-NEXT:     spv.Branch ^bb3
+      spv.Branch ^continue
+
+// CHECK-NEXT:   ^bb3:
+    ^continue:
+// CHECK-NEXT:     spv.Load
+      %val1 = spv.Load "Function" %var : i32
+// CHECK-NEXT:     spv.constant 1
+// CHECK-NEXT:     spv.IAdd
+      %add = spv.IAdd %val1, %one : i32
+// CHECK-NEXT:     spv.Store
+      spv.Store "Function" %var, %add : i32
+// CHECK-NEXT:     spv.Branch ^bb1
+      spv.Branch ^header
+
+// CHECK-NEXT:   ^bb4:
+// CHECK-NEXT:     spv._merge
+    ^merge:
+      spv._merge
+    }
+    spv.Return
+  }
+
+  func @main() -> () {
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @main
+} attributes {
+  capabilities = ["Shader"]
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @GV1 bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<10 x f32 [4]> [0]>, StorageBuffer>
+  spv.globalVariable @GV2 bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<10 x f32 [4]> [0]>, StorageBuffer>
+  func @loop_kernel() {
+    %0 = spv._address_of @GV1 : !spv.ptr<!spv.struct<!spv.array<10 x f32 [4]> [0]>, StorageBuffer>
+    %1 = spv.constant 0 : i32
+    %2 = spv.AccessChain %0[%1] : !spv.ptr<!spv.struct<!spv.array<10 x f32 [4]> [0]>, StorageBuffer>
+    %3 = spv._address_of @GV2 : !spv.ptr<!spv.struct<!spv.array<10 x f32 [4]> [0]>, StorageBuffer>
+    %5 = spv.AccessChain %3[%1] : !spv.ptr<!spv.struct<!spv.array<10 x f32 [4]> [0]>, StorageBuffer>
+    %6 = spv.constant 4 : i32
+    %7 = spv.constant 42 : i32
+    %8 = spv.constant 2 : i32
+// CHECK:        spv.Branch ^bb1(%{{.*}} : i32)
+// CHECK-NEXT: ^bb1(%[[OUTARG:.*]]: i32):
+// CHECK-NEXT:   spv.loop {
+    spv.loop {
+// CHECK-NEXT:     spv.Branch ^bb1(%[[OUTARG]] : i32)
+      spv.Branch ^header(%6 : i32)
+// CHECK-NEXT:   ^bb1(%[[HEADARG:.*]]: i32):
+    ^header(%9: i32):
+      %10 = spv.SLessThan %9, %7 : i32
+// CHECK:          spv.BranchConditional %{{.*}}, ^bb2, ^bb3
+      spv.BranchConditional %10, ^body, ^merge
+// CHECK-NEXT:   ^bb2:     // pred: ^bb1
+    ^body:
+      %11 = spv.AccessChain %2[%9] : !spv.ptr<!spv.array<10 x f32 [4]>, StorageBuffer>
+      %12 = spv.Load "StorageBuffer" %11 : f32
+      %13 = spv.AccessChain %5[%9] : !spv.ptr<!spv.array<10 x f32 [4]>, StorageBuffer>
+      spv.Store "StorageBuffer" %13, %12 : f32
+// CHECK:          %[[ADD:.*]] = spv.IAdd
+      %14 = spv.IAdd %9, %8 : i32
+// CHECK-NEXT:     spv.Branch ^bb1(%[[ADD]] : i32)
+      spv.Branch ^header(%14 : i32)
+// CHECK-NEXT:   ^bb3:
+    ^merge:
+// CHECK-NEXT:     spv._merge
+      spv._merge
+    }
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @loop_kernel
+  spv.ExecutionMode @loop_kernel "LocalSize", 1, 1, 1
+} attributes {capabilities = ["Shader"], extensions = ["SPV_KHR_storage_buffer_storage_class"]}
+
+// -----
+
+// Nested loop
+
+spv.module "Logical" "GLSL450" {
+  // for (int i = 0; i < count; ++i) {
+  //   for (int j = 0; j < count; ++j) { }
+  // }
+  func @loop(%count : i32) -> () {
+    %zero = spv.constant 0: i32
+    %one = spv.constant 1: i32
+    %ivar = spv.Variable init(%zero) : !spv.ptr<i32, Function>
+    %jvar = spv.Variable init(%zero) : !spv.ptr<i32, Function>
+
+// CHECK:        spv.Branch ^bb1
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:   spv.loop
+    spv.loop {
+// CHECK-NEXT:     spv.Branch ^bb1
+      spv.Branch ^header
+
+// CHECK-NEXT:   ^bb1:
+    ^header:
+// CHECK-NEXT:     spv.Load
+      %ival0 = spv.Load "Function" %ivar : i32
+// CHECK-NEXT:     spv.SLessThan
+      %icmp = spv.SLessThan %ival0, %count : i32
+// CHECK-NEXT:     spv.BranchConditional %{{.*}}, ^bb2, ^bb5
+      spv.BranchConditional %icmp, ^body, ^merge
+
+// CHECK-NEXT:   ^bb2:
+    ^body:
+// CHECK-NEXT:     spv.constant 0
+// CHECK-NEXT: 		 spv.Store
+      spv.Store "Function" %jvar, %zero : i32
+// CHECK-NEXT:     spv.Branch ^bb3
+// CHECK-NEXT:   ^bb3:
+// CHECK-NEXT:     spv.loop
+      spv.loop {
+// CHECK-NEXT:       spv.Branch ^bb1
+        spv.Branch ^header
+
+// CHECK-NEXT:     ^bb1:
+      ^header:
+// CHECK-NEXT:       spv.Load
+        %jval0 = spv.Load "Function" %jvar : i32
+// CHECK-NEXT:       spv.SLessThan
+        %jcmp = spv.SLessThan %jval0, %count : i32
+// CHECK-NEXT:       spv.BranchConditional %{{.*}}, ^bb2, ^bb4
+        spv.BranchConditional %jcmp, ^body, ^merge
+
+// CHECK-NEXT:     ^bb2:
+      ^body:
+        // Do nothing
+// CHECK-NEXT:       spv.Branch ^bb3
+        spv.Branch ^continue
+
+// CHECK-NEXT:     ^bb3:
+      ^continue:
+// CHECK-NEXT:       spv.Load
+        %jval1 = spv.Load "Function" %jvar : i32
+// CHECK-NEXT:       spv.constant 1
+// CHECK-NEXT:       spv.IAdd
+        %add = spv.IAdd %jval1, %one : i32
+// CHECK-NEXT:       spv.Store
+        spv.Store "Function" %jvar, %add : i32
+// CHECK-NEXT:       spv.Branch ^bb1
+        spv.Branch ^header
+
+// CHECK-NEXT:     ^bb4:
+      ^merge:
+// CHECK-NEXT:       spv._merge
+        spv._merge
+      } // end inner loop
+
+// CHECK:          spv.Branch ^bb4
+      spv.Branch ^continue
+
+// CHECK-NEXT:   ^bb4:
+    ^continue:
+// CHECK-NEXT:     spv.Load
+      %ival1 = spv.Load "Function" %ivar : i32
+// CHECK-NEXT:     spv.constant 1
+// CHECK-NEXT:     spv.IAdd
+      %add = spv.IAdd %ival1, %one : i32
+// CHECK-NEXT:     spv.Store
+      spv.Store "Function" %ivar, %add : i32
+// CHECK-NEXT:     spv.Branch ^bb1
+      spv.Branch ^header
+
+// CHECK-NEXT:   ^bb5:
+// CHECK-NEXT:     spv._merge
+    ^merge:
+      spv._merge
+    } // end outer loop
+    spv.Return
+  }
+
+  func @main() -> () {
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @main
+} attributes {
+  capabilities = ["Shader"]
+}
+
diff --git a/mlir/test/Dialect/SPIRV/Serialization/memory-ops.mlir b/mlir/test/Dialect/SPIRV/Serialization/memory-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..bc932b1424e6ee64f42ec2bb1e28298a8c98b2a6
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/memory-ops.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+
+// CHECK:           func {{@.*}}([[ARG1:%.*]]: !spv.ptr<f32, Input>, [[ARG2:%.*]]: !spv.ptr<f32, Output>) {
+// CHECK-NEXT:        [[VALUE:%.*]] = spv.Load "Input" [[ARG1]] : f32
+// CHECK-NEXT:        spv.Store "Output" [[ARG2]], [[VALUE]] : f32
+
+spv.module "Logical" "GLSL450" {
+  func @load_store(%arg0 : !spv.ptr<f32, Input>, %arg1 : !spv.ptr<f32, Output>) {
+    %1 = spv.Load "Input" %arg0 : f32
+    spv.Store "Output" %arg1, %1 : f32
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @access_chain(%arg0 : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>,
+                     %arg1 : i32, %arg2 : i32) {
+    // CHECK: {{%.*}} = spv.AccessChain {{%.*}}[{{%.*}}] : !spv.ptr<!spv.array<4 x !spv.array<4 x f32>>, Function>
+    // CHECK-NEXT: {{%.*}} = spv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spv.ptr<!spv.array<4 x !spv.array<4 x f32>>, Function>
+    %1 = spv.AccessChain %arg0[%arg1] : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+    %2 = spv.AccessChain %arg0[%arg1, %arg2] : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+    spv.Return
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/module.mlir b/mlir/test/Dialect/SPIRV/Serialization/module.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..a66116aeb252cc25e6d2013a080ed73814fd954f
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/module.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+
+// CHECK:      spv.module "Logical" "GLSL450" {
+// CHECK-NEXT:   func @foo() {
+// CHECK-NEXT:     spv.Return
+// CHECK-NEXT:   }
+// CHECK-NEXT: } attributes {major_version = 1 : i32, minor_version = 0 : i32}
+
+spv.module "Logical" "GLSL450" {
+  func @foo() -> () {
+     spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+} attributes {
+  // CHECK: capabilities = ["Shader", "Float16"]
+  capabilities = ["Shader", "Float16"]
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+} attributes {
+  // CHECK: extensions = ["SPV_KHR_float_controls", "SPV_KHR_subgroup_vote"]
+  extensions = ["SPV_KHR_float_controls", "SPV_KHR_subgroup_vote"]
+}
+
diff --git a/mlir/test/Dialect/SPIRV/Serialization/non-uniform-ops.mlir b/mlir/test/Dialect/SPIRV/Serialization/non-uniform-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..282811ec6ec8905099bc81f8e8b5c35088352fc6
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/non-uniform-ops.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  // CHECK-LABEL: @group_non_uniform_ballot
+  func @group_non_uniform_ballot(%predicate: i1) -> vector<4xi32> {
+    // CHECK: %{{.*}} = spv.GroupNonUniformBallot "Workgroup" %{{.*}}: vector<4xi32>
+  %0 = spv.GroupNonUniformBallot "Workgroup" %predicate : vector<4xi32>
+    spv.ReturnValue %0: vector<4xi32>
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/phi.mlir b/mlir/test/Dialect/SPIRV/Serialization/phi.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..cf2a1c81aa557bc852f2dafa1ee8ab6a3e3326ad
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/phi.mlir
@@ -0,0 +1,248 @@
+// RUN: mlir-translate -split-input-file -test-spirv-roundtrip %s | FileCheck %s
+
+// Test branch with one block argument
+
+spv.module "Logical" "GLSL450" {
+  func @foo() -> () {
+// CHECK:        %[[CST:.*]] = spv.constant 0
+    %zero = spv.constant 0 : i32
+// CHECK-NEXT:   spv.Branch ^bb1(%[[CST]] : i32)
+    spv.Branch ^bb1(%zero : i32)
+// CHECK-NEXT: ^bb1(%{{.*}}: i32):
+  ^bb1(%arg0: i32):
+   spv.Return
+  }
+
+  func @main() -> () {
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @main
+} attributes {
+  capabilities = ["Shader"]
+}
+
+// -----
+
+// Test branch with multiple block arguments
+
+spv.module "Logical" "GLSL450" {
+  func @foo() -> () {
+// CHECK:        %[[ZERO:.*]] = spv.constant 0
+    %zero = spv.constant 0 : i32
+// CHECK-NEXT:   %[[ONE:.*]] = spv.constant 1
+    %one = spv.constant 1.0 : f32
+// CHECK-NEXT:   spv.Branch ^bb1(%[[ZERO]], %[[ONE]] : i32, f32)
+    spv.Branch ^bb1(%zero, %one : i32, f32)
+
+// CHECK-NEXT: ^bb1(%{{.*}}: i32, %{{.*}}: f32):     // pred: ^bb0
+  ^bb1(%arg0: i32, %arg1: f32):
+   spv.Return
+  }
+
+  func @main() -> () {
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @main
+} attributes {
+  capabilities = ["Shader"]
+}
+
+// -----
+
+// Test using block arguments within branch
+
+spv.module "Logical" "GLSL450" {
+  func @foo() -> () {
+// CHECK:        %[[CST0:.*]] = spv.constant 0
+    %zero = spv.constant 0 : i32
+// CHECK-NEXT:   spv.Branch ^bb1(%[[CST0]] : i32)
+    spv.Branch ^bb1(%zero : i32)
+
+// CHECK-NEXT: ^bb1(%[[ARG:.*]]: i32):
+  ^bb1(%arg0: i32):
+// CHECK-NEXT:   %[[ADD:.*]] = spv.IAdd %[[ARG]], %[[ARG]] : i32
+    %0 = spv.IAdd %arg0, %arg0 : i32
+// CHECK-NEXT:   %[[CST1:.*]] = spv.constant 0
+// CHECK-NEXT:   spv.Branch ^bb2(%[[CST1]], %[[ADD]] : i32, i32)
+    spv.Branch ^bb2(%zero, %0 : i32, i32)
+
+// CHECK-NEXT: ^bb2(%{{.*}}: i32, %{{.*}}: i32):
+  ^bb2(%arg1: i32, %arg2: i32):
+   spv.Return
+  }
+
+  func @main() -> () {
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @main
+} attributes {
+  capabilities = ["Shader"]
+}
+
+// -----
+
+// Test block not following domination order
+
+spv.module "Logical" "GLSL450" {
+  func @foo() -> () {
+// CHECK:        spv.Branch ^bb1
+    spv.Branch ^bb1
+
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:   %[[ZERO:.*]] = spv.constant 0
+// CHECK-NEXT:   %[[ONE:.*]] = spv.constant 1
+// CHECK-NEXT:   spv.Branch ^bb2(%[[ZERO]], %[[ONE]] : i32, f32)
+
+// CHECK-NEXT: ^bb2(%{{.*}}: i32, %{{.*}}: f32):
+  ^bb2(%arg0: i32, %arg1: f32):
+// CHECK-NEXT:   spv.Return
+   spv.Return
+
+  // This block is reordered to follow domination order.
+  ^bb1:
+    %zero = spv.constant 0 : i32
+    %one = spv.constant 1.0 : f32
+    spv.Branch ^bb2(%zero, %one : i32, f32)
+  }
+
+  func @main() -> () {
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @main
+} attributes {
+  capabilities = ["Shader"]
+}
+
+// -----
+
+// Test multiple predecessors
+
+spv.module "Logical" "GLSL450" {
+  func @foo() -> () {
+    %var = spv.Variable : !spv.ptr<i32, Function>
+
+// CHECK:      spv.selection
+    spv.selection {
+      %true = spv.constant true
+// CHECK:        spv.BranchConditional %{{.*}}, ^bb1, ^bb2
+      spv.BranchConditional %true, ^true, ^false
+
+// CHECK-NEXT: ^bb1:
+    ^true:
+// CHECK-NEXT:   %[[ZERO:.*]] = spv.constant 0
+      %zero = spv.constant 0 : i32
+// CHECK-NEXT:   spv.Branch ^bb3(%[[ZERO]] : i32)
+      spv.Branch ^phi(%zero: i32)
+
+// CHECK-NEXT: ^bb2:
+    ^false:
+// CHECK-NEXT:   %[[ONE:.*]] = spv.constant 1
+      %one = spv.constant 1 : i32
+// CHECK-NEXT:   spv.Branch ^bb3(%[[ONE]] : i32)
+      spv.Branch ^phi(%one: i32)
+
+// CHECK-NEXT: ^bb3(%[[ARG:.*]]: i32):
+    ^phi(%arg: i32):
+// CHECK-NEXT:   spv.Store "Function" %{{.*}}, %[[ARG]] : i32
+      spv.Store "Function" %var, %arg : i32
+// CHECK-NEXT:   spv.Return
+      spv.Return
+
+// CHECK-NEXT: ^bb4:
+    ^merge:
+// CHECK-NEXT:   spv._merge
+      spv._merge
+    }
+    spv.Return
+  }
+
+  func @main() -> () {
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @main
+} attributes {
+  capabilities = ["Shader"]
+}
+
+// -----
+
+// Test nested loops with block arguments
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
+  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
+  func @fmul_kernel() {
+    %3 = spv.constant 12 : i32
+    %4 = spv.constant 32 : i32
+    %5 = spv.constant 4 : i32
+    %6 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
+    %7 = spv.Load "Input" %6 : vector<3xi32>
+    %8 = spv.CompositeExtract %7[0 : i32] : vector<3xi32>
+    %9 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
+    %10 = spv.Load "Input" %9 : vector<3xi32>
+    %11 = spv.CompositeExtract %10[1 : i32] : vector<3xi32>
+    %18 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
+    %19 = spv.Load "Input" %18 : vector<3xi32>
+    %20 = spv.CompositeExtract %19[0 : i32] : vector<3xi32>
+    %21 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
+    %22 = spv.Load "Input" %21 : vector<3xi32>
+    %23 = spv.CompositeExtract %22[1 : i32] : vector<3xi32>
+    %30 = spv.IMul %11, %4 : i32
+    %31 = spv.IMul %23, %4 : i32
+
+// CHECK:   spv.Branch ^[[FN_BB:.*]](%{{.*}} : i32)
+// CHECK: ^[[FN_BB]](%[[FN_BB_ARG:.*]]: i32):
+// CHECK:   spv.loop {
+    spv.loop {
+// CHECK:     spv.Branch ^bb1(%[[FN_BB_ARG]] : i32)
+      spv.Branch ^bb1(%30 : i32)
+// CHECK:   ^[[LP1_HDR:.*]](%[[LP1_HDR_ARG:.*]]: i32):
+    ^bb1(%32: i32):
+// CHECK:     spv.SLessThan
+      %33 = spv.SLessThan %32, %3 : i32
+// CHECK:     spv.BranchConditional %{{.*}}, ^[[LP1_BDY:.*]], ^[[LP1_MG:.*]]
+      spv.BranchConditional %33, ^bb2, ^bb3
+// CHECK:   ^[[LP1_BDY]]:
+    ^bb2:
+// CHECK:     %[[MUL:.*]] = spv.IMul
+      %34 = spv.IMul %8, %5 : i32
+// CHECK:     spv.IMul
+      %35 = spv.IMul %20, %5 : i32
+// CHECK:     spv.Branch ^[[LP1_CNT:.*]](%[[MUL]] : i32)
+// CHECK:   ^[[LP1_CNT]](%[[LP1_CNT_ARG:.*]]: i32):
+// CHECK:     spv.loop {
+      spv.loop {
+// CHECK:       spv.Branch ^[[LP2_HDR:.*]](%[[LP1_CNT_ARG]] : i32)
+        spv.Branch ^bb1(%34 : i32)
+// CHECK:     ^[[LP2_HDR]](%[[LP2_HDR_ARG:.*]]: i32):
+      ^bb1(%37: i32):
+// CHECK:       spv.SLessThan %[[LP2_HDR_ARG]]
+        %38 = spv.SLessThan %37, %5 : i32
+// CHECK:       spv.BranchConditional %{{.*}}, ^[[LP2_BDY:.*]], ^[[LP2_MG:.*]]
+        spv.BranchConditional %38, ^bb2, ^bb3
+// CHECK:     ^[[LP2_BDY]]:
+      ^bb2:
+// CHECK:       %[[ADD1:.*]] = spv.IAdd
+        %48 = spv.IAdd %37, %35 : i32
+// CHECK:       spv.Branch ^[[LP2_HDR]](%[[ADD1]] : i32)
+        spv.Branch ^bb1(%48 : i32)
+// CHECK:     ^[[LP2_MG]]:
+      ^bb3:
+// CHECK:       spv._merge
+        spv._merge
+      }
+// CHECK:     %[[ADD2:.*]] = spv.IAdd %[[LP1_HDR_ARG]]
+      %36 = spv.IAdd %32, %31 : i32
+// CHECK:     spv.Branch ^[[LP1_HDR]](%[[ADD2]] : i32)
+      spv.Branch ^bb1(%36 : i32)
+// CHECK:   ^[[LP1_MG]]:
+    ^bb3:
+// CHECK:     spv._merge
+      spv._merge
+    }
+    spv.Return
+  }
+
+  spv.EntryPoint "GLCompute" @fmul_kernel, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__
+  spv.ExecutionMode @fmul_kernel "LocalSize", 32, 1, 1
+} attributes {capabilities = ["Shader"], extensions = ["SPV_KHR_storage_buffer_storage_class"]}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/selection.mlir b/mlir/test/Dialect/SPIRV/Serialization/selection.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..676cd17abde89882fc94518d65cedd645a2c15a3
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/selection.mlir
@@ -0,0 +1,93 @@
+// RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+
+// Selection with both then and else branches
+
+spv.module "Logical" "GLSL450" {
+  func @selection(%cond: i1) -> () {
+// CHECK:        spv.Branch ^bb1
+// CHECK-NEXT: ^bb1:
+    %zero = spv.constant 0: i32
+    %one = spv.constant 1: i32
+    %two = spv.constant 2: i32
+    %var = spv.Variable init(%zero) : !spv.ptr<i32, Function>
+
+// CHECK-NEXT:   spv.selection {
+// CHECK-NEXT:     spv.constant 0
+// CHECK-NEXT:     spv.Variable
+    spv.selection {
+// CHECK-NEXT: spv.BranchConditional %{{.*}} [5, 10], ^bb1, ^bb2
+      spv.BranchConditional %cond [5, 10], ^then, ^else
+
+// CHECK-NEXT:   ^bb1:
+    ^then:
+// CHECK-NEXT:     spv.constant 1
+// CHECK-NEXT:     spv.Store
+      spv.Store "Function" %var, %one : i32
+// CHECK-NEXT:     spv.Branch ^bb3
+      spv.Branch ^merge
+
+// CHECK-NEXT:   ^bb2:
+    ^else:
+// CHECK-NEXT:     spv.constant 2
+// CHECK-NEXT:     spv.Store
+      spv.Store "Function" %var, %two : i32
+// CHECK-NEXT:     spv.Branch ^bb3
+      spv.Branch ^merge
+
+// CHECK-NEXT:   ^bb3:
+    ^merge:
+// CHECK-NEXT:     spv._merge
+      spv._merge
+    }
+
+    spv.Return
+  }
+
+  func @main() -> () {
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @main
+  spv.ExecutionMode @main "LocalSize", 1, 1, 1
+} attributes {
+  capabilities = ["Shader"]
+}
+
+// -----
+
+// Selection with only then branch
+// Selection in function entry block
+
+spv.module "Logical" "GLSL450" {
+// CHECK:      func @selection(%[[ARG:.*]]: i1
+  func @selection(%cond: i1) -> (i32) {
+// CHECK:        spv.Branch ^bb1
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:   spv.selection
+    spv.selection {
+// CHECK-NEXT: spv.BranchConditional %[[ARG]], ^bb1, ^bb2
+      spv.BranchConditional %cond, ^then, ^merge
+
+// CHECK:        ^bb1:
+    ^then:
+      %zero = spv.constant 0 : i32
+      spv.ReturnValue  %zero : i32
+
+// CHECK:        ^bb2:
+    ^merge:
+// CHECK-NEXT:     spv._merge
+      spv._merge
+    }
+
+    %one = spv.constant 1 : i32
+    spv.ReturnValue  %one : i32
+  }
+
+  func @main() -> () {
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @main
+  spv.ExecutionMode @main "LocalSize", 1, 1, 1
+} attributes {
+  capabilities = ["Shader"]
+}
+
diff --git a/mlir/test/Dialect/SPIRV/Serialization/spec-constant.mlir b/mlir/test/Dialect/SPIRV/Serialization/spec-constant.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..3966a07e107367b3b520ad336c3d3a0e16fe6bb9
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/spec-constant.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-translate -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: spv.specConstant @sc_true = true
+  spv.specConstant @sc_true = true
+  // CHECK: spv.specConstant @sc_false spec_id(1) = false
+  spv.specConstant @sc_false spec_id(1) = false
+
+  // CHECK: spv.specConstant @sc_int = -5 : i32
+  spv.specConstant @sc_int = -5 : i32
+
+  // CHECK: spv.specConstant @sc_float spec_id(5) = 1.000000e+00 : f32
+  spv.specConstant @sc_float spec_id(5) = 1. : f32
+
+  // CHECK-LABEL: @use
+  func @use() -> (i32) {
+    // We materialize a `spv._reference_of` op at every use of a
+    // specialization constant in the deserializer. So two ops here.
+    // CHECK: %[[USE1:.*]] = spv._reference_of @sc_int : i32
+    // CHECK: %[[USE2:.*]] = spv._reference_of @sc_int : i32
+    // CHECK: spv.IAdd %[[USE1]], %[[USE2]]
+
+    %0 = spv._reference_of @sc_int : i32
+    %1 = spv.IAdd %0, %0 : i32
+    spv.ReturnValue %1 : i32
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/struct.mlir b/mlir/test/Dialect/SPIRV/Serialization/struct.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..3bca2a12fc4686787dd7ee58e17116c628d20f0b
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/struct.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-translate -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: !spv.ptr<!spv.struct<!spv.array<128 x f32 [4]> [0]>, Input>
+  spv.globalVariable @var0 bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<128 x f32 [4]> [0]>, Input>
+
+  // CHECK: !spv.ptr<!spv.struct<f32 [0], !spv.struct<f32 [0], !spv.array<16 x f32 [4]> [4]> [4]>, Input>
+  spv.globalVariable @var1 bind(0, 2) : !spv.ptr<!spv.struct<f32 [0], !spv.struct<f32 [0], !spv.array<16 x f32 [4]> [4]> [4]>, Input>
+
+  // CHECK: !spv.ptr<!spv.struct<f32 [0], i32 [4], f64 [8], i64 [16], f32 [24], i32 [30], f32 [34], i32 [38]>, StorageBuffer>
+  spv.globalVariable @var2 : !spv.ptr<!spv.struct<f32 [0], i32 [4], f64 [8], i64 [16], f32 [24], i32 [30], f32 [34], i32 [38]>, StorageBuffer>
+
+  // CHECK: !spv.ptr<!spv.struct<!spv.array<128 x !spv.struct<!spv.array<128 x f32 [4]> [0]> [4]> [0]>, StorageBuffer>
+  spv.globalVariable @var3 : !spv.ptr<!spv.struct<!spv.array<128 x !spv.struct<!spv.array<128 x f32 [4]> [0]> [4]> [0]>, StorageBuffer>
+
+  // CHECK: !spv.ptr<!spv.struct<f32 [0, NonWritable], i32 [4]>, StorageBuffer>
+  spv.globalVariable @var4 : !spv.ptr<!spv.struct<f32 [0, NonWritable], i32 [4]>, StorageBuffer>
+
+  // CHECK: !spv.ptr<!spv.struct<f32 [NonWritable], i32 [NonWritable, NonReadable]>, StorageBuffer>
+  spv.globalVariable @var5 : !spv.ptr<!spv.struct<f32 [NonWritable], i32 [NonWritable, NonReadable]>, StorageBuffer>
+
+  // CHECK: !spv.ptr<!spv.struct<f32 [0, NonWritable], i32 [4, NonWritable, NonReadable]>, StorageBuffer>
+  spv.globalVariable @var6 : !spv.ptr<!spv.struct<f32 [0, NonWritable], i32 [4, NonWritable, NonReadable]>, StorageBuffer>
+
+  // CHECK: !spv.ptr<!spv.struct<>, StorageBuffer>
+  spv.globalVariable @empty : !spv.ptr<!spv.struct<>, StorageBuffer>
+
+  // CHECK: !spv.ptr<!spv.struct<!spv.array<128 x f32 [4]> [0]>, Input>,
+  // CHECK-SAME: !spv.ptr<!spv.struct<!spv.array<128 x f32 [4]> [0]>, Output>
+  func @kernel_1(%arg0: !spv.ptr<!spv.struct<!spv.array<128 x f32 [4]> [0]>, Input>, %arg1: !spv.ptr<!spv.struct<!spv.array<128 x f32 [4]> [0]>, Output>) -> () {
+    spv.Return
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/terminator.mlir b/mlir/test/Dialect/SPIRV/Serialization/terminator.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..502926a641777ffab68d2fe1381e05a70ffc0402
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/terminator.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-translate -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  // CHECK-LABEL: @ret
+  func @ret() -> () {
+    // CHECK: spv.Return
+    spv.Return
+  }
+
+  // CHECK-LABEL: @ret_val
+  func @ret_val() -> (i32) {
+    %0 = spv.Variable : !spv.ptr<i32, Function>
+    %1 = spv.Load "Function" %0 : i32
+    // CHECK: spv.ReturnValue {{.*}} : i32
+    spv.ReturnValue %1 : i32
+  }
+
+  // CHECK-LABEL: @unreachable
+  func @unreachable() {
+    spv.Return
+  // CHECK-NOT: ^bb
+  ^bb1:
+    // Unreachable blocks will be dropped during serialization.
+    // CHECK-NOT: spv.Unreachable
+    spv.Unreachable
+  }
+}
diff --git a/mlir/test/Dialect/SPIRV/Serialization/undef.mlir b/mlir/test/Dialect/SPIRV/Serialization/undef.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d5cb4100fff96620fa9822727eed2dd9968e44ae
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Serialization/undef.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-translate -split-input-file -test-spirv-roundtrip %s | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  func @foo() -> () {
+    // CHECK: {{%.*}} = spv.undef : f32
+    // CHECK-NEXT: {{%.*}} = spv.undef : f32
+    %0 = spv.undef : f32
+    %1 = spv.undef : f32
+    %2 = spv.FAdd %0, %1 : f32
+    // CHECK: {{%.*}} = spv.undef : vector<4xi32>
+    %3 = spv.undef : vector<4xi32>
+    %4 = spv.CompositeExtract %3[1 : i32] : vector<4xi32>
+    // CHECK: {{%.*}} = spv.undef : !spv.array<4 x !spv.array<4 x i32>>
+    %5 = spv.undef : !spv.array<4x!spv.array<4xi32>>
+    %6 = spv.CompositeExtract %5[1 : i32, 2 : i32] : !spv.array<4x!spv.array<4xi32>>
+    // CHECK: {{%.*}} = spv.undef : !spv.ptr<!spv.struct<f32>, StorageBuffer>
+    %7 = spv.undef : !spv.ptr<!spv.struct<f32>, StorageBuffer>
+    %8 = spv.constant 0 : i32
+    %9 = spv.AccessChain %7[%8] : !spv.ptr<!spv.struct<f32>, StorageBuffer>
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: func {{@.*}}
+  func @ignore_unused_undef() -> () {
+    // CHECK-NEXT: spv.Return
+    %0 = spv.undef : f32
+    spv.Return
+  }
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir b/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ebfec94b186a4a23e148bf14410846396b1d1959
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir
@@ -0,0 +1,124 @@
+// RUN: mlir-opt -spirv-lower-abi-attrs -verify-diagnostics %s -o - | FileCheck %s
+
+// CHECK-LABEL: spv.module
+spv.module "Logical" "GLSL450" {
+  // CHECK-DAG: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize")
+  spv.globalVariable @__builtin_var_WorkgroupSize__ built_in("WorkgroupSize") : !spv.ptr<vector<3xi32>, Input>
+  // CHECK-DAG: spv.globalVariable [[NUMWORKGROUPS:@.*]] built_in("NumWorkgroups")
+  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
+  // CHECK-DAG: spv.globalVariable [[LOCALINVOCATIONID:@.*]] built_in("LocalInvocationId")
+  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
+  // CHECK-DAG: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
+  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
+  // CHECK-DAG: spv.globalVariable [[VAR0:@.*]] bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<12 x !spv.array<4 x f32 [4]> [16]> [0]>, StorageBuffer>
+  // CHECK-DAG: spv.globalVariable [[VAR1:@.*]] bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<12 x !spv.array<4 x f32 [4]> [16]> [0]>, StorageBuffer>
+  // CHECK-DAG: spv.globalVariable [[VAR2:@.*]] bind(0, 2) : !spv.ptr<!spv.struct<!spv.array<12 x !spv.array<4 x f32 [4]> [16]> [0]>, StorageBuffer>
+  // CHECK-DAG: spv.globalVariable [[VAR3:@.*]] bind(0, 3) : !spv.ptr<!spv.struct<i32 [0]>, StorageBuffer>
+  // CHECK-DAG: spv.globalVariable [[VAR4:@.*]] bind(0, 4) : !spv.ptr<!spv.struct<i32 [0]>, StorageBuffer>
+  // CHECK-DAG: spv.globalVariable [[VAR5:@.*]] bind(0, 5) : !spv.ptr<!spv.struct<i32 [0]>, StorageBuffer>
+  // CHECK-DAG: spv.globalVariable [[VAR6:@.*]] bind(0, 6) : !spv.ptr<!spv.struct<i32 [0]>, StorageBuffer>
+  // CHECK: func [[FN:@.*]]()
+  func @load_store_kernel(%arg0: !spv.ptr<!spv.struct<!spv.array<12 x !spv.array<4 x f32>>>, StorageBuffer>
+                          {spirv.interface_var_abi = {binding = 0 : i32,
+                                                      descriptor_set = 0 : i32,
+                                                      storage_class = 12 : i32}},
+                          %arg1: !spv.ptr<!spv.struct<!spv.array<12 x !spv.array<4 x f32>>>, StorageBuffer>
+                          {spirv.interface_var_abi = {binding = 1 : i32,
+                                                      descriptor_set = 0 : i32,
+                                                      storage_class = 12 : i32}},
+                          %arg2: !spv.ptr<!spv.struct<!spv.array<12 x !spv.array<4 x f32>>>, StorageBuffer>
+                          {spirv.interface_var_abi = {binding = 2 : i32,
+                                                      descriptor_set = 0 : i32,
+                                                      storage_class = 12 : i32}},
+                          %arg3: i32
+                          {spirv.interface_var_abi = {binding = 3 : i32,
+                                                      descriptor_set = 0 : i32,
+                                                      storage_class = 12 : i32}},
+                          %arg4: i32
+                          {spirv.interface_var_abi = {binding = 4 : i32,
+                                                      descriptor_set = 0 : i32,
+                                                      storage_class = 12 : i32}},
+                          %arg5: i32
+                          {spirv.interface_var_abi = {binding = 5 : i32,
+                                                      descriptor_set = 0 : i32,
+                                                      storage_class = 12 : i32}},
+                          %arg6: i32
+                          {spirv.interface_var_abi = {binding = 6 : i32,
+                                                      descriptor_set = 0 : i32,
+                                                      storage_class = 12 : i32}})
+  attributes  {spirv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
+    // CHECK: [[ADDRESSARG6:%.*]] = spv._address_of [[VAR6]]
+    // CHECK: [[CONST6:%.*]] = spv.constant 0 : i32
+    // CHECK: [[ARG6PTR:%.*]] = spv.AccessChain [[ADDRESSARG6]]{{\[}}[[CONST6]]
+    // CHECK: {{%.*}} = spv.Load "StorageBuffer" [[ARG6PTR]]
+    // CHECK: [[ADDRESSARG5:%.*]] = spv._address_of [[VAR5]]
+    // CHECK: [[CONST5:%.*]] = spv.constant 0 : i32
+    // CHECK: [[ARG5PTR:%.*]] = spv.AccessChain [[ADDRESSARG5]]{{\[}}[[CONST5]]
+    // CHECK: {{%.*}} = spv.Load "StorageBuffer" [[ARG5PTR]]
+    // CHECK: [[ADDRESSARG4:%.*]] = spv._address_of [[VAR4]]
+    // CHECK: [[CONST4:%.*]] = spv.constant 0 : i32
+    // CHECK: [[ARG4PTR:%.*]] = spv.AccessChain [[ADDRESSARG4]]{{\[}}[[CONST4]]
+    // CHECK: [[ARG4:%.*]] = spv.Load "StorageBuffer" [[ARG4PTR]]
+    // CHECK: [[ADDRESSARG3:%.*]] = spv._address_of [[VAR3]]
+    // CHECK: [[CONST3:%.*]] = spv.constant 0 : i32
+    // CHECK: [[ARG3PTR:%.*]] = spv.AccessChain [[ADDRESSARG3]]{{\[}}[[CONST3]]
+    // CHECK: [[ARG3:%.*]] = spv.Load "StorageBuffer" [[ARG3PTR]]
+    // CHECK: [[ARG2:%.*]] = spv._address_of [[VAR2]]
+    // CHECK: [[ARG1:%.*]] = spv._address_of [[VAR1]]
+    // CHECK: [[ARG0:%.*]] = spv._address_of [[VAR0]]
+    %0 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
+    %1 = spv.Load "Input" %0 : vector<3xi32>
+    %2 = spv.CompositeExtract %1[0 : i32] : vector<3xi32>
+    %3 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
+    %4 = spv.Load "Input" %3 : vector<3xi32>
+    %5 = spv.CompositeExtract %4[1 : i32] : vector<3xi32>
+    %6 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
+    %7 = spv.Load "Input" %6 : vector<3xi32>
+    %8 = spv.CompositeExtract %7[2 : i32] : vector<3xi32>
+    %9 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
+    %10 = spv.Load "Input" %9 : vector<3xi32>
+    %11 = spv.CompositeExtract %10[0 : i32] : vector<3xi32>
+    %12 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
+    %13 = spv.Load "Input" %12 : vector<3xi32>
+    %14 = spv.CompositeExtract %13[1 : i32] : vector<3xi32>
+    %15 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
+    %16 = spv.Load "Input" %15 : vector<3xi32>
+    %17 = spv.CompositeExtract %16[2 : i32] : vector<3xi32>
+    %18 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
+    %19 = spv.Load "Input" %18 : vector<3xi32>
+    %20 = spv.CompositeExtract %19[0 : i32] : vector<3xi32>
+    %21 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
+    %22 = spv.Load "Input" %21 : vector<3xi32>
+    %23 = spv.CompositeExtract %22[1 : i32] : vector<3xi32>
+    %24 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
+    %25 = spv.Load "Input" %24 : vector<3xi32>
+    %26 = spv.CompositeExtract %25[2 : i32] : vector<3xi32>
+    %27 = spv._address_of @__builtin_var_WorkgroupSize__ : !spv.ptr<vector<3xi32>, Input>
+    %28 = spv.Load "Input" %27 : vector<3xi32>
+    %29 = spv.CompositeExtract %28[0 : i32] : vector<3xi32>
+    %30 = spv._address_of @__builtin_var_WorkgroupSize__ : !spv.ptr<vector<3xi32>, Input>
+    %31 = spv.Load "Input" %30 : vector<3xi32>
+    %32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
+    %33 = spv._address_of @__builtin_var_WorkgroupSize__ : !spv.ptr<vector<3xi32>, Input>
+    %34 = spv.Load "Input" %33 : vector<3xi32>
+    %35 = spv.CompositeExtract %34[2 : i32] : vector<3xi32>
+    // CHECK: spv.IAdd [[ARG3]]
+    %36 = spv.IAdd %arg3, %2 : i32
+    // CHECK: spv.IAdd [[ARG4]]
+    %37 = spv.IAdd %arg4, %11 : i32
+    // CHECK: spv.AccessChain [[ARG0]]
+    %c0 = spv.constant 0 : i32
+    %38 = spv.AccessChain %arg0[%c0, %36, %37] : !spv.ptr<!spv.struct<!spv.array<12 x !spv.array<4 x f32>>>, StorageBuffer>
+    %39 = spv.Load "StorageBuffer" %38 : f32
+    // CHECK: spv.AccessChain [[ARG1]]
+    %40 = spv.AccessChain %arg1[%c0, %36, %37] : !spv.ptr<!spv.struct<!spv.array<12 x !spv.array<4 x f32>>>, StorageBuffer>
+    %41 = spv.Load "StorageBuffer" %40 : f32
+    %42 = spv.FAdd %39, %41 : f32
+    // CHECK: spv.AccessChain [[ARG2]]
+    %43 = spv.AccessChain %arg2[%c0, %36, %37] : !spv.ptr<!spv.struct<!spv.array<12 x !spv.array<4 x f32>>>, StorageBuffer>
+    spv.Store "StorageBuffer" %43, %42 : f32
+    spv.Return
+  }
+  // CHECK: spv.EntryPoint "GLCompute" [[FN]], [[WORKGROUPID]], [[LOCALINVOCATIONID]], [[NUMWORKGROUPS]], [[WORKGROUPSIZE]]
+  // CHECK-NEXT: spv.ExecutionMode [[FN]] "LocalSize", 32, 1, 1
+} attributes {capabilities = ["Shader"], extensions = ["SPV_KHR_storage_buffer_storage_class"]}
diff --git a/mlir/test/Dialect/SPIRV/Transforms/abi-simple.mlir b/mlir/test/Dialect/SPIRV/Transforms/abi-simple.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..aa16877fa6526f32a878c00168e043354df33773
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Transforms/abi-simple.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt -spirv-lower-abi-attrs -verify-diagnostics %s -o - | FileCheck %s
+
+// CHECK-LABEL: spv.module
+spv.module "Logical" "GLSL450" {
+  // CHECK-DAG:    spv.globalVariable [[VAR0:@.*]] bind(0, 0) : !spv.ptr<!spv.struct<f32 [0]>, StorageBuffer>
+  // CHECK-DAG:    spv.globalVariable [[VAR1:@.*]] bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<12 x f32 [4]> [0]>, StorageBuffer>
+  // CHECK:    func [[FN:@.*]]()
+  func @kernel_1(%arg0: f32
+                {spirv.interface_var_abi = {binding = 0 : i32,
+                                            descriptor_set = 0 : i32,
+                                            storage_class = 12 : i32}},
+                 %arg1: !spv.ptr<!spv.struct<!spv.array<12 x f32>>, StorageBuffer>
+                 {spirv.interface_var_abi = {binding = 1 : i32,
+                                             descriptor_set = 0 : i32,
+                                             storage_class = 12 : i32}})
+  attributes  {spirv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
+    // CHECK: [[ARG1:%.*]] = spv._address_of [[VAR1]]
+    // CHECK: [[ADDRESSARG0:%.*]] = spv._address_of [[VAR0]]
+    // CHECK: [[CONST0:%.*]] = spv.constant 0 : i32
+    // CHECK: [[ARG0PTR:%.*]] = spv.AccessChain [[ADDRESSARG0]]{{\[}}[[CONST0]]
+    // CHECK: [[ARG0:%.*]] = spv.Load "StorageBuffer" [[ARG0PTR]]
+    // CHECK: spv.Return
+    spv.Return
+  }
+  // CHECK: spv.EntryPoint "GLCompute" [[FN]]
+  // CHECK: spv.ExecutionMode [[FN]] "LocalSize", 32, 1, 1
+} attributes {capabilities = ["Shader"], extensions = ["SPV_KHR_storage_buffer_storage_class"]}
diff --git a/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir b/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..75360194d7bac8e718274ce276691ff282b282ff
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir
@@ -0,0 +1,227 @@
+// RUN: mlir-opt %s -split-input-file -pass-pipeline='spv.module(inline)' -mlir-disable-inline-simplify | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  func @callee() {
+    spv.Return
+  }
+
+  // CHECK-LABEL: func @calling_single_block_ret_func
+  func @calling_single_block_ret_func() {
+    // CHECK-NEXT: spv.Return
+    spv.FunctionCall @callee() : () -> ()
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @callee() -> i32 {
+    %0 = spv.constant 42 : i32
+    spv.ReturnValue %0 : i32
+  }
+
+  // CHECK-LABEL: func @calling_single_block_retval_func
+  func @calling_single_block_retval_func() -> i32 {
+    // CHECK-NEXT: %[[CST:.*]] = spv.constant 42
+    %0 = spv.FunctionCall @callee() : () -> (i32)
+    // CHECK-NEXT: spv.ReturnValue %[[CST]]
+    spv.ReturnValue %0 : i32
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @data bind(0, 0) : !spv.ptr<!spv.struct<!spv.rtarray<i32> [0]>, StorageBuffer>
+  func @callee() {
+    %0 = spv._address_of @data : !spv.ptr<!spv.struct<!spv.rtarray<i32> [0]>, StorageBuffer>
+    %1 = spv.constant 0: i32
+    %2 = spv.AccessChain %0[%1, %1] : !spv.ptr<!spv.struct<!spv.rtarray<i32> [0]>, StorageBuffer>
+    spv.Branch ^next
+
+  ^next:
+    %3 = spv.constant 42: i32
+    spv.Store "StorageBuffer" %2, %3 : i32
+    spv.Return
+  }
+
+  // CHECK-LABEL: func @calling_multi_block_ret_func
+  func @calling_multi_block_ret_func() {
+    // CHECK-NEXT:   spv._address_of
+    // CHECK-NEXT:   spv.constant 0
+    // CHECK-NEXT:   spv.AccessChain
+    // CHECK-NEXT:   spv.Branch ^bb1
+    // CHECK-NEXT: ^bb1:
+    // CHECK-NEXT:   spv.constant
+    // CHECK-NEXT:   spv.Store
+    // CHECK-NEXT:   spv.Branch ^bb2
+    spv.FunctionCall @callee() : () -> ()
+    // CHECK-NEXT: ^bb2:
+    // CHECK-NEXT:   spv.Return
+    spv.Return
+  }
+}
+
+// TODO: calling_multi_block_retval_func
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @callee(%cond : i1) -> () {
+    spv.selection {
+      spv.BranchConditional %cond, ^then, ^merge
+    ^then:
+      spv.Return
+    ^merge:
+      spv._merge
+    }
+    spv.Return
+  }
+
+  // CHECK-LABEL: calling_selection_ret_func
+  func @calling_selection_ret_func() {
+    %0 = spv.constant true
+    // CHECK: spv.FunctionCall
+    spv.FunctionCall @callee(%0) : (i1) -> ()
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @callee(%cond : i1) -> () {
+    spv.selection {
+      spv.BranchConditional %cond, ^then, ^merge
+    ^then:
+      spv.Branch ^merge
+    ^merge:
+      spv._merge
+    }
+    spv.Return
+  }
+
+  // CHECK-LABEL: calling_selection_no_ret_func
+  func @calling_selection_no_ret_func() {
+    // CHECK-NEXT: %[[TRUE:.*]] = spv.constant true
+    %0 = spv.constant true
+    // CHECK-NEXT: spv.selection
+    // CHECK-NEXT:   spv.BranchConditional %[[TRUE]], ^bb1, ^bb2
+    // CHECK-NEXT: ^bb1:
+    // CHECK-NEXT:   spv.Branch ^bb2
+    // CHECK-NEXT: ^bb2:
+    // CHECK-NEXT:   spv._merge
+    spv.FunctionCall @callee(%0) : (i1) -> ()
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @callee(%cond : i1) -> () {
+    spv.loop {
+      spv.Branch ^header
+    ^header:
+      spv.BranchConditional %cond, ^body, ^merge
+    ^body:
+      spv.Return
+    ^continue:
+      spv.Branch ^header
+    ^merge:
+      spv._merge
+    }
+    spv.Return
+  }
+
+  // CHECK-LABEL: calling_loop_ret_func
+  func @calling_loop_ret_func() {
+    %0 = spv.constant true
+    // CHECK: spv.FunctionCall
+    spv.FunctionCall @callee(%0) : (i1) -> ()
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @callee(%cond : i1) -> () {
+    spv.loop {
+      spv.Branch ^header
+    ^header:
+      spv.BranchConditional %cond, ^body, ^merge
+    ^body:
+      spv.Branch ^continue
+    ^continue:
+      spv.Branch ^header
+    ^merge:
+      spv._merge
+    }
+    spv.Return
+  }
+
+  // CHECK-LABEL: calling_loop_no_ret_func
+  func @calling_loop_no_ret_func() {
+    // CHECK-NEXT: %[[TRUE:.*]] = spv.constant true
+    %0 = spv.constant true
+    // CHECK-NEXT: spv.loop
+    // CHECK-NEXT:   spv.Branch ^bb1
+    // CHECK-NEXT: ^bb1:
+    // CHECK-NEXT:   spv.BranchConditional %[[TRUE]], ^bb2, ^bb4
+    // CHECK-NEXT: ^bb2:
+    // CHECK-NEXT:   spv.Branch ^bb3
+    // CHECK-NEXT: ^bb3:
+    // CHECK-NEXT:   spv.Branch ^bb1
+    // CHECK-NEXT: ^bb4:
+    // CHECK-NEXT:   spv._merge
+    spv.FunctionCall @callee(%0) : (i1) -> ()
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @arg_0 bind(0, 0) : !spv.ptr<!spv.struct<i32 [0]>, StorageBuffer>
+  spv.globalVariable @arg_1 bind(0, 1) : !spv.ptr<!spv.struct<i32 [0]>, StorageBuffer>
+  // CHECK: func @inline_into_selection_region
+  func @inline_into_selection_region() {
+    %1 = spv.constant 0 : i32
+    // CHECK-DAG: [[ADDRESS_ARG0:%.*]] = spv._address_of @arg_0
+    // CHECK-DAG: [[ADDRESS_ARG1:%.*]] = spv._address_of @arg_1
+    // CHECK-DAG: [[LOADPTR:%.*]] = spv.AccessChain [[ADDRESS_ARG0]]
+    // CHECK: [[VAL:%.*]] = spv.Load "StorageBuffer" [[LOADPTR]]
+    %2 = spv._address_of @arg_0 : !spv.ptr<!spv.struct<i32 [0]>, StorageBuffer>
+    %3 = spv._address_of @arg_1 : !spv.ptr<!spv.struct<i32 [0]>, StorageBuffer>
+    %4 = spv.AccessChain %2[%1] : !spv.ptr<!spv.struct<i32 [0]>, StorageBuffer>
+    %5 = spv.Load "StorageBuffer" %4 : i32
+    %6 = spv.SGreaterThan %5, %1 : i32
+    // CHECK: spv.selection
+    spv.selection {
+      spv.BranchConditional %6, ^bb1, ^bb2
+    ^bb1: // pred: ^bb0
+      // CHECK: [[STOREPTR:%.*]] = spv.AccessChain [[ADDRESS_ARG1]]
+      %7 = spv.AccessChain %3[%1] : !spv.ptr<!spv.struct<i32 [0]>, StorageBuffer>
+      // CHECK-NOT: spv.FunctionCall
+      // CHECK: spv.AtomicIAdd "Device" "AcquireRelease" [[STOREPTR]], [[VAL]]
+      // CHECK: spv.Branch
+      spv.FunctionCall @atomic_add(%5, %7) : (i32, !spv.ptr<i32, StorageBuffer>) -> ()
+      spv.Branch ^bb2
+    ^bb2 : // 2 preds: ^bb0, ^bb1
+      spv._merge
+    }
+    // CHECK: spv.Return
+    spv.Return
+  }
+  func @atomic_add(%arg0: i32, %arg1: !spv.ptr<i32, StorageBuffer>) {
+    %0 = spv.AtomicIAdd "Device" "AcquireRelease" %arg1, %arg0 : !spv.ptr<i32, StorageBuffer>
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @inline_into_selection_region
+  spv.ExecutionMode @inline_into_selection_region "LocalSize", 32, 1, 1
+} attributes {capabilities = ["Shader"], extensions = ["SPV_KHR_storage_buffer_storage_class"]}
+
+// TODO: Add tests for inlining structured control flow into
+// structured control flow.
\ No newline at end of file
diff --git a/mlir/test/Dialect/SPIRV/Transforms/layout-decoration.mlir b/mlir/test/Dialect/SPIRV/Transforms/layout-decoration.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..0a019b1abea076d6a203918a4ace10fb92b84f0d
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Transforms/layout-decoration.mlir
@@ -0,0 +1,99 @@
+// RUN: mlir-opt -decorate-spirv-composite-type-layout -split-input-file -verify-diagnostics %s -o - | FileCheck %s
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: spv.globalVariable @var0 bind(0, 1) : !spv.ptr<!spv.struct<i32 [0], !spv.struct<f32 [0], i32 [4]> [4], f32 [12]>, Uniform>
+  spv.globalVariable @var0 bind(0,1) : !spv.ptr<!spv.struct<i32, !spv.struct<f32, i32>, f32>, Uniform>
+
+  // CHECK: spv.globalVariable @var1 bind(0, 2) : !spv.ptr<!spv.struct<!spv.array<64 x i32 [4]> [0], f32 [256]>, StorageBuffer>
+  spv.globalVariable @var1 bind(0,2) : !spv.ptr<!spv.struct<!spv.array<64xi32>, f32>, StorageBuffer>
+
+  // CHECK: spv.globalVariable @var2 bind(1, 0) : !spv.ptr<!spv.struct<!spv.struct<!spv.array<64 x i32 [4]> [0], f32 [256]> [0], i32 [260]>, StorageBuffer>
+  spv.globalVariable @var2 bind(1,0) : !spv.ptr<!spv.struct<!spv.struct<!spv.array<64xi32>, f32>, i32>, StorageBuffer>
+
+  // CHECK: spv.globalVariable @var3 : !spv.ptr<!spv.struct<!spv.array<16 x !spv.struct<f32 [0], f32 [4], !spv.array<16 x f32 [4]> [8]> [72]> [0], f32 [1152]>, StorageBuffer>
+  spv.globalVariable @var3 : !spv.ptr<!spv.struct<!spv.array<16x!spv.struct<f32, f32, !spv.array<16xf32>>>, f32>, StorageBuffer>
+
+  // CHECK: spv.globalVariable @var4 bind(1, 2) : !spv.ptr<!spv.struct<!spv.struct<!spv.struct<i1 [0], i8 [1], i16 [2], i32 [4], i64 [8]> [0], f32 [16], i1 [20]> [0], i1 [24]>, StorageBuffer>
+  spv.globalVariable @var4 bind(1,2) : !spv.ptr<!spv.struct<!spv.struct<!spv.struct<i1, i8, i16, i32, i64>, f32, i1>, i1>, StorageBuffer>
+
+  // CHECK: spv.globalVariable @var5 bind(1, 3) : !spv.ptr<!spv.struct<!spv.array<256 x f32 [4]> [0]>, StorageBuffer>
+  spv.globalVariable @var5 bind(1,3) : !spv.ptr<!spv.struct<!spv.array<256xf32>>, StorageBuffer>
+
+  func @kernel() -> () {
+    %c0 = spv.constant 0 : i32
+    // CHECK: {{%.*}} = spv._address_of @var0 : !spv.ptr<!spv.struct<i32 [0], !spv.struct<f32 [0], i32 [4]> [4], f32 [12]>, Uniform>
+    %0 = spv._address_of @var0 : !spv.ptr<!spv.struct<i32, !spv.struct<f32, i32>, f32>, Uniform>
+    // CHECK:  {{%.*}} = spv.AccessChain {{%.*}}[{{%.*}}] : !spv.ptr<!spv.struct<i32 [0], !spv.struct<f32 [0], i32 [4]> [4], f32 [12]>, Uniform>
+    %1 = spv.AccessChain %0[%c0] : !spv.ptr<!spv.struct<i32, !spv.struct<f32, i32>, f32>, Uniform>
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: spv.globalVariable @var0 : !spv.ptr<!spv.struct<!spv.struct<!spv.struct<!spv.struct<!spv.struct<i1 [0], i1 [1], f64 [8]> [0], i1 [16]> [0], i1 [24]> [0], i1 [32]> [0], i1 [40]>, Uniform>
+  spv.globalVariable @var0 : !spv.ptr<!spv.struct<!spv.struct<!spv.struct<!spv.struct<!spv.struct<i1, i1, f64>, i1>, i1>, i1>, i1>, Uniform>
+
+  // CHECK: spv.globalVariable @var1 : !spv.ptr<!spv.struct<!spv.struct<i16 [0], !spv.struct<i1 [0], f64 [8]> [8], f32 [24]> [0], f32 [32]>, Uniform>
+  spv.globalVariable @var1 : !spv.ptr<!spv.struct<!spv.struct<i16, !spv.struct<i1, f64>, f32>, f32>, Uniform>
+
+  // CHECK: spv.globalVariable @var2 : !spv.ptr<!spv.struct<!spv.struct<i16 [0], !spv.struct<i1 [0], !spv.array<16 x !spv.array<16 x i64 [8]> [128]> [8]> [8], f32 [2064]> [0], f32 [2072]>, Uniform>
+  spv.globalVariable @var2 : !spv.ptr<!spv.struct<!spv.struct<i16, !spv.struct<i1, !spv.array<16x!spv.array<16xi64>>>, f32>, f32>, Uniform>
+
+  // CHECK: spv.globalVariable @var3 : !spv.ptr<!spv.struct<!spv.struct<!spv.array<64 x i64 [8]> [0], i1 [512]> [0], i1 [520]>, Uniform>
+  spv.globalVariable @var3 : !spv.ptr<!spv.struct<!spv.struct<!spv.array<64xi64>, i1>, i1>, Uniform>
+
+  // CHECK: spv.globalVariable @var4 : !spv.ptr<!spv.struct<i1 [0], !spv.struct<i64 [0], i1 [8], i1 [9], i1 [10], i1 [11]> [8], i1 [24]>, Uniform>
+  spv.globalVariable @var4 : !spv.ptr<!spv.struct<i1, !spv.struct<i64, i1, i1, i1, i1>, i1>, Uniform>
+
+  // CHECK: spv.globalVariable @var5 : !spv.ptr<!spv.struct<i1 [0], !spv.struct<i1 [0], i1 [1], i1 [2], i1 [3], i64 [8]> [8], i1 [24]>, Uniform>
+  spv.globalVariable @var5 : !spv.ptr<!spv.struct<i1, !spv.struct<i1, i1, i1, i1, i64>, i1>, Uniform>
+
+  // CHECK: spv.globalVariable @var6 : !spv.ptr<!spv.struct<i1 [0], !spv.struct<i64 [0], i32 [8], i16 [12], i8 [14], i1 [15]> [8], i1 [24]>, Uniform>
+  spv.globalVariable @var6 : !spv.ptr<!spv.struct<i1, !spv.struct<i64, i32, i16, i8, i1>, i1>, Uniform>
+
+  // CHECK: spv.globalVariable @var7 : !spv.ptr<!spv.struct<i1 [0], !spv.struct<!spv.struct<i1 [0], i64 [8]> [0], i1 [16]> [8], i1 [32]>, Uniform>
+  spv.globalVariable @var7 : !spv.ptr<!spv.struct<i1, !spv.struct<!spv.struct<i1, i64>, i1>, i1>, Uniform>
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: spv.globalVariable @var0 : !spv.ptr<!spv.struct<vector<2xi32> [0], f32 [8]>, StorageBuffer>
+  spv.globalVariable @var0 : !spv.ptr<!spv.struct<vector<2xi32>, f32>, StorageBuffer>
+
+  // CHECK: spv.globalVariable @var1 : !spv.ptr<!spv.struct<vector<3xi32> [0], f32 [12]>, StorageBuffer>
+  spv.globalVariable @var1 : !spv.ptr<!spv.struct<vector<3xi32>, f32>, StorageBuffer>
+
+  // CHECK: spv.globalVariable @var2 : !spv.ptr<!spv.struct<vector<4xi32> [0], f32 [16]>, StorageBuffer>
+  spv.globalVariable @var2 : !spv.ptr<!spv.struct<vector<4xi32>, f32>, StorageBuffer>
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: spv.globalVariable @emptyStructAsMember : !spv.ptr<!spv.struct<!spv.struct<> [0]>, StorageBuffer>
+  spv.globalVariable @emptyStructAsMember : !spv.ptr<!spv.struct<!spv.struct<>>, StorageBuffer>
+
+  // CHECK: spv.globalVariable @arrayType : !spv.ptr<!spv.array<4 x !spv.array<4 x f32>>, StorageBuffer>
+  spv.globalVariable @arrayType : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, StorageBuffer>
+
+  // CHECK: spv.globalVariable @InputStorage : !spv.ptr<!spv.struct<!spv.array<256 x f32>>, Input>
+  spv.globalVariable @InputStorage : !spv.ptr<!spv.struct<!spv.array<256xf32>>, Input>
+
+  // CHECK: spv.globalVariable @customLayout : !spv.ptr<!spv.struct<f32 [256], i32 [512]>, Uniform>
+  spv.globalVariable @customLayout : !spv.ptr<!spv.struct<f32 [256], i32 [512]>, Uniform>
+
+  // CHECK:  spv.globalVariable @emptyStruct : !spv.ptr<!spv.struct<>, Uniform>
+  spv.globalVariable @emptyStruct : !spv.ptr<!spv.struct<>, Uniform>
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: spv.globalVariable @var0 : !spv.ptr<!spv.struct<i32 [0]>, PushConstant>
+  spv.globalVariable @var0 : !spv.ptr<!spv.struct<i32>, PushConstant>
+  // CHECK: spv.globalVariable @var1 : !spv.ptr<!spv.struct<i32 [0]>, PhysicalStorageBuffer>
+  spv.globalVariable @var1 : !spv.ptr<!spv.struct<i32>, PhysicalStorageBuffer>
+}
diff --git a/mlir/test/Dialect/SPIRV/arithmetic-ops.mlir b/mlir/test/Dialect/SPIRV/arithmetic-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..85998fb03efddf49eae3e056a20cb9faaaf1511c
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/arithmetic-ops.mlir
@@ -0,0 +1,210 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// spv.FAdd
+//===----------------------------------------------------------------------===//
+
+func @fadd_scalar(%arg: f32) -> f32 {
+  // CHECK: spv.FAdd
+  %0 = spv.FAdd %arg, %arg : f32
+  return %0 : f32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.FDiv
+//===----------------------------------------------------------------------===//
+
+func @fdiv_scalar(%arg: f32) -> f32 {
+  // CHECK: spv.FDiv
+  %0 = spv.FDiv %arg, %arg : f32
+  return %0 : f32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.FMod
+//===----------------------------------------------------------------------===//
+
+func @fmod_scalar(%arg: f32) -> f32 {
+  // CHECK: spv.FMod
+  %0 = spv.FMod %arg, %arg : f32
+  return %0 : f32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.FMul
+//===----------------------------------------------------------------------===//
+
+func @fmul_scalar(%arg: f32) -> f32 {
+  // CHECK: spv.FMul
+  %0 = spv.FMul %arg, %arg : f32
+  return %0 : f32
+}
+
+func @fmul_vector(%arg: vector<4xf32>) -> vector<4xf32> {
+  // CHECK: spv.FMul
+  %0 = spv.FMul %arg, %arg : vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// -----
+
+func @fmul_i32(%arg: i32) -> i32 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spv.FMul %arg, %arg : i32
+  return %0 : i32
+}
+
+// -----
+
+func @fmul_bf16(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spv.FMul %arg, %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
+func @fmul_tensor(%arg: tensor<4xf32>) -> tensor<4xf32> {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spv.FMul %arg, %arg : tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.FNegate
+//===----------------------------------------------------------------------===//
+
+func @fnegate_scalar(%arg: f32) -> f32 {
+  // CHECK: spv.FNegate
+  %0 = spv.FNegate %arg : f32
+  return %0 : f32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.FRem
+//===----------------------------------------------------------------------===//
+
+func @frem_scalar(%arg: f32) -> f32 {
+  // CHECK: spv.FRem
+  %0 = spv.FRem %arg, %arg : f32
+  return %0 : f32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.FSub
+//===----------------------------------------------------------------------===//
+
+func @fsub_scalar(%arg: f32) -> f32 {
+  // CHECK: spv.FSub
+  %0 = spv.FSub %arg, %arg : f32
+  return %0 : f32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.IAdd
+//===----------------------------------------------------------------------===//
+
+func @iadd_scalar(%arg: i32) -> i32 {
+  // CHECK: spv.IAdd
+  %0 = spv.IAdd %arg, %arg : i32
+  return %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.IMul
+//===----------------------------------------------------------------------===//
+
+func @imul_scalar(%arg: i32) -> i32 {
+  // CHECK: spv.IMul
+  %0 = spv.IMul %arg, %arg : i32
+  return %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ISub
+//===----------------------------------------------------------------------===//
+
+func @isub_scalar(%arg: i32) -> i32 {
+  // CHECK: spv.ISub
+  %0 = spv.ISub %arg, %arg : i32
+  return %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.SDiv
+//===----------------------------------------------------------------------===//
+
+func @sdiv_scalar(%arg: i32) -> i32 {
+  // CHECK: spv.SDiv
+  %0 = spv.SDiv %arg, %arg : i32
+  return %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.SMod
+//===----------------------------------------------------------------------===//
+
+func @smod_scalar(%arg: i32) -> i32 {
+  // CHECK: spv.SMod
+  %0 = spv.SMod %arg, %arg : i32
+  return %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.SRem
+//===----------------------------------------------------------------------===//
+
+func @srem_scalar(%arg: i32) -> i32 {
+  // CHECK: spv.SRem
+  %0 = spv.SRem %arg, %arg : i32
+  return %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.UDiv
+//===----------------------------------------------------------------------===//
+
+func @udiv_scalar(%arg: i32) -> i32 {
+  // CHECK: spv.UDiv
+  %0 = spv.UDiv %arg, %arg : i32
+  return %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.UMod
+//===----------------------------------------------------------------------===//
+
+func @umod_scalar(%arg: i32) -> i32 {
+  // CHECK: spv.UMod
+  %0 = spv.UMod %arg, %arg : i32
+  return %0 : i32
+}
+
diff --git a/mlir/test/Dialect/SPIRV/atomic-ops.mlir b/mlir/test/Dialect/SPIRV/atomic-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..67529f289bd5b121f070d7914e16bfe1d1f18fbf
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/atomic-ops.mlir
@@ -0,0 +1,166 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicAnd
+//===----------------------------------------------------------------------===//
+
+func @atomic_and(%ptr : !spv.ptr<i32, StorageBuffer>, %value : i32) -> i32 {
+  // CHECK: spv.AtomicAnd "Device" "None" %{{.*}}, %{{.*}} : !spv.ptr<i32, StorageBuffer>
+  %0 = spv.AtomicAnd "Device" "None" %ptr, %value : !spv.ptr<i32, StorageBuffer>
+  return %0 : i32
+}
+
+// -----
+
+func @atomic_and(%ptr : !spv.ptr<f32, StorageBuffer>, %value : i32) -> i32 {
+  // expected-error @+1 {{pointer operand must point to an integer value, found 'f32'}}
+  %0 = "spv.AtomicAnd"(%ptr, %value) {memory_scope = 4: i32, semantics = 0x4 : i32} : (!spv.ptr<f32, StorageBuffer>, i32) -> (i32)
+  return %0 : i32
+}
+
+
+// -----
+
+func @atomic_and(%ptr : !spv.ptr<i32, StorageBuffer>, %value : i64) -> i64 {
+  // expected-error @+1 {{expected value to have the same type as the pointer operand's pointee type 'i32', but found 'i64'}}
+  %0 = "spv.AtomicAnd"(%ptr, %value) {memory_scope = 2: i32, semantics = 0x8 : i32} : (!spv.ptr<i32, StorageBuffer>, i64) -> (i64)
+  return %0 : i64
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicCompareExchangeWeak
+//===----------------------------------------------------------------------===//
+
+func @atomic_compare_exchange_weak(%ptr: !spv.ptr<i32, Workgroup>, %value: i32, %comparator: i32) -> i32 {
+  // CHECK: spv.AtomicCompareExchangeWeak "Workgroup" "Release" "Acquire" %{{.*}}, %{{.*}}, %{{.*}} : !spv.ptr<i32, Workgroup>
+  %0 = spv.AtomicCompareExchangeWeak "Workgroup" "Release" "Acquire" %ptr, %value, %comparator: !spv.ptr<i32, Workgroup>
+  return %0: i32
+}
+
+// -----
+
+func @atomic_compare_exchange_weak(%ptr: !spv.ptr<i32, Workgroup>, %value: i64, %comparator: i32) -> i32 {
+  // expected-error @+1 {{value operand must have the same type as the op result, but found 'i64' vs 'i32'}}
+  %0 = "spv.AtomicCompareExchangeWeak"(%ptr, %value, %comparator) {memory_scope = 4: i32, equal_semantics = 0x4: i32, unequal_semantics = 0x2:i32} : (!spv.ptr<i32, Workgroup>, i64, i32) -> (i32)
+  return %0: i32
+}
+
+// -----
+
+func @atomic_compare_exchange_weak(%ptr: !spv.ptr<i32, Workgroup>, %value: i32, %comparator: i16) -> i32 {
+  // expected-error @+1 {{comparator operand must have the same type as the op result, but found 'i16' vs 'i32'}}
+  %0 = "spv.AtomicCompareExchangeWeak"(%ptr, %value, %comparator) {memory_scope = 4: i32, equal_semantics = 0x4: i32, unequal_semantics = 0x2:i32} : (!spv.ptr<i32, Workgroup>, i32, i16) -> (i32)
+  return %0: i32
+}
+
+// -----
+
+func @atomic_compare_exchange_weak(%ptr: !spv.ptr<i64, Workgroup>, %value: i32, %comparator: i32) -> i32 {
+  // expected-error @+1 {{pointer operand's pointee type must have the same as the op result type, but found 'i64' vs 'i32'}}
+  %0 = "spv.AtomicCompareExchangeWeak"(%ptr, %value, %comparator) {memory_scope = 4: i32, equal_semantics = 0x4: i32, unequal_semantics = 0x2:i32} : (!spv.ptr<i64, Workgroup>, i32, i32) -> (i32)
+  return %0: i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicIAdd
+//===----------------------------------------------------------------------===//
+
+func @atomic_iadd(%ptr : !spv.ptr<i32, StorageBuffer>, %value : i32) -> i32 {
+  // CHECK: spv.AtomicIAdd "Workgroup" "None" %{{.*}}, %{{.*}} : !spv.ptr<i32, StorageBuffer>
+  %0 = spv.AtomicIAdd "Workgroup" "None" %ptr, %value : !spv.ptr<i32, StorageBuffer>
+  return %0 : i32
+}
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicIDecrement
+//===----------------------------------------------------------------------===//
+
+func @atomic_idecrement(%ptr : !spv.ptr<i32, StorageBuffer>) -> i32 {
+  // CHECK: spv.AtomicIDecrement "Workgroup" "None" %{{.*}} : !spv.ptr<i32, StorageBuffer>
+  %0 = spv.AtomicIDecrement "Workgroup" "None" %ptr : !spv.ptr<i32, StorageBuffer>
+  return %0 : i32
+}
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicIIncrement
+//===----------------------------------------------------------------------===//
+
+func @atomic_iincrement(%ptr : !spv.ptr<i32, StorageBuffer>) -> i32 {
+  // CHECK: spv.AtomicIIncrement "Workgroup" "None" %{{.*}} : !spv.ptr<i32, StorageBuffer>
+  %0 = spv.AtomicIIncrement "Workgroup" "None" %ptr : !spv.ptr<i32, StorageBuffer>
+  return %0 : i32
+}
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicISub
+//===----------------------------------------------------------------------===//
+
+func @atomic_isub(%ptr : !spv.ptr<i32, StorageBuffer>, %value : i32) -> i32 {
+  // CHECK: spv.AtomicISub "Workgroup" "None" %{{.*}}, %{{.*}} : !spv.ptr<i32, StorageBuffer>
+  %0 = spv.AtomicISub "Workgroup" "None" %ptr, %value : !spv.ptr<i32, StorageBuffer>
+  return %0 : i32
+}
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicOr
+//===----------------------------------------------------------------------===//
+
+func @atomic_or(%ptr : !spv.ptr<i32, StorageBuffer>, %value : i32) -> i32 {
+  // CHECK: spv.AtomicOr "Workgroup" "None" %{{.*}}, %{{.*}} : !spv.ptr<i32, StorageBuffer>
+  %0 = spv.AtomicOr "Workgroup" "None" %ptr, %value : !spv.ptr<i32, StorageBuffer>
+  return %0 : i32
+}
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicSMax
+//===----------------------------------------------------------------------===//
+
+func @atomic_smax(%ptr : !spv.ptr<i32, StorageBuffer>, %value : i32) -> i32 {
+  // CHECK: spv.AtomicSMax "Workgroup" "None" %{{.*}}, %{{.*}} : !spv.ptr<i32, StorageBuffer>
+  %0 = spv.AtomicSMax "Workgroup" "None" %ptr, %value : !spv.ptr<i32, StorageBuffer>
+  return %0 : i32
+}
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicSMin
+//===----------------------------------------------------------------------===//
+
+func @atomic_smin(%ptr : !spv.ptr<i32, StorageBuffer>, %value : i32) -> i32 {
+  // CHECK: spv.AtomicSMin "Workgroup" "None" %{{.*}}, %{{.*}} : !spv.ptr<i32, StorageBuffer>
+  %0 = spv.AtomicSMin "Workgroup" "None" %ptr, %value : !spv.ptr<i32, StorageBuffer>
+  return %0 : i32
+}
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicUMax
+//===----------------------------------------------------------------------===//
+
+func @atomic_umax(%ptr : !spv.ptr<i32, StorageBuffer>, %value : i32) -> i32 {
+  // CHECK: spv.AtomicUMax "Workgroup" "None" %{{.*}}, %{{.*}} : !spv.ptr<i32, StorageBuffer>
+  %0 = spv.AtomicUMax "Workgroup" "None" %ptr, %value : !spv.ptr<i32, StorageBuffer>
+  return %0 : i32
+}
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicUMin
+//===----------------------------------------------------------------------===//
+
+func @atomic_umin(%ptr : !spv.ptr<i32, StorageBuffer>, %value : i32) -> i32 {
+  // CHECK: spv.AtomicUMin "Workgroup" "None" %{{.*}}, %{{.*}} : !spv.ptr<i32, StorageBuffer>
+  %0 = spv.AtomicUMin "Workgroup" "None" %ptr, %value : !spv.ptr<i32, StorageBuffer>
+  return %0 : i32
+}
+
+//===----------------------------------------------------------------------===//
+// spv.AtomicXor
+//===----------------------------------------------------------------------===//
+
+func @atomic_xor(%ptr : !spv.ptr<i32, StorageBuffer>, %value : i32) -> i32 {
+  // CHECK: spv.AtomicXor "Workgroup" "None" %{{.*}}, %{{.*}} : !spv.ptr<i32, StorageBuffer>
+  %0 = spv.AtomicXor "Workgroup" "None" %ptr, %value : !spv.ptr<i32, StorageBuffer>
+  return %0 : i32
+}
diff --git a/mlir/test/Dialect/SPIRV/bit-ops.mlir b/mlir/test/Dialect/SPIRV/bit-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..66998ec7084e58c3ae0a5972a829cb127b183d53
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/bit-ops.mlir
@@ -0,0 +1,77 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// spv.BitwiseOr
+//===----------------------------------------------------------------------===//
+
+func @bitwise_or_scalar(%arg: i32) -> i32 {
+  // CHECK: spv.BitwiseOr
+  %0 = spv.BitwiseOr %arg, %arg : i32
+  return %0 : i32
+}
+
+func @bitwise_or_vector(%arg: vector<4xi32>) -> vector<4xi32> {
+  // CHECK: spv.BitwiseOr
+  %0 = spv.BitwiseOr %arg, %arg : vector<4xi32>
+  return %0 : vector<4xi32>
+}
+
+// -----
+
+func @bitwise_or_float(%arg0: f16, %arg1: f16) -> f16 {
+  // expected-error @+1 {{operand #0 must be 8/16/32/64-bit integer or vector of 8/16/32/64-bit integer values of length 2/3/4}}
+  %0 = spv.BitwiseOr %arg0, %arg1 : f16
+  return %0 : f16
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.BitwiseXor
+//===----------------------------------------------------------------------===//
+
+func @bitwise_xor_scalar(%arg: i32) -> i32 {
+  // CHECK: spv.BitwiseXor
+  %0 = spv.BitwiseXor %arg, %arg : i32
+  return %0 : i32
+}
+
+func @bitwise_xor_vector(%arg: vector<4xi32>) -> vector<4xi32> {
+  // CHECK: spv.BitwiseXor
+  %0 = spv.BitwiseXor %arg, %arg : vector<4xi32>
+  return %0 : vector<4xi32>
+}
+
+// -----
+
+func @bitwise_xor_float(%arg0: f16, %arg1: f16) -> f16 {
+  // expected-error @+1 {{operand #0 must be 8/16/32/64-bit integer or vector of 8/16/32/64-bit integer values of length 2/3/4}}
+  %0 = spv.BitwiseXor %arg0, %arg1 : f16
+  return %0 : f16
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.BitwiseAnd
+//===----------------------------------------------------------------------===//
+
+func @bitwise_and_scalar(%arg: i32) -> i32 {
+  // CHECK: spv.BitwiseAnd
+  %0 = spv.BitwiseAnd %arg, %arg : i32
+  return %0 : i32
+}
+
+func @bitwise_and_vector(%arg: vector<4xi32>) -> vector<4xi32> {
+  // CHECK: spv.BitwiseAnd
+  %0 = spv.BitwiseAnd %arg, %arg : vector<4xi32>
+  return %0 : vector<4xi32>
+}
+
+// -----
+
+func @bitwise_and_float(%arg0: f16, %arg1: f16) -> f16 {
+  // expected-error @+1 {{operand #0 must be 8/16/32/64-bit integer or vector of 8/16/32/64-bit integer values of length 2/3/4}}
+  %0 = spv.BitwiseAnd %arg0, %arg1 : f16
+  return %0 : f16
+}
diff --git a/mlir/test/Dialect/SPIRV/canonicalize.mlir b/mlir/test/Dialect/SPIRV/canonicalize.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..c8e4ccf51176fb51e1a0ea5868623943ceae8b36
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/canonicalize.mlir
@@ -0,0 +1,650 @@
+// RUN: mlir-opt %s -split-input-file -pass-pipeline='func(canonicalize)' | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// spv.AccessChain
+//===----------------------------------------------------------------------===//
+
+func @combine_full_access_chain() -> f32 {
+  // CHECK: %[[INDEX:.*]] = spv.constant 0
+  // CHECK-NEXT: %[[VAR:.*]] = spv.Variable
+  // CHECK-NEXT: %[[PTR:.*]] = spv.AccessChain %[[VAR]][%[[INDEX]], %[[INDEX]], %[[INDEX]]]
+  // CHECK-NEXT: spv.Load "Function" %[[PTR]]
+  %c0 = spv.constant 0: i32
+  %0 = spv.Variable : !spv.ptr<!spv.struct<!spv.array<4x!spv.array<4xf32>>, !spv.array<4xi32>>, Function>
+  %1 = spv.AccessChain %0[%c0] : !spv.ptr<!spv.struct<!spv.array<4x!spv.array<4xf32>>, !spv.array<4xi32>>, Function>
+  %2 = spv.AccessChain %1[%c0, %c0] : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  %3 = spv.Load "Function" %2 : f32
+  spv.ReturnValue %3 : f32
+}
+
+// -----
+
+func @combine_access_chain_multi_use() -> !spv.array<4xf32> {
+  // CHECK: %[[INDEX:.*]] = spv.constant 0
+  // CHECK-NEXT: %[[VAR:.*]] = spv.Variable
+  // CHECK-NEXT: %[[PTR_0:.*]] = spv.AccessChain %[[VAR]][%[[INDEX]], %[[INDEX]]]
+  // CHECK-NEXT: %[[PTR_1:.*]] = spv.AccessChain %[[VAR]][%[[INDEX]], %[[INDEX]], %[[INDEX]]]
+  // CHECK-NEXT: spv.Load "Function" %[[PTR_0]]
+  // CHECK-NEXT: spv.Load "Function" %[[PTR_1]]
+  %c0 = spv.constant 0: i32
+  %0 = spv.Variable : !spv.ptr<!spv.struct<!spv.array<4x!spv.array<4xf32>>, !spv.array<4xi32>>, Function>
+  %1 = spv.AccessChain %0[%c0] : !spv.ptr<!spv.struct<!spv.array<4x!spv.array<4xf32>>, !spv.array<4xi32>>, Function>
+  %2 = spv.AccessChain %1[%c0] : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  %3 = spv.AccessChain %2[%c0] : !spv.ptr<!spv.array<4xf32>, Function>
+  %4 = spv.Load "Function" %2 : !spv.array<4xf32>
+  %5 = spv.Load "Function" %3 : f32
+  spv.ReturnValue %4: !spv.array<4xf32>
+}
+
+// -----
+
+func @dont_combine_access_chain_without_common_base() -> !spv.array<4xi32> {
+  // CHECK: %[[INDEX:.*]] = spv.constant 1
+  // CHECK-NEXT: %[[VAR_0:.*]] = spv.Variable
+  // CHECK-NEXT: %[[VAR_1:.*]] = spv.Variable
+  // CHECK-NEXT: %[[VAR_0_PTR:.*]] = spv.AccessChain %[[VAR_0]][%[[INDEX]]]
+  // CHECK-NEXT: %[[VAR_1_PTR:.*]] = spv.AccessChain %[[VAR_1]][%[[INDEX]]]
+  // CHECK-NEXT: spv.Load "Function" %[[VAR_0_PTR]]
+  // CHECK-NEXT: spv.Load "Function" %[[VAR_1_PTR]]
+  %c1 = spv.constant 1: i32
+  %0 = spv.Variable : !spv.ptr<!spv.struct<!spv.array<4x!spv.array<4xf32>>, !spv.array<4xi32>>, Function>
+  %1 = spv.Variable : !spv.ptr<!spv.struct<!spv.array<4x!spv.array<4xf32>>, !spv.array<4xi32>>, Function>
+  %2 = spv.AccessChain %0[%c1] : !spv.ptr<!spv.struct<!spv.array<4x!spv.array<4xf32>>, !spv.array<4xi32>>, Function>
+  %3 = spv.AccessChain %1[%c1] : !spv.ptr<!spv.struct<!spv.array<4x!spv.array<4xf32>>, !spv.array<4xi32>>, Function>
+  %4 = spv.Load "Function" %2 : !spv.array<4xi32>
+  %5 = spv.Load "Function" %3 : !spv.array<4xi32>
+  spv.ReturnValue %4 : !spv.array<4xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.Bitcast
+//===----------------------------------------------------------------------===//
+
+func @convert_bitcast_full(%arg0 : vector<2xf32>) -> f64 {
+  // CHECK: %[[RESULT:.*]] = spv.Bitcast {{%.*}} : vector<2xf32> to f64
+  // CHECK-NEXT: spv.ReturnValue %[[RESULT]]
+  %0 = spv.Bitcast %arg0 : vector<2xf32> to vector<2xi32>
+  %1 = spv.Bitcast %0 : vector<2xi32> to i64
+  %2 = spv.Bitcast %1 : i64 to f64
+  spv.ReturnValue %2 : f64
+}
+
+// -----
+
+func @convert_bitcast_multi_use(%arg0 : vector<2xf32>, %arg1 : !spv.ptr<i64, Uniform>) -> f64 {
+  // CHECK: %[[RESULT_0:.*]] = spv.Bitcast {{%.*}} : vector<2xf32> to i64
+  // CHECK-NEXT: %[[RESULT_1:.*]] = spv.Bitcast {{%.*}} : vector<2xf32> to f64
+  // CHECK-NEXT: spv.Store {{".*"}} {{%.*}}, %[[RESULT_0]]
+  // CHECK-NEXT: spv.ReturnValue %[[RESULT_1]]
+  %0 = spv.Bitcast %arg0 : vector<2xf32> to i64
+  %1 = spv.Bitcast %0 : i64 to f64
+  spv.Store "Uniform" %arg1, %0 : i64
+  spv.ReturnValue %1 : f64
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.CompositeExtract
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: extract_vector
+func @extract_vector() -> (i32, i32, i32) {
+  // CHECK: spv.constant 42 : i32
+  // CHECK: spv.constant -33 : i32
+  // CHECK: spv.constant 6 : i32
+  %0 = spv.constant dense<[42, -33, 6]> : vector<3xi32>
+  %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
+  %2 = spv.CompositeExtract %0[1 : i32] : vector<3xi32>
+  %3 = spv.CompositeExtract %0[2 : i32] : vector<3xi32>
+  return %1, %2, %3 : i32, i32, i32
+}
+
+// -----
+
+// CHECK-LABEL: extract_array_final
+func @extract_array_final() -> (i32, i32) {
+  // CHECK: spv.constant 4 : i32
+  // CHECK: spv.constant -5 : i32
+  %0 = spv.constant [dense<[4, -5]> : vector<2xi32>] : !spv.array<1 x vector<2xi32>>
+  %1 = spv.CompositeExtract %0[0 : i32, 0 : i32] : !spv.array<1 x vector<2 x i32>>
+  %2 = spv.CompositeExtract %0[0 : i32, 1 : i32] : !spv.array<1 x vector<2 x i32>>
+  return %1, %2 : i32, i32
+}
+
+// -----
+
+// CHECK-LABEL: extract_array_interm
+func @extract_array_interm() -> (vector<2xi32>) {
+  // CHECK: spv.constant dense<[4, -5]> : vector<2xi32>
+  %0 = spv.constant [dense<[4, -5]> : vector<2xi32>] : !spv.array<1 x vector<2xi32>>
+  %1 = spv.CompositeExtract %0[0 : i32] : !spv.array<1 x vector<2 x i32>>
+  return %1 : vector<2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: extract_from_not_constant
+func @extract_from_not_constant() -> i32 {
+  %0 = spv.Variable : !spv.ptr<vector<3xi32>, Function>
+  %1 = spv.Load "Function" %0 : vector<3xi32>
+  // CHECK: spv.CompositeExtract
+  %2 = spv.CompositeExtract %1[0 : i32] : vector<3xi32>
+  spv.ReturnValue %2 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.constant
+//===----------------------------------------------------------------------===//
+
+// TODO(antiagainst): test constants in different blocks
+
+func @deduplicate_scalar_constant() -> (i32, i32) {
+  // CHECK: %[[CST:.*]] = spv.constant 42 : i32
+  %0 = spv.constant 42 : i32
+  %1 = spv.constant 42 : i32
+  // CHECK-NEXT: return %[[CST]], %[[CST]]
+  return %0, %1 : i32, i32
+}
+
+// -----
+
+func @deduplicate_vector_constant() -> (vector<3xi32>, vector<3xi32>) {
+  // CHECK: %[[CST:.*]] = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  %0 = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  %1 = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  // CHECK-NEXT: return %[[CST]], %[[CST]]
+  return %0, %1 : vector<3xi32>, vector<3xi32>
+}
+
+// -----
+
+func @deduplicate_composite_constant() -> (!spv.array<1 x vector<2xi32>>, !spv.array<1 x vector<2xi32>>) {
+  // CHECK: %[[CST:.*]] = spv.constant [dense<5> : vector<2xi32>] : !spv.array<1 x vector<2xi32>>
+  %0 = spv.constant [dense<5> : vector<2xi32>] : !spv.array<1 x vector<2xi32>>
+  %1 = spv.constant [dense<5> : vector<2xi32>] : !spv.array<1 x vector<2xi32>>
+  // CHECK-NEXT: return %[[CST]], %[[CST]]
+  return %0, %1 : !spv.array<1 x vector<2xi32>>, !spv.array<1 x vector<2xi32>>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.IAdd
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @iadd_zero
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+func @iadd_zero(%arg0: i32) -> (i32, i32) {
+  %zero = spv.constant 0 : i32
+  %0 = spv.IAdd %arg0, %zero : i32
+  %1 = spv.IAdd %zero, %arg0 : i32
+  // CHECK: return %[[ARG]], %[[ARG]]
+  return %0, %1: i32, i32
+}
+
+// CHECK-LABEL: @const_fold_scalar_iadd_normal
+func @const_fold_scalar_iadd_normal() -> (i32, i32, i32) {
+  %c5 = spv.constant 5 : i32
+  %cn8 = spv.constant -8 : i32
+
+  // CHECK: spv.constant 10
+  // CHECK: spv.constant -16
+  // CHECK: spv.constant -3
+  %0 = spv.IAdd %c5, %c5 : i32
+  %1 = spv.IAdd %cn8, %cn8 : i32
+  %2 = spv.IAdd %c5, %cn8 : i32
+  return %0, %1, %2: i32, i32, i32
+}
+
+// CHECK-LABEL: @const_fold_scalar_iadd_flow
+func @const_fold_scalar_iadd_flow() -> (i32, i32, i32, i32) {
+  %c1 = spv.constant 1 : i32
+  %c2 = spv.constant 2 : i32
+  %c3 = spv.constant 4294967295 : i32  // 2^32 - 1: 0xffff ffff
+  %c4 = spv.constant -2147483648 : i32 // -2^31   : 0x8000 0000
+  %c5 = spv.constant -1 : i32          //         : 0xffff ffff
+  %c6 = spv.constant -2 : i32          //         : 0xffff fffe
+
+  // 0x0000 0001 + 0xffff ffff = 0x1 0000 0000 -> 0x0000 0000
+  // CHECK: spv.constant 0
+  %0 = spv.IAdd %c1, %c3 : i32
+  // 0x0000 0002 + 0xffff ffff = 0x1 0000 0001 -> 0x0000 0001
+  // CHECK: spv.constant 1
+  %1 = spv.IAdd %c2, %c3 : i32
+  // 0x8000 0000 + 0xffff ffff = 0x1 7fff ffff -> 0x7fff ffff
+  // CHECK: spv.constant 2147483647
+  %2 = spv.IAdd %c4, %c5 : i32
+  // 0x8000 0000 + 0xffff fffe = 0x1 7fff fffe -> 0x7fff fffe
+  // CHECK: spv.constant 2147483646
+  %3 = spv.IAdd %c4, %c6 : i32
+  return %0, %1, %2, %3: i32, i32, i32, i32
+}
+
+// CHECK-LABEL: @const_fold_vector_iadd
+func @const_fold_vector_iadd() -> vector<3xi32> {
+  %vc1 = spv.constant dense<[42, -55, 127]> : vector<3xi32>
+  %vc2 = spv.constant dense<[-3, -15, 28]> : vector<3xi32>
+
+  // CHECK: spv.constant dense<[39, -70, 155]>
+  %0 = spv.IAdd %vc1, %vc2 : vector<3xi32>
+  return %0: vector<3xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.IMul
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @imul_zero_one
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+func @imul_zero_one(%arg0: i32) -> (i32, i32) {
+  // CHECK: %[[ZERO:.*]] = spv.constant 0
+  %zero = spv.constant 0 : i32
+  %one = spv.constant 1: i32
+  %0 = spv.IMul %arg0, %zero : i32
+  %1 = spv.IMul %one, %arg0 : i32
+  // CHECK: return %[[ZERO]], %[[ARG]]
+  return %0, %1: i32, i32
+}
+
+// CHECK-LABEL: @const_fold_scalar_imul_normal
+func @const_fold_scalar_imul_normal() -> (i32, i32, i32) {
+  %c5 = spv.constant 5 : i32
+  %cn8 = spv.constant -8 : i32
+  %c7 = spv.constant 7 : i32
+
+  // CHECK: spv.constant 35
+  // CHECK: spv.constant -40
+  // CHECK: spv.constant -56
+  %0 = spv.IMul %c7, %c5 : i32
+  %1 = spv.IMul %c5, %cn8 : i32
+  %2 = spv.IMul %cn8, %c7 : i32
+  return %0, %1, %2: i32, i32, i32
+}
+
+// CHECK-LABEL: @const_fold_scalar_imul_flow
+func @const_fold_scalar_imul_flow() -> (i32, i32, i32) {
+  %c1 = spv.constant 2 : i32
+  %c2 = spv.constant 4 : i32
+  %c3 = spv.constant 4294967295 : i32  // 2^32 - 1 : 0xffff ffff
+  %c4 = spv.constant -2147483649 : i32 // -2^31 - 1: 0x7fff ffff
+
+  // (0xffff ffff << 1) = 0x1 ffff fffe -> 0xffff fffe
+  // CHECK: %[[CST2:.*]] = spv.constant -2
+  %0 = spv.IMul %c1, %c3 : i32
+  // (0x7fff ffff << 1) = 0x0 ffff fffe -> 0xffff fffe
+  %1 = spv.IMul %c1, %c4 : i32
+  // (0x7fff ffff << 2) = 0x1 ffff fffc -> 0xffff fffc
+  // CHECK: %[[CST4:.*]] = spv.constant -4
+  %2 = spv.IMul %c4, %c2 : i32
+  // CHECK: return %[[CST2]], %[[CST2]], %[[CST4]]
+  return %0, %1, %2: i32, i32, i32
+}
+
+
+// CHECK-LABEL: @const_fold_vector_imul
+func @const_fold_vector_imul() -> vector<3xi32> {
+  %vc1 = spv.constant dense<[42, -55, 127]> : vector<3xi32>
+  %vc2 = spv.constant dense<[-3, -15, 28]> : vector<3xi32>
+
+  // CHECK: spv.constant dense<[-126, 825, 3556]>
+  %0 = spv.IMul %vc1, %vc2 : vector<3xi32>
+  return %0: vector<3xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ISub
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @isub_x_x
+func @isub_x_x(%arg0: i32) -> i32 {
+  // CHECK: spv.constant 0
+  %0 = spv.ISub %arg0, %arg0: i32
+  return %0: i32
+}
+
+// CHECK-LABEL: @const_fold_scalar_isub_normal
+func @const_fold_scalar_isub_normal() -> (i32, i32, i32) {
+  %c5 = spv.constant 5 : i32
+  %cn8 = spv.constant -8 : i32
+  %c7 = spv.constant 7 : i32
+
+  // CHECK: spv.constant 2
+  // CHECK: spv.constant 13
+  // CHECK: spv.constant -15
+  %0 = spv.ISub %c7, %c5 : i32
+  %1 = spv.ISub %c5, %cn8 : i32
+  %2 = spv.ISub %cn8, %c7 : i32
+  return %0, %1, %2: i32, i32, i32
+}
+
+// CHECK-LABEL: @const_fold_scalar_isub_flow
+func @const_fold_scalar_isub_flow() -> (i32, i32, i32, i32) {
+  %c1 = spv.constant 0 : i32
+  %c2 = spv.constant 1 : i32
+  %c3 = spv.constant 4294967295 : i32  // 2^32 - 1 : 0xffff ffff
+  %c4 = spv.constant -2147483649 : i32 // -2^31 - 1: 0x7fff ffff
+  %c5 = spv.constant -1 : i32          //          : 0xffff ffff
+  %c6 = spv.constant -2 : i32          //          : 0xffff fffe
+
+  // 0x0000 0000 - 0xffff ffff -> 0x0000 0000 + 0x0000 0001 = 0x0000 0001
+  // CHECK: spv.constant 1
+  %0 = spv.ISub %c1, %c3 : i32
+  // 0x0000 0001 - 0xffff ffff -> 0x0000 0001 + 0x0000 0001 = 0x0000 0002
+  // CHECK: spv.constant 2
+  %1 = spv.ISub %c2, %c3 : i32
+  // 0xffff ffff - 0x7fff ffff -> 0xffff ffff + 0x8000 0001 = 0x1 8000 0000
+  // CHECK: spv.constant -2147483648
+  %2 = spv.ISub %c5, %c4 : i32
+  // 0xffff fffe - 0x7fff ffff -> 0xffff fffe + 0x8000 0001 = 0x1 7fff ffff
+  // CHECK: spv.constant 2147483647
+  %3 = spv.ISub %c6, %c4 : i32
+  return %0, %1, %2, %3: i32, i32, i32, i32
+}
+
+// CHECK-LABEL: @const_fold_vector_isub
+func @const_fold_vector_isub() -> vector<3xi32> {
+  %vc1 = spv.constant dense<[42, -55, 127]> : vector<3xi32>
+  %vc2 = spv.constant dense<[-3, -15, 28]> : vector<3xi32>
+
+  // CHECK: spv.constant dense<[45, -40, 99]>
+  %0 = spv.ISub %vc1, %vc2 : vector<3xi32>
+  return %0: vector<3xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.LogicalNot
+//===----------------------------------------------------------------------===//
+
+func @convert_logical_not_to_not_equal(%arg0: vector<3xi64>, %arg1: vector<3xi64>) -> vector<3xi1> {
+  // CHECK: %[[RESULT:.*]] = spv.INotEqual {{%.*}}, {{%.*}} : vector<3xi64>
+  // CHECK-NEXT: spv.ReturnValue %[[RESULT]] : vector<3xi1>
+  %2 = spv.IEqual %arg0, %arg1 : vector<3xi64>
+  %3 = spv.LogicalNot %2 : vector<3xi1>
+  spv.ReturnValue %3 : vector<3xi1>
+}
+
+// -----
+
+func @convert_logical_not_to_equal(%arg0: vector<3xi64>, %arg1: vector<3xi64>) -> vector<3xi1> {
+  // CHECK: %[[RESULT:.*]] = spv.IEqual {{%.*}}, {{%.*}} : vector<3xi64>
+  // CHECK-NEXT: spv.ReturnValue %[[RESULT]] : vector<3xi1>
+  %2 = spv.INotEqual %arg0, %arg1 : vector<3xi64>
+  %3 = spv.LogicalNot %2 : vector<3xi1>
+  spv.ReturnValue %3 : vector<3xi1>
+}
+
+// -----
+
+func @convert_logical_not_parent_multi_use(%arg0: vector<3xi64>, %arg1: vector<3xi64>, %arg2: !spv.ptr<vector<3xi1>, Uniform>) -> vector<3xi1> {
+  // CHECK: %[[RESULT_0:.*]] = spv.INotEqual {{%.*}}, {{%.*}} : vector<3xi64>
+  // CHECK-NEXT: %[[RESULT_1:.*]] = spv.IEqual {{%.*}}, {{%.*}} : vector<3xi64>
+  // CHECK-NEXT: spv.Store "Uniform" {{%.*}}, %[[RESULT_0]]
+  // CHECK-NEXT: spv.ReturnValue %[[RESULT_1]]
+  %0 = spv.INotEqual %arg0, %arg1 : vector<3xi64>
+  %1 = spv.LogicalNot %0 : vector<3xi1>
+  spv.Store "Uniform" %arg2, %0 : vector<3xi1>
+  spv.ReturnValue %1 : vector<3xi1>
+}
+
+// -----
+
+func @convert_logical_not_to_logical_not_equal(%arg0: vector<3xi1>, %arg1: vector<3xi1>) -> vector<3xi1> {
+  // CHECK: %[[RESULT:.*]] = spv.LogicalNotEqual {{%.*}}, {{%.*}} : vector<3xi1>
+  // CHECK-NEXT: spv.ReturnValue %[[RESULT]] : vector<3xi1>
+  %2 = spv.LogicalEqual %arg0, %arg1 : vector<3xi1>
+  %3 = spv.LogicalNot %2 : vector<3xi1>
+  spv.ReturnValue %3 : vector<3xi1>
+}
+
+// -----
+
+func @convert_logical_not_to_logical_equal(%arg0: vector<3xi1>, %arg1: vector<3xi1>) -> vector<3xi1> {
+  // CHECK: %[[RESULT:.*]] = spv.LogicalEqual {{%.*}}, {{%.*}} : vector<3xi1>
+  // CHECK-NEXT: spv.ReturnValue %[[RESULT]] : vector<3xi1>
+  %2 = spv.LogicalNotEqual %arg0, %arg1 : vector<3xi1>
+  %3 = spv.LogicalNot %2 : vector<3xi1>
+  spv.ReturnValue %3 : vector<3xi1>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.selection
+//===----------------------------------------------------------------------===//
+
+func @canonicalize_selection_op_scalar_type(%cond: i1) -> () {
+  %0 = spv.constant 0: i32
+  // CHECK: %[[TRUE_VALUE:.*]] = spv.constant 1 : i32
+  %1 = spv.constant 1: i32
+  // CHECK: %[[FALSE_VALUE:.*]] = spv.constant 2 : i32
+  %2 = spv.constant 2: i32
+  // CHECK: %[[DST_VAR:.*]] = spv.Variable init({{%.*}}) : !spv.ptr<i32, Function>
+  %3 = spv.Variable init(%0) : !spv.ptr<i32, Function>
+
+  // CHECK: %[[SRC_VALUE:.*]] = spv.Select {{%.*}}, %[[TRUE_VALUE]], %[[FALSE_VALUE]] : i1, i32
+  // CHECK-NEXT: spv.Store "Function" %[[DST_VAR]], %[[SRC_VALUE]] ["Aligned", 4] : i32
+  // CHECK-NEXT: spv.Return
+  spv.selection {
+    spv.BranchConditional %cond, ^then, ^else
+
+  ^else:
+    spv.Store "Function" %3, %2 ["Aligned", 4]: i32
+    spv.Branch ^merge
+
+  ^then:
+    spv.Store "Function" %3, %1 ["Aligned", 4]: i32
+    spv.Branch ^merge
+
+  ^merge:
+    spv._merge
+  }
+  spv.Return
+}
+
+// -----
+
+func @canonicalize_selection_op_vector_type(%cond: i1) -> () {
+  %0 = spv.constant dense<[0, 1, 2]> : vector<3xi32>
+  // CHECK: %[[TRUE_VALUE:.*]] = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  %1 = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  // CHECK: %[[FALSE_VALUE:.*]] = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  %2 = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  // CHECK: %[[DST_VAR:.*]] = spv.Variable init({{%.*}}) : !spv.ptr<vector<3xi32>, Function>
+  %3 = spv.Variable init(%0) : !spv.ptr<vector<3xi32>, Function>
+
+  // CHECK: %[[SRC_VALUE:.*]] = spv.Select {{%.*}}, %[[TRUE_VALUE]], %[[FALSE_VALUE]] : i1, vector<3xi32>
+  // CHECK-NEXT: spv.Store "Function" %[[DST_VAR]], %[[SRC_VALUE]] ["Aligned", 8] : vector<3xi32>
+  // CHECK-NEXT: spv.Return
+  spv.selection {
+    spv.BranchConditional %cond, ^then, ^else
+
+  ^then:
+    spv.Store "Function" %3, %1 ["Aligned", 8]:  vector<3xi32>
+    spv.Branch ^merge
+
+  ^else:
+    spv.Store "Function" %3, %2 ["Aligned", 8] : vector<3xi32>
+    spv.Branch ^merge
+
+  ^merge:
+    spv._merge
+  }
+  spv.Return
+}
+
+// -----
+
+// Store to a different variables.
+func @cannot_canonicalize_selection_op_0(%cond: i1) -> () {
+  %0 = spv.constant dense<[0, 1, 2]> : vector<3xi32>
+  // CHECK: %[[SRC_VALUE_0:.*]] = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  %1 = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  // CHECK: %[[SRC_VALUE_1:.*]] = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  %2 = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  // CHECK: %[[DST_VAR_0:.*]] = spv.Variable init({{%.*}}) : !spv.ptr<vector<3xi32>, Function>
+  %3 = spv.Variable init(%0) : !spv.ptr<vector<3xi32>, Function>
+  // CHECK: %[[DST_VAR_1:.*]] = spv.Variable init({{%.*}}) : !spv.ptr<vector<3xi32>, Function>
+  %4 = spv.Variable init(%0) : !spv.ptr<vector<3xi32>, Function>
+
+  // CHECK: spv.selection {
+  spv.selection {
+    spv.BranchConditional %cond, ^then, ^else
+
+  ^then:
+    // CHECK: spv.Store "Function" %[[DST_VAR_0]], %[[SRC_VALUE_0]] ["Aligned", 8] : vector<3xi32>
+    spv.Store "Function" %3, %1 ["Aligned", 8]:  vector<3xi32>
+    spv.Branch ^merge
+
+  ^else:
+    // CHECK: spv.Store "Function" %[[DST_VAR_1]], %[[SRC_VALUE_1]] ["Aligned", 8] : vector<3xi32>
+    spv.Store "Function" %4, %2 ["Aligned", 8] : vector<3xi32>
+    spv.Branch ^merge
+
+  ^merge:
+    spv._merge
+  }
+  spv.Return
+}
+
+// -----
+
+// A conditional block consists of more than 2 operations.
+func @cannot_canonicalize_selection_op_1(%cond: i1) -> () {
+  %0 = spv.constant dense<[0, 1, 2]> : vector<3xi32>
+  // CHECK: %[[SRC_VALUE_0:.*]] = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  %1 = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  // CHECK: %[[SRC_VALUE_1:.*]] = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  %2 = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  // CHECK: %[[DST_VAR_0:.*]] = spv.Variable init({{%.*}}) : !spv.ptr<vector<3xi32>, Function>
+  %3 = spv.Variable init(%0) : !spv.ptr<vector<3xi32>, Function>
+  // CHECK: %[[DST_VAR_1:.*]] = spv.Variable init({{%.*}}) : !spv.ptr<vector<3xi32>, Function>
+  %4 = spv.Variable init(%0) : !spv.ptr<vector<3xi32>, Function>
+
+  // CHECK: spv.selection {
+  spv.selection {
+    spv.BranchConditional %cond, ^then, ^else
+
+  ^then:
+    // CHECK: spv.Store "Function" %[[DST_VAR_0]], %[[SRC_VALUE_0]] ["Aligned", 8] : vector<3xi32>
+    spv.Store "Function" %3, %1 ["Aligned", 8] : vector<3xi32>
+    // CHECK: spv.Store "Function" %[[DST_VAR_1]], %[[SRC_VALUE_0]] ["Aligned", 8] : vector<3xi32>
+    spv.Store "Function" %4, %1 ["Aligned", 8]:  vector<3xi32>
+    spv.Branch ^merge
+
+  ^else:
+    // CHECK: spv.Store "Function" %[[DST_VAR_1]], %[[SRC_VALUE_1]] ["Aligned", 8] : vector<3xi32>
+    spv.Store "Function" %4, %2 ["Aligned", 8] : vector<3xi32>
+    spv.Branch ^merge
+
+  ^merge:
+    spv._merge
+  }
+  spv.Return
+}
+
+// -----
+
+// A control-flow goes into `^then` block from `^else` block.
+func @cannot_canonicalize_selection_op_2(%cond: i1) -> () {
+  %0 = spv.constant dense<[0, 1, 2]> : vector<3xi32>
+  // CHECK: %[[SRC_VALUE_0:.*]] = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  %1 = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  // CHECK: %[[SRC_VALUE_1:.*]] = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  %2 = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  // CHECK: %[[DST_VAR:.*]] = spv.Variable init({{%.*}}) : !spv.ptr<vector<3xi32>, Function>
+  %3 = spv.Variable init(%0) : !spv.ptr<vector<3xi32>, Function>
+
+  // CHECK: spv.selection {
+  spv.selection {
+    spv.BranchConditional %cond, ^then, ^else
+
+  ^then:
+    // CHECK: spv.Store "Function" %[[DST_VAR]], %[[SRC_VALUE_0]] ["Aligned", 8] : vector<3xi32>
+    spv.Store "Function" %3, %1 ["Aligned", 8]:  vector<3xi32>
+    spv.Branch ^merge
+
+  ^else:
+    // CHECK: spv.Store "Function" %[[DST_VAR]], %[[SRC_VALUE_1]] ["Aligned", 8] : vector<3xi32>
+    spv.Store "Function" %3, %2 ["Aligned", 8] : vector<3xi32>
+    spv.Branch ^then
+
+  ^merge:
+    spv._merge
+  }
+  spv.Return
+}
+
+// -----
+
+// `spv.Return` as a block terminator.
+func @cannot_canonicalize_selection_op_3(%cond: i1) -> () {
+  %0 = spv.constant dense<[0, 1, 2]> : vector<3xi32>
+  // CHECK: %[[SRC_VALUE_0:.*]] = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  %1 = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  // CHECK: %[[SRC_VALUE_1:.*]] = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  %2 = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  // CHECK: %[[DST_VAR:.*]] = spv.Variable init({{%.*}}) : !spv.ptr<vector<3xi32>, Function>
+  %3 = spv.Variable init(%0) : !spv.ptr<vector<3xi32>, Function>
+
+  // CHECK: spv.selection {
+  spv.selection {
+    spv.BranchConditional %cond, ^then, ^else
+
+  ^then:
+    // CHECK: spv.Store "Function" %[[DST_VAR]], %[[SRC_VALUE_0]] ["Aligned", 8] : vector<3xi32>
+    spv.Store "Function" %3, %1 ["Aligned", 8]:  vector<3xi32>
+    spv.Return
+
+  ^else:
+    // CHECK: spv.Store "Function" %[[DST_VAR]], %[[SRC_VALUE_1]] ["Aligned", 8] : vector<3xi32>
+    spv.Store "Function" %3, %2 ["Aligned", 8] : vector<3xi32>
+    spv.Branch ^merge
+
+  ^merge:
+    spv._merge
+  }
+  spv.Return
+}
+
+// -----
+
+// Different memory access attributes.
+func @cannot_canonicalize_selection_op_4(%cond: i1) -> () {
+  %0 = spv.constant dense<[0, 1, 2]> : vector<3xi32>
+  // CHECK: %[[SRC_VALUE_0:.*]] = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  %1 = spv.constant dense<[1, 2, 3]> : vector<3xi32>
+  // CHECK: %[[SRC_VALUE_1:.*]] = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  %2 = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  // CHECK: %[[DST_VAR:.*]] = spv.Variable init({{%.*}}) : !spv.ptr<vector<3xi32>, Function>
+  %3 = spv.Variable init(%0) : !spv.ptr<vector<3xi32>, Function>
+
+  // CHECK: spv.selection {
+  spv.selection {
+    spv.BranchConditional %cond, ^then, ^else
+
+  ^then:
+    // CHECK: spv.Store "Function" %[[DST_VAR]], %[[SRC_VALUE_0]] ["Aligned", 4] : vector<3xi32>
+    spv.Store "Function" %3, %1 ["Aligned", 4]:  vector<3xi32>
+    spv.Branch ^merge
+
+  ^else:
+    // CHECK: spv.Store "Function" %[[DST_VAR]], %[[SRC_VALUE_1]] ["Aligned", 8] : vector<3xi32>
+    spv.Store "Function" %3, %2 ["Aligned", 8] : vector<3xi32>
+    spv.Branch ^merge
+
+  ^merge:
+    spv._merge
+  }
+  spv.Return
+}
diff --git a/mlir/test/Dialect/SPIRV/composite-ops.mlir b/mlir/test/Dialect/SPIRV/composite-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..4ce89748a091d8571d60fb37e9be67a82bc64fa6
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/composite-ops.mlir
@@ -0,0 +1,231 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// spv.CompositeConstruct
+//===----------------------------------------------------------------------===//
+
+func @composite_construct_vector(%arg0: f32, %arg1: f32, %arg2 : f32) -> vector<3xf32> {
+  // CHECK: spv.CompositeConstruct {{%.*}}, {{%.*}}, {{%.*}} : vector<3xf32>
+  %0 = spv.CompositeConstruct %arg0, %arg1, %arg2 : vector<3xf32>
+  return %0: vector<3xf32>
+}
+
+// -----
+
+func @composite_construct_struct(%arg0: vector<3xf32>, %arg1: !spv.array<4xf32>, %arg2 : !spv.struct<f32>) -> !spv.struct<vector<3xf32>, !spv.array<4xf32>, !spv.struct<f32>> {
+  // CHECK: spv.CompositeConstruct %arg0, %arg1, %arg2 : !spv.struct<vector<3xf32>, !spv.array<4 x f32>, !spv.struct<f32>>
+  %0 = spv.CompositeConstruct %arg0, %arg1, %arg2 : !spv.struct<vector<3xf32>, !spv.array<4xf32>, !spv.struct<f32>>
+  return %0: !spv.struct<vector<3xf32>, !spv.array<4xf32>, !spv.struct<f32>>
+}
+
+// -----
+
+func @composite_construct_empty_struct() -> !spv.struct<> {
+  // CHECK: spv.CompositeConstruct : !spv.struct<>
+  %0 = spv.CompositeConstruct : !spv.struct<>
+  return %0: !spv.struct<>
+}
+
+// -----
+
+func @composite_construct_invalid_num_of_elements(%arg0: f32) -> f32 {
+  // expected-error @+1 {{result type must be a composite type, but provided 'f32'}}
+  %0 = spv.CompositeConstruct %arg0 : f32
+  return %0: f32
+}
+
+// -----
+
+func @composite_construct_invalid_result_type(%arg0: f32, %arg1: f32, %arg2 : f32) -> vector<3xf32> {
+  // expected-error @+1 {{has incorrect number of operands: expected 3, but provided 2}}
+  %0 = spv.CompositeConstruct %arg0, %arg2 : vector<3xf32>
+  return %0: vector<3xf32>
+}
+
+// -----
+
+func @composite_construct_invalid_operand_type(%arg0: f32, %arg1: f32, %arg2 : f32) -> vector<3xi32> {
+  // expected-error @+1 {{operand type mismatch: expected operand type 'i32', but provided 'f32'}}
+  %0 = "spv.CompositeConstruct" (%arg0, %arg1, %arg2) : (f32, f32, f32) -> vector<3xi32>
+  return %0: vector<3xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.CompositeExtractOp
+//===----------------------------------------------------------------------===//
+
+func @composite_extract_array(%arg0: !spv.array<4xf32>) -> f32 {
+  // CHECK: {{%.*}} = spv.CompositeExtract {{%.*}}[1 : i32] : !spv.array<4 x f32>
+  %0 = spv.CompositeExtract %arg0[1 : i32] : !spv.array<4xf32>
+  return %0: f32
+}
+
+// -----
+
+func @composite_extract_struct(%arg0 : !spv.struct<f32, !spv.array<4xf32>>) -> f32 {
+  // CHECK: {{%.*}} = spv.CompositeExtract {{%.*}}[1 : i32, 2 : i32] : !spv.struct<f32, !spv.array<4 x f32>>
+  %0 = spv.CompositeExtract %arg0[1 : i32, 2 : i32] : !spv.struct<f32, !spv.array<4xf32>>
+  return %0 : f32
+}
+
+// -----
+
+func @composite_extract_vector(%arg0 : vector<4xf32>) -> f32 {
+  // CHECK: {{%.*}} = spv.CompositeExtract {{%.*}}[1 : i32] : vector<4xf32>
+  %0 = spv.CompositeExtract %arg0[1 : i32] : vector<4xf32>
+  return %0 : f32
+}
+
+// -----
+
+func @composite_extract_no_ssa_operand() -> () {
+  // expected-error @+1 {{expected SSA operand}}
+  %0 = spv.CompositeExtract [4 : i32, 1 : i32] : !spv.array<4x!spv.array<4xf32>>
+  return
+}
+
+// -----
+
+func @composite_extract_invalid_index_type_1() -> () {
+  %0 = spv.constant 10 : i32
+  %1 = spv.Variable : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  %2 = spv.Load "Function" %1 ["Volatile"] : !spv.array<4x!spv.array<4xf32>>
+  // expected-error @+1 {{expected non-function type}}
+  %3 = spv.CompositeExtract %2[%0] : !spv.array<4x!spv.array<4xf32>>
+  return
+}
+
+// -----
+
+func @composite_extract_invalid_index_type_2(%arg0 : !spv.array<4x!spv.array<4xf32>>) -> () {
+  // expected-error @+1 {{op attribute 'indices' failed to satisfy constraint: 32-bit integer array attribute}}
+  %0 = spv.CompositeExtract %arg0[1] : !spv.array<4x!spv.array<4xf32>>
+  return
+}
+
+// -----
+
+func @composite_extract_invalid_index_identifier(%arg0 : !spv.array<4x!spv.array<4xf32>>) -> () {
+  // expected-error @+1 {{expected bare identifier}}
+  %0 = spv.CompositeExtract %arg0(1 : i32) : !spv.array<4x!spv.array<4xf32>>
+  return
+}
+
+// -----
+
+func @composite_extract_2D_array_out_of_bounds_access_1(%arg0: !spv.array<4x!spv.array<4xf32>>) -> () {
+  // expected-error @+1 {{index 4 out of bounds for '!spv.array<4 x !spv.array<4 x f32>>'}}
+  %0 = spv.CompositeExtract %arg0[4 : i32, 1 : i32] : !spv.array<4x!spv.array<4xf32>>
+  return
+}
+
+// -----
+
+func @composite_extract_2D_array_out_of_bounds_access_2(%arg0: !spv.array<4x!spv.array<4xf32>>
+) -> () {
+  // expected-error @+1 {{index 4 out of bounds for '!spv.array<4 x f32>'}}
+  %0 = spv.CompositeExtract %arg0[1 : i32, 4 : i32] : !spv.array<4x!spv.array<4xf32>>
+  return
+}
+
+// -----
+
+func @composite_extract_struct_element_out_of_bounds_access(%arg0 : !spv.struct<f32, !spv.array<4xf32>>) -> () {
+  // expected-error @+1 {{index 2 out of bounds for '!spv.struct<f32, !spv.array<4 x f32>>'}}
+  %0 = spv.CompositeExtract %arg0[2 : i32, 0 : i32] : !spv.struct<f32, !spv.array<4xf32>>
+  return
+}
+
+// -----
+
+func @composite_extract_vector_out_of_bounds_access(%arg0: vector<4xf32>) -> () {
+  // expected-error @+1 {{index 4 out of bounds for 'vector<4xf32>'}}
+  %0 = spv.CompositeExtract %arg0[4 : i32] : vector<4xf32>
+  return
+}
+
+// -----
+
+func @composite_extract_invalid_types_1(%arg0: !spv.array<4x!spv.array<4xf32>>) -> () {
+  // expected-error @+1 {{cannot extract from non-composite type 'f32' with index 3}}
+  %0 = spv.CompositeExtract %arg0[1 : i32, 2 : i32, 3 : i32] : !spv.array<4x!spv.array<4xf32>>
+  return
+}
+
+// -----
+
+func @composite_extract_invalid_types_2(%arg0: f32) -> () {
+  // expected-error @+1 {{cannot extract from non-composite type 'f32' with index 1}}
+  %0 = spv.CompositeExtract %arg0[1 : i32] : f32
+  return
+}
+
+// -----
+
+func @composite_extract_invalid_extracted_type(%arg0: !spv.array<4x!spv.array<4xf32>>) -> () {
+  // expected-error @+1 {{expected at least one index for spv.CompositeExtract}}
+  %0 = spv.CompositeExtract %arg0[] : !spv.array<4x!spv.array<4xf32>>
+  return
+}
+
+// -----
+
+func @composite_extract_result_type_mismatch(%arg0: !spv.array<4xf32>) -> i32 {
+  // expected-error @+1 {{invalid result type: expected 'f32' but provided 'i32'}}
+  %0 = "spv.CompositeExtract"(%arg0) {indices = [2: i32]} : (!spv.array<4xf32>) -> (i32)
+  return %0: i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.CompositeInsert
+//===----------------------------------------------------------------------===//
+
+func @composite_insert_array(%arg0: !spv.array<4xf32>, %arg1: f32) -> !spv.array<4xf32> {
+  // CHECK: {{%.*}} = spv.CompositeInsert {{%.*}}, {{%.*}}[1 : i32] : f32 into !spv.array<4 x f32>
+  %0 = spv.CompositeInsert %arg1, %arg0[1 : i32] : f32 into !spv.array<4xf32>
+  return %0: !spv.array<4xf32>
+}
+
+// -----
+
+func @composite_insert_struct(%arg0: !spv.struct<!spv.array<4xf32>, f32>, %arg1: !spv.array<4xf32>) -> !spv.struct<!spv.array<4xf32>, f32> {
+  // CHECK: {{%.*}} = spv.CompositeInsert {{%.*}}, {{%.*}}[0 : i32] : !spv.array<4 x f32> into !spv.struct<!spv.array<4 x f32>, f32>
+  %0 = spv.CompositeInsert %arg1, %arg0[0 : i32] : !spv.array<4xf32> into !spv.struct<!spv.array<4xf32>, f32>
+  return %0: !spv.struct<!spv.array<4xf32>, f32>
+}
+
+// -----
+
+func @composite_insert_no_indices(%arg0: !spv.array<4xf32>, %arg1: f32) -> !spv.array<4xf32> {
+  // expected-error @+1 {{expected at least one index}}
+  %0 = spv.CompositeInsert %arg1, %arg0[] : f32 into !spv.array<4xf32>
+  return %0: !spv.array<4xf32>
+}
+
+// -----
+
+func @composite_insert_out_of_bounds(%arg0: !spv.array<4xf32>, %arg1: f32) -> !spv.array<4xf32> {
+  // expected-error @+1 {{index 4 out of bounds}}
+  %0 = spv.CompositeInsert %arg1, %arg0[4 : i32] : f32 into !spv.array<4xf32>
+  return %0: !spv.array<4xf32>
+}
+
+// -----
+
+func @composite_insert_invalid_object_type(%arg0: !spv.array<4xf32>, %arg1: f64) -> !spv.array<4xf32> {
+  // expected-error @+1 {{object operand type should be 'f32', but found 'f64'}}
+  %0 = spv.CompositeInsert %arg1, %arg0[3 : i32] : f64 into !spv.array<4xf32>
+  return %0: !spv.array<4xf32>
+}
+
+// -----
+
+func @composite_insert_invalid_result_type(%arg0: !spv.array<4xf32>, %arg1 : f32) -> !spv.array<4xf64> {
+  // expected-error @+1 {{result type should be the same as the composite type, but found '!spv.array<4 x f32>' vs '!spv.array<4 x f64>'}}
+  %0 = "spv.CompositeInsert"(%arg1, %arg0) {indices = [0: i32]} : (f32, !spv.array<4xf32>) -> !spv.array<4xf64>
+  return %0: !spv.array<4xf64>
+}
diff --git a/mlir/test/Dialect/SPIRV/control-flow-ops.mlir b/mlir/test/Dialect/SPIRV/control-flow-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..63e214a08b9ced25bcf6a910912eb8058ed0ba41
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/control-flow-ops.mlir
@@ -0,0 +1,712 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// spv.Branch
+//===----------------------------------------------------------------------===//
+
+func @branch() -> () {
+  // CHECK: spv.Branch ^bb1
+  spv.Branch ^next
+^next:
+  spv.Return
+}
+
+// -----
+
+func @branch_argument() -> () {
+  %zero = spv.constant 0 : i32
+  // CHECK: spv.Branch ^bb1(%{{.*}}, %{{.*}} : i32, i32)
+  spv.Branch ^next(%zero, %zero: i32, i32)
+^next(%arg0: i32, %arg1: i32):
+  spv.Return
+}
+
+// -----
+
+func @missing_accessor() -> () {
+  spv.Branch
+  // expected-error @+1 {{expected block name}}
+}
+
+// -----
+
+func @wrong_accessor_count() -> () {
+  %true = spv.constant true
+  // expected-error @+1 {{must have exactly one successor}}
+  "spv.Branch"()[^one, ^two] : () -> ()
+^one:
+  spv.Return
+^two:
+  spv.Return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.BranchConditional
+//===----------------------------------------------------------------------===//
+
+func @cond_branch() -> () {
+  %true = spv.constant true
+  // CHECK: spv.BranchConditional %{{.*}}, ^bb1, ^bb2
+  spv.BranchConditional %true, ^one, ^two
+// CHECK: ^bb1
+^one:
+  spv.Return
+// CHECK: ^bb2
+^two:
+  spv.Return
+}
+
+// -----
+
+func @cond_branch_argument() -> () {
+  %true = spv.constant true
+  %zero = spv.constant 0 : i32
+  // CHECK: spv.BranchConditional %{{.*}}, ^bb1(%{{.*}}, %{{.*}} : i32, i32), ^bb2
+  spv.BranchConditional %true, ^true1(%zero, %zero: i32, i32), ^false1
+^true1(%arg0: i32, %arg1: i32):
+  // CHECK: spv.BranchConditional %{{.*}}, ^bb3, ^bb4(%{{.*}}, %{{.*}} : i32, i32)
+  spv.BranchConditional %true, ^true2, ^false2(%zero, %zero: i32, i32)
+^false1:
+  spv.Return
+^true2:
+  spv.Return
+^false2(%arg3: i32, %arg4: i32):
+  spv.Return
+}
+
+// -----
+
+func @cond_branch_with_weights() -> () {
+  %true = spv.constant true
+  // CHECK: spv.BranchConditional %{{.*}} [5, 10]
+  spv.BranchConditional %true [5, 10], ^one, ^two
+^one:
+  spv.Return
+^two:
+  spv.Return
+}
+
+// -----
+
+func @missing_condition() -> () {
+  // expected-error @+1 {{expected SSA operand}}
+  spv.BranchConditional ^one, ^two
+^one:
+  spv.Return
+^two:
+  spv.Return
+}
+
+// -----
+
+func @wrong_condition_type() -> () {
+  // expected-note @+1 {{prior use here}}
+  %zero = spv.constant 0 : i32
+  // expected-error @+1 {{use of value '%zero' expects different type than prior uses: 'i1' vs 'i32'}}
+  spv.BranchConditional %zero, ^one, ^two
+^one:
+  spv.Return
+^two:
+  spv.Return
+}
+
+// -----
+
+func @wrong_accessor_count() -> () {
+  %true = spv.constant true
+  // expected-error @+1 {{must have exactly two successors}}
+  "spv.BranchConditional"(%true)[^one] : (i1) -> ()
+^one:
+  spv.Return
+^two:
+  spv.Return
+}
+
+// -----
+
+func @wrong_number_of_weights() -> () {
+  %true = spv.constant true
+  // expected-error @+1 {{must have exactly two branch weights}}
+  "spv.BranchConditional"(%true)[^one, ^two] {branch_weights = [1 : i32, 2 : i32, 3 : i32]} : (i1) -> ()
+^one:
+  spv.Return
+^two:
+  spv.Return
+}
+
+// -----
+
+func @weights_cannot_both_be_zero() -> () {
+  %true = spv.constant true
+  // expected-error @+1 {{branch weights cannot both be zero}}
+  spv.BranchConditional %true [0, 0], ^one, ^two
+^one:
+  spv.Return
+^two:
+  spv.Return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.FunctionCall
+//===----------------------------------------------------------------------===//
+
+spv.module "Logical" "GLSL450" {
+  func @fmain(%arg0 : vector<4xf32>, %arg1 : vector<4xf32>, %arg2 : i32) -> i32 {
+    // CHECK: {{%.*}} = spv.FunctionCall @f_0({{%.*}}, {{%.*}}) : (vector<4xf32>, vector<4xf32>) -> vector<4xf32>
+    %0 = spv.FunctionCall @f_0(%arg0, %arg1) : (vector<4xf32>, vector<4xf32>) -> vector<4xf32>
+    // CHECK: spv.FunctionCall @f_1({{%.*}}, {{%.*}}) : (vector<4xf32>, vector<4xf32>) -> ()
+    spv.FunctionCall @f_1(%0, %arg1) : (vector<4xf32>, vector<4xf32>) ->  ()
+    // CHECK: spv.FunctionCall @f_2() : () -> ()
+    spv.FunctionCall @f_2() : () -> ()
+    // CHECK: {{%.*}} = spv.FunctionCall @f_3({{%.*}}) : (i32) -> i32
+    %1 = spv.FunctionCall @f_3(%arg2) : (i32) -> i32
+    spv.ReturnValue %1 : i32
+  }
+
+  func @f_0(%arg0 : vector<4xf32>, %arg1 : vector<4xf32>) -> (vector<4xf32>) {
+    spv.ReturnValue %arg0 : vector<4xf32>
+  }
+
+  func @f_1(%arg0 : vector<4xf32>, %arg1 : vector<4xf32>) -> () {
+    spv.Return
+  }
+
+  func @f_2() -> () {
+    spv.Return
+  }
+
+  func @f_3(%arg0 : i32) -> (i32) {
+    spv.ReturnValue %arg0 : i32
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @f_invalid_result_type(%arg0 : i32, %arg1 : i32) -> () {
+    // expected-error @+1 {{expected callee function to have 0 or 1 result, but provided 2}}
+    %0 = spv.FunctionCall @f_invalid_result_type(%arg0, %arg1) : (i32, i32) -> (i32, i32)
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @f_result_type_mismatch(%arg0 : i32, %arg1 : i32) -> () {
+    // expected-error @+1 {{has incorrect number of results has for callee: expected 0, but provided 1}}
+    %1 = spv.FunctionCall @f_result_type_mismatch(%arg0, %arg0) : (i32, i32) -> (i32)
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @f_type_mismatch(%arg0 : i32, %arg1 : i32) -> () {
+    // expected-error @+1 {{has incorrect number of operands for callee: expected 2, but provided 1}}
+    spv.FunctionCall @f_type_mismatch(%arg0) : (i32) -> ()
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @f_type_mismatch(%arg0 : i32, %arg1 : i32) -> () {
+    %0 = spv.constant 2.0 : f32
+    // expected-error @+1 {{operand type mismatch: expected operand type 'i32', but provided 'f32' for operand number 1}}
+    spv.FunctionCall @f_type_mismatch(%arg0, %0) : (i32, f32) -> ()
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @f_type_mismatch(%arg0 : i32, %arg1 : i32) -> i32 {
+    // expected-error @+1 {{result type mismatch: expected 'i32', but provided 'f32'}}
+    %0 = spv.FunctionCall @f_type_mismatch(%arg0, %arg0) : (i32, i32) -> f32
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @f_foo(%arg0 : i32, %arg1 : i32) -> i32 {
+    // expected-error @+1 {{op callee function 'f_undefined' not found in 'spv.module'}}
+    %0 = spv.FunctionCall @f_undefined(%arg0, %arg0) : (i32, i32) -> i32
+    spv.Return
+  }
+}
+
+// -----
+
+func @f_foo(%arg0 : i32, %arg1 : i32) -> i32 {
+    // expected-error @+1 {{must appear in a function inside 'spv.module'}}
+    %0 = spv.FunctionCall @f_foo(%arg0, %arg0) : (i32, i32) -> i32
+    spv.Return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.loop
+//===----------------------------------------------------------------------===//
+
+// for (int i = 0; i < count; ++i) {}
+func @loop(%count : i32) -> () {
+  %zero = spv.constant 0: i32
+  %one = spv.constant 1: i32
+  %var = spv.Variable init(%zero) : !spv.ptr<i32, Function>
+
+  // CHECK: spv.loop {
+  spv.loop {
+    // CHECK-NEXT: spv.Branch ^bb1
+    spv.Branch ^header
+
+  // CHECK-NEXT: ^bb1:
+  ^header:
+    %val0 = spv.Load "Function" %var : i32
+    %cmp = spv.SLessThan %val0, %count : i32
+    // CHECK: spv.BranchConditional %{{.*}}, ^bb2, ^bb4
+    spv.BranchConditional %cmp, ^body, ^merge
+
+  // CHECK-NEXT: ^bb2:
+  ^body:
+    // Do nothing
+    // CHECK-NEXT: spv.Branch ^bb3
+    spv.Branch ^continue
+
+  // CHECK-NEXT: ^bb3:
+  ^continue:
+    %val1 = spv.Load "Function" %var : i32
+    %add = spv.IAdd %val1, %one : i32
+    spv.Store "Function" %var, %add : i32
+    // CHECK: spv.Branch ^bb1
+    spv.Branch ^header
+
+  // CHECK-NEXT: ^bb4:
+  ^merge:
+    spv._merge
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @empty_region
+func @empty_region() -> () {
+  // CHECK: spv.loop
+  spv.loop {
+  }
+  return
+}
+
+// -----
+
+func @wrong_merge_block() -> () {
+  // expected-error @+1 {{last block must be the merge block with only one 'spv._merge' op}}
+  spv.loop {
+    spv.Return
+  }
+  return
+}
+
+// -----
+
+func @missing_entry_block() -> () {
+  // expected-error @+1 {{must have an entry block branching to the loop header block}}
+  spv.loop {
+    spv._merge
+  }
+  return
+}
+
+// -----
+
+func @missing_header_block() -> () {
+  // expected-error @+1 {{must have a loop header block branched from the entry block}}
+  spv.loop {
+  ^entry:
+    spv.Branch ^merge
+  ^merge:
+    spv._merge
+  }
+  return
+}
+
+// -----
+
+func @entry_should_branch_to_header() -> () {
+  // expected-error @+1 {{entry block must only have one 'spv.Branch' op to the second block}}
+  spv.loop {
+  ^entry:
+    spv.Branch ^merge
+  ^header:
+    spv.Branch ^merge
+  ^merge:
+    spv._merge
+  }
+  return
+}
+
+// -----
+
+func @missing_continue_block() -> () {
+  // expected-error @+1 {{requires a loop continue block branching to the loop header block}}
+  spv.loop {
+  ^entry:
+    spv.Branch ^header
+  ^header:
+    spv.Branch ^merge
+  ^merge:
+    spv._merge
+  }
+  return
+}
+
+// -----
+
+func @continue_should_branch_to_header() -> () {
+  // expected-error @+1 {{second to last block must be the loop continue block that branches to the loop header block}}
+  spv.loop {
+  ^entry:
+    spv.Branch ^header
+  ^header:
+    spv.Branch ^continue
+  ^continue:
+    spv.Branch ^merge
+  ^merge:
+    spv._merge
+  }
+  return
+}
+
+// -----
+
+func @only_entry_and_continue_branch_to_header() -> () {
+  // expected-error @+1 {{can only have the entry and loop continue block branching to the loop header block}}
+  spv.loop {
+  ^entry:
+    spv.Branch ^header
+  ^header:
+    spv.Branch ^cont1
+  ^cont1:
+    spv.Branch ^header
+  ^cont2:
+    spv.Branch ^header
+  ^merge:
+    spv._merge
+  }
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv._merge
+//===----------------------------------------------------------------------===//
+
+func @merge() -> () {
+  // expected-error @+1 {{expected parent op to be 'spv.selection' or 'spv.loop'}}
+  spv._merge
+}
+
+// -----
+
+func @only_allowed_in_last_block(%cond : i1) -> () {
+  %zero = spv.constant 0: i32
+  %one = spv.constant 1: i32
+  %var = spv.Variable init(%zero) : !spv.ptr<i32, Function>
+
+  spv.selection {
+    spv.BranchConditional %cond, ^then, ^merge
+
+  ^then:
+    spv.Store "Function" %var, %one : i32
+    // expected-error @+1 {{can only be used in the last block of 'spv.selection' or 'spv.loop'}}
+    spv._merge
+
+  ^merge:
+    spv._merge
+  }
+
+  spv.Return
+}
+
+// -----
+
+func @only_allowed_in_last_block() -> () {
+  %true = spv.constant true
+  spv.loop {
+    spv.Branch ^header
+  ^header:
+    spv.BranchConditional %true, ^body, ^merge
+  ^body:
+    // expected-error @+1 {{can only be used in the last block of 'spv.selection' or 'spv.loop'}}
+    spv._merge
+  ^continue:
+    spv.Branch ^header
+  ^merge:
+    spv._merge
+  }
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.Return
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @in_selection
+func @in_selection(%cond : i1) -> () {
+  spv.selection {
+    spv.BranchConditional %cond, ^then, ^merge
+  ^then:
+    // CHECK: spv.Return
+    spv.Return
+  ^merge:
+    spv._merge
+  }
+  spv.Return
+}
+
+// CHECK-LABEL: func @in_loop
+func @in_loop(%cond : i1) -> () {
+  spv.loop {
+    spv.Branch ^header
+  ^header:
+    spv.BranchConditional %cond, ^body, ^merge
+  ^body:
+    // CHECK: spv.Return
+    spv.Return
+  ^continue:
+    spv.Branch ^header
+  ^merge:
+    spv._merge
+  }
+  spv.Return
+}
+
+// -----
+
+"foo.function"() ({
+  // expected-error @+1 {{op must appear in a 'func' block}}
+  spv.Return
+})  : () -> ()
+
+// -----
+
+// Return mismatches function signature
+spv.module "Logical" "GLSL450" {
+  func @work() -> (i32) {
+    // expected-error @+1 {{cannot be used in functions returning value}}
+    spv.Return
+  }
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ReturnValue
+//===----------------------------------------------------------------------===//
+
+func @ret_val() -> (i32) {
+  %0 = spv.constant 42 : i32
+  // CHECK: spv.ReturnValue %{{.*}} : i32
+  spv.ReturnValue %0 : i32
+}
+
+// CHECK-LABEL: func @in_selection
+func @in_selection(%cond : i1) -> (i32) {
+  spv.selection {
+    spv.BranchConditional %cond, ^then, ^merge
+  ^then:
+    %zero = spv.constant 0 : i32
+    // CHECK: spv.ReturnValue
+    spv.ReturnValue %zero : i32
+  ^merge:
+    spv._merge
+  }
+  %one = spv.constant 1 : i32
+  spv.ReturnValue %one : i32
+}
+
+// CHECK-LABEL: func @in_loop
+func @in_loop(%cond : i1) -> (i32) {
+  spv.loop {
+    spv.Branch ^header
+  ^header:
+    spv.BranchConditional %cond, ^body, ^merge
+  ^body:
+    %zero = spv.constant 0 : i32
+    // CHECK: spv.ReturnValue
+    spv.ReturnValue %zero : i32
+  ^continue:
+    spv.Branch ^header
+  ^merge:
+    spv._merge
+  }
+  %one = spv.constant 1 : i32
+  spv.ReturnValue %one : i32
+}
+
+// -----
+
+"foo.function"() ({
+  %0 = spv.constant true
+  // expected-error @+1 {{op must appear in a 'func' block}}
+  spv.ReturnValue %0 : i1
+})  : () -> ()
+
+// -----
+
+func @value_count_mismatch() -> () {
+  %0 = spv.constant 42 : i32
+  // expected-error @+1 {{op returns 1 value but enclosing function requires 0 results}}
+  spv.ReturnValue %0 : i32
+}
+
+// -----
+
+func @value_type_mismatch() -> (f32) {
+  %0 = spv.constant 42 : i32
+  // expected-error @+1 {{return value's type ('i32') mismatch with function's result type ('f32')}}
+  spv.ReturnValue %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.selection
+//===----------------------------------------------------------------------===//
+
+func @selection(%cond: i1) -> () {
+  %zero = spv.constant 0: i32
+  %one = spv.constant 1: i32
+  %var = spv.Variable init(%zero) : !spv.ptr<i32, Function>
+
+  // CHECK: spv.selection {
+  spv.selection {
+    // CHECK-NEXT: spv.BranchConditional %{{.*}}, ^bb1, ^bb2
+    spv.BranchConditional %cond, ^then, ^merge
+
+  // CHECK: ^bb1
+  ^then:
+    spv.Store "Function" %var, %one : i32
+    // CHECK: spv.Branch ^bb2
+    spv.Branch ^merge
+
+  // CHECK: ^bb2
+  ^merge:
+    // CHECK-NEXT: spv._merge
+    spv._merge
+  }
+
+  spv.Return
+}
+
+// -----
+
+func @selection(%cond: i1) -> () {
+  %zero = spv.constant 0: i32
+  %one = spv.constant 1: i32
+  %two = spv.constant 2: i32
+  %var = spv.Variable init(%zero) : !spv.ptr<i32, Function>
+
+  // CHECK: spv.selection {
+  spv.selection {
+    // CHECK-NEXT: spv.BranchConditional %{{.*}}, ^bb1, ^bb2
+    spv.BranchConditional %cond, ^then, ^else
+
+  // CHECK: ^bb1
+  ^then:
+    spv.Store "Function" %var, %one : i32
+    // CHECK: spv.Branch ^bb3
+    spv.Branch ^merge
+
+  // CHECK: ^bb2
+  ^else:
+    spv.Store "Function" %var, %two : i32
+    // CHECK: spv.Branch ^bb3
+    spv.Branch ^merge
+
+  // CHECK: ^bb3
+  ^merge:
+    // CHECK-NEXT: spv._merge
+    spv._merge
+  }
+
+  spv.Return
+}
+
+// -----
+
+// CHECK-LABEL: @empty_region
+func @empty_region() -> () {
+  // CHECK: spv.selection
+  spv.selection {
+  }
+  return
+}
+
+// -----
+
+func @wrong_merge_block() -> () {
+  // expected-error @+1 {{last block must be the merge block with only one 'spv._merge' op}}
+  spv.selection {
+    spv.Return
+  }
+  return
+}
+
+// -----
+
+func @missing_entry_block() -> () {
+  // expected-error @+1 {{must have a selection header block}}
+  spv.selection {
+    spv._merge
+  }
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.Unreachable
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @unreachable_no_pred
+func @unreachable_no_pred() {
+    spv.Return
+
+  ^next:
+    // CHECK: spv.Unreachable
+    spv.Unreachable
+}
+
+// CHECK-LABEL: func @unreachable_with_pred
+func @unreachable_with_pred() {
+    spv.Return
+
+  ^parent:
+    spv.Branch ^unreachable
+
+  ^unreachable:
+    // CHECK: spv.Unreachable
+    spv.Unreachable
+}
+
+// -----
+
+func @unreachable() {
+  // expected-error @+1 {{cannot be used in reachable block}}
+  spv.Unreachable
+}
diff --git a/mlir/test/Dialect/SPIRV/glslops.mlir b/mlir/test/Dialect/SPIRV/glslops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..e3a3ca52e449b2b4f699dba7e458314f4c946cc3
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/glslops.mlir
@@ -0,0 +1,109 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// spv.GLSL.Exp
+//===----------------------------------------------------------------------===//
+
+func @exp(%arg0 : f32) -> () {
+  // CHECK: spv.GLSL.Exp {{%.*}} : f32
+  %2 = spv.GLSL.Exp %arg0 : f32
+  return
+}
+
+func @expvec(%arg0 : vector<3xf16>) -> () {
+  // CHECK: spv.GLSL.Exp {{%.*}} : vector<3xf16>
+  %2 = spv.GLSL.Exp %arg0 : vector<3xf16>
+  return
+}
+
+// -----
+
+func @exp(%arg0 : i32) -> () {
+  // expected-error @+1 {{op operand #0 must be 16/32-bit float or vector of 16/32-bit float values}}
+  %2 = spv.GLSL.Exp %arg0 : i32
+  return
+}
+
+// -----
+
+func @exp(%arg0 : vector<5xf32>) -> () {
+  // expected-error @+1 {{op operand #0 must be 16/32-bit float or vector of 16/32-bit float values of length 2/3/4}}
+  %2 = spv.GLSL.Exp %arg0 : vector<5xf32>
+  return
+}
+
+// -----
+
+func @exp(%arg0 : f32, %arg1 : f32) -> () {
+  // expected-error @+1 {{expected ':'}}
+  %2 = spv.GLSL.Exp %arg0, %arg1 : i32
+  return
+}
+
+// -----
+
+func @exp(%arg0 : i32) -> () {
+  // expected-error @+2 {{expected non-function type}}
+  %2 = spv.GLSL.Exp %arg0 :
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.GLSL.FMax
+//===----------------------------------------------------------------------===//
+
+func @fmax(%arg0 : f32, %arg1 : f32) -> () {
+  // CHECK: spv.GLSL.FMax {{%.*}}, {{%.*}} : f32
+  %2 = spv.GLSL.FMax %arg0, %arg1 : f32
+  return
+}
+
+func @fmaxvec(%arg0 : vector<3xf16>, %arg1 : vector<3xf16>) -> () {
+  // CHECK: spv.GLSL.FMax {{%.*}}, {{%.*}} : vector<3xf16>
+  %2 = spv.GLSL.FMax %arg0, %arg1 : vector<3xf16>
+  return
+}
+
+func @fmaxf64(%arg0 : f64, %arg1 : f64) -> () {
+  // CHECK: spv.GLSL.FMax {{%.*}}, {{%.*}} : f64
+  %2 = spv.GLSL.FMax %arg0, %arg1 : f64
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.GLSL.InverseSqrt
+//===----------------------------------------------------------------------===//
+
+func @inversesqrt(%arg0 : f32) -> () {
+  // CHECK: spv.GLSL.InverseSqrt {{%.*}} : f32
+  %2 = spv.GLSL.InverseSqrt %arg0 : f32
+  return
+}
+
+func @inversesqrtvec(%arg0 : vector<3xf16>) -> () {
+  // CHECK: spv.GLSL.InverseSqrt {{%.*}} : vector<3xf16>
+  %2 = spv.GLSL.InverseSqrt %arg0 : vector<3xf16>
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.GLSL.Sqrt
+//===----------------------------------------------------------------------===//
+
+func @sqrt(%arg0 : f32) -> () {
+  // CHECK: spv.GLSL.Sqrt {{%.*}} : f32
+  %2 = spv.GLSL.Sqrt %arg0 : f32
+  return
+}
+
+func @sqrtvec(%arg0 : vector<3xf16>) -> () {
+  // CHECK: spv.GLSL.Sqrt {{%.*}} : vector<3xf16>
+  %2 = spv.GLSL.Sqrt %arg0 : vector<3xf16>
+  return
+}
diff --git a/mlir/test/Dialect/SPIRV/group-ops.mlir b/mlir/test/Dialect/SPIRV/group-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ba5e79209e310e25857e611359c378ee74bfa9e8
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/group-ops.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// spv.SubgroupBallotKHR
+//===----------------------------------------------------------------------===//
+
+func @subgroup_ballot(%predicate: i1) -> vector<4xi32> {
+  // CHECK: %{{.*}} = spv.SubgroupBallotKHR %{{.*}} : vector<4xi32>
+  %0 = spv.SubgroupBallotKHR %predicate: vector<4xi32>
+  return %0: vector<4xi32>
+}
diff --git a/mlir/test/Dialect/SPIRV/logical-ops.mlir b/mlir/test/Dialect/SPIRV/logical-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d102ae98d3aeca19526cc0a4ee693c509c0eca40
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/logical-ops.mlir
@@ -0,0 +1,127 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// spv.IEqual
+//===----------------------------------------------------------------------===//
+
+func @iequal_scalar(%arg0: i32, %arg1: i32) -> i1 {
+  // CHECK: spv.IEqual {{.*}}, {{.*}} : i32
+  %0 = spv.IEqual %arg0, %arg1 : i32
+  return %0 : i1
+}
+
+// -----
+
+func @iequal_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) -> vector<4xi1> {
+  // CHECK: spv.IEqual {{.*}}, {{.*}} : vector<4xi32>
+  %0 = spv.IEqual %arg0, %arg1 : vector<4xi32>
+  return %0 : vector<4xi1>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.INotEqual
+//===----------------------------------------------------------------------===//
+
+func @inotequal_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) -> vector<4xi1> {
+  // CHECK: spv.INotEqual {{.*}}, {{.*}} : vector<4xi32>
+  %0 = spv.INotEqual %arg0, %arg1 : vector<4xi32>
+  return %0 : vector<4xi1>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.SGreaterThan
+//===----------------------------------------------------------------------===//
+
+func @sgt_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) -> vector<4xi1> {
+  // CHECK: spv.SGreaterThan {{.*}}, {{.*}} : vector<4xi32>
+  %0 = spv.SGreaterThan %arg0, %arg1 : vector<4xi32>
+  return %0 : vector<4xi1>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.SGreaterThanEqual
+//===----------------------------------------------------------------------===//
+
+func @sge_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) -> vector<4xi1> {
+  // CHECK: spv.SGreaterThanEqual {{.*}}, {{.*}} : vector<4xi32>
+  %0 = spv.SGreaterThanEqual %arg0, %arg1 : vector<4xi32>
+  return %0 : vector<4xi1>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.SLessThan
+//===----------------------------------------------------------------------===//
+
+func @slt_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) -> vector<4xi1> {
+  // CHECK: spv.SLessThan {{.*}}, {{.*}} : vector<4xi32>
+  %0 = spv.SLessThan %arg0, %arg1 : vector<4xi32>
+  return %0 : vector<4xi1>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.SLessThanEqual
+//===----------------------------------------------------------------------===//
+
+func @slte_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) -> vector<4xi1> {
+  // CHECK: spv.SLessThanEqual {{.*}}, {{.*}} : vector<4xi32>
+  %0 = spv.SLessThanEqual %arg0, %arg1 : vector<4xi32>
+  return %0 : vector<4xi1>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.UGreaterThan
+//===----------------------------------------------------------------------===//
+
+func @ugt_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) -> vector<4xi1> {
+  // CHECK: spv.UGreaterThan {{.*}}, {{.*}} : vector<4xi32>
+  %0 = spv.UGreaterThan %arg0, %arg1 : vector<4xi32>
+  return %0 : vector<4xi1>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.UGreaterThanEqual
+//===----------------------------------------------------------------------===//
+
+func @ugte_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) -> vector<4xi1> {
+  // CHECK: spv.UGreaterThanEqual {{.*}}, {{.*}} : vector<4xi32>
+  %0 = spv.UGreaterThanEqual %arg0, %arg1 : vector<4xi32>
+  return %0 : vector<4xi1>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ULessThan
+//===----------------------------------------------------------------------===//
+
+func @ult_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) -> vector<4xi1> {
+  // CHECK: spv.ULessThan {{.*}}, {{.*}} : vector<4xi32>
+  %0 = spv.ULessThan %arg0, %arg1 : vector<4xi32>
+  return %0 : vector<4xi1>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ULessThanEqual
+//===----------------------------------------------------------------------===//
+
+func @ulte_vector(%arg0: vector<4xi32>, %arg1: vector<4xi32>) -> vector<4xi1> {
+  // CHECK: spv.ULessThanEqual {{.*}}, {{.*}} : vector<4xi32>
+  %0 = spv.ULessThanEqual %arg0, %arg1 : vector<4xi32>
+  return %0 : vector<4xi1>
+}
diff --git a/mlir/test/Dialect/SPIRV/non-uniform-ops.mlir b/mlir/test/Dialect/SPIRV/non-uniform-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..483a7319ab6f49545cbccd80497f1a471bfaa5c3
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/non-uniform-ops.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// spv.GroupNonUniformBallot
+//===----------------------------------------------------------------------===//
+
+func @subgroup_ballot(%predicate: i1) -> vector<4xi32> {
+  // CHECK: %{{.*}} = spv.GroupNonUniformBallot "Workgroup" %{{.*}}: vector<4xi32>
+  %0 = spv.GroupNonUniformBallot "Workgroup" %predicate : vector<4xi32>
+  return %0: vector<4xi32>
+}
+
+// -----
+
+func @subgroup_ballot(%predicate: i1) -> vector<4xi32> {
+  // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}}
+  %0 = spv.GroupNonUniformBallot "Device" %predicate : vector<4xi32>
+  return %0: vector<4xi32>
+}
diff --git a/mlir/test/Dialect/SPIRV/ops.mlir b/mlir/test/Dialect/SPIRV/ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..00996695a13ba4723db4879c328df89b7a19dff3
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/ops.mlir
@@ -0,0 +1,1206 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// spv.AccessChain
+//===----------------------------------------------------------------------===//
+
+func @access_chain_struct() -> () {
+  %0 = spv.constant 1: i32
+  %1 = spv.Variable : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+  // CHECK: spv.AccessChain {{.*}}[{{.*}}, {{.*}}] : !spv.ptr<!spv.struct<f32, !spv.array<4 x f32>>, Function>
+  %2 = spv.AccessChain %1[%0, %0] : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+  return
+}
+
+func @access_chain_1D_array(%arg0 : i32) -> () {
+  %0 = spv.Variable : !spv.ptr<!spv.array<4xf32>, Function>
+  // CHECK: spv.AccessChain {{.*}}[{{.*}}] : !spv.ptr<!spv.array<4 x f32>, Function>
+  %1 = spv.AccessChain %0[%arg0] : !spv.ptr<!spv.array<4xf32>, Function>
+  return
+}
+
+func @access_chain_2D_array_1(%arg0 : i32) -> () {
+  %0 = spv.Variable : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  // CHECK: spv.AccessChain {{.*}}[{{.*}}, {{.*}}] : !spv.ptr<!spv.array<4 x !spv.array<4 x f32>>, Function>
+  %1 = spv.AccessChain %0[%arg0, %arg0] : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  %2 = spv.Load "Function" %1 ["Volatile"] : f32
+  return
+}
+
+func @access_chain_2D_array_2(%arg0 : i32) -> () {
+  %0 = spv.Variable : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  // CHECK: spv.AccessChain {{.*}}[{{.*}}] : !spv.ptr<!spv.array<4 x !spv.array<4 x f32>>, Function>
+  %1 = spv.AccessChain %0[%arg0] : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  %2 = spv.Load "Function" %1 ["Volatile"] : !spv.array<4xf32>
+  return
+}
+
+func @access_chain_rtarray(%arg0 : i32) -> () {
+  %0 = spv.Variable : !spv.ptr<!spv.rtarray<f32>, Function>
+  // CHECK: spv.AccessChain {{.*}}[{{.*}}] : !spv.ptr<!spv.rtarray<f32>, Function>
+  %1 = spv.AccessChain %0[%arg0] : !spv.ptr<!spv.rtarray<f32>, Function>
+  %2 = spv.Load "Function" %1 ["Volatile"] : f32
+  return
+}
+
+// -----
+
+func @access_chain_non_composite() -> () {
+  %0 = spv.constant 1: i32
+  %1 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{cannot extract from non-composite type 'f32' with index 0}}
+  %2 = spv.AccessChain %1[%0] : !spv.ptr<f32, Function>
+  return
+}
+
+// -----
+
+func @access_chain_no_indices(%index0 : i32) -> () {
+  %0 = spv.Variable : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  // expected-error @+1 {{expected at least one index}}
+  %1 = spv.AccessChain %0[] : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  return
+}
+
+// -----
+
+func @access_chain_invalid_type(%index0 : i32) -> () {
+  %0 = spv.Variable : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  %1 = spv.Load "Function" %0 ["Volatile"] : !spv.array<4x!spv.array<4xf32>>
+  // expected-error @+1 {{expected a pointer to composite type, but provided '!spv.array<4 x !spv.array<4 x f32>>'}}
+  %2 = spv.AccessChain %1[%index0] : !spv.array<4x!spv.array<4xf32>>
+  return
+}
+
+// -----
+
+func @access_chain_invalid_index_1(%index0 : i32) -> () {
+   %0 = spv.Variable : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  // expected-error @+1 {{expected SSA operand}}
+  %1 = spv.AccessChain %0[%index, 4] : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  return
+}
+
+// -----
+
+func @access_chain_invalid_index_2(%index0 : i32) -> () {
+  %0 = spv.Variable : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+  // expected-error @+1 {{index must be an integer spv.constant to access element of spv.struct}}
+  %1 = spv.AccessChain %0[%index0, %index0] : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+  return
+}
+
+// -----
+
+func @access_chain_invalid_constant_type_1() -> () {
+  %0 = std.constant 1: i32
+  %1 = spv.Variable : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+  // expected-error @+1 {{index must be an integer spv.constant to access element of spv.struct, but provided std.constant}}
+  %2 = spv.AccessChain %1[%0, %0] : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+  return
+}
+
+// -----
+
+func @access_chain_out_of_bounds() -> () {
+  %index0 = "spv.constant"() { value = 12: i32} : () -> i32
+  %0 = spv.Variable : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+  // expected-error @+1 {{'spv.AccessChain' op index 12 out of bounds for '!spv.struct<f32, !spv.array<4 x f32>>'}}
+  %1 = spv.AccessChain %0[%index0, %index0] : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+  return
+}
+
+// -----
+
+func @access_chain_invalid_accessing_type(%index0 : i32) -> () {
+  %0 = spv.Variable : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  // expected-error @+1 {{cannot extract from non-composite type 'f32' with index 0}}
+  %1 = spv.AccessChain %0[%index, %index0, %index0] : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  return
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.Bitcast
+//===----------------------------------------------------------------------===//
+
+func @cast1(%arg0 : f32) {
+  // CHECK: {{%.*}} = spv.Bitcast {{%.*}} : f32 to i32
+  %0 = spv.Bitcast %arg0 : f32 to i32
+  return
+}
+
+func @cast2(%arg0 : vector<2xf32>) {
+  // CHECK: {{%.*}} = spv.Bitcast {{%.*}} : vector<2xf32> to vector<2xi32>
+  %0 = spv.Bitcast %arg0 : vector<2xf32> to vector<2xi32>
+  return
+}
+
+func @cast3(%arg0 : vector<2xf32>) {
+  // CHECK: {{%.*}} = spv.Bitcast {{%.*}} : vector<2xf32> to i64
+  %0 = spv.Bitcast %arg0 : vector<2xf32> to i64
+  return
+}
+
+func @cast4(%arg0 : !spv.ptr<f32, Function>) {
+  // CHECK: {{%.*}} = spv.Bitcast {{%.*}} : !spv.ptr<f32, Function> to !spv.ptr<i32, Function>
+  %0 = spv.Bitcast %arg0 : !spv.ptr<f32, Function> to !spv.ptr<i32, Function>
+  return
+}
+
+func @cast5(%arg0 : !spv.ptr<f32, Function>) {
+  // CHECK: {{%.*}} = spv.Bitcast {{%.*}} : !spv.ptr<f32, Function> to !spv.ptr<vector<2xi32>, Function>
+  %0 = spv.Bitcast %arg0 : !spv.ptr<f32, Function> to !spv.ptr<vector<2xi32>, Function>
+  return
+}
+
+func @cast6(%arg0 : vector<4xf32>) {
+  // CHECK: {{%.*}} = spv.Bitcast {{%.*}} : vector<4xf32> to vector<2xi64>
+  %0 = spv.Bitcast %arg0 : vector<4xf32> to vector<2xi64>
+  return
+}
+
+// -----
+
+func @cast1(%arg0 : f32) {
+  // expected-error @+1 {{result type must be different from operand type}}
+  %0 = spv.Bitcast %arg0 : f32 to f32
+  return
+}
+
+// -----
+
+func @cast1(%arg0 : f32) {
+  // expected-error @+1 {{mismatch in result type bitwidth 64 and operand type bitwidth 32}}
+  %0 = spv.Bitcast %arg0 : f32 to i64
+  return
+}
+
+// -----
+
+func @cast1(%arg0 : vector<2xf32>) {
+  // expected-error @+1 {{mismatch in result type bitwidth 96 and operand type bitwidth 64}}
+  %0 = spv.Bitcast %arg0 : vector<2xf32> to vector<3xf32>
+  return
+}
+
+// -----
+
+func @cast3(%arg0 : !spv.ptr<f32, Function>) {
+  // expected-error @+1 {{unhandled bit cast conversion from pointer type to non-pointer type}}
+  %0 = spv.Bitcast %arg0 : !spv.ptr<f32, Function> to i64
+  return
+}
+
+// -----
+
+func @cast3(%arg0 : i64) {
+  // expected-error @+1 {{unhandled bit cast conversion from non-pointer type to pointer type}}
+  %0 = spv.Bitcast %arg0 : i64 to !spv.ptr<f32, Function>
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.BitCount
+//===----------------------------------------------------------------------===//
+
+func @bitcount(%arg: i32) -> i32 {
+  // CHECK: spv.BitCount {{%.*}} : i32
+  %0 = spv.BitCount %arg : i32
+  spv.ReturnValue %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.BitFieldInsert
+//===----------------------------------------------------------------------===//
+
+func @bit_field_insert_vec(%base: vector<3xi32>, %insert: vector<3xi32>, %offset: i32, %count: i16) -> vector<3xi32> {
+  // CHECK: {{%.*}} = spv.BitFieldInsert {{%.*}}, {{%.*}}, {{%.*}}, {{%.*}} : vector<3xi32>, i32, i16
+  %0 = spv.BitFieldInsert %base, %insert, %offset, %count : vector<3xi32>, i32, i16
+  spv.ReturnValue %0 : vector<3xi32>
+}
+
+// -----
+
+func @bit_field_insert_invalid_insert_type(%base: vector<3xi32>, %insert: vector<2xi32>, %offset: i32, %count: i16) -> vector<3xi32> {
+  // expected-error @+1 {{expected the same type for the base operand, insert operand, and result, but provided 'vector<3xi32>', 'vector<2xi32>' and 'vector<3xi32>'}}
+  %0 = "spv.BitFieldInsert" (%base, %insert, %offset, %count) : (vector<3xi32>, vector<2xi32>, i32, i16) -> vector<3xi32>
+  spv.ReturnValue %0 : vector<3xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.BitFieldSExtract
+//===----------------------------------------------------------------------===//
+
+func @bit_field_s_extract_vec(%base: vector<3xi32>, %offset: i8, %count: i8) -> vector<3xi32> {
+  // CHECK: {{%.*}} = spv.BitFieldSExtract {{%.*}}, {{%.*}}, {{%.*}} : vector<3xi32>, i8, i8
+  %0 = spv.BitFieldSExtract %base, %offset, %count : vector<3xi32>, i8, i8
+  spv.ReturnValue %0 : vector<3xi32>
+}
+
+//===----------------------------------------------------------------------===//
+// spv.BitFieldUExtract
+//===----------------------------------------------------------------------===//
+
+func @bit_field_u_extract_vec(%base: vector<3xi32>, %offset: i8, %count: i8) -> vector<3xi32> {
+  // CHECK: {{%.*}} = spv.BitFieldUExtract {{%.*}}, {{%.*}}, {{%.*}} : vector<3xi32>, i8, i8
+  %0 = spv.BitFieldUExtract %base, %offset, %count : vector<3xi32>, i8, i8
+  spv.ReturnValue %0 : vector<3xi32>
+}
+
+// -----
+
+func @bit_field_u_extract_invalid_result_type(%base: vector<3xi32>, %offset: i32, %count: i16) -> vector<4xi32> {
+  // expected-error @+1 {{expected the same type for the first operand and result, but provided 'vector<3xi32>' and 'vector<4xi32>'}}
+  %0 = "spv.BitFieldUExtract" (%base, %offset, %count) : (vector<3xi32>, i32, i16) -> vector<4xi32>
+  spv.ReturnValue %0 : vector<4xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.BitReverse
+//===----------------------------------------------------------------------===//
+
+func @bitreverse(%arg: i32) -> i32 {
+  // CHECK: spv.BitReverse {{%.*}} : i32
+  %0 = spv.BitReverse %arg : i32
+  spv.ReturnValue %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ControlBarrier
+//===----------------------------------------------------------------------===//
+
+func @control_barrier_0() -> () {
+  // CHECK:  spv.ControlBarrier "Workgroup", "Device", "Acquire|UniformMemory"
+  spv.ControlBarrier "Workgroup", "Device", "Acquire|UniformMemory"
+  return
+}
+
+// -----
+
+func @control_barrier_1() -> () {
+  // expected-error @+1 {{invalid scope attribute specification: "Something"}}
+  spv.ControlBarrier "Something", "Device", "Acquire|UniformMemory"
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ConvertFToS
+//===----------------------------------------------------------------------===//
+
+func @convert_f_to_s_scalar(%arg0 : f32) -> i32 {
+  // CHECK: {{%.*}} = spv.ConvertFToS {{%.*}} : f32 to i32
+  %0 = spv.ConvertFToS %arg0 : f32 to i32
+  spv.ReturnValue %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ConvertFToU
+//===----------------------------------------------------------------------===//
+
+func @convert_f_to_u_scalar(%arg0 : f32) -> i32 {
+  // CHECK: {{%.*}} = spv.ConvertFToU {{%.*}} : f32 to i32
+  %0 = spv.ConvertFToU %arg0 : f32 to i32
+  spv.ReturnValue %0 : i32
+}
+
+// -----
+
+func @convert_f_to_u_vector(%arg0 : vector<3xf32>) -> vector<3xi32> {
+  // CHECK: {{%.*}} = spv.ConvertFToU {{%.*}} : vector<3xf32> to vector<3xi32>
+  %0 = spv.ConvertFToU %arg0 : vector<3xf32> to vector<3xi32>
+  spv.ReturnValue %0 : vector<3xi32>
+}
+
+// -----
+
+func @convert_f_to_u_scalar_invalid(%arg0 : f16) -> i32 {
+  // expected-error @+1 {{expected the same bit widths for operand type and result type, but provided 'f16' and 'i32'}}
+  %0 = spv.ConvertFToU %arg0 : f16 to i32
+  spv.ReturnValue %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ConvertSToF
+//===----------------------------------------------------------------------===//
+
+func @convert_s_to_f_scalar(%arg0 : i32) -> f32 {
+  // CHECK: {{%.*}} = spv.ConvertSToF {{%.*}} : i32 to f32
+  %0 = spv.ConvertSToF %arg0 : i32 to f32
+  spv.ReturnValue %0 : f32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ConvertUToF
+//===----------------------------------------------------------------------===//
+
+func @convert_u_to_f_scalar(%arg0 : i32) -> f32 {
+  // CHECK: {{%.*}} = spv.ConvertUToF {{%.*}} : i32 to f32
+  %0 = spv.ConvertUToF %arg0 : i32 to f32
+  spv.ReturnValue %0 : f32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.FConvert
+//===----------------------------------------------------------------------===//
+
+func @f_convert_scalar(%arg0 : f32) -> f64 {
+  // CHECK: {{%.*}} = spv.FConvert {{%.*}} : f32 to f64
+  %0 = spv.FConvert %arg0 : f32 to f64
+  spv.ReturnValue %0 : f64
+}
+
+// -----
+
+func @f_convert_vector(%arg0 : vector<3xf32>) -> vector<3xf64> {
+  // CHECK: {{%.*}} = spv.FConvert {{%.*}} : vector<3xf32> to vector<3xf64>
+  %0 = spv.FConvert %arg0 : vector<3xf32> to vector<3xf64>
+  spv.ReturnValue %0 : vector<3xf64>
+}
+
+// -----
+
+func @f_convert_vector(%arg0 : f32) -> f32 {
+  // expected-error @+1 {{expected the different bit widths for operand type and result type, but provided 'f32' and 'f32'}}
+  %0 = spv.FConvert %arg0 : f32 to f32
+  spv.ReturnValue %0 : f32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.SConvert
+//===----------------------------------------------------------------------===//
+
+func @s_convert_scalar(%arg0 : i32) -> i64 {
+  // CHECK: {{%.*}} = spv.SConvert {{%.*}} : i32 to i64
+  %0 = spv.SConvert %arg0 : i32 to i64
+  spv.ReturnValue %0 : i64
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.UConvert
+//===----------------------------------------------------------------------===//
+
+func @u_convert_scalar(%arg0 : i32) -> i64 {
+  // CHECK: {{%.*}} = spv.UConvert {{%.*}} : i32 to i64
+  %0 = spv.UConvert %arg0 : i32 to i64
+  spv.ReturnValue %0 : i64
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ExecutionMode
+//===----------------------------------------------------------------------===//
+
+spv.module "Logical" "GLSL450" {
+   func @do_nothing() -> () {
+     spv.Return
+   }
+   spv.EntryPoint "GLCompute" @do_nothing
+   // CHECK: spv.ExecutionMode {{@.*}} "ContractionOff"
+   spv.ExecutionMode @do_nothing "ContractionOff"
+}
+
+spv.module "Logical" "GLSL450" {
+   func @do_nothing() -> () {
+     spv.Return
+   }
+   spv.EntryPoint "GLCompute" @do_nothing
+   // CHECK: spv.ExecutionMode {{@.*}} "LocalSizeHint", 3, 4, 5
+   spv.ExecutionMode @do_nothing "LocalSizeHint", 3, 4, 5
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+   func @do_nothing() -> () {
+     spv.Return
+   }
+   spv.EntryPoint "GLCompute" @do_nothing
+   // expected-error @+1 {{custom op 'spv.ExecutionMode' invalid execution_mode attribute specification: "GLCompute"}}
+   spv.ExecutionMode @do_nothing "GLCompute", 3, 4, 5
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.LoadOp
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @simple_load
+func @simple_load() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK: spv.Load "Function" %{{.*}} : f32
+  %1 = spv.Load "Function" %0 : f32
+  return
+}
+
+// CHECK-LABEL: @load_none_access
+func @load_none_access() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK: spv.Load "Function" %{{.*}} ["None"] : f32
+  %1 = spv.Load "Function" %0 ["None"] : f32
+  return
+}
+
+// CHECK-LABEL: @volatile_load
+func @volatile_load() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK: spv.Load "Function" %{{.*}} ["Volatile"] : f32
+  %1 = spv.Load "Function" %0 ["Volatile"] : f32
+  return
+}
+
+// CHECK-LABEL: @aligned_load
+func @aligned_load() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK: spv.Load "Function" %{{.*}} ["Aligned", 4] : f32
+  %1 = spv.Load "Function" %0 ["Aligned", 4] : f32
+  return
+}
+
+// CHECK-LABEL: @volatile_aligned_load
+func @volatile_aligned_load() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK: spv.Load "Function" %{{.*}} ["Volatile|Aligned", 4] : f32
+  %1 = spv.Load "Function" %0 ["Volatile|Aligned", 4] : f32
+  return
+}
+
+// -----
+
+// CHECK-LABEL: load_none_access
+func @load_none_access() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK: spv.Load
+  // CHECK-SAME: ["None"]
+  %1 = "spv.Load"(%0) {memory_access = 0 : i32} : (!spv.ptr<f32, Function>) -> (f32)
+  return
+}
+
+// CHECK-LABEL: volatile_load
+func @volatile_load() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK: spv.Load
+  // CHECK-SAME: ["Volatile"]
+  %1 = "spv.Load"(%0) {memory_access = 1 : i32} : (!spv.ptr<f32, Function>) -> (f32)
+  return
+}
+
+// CHECK-LABEL: aligned_load
+func @aligned_load() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK: spv.Load
+  // CHECK-SAME: ["Aligned", 4]
+  %1 = "spv.Load"(%0) {memory_access = 2 : i32, alignment = 4 : i32} : (!spv.ptr<f32, Function>) -> (f32)
+  return
+}
+
+// CHECK-LABEL: volatile_aligned_load
+func @volatile_aligned_load() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK: spv.Load
+  // CHECK-SAME: ["Volatile|Aligned", 4]
+  %1 = "spv.Load"(%0) {memory_access = 3 : i32, alignment = 4 : i32} : (!spv.ptr<f32, Function>) -> (f32)
+  return
+}
+
+// -----
+
+func @simple_load_missing_storageclass() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected non-function type}}
+  %1 = spv.Load %0 : f32
+  return
+}
+
+// -----
+
+func @simple_load_missing_operand() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected SSA operand}}
+  %1 = spv.Load "Function" : f32
+  return
+}
+
+// -----
+
+func @simple_load_missing_rettype() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+2 {{expected ':'}}
+  %1 = spv.Load "Function" %0
+  return
+}
+
+// -----
+
+func @volatile_load_missing_lbrace() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected ':'}}
+  %1 = spv.Load "Function" %0 "Volatile"] : f32
+  return
+}
+
+// -----
+
+func @volatile_load_missing_rbrace() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected ']'}}
+  %1 = spv.Load "Function" %0 ["Volatile"} : f32
+  return
+}
+
+// -----
+
+func @aligned_load_missing_alignment() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected ','}}
+  %1 = spv.Load "Function" %0 ["Aligned"] : f32
+  return
+}
+
+// -----
+
+func @aligned_load_missing_comma() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected ','}}
+  %1 = spv.Load "Function" %0 ["Aligned" 4] : f32
+  return
+}
+
+// -----
+
+func @load_incorrect_attributes() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected ']'}}
+  %1 = spv.Load "Function" %0 ["Volatile", 4] : f32
+  return
+}
+
+// -----
+
+func @load_unknown_memory_access() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{custom op 'spv.Load' invalid memory_access attribute specification: "Something"}}
+  %1 = spv.Load "Function" %0 ["Something"] : f32
+  return
+}
+
+// -----
+
+func @load_unknown_memory_access() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{custom op 'spv.Load' invalid memory_access attribute specification: "Volatile|Something"}}
+  %1 = spv.Load "Function" %0 ["Volatile|Something"] : f32
+  return
+}
+
+// -----
+
+func @load_unknown_memory_access() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{failed to satisfy constraint: valid SPIR-V MemoryAccess}}
+  %1 = "spv.Load"(%0) {memory_access = 0x80000000 : i32} : (!spv.ptr<f32, Function>) -> (f32)
+  return
+}
+
+// -----
+
+func @aligned_load_incorrect_attributes() -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected ']'}}
+  %1 = spv.Load "Function" %0 ["Aligned", 4, 23] : f32
+  return
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @var0 : !spv.ptr<f32, Input>
+  // CHECK_LABEL: @simple_load
+  func @simple_load() -> () {
+    // CHECK: spv.Load "Input" {{%.*}} : f32
+    %0 = spv._address_of @var0 : !spv.ptr<f32, Input>
+    %1 = spv.Load "Input" %0 : f32
+    spv.Return
+  }
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.LogicalAnd
+//===----------------------------------------------------------------------===//
+
+func @logicalBinary(%arg0 : i1, %arg1 : i1, %arg2 : i1)
+{
+  // CHECK: [[TMP:%.*]] = spv.LogicalAnd {{%.*}}, {{%.*}} : i1
+  %0 = spv.LogicalAnd %arg0, %arg1 : i1
+  // CHECK: {{%.*}} = spv.LogicalAnd [[TMP]], {{%.*}} : i1
+  %1 = spv.LogicalAnd %0, %arg2 : i1
+  return
+}
+
+func @logicalBinary2(%arg0 : vector<4xi1>, %arg1 : vector<4xi1>)
+{
+  // CHECK: {{%.*}} = spv.LogicalAnd {{%.*}}, {{%.*}} : vector<4xi1>
+  %0 = spv.LogicalAnd %arg0, %arg1 : vector<4xi1>
+  return
+}
+
+// -----
+
+func @logicalBinary(%arg0 : i1, %arg1 : i1)
+{
+  // expected-error @+2 {{expected ':'}}
+  %0 = spv.LogicalAnd %arg0, %arg1
+  return
+}
+
+// -----
+
+func @logicalBinary(%arg0 : i1, %arg1 : i1)
+{
+  // expected-error @+2 {{expected non-function type}}
+  %0 = spv.LogicalAnd %arg0, %arg1 :
+  return
+}
+
+// -----
+
+func @logicalBinary(%arg0 : i1, %arg1 : i1)
+{
+  // expected-error @+1 {{custom op 'spv.LogicalAnd' expected 2 operands}}
+  %0 = spv.LogicalAnd %arg0 : i1
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.LogicalNot
+//===----------------------------------------------------------------------===//
+
+func @logicalUnary(%arg0 : i1, %arg1 : i1)
+{
+  // CHECK: [[TMP:%.*]] = spv.LogicalNot {{%.*}} : i1
+  %0 = spv.LogicalNot %arg0 : i1
+  // CHECK: {{%.*}} = spv.LogicalNot [[TMP]] : i1
+  %1 = spv.LogicalNot %0 : i1
+  return
+}
+
+func @logicalUnary2(%arg0 : vector<4xi1>)
+{
+  // CHECK: {{%.*}} = spv.LogicalNot {{%.*}} : vector<4xi1>
+  %0 = spv.LogicalNot %arg0 : vector<4xi1>
+  return
+}
+
+// -----
+
+func @logicalUnary(%arg0 : i1)
+{
+  // expected-error @+2 {{expected ':'}}
+  %0 = spv.LogicalNot %arg0
+  return
+}
+
+// -----
+
+func @logicalUnary(%arg0 : i1)
+{
+  // expected-error @+2 {{expected non-function type}}
+  %0 = spv.LogicalNot %arg0 :
+  return
+}
+
+// -----
+
+func @logicalUnary(%arg0 : i1)
+{
+  // expected-error @+1 {{expected SSA operand}}
+  %0 = spv.LogicalNot : i1
+  return
+}
+
+// -----
+
+func @logicalUnary(%arg0 : i32)
+{
+  // expected-error @+1 {{'spv.LogicalNot' op operand #0 must be 1-bit integer or vector of 1-bit integer values of length 2/3/4, but got 'i32'}}
+  %0 = spv.LogicalNot %arg0 : i32
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.MemoryBarrier
+//===----------------------------------------------------------------------===//
+
+func @memory_barrier_0() -> () {
+  // CHECK: spv.MemoryBarrier "Device", "Acquire|UniformMemory"
+  spv.MemoryBarrier "Device", "Acquire|UniformMemory"
+  return
+}
+
+// -----
+
+func @memory_barrier_1() -> () {
+  // CHECK: spv.MemoryBarrier "Workgroup", "Acquire"
+  spv.MemoryBarrier "Workgroup", "Acquire"
+  return
+}
+
+// -----
+
+func @memory_barrier_2() -> () {
+ // expected-error @+1 {{expected at most one of these four memory constraints to be set: `Acquire`, `Release`,`AcquireRelease` or `SequentiallyConsistent`}}
+  spv.MemoryBarrier "Device", "Acquire|Release"
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.Not
+//===----------------------------------------------------------------------===//
+
+func @not(%arg: i32) -> i32 {
+  // CHECK: spv.Not {{%.*}} : i32
+  %0 = spv.Not %arg : i32
+  spv.ReturnValue %0 : i32
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.SelectOp
+//===----------------------------------------------------------------------===//
+
+func @select_op_bool(%arg0: i1) -> () {
+  %0 = spv.constant true
+  %1 = spv.constant false
+  // CHECK : spv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, i1
+  %2 = spv.Select %arg0, %0, %1 : i1, i1
+  return
+}
+
+func @select_op_int(%arg0: i1) -> () {
+  %0 = spv.constant 2 : i32
+  %1 = spv.constant 3 : i32
+  // CHECK : spv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, i32
+  %2 = spv.Select %arg0, %0, %1 : i1, i32
+  return
+}
+
+func @select_op_float(%arg0: i1) -> () {
+  %0 = spv.constant 2.0 : f32
+  %1 = spv.constant 3.0 : f32
+  // CHECK : spv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, f32
+  %2 = spv.Select %arg0, %0, %1 : i1, f32
+  return
+}
+
+func @select_op_ptr(%arg0: i1) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  %1 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK : spv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, !spv.ptr<f32, Function>
+  %2 = spv.Select %arg0, %0, %1 : i1, !spv.ptr<f32, Function>
+  return
+}
+
+func @select_op_vec(%arg0: i1) -> () {
+  %0 = spv.constant dense<[2.0, 3.0, 4.0]> : vector<3xf32>
+  %1 = spv.constant dense<[5.0, 6.0, 7.0]> : vector<3xf32>
+  // CHECK : spv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, vector<3xf32>
+  %2 = spv.Select %arg0, %0, %1 : i1, vector<3xf32>
+  return
+}
+
+func @select_op_vec_condn_vec(%arg0: vector<3xi1>) -> () {
+  %0 = spv.constant dense<[2.0, 3.0, 4.0]> : vector<3xf32>
+  %1 = spv.constant dense<[5.0, 6.0, 7.0]> : vector<3xf32>
+  // CHECK : spv.Select {{%.*}}, {{%.*}}, {{%.*}} : vector<3xi1>, vector<3xf32>
+  %2 = spv.Select %arg0, %0, %1 : vector<3xi1>, vector<3xf32>
+  return
+}
+
+// -----
+
+func @select_op(%arg0: i1) -> () {
+  %0 = spv.constant 2 : i32
+  %1 = spv.constant 3 : i32
+  // expected-error @+1 {{need exactly two trailing types for select condition and object}}
+  %2 = spv.Select %arg0, %0, %1 : i1
+  return
+}
+
+// -----
+
+func @select_op(%arg1: vector<3xi1>) -> () {
+  %0 = spv.constant 2 : i32
+  %1 = spv.constant 3 : i32
+  // expected-error @+1 {{result expected to be of vector type when condition is of vector type}}
+  %2 = spv.Select %arg1, %0, %1 : vector<3xi1>, i32
+  return
+}
+
+// -----
+
+func @select_op(%arg1: vector<4xi1>) -> () {
+  %0 = spv.constant dense<[2, 3, 4]> : vector<3xi32>
+  %1 = spv.constant dense<[5, 6, 7]> : vector<3xi32>
+  // expected-error @+1 {{result should have the same number of elements as the condition when condition is of vector type}}
+  %2 = spv.Select %arg1, %0, %1 : vector<4xi1>, vector<3xi32>
+  return
+}
+
+// -----
+
+func @select_op(%arg1: vector<4xi1>) -> () {
+  %0 = spv.constant dense<[2.0, 3.0, 4.0]> : vector<3xf32>
+  %1 = spv.constant dense<[5, 6, 7]> : vector<3xi32>
+  // expected-error @+1 {{op result type and true value type must be the same}}
+  %2 = "spv.Select"(%arg1, %0, %1) : (vector<4xi1>, vector<3xf32>, vector<3xi32>) -> vector<3xi32>
+  return
+}
+
+// -----
+
+func @select_op(%arg1: vector<4xi1>) -> () {
+  %0 = spv.constant dense<[2.0, 3.0, 4.0]> : vector<3xf32>
+  %1 = spv.constant dense<[5, 6, 7]> : vector<3xi32>
+  // expected-error @+1 {{op result type and false value type must be the same}}
+  %2 = "spv.Select"(%arg1, %1, %0) : (vector<4xi1>, vector<3xi32>, vector<3xf32>) -> vector<3xi32>
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ShiftLeftLogical
+//===----------------------------------------------------------------------===//
+
+func @shift_left_logical(%arg0: i32, %arg1 : i16) -> i32 {
+  // CHECK: {{%.*}} = spv.ShiftLeftLogical {{%.*}}, {{%.*}} : i32, i16
+  %0 = spv.ShiftLeftLogical %arg0, %arg1: i32, i16
+  spv.ReturnValue %0 : i32
+}
+
+// -----
+
+func @shift_left_logical_invalid_result_type(%arg0: i32, %arg1 : i16) -> i16 {
+  // expected-error @+1 {{expected the same type for the first operand and result, but provided 'i32' and 'i16'}}
+  %0 = "spv.ShiftLeftLogical" (%arg0, %arg1) : (i32, i16) -> (i16)
+  spv.ReturnValue %0 : i16
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ShiftRightArithmetic
+//===----------------------------------------------------------------------===//
+
+func @shift_right_arithmetic(%arg0: vector<4xi32>, %arg1 : vector<4xi8>) -> vector<4xi32> {
+  // CHECK: {{%.*}} = spv.ShiftRightArithmetic {{%.*}}, {{%.*}} : vector<4xi32>, vector<4xi8>
+  %0 = spv.ShiftRightArithmetic %arg0, %arg1: vector<4xi32>, vector<4xi8>
+  spv.ReturnValue %0 : vector<4xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.ShiftRightLogical
+//===----------------------------------------------------------------------===//
+
+func @shift_right_logical(%arg0: vector<2xi32>, %arg1 : vector<2xi8>) -> vector<2xi32> {
+  // CHECK: {{%.*}} = spv.ShiftRightLogical {{%.*}}, {{%.*}} : vector<2xi32>, vector<2xi8>
+  %0 = spv.ShiftRightLogical %arg0, %arg1: vector<2xi32>, vector<2xi8>
+  spv.ReturnValue %0 : vector<2xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.StoreOp
+//===----------------------------------------------------------------------===//
+
+func @simple_store(%arg0 : f32) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK: spv.Store  "Function" %0, %arg0 : f32
+  spv.Store  "Function" %0, %arg0 : f32
+  return
+}
+
+// CHECK_LABEL: @volatile_store
+func @volatile_store(%arg0 : f32) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK: spv.Store  "Function" %0, %arg0 ["Volatile"] : f32
+  spv.Store  "Function" %0, %arg0 ["Volatile"] : f32
+  return
+}
+
+// CHECK_LABEL: @aligned_store
+func @aligned_store(%arg0 : f32) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // CHECK: spv.Store  "Function" %0, %arg0 ["Aligned", 4] : f32
+  spv.Store  "Function" %0, %arg0 ["Aligned", 4] : f32
+  return
+}
+
+// -----
+
+func @simple_store_missing_ptr_type(%arg0 : f32) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected non-function type}}
+  spv.Store  %0, %arg0 : f32
+  return
+}
+
+// -----
+
+func @simple_store_missing_operand(%arg0 : f32) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{custom op 'spv.Store' invalid operand}} : f32
+  spv.Store  "Function" , %arg0 : f32
+  return
+}
+
+// -----
+
+func @simple_store_missing_operand(%arg0 : f32) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{custom op 'spv.Store' expected 2 operands}} : f32
+  spv.Store  "Function" %0 : f32
+  return
+}
+
+// -----
+
+func @volatile_store_missing_lbrace(%arg0 : f32) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected ':'}}
+  spv.Store  "Function" %0, %arg0 "Volatile"] : f32
+  return
+}
+
+// -----
+
+func @volatile_store_missing_rbrace(%arg0 : f32) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected ']'}}
+  spv.Store "Function" %0, %arg0 ["Volatile"} : f32
+  return
+}
+
+// -----
+
+func @aligned_store_missing_alignment(%arg0 : f32) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected ','}}
+  spv.Store  "Function" %0, %arg0 ["Aligned"] : f32
+  return
+}
+
+// -----
+
+func @aligned_store_missing_comma(%arg0 : f32) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected ','}}
+  spv.Store  "Function" %0, %arg0 ["Aligned" 4] : f32
+  return
+}
+
+// -----
+
+func @load_incorrect_attributes(%arg0 : f32) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected ']'}}
+  spv.Store  "Function" %0, %arg0 ["Volatile", 4] : f32
+  return
+}
+
+// -----
+
+func @aligned_store_incorrect_attributes(%arg0 : f32) -> () {
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  // expected-error @+1 {{expected ']'}}
+  spv.Store  "Function" %0, %arg0 ["Aligned", 4, 23] : f32
+  return
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @var0 : !spv.ptr<f32, Input>
+  func @simple_store(%arg0 : f32) -> () {
+    %0 = spv._address_of @var0 : !spv.ptr<f32, Input>
+    // CHECK: spv.Store  "Input" {{%.*}}, {{%.*}} : f32
+    spv.Store  "Input" %0, %arg0 : f32
+    spv.Return
+  }
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.SubgroupBallotKHR
+//===----------------------------------------------------------------------===//
+
+func @subgroup_ballot(%predicate: i1) -> vector<4xi32> {
+  %0 = spv.SubgroupBallotKHR %predicate: vector<4xi32>
+  return %0: vector<4xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.undef
+//===----------------------------------------------------------------------===//
+
+func @undef() -> () {
+  %0 = spv.undef : f32
+  %1 = spv.undef : vector<4xf32>
+  spv.Return
+}
+
+// -----
+
+func @undef() -> () {
+  // expected-error @+2{{expected non-function type}}
+  %0 = spv.undef :
+  spv.Return
+}
+
+// -----
+
+func @undef() -> () {
+  // expected-error @+2{{expected ':'}}
+  %0 = spv.undef
+  spv.Return
+}
+
+// -----
+
+
+//===----------------------------------------------------------------------===//
+// spv.Variable
+//===----------------------------------------------------------------------===//
+
+func @variable(%arg0: f32) -> () {
+  // CHECK: spv.Variable : !spv.ptr<f32, Function>
+  %0 = spv.Variable : !spv.ptr<f32, Function>
+  return
+}
+
+// -----
+
+func @variable_init_normal_constant() -> () {
+  %0 = spv.constant 4.0 : f32
+  // CHECK: spv.Variable init(%0) : !spv.ptr<f32, Function>
+  %1 = spv.Variable init(%0) : !spv.ptr<f32, Function>
+  return
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @global : !spv.ptr<f32, Workgroup>
+  func @variable_init_global_variable() -> () {
+    %0 = spv._address_of @global : !spv.ptr<f32, Workgroup>
+    // CHECK: spv.Variable init({{.*}}) : !spv.ptr<!spv.ptr<f32, Workgroup>, Function>
+    %1 = spv.Variable init(%0) : !spv.ptr<!spv.ptr<f32, Workgroup>, Function>
+    spv.Return
+  }
+} attributes {
+  capability = ["VariablePointers"],
+  extension = ["SPV_KHR_variable_pointers"]
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.specConstant @sc = 42 : i32
+  // CHECK-LABEL: @variable_init_spec_constant
+  func @variable_init_spec_constant() -> () {
+    %0 = spv._reference_of @sc : i32
+    // CHECK: spv.Variable init(%0) : !spv.ptr<i32, Function>
+    %1 = spv.Variable init(%0) : !spv.ptr<i32, Function>
+    spv.Return
+  }
+}
+
+// -----
+
+func @variable_bind() -> () {
+  // expected-error @+1 {{cannot have 'descriptor_set' attribute (only allowed in spv.globalVariable)}}
+  %0 = spv.Variable bind(1, 2) : !spv.ptr<f32, Function>
+  return
+}
+
+// -----
+
+func @variable_init_bind() -> () {
+  %0 = spv.constant 4.0 : f32
+  // expected-error @+1 {{cannot have 'binding' attribute (only allowed in spv.globalVariable)}}
+  %1 = spv.Variable init(%0) {binding = 5 : i32} : !spv.ptr<f32, Function>
+  return
+}
+
+// -----
+
+func @variable_builtin() -> () {
+  // expected-error @+1 {{cannot have 'built_in' attribute (only allowed in spv.globalVariable)}}
+  %1 = spv.Variable built_in("GlobalInvocationID") : !spv.ptr<vector<3xi32>, Function>
+  return
+}
+
+// -----
+
+func @expect_ptr_result_type(%arg0: f32) -> () {
+  // expected-error @+1 {{expected spv.ptr type}}
+  %0 = spv.Variable : f32
+  return
+}
+
+// -----
+
+func @variable_init(%arg0: f32) -> () {
+  // expected-error @+1 {{op initializer must be the result of a constant or spv.globalVariable op}}
+  %0 = spv.Variable init(%arg0) : !spv.ptr<f32, Function>
+  return
+}
+
+// -----
+
+func @cannot_be_generic_storage_class(%arg0: f32) -> () {
+  // expected-error @+1 {{op can only be used to model function-level variables. Use spv.globalVariable for module-level variables}}
+  %0 = spv.Variable : !spv.ptr<f32, Generic>
+  return
+}
diff --git a/mlir/test/Dialect/SPIRV/structure-ops.mlir b/mlir/test/Dialect/SPIRV/structure-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8fe03f46323fd51448a79e808085956c170c067a
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/structure-ops.mlir
@@ -0,0 +1,525 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// spv._address_of
+//===----------------------------------------------------------------------===//
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @var1 : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Input>
+  func @access_chain() -> () {
+    %0 = spv.constant 1: i32
+    // CHECK: [[VAR1:%.*]] = spv._address_of @var1 : !spv.ptr<!spv.struct<f32, !spv.array<4 x f32>>, Input>
+    // CHECK-NEXT: spv.AccessChain [[VAR1]][{{.*}}, {{.*}}] : !spv.ptr<!spv.struct<f32, !spv.array<4 x f32>>, Input>
+    %1 = spv._address_of @var1 : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Input>
+    %2 = spv.AccessChain %1[%0, %0] : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Input>
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @var1 : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Input>
+  func @foo() -> () {
+    // expected-error @+1 {{expected spv.globalVariable symbol}}
+    %0 = spv._address_of @var2 : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Input>
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.globalVariable @var1 : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Input>
+  func @foo() -> () {
+    // expected-error @+1 {{result type mismatch with the referenced global variable's type}}
+    %0 = spv._address_of @var1 : !spv.ptr<f32, Input>
+  }
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.constant
+//===----------------------------------------------------------------------===//
+
+func @const() -> () {
+  // CHECK: %0 = spv.constant true
+  // CHECK: %1 = spv.constant 42 : i32
+  // CHECK: %2 = spv.constant 5.000000e-01 : f32
+  // CHECK: %3 = spv.constant dense<[2, 3]> : vector<2xi32>
+  // CHECK: %4 = spv.constant [dense<3.000000e+00> : vector<2xf32>] : !spv.array<1 x vector<2xf32>>
+  // CHECK: %5 = spv.constant dense<1> : tensor<2x3xi32> : !spv.array<2 x !spv.array<3 x i32 [4]> [12]>
+  // CHECK: %6 = spv.constant dense<1.000000e+00> : tensor<2x3xf32> : !spv.array<2 x !spv.array<3 x f32 [4]> [12]>
+  // CHECK: %7 = spv.constant dense<{{\[}}[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32> : !spv.array<2 x !spv.array<3 x i32 [4]> [12]>
+  // CHECK: %8 = spv.constant dense<{{\[}}[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32> : !spv.array<2 x !spv.array<3 x f32 [4]> [12]>
+
+  %0 = spv.constant true
+  %1 = spv.constant 42 : i32
+  %2 = spv.constant 0.5 : f32
+  %3 = spv.constant dense<[2, 3]> : vector<2xi32>
+  %4 = spv.constant [dense<3.0> : vector<2xf32>] : !spv.array<1xvector<2xf32>>
+  %5 = spv.constant dense<1> : tensor<2x3xi32> : !spv.array<2 x !spv.array<3 x i32 [4]> [12]>
+  %6 = spv.constant dense<1.0> : tensor<2x3xf32> : !spv.array<2 x !spv.array<3 x f32 [4]> [12]>
+  %7 = spv.constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32> : !spv.array<2 x !spv.array<3 x i32 [4]> [12]>
+  %8 = spv.constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf32> : !spv.array<2 x !spv.array<3 x f32 [4]> [12]>
+  return
+}
+
+// -----
+
+func @unaccepted_std_attr() -> () {
+  // expected-error @+1 {{cannot have value of type 'none'}}
+  %0 = spv.constant unit : none
+  return
+}
+
+// -----
+
+func @array_constant() -> () {
+  // expected-error @+1 {{has array element whose type ('vector<2xi32>') does not match the result element type ('vector<2xf32>')}}
+  %0 = spv.constant [dense<3.0> : vector<2xf32>, dense<4> : vector<2xi32>] : !spv.array<2xvector<2xf32>>
+  return
+}
+
+// -----
+
+func @array_constant() -> () {
+  // expected-error @+1 {{must have spv.array result type for array value}}
+  %0 = spv.constant [dense<3.0> : vector<2xf32>] : !spv.rtarray<vector<2xf32>>
+  return
+}
+
+// -----
+
+func @non_nested_array_constant() -> () {
+  // expected-error @+1 {{only support nested array result type}}
+  %0 = spv.constant dense<3.0> : tensor<2x2xf32> : !spv.array<2xvector<2xf32>>
+  return
+}
+
+// -----
+
+func @value_result_type_mismatch() -> () {
+  // expected-error @+1 {{must have spv.array result type for array value}}
+  %0 = "spv.constant"() {value = dense<0> : tensor<4xi32>} : () -> (vector<4xi32>)
+}
+
+// -----
+
+func @value_result_type_mismatch() -> () {
+  // expected-error @+1 {{result element type ('i32') does not match value element type ('f32')}}
+  %0 = spv.constant dense<1.0> : tensor<2x3xf32> : !spv.array<2 x !spv.array<3 x i32 [4]> [12]>
+}
+
+// -----
+
+func @value_result_num_elements_mismatch() -> () {
+  // expected-error @+1 {{result number of elements (6) does not match value number of elements (4)}}
+  %0 = spv.constant dense<1.0> : tensor<2x2xf32> : !spv.array<2 x !spv.array<3 x f32 [4]> [12]>
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.EntryPoint
+//===----------------------------------------------------------------------===//
+
+spv.module "Logical" "GLSL450" {
+   func @do_nothing() -> () {
+     spv.Return
+   }
+   // CHECK: spv.EntryPoint "GLCompute" @do_nothing
+   spv.EntryPoint "GLCompute" @do_nothing
+}
+
+spv.module "Logical" "GLSL450" {
+   spv.globalVariable @var2 : !spv.ptr<f32, Input>
+   spv.globalVariable @var3 : !spv.ptr<f32, Output>
+   func @do_something(%arg0 : !spv.ptr<f32, Input>, %arg1 : !spv.ptr<f32, Output>) -> () {
+     %1 = spv.Load "Input" %arg0 : f32
+     spv.Store "Output" %arg1, %1 : f32
+     spv.Return
+   }
+   // CHECK: spv.EntryPoint "GLCompute" @do_something, @var2, @var3
+   spv.EntryPoint "GLCompute" @do_something, @var2, @var3
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+   func @do_nothing() -> () {
+     spv.Return
+   }
+   // expected-error @+1 {{invalid kind of attribute specified}}
+   spv.EntryPoint "GLCompute" "do_nothing"
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+   func @do_nothing() -> () {
+     spv.Return
+   }
+   // expected-error @+1 {{function 'do_something' not found in 'spv.module'}}
+   spv.EntryPoint "GLCompute" @do_something
+}
+
+/// TODO(ravishankarm) : Add a test that verifies an error is thrown
+/// when interface entries of EntryPointOp are not
+/// spv.Variables. There is currently no other op that has a spv.ptr
+/// return type
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+   func @do_nothing() -> () {
+     // expected-error @+1 {{'spv.EntryPoint' op failed to verify that op must appear in a 'spv.module' block}}
+     spv.EntryPoint "GLCompute" @do_something
+   }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+   func @do_nothing() -> () {
+     spv.Return
+   }
+   spv.EntryPoint "GLCompute" @do_nothing
+   // expected-error @+1 {{duplicate of a previous EntryPointOp}}
+   spv.EntryPoint "GLCompute" @do_nothing
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+   func @do_nothing() -> () {
+     spv.Return
+   }
+   spv.EntryPoint "GLCompute" @do_nothing
+   // expected-error @+1 {{custom op 'spv.EntryPoint' invalid execution_model attribute specification: "ContractionOff"}}
+   spv.EntryPoint "ContractionOff" @do_nothing
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.globalVariable
+//===----------------------------------------------------------------------===//
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: spv.globalVariable @var0 : !spv.ptr<f32, Input>
+  spv.globalVariable @var0 : !spv.ptr<f32, Input>
+}
+
+// TODO: Fix test case after initialization with normal constant is addressed
+// spv.module "Logical" "GLSL450" {
+//   %0 = spv.constant 4.0 : f32
+//   // CHECK1: spv.Variable init(%0) : !spv.ptr<f32, Private>
+//   spv.globalVariable @var1 init(%0) : !spv.ptr<f32, Private>
+// }
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.specConstant @sc = 4.0 : f32
+  // CHECK: spv.globalVariable @var initializer(@sc) : !spv.ptr<f32, Private>
+  spv.globalVariable @var initializer(@sc) : !spv.ptr<f32, Private>
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: spv.globalVariable @var0 bind(1, 2) : !spv.ptr<f32, Uniform>
+  spv.globalVariable @var0 bind(1, 2) : !spv.ptr<f32, Uniform>
+}
+
+// TODO: Fix test case after initialization with constant is addressed
+// spv.module "Logical" "GLSL450" {
+//   %0 = spv.constant 4.0 : f32
+//   // CHECK1: spv.globalVariable @var1 initializer(%0) {binding = 5 : i32} : !spv.ptr<f32, Private>
+//   spv.globalVariable @var1 initializer(%0) {binding = 5 : i32} : !spv.ptr<f32, Private>
+// }
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: spv.globalVariable @var1 built_in("GlobalInvocationID") : !spv.ptr<vector<3xi32>, Input>
+  spv.globalVariable @var1 built_in("GlobalInvocationID") : !spv.ptr<vector<3xi32>, Input>
+  // CHECK: spv.globalVariable @var2 built_in("GlobalInvocationID") : !spv.ptr<vector<3xi32>, Input>
+  spv.globalVariable @var2 {built_in = "GlobalInvocationID"} : !spv.ptr<vector<3xi32>, Input>
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // expected-error @+1 {{expected spv.ptr type}}
+  spv.globalVariable @var0 : f32
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // expected-error @+1 {{op initializer must be result of a spv.specConstant or spv.globalVariable op}}
+  spv.globalVariable @var0 initializer(@var1) : !spv.ptr<f32, Private>
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // expected-error @+1 {{storage class cannot be 'Generic'}}
+  spv.globalVariable @var0 : !spv.ptr<f32, Generic>
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @foo() {
+    // expected-error @+1 {{op failed to verify that op must appear in a 'spv.module' block}}
+    spv.globalVariable @var0 : !spv.ptr<f32, Input>
+    spv.Return
+  }
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.module
+//===----------------------------------------------------------------------===//
+
+// Module without capability and extension
+// CHECK: spv.module "Logical" "GLSL450"
+spv.module "Logical" "GLSL450" { }
+
+// Module with capability and extension
+// CHECK: attributes {capability = ["Shader"], extension = ["SPV_KHR_16bit_storage"]}
+spv.module "Logical" "GLSL450" { } attributes {
+  capability = ["Shader"],
+  extension = ["SPV_KHR_16bit_storage"]
+}
+
+// Module with explicit spv._module_end
+// CHECK: spv.module
+spv.module "Logical" "GLSL450" {
+  spv._module_end
+}
+
+// Module with function
+// CHECK: spv.module
+spv.module "Logical" "GLSL450" {
+  func @do_nothing() -> () {
+    spv.Return
+  }
+}
+
+// -----
+
+// Missing addressing model
+// expected-error@+1 {{custom op 'spv.module' expected addressing_model attribute specified as string}}
+spv.module { }
+
+// -----
+
+// Wrong addressing model
+// expected-error@+1 {{custom op 'spv.module' invalid addressing_model attribute specification: "Physical"}}
+spv.module "Physical" { }
+
+// -----
+
+// Missing memory model
+// expected-error@+1 {{custom op 'spv.module' expected memory_model attribute specified as string}}
+spv.module "Logical" { }
+
+// -----
+
+// Wrong memory model
+// expected-error@+1 {{custom op 'spv.module' invalid memory_model attribute specification: "Bla"}}
+spv.module "Logical" "Bla" { }
+
+// -----
+
+// Module with multiple blocks
+// expected-error @+1 {{expects region #0 to have 0 or 1 blocks}}
+spv.module "Logical" "GLSL450" {
+^first:
+  spv.Return
+^second:
+  spv.Return
+}
+
+// -----
+
+// Module with wrong terminator
+// expected-error@+2 {{expects regions to end with 'spv._module_end'}}
+// expected-note@+1 {{in custom textual format, the absence of terminator implies 'spv._module_end'}}
+"spv.module"() ({
+  %0 = spv.constant true
+}) {addressing_model = 0 : i32, memory_model = 1 : i32} : () -> ()
+
+// -----
+
+// Use non SPIR-V op inside.module
+spv.module "Logical" "GLSL450" {
+  // expected-error @+1 {{'spv.module' can only contain func and spv.* ops}}
+  "dialect.op"() : () -> ()
+}
+
+// -----
+
+// Use non SPIR-V op inside function
+spv.module "Logical" "GLSL450" {
+  func @do_nothing() -> () {
+    // expected-error @+1 {{functions in 'spv.module' can only contain spv.* ops}}
+    "dialect.op"() : () -> ()
+  }
+}
+
+// -----
+
+// Use external function
+spv.module "Logical" "GLSL450" {
+  // expected-error @+1 {{'spv.module' cannot contain external functions}}
+  func @extern() -> ()
+}
+
+// -----
+
+// Module with nested function
+spv.module "Logical" "GLSL450" {
+  func @outer_func() -> () {
+    // expected-error @+1 {{'spv.module' cannot contain nested functions}}
+    func @inner_func() -> () {
+      spv.Return
+    }
+    spv.Return
+  }
+}
+
+// -----
+
+// expected-error @+1 {{uses unknown capability: MyAwesomeCapability}}
+spv.module "Logical" "GLSL450" {
+} attributes {
+  capabilities = ["MyAwesomeCapability"]
+}
+
+// -----
+
+// expected-error @+1 {{uses unknown extension: MyAwesomeExtension}}
+spv.module "Logical" "GLSL450" {
+} attributes {
+  extensions = ["MyAwesomeExtension"]
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv._module_end
+//===----------------------------------------------------------------------===//
+
+func @module_end_not_in_module() -> () {
+  // expected-error @+1 {{op must appear in a 'spv.module' block}}
+  spv._module_end
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv._reference_of
+//===----------------------------------------------------------------------===//
+
+spv.module "Logical" "GLSL450" {
+  spv.specConstant @sc1 = false
+  spv.specConstant @sc2 = 42 : i64
+  spv.specConstant @sc3 = 1.5 : f32
+
+  // CHECK-LABEL: @reference
+  func @reference() -> i1 {
+    // CHECK: spv._reference_of @sc1 : i1
+    %0 = spv._reference_of @sc1 : i1
+    spv.ReturnValue %0 : i1
+  }
+
+  // CHECK-LABEL: @initialize
+  func @initialize() -> i64 {
+    // CHECK: spv._reference_of @sc2 : i64
+    %0 = spv._reference_of @sc2 : i64
+    %1 = spv.Variable init(%0) : !spv.ptr<i64, Function>
+    %2 = spv.Load "Function" %1 : i64
+    spv.ReturnValue %2 : i64
+  }
+
+  // CHECK-LABEL: @compute
+  func @compute() -> f32 {
+    // CHECK: spv._reference_of @sc3 : f32
+    %0 = spv._reference_of @sc3 : f32
+    %1 = spv.constant 6.0 : f32
+    %2 = spv.FAdd %0, %1 : f32
+    spv.ReturnValue %2 : f32
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  func @foo() -> () {
+    // expected-error @+1 {{expected spv.specConstant symbol}}
+    %0 = spv._reference_of @sc : i32
+    spv.Return
+  }
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  spv.specConstant @sc = 42 : i32
+  func @foo() -> () {
+    // expected-error @+1 {{result type mismatch with the referenced specialization constant's type}}
+    %0 = spv._reference_of @sc : f32
+    spv.Return
+  }
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.specConstant
+//===----------------------------------------------------------------------===//
+
+spv.module "Logical" "GLSL450" {
+  // CHECK: spv.specConstant @sc1 = false
+  spv.specConstant @sc1 = false
+  // CHECK: spv.specConstant @sc2 spec_id(5) = 42 : i64
+  spv.specConstant @sc2 spec_id(5) = 42 : i64
+  // CHECK: spv.specConstant @sc3 = 1.500000e+00 : f32
+  spv.specConstant @sc3 = 1.5 : f32
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // expected-error @+1 {{SpecId cannot be negative}}
+  spv.specConstant @sc2 spec_id(-5) = 42 : i64
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // expected-error @+1 {{default value bitwidth disallowed}}
+  spv.specConstant @sc = 15 : i4
+}
+
+// -----
+
+spv.module "Logical" "GLSL450" {
+  // expected-error @+1 {{default value can only be a bool, integer, or float scalar}}
+  spv.specConstant @sc = dense<[2, 3]> : vector<2xi32>
+}
+
+// -----
+
+func @use_in_function() -> () {
+  // expected-error @+1 {{op must appear in a 'spv.module' block}}
+  spv.specConstant @sc = false
+  return
+}
diff --git a/mlir/test/Dialect/SPIRV/types.mlir b/mlir/test/Dialect/SPIRV/types.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..266790d4d95456dd4209d90d95f06ebb3496de0c
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/types.mlir
@@ -0,0 +1,321 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+
+// TODO(b/133530217): Add more tests after switching to the generic parser.
+
+//===----------------------------------------------------------------------===//
+// ArrayType
+//===----------------------------------------------------------------------===//
+
+// CHECK: func @scalar_array_type(!spv.array<16 x f32>, !spv.array<8 x i32>)
+func @scalar_array_type(!spv.array<16xf32>, !spv.array<8 x i32>) -> ()
+
+// CHECK: func @vector_array_type(!spv.array<32 x vector<4xf32>>)
+func @vector_array_type(!spv.array< 32 x vector<4xf32> >) -> ()
+
+// CHECK: func @array_type_stride(!spv.array<4 x !spv.array<4 x f32 [4]> [128]>)
+func @array_type_stride(!spv.array< 4 x !spv.array<4 x f32 [4]> [128]>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected '<'}}
+func @missing_left_angle_bracket(!spv.array 4xf32>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected single integer for array element count}}
+func @missing_count(!spv.array<f32>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected 'x' in dimension list}}
+func @missing_x(!spv.array<4 f32>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected non-function type}}
+func @missing_element_type(!spv.array<4x>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected non-function type}}
+func @cannot_parse_type(!spv.array<4xblabla>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected single integer for array element count}}
+func @more_than_one_dim(!spv.array<4x3xf32>) -> ()
+
+// -----
+
+// expected-error @+1 {{only 1-D vector allowed but found 'vector<4x3xf32>'}}
+func @non_1D_vector(!spv.array<4xvector<4x3xf32>>) -> ()
+
+// -----
+
+// expected-error @+1 {{cannot use 'tensor<4xf32>' to compose SPIR-V types}}
+func @tensor_type(!spv.array<4xtensor<4xf32>>) -> ()
+
+// -----
+
+// expected-error @+1 {{cannot use 'bf16' to compose SPIR-V types}}
+func @bf16_type(!spv.array<4xbf16>) -> ()
+
+// -----
+
+// expected-error @+1 {{only 1/8/16/32/64-bit integer type allowed but found 'i256'}}
+func @i256_type(!spv.array<4xi256>) -> ()
+
+// -----
+
+// expected-error @+1 {{cannot use 'index' to compose SPIR-V types}}
+func @index_type(!spv.array<4xindex>) -> ()
+
+// -----
+
+// expected-error @+1 {{cannot use '!llvm.i32' to compose SPIR-V types}}
+func @llvm_type(!spv.array<4x!llvm.i32>) -> ()
+
+// -----
+
+// expected-error @+1 {{ArrayStride must be greater than zero}}
+func @array_type_zero_stide(!spv.array<4xi32 [0]>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected array length greater than 0}}
+func @array_type_zero_length(!spv.array<0xf32>) -> ()
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// PointerType
+//===----------------------------------------------------------------------===//
+
+// CHECK: @bool_ptr_type(!spv.ptr<i1, Uniform>)
+func @bool_ptr_type(!spv.ptr<i1, Uniform>) -> ()
+
+// CHECK: @scalar_ptr_type(!spv.ptr<f32, Uniform>)
+func @scalar_ptr_type(!spv.ptr<f32, Uniform>) -> ()
+
+// CHECK: @vector_ptr_type(!spv.ptr<vector<4xi32>, PushConstant>)
+func @vector_ptr_type(!spv.ptr<vector<4xi32>,PushConstant>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected '<'}}
+func @missing_left_angle_bracket(!spv.ptr f32, Uniform>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @missing_comma(!spv.ptr<f32 Uniform>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected non-function type}}
+func @missing_pointee_type(!spv.ptr<, Uniform>) -> ()
+
+// -----
+
+// expected-error @+1 {{unknown storage class: SomeStorageClass}}
+func @unknown_storage_class(!spv.ptr<f32, SomeStorageClass>) -> ()
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// RuntimeArrayType
+//===----------------------------------------------------------------------===//
+
+// CHECK: func @scalar_runtime_array_type(!spv.rtarray<f32>, !spv.rtarray<i32>)
+func @scalar_runtime_array_type(!spv.rtarray<f32>, !spv.rtarray<i32>) -> ()
+
+// CHECK: func @vector_runtime_array_type(!spv.rtarray<vector<4xf32>>)
+func @vector_runtime_array_type(!spv.rtarray< vector<4xf32> >) -> ()
+
+// -----
+
+// expected-error @+1 {{expected '<'}}
+func @missing_left_angle_bracket(!spv.rtarray f32>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected non-function type}}
+func @missing_element_type(!spv.rtarray<>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected non-function type}}
+func @redundant_count(!spv.rtarray<4xf32>) -> ()
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// ImageType
+//===----------------------------------------------------------------------===//
+
+// CHECK: func @image_parameters_1D(!spv.image<f32, Dim1D, NoDepth, NonArrayed, SingleSampled, SamplerUnknown, Unknown>)
+func @image_parameters_1D(!spv.image<f32, Dim1D, NoDepth, NonArrayed, SingleSampled, SamplerUnknown, Unknown>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @image_parameters_one_element(!spv.image<f32>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @image_parameters_two_elements(!spv.image<f32, Dim1D>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @image_parameters_three_elements(!spv.image<f32, Dim1D, NoDepth>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @image_parameters_four_elements(!spv.image<f32, Dim1D, NoDepth, NonArrayed>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @image_parameters_five_elements(!spv.image<f32, Dim1D, NoDepth, NonArrayed, SingleSampled>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @image_parameters_six_elements(!spv.image<f32, Dim1D, NoDepth, NonArrayed, SingleSampled, SamplerUnknown>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected '<'}}
+func @image_parameters_delimiter(!spv.image f32, Dim1D, NoDepth, NonArrayed, SingleSampled, SamplerUnknown, Unknown>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @image_parameters_nocomma_1(!spv.image<f32, Dim1D NoDepth, NonArrayed, SingleSampled, SamplerUnknown, Unknown>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @image_parameters_nocomma_2(!spv.image<f32, Dim1D, NoDepth NonArrayed, SingleSampled, SamplerUnknown, Unknown>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @image_parameters_nocomma_3(!spv.image<f32, Dim1D, NoDepth, NonArrayed SingleSampled, SamplerUnknown, Unknown>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @image_parameters_nocomma_4(!spv.image<f32, Dim1D, NoDepth, NonArrayed, SingleSampled SamplerUnknown, Unknown>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @image_parameters_nocomma_5(!spv.image<f32, Dim1D, NoDepth, NonArrayed, SingleSampled, SamplerUnknown Unknown>) -> ()
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// StructType
+//===----------------------------------------------------------------------===//
+
+// CHECK: func @struct_type(!spv.struct<f32>)
+func @struct_type(!spv.struct<f32>) -> ()
+
+// CHECK: func @struct_type2(!spv.struct<f32 [0]>)
+func @struct_type2(!spv.struct<f32 [0]>) -> ()
+
+// CHECK: func @struct_type_simple(!spv.struct<f32, !spv.image<f32, Dim1D, NoDepth, NonArrayed, SingleSampled, SamplerUnknown, Unknown>>)
+func @struct_type_simple(!spv.struct<f32, !spv.image<f32, Dim1D, NoDepth, NonArrayed, SingleSampled, SamplerUnknown, Unknown>>) -> ()
+
+// CHECK: func @struct_type_with_offset(!spv.struct<f32 [0], i32 [4]>)
+func @struct_type_with_offset(!spv.struct<f32 [0], i32 [4]>) -> ()
+
+// CHECK: func @nested_struct(!spv.struct<f32, !spv.struct<f32, i32>>)
+func @nested_struct(!spv.struct<f32, !spv.struct<f32, i32>>)
+
+// CHECK: func @nested_struct_with_offset(!spv.struct<f32 [0], !spv.struct<f32 [0], i32 [4]> [4]>)
+func @nested_struct_with_offset(!spv.struct<f32 [0], !spv.struct<f32 [0], i32 [4]> [4]>)
+
+// CHECK: func @struct_type_with_decoration(!spv.struct<f32 [NonWritable]>)
+func @struct_type_with_decoration(!spv.struct<f32 [NonWritable]>)
+
+// CHECK: func @struct_type_with_decoration_and_offset(!spv.struct<f32 [0, NonWritable]>)
+func @struct_type_with_decoration_and_offset(!spv.struct<f32 [0, NonWritable]>)
+
+// CHECK: func @struct_type_with_decoration2(!spv.struct<f32 [NonWritable], i32 [NonReadable]>)
+func @struct_type_with_decoration2(!spv.struct<f32 [NonWritable], i32 [NonReadable]>)
+
+// CHECK: func @struct_type_with_decoration3(!spv.struct<f32, i32 [NonReadable]>)
+func @struct_type_with_decoration3(!spv.struct<f32, i32 [NonReadable]>)
+
+// CHECK: func @struct_type_with_decoration4(!spv.struct<f32 [0], i32 [4, NonReadable]>)
+func @struct_type_with_decoration4(!spv.struct<f32 [0], i32 [4, NonReadable]>)
+
+// CHECK: func @struct_type_with_decoration5(!spv.struct<f32 [NonWritable, NonReadable]>)
+func @struct_type_with_decoration5(!spv.struct<f32 [NonWritable, NonReadable]>)
+
+// CHECK: func @struct_type_with_decoration6(!spv.struct<f32, !spv.struct<i32 [NonWritable, NonReadable]>>)
+func @struct_type_with_decoration6(!spv.struct<f32, !spv.struct<i32 [NonWritable, NonReadable]>>)
+
+// CHECK: func @struct_type_with_decoration7(!spv.struct<f32 [0], !spv.struct<i32, f32 [NonReadable]> [4]>)
+func @struct_type_with_decoration7(!spv.struct<f32 [0], !spv.struct<i32, f32 [NonReadable]> [4]>)
+
+// CHECK: func @struct_type_with_decoration8(!spv.struct<f32, !spv.struct<i32 [0], f32 [4, NonReadable]>>)
+func @struct_type_with_decoration8(!spv.struct<f32, !spv.struct<i32 [0], f32 [4, NonReadable]>>)
+
+// CHECK: func @struct_empty(!spv.struct<>)
+func @struct_empty(!spv.struct<>)
+
+// -----
+
+// expected-error @+1 {{layout specification must be given for all members}}
+func @struct_type_missing_offset1((!spv.struct<f32, i32 [4]>) -> ()
+
+// -----
+
+// expected-error @+1 {{layout specification must be given for all members}}
+func @struct_type_missing_offset2(!spv.struct<f32 [3], i32>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected '>'}}
+func @struct_type_missing_comma1(!spv.struct<f32 i32>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected '>'}}
+func @struct_type_missing_comma2(!spv.struct<f32 [0] i32>) -> ()
+
+// -----
+
+//  expected-error @+1 {{unbalanced '>' character in pretty dialect name}}
+func @struct_type_neg_offset(!spv.struct<f32 [0>) -> ()
+
+// -----
+
+//  expected-error @+1 {{unbalanced ']' character in pretty dialect name}}
+func @struct_type_neg_offset(!spv.struct<f32 0]>) -> ()
+
+// -----
+
+//  expected-error @+1 {{expected ']'}}
+func @struct_type_neg_offset(!spv.struct<f32 [NonWritable 0]>) -> ()
+
+// -----
+
+//  expected-error @+1 {{expected valid keyword}}
+func @struct_type_neg_offset(!spv.struct<f32 [NonWritable, 0]>) -> ()
+
+// -----
+
+// expected-error @+1 {{expected ','}}
+func @struct_type_missing_comma(!spv.struct<f32 [0 NonWritable], i32 [4]>)
+
+// -----
+
+// expected-error @+1 {{expected ']'}}
+func @struct_type_missing_comma(!spv.struct<f32 [0, NonWritable NonReadable], i32 [4]>)
diff --git a/mlir/test/Dialect/VectorOps/canonicalize.mlir b/mlir/test/Dialect/VectorOps/canonicalize.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8dca47515cede6f4e8505f975cd76ff0f200dd64
--- /dev/null
+++ b/mlir/test/Dialect/VectorOps/canonicalize.mlir
@@ -0,0 +1,89 @@
+// RUN: mlir-opt %s -pass-pipeline='func(canonicalize)' -split-input-file | FileCheck %s
+
+// -----
+
+// CHECK-LABEL: create_vector_mask_to_constant_mask
+func @create_vector_mask_to_constant_mask() -> (vector<4x3xi1>) {
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  // CHECK: vector.constant_mask [3, 2] : vector<4x3xi1>
+  %0 = vector.create_mask %c3, %c2 : vector<4x3xi1>
+  return %0 : vector<4x3xi1>
+}
+
+// -----
+
+func @strided_slice_of_constant_mask() -> (vector<2x2xi1>) {
+  %0 = vector.constant_mask [2, 2] : vector<4x3xi1>
+  %1 = vector.strided_slice %0
+    {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]}
+      : vector<4x3xi1> to vector<2x2xi1>
+  // CHECK: vector.constant_mask [2, 2] : vector<2x2xi1>
+  return %1 : vector<2x2xi1>
+}
+
+// -----
+
+func @strided_slice_of_constant_mask() -> (vector<2x2xi1>) {
+  %0 = vector.constant_mask [2, 2] : vector<4x3xi1>
+  %1 = vector.strided_slice %0
+    {offsets = [1, 0], sizes = [2, 2], strides = [1, 1]}
+      : vector<4x3xi1> to vector<2x2xi1>
+  // CHECK: vector.constant_mask [1, 2] : vector<2x2xi1>
+  return %1 : vector<2x2xi1>
+}
+
+// -----
+
+func @strided_slice_of_constant_mask() -> (vector<2x2xi1>) {
+  %0 = vector.constant_mask [2, 2] : vector<4x3xi1>
+  %1 = vector.strided_slice %0
+    {offsets = [0, 1], sizes = [2, 2], strides = [1, 1]}
+      : vector<4x3xi1> to vector<2x2xi1>
+  // CHECK: vector.constant_mask [2, 1] : vector<2x2xi1>
+  return %1 : vector<2x2xi1>
+}
+
+// -----
+
+func @strided_slice_of_constant_mask() -> (vector<2x2xi1>) {
+  %0 = vector.constant_mask [2, 2] : vector<4x3xi1>
+  %1 = vector.strided_slice %0
+    {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]}
+      : vector<4x3xi1> to vector<2x2xi1>
+  // CHECK: vector.constant_mask [0, 0] : vector<2x2xi1>
+  return %1 : vector<2x2xi1>
+}
+
+// -----
+
+func @strided_slice_of_constant_mask() -> (vector<2x1xi1>) {
+  %0 = vector.constant_mask [2, 2] : vector<4x3xi1>
+  %1 = vector.strided_slice %0
+    {offsets = [0, 2], sizes = [2, 1], strides = [1, 1]}
+      : vector<4x3xi1> to vector<2x1xi1>
+  // CHECK: vector.constant_mask [0, 0] : vector<2x1xi1>
+  return %1 : vector<2x1xi1>
+}
+
+// -----
+
+func @strided_slice_of_constant_mask() -> (vector<2x1xi1>) {
+  %0 = vector.constant_mask [2, 2] : vector<4x3xi1>
+  %1 = vector.strided_slice %0
+    {offsets = [0, 1], sizes = [2, 1], strides = [1, 1]}
+      : vector<4x3xi1> to vector<2x1xi1>
+  // CHECK: vector.constant_mask [2, 1] : vector<2x1xi1>
+  return %1 : vector<2x1xi1>
+}
+
+// -----
+
+func @strided_slice_of_constant_mask() -> (vector<2x1xi1>) {
+  %0 = vector.constant_mask [2, 2] : vector<4x3xi1>
+  %1 = vector.strided_slice %0
+    {offsets = [1, 1], sizes = [2, 1], strides = [1, 1]}
+      : vector<4x3xi1> to vector<2x1xi1>
+  // CHECK: vector.constant_mask [1, 1] : vector<2x1xi1>
+  return %1 : vector<2x1xi1>
+}
diff --git a/mlir/test/Dialect/VectorOps/invalid.mlir b/mlir/test/Dialect/VectorOps/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ef02497710bec5780a95a91d4ec00ca9e43a7995
--- /dev/null
+++ b/mlir/test/Dialect/VectorOps/invalid.mlir
@@ -0,0 +1,891 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// -----
+
+func @broadcast_to_scalar(%arg0: f32) -> f32 {
+  // expected-error@+1 {{'vector.broadcast' op result #0 must be vector of any type values, but got 'f32'}}
+  %0 = vector.broadcast %arg0 : f32 to f32
+}
+
+// -----
+
+func @broadcast_rank_too_high(%arg0: vector<4x4xf32>) {
+  // expected-error@+1 {{'vector.broadcast' op source rank higher than destination rank}}
+  %1 = vector.broadcast %arg0 : vector<4x4xf32> to vector<4xf32>
+}
+
+// -----
+
+func @broadcast_dim1_mismatch(%arg0: vector<7xf32>) {
+  // expected-error@+1 {{'vector.broadcast' op dimension mismatch (7 vs. 3)}}
+  %1 = vector.broadcast %arg0 : vector<7xf32> to vector<3xf32>
+}
+
+// -----
+
+func @broadcast_dim2_mismatch(%arg0: vector<4x8xf32>) {
+  // expected-error@+1 {{'vector.broadcast' op dimension mismatch (4 vs. 1)}}
+  %1 = vector.broadcast %arg0 : vector<4x8xf32> to vector<1x8xf32>
+}
+
+// -----
+
+func @shuffle_elt_type_mismatch(%arg0: vector<2xf32>, %arg1: vector<2xi32>) {
+  // expected-error@+1 {{'vector.shuffle' op failed to verify that second operand v2 and result have same element type}}
+  %1 = vector.shuffle %arg0, %arg1 [0, 1] : vector<2xf32>, vector<2xi32>
+}
+
+// -----
+
+func @shuffle_rank_mismatch(%arg0: vector<2xf32>, %arg1: vector<4x2xf32>) {
+  // expected-error@+1 {{'vector.shuffle' op rank mismatch}}
+  %1 = vector.shuffle %arg0, %arg1 [0, 1] : vector<2xf32>, vector<4x2xf32>
+}
+
+// -----
+
+func @shuffle_trailing_dim_size_mismatch(%arg0: vector<2x2xf32>, %arg1: vector<2x4xf32>) {
+  // expected-error@+1 {{'vector.shuffle' op dimension mismatch}}
+  %1 = vector.shuffle %arg0, %arg1 [0, 1] : vector<2x2xf32>, vector<2x4xf32>
+}
+
+// -----
+
+func @shuffle_index_out_of_range(%arg0: vector<2xf32>, %arg1: vector<2xf32>) {
+  // expected-error@+1 {{'vector.shuffle' op mask index #2 out of range}}
+  %1 = vector.shuffle %arg0, %arg1 [0, 4] : vector<2xf32>, vector<2xf32>
+}
+
+// -----
+
+func @shuffle_empty_mask(%arg0: vector<2xf32>, %arg1: vector<2xf32>) {
+  // expected-error@+1 {{'vector.shuffle' invalid mask length}}
+  %1 = vector.shuffle %arg0, %arg1 [] : vector<2xf32>, vector<2xf32>
+}
+
+// -----
+
+func @extract_element(%arg0: vector<4x4xf32>) {
+  %c = constant 3 : i32
+  // expected-error@+1 {{'vector.extractelement' op expected 1-D vector}}
+  %1 = vector.extractelement %arg0[%c : i32] : vector<4x4xf32>
+}
+
+// -----
+
+func @extract_vector_type(%arg0: index) {
+  // expected-error@+1 {{expected vector type}}
+  %1 = vector.extract %arg0[] : index
+}
+
+// -----
+
+func @extract_position_empty(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected non-empty position attribute}}
+  %1 = vector.extract %arg0[] : vector<4x8x16xf32>
+}
+
+// -----
+
+func @extract_position_rank_overflow(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected position attribute of rank smaller than vector}}
+  %1 = vector.extract %arg0[0, 0, 0, 0] : vector<4x8x16xf32>
+}
+
+// -----
+
+func @extract_position_rank_overflow_generic(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected position attribute of rank smaller than vector}}
+  %1 = "vector.extract" (%arg0) { position = [0, 0, 0, 0] } : (vector<4x8x16xf32>) -> (vector<16xf32>)
+}
+
+// -----
+
+func @extract_position_overflow(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected position attribute #2 to be a non-negative integer smaller than the corresponding vector dimension}}
+  %1 = vector.extract %arg0[0, 43, 0] : vector<4x8x16xf32>
+}
+
+// -----
+
+func @extract_precise_position_overflow(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected position attribute #3 to be a non-negative integer smaller than the corresponding vector dimension}}
+  %1 = vector.extract %arg0[3, 7, 16] : vector<4x8x16xf32>
+}
+
+// -----
+
+func @extract_position_overflow(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected position attribute #3 to be a non-negative integer smaller than the corresponding vector dimension}}
+  %1 = vector.extract %arg0[0, 0, -1] : vector<4x8x16xf32>
+}
+
+// -----
+
+func @insert_element(%arg0: f32, %arg1: vector<4x4xf32>) {
+  %c = constant 3 : i32
+  // expected-error@+1 {{'vector.insertelement' op expected 1-D vector}}
+  %0 = vector.insertelement %arg0, %arg1[%c : i32] : vector<4x4xf32>
+}
+
+// -----
+
+func @insert_element_wrong_type(%arg0: i32, %arg1: vector<4xf32>) {
+  %c = constant 3 : index
+  // expected-error@+1 {{'vector.insertelement' op failed to verify that source operand and result have same element type}}
+  %0 = "vector.insertelement" (%arg0, %arg1, %c) : (i32, vector<4xf32>, index) -> (vector<4xf32>)
+}
+
+// -----
+
+func @insert_vector_type(%a: f32, %b: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected non-empty position attribute}}
+  %1 = vector.insert %a, %b[] : f32 into vector<4x8x16xf32>
+}
+
+// -----
+
+func @insert_vector_type(%a: f32, %b: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected position attribute of rank smaller than dest vector rank}}
+  %1 = vector.insert %a, %b[3, 3, 3, 3, 3, 3] : f32 into vector<4x8x16xf32>
+}
+
+// -----
+
+func @insert_vector_type(%a: vector<4xf32>, %b: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected position attribute rank + source rank to match dest vector rank}}
+  %1 = vector.insert %a, %b[3] : vector<4xf32> into vector<4x8x16xf32>
+}
+
+// -----
+
+func @insert_vector_type(%a: f32, %b: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected position attribute rank to match the dest vector rank}}
+  %1 = vector.insert %a, %b[3, 3] : f32 into vector<4x8x16xf32>
+}
+
+// -----
+
+func @insert_position_overflow(%a: f32, %b: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected position attribute #3 to be a non-negative integer smaller than the corresponding dest vector dimension}}
+  %1 = vector.insert %a, %b[0, 0, -1] : f32 into vector<4x8x16xf32>
+}
+
+// -----
+
+func @insert_precise_position_overflow(%a: f32, %b: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected position attribute #1 to be a non-negative integer smaller than the corresponding dest vector dimension}}
+  %1 = vector.insert %a, %b[4, 7, 15] : f32 into vector<4x8x16xf32>
+}
+
+// -----
+
+func @outerproduct_num_operands(%arg0: f32) {
+  // expected-error@+1 {{expected at least 2 operands}}
+  %1 = vector.outerproduct %arg0 : f32, f32
+}
+// -----
+
+func @outerproduct_non_vector_operand(%arg0: f32) {
+  // expected-error@+1 {{expected 2 vector types}}
+  %1 = vector.outerproduct %arg0, %arg0 : f32, f32
+}
+
+// -----
+
+func @outerproduct_operand_1(%arg0: vector<4xf32>, %arg1: vector<4x8xf32>) {
+  // expected-error@+1 {{expected 1-d vector for operand #1}}
+  %1 = vector.outerproduct %arg1, %arg1 : vector<4x8xf32>, vector<4x8xf32>
+}
+
+// -----
+
+func @outerproduct_operand_2(%arg0: vector<4xf32>, %arg1: vector<4x8xf32>) {
+  // expected-error@+1 {{expected 1-d vector for operand #2}}
+  %1 = vector.outerproduct %arg0, %arg1 : vector<4xf32>, vector<4x8xf32>
+}
+
+// -----
+
+func @outerproduct_result_generic(%arg0: vector<4xf32>, %arg1: vector<8xf32>) {
+  // expected-error@+1 {{expected 2-d vector result}}
+  %1 = "vector.outerproduct" (%arg0, %arg1) : (vector<4xf32>, vector<8xf32>) -> (vector<8xf32>)
+}
+
+// -----
+
+func @outerproduct_operand_1_dim_generic(%arg0: vector<4xf32>, %arg1: vector<8xf32>) {
+  // expected-error@+1 {{expected #1 operand dim to match result dim #1}}
+  %1 = "vector.outerproduct" (%arg0, %arg1) : (vector<4xf32>, vector<8xf32>) -> (vector<8x16xf32>)
+}
+
+// -----
+
+func @outerproduct_operand_2_dim_generic(%arg0: vector<4xf32>, %arg1: vector<8xf32>) {
+  // expected-error@+1 {{expected #2 operand dim to match result dim #2}}
+  %1 = "vector.outerproduct" (%arg0, %arg1) : (vector<4xf32>, vector<8xf32>) -> (vector<4x16xf32>)
+}
+
+// -----
+
+func @outerproduct_operand_3_result_type_generic(%arg0: vector<4xf32>, %arg1: vector<8xf32>, %arg2: vector<4x16xf32>) {
+  // expected-error@+1 {{expected operand #3 of same type as result type}}
+  %1 = "vector.outerproduct" (%arg0, %arg1, %arg2) : (vector<4xf32>, vector<8xf32>, vector<4x16xf32>) -> (vector<4x8xf32>)
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{two types required}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst { permutation_map = ()->(0) } : memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires 2 indices}}
+  %0 = vector.transfer_read %arg0[%c3, %c3, %c3], %cst { permutation_map = ()->(0) } : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires attribute 'permutation_map'}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {perm = (d0)->(d0)} : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires a permutation_map with input dims of the same rank as the memref type}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = (d0)->(d0)} : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires a permutation_map with result dims of the same rank as the vector type}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = (d0, d1)->(d0, d1)} : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = (d0, d1)->(d0 + d1)} : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = (d0, d1)->(d0 + 1)} : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires a permutation_map that is a permutation (found one dim used more than once)}}
+  %0 = vector.transfer_read %arg0[%c3, %c3, %c3], %cst {permutation_map = (d0, d1, d2)->(d0, d0)} : memref<?x?x?xf32>, vector<3x7xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xvector<4x3xf32>>) {
+  %c3 = constant 3 : index
+  %f0 = constant 0.0 : f32
+  %vf0 = splat %f0 : vector<4x3xf32>
+  // expected-error@+1 {{requires memref and vector types of the same elemental type}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %vf0 {permutation_map = (d0, d1)->(d0, d1)} : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xi32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xvector<4x3xf32>>) {
+  %c3 = constant 3 : index
+  %f0 = constant 0.0 : f32
+  %vf0 = splat %f0 : vector<4x3xf32>
+  // expected-error@+1 {{requires memref vector element and vector result ranks to match}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %vf0 {permutation_map = (d0, d1)->(d0, d1)} : memref<?x?xvector<4x3xf32>>, vector<3xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<?x?xvector<4x3xf32>>) {
+  %c3 = constant 3 : index
+  %f0 = constant 0.0 : f32
+  %vf0 = splat %f0 : vector<4x3xf32>
+  // expected-error@+1 {{ requires memref vector element shape to match suffix of vector result shape}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %vf0 {permutation_map = (d0, d1)->(d0, d1)} : memref<?x?xvector<4x3xf32>>, vector<1x1x2x3xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{expected 5 operand types but had 4}}
+  %0 = "vector.transfer_write"(%cst, %arg0, %c3, %c3, %c3) {permutation_map = ()->(0)} : (vector<128xf32>, memref<?x?xf32>, index, index) -> ()
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires 2 indices}}
+  vector.transfer_write %cst, %arg0[%c3, %c3, %c3] {permutation_map = ()->(0)} : vector<128xf32>, memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires attribute 'permutation_map'}}
+  vector.transfer_write %cst, %arg0[%c3, %c3] {perm = (d0)->(d0)} : vector<128xf32>, memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires a permutation_map with input dims of the same rank as the memref type}}
+  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = (d0)->(d0)} : vector<128xf32>, memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires a permutation_map with result dims of the same rank as the vector type}}
+  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0, d1)} : vector<128xf32>, memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
+  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0 + d1)} : vector<128xf32>, memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires a projected permutation_map (at most one dim or the zero constant can appear in each result)}}
+  vector.transfer_write %cst, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0 + 1)} : vector<128xf32>, memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<?x?x?xf32>) {
+  %c3 = constant 3 : index
+  %cst = constant dense<3.0> : vector<3 x 7 x f32>
+  // expected-error@+1 {{requires a permutation_map that is a permutation (found one dim used more than once)}}
+  vector.transfer_write %cst, %arg0[%c3, %c3, %c3] {permutation_map = (d0, d1, d2)->(d0, d0)} : vector<3x7xf32>, memref<?x?x?xf32>
+}
+
+// -----
+
+func @insert_strided_slice(%a: vector<4x4xf32>, %b: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected offsets of same size as destination vector rank}}
+  %1 = vector.insert_strided_slice %a, %b {offsets = [100], strides = [1, 1]} : vector<4x4xf32> into vector<4x8x16xf32>
+}
+
+// -----
+
+func @insert_strided_slice(%a: vector<4x4xf32>, %b: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected strides of same size as source vector rank}}
+  %1 = vector.insert_strided_slice %a, %b {offsets = [2, 2, 2], strides = [1]} : vector<4x4xf32> into vector<4x8x16xf32>
+}
+
+// -----
+
+func @insert_strided_slice(%a: vector<4x4xf32>, %b: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected source rank to be smaller than destination rank}}
+  %1 = vector.insert_strided_slice %b, %a {offsets = [2, 2], strides = [1, 1, 1]} : vector<4x8x16xf32> into vector<4x4xf32>
+}
+
+// -----
+
+func @insert_strided_slice(%a: vector<4x4xf32>, %b: vector<4x8x16xf32>) {
+  // expected-error@+1 {{op expected offsets dimension 0 to be confined to [0, 4)}}
+  %1 = vector.insert_strided_slice %a, %b {offsets = [100,100,100], strides = [1, 1]} : vector<4x4xf32> into vector<4x8x16xf32>
+}
+
+// -----
+
+func @insert_strided_slice(%a: vector<4x4xf32>, %b: vector<4x8x16xf32>) {
+  // expected-error@+1 {{op expected strides to be confined to [1, 2)}}
+  %1 = vector.insert_strided_slice %a, %b {offsets = [2, 2, 2], strides = [100, 100]} : vector<4x4xf32> into vector<4x8x16xf32>
+}
+
+// -----
+
+func @insert_strided_slice(%a: vector<4x4xf32>, %b: vector<4x8x16xf32>) {
+  // expected-error@+1 {{op expected sum(offsets, source vector shape) dimension 1 to be confined to [1, 9)}}
+  %1 = vector.insert_strided_slice %a, %b {offsets = [2, 7, 2], strides = [1, 1]} : vector<4x4xf32> into vector<4x8x16xf32>
+}
+
+// -----
+
+func @strided_slice(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected offsets, sizes and strides attributes of same size}}
+  %1 = vector.strided_slice %arg0 {offsets = [100], sizes = [2, 2], strides = [1, 1]} : vector<4x8x16xf32> to vector<2x2x16xf32>
+}
+
+// -----
+
+func @strided_slice(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected offsets attribute of rank smaller than vector rank}}
+  %1 = vector.strided_slice %arg0 {offsets = [2, 2, 2, 2], sizes = [2, 2, 2, 2], strides = [1, 1, 1, 1]} : vector<4x8x16xf32> to vector<2x2x16xf32>
+}
+
+// -----
+
+func @strided_slice(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{expected offsets attribute of rank smaller than vector rank}}
+  %1 = vector.strided_slice %arg0 {offsets = [2, 2, 2, 2], sizes = [2, 2, 2, 2], strides = [1, 1, 1, 1]} : vector<4x8x16xf32> to vector<2x2x16xf32>
+}
+
+// -----
+
+func @strided_slice(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{op expected offsets dimension 0 to be confined to [0, 4)}}
+  %1 = vector.strided_slice %arg0 {offsets = [100], sizes = [100], strides = [100]} : vector<4x8x16xf32> to vector<100x8x16xf32>
+}
+
+// -----
+
+func @strided_slice(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{op expected sizes dimension 0 to be confined to [1, 5)}}
+  %1 = vector.strided_slice %arg0 {offsets = [2], sizes = [100], strides = [100]} : vector<4x8x16xf32> to vector<100x8x16xf32>
+}
+
+// -----
+
+func @strided_slice(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{op expected strides to be confined to [1, 2)}}
+  %1 = vector.strided_slice %arg0 {offsets = [2], sizes = [1], strides = [100]} : vector<4x8x16xf32> to vector<1x8x16xf32>
+}
+
+// -----
+
+func @strided_slice(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{op expected strides to be confined to [1, 2)}}
+  %1 = vector.strided_slice %arg0 {offsets = [2], sizes = [1], strides = [100]} : vector<4x8x16xf32> to vector<1x8x16xf32>
+}
+
+// -----
+
+func @strided_slice(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{op expected sum(offsets, sizes) dimension 0 to be confined to [1, 5)}}
+  %1 = vector.strided_slice %arg0 {offsets = [2], sizes = [3], strides = [1]} : vector<4x8x16xf32> to vector<3x8x16xf32>
+}
+
+// -----
+
+func @strided_slice(%arg0: vector<4x8x16xf32>) {
+  // expected-error@+1 {{op expected result type to be 'vector<2x8x16xf32>'}}
+  %1 = vector.strided_slice %arg0 {offsets = [2], sizes = [2], strides = [1]} : vector<4x8x16xf32> to vector<3x1xf32>
+}
+
+// -----
+
+#contraction_accesses = [
+  (b0, f0, f1, c0, c1) -> (c0, b0, c1, f0),
+  (b0, f0, f1, c0, c1) -> (b0, c1, c0, f1),
+  (b0, f0, f1, c0, c1) -> (b0, f0, f1),
+  (b0, f0, f1, c0, c1) -> (b0, f0, f1)
+]
+#contraction_trait = {
+  indexing_maps = #contraction_accesses,
+  iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
+}
+func @contraction(%arg0: vector<7x8x16x15xf32>, %arg1: vector<8x16x7x5xf32>,
+                  %arg2: vector<8x15x5xf32>, %arg3 :  vector<8x15x8x5xf32>,
+                  %arg4 : index) {
+  // expected-error@+1 {{expected an indexing map for each vector operand}}
+  %0 = vector.contract #contraction_trait %arg0, %arg1, %arg2
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+}
+
+// -----
+
+#contraction_accesses = [
+  (b0, f0, f1, c0, c1) -> (c0, c0, c1, f0),
+  (b0, f0, f1, c0, c1) -> (b0, c1, c0, f1),
+  (b0, f0, f1, c0, c1) -> (b0, f0, f1)
+]
+#contraction_trait = {
+  indexing_maps = #contraction_accesses,
+  iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
+}
+func @contraction(%arg0: vector<7x8x16x15xf32>, %arg1: vector<8x16x7x5xf32>,
+                  %arg2: vector<8x15x5xf32>, %arg3 :  vector<8x15x8x5xf32>,
+                  %arg4 : index) {
+  // expected-error@+1 {{expected indexing map 0 to be a projected permutation of its inputs}}
+  %0 = vector.contract #contraction_trait %arg0, %arg1, %arg2
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+}
+
+// -----
+
+#contraction_accesses = [
+  (b0, f0, f1, c0, c1) -> (c0, b0, c1, f0),
+  (b0, f0, f1, c0, c1)[s0] -> (b0, s0, c0, f1),
+  (b0, f0, f1, c0, c1) -> (b0, f0, f1)
+]
+#contraction_trait = {
+  indexing_maps = #contraction_accesses,
+  iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
+}
+func @contraction(%arg0: vector<7x8x16x15xf32>, %arg1: vector<8x16x7x5xf32>,
+                  %arg2: vector<8x15x5xf32>, %arg3 :  vector<8x15x8x5xf32>,
+                  %arg4 : index) {
+  // expected-error@+1 {{op expected indexing map 1 to have no symbols}}
+  %0 = vector.contract #contraction_trait %arg0, %arg1, %arg2
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+}
+
+// -----
+
+#contraction_accesses = [
+  (b0, f0, f1, c0, c1) -> (c0, b0, c1, f0),
+  (b0, f0, f1, c0, c1) -> (b0, c1, c0, f1),
+  (b0, f0, f1, c1) -> (b0, f0, f1)
+]
+#contraction_trait = {
+  indexing_maps = #contraction_accesses,
+  iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
+}
+func @contraction(%arg0: vector<7x8x16x15xf32>, %arg1: vector<8x16x7x5xf32>,
+                  %arg2: vector<8x15x5xf32>, %arg3 :  vector<8x15x8x5xf32>,
+                  %arg4 : index) {
+  // expected-error@+1 {{expected indexing map 2 to have 5 number of inputs}}
+  %0 = vector.contract #contraction_trait %arg0, %arg1, %arg2
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+}
+
+// -----
+
+#contraction_accesses = [
+  (b0, f0, f1, c0, c1) -> (c0, b0, c1, f0),
+  (b0, f0, f1, c0, c1) -> (b0, c1, f1),
+  (b0, f0, f1, c0, c1) -> (b0, f0, f1)
+]
+#contraction_trait = {
+  indexing_maps = #contraction_accesses,
+  iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
+}
+func @contraction(%arg0: vector<7x8x16x15xf32>, %arg1: vector<8x16x7x5xf32>,
+                  %arg2: vector<8x15x5xf32>, %arg3 :  vector<8x15x8x5xf32>,
+                  %arg4 : index) {
+  // expected-error@+1 {{expected indexing map 1 to have 4 number of outputs}}
+  %0 = vector.contract #contraction_trait %arg0, %arg1, %arg2
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+}
+
+// -----
+
+#contraction_accesses = [
+  (b0, f0, f1, b1, b2) -> (b1, b0, b2, f0),
+  (b0, f0, f1, b1, b2) -> (b0, b2, b1, f1),
+  (b0, f0, f1, b1, b2) -> (b0, f0, f1)
+]
+#contraction_trait = {
+  indexing_maps = #contraction_accesses,
+  iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]
+}
+func @contraction(%arg0: vector<7x8x16x15xf32>, %arg1: vector<8x16x7x5xf32>,
+                  %arg2: vector<8x15x5xf32>, %arg3 :  vector<8x15x8x5xf32>,
+                  %arg4 : index) {
+  // expected-error@+1 {{op expected at least one contracting dimension pair}}
+  %0 = vector.contract #contraction_trait %arg0, %arg1, %arg2
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+}
+
+// -----
+
+#contraction_accesses = [
+  (b0, f0, f1, c0, c1) -> (c1, b0, c0, f0),
+  (b0, f0, f1, c0, c1) -> (b0, c1, c0, f1),
+  (b0, f0, f1, c0, c1) -> (b0, f0, f1)
+]
+#contraction_trait = {
+  indexing_maps = #contraction_accesses,
+  iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
+}
+func @contraction(%arg0: vector<7x8x16x15xf32>, %arg1: vector<8x16x7x5xf32>,
+                  %arg2: vector<8x15x5xf32>, %arg3 :  vector<8x15x8x5xf32>,
+                  %arg4 : index) {
+  // expected-error@+1 {{invalid contracting dimension map}}
+  %0 = vector.contract #contraction_trait %arg0, %arg1, %arg2
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+}
+
+// -----
+
+#contraction_accesses = [
+  (b0, f0, f1, c0, c1) -> (c0, b0, c1, f0),
+  (b0, f0, f1, c0, c1) -> (f1, c1, c0, b0),
+  (b0, f0, f1, c0, c1) -> (b0, f0, f1)
+]
+#contraction_trait = {
+  indexing_maps = #contraction_accesses,
+  iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
+}
+func @contraction(%arg0: vector<7x8x16x15xf32>, %arg1: vector<8x16x7x5xf32>,
+                  %arg2: vector<8x15x5xf32>, %arg3 :  vector<8x15x8x5xf32>,
+                  %arg4 : index) {
+  // expected-error@+1 {{invalid batch dimension map}}
+  %0 = vector.contract #contraction_trait %arg0, %arg1, %arg2
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+}
+
+// -----
+
+#contraction_accesses = [
+  (b0, f0, f1, c0, c1) -> (c0, b0, c1, f0),
+  (b0, f0, f1, c0, c1) -> (b0, c1, c0, f1),
+  (b0, f0, f1, c0, c1) -> (b0, f0, f1)
+]
+#contraction_trait = {
+  indexing_maps = #contraction_accesses,
+  iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
+}
+func @contraction(%arg0: vector<7x8x16x15xf32>, %arg1: vector<8x16x7x5xf32>,
+                  %arg2: vector<88x15x5xf32>, %arg3 :  vector<8x15x8x5xf32>,
+                  %arg4 : index) {
+  // expected-error@+1 {{invalid accumulator/result vector shape}}
+  %0 = vector.contract #contraction_trait %arg0, %arg1, %arg2
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<88x15x5xf32>
+}
+
+// -----
+
+#contraction_accesses = [
+  (b0, f0, f1, c0, c1) -> (c0, b0, c1, f0),
+  (b0, f0, f1, c0, c1) -> (b0, c1, c0, f1),
+  (b0, f0, f1, c0, c1) -> (b0, f0, f1)
+]
+#contraction_trait = {
+  indexing_maps = #contraction_accesses,
+  iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
+}
+func @contraction(%arg0: vector<7x8x16x15xf32>, %arg1: vector<8x16x7x5xf32>,
+                  %arg2: vector<8x15x5xf32>, %arg3 :  vector<8x15x8x5xf32>,
+                  %arg4 : index) {
+  %lhs_mask = vector.constant_mask [7, 8, 16, 15] : vector<7x8x16x15xi1>
+  %rhs_mask = vector.constant_mask [8, 16, 7, 5] : vector<8x16x7x5xi1>
+  // expected-error@+1 {{expected zero or exactly 2 vector mask operands}}
+  %0 = vector.contract #contraction_trait %arg0, %arg1, %arg2, %lhs_mask
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+}
+
+// -----
+
+func @create_mask() {
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  // expected-error@+1 {{must specify an operand for each result vector dimension}}
+  %0 = vector.create_mask %c3, %c2 : vector<4x3x7xi1>
+}
+
+
+// -----
+
+func @constant_mask() {
+  // expected-error@+1 {{must specify array attr of size equal vector result rank}}
+  %0 = vector.constant_mask [3, 2, 7] : vector<4x3xi1>
+}
+
+// -----
+
+func @constant_mask_out_of_bounds() {
+  // expected-error@+1 {{array attr of size out of bounds of vector result dimension size}}
+  %0 = vector.constant_mask [-1, 2] : vector<4x3xi1>
+}
+
+// -----
+
+func @constant_mask_out_of_bounds() {
+  // expected-error@+1 {{array attr of size out of bounds of vector result dimension size}}
+  %0 = vector.constant_mask [3, 4] : vector<4x3xi1>
+}
+
+// -----
+
+func @constant_mask_with_zero_mask_dim_size() {
+  // expected-error@+1 {{expected all mask dim sizes to be zeros, as a result of conjunction with zero mask dim}}
+  %0 = vector.constant_mask [0, 2] : vector<4x3xi1>
+}
+
+
+// -----
+
+func @extract_slices_non_unit_strides(%arg0 : vector<4x2xf32>) {
+  // expected-error@+1 {{requires unit strides}}
+  %0 = vector.extract_slices %arg0, [2, 2], [1, 3]
+    : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
+}
+
+// -----
+
+func @extract_slices_tuple_element_wrong_rank(%arg0 : vector<4x2xf32>) {
+  // expected-error@+1 {{requires vector tuple elements of rank 2}}
+  %0 = vector.extract_slices %arg0, [2, 2], [1, 1]
+    : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2x3xf32>>
+}
+
+// -----
+
+func @extract_slices_sizes_strides_wrong_rank(%arg0 : vector<4x2xf32>) {
+  // expected-error@+1 {{requires sizes and strides of rank}}
+  %0 = vector.extract_slices %arg0, [2, 2], [1, 1, 1]
+    : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
+}
+
+// -----
+
+func @extract_slices_invalid_tuple_element_type(%arg0 : vector<4x2xf32>) {
+  // expected-error@+1 {{invalid tuple element type}}
+  %0 = vector.extract_slices %arg0, [2, 2], [1, 1]
+    : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<4x2xf32>>
+}
+
+// -----
+
+func @tuple_of_non_vectors(%arg0 : vector<4x2xf32>) {
+  %c0 = constant 0 : index
+  // expected-error@+1 {{must be vector of any type values}}
+  %0 = vector.tuple %arg0, %c0 : vector<4x2xf32>, index
+}
+
+// -----
+
+func @tuple_get_of_non_vectors(%arg0 : tuple<vector<4x2xf32>, index>) {
+  // expected-error@+1 {{vector of any type values}}
+  %0 = vector.tuple_get %arg0, 0 : tuple<vector<4x2xf32>, index>
+}
+
+// -----
+
+func @insert_slices_non_unit_strides(%arg0 : tuple<vector<2x2xf32>, vector<2x2xf32>>) {
+  // expected-error@+1 {{requires unit strides}}
+  %0 = vector.insert_slices %arg0, [2, 2], [1, 3]
+    : tuple<vector<2x2xf32>, vector<2x2xf32>> into vector<4x2xf32>
+}
+
+// -----
+
+func @insert_slices_tuple_element_wrong_rank(%arg0 : tuple<vector<2x2xf32>, vector<2x2x3xf32>>) {
+  // expected-error@+1 {{requires vector tuple elements of rank 2}}
+  %0 = vector.insert_slices %arg0, [2, 2], [1, 1]
+    : tuple<vector<2x2xf32>, vector<2x2x3xf32>> into vector<4x2xf32>
+}
+
+// -----
+
+func @insert_slices_sizes_strides_wrong_rank(%arg0 : tuple<vector<2x2xf32>, vector<2x2xf32>>) {
+  // expected-error@+1 {{requires sizes and strides of rank}}
+  %0 = vector.insert_slices %arg0, [2, 2], [1, 1, 1]
+    : tuple<vector<2x2xf32>, vector<2x2xf32>> into vector<4x2xf32>
+}
+
+// -----
+
+func @insert_slices_invalid_tuple_element_type(%arg0 : tuple<vector<2x2xf32>, vector<4x2xf32>>) {
+  // expected-error@+1 {{invalid tuple element type}}
+  %0 = vector.insert_slices %arg0, [2, 2], [1, 1]
+    : tuple<vector<2x2xf32>, vector<4x2xf32>> into vector<4x2xf32>
+}
+
+// -----
+
+func @print_no_result(%arg0 : f32) -> i32 {
+  // expected-error@+1 {{cannot name an operation with no results}}
+  %0 = vector.print %arg0 : f32
+}
+
+// -----
+
+func @reshape_bad_input_shape(%arg0 : vector<3x2x4xf32>) {
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c6 = constant 6 : index
+  %c9 = constant 9 : index
+  // expected-error@+1 {{invalid input shape for vector type}}
+  %1 = vector.reshape %arg0, [%c3, %c6, %c3], [%c2, %c9], [4]
+    : vector<3x2x4xf32> to vector<2x3x4xf32>
+}
+
+// -----
+
+func @reshape_bad_output_shape(%arg0 : vector<3x2x4xf32>) {
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c6 = constant 6 : index
+  %c9 = constant 9 : index
+  // expected-error@+1 {{invalid output shape for vector type}}
+  %1 = vector.reshape %arg0, [%c3, %c6], [%c2, %c9, %c3], [4]
+    : vector<3x2x4xf32> to vector<2x3x4xf32>
+}
+
+// -----
+
+func @reshape_bad_input_output_shape_product(%arg0 : vector<3x2x4xf32>) {
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c6 = constant 6 : index
+  %c9 = constant 9 : index
+  // expected-error@+1 {{product of input and output shape sizes must match}}
+  %1 = vector.reshape %arg0, [%c3, %c6], [%c2, %c6], [4]
+    : vector<3x2x4xf32> to vector<2x3x4xf32>
+}
+
+// -----
+
+func @reshape_bad_input_fixed_size(%arg0 : vector<3x2x5xf32>) {
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c6 = constant 6 : index
+  %c9 = constant 9 : index
+  // expected-error@+1 {{fixed vector size must match input vector for dim 0}}
+  %1 = vector.reshape %arg0, [%c3, %c6], [%c2, %c9], [4]
+    : vector<3x2x5xf32> to vector<2x3x4xf32>
+}
+
+// -----
+
+func @reshape_bad_output_fixed_size(%arg0 : vector<3x2x4xf32>) {
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c6 = constant 6 : index
+  %c9 = constant 9 : index
+  // expected-error@+1 {{fixed vector size must match output vector for dim 0}}
+  %1 = vector.reshape %arg0, [%c3, %c6], [%c2, %c9], [4]
+    : vector<3x2x4xf32> to vector<2x3x5xf32>
+}
diff --git a/mlir/test/Dialect/VectorOps/ops.mlir b/mlir/test/Dialect/VectorOps/ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..31113bdd4790e6323322637491f65d734e8686c0
--- /dev/null
+++ b/mlir/test/Dialect/VectorOps/ops.mlir
@@ -0,0 +1,235 @@
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+
+// CHECK-DAG: #[[MAP0:map[0-9]+]] = (d0, d1) -> (d0, d1)
+
+// CHECK-LABEL: func @vector_transfer_ops(
+func @vector_transfer_ops(%arg0: memref<?x?xf32>,
+                          %arg1 : memref<?x?xvector<4x3xf32>>) {
+  // CHECK: %[[C3:.*]] = constant 3 : index
+  %c3 = constant 3 : index
+  %cst = constant 3.0 : f32
+  %f0 = constant 0.0 : f32
+  %vf0 = splat %f0 : vector<4x3xf32>
+
+  //
+  // CHECK: vector.transfer_read
+  %0 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = (d0, d1)->(d0)} : memref<?x?xf32>, vector<128xf32>
+  // CHECK: vector.transfer_read
+  %1 = vector.transfer_read %arg0[%c3, %c3], %f0 {permutation_map = (d0, d1)->(d1, d0)} : memref<?x?xf32>, vector<3x7xf32>
+  // CHECK: vector.transfer_read
+  %2 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = (d0, d1)->(d0)} : memref<?x?xf32>,  vector<128xf32>
+  // CHECK: vector.transfer_read
+  %3 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = (d0, d1)->(d1)} : memref<?x?xf32>,  vector<128xf32>
+  // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
+  %4 = vector.transfer_read %arg1[%c3, %c3], %vf0 {permutation_map = (d0, d1)->(d0, d1)} : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
+
+  // CHECK: vector.transfer_write
+  vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d0)} : vector<128xf32>, memref<?x?xf32>
+  // CHECK: vector.transfer_write
+  vector.transfer_write %1, %arg0[%c3, %c3] {permutation_map = (d0, d1)->(d1, d0)} : vector<3x7xf32>, memref<?x?xf32>
+  // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] {permutation_map = #[[MAP0]]} : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
+  vector.transfer_write %4, %arg1[%c3, %c3] {permutation_map = (d0, d1)->(d0, d1)} : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
+
+  return
+}
+
+// CHECK-LABEL: @vector_broadcast
+func @vector_broadcast(%a: f32, %b: vector<16xf32>, %c: vector<1x16xf32>, %d: vector<8x1xf32>) -> vector<8x16xf32> {
+  // CHECK: vector.broadcast %{{.*}} : f32 to vector<16xf32>
+  %0 = vector.broadcast %a : f32 to vector<16xf32>
+  // CHECK-NEXT: vector.broadcast %{{.*}} : vector<16xf32> to vector<8x16xf32>
+  %1 = vector.broadcast %b : vector<16xf32> to vector<8x16xf32>
+  // CHECK-NEXT: vector.broadcast %{{.*}} : vector<1x16xf32> to vector<8x16xf32>
+  %2 = vector.broadcast %c : vector<1x16xf32> to vector<8x16xf32>
+  // CHECK-NEXT: vector.broadcast %{{.*}} : vector<8x1xf32> to vector<8x16xf32>
+  %3 = vector.broadcast %d : vector<8x1xf32> to vector<8x16xf32>
+  return %3 : vector<8x16xf32>
+}
+
+// CHECK-LABEL: @shuffle1D
+func @shuffle1D(%a: vector<2xf32>, %b: vector<4xf32>) -> vector<2xf32> {
+  // CHECK: vector.shuffle %{{.*}}, %{{.*}}[0, 1, 2, 3] : vector<2xf32>, vector<2xf32>
+  %1 = vector.shuffle %a, %a[0, 1, 2, 3] : vector<2xf32>, vector<2xf32>
+  // CHECK-NEXT: vector.shuffle %{{.*}}, %{{.*}}[0, 1, 2] : vector<4xf32>, vector<4xf32>
+  %2 = vector.shuffle %1, %b[0, 1, 2] : vector<4xf32>, vector<4xf32>
+  // CHECK-NEXT: vector.shuffle %{{.*}}, %{{.*}}[0, 6] : vector<3xf32>, vector<4xf32>
+  %3 = vector.shuffle %2, %b[0, 6] : vector<3xf32>, vector<4xf32>
+  return %3 : vector<2xf32>
+}
+
+// CHECK-LABEL: @shuffle2D
+func @shuffle2D(%a: vector<1x4xf32>, %b: vector<2x4xf32>) -> vector<3x4xf32> {
+  // CHECK: vector.shuffle %{{.*}}, %{{.*}}[0, 1, 2] : vector<1x4xf32>, vector<2x4xf32>
+  %1 = vector.shuffle %a, %b[0, 1, 2] : vector<1x4xf32>, vector<2x4xf32>
+  return %1 : vector<3x4xf32>
+}
+
+// CHECK-LABEL: @extract_element
+func @extract_element(%a: vector<16xf32>) -> f32 {
+  // CHECK:      %[[C15:.*]] = constant 15 : i32
+  %c = constant 15 : i32
+  // CHECK-NEXT: vector.extractelement %{{.*}}[%[[C15]] : i32] : vector<16xf32>
+  %1 = vector.extractelement %a[%c : i32] : vector<16xf32>
+  return %1 : f32
+}
+
+// CHECK-LABEL: @extract
+func @extract(%arg0: vector<4x8x16xf32>) -> (vector<8x16xf32>, vector<16xf32>, f32) {
+  // CHECK: vector.extract {{.*}}[3] : vector<4x8x16xf32>
+  %1 = vector.extract %arg0[3] : vector<4x8x16xf32>
+  // CHECK-NEXT: vector.extract {{.*}}[3, 3] : vector<4x8x16xf32>
+  %2 = vector.extract %arg0[3, 3] : vector<4x8x16xf32>
+  // CHECK-NEXT: vector.extract {{.*}}[3, 3, 3] : vector<4x8x16xf32>
+  %3 = vector.extract %arg0[3, 3, 3] : vector<4x8x16xf32>
+  return %1, %2, %3 : vector<8x16xf32>, vector<16xf32>, f32
+}
+
+// CHECK-LABEL: @insert_element
+func @insert_element(%a: f32, %b: vector<16xf32>) -> vector<16xf32> {
+  // CHECK:      %[[C15:.*]] = constant 15 : i32
+  %c = constant 15 : i32
+  // CHECK-NEXT: vector.insertelement %{{.*}}, %{{.*}}[%[[C15]] : i32] : vector<16xf32>
+  %1 = vector.insertelement %a, %b[%c : i32] : vector<16xf32>
+  return %1 : vector<16xf32>
+}
+
+// CHECK-LABEL: @insert
+func @insert(%a: f32, %b: vector<16xf32>, %c: vector<8x16xf32>, %res: vector<4x8x16xf32>) -> vector<4x8x16xf32> {
+  // CHECK: vector.insert %{{.*}}, %{{.*}}[3] : vector<8x16xf32> into vector<4x8x16xf32>
+  %1 = vector.insert %c, %res[3] : vector<8x16xf32> into vector<4x8x16xf32>
+  // CHECK: vector.insert %{{.*}}, %{{.*}}[3, 3] : vector<16xf32> into vector<4x8x16xf32>
+  %2 = vector.insert %b, %res[3, 3] : vector<16xf32> into vector<4x8x16xf32>
+  // CHECK: vector.insert %{{.*}}, %{{.*}}[3, 3, 3] : f32 into vector<4x8x16xf32>
+  %3 = vector.insert %a, %res[3, 3, 3] : f32 into vector<4x8x16xf32>
+  return %3 : vector<4x8x16xf32>
+}
+
+// CHECK-LABEL: @outerproduct
+func @outerproduct(%arg0: vector<4xf32>, %arg1: vector<8xf32>, %arg2: vector<4x8xf32>) -> vector<4x8xf32> {
+  // CHECK: vector.outerproduct {{.*}} : vector<4xf32>, vector<8xf32>
+  %0 = vector.outerproduct %arg0, %arg1 : vector<4xf32>, vector<8xf32>
+  // CHECK: vector.outerproduct {{.*}}, {{.*}}, {{.*}} : vector<4xf32>, vector<8xf32>
+  %1 = vector.outerproduct %arg0, %arg1, %arg2 : vector<4xf32>, vector<8xf32>
+  return %1 : vector<4x8xf32>
+}
+
+// CHECK-LABEL: @insert_strided_slice
+func @insert_strided_slice(%a: vector<4x4xf32>, %b: vector<4x8x16xf32>) {
+  // CHECK: vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [2, 2, 2], strides = [1, 1]} : vector<4x4xf32> into vector<4x8x16xf32>
+  %1 = vector.insert_strided_slice %a, %b {offsets = [2, 2, 2], strides = [1, 1]} : vector<4x4xf32> into vector<4x8x16xf32>
+  return
+}
+
+// CHECK-LABEL: @strided_slice
+func @strided_slice(%arg0: vector<4x8x16xf32>) -> vector<2x2x16xf32> {
+  // CHECK: vector.strided_slice %{{.*}} {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x8x16xf32>
+  %1 = vector.strided_slice %arg0 {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x8x16xf32> to vector<2x2x16xf32>
+  return %1: vector<2x2x16xf32>
+}
+
+#contraction_accesses0 = [
+  (b0, f0, f1, c0, c1) -> (c0, b0, c1, f0),
+  (b0, f0, f1, c0, c1) -> (b0, c1, c0, f1),
+  (b0, f0, f1, c0, c1) -> (b0, f0, f1)
+]
+#contraction_trait0 = {
+  indexing_maps = #contraction_accesses0,
+  iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
+}
+#contraction_accesses1 = [
+  (f0, f1, f2, f3, c0, c1) -> (c0, f0, c1, f2),
+  (f0, f1, f2, f3, c0, c1) -> (f1, c1, c0, f3),
+  (f0, f1, f2, f3, c0, c1) -> (f0, f1, f2, f3)
+]
+#contraction_trait1 = {
+  indexing_maps = #contraction_accesses1,
+  iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction",
+                    "reduction"]
+}
+// CHECK-LABEL: contraction
+func @contraction(%arg0 : vector<7x8x16x15xf32>, %arg1 : vector<8x16x7x5xf32>,
+                  %arg2 : vector<8x15x5xf32>, %arg3 : vector<8x15x8x5xf32>,
+                  %arg4 : index) {
+  // Test contraction with batch and contracting dims.
+  // CHECK: vector.contract {indexing_maps = [#{{.*}}, #{{.*}}, #{{.*}}], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} {{.*}}, {{.*}}, {{.*}} : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+  %0 = vector.contract #contraction_trait0 %arg0, %arg1, %arg2
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
+  // Test contraction with only contracting dims. In this case the lhs/rhs
+  // dimension of size 8 will be considered a parallel dim for lhs/rhs and will
+  // appear twice in the output.
+  // CHECK: vector.contract {indexing_maps = [#{{.*}}, #{{.*}}, #{{.*}}], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} {{.*}}, {{.*}}, {{.*}} : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32>
+  %1 = vector.contract #contraction_trait1 %arg0, %arg1, %arg3
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32>
+  // Test contraction with optional vector mask arguments.
+  %lhs_mask = vector.constant_mask [7, 8, 16, 15] : vector<7x8x16x15xi1>
+  %rhs_mask = vector.constant_mask [8, 16, 7, 5] : vector<8x16x7x5xi1>
+  // CHECK: vector.contract {indexing_maps = [#{{.*}}, #{{.*}}, #{{.*}}], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}} : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32>
+  %2 = vector.contract #contraction_trait1 %arg0, %arg1, %arg3, %lhs_mask,
+                                           %rhs_mask
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32>
+  return
+}
+
+// CHECK-LABEL: create_vector_mask
+func @create_vector_mask() {
+  // CHECK:      %[[C2:.*]] = constant 2 : index
+  %c2 = constant 2 : index
+  // CHECK-NEXT: %[[C3:.*]] = constant 3 : index
+  %c3 = constant 3 : index
+  // CHECK-NEXT: vector.create_mask %[[C3]], %[[C2]] : vector<4x3xi1>
+  %0 = vector.create_mask %c3, %c2 : vector<4x3xi1>
+
+  return
+}
+
+// CHECK-LABEL: constant_vector_mask
+func @constant_vector_mask() {
+  // CHECK: vector.constant_mask [3, 2] : vector<4x3xi1>
+  %0 = vector.constant_mask [3, 2] : vector<4x3xi1>
+  return
+}
+
+// CHECK-LABEL: extract_slices
+func @extract_slices(%arg0 : vector<4x2xf32>)
+  -> (tuple<vector<2x2xf32>, vector<2x2xf32>>) {
+  // CHECK: vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
+  %0 = vector.extract_slices %arg0, [2, 2], [1, 1]
+    : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
+  %1 = vector.tuple_get %0, 0 : tuple<vector<2x2xf32>, vector<2x2xf32>>
+  %2 = vector.tuple_get %0, 1 : tuple<vector<2x2xf32>, vector<2x2xf32>>
+  %3 = vector.tuple %1, %2 : vector<2x2xf32>, vector<2x2xf32>
+  return %3 : tuple<vector<2x2xf32>, vector<2x2xf32>>
+}
+
+// CHECK-LABEL: insert_slices
+func @insert_slices(%arg0 : tuple<vector<2x2xf32>, vector<2x2xf32>>)
+  -> (vector<4x2xf32>) {
+  // CHECK: vector.insert_slices %{{.*}}, [2, 2], [1, 1] : tuple<vector<2x2xf32>, vector<2x2xf32>> into vector<4x2xf32>
+  %0 = vector.insert_slices %arg0, [2, 2], [1, 1]
+    : tuple<vector<2x2xf32>, vector<2x2xf32>> into vector<4x2xf32>
+  return %0 : vector<4x2xf32>
+}
+
+// CHECK-LABEL: @vector_print
+func @vector_print(%arg0: vector<8x4xf32>) {
+  // CHECK: vector.print %{{.*}} : vector<8x4xf32>
+  vector.print %arg0 : vector<8x4xf32>
+  return
+}
+
+// CHECK-LABEL: reshape
+func @reshape(%arg0 : vector<3x2x4xf32>) -> (vector<2x3x4xf32>) {
+  // CHECK:      %[[C2:.*]] = constant 2 : index
+  %c2 = constant 2 : index
+  // CHECK:      %[[C3:.*]] = constant 3 : index
+  %c3 = constant 3 : index
+  // CHECK:      %[[C6:.*]] = constant 6 : index
+  %c6 = constant 6 : index
+  // CHECK:      %[[C9:.*]] = constant 9 : index
+  %c9 = constant 9 : index
+  // CHECK: vector.reshape %{{.*}}, [%[[C3]], %[[C6]]], [%[[C2]], %[[C9]]], [4] : vector<3x2x4xf32> to vector<2x3x4xf32>
+  %1 = vector.reshape %arg0, [%c3, %c6], [%c2, %c9], [4]
+    : vector<3x2x4xf32> to vector<2x3x4xf32>
+
+  return %1 : vector<2x3x4xf32>
+}
diff --git a/mlir/test/Dialect/VectorOps/vector-transforms.mlir b/mlir/test/Dialect/VectorOps/vector-transforms.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..b5fcbaba91c6dd832c86e7e3ed6f197344eb7c0a
--- /dev/null
+++ b/mlir/test/Dialect/VectorOps/vector-transforms.mlir
@@ -0,0 +1,304 @@
+// RUN: mlir-opt %s -test-vector-to-vector-conversion | FileCheck %s
+
+// CHECK-DAG: #[[MAP0:map[0-9]+]] = (d0, d1) -> (d0, d1)
+
+// CHECK-LABEL: func @add4x2
+//      CHECK: %[[ES1:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[ES2:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG1:.*]] = vector.tuple_get %[[ES1]], 0 : tuple<vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG2:.*]] = vector.tuple_get %[[ES2]], 0 : tuple<vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[A1:.*]] = addf %[[TG1]], %[[TG2]] : vector<2x2xf32>
+// CHECK-NEXT: %[[TG3:.*]] = vector.tuple_get %[[ES1]], 1 : tuple<vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG4:.*]] = vector.tuple_get %[[ES2]], 1 : tuple<vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[A2:.*]] = addf %[[TG3]], %[[TG4]] : vector<2x2xf32>
+// CHECK-NEXT: %[[R1:.*]] = vector.tuple %[[A1]], %[[A2]] : vector<2x2xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[R2:.*]] = vector.insert_slices %[[R1]], [2, 2], [1, 1] : tuple<vector<2x2xf32>, vector<2x2xf32>> into vector<4x2xf32>
+// CHECK-NEXT: return %[[R2:.*]] : vector<4x2xf32>
+
+func @add4x2(%0: vector<4x2xf32>) -> vector<4x2xf32> {
+  %1 = addf %0, %0: vector<4x2xf32>
+  return %1: vector<4x2xf32>
+}
+
+// CHECK-LABEL: func @add4x4
+//      CHECK: %[[ES1:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<4x4xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[ES2:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<4x4xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+
+// CHECK-NEXT: %[[TG1:.*]] = vector.tuple_get %[[ES1]], 0 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG2:.*]] = vector.tuple_get %[[ES2]], 0 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[A1:.*]] = addf %[[TG1]], %[[TG2]] : vector<2x2xf32>
+
+// CHECK-NEXT: %[[TG3:.*]] = vector.tuple_get %[[ES1]], 1 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG4:.*]] = vector.tuple_get %[[ES2]], 1 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[A2:.*]] = addf %[[TG3]], %[[TG4]] : vector<2x2xf32>
+
+// CHECK-NEXT: %[[TG5:.*]] = vector.tuple_get %[[ES1]], 2 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG6:.*]] = vector.tuple_get %[[ES2]], 2 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[A3:.*]] = addf %[[TG5]], %[[TG6]] : vector<2x2xf32>
+
+// CHECK-NEXT: %[[TG7:.*]] = vector.tuple_get %[[ES1]], 3 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG8:.*]] = vector.tuple_get %[[ES2]], 3 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[A4:.*]] = addf %[[TG7]], %[[TG8]] : vector<2x2xf32>
+
+// CHECK-NEXT: %[[ES3:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<4x4xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+
+// CHECK-NEXT: %[[TG9:.*]] = vector.tuple_get %[[ES3]], 0 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[A5:.*]] = addf %[[TG9]], %[[A1]] : vector<2x2xf32>
+
+// CHECK-NEXT: %[[TG11:.*]] = vector.tuple_get %[[ES3]], 1 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[A6:.*]] = addf %[[TG11]], %[[A2]] : vector<2x2xf32>
+
+// CHECK-NEXT: %[[TG13:.*]] = vector.tuple_get %[[ES3]], 2 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[A7:.*]] = addf %[[TG13]], %[[A3]] : vector<2x2xf32>
+
+// CHECK-NEXT: %[[TG15:.*]] = vector.tuple_get %[[ES3]], 3 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[A8:.*]] = addf %[[TG15]], %[[A4]] : vector<2x2xf32>
+
+// CHECK-NEXT: %[[R3:.*]] = vector.tuple %[[A5]], %[[A6]], %[[A7]], %[[A8]] : vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[R4:.*]] = vector.insert_slices %[[R3]], [2, 2], [1, 1] : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>> into vector<4x4xf32>
+// CHECK-NEXT: return %[[R4]] : vector<4x4xf32>
+
+func @add4x4(%0: vector<4x4xf32>, %1: vector<4x4xf32>) -> vector<4x4xf32> {
+  %2 = addf %0, %1: vector<4x4xf32>
+  %3 = addf %1, %2: vector<4x4xf32>
+  return %3: vector<4x4xf32>
+}
+
+#contraction_accesses0 = [
+  (i, j, k) -> (i, k),
+  (i, j, k) -> (k, j),
+  (i, j, k) -> (i, j)
+]
+#contraction_trait0 = {
+  indexing_maps = #contraction_accesses0,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+// CHECK-LABEL: func @contraction4x4_ijk
+
+//      CHECK: %[[LMASK:.*]] = vector.constant_mask [4, 6] : vector<4x6xi1>
+// CHECK-NEXT: %[[RMASK:.*]] = vector.constant_mask [6, 4] : vector<6x4xi1>
+
+// Reducing output vector [0, 0]
+
+// CHECK-NEXT: %[[ES1:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<4x6xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[ES2:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<6x4xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[ES3:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<4x4xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[ES4:.*]] = vector.extract_slices %[[LMASK]], [2, 2], [1, 1] : vector<4x6xi1> into tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[ES5:.*]] = vector.extract_slices %[[RMASK]], [2, 2], [1, 1] : vector<6x4xi1> into tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+
+// CHECK-NEXT: %[[TG1:.*]] = vector.tuple_get %[[ES1]], 0 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG2:.*]] = vector.tuple_get %[[ES2]], 0 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG3:.*]] = vector.tuple_get %[[ES3]], 0 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG4:.*]] = vector.tuple_get %[[ES4]], 0 : tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[TG5:.*]] = vector.tuple_get %[[ES5]], 0 : tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[R1S00:.*]] = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %[[TG1]], %[[TG2]], %[[TG3]], %[[TG4]], %[[TG5]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// CHECK-NEXT: %[[TG6:.*]] = vector.tuple_get %[[ES1]], 1 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG7:.*]] = vector.tuple_get %[[ES2]], 2 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG8:.*]] = vector.tuple_get %[[ES4]], 1 : tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[TG9:.*]] = vector.tuple_get %[[ES5]], 2 : tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[R2S00:.*]] = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %[[TG6]], %[[TG7]], %[[R1S00]], %[[TG8]], %[[TG9]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// CHECK-NEXT: %[[TG10:.*]] = vector.tuple_get %[[ES1]], 2 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG11:.*]] = vector.tuple_get %[[ES2]], 4 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG12:.*]] = vector.tuple_get %[[ES4]], 2 : tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[TG13:.*]] = vector.tuple_get %[[ES5]], 4 : tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[R3S00:.*]] = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %[[TG10]], %[[TG11]], %[[R2S00]], %[[TG12]], %[[TG13]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// Reducing output vector [0, 2]
+
+// CHECK-NEXT: %[[TG14:.*]] = vector.tuple_get %[[ES2]], 1 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG15:.*]] = vector.tuple_get %[[ES3]], 1 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG16:.*]] = vector.tuple_get %[[ES5]], 1 : tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[R1S02:.*]] = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %[[TG1]], %[[TG14]], %[[TG15]], %[[TG4]], %[[TG16]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// CHECK-NEXT: %[[TG17:.*]] = vector.tuple_get %[[ES2]], 3 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG18:.*]] = vector.tuple_get %[[ES5]], 3 : tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[R2S02:.*]] = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %[[TG6]], %[[TG17]], %[[R1S02]], %[[TG8]], %[[TG18]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// CHECK-NEXT: %[[TG19:.*]] = vector.tuple_get %[[ES2]], 5 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG20:.*]] = vector.tuple_get %[[ES5]], 5 : tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[R3S02:.*]] = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %[[TG10]], %[[TG19]], %[[R2S02]], %[[TG12]], %[[TG20]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// Reducing output vector [2, 0]
+
+// CHECK-NEXT: %[[TG21:.*]] = vector.tuple_get %[[ES1]], 3 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG22:.*]] = vector.tuple_get %[[ES3]], 2 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG23:.*]] = vector.tuple_get %[[ES4]], 3 : tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[R1S20:.*]] = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %[[TG21]], %[[TG2]], %[[TG22]], %[[TG23]], %[[TG5]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// CHECK-NEXT: %[[TG24:.*]] = vector.tuple_get %[[ES1]], 4 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG25:.*]] = vector.tuple_get %[[ES4]], 4 : tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT:  %[[R2S20:.*]] = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %[[TG24]], %[[TG7]], %[[R1S20]], %[[TG25]], %[[TG9]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// CHECK-NEXT: %[[TG26:.*]] = vector.tuple_get %[[ES1]], 5 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG27:.*]] = vector.tuple_get %[[ES4]], 5 : tuple<vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT:  %[[R3S20:.*]] = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %[[TG26]], %[[TG11]], %[[R2S20]], %[[TG27]], %[[TG13]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// Reducing output vector [2, 2]
+
+// CHECK-NEXT: %[[TG28:.*]] = vector.tuple_get %[[ES3]], 3 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[R1S22:.*]] = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %[[TG21]], %[[TG14]], %[[TG28]], %[[TG23]], %[[TG16]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+// CHECK-NEXT: %[[R2S22:.*]] = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %[[TG24]], %[[TG17]], %[[R1S22]], %[[TG25]], %[[TG18]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+// CHECK-NEXT: %[[R3S22:.*]] = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %[[TG26]], %[[TG19]], %[[R2S22]], %[[TG27]], %[[TG20]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// CHECK-NEXT: %[[RES0:.*]] = vector.tuple %[[R3S00]], %[[R3S02]], %[[R3S20]], %[[R3S22]] : vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[RES1:.*]] = vector.insert_slices %[[RES0]], [2, 2], [1, 1] : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>> into vector<4x4xf32>
+// CHECK-NEXT:  return %[[RES1]] : vector<4x4xf32>
+
+func @contraction4x4_ijk(%arg0 : vector<4x6xf32>, %arg1 : vector<6x4xf32>,
+                         %arg2 : vector<4x4xf32>, %arg3 : index)
+                         -> (vector<4x4xf32>) {
+  %lhsm = vector.constant_mask [4, 6] : vector<4x6xi1>
+  %rhsm = vector.constant_mask [6, 4] : vector<6x4xi1>
+  %0 = vector.contract #contraction_trait0 %arg0, %arg1, %arg2, %lhsm, %rhsm
+      : vector<4x6xf32>, vector<6x4xf32> into vector<4x4xf32>
+
+  return %0 : vector<4x4xf32>
+}
+
+#contraction_accesses1 = [
+  (i, k, j) -> (i, k),
+  (i, k, j) -> (k, j),
+  (i, k, j) -> (i, j)
+]
+#contraction_trait1 = {
+  indexing_maps = #contraction_accesses1,
+  iterator_types = ["parallel", "reduction", "parallel"]
+}
+
+// CHECK-LABEL: func @contraction4x4_ikj
+
+
+//      CHECK: %[[LMASK:.*]] = vector.constant_mask [4, 2] : vector<4x2xi1>
+// CHECK-NEXT: %[[RMASK:.*]] = vector.constant_mask [2, 4] : vector<2x4xi1>
+
+// Reducing output vector [0, 0]
+
+// CHECK-NEXT: %[[ES1:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[ES2:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<2x4xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[ES3:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<4x4xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[ES4:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<4x2xi1> into tuple<vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[ES5:.*]] = vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<2x4xi1> into tuple<vector<2x2xi1>, vector<2x2xi1>>
+
+// CHECK-NEXT: %[[TG1:.*]] = vector.tuple_get %[[ES1]], 0 : tuple<vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG2:.*]] = vector.tuple_get %[[ES2]], 0 : tuple<vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG3:.*]] = vector.tuple_get %[[ES3]], 0 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG4:.*]] = vector.tuple_get %[[ES4]], 0 : tuple<vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT: %[[TG5:.*]] = vector.tuple_get %[[ES5]], 0 : tuple<vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT:  %[[R1S00:.*]] = vector.contract {indexing_maps = [#map2, #map3, #map0], iterator_types = ["parallel", "reduction", "parallel"]} %[[TG1]], %[[TG2]], %[[TG3]], %[[TG4]], %[[TG5]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// Reducing output vector [0, 2]
+
+// CHECK-NEXT: %[[TG6:.*]] = vector.tuple_get %[[ES2]], 1 : tuple<vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG7:.*]] = vector.tuple_get %[[ES3]], 1 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG8:.*]] = vector.tuple_get %[[ES5]], 1 : tuple<vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT:  %[[R1S02:.*]] = vector.contract {indexing_maps = [#map2, #map3, #map0], iterator_types = ["parallel", "reduction", "parallel"]} %[[TG1]], %[[TG6]], %[[TG7]], %[[TG4]], %[[TG8]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// Reducing output vector [2, 0]
+
+// CHECK-NEXT: %[[TG9:.*]] = vector.tuple_get %[[ES1]], 1 : tuple<vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG10:.*]] = vector.tuple_get %[[ES3]], 2 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT: %[[TG11:.*]] = vector.tuple_get %[[ES4]], 1 : tuple<vector<2x2xi1>, vector<2x2xi1>>
+// CHECK-NEXT:  %[[R1S20:.*]] = vector.contract {indexing_maps = [#map2, #map3, #map0], iterator_types = ["parallel", "reduction", "parallel"]} %[[TG9]], %[[TG2]], %[[TG10]], %[[TG11]], %[[TG5]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// Reducing output vector [2, 2]
+
+// CHECK-NEXT: %[[TG12:.*]] = vector.tuple_get %[[ES3]], 3 : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>>
+// CHECK-NEXT:  %[[R1S22:.*]] = vector.contract {indexing_maps = [#map2, #map3, #map0], iterator_types = ["parallel", "reduction", "parallel"]} %[[TG9]], %[[TG6]], %[[TG12]], %[[TG11]], %[[TG8]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// CHECK-NEXT: %[[RES0:.*]] = vector.tuple %[[R1S00]], %[[R1S02]], %[[R1S20]], %[[R1S22]] : vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[RES1:.*]] = vector.insert_slices %[[RES0]], [2, 2], [1, 1] : tuple<vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>> into vector<4x4xf32>
+// CHECK-NEXT:  return %[[RES1]] : vector<4x4xf32>
+
+func @contraction4x4_ikj(%arg0 : vector<4x2xf32>, %arg1 : vector<2x4xf32>,
+                         %arg2 : vector<4x4xf32>, %arg3 : index)
+                         -> (vector<4x4xf32>) {
+  %lhsm = vector.constant_mask [4, 2] : vector<4x2xi1>
+  %rhsm = vector.constant_mask [2, 4] : vector<2x4xi1>
+  %0 = vector.contract #contraction_trait1 %arg0, %arg1, %arg2, %lhsm, %rhsm
+      : vector<4x2xf32>, vector<2x4xf32> into vector<4x4xf32>
+
+  return %0 : vector<4x4xf32>
+}
+
+// CHECK-LABEL: func @contraction4x4_ikj_xfer_read
+
+// CHECK:      %[[C0:.*]] = constant 0 : index
+// CHECK:      %[[C2:.*]] = constant 2 : index
+
+// Check LHS vector.transfer read is split for each user.
+
+//      CHECK: %[[VTR0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<4x2xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[VTR1:.*]] = vector.transfer_read %{{.*}}[%[[C2]], %[[C0]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<4x2xf32>, vector<2x2xf32>
+
+// CHECK-NEXT: %[[VTR2:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<2x4xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[VTR3:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C2]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<2x4xf32>, vector<2x2xf32>
+
+// CHECK-NEXT: %[[VTR4:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[VTR5:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C2]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[VTR6:.*]] = vector.transfer_read %{{.*}}[%[[C2]], %[[C0]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[VTR7:.*]] = vector.transfer_read %{{.*}}[%[[C2]], %[[C2]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<4x4xf32>, vector<2x2xf32>
+
+// CHECK-NEXT: %[[R0:.*]] = vector.contract {indexing_maps = [#map2, #map3, #map0], iterator_types = ["parallel", "reduction", "parallel"]} %[[VTR0]], %[[VTR2]], %[[VTR4]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+// CHECK-NEXT: %[[R1:.*]] = vector.contract {indexing_maps = [#map2, #map3, #map0], iterator_types = ["parallel", "reduction", "parallel"]} %[[VTR0]], %[[VTR3]], %[[VTR5]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+// CHECK-NEXT: %[[R2:.*]] = vector.contract {indexing_maps = [#map2, #map3, #map0], iterator_types = ["parallel", "reduction", "parallel"]} %[[VTR1]], %[[VTR2]], %[[VTR6]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+// CHECK-NEXT: %[[R3:.*]] = vector.contract {indexing_maps = [#map2, #map3, #map0], iterator_types = ["parallel", "reduction", "parallel"]} %[[VTR1]], %[[VTR3]], %[[VTR7]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
+
+// CHECK-NEXT: vector.transfer_write %[[R0]], %{{.*}}[%[[C0]], %[[C0]]] {permutation_map = #[[MAP0]]} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT: vector.transfer_write %[[R1]], %{{.*}}[%[[C0]], %[[C2]]] {permutation_map = #[[MAP0]]} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT: vector.transfer_write %[[R2]], %{{.*}}[%[[C2]], %[[C0]]] {permutation_map = #[[MAP0]]} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT: vector.transfer_write %[[R3]], %{{.*}}[%[[C2]], %[[C2]]] {permutation_map = #[[MAP0]]} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT: return
+
+func @contraction4x4_ikj_xfer_read(%arg0 : memref<4x2xf32>,
+                                   %arg1 : memref<2x4xf32>,
+                                   %arg2 : memref<4x4xf32>) {
+  %c0 = constant 0 : index
+  %cf0 = constant 0.0 : f32
+
+  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0
+    { permutation_map = (d0, d1) -> (d0, d1) }
+      : memref<4x2xf32>, vector<4x2xf32>
+
+  %1 = vector.transfer_read %arg1[%c0, %c0], %cf0
+    { permutation_map = (d0, d1) -> (d0, d1) }
+    : memref<2x4xf32>, vector<2x4xf32>
+
+  %2 = vector.transfer_read %arg2[%c0, %c0], %cf0
+    { permutation_map = (d0, d1) -> (d0, d1) }
+      : memref<4x4xf32>, vector<4x4xf32>
+
+  %3 = vector.contract #contraction_trait1 %0, %1, %2
+      : vector<4x2xf32>, vector<2x4xf32> into vector<4x4xf32>
+
+  vector.transfer_write %3, %arg2[%c0, %c0]
+    {permutation_map = (d0, d1) -> (d0, d1)}
+      : vector<4x4xf32>, memref<4x4xf32>
+  return
+}
+
+// TODO(andydavis) Update test with VTR split transform.
+// CHECK-LABEL: func @vector_transfers
+// CHECK-COUNT-8: vector.transfer_read
+// CHECK-COUNT-4: addf
+// CHECK-COUNT-4: vector.transfer_write
+
+func @vector_transfers(%arg0: index, %arg1: index) {
+  %cst = constant 0.000000e+00 : f32
+  %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
+  %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
+  %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
+  %cst_0 = constant 1.000000e+00 : f32
+  %cst_1 = constant 2.000000e+00 : f32
+  affine.for %arg2 = 0 to %arg0 step 4 {
+    affine.for %arg3 = 0 to %arg1 step 4 {
+      %4 = vector.transfer_read %0[%arg2, %arg3], %cst  {permutation_map = (d0, d1) -> (d0, d1)} : memref<?x?xf32>, vector<4x4xf32>
+      %5 = vector.transfer_read %1[%arg2, %arg3], %cst  {permutation_map = (d0, d1) -> (d0, d1)} : memref<?x?xf32>, vector<4x4xf32>
+      %6 = addf %4, %5 : vector<4x4xf32>
+      vector.transfer_write %6, %2[%arg2, %arg3] {permutation_map = (d0, d1) -> (d0, d1)} : vector<4x4xf32>, memref<?x?xf32>
+    }
+  }
+  return
+}
diff --git a/mlir/test/Dialect/traits.mlir b/mlir/test/Dialect/traits.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..6ef10407768ff71808a96355fec0c86dce391fd9
--- /dev/null
+++ b/mlir/test/Dialect/traits.mlir
@@ -0,0 +1,153 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// Verify that ops with broadcastable trait verifies operand and result type
+// combinations and emits an error for invalid combinations.
+
+func @broadcast_scalar_scalar_scalar(tensor<i32>, tensor<i32>) -> tensor<i32> {
+^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @broadcast_tensor_scalar_tensor(tensor<4xi32>, tensor<i32>) -> tensor<4xi32> {
+^bb0(%arg0: tensor<4xi32>, %arg1: tensor<i32>):
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<4xi32>, tensor<i32>) -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+// -----
+
+// Check only one dimension has size 1
+func @broadcast_tensor_tensor_tensor(tensor<4x3x2xi32>, tensor<3x1xi32>) -> tensor<4x3x2xi32> {
+^bb0(%arg0: tensor<4x3x2xi32>, %arg1: tensor<3x1xi32>):
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<4x3x2xi32>, tensor<3x1xi32>) -> tensor<4x3x2xi32>
+  return %0 : tensor<4x3x2xi32>
+}
+
+// -----
+
+// Check multiple dimensions have size 1
+func @broadcast_tensor_tensor_tensor(tensor<8x1x6x1xi32>, tensor<7x1x5xi32>) -> tensor<8x7x6x5xi32> {
+^bb0(%arg0: tensor<8x1x6x1xi32>, %arg1: tensor<7x1x5xi32>):
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<8x1x6x1xi32>, tensor<7x1x5xi32>) -> tensor<8x7x6x5xi32>
+  return %0 : tensor<8x7x6x5xi32>
+}
+
+// -----
+
+// Check leading unknown dimension
+func @broadcast_tensor_tensor_tensor(tensor<?x1x6x1xi32>, tensor<7x1x5xi32>) -> tensor<?x7x6x5xi32> {
+^bb0(%arg0: tensor<?x1x6x1xi32>, %arg1: tensor<7x1x5xi32>):
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<?x1x6x1xi32>, tensor<7x1x5xi32>) -> tensor<?x7x6x5xi32>
+  return %0 : tensor<?x7x6x5xi32>
+}
+
+// -----
+
+// Check unknown dimension in the middle
+func @broadcast_tensor_tensor_tensor(tensor<8x1x?x1xi32>, tensor<7x1x5xi32>) -> tensor<8x7x?x5xi32> {
+^bb0(%arg0: tensor<8x1x?x1xi32>, %arg1: tensor<7x1x5xi32>):
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<8x1x?x1xi32>, tensor<7x1x5xi32>) -> tensor<8x7x?x5xi32>
+  return %0 : tensor<8x7x?x5xi32>
+}
+
+// -----
+
+// Check incompatible vector and tensor result type
+func @broadcast_scalar_vector_vector(tensor<4xf32>, tensor<4xf32>) -> vector<4xf32> {
+^bb0(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>):
+  // expected-error @+1 {{cannot broadcast vector with tensor}}
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> vector<4xf32>
+  return %0 : vector<4xf32>
+}
+
+// -----
+
+// Check incompatible operand types with known dimension
+func @broadcast_tensor_tensor_tensor(tensor<4x3x2xi32>, tensor<3x3xi32>) -> tensor<4x3x2xi32> {
+^bb0(%arg0: tensor<4x3x2xi32>, %arg1: tensor<3x3xi32>):
+  // expected-error @+1 {{operands don't have broadcast-compatible shapes}}
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<4x3x2xi32>, tensor<3x3xi32>) -> tensor<4x3x2xi32>
+  return %0 : tensor<4x3x2xi32>
+}
+
+// -----
+
+// Check incompatible result type with known dimension
+func @broadcast_tensor_tensor_tensor(tensor<4x3x2xi32>, tensor<3x1xi32>) -> tensor<4x3x3xi32> {
+^bb0(%arg0: tensor<4x3x2xi32>, %arg1: tensor<3x1xi32>):
+  // expected-error @+1 {{does not have shape compatible with the one computed}}
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<4x3x2xi32>, tensor<3x1xi32>) -> tensor<4x3x3xi32>
+  return %0 : tensor<4x3x3xi32>
+}
+
+// -----
+
+// Check incompatible result type with known dimension
+func @broadcast_tensor_tensor_tensor(tensor<8x1x6x1xi32>, tensor<7x1x5xi32>) -> tensor<8x7x6x1xi32> {
+^bb0(%arg0: tensor<8x1x6x1xi32>, %arg1: tensor<7x1x5xi32>):
+  // expected-error @+1 {{does not have shape compatible with the one computed}}
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<8x1x6x1xi32>, tensor<7x1x5xi32>) -> tensor<8x7x6x1xi32>
+  return %0 : tensor<8x7x6x1xi32>
+}
+
+// -----
+
+func @broadcast_tensor_tensor_tensor(tensor<2xi32>, tensor<2xi32>) -> tensor<*xi32> {
+^bb0(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>):
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<*xi32>
+  return %0 : tensor<*xi32>
+}
+
+// -----
+
+func @broadcast_tensor_tensor_tensor(tensor<4x3x2xi32>, tensor<?xi32>) -> tensor<4x3x2xi32> {
+^bb0(%arg0: tensor<4x3x2xi32>, %arg1: tensor<?xi32>):
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<4x3x2xi32>, tensor<?xi32>) -> tensor<4x3x2xi32>
+  return %0 : tensor<4x3x2xi32>
+}
+
+// -----
+
+// Unranked operands but ranked result
+func @broadcast_tensor_tensor_tensor(tensor<*xi32>, tensor<*xi32>) -> tensor<2xi32> {
+^bb0(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>):
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<2xi32>
+  return %0 : tensor<2xi32>
+}
+
+// -----
+
+// Unranked operand and compatible ranked result
+func @broadcast_tensor_tensor_tensor(tensor<3x2xi32>, tensor<*xi32>) -> tensor<4x3x2xi32> {
+^bb0(%arg0: tensor<3x2xi32>, %arg1: tensor<*xi32>):
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<3x2xi32>, tensor<*xi32>) -> tensor<4x3x2xi32>
+  return %0 : tensor<4x3x2xi32>
+}
+
+// -----
+
+func @broadcast_tensor_tensor_tensor(tensor<3x2xi32>, tensor<*xi32>) -> tensor<2xi32> {
+^bb0(%arg0: tensor<3x2xi32>, %arg1: tensor<*xi32>):
+  // expected-error @+1 {{shape incompatible with a ranked operand type}}
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<3x2xi32>, tensor<*xi32>) -> tensor<2xi32>
+  return %0 : tensor<2xi32>
+}
+
+// -----
+
+func @broadcast_tensor_tensor_tensor(tensor<?x1x6x1xi32>, tensor<7x1x5xi32>) -> tensor<8x7x6x5xi32> {
+^bb0(%arg0: tensor<?x1x6x1xi32>, %arg1: tensor<7x1x5xi32>):
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<?x1x6x1xi32>, tensor<7x1x5xi32>) -> tensor<8x7x6x5xi32>
+  return %0 : tensor<8x7x6x5xi32>
+}
+
+// -----
+
+func @broadcastDifferentResultType(tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1> {
+^bb0(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>):
+  %0 = "test.broadcastable"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  return %0 : tensor<4xi1>
+}
diff --git a/mlir/test/EDSC/CMakeLists.txt b/mlir/test/EDSC/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0fb770411ea3d224ea561abdd0404a91771f1d29
--- /dev/null
+++ b/mlir/test/EDSC/CMakeLists.txt
@@ -0,0 +1,28 @@
+add_llvm_executable(mlir-edsc-builder-api-test
+  builder-api-test.cpp
+)
+
+llvm_update_compile_flags(mlir-edsc-builder-api-test)
+
+target_link_libraries(mlir-edsc-builder-api-test
+  PRIVATE
+  MLIRAffineOps
+  MLIREDSC
+  MLIRIR
+  MLIRLinalg
+  MLIRLoopOps
+  MLIRStandardOps
+  MLIRTransforms
+  LLVMCore
+  LLVMSupport
+)
+
+target_include_directories(mlir-edsc-builder-api-test PRIVATE ..)
+
+whole_archive_link(mlir-edsc-builder-api-test
+  MLIRAffineOps
+  MLIRLinalg
+  MLIRLoopOps
+  MLIRStandardOps
+  MLIRTransforms
+)
diff --git a/mlir/test/EDSC/builder-api-test.cpp b/mlir/test/EDSC/builder-api-test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c776ffe12bdd7d395f53fa7c0bf8de538ead42f4
--- /dev/null
+++ b/mlir/test/EDSC/builder-api-test.cpp
@@ -0,0 +1,941 @@
+//===- builder-api-test.cpp - Tests for Declarative Builder APIs ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: mlir-edsc-builder-api-test | FileCheck %s
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/Linalg/EDSC/Builders.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/EDSC/Intrinsics.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "APITest.h"
+
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+static MLIRContext &globalContext() {
+  static thread_local MLIRContext context;
+  return context;
+}
+
+static FuncOp makeFunction(StringRef name, ArrayRef<Type> results = {},
+                           ArrayRef<Type> args = {}) {
+  auto &ctx = globalContext();
+  auto function = FuncOp::create(UnknownLoc::get(&ctx), name,
+                                 FunctionType::get(args, results, &ctx));
+  function.addEntryBlock();
+  return function;
+}
+
+TEST_FUNC(builder_dynamic_for_func_args) {
+  using namespace edsc;
+  using namespace edsc::op;
+  using namespace edsc::intrinsics;
+  auto indexType = IndexType::get(&globalContext());
+  auto f32Type = FloatType::getF32(&globalContext());
+  auto f =
+      makeFunction("builder_dynamic_for_func_args", {}, {indexType, indexType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle i(indexType), j(indexType), lb(f.getArgument(0)),
+      ub(f.getArgument(1));
+  ValueHandle f7(constant_float(llvm::APFloat(7.0f), f32Type));
+  ValueHandle f13(constant_float(llvm::APFloat(13.0f), f32Type));
+  ValueHandle i7(constant_int(7, 32));
+  ValueHandle i13(constant_int(13, 32));
+  AffineLoopNestBuilder(&i, lb, ub, 3)([&] {
+    lb *index_t(3) + ub;
+    lb + index_t(3);
+    AffineLoopNestBuilder(&j, lb, ub, 2)([&] {
+      ceilDiv(index_t(31) * floorDiv(i + j * index_t(3), index_t(32)),
+              index_t(32));
+      ((f7 + f13) / f7) % f13 - f7 *f13;
+      ((i7 + i13) / i7) % i13 - i7 *i13;
+    });
+  });
+
+  // clang-format off
+  // CHECK-LABEL: func @builder_dynamic_for_func_args(%{{.*}}: index, %{{.*}}: index) {
+  //     CHECK:  affine.for %{{.*}} = (d0) -> (d0)(%{{.*}}) to (d0) -> (d0)(%{{.*}}) step 3 {
+  //     CHECK:  {{.*}} = affine.apply ()[s0] -> (s0 * 3)()[%{{.*}}]
+  //     CHECK:  {{.*}} = affine.apply ()[s0, s1] -> (s1 + s0 * 3)()[%{{.*}}, %{{.*}}]
+  //     CHECK:  {{.*}} = affine.apply ()[s0] -> (s0 + 3)()[%{{.*}}]
+  //     CHECK:  affine.for %{{.*}} = (d0) -> (d0)(%{{.*}}) to (d0) -> (d0)(%{{.*}}) step 2 {
+  //     CHECK:    {{.*}} = affine.apply (d0, d1) -> ((d0 + d1 * 3) floordiv 32)(%{{.*}}, %{{.*}})
+  //     CHECK:    {{.*}} = affine.apply (d0, d1) -> (((d0 + d1 * 3) floordiv 32) * 31)(%{{.*}}, %{{.*}})
+  //     CHECK:    {{.*}} = affine.apply (d0, d1) -> ((((d0 + d1 * 3) floordiv 32) * 31) ceildiv 32)(%{{.*}}, %{{.*}})
+  // CHECK-DAG:    [[rf1:%[0-9]+]] = addf {{.*}}, {{.*}} : f32
+  // CHECK-DAG:    [[rf2:%[0-9]+]] = divf [[rf1]], {{.*}} : f32
+  // CHECK-DAG:    [[rf3:%[0-9]+]] = remf [[rf2]], {{.*}} : f32
+  // CHECK-DAG:    [[rf4:%[0-9]+]] = mulf {{.*}}, {{.*}} : f32
+  //     CHECK:    {{.*}} = subf [[rf3]], [[rf4]] : f32
+  // CHECK-DAG:    [[ri1:%[0-9]+]] = addi {{.*}}, {{.*}} : i32
+  // CHECK-DAG:    [[ri2:%[0-9]+]] = divi_signed [[ri1]], {{.*}} : i32
+  // CHECK-DAG:    [[ri3:%[0-9]+]] = remi_signed [[ri2]], {{.*}} : i32
+  // CHECK-DAG:    [[ri4:%[0-9]+]] = muli {{.*}}, {{.*}} : i32
+  //     CHECK:    {{.*}} = subi [[ri3]], [[ri4]] : i32
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+TEST_FUNC(builder_dynamic_for) {
+  using namespace edsc;
+  using namespace edsc::op;
+  using namespace edsc::intrinsics;
+  auto indexType = IndexType::get(&globalContext());
+  auto f = makeFunction("builder_dynamic_for", {},
+                        {indexType, indexType, indexType, indexType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle i(indexType), a(f.getArgument(0)), b(f.getArgument(1)),
+      c(f.getArgument(2)), d(f.getArgument(3));
+  AffineLoopNestBuilder(&i, a - b, c + d, 2)();
+
+  // clang-format off
+  // CHECK-LABEL: func @builder_dynamic_for(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
+  // CHECK-DAG:    [[r0:%[0-9]+]] = affine.apply ()[s0, s1] -> (s0 - s1)()[%{{.*}}, %{{.*}}]
+  // CHECK-DAG:    [[r1:%[0-9]+]] = affine.apply ()[s0, s1] -> (s0 + s1)()[%{{.*}}, %{{.*}}]
+  // CHECK-NEXT:   affine.for %{{.*}} = (d0) -> (d0)([[r0]]) to (d0) -> (d0)([[r1]]) step 2 {
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+TEST_FUNC(builder_loop_for) {
+  using namespace edsc;
+  using namespace edsc::op;
+  using namespace edsc::intrinsics;
+  auto indexType = IndexType::get(&globalContext());
+  auto f = makeFunction("builder_loop_for", {},
+                        {indexType, indexType, indexType, indexType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle i(indexType), a(f.getArgument(0)), b(f.getArgument(1)),
+      c(f.getArgument(2)), d(f.getArgument(3));
+  LoopNestBuilder(&i, a - b, c + d, a)();
+
+  // clang-format off
+  // CHECK-LABEL: func @builder_loop_for(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
+  // CHECK-DAG:    [[r0:%[0-9]+]] = affine.apply ()[s0, s1] -> (s0 - s1)()[%{{.*}}, %{{.*}}]
+  // CHECK-DAG:    [[r1:%[0-9]+]] = affine.apply ()[s0, s1] -> (s0 + s1)()[%{{.*}}, %{{.*}}]
+  // CHECK-NEXT:   loop.for %{{.*}} = [[r0]] to [[r1]] step {{.*}} {
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+TEST_FUNC(builder_max_min_for) {
+  using namespace edsc;
+  using namespace edsc::op;
+  using namespace edsc::intrinsics;
+  auto indexType = IndexType::get(&globalContext());
+  auto f = makeFunction("builder_max_min_for", {},
+                        {indexType, indexType, indexType, indexType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle i(indexType), lb1(f.getArgument(0)), lb2(f.getArgument(1)),
+      ub1(f.getArgument(2)), ub2(f.getArgument(3));
+  AffineLoopNestBuilder(&i, {lb1, lb2}, {ub1, ub2}, 1)();
+  ret();
+
+  // clang-format off
+  // CHECK-LABEL: func @builder_max_min_for(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
+  // CHECK:  affine.for %{{.*}} = max (d0, d1) -> (d0, d1)(%{{.*}}, %{{.*}}) to min (d0, d1) -> (d0, d1)(%{{.*}}, %{{.*}}) {
+  // CHECK:  return
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+TEST_FUNC(builder_blocks) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto f = makeFunction("builder_blocks");
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle c1(ValueHandle::create<ConstantIntOp>(42, 32)),
+      c2(ValueHandle::create<ConstantIntOp>(1234, 32));
+  ValueHandle arg1(c1.getType()), arg2(c1.getType()), arg3(c1.getType()),
+      arg4(c1.getType()), r(c1.getType());
+
+  BlockHandle b1, b2, functionBlock(&f.front());
+  BlockBuilder(&b1, {&arg1, &arg2})(
+      // b2 has not yet been constructed, need to come back later.
+      // This is a byproduct of non-structured control-flow.
+  );
+  BlockBuilder(&b2, {&arg3, &arg4})([&] { br(b1, {arg3, arg4}); });
+  // The insertion point within the toplevel function is now past b2, we will
+  // need to get back the entry block.
+  // This is what happens with unstructured control-flow..
+  BlockBuilder(b1, Append())([&] {
+    r = arg1 + arg2;
+    br(b2, {arg1, r});
+  });
+  // Get back to entry block and add a branch into b1
+  BlockBuilder(functionBlock, Append())([&] { br(b1, {c1, c2}); });
+
+  // clang-format off
+  // CHECK-LABEL: @builder_blocks
+  // CHECK:        %{{.*}} = constant 42 : i32
+  // CHECK-NEXT:   %{{.*}} = constant 1234 : i32
+  // CHECK-NEXT:   br ^bb1(%{{.*}}, %{{.*}} : i32, i32)
+  // CHECK-NEXT: ^bb1(%{{.*}}: i32, %{{.*}}: i32):   // 2 preds: ^bb0, ^bb2
+  // CHECK-NEXT:   %{{.*}} = addi %{{.*}}, %{{.*}} : i32
+  // CHECK-NEXT:   br ^bb2(%{{.*}}, %{{.*}} : i32, i32)
+  // CHECK-NEXT: ^bb2(%{{.*}}: i32, %{{.*}}: i32):   // pred: ^bb1
+  // CHECK-NEXT:   br ^bb1(%{{.*}}, %{{.*}} : i32, i32)
+  // CHECK-NEXT: }
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+TEST_FUNC(builder_blocks_eager) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto f = makeFunction("builder_blocks_eager");
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle c1(ValueHandle::create<ConstantIntOp>(42, 32)),
+      c2(ValueHandle::create<ConstantIntOp>(1234, 32));
+  ValueHandle arg1(c1.getType()), arg2(c1.getType()), arg3(c1.getType()),
+      arg4(c1.getType()), r(c1.getType());
+
+  // clang-format off
+  BlockHandle b1, b2;
+  { // Toplevel function scope.
+    // Build a new block for b1 eagerly.
+    br(&b1, {&arg1, &arg2}, {c1, c2});
+    // Construct a new block b2 explicitly with a branch into b1.
+    BlockBuilder(&b2, {&arg3, &arg4})([&]{
+        br(b1, {arg3, arg4});
+    });
+    /// And come back to append into b1 once b2 exists.
+    BlockBuilder(b1, Append())([&]{
+        r = arg1 + arg2;
+        br(b2, {arg1, r});
+    });
+  }
+
+  // CHECK-LABEL: @builder_blocks_eager
+  // CHECK:        %{{.*}} = constant 42 : i32
+  // CHECK-NEXT:   %{{.*}} = constant 1234 : i32
+  // CHECK-NEXT:   br ^bb1(%{{.*}}, %{{.*}} : i32, i32)
+  // CHECK-NEXT: ^bb1(%{{.*}}: i32, %{{.*}}: i32):   // 2 preds: ^bb0, ^bb2
+  // CHECK-NEXT:   %{{.*}} = addi %{{.*}}, %{{.*}} : i32
+  // CHECK-NEXT:   br ^bb2(%{{.*}}, %{{.*}} : i32, i32)
+  // CHECK-NEXT: ^bb2(%{{.*}}: i32, %{{.*}}: i32):   // pred: ^bb1
+  // CHECK-NEXT:   br ^bb1(%{{.*}}, %{{.*}} : i32, i32)
+  // CHECK-NEXT: }
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+TEST_FUNC(builder_cond_branch) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  auto f = makeFunction("builder_cond_branch", {},
+                        {IntegerType::get(1, &globalContext())});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle funcArg(f.getArgument(0));
+  ValueHandle c32(ValueHandle::create<ConstantIntOp>(32, 32)),
+      c64(ValueHandle::create<ConstantIntOp>(64, 64)),
+      c42(ValueHandle::create<ConstantIntOp>(42, 32));
+  ValueHandle arg1(c32.getType()), arg2(c64.getType()), arg3(c32.getType());
+
+  BlockHandle b1, b2, functionBlock(&f.front());
+  BlockBuilder(&b1, {&arg1})([&] { ret(); });
+  BlockBuilder(&b2, {&arg2, &arg3})([&] { ret(); });
+  // Get back to entry block and add a conditional branch
+  BlockBuilder(functionBlock, Append())([&] {
+    cond_br(funcArg, b1, {c32}, b2, {c64, c42});
+  });
+
+  // clang-format off
+  // CHECK-LABEL: @builder_cond_branch
+  // CHECK:   %{{.*}} = constant 32 : i32
+  // CHECK-NEXT:   %{{.*}} = constant 64 : i64
+  // CHECK-NEXT:   %{{.*}} = constant 42 : i32
+  // CHECK-NEXT:   cond_br %{{.*}}, ^bb1(%{{.*}} : i32), ^bb2(%{{.*}}, %{{.*}} : i64, i32)
+  // CHECK-NEXT: ^bb1(%{{.*}}: i32):   // pred: ^bb0
+  // CHECK-NEXT:   return
+  // CHECK-NEXT: ^bb2(%{{.*}}: i64, %{{.*}}: i32):  // pred: ^bb0
+  // CHECK-NEXT:   return
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+TEST_FUNC(builder_cond_branch_eager) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto f = makeFunction("builder_cond_branch_eager", {},
+                        {IntegerType::get(1, &globalContext())});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle funcArg(f.getArgument(0));
+  ValueHandle c32(ValueHandle::create<ConstantIntOp>(32, 32)),
+      c64(ValueHandle::create<ConstantIntOp>(64, 64)),
+      c42(ValueHandle::create<ConstantIntOp>(42, 32));
+  ValueHandle arg1(c32.getType()), arg2(c64.getType()), arg3(c32.getType());
+
+  // clang-format off
+  BlockHandle b1, b2;
+  cond_br(funcArg, &b1, {&arg1}, {c32}, &b2, {&arg2, &arg3}, {c64, c42});
+  BlockBuilder(b1, Append())([]{
+      ret();
+  });
+  BlockBuilder(b2, Append())([]{
+      ret();
+  });
+
+  // CHECK-LABEL: @builder_cond_branch_eager
+  // CHECK:   %{{.*}} = constant 32 : i32
+  // CHECK-NEXT:   %{{.*}} = constant 64 : i64
+  // CHECK-NEXT:   %{{.*}} = constant 42 : i32
+  // CHECK-NEXT:   cond_br %{{.*}}, ^bb1(%{{.*}} : i32), ^bb2(%{{.*}}, %{{.*}} : i64, i32)
+  // CHECK-NEXT: ^bb1(%{{.*}}: i32):   // pred: ^bb0
+  // CHECK-NEXT:   return
+  // CHECK-NEXT: ^bb2(%{{.*}}: i64, %{{.*}}: i32):  // pred: ^bb0
+  // CHECK-NEXT:   return
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+TEST_FUNC(builder_helpers) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto f32Type = FloatType::getF32(&globalContext());
+  auto memrefType = MemRefType::get({-1, -1, -1}, f32Type, {}, 0);
+  auto f =
+      makeFunction("builder_helpers", {}, {memrefType, memrefType, memrefType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  // clang-format off
+  ValueHandle f7(
+      ValueHandle::create<ConstantFloatOp>(llvm::APFloat(7.0f), f32Type));
+  MemRefView vA(f.getArgument(0)), vB(f.getArgument(1)),
+      vC(f.getArgument(2));
+  IndexedValue A(f.getArgument(0)), B(f.getArgument(1)), C(f.getArgument(2));
+  IndexHandle i, j, k1, k2, lb0, lb1, lb2, ub0, ub1, ub2;
+  int64_t step0, step1, step2;
+  std::tie(lb0, ub0, step0) = vA.range(0);
+  std::tie(lb1, ub1, step1) = vA.range(1);
+  lb2 = vA.lb(2);
+  ub2 = vA.ub(2);
+  step2 = vA.step(2);
+  AffineLoopNestBuilder({&i, &j}, {lb0, lb1}, {ub0, ub1}, {step0, step1})([&]{
+    AffineLoopNestBuilder(&k1, lb2, ub2, step2)([&]{
+      C(i, j, k1) = f7 + A(i, j, k1) + B(i, j, k1);
+    });
+    AffineLoopNestBuilder(&k2, lb2, ub2, step2)([&]{
+      C(i, j, k2) += A(i, j, k2) + B(i, j, k2);
+    });
+  });
+
+  // CHECK-LABEL: @builder_helpers
+  //      CHECK:   affine.for %{{.*}} = (d0) -> (d0)({{.*}}) to (d0) -> (d0)({{.*}}) {
+  // CHECK-NEXT:     affine.for %{{.*}} = (d0) -> (d0)({{.*}}) to (d0) -> (d0)({{.*}}) {
+  // CHECK-NEXT:       affine.for %{{.*}} = (d0) -> (d0)({{.*}}) to (d0) -> (d0)({{.*}}) {
+  //  CHECK-DAG:         [[a:%.*]] = affine.load %arg0[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  //  CHECK-DAG:         [[b:%.*]] = addf {{.*}}, [[a]] : f32
+  //  CHECK-DAG:         [[c:%.*]] = affine.load %arg1[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  //  CHECK-DAG:         [[d:%.*]] = addf [[b]], [[c]] : f32
+  // CHECK-NEXT:         affine.store [[d]], %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  // CHECK-NEXT:       }
+  // CHECK-NEXT:       affine.for %{{.*}} = (d0) -> (d0)(%{{.*}}) to (d0) -> (d0)(%{{.*}}) {
+  //  CHECK-DAG:         [[a:%.*]] = affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  //  CHECK-DAG:         [[b:%.*]] = affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  //  CHECK-DAG:         [[c:%.*]] = addf [[b]], [[a]] : f32
+  //  CHECK-DAG:         [[d:%.*]] = affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  //  CHECK-DAG:         [[e:%.*]] = addf [[d]], [[c]] : f32
+  // CHECK-NEXT:         affine.store [[e]], %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+TEST_FUNC(custom_ops) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto indexType = IndexType::get(&globalContext());
+  auto f = makeFunction("custom_ops", {}, {indexType, indexType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  CustomOperation<ValueHandle> MY_CUSTOM_OP("my_custom_op");
+  CustomOperation<OperationHandle> MY_CUSTOM_OP_0("my_custom_op_0");
+  CustomOperation<OperationHandle> MY_CUSTOM_OP_2("my_custom_op_2");
+
+  // clang-format off
+  ValueHandle vh(indexType), vh20(indexType), vh21(indexType);
+  OperationHandle ih0, ih2;
+  IndexHandle m, n, M(f.getArgument(0)), N(f.getArgument(1));
+  IndexHandle ten(index_t(10)), twenty(index_t(20));
+  AffineLoopNestBuilder({&m, &n}, {M, N}, {M + ten, N + twenty}, {1, 1})([&]{
+    vh = MY_CUSTOM_OP({m, m + n}, {indexType}, {});
+    ih0 = MY_CUSTOM_OP_0({m, m + n}, {});
+    ih2 = MY_CUSTOM_OP_2({m, m + n}, {indexType, indexType});
+    // These captures are verbose for now, can improve when used in practice.
+    vh20 = ValueHandle(ih2.getOperation()->getResult(0));
+    vh21 = ValueHandle(ih2.getOperation()->getResult(1));
+    MY_CUSTOM_OP({vh20, vh21}, {indexType}, {});
+  });
+
+  // CHECK-LABEL: @custom_ops
+  // CHECK: affine.for %{{.*}} {{.*}}
+  // CHECK:   affine.for %{{.*}} {{.*}}
+  // CHECK:     {{.*}} = "my_custom_op"{{.*}} : (index, index) -> index
+  // CHECK:     "my_custom_op_0"{{.*}} : (index, index) -> ()
+  // CHECK:     [[TWO:%[a-z0-9]+]]:2 = "my_custom_op_2"{{.*}} : (index, index) -> (index, index)
+  // CHECK:     {{.*}} = "my_custom_op"([[TWO]]#0, [[TWO]]#1) : (index, index) -> index
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+TEST_FUNC(insertion_in_block) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto indexType = IndexType::get(&globalContext());
+  auto f = makeFunction("insertion_in_block", {}, {indexType, indexType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  BlockHandle b1;
+  // clang-format off
+  ValueHandle::create<ConstantIntOp>(0, 32);
+  BlockBuilder(&b1, {})([]{
+    ValueHandle::create<ConstantIntOp>(1, 32);
+  });
+  ValueHandle::create<ConstantIntOp>(2, 32);
+  // CHECK-LABEL: @insertion_in_block
+  // CHECK: {{.*}} = constant 0 : i32
+  // CHECK: {{.*}} = constant 2 : i32
+  // CHECK: ^bb1:   // no predecessors
+  // CHECK: {{.*}} = constant 1 : i32
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+TEST_FUNC(select_op_i32) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto f32Type = FloatType::getF32(&globalContext());
+  auto memrefType = MemRefType::get({-1, -1}, f32Type, {}, 0);
+  auto f = makeFunction("select_op", {}, {memrefType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  // clang-format off
+  ValueHandle zero = constant_index(0), one = constant_index(1);
+  MemRefView vA(f.getArgument(0));
+  IndexedValue A(f.getArgument(0));
+  IndexHandle i, j;
+  AffineLoopNestBuilder({&i, &j}, {zero, zero}, {one, one}, {1, 1})([&]{
+    // This test exercises IndexedValue::operator Value.
+    // Without it, one must force conversion to ValueHandle as such:
+    //   edsc::intrinsics::select(
+    //      i == zero, ValueHandle(A(zero, zero)), ValueHandle(ValueA(i, j)))
+    edsc::intrinsics::select(i == zero, *A(zero, zero), *A(i, j));
+  });
+
+  // CHECK-LABEL: @select_op
+  //      CHECK: affine.for %{{.*}} = 0 to 1 {
+  // CHECK-NEXT:   affine.for %{{.*}} = 0 to 1 {
+  //  CHECK-DAG:     {{.*}} = cmpi "eq"
+  //  CHECK-DAG:     {{.*}} = affine.load
+  //  CHECK-DAG:     {{.*}} = affine.load
+  // CHECK-NEXT:     {{.*}} = select
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+TEST_FUNC(select_op_f32) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto f32Type = FloatType::getF32(&globalContext());
+  auto memrefType = MemRefType::get({-1, -1}, f32Type, {}, 0);
+  auto f = makeFunction("select_op", {}, {memrefType, memrefType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  // clang-format off
+  ValueHandle zero = constant_index(0), one = constant_index(1);
+  MemRefView vA(f.getArgument(0)), vB(f.getArgument(1));
+  IndexedValue A(f.getArgument(0)), B(f.getArgument(1));
+  IndexHandle i, j;
+  AffineLoopNestBuilder({&i, &j}, {zero, zero}, {one, one}, {1, 1})([&]{
+
+    edsc::intrinsics::select(B(i, j) == B(i+one, j), *A(zero, zero), *A(i, j));
+    edsc::intrinsics::select(B(i, j) != B(i+one, j), *A(zero, zero), *A(i, j));
+    edsc::intrinsics::select(B(i, j) >= B(i+one, j), *A(zero, zero), *A(i, j));
+    edsc::intrinsics::select(B(i, j) <= B(i+one, j), *A(zero, zero), *A(i, j));
+    edsc::intrinsics::select(B(i, j) < B(i+one, j), *A(zero, zero), *A(i, j));
+    edsc::intrinsics::select(B(i, j) > B(i+one, j), *A(zero, zero), *A(i, j));
+  });
+
+  // CHECK-LABEL: @select_op
+  //      CHECK: affine.for %{{.*}} = 0 to 1 {
+  // CHECK-NEXT:   affine.for %{{.*}} = 0 to 1 {
+  //  CHECK-DAG:     cmpf "oeq"
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.apply
+  // CHECK-NEXT:     select
+  //  CHECK-DAG:     cmpf "one"
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.apply
+  // CHECK-NEXT:     select
+  //  CHECK-DAG:     cmpf "oge"
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.apply
+  // CHECK-NEXT:     select
+  //  CHECK-DAG:     cmpf "ole"
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.apply
+  // CHECK-NEXT:     select
+  //  CHECK-DAG:     cmpf "olt"
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.apply
+  // CHECK-NEXT:     select
+  //  CHECK-DAG:     cmpf "ogt"
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.load
+  //  CHECK-DAG:     affine.apply
+  // CHECK-NEXT:     select
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+// Inject an EDSC-constructed computation to exercise imperfectly nested 2-d
+// tiling.
+TEST_FUNC(tile_2d) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto memrefType =
+      MemRefType::get({-1, -1, -1}, FloatType::getF32(&globalContext()), {}, 0);
+  auto f = makeFunction("tile_2d", {}, {memrefType, memrefType, memrefType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle zero = constant_index(0);
+  MemRefView vA(f.getArgument(0)), vB(f.getArgument(1)), vC(f.getArgument(2));
+  IndexedValue A(f.getArgument(0)), B(f.getArgument(1)), C(f.getArgument(2));
+  IndexHandle i, j, k1, k2, M(vC.ub(0)), N(vC.ub(1)), O(vC.ub(2));
+
+  // clang-format off
+  AffineLoopNestBuilder({&i, &j}, {zero, zero}, {M, N}, {1, 1})([&]{
+    AffineLoopNestBuilder(&k1, zero, O, 1)([&]{
+      C(i, j, k1) = A(i, j, k1) + B(i, j, k1);
+    });
+    AffineLoopNestBuilder(&k2, zero, O, 1)([&]{
+      C(i, j, k2) = A(i, j, k2) + B(i, j, k2);
+    });
+  });
+  // clang-format on
+
+  auto li = getForInductionVarOwner(i.getValue()),
+       lj = getForInductionVarOwner(j.getValue()),
+       lk1 = getForInductionVarOwner(k1.getValue()),
+       lk2 = getForInductionVarOwner(k2.getValue());
+  auto indicesL1 = mlir::tile({li, lj}, {512, 1024}, {lk1, lk2});
+  auto lii1 = indicesL1[0][0], ljj1 = indicesL1[1][0];
+  mlir::tile({ljj1, lii1}, {32, 16}, ljj1);
+
+  // clang-format off
+  // CHECK-LABEL: func @tile_2d
+  //       CHECK: %[[ZERO:.*]] = constant 0 : index
+  //       CHECK: %[[M:[0-9]+]] = dim %arg2, 0 : memref<?x?x?xf32>
+  //  CHECK-NEXT: %[[N:[0-9]+]] = dim %arg2, 1 : memref<?x?x?xf32>
+  //  CHECK-NEXT: %[[P:[0-9]+]] = dim %arg2, 2 : memref<?x?x?xf32>
+  //       CHECK:   affine.for %{{.*}} = (d0) -> (d0)(%[[ZERO]]) to (d0) -> (d0)(%[[M]]) step 512 {
+  //  CHECK-NEXT:     affine.for %{{.*}} = (d0) -> (d0)(%[[ZERO]]) to (d0) -> (d0)(%[[N]]) step 1024 {
+  //  CHECK-NEXT:       affine.for %{{.*}} = (d0) -> (d0)(%[[ZERO]]) to (d0) -> (d0)(%[[P]]) {
+  //  CHECK-NEXT:         affine.for %{{.*}} = max (d0) -> (0, d0)(%{{.*}}) to min (d0)[s0] -> (s0, d0 + 512)(%{{.*}})[%[[M]]] step 16 {
+  //  CHECK-NEXT:           affine.for %{{.*}} = max (d0) -> (0, d0)(%{{.*}}) to min (d0)[s0] -> (s0, d0 + 1024)(%{{.*}})[%[[N]]] step 32 {
+  //  CHECK-NEXT:             affine.for %{{.*}} = max (d0, d1) -> (0, d0, d1)(%{{.*}}, %{{.*}}) to min (d0, d1)[s0] -> (s0, d0 + 1024, d1 + 32)(%{{.*}}, %{{.*}})[%[[N]]] {
+  //  CHECK-NEXT:               affine.for %{{.*}} = max (d0, d1) -> (0, d0, d1)(%{{.*}}, %{{.*}}) to min (d0, d1)[s0] -> (s0, d0 + 512, d1 + 16)(%{{.*}}, %{{.*}})[%[[M]]] {
+  //  CHECK-NEXT:                 {{.*}} = affine.load {{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  //  CHECK-NEXT:                 {{.*}} = affine.load {{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  //  CHECK-NEXT:                 {{.*}} = addf {{.*}}, {{.*}} : f32
+  //  CHECK-NEXT:                 affine.store {{.*}}, {{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  //       CHECK:               }
+  //  CHECK-NEXT:             }
+  //  CHECK-NEXT:           }
+  //  CHECK-NEXT:         }
+  //  CHECK-NEXT:       }
+  //  CHECK-NEXT:       affine.for %{{.*}} = (d0) -> (d0)(%[[ZERO]]) to (d0) -> (d0)(%[[P]]) {
+  //  CHECK-NEXT:         affine.for %{{.*}} = max (d0) -> (0, d0)(%{{.*}}) to min (d0)[s0] -> (s0, d0 + 512)(%{{.*}})[%[[M]]] {
+  //  CHECK-NEXT:           affine.for %{{.*}} = max (d0) -> (0, d0)(%{{.*}}) to min (d0)[s0] -> (s0, d0 + 1024)(%{{.*}})[%[[N]]] {
+  //  CHECK-NEXT:             {{.*}} = affine.load {{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  //  CHECK-NEXT:             {{.*}} = affine.load {{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  //  CHECK-NEXT:             {{.*}}= addf {{.*}}, {{.*}} : f32
+  //  CHECK-NEXT:             affine.store {{.*}}, {{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<?x?x?xf32>
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+// Inject an EDSC-constructed computation to exercise 2-d vectorization.
+// TODO(ntv,andydavis) Convert EDSC to use AffineLoad/Store.
+/*
+TEST_FUNC(vectorize_2d) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto memrefType =
+      MemRefType::get({-1, -1, -1}, FloatType::getF32(&globalContext()), {}, 0);
+  auto owningF =
+      makeFunction("vectorize_2d", {}, {memrefType, memrefType, memrefType});
+
+  mlir::FuncOp f = owningF;
+  mlir::OwningModuleRef module = ModuleOp::create(&globalContext());
+  module->push_back(f);
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle zero = constant_index(0);
+  MemRefView vA(f.getArgument(0)), vB(f.getArgument(1)), vC(f.getArgument(2));
+  IndexedValue A(f.getArgument(0)), B(f.getArgument(1)), C(f.getArgument(2));
+  IndexHandle M(vA.ub(0)), N(vA.ub(1)), P(vA.ub(2));
+
+  // clang-format off
+  IndexHandle i, j, k;
+  AffineLoopNestBuilder({&i, &j, &k}, {zero, zero, zero}, {M, N, P}, {1, 1,
+1})([&]{ C(i, j, k) = A(i, j, k) + B(i, j, k);
+  });
+  ret();
+
+  // xCHECK-LABEL: func @vectorize_2d
+  //  xCHECK-NEXT: %[[M:.*]] = dim %{{.*}}, 0 : memref<?x?x?xf32>
+  //  xCHECK-NEXT: %[[N:.*]] = dim %{{.*}}, 1 : memref<?x?x?xf32>
+  //  xCHECK-NEXT: %[[P:.*]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+  //  xCHECK-NEXT: affine.for %{{.*}} = 0 to (d0) -> (d0)(%[[M]]) {
+  //  xCHECK-NEXT:   affine.for %{{.*}} = 0 to (d0) -> (d0)(%[[N]]) step 4 {
+  //  xCHECK-NEXT:     affine.for %{{.*}} = 0 to (d0) -> (d0)(%[[P]]) step 4 {
+  //  xCHECK-NEXT:       %[[vA:.*]] = "vector.transfer_read"(%{{.*}}, %{{.*}},
+%{{.*}}, %i2) {permutation_map = (d0, d1, d2) -> (d1, d2)} : (memref<?x?x?xf32>,
+index, index, index) -> vector<4x4xf32>
+  //  xCHECK-NEXT:       %[[vB:.*]] =  "vector.transfer_read"(%{{.*}}, %{{.*}},
+%{{.*}}, %i2) {permutation_map = (d0, d1,  d2) -> (d1, d2)} :
+(memref<?x?x?xf32>, index, index, index) -> vector<4x4xf32>
+  //  xCHECK-NEXT:       %[[vRES:.*]] = addf %[[vB]], %[[vA]] : vector<4x4xf32>
+  //  xCHECK-NEXT:       "vector.transfer_write"(%[[vRES:.*]], %{{.*}}, %{{.*}},
+%{{.*}}, %i2) {permutation_map = (d0, d1, d2) -> (d1, d2)} : (vector<4x4xf32>,
+memref<?x?x?xf32>, index, index, index) -> ()
+  // clang-format on
+
+  mlir::PassManager pm;
+  pm.addPass(mlir::createCanonicalizerPass());
+  SmallVector<int64_t, 2> vectorSizes{4, 4};
+  pm.addPass(mlir::createVectorizePass(vectorSizes));
+  auto result = pm.run(f.getModule());
+  if (succeeded(result))
+    f.print(llvm::outs());
+  f.erase();
+}
+*/
+
+// Exercise StdIndexedValue for loads and stores.
+TEST_FUNC(indirect_access) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto memrefType =
+      MemRefType::get({-1}, FloatType::getF32(&globalContext()), {}, 0);
+  auto f = makeFunction("indirect_access", {},
+                        {memrefType, memrefType, memrefType, memrefType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle zero = constant_index(0);
+  MemRefView vC(f.getArgument(2));
+  IndexedValue B(f.getArgument(1)), D(f.getArgument(3));
+  StdIndexedValue A(f.getArgument(0)), C(f.getArgument(2));
+  IndexHandle i, N(vC.ub(0));
+
+  // clang-format off
+  AffineLoopNestBuilder(&i, zero, N, 1)([&]{
+      C((ValueHandle)D(i)) = A((ValueHandle)B(i));
+  });
+  // clang-format on
+
+  // clang-format off
+  // CHECK-LABEL: func @indirect_access(
+  // CHECK:  [[B:%.*]] = affine.load
+  // CHECK:  [[D:%.*]] = affine.load
+  // CHECK:  load %{{.*}}{{\[}}[[B]]{{\]}}
+  // CHECK:  store %{{.*}}, %{{.*}}{{\[}}[[D]]{{\]}}
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+// Exercise affine loads and stores build with empty maps.
+TEST_FUNC(empty_map_load_store) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto memrefType =
+      MemRefType::get({}, FloatType::getF32(&globalContext()), {}, 0);
+  auto f = makeFunction("empty_map_load_store", {},
+                        {memrefType, memrefType, memrefType, memrefType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle zero = constant_index(0);
+  ValueHandle one = constant_index(1);
+  IndexedValue input(f.getArgument(0)), res(f.getArgument(1));
+  IndexHandle iv;
+
+  // clang-format off
+  AffineLoopNestBuilder(&iv, zero, one, 1)([&]{
+      res() = input();
+  });
+  // clang-format on
+
+  // clang-format off
+  // CHECK-LABEL: func @empty_map_load_store(
+  // CHECK:  [[A:%.*]] = affine.load %{{.*}}[]
+  // CHECK:  affine.store [[A]], %{{.*}}[]
+  // clang-format on
+  f.print(llvm::outs());
+  f.erase();
+}
+
+// CHECK-LABEL: func @affine_if_op
+// CHECK:       affine.if ([[d0:.*]], [[d1:.*]]){{\[}}[[s0:.*]], [[s1:.*]]{{\]}}
+// CHECK-NOT:   else
+// CHECK:       affine.if ([[d0:.*]], [[d1:.*]]){{\[}}[[s0:.*]], [[s1:.*]]{{\]}}
+// CHECK-NEXT:  } else {
+TEST_FUNC(affine_if_op) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  using namespace edsc::op;
+  auto f32Type = FloatType::getF32(&globalContext());
+  auto memrefType = MemRefType::get({-1, -1}, f32Type, {}, 0);
+  auto f = makeFunction("affine_if_op", {}, {memrefType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+
+  ValueHandle zero = constant_index(0), ten = constant_index(10);
+
+  SmallVector<bool, 4> isEq = {false, false, false, false};
+  SmallVector<AffineExpr, 4> affineExprs = {
+      builder.getAffineDimExpr(0),    // d0 >= 0
+      builder.getAffineDimExpr(1),    // d1 >= 0
+      builder.getAffineSymbolExpr(0), // s0 >= 0
+      builder.getAffineSymbolExpr(1)  // s1 >= 0
+  };
+  auto intSet = IntegerSet::get(2, 2, affineExprs, isEq);
+
+  SmallVector<Value, 4> affineIfArgs = {zero, zero, ten, ten};
+  intrinsics::affine_if(intSet, affineIfArgs, /*withElseRegion=*/false);
+  intrinsics::affine_if(intSet, affineIfArgs, /*withElseRegion=*/true);
+
+  f.print(llvm::outs());
+  f.erase();
+}
+
+// clang-format off
+// CHECK-LABEL: func @linalg_pointwise
+//       CHECK:   linalg.generic {args_in = 2 : i64, args_out = 1 : i64,
+// CHECK-SAME: indexing_maps = [(d0, d1) -> (d0, d1), (d0, d1) -> (d0, d1), (d0, d1) -> (d0, d1)],
+// CHECK-SAME: iterator_types = ["parallel", "parallel"]}
+//       CHECK:       addf
+//       CHECK:     }: memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
+//       CHECK:   linalg.generic {args_in = 2 : i64, args_out = 1 : i64,
+// CHECK-SAME: indexing_maps = [(d0, d1) -> (d0, d1), (d0, d1) -> (d0, d1), (d0, d1) -> (d0, d1)],
+// CHECK-SAME: iterator_types = ["parallel", "parallel"]}
+//       CHECK:       cmpf "ogt"
+//       CHECK:       select
+//       CHECK:   }: memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
+//       CHECK:   linalg.generic {args_in = 1 : i64, args_out = 1 : i64,
+// CHECK-SAME:      indexing_maps = [(d0, d1) -> (d0, d1), (d0, d1) -> (d0, d1)],
+// CHECK-SAME:      iterator_types = ["parallel", "parallel"]}
+//       CHECK:     tanh
+//       CHECK:   }: memref<?x?xf32>, memref<?x?xf32>
+// clang-format on
+TEST_FUNC(linalg_pointwise_test) {
+  using namespace edsc;
+  using namespace edsc::ops;
+
+  auto f32Type = FloatType::getF32(&globalContext());
+  auto memrefType = MemRefType::get({-1, -1}, f32Type, {}, 0);
+  auto f = makeFunction("linalg_pointwise", {},
+                        {memrefType, memrefType, memrefType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  ValueHandle A(f.getArgument(0)), B(f.getArgument(1)), C(f.getArgument(2));
+  AffineExpr i, j;
+  bindDims(&globalContext(), i, j);
+  StructuredIndexed SA(A), SB(B), SC(C);
+  linalg_pointwise_add(SA({i, j}), SB({i, j}), SC({i, j}));
+  linalg_pointwise_max(SA({i, j}), SB({i, j}), SC({i, j}));
+  linalg_pointwise_tanh(SA({i, j}), SC({i, j}));
+
+  f.print(llvm::outs());
+  f.erase();
+}
+
+// clang-format off
+// CHECK-LABEL: func @linalg_matmul
+//       CHECK:   linalg.generic {args_in = 2 : i64, args_out = 1 : i64,
+// CHECK-SAME: indexing_maps = [(d0, d1, d2) -> (d0, d2), (d0, d1, d2) -> (d2, d1), (d0, d1, d2) -> (d0, d1)],
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"]}
+///      CHECK:   ^bb1(%[[a0:.*]]: f32, %[[a1:.*]]: f32, %[[a2:.*]]: f32):
+//       CHECK:     %[[a3:.*]] = mulf %[[a0]], %[[a1]] : f32
+//       CHECK:     %[[a4:.*]] = addf %[[a2]], %[[a3]] : f32
+//       CHECK:     linalg.yield %[[a4]] : f32
+//       CHECK:   }: memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
+// clang-format on
+TEST_FUNC(linalg_matmul_test) {
+  using namespace edsc;
+  using namespace edsc::ops;
+
+  auto f32Type = FloatType::getF32(&globalContext());
+  auto memrefType = MemRefType::get({-1, -1}, f32Type, {}, 0);
+  auto f =
+      makeFunction("linalg_matmul", {}, {memrefType, memrefType, memrefType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  linalg_matmul(makeValueHandles(llvm::to_vector<3>(f.getArguments())));
+
+  f.print(llvm::outs());
+  f.erase();
+}
+
+// clang-format off
+// CHECK-LABEL: func @linalg_conv_nhwc
+//       CHECK:   linalg.generic {args_in = 2 : i64, args_out = 1 : i64,
+// CHECK-SAME: indexing_maps = [(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2 * 3 + d4 * 5, d3 * 4 + d5 * 6, d6),
+// CHECK-SAME: (d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d1),
+// CHECK-SAME: (d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1)],
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]}
+///      CHECK:   ^bb1(%[[a0:.*]]: f32, %[[a1:.*]]: f32, %[[a2:.*]]: f32):
+//       CHECK:     %[[a3:.*]] = mulf %[[a0]], %[[a1]] : f32
+//       CHECK:     %[[a4:.*]] = addf %[[a2]], %[[a3]] : f32
+//       CHECK:     linalg.yield %[[a4]] : f32
+//       CHECK:   }: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>
+// clang-format on
+TEST_FUNC(linalg_conv_nhwc) {
+  using namespace edsc;
+  using namespace edsc::ops;
+
+  auto f32Type = FloatType::getF32(&globalContext());
+  auto memrefType = MemRefType::get({-1, -1, -1, -1}, f32Type, {}, 0);
+  auto f = makeFunction("linalg_conv_nhwc", {},
+                        {memrefType, memrefType, memrefType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  linalg_conv_nhwc(makeValueHandles(llvm::to_vector<3>(f.getArguments())),
+                   /*strides=*/{3, 4}, /*dilations=*/{5, 6});
+
+  f.print(llvm::outs());
+  f.erase();
+}
+
+// clang-format off
+// CHECK-LABEL: func @linalg_dilated_conv_nhwc
+//       CHECK:   linalg.generic {args_in = 2 : i64, args_out = 1 : i64,
+// CHECK-SAME: indexing_maps = [(d0, d1, d2, d3, d4, d5, d6) -> (d0, d3 * 3 + d5 * 5, d4 * 4 + d6 * 6, d2),
+// CHECK-SAME: (d0, d1, d2, d3, d4, d5, d6) -> (d5, d6, d2, d1),
+// CHECK-SAME: (d0, d1, d2, d3, d4, d5, d6) -> (d0, d3, d4, d1 + d2 * 7)],
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]}
+///      CHECK:   ^bb1(%[[a0:.*]]: f32, %[[a1:.*]]: f32, %[[a2:.*]]: f32):
+//       CHECK:     %[[a3:.*]] = mulf %[[a0]], %[[a1]] : f32
+//       CHECK:     %[[a4:.*]] = addf %[[a2]], %[[a3]] : f32
+//       CHECK:     linalg.yield %[[a4]] : f32
+//       CHECK:   }: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>
+// clang-format on
+TEST_FUNC(linalg_dilated_conv_nhwc) {
+  using namespace edsc;
+  using namespace edsc::ops;
+
+  auto f32Type = FloatType::getF32(&globalContext());
+  auto memrefType = MemRefType::get({-1, -1, -1, -1}, f32Type, {}, 0);
+  auto f = makeFunction("linalg_dilated_conv_nhwc", {},
+                        {memrefType, memrefType, memrefType});
+
+  OpBuilder builder(f.getBody());
+  ScopedContext scope(builder, f.getLoc());
+  linalg_dilated_conv_nhwc(
+      makeValueHandles(llvm::to_vector<3>(f.getArguments())),
+      /*depth_multiplier=*/7,
+      /*strides=*/{3, 4}, /*dilations=*/{5, 6});
+
+  f.print(llvm::outs());
+  f.erase();
+}
+
+int main() {
+  RUN_TESTS();
+  return 0;
+}
diff --git a/mlir/test/EDSC/lit.local.cfg b/mlir/test/EDSC/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..81261555b4246b27334873bcbc67cd9f54ba0d4d
--- /dev/null
+++ b/mlir/test/EDSC/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes.add('.cpp')
diff --git a/mlir/test/Examples/Toy/Ch1/ast.toy b/mlir/test/Examples/Toy/Ch1/ast.toy
new file mode 100644
index 0000000000000000000000000000000000000000..f13d690a7d013e7da4afe65e5c3be34bd0392ee1
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch1/ast.toy
@@ -0,0 +1,76 @@
+# RUN: toyc-ch1 %s -emit=ast 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments.
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
+  # The shape is inferred from the supplied literal.
+  var a = [[1, 2, 3], [4, 5, 6]];
+  # b is identical to a, the literal array is implicitly reshaped: defining new
+  # variables is the way to reshape arrays (element count in literal must match
+  # the size of specified shape).
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+
+  # This call will specialize `multiply_transpose` with <2, 3> for both
+  # arguments and deduce a return type of <2, 2> in initialization of `c`.
+  var c = multiply_transpose(a, b);
+  # A second call to `multiply_transpose` with <2, 3> for both arguments will
+  # reuse the previously specialized and inferred version and return `<2, 2>`
+  var d = multiply_transpose(b, a);
+  # A new call with `<2, 2>` for both dimension will trigger another
+  # specialization of `multiply_transpose`.
+  var e = multiply_transpose(b, c);
+  # Finally, calling into `multiply_transpose` with incompatible shape will
+  # trigger a shape inference error.
+  var f = multiply_transpose(transpose(a), c);
+}
+
+
+# CHECK: Module:
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'multiply_transpose' @{{.*}}ast.toy:4:1'
+# CHECK-NEXT:       Params: [a, b]
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         Return
+# CHECK-NEXT:           BinOp: * @{{.*}}ast.toy:5:25
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:10
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:5:20
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:25
+# CHECK-NEXT:               var: b @{{.*}}ast.toy:5:35
+# CHECK-NEXT:             ]
+# CHECK-NEXT:       } // Block
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'main' @{{.*}}ast.toy:8:1'
+# CHECK-NEXT:       Params: []
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         VarDecl a<> @{{.*}}ast.toy:11:3
+# CHECK-NEXT:           Literal: <2, 3>[ <3>[ 1.000000e+00, 2.000000e+00, 3.000000e+00], <3>[ 4.000000e+00, 5.000000e+00, 6.000000e+00]] @{{.*}}ast.toy:11:11
+# CHECK-NEXT:         VarDecl b<2, 3> @{{.*}}ast.toy:15:3
+# CHECK-NEXT:           Literal: <6>[ 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00] @{{.*}}ast.toy:15:17
+# CHECK-NEXT:         VarDecl c<> @{{.*}}ast.toy:19:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:19:11
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:19:30
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:19:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl d<> @{{.*}}ast.toy:22:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:22:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:22:30
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:22:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl e<> @{{.*}}ast.toy:25:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:25:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:25:30
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:25:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl f<> @{{.*}}ast.toy:28:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:28:11
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:28:30
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:28:40
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:28:44
+# CHECK-NEXT:           ]
+
diff --git a/mlir/test/Examples/Toy/Ch2/ast.toy b/mlir/test/Examples/Toy/Ch2/ast.toy
new file mode 100644
index 0000000000000000000000000000000000000000..5f399372f6060db5306435d772fa19b83fe6ba98
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch2/ast.toy
@@ -0,0 +1,76 @@
+# RUN: toyc-ch2 %s -emit=ast 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments.
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
+  # The shape is inferred from the supplied literal.
+  var a = [[1, 2, 3], [4, 5, 6]];
+  # b is identical to a, the literal array is implicitly reshaped: defining new
+  # variables is the way to reshape arrays (element count in literal must match
+  # the size of specified shape).
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+
+  # This call will specialize `multiply_transpose` with <2, 3> for both
+  # arguments and deduce a return type of <2, 2> in initialization of `c`.
+  var c = multiply_transpose(a, b);
+  # A second call to `multiply_transpose` with <2, 3> for both arguments will
+  # reuse the previously specialized and inferred version and return `<2, 2>`
+  var d = multiply_transpose(b, a);
+  # A new call with `<2, 2>` for both dimension will trigger another
+  # specialization of `multiply_transpose`.
+  var e = multiply_transpose(b, c);
+  # Finally, calling into `multiply_transpose` with incompatible shape will
+  # trigger a shape inference error.
+  var f = multiply_transpose(transpose(a), c);
+}
+
+
+# CHECK: Module:
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'multiply_transpose' @{{.*}}ast.toy:4:1'
+# CHECK-NEXT:       Params: [a, b]
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         Return
+# CHECK-NEXT:           BinOp: * @{{.*}}ast.toy:5:25
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:10
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:5:20
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:25
+# CHECK-NEXT:               var: b @{{.*}}ast.toy:5:35
+# CHECK-NEXT:             ]
+# CHECK-NEXT:       } // Block
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'main' @{{.*}}ast.toy:8:1'
+# CHECK-NEXT:       Params: []
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         VarDecl a<> @{{.*}}ast.toy:11:3
+# CHECK-NEXT:           Literal: <2, 3>[ <3>[ 1.000000e+00, 2.000000e+00, 3.000000e+00], <3>[ 4.000000e+00, 5.000000e+00, 6.000000e+00]] @{{.*}}ast.toy:11:11
+# CHECK-NEXT:         VarDecl b<2, 3> @{{.*}}ast.toy:15:3
+# CHECK-NEXT:           Literal: <6>[ 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00] @{{.*}}ast.toy:15:17
+# CHECK-NEXT:         VarDecl c<> @{{.*}}ast.toy:19:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:19:11
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:19:30
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:19:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl d<> @{{.*}}ast.toy:22:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:22:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:22:30
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:22:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl e<> @{{.*}}ast.toy:25:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:25:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:25:30
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:25:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl f<> @{{.*}}ast.toy:28:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:28:11
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:28:30
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:28:40
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:28:44
+# CHECK-NEXT:           ]
+
diff --git a/mlir/test/Examples/Toy/Ch2/codegen.toy b/mlir/test/Examples/Toy/Ch2/codegen.toy
new file mode 100644
index 0000000000000000000000000000000000000000..e4f20aa2a47fdd4ead28f4ab5bf92987ca5e3755
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch2/codegen.toy
@@ -0,0 +1,31 @@
+# RUN: toyc-ch2 %s -emit=mlir 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+  var c = multiply_transpose(a, b);
+  var d = multiply_transpose(b, a);
+  print(d);
+}
+
+# CHECK-LABEL: func @multiply_transpose(
+# CHECK-SAME:                           [[VAL_0:%.*]]: tensor<*xf64>, [[VAL_1:%.*]]: tensor<*xf64>) -> tensor<*xf64>
+# CHECK:         [[VAL_2:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_3:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_4:%.*]] = "toy.mul"([[VAL_2]], [[VAL_3]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    "toy.return"([[VAL_4]]) : (tensor<*xf64>) -> ()
+
+# CHECK-LABEL: func @main()
+# CHECK-NEXT:    [[VAL_5:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_6:%.*]] = "toy.reshape"([[VAL_5]]) : (tensor<2x3xf64>) -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_7:%.*]] = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
+# CHECK-NEXT:    [[VAL_8:%.*]] = "toy.reshape"([[VAL_7]]) : (tensor<6xf64>) -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_9:%.*]] = "toy.generic_call"([[VAL_6]], [[VAL_8]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_10:%.*]] = "toy.generic_call"([[VAL_8]], [[VAL_6]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    "toy.print"([[VAL_10]]) : (tensor<*xf64>) -> ()
+# CHECK-NEXT:    "toy.return"() : () -> ()
diff --git a/mlir/test/Examples/Toy/Ch2/invalid.mlir b/mlir/test/Examples/Toy/Ch2/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..a8ceab9735e118204379a3809d00a73c9a55d475
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch2/invalid.mlir
@@ -0,0 +1,9 @@
+// RUN: not toyc-ch2 %s -emit=mlir 2>&1
+
+// The following IR is not "valid":
+// - toy.print should not return a value.
+// - toy.print should take an argument.
+// - There should be a block terminator.
+func @main() {
+  %0 = "toy.print"()  : () -> tensor<2x3xf64>
+}
diff --git a/mlir/test/Examples/Toy/Ch2/scalar.toy b/mlir/test/Examples/Toy/Ch2/scalar.toy
new file mode 100644
index 0000000000000000000000000000000000000000..0671f0501295f820663fdec39cfcbd14c806a633
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch2/scalar.toy
@@ -0,0 +1,14 @@
+# RUN: toyc-ch2 %s -emit=mlir 2>&1 | FileCheck %s
+
+def main() {
+  var a<2, 2> = 5.5;
+  print(a);
+}
+
+# CHECK-LABEL: func @main() {
+# CHECK-NEXT:    %0 = "toy.constant"() {value = dense<5.500000e+00> : tensor<f64>} : () -> tensor<f64>
+# CHECK-NEXT:    %1 = "toy.reshape"(%0) : (tensor<f64>) -> tensor<2x2xf64>
+# CHECK-NEXT:    "toy.print"(%1) : (tensor<2x2xf64>) -> ()
+# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:  }
+
diff --git a/mlir/test/Examples/Toy/Ch3/ast.toy b/mlir/test/Examples/Toy/Ch3/ast.toy
new file mode 100644
index 0000000000000000000000000000000000000000..b8e4eb0bcccd88b51b6684d46d6922afef13af61
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch3/ast.toy
@@ -0,0 +1,76 @@
+# RUN: toyc-ch3 %s -emit=ast 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments.
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
+  # The shape is inferred from the supplied literal.
+  var a = [[1, 2, 3], [4, 5, 6]];
+  # b is identical to a, the literal array is implicitly reshaped: defining new
+  # variables is the way to reshape arrays (element count in literal must match
+  # the size of specified shape).
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+
+  # This call will specialize `multiply_transpose` with <2, 3> for both
+  # arguments and deduce a return type of <2, 2> in initialization of `c`.
+  var c = multiply_transpose(a, b);
+  # A second call to `multiply_transpose` with <2, 3> for both arguments will
+  # reuse the previously specialized and inferred version and return `<2, 2>`
+  var d = multiply_transpose(b, a);
+  # A new call with `<2, 2>` for both dimension will trigger another
+  # specialization of `multiply_transpose`.
+  var e = multiply_transpose(b, c);
+  # Finally, calling into `multiply_transpose` with incompatible shape will
+  # trigger a shape inference error.
+  var f = multiply_transpose(transpose(a), c);
+}
+
+
+# CHECK: Module:
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'multiply_transpose' @{{.*}}ast.toy:4:1'
+# CHECK-NEXT:       Params: [a, b]
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         Return
+# CHECK-NEXT:           BinOp: * @{{.*}}ast.toy:5:25
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:10
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:5:20
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:25
+# CHECK-NEXT:               var: b @{{.*}}ast.toy:5:35
+# CHECK-NEXT:             ]
+# CHECK-NEXT:       } // Block
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'main' @{{.*}}ast.toy:8:1'
+# CHECK-NEXT:       Params: []
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         VarDecl a<> @{{.*}}ast.toy:11:3
+# CHECK-NEXT:           Literal: <2, 3>[ <3>[ 1.000000e+00, 2.000000e+00, 3.000000e+00], <3>[ 4.000000e+00, 5.000000e+00, 6.000000e+00]] @{{.*}}ast.toy:11:11
+# CHECK-NEXT:         VarDecl b<2, 3> @{{.*}}ast.toy:15:3
+# CHECK-NEXT:           Literal: <6>[ 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00] @{{.*}}ast.toy:15:17
+# CHECK-NEXT:         VarDecl c<> @{{.*}}ast.toy:19:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:19:11
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:19:30
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:19:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl d<> @{{.*}}ast.toy:22:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:22:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:22:30
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:22:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl e<> @{{.*}}ast.toy:25:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:25:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:25:30
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:25:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl f<> @{{.*}}ast.toy:28:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:28:11
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:28:30
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:28:40
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:28:44
+# CHECK-NEXT:           ]
+
diff --git a/mlir/test/Examples/Toy/Ch3/codegen.toy b/mlir/test/Examples/Toy/Ch3/codegen.toy
new file mode 100644
index 0000000000000000000000000000000000000000..cc9fdd4798a6d9ddb25b293102926593d1fda3ec
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch3/codegen.toy
@@ -0,0 +1,31 @@
+# RUN: toyc-ch3 %s -emit=mlir 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+  var c = multiply_transpose(a, b);
+  var d = multiply_transpose(b, a);
+  print(d);
+}
+
+# CHECK-LABEL: func @multiply_transpose(
+# CHECK-SAME:                           [[VAL_0:%.*]]: tensor<*xf64>, [[VAL_1:%.*]]: tensor<*xf64>) -> tensor<*xf64>
+# CHECK:         [[VAL_2:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_3:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_4:%.*]] = "toy.mul"([[VAL_2]], [[VAL_3]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    "toy.return"([[VAL_4]]) : (tensor<*xf64>) -> ()
+
+# CHECK-LABEL: func @main()
+# CHECK-NEXT:    [[VAL_5:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_6:%.*]] = "toy.reshape"([[VAL_5]]) : (tensor<2x3xf64>) -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_7:%.*]] = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
+# CHECK-NEXT:    [[VAL_8:%.*]] = "toy.reshape"([[VAL_7]]) : (tensor<6xf64>) -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_9:%.*]] = "toy.generic_call"([[VAL_6]], [[VAL_8]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_10:%.*]] = "toy.generic_call"([[VAL_8]], [[VAL_6]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    "toy.print"([[VAL_10]]) : (tensor<*xf64>) -> ()
+# CHECK-NEXT:    "toy.return"() : () -> ()
diff --git a/mlir/test/Examples/Toy/Ch3/invalid.mlir b/mlir/test/Examples/Toy/Ch3/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..558cf9c154d2faa90cf10e08ef69aeacf57362c9
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch3/invalid.mlir
@@ -0,0 +1,9 @@
+// RUN: not toyc-ch3 %s -emit=mlir 2>&1
+
+// The following IR is not "valid":
+// - toy.print should not return a value.
+// - toy.print should take an argument.
+// - There should be a block terminator.
+func @main() {
+  %0 = "toy.print"()  : () -> tensor<2x3xf64>
+}
diff --git a/mlir/test/Examples/Toy/Ch3/scalar.toy b/mlir/test/Examples/Toy/Ch3/scalar.toy
new file mode 100644
index 0000000000000000000000000000000000000000..dd7ec935e88b4c7ca1b413574a7a710986dc2eae
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch3/scalar.toy
@@ -0,0 +1,14 @@
+# RUN: toyc-ch3 %s -emit=mlir 2>&1 | FileCheck %s
+
+def main() {
+  var a<2, 2> = 5.5;
+  print(a);
+}
+
+# CHECK-LABEL: func @main() {
+# CHECK-NEXT:    %0 = "toy.constant"() {value = dense<5.500000e+00> : tensor<f64>} : () -> tensor<f64>
+# CHECK-NEXT:    %1 = "toy.reshape"(%0) : (tensor<f64>) -> tensor<2x2xf64>
+# CHECK-NEXT:    "toy.print"(%1) : (tensor<2x2xf64>) -> ()
+# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:  }
+
diff --git a/mlir/test/Examples/Toy/Ch4/ast.toy b/mlir/test/Examples/Toy/Ch4/ast.toy
new file mode 100644
index 0000000000000000000000000000000000000000..c991cc130cd1ab344b92a96398d327623498c7ca
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch4/ast.toy
@@ -0,0 +1,76 @@
+# RUN: toyc-ch4 %s -emit=ast 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments.
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
+  # The shape is inferred from the supplied literal.
+  var a = [[1, 2, 3], [4, 5, 6]];
+  # b is identical to a, the literal array is implicitly reshaped: defining new
+  # variables is the way to reshape arrays (element count in literal must match
+  # the size of specified shape).
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+
+  # This call will specialize `multiply_transpose` with <2, 3> for both
+  # arguments and deduce a return type of <2, 2> in initialization of `c`.
+  var c = multiply_transpose(a, b);
+  # A second call to `multiply_transpose` with <2, 3> for both arguments will
+  # reuse the previously specialized and inferred version and return `<2, 2>`
+  var d = multiply_transpose(b, a);
+  # A new call with `<2, 2>` for both dimension will trigger another
+  # specialization of `multiply_transpose`.
+  var e = multiply_transpose(b, c);
+  # Finally, calling into `multiply_transpose` with incompatible shape will
+  # trigger a shape inference error.
+  var f = multiply_transpose(transpose(a), c);
+}
+
+
+# CHECK: Module:
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'multiply_transpose' @{{.*}}ast.toy:4:1'
+# CHECK-NEXT:       Params: [a, b]
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         Return
+# CHECK-NEXT:           BinOp: * @{{.*}}ast.toy:5:25
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:10
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:5:20
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:25
+# CHECK-NEXT:               var: b @{{.*}}ast.toy:5:35
+# CHECK-NEXT:             ]
+# CHECK-NEXT:       } // Block
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'main' @{{.*}}ast.toy:8:1'
+# CHECK-NEXT:       Params: []
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         VarDecl a<> @{{.*}}ast.toy:11:3
+# CHECK-NEXT:           Literal: <2, 3>[ <3>[ 1.000000e+00, 2.000000e+00, 3.000000e+00], <3>[ 4.000000e+00, 5.000000e+00, 6.000000e+00]] @{{.*}}ast.toy:11:11
+# CHECK-NEXT:         VarDecl b<2, 3> @{{.*}}ast.toy:15:3
+# CHECK-NEXT:           Literal: <6>[ 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00] @{{.*}}ast.toy:15:17
+# CHECK-NEXT:         VarDecl c<> @{{.*}}ast.toy:19:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:19:11
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:19:30
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:19:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl d<> @{{.*}}ast.toy:22:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:22:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:22:30
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:22:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl e<> @{{.*}}ast.toy:25:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:25:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:25:30
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:25:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl f<> @{{.*}}ast.toy:28:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:28:11
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:28:30
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:28:40
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:28:44
+# CHECK-NEXT:           ]
+
diff --git a/mlir/test/Examples/Toy/Ch4/codegen.toy b/mlir/test/Examples/Toy/Ch4/codegen.toy
new file mode 100644
index 0000000000000000000000000000000000000000..94ecbae38e896f3f2011459f8e07562d0f3ffec4
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch4/codegen.toy
@@ -0,0 +1,31 @@
+# RUN: toyc-ch4 %s -emit=mlir 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+  var c = multiply_transpose(a, b);
+  var d = multiply_transpose(b, a);
+  print(d);
+}
+
+# CHECK-LABEL: func @multiply_transpose(
+# CHECK-SAME:                           [[VAL_0:%.*]]: tensor<*xf64>, [[VAL_1:%.*]]: tensor<*xf64>) -> tensor<*xf64>
+# CHECK:         [[VAL_2:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_3:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_4:%.*]] = "toy.mul"([[VAL_2]], [[VAL_3]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    "toy.return"([[VAL_4]]) : (tensor<*xf64>) -> ()
+
+# CHECK-LABEL: func @main()
+# CHECK-NEXT:    [[VAL_5:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_6:%.*]] = "toy.reshape"([[VAL_5]]) : (tensor<2x3xf64>) -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_7:%.*]] = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
+# CHECK-NEXT:    [[VAL_8:%.*]] = "toy.reshape"([[VAL_7]]) : (tensor<6xf64>) -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_9:%.*]] = "toy.generic_call"([[VAL_6]], [[VAL_8]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_10:%.*]] = "toy.generic_call"([[VAL_8]], [[VAL_6]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    "toy.print"([[VAL_10]]) : (tensor<*xf64>) -> ()
+# CHECK-NEXT:    "toy.return"() : () -> ()
diff --git a/mlir/test/Examples/Toy/Ch4/invalid.mlir b/mlir/test/Examples/Toy/Ch4/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8d1bb27344f2311903a50a66885473a7b3915b8a
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch4/invalid.mlir
@@ -0,0 +1,9 @@
+// RUN: not toyc-ch4 %s -emit=mlir 2>&1
+
+// The following IR is not "valid":
+// - toy.print should not return a value.
+// - toy.print should take an argument.
+// - There should be a block terminator.
+func @main() {
+  %0 = "toy.print"()  : () -> tensor<2x3xf64>
+}
diff --git a/mlir/test/Examples/Toy/Ch4/scalar.toy b/mlir/test/Examples/Toy/Ch4/scalar.toy
new file mode 100644
index 0000000000000000000000000000000000000000..032b3b02b9d905e164dce76a1076e85d97f739a3
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch4/scalar.toy
@@ -0,0 +1,14 @@
+# RUN: toyc-ch4 %s -emit=mlir 2>&1 | FileCheck %s
+
+def main() {
+  var a<2, 2> = 5.5;
+  print(a);
+}
+
+# CHECK-LABEL: func @main() {
+# CHECK-NEXT:    %0 = "toy.constant"() {value = dense<5.500000e+00> : tensor<f64>} : () -> tensor<f64>
+# CHECK-NEXT:    %1 = "toy.reshape"(%0) : (tensor<f64>) -> tensor<2x2xf64>
+# CHECK-NEXT:    "toy.print"(%1) : (tensor<2x2xf64>) -> ()
+# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:  }
+
diff --git a/mlir/test/Examples/Toy/Ch4/shape_inference.mlir b/mlir/test/Examples/Toy/Ch4/shape_inference.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d1c07e49ae6965fcc18d3f8b480fd054cf834fda
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch4/shape_inference.mlir
@@ -0,0 +1,30 @@
+// RUN: toyc-ch4 %s -emit=mlir -opt 2>&1 | FileCheck %s
+
+// Check the result of inlining+shape inference on an input module.
+
+func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
+  %1 = "toy.transpose"(%arg1) : (tensor<*xf64>) -> tensor<*xf64>
+  %2 = "toy.mul"(%0, %1) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+  "toy.return"(%2) : (tensor<*xf64>) -> ()
+}
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %1 = "toy.reshape"(%0) : (tensor<2x3xf64>) -> tensor<2x3xf64>
+  %2 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
+  %3 = "toy.reshape"(%2) : (tensor<6xf64>) -> tensor<2x3xf64>
+  %4 = "toy.generic_call"(%1, %3) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  %5 = "toy.generic_call"(%3, %1) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  "toy.print"(%5) : (tensor<*xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+
+// CHECK-NOT: func @multiply_transpose
+// CHECK-NOT: tensor<*xf64>
+
+// CHECK-LABEL: func @main()
+// CHECK:         [[VAL_0:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+// CHECK:         [[VAL_1:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+// CHECK:         [[VAL_2:%.*]] = "toy.mul"([[VAL_1]], [[VAL_1]]) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+// CHECK:         "toy.print"([[VAL_2]]) : (tensor<3x2xf64>) -> ()
+// CHECK:         "toy.return"() : () -> ()
diff --git a/mlir/test/Examples/Toy/Ch5/affine-lowering.mlir b/mlir/test/Examples/Toy/Ch5/affine-lowering.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..07bbc22ab6ecf21f9a7a283b272adda41fd6bcb2
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch5/affine-lowering.mlir
@@ -0,0 +1,65 @@
+// RUN: toyc-ch5 %s -emit=mlir-affine 2>&1 | FileCheck %s
+// RUN: toyc-ch5 %s -emit=mlir-affine -opt 2>&1 | FileCheck %s --check-prefix=OPT
+
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+
+// CHECK-LABEL: func @main()
+// CHECK:         [[VAL_0:%.*]] = constant 1.000000e+00 : f64
+// CHECK:         [[VAL_1:%.*]] = constant 2.000000e+00 : f64
+// CHECK:         [[VAL_2:%.*]] = constant 3.000000e+00 : f64
+// CHECK:         [[VAL_3:%.*]] = constant 4.000000e+00 : f64
+// CHECK:         [[VAL_4:%.*]] = constant 5.000000e+00 : f64
+// CHECK:         [[VAL_5:%.*]] = constant 6.000000e+00 : f64
+// CHECK:         [[VAL_6:%.*]] = alloc() : memref<3x2xf64>
+// CHECK:         [[VAL_7:%.*]] = alloc() : memref<3x2xf64>
+// CHECK:         [[VAL_8:%.*]] = alloc() : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_0]], [[VAL_8]][0, 0] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_1]], [[VAL_8]][0, 1] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_2]], [[VAL_8]][0, 2] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_3]], [[VAL_8]][1, 0] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_4]], [[VAL_8]][1, 1] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_5]], [[VAL_8]][1, 2] : memref<2x3xf64>
+// CHECK:         affine.for [[VAL_9:%.*]] = 0 to 3 {
+// CHECK:           affine.for [[VAL_10:%.*]] = 0 to 2 {
+// CHECK:             [[VAL_11:%.*]] = affine.load [[VAL_8]]{{\[}}[[VAL_10]], [[VAL_9]]] : memref<2x3xf64>
+// CHECK:             affine.store [[VAL_11]], [[VAL_7]]{{\[}}[[VAL_9]], [[VAL_10]]] : memref<3x2xf64>
+// CHECK:         affine.for [[VAL_12:%.*]] = 0 to 3 {
+// CHECK:           affine.for [[VAL_13:%.*]] = 0 to 2 {
+// CHECK:             [[VAL_14:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
+// CHECK:             [[VAL_15:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
+// CHECK:             [[VAL_16:%.*]] = mulf [[VAL_14]], [[VAL_15]] : f64
+// CHECK:             affine.store [[VAL_16]], [[VAL_6]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
+// CHECK:         "toy.print"([[VAL_6]]) : (memref<3x2xf64>) -> ()
+// CHECK:         dealloc [[VAL_8]] : memref<2x3xf64>
+// CHECK:         dealloc [[VAL_7]] : memref<3x2xf64>
+// CHECK:         dealloc [[VAL_6]] : memref<3x2xf64>
+
+// OPT-LABEL: func @main()
+// OPT:         [[VAL_0:%.*]] = constant 1.000000e+00 : f64
+// OPT:         [[VAL_1:%.*]] = constant 2.000000e+00 : f64
+// OPT:         [[VAL_2:%.*]] = constant 3.000000e+00 : f64
+// OPT:         [[VAL_3:%.*]] = constant 4.000000e+00 : f64
+// OPT:         [[VAL_4:%.*]] = constant 5.000000e+00 : f64
+// OPT:         [[VAL_5:%.*]] = constant 6.000000e+00 : f64
+// OPT:         [[VAL_6:%.*]] = alloc() : memref<3x2xf64>
+// OPT:         [[VAL_7:%.*]] = alloc() : memref<2x3xf64>
+// OPT:         affine.store [[VAL_0]], [[VAL_7]][0, 0] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_1]], [[VAL_7]][0, 1] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_2]], [[VAL_7]][0, 2] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_3]], [[VAL_7]][1, 0] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_4]], [[VAL_7]][1, 1] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_5]], [[VAL_7]][1, 2] : memref<2x3xf64>
+// OPT:         affine.for [[VAL_8:%.*]] = 0 to 3 {
+// OPT:           affine.for [[VAL_9:%.*]] = 0 to 2 {
+// OPT:             [[VAL_10:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_9]], [[VAL_8]]] : memref<2x3xf64>
+// OPT:             [[VAL_11:%.*]] = mulf [[VAL_10]], [[VAL_10]] : f64
+// OPT:             affine.store [[VAL_11]], [[VAL_6]]{{\[}}[[VAL_8]], [[VAL_9]]] : memref<3x2xf64>
+// OPT:         "toy.print"([[VAL_6]]) : (memref<3x2xf64>) -> ()
+// OPT:         dealloc [[VAL_7]] : memref<2x3xf64>
+// OPT:         dealloc [[VAL_6]] : memref<3x2xf64>
diff --git a/mlir/test/Examples/Toy/Ch5/ast.toy b/mlir/test/Examples/Toy/Ch5/ast.toy
new file mode 100644
index 0000000000000000000000000000000000000000..4f45c30f0ad1ea6573a5ec5abf2ff189b2b022c2
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch5/ast.toy
@@ -0,0 +1,76 @@
+# RUN: toyc-ch5 %s -emit=ast 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments.
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
+  # The shape is inferred from the supplied literal.
+  var a = [[1, 2, 3], [4, 5, 6]];
+  # b is identical to a, the literal array is implicitly reshaped: defining new
+  # variables is the way to reshape arrays (element count in literal must match
+  # the size of specified shape).
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+
+  # This call will specialize `multiply_transpose` with <2, 3> for both
+  # arguments and deduce a return type of <2, 2> in initialization of `c`.
+  var c = multiply_transpose(a, b);
+  # A second call to `multiply_transpose` with <2, 3> for both arguments will
+  # reuse the previously specialized and inferred version and return `<2, 2>`
+  var d = multiply_transpose(b, a);
+  # A new call with `<2, 2>` for both dimension will trigger another
+  # specialization of `multiply_transpose`.
+  var e = multiply_transpose(b, c);
+  # Finally, calling into `multiply_transpose` with incompatible shape will
+  # trigger a shape inference error.
+  var f = multiply_transpose(transpose(a), c);
+}
+
+
+# CHECK: Module:
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'multiply_transpose' @{{.*}}ast.toy:4:1'
+# CHECK-NEXT:       Params: [a, b]
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         Return
+# CHECK-NEXT:           BinOp: * @{{.*}}ast.toy:5:25
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:10
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:5:20
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:25
+# CHECK-NEXT:               var: b @{{.*}}ast.toy:5:35
+# CHECK-NEXT:             ]
+# CHECK-NEXT:       } // Block
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'main' @{{.*}}ast.toy:8:1'
+# CHECK-NEXT:       Params: []
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         VarDecl a<> @{{.*}}ast.toy:11:3
+# CHECK-NEXT:           Literal: <2, 3>[ <3>[ 1.000000e+00, 2.000000e+00, 3.000000e+00], <3>[ 4.000000e+00, 5.000000e+00, 6.000000e+00]] @{{.*}}ast.toy:11:11
+# CHECK-NEXT:         VarDecl b<2, 3> @{{.*}}ast.toy:15:3
+# CHECK-NEXT:           Literal: <6>[ 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00] @{{.*}}ast.toy:15:17
+# CHECK-NEXT:         VarDecl c<> @{{.*}}ast.toy:19:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:19:11
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:19:30
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:19:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl d<> @{{.*}}ast.toy:22:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:22:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:22:30
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:22:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl e<> @{{.*}}ast.toy:25:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:25:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:25:30
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:25:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl f<> @{{.*}}ast.toy:28:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:28:11
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:28:30
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:28:40
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:28:44
+# CHECK-NEXT:           ]
+
diff --git a/mlir/test/Examples/Toy/Ch5/codegen.toy b/mlir/test/Examples/Toy/Ch5/codegen.toy
new file mode 100644
index 0000000000000000000000000000000000000000..8719ce4f4301c927e08ec8765d2e3c0861d2ea7f
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch5/codegen.toy
@@ -0,0 +1,31 @@
+# RUN: toyc-ch5 %s -emit=mlir 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+  var c = multiply_transpose(a, b);
+  var d = multiply_transpose(b, a);
+  print(d);
+}
+
+# CHECK-LABEL: func @multiply_transpose(
+# CHECK-SAME:                           [[VAL_0:%.*]]: tensor<*xf64>, [[VAL_1:%.*]]: tensor<*xf64>) -> tensor<*xf64>
+# CHECK:         [[VAL_2:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_3:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_4:%.*]] = "toy.mul"([[VAL_2]], [[VAL_3]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    "toy.return"([[VAL_4]]) : (tensor<*xf64>) -> ()
+
+# CHECK-LABEL: func @main()
+# CHECK-NEXT:    [[VAL_5:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_6:%.*]] = "toy.reshape"([[VAL_5]]) : (tensor<2x3xf64>) -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_7:%.*]] = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
+# CHECK-NEXT:    [[VAL_8:%.*]] = "toy.reshape"([[VAL_7]]) : (tensor<6xf64>) -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_9:%.*]] = "toy.generic_call"([[VAL_6]], [[VAL_8]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_10:%.*]] = "toy.generic_call"([[VAL_8]], [[VAL_6]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    "toy.print"([[VAL_10]]) : (tensor<*xf64>) -> ()
+# CHECK-NEXT:    "toy.return"() : () -> ()
diff --git a/mlir/test/Examples/Toy/Ch5/invalid.mlir b/mlir/test/Examples/Toy/Ch5/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..73ade9a910eb766f525cd065c343a116ede49878
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch5/invalid.mlir
@@ -0,0 +1,9 @@
+// RUN: not toyc-ch5 %s -emit=mlir 2>&1
+
+// The following IR is not "valid":
+// - toy.print should not return a value.
+// - toy.print should take an argument.
+// - There should be a block terminator.
+func @main() {
+  %0 = "toy.print"()  : () -> tensor<2x3xf64>
+}
diff --git a/mlir/test/Examples/Toy/Ch5/scalar.toy b/mlir/test/Examples/Toy/Ch5/scalar.toy
new file mode 100644
index 0000000000000000000000000000000000000000..2743b5a3ac94f636507bd0907f481f81676ef62c
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch5/scalar.toy
@@ -0,0 +1,14 @@
+# RUN: toyc-ch5 %s -emit=mlir 2>&1 | FileCheck %s
+
+def main() {
+  var a<2, 2> = 5.5;
+  print(a);
+}
+
+# CHECK-LABEL: func @main() {
+# CHECK-NEXT:    %0 = "toy.constant"() {value = dense<5.500000e+00> : tensor<f64>} : () -> tensor<f64>
+# CHECK-NEXT:    %1 = "toy.reshape"(%0) : (tensor<f64>) -> tensor<2x2xf64>
+# CHECK-NEXT:    "toy.print"(%1) : (tensor<2x2xf64>) -> ()
+# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:  }
+
diff --git a/mlir/test/Examples/Toy/Ch5/shape_inference.mlir b/mlir/test/Examples/Toy/Ch5/shape_inference.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..9e44ac53ba026685c9ff8e0bcb95658e8e086152
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch5/shape_inference.mlir
@@ -0,0 +1,30 @@
+// RUN: toyc-ch5 %s -emit=mlir -opt 2>&1 | FileCheck %s
+
+// Check the result of inlining+shape inference on an input module.
+
+func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
+  %1 = "toy.transpose"(%arg1) : (tensor<*xf64>) -> tensor<*xf64>
+  %2 = "toy.mul"(%0, %1) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+  "toy.return"(%2) : (tensor<*xf64>) -> ()
+}
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %1 = "toy.reshape"(%0) : (tensor<2x3xf64>) -> tensor<2x3xf64>
+  %2 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
+  %3 = "toy.reshape"(%2) : (tensor<6xf64>) -> tensor<2x3xf64>
+  %4 = "toy.generic_call"(%1, %3) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  %5 = "toy.generic_call"(%3, %1) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  "toy.print"(%5) : (tensor<*xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+
+// CHECK-NOT: func @multiply_transpose
+// CHECK-NOT: tensor<*xf64>
+
+// CHECK-LABEL: func @main()
+// CHECK:         [[VAL_0:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+// CHECK:         [[VAL_1:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+// CHECK:         [[VAL_2:%.*]] = "toy.mul"([[VAL_1]], [[VAL_1]]) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+// CHECK:         "toy.print"([[VAL_2]]) : (tensor<3x2xf64>) -> ()
+// CHECK:         "toy.return"() : () -> ()
diff --git a/mlir/test/Examples/Toy/Ch6/affine-lowering.mlir b/mlir/test/Examples/Toy/Ch6/affine-lowering.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..3f546be0790f6307b00e3d8c8714a7af37668a55
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch6/affine-lowering.mlir
@@ -0,0 +1,65 @@
+// RUN: toyc-ch6 %s -emit=mlir-affine 2>&1 | FileCheck %s
+// RUN: toyc-ch6 %s -emit=mlir-affine -opt 2>&1 | FileCheck %s --check-prefix=OPT
+
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+
+// CHECK-LABEL: func @main()
+// CHECK:         [[VAL_0:%.*]] = constant 1.000000e+00 : f64
+// CHECK:         [[VAL_1:%.*]] = constant 2.000000e+00 : f64
+// CHECK:         [[VAL_2:%.*]] = constant 3.000000e+00 : f64
+// CHECK:         [[VAL_3:%.*]] = constant 4.000000e+00 : f64
+// CHECK:         [[VAL_4:%.*]] = constant 5.000000e+00 : f64
+// CHECK:         [[VAL_5:%.*]] = constant 6.000000e+00 : f64
+// CHECK:         [[VAL_6:%.*]] = alloc() : memref<3x2xf64>
+// CHECK:         [[VAL_7:%.*]] = alloc() : memref<3x2xf64>
+// CHECK:         [[VAL_8:%.*]] = alloc() : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_0]], [[VAL_8]][0, 0] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_1]], [[VAL_8]][0, 1] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_2]], [[VAL_8]][0, 2] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_3]], [[VAL_8]][1, 0] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_4]], [[VAL_8]][1, 1] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_5]], [[VAL_8]][1, 2] : memref<2x3xf64>
+// CHECK:         affine.for [[VAL_9:%.*]] = 0 to 3 {
+// CHECK:           affine.for [[VAL_10:%.*]] = 0 to 2 {
+// CHECK:             [[VAL_11:%.*]] = affine.load [[VAL_8]]{{\[}}[[VAL_10]], [[VAL_9]]] : memref<2x3xf64>
+// CHECK:             affine.store [[VAL_11]], [[VAL_7]]{{\[}}[[VAL_9]], [[VAL_10]]] : memref<3x2xf64>
+// CHECK:         affine.for [[VAL_12:%.*]] = 0 to 3 {
+// CHECK:           affine.for [[VAL_13:%.*]] = 0 to 2 {
+// CHECK:             [[VAL_14:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
+// CHECK:             [[VAL_15:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
+// CHECK:             [[VAL_16:%.*]] = mulf [[VAL_14]], [[VAL_15]] : f64
+// CHECK:             affine.store [[VAL_16]], [[VAL_6]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
+// CHECK:         "toy.print"([[VAL_6]]) : (memref<3x2xf64>) -> ()
+// CHECK:         dealloc [[VAL_8]] : memref<2x3xf64>
+// CHECK:         dealloc [[VAL_7]] : memref<3x2xf64>
+// CHECK:         dealloc [[VAL_6]] : memref<3x2xf64>
+
+// OPT-LABEL: func @main()
+// OPT:         [[VAL_0:%.*]] = constant 1.000000e+00 : f64
+// OPT:         [[VAL_1:%.*]] = constant 2.000000e+00 : f64
+// OPT:         [[VAL_2:%.*]] = constant 3.000000e+00 : f64
+// OPT:         [[VAL_3:%.*]] = constant 4.000000e+00 : f64
+// OPT:         [[VAL_4:%.*]] = constant 5.000000e+00 : f64
+// OPT:         [[VAL_5:%.*]] = constant 6.000000e+00 : f64
+// OPT:         [[VAL_6:%.*]] = alloc() : memref<3x2xf64>
+// OPT:         [[VAL_7:%.*]] = alloc() : memref<2x3xf64>
+// OPT:         affine.store [[VAL_0]], [[VAL_7]][0, 0] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_1]], [[VAL_7]][0, 1] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_2]], [[VAL_7]][0, 2] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_3]], [[VAL_7]][1, 0] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_4]], [[VAL_7]][1, 1] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_5]], [[VAL_7]][1, 2] : memref<2x3xf64>
+// OPT:         affine.for [[VAL_8:%.*]] = 0 to 3 {
+// OPT:           affine.for [[VAL_9:%.*]] = 0 to 2 {
+// OPT:             [[VAL_10:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_9]], [[VAL_8]]] : memref<2x3xf64>
+// OPT:             [[VAL_11:%.*]] = mulf [[VAL_10]], [[VAL_10]] : f64
+// OPT:             affine.store [[VAL_11]], [[VAL_6]]{{\[}}[[VAL_8]], [[VAL_9]]] : memref<3x2xf64>
+// OPT:         "toy.print"([[VAL_6]]) : (memref<3x2xf64>) -> ()
+// OPT:         dealloc [[VAL_7]] : memref<2x3xf64>
+// OPT:         dealloc [[VAL_6]] : memref<3x2xf64>
diff --git a/mlir/test/Examples/Toy/Ch6/ast.toy b/mlir/test/Examples/Toy/Ch6/ast.toy
new file mode 100644
index 0000000000000000000000000000000000000000..38eaa1c42805f21a2da0e76b86ee9c0e8ceed144
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch6/ast.toy
@@ -0,0 +1,76 @@
+# RUN: toyc-ch6 %s -emit=ast 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments.
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
+  # The shape is inferred from the supplied literal.
+  var a = [[1, 2, 3], [4, 5, 6]];
+  # b is identical to a, the literal array is implicitly reshaped: defining new
+  # variables is the way to reshape arrays (element count in literal must match
+  # the size of specified shape).
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+
+  # This call will specialize `multiply_transpose` with <2, 3> for both
+  # arguments and deduce a return type of <2, 2> in initialization of `c`.
+  var c = multiply_transpose(a, b);
+  # A second call to `multiply_transpose` with <2, 3> for both arguments will
+  # reuse the previously specialized and inferred version and return `<2, 2>`
+  var d = multiply_transpose(b, a);
+  # A new call with `<2, 2>` for both dimension will trigger another
+  # specialization of `multiply_transpose`.
+  var e = multiply_transpose(b, c);
+  # Finally, calling into `multiply_transpose` with incompatible shape will
+  # trigger a shape inference error.
+  var f = multiply_transpose(transpose(a), c);
+}
+
+
+# CHECK: Module:
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'multiply_transpose' @{{.*}}ast.toy:4:1'
+# CHECK-NEXT:       Params: [a, b]
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         Return
+# CHECK-NEXT:           BinOp: * @{{.*}}ast.toy:5:25
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:10
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:5:20
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:25
+# CHECK-NEXT:               var: b @{{.*}}ast.toy:5:35
+# CHECK-NEXT:             ]
+# CHECK-NEXT:       } // Block
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'main' @{{.*}}ast.toy:8:1'
+# CHECK-NEXT:       Params: []
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         VarDecl a<> @{{.*}}ast.toy:11:3
+# CHECK-NEXT:           Literal: <2, 3>[ <3>[ 1.000000e+00, 2.000000e+00, 3.000000e+00], <3>[ 4.000000e+00, 5.000000e+00, 6.000000e+00]] @{{.*}}ast.toy:11:11
+# CHECK-NEXT:         VarDecl b<2, 3> @{{.*}}ast.toy:15:3
+# CHECK-NEXT:           Literal: <6>[ 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00] @{{.*}}ast.toy:15:17
+# CHECK-NEXT:         VarDecl c<> @{{.*}}ast.toy:19:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:19:11
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:19:30
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:19:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl d<> @{{.*}}ast.toy:22:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:22:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:22:30
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:22:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl e<> @{{.*}}ast.toy:25:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:25:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:25:30
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:25:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl f<> @{{.*}}ast.toy:28:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:28:11
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:28:30
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:28:40
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:28:44
+# CHECK-NEXT:           ]
+
diff --git a/mlir/test/Examples/Toy/Ch6/codegen.toy b/mlir/test/Examples/Toy/Ch6/codegen.toy
new file mode 100644
index 0000000000000000000000000000000000000000..7056880eae9e269179fe8e726514749c6567c54b
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch6/codegen.toy
@@ -0,0 +1,31 @@
+# RUN: toyc-ch6 %s -emit=mlir 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+  var c = multiply_transpose(a, b);
+  var d = multiply_transpose(b, a);
+  print(d);
+}
+
+# CHECK-LABEL: func @multiply_transpose(
+# CHECK-SAME:                           [[VAL_0:%.*]]: tensor<*xf64>, [[VAL_1:%.*]]: tensor<*xf64>) -> tensor<*xf64>
+# CHECK:         [[VAL_2:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_3:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_4:%.*]] = "toy.mul"([[VAL_2]], [[VAL_3]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    "toy.return"([[VAL_4]]) : (tensor<*xf64>) -> ()
+
+# CHECK-LABEL: func @main()
+# CHECK-NEXT:    [[VAL_5:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_6:%.*]] = "toy.reshape"([[VAL_5]]) : (tensor<2x3xf64>) -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_7:%.*]] = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
+# CHECK-NEXT:    [[VAL_8:%.*]] = "toy.reshape"([[VAL_7]]) : (tensor<6xf64>) -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_9:%.*]] = "toy.generic_call"([[VAL_6]], [[VAL_8]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_10:%.*]] = "toy.generic_call"([[VAL_8]], [[VAL_6]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    "toy.print"([[VAL_10]]) : (tensor<*xf64>) -> ()
+# CHECK-NEXT:    "toy.return"() : () -> ()
diff --git a/mlir/test/Examples/Toy/Ch6/invalid.mlir b/mlir/test/Examples/Toy/Ch6/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..da614216cbf2036f70fe0ab3c25a57b640e503cd
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch6/invalid.mlir
@@ -0,0 +1,9 @@
+// RUN: not toyc-ch6 %s -emit=mlir 2>&1
+
+// The following IR is not "valid":
+// - toy.print should not return a value.
+// - toy.print should take an argument.
+// - There should be a block terminator.
+func @main() {
+  %0 = "toy.print"()  : () -> tensor<2x3xf64>
+}
diff --git a/mlir/test/Examples/Toy/Ch6/llvm-lowering.mlir b/mlir/test/Examples/Toy/Ch6/llvm-lowering.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..12b050c3bfe88e7d319825f93a0a394e26e3224e
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch6/llvm-lowering.mlir
@@ -0,0 +1,23 @@
+// RUN: toyc-ch6 %s -emit=llvm -opt
+
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+
+// CHECK-LABEL: define void @main()
+// CHECK: @printf
+// CHECK-SAME: 1.000000e+00
+// CHECK: @printf
+// CHECK-SAME: 1.600000e+01
+// CHECK: @printf
+// CHECK-SAME: 4.000000e+00
+// CHECK: @printf
+// CHECK-SAME: 2.500000e+01
+// CHECK: @printf
+// CHECK-SAME: 9.000000e+00
+// CHECK: @printf
+// CHECK-SAME: 3.000000e+01
diff --git a/mlir/test/Examples/Toy/Ch6/scalar.toy b/mlir/test/Examples/Toy/Ch6/scalar.toy
new file mode 100644
index 0000000000000000000000000000000000000000..f28bbf97e21b266719c2086882eaabafb363917a
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch6/scalar.toy
@@ -0,0 +1,14 @@
+# RUN: toyc-ch6 %s -emit=mlir 2>&1 | FileCheck %s
+
+def main() {
+  var a<2, 2> = 5.5;
+  print(a);
+}
+
+# CHECK-LABEL: func @main() {
+# CHECK-NEXT:    %0 = "toy.constant"() {value = dense<5.500000e+00> : tensor<f64>} : () -> tensor<f64>
+# CHECK-NEXT:    %1 = "toy.reshape"(%0) : (tensor<f64>) -> tensor<2x2xf64>
+# CHECK-NEXT:    "toy.print"(%1) : (tensor<2x2xf64>) -> ()
+# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:  }
+
diff --git a/mlir/test/Examples/Toy/Ch6/shape_inference.mlir b/mlir/test/Examples/Toy/Ch6/shape_inference.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..199446c188c35593784bdd91bd38f907152d920d
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch6/shape_inference.mlir
@@ -0,0 +1,30 @@
+// RUN: toyc-ch6 %s -emit=mlir -opt 2>&1 | FileCheck %s
+
+// Check the result of inlining+shape inference on an input module.
+
+func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
+  %1 = "toy.transpose"(%arg1) : (tensor<*xf64>) -> tensor<*xf64>
+  %2 = "toy.mul"(%0, %1) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+  "toy.return"(%2) : (tensor<*xf64>) -> ()
+}
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %1 = "toy.reshape"(%0) : (tensor<2x3xf64>) -> tensor<2x3xf64>
+  %2 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
+  %3 = "toy.reshape"(%2) : (tensor<6xf64>) -> tensor<2x3xf64>
+  %4 = "toy.generic_call"(%1, %3) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  %5 = "toy.generic_call"(%3, %1) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  "toy.print"(%5) : (tensor<*xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+
+// CHECK-NOT: func @multiply_transpose
+// CHECK-NOT: tensor<*xf64>
+
+// CHECK-LABEL: func @main()
+// CHECK:         [[VAL_0:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+// CHECK:         [[VAL_1:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+// CHECK:         [[VAL_2:%.*]] = "toy.mul"([[VAL_1]], [[VAL_1]]) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+// CHECK:         "toy.print"([[VAL_2]]) : (tensor<3x2xf64>) -> ()
+// CHECK:         "toy.return"() : () -> ()
diff --git a/mlir/test/Examples/Toy/Ch7/affine-lowering.mlir b/mlir/test/Examples/Toy/Ch7/affine-lowering.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..3d08d0c1d804762160d336039c2d11f84d2cfea6
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch7/affine-lowering.mlir
@@ -0,0 +1,65 @@
+// RUN: toyc-ch7 %s -emit=mlir-affine 2>&1 | FileCheck %s
+// RUN: toyc-ch7 %s -emit=mlir-affine -opt 2>&1 | FileCheck %s --check-prefix=OPT
+
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+
+// CHECK-LABEL: func @main()
+// CHECK:         [[VAL_0:%.*]] = constant 1.000000e+00 : f64
+// CHECK:         [[VAL_1:%.*]] = constant 2.000000e+00 : f64
+// CHECK:         [[VAL_2:%.*]] = constant 3.000000e+00 : f64
+// CHECK:         [[VAL_3:%.*]] = constant 4.000000e+00 : f64
+// CHECK:         [[VAL_4:%.*]] = constant 5.000000e+00 : f64
+// CHECK:         [[VAL_5:%.*]] = constant 6.000000e+00 : f64
+// CHECK:         [[VAL_6:%.*]] = alloc() : memref<3x2xf64>
+// CHECK:         [[VAL_7:%.*]] = alloc() : memref<3x2xf64>
+// CHECK:         [[VAL_8:%.*]] = alloc() : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_0]], [[VAL_8]][0, 0] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_1]], [[VAL_8]][0, 1] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_2]], [[VAL_8]][0, 2] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_3]], [[VAL_8]][1, 0] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_4]], [[VAL_8]][1, 1] : memref<2x3xf64>
+// CHECK:         affine.store [[VAL_5]], [[VAL_8]][1, 2] : memref<2x3xf64>
+// CHECK:         affine.for [[VAL_9:%.*]] = 0 to 3 {
+// CHECK:           affine.for [[VAL_10:%.*]] = 0 to 2 {
+// CHECK:             [[VAL_11:%.*]] = affine.load [[VAL_8]]{{\[}}[[VAL_10]], [[VAL_9]]] : memref<2x3xf64>
+// CHECK:             affine.store [[VAL_11]], [[VAL_7]]{{\[}}[[VAL_9]], [[VAL_10]]] : memref<3x2xf64>
+// CHECK:         affine.for [[VAL_12:%.*]] = 0 to 3 {
+// CHECK:           affine.for [[VAL_13:%.*]] = 0 to 2 {
+// CHECK:             [[VAL_14:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
+// CHECK:             [[VAL_15:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
+// CHECK:             [[VAL_16:%.*]] = mulf [[VAL_14]], [[VAL_15]] : f64
+// CHECK:             affine.store [[VAL_16]], [[VAL_6]]{{\[}}[[VAL_12]], [[VAL_13]]] : memref<3x2xf64>
+// CHECK:         "toy.print"([[VAL_6]]) : (memref<3x2xf64>) -> ()
+// CHECK:         dealloc [[VAL_8]] : memref<2x3xf64>
+// CHECK:         dealloc [[VAL_7]] : memref<3x2xf64>
+// CHECK:         dealloc [[VAL_6]] : memref<3x2xf64>
+
+// OPT-LABEL: func @main()
+// OPT:         [[VAL_0:%.*]] = constant 1.000000e+00 : f64
+// OPT:         [[VAL_1:%.*]] = constant 2.000000e+00 : f64
+// OPT:         [[VAL_2:%.*]] = constant 3.000000e+00 : f64
+// OPT:         [[VAL_3:%.*]] = constant 4.000000e+00 : f64
+// OPT:         [[VAL_4:%.*]] = constant 5.000000e+00 : f64
+// OPT:         [[VAL_5:%.*]] = constant 6.000000e+00 : f64
+// OPT:         [[VAL_6:%.*]] = alloc() : memref<3x2xf64>
+// OPT:         [[VAL_7:%.*]] = alloc() : memref<2x3xf64>
+// OPT:         affine.store [[VAL_0]], [[VAL_7]][0, 0] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_1]], [[VAL_7]][0, 1] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_2]], [[VAL_7]][0, 2] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_3]], [[VAL_7]][1, 0] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_4]], [[VAL_7]][1, 1] : memref<2x3xf64>
+// OPT:         affine.store [[VAL_5]], [[VAL_7]][1, 2] : memref<2x3xf64>
+// OPT:         affine.for [[VAL_8:%.*]] = 0 to 3 {
+// OPT:           affine.for [[VAL_9:%.*]] = 0 to 2 {
+// OPT:             [[VAL_10:%.*]] = affine.load [[VAL_7]]{{\[}}[[VAL_9]], [[VAL_8]]] : memref<2x3xf64>
+// OPT:             [[VAL_11:%.*]] = mulf [[VAL_10]], [[VAL_10]] : f64
+// OPT:             affine.store [[VAL_11]], [[VAL_6]]{{\[}}[[VAL_8]], [[VAL_9]]] : memref<3x2xf64>
+// OPT:         "toy.print"([[VAL_6]]) : (memref<3x2xf64>) -> ()
+// OPT:         dealloc [[VAL_7]] : memref<2x3xf64>
+// OPT:         dealloc [[VAL_6]] : memref<3x2xf64>
diff --git a/mlir/test/Examples/Toy/Ch7/ast.toy b/mlir/test/Examples/Toy/Ch7/ast.toy
new file mode 100644
index 0000000000000000000000000000000000000000..05cde0339ae1aa2e7c0ea3fb8b9834349498d50a
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch7/ast.toy
@@ -0,0 +1,76 @@
+# RUN: toyc-ch7 %s -emit=ast 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments.
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
+  # The shape is inferred from the supplied literal.
+  var a = [[1, 2, 3], [4, 5, 6]];
+  # b is identical to a, the literal array is implicitly reshaped: defining new
+  # variables is the way to reshape arrays (element count in literal must match
+  # the size of specified shape).
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+
+  # This call will specialize `multiply_transpose` with <2, 3> for both
+  # arguments and deduce a return type of <2, 2> in initialization of `c`.
+  var c = multiply_transpose(a, b);
+  # A second call to `multiply_transpose` with <2, 3> for both arguments will
+  # reuse the previously specialized and inferred version and return `<2, 2>`
+  var d = multiply_transpose(b, a);
+  # A new call with `<2, 2>` for both dimension will trigger another
+  # specialization of `multiply_transpose`.
+  var e = multiply_transpose(b, c);
+  # Finally, calling into `multiply_transpose` with incompatible shape will
+  # trigger a shape inference error.
+  var f = multiply_transpose(transpose(a), c);
+}
+
+
+# CHECK: Module:
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'multiply_transpose' @{{.*}}ast.toy:4:1'
+# CHECK-NEXT:       Params: [a, b]
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         Return
+# CHECK-NEXT:           BinOp: * @{{.*}}ast.toy:5:25
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:10
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:5:20
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:5:25
+# CHECK-NEXT:               var: b @{{.*}}ast.toy:5:35
+# CHECK-NEXT:             ]
+# CHECK-NEXT:       } // Block
+# CHECK-NEXT:     Function
+# CHECK-NEXT:       Proto 'main' @{{.*}}ast.toy:8:1'
+# CHECK-NEXT:       Params: []
+# CHECK-NEXT:       Block {
+# CHECK-NEXT:         VarDecl a<> @{{.*}}ast.toy:11:3
+# CHECK-NEXT:           Literal: <2, 3>[ <3>[ 1.000000e+00, 2.000000e+00, 3.000000e+00], <3>[ 4.000000e+00, 5.000000e+00, 6.000000e+00]] @{{.*}}ast.toy:11:11
+# CHECK-NEXT:         VarDecl b<2, 3> @{{.*}}ast.toy:15:3
+# CHECK-NEXT:           Literal: <6>[ 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00] @{{.*}}ast.toy:15:17
+# CHECK-NEXT:         VarDecl c<> @{{.*}}ast.toy:19:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:19:11
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:19:30
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:19:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl d<> @{{.*}}ast.toy:22:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:22:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:22:30
+# CHECK-NEXT:             var: a @{{.*}}ast.toy:22:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl e<> @{{.*}}ast.toy:25:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:25:11
+# CHECK-NEXT:             var: b @{{.*}}ast.toy:25:30
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:25:33
+# CHECK-NEXT:           ]
+# CHECK-NEXT:         VarDecl f<> @{{.*}}ast.toy:28:3
+# CHECK-NEXT:           Call 'multiply_transpose' [ @{{.*}}ast.toy:28:11
+# CHECK-NEXT:             Call 'transpose' [ @{{.*}}ast.toy:28:30
+# CHECK-NEXT:               var: a @{{.*}}ast.toy:28:40
+# CHECK-NEXT:             ]
+# CHECK-NEXT:             var: c @{{.*}}ast.toy:28:44
+# CHECK-NEXT:           ]
+
diff --git a/mlir/test/Examples/Toy/Ch7/codegen.toy b/mlir/test/Examples/Toy/Ch7/codegen.toy
new file mode 100644
index 0000000000000000000000000000000000000000..e19500bd9ae7fa4e17eeeb02f9ac1f2952f5167d
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch7/codegen.toy
@@ -0,0 +1,31 @@
+# RUN: toyc-ch7 %s -emit=mlir 2>&1 | FileCheck %s
+
+# User defined generic function that operates on unknown shaped arguments
+def multiply_transpose(a, b) {
+  return transpose(a) * transpose(b);
+}
+
+def main() {
+  var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+  var c = multiply_transpose(a, b);
+  var d = multiply_transpose(b, a);
+  print(d);
+}
+
+# CHECK-LABEL: func @multiply_transpose(
+# CHECK-SAME:                           [[VAL_0:%.*]]: tensor<*xf64>, [[VAL_1:%.*]]: tensor<*xf64>) -> tensor<*xf64>
+# CHECK:         [[VAL_2:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_3:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_4:%.*]] = "toy.mul"([[VAL_2]], [[VAL_3]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    "toy.return"([[VAL_4]]) : (tensor<*xf64>) -> ()
+
+# CHECK-LABEL: func @main()
+# CHECK-NEXT:    [[VAL_5:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_6:%.*]] = "toy.reshape"([[VAL_5]]) : (tensor<2x3xf64>) -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_7:%.*]] = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
+# CHECK-NEXT:    [[VAL_8:%.*]] = "toy.reshape"([[VAL_7]]) : (tensor<6xf64>) -> tensor<2x3xf64>
+# CHECK-NEXT:    [[VAL_9:%.*]] = "toy.generic_call"([[VAL_6]], [[VAL_8]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    [[VAL_10:%.*]] = "toy.generic_call"([[VAL_8]], [[VAL_6]]) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+# CHECK-NEXT:    "toy.print"([[VAL_10]]) : (tensor<*xf64>) -> ()
+# CHECK-NEXT:    "toy.return"() : () -> ()
diff --git a/mlir/test/Examples/Toy/Ch7/invalid.mlir b/mlir/test/Examples/Toy/Ch7/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..5d35d95a5bfa966b4678efe0d237dd84a715a1cc
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch7/invalid.mlir
@@ -0,0 +1,9 @@
+// RUN: not toyc-ch7 %s -emit=mlir 2>&1
+
+// The following IR is not "valid":
+// - toy.print should not return a value.
+// - toy.print should take an argument.
+// - There should be a block terminator.
+func @main() {
+  %0 = "toy.print"()  : () -> tensor<2x3xf64>
+}
diff --git a/mlir/test/Examples/Toy/Ch7/llvm-lowering.mlir b/mlir/test/Examples/Toy/Ch7/llvm-lowering.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..0009bb507eb8f62359d3b600a35b9201aa7b9678
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch7/llvm-lowering.mlir
@@ -0,0 +1,23 @@
+// RUN: toyc-ch7 %s -emit=llvm -opt
+
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %2 = "toy.transpose"(%0) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+  %3 = "toy.mul"(%2, %2) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+  "toy.print"(%3) : (tensor<3x2xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+
+// CHECK-LABEL: define void @main()
+// CHECK: @printf
+// CHECK-SAME: 1.000000e+00
+// CHECK: @printf
+// CHECK-SAME: 1.600000e+01
+// CHECK: @printf
+// CHECK-SAME: 4.000000e+00
+// CHECK: @printf
+// CHECK-SAME: 2.500000e+01
+// CHECK: @printf
+// CHECK-SAME: 9.000000e+00
+// CHECK: @printf
+// CHECK-SAME: 3.000000e+01
diff --git a/mlir/test/Examples/Toy/Ch7/scalar.toy b/mlir/test/Examples/Toy/Ch7/scalar.toy
new file mode 100644
index 0000000000000000000000000000000000000000..f917ea622e5cec960b5f5f02c79730173de8c97c
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch7/scalar.toy
@@ -0,0 +1,14 @@
+# RUN: toyc-ch7 %s -emit=mlir 2>&1 | FileCheck %s
+
+def main() {
+  var a<2, 2> = 5.5;
+  print(a);
+}
+
+# CHECK-LABEL: func @main() {
+# CHECK-NEXT:    %0 = "toy.constant"() {value = dense<5.500000e+00> : tensor<f64>} : () -> tensor<f64>
+# CHECK-NEXT:    %1 = "toy.reshape"(%0) : (tensor<f64>) -> tensor<2x2xf64>
+# CHECK-NEXT:    "toy.print"(%1) : (tensor<2x2xf64>) -> ()
+# CHECK-NEXT:    "toy.return"() : () -> ()
+# CHECK-NEXT:  }
+
diff --git a/mlir/test/Examples/Toy/Ch7/shape_inference.mlir b/mlir/test/Examples/Toy/Ch7/shape_inference.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..b9355cf7e2515b464f6f69a6341935c6dbedc819
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch7/shape_inference.mlir
@@ -0,0 +1,30 @@
+// RUN: toyc-ch7 %s -emit=mlir -opt 2>&1 | FileCheck %s
+
+// Check the result of inlining+shape inference on an input module.
+
+func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+  %0 = "toy.transpose"(%arg0) : (tensor<*xf64>) -> tensor<*xf64>
+  %1 = "toy.transpose"(%arg1) : (tensor<*xf64>) -> tensor<*xf64>
+  %2 = "toy.mul"(%0, %1) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+  "toy.return"(%2) : (tensor<*xf64>) -> ()
+}
+func @main() {
+  %0 = "toy.constant"() {value = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+  %1 = "toy.reshape"(%0) : (tensor<2x3xf64>) -> tensor<2x3xf64>
+  %2 = "toy.constant"() {value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>} : () -> tensor<6xf64>
+  %3 = "toy.reshape"(%2) : (tensor<6xf64>) -> tensor<2x3xf64>
+  %4 = "toy.generic_call"(%1, %3) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  %5 = "toy.generic_call"(%3, %1) {callee = @multiply_transpose} : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+  "toy.print"(%5) : (tensor<*xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+
+// CHECK-NOT: func @multiply_transpose
+// CHECK-NOT: tensor<*xf64>
+
+// CHECK-LABEL: func @main()
+// CHECK:         [[VAL_0:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+// CHECK:         [[VAL_1:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+// CHECK:         [[VAL_2:%.*]] = "toy.mul"([[VAL_1]], [[VAL_1]]) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+// CHECK:         "toy.print"([[VAL_2]]) : (tensor<3x2xf64>) -> ()
+// CHECK:         "toy.return"() : () -> ()
diff --git a/mlir/test/Examples/Toy/Ch7/struct-ast.toy b/mlir/test/Examples/Toy/Ch7/struct-ast.toy
new file mode 100644
index 0000000000000000000000000000000000000000..dee0d5b0efd55af0af5281f1f08ec6da90605ce3
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch7/struct-ast.toy
@@ -0,0 +1,61 @@
+# RUN: toyc-ch7 %s -emit=ast 2>&1 | FileCheck %s
+
+struct Struct {
+  var a;
+  var b;
+}
+
+# User defined generic function may operate on struct types as well.
+def multiply_transpose(Struct value) {
+  # We can access the elements of a struct via the '.' operator.
+  return transpose(value.a) * transpose(value.b);
+}
+
+def main() {
+  # We initialize struct values using a composite initializer.
+  Struct value = {[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]};
+
+  # We pass these arguments to functions like we do with variables.
+  var c = multiply_transpose(value);
+  print(c);
+}
+
+# CHECK:  Module:
+# CHECK-NEXT:    Struct: Struct @{{.*}}struct-ast.toy:3:1
+# CHECK-NEXT:  Variables: [
+# CHECK-NEXT:    VarDecl a<> @{{.*}}struct-ast.toy:4:3
+# CHECK-NEXT:    VarDecl b<> @{{.*}}struct-ast.toy:5:3
+# CHECK-NEXT:  ]
+# CHECK-NEXT:Function
+# CHECK-NEXT:  Proto 'multiply_transpose' @{{.*}}struct-ast.toy:9:1'
+# CHECK-NEXT:  Params: [value]
+# CHECK-NEXT:  Block {
+# CHECK-NEXT:    Return
+# CHECK-NEXT:      BinOp: * @{{.*}}struct-ast.toy:11:31
+# CHECK-NEXT:        Call 'transpose' [ @{{.*}}struct-ast.toy:11:10
+# CHECK-NEXT:          BinOp: . @{{.*}}struct-ast.toy:11:26
+# CHECK-NEXT:            var: value @{{.*}}struct-ast.toy:11:20
+# CHECK-NEXT:            var: a @{{.*}}struct-ast.toy:11:26
+# CHECK-NEXT:        ]
+# CHECK-NEXT:        Call 'transpose' [ @{{.*}}struct-ast.toy:11:31
+# CHECK-NEXT:          BinOp: . @{{.*}}struct-ast.toy:11:47
+# CHECK-NEXT:            var: value @{{.*}}struct-ast.toy:11:41
+# CHECK-NEXT:            var: b @{{.*}}struct-ast.toy:11:47
+# CHECK-NEXT:        ]
+# CHECK-NEXT:  }
+# CHECK-NEXT:Function
+# CHECK-NEXT:  Proto 'main' @{{.*}}struct-ast.toy:14:1'
+# CHECK-NEXT:  Params: []
+# CHECK-NEXT:  Block {
+# CHECK-NEXT:    VarDecl value<Struct> @{{.*}}struct-ast.toy:16:3
+# CHECK-NEXT:      Struct Literal:             Literal: <2, 3>[ <3>[ 1.000000e+00, 2.000000e+00, 3.000000e+00], <3>[ 4.000000e+00, 5.000000e+00, 6.000000e+00]] @{{.*}}struct-ast.toy:16:19
+# CHECK-NEXT:        Literal: <2, 3>[ <3>[ 1.000000e+00, 2.000000e+00, 3.000000e+00], <3>[ 4.000000e+00, 5.000000e+00, 6.000000e+00]] @{{.*}}struct-ast.toy:16:43
+# CHECK-NEXT:       @{{.*}}struct-ast.toy:16:18
+# CHECK-NEXT:    VarDecl c<> @{{.*}}struct-ast.toy:19:3
+# CHECK-NEXT:      Call 'multiply_transpose' [ @{{.*}}struct-ast.toy:19:11
+# CHECK-NEXT:        var: value @{{.*}}struct-ast.toy:19:30
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    Print [ @{{.*}}struct-ast.toy:20:3
+# CHECK-NEXT:      var: c @{{.*}}struct-ast.toy:20:9
+# CHECK-NEXT:    ]
+# CHECK-NEXT:  }
\ No newline at end of file
diff --git a/mlir/test/Examples/Toy/Ch7/struct-codegen.toy b/mlir/test/Examples/Toy/Ch7/struct-codegen.toy
new file mode 100644
index 0000000000000000000000000000000000000000..66eaf8a163906283c317dea7926afd86a4523cba
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch7/struct-codegen.toy
@@ -0,0 +1,44 @@
+# RUN: toyc-ch7 %s -emit=mlir 2>&1
+# RUN: toyc-ch7 %s -emit=mlir -opt 2>&1 | FileCheck %s --check-prefix=OPT
+
+struct Struct {
+  var a;
+  var b;
+}
+
+# User defined generic function may operate on struct types as well.
+def multiply_transpose(Struct value) {
+  # We can access the elements of a struct via the '.' operator.
+  return transpose(value.a) * transpose(value.b);
+}
+
+def main() {
+  # We initialize struct values using a composite initializer.
+  Struct value = {[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]};
+
+  # We pass these arguments to functions like we do with variables.
+  var c = multiply_transpose(value);
+  print(c);
+}
+
+# CHECK-LABEL:   func @multiply_transpose(
+# CHECK-SAME:                             [[VAL_0:%.*]]: !toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
+# CHECK-NEXT:      [[VAL_1:%.*]] = "toy.struct_access"([[VAL_0]]) {index = 0 : i64} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
+# CHECK-NEXT:      [[VAL_2:%.*]] = "toy.transpose"([[VAL_1]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:      [[VAL_3:%.*]] = "toy.struct_access"([[VAL_0]]) {index = 1 : i64} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
+# CHECK-NEXT:      [[VAL_4:%.*]] = "toy.transpose"([[VAL_3]]) : (tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:      [[VAL_5:%.*]] = "toy.mul"([[VAL_2]], [[VAL_4]]) : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
+# CHECK-NEXT:      "toy.return"([[VAL_5]]) : (tensor<*xf64>) -> ()
+
+# CHECK-LABEL:   func @main()
+# CHECK-NEXT:      [[VAL_6:%.*]] = "toy.struct_constant"() {value = [dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>, dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>]} : () -> !toy.struct<tensor<*xf64>, tensor<*xf64>>
+# CHECK-NEXT:      [[VAL_7:%.*]] = "toy.generic_call"([[VAL_6]]) {callee = @multiply_transpose} : (!toy.struct<tensor<*xf64>, tensor<*xf64>>) -> tensor<*xf64>
+# CHECK-NEXT:      "toy.print"([[VAL_7]]) : (tensor<*xf64>) -> ()
+# CHECK-NEXT:      "toy.return"() : () -> ()
+
+# OPT-LABEL:   func @main()
+# OPT-NEXT:      [[VAL_0:%.*]] = "toy.constant"() {value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>} : () -> tensor<2x3xf64>
+# OPT-NEXT:      [[VAL_1:%.*]] = "toy.transpose"([[VAL_0]]) : (tensor<2x3xf64>) -> tensor<3x2xf64>
+# OPT-NEXT:      [[VAL_2:%.*]] = "toy.mul"([[VAL_1]], [[VAL_1]]) : (tensor<3x2xf64>, tensor<3x2xf64>) -> tensor<3x2xf64>
+# OPT-NEXT:      "toy.print"([[VAL_2]]) : (tensor<3x2xf64>) -> ()
+# OPT-NEXT:      "toy.return"() : () -> ()
diff --git a/mlir/test/Examples/Toy/Ch7/struct-opt.mlir b/mlir/test/Examples/Toy/Ch7/struct-opt.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8c4b055b4bf538f3567680db226d64eb6dc8a77d
--- /dev/null
+++ b/mlir/test/Examples/Toy/Ch7/struct-opt.mlir
@@ -0,0 +1,16 @@
+// RUN: toyc-ch7 %s -emit=mlir -opt 2>&1 | FileCheck %s
+
+func @main() {
+  %0 = "toy.struct_constant"() {
+    value = [[dense<4.000000e+00> : tensor<2x2xf64>], dense<4.000000e+00> : tensor<2x2xf64>]
+  } : () -> !toy.struct<!toy.struct<tensor<*xf64>>, tensor<*xf64>>
+  %1 = "toy.struct_access"(%0) {index = 0 : i64} : (!toy.struct<!toy.struct<tensor<*xf64>>, tensor<*xf64>>) -> !toy.struct<tensor<*xf64>>
+  %2 = "toy.struct_access"(%1) {index = 0 : i64} : (!toy.struct<tensor<*xf64>>) -> tensor<*xf64>
+  "toy.print"(%2) : (tensor<*xf64>) -> ()
+  "toy.return"() : () -> ()
+}
+
+// CHECK-LABEL: func @main
+// CHECK-NEXT: %[[CST:.*]] = "toy.constant"
+// CHECK-SAME: dense<4.0
+// CHECK-NEXT: "toy.print"(%[[CST]])
diff --git a/mlir/test/Examples/lit.local.cfg b/mlir/test/Examples/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..97db322f29e2942c41f81c9ce827772c5dbfb9a9
--- /dev/null
+++ b/mlir/test/Examples/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.build_examples:
+  config.unsupported = True
diff --git a/mlir/test/IR/affine-map.mlir b/mlir/test/IR/affine-map.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ebbd4735635bb32452bc46ecfb0fde924b5237be
--- /dev/null
+++ b/mlir/test/IR/affine-map.mlir
@@ -0,0 +1,363 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+// Identity maps used in trivial compositions in MemRefs are optimized away.
+// CHECK-NOT: #map{{[0-9]+}} = (d0, d1) -> (d0, d1)
+#map0 = (i, j) -> (i, j)
+
+// CHECK-NOT: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0, d1)
+#map1 = (i, j)[s0] -> (i, j)
+
+// CHECK: #map{{[0-9]+}} = () -> (0)
+// A map may have 0 inputs.
+// However, an affine.apply always takes at least one input.
+#map2 = () -> (0)
+
+// All the maps in the following block are equivalent and are unique'd as one
+// map. Therefore there should be only one output and we explicitly CHECK-NOT
+// for the others.
+// CHECK: #map{{[0-9]+}} = (d0, d1) -> (d0 + 1, d1 * 4 + 2)
+#map3  = (i, j) -> (i+1, 4*j + 2)
+// CHECK-NOT: #map3{{[a-z]}}
+#map3a = (i, j) -> (1+i, 4*j + 2)
+#map3b = (i, j) -> (2 + 3 - 2*2 + i, 4*j + 2)
+#map3c = (i, j) -> (i +1 + 0, 4*j + 2)
+#map3d = (i, j) -> (i + 3 + 2 - 4, 4*j + 2)
+#map3e = (i, j) -> (1*i+3*2-2*2-1, 4*j + 2)
+#map3f = (i, j) -> (i + 1, 4*j*1 + 2)
+#map3g = (i, j) -> (i + 1, 2*2*j + 2)
+#map3h = (i, j) -> (i + 1, 2*j*2 + 2)
+#map3i = (i, j) -> (i + 1, j*2*2 + 2)
+#map3j = (i, j) -> (i + 1, j*1*4 + 2)
+#map3k = (i, j) -> (i + 1, j*4*1 + 2)
+
+// The following reduction should be unique'd out too but such expression
+// simplification is not performed for IR parsing, but only through analyses
+// and transforms.
+// CHECK: #map{{[0-9]+}} = (d0, d1) -> (d1 - d0 + (d0 - d1 + 1) * 2 + d1 - 1, d1 + d1 + d1 + d1 + 2)
+#map3l = (i, j) -> ((j - i) + 2*(i - j + 1) + j - 1 + 0, j + j + 1 + j + j + 1)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1) -> (d0 + 2, d1)
+#map4  = (i, j) -> (3+3-2*2+i, j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 + s0, d1)
+#map5 = (i, j)[s0] -> (i + s0, j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 + s0, d1 + 5)
+#map6 = (i, j)[s0] -> (i + s0, j + 5)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 + d1 + s0, d1)
+#map7 = (i, j)[s0] -> (i + j + s0, j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 + d1 + s0 + 5, d1)
+#map8 = (i, j)[s0] -> (5 + i + j + s0, j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 + d1 + 5, d1)
+#map9 = (i, j)[s0] -> ((i + j) + 5, j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 + d1 + 5, d1)
+#map10 = (i, j)[s0] -> (i + (j + 5), j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 * 2, d1 * 3)
+#map11 = (i, j)[s0] -> (2*i, 3*j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 + (d1 + s0 * 3) * 5 + 12, d1)
+#map12 = (i, j)[s0] -> (i + 2*6 + 5*(j+s0*3), j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 * 5 + d1, d1)
+#map13 = (i, j)[s0] -> (5*i + j, j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 + d1, d1)
+#map14 = (i, j)[s0] -> ((i + j), (j))
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 + d1 + 7, d1 + 3)
+#map15 = (i, j)[s0] -> ((i + j + 2) + 5, (j)+3)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0, 0)
+#map16 = (i, j)[s1] -> (i, 0)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0, d1 * s0)
+#map17 = (i, j)[s0] -> (i, s0*j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1) -> (d0, d0 * 3 + d1)
+#map19 = (i, j) -> (i, 3*i + j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1) -> (d0, d0 + d1 * 3)
+#map20 = (i, j)  -> (i, i + 3*j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0, d0 * ((s0 * s0) * 9) + 3)
+#map18 = (i, j)[N] -> (i, 2 + N*N*9*i + 1)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1) -> (1, d0 + d1 * 3 + 5)
+#map21 = (i, j)  -> (1, i + 3*j + 5)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (s0 * 5, d0 + d1 * 3 + d0 * 5)
+#map22 = (i, j)[s0] -> (5*s0, i + 3*j + 5*i)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0, s1] -> (d0 * (s0 * s1), d1)
+#map23 = (i, j)[s0, s1] -> (i*(s0*s1), j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0, s1] -> (d0, d1 mod 5)
+#map24 = (i, j)[s0, s1] -> (i, j mod 5)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0, s1] -> (d0, d1 floordiv 5)
+#map25 = (i, j)[s0, s1] -> (i, j floordiv 5)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0, s1] -> (d0, d1 ceildiv 5)
+#map26 = (i, j)[s0, s1] -> (i, j ceildiv 5)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0, s1] -> (d0, d0 - d1 - 5)
+#map29 = (i, j)[s0, s1] -> (i, i - j - 5)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0, s1] -> (d0, d0 - d1 * s1 + 2)
+#map30 = (i, j)[M, N] -> (i, i - N*j + 2)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0, s1] -> (d0 * -5, d1 * -3, -2, -(d0 + d1), -s0)
+#map32 = (i, j)[s0, s1] -> (-5*i, -3*j, -2, -1*(i+j), -1*s0)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1) -> (-4, -d0)
+#map33 = (i, j) -> (-2+-5-(-3), -1*i)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0, s1] -> (d0, d1 floordiv s0, d1 mod s0)
+#map34 = (i, j)[s0, s1] -> (i, j floordiv s0, j mod s0)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1, d2)[s0, s1, s2] -> ((d0 * s1) * s2 + d1 * s1 + d2)
+#map35 = (i, j, k)[s0, s1, s2] -> (i*s1*s2 + j*s1 + k)
+
+// Constant folding.
+// CHECK: #map{{[0-9]+}} = (d0, d1) -> (8, 4, 1, 3, 2, 4)
+#map36 = (i, j) -> (5+3, 2*2, 8-7, 100 floordiv 32, 5 mod 3, 10 ceildiv 3)
+// CHECK: #map{{[0-9]+}} = (d0, d1) -> (4, 11, 512, 15)
+#map37 = (i, j) -> (5 mod 3 + 2, 5*3 - 4, 128 * (500 ceildiv 128), 40 floordiv 7 * 3)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1) -> (d0 * 2 + 1, d1 + 2)
+#map38 = (i, j) -> (1 + i*2, 2 + j)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0, s1] -> (d0 * s0, d0 + s0, d0 + 2, d1 * 2, s1 * 2, s0 + 2)
+#map39 = (i, j)[M, N] -> (i*M, M + i, 2+i, j*2, N*2, 2 + M)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> ((d0 * 5) floordiv 4, (d1 ceildiv 7) mod s0)
+#map43 = (i, j) [s0] -> ( i * 5 floordiv 4, j ceildiv 7 mod s0)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1) -> (d0 - d1 * 2, (d1 * 6) floordiv 4)
+#map44 = (i, j) -> (i - 2*j, j * 6 floordiv 4)
+
+// Simplifications
+// CHECK: #map{{[0-9]+}} = (d0, d1, d2)[s0] -> (d0 + d1 + d2 + 1, d2 + d1, (d0 * s0) * 8)
+#map45 = (i, j, k) [N] -> (1 + i + 3 + j - 3 + k, k + 5 + j - 5, 2*i*4*N)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1, d2) -> (0, d1, d0 * 2, 0)
+#map46 = (i, j, k) -> (i*0, 1*j, i * 128 floordiv 64, j * 0 floordiv 64)
+
+// CHECK: #map{{[0-9]+}} = (d0, d1, d2) -> (d0, d0 * 4, 0, 0, 0)
+#map47 = (i, j, k) -> (i * 64 ceildiv 64, i * 512 ceildiv 128, 4 * j mod 4, 4*j*4 mod 8, k mod 1)
+
+// floordiv should resolve similarly to ceildiv and be unique'd out.
+// CHECK-NOT: #map48{{[a-z]}}
+#map48 = (i, j, k) -> (i * 64 floordiv 64, i * 512 floordiv 128, 4 * j mod 4, 4*j*4 mod 8)
+
+// Simplifications for mod using known GCD's of the LHS expr.
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (0, 0, 0, 1)
+#map49 = (i, j)[s0] -> ( (i * 4 + 8) mod 4, 32 * j * s0 * 8 mod 256, (4*i + (j * (s0 * 2))) mod 2, (4*i + 3) mod 2)
+
+// Floordiv, ceildiv divide by one.
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 * 2 + 1, d1 + s0)
+#map50 = (i, j)[s0] -> ( (i * 2 + 1) ceildiv 1, (j + s0) floordiv 1)
+
+// floordiv, ceildiv, and mod where LHS is negative.
+// CHECK: #map{{[0-9]+}} = (d0) -> (-2, 1, -1)
+#map51 = (i) -> (-5 floordiv 3, -5 mod 3, -5 ceildiv 3)
+
+// Parenthesis elision.
+// CHECK: #map{{[0-9]+}} = (d0) -> (d0 * 16 - (d0 + 1) + 15)
+#map52 = (d0) -> (16*d0 + ((d0 + 1) * -1) + 15)
+
+// CHECK: #map{{[0-9]+}} = (d0) -> (d0 - (d0 + 1))
+#map53 = (d0) -> (d0 - (d0 + 1))
+
+// CHECK: #map{{[0-9]+}} = (d0)[s0] -> ((-s0) floordiv 4, d0 floordiv -1)
+#map54 = (d0)[s0] -> (-s0 floordiv 4, d0 floordiv -1)
+
+// CHECK: #map{{[0-9]+}} = () -> ()
+#map55 = () -> ()
+
+// CHECK: #map{{[0-9]+}} = (d0, d1) -> (d0, d0 * 2 + d1 * 4 + 2, 1, 2, (d0 * 4) mod 8)
+#map56 = (d0, d1) -> ((4*d0 + 2) floordiv 4, (4*d0 + 8*d1 + 5) floordiv 2, (2*d0 + 4*d1 + 3) mod 2, (3*d0 - 4) mod 3, (4*d0 + 8*d1) mod 8)
+
+// Single identity maps are removed.
+// CHECK: func @f0(memref<2x4xi8, 1>)
+func @f0(memref<2x4xi8, #map0, 1>)
+
+// Single identity maps are removed.
+// CHECK: func @f1(memref<2x4xi8, 1>)
+func @f1(memref<2x4xi8, #map1, 1>)
+
+// CHECK: func @f2(memref<i8, #map{{[0-9]+}}, 1>)
+func @f2(memref<i8, #map2, 1>)
+
+// CHECK: func @f3(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3(memref<2x4xi8, #map3, 1>)
+// CHECK: func @f3a(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3a(memref<2x4xi8, #map3a, 1>)
+// CHECK: func @f3b(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3b(memref<2x4xi8, #map3b, 1>)
+// CHECK: func @f3c(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3c(memref<2x4xi8, #map3c, 1>)
+// CHECK: func @f3d(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3d(memref<2x4xi8, #map3d, 1>)
+// CHECK: func @f3e(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3e(memref<2x4xi8, #map3e, 1>)
+// CHECK: func @f3f(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3f(memref<2x4xi8, #map3f, 1>)
+// CHECK: func @f3g(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3g(memref<2x4xi8, #map3g, 1>)
+// CHECK: func @f3h(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3h(memref<2x4xi8, #map3h, 1>)
+// CHECK: func @f3i(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3i(memref<2x4xi8, #map3i, 1>)
+// CHECK: func @f3j(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3j(memref<2x4xi8, #map3j, 1>)
+// CHECK: func @f3k(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3k(memref<2x4xi8, #map3k, 1>)
+// CHECK: func @f3l(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f3l(memref<2x4xi8, #map3l, 1>)
+
+// CHECK: func @f4(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f4(memref<2x4xi8, #map4, 1>)
+
+// CHECK: func @f5(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f5(memref<2x4xi8, #map5, 1>)
+
+// CHECK: func @f6(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f6(memref<2x4xi8, #map6, 1>)
+
+// CHECK: func @f7(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f7(memref<2x4xi8, #map7, 1>)
+
+// CHECK: func @f8(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f8(memref<2x4xi8, #map8, 1>)
+
+// CHECK: func @f9(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f9(memref<2x4xi8, #map9, 1>)
+
+// CHECK: func @f10(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f10(memref<2x4xi8, #map10, 1>)
+
+// CHECK: func @f11(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f11(memref<2x4xi8, #map11, 1>)
+
+// CHECK: func @f12(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f12(memref<2x4xi8, #map12, 1>)
+
+// CHECK: func @f13(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f13(memref<2x4xi8, #map13, 1>)
+
+// CHECK: func @f14(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f14(memref<2x4xi8, #map14, 1>)
+
+// CHECK: func @f15(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f15(memref<2x4xi8, #map15, 1>)
+
+// CHECK: func @f16(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f16(memref<2x4xi8, #map16, 1>)
+
+// CHECK: func @f17(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f17(memref<2x4xi8, #map17, 1>)
+
+// CHECK: func @f19(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f19(memref<2x4xi8, #map19, 1>)
+
+// CHECK: func @f20(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f20(memref<2x4xi8, #map20, 1>)
+
+// CHECK: func @f18(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f18(memref<2x4xi8, #map18, 1>)
+
+// CHECK: func @f21(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f21(memref<2x4xi8, #map21, 1>)
+
+// CHECK: func @f22(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f22(memref<2x4xi8, #map22, 1>)
+
+// CHECK: func @f23(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f23(memref<2x4xi8, #map23, 1>)
+
+// CHECK: func @f24(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f24(memref<2x4xi8, #map24, 1>)
+
+// CHECK: func @f25(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f25(memref<2x4xi8, #map25, 1>)
+
+// CHECK: func @f26(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f26(memref<2x4xi8, #map26, 1>)
+
+// CHECK: func @f29(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f29(memref<2x4xi8, #map29, 1>)
+
+// CHECK: func @f30(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f30(memref<2x4xi8, #map30, 1>)
+
+// CHECK: func @f32(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f32(memref<2x4xi8, #map32, 1>)
+
+// CHECK: func @f33(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f33(memref<2x4xi8, #map33, 1>)
+
+// CHECK: func @f34(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f34(memref<2x4xi8, #map34, 1>)
+
+// CHECK: func @f35(memref<2x4x4xi8, #map{{[0-9]+}}, 1>)
+func @f35(memref<2x4x4xi8, #map35, 1>)
+
+// CHECK: func @f36(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f36(memref<2x4xi8, #map36, 1>)
+
+// CHECK: func @f37(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f37(memref<2x4xi8, #map37, 1>)
+
+// CHECK: func @f38(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f38(memref<2x4xi8, #map38, 1>)
+
+// CHECK: func @f39(memref<2x4xi8, #map{{[0-9]+}}, 1>)
+func @f39(memref<2x4xi8, #map39, 1>)
+
+// CHECK: func @f43(memref<2x4xi8, #map{{[0-9]+}}>)
+func @f43(memref<2x4xi8, #map43>)
+
+// CHECK: func @f44(memref<2x4xi8, #map{{[0-9]+}}>)
+func @f44(memref<2x4xi8, #map44>)
+
+// CHECK: func @f45(memref<100x100x100xi8, #map{{[0-9]+}}>)
+func @f45(memref<100x100x100xi8, #map45>)
+
+// CHECK: func @f46(memref<100x100x100xi8, #map{{[0-9]+}}>)
+func @f46(memref<100x100x100xi8, #map46>)
+
+// CHECK: func @f47(memref<100x100x100xi8, #map{{[0-9]+}}>)
+func @f47(memref<100x100x100xi8, #map47>)
+
+// CHECK: func @f48(memref<100x100x100xi8, #map{{[0-9]+}}>)
+func @f48(memref<100x100x100xi8, #map48>)
+
+// CHECK: func @f49(memref<100x100xi8, #map{{[0-9]+}}>)
+func @f49(memref<100x100xi8, #map49>)
+
+// CHECK: func @f50(memref<100x100xi8, #map{{[0-9]+}}>)
+func @f50(memref<100x100xi8, #map50>)
+
+// CHECK: func @f51(memref<1xi8, #map{{[0-9]+}}>)
+func @f51(memref<1xi8, #map51>)
+
+// CHECK: func @f52(memref<1xi8, #map{{[0-9]+}}>)
+func @f52(memref<1xi8, #map52>)
+
+// CHECK: func @f53(memref<1xi8, #map{{[0-9]+}}>)
+func @f53(memref<1xi8, #map53>)
+
+// CHECK: func @f54(memref<10xi32, #map{{[0-9]+}}>)
+func @f54(memref<10xi32, #map54>)
+
+// CHECK: "foo.op"() {map = #map{{[0-9]+}}} : () -> ()
+"foo.op"() {map = #map55} : () -> ()
+
+// CHECK: func @f56(memref<1x1xi8, #map{{[0-9]+}}>)
+func @f56(memref<1x1xi8, #map56>)
diff --git a/mlir/test/IR/attribute.mlir b/mlir/test/IR/attribute.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..318837d30cc9d6d7b18c2ef65f74a69b371e9bde
--- /dev/null
+++ b/mlir/test/IR/attribute.mlir
@@ -0,0 +1,245 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Test Non-negative Int Attr
+//===----------------------------------------------------------------------===//
+
+func @non_negative_int_attr_pass() {
+  // CHECK: test.non_negative_int_attr
+  "test.non_negative_int_attr"() {i32attr = 5 : i32, i64attr = 10 : i64} : () -> ()
+  // CHECK: test.non_negative_int_attr
+  "test.non_negative_int_attr"() {i32attr = 0 : i32, i64attr = 0 : i64} : () -> ()
+  return
+}
+
+// -----
+
+func @negative_int_attr_fail() {
+  // expected-error @+1 {{'i32attr' failed to satisfy constraint: non-negative 32-bit integer attribute}}
+  "test.non_negative_int_attr"() {i32attr = -5 : i32, i64attr = 10 : i64} : () -> ()
+  return
+}
+
+// -----
+
+func @negative_int_attr_fail() {
+  // expected-error @+1 {{'i64attr' failed to satisfy constraint: non-negative 64-bit integer attribute}}
+  "test.non_negative_int_attr"() {i32attr = 5 : i32, i64attr = -10 : i64} : () -> ()
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Test Positive Int Attr
+//===----------------------------------------------------------------------===//
+
+func @positive_int_attr_pass() {
+  // CHECK: test.positive_int_attr
+  "test.positive_int_attr"() {i32attr = 5 : i32, i64attr = 10 : i64} : () -> ()
+  return
+}
+
+// -----
+
+func @positive_int_attr_fail() {
+  // expected-error @+1 {{'i32attr' failed to satisfy constraint: positive 32-bit integer attribute}}
+  "test.positive_int_attr"() {i32attr = 0 : i32, i64attr = 5: i64} : () -> ()
+  return
+}
+
+// -----
+
+func @positive_int_attr_fail() {
+  // expected-error @+1 {{'i64attr' failed to satisfy constraint: positive 64-bit integer attribute}}
+  "test.positive_int_attr"() {i32attr = 5 : i32, i64attr = 0: i64} : () -> ()
+  return
+}
+
+// -----
+
+func @positive_int_attr_fail() {
+  // expected-error @+1 {{'i32attr' failed to satisfy constraint: positive 32-bit integer attribute}}
+  "test.positive_int_attr"() {i32attr = -10 : i32, i64attr = 5 : i64} : () -> ()
+  return
+}
+
+// -----
+
+func @positive_int_attr_fail() {
+  // expected-error @+1 {{'i64attr' failed to satisfy constraint: positive 64-bit integer attribute}}
+  "test.positive_int_attr"() {i32attr = 5 : i32, i64attr = -10 : i64} : () -> ()
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Test TypeArrayAttr
+//===----------------------------------------------------------------------===//
+
+func @correct_type_array_attr_pass() {
+  // CHECK: test.type_array_attr
+  "test.type_array_attr"() {attr = [i32, f32]} : () -> ()
+  return
+}
+
+// -----
+
+func @non_type_in_type_array_attr_fail() {
+  // expected-error @+1 {{'attr' failed to satisfy constraint: type array attribute}}
+  "test.type_array_attr"() {attr = [i32, 5 : i64]} : () -> ()
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Test StringAttr with custom type
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @string_attr_custom_type
+func @string_attr_custom_type() {
+  // CHECK: "string_data" : !foo.string
+  test.string_attr_with_type "string_data"
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Test StrEnumAttr
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @allowed_cases_pass
+func @allowed_cases_pass() {
+  // CHECK: test.str_enum_attr
+  %0 = "test.str_enum_attr"() {attr = "A"} : () -> i32
+  // CHECK: test.str_enum_attr
+  %1 = "test.str_enum_attr"() {attr = "B"} : () -> i32
+  return
+}
+
+// -----
+
+func @disallowed_case_fail() {
+  // expected-error @+1 {{allowed string cases: 'A', 'B'}}
+  %0 = "test.str_enum_attr"() {attr = 7: i32} : () -> i32
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Test I32EnumAttr
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @allowed_cases_pass
+func @allowed_cases_pass() {
+  // CHECK: test.i32_enum_attr
+  %0 = "test.i32_enum_attr"() {attr = 5: i32} : () -> i32
+  // CHECK: test.i32_enum_attr
+  %1 = "test.i32_enum_attr"() {attr = 10: i32} : () -> i32
+  return
+}
+
+// -----
+
+func @disallowed_case7_fail() {
+  // expected-error @+1 {{allowed 32-bit integer cases: 5, 10}}
+  %0 = "test.i32_enum_attr"() {attr = 7: i32} : () -> i32
+  return
+}
+
+// -----
+
+func @disallowed_case7_fail() {
+  // expected-error @+1 {{allowed 32-bit integer cases: 5, 10}}
+  %0 = "test.i32_enum_attr"() {attr = 5: i64} : () -> i32
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Test I64EnumAttr
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @allowed_cases_pass
+func @allowed_cases_pass() {
+  // CHECK: test.i64_enum_attr
+  %0 = "test.i64_enum_attr"() {attr = 5: i64} : () -> i32
+  // CHECK: test.i64_enum_attr
+  %1 = "test.i64_enum_attr"() {attr = 10: i64} : () -> i32
+  return
+}
+
+// -----
+
+func @disallowed_case7_fail() {
+  // expected-error @+1 {{allowed 64-bit integer cases: 5, 10}}
+  %0 = "test.i64_enum_attr"() {attr = 7: i64} : () -> i32
+  return
+}
+
+// -----
+
+func @disallowed_case7_fail() {
+  // expected-error @+1 {{allowed 64-bit integer cases: 5, 10}}
+  %0 = "test.i64_enum_attr"() {attr = 5: i32} : () -> i32
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Test FloatElementsAttr
+//===----------------------------------------------------------------------===//
+
+func @correct_type_pass() {
+  "test.float_elements_attr"() {
+    // CHECK: scalar_f32_attr = dense<5.000000e+00> : tensor<2xf32>
+    // CHECK: tensor_f64_attr = dense<6.000000e+00> : tensor<4x8xf64>
+    scalar_f32_attr = dense<5.0> : tensor<2xf32>,
+    tensor_f64_attr = dense<6.0> : tensor<4x8xf64>
+  } : () -> ()
+  return
+}
+
+// -----
+
+func @wrong_element_type_pass() {
+  // expected-error @+1 {{failed to satisfy constraint: 32-bit float elements attribute of shape [2]}}
+  "test.float_elements_attr"() {
+    scalar_f32_attr = dense<5.0> : tensor<2xf64>,
+    tensor_f64_attr = dense<6.0> : tensor<4x8xf64>
+  } : () -> ()
+  return
+}
+
+// -----
+
+func @correct_type_pass() {
+  // expected-error @+1 {{failed to satisfy constraint: 64-bit float elements attribute of shape [4, 8]}}
+  "test.float_elements_attr"() {
+    scalar_f32_attr = dense<5.0> : tensor<2xf32>,
+    tensor_f64_attr = dense<6.0> : tensor<4xf64>
+  } : () -> ()
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Test SymbolRefAttr
+//===----------------------------------------------------------------------===//
+
+func @fn() { return }
+
+// CHECK: test.symbol_ref_attr
+"test.symbol_ref_attr"() {symbol = @fn} : () -> ()
+
+// -----
+
+// expected-error @+1 {{referencing to a 'FuncOp' symbol}}
+"test.symbol_ref_attr"() {symbol = @foo} : () -> ()
diff --git a/mlir/test/IR/check-help-output.mlir b/mlir/test/IR/check-help-output.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..c11ab12626add19b40793a27a51809bc3c664c2a
--- /dev/null
+++ b/mlir/test/IR/check-help-output.mlir
@@ -0,0 +1,4 @@
+// RUN: mlir-opt --help | FileCheck %s
+//
+// CHECK: OVERVIEW: MLIR modular optimizer driver
+
diff --git a/mlir/test/IR/core-ops.mlir b/mlir/test/IR/core-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..aac0dfc03a33fc799a1f1e9d11ec37aade0539bd
--- /dev/null
+++ b/mlir/test/IR/core-ops.mlir
@@ -0,0 +1,730 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK: #map0 = (d0) -> (d0 + 1)
+
+// CHECK: #map1 = ()[s0] -> (s0 + 1)
+
+// CHECK-DAG: #[[VIEW_MAP1:map[0-9]+]] = (d0, d1) -> (d0 * 4 + d1)
+// CHECK-DAG: #[[VIEW_MAP2:map[0-9]+]] = (d0, d1)[s0, s1] -> (d0 * s1 + d1 + s0)
+// CHECK-DAG: #[[VIEW_MAP3:map[0-9]+]] = (d0, d1)[s0] -> (d0 * s0 + d1)
+
+// CHECK-DAG: #[[BASE_MAP0:map[0-9]+]] = (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)
+// CHECK-DAG: #[[BASE_MAP3:map[0-9]+]] = (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)
+// CHECK-DAG: #[[SUBVIEW_MAP0:map[0-9]+]] = (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)
+
+// CHECK-DAG: #[[BASE_MAP1:map[0-9]+]] = (d0)[s0] -> (d0 + s0)
+// CHECK-DAG: #[[SUBVIEW_MAP1:map[0-9]+]] = (d0)[s0, s1] -> (d0 * s1 + s0)
+
+// CHECK-DAG: #[[BASE_MAP2:map[0-9]+]] = (d0, d1) -> (d0 * 22 + d1)
+// CHECK-DAG: #[[SUBVIEW_MAP2:map[0-9]+]] = (d0, d1)[s0, s1, s2] -> (d0 * s1 + d1 * s2 + s0)
+// CHECK-DAG: #[[SUBVIEW_MAP3:map[0-9]+]] = (d0, d1, d2) -> (d0 * 16 + d1 * 4 + d2 + 8)
+// CHECK-DAG: #[[SUBVIEW_MAP4:map[0-9]+]] = (d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)
+// CHECK-DAG: #[[SUBVIEW_MAP5:map[0-9]+]] = (d0, d1)[s0] -> (d0 * 8 + s0 + d1 * 2)
+
+// CHECK-LABEL: func @func_with_ops(%arg0: f32) {
+func @func_with_ops(f32) {
+^bb0(%a : f32):
+  // CHECK: %0 = "getTensor"() : () -> tensor<4x4x?xf32>
+  %t = "getTensor"() : () -> tensor<4x4x?xf32>
+
+  // CHECK: %1 = dim %0, 2 : tensor<4x4x?xf32>
+  %t2 = "std.dim"(%t){index = 2} : (tensor<4x4x?xf32>) -> index
+
+  // CHECK: %2 = addf %arg0, %arg0 : f32
+  %x = "std.addf"(%a, %a) : (f32,f32) -> (f32)
+
+  // CHECK:   return
+  return
+}
+
+// CHECK-LABEL: func @standard_instrs(%arg0: tensor<4x4x?xf32>, %arg1: f32, %arg2: i32, %arg3: index, %arg4: i64, %arg5: f16) {
+func @standard_instrs(tensor<4x4x?xf32>, f32, i32, index, i64, f16) {
+^bb42(%t: tensor<4x4x?xf32>, %f: f32, %i: i32, %idx : index, %j: i64, %half: f16):
+  // CHECK: %0 = dim %arg0, 2 : tensor<4x4x?xf32>
+  %a = "std.dim"(%t){index = 2} : (tensor<4x4x?xf32>) -> index
+
+  // CHECK: %1 = dim %arg0, 2 : tensor<4x4x?xf32>
+  %a2 = dim %t, 2 : tensor<4x4x?xf32>
+
+  // CHECK: %2 = addf %arg1, %arg1 : f32
+  %f2 = "std.addf"(%f, %f) : (f32,f32) -> f32
+
+  // CHECK: %3 = addf %2, %2 : f32
+  %f3 = addf %f2, %f2 : f32
+
+  // CHECK: %4 = addi %arg2, %arg2 : i32
+  %i2 = "std.addi"(%i, %i) : (i32,i32) -> i32
+
+  // CHECK: %5 = addi %4, %4 : i32
+  %i3 = addi %i2, %i2 : i32
+
+  // CHECK: %{{[0-9]+}} = addi %arg3, %arg3 : index
+  %idx1 = addi %idx, %idx : index
+
+  // CHECK: %{{[0-9]+}} = addi %arg3, %{{[0-9]+}} : index
+  %idx2 = "std.addi"(%idx, %idx1) : (index, index) -> index
+
+  // CHECK: %8 = subf %arg1, %arg1 : f32
+  %f4 = "std.subf"(%f, %f) : (f32,f32) -> f32
+
+  // CHECK: %9 = subf %8, %8 : f32
+  %f5 = subf %f4, %f4 : f32
+
+  // CHECK: %10 = subi %arg2, %arg2 : i32
+  %i4 = "std.subi"(%i, %i) : (i32,i32) -> i32
+
+  // CHECK: %11 = subi %10, %10 : i32
+  %i5 = subi %i4, %i4 : i32
+
+  // CHECK: %12 = mulf %2, %2 : f32
+  %f6 = mulf %f2, %f2 : f32
+
+  // CHECK: %13 = muli %4, %4 : i32
+  %i6 = muli %i2, %i2 : i32
+
+  // CHECK: %c42_i32 = constant 42 : i32
+  %x = "std.constant"(){value = 42 : i32} : () -> i32
+
+  // CHECK: %c42_i32_0 = constant 42 : i32
+  %7 = constant 42 : i32
+
+  // CHECK: %c43 = constant {crazy = "std.foo"} 43 : index
+  %8 = constant {crazy = "std.foo"} 43: index
+
+  // CHECK: %cst = constant 4.300000e+01 : bf16
+  %9 = constant 43.0 : bf16
+
+  // CHECK: %f = constant @func_with_ops : (f32) -> ()
+  %10 = constant @func_with_ops : (f32) -> ()
+
+  // CHECK: %f_1 = constant @affine_apply : () -> ()
+  %11 = constant @affine_apply : () -> ()
+
+  // CHECK: %f_2 = constant @affine_apply : () -> ()
+  %12 = constant @affine_apply : () -> ()
+
+  // CHECK: %cst_3 = constant dense<0> : vector<4xi32>
+  %13 = constant dense<0> : vector<4 x i32>
+
+  // CHECK: %cst_4 = constant dense<0> : tensor<42xi32>
+  %tci32 = constant dense<0> : tensor<42 x i32>
+
+  // CHECK: %cst_5 = constant dense<0> : vector<42xi32>
+  %vci32 = constant dense<0> : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = cmpi "eq", %{{[0-9]+}}, %{{[0-9]+}} : i32
+  %14 = cmpi "eq", %i3, %i4 : i32
+
+  // Predicate 1 means inequality comparison.
+  // CHECK: %{{[0-9]+}} = cmpi "ne", %{{[0-9]+}}, %{{[0-9]+}} : i32
+  %15 = "std.cmpi"(%i3, %i4) {predicate = 1} : (i32, i32) -> i1
+
+  // CHECK: %{{[0-9]+}} = cmpi "slt", %cst_3, %cst_3 : vector<4xi32>
+  %16 = cmpi "slt", %13, %13 : vector<4 x i32>
+
+  // CHECK: %{{[0-9]+}} = cmpi "ne", %cst_3, %cst_3 : vector<4xi32>
+  %17 = "std.cmpi"(%13, %13) {predicate = 1} : (vector<4 x i32>, vector<4 x i32>) -> vector<4 x i1>
+
+  // CHECK: %{{[0-9]+}} = cmpi "slt", %arg3, %arg3 : index
+  %18 = cmpi "slt", %idx, %idx : index
+
+  // CHECK: %{{[0-9]+}} = cmpi "eq", %cst_4, %cst_4 : tensor<42xi32>
+  %19 = cmpi "eq", %tci32, %tci32 : tensor<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = cmpi "eq", %cst_5, %cst_5 : vector<42xi32>
+  %20 = cmpi "eq", %vci32, %vci32 : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = select %{{[0-9]+}}, %arg3, %arg3 : index
+  %21 = select %18, %idx, %idx : index
+
+  // CHECK: %{{[0-9]+}} = select %{{[0-9]+}}, %cst_4, %cst_4 : tensor<42xi32>
+  %22 = select %19, %tci32, %tci32 : tensor<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = select %{{[0-9]+}}, %cst_5, %cst_5 : vector<42xi32>
+  %23 = select %20, %vci32, %vci32 : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = select %{{[0-9]+}}, %arg3, %arg3 : index
+  %24 = "std.select"(%18, %idx, %idx) : (i1, index, index) -> index
+
+  // CHECK: %{{[0-9]+}} = select %{{[0-9]+}}, %cst_4, %cst_4 : tensor<42xi32>
+  %25 = "std.select"(%19, %tci32, %tci32) : (tensor<42 x i1>, tensor<42 x i32>, tensor<42 x i32>) -> tensor<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = divi_signed %arg2, %arg2 : i32
+  %26 = divi_signed %i, %i : i32
+
+  // CHECK: %{{[0-9]+}} = divi_signed %arg3, %arg3 : index
+  %27 = divi_signed %idx, %idx : index
+
+  // CHECK: %{{[0-9]+}} = divi_signed %cst_5, %cst_5 : vector<42xi32>
+  %28 = divi_signed %vci32, %vci32 : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = divi_signed %cst_4, %cst_4 : tensor<42xi32>
+  %29 = divi_signed %tci32, %tci32 : tensor<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = divi_signed %arg2, %arg2 : i32
+  %30 = "std.divi_signed"(%i, %i) : (i32, i32) -> i32
+
+  // CHECK: %{{[0-9]+}} = divi_unsigned %arg2, %arg2 : i32
+  %31 = divi_unsigned %i, %i : i32
+
+  // CHECK: %{{[0-9]+}} = divi_unsigned %arg3, %arg3 : index
+  %32 = divi_unsigned %idx, %idx : index
+
+  // CHECK: %{{[0-9]+}} = divi_unsigned %cst_5, %cst_5 : vector<42xi32>
+  %33 = divi_unsigned %vci32, %vci32 : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = divi_unsigned %cst_4, %cst_4 : tensor<42xi32>
+  %34 = divi_unsigned %tci32, %tci32 : tensor<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = divi_unsigned %arg2, %arg2 : i32
+  %35 = "std.divi_unsigned"(%i, %i) : (i32, i32) -> i32
+
+  // CHECK: %{{[0-9]+}} = remi_signed %arg2, %arg2 : i32
+  %36 = remi_signed %i, %i : i32
+
+  // CHECK: %{{[0-9]+}} = remi_signed %arg3, %arg3 : index
+  %37 = remi_signed %idx, %idx : index
+
+  // CHECK: %{{[0-9]+}} = remi_signed %cst_5, %cst_5 : vector<42xi32>
+  %38 = remi_signed %vci32, %vci32 : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = remi_signed %cst_4, %cst_4 : tensor<42xi32>
+  %39 = remi_signed %tci32, %tci32 : tensor<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = remi_signed %arg2, %arg2 : i32
+  %40 = "std.remi_signed"(%i, %i) : (i32, i32) -> i32
+
+  // CHECK: %{{[0-9]+}} = remi_unsigned %arg2, %arg2 : i32
+  %41 = remi_unsigned %i, %i : i32
+
+  // CHECK: %{{[0-9]+}} = remi_unsigned %arg3, %arg3 : index
+  %42 = remi_unsigned %idx, %idx : index
+
+  // CHECK: %{{[0-9]+}} = remi_unsigned %cst_5, %cst_5 : vector<42xi32>
+  %43 = remi_unsigned %vci32, %vci32 : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = remi_unsigned %cst_4, %cst_4 : tensor<42xi32>
+  %44 = remi_unsigned %tci32, %tci32 : tensor<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = remi_unsigned %arg2, %arg2 : i32
+  %45 = "std.remi_unsigned"(%i, %i) : (i32, i32) -> i32
+
+  // CHECK: %{{[0-9]+}} = divf %arg1, %arg1 : f32
+  %46 = "std.divf"(%f, %f) : (f32,f32) -> f32
+
+  // CHECK: %{{[0-9]+}} = divf %arg1, %arg1 : f32
+  %47 = divf %f, %f : f32
+
+  // CHECK: %{{[0-9]+}} = divf %arg0, %arg0 : tensor<4x4x?xf32>
+  %48 = divf %t, %t : tensor<4x4x?xf32>
+
+  // CHECK: %{{[0-9]+}} = remf %arg1, %arg1 : f32
+  %49 = "std.remf"(%f, %f) : (f32,f32) -> f32
+
+  // CHECK: %{{[0-9]+}} = remf %arg1, %arg1 : f32
+  %50 = remf %f, %f : f32
+
+  // CHECK: %{{[0-9]+}} = remf %arg0, %arg0 : tensor<4x4x?xf32>
+  %51 = remf %t, %t : tensor<4x4x?xf32>
+
+  // CHECK: %{{[0-9]+}} = and %arg2, %arg2 : i32
+  %52 = "std.and"(%i, %i) : (i32,i32) -> i32
+
+  // CHECK: %{{[0-9]+}} = and %arg2, %arg2 : i32
+  %53 = and %i, %i : i32
+
+  // CHECK: %{{[0-9]+}} = and %cst_5, %cst_5 : vector<42xi32>
+  %54 = std.and %vci32, %vci32 : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = and %cst_4, %cst_4 : tensor<42xi32>
+  %55 = and %tci32, %tci32 : tensor<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = or %arg2, %arg2 : i32
+  %56 = "std.or"(%i, %i) : (i32,i32) -> i32
+
+  // CHECK: %{{[0-9]+}} = or %arg2, %arg2 : i32
+  %57 = or %i, %i : i32
+
+  // CHECK: %{{[0-9]+}} = or %cst_5, %cst_5 : vector<42xi32>
+  %58 = std.or %vci32, %vci32 : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = or %cst_4, %cst_4 : tensor<42xi32>
+  %59 = or %tci32, %tci32 : tensor<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = xor %arg2, %arg2 : i32
+  %60 = "std.xor"(%i, %i) : (i32,i32) -> i32
+
+  // CHECK: %{{[0-9]+}} = xor %arg2, %arg2 : i32
+  %61 = xor %i, %i : i32
+
+  // CHECK: %{{[0-9]+}} = xor %cst_5, %cst_5 : vector<42xi32>
+  %62 = std.xor %vci32, %vci32 : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = xor %cst_4, %cst_4 : tensor<42xi32>
+  %63 = xor %tci32, %tci32 : tensor<42 x i32>
+
+  %64 = constant dense<0.> : vector<4 x f32>
+  %tcf32 = constant dense<0.> : tensor<42 x f32>
+  %vcf32 = constant dense<0.> : vector<4 x f32>
+
+  // CHECK: %{{[0-9]+}} = cmpf "ogt", %{{[0-9]+}}, %{{[0-9]+}} : f32
+  %65 = cmpf "ogt", %f3, %f4 : f32
+
+  // Predicate 0 means ordered equality comparison.
+  // CHECK: %{{[0-9]+}} = cmpf "oeq", %{{[0-9]+}}, %{{[0-9]+}} : f32
+  %66 = "std.cmpf"(%f3, %f4) {predicate = 1} : (f32, f32) -> i1
+
+  // CHECK: %{{[0-9]+}} = cmpf "olt", %cst_8, %cst_8 : vector<4xf32>
+  %67 = cmpf "olt", %vcf32, %vcf32 : vector<4 x f32>
+
+  // CHECK: %{{[0-9]+}} = cmpf "oeq", %cst_8, %cst_8 : vector<4xf32>
+  %68 = "std.cmpf"(%vcf32, %vcf32) {predicate = 1} : (vector<4 x f32>, vector<4 x f32>) -> vector<4 x i1>
+
+  // CHECK: %{{[0-9]+}} = cmpf "oeq", %cst_7, %cst_7 : tensor<42xf32>
+  %69 = cmpf "oeq", %tcf32, %tcf32 : tensor<42 x f32>
+
+  // CHECK: %{{[0-9]+}} = cmpf "oeq", %cst_8, %cst_8 : vector<4xf32>
+  %70 = cmpf "oeq", %vcf32, %vcf32 : vector<4 x f32>
+
+  // CHECK: %{{[0-9]+}} = rank %arg0 : tensor<4x4x?xf32>
+  %71 = "std.rank"(%t) : (tensor<4x4x?xf32>) -> index
+
+  // CHECK: %{{[0-9]+}} = rank %arg0 : tensor<4x4x?xf32>
+  %72 = rank %t : tensor<4x4x?xf32>
+
+  // CHECK: = constant unit
+  %73 = constant unit
+
+  // CHECK: constant true
+  %74 = constant true
+
+  // CHECK: constant false
+  %75 = constant false
+
+  // CHECK: = index_cast {{.*}} : index to i64
+  %76 = index_cast %idx : index to i64
+
+  // CHECK: = index_cast {{.*}} : i32 to index
+  %77 = index_cast %i : i32 to index
+
+  // CHECK: = sitofp {{.*}} : i32 to f32
+  %78 = sitofp %i : i32 to f32
+
+  // CHECK: = sitofp {{.*}} : i32 to f64
+  %79 = sitofp %i : i32 to f64
+
+  // CHECK: = sitofp {{.*}} : i64 to f32
+  %80 = sitofp %j : i64 to f32
+
+  // CHECK: = sitofp {{.*}} : i64 to f64
+  %81 = sitofp %j : i64 to f64
+
+  // CHECK: = sexti %arg2 : i32 to i64
+  %82 = "std.sexti"(%i) : (i32) -> i64
+
+  // CHECK: = sexti %arg2 : i32 to i64
+  %83 = sexti %i : i32 to i64
+
+  // CHECK: %{{[0-9]+}} = sexti %cst_5 : vector<42xi32>
+  %84 = sexti %vci32 : vector<42 x i32> to vector<42 x i64>
+
+  // CHECK: %{{[0-9]+}} = sexti %cst_4 : tensor<42xi32>
+  %85 = sexti %tci32 : tensor<42 x i32> to tensor<42 x i64>
+
+  // CHECK: = zexti %arg2 : i32 to i64
+  %86 = "std.zexti"(%i) : (i32) -> i64
+
+  // CHECK: = zexti %arg2 : i32 to i64
+  %87 = zexti %i : i32 to i64
+
+  // CHECK: %{{[0-9]+}} = zexti %cst_5 : vector<42xi32>
+  %88 = zexti %vci32 : vector<42 x i32> to vector<42 x i64>
+
+  // CHECK: %{{[0-9]+}} = zexti %cst_4 : tensor<42xi32>
+  %89 = zexti %tci32 : tensor<42 x i32> to tensor<42 x i64>
+
+  // CHECK: = trunci %arg2 : i32 to i16
+  %90 = "std.trunci"(%i) : (i32) -> i16
+
+  // CHECK: = trunci %arg2 : i32 to i16
+  %91 = trunci %i : i32 to i16
+
+  // CHECK: %{{[0-9]+}} = trunci %cst_5 : vector<42xi32>
+  %92 = trunci %vci32 : vector<42 x i32> to vector<42 x i16>
+
+  // CHECK: %{{[0-9]+}} = trunci %cst_4 : tensor<42xi32>
+  %93 = trunci %tci32 : tensor<42 x i32> to tensor<42 x i16>
+
+  // CHECK: = fpext {{.*}} : f16 to f32
+  %94 = fpext %half : f16 to f32
+
+  // CHECK: = fptrunc {{.*}} : f32 to f16
+  %95 = fptrunc %f : f32 to f16
+
+  // CHECK: %{{[0-9]+}} = exp %arg1 : f32
+  %96 = "std.exp"(%f) : (f32) -> f32
+
+  // CHECK: %{{[0-9]+}} = exp %arg1 : f32
+  %97 = exp %f : f32
+
+  // CHECK: %{{[0-9]+}} = exp %cst_8 : vector<4xf32>
+  %98 = exp %vcf32 : vector<4xf32>
+
+  // CHECK: %{{[0-9]+}} = exp %arg0 : tensor<4x4x?xf32>
+  %99 = exp %t : tensor<4x4x?xf32>
+
+  // CHECK: %{{[0-9]+}} = absf %arg1 : f32
+  %100 = "std.absf"(%f) : (f32) -> f32
+
+  // CHECK: %{{[0-9]+}} = absf %arg1 : f32
+  %101 = absf %f : f32
+
+  // CHECK: %{{[0-9]+}} = absf %cst_8 : vector<4xf32>
+  %102 = absf %vcf32 : vector<4xf32>
+
+  // CHECK: %{{[0-9]+}} = absf %arg0 : tensor<4x4x?xf32>
+  %103 = absf %t : tensor<4x4x?xf32>
+
+  // CHECK: %{{[0-9]+}} = ceilf %arg1 : f32
+  %104 = "std.ceilf"(%f) : (f32) -> f32
+
+  // CHECK: %{{[0-9]+}} = ceilf %arg1 : f32
+  %105 = ceilf %f : f32
+
+  // CHECK: %{{[0-9]+}} = ceilf %cst_8 : vector<4xf32>
+  %106 = ceilf %vcf32 : vector<4xf32>
+
+  // CHECK: %{{[0-9]+}} = ceilf %arg0 : tensor<4x4x?xf32>
+  %107 = ceilf %t : tensor<4x4x?xf32>
+
+  // CHECK: %{{[0-9]+}} = cos %arg1 : f32
+  %108 = "std.cos"(%f) : (f32) -> f32
+
+  // CHECK: %{{[0-9]+}} = cos %arg1 : f32
+  %109 = cos %f : f32
+
+  // CHECK: %{{[0-9]+}} = cos %cst_8 : vector<4xf32>
+  %110 = cos %vcf32 : vector<4xf32>
+
+  // CHECK: %{{[0-9]+}} = cos %arg0 : tensor<4x4x?xf32>
+  %111 = cos %t : tensor<4x4x?xf32>
+
+  // CHECK: %{{[0-9]+}} = negf %arg1 : f32
+  %112 = "std.negf"(%f) : (f32) -> f32
+
+  // CHECK: %{{[0-9]+}} = negf %arg1 : f32
+  %113 = negf %f : f32
+
+  // CHECK: %{{[0-9]+}} = negf %cst_8 : vector<4xf32>
+  %114 = negf %vcf32 : vector<4xf32>
+
+  // CHECK: %{{[0-9]+}} = negf %arg0 : tensor<4x4x?xf32>
+  %115 = negf %t : tensor<4x4x?xf32>
+
+  // CHECK: %{{[0-9]+}} = copysign %arg1, %arg1 : f32
+  %116 = "std.copysign"(%f, %f) : (f32, f32) -> f32
+
+  // CHECK: %{{[0-9]+}} = copysign %arg1, %arg1 : f32
+  %117 = copysign %f, %f : f32
+
+  // CHECK: %{{[0-9]+}} = copysign %cst_8, %cst_8 : vector<4xf32>
+  %118 = copysign %vcf32, %vcf32 : vector<4xf32>
+
+  // CHECK: %{{[0-9]+}} = copysign %arg0, %arg0 : tensor<4x4x?xf32>
+  %119 = copysign %t, %t : tensor<4x4x?xf32>
+
+  // CHECK: %{{[0-9]+}} = tanh %arg1 : f32
+  %120 = "std.tanh"(%f) : (f32) -> f32
+
+  // CHECK: %{{[0-9]+}} = tanh %arg1 : f32
+  %121 = tanh %f : f32
+
+  // CHECK: %{{[0-9]+}} = tanh %cst_8 : vector<4xf32>
+  %122 = tanh %vcf32 : vector<4xf32>
+
+  // CHECK: %{{[0-9]+}} = tanh %arg0 : tensor<4x4x?xf32>
+  %123 = tanh %t : tensor<4x4x?xf32>
+
+  // CHECK: %{{[0-9]+}} = shift_left %arg2, %arg2 : i32
+  %124 = "std.shift_left"(%i, %i) : (i32, i32) -> i32
+
+  // CHECK:%{{[0-9]+}} = shift_left %4, %4 : i32
+  %125 = shift_left %i2, %i2 : i32
+
+  // CHECK: %{{[0-9]+}} = shift_left %arg3, %arg3 : index
+  %126 = shift_left %idx, %idx : index
+
+  // CHECK: %{{[0-9]+}} = shift_left %cst_5, %cst_5 : vector<42xi32>
+  %127 = shift_left %vci32, %vci32 : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = shift_left %cst_4, %cst_4 : tensor<42xi32>
+  %128 = shift_left %tci32, %tci32 : tensor<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = shift_right_signed %arg2, %arg2 : i32
+  %129 = "std.shift_right_signed"(%i, %i) : (i32, i32) -> i32
+
+  // CHECK:%{{[0-9]+}} = shift_right_signed %4, %4 : i32
+  %130 = shift_right_signed %i2, %i2 : i32
+
+  // CHECK: %{{[0-9]+}} = shift_right_signed %arg3, %arg3 : index
+  %131 = shift_right_signed %idx, %idx : index
+
+  // CHECK: %{{[0-9]+}} = shift_right_signed %cst_5, %cst_5 : vector<42xi32>
+  %132 = shift_right_signed %vci32, %vci32 : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = shift_right_signed %cst_4, %cst_4 : tensor<42xi32>
+  %133 = shift_right_signed %tci32, %tci32 : tensor<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = shift_right_unsigned %arg2, %arg2 : i32
+  %134 = "std.shift_right_unsigned"(%i, %i) : (i32, i32) -> i32
+
+  // CHECK:%{{[0-9]+}} = shift_right_unsigned %4, %4 : i32
+  %135 = shift_right_unsigned %i2, %i2 : i32
+
+  // CHECK: %{{[0-9]+}} = shift_right_unsigned %arg3, %arg3 : index
+  %136 = shift_right_unsigned %idx, %idx : index
+
+  // CHECK: %{{[0-9]+}} = shift_right_unsigned %cst_5, %cst_5 : vector<42xi32>
+  %137 = shift_right_unsigned %vci32, %vci32 : vector<42 x i32>
+
+  // CHECK: %{{[0-9]+}} = shift_right_unsigned %cst_4, %cst_4 : tensor<42xi32>
+  %138 = shift_right_unsigned %tci32, %tci32 : tensor<42 x i32>
+
+  return
+}
+
+// CHECK-LABEL: func @affine_apply() {
+func @affine_apply() {
+  %i = "std.constant"() {value = 0: index} : () -> index
+  %j = "std.constant"() {value = 1: index} : () -> index
+
+  // CHECK: affine.apply #map0(%c0)
+  %a = "affine.apply" (%i) { map = (d0) -> (d0 + 1) } :
+    (index) -> (index)
+
+  // CHECK: affine.apply #map1()[%c0]
+  %b = affine.apply ()[x] -> (x+1)()[%i]
+
+  return
+}
+
+// CHECK-LABEL: func @load_store_prefetch
+func @load_store_prefetch(memref<4x4xi32>, index) {
+^bb0(%0: memref<4x4xi32>, %1: index):
+  // CHECK: %0 = load %arg0[%arg1, %arg1] : memref<4x4xi32>
+  %2 = "std.load"(%0, %1, %1) : (memref<4x4xi32>, index, index)->i32
+
+  // CHECK: %{{.*}} = load %arg0[%arg1, %arg1] : memref<4x4xi32>
+  %3 = load %0[%1, %1] : memref<4x4xi32>
+
+  // CHECK: prefetch %arg0[%arg1, %arg1], write, locality<1>, data : memref<4x4xi32>
+  prefetch %0[%1, %1], write, locality<1>, data : memref<4x4xi32>
+
+  // CHECK: prefetch %arg0[%arg1, %arg1], read, locality<3>, instr : memref<4x4xi32>
+  prefetch %0[%1, %1], read, locality<3>, instr : memref<4x4xi32>
+
+  return
+}
+
+// Test with zero-dimensional operands using no index in load/store.
+// CHECK-LABEL: func @zero_dim_no_idx
+func @zero_dim_no_idx(%arg0 : memref<i32>, %arg1 : memref<i32>, %arg2 : memref<i32>) {
+  %0 = std.load %arg0[] : memref<i32>
+  std.store %0, %arg1[] : memref<i32>
+  return
+  // CHECK: %0 = load %{{.*}}[] : memref<i32>
+  // CHECK: store %{{.*}}, %{{.*}}[] : memref<i32>
+}
+
+// CHECK-LABEL: func @return_op(%arg0: i32) -> i32 {
+func @return_op(%a : i32) -> i32 {
+  // CHECK: return %arg0 : i32
+  "std.return" (%a) : (i32)->()
+}
+
+// CHECK-LABEL: func @calls(%arg0: i32) {
+func @calls(%arg0: i32) {
+  // CHECK: %0 = call @return_op(%arg0) : (i32) -> i32
+  %x = call @return_op(%arg0) : (i32) -> i32
+  // CHECK: %1 = call @return_op(%0) : (i32) -> i32
+  %y = call @return_op(%x) : (i32) -> i32
+  // CHECK: %2 = call @return_op(%0) : (i32) -> i32
+  %z = "std.call"(%x) {callee = @return_op} : (i32) -> i32
+
+  // CHECK: %f = constant @affine_apply : () -> ()
+  %f = constant @affine_apply : () -> ()
+
+  // CHECK: call_indirect %f() : () -> ()
+  call_indirect %f() : () -> ()
+
+  // CHECK: %f_0 = constant @return_op : (i32) -> i32
+  %f_0 = constant @return_op : (i32) -> i32
+
+  // CHECK: %3 = call_indirect %f_0(%arg0) : (i32) -> i32
+  %2 = call_indirect %f_0(%arg0) : (i32) -> i32
+
+  // CHECK: %4 = call_indirect %f_0(%arg0) : (i32) -> i32
+  %3 = "std.call_indirect"(%f_0, %arg0) : ((i32) -> i32, i32) -> i32
+
+  return
+}
+
+// CHECK-LABEL: func @extract_element(%arg0: tensor<*xi32>, %arg1: tensor<4x4xf32>) -> i32 {
+func @extract_element(%arg0: tensor<*xi32>, %arg1 : tensor<4x4xf32>) -> i32 {
+  %c0 = "std.constant"() {value = 0: index} : () -> index
+
+  // CHECK: %0 = extract_element %arg0[%c0, %c0, %c0, %c0] : tensor<*xi32>
+  %0 = extract_element %arg0[%c0, %c0, %c0, %c0] : tensor<*xi32>
+
+  // CHECK: %1 = extract_element %arg1[%c0, %c0] : tensor<4x4xf32>
+  %1 = extract_element %arg1[%c0, %c0] : tensor<4x4xf32>
+
+  return %0 : i32
+}
+
+// CHECK-LABEL: func @tensor_cast(%arg0
+func @tensor_cast(%arg0: tensor<*xf32>, %arg1 : tensor<4x4xf32>, %arg2: tensor<?x?xf32>) {
+  // CHECK: %0 = tensor_cast %arg0 : tensor<*xf32> to tensor<?x?xf32>
+  %0 = tensor_cast %arg0 : tensor<*xf32> to tensor<?x?xf32>
+
+  // CHECK: %1 = tensor_cast %arg1 : tensor<4x4xf32> to tensor<*xf32>
+  %1 = tensor_cast %arg1 : tensor<4x4xf32> to tensor<*xf32>
+
+  // CHECK: %2 = tensor_cast %arg2 : tensor<?x?xf32> to tensor<4x?xf32>
+  %2 = tensor_cast %arg2 : tensor<?x?xf32> to tensor<4x?xf32>
+
+  // CHECK: %3 = tensor_cast %2 : tensor<4x?xf32> to tensor<?x?xf32>
+  %3 = tensor_cast %2 : tensor<4x?xf32> to tensor<?x?xf32>
+
+  return
+}
+
+// CHECK-LABEL: func @memref_cast(%arg0
+func @memref_cast(%arg0: memref<4xf32>, %arg1 : memref<?xf32>, %arg2 : memref<64x16x4xf32, offset: 0, strides: [64, 4, 1]>) {
+  // CHECK: %0 = memref_cast %arg0 : memref<4xf32> to memref<?xf32>
+  %0 = memref_cast %arg0 : memref<4xf32> to memref<?xf32>
+
+  // CHECK: %1 = memref_cast %arg1 : memref<?xf32> to memref<4xf32>
+  %1 = memref_cast %arg1 : memref<?xf32> to memref<4xf32>
+
+  // CHECK: {{%.*}} = memref_cast %arg2 : memref<64x16x4xf32, #[[BASE_MAP0]]> to memref<64x16x4xf32, #[[BASE_MAP3]]>
+  %2 = memref_cast %arg2 : memref<64x16x4xf32, offset: 0, strides: [64, 4, 1]> to memref<64x16x4xf32, offset: ?, strides: [?, ?, ?]>
+
+  // CHECK: {{%.*}} = memref_cast {{%.*}} : memref<64x16x4xf32, #[[BASE_MAP3]]> to memref<64x16x4xf32, #[[BASE_MAP0]]>
+  %3 = memref_cast %2 : memref<64x16x4xf32, offset: ?, strides: [?, ?, ?]> to memref<64x16x4xf32, offset: 0, strides: [64, 4, 1]>
+
+  // CHECK: memref_cast %{{.*}} : memref<4xf32> to memref<*xf32>
+  %4 = memref_cast %1 : memref<4xf32> to memref<*xf32>
+
+  // CHECK: memref_cast %{{.*}} : memref<*xf32> to memref<4xf32>
+  %5 = memref_cast %4 : memref<*xf32> to memref<4xf32>
+  return
+}
+
+// CHECK-LABEL: func @memref_view(%arg0
+func @memref_view(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<2048xi8>
+  // Test two dynamic sizes and dynamic offset.
+  // CHECK: %{{.*}} = std.view %0[%arg2][%arg0, %arg1] : memref<2048xi8> to memref<?x?xf32, #[[VIEW_MAP2]]>
+  %1 = view %0[%arg2][%arg0, %arg1]
+    : memref<2048xi8> to memref<?x?xf32, (d0, d1)[s0, s1] -> (d0 * s1 + d1 + s0)>
+
+  // Test two dynamic sizes and static offset.
+  // CHECK: %{{.*}} = std.view %0[][%arg0, %arg1] : memref<2048xi8> to memref<?x?xf32, #[[VIEW_MAP3]]>
+  %2 = view %0[][%arg0, %arg1]
+    : memref<2048xi8> to memref<?x?xf32, (d0, d1)[s0] -> (d0 * s0 + d1)>
+
+  // Test one dynamic size and dynamic offset.
+  // CHECK: %{{.*}} = std.view %0[%arg2][%arg1] : memref<2048xi8> to memref<4x?xf32, #[[VIEW_MAP2]]>
+  %3 = view %0[%arg2][%arg1]
+    : memref<2048xi8> to memref<4x?xf32, (d0, d1)[s0, s1] -> (d0 * s1 + d1 + s0)>
+
+  // Test one dynamic size and static offset.
+  // CHECK: %{{.*}} = std.view %0[][%arg0] : memref<2048xi8> to memref<?x4xf32, #[[VIEW_MAP1]]>
+  %4 = view %0[][%arg0]
+    : memref<2048xi8> to memref<?x4xf32, (d0, d1) -> (d0 * 4 + d1)>
+
+  // Test static sizes and static offset.
+  // CHECK: %{{.*}} = std.view %0[][] : memref<2048xi8> to memref<64x4xf32, #[[VIEW_MAP1]]>
+  %5 = view %0[][]
+    : memref<2048xi8> to memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1)>
+  return
+}
+
+// CHECK-LABEL: func @memref_subview(%arg0
+func @memref_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+
+  %0 = alloc() : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>
+  // CHECK: std.subview %0[%c0, %c0, %c0][%arg0, %arg1, %arg2][%c1, %c1, %c1] : memref<8x16x4xf32, #[[BASE_MAP0]]> to memref<?x?x?xf32, #[[SUBVIEW_MAP0]]>
+  %1 = subview %0[%c0, %c0, %c0][%arg0, %arg1, %arg2][%c1, %c1, %c1]
+    : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to
+      memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+
+  %2 = alloc()[%arg2] : memref<64xf32, (d0)[s0] -> (d0 + s0)>
+ // CHECK: std.subview %2[%c1][%arg0][%c1] : memref<64xf32, #[[BASE_MAP1]]> to memref<?xf32, #[[SUBVIEW_MAP1]]>
+  %3 = subview %2[%c1][%arg0][%c1]
+    : memref<64xf32, (d0)[s0] -> (d0 + s0)> to
+      memref<?xf32, (d0)[s0, s1] -> (d0 * s1 + s0)>
+
+  %4 = alloc() : memref<64x22xf32, (d0, d1) -> (d0 * 22 + d1)>
+  // CHECK: std.subview %4[%c0, %c1][%arg0, %arg1][%c1, %c0] : memref<64x22xf32, #[[BASE_MAP2]]> to memref<?x?xf32, #[[SUBVIEW_MAP2]]>
+  %5 = subview %4[%c0, %c1][%arg0, %arg1][%c1, %c0]
+    : memref<64x22xf32, (d0, d1) -> (d0 * 22 + d1)> to
+      memref<?x?xf32, (d0, d1)[s0, s1, s2] -> (d0 * s1 + d1 * s2 + s0)>
+
+  // CHECK: std.subview %0[][][] : memref<8x16x4xf32, #[[BASE_MAP0]]> to memref<4x4x4xf32, #[[SUBVIEW_MAP3]]>
+  %6 = subview %0[][][]
+    : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to
+      memref<4x4x4xf32, (d0, d1, d2) -> (d0 * 16 + d1 * 4 + d2 + 8)>
+
+  %7 = alloc(%arg1, %arg2) : memref<?x?xf32>
+  // CHECK: std.subview {{%.*}}[][][] : memref<?x?xf32> to memref<4x4xf32, #[[SUBVIEW_MAP4]]>
+  %8 = subview %7[][][]
+    : memref<?x?xf32> to memref<4x4xf32, offset: ?, strides:[?, ?]>
+
+  %9 = alloc() : memref<16x4xf32>
+  // CHECK: std.subview {{%.*}}[{{%.*}}, {{%.*}}][][{{%.*}}, {{%.*}}] : memref<16x4xf32> to memref<4x4xf32, #[[SUBVIEW_MAP4]]
+  %10 = subview %9[%arg1, %arg1][][%arg2, %arg2]
+    : memref<16x4xf32> to memref<4x4xf32, offset: ?, strides:[?, ?]>
+  // CHECK: std.subview {{%.*}}[{{%.*}}, {{%.*}}][][] : memref<16x4xf32> to memref<4x4xf32, #[[SUBVIEW_MAP5]]
+  %11 = subview %9[%arg1, %arg2][][]
+    : memref<16x4xf32> to memref<4x4xf32, offset: ?, strides:[8, 2]>
+  return
+}
+
+// CHECK-LABEL: func @test_dimop(%arg0
+func @test_dimop(%arg0: tensor<4x4x?xf32>) {
+  // CHECK: %0 = dim %arg0, 2 : tensor<4x4x?xf32>
+  %0 = dim %arg0, 2 : tensor<4x4x?xf32>
+  // use dim as an index to ensure type correctness
+  %1 = affine.apply (d0) -> (d0)(%0)
+  return
+}
+
+// CHECK-LABEL: func @test_splat_op
+// CHECK-SAME: [[S:%arg[0-9]+]]: f32
+func @test_splat_op(%s : f32) {
+  %v = splat %s : vector<8xf32>
+  // CHECK: splat [[S]] : vector<8xf32>
+  %t = splat %s : tensor<8xf32>
+  // CHECK: splat [[S]] : tensor<8xf32>
+  %u = "std.splat"(%s) : (f32) -> vector<4xf32>
+  // CHECK: splat [[S]] : vector<4xf32>
+  return
+}
+
+// CHECK-LABEL: func @tensor_load_store
+func @tensor_load_store(%0 : memref<4x4xi32>) {
+  // CHECK: %[[TENSOR:.*]] = tensor_load %[[MEMREF:.*]] : memref<4x4xi32>
+  %1 = tensor_load %0 : memref<4x4xi32>
+  // CHECK: tensor_store %[[TENSOR]], %[[MEMREF]] : memref<4x4xi32>
+  tensor_store %1, %0 : memref<4x4xi32>
+  return
+}
diff --git a/mlir/test/IR/invalid-affinemap.mlir b/mlir/test/IR/invalid-affinemap.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..f13636238e39ab296e931e8252689de2128ae64d
--- /dev/null
+++ b/mlir/test/IR/invalid-affinemap.mlir
@@ -0,0 +1,103 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// Check different error cases.
+// -----
+#hello_world = (i, j) -> ((), j) // expected-error {{no expression inside parentheses}}
+
+// -----
+#hello_world = (i, j) -> (->, j) // expected-error {{expected affine expression}}
+
+// -----
+#hello_world = (i, j) -> (:) // expected-error {{expected affine expression}}
+
+// -----
+#hello_world = (i, j) -> (, j) // expected-error {{expected affine expression}}
+
+// -----
+#hello_world (i, j) [s0] -> (i, j) // expected-error {{expected '=' in attribute alias definition}}
+
+// -----
+#hello_world = (i, j) [s0] -> (2*i*, 3*j*i*2 + 5) // expected-error {{missing right operand of binary op}}
+
+// -----
+#hello_world = (i, j) [s0] -> (i+, i+j+2 + 5) // expected-error {{missing right operand of binary op}}
+
+// -----
+#hello_world = (i, j) [s0] -> ((s0 + i, j) // expected-error {{expected ')'}}
+
+// -----
+#hello_world = (i, j) [s0] -> (((s0 + (i + j) + 5), j) // expected-error {{expected ')'}}
+
+// -----
+#hello_world = (i, j) [s0] -> i + s0, j) // expected-error {{expected '(' at start of affine map range}}
+
+// -----
+#hello_world = (i, j) [s0] -> (x) // expected-error {{use of undeclared identifier}}
+
+// -----
+#hello_world = (i, j, i) [s0] -> (i) // expected-error {{redefinition of identifier 'i'}}
+
+// -----
+#hello_world = (i, j) [s0, s1, s0] -> (i) // expected-error {{redefinition of identifier 's0'}}
+
+// -----
+#hello_world = (i, j) [i, s0] -> (j) // expected-error {{redefinition of identifier 'i'}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (+i, j) // expected-error {{missing left operand of binary op}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (i, *j) // expected-error {{missing left operand of binary op}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (floordiv i 2, j) // expected-error {{missing left operand of binary op}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (ceildiv i 2, j) // expected-error {{missing left operand of binary op}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (mod i 2, j) // expected-error {{missing left operand of binary op}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (-(), j)
+// expected-error@-1 {{no expression inside parentheses}}
+// expected-error@-2 {{missing operand of negation}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (i, *j+5) // expected-error {{missing left operand of binary op}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (i, floordiv j+5) // expected-error {{missing left operand of binary op}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (i, ceildiv j+5) // expected-error {{missing left operand of binary op}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (i, mod j+5) // expected-error {{missing left operand of binary op}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (i*j, j) // expected-error {{non-affine expression: at least one of the multiply operands has to be either a constant or symbolic}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (i, j + j ceildiv 128 mod 16 * i - 4) // expected-error {{non-affine expression: at least one of the multiply operands has to be either a constant or symbolic}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (i, j floordiv i) // expected-error {{non-affine expression: right operand of floordiv has to be either a constant or symbolic}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (i, i*2 ceildiv j*5) // expected-error {{non-affine expression: right operand of ceildiv has to be either a constant or symbolic}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (i, i mod (2+i)) // expected-error {{non-affine expression: right operand of mod has to be either a constant or symbolic}}
+
+// -----
+#hello_world = (i, j) [s0, s1] -> (-1*i j, j) // expected-error {{expected ',' or ')'}}
+
+// -----
+#hello_world = (i, j) -> (i, 3*d0 + ) // expected-error {{use of undeclared identifier}}
+
+// TODO(bondhugula): Add more tests; coverage of error messages emitted not complete
+
+// -----
+#ABC = (i,j) -> (i+j)
+#ABC = (i,j) -> (i+j)  // expected-error {{redefinition of attribute alias id 'ABC'}}
diff --git a/mlir/test/IR/invalid-func-op.mlir b/mlir/test/IR/invalid-func-op.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..20af5ece6b1117f330e8a4f395fb192e5d3e9b0b
--- /dev/null
+++ b/mlir/test/IR/invalid-func-op.mlir
@@ -0,0 +1,75 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// -----
+
+func @func_op() {
+  // expected-error@+1 {{expected valid '@'-identifier for symbol name}}
+  func missingsigil() -> (i1, index, f32)
+  return
+}
+
+// -----
+
+func @func_op() {
+  // expected-error@+1 {{expected type instead of SSA identifier}}
+  func @mixed_named_arguments(f32, %a : i32) {
+    return
+  }
+  return
+}
+
+// -----
+
+func @func_op() {
+  // expected-error@+1 {{expected SSA identifier}}
+  func @mixed_named_arguments(%a : i32, f32) -> () {
+    return
+  }
+  return
+}
+
+// -----
+
+func @func_op() {
+  // expected-error@+1 {{entry block must have 1 arguments to match function signature}}
+  func @mixed_named_arguments(f32) {
+  ^entry:
+    return
+  }
+  return
+}
+
+// -----
+
+func @func_op() {
+  // expected-error@+1 {{type of entry block argument #0('i32') must match the type of the corresponding argument in function signature('f32')}}
+  func @mixed_named_arguments(f32) {
+  ^entry(%arg : i32):
+    return
+  }
+  return
+}
+
+// -----
+
+// expected-error@+1 {{expected non-function type}}
+func @f() -> (foo
+
+// -----
+
+// expected-error@+1 {{expected attribute name}}
+func @f() -> (i1 {)
+
+// -----
+
+// expected-error@+1 {{invalid to use 'test.invalid_attr'}}
+func @f(%arg0: i64 {test.invalid_attr}) {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid to use 'test.invalid_attr'}}
+func @f(%arg0: i64) -> (i64 {test.invalid_attr}) {
+  return %arg0 : i64
+}
diff --git a/mlir/test/IR/invalid-locations.mlir b/mlir/test/IR/invalid-locations.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..41777163871654e4576bc370d4107db5cd3385d9
--- /dev/null
+++ b/mlir/test/IR/invalid-locations.mlir
@@ -0,0 +1,100 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// -----
+
+func @location_missing_l_paren() {
+^bb:
+  return loc) // expected-error {{expected '(' in inline location}}
+}
+
+// -----
+
+func @location_missing_r_paren() {
+^bb:
+  return loc(unknown // expected-error@+1 {{expected ')' in inline location}}
+}
+
+// -----
+
+func @location_invalid_instance() {
+^bb:
+  return loc() // expected-error {{expected location instance}}
+}
+
+// -----
+
+func @location_name_missing_r_paren() {
+^bb:
+  return loc("foo"(unknown]) // expected-error {{expected ')' after child location of NameLoc}}
+}
+
+// -----
+
+func @location_name_child_is_name() {
+^bb:
+  return loc("foo"("foo")) // expected-error {{child of NameLoc cannot be another NameLoc}}
+}
+
+// -----
+
+func @location_callsite_missing_l_paren() {
+^bb:
+  return loc(callsite unknown  // expected-error {{expected '(' in callsite location}}
+}
+
+// -----
+
+func @location_callsite_missing_callee() {
+^bb:
+  return loc(callsite( at )  // expected-error {{expected location instance}}
+}
+
+// -----
+
+func @location_callsite_missing_at() {
+^bb:
+  return loc(callsite(unknown unknown) // expected-error {{expected 'at' in callsite location}}
+}
+
+// -----
+
+func @location_callsite_missing_caller() {
+^bb:
+  return loc(callsite(unknown at )  // expected-error {{expected location instance}}
+}
+
+// -----
+
+func @location_callsite_missing_r_paren() {
+^bb:
+  return loc(callsite( unknown at unknown  // expected-error@+1 {{expected ')' in callsite location}}
+}
+
+// -----
+
+func @location_fused_missing_greater() {
+^bb:
+  return loc(fused<true [unknown]) // expected-error {{expected '>' after fused location metadata}}
+}
+
+// -----
+
+func @location_fused_missing_metadata() {
+^bb:
+  // expected-error@+1 {{expected non-function type}}
+  return loc(fused<) // expected-error {{expected valid attribute metadata}}
+}
+
+// -----
+
+func @location_fused_missing_l_square() {
+^bb:
+  return loc(fused<true>unknown]) // expected-error {{expected '[' in fused location}}
+}
+
+// -----
+
+func @location_fused_missing_r_square() {
+^bb:
+  return loc(fused[unknown) // expected-error {{expected ']' in fused location}}
+}
diff --git a/mlir/test/IR/invalid-module-op.mlir b/mlir/test/IR/invalid-module-op.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..f010ceca12f6da152a51ded29b6fad14f653a454
--- /dev/null
+++ b/mlir/test/IR/invalid-module-op.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// -----
+
+func @module_op() {
+  // expected-error@+1 {{Operations with a 'SymbolTable' must have exactly one block}}
+  module {
+  ^bb1:
+    "module_terminator"() : () -> ()
+  ^bb2:
+    "module_terminator"() : () -> ()
+  }
+  return
+}
+
+// -----
+
+func @module_op() {
+  // expected-error@+1 {{expected body to have no arguments}}
+  module {
+  ^bb1(%arg: i32):
+    "module_terminator"() : () -> ()
+  }
+  return
+}
+
+// -----
+
+func @module_op() {
+  // expected-error@below {{expects regions to end with 'module_terminator'}}
+  // expected-note@below {{the absence of terminator implies 'module_terminator'}}
+  module {
+    return
+  }
+  return
+}
+
+// -----
+
+func @module_op() {
+  // expected-error@+1 {{expects parent op 'module'}}
+  "module_terminator"() : () -> ()
+}
+
+// -----
+
+// expected-error@+1 {{can only contain dialect-specific attributes}}
+module attributes {attr} {
+}
+
diff --git a/mlir/test/IR/invalid-ops.mlir b/mlir/test/IR/invalid-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..67b8d4acb553baa34122108b345c056f8fbda9c9
--- /dev/null
+++ b/mlir/test/IR/invalid-ops.mlir
@@ -0,0 +1,1038 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+func @dim(tensor<1xf32>) {
+^bb(%0: tensor<1xf32>):
+  "std.dim"(%0){index = "xyz"} : (tensor<1xf32>)->index // expected-error {{attribute 'index' failed to satisfy constraint: arbitrary integer attribute}}
+  return
+}
+
+// -----
+
+func @dim2(tensor<1xf32>) {
+^bb(%0: tensor<1xf32>):
+  "std.dim"(){index = "xyz"} : ()->index // expected-error {{'std.dim' op requires a single operand}}
+  return
+}
+
+// -----
+
+func @dim3(tensor<1xf32>) {
+^bb(%0: tensor<1xf32>):
+  "std.dim"(%0){index = 1} : (tensor<1xf32>)->index // expected-error {{'std.dim' op index is out of range}}
+  return
+}
+
+// -----
+
+func @rank(f32) {
+^bb(%0: f32):
+  "std.rank"(%0): (f32)->index // expected-error {{'std.rank' op operand #0 must be tensor of any type values}}
+  return
+}
+
+// -----
+
+func @constant() {
+^bb:
+  %x = "std.constant"(){value = "xyz"} : () -> i32 // expected-error {{unsupported 'value' attribute}}
+  return
+}
+
+// -----
+
+func @constant_out_of_range() {
+^bb:
+  %x = "std.constant"(){value = 100} : () -> i1 // expected-error {{requires attribute's type ('i64') to match op's return type ('i1')}}
+  return
+}
+
+// -----
+
+func @constant_wrong_type() {
+^bb:
+  %x = "std.constant"(){value = 10.} : () -> f32 // expected-error {{requires attribute's type ('f64') to match op's return type ('f32')}}
+  return
+}
+
+// -----
+func @affine_apply_no_map() {
+^bb0:
+  %i = constant 0 : index
+  %x = "affine.apply" (%i) { } : (index) -> (index) //  expected-error {{'affine.apply' op requires an affine map}}
+  return
+}
+
+// -----
+
+func @affine_apply_wrong_operand_count() {
+^bb0:
+  %i = constant 0 : index
+  %x = "affine.apply" (%i) {map = (d0, d1) -> ((d0 + 1), (d1 + 2))} : (index) -> (index) //  expected-error {{'affine.apply' op operand count and affine map dimension and symbol count must match}}
+  return
+}
+
+// -----
+
+func @affine_apply_wrong_result_count() {
+^bb0:
+  %i = constant 0 : index
+  %j = constant 1 : index
+  %x = "affine.apply" (%i, %j) {map = (d0, d1) -> ((d0 + 1), (d1 + 2))} : (index,index) -> (index) //  expected-error {{'affine.apply' op mapping must produce one value}}
+  return
+}
+
+// -----
+
+func @unknown_custom_op() {
+^bb0:
+  %i = crazyThing() {value = 0} : () -> index  // expected-error {{custom op 'crazyThing' is unknown}}
+  return
+}
+
+// -----
+
+func @unknown_std_op() {
+  // expected-error@+1 {{unregistered operation 'std.foo_bar_op' found in dialect ('std') that does not allow unknown operations}}
+  %0 = "std.foo_bar_op"() : () -> index
+  return
+}
+
+// -----
+
+func @bad_alloc_wrong_dynamic_dim_count() {
+^bb0:
+  %0 = constant 7 : index
+  // Test alloc with wrong number of dynamic dimensions.
+  %1 = alloc(%0)[%1] : memref<2x4xf32, (d0, d1)[s0] -> ((d0 + s0), d1), 1> // expected-error {{op 'std.alloc' dimension operand count does not equal memref dynamic dimension count}}
+  return
+}
+
+// -----
+
+func @bad_alloc_wrong_symbol_count() {
+^bb0:
+  %0 = constant 7 : index
+  // Test alloc with wrong number of symbols
+  %1 = alloc(%0) : memref<2x?xf32, (d0, d1)[s0] -> ((d0 + s0), d1), 1> // expected-error {{operand count does not equal dimension plus symbol operand count}}
+  return
+}
+
+// -----
+
+func @test_store_zero_results() {
+^bb0:
+  %0 = alloc() : memref<1024x64xf32, (d0, d1) -> (d0, d1), 1>
+  %1 = constant 0 : index
+  %2 = constant 1 : index
+  %3 = load %0[%1, %2] : memref<1024x64xf32, (d0, d1) -> (d0, d1), 1>
+  // Test that store returns zero results.
+  %4 = store %3, %0[%1, %2] : memref<1024x64xf32, (d0, d1) -> (d0, d1), 1> // expected-error {{cannot name an operation with no results}}
+  return
+}
+
+// -----
+
+func @test_store_zero_results2(%x: i32, %p: memref<i32>) {
+  "std.store"(%x,%p) : (i32, memref<i32>) -> i32  // expected-error {{'std.store' op requires zero results}}
+  return
+}
+
+// -----
+
+func @test_alloc_memref_map_rank_mismatch() {
+^bb0:
+  %0 = alloc() : memref<1024x64xf32, (d0) -> (d0), 1> // expected-error {{memref affine map dimension mismatch}}
+  return
+}
+
+// -----
+
+func @intlimit2() {
+^bb:
+  %0 = "std.constant"() {value = 0} : () -> i4096
+  %1 = "std.constant"() {value = 1} : () -> i4097 // expected-error {{integer bitwidth is limited to 4096 bits}}
+  return
+}
+
+// -----
+
+func @calls(%arg0: i32) {
+  %x = call @calls() : () -> i32  // expected-error {{incorrect number of operands for callee}}
+  return
+}
+
+// -----
+
+func @func_with_ops(f32) {
+^bb0(%a : f32):
+  %sf = addf %a, %a, %a : f32  // expected-error {{'std.addf' op expected 2 operands}}
+}
+
+// -----
+
+func @func_with_ops(f32) {
+^bb0(%a : f32):
+  %sf = addf(%a, %a) : f32  // expected-error {{expected ':'}}
+}
+
+// -----
+
+func @func_with_ops(f32) {
+^bb0(%a : f32):
+  %sf = addf{%a, %a} : f32  // expected-error {{expected attribute name}}
+}
+
+// -----
+
+func @func_with_ops(f32) {
+^bb0(%a : f32):
+  // expected-error@+1 {{'std.addi' op operand #0 must be integer-like}}
+  %sf = addi %a, %a : f32
+}
+
+// -----
+
+func @func_with_ops(i32) {
+^bb0(%a : i32):
+  %sf = addf %a, %a : i32  // expected-error {{'std.addf' op operand #0 must be floating-point-like}}
+}
+
+// -----
+
+func @func_with_ops(i32) {
+^bb0(%a : i32):
+  // expected-error@+1 {{failed to satisfy constraint: allowed 64-bit integer cases: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}}
+  %r = "std.cmpi"(%a, %a) {predicate = 42} : (i32, i32) -> i1
+}
+
+// -----
+
+// Comparison are defined for arguments of the same type.
+func @func_with_ops(i32, i64) {
+^bb0(%a : i32, %b : i64): // expected-note {{prior use here}}
+  %r = cmpi "eq", %a, %b : i32 // expected-error {{use of value '%b' expects different type than prior uses}}
+}
+
+// -----
+
+// Comparisons must have the "predicate" attribute.
+func @func_with_ops(i32, i32) {
+^bb0(%a : i32, %b : i32):
+  %r = cmpi %a, %b : i32 // expected-error {{expected non-function type}}
+}
+
+// -----
+
+// Integer comparisons are not recognized for float types.
+func @func_with_ops(f32, f32) {
+^bb0(%a : f32, %b : f32):
+  %r = cmpi "eq", %a, %b : f32 // expected-error {{operand #0 must be integer-like}}
+}
+
+// -----
+
+// Result type must be boolean like.
+func @func_with_ops(i32, i32) {
+^bb0(%a : i32, %b : i32):
+  %r = "std.cmpi"(%a, %b) {predicate = 0} : (i32, i32) -> i32 // expected-error {{op result #0 must be bool-like}}
+}
+
+// -----
+
+func @func_with_ops(i32, i32) {
+^bb0(%a : i32, %b : i32):
+  // expected-error@+1 {{requires attribute 'predicate'}}
+  %r = "std.cmpi"(%a, %b) {foo = 1} : (i32, i32) -> i1
+}
+
+// -----
+
+func @func_with_ops() {
+^bb0:
+  %c = constant dense<0> : vector<42 x i32>
+  // expected-error@+1 {{op requires the same shape for all operands and results}}
+  %r = "std.cmpi"(%c, %c) {predicate = 0} : (vector<42 x i32>, vector<42 x i32>) -> vector<41 x i1>
+}
+
+// -----
+
+func @func_with_ops(i32, i32, i32) {
+^bb0(%cond : i32, %t : i32, %f : i32):
+  // expected-error@+2 {{different type than prior uses}}
+  // expected-note@-2 {{prior use here}}
+  %r = select %cond, %t, %f : i32
+}
+
+// -----
+
+func @func_with_ops(i32, i32, i32) {
+^bb0(%cond : i32, %t : i32, %f : i32):
+  // expected-error@+1 {{op operand #0 must be bool-like}}
+  %r = "std.select"(%cond, %t, %f) : (i32, i32, i32) -> i32
+}
+
+// -----
+
+func @func_with_ops(i1, i32, i64) {
+^bb0(%cond : i1, %t : i32, %f : i64):
+  // expected-error@+1 {{'true' and 'false' arguments to be of the same type}}
+  %r = "std.select"(%cond, %t, %f) : (i1, i32, i64) -> i32
+}
+
+// -----
+
+func @func_with_ops(i1, vector<42xi32>, vector<42xi32>) {
+^bb0(%cond : i1, %t : vector<42xi32>, %f : vector<42xi32>):
+  // expected-error@+1 {{requires the same shape for all operands and results}}
+  %r = "std.select"(%cond, %t, %f) : (i1, vector<42xi32>, vector<42xi32>) -> vector<42xi32>
+}
+
+// -----
+
+func @func_with_ops(i1, tensor<42xi32>, tensor<?xi32>) {
+^bb0(%cond : i1, %t : tensor<42xi32>, %f : tensor<?xi32>):
+  // expected-error@+1 {{ op requires the same shape for all operands and results}}
+  %r = "std.select"(%cond, %t, %f) : (i1, tensor<42xi32>, tensor<?xi32>) -> tensor<42xi32>
+}
+
+// -----
+
+func @invalid_select_shape(%cond : i1, %idx : () -> ()) {
+  // expected-error@+1 {{expected type with valid i1 shape}}
+  %sel = select %cond, %idx, %idx : () -> ()
+
+// -----
+
+func @invalid_cmp_shape(%idx : () -> ()) {
+  // expected-error@+1 {{expected type with valid i1 shape}}
+  %cmp = cmpi "eq", %idx, %idx : () -> ()
+
+// -----
+
+func @dma_no_src_memref(%m : f32, %tag : f32, %c0 : index) {
+  // expected-error@+1 {{expected source to be of memref type}}
+  dma_start %m[%c0], %m[%c0], %c0, %tag[%c0] : f32, f32, f32
+}
+
+// -----
+
+func @dma_no_dst_memref(%m : f32, %tag : f32, %c0 : index) {
+  %mref = alloc() : memref<8 x f32>
+  // expected-error@+1 {{expected destination to be of memref type}}
+  dma_start %mref[%c0], %m[%c0], %c0, %tag[%c0] : memref<8 x f32>, f32, f32
+}
+
+// -----
+
+func @dma_no_tag_memref(%tag : f32, %c0 : index) {
+  %mref = alloc() : memref<8 x f32>
+  // expected-error@+1 {{expected tag to be of memref type}}
+  dma_start %mref[%c0], %mref[%c0], %c0, %tag[%c0] : memref<8 x f32>, memref<8 x f32>, f32
+}
+
+// -----
+
+func @dma_wait_no_tag_memref(%tag : f32, %c0 : index) {
+  // expected-error@+1 {{expected tag to be of memref type}}
+  dma_wait %tag[%c0], %arg0 : f32
+}
+
+// -----
+
+func @invalid_cmp_attr(%idx : i32) {
+  // expected-error@+1 {{expected string comparison predicate attribute}}
+  %cmp = cmpi i1, %idx, %idx : i32
+
+// -----
+
+func @cmpf_generic_invalid_predicate_value(%a : f32) {
+  // expected-error@+1 {{'predicate' attribute value out of range}}
+  %r = "std.cmpf"(%a, %a) {predicate = 42} : (f32, f32) -> i1
+}
+
+// -----
+
+func @cmpf_canonical_invalid_predicate_value(%a : f32) {
+  // expected-error@+1 {{unknown comparison predicate "foo"}}
+  %r = cmpf "foo", %a, %a : f32
+}
+
+// -----
+
+func @cmpf_canonical_invalid_predicate_value_signed(%a : f32) {
+  // expected-error@+1 {{unknown comparison predicate "sge"}}
+  %r = cmpf "sge", %a, %a : f32
+}
+
+// -----
+
+func @cmpf_canonical_invalid_predicate_value_no_order(%a : f32) {
+  // expected-error@+1 {{unknown comparison predicate "eq"}}
+  %r = cmpf "eq", %a, %a : f32
+}
+
+// -----
+
+func @cmpf_canonical_no_predicate_attr(%a : f32, %b : f32) {
+  %r = cmpf %a, %b : f32 // expected-error {{}}
+}
+
+// -----
+
+func @cmpf_generic_no_predicate_attr(%a : f32, %b : f32) {
+  // expected-error@+1 {{requires an integer attribute named 'predicate'}}
+  %r = "std.cmpf"(%a, %b) {foo = 1} : (f32, f32) -> i1
+}
+
+// -----
+
+func @cmpf_wrong_type(%a : i32, %b : i32) {
+  %r = cmpf "oeq", %a, %b : i32 // expected-error {{operand #0 must be floating-point-like}}
+}
+
+// -----
+
+func @cmpf_generic_wrong_result_type(%a : f32, %b : f32) {
+  // expected-error@+1 {{result #0 must be bool-like}}
+  %r = "std.cmpf"(%a, %b) {predicate = 0} : (f32, f32) -> f32
+}
+
+// -----
+
+func @cmpf_canonical_wrong_result_type(%a : f32, %b : f32) -> f32 {
+  %r = cmpf "oeq", %a, %b : f32 // expected-note {{prior use here}}
+  // expected-error@+1 {{use of value '%r' expects different type than prior uses}}
+  return %r : f32
+}
+
+// -----
+
+func @cmpf_result_shape_mismatch(%a : vector<42xf32>) {
+  // expected-error@+1 {{op requires the same shape for all operands and results}}
+  %r = "std.cmpf"(%a, %a) {predicate = 0} : (vector<42 x f32>, vector<42 x f32>) -> vector<41 x i1>
+}
+
+// -----
+
+func @cmpf_operand_shape_mismatch(%a : vector<42xf32>, %b : vector<41xf32>) {
+  // expected-error@+1 {{op requires all operands to have the same type}}
+  %r = "std.cmpf"(%a, %b) {predicate = 0} : (vector<42 x f32>, vector<41 x f32>) -> vector<42 x i1>
+}
+
+// -----
+
+func @cmpf_generic_operand_type_mismatch(%a : f32, %b : f64) {
+  // expected-error@+1 {{op requires all operands to have the same type}}
+  %r = "std.cmpf"(%a, %b) {predicate = 0} : (f32, f64) -> i1
+}
+
+// -----
+
+func @cmpf_canonical_type_mismatch(%a : f32, %b : f64) { // expected-note {{prior use here}}
+  // expected-error@+1 {{use of value '%b' expects different type than prior uses}}
+  %r = cmpf "oeq", %a, %b : f32
+}
+
+// -----
+
+func @extract_element_no_operands() {
+  // expected-error@+1 {{op expected 1 or more operands}}
+  %0 = "std.extract_element"() : () -> f32
+  return
+}
+
+// -----
+
+func @extract_element_no_indices(%v : vector<3xf32>) {
+  // expected-error@+1 {{incorrect number of indices for extract_element}}
+  %0 = "std.extract_element"(%v) : (vector<3xf32>) -> f32
+  return
+}
+
+// -----
+
+func @extract_element_invalid_index_type(%v : vector<3xf32>, %i : i32) {
+  // expected-error@+1 {{operand #1 must be index}}
+  %0 = "std.extract_element"(%v, %i) : (vector<3xf32>, i32) -> f32
+  return
+}
+
+// -----
+
+func @extract_element_element_result_type_mismatch(%v : vector<3xf32>, %i : index) {
+  // expected-error@+1 {{result type must match element type of aggregate}}
+  %0 = "std.extract_element"(%v, %i) : (vector<3xf32>, index) -> f64
+  return
+}
+
+// -----
+
+func @extract_element_vector_too_many_indices(%v : vector<3xf32>, %i : index) {
+  // expected-error@+1 {{incorrect number of indices for extract_element}}
+  %0 = "std.extract_element"(%v, %i, %i) : (vector<3xf32>, index, index) -> f32
+  return
+}
+
+// -----
+
+func @extract_element_tensor_too_many_indices(%t : tensor<2x3xf32>, %i : index) {
+  // expected-error@+1 {{incorrect number of indices for extract_element}}
+  %0 = "std.extract_element"(%t, %i, %i, %i) : (tensor<2x3xf32>, index, index, index) -> f32
+  return
+}
+
+// -----
+
+func @extract_element_tensor_too_few_indices(%t : tensor<2x3xf32>, %i : index) {
+  // expected-error@+1 {{incorrect number of indices for extract_element}}
+  %0 = "std.extract_element"(%t, %i) : (tensor<2x3xf32>, index) -> f32
+  return
+}
+
+// -----
+
+func @index_cast_index_to_index(%arg0: index) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = index_cast %arg0: index to index
+  return
+}
+
+// -----
+
+func @index_cast_float(%arg0: index, %arg1: f32) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = index_cast %arg0 : index to f32
+  return
+}
+
+// -----
+
+func @index_cast_float_to_index(%arg0: f32) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = index_cast %arg0 : f32 to index
+  return
+}
+
+// -----
+
+func @sitofp_i32_to_i64(%arg0 : i32) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = sitofp %arg0 : i32 to i64
+  return
+}
+
+// -----
+
+func @sitofp_f32_to_i32(%arg0 : f32) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = sitofp %arg0 : f32 to i32
+  return
+}
+
+// -----
+
+func @fpext_f32_to_f16(%arg0 : f32) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = fpext %arg0 : f32 to f16
+  return
+}
+
+// -----
+
+func @fpext_f16_to_f16(%arg0 : f16) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = fpext %arg0 : f16 to f16
+  return
+}
+
+// -----
+
+func @fpext_i32_to_f32(%arg0 : i32) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = fpext %arg0 : i32 to f32
+  return
+}
+
+// -----
+
+func @fpext_f32_to_i32(%arg0 : f32) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = fpext %arg0 : f32 to i32
+  return
+}
+
+// -----
+
+func @fptrunc_f16_to_f32(%arg0 : f16) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = fptrunc %arg0 : f16 to f32
+  return
+}
+
+// -----
+
+func @fptrunc_f32_to_f32(%arg0 : f32) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = fptrunc %arg0 : f32 to f32
+  return
+}
+
+// -----
+
+func @fptrunc_i32_to_f32(%arg0 : i32) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = fptrunc %arg0 : i32 to f32
+  return
+}
+
+// -----
+
+func @fptrunc_f32_to_i32(%arg0 : f32) {
+  // expected-error@+1 {{are cast incompatible}}
+  %0 = fptrunc %arg0 : f32 to i32
+  return
+}
+
+// -----
+
+func @sexti_index_as_operand(%arg0 : index) {
+  // expected-error@+1 {{'index' is not a valid operand type}}
+  %0 = sexti %arg0 : index to i128
+  return
+}
+
+// -----
+
+func @zexti_index_as_operand(%arg0 : index) {
+  // expected-error@+1 {{'index' is not a valid operand type}}
+  %0 = zexti %arg0 : index to i128
+  return
+}
+
+// -----
+
+func @trunci_index_as_operand(%arg0 : index) {
+  // expected-error@+1 {{'index' is not a valid operand type}}
+  %2 = trunci %arg0 : index to i128
+  return
+}
+
+// -----
+
+func @sexti_index_as_result(%arg0 : i1) {
+  // expected-error@+1 {{'index' is not a valid result type}}
+  %0 = sexti %arg0 : i1 to index
+  return
+}
+
+// -----
+
+func @zexti_index_as_operand(%arg0 : i1) {
+  // expected-error@+1 {{'index' is not a valid result type}}
+  %0 = zexti %arg0 : i1 to index
+  return
+}
+
+// -----
+
+func @trunci_index_as_result(%arg0 : i128) {
+  // expected-error@+1 {{'index' is not a valid result type}}
+  %2 = trunci %arg0 : i128 to index
+  return
+}
+
+// -----
+
+func @sexti_cast_to_narrower(%arg0 : i16) {
+  // expected-error@+1 {{must be wider}}
+  %0 = sexti %arg0 : i16 to i15
+  return
+}
+
+// -----
+
+func @zexti_cast_to_narrower(%arg0 : i16) {
+  // expected-error@+1 {{must be wider}}
+  %0 = zexti %arg0 : i16 to i15
+  return
+}
+
+// -----
+
+func @trunci_cast_to_wider(%arg0 : i16) {
+  // expected-error@+1 {{must be wider}}
+  %0 = trunci %arg0 : i16 to i17
+  return
+}
+
+// -----
+
+func @sexti_cast_to_same_width(%arg0 : i16) {
+  // expected-error@+1 {{must be wider}}
+  %0 = sexti %arg0 : i16 to i16
+  return
+}
+
+// -----
+
+func @zexti_cast_to_same_width(%arg0 : i16) {
+  // expected-error@+1 {{must be wider}}
+  %0 = zexti %arg0 : i16 to i16
+  return
+}
+
+// -----
+
+func @trunci_cast_to_same_width(%arg0 : i16) {
+  // expected-error@+1 {{must be wider}}
+  %0 = trunci %arg0 : i16 to i16
+  return
+}
+
+// -----
+
+func @return_not_in_function() {
+  "foo.region"() ({
+    // expected-error@+1 {{'std.return' op expects parent op 'func'}}
+    return
+  }): () -> ()
+  return
+}
+
+// -----
+
+func @invalid_splat(%v : f32) {
+  splat %v : memref<8xf32>
+  // expected-error@-1 {{must be vector of any type values or statically shaped tensor of any type values}}
+  return
+}
+
+// -----
+
+func @invalid_splat(%v : vector<8xf32>) {
+  %w = splat %v : tensor<8xvector<8xf32>>
+  // expected-error@-1 {{must be integer or float type}}
+  return
+}
+
+// -----
+
+func @invalid_splat(%v : f32) { // expected-note {{prior use here}}
+  splat %v : vector<8xf64>
+  // expected-error@-1 {{expects different type than prior uses}}
+  return
+}
+
+// -----
+
+func @invalid_view(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<2048xi8>
+  // expected-error@+1 {{incorrect number of operands for type}}
+  %1 = view %0[][%arg0, %arg1]
+    : memref<2048xi8> to memref<?x?xf32, (d0, d1)[s0] -> (d0 * 4 + d1 + s0)>
+  return
+}
+
+// -----
+
+func @invalid_view(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<2048xi8>
+  // expected-error@+1 {{is not strided}}
+  %1 = view %0[][%arg0, %arg1]
+    : memref<2048xi8> to memref<?x?xf32, (d0, d1)[s0] -> (d0, d1, s0)>
+  return
+}
+
+// -----
+
+func @invalid_view(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<2048xf32>
+  // expected-error@+1 {{must be 1D memref of 8-bit integer values}}
+  %1 = view %0[][%arg0, %arg1]
+    : memref<2048xf32> to memref<?x?xf32, (d0, d1)[s0] -> (d0 * 4 + d1 + s0)>
+  return
+}
+
+// -----
+
+func @invalid_view(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<2048xi8, (d0) -> (d0 floordiv 8, d0 mod 8)>
+  // expected-error@+1 {{unsupported map for base memref}}
+  %1 = view %0[][%arg0, %arg1]
+    : memref<2048xi8, (d0) -> (d0 floordiv 8, d0 mod 8)> to
+      memref<?x?xf32, (d0, d1)[s0] -> (d0 * 4 + d1 + s0)>
+  return
+}
+
+// -----
+
+func @invalid_view(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<2048xi8, 2>
+  // expected-error@+1 {{different memory spaces}}
+  %1 = view %0[][%arg0, %arg1]
+    : memref<2048xi8, 2> to
+      memref<?x?xf32, (d0, d1)[s0] -> (d0 * 4 + d1 + s0), 1>
+  return
+}
+
+// -----
+
+func @invalid_view(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<2048xi8>
+  // expected-error@+1 {{incorrect dynamic strides}}
+  %1 = view %0[][%arg0, %arg1]
+    : memref<2048xi8> to
+      memref<?x?x4xf32, (d0, d1, d2) -> (d0 * 777 + d1 * 4 + d2)>
+  return
+}
+
+// -----
+
+func @invalid_view(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<2048xi8>
+  // expected-error@+1 {{incorrect dynamic strides}}
+  %1 = view %0[%arg0][]
+    : memref<2048xi8> to
+      memref<16x4x?xf32, (d0, d1, d2) -> (d0 * 777 + d1 * 4 + d2)>
+  return
+}
+
+// -----
+
+func @multiple_offsets(%arg0: index) {
+  %0 = alloc() : memref<2048xi8>
+  // expected-error@+1 {{expects 0 or 1 offset operand}}
+  %1 = view %0[%arg0, %arg0][%arg0]
+    : memref<2048xi8> to
+      memref<?x?x4xf32, (d0, d1, d2) -> (d0 * 777 + d1 * 4 + d2)>
+  return
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2), 2>
+  // expected-error@+1 {{different memory spaces}}
+  %1 = subview %0[][%arg2][]
+    : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2), 2> to
+      memref<8x?x4xf32, (d0, d1, d2)[s0] -> (d0 * s0 + d1 * 4 + d2)>
+  return
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>
+  // expected-error@+1 {{is not strided}}
+  %1 = subview %0[][%arg2][]
+    : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to
+      memref<8x?x4xf32, (d0, d1, d2)[s0] -> (d0 + s0, d1, d2)>
+  return
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<8x16x4xf32, (d0, d1, d2) -> (d0 + d1, d1 + d2, d2)>
+  // expected-error@+1 {{is not strided}}
+  %1 = subview %0[][%arg2][]
+    : memref<8x16x4xf32, (d0, d1, d2) -> (d0 + d1, d1 + d2, d2)> to
+      memref<8x?x4xf32, (d0, d1, d2)[s0] -> (d0 * s0 + d1 * 4 + d2)>
+  return
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<8x16x4xf32>
+  // expected-error@+1 {{expected number of dynamic offsets specified to match the rank of the result type}}
+  %1 = subview %0[%arg0, %arg1][%arg2][]
+    : memref<8x16x4xf32> to
+      memref<8x?x4xf32, offset: 0, strides:[?, ?, 4]>
+  return
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<8x16x4xf32>
+  // expected-error@+1 {{expected result type to have dynamic strides}}
+  %1 = subview %0[%arg0, %arg1, %arg2][%arg0, %arg1, %arg2][%arg0, %arg1, %arg2]
+    : memref<8x16x4xf32> to
+      memref<?x?x?xf32, offset: ?, strides: [64, 4, 1]>
+  return
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<8x16x4xf32>
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  // expected-error@+1 {{expected result memref layout map to have dynamic offset}}
+  %1 = subview %0[%c0, %c0, %c0][%arg0, %arg1, %arg2][%c1, %c1, %c1]
+    : memref<8x16x4xf32> to
+      memref<?x?x?xf32, offset: 0, strides: [?, ?, ?]>
+  return
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : memref<?x?xf32>) {
+  // expected-error@+1 {{expected rank of result type to match rank of base type}}
+  %0 = subview %arg1[%arg0, %arg0][][%arg0, %arg0] : memref<?x?xf32> to memref<?xf32>
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : memref<?x?xf32>) {
+  // expected-error@+1 {{expected number of dynamic offsets specified to match the rank of the result type}}
+  %0 = subview %arg1[%arg0][][] : memref<?x?xf32> to memref<4x4xf32, offset: ?, strides: [4, 1]>
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : memref<?x?xf32>) {
+  // expected-error@+1 {{expected number of dynamic sizes specified to match the rank of the result type}}
+  %0 = subview %arg1[][%arg0][] : memref<?x?xf32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : memref<?x?xf32>) {
+  // expected-error@+1 {{expected number of dynamic strides specified to match the rank of the result type}}
+  %0 = subview %arg1[][][%arg0] : memref<?x?xf32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : memref<?x?xf32>) {
+  // expected-error@+1 {{invalid to specify dynamic sizes when subview result type is statically shaped and viceversa}}
+  %0 = subview %arg1[][%arg0, %arg0][] : memref<?x?xf32> to memref<4x8xf32, offset: ?, strides: [?, ?]>
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : memref<?x?xf32>) {
+  // expected-error@+1 {{invalid to specify dynamic sizes when subview result type is statically shaped and viceversa}}
+  %0 = subview %arg1[][][] : memref<?x?xf32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : memref<16x4xf32>) {
+  // expected-error@+1 {{expected result memref layout map to have dynamic offset}}
+  %0 = subview %arg1[%arg0, %arg0][][] : memref<16x4xf32> to memref<4x2xf32>
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : memref<16x4xf32, offset: ?, strides: [4, 1]>) {
+  // expected-error@+1 {{expected result memref layout map to have dynamic offset}}
+  %0 = subview %arg1[][][] : memref<16x4xf32, offset: ?, strides: [4, 1]> to memref<4x2xf32>
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : memref<16x4xf32, offset: 8, strides:[?, 1]>) {
+  // expected-error@+1 {{expected result memref layout map to have dynamic offset}}
+  %0 = subview %arg1[][][] : memref<16x4xf32, offset: 8, strides:[?, 1]> to memref<4x2xf32>
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : memref<16x4xf32>) {
+  // expected-error@+1 {{expected result type to have dynamic strides}}
+  %0 = subview %arg1[][][%arg0, %arg0] : memref<16x4xf32> to memref<4x2xf32>
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : memref<16x4xf32, offset: 0, strides:[?, ?]>) {
+  // expected-error@+1 {{expected result type to have dynamic stride along a dimension if the base memref type has dynamic stride along that dimension}}
+  %0 = subview %arg1[][][] : memref<16x4xf32, offset: 0, strides:[?, ?]> to memref<4x2xf32, offset:?, strides:[2, 1]>
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : memref<?x8x?xf32>) {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  // expected-error@+1 {{expected shape of result type to be fully dynamic when sizes are specified}}
+  %0 = subview %arg1[%c0, %c0, %c0][%c1, %arg0, %c1][%c1, %c1, %c1] : memref<?x8x?xf32> to memref<?x8x?xf32, offset:?, strides:[?, ?, ?]>
+  return
+}
+
+// -----
+
+func @invalid_memref_cast(%arg0 : memref<12x4x16xf32, offset:0, strides:[64, 16, 1]>) {
+  // expected-error@+1{{operand type 'memref<12x4x16xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 16 + d2)>' and result type 'memref<12x4x16xf32, (d0, d1, d2) -> (d0 * 128 + d1 * 32 + d2 * 2)>' are cast incompatible}}
+  %0 = memref_cast %arg0 : memref<12x4x16xf32, offset:0, strides:[64, 16, 1]> to memref<12x4x16xf32, offset:0, strides:[128, 32, 2]>
+  return
+}
+
+// -----
+
+func @invalid_memref_cast(%arg0 : memref<12x4x16xf32, offset:0, strides:[64, 16, 1]>) {
+  // expected-error@+1{{operand type 'memref<12x4x16xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 16 + d2)>' and result type 'memref<12x4x16xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 16 + d2 + 16)>' are cast incompatible}}
+  %0 = memref_cast %arg0 : memref<12x4x16xf32, offset:0, strides:[64, 16, 1]> to memref<12x4x16xf32, offset:16, strides:[64, 16, 1]>
+  return
+}
+
+// -----
+
+// incompatible element types
+func @invalid_memref_cast() {
+  %0 = alloc() : memref<2x5xf32, 0>
+  // expected-error@+1 {{operand type 'memref<2x5xf32>' and result type 'memref<*xi32>' are cast incompatible}}
+  %1 = memref_cast %0 : memref<2x5xf32, 0> to memref<*xi32>
+  return
+}
+
+// -----
+
+func @invalid_prefetch_rw(%i : index) {
+  %0 = alloc() : memref<10xf32>
+  // expected-error@+1 {{rw specifier has to be 'read' or 'write'}}
+  prefetch %0[%i], rw, locality<0>, data  : memref<10xf32>
+  return
+}
+
+// -----
+
+func @invalid_prefetch_cache_type(%i : index) {
+  %0 = alloc() : memref<10xf32>
+  // expected-error@+1 {{cache type has to be 'data' or 'instr'}}
+  prefetch %0[%i], read, locality<0>, false  : memref<10xf32>
+  return
+}
+
+// -----
+
+func @invalid_prefetch_locality_hint(%i : index) {
+  %0 = alloc() : memref<10xf32>
+  // expected-error@+1 {{32-bit integer attribute whose minimum value is 0 whose maximum value is 3}}
+  prefetch %0[%i], read, locality<5>, data  : memref<10xf32>
+  return
+}
+
+// -----
+
+// incompatible memory space
+func @invalid_memref_cast() {
+  %0 = alloc() : memref<2x5xf32, 0>
+  // expected-error@+1 {{operand type 'memref<2x5xf32>' and result type 'memref<*xf32>' are cast incompatible}}
+  %1 = memref_cast %0 : memref<2x5xf32, 0> to memref<*xf32, 1>
+  return
+}
+
+// -----
+
+// unranked to unranked
+func @invalid_memref_cast() {
+  %0 = alloc() : memref<2x5xf32, 0>
+  %1 = memref_cast %0 : memref<2x5xf32, 0> to memref<*xf32, 0>
+  // expected-error@+1 {{operand type 'memref<*xf32>' and result type 'memref<*xf32>' are cast incompatible}}
+  %2 = memref_cast %1 : memref<*xf32, 0> to memref<*xf32, 0>
+  return
+}
diff --git a/mlir/test/IR/invalid.mlir b/mlir/test/IR/invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d0714d55a263e3bfa5c1595d66c7d16c9571ffcc
--- /dev/null
+++ b/mlir/test/IR/invalid.mlir
@@ -0,0 +1,1217 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// Check different error cases.
+// -----
+
+func @illegaltype(i) // expected-error {{expected non-function type}}
+
+// -----
+
+func @illegaltype() {
+  %0 = constant dense<0> : <vector 4 x f32> : vector<4 x f32> // expected-error {{expected non-function type}}
+}
+
+// -----
+
+func @nestedtensor(tensor<tensor<i8>>) -> () // expected-error {{invalid tensor element type}}
+
+// -----
+
+func @indexvector(vector<4 x index>) -> () // expected-error {{vector elements must be int or float type}}
+
+// -----
+
+func @indexmemref(memref<? x index>) -> () // expected-error {{invalid memref element type}}
+
+// -----
+
+func @indextensor(tensor<4 x index>) -> () // expected-error {{invalid tensor element type}}
+
+// -----
+// Test no map in memref type.
+func @memrefs(memref<2x4xi8, >) // expected-error {{expected list element}}
+
+// -----
+// Test non-existent map in memref type.
+func @memrefs(memref<2x4xi8, #map7>) // expected-error {{undefined symbol alias id 'map7'}}
+
+// -----
+// Test non affine map in memref type.
+func @memrefs(memref<2x4xi8, i8>) // expected-error {{expected affine map in memref type}}
+
+// -----
+// Test non-existent map in map composition of memref type.
+#map0 = (d0, d1) -> (d0, d1)
+
+func @memrefs(memref<2x4xi8, #map0, #map8>) // expected-error {{undefined symbol alias id 'map8'}}
+
+// -----
+// Test multiple memory space error.
+#map0 = (d0, d1) -> (d0, d1)
+func @memrefs(memref<2x4xi8, #map0, 1, 2>) // expected-error {{multiple memory spaces specified in memref type}}
+
+// -----
+// Test affine map after memory space.
+#map0 = (d0, d1) -> (d0, d1)
+#map1 = (d0, d1) -> (d0, d1)
+
+func @memrefs(memref<2x4xi8, #map0, 1, #map1>) // expected-error {{expected memory space to be last in memref type}}
+
+// -----
+// Test dimension mismatch between memref and layout map.
+// The error must be emitted even for the trivial identity layout maps that are
+// dropped in type creation.
+#map0 = (d0, d1) -> (d0, d1)
+func @memrefs(memref<42xi8, #map0>) // expected-error {{memref affine map dimension mismatch}}
+
+// -----
+
+#map0 = (d0, d1) -> (d0, d1)
+#map1 = (d0) -> (d0)
+func @memrefs(memref<42x42xi8, #map0, #map1>) // expected-error {{memref affine map dimension mismatch}}
+
+// -----
+
+func @memref_space_after_strides(memref<42x42xi8, 0, offset: ?, strides: [?, ?]>) // expected-error {{expected memory space to be last in memref type}}
+
+// -----
+
+func @memref_stride_missing_colon(memref<42x42xi8, offset ?, strides: [?, ?]>) // expected-error {{expected colon after `offset` keyword}}
+
+// -----
+
+func @memref_stride_invalid_offset(memref<42x42xi8, offset: [], strides: [?, ?]>) // expected-error {{invalid offset}}
+
+// -----
+
+func @memref_stride_missing_strides(memref<42x42xi8, offset: 0 [?, ?]>) // expected-error {{expected comma after offset value}}
+
+// -----
+
+func @memref_stride_missing_strides(memref<42x42xi8, offset: 0, [?, ?]>) // expected-error {{expected `strides` keyword after offset specification}}
+
+// -----
+
+func @memref_stride_missing_colon_2(memref<42x42xi8, offset: 0, strides [?, ?]>) // expected-error {{expected colon after `strides` keyword}}
+
+// -----
+
+func @memref_stride_invalid_strides(memref<42x42xi8, offset: 0, strides: ()>) // expected-error {{invalid braces-enclosed stride list}}
+
+// -----
+
+func @memref_zero_stride(memref<42x42xi8, offset: ?, strides: [0, ?]>) // expected-error {{invalid memref stride}}
+
+// -----
+
+func @bad_branch() {
+^bb12:
+  br ^missing  // expected-error {{reference to an undefined block}}
+}
+
+// -----
+
+func @block_redef() {
+^bb42:
+  return
+^bb42:        // expected-error {{redefinition of block '^bb42'}}
+  return
+}
+
+// -----
+
+func @no_terminator() {   // expected-error {{block with no terminator}}
+^bb40:
+  return
+^bb41:
+^bb42:
+  return
+}
+
+// -----
+
+func @block_no_rparen() {
+^bb42 (%bb42 : i32: // expected-error {{expected ')' to end argument list}}
+  return
+}
+
+// -----
+
+func @block_arg_no_ssaid() {
+^bb42 (i32): // expected-error {{expected SSA operand}}
+  return
+}
+
+// -----
+
+func @block_arg_no_type() {
+^bb42 (%0): // expected-error {{expected ':' and type for SSA operand}}
+  return
+}
+
+// -----
+
+func @block_arg_no_close_paren() {
+^bb42:
+  br ^bb2( // expected-error@+1 {{expected ')' to close argument list}}
+  return
+}
+
+// -----
+
+func @block_first_has_predecessor() {
+// expected-error@-1 {{entry block of region may not have predecessors}}
+^bb42:
+  br ^bb43
+^bb43:
+  br ^bb42
+}
+
+// -----
+
+func @no_return() {
+  %x = constant 0 : i32  // expected-error {{block with no terminator}}
+}
+
+// -----
+
+"       // expected-error {{expected}}
+"
+
+// -----
+
+"       // expected-error {{expected}}
+
+// -----
+
+func @bad_op_type() {
+^bb40:
+  "foo"() : i32  // expected-error {{expected function type}}
+  return
+}
+// -----
+
+func @no_terminator() {
+^bb40:
+  "foo"() : ()->()
+  ""() : ()->()  // expected-error {{empty operation name is invalid}}
+  return
+}
+
+// -----
+
+func @illegaltype(i0) // expected-error {{invalid integer width}}
+
+// -----
+
+func @malformed_for_percent() {
+  affine.for i = 1 to 10 { // expected-error {{expected SSA operand}}
+
+// -----
+
+func @malformed_for_equal() {
+  affine.for %i 1 to 10 { // expected-error {{expected '='}}
+
+// -----
+
+func @malformed_for_to() {
+  affine.for %i = 1 too 10 { // expected-error {{expected 'to' between bounds}}
+  }
+}
+
+// -----
+
+func @incomplete_for() {
+  affine.for %i = 1 to 10 step 2
+}        // expected-error {{expected '{' to begin a region}}
+
+// -----
+
+#map0 = (d0) -> (d0 floordiv 4)
+
+func @reference_to_iv_in_bound() {
+  // expected-error@+2 {{region entry argument '%i0' is already in use}}
+  // expected-note@+1 {{previously referenced here}}
+  affine.for %i0 = #map0(%i0) to 10 {
+  }
+}
+
+// -----
+
+func @nonconstant_step(%1 : i32) {
+  affine.for %2 = 1 to 5 step %1 { // expected-error {{expected non-function type}}
+
+// -----
+
+func @for_negative_stride() {
+  affine.for %i = 1 to 10 step -1
+}        // expected-error@-1 {{expected step to be representable as a positive signed integer}}
+
+// -----
+
+func @non_operation() {
+  asd   // expected-error {{custom op 'asd' is unknown}}
+}
+
+// -----
+
+func @invalid_if_conditional2() {
+  affine.for %i = 1 to 10 {
+    affine.if (i)[N] : (i >= )  // expected-error {{expected '== 0' or '>= 0' at end of affine constraint}}
+  }
+}
+
+// -----
+
+func @invalid_if_conditional3() {
+  affine.for %i = 1 to 10 {
+    affine.if (i)[N] : (i == 1) // expected-error {{expected '0' after '=='}}
+  }
+}
+
+// -----
+
+func @invalid_if_conditional4() {
+  affine.for %i = 1 to 10 {
+    affine.if (i)[N] : (i >= 2) // expected-error {{expected '0' after '>='}}
+  }
+}
+
+// -----
+
+func @invalid_if_conditional5() {
+  affine.for %i = 1 to 10 {
+    affine.if (i)[N] : (i <= 0 ) // expected-error {{expected '== 0' or '>= 0' at end of affine constraint}}
+  }
+}
+
+// -----
+
+func @invalid_if_conditional6() {
+  affine.for %i = 1 to 10 {
+    affine.if (i) : (i) // expected-error {{expected '== 0' or '>= 0' at end of affine constraint}}
+  }
+}
+
+// -----
+// TODO (support affine.if (1)?
+func @invalid_if_conditional7() {
+  affine.for %i = 1 to 10 {
+    affine.if (i) : (1) // expected-error {{expected '== 0' or '>= 0' at end of affine constraint}}
+  }
+}
+
+// -----
+
+#map = (d0) -> (%  // expected-error {{invalid SSA name}}
+
+// -----
+
+func @test() {
+^bb40:
+  %1 = "foo"() : (i32)->i64 // expected-error {{expected 0 operand types but had 1}}
+  return
+}
+
+// -----
+
+func @redef() {
+^bb42:
+  %x = "xxx"(){index = 0} : ()->i32 // expected-note {{previously defined here}}
+  %x = "xxx"(){index = 0} : ()->i32 // expected-error {{redefinition of SSA value '%x'}}
+  return
+}
+
+// -----
+
+func @undef() {
+^bb42:
+  %x = "xxx"(%y) : (i32)->i32   // expected-error {{use of undeclared SSA value}}
+  return
+}
+
+// -----
+
+func @malformed_type(%a : intt) { // expected-error {{expected non-function type}}
+}
+
+// -----
+
+func @resulterror() -> i32 {
+^bb42:
+  return    // expected-error {{'std.return' op has 0 operands, but enclosing function returns 1}}
+}
+
+// -----
+
+func @func_resulterror() -> i32 {
+  return // expected-error {{'std.return' op has 0 operands, but enclosing function returns 1}}
+}
+
+// -----
+
+func @argError() {
+^bb1(%a: i64):  // expected-note {{previously defined here}}
+  br ^bb2
+^bb2(%a: i64):  // expected-error{{redefinition of SSA value '%a'}}
+  return
+}
+
+// -----
+
+func @br_mismatch() {
+^bb0:
+  %0:2 = "foo"() : () -> (i1, i17)
+  // expected-error @+1 {{branch has 2 operands for successor #0, but target block has 1}}
+  br ^bb1(%0#1, %0#0 : i17, i1)
+
+^bb1(%x: i17):
+  return
+}
+
+// -----
+
+func @succ_arg_type_mismatch() {
+^bb0:
+  %0 = "getBool"() : () -> i1
+  // expected-error @+1 {{type mismatch for bb argument #0 of successor #0}}
+  br ^bb1(%0 : i1)
+
+^bb1(%x: i32):
+  return
+}
+
+
+// -----
+
+// Test no nested vector.
+func @vectors(vector<1 x vector<1xi32>>, vector<2x4xf32>)
+// expected-error@-1 {{vector elements must be int or float type}}
+
+// -----
+
+func @condbr_notbool() {
+^bb0:
+  %a = "foo"() : () -> i32 // expected-note {{prior use here}}
+  cond_br %a, ^bb0, ^bb0 // expected-error {{use of value '%a' expects different type than prior uses: 'i1' vs 'i32'}}
+// expected-error@-1 {{expected condition type was boolean (i1)}}
+}
+
+// -----
+
+func @condbr_badtype() {
+^bb0:
+  %c = "foo"() : () -> i1
+  %a = "foo"() : () -> i32
+  cond_br %c, ^bb0(%a, %a : i32, ^bb0) // expected-error {{expected non-function type}}
+}
+
+// -----
+
+func @condbr_a_bb_is_not_a_type() {
+^bb0:
+  %c = "foo"() : () -> i1
+  %a = "foo"() : () -> i32
+  cond_br %c, ^bb0(%a, %a : i32, i32), i32 // expected-error {{expected block name}}
+}
+
+// -----
+
+func @successors_in_non_terminator(%a : i32, %b : i32) {
+  %c = "std.addi"(%a, %b)[^bb1] : () -> () // expected-error {{successors in non-terminator}}
+^bb1:
+  return
+}
+
+// -----
+
+func @undef() {
+^bb0:
+  %x = "xxx"(%y) : (i32)->i32   // expected-error {{use of undeclared SSA value name}}
+  return
+}
+
+// -----
+
+func @undef() {
+  %x = "xxx"(%y) : (i32)->i32   // expected-error {{use of undeclared SSA value name}}
+  return
+}
+
+// -----
+
+func @duplicate_induction_var() {
+  affine.for %i = 1 to 10 {   // expected-note {{previously referenced here}}
+    affine.for %i = 1 to 10 { // expected-error {{region entry argument '%i' is already in use}}
+    }
+  }
+  return
+}
+
+// -----
+
+func @name_scope_failure() {
+  affine.for %i = 1 to 10 {
+  }
+  "xxx"(%i) : (index)->()   // expected-error {{use of undeclared SSA value name}}
+  return
+}
+
+// -----
+
+func @dominance_failure() {
+^bb0:
+  "foo"(%x) : (i32) -> ()    // expected-error {{operand #0 does not dominate this use}}
+  br ^bb1
+^bb1:
+  %x = "bar"() : () -> i32    // expected-note {{operand defined here}}
+  return
+}
+
+// -----
+
+func @return_type_mismatch() -> i32 {
+  %0 = "foo"() : ()->f32
+  return %0 : f32  // expected-error {{type of return operand 0 ('f32') doesn't match function result type ('i32')}}
+}
+
+// -----
+
+func @return_inside_loop() {
+  affine.for %i = 1 to 100 {
+    // expected-error@-1 {{op expects regions to end with 'affine.terminator', found 'std.return'}}
+    // expected-note@-2 {{in custom textual format, the absence of terminator implies}}
+    return
+  }
+  return
+}
+
+// -----
+
+// expected-error@+1 {{expected three consecutive dots for an ellipsis}}
+func @malformed_ellipsis_one(.)
+
+// -----
+
+// expected-error@+1 {{expected three consecutive dots for an ellipsis}}
+func @malformed_ellipsis_two(..)
+
+// -----
+
+// expected-error@+1 {{expected non-function type}}
+func @func_variadic(...)
+
+// -----
+
+func @redef()  // expected-note {{see existing symbol definition here}}
+func @redef()  // expected-error {{redefinition of symbol named 'redef'}}
+
+// -----
+
+func @foo() {
+^bb0:
+  %x = constant @foo : (i32) -> ()  // expected-error {{reference to function with mismatched type}}
+  return
+}
+
+// -----
+
+func @undefined_function() {
+^bb0:
+  %x = constant @bar : (i32) -> ()  // expected-error {{reference to undefined function 'bar'}}
+  return
+}
+
+// -----
+
+#map1 = (i)[j] -> (i+j)
+
+func @bound_symbol_mismatch(%N : index) {
+  affine.for %i = #map1(%N) to 100 {
+  // expected-error@-1 {{symbol operand count and integer set symbol count must match}}
+  }
+  return
+}
+
+// -----
+
+#map1 = (i)[j] -> (i+j)
+
+func @bound_dim_mismatch(%N : index) {
+  affine.for %i = #map1(%N, %N)[%N] to 100 {
+  // expected-error@-1 {{dim operand count and integer set dim count must match}}
+  }
+  return
+}
+
+// -----
+
+func @large_bound() {
+  affine.for %i = 1 to 9223372036854775810 {
+  // expected-error@-1 {{integer constant out of range for attribute}}
+  }
+  return
+}
+
+// -----
+
+func @max_in_upper_bound(%N : index) {
+  affine.for %i = 1 to max (i)->(N, 100) { //expected-error {{expected non-function type}}
+  }
+  return
+}
+
+// -----
+
+func @step_typo() {
+  affine.for %i = 1 to 100 step -- 1 { //expected-error {{expected constant integer}}
+  }
+  return
+}
+
+// -----
+
+func @invalid_bound_map(%N : i32) {
+  affine.for %i = 1 to (i)->(j)(%N) { //expected-error {{use of undeclared identifier}}
+  }
+  return
+}
+
+// -----
+
+#set0 = (i)[N, M] : )i >= 0) // expected-error {{expected '(' at start of integer set constraint list}}
+
+// -----
+#set0 = (i)[N] : (i >= 0, N - i >= 0)
+
+func @invalid_if_operands1(%N : index) {
+  affine.for %i = 1 to 10 {
+    affine.if #set0(%i) {
+    // expected-error@-1 {{symbol operand count and integer set symbol count must match}}
+
+// -----
+#set0 = (i)[N] : (i >= 0, N - i >= 0)
+
+func @invalid_if_operands2(%N : index) {
+  affine.for %i = 1 to 10 {
+    affine.if #set0()[%N] {
+    // expected-error@-1 {{dim operand count and integer set dim count must match}}
+
+// -----
+#set0 = (i)[N] : (i >= 0, N - i >= 0)
+
+func @invalid_if_operands3(%N : index) {
+  affine.for %i = 1 to 10 {
+    affine.if #set0(%i)[%i] {
+    // expected-error@-1 {{operand cannot be used as a symbol}}
+    }
+  }
+  return
+}
+
+// -----
+// expected-error@+1 {{expected '"' in string literal}}
+"J// -----
+func @calls(%arg0: i32) {
+  // expected-error@+1 {{expected non-function type}}
+  %z = "casdasda"(%x) : (ppop32) -> i32
+}
+// -----
+// expected-error@+2 {{expected SSA operand}}
+func@n(){^b(
+// -----
+
+func @elementsattr_non_tensor_type() -> () {
+^bb0:
+  "foo"(){bar = dense<[4]> : i32} : () -> () // expected-error {{elements literal must be a ranked tensor or vector type}}
+}
+
+// -----
+
+func @elementsattr_non_ranked() -> () {
+^bb0:
+  "foo"(){bar = dense<[4]> : tensor<?xi32>} : () -> () // expected-error {{elements literal type must have static shape}}
+}
+
+// -----
+
+func @elementsattr_shape_mismatch() -> () {
+^bb0:
+  "foo"(){bar = dense<[4]> : tensor<5xi32>} : () -> () // expected-error {{inferred shape of elements literal ([1]) does not match type ([5])}}
+}
+
+// -----
+
+func @elementsattr_invalid() -> () {
+^bb0:
+  "foo"(){bar = dense<[4, [5]]> : tensor<2xi32>} : () -> () // expected-error {{tensor literal is invalid; ranks are not consistent between elements}}
+}
+
+// -----
+
+func @elementsattr_badtoken() -> () {
+^bb0:
+  "foo"(){bar = dense<[tf_opaque]> : tensor<1xi32>} : () -> () // expected-error {{expected element literal of primitive type}}
+}
+
+// -----
+
+func @elementsattr_floattype1() -> () {
+^bb0:
+  // expected-error@+1 {{expected integer elements, but parsed floating-point}}
+  "foo"(){bar = dense<[4.0]> : tensor<1xi32>} : () -> ()
+}
+
+// -----
+
+func @elementsattr_floattype1() -> () {
+^bb0:
+  // expected-error@+1 {{expected integer elements, but parsed floating-point}}
+  "foo"(){bar = dense<4.0> : tensor<i32>} : () -> ()
+}
+
+// -----
+
+func @elementsattr_floattype2() -> () {
+^bb0:
+  // expected-error@+1 {{expected floating-point elements, but parsed integer}}
+  "foo"(){bar = dense<[4]> : tensor<1xf32>} : () -> ()
+}
+
+// -----
+
+func @elementsattr_toolarge1() -> () {
+^bb0:
+  "foo"(){bar = dense<[777]> : tensor<1xi8>} : () -> () // expected-error {{integer constant out of range}}
+}
+
+// -----
+
+func @elementsattr_toolarge2() -> () {
+^bb0:
+  "foo"(){bar = dense<[-777]> : tensor<1xi8>} : () -> () // expected-error {{integer constant out of range}}
+}
+
+// -----
+
+func @elementsattr_malformed_opaque() -> () {
+^bb0:
+  "foo"(){bar = opaque<10, "0xQZz123"> : tensor<1xi8>} : () -> () // expected-error {{expected dialect namespace}}
+}
+
+// -----
+
+func @elementsattr_malformed_opaque1() -> () {
+^bb0:
+  "foo"(){bar = opaque<"", "0xQZz123"> : tensor<1xi8>} : () -> () // expected-error {{opaque string only contains hex digits}}
+}
+
+// -----
+
+func @elementsattr_malformed_opaque2() -> () {
+^bb0:
+  "foo"(){bar = opaque<"", "00abc"> : tensor<1xi8>} : () -> () // expected-error {{opaque string should start with '0x'}}
+}
+
+// -----
+
+func @elementsattr_malformed_opaque3() -> () {
+^bb0:
+  "foo"(){bar = opaque<"t", "0xabc"> : tensor<1xi8>} : () -> () // expected-error {{no registered dialect with namespace 't'}}
+}
+
+// -----
+
+func @redundant_signature(%a : i32) -> () {
+^bb0(%b : i32):  // expected-error {{invalid block name in region with named arguments}}
+  return
+}
+
+// -----
+
+func @mixed_named_arguments(%a : i32,
+                               f32) -> () {
+    // expected-error @-1 {{expected SSA identifier}}
+  return
+}
+
+// -----
+
+func @mixed_named_arguments(f32,
+                               %a : i32) -> () { // expected-error {{expected type instead of SSA identifier}}
+  return
+}
+
+// -----
+
+// This used to crash the parser, but should just error out by interpreting
+// `tensor` as operator rather than as a type.
+func @f(f32) {
+^bb0(%a : f32):
+  %18 = cmpi "slt", %idx, %idx : index
+  tensor<42 x index  // expected-error {{custom op 'tensor' is unknown}}
+  return
+}
+
+// -----
+
+func @f(%m : memref<?x?xf32>) {
+  affine.for %i0 = 0 to 42 {
+    // expected-note@+1 {{previously referenced here}}
+    %x = load %m[%i0, %i1] : memref<?x?xf32>
+  }
+  // expected-error@+1 {{region entry argument '%i1' is already in use}}
+  affine.for %i1 = 0 to 42 {
+  }
+  return
+}
+
+// -----
+
+func @dialect_type_empty_namespace(!<"">) -> () { // expected-error {{invalid type identifier}}
+  return
+}
+
+// -----
+
+func @dialect_type_no_string_type_data(!foo<>) -> () { // expected-error {{expected string literal data in dialect symbol}}
+  return
+}
+
+// -----
+
+func @dialect_type_missing_greater(!foo<"") -> () { // expected-error {{expected '>' in dialect symbol}}
+  return
+}
+
+// -----
+
+func @type_alias_unknown(!unknown_alias) -> () { // expected-error {{undefined symbol alias id 'unknown_alias'}}
+  return
+}
+
+// -----
+
+// expected-error @+1 {{type names with a '.' are reserved for dialect-defined names}}
+!foo.bar = i32
+
+// -----
+
+!missing_eq_alias type i32 // expected-error {{expected '=' in type alias definition}}
+
+// -----
+
+!missing_kw_type_alias = i32 // expected-error {{expected 'type' in type alias definition}}
+
+// -----
+
+!missing_type_alias = type // expected-error@+2 {{expected non-function type}}
+
+// -----
+
+!redef_alias = type i32
+!redef_alias = type i32 // expected-error {{redefinition of type alias id 'redef_alias'}}
+
+// -----
+
+// Check ill-formed opaque tensor.
+func @complex_loops() {
+  affine.for %i1 = 1 to 100 {
+  // expected-error @+1 {{expected '"' in string literal}}
+  "opaqueIntTensor"(){bar = opaque<"", "0x686]> : tensor<2x1x4xi32>} : () -> ()
+
+// -----
+
+func @mi() {
+  // expected-error @+1 {{expected element literal of primitive type}}
+  "fooi64"(){bar = sparse<vector<1xi64>,[,[,1]
+
+// -----
+
+func @invalid_tensor_literal() {
+  // expected-error @+1 {{expected 1-d tensor for values}}
+  "foof16"(){bar = sparse<[[0, 0, 0]],  [[-2.0]]> : vector<1x1x1xf16>} : () -> ()
+
+// -----
+
+func @invalid_tensor_literal() {
+  // expected-error @+1 {{expected element literal of primitive type}}
+  "fooi16"(){bar = sparse<[[1, 1, 0], [0, 1, 0], [0,, [[0, 0, 0]], [-2.0]> : tensor<2x2x2xi16>} : () -> ()
+
+// -----
+
+func @invalid_affine_structure() {
+  %c0 = constant 0 : index
+  %idx = affine.apply (d0, d1) (%c0, %c0) // expected-error {{expected '->' or ':'}}
+  return
+}
+
+// -----
+
+func @missing_for_max(%arg0: index, %arg1: index, %arg2: memref<100xf32>) {
+  // expected-error @+1 {{lower loop bound affine map with multiple results requires 'max' prefix}}
+  affine.for %i0 = ()[s]->(0,s-1)()[%arg0] to %arg1 {
+  }
+  return
+}
+
+// -----
+
+func @missing_for_min(%arg0: index, %arg1: index, %arg2: memref<100xf32>) {
+  // expected-error @+1 {{upper loop bound affine map with multiple results requires 'min' prefix}}
+  affine.for %i0 = %arg0 to ()[s]->(100,s+1)()[%arg1] {
+  }
+  return
+}
+
+// -----
+
+// expected-error @+1 {{vector types must have positive constant sizes}}
+func @zero_vector_type() -> vector<0xi32>
+
+// -----
+
+// expected-error @+1 {{vector types must have positive constant sizes}}
+func @zero_in_vector_type() -> vector<1x0xi32>
+
+// -----
+
+// expected-error @+1 {{expected dimension size in vector type}}
+func @negative_vector_size() -> vector<-1xi32>
+
+// -----
+
+// expected-error @+1 {{expected non-function type}}
+func @negative_in_vector_size() -> vector<1x-1xi32>
+
+// -----
+
+// expected-error @+1 {{expected non-function type}}
+func @negative_memref_size() -> memref<-1xi32>
+
+// -----
+
+// expected-error @+1 {{expected non-function type}}
+func @negative_in_memref_size() -> memref<1x-1xi32>
+
+// -----
+
+// expected-error @+1 {{expected non-function type}}
+func @negative_tensor_size() -> tensor<-1xi32>
+
+// -----
+
+// expected-error @+1 {{expected non-function type}}
+func @negative_in_tensor_size() -> tensor<1x-1xi32>
+
+// -----
+
+func @invalid_nested_dominance() {
+  "foo.region"() ({
+    // expected-error @+1 {{operand #0 does not dominate this use}}
+    "foo.use" (%1) : (i32) -> ()
+    br ^bb2
+
+  ^bb2:
+    // expected-note @+1 {{operand defined here}}
+    %1 = constant 0 : i32
+    "foo.yield" () : () -> ()
+  }) : () -> ()
+  return
+}
+
+// -----
+
+// expected-error @+1 {{unbalanced ']' character in pretty dialect name}}
+func @invalid_unknown_type_dialect_name() -> !invalid.dialect<!x@#]!@#>
+
+// -----
+
+// expected-error @+1 {{@ identifier expected to start with letter or '_'}}
+func @$invalid_function_name()
+
+// -----
+
+// expected-error @+1 {{arguments may only have dialect attributes}}
+func @invalid_func_arg_attr(i1 {non_dialect_attr = 10})
+
+// -----
+
+// expected-error @+1 {{results may only have dialect attributes}}
+func @invalid_func_result_attr() -> (i1 {non_dialect_attr = 10})
+
+// -----
+
+// expected-error @+1 {{expected '<' in tuple type}}
+func @invalid_tuple_missing_less(tuple i32>)
+
+// -----
+
+// expected-error @+1 {{expected '>' in tuple type}}
+func @invalid_tuple_missing_greater(tuple<i32)
+
+// -----
+
+// Should not crash because of deletion order here.
+func @invalid_region_dominance() {
+  "foo.use" (%1) : (i32) -> ()
+  "foo.region"() ({
+    %1 = constant 0 : i32  // This value is used outside of the region.
+    "foo.yield" () : () -> ()
+  }, {
+    // expected-error @+1 {{expected operation name in quotes}}
+    %2 = constant 1 i32  // Syntax error causes region deletion.
+  }) : () -> ()
+  return
+}
+
+// -----
+
+// Should not crash because of deletion order here.
+func @invalid_region_block() {
+  "foo.branch"()[^bb2] : () -> ()  // Attempt to jump into the region.
+
+^bb1:
+  "foo.region"() ({
+    ^bb2:
+      "foo.yield"() : () -> ()
+  }, {
+    // expected-error @+1 {{expected operation name in quotes}}
+    %2 = constant 1 i32  // Syntax error causes region deletion.
+  }) : () -> ()
+}
+
+// -----
+
+// Should not crash because of deletion order here.
+func @invalid_region_dominance() {
+  "foo.use" (%1) : (i32) -> ()
+  "foo.region"() ({
+    "foo.region"() ({
+      %1 = constant 0 : i32  // This value is used outside of the region.
+      "foo.yield" () : () -> ()
+    }) : () -> ()
+  }, {
+    // expected-error @+1 {{expected operation name in quotes}}
+    %2 = constant 1 i32  // Syntax error causes region deletion.
+  }) : () -> ()
+  return
+}
+
+// -----
+
+func @unfinished_region_list() {
+  // expected-error@+1 {{expected ')' to end region list}}
+  "region"() ({},{},{} : () -> ()
+}
+
+// -----
+
+func @multi_result_missing_count() {
+  // expected-error@+1 {{expected integer number of results}}
+  %0: = "foo" () : () -> (i32, i32)
+  return
+}
+
+// -----
+
+func @multi_result_zero_count() {
+  // expected-error@+1 {{expected named operation to have atleast 1 result}}
+  %0:0 = "foo" () : () -> (i32, i32)
+  return
+}
+
+// -----
+
+func @multi_result_invalid_identifier() {
+  // expected-error@+1 {{expected valid ssa identifier}}
+  %0, = "foo" () : () -> (i32, i32)
+  return
+}
+
+// -----
+
+func @multi_result_mismatch_count() {
+  // expected-error@+1 {{operation defines 2 results but was provided 1 to bind}}
+  %0:1 = "foo" () : () -> (i32, i32)
+  return
+}
+
+// -----
+
+func @multi_result_mismatch_count() {
+  // expected-error@+1 {{operation defines 2 results but was provided 3 to bind}}
+  %0, %1, %3 = "foo" () : () -> (i32, i32)
+  return
+}
+
+// -----
+
+func @no_result_with_name() {
+  // expected-error@+1 {{cannot name an operation with no results}}
+  %0 = "foo" () : () -> ()
+  return
+}
+
+// -----
+
+func @conflicting_names() {
+  // expected-note@+1 {{previously defined here}}
+  %foo, %bar  = "foo" () : () -> (i32, i32)
+
+  // expected-error@+1 {{redefinition of SSA value '%bar'}}
+  %bar, %baz  = "foo" () : () -> (i32, i32)
+  return
+}
+
+// -----
+
+func @ssa_name_missing_eq() {
+  // expected-error@+1 {{expected '=' after SSA name}}
+  %0:2 "foo" () : () -> (i32, i32)
+  return
+}
+
+// -----
+
+// expected-error @+1 {{invalid element type for complex}}
+func @bad_complex(complex<memref<2x4xi8>>)
+
+// -----
+
+// expected-error @+1 {{expected '<' in complex type}}
+func @bad_complex(complex memref<2x4xi8>>)
+
+// -----
+
+// expected-error @+1 {{expected '>' in complex type}}
+func @bad_complex(complex<i32)
+
+// -----
+
+// expected-error @+1 {{attribute names with a '.' are reserved for dialect-defined names}}
+#foo.attr = i32
+
+// -----
+
+func @invalid_region_dominance() {
+  "foo.region"() ({
+    // expected-error @+1 {{operand #0 does not dominate this use}}
+    "foo.use" (%def) : (i32) -> ()
+    "foo.yield" () : () -> ()
+  }, {
+    // expected-note @+1 {{operand defined here}}
+    %def = "foo.def" () : () -> i32
+  }) : () -> ()
+  return
+}
+
+// -----
+
+func @invalid_region_dominance() {
+  // expected-note @+1 {{operand defined here}}
+  %def = "foo.region_with_def"() ({
+    // expected-error @+1 {{operand #0 does not dominate this use}}
+    "foo.use" (%def) : (i32) -> ()
+    "foo.yield" () : () -> ()
+  }) : () -> (i32)
+  return
+}
+
+// -----
+
+func @hexadecimal_bf16() {
+  // expected-error @+1 {{hexadecimal float literal not supported for bfloat16}}
+  "foo"() {value = 0xffff : bf16} : () -> ()
+}
+
+// -----
+
+func @hexadecimal_float_leading_minus() {
+  // expected-error @+1 {{hexadecimal float literal should not have a leading minus}}
+  "foo"() {value = -0x7fff : f16} : () -> ()
+}
+
+// -----
+
+func @hexadecimal_float_literal_overflow() {
+  // expected-error @+1 {{hexadecimal float constant out of range for type}}
+  "foo"() {value = 0xffffffff : f16} : () -> ()
+}
+
+// -----
+
+func @decimal_float_literal() {
+  // expected-error @+2 {{unexpected decimal integer literal for a float attribute}}
+  // expected-note @+1 {{add a trailing dot to make the literal a float}}
+  "foo"() {value = 42 : f32} : () -> ()
+}
+
+// -----
+
+func @float_in_int_tensor() {
+  // expected-error @+1 {{expected integer elements, but parsed floating-point}}
+  "foo"() {bar = dense<[42.0, 42]> : tensor<2xi32>} : () -> ()
+}
+
+// -----
+
+func @float_in_bool_tensor() {
+  // expected-error @+1 {{expected integer elements, but parsed floating-point}}
+  "foo"() {bar = dense<[true, 42.0]> : tensor<2xi1>} : () -> ()
+}
+
+// -----
+
+func @decimal_int_in_float_tensor() {
+  // expected-error @+1 {{expected floating-point elements, but parsed integer}}
+  "foo"() {bar = dense<[42, 42.0]> : tensor<2xf32>} : () -> ()
+}
+
+// -----
+
+func @bool_in_float_tensor() {
+  // expected-error @+1 {{expected floating-point elements, but parsed integer}}
+  "foo"() {bar = dense<[42.0, true]> : tensor<2xf32>} : () -> ()
+}
+
+// -----
+
+func @hexadecimal_float_leading_minus_in_tensor() {
+  // expected-error @+1 {{hexadecimal float literal should not have a leading minus}}
+  "foo"() {bar = dense<-0x7FFFFFFF> : tensor<2xf32>} : () -> ()
+}
+
+// -----
+
+// Check that we report an error when a value could be parsed, but does not fit
+// into the specified type.
+func @hexadecimal_float_too_wide_for_type_in_tensor() {
+  // expected-error @+1 {{hexadecimal float constant out of range for type}}
+  "foo"() {bar = dense<0x7FF0000000000000> : tensor<2xf32>} : () -> ()
+}
+
+// -----
+
+// Check that we report an error when a value is too wide to be parsed.
+func @hexadecimal_float_too_wide_in_tensor() {
+  // expected-error @+1 {{hexadecimal float constant out of range for attribute}}
+  "foo"() {bar = dense<0x7FFFFFF0000000000000> : tensor<2xf32>} : () -> ()
+}
+
+// -----
+
+func @integer_too_wide_in_tensor() {
+  // expected-error @+1 {{integer constant out of range for type}}
+  "foo"() {bar = dense<0xFFFFFFFFFFFFFF> : tensor<2xi16>} : () -> ()
+}
+
+// -----
+
+func @bool_literal_in_non_bool_tensor() {
+  // expected-error @+1 {{expected i1 type for 'true' or 'false' values}}
+  "foo"() {bar = dense<true> : tensor<2xi16>} : () -> ()
+}
+
+// expected-error @+1 {{unbalanced ')' character in pretty dialect name}}
+func @bad_arrow(%arg : !unreg.ptr<(i32)->)
diff --git a/mlir/test/IR/locations.mlir b/mlir/test/IR/locations.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..9a2017eec737e5b977efa3e3c1dbaedb538659c5
--- /dev/null
+++ b/mlir/test/IR/locations.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -mlir-print-debuginfo | FileCheck %s
+// This test verifies that debug locations are round-trippable.
+
+#set0 = (d0) : (1 == 0)
+
+// CHECK-LABEL: func @inline_notation
+func @inline_notation() -> i32 {
+  // CHECK: -> i32 loc("foo")
+  %1 = "foo"() : () -> i32 loc("foo")
+
+  // CHECK: constant 4 : index loc(callsite("foo" at "mysource.cc":10:8))
+  %2 = constant 4 : index loc(callsite("foo" at "mysource.cc":10:8))
+
+  // CHECK: } loc(fused["foo", "mysource.cc":10:8])
+  affine.for %i0 = 0 to 8 {
+  } loc(fused["foo", "mysource.cc":10:8])
+
+  // CHECK: } loc(fused<"myPass">["foo", "foo2"])
+  affine.if #set0(%2) {
+  } loc(fused<"myPass">["foo", "foo2"])
+
+  // CHECK: return %0 : i32 loc(unknown)
+  return %1 : i32 loc(unknown)
+}
diff --git a/mlir/test/IR/memory-ops.mlir b/mlir/test/IR/memory-ops.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..24ea180a9cd9b8f8230148507df3ab0efd69e704
--- /dev/null
+++ b/mlir/test/IR/memory-ops.mlir
@@ -0,0 +1,89 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+// CHECK: #map0 = (d0, d1)[s0] -> (d0 + s0, d1)
+
+// CHECK-LABEL: func @alloc() {
+func @alloc() {
+^bb0:
+  // Test simple alloc.
+  // CHECK: %0 = alloc() : memref<1024x64xf32, 1>
+  %0 = alloc() : memref<1024x64xf32, (d0, d1) -> (d0, d1), 1>
+
+  %c0 = "std.constant"() {value = 0: index} : () -> index
+  %c1 = "std.constant"() {value = 1: index} : () -> index
+
+  // Test alloc with dynamic dimensions.
+  // CHECK: %1 = alloc(%c0, %c1) : memref<?x?xf32, 1>
+  %1 = alloc(%c0, %c1) : memref<?x?xf32, (d0, d1) -> (d0, d1), 1>
+
+  // Test alloc with no dynamic dimensions and one symbol.
+  // CHECK: %2 = alloc()[%c0] : memref<2x4xf32, #map0, 1>
+  %2 = alloc()[%c0] : memref<2x4xf32, (d0, d1)[s0] -> ((d0 + s0), d1), 1>
+
+  // Test alloc with dynamic dimensions and one symbol.
+  // CHECK: %3 = alloc(%c1)[%c0] : memref<2x?xf32, #map0, 1>
+  %3 = alloc(%c1)[%c0] : memref<2x?xf32, (d0, d1)[s0] -> (d0 + s0, d1), 1>
+
+  // Alloc with no mappings.
+  // b/116054838 Parser crash while parsing ill-formed AllocOp
+  // CHECK: %4 = alloc() : memref<2xi32>
+  %4 = alloc() : memref<2 x i32>
+
+  // CHECK:   return
+  return
+}
+
+// CHECK-LABEL: func @dealloc() {
+func @dealloc() {
+^bb0:
+  // CHECK: %0 = alloc() : memref<1024x64xf32>
+  %0 = alloc() : memref<1024x64xf32, (d0, d1) -> (d0, d1), 0>
+
+  // CHECK: dealloc %0 : memref<1024x64xf32>
+  dealloc %0 : memref<1024x64xf32, (d0, d1) -> (d0, d1), 0>
+  return
+}
+
+// CHECK-LABEL: func @load_store
+func @load_store() {
+^bb0:
+  // CHECK: %0 = alloc() : memref<1024x64xf32, 1>
+  %0 = alloc() : memref<1024x64xf32, (d0, d1) -> (d0, d1), 1>
+
+  %1 = constant 0 : index
+  %2 = constant 1 : index
+
+  // CHECK: %1 = load %0[%c0, %c1] : memref<1024x64xf32, 1>
+  %3 = load %0[%1, %2] : memref<1024x64xf32, (d0, d1) -> (d0, d1), 1>
+
+  // CHECK: store %1, %0[%c0, %c1] : memref<1024x64xf32, 1>
+  store %3, %0[%1, %2] : memref<1024x64xf32, (d0, d1) -> (d0, d1), 1>
+
+  return
+}
+
+// CHECK-LABEL: func @dma_ops()
+func @dma_ops() {
+  %c0 = constant 0 : index
+  %stride = constant 32 : index
+  %elt_per_stride = constant 16 : index
+
+  %A = alloc() : memref<256 x f32, (d0) -> (d0), 0>
+  %Ah = alloc() : memref<256 x f32, (d0) -> (d0), 1>
+  %tag = alloc() : memref<1 x f32>
+
+  %num_elements = constant 256 : index
+
+  dma_start %A[%c0], %Ah[%c0], %num_elements, %tag[%c0] : memref<256 x f32>, memref<256 x f32, 1>, memref<1 x f32>
+  dma_wait %tag[%c0], %num_elements : memref<1 x f32>
+  // CHECK: dma_start %0[%c0], %1[%c0], %c256, %2[%c0] : memref<256xf32>, memref<256xf32, 1>, memref<1xf32>
+  // CHECK-NEXT:  dma_wait %2[%c0], %c256 : memref<1xf32>
+
+  // DMA with strides
+  dma_start %A[%c0], %Ah[%c0], %num_elements, %tag[%c0], %stride, %elt_per_stride : memref<256 x f32>, memref<256 x f32, 1>, memref<1 x f32>
+  dma_wait %tag[%c0], %num_elements : memref<1 x f32>
+  // CHECK-NEXT  dma_start %0[%c0], %1[%c0], %c256, %2[%c0], %c32, %c16 : memref<256xf32>, memref<256xf32, 1>, memref<1xf32>
+  // CHECK-NEXT  dma_wait %2[%c0], %c256 : memref<1xf32>
+
+  return
+}
diff --git a/mlir/test/IR/module-op.mlir b/mlir/test/IR/module-op.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..84a404b1acfac45846f75cfacb2f6143889f72b4
--- /dev/null
+++ b/mlir/test/IR/module-op.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-opt %s -split-input-file -mlir-print-debuginfo | FileCheck %s
+
+// CHECK: module {
+module {
+}
+
+// CHECK: module {
+// CHECK-NEXT: }
+module {
+  "module_terminator"() : () -> ()
+}
+
+// CHECK: module attributes {foo.attr = true} {
+module attributes {foo.attr = true} {
+}
+
+// CHECK: module {
+module {
+  // CHECK-NEXT: "foo.result_op"() : () -> i32
+  %result = "foo.result_op"() : () -> i32
+}
+
+// -----
+
+// Check that a top-level module is always created, with location info.
+// CHECK: module {
+// CHECK-NEXT: } loc({{.*}}module-op.mlir{{.*}})
+
+// -----
+
+// Check that the top-level module can be defined via a single module operation.
+// CHECK: module {
+// CHECK-NOT: module {
+module {
+}
+
+// -----
+
+// Check that the implicit top-level module is also a name scope for SSA
+// values.  This should not crash.
+// CHECK: module {
+// CHECK: %{{.*}} = "op"
+// CHECK: }
+%0 = "op"() : () -> i32
+
+// -----
+
+// CHECK-LABEL: module @foo
+// CHECK-NOT: attributes
+module @foo {
+  // CHECK: module
+  module {
+    // CHECK: module @bar attributes
+    module @bar attributes {foo.bar} {
+    }
+  }
+}
diff --git a/mlir/test/IR/op-stats.mlir b/mlir/test/IR/op-stats.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..63abe6d12675fc3103f361de8617db333ec46a80
--- /dev/null
+++ b/mlir/test/IR/op-stats.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-opt -print-op-stats %s -o=/dev/null 2>&1 | FileCheck %s
+
+func @main(tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> {
+^bb0(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>):
+  %0 = addf %arg0, %arg1 : tensor<4xf32>
+  %1 = addf %arg0, %arg1 : tensor<4xf32>
+  %2 = addf %arg0, %arg1 : tensor<4xf32>
+  %3 = addf %arg0, %arg1 : tensor<4xf32>
+  %4 = addf %arg0, %arg1 : tensor<4xf32>
+  %5 = addf %arg0, %arg1 : tensor<4xf32>
+  %10 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %11 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %12 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %13 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %14 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %15 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %16 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %17 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %18 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %19 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %20 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %21 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %22 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %23 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %24 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %25 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %26 = "xla.add"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  %30 = "long_op_name"(%0, %arg1) : (tensor<4xf32>,tensor<4xf32>)-> tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: Operations encountered
+// CHECK: long_op_name , 1
+// CHECK: std.addf , 6
+// CHECK: std.return , 1
+// CHECK: xla.add , 17
diff --git a/mlir/test/IR/opaque_locations.mlir b/mlir/test/IR/opaque_locations.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..557534d558e263c9148834f9b600d76a14593b1d
--- /dev/null
+++ b/mlir/test/IR/opaque_locations.mlir
@@ -0,0 +1,35 @@
+// RUN: mlir-opt %s -test-opaque-loc -mlir-print-debuginfo | FileCheck %s
+// This test verifies that debug opaque locations can be printed.
+
+#set0 = (d0) : (1 == 0)
+
+// CHECK: MyLocation: 0: 'foo' op
+// CHECK: nullptr: 'foo' op
+// CHECK: MyLocation: 0: 'foo' op
+// CHECK: MyLocation: 1: 'std.constant' op
+// CHECK: nullptr: 'std.constant' op
+// CHECK: MyLocation: 1: 'std.constant' op
+
+// CHECK-LABEL: func @inline_notation
+func @inline_notation() -> i32 {
+  // CHECK: -> i32 loc("foo")
+  // CHECK: -> i32 loc("foo")
+  // CHECK: -> i32 loc(unknown)
+  %1 = "foo"() : () -> i32 loc("foo")
+
+  // CHECK: constant 4 : index loc(callsite("foo" at "mysource.cc":10:8))
+  // CHECK: constant 4 : index loc(callsite("foo" at "mysource.cc":10:8))
+  // CHECK: constant 4 : index loc(unknown)
+  %2 = constant 4 : index loc(callsite("foo" at "mysource.cc":10:8))
+
+  // CHECK: } loc(unknown)
+  affine.for %i0 = 0 to 8 {
+  } loc(fused["foo", "mysource.cc":10:8])
+
+  // CHECK: } loc(unknown)
+  affine.for %i0 = 0 to 8 {
+  } loc(fused["foo", "mysource.cc":10:8, callsite("foo" at "mysource.cc":10:8)])
+
+  // CHECK: return %{{.*}} : i32 loc(unknown)
+  return %1 : i32 loc(unknown)
+}
diff --git a/mlir/test/IR/operand.mlir b/mlir/test/IR/operand.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..3ca8832821c3bdbb69e5a53b09fffb1f0fe3009f
--- /dev/null
+++ b/mlir/test/IR/operand.mlir
@@ -0,0 +1,35 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Test mixed normal and variadic operands
+//===----------------------------------------------------------------------===//
+
+func @correct_variadic_operand(%arg0: tensor<f32>, %arg1: f32) {
+  // CHECK: mixed_normal_variadic_operand
+  "test.mixed_normal_variadic_operand"(%arg0, %arg0, %arg0, %arg0, %arg0) : (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> ()
+  return
+}
+
+// -----
+
+func @error_in_first_variadic_operand(%arg0: tensor<f32>, %arg1: f32) {
+  // expected-error @+1 {{operand #1 must be tensor of any type}}
+  "test.mixed_normal_variadic_operand"(%arg0, %arg1, %arg0, %arg0, %arg0) : (tensor<f32>, f32, tensor<f32>, tensor<f32>, tensor<f32>) -> ()
+  return
+}
+
+// -----
+
+func @error_in_normal_operand(%arg0: tensor<f32>, %arg1: f32) {
+  // expected-error @+1 {{operand #2 must be tensor of any type}}
+  "test.mixed_normal_variadic_operand"(%arg0, %arg0, %arg1, %arg0, %arg0) : (tensor<f32>, tensor<f32>, f32, tensor<f32>, tensor<f32>) -> ()
+  return
+}
+
+// -----
+
+func @error_in_second_variadic_operand(%arg0: tensor<f32>, %arg1: f32) {
+  // expected-error @+1 {{operand #3 must be tensor of any type}}
+  "test.mixed_normal_variadic_operand"(%arg0, %arg0, %arg0, %arg1, %arg0) : (tensor<f32>, tensor<f32>, tensor<f32>, f32, tensor<f32>) -> ()
+  return
+}
diff --git a/mlir/test/IR/parser.mlir b/mlir/test/IR/parser.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..41e6d5cefcb66412ce45217f5496049c9105a6af
--- /dev/null
+++ b/mlir/test/IR/parser.mlir
@@ -0,0 +1,1133 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+// CHECK-DAG: #map{{[0-9]+}} = (d0, d1, d2, d3, d4)[s0] -> (d0, d1, d2, d4, d3)
+#map0 = (d0, d1, d2, d3, d4)[s0] -> (d0, d1, d2, d4, d3)
+
+// CHECK-DAG: #map{{[0-9]+}} = (d0) -> (d0)
+#map1 = (d0) -> (d0)
+
+// CHECK-DAG: #map{{[0-9]+}} = (d0, d1, d2) -> (d0, d1, d2)
+#map2 = (d0, d1, d2) -> (d0, d1, d2)
+
+// CHECK-DAG: #map{{[0-9]+}} = (d0, d1, d2) -> (d1, d0, d2)
+#map3 = (d0, d1, d2) -> (d1, d0, d2)
+
+// CHECK-DAG: #map{{[0-9]+}} = (d0, d1, d2) -> (d2, d1, d0)
+#map4 = (d0, d1, d2) -> (d2, d1, d0)
+
+// CHECK-DAG: #map{{[0-9]+}} = ()[s0] -> (0, s0 - 1)
+#inline_map_minmax_loop1 = ()[s0] -> (0, s0 - 1)
+
+// CHECK-DAG: #map{{[0-9]+}} = ()[s0] -> (100, s0 + 1)
+#inline_map_minmax_loop2 = ()[s0] -> (100, s0 + 1)
+
+// CHECK-DAG: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 + d1 + s0)
+#bound_map1 = (i, j)[s] -> (i + j + s)
+
+// CHECK-DAG: #map{{[0-9]+}} = (d0, d1) -> (d0 + d1)
+#inline_map_loop_bounds2 = (d0, d1) -> (d0 + d1)
+
+// CHECK-DAG: #map{{[0-9]+}} = (d0)[s0] -> (d0 + s0, d0 - s0)
+#bound_map2 = (i)[s] -> (i + s, i - s)
+
+// All maps appear in arbitrary order before all sets, in arbitrary order.
+// CHECK-NOT: Placeholder
+
+// CHECK-DAG: #set{{[0-9]+}} = (d0)[s0, s1] : (d0 >= 0, -d0 + s0 >= 0, s0 - 5 == 0, -d0 + s1 + 1 >= 0)
+#set0 = (i)[N, M] : (i >= 0, -i + N >= 0, N - 5 == 0, -i + M + 1 >= 0)
+
+// CHECK-DAG: #set{{[0-9]+}} = (d0, d1)[s0] : (d0 >= 0, d1 >= 0)
+#set1 = (d0, d1)[s0] : (d0 >= 0, d1 >= 0)
+
+// CHECK-DAG: #set{{[0-9]+}} = (d0) : (d0 - 1 == 0)
+#set2 = (d0) : (d0 - 1 == 0)
+
+// CHECK-DAG: [[SET_TRUE:#set[0-9]+]] = () : (0 == 0)
+
+// CHECK-DAG: #set{{[0-9]+}} = (d0)[s0] : (d0 - 2 >= 0, -d0 + 4 >= 0)
+
+// CHECK: func @foo(i32, i64) -> f32
+func @foo(i32, i64) -> f32
+
+// CHECK: func @bar()
+func @bar() -> ()
+
+// CHECK: func @baz() -> (i1, index, f32)
+func @baz() -> (i1, index, f32)
+
+// CHECK: func @missingReturn()
+func @missingReturn()
+
+// CHECK: func @int_types(i1, i2, i4, i7, i87) -> (i1, index, i19)
+func @int_types(i1, i2, i4, i7, i87) -> (i1, index, i19)
+
+
+// CHECK: func @vectors(vector<1xf32>, vector<2x4xf32>)
+func @vectors(vector<1 x f32>, vector<2x4xf32>)
+
+// CHECK: func @tensors(tensor<*xf32>, tensor<*xvector<2x4xf32>>, tensor<1x?x4x?x?xi32>, tensor<i8>)
+func @tensors(tensor<* x f32>, tensor<* x vector<2x4xf32>>,
+                 tensor<1x?x4x?x?xi32>, tensor<i8>)
+
+// CHECK: func @memrefs(memref<1x?x4x?x?xi32, #map{{[0-9]+}}>, memref<8xi8>)
+func @memrefs(memref<1x?x4x?x?xi32, #map0>, memref<8xi8, #map1, #map1>)
+
+// Test memref affine map compositions.
+
+// CHECK: func @memrefs2(memref<2x4x8xi8, 1>)
+func @memrefs2(memref<2x4x8xi8, #map2, 1>)
+
+// CHECK: func @memrefs23(memref<2x4x8xi8, #map{{[0-9]+}}>)
+func @memrefs23(memref<2x4x8xi8, #map2, #map3, 0>)
+
+// CHECK: func @memrefs234(memref<2x4x8xi8, #map{{[0-9]+}}, #map{{[0-9]+}}, 3>)
+func @memrefs234(memref<2x4x8xi8, #map2, #map3, #map4, 3>)
+
+// Test memref inline affine map compositions, minding that identity maps are removed.
+
+// CHECK: func @memrefs3(memref<2x4x8xi8>)
+func @memrefs3(memref<2x4x8xi8, (d0, d1, d2) -> (d0, d1, d2)>)
+
+// CHECK: func @memrefs33(memref<2x4x8xi8, #map{{[0-9]+}}, 1>)
+func @memrefs33(memref<2x4x8xi8, (d0, d1, d2) -> (d0, d1, d2), (d0, d1, d2) -> (d1, d0, d2), 1>)
+
+// CHECK: func @memrefs_drop_triv_id_inline(memref<2xi8>)
+func @memrefs_drop_triv_id_inline(memref<2xi8, (d0) -> (d0)>)
+
+// CHECK: func @memrefs_drop_triv_id_inline0(memref<2xi8>)
+func @memrefs_drop_triv_id_inline0(memref<2xi8, (d0) -> (d0), 0>)
+
+// CHECK: func @memrefs_drop_triv_id_inline1(memref<2xi8, 1>)
+func @memrefs_drop_triv_id_inline1(memref<2xi8, (d0) -> (d0), 1>)
+
+// Identity maps should be dropped from the composition, but not the pair of
+// "interchange" maps that, if composed, would be also an identity.
+// CHECK: func @memrefs_drop_triv_id_composition(memref<2x2xi8, #map{{[0-9]+}}, #map{{[0-9]+}}>)
+func @memrefs_drop_triv_id_composition(memref<2x2xi8,
+                                                (d0, d1) -> (d1, d0),
+                                                (d0, d1) -> (d0, d1),
+                                                (d0, d1) -> (d1, d0),
+                                                (d0, d1) -> (d0, d1),
+                                                (d0, d1) -> (d0, d1)>)
+
+// CHECK: func @memrefs_drop_triv_id_trailing(memref<2x2xi8, #map{{[0-9]+}}>)
+func @memrefs_drop_triv_id_trailing(memref<2x2xi8, (d0, d1) -> (d1, d0),
+                                              (d0, d1) -> (d0, d1)>)
+
+// CHECK: func @memrefs_drop_triv_id_middle(memref<2x2xi8, #map{{[0-9]+}}, #map{{[0-9]+}}>)
+func @memrefs_drop_triv_id_middle(memref<2x2xi8,
+                                         (d0, d1) -> (d0, d1 + 1),
+                                         (d0, d1) -> (d0, d1),
+                                         (d0, d1) -> (d0 + 1, d1)>)
+
+// CHECK: func @memrefs_drop_triv_id_multiple(memref<2xi8>)
+func @memrefs_drop_triv_id_multiple(memref<2xi8, (d0) -> (d0), (d0) -> (d0)>)
+
+// These maps appeared before, so they must be uniqued and hoisted to the beginning.
+// Identity map should be removed.
+// CHECK: func @memrefs_compose_with_id(memref<2x2xi8, #map{{[0-9]+}}>)
+func @memrefs_compose_with_id(memref<2x2xi8, (d0, d1) -> (d0, d1),
+                                        (d0, d1) -> (d1, d0)>)
+
+
+// CHECK: func @complex_types(complex<i1>) -> complex<f32>
+func @complex_types(complex<i1>) -> complex<f32>
+
+// CHECK: func @functions((memref<1x?x4x?x?xi32, #map0>, memref<8xi8>) -> (), () -> ())
+func @functions((memref<1x?x4x?x?xi32, #map0, 0>, memref<8xi8, #map1, 0>) -> (), ()->())
+
+// CHECK-LABEL: func @simpleCFG(%{{.*}}: i32, %{{.*}}: f32) -> i1 {
+func @simpleCFG(%arg0: i32, %f: f32) -> i1 {
+  // CHECK: %{{.*}} = "foo"() : () -> i64
+  %1 = "foo"() : ()->i64
+  // CHECK: "bar"(%{{.*}}) : (i64) -> (i1, i1, i1)
+  %2:3 = "bar"(%1) : (i64) -> (i1,i1,i1)
+  // CHECK: return %{{.*}}#1
+  return %2#1 : i1
+// CHECK: }
+}
+
+// CHECK-LABEL: func @simpleCFGUsingBBArgs(%{{.*}}: i32, %{{.*}}: i64) {
+func @simpleCFGUsingBBArgs(i32, i64) {
+^bb42 (%arg0: i32, %f: i64):
+  // CHECK: "bar"(%{{.*}}) : (i64) -> (i1, i1, i1)
+  %2:3 = "bar"(%f) : (i64) -> (i1,i1,i1)
+  // CHECK: return{{$}}
+  return
+// CHECK: }
+}
+
+// CHECK-LABEL: func @multiblock() {
+func @multiblock() {
+  return     // CHECK:   return
+^bb1:         // CHECK: ^bb1:   // no predecessors
+  br ^bb4     // CHECK:   br ^bb3
+^bb2:         // CHECK: ^bb2:   // pred: ^bb2
+  br ^bb2     // CHECK:   br ^bb2
+^bb4:         // CHECK: ^bb3:   // pred: ^bb1
+  return     // CHECK:   return
+}            // CHECK: }
+
+// CHECK-LABEL: func @emptyMLF() {
+func @emptyMLF() {
+  return     // CHECK:  return
+}            // CHECK: }
+
+// CHECK-LABEL: func @func_with_one_arg(%{{.*}}: i1) -> i2 {
+func @func_with_one_arg(%c : i1) -> i2 {
+  // CHECK: %{{.*}} = "foo"(%{{.*}}) : (i1) -> i2
+  %b = "foo"(%c) : (i1) -> (i2)
+  return %b : i2   // CHECK: return %{{.*}} : i2
+} // CHECK: }
+
+// CHECK-LABEL: func @func_with_two_args(%{{.*}}: f16, %{{.*}}: i8) -> (i1, i32) {
+func @func_with_two_args(%a : f16, %b : i8) -> (i1, i32) {
+  // CHECK: %{{.*}}:2 = "foo"(%{{.*}}, %{{.*}}) : (f16, i8) -> (i1, i32)
+  %c:2 = "foo"(%a, %b) : (f16, i8)->(i1, i32)
+  return %c#0, %c#1 : i1, i32  // CHECK: return %{{.*}}#0, %{{.*}}#1 : i1, i32
+} // CHECK: }
+
+// CHECK-LABEL: func @second_order_func() -> (() -> ()) {
+func @second_order_func() -> (() -> ()) {
+// CHECK-NEXT: %{{.*}} = constant @emptyMLF : () -> ()
+  %c = constant @emptyMLF : () -> ()
+// CHECK-NEXT: return %{{.*}} : () -> ()
+  return %c : () -> ()
+}
+
+// CHECK-LABEL: func @third_order_func() -> (() -> (() -> ())) {
+func @third_order_func() -> (() -> (() -> ())) {
+// CHECK-NEXT:  %{{.*}} = constant @second_order_func : () -> (() -> ())
+  %c = constant @second_order_func : () -> (() -> ())
+// CHECK-NEXT:  return %{{.*}} : () -> (() -> ())
+  return %c : () -> (() -> ())
+}
+
+// CHECK-LABEL: func @identity_functor(%{{.*}}: () -> ()) -> (() -> ())  {
+func @identity_functor(%a : () -> ()) -> (() -> ())  {
+// CHECK-NEXT: return %{{.*}} : () -> ()
+  return %a : () -> ()
+}
+
+// CHECK-LABEL: func @func_ops_in_loop() {
+func @func_ops_in_loop() {
+  // CHECK: %{{.*}} = "foo"() : () -> i64
+  %a = "foo"() : ()->i64
+  // CHECK: affine.for %{{.*}} = 1 to 10 {
+  affine.for %i = 1 to 10 {
+    // CHECK: %{{.*}} = "doo"() : () -> f32
+    %b = "doo"() : ()->f32
+    // CHECK: "bar"(%{{.*}}, %{{.*}}) : (i64, f32) -> ()
+    "bar"(%a, %b) : (i64, f32) -> ()
+  // CHECK: }
+  }
+  // CHECK: return
+  return
+  // CHECK: }
+}
+
+
+// CHECK-LABEL: func @loops() {
+func @loops() {
+  // CHECK: affine.for %{{.*}} = 1 to 100 step 2 {
+  affine.for %i = 1 to 100 step 2 {
+    // CHECK: affine.for %{{.*}} = 1 to 200 {
+    affine.for %j = 1 to 200 {
+    }        // CHECK:     }
+  }          // CHECK:   }
+  return     // CHECK:   return
+}            // CHECK: }
+
+// CHECK-LABEL: func @complex_loops() {
+func @complex_loops() {
+  affine.for %i1 = 1 to 100 {      // CHECK:   affine.for %{{.*}} = 1 to 100 {
+    affine.for %j1 = 1 to 100 {    // CHECK:     affine.for %{{.*}} = 1 to 100 {
+       // CHECK: "foo"(%{{.*}}, %{{.*}}) : (index, index) -> ()
+       "foo"(%i1, %j1) : (index,index) -> ()
+    }                       // CHECK:     }
+    "boo"() : () -> ()      // CHECK:     "boo"() : () -> ()
+    affine.for %j2 = 1 to 10 {     // CHECK:     affine.for %{{.*}} = 1 to 10 {
+      affine.for %k2 = 1 to 10 {   // CHECK:       affine.for %{{.*}} = 1 to 10 {
+        "goo"() : () -> ()  // CHECK:         "goo"() : () -> ()
+      }                     // CHECK:       }
+    }                       // CHECK:     }
+  }                         // CHECK:   }
+  return                    // CHECK:   return
+}                           // CHECK: }
+
+// CHECK: func @triang_loop(%{{.*}}: index, %{{.*}}: memref<?x?xi32>) {
+func @triang_loop(%arg0: index, %arg1: memref<?x?xi32>) {
+  %c = constant 0 : i32       // CHECK: %{{.*}} = constant 0 : i32
+  affine.for %i0 = 1 to %arg0 {      // CHECK: affine.for %{{.*}} = 1 to %{{.*}} {
+    affine.for %i1 = (d0)[]->(d0)(%i0)[] to %arg0 {  // CHECK:   affine.for %{{.*}} = #map{{[0-9]+}}(%{{.*}}) to %{{.*}} {
+      store %c, %arg1[%i0, %i1] : memref<?x?xi32>  // CHECK: store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}]
+    }          // CHECK:     }
+  }            // CHECK:   }
+  return       // CHECK:   return
+}              // CHECK: }
+
+// CHECK: func @minmax_loop(%{{.*}}: index, %{{.*}}: index, %{{.*}}: memref<100xf32>) {
+func @minmax_loop(%arg0: index, %arg1: index, %arg2: memref<100xf32>) {
+  // CHECK: affine.for %{{.*}} = max #map{{.*}}()[%{{.*}}] to min #map{{.*}}()[%{{.*}}] {
+  affine.for %i0 = max()[s]->(0,s-1)()[%arg0] to min()[s]->(100,s+1)()[%arg1] {
+    // CHECK: "foo"(%{{.*}}, %{{.*}}) : (memref<100xf32>, index) -> ()
+    "foo"(%arg2, %i0) : (memref<100xf32>, index) -> ()
+  }      // CHECK:   }
+  return // CHECK:   return
+}        // CHECK: }
+
+// CHECK-LABEL: func @loop_bounds(%{{.*}}: index) {
+func @loop_bounds(%N : index) {
+  // CHECK: %{{.*}} = "foo"(%{{.*}}) : (index) -> index
+  %s = "foo"(%N) : (index) -> index
+  // CHECK: affine.for %{{.*}} = %{{.*}} to %{{.*}}
+  affine.for %i = %s to %N {
+    // CHECK: affine.for %{{.*}} = #map{{[0-9]+}}(%{{.*}}) to 0
+    affine.for %j = (d0)[]->(d0)(%i)[] to 0 step 1 {
+       // CHECK: %{{.*}} = affine.apply #map{{.*}}(%{{.*}}, %{{.*}})[%{{.*}}]
+       %w1 = affine.apply(d0, d1)[s0] -> (d0+d1) (%i, %j) [%s]
+       // CHECK: %{{.*}} = affine.apply #map{{.*}}(%{{.*}}, %{{.*}})[%{{.*}}]
+       %w2 = affine.apply(d0, d1)[s0] -> (s0+1) (%i, %j) [%s]
+       // CHECK: affine.for %{{.*}} = #map{{.*}}(%{{.*}}, %{{.*}})[%{{.*}}] to #map{{.*}}(%{{.*}}, %{{.*}})[%{{.*}}] {
+       affine.for %k = #bound_map1 (%w1, %i)[%N] to (i, j)[s] -> (i + j + s) (%w2, %j)[%s] {
+          // CHECK: "foo"(%{{.*}}, %{{.*}}, %{{.*}}) : (index, index, index) -> ()
+          "foo"(%i, %j, %k) : (index, index, index)->()
+          // CHECK: %{{.*}} = constant 30 : index
+          %c = constant 30 : index
+          // CHECK: %{{.*}} = affine.apply #map{{.*}}(%{{.*}}, %{{.*}})
+          %u = affine.apply (d0, d1)->(d0+d1) (%N, %c)
+          // CHECK: affine.for %{{.*}} = max #map{{.*}}(%{{.*}})[%{{.*}}] to min #map{{.*}}(%{{.*}})[%{{.*}}] {
+          affine.for %l = max #bound_map2(%i)[%u] to min #bound_map2(%k)[%c] {
+            // CHECK: "bar"(%{{.*}}) : (index) -> ()
+            "bar"(%l) : (index) -> ()
+          } // CHECK:           }
+       }    // CHECK:         }
+     }      // CHECK:       }
+  }         // CHECK:     }
+  return    // CHECK:   return
+}           // CHECK: }
+
+// CHECK-LABEL: func @ifinst(%{{.*}}: index) {
+func @ifinst(%N: index) {
+  %c = constant 200 : index // CHECK   %{{.*}} = constant 200
+  affine.for %i = 1 to 10 {           // CHECK   affine.for %{{.*}} = 1 to 10 {
+    affine.if #set0(%i)[%N, %c] {     // CHECK     affine.if #set0(%{{.*}})[%{{.*}}, %{{.*}}] {
+      %x = constant 1 : i32
+       // CHECK: %{{.*}} = constant 1 : i32
+      %y = "add"(%x, %i) : (i32, index) -> i32 // CHECK: %{{.*}} = "add"(%{{.*}}, %{{.*}}) : (i32, index) -> i32
+      %z = "mul"(%y, %y) : (i32, i32) -> i32 // CHECK: %{{.*}} = "mul"(%{{.*}}, %{{.*}}) : (i32, i32) -> i32
+    } else { // CHECK } else {
+      affine.if (i)[N] : (i - 2 >= 0, 4 - i >= 0)(%i)[%N]  {      // CHECK  affine.if (#set1(%{{.*}})[%{{.*}}]) {
+        // CHECK: %{{.*}} = constant 1 : index
+        %u = constant 1 : index
+        // CHECK: %{{.*}} = affine.apply #map{{.*}}(%{{.*}}, %{{.*}})[%{{.*}}]
+        %w = affine.apply (d0,d1)[s0] -> (d0+d1+s0) (%i, %i) [%u]
+      } else {            // CHECK     } else {
+        %v = constant 3 : i32 // %c3_i32 = constant 3 : i32
+      }
+    }       // CHECK     }
+  }         // CHECK   }
+  return    // CHECK   return
+}           // CHECK }
+
+// CHECK-LABEL: func @simple_ifinst(%{{.*}}: index) {
+func @simple_ifinst(%N: index) {
+  %c = constant 200 : index // CHECK   %{{.*}} = constant 200
+  affine.for %i = 1 to 10 {           // CHECK   affine.for %{{.*}} = 1 to 10 {
+    affine.if #set0(%i)[%N, %c] {     // CHECK     affine.if #set0(%{{.*}})[%{{.*}}, %{{.*}}] {
+      %x = constant 1 : i32
+       // CHECK: %{{.*}} = constant 1 : i32
+      %y = "add"(%x, %i) : (i32, index) -> i32 // CHECK: %{{.*}} = "add"(%{{.*}}, %{{.*}}) : (i32, index) -> i32
+      %z = "mul"(%y, %y) : (i32, i32) -> i32 // CHECK: %{{.*}} = "mul"(%{{.*}}, %{{.*}}) : (i32, i32) -> i32
+    }       // CHECK     }
+  }         // CHECK   }
+  return    // CHECK   return
+}           // CHECK }
+
+// CHECK-LABEL: func @attributes() {
+func @attributes() {
+  // CHECK: "foo"()
+  "foo"(){} : ()->()
+
+  // CHECK: "foo"() {a = 1 : i64, b = -423 : i64, c = [true, false], d = 1.600000e+01 : f64}  : () -> ()
+  "foo"() {a = 1, b = -423, c = [true, false], d = 16.0 } : () -> ()
+
+  // CHECK: "foo"() {map1 = #map{{[0-9]+}}}
+  "foo"() {map1 = #map1} : () -> ()
+
+  // CHECK: "foo"() {map2 = #map{{[0-9]+}}}
+  "foo"() {map2 = (d0, d1, d2) -> (d0, d1, d2)} : () -> ()
+
+  // CHECK: "foo"() {map12 = [#map{{[0-9]+}}, #map{{[0-9]+}}]}
+  "foo"() {map12 = [#map1, #map2]} : () -> ()
+
+  // CHECK: "foo"() {set1 = #set{{[0-9]+}}}
+  "foo"() {set1 = #set1} : () -> ()
+
+  // CHECK: "foo"() {set2 = #set{{[0-9]+}}}
+  "foo"() {set2 = (d0, d1, d2) : (d0 >= 0, d1 >= 0, d2 - d1 == 0)} : () -> ()
+
+  // CHECK: "foo"() {set12 = [#set{{[0-9]+}}, #set{{[0-9]+}}]}
+  "foo"() {set12 = [#set1, #set2]} : () -> ()
+
+  // CHECK: "foo"() {dictionary = {bool = true, fn = @ifinst}}
+  "foo"() {dictionary = {bool = true, fn = @ifinst}} : () -> ()
+
+  // Check that the dictionary attribute elements are sorted.
+  // CHECK: "foo"() {dictionary = {bar = false, bool = true, fn = @ifinst}}
+  "foo"() {dictionary = {fn = @ifinst, bar = false, bool = true}} : () -> ()
+
+  // CHECK: "foo"() {d = 1.000000e-09 : f64, func = [], i123 = 7 : i64, if = "foo"} : () -> ()
+  "foo"() {if = "foo", func = [], i123 = 7, d = 1.e-9} : () -> ()
+
+  // CHECK: "foo"() {fn = @attributes, if = @ifinst} : () -> ()
+  "foo"() {fn = @attributes, if = @ifinst} : () -> ()
+
+  // CHECK: "foo"() {int = 0 : i42} : () -> ()
+  "foo"() {int = 0 : i42} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @ssa_values() -> (i16, i8) {
+func @ssa_values() -> (i16, i8) {
+  // CHECK: %{{.*}}:2 = "foo"() : () -> (i1, i17)
+  %0:2 = "foo"() : () -> (i1, i17)
+  br ^bb2
+
+^bb1:       // CHECK: ^bb1: // pred: ^bb2
+  // CHECK: %{{.*}}:2 = "baz"(%{{.*}}#1, %{{.*}}#0, %{{.*}}#1) : (f32, i11, i17) -> (i16, i8)
+  %1:2 = "baz"(%2#1, %2#0, %0#1) : (f32, i11, i17) -> (i16, i8)
+
+  // CHECK: return %{{.*}}#0, %{{.*}}#1 : i16, i8
+  return %1#0, %1#1 : i16, i8
+
+^bb2:       // CHECK: ^bb2:  // pred: ^bb0
+  // CHECK: %{{.*}}:2 = "bar"(%{{.*}}#0, %{{.*}}#1) : (i1, i17) -> (i11, f32)
+  %2:2 = "bar"(%0#0, %0#1) : (i1, i17) -> (i11, f32)
+  br ^bb1
+}
+
+// CHECK-LABEL: func @bbargs() -> (i16, i8) {
+func @bbargs() -> (i16, i8) {
+  // CHECK: %{{.*}}:2 = "foo"() : () -> (i1, i17)
+  %0:2 = "foo"() : () -> (i1, i17)
+  br ^bb1(%0#1, %0#0 : i17, i1)
+
+^bb1(%x: i17, %y: i1):       // CHECK: ^bb1(%{{.*}}: i17, %{{.*}}: i1):
+  // CHECK: %{{.*}}:2 = "baz"(%{{.*}}, %{{.*}}, %{{.*}}#1) : (i17, i1, i17) -> (i16, i8)
+  %1:2 = "baz"(%x, %y, %0#1) : (i17, i1, i17) -> (i16, i8)
+  return %1#0, %1#1 : i16, i8
+}
+
+// CHECK-LABEL: func @verbose_terminators() -> (i1, i17)
+func @verbose_terminators() -> (i1, i17) {
+  %0:2 = "foo"() : () -> (i1, i17)
+// CHECK:  br ^bb1(%{{.*}}#0, %{{.*}}#1 : i1, i17)
+  "std.br"()[^bb1(%0#0, %0#1 : i1, i17)] : () -> ()
+
+^bb1(%x : i1, %y : i17):
+// CHECK:  cond_br %{{.*}}, ^bb2(%{{.*}} : i17), ^bb3(%{{.*}}, %{{.*}} : i1, i17)
+  "std.cond_br"(%x)[^bb2(%y : i17), ^bb3(%x, %y : i1, i17)] : (i1) -> ()
+
+^bb2(%a : i17):
+  %true = constant 1 : i1
+// CHECK:  return %{{.*}}, %{{.*}} : i1, i17
+  "std.return"(%true, %a) : (i1, i17) -> ()
+
+^bb3(%b : i1, %c : i17):
+// CHECK:  return %{{.*}}, %{{.*}} : i1, i17
+  "std.return"(%b, %c) : (i1, i17) -> ()
+}
+
+// CHECK-LABEL: func @condbr_simple
+func @condbr_simple() -> (i32) {
+  %cond = "foo"() : () -> i1
+  %a = "bar"() : () -> i32
+  %b = "bar"() : () -> i64
+  // CHECK: cond_br %{{.*}}, ^bb1(%{{.*}} : i32), ^bb2(%{{.*}} : i64)
+  cond_br %cond, ^bb1(%a : i32), ^bb2(%b : i64)
+
+// CHECK: ^bb1({{.*}}: i32): // pred: ^bb0
+^bb1(%x : i32):
+  br ^bb2(%b: i64)
+
+// CHECK: ^bb2({{.*}}: i64): // 2 preds: ^bb0, ^bb1
+^bb2(%y : i64):
+  %z = "foo"() : () -> i32
+  return %z : i32
+}
+
+// CHECK-LABEL: func @condbr_moarargs
+func @condbr_moarargs() -> (i32) {
+  %cond = "foo"() : () -> i1
+  %a = "bar"() : () -> i32
+  %b = "bar"() : () -> i64
+  // CHECK: cond_br %{{.*}}, ^bb1(%{{.*}}, %{{.*}} : i32, i64), ^bb2(%{{.*}}, %{{.*}}, %{{.*}} : i64, i32, i32)
+  cond_br %cond, ^bb1(%a, %b : i32, i64), ^bb2(%b, %a, %a : i64, i32, i32)
+
+^bb1(%x : i32, %y : i64):
+  return %x : i32
+
+^bb2(%x2 : i64, %y2 : i32, %z2 : i32):
+  %z = "foo"() : () -> i32
+  return %z : i32
+}
+
+
+// Test pretty printing of constant names.
+// CHECK-LABEL: func @constants
+func @constants() -> (i32, i23, i23, i1, i1) {
+  // CHECK: %{{.*}} = constant 42 : i32
+  %x = constant 42 : i32
+  // CHECK: %{{.*}} = constant 17 : i23
+  %y = constant 17 : i23
+
+  // This is a redundant definition of 17, the asmprinter gives it a unique name
+  // CHECK: %{{.*}} = constant 17 : i23
+  %z = constant 17 : i23
+
+  // CHECK: %{{.*}} = constant 1 : i1
+  %t = constant 1 : i1
+  // CHECK: %{{.*}} = constant 0 : i1
+  %f = constant 0 : i1
+
+  // The trick to parse type declarations should not interfere with hex
+  // literals.
+  // CHECK: %{{.*}} = constant 3890 : i32
+  %h = constant 0xf32 : i32
+
+  // CHECK: return %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
+  return %x, %y, %z, %t, %f : i32, i23, i23, i1, i1
+}
+
+// CHECK-LABEL: func @typeattr
+func @typeattr() -> () {
+^bb0:
+// CHECK: "foo"() {bar = tensor<*xf32>} : () -> ()
+  "foo"(){bar = tensor<*xf32>} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @stringquote
+func @stringquote() -> () {
+^bb0:
+  // CHECK: "foo"() {bar = "a\22quoted\22string"} : () -> ()
+  "foo"(){bar = "a\"quoted\"string"} : () -> ()
+
+  // CHECK-NEXT: "typed_string" : !foo.string
+  "foo"(){bar = "typed_string" : !foo.string} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @unitAttrs
+func @unitAttrs() -> () {
+  // CHECK-NEXT: "foo"() {unitAttr}
+  "foo"() {unitAttr = unit} : () -> ()
+
+  // CHECK-NEXT: "foo"() {unitAttr}
+  "foo"() {unitAttr} : () -> ()
+
+  // CHECK-NEXT: "foo"() {nested = {unitAttr}}
+  "foo"() {nested = {unitAttr}} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @floatAttrs
+func @floatAttrs() -> () {
+^bb0:
+  // CHECK: "foo"() {a = 4.000000e+00 : f64, b = 2.000000e+00 : f64, c = 7.100000e+00 : f64, d = -0.000000e+00 : f64} : () -> ()
+  "foo"(){a = 4.0, b = 2.0, c = 7.1, d = -0.0} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @externalfuncattr
+func @externalfuncattr() -> ()
+  // CHECK: attributes {dialect.a = "a\22quoted\22string", dialect.b = 4.000000e+00 : f64, dialect.c = tensor<*xf32>}
+  attributes {dialect.a = "a\"quoted\"string", dialect.b = 4.0, dialect.c = tensor<*xf32>}
+
+// CHECK-LABEL: func @funcattrempty
+func @funcattrempty() -> ()
+  attributes {}
+
+// CHECK-LABEL: func @funcattr
+func @funcattr() -> ()
+  // CHECK: attributes {dialect.a = "a\22quoted\22string", dialect.b = 4.000000e+00 : f64, dialect.c = tensor<*xf32>}
+  attributes {dialect.a = "a\"quoted\"string", dialect.b = 4.0, dialect.c = tensor<*xf32>} {
+^bb0:
+  return
+}
+
+// CHECK-LABEL: func @funcattrwithblock
+func @funcattrwithblock() -> ()
+  attributes {} {
+^bb0:
+  return
+}
+
+// CHECK-label func @funcsimplemap
+#map_simple0 = ()[] -> (10)
+#map_simple1 = ()[s0] -> (s0)
+#map_non_simple0 = (d0)[] -> (d0)
+#map_non_simple1 = (d0)[s0] -> (d0 + s0)
+#map_non_simple2 = ()[s0, s1] -> (s0 + s1)
+#map_non_simple3 = ()[s0] -> (s0 + 3)
+func @funcsimplemap(%arg0: index, %arg1: index) -> () {
+  affine.for %i0 = 0 to #map_simple0()[] {
+  // CHECK: affine.for %{{.*}} = 0 to 10 {
+    affine.for %i1 = 0 to #map_simple1()[%arg1] {
+    // CHECK: affine.for %{{.*}} = 0 to %{{.*}} {
+      affine.for %i2 = 0 to #map_non_simple0(%i0)[] {
+      // CHECK: affine.for %{{.*}} = 0 to #map{{[a-z_0-9]*}}(%{{.*}}) {
+        affine.for %i3 = 0 to #map_non_simple1(%i0)[%arg1] {
+        // CHECK: affine.for %{{.*}} = 0 to #map{{[a-z_0-9]*}}(%{{.*}})[%{{.*}}] {
+          affine.for %i4 = 0 to #map_non_simple2()[%arg1, %arg0] {
+          // CHECK: affine.for %{{.*}} = 0 to #map{{[a-z_0-9]*}}()[%{{.*}}, %{{.*}}] {
+            affine.for %i5 = 0 to #map_non_simple3()[%arg0] {
+            // CHECK: affine.for %{{.*}} = 0 to #map{{[a-z_0-9]*}}()[%{{.*}}] {
+              %c42_i32 = constant 42 : i32
+            }
+          }
+        }
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @splattensorattr
+func @splattensorattr() -> () {
+^bb0:
+  // CHECK: "splatBoolTensor"() {bar = dense<false> : tensor<i1>} : () -> ()
+  "splatBoolTensor"(){bar = dense<false> : tensor<i1>} : () -> ()
+
+  // CHECK: "splatIntTensor"() {bar = dense<5> : tensor<2x1x4xi32>} : () -> ()
+  "splatIntTensor"(){bar = dense<5> : tensor<2x1x4xi32>} : () -> ()
+
+  // CHECK: "splatFloatTensor"() {bar = dense<-5.000000e+00> : tensor<2x1x4xf32>} : () -> ()
+  "splatFloatTensor"(){bar = dense<-5.0> : tensor<2x1x4xf32>} : () -> ()
+
+  // CHECK: "splatIntVector"() {bar = dense<5> : vector<2x1x4xi64>} : () -> ()
+  "splatIntVector"(){bar = dense<5> : vector<2x1x4xi64>} : () -> ()
+
+  // CHECK: "splatFloatVector"() {bar = dense<-5.000000e+00> : vector<2x1x4xf16>} : () -> ()
+  "splatFloatVector"(){bar = dense<-5.0> : vector<2x1x4xf16>} : () -> ()
+
+  // CHECK: "splatIntScalar"() {bar = dense<5> : tensor<i9>} : () -> ()
+  "splatIntScalar"() {bar = dense<5> : tensor<i9>} : () -> ()
+  // CHECK: "splatFloatScalar"() {bar = dense<-5.000000e+00> : tensor<f16>} : () -> ()
+  "splatFloatScalar"() {bar = dense<-5.0> : tensor<f16>} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @densetensorattr
+func @densetensorattr() -> () {
+^bb0:
+
+// NOTE: The {{\[\[}} syntax is because "[[" confuses FileCheck.
+// CHECK: "fooi3"() {bar = dense<{{\[\[\[}}1, -2, 1, 2]], {{\[\[}}0, 2, -1, 2]]]> : tensor<2x1x4xi3>} : () -> ()
+  "fooi3"(){bar = dense<[[[1, -2, 1, 2]], [[0, 2, -1, 2]]]> : tensor<2x1x4xi3>} : () -> ()
+// CHECK: "fooi6"() {bar = dense<{{\[\[\[}}5, -6, 1, 2]], {{\[\[}}7, 8, 3, 4]]]> : tensor<2x1x4xi6>} : () -> ()
+  "fooi6"(){bar = dense<[[[5, -6, 1, 2]], [[7, 8, 3, 4]]]> : tensor<2x1x4xi6>} : () -> ()
+// CHECK: "fooi8"() {bar = dense<5> : tensor<1x1x1xi8>} : () -> ()
+  "fooi8"(){bar = dense<[[[5]]]> : tensor<1x1x1xi8>} : () -> ()
+// CHECK: "fooi13"() {bar = dense<{{\[\[\[}}1, -2, 1, 2]], {{\[\[}}0, 2, -1, 2]]]> : tensor<2x1x4xi13>} : () -> ()
+  "fooi13"(){bar = dense<[[[1, -2, 1, 2]], [[0, 2, -1, 2]]]> : tensor<2x1x4xi13>} : () -> ()
+// CHECK: "fooi16"() {bar = dense<-5> : tensor<1x1x1xi16>} : () -> ()
+  "fooi16"(){bar = dense<[[[-5]]]> : tensor<1x1x1xi16>} : () -> ()
+// CHECK: "fooi23"() {bar = dense<{{\[\[\[}}1, -2, 1, 2]], {{\[\[}}0, 2, -1, 2]]]> : tensor<2x1x4xi23>} : () -> ()
+  "fooi23"(){bar = dense<[[[1, -2, 1, 2]], [[0, 2, -1, 2]]]> : tensor<2x1x4xi23>} : () -> ()
+// CHECK: "fooi32"() {bar = dense<5> : tensor<1x1x1xi32>} : () -> ()
+  "fooi32"(){bar = dense<[[[5]]]> : tensor<1x1x1xi32>} : () -> ()
+// CHECK: "fooi33"() {bar = dense<{{\[\[\[}}1, -2, 1, 2]], {{\[\[}}0, 2, -1, 2]]]> : tensor<2x1x4xi33>} : () -> ()
+  "fooi33"(){bar = dense<[[[1, -2, 1, 2]], [[0, 2, -1, 2]]]> : tensor<2x1x4xi33>} : () -> ()
+// CHECK: "fooi43"() {bar = dense<{{\[\[\[}}1, -2, 1, 2]], {{\[\[}}0, 2, -1, 2]]]> : tensor<2x1x4xi43>} : () -> ()
+  "fooi43"(){bar = dense<[[[1, -2, 1, 2]], [[0, 2, -1, 2]]]> : tensor<2x1x4xi43>} : () -> ()
+// CHECK: "fooi53"() {bar = dense<{{\[\[\[}}1, -2, 1, 2]], {{\[\[}}0, 2, -1, 2]]]> : tensor<2x1x4xi53>} : () -> ()
+  "fooi53"(){bar = dense<[[[1, -2, 1, 2]], [[0, 2, -1, 2]]]> : tensor<2x1x4xi53>} : () -> ()
+// CHECK: "fooi64"() {bar = dense<{{\[\[\[}}1, -2, 1, 2]], {{\[\[}}0, 3, -1, 2]]]> : tensor<2x1x4xi64>} : () -> ()
+  "fooi64"(){bar = dense<[[[1, -2, 1, 2]], [[0, 3, -1, 2]]]> : tensor<2x1x4xi64>} : () -> ()
+// CHECK: "fooi64"() {bar = dense<-5> : tensor<1x1x1xi64>} : () -> ()
+  "fooi64"(){bar = dense<[[[-5]]]> : tensor<1x1x1xi64>} : () -> ()
+// CHECK: "fooi67"() {bar = dense<{{\[\[\[}}-5, 4, 6, 2]]]> : vector<1x1x4xi67>} : () -> ()
+  "fooi67"(){bar = dense<[[[-5, 4, 6, 2]]]> : vector<1x1x4xi67>} : () -> ()
+
+// CHECK: "foo2"() {bar = dense<[]> : tensor<0xi32>} : () -> ()
+  "foo2"(){bar = dense<[]> : tensor<0xi32>} : () -> ()
+// CHECK: "foo2"() {bar = dense<{{\[\[}}]]> : tensor<1x0xi32>} : () -> ()
+  "foo2"(){bar = dense<[[]]> : tensor<1x0xi32>} : () -> ()
+// CHECK: "foo3"() {bar = dense<{{\[\[\[}}5, -6, 1, 2]], {{\[\[}}7, 8, 3, 4]]]> : tensor<2x1x4xi32>} : () -> ()
+  "foo3"(){bar = dense<[[[5, -6, 1, 2]], [[7, 8, 3, 4]]]> : tensor<2x1x4xi32>} : () -> ()
+
+// CHECK: "float1"() {bar = dense<5.000000e+00> : tensor<1x1x1xf32>} : () -> ()
+  "float1"(){bar = dense<[[[5.0]]]> : tensor<1x1x1xf32>} : () -> ()
+// CHECK: "float2"() {bar = dense<[]> : tensor<0xf32>} : () -> ()
+  "float2"(){bar = dense<[]> : tensor<0xf32>} : () -> ()
+// CHECK: "float2"() {bar = dense<{{\[\[}}]]> : tensor<1x0xf32>} : () -> ()
+  "float2"(){bar = dense<[[]]> : tensor<1x0xf32>} : () -> ()
+
+// CHECK: "bfloat16"() {bar = dense<{{\[\[\[}}-5.000000e+00, 6.000000e+00, 1.000000e+00, 2.000000e+00]], {{\[\[}}7.000000e+00, -8.000000e+00, 3.000000e+00, 4.000000e+00]]]> : tensor<2x1x4xbf16>} : () -> ()
+  "bfloat16"(){bar = dense<[[[-5.0, 6.0, 1.0, 2.0]], [[7.0, -8.0, 3.0, 4.0]]]> : tensor<2x1x4xbf16>} : () -> ()
+// CHECK: "float16"() {bar = dense<{{\[\[\[}}-5.000000e+00, 6.000000e+00, 1.000000e+00, 2.000000e+00]], {{\[\[}}7.000000e+00, -8.000000e+00, 3.000000e+00, 4.000000e+00]]]> : tensor<2x1x4xf16>} : () -> ()
+  "float16"(){bar = dense<[[[-5.0, 6.0, 1.0, 2.0]], [[7.0, -8.0, 3.0, 4.0]]]> : tensor<2x1x4xf16>} : () -> ()
+// CHECK: "float32"() {bar = dense<{{\[\[\[}}-5.000000e+00, 6.000000e+00, 1.000000e+00, 2.000000e+00]], {{\[\[}}7.000000e+00, -8.000000e+00, 3.000000e+00, 4.000000e+00]]]> : tensor<2x1x4xf32>} : () -> ()
+  "float32"(){bar = dense<[[[-5.0, 6.0, 1.0, 2.0]], [[7.0, -8.0, 3.0, 4.0]]]> : tensor<2x1x4xf32>} : () -> ()
+// CHECK: "float64"() {bar = dense<{{\[\[\[}}-5.000000e+00, 6.000000e+00, 1.000000e+00, 2.000000e+00]], {{\[\[}}7.000000e+00, -8.000000e+00, 3.000000e+00, 4.000000e+00]]]> : tensor<2x1x4xf64>} : () -> ()
+  "float64"(){bar = dense<[[[-5.0, 6.0, 1.0, 2.0]], [[7.0, -8.0, 3.0, 4.0]]]> : tensor<2x1x4xf64>} : () -> ()
+
+// CHECK: "intscalar"() {bar = dense<1> : tensor<i32>} : () -> ()
+  "intscalar"(){bar = dense<1> : tensor<i32>} : () -> ()
+// CHECK: "floatscalar"() {bar = dense<5.000000e+00> : tensor<f32>} : () -> ()
+  "floatscalar"(){bar = dense<5.0> : tensor<f32>} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @densevectorattr
+func @densevectorattr() -> () {
+^bb0:
+// NOTE: The {{\[\[}} syntax is because "[[" confuses FileCheck.
+// CHECK: "fooi8"() {bar = dense<5> : vector<1x1x1xi8>} : () -> ()
+  "fooi8"(){bar = dense<[[[5]]]> : vector<1x1x1xi8>} : () -> ()
+// CHECK: "fooi16"() {bar = dense<-5> : vector<1x1x1xi16>} : () -> ()
+  "fooi16"(){bar = dense<[[[-5]]]> : vector<1x1x1xi16>} : () -> ()
+// CHECK: "foo32"() {bar = dense<5> : vector<1x1x1xi32>} : () -> ()
+  "foo32"(){bar = dense<[[[5]]]> : vector<1x1x1xi32>} : () -> ()
+// CHECK: "fooi64"() {bar = dense<-5> : vector<1x1x1xi64>} : () -> ()
+  "fooi64"(){bar = dense<[[[-5]]]> : vector<1x1x1xi64>} : () -> ()
+
+// CHECK: "foo3"() {bar = dense<{{\[\[\[}}5, -6, 1, 2]], {{\[\[}}7, 8, 3, 4]]]> : vector<2x1x4xi32>} : () -> ()
+  "foo3"(){bar = dense<[[[5, -6, 1, 2]], [[7, 8, 3, 4]]]> : vector<2x1x4xi32>} : () -> ()
+
+// CHECK: "float1"() {bar = dense<5.000000e+00> : vector<1x1x1xf32>} : () -> ()
+  "float1"(){bar = dense<[[[5.0]]]> : vector<1x1x1xf32>} : () -> ()
+
+// CHECK: "bfloat16"() {bar = dense<{{\[\[\[}}-5.000000e+00, 6.000000e+00, 1.000000e+00, 2.000000e+00]], {{\[\[}}7.000000e+00, -8.000000e+00, 3.000000e+00, 4.000000e+00]]]> : vector<2x1x4xbf16>} : () -> ()
+  "bfloat16"(){bar = dense<[[[-5.0, 6.0, 1.0, 2.0]], [[7.0, -8.0, 3.0, 4.0]]]> : vector<2x1x4xbf16>} : () -> ()
+// CHECK: "float16"() {bar = dense<{{\[\[\[}}-5.000000e+00, 6.000000e+00, 1.000000e+00, 2.000000e+00]], {{\[\[}}7.000000e+00, -8.000000e+00, 3.000000e+00, 4.000000e+00]]]> : vector<2x1x4xf16>} : () -> ()
+  "float16"(){bar = dense<[[[-5.0, 6.0, 1.0, 2.0]], [[7.0, -8.0, 3.0, 4.0]]]> : vector<2x1x4xf16>} : () -> ()
+// CHECK: "float32"() {bar = dense<{{\[\[\[}}-5.000000e+00, 6.000000e+00, 1.000000e+00, 2.000000e+00]], {{\[\[}}7.000000e+00, -8.000000e+00, 3.000000e+00, 4.000000e+00]]]> : vector<2x1x4xf32>} : () -> ()
+  "float32"(){bar = dense<[[[-5.0, 6.0, 1.0, 2.0]], [[7.0, -8.0, 3.0, 4.0]]]> : vector<2x1x4xf32>} : () -> ()
+// CHECK: "float64"() {bar = dense<{{\[\[\[}}-5.000000e+00, 6.000000e+00, 1.000000e+00, 2.000000e+00]], {{\[\[}}7.000000e+00, -8.000000e+00, 3.000000e+00, 4.000000e+00]]]> : vector<2x1x4xf64>} : () -> ()
+  "float64"(){bar = dense<[[[-5.0, 6.0, 1.0, 2.0]], [[7.0, -8.0, 3.0, 4.0]]]> : vector<2x1x4xf64>} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @sparsetensorattr
+func @sparsetensorattr() -> () {
+^bb0:
+// NOTE: The {{\[\[}} syntax is because "[[" confuses FileCheck.
+// CHECK: "fooi8"() {bar = sparse<0, -2> : tensor<1x1x1xi8>} : () -> ()
+  "fooi8"(){bar = sparse<0, -2> : tensor<1x1x1xi8>} : () -> ()
+// CHECK: "fooi16"() {bar = sparse<{{\[\[}}1, 1, 0], {{\[}}0, 1, 0], {{\[}}0, 0, 1]], {{\[}}2, -1, 5]> : tensor<2x2x2xi16>} : () -> ()
+  "fooi16"(){bar = sparse<[[1, 1, 0], [0, 1, 0], [0, 0, 1]], [2, -1, 5]> : tensor<2x2x2xi16>} : () -> ()
+// CHECK: "fooi32"() {bar = sparse<{{\[}}], {{\[}}]> : tensor<1x1xi32>} : () -> ()
+  "fooi32"(){bar = sparse<[], []> : tensor<1x1xi32>} : () -> ()
+// CHECK: "fooi64"() {bar = sparse<0, -1> : tensor<1xi64>} : () -> ()
+  "fooi64"(){bar = sparse<[[0]], [-1]> : tensor<1xi64>} : () -> ()
+// CHECK: "foo2"() {bar = sparse<{{\[}}], {{\[}}]> : tensor<0xi32>} : () -> ()
+  "foo2"(){bar = sparse<[], []> : tensor<0xi32>} : () -> ()
+// CHECK: "foo3"() {bar = sparse<{{\[}}], {{\[}}]> : tensor<i32>} : () -> ()
+  "foo3"(){bar = sparse<[], []> : tensor<i32>} : () -> ()
+
+// CHECK: "foof16"() {bar = sparse<0, -2.000000e+00> : tensor<1x1x1xf16>} : () -> ()
+  "foof16"(){bar = sparse<0, -2.0> : tensor<1x1x1xf16>} : () -> ()
+// CHECK: "foobf16"() {bar = sparse<{{\[\[}}1, 1, 0], {{\[}}0, 1, 0], {{\[}}0, 0, 1]], {{\[}}2.000000e+00, -1.000000e+00, 5.000000e+00]> : tensor<2x2x2xbf16>} : () -> ()
+  "foobf16"(){bar = sparse<[[1, 1, 0], [0, 1, 0], [0, 0, 1]], [2.0, -1.0, 5.0]> : tensor<2x2x2xbf16>} : () -> ()
+// CHECK: "foof32"() {bar = sparse<{{\[}}], {{\[}}]> : tensor<1x0x1xf32>} : () -> ()
+  "foof32"(){bar = sparse<[], []> : tensor<1x0x1xf32>} : () -> ()
+// CHECK:  "foof64"() {bar = sparse<0, -1.000000e+00> : tensor<1xf64>} : () -> ()
+  "foof64"(){bar = sparse<[[0]], [-1.0]> : tensor<1xf64>} : () -> ()
+// CHECK: "foof320"() {bar = sparse<{{\[}}], {{\[}}]> : tensor<0xf32>} : () -> ()
+  "foof320"(){bar = sparse<[], []> : tensor<0xf32>} : () -> ()
+// CHECK: "foof321"() {bar = sparse<{{\[}}], {{\[}}]> : tensor<f32>} : () -> ()
+  "foof321"(){bar = sparse<[], []> : tensor<f32>} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @sparsevectorattr
+func @sparsevectorattr() -> () {
+^bb0:
+// NOTE: The {{\[\[}} syntax is because "[[" confuses FileCheck.
+// CHECK: "fooi8"() {bar = sparse<0, -2> : vector<1x1x1xi8>} : () -> ()
+  "fooi8"(){bar = sparse<0, -2> : vector<1x1x1xi8>} : () -> ()
+// CHECK: "fooi16"() {bar = sparse<{{\[\[}}1, 1, 0], {{\[}}0, 1, 0], {{\[}}0, 0, 1]], {{\[}}2, -1, 5]> : vector<2x2x2xi16>} : () -> ()
+  "fooi16"(){bar = sparse<[[1, 1, 0], [0, 1, 0], [0, 0, 1]], [2, -1, 5]> : vector<2x2x2xi16>} : () -> ()
+// CHECK: "fooi32"() {bar = sparse<{{\[}}], {{\[}}]> : vector<1x1xi32>} : () -> ()
+  "fooi32"(){bar = sparse<[], []> : vector<1x1xi32>} : () -> ()
+// CHECK: "fooi64"() {bar = sparse<0, -1> : vector<1xi64>} : () -> ()
+  "fooi64"(){bar = sparse<[[0]], [-1]> : vector<1xi64>} : () -> ()
+
+// CHECK: "foof16"() {bar = sparse<0, -2.000000e+00> : vector<1x1x1xf16>} : () -> ()
+  "foof16"(){bar = sparse<0, -2.0> : vector<1x1x1xf16>} : () -> ()
+// CHECK: "foobf16"() {bar = sparse<{{\[\[}}1, 1, 0], {{\[}}0, 1, 0], {{\[}}0, 0, 1]], {{\[}}2.000000e+00, -1.000000e+00, 5.000000e+00]> : vector<2x2x2xbf16>} : () -> ()
+  "foobf16"(){bar = sparse<[[1, 1, 0], [0, 1, 0], [0, 0, 1]], [2.0, -1.0, 5.0]> : vector<2x2x2xbf16>} : () -> ()
+// CHECK:  "foof64"() {bar = sparse<0, -1.000000e+00> : vector<1xf64>} : () -> ()
+  "foof64"(){bar = sparse<0, [-1.0]> : vector<1xf64>} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @unknown_dialect_type() -> !bar<""> {
+func @unknown_dialect_type() -> !bar<""> {
+  // Unregistered dialect 'bar'.
+  // CHECK: "foo"() : () -> !bar<"">
+  %0 = "foo"() : () -> !bar<"">
+
+  // CHECK: "foo"() : () -> !bar.baz
+  %1 = "foo"() : () -> !bar<"baz">
+
+  return %0 : !bar<"">
+}
+
+// CHECK-LABEL: func @type_alias() -> i32 {
+!i32_type_alias = type i32
+func @type_alias() -> !i32_type_alias {
+
+  // Return a non-aliased i32 type.
+  %0 = "foo"() : () -> i32
+  return %0 : i32
+}
+
+// CHECK-LABEL: func @no_integer_set_constraints(
+func @no_integer_set_constraints() {
+  // CHECK: affine.if [[SET_TRUE]]() {
+  affine.if () : () () {
+  }
+  return
+}
+
+// CHECK-LABEL: func @verbose_if(
+func @verbose_if(%N: index) {
+  %c = constant 200 : index
+
+  // CHECK: affine.if #set{{.*}}(%{{.*}})[%{{.*}}, %{{.*}}] {
+  "affine.if"(%c, %N, %c) ({
+    // CHECK-NEXT: "add"
+    %y = "add"(%c, %N) : (index, index) -> index
+    "affine.terminator"() : () -> ()
+    // CHECK-NEXT: } else {
+  }, { // The else region.
+    // CHECK-NEXT: "add"
+    %z = "add"(%c, %c) : (index, index) -> index
+    "affine.terminator"() : () -> ()
+  })
+  { condition = #set0 } : (index, index, index) -> ()
+  return
+}
+
+// CHECK-LABEL: func @terminator_with_regions
+func @terminator_with_regions() {
+  // Combine successors and regions in the same operation.
+  // CHECK: "region"()[^bb1] ( {
+  // CHECK: }) : () -> ()
+  "region"()[^bb2] ({}) : () -> ()
+^bb2:
+  return
+}
+
+// CHECK-LABEL: func @unregistered_term
+func @unregistered_term(%arg0 : i1) -> i1 {
+  // CHECK-NEXT: "unregistered_br"()[^bb1(%{{.*}} : i1)] : () -> ()
+  "unregistered_br"()[^bb1(%arg0 : i1)] : () -> ()
+
+^bb1(%arg1 : i1):
+  return %arg1 : i1
+}
+
+// CHECK-LABEL: func @dialect_attrs
+func @dialect_attrs()
+    // CHECK: attributes  {dialect.attr = 10
+    attributes {dialect.attr = 10} {
+  return
+}
+
+// CHECK-LABEL: func @_valid.function$name
+func @_valid.function$name()
+
+// CHECK-LABEL: func @external_func_arg_attrs(i32, i1 {dialect.attr = 10 : i64}, i32)
+func @external_func_arg_attrs(i32, i1 {dialect.attr = 10 : i64}, i32)
+
+// CHECK-LABEL: func @func_arg_attrs(%{{.*}}: i1 {dialect.attr = 10 : i64})
+func @func_arg_attrs(%arg0: i1 {dialect.attr = 10 : i64}) {
+  return
+}
+
+// CHECK-LABEL: func @func_result_attrs({{.*}}) -> (f32 {dialect.attr = 1 : i64})
+func @func_result_attrs(%arg0: f32) -> (f32 {dialect.attr = 1}) {
+  return %arg0 : f32
+}
+
+// CHECK-LABEL: func @empty_tuple(tuple<>)
+func @empty_tuple(tuple<>)
+
+// CHECK-LABEL: func @tuple_single_element(tuple<i32>)
+func @tuple_single_element(tuple<i32>)
+
+// CHECK-LABEL: func @tuple_multi_element(tuple<i32, i16, f32>)
+func @tuple_multi_element(tuple<i32, i16, f32>)
+
+// CHECK-LABEL: func @tuple_nested(tuple<tuple<tuple<i32>>>)
+func @tuple_nested(tuple<tuple<tuple<i32>>>)
+
+// CHECK-LABEL: func @pretty_form_multi_result
+func @pretty_form_multi_result() -> (i16, i16) {
+  // CHECK: %{{.*}}:2 = "foo_div"() : () -> (i16, i16)
+  %quot, %rem = "foo_div"() : () -> (i16, i16)
+  return %quot, %rem : i16, i16
+}
+
+// CHECK-LABEL: func @pretty_form_multi_result_groups
+func @pretty_form_multi_result_groups() -> (i16, i16, i16, i16, i16) {
+  // CHECK: %[[RES:.*]]:5 =
+  // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2, %[[RES]]#3, %[[RES]]#4
+  %group_1:2, %group_2, %group_3:2 = "foo_test"() : () -> (i16, i16, i16, i16, i16)
+  return %group_1#0, %group_1#1, %group_2, %group_3#0, %group_3#1 : i16, i16, i16, i16, i16
+}
+
+// CHECK-LABEL: func @pretty_dialect_attribute()
+func @pretty_dialect_attribute() {
+  // CHECK: "foo.unknown_op"() {foo = #foo.simple_attr} : () -> ()
+  "foo.unknown_op"() {foo = #foo.simple_attr} : () -> ()
+
+  // CHECK: "foo.unknown_op"() {foo = #foo.complexattr<abcd>} : () -> ()
+  "foo.unknown_op"() {foo = #foo.complexattr<abcd>} : () -> ()
+
+  // CHECK: "foo.unknown_op"() {foo = #foo.complexattr<abcd<f32>>} : () -> ()
+  "foo.unknown_op"() {foo = #foo.complexattr<abcd<f32>>} : () -> ()
+
+  // CHECK: "foo.unknown_op"() {foo = #foo.complexattr<abcd<[f]$$[32]>>} : () -> ()
+  "foo.unknown_op"() {foo = #foo.complexattr<abcd<[f]$$[32]>>} : () -> ()
+
+  // CHECK: "foo.unknown_op"() {foo = #foo.dialect<!x@#!@#>} : () -> ()
+  "foo.unknown_op"() {foo = #foo.dialect<!x@#!@#>} : () -> ()
+
+  // Extraneous extra > character can't use the pretty syntax.
+  // CHECK: "foo.unknown_op"() {foo = #foo<"dialect<!x@#!@#>>">} : () -> ()
+  "foo.unknown_op"() {foo = #foo<"dialect<!x@#!@#>>">} : () -> ()
+
+  return
+}
+
+// CHECK-LABEL: func @pretty_dialect_type()
+func @pretty_dialect_type() {
+
+  // CHECK: %{{.*}} = "foo.unknown_op"() : () -> !foo.simpletype
+  %0 = "foo.unknown_op"() : () -> !foo.simpletype
+
+  // CHECK: %{{.*}} = "foo.unknown_op"() : () -> !foo.complextype<abcd>
+  %1 = "foo.unknown_op"() : () -> !foo.complextype<abcd>
+
+  // CHECK: %{{.*}} = "foo.unknown_op"() : () -> !foo.complextype<abcd<f32>>
+  %2 = "foo.unknown_op"() : () -> !foo.complextype<abcd<f32>>
+
+  // CHECK: %{{.*}} = "foo.unknown_op"() : () -> !foo.complextype<abcd<[f]$$[32]>>
+  %3 = "foo.unknown_op"() : () -> !foo.complextype<abcd<[f]$$[32]>>
+
+  // CHECK: %{{.*}} = "foo.unknown_op"() : () -> !foo.dialect<!x@#!@#>
+  %4 = "foo.unknown_op"() : () -> !foo.dialect<!x@#!@#>
+
+  // Extraneous extra > character can't use the pretty syntax.
+  // CHECK: %{{.*}} = "foo.unknown_op"() : () -> !foo<"dialect<!x@#!@#>>">
+  %5 = "foo.unknown_op"() : () -> !foo<"dialect<!x@#!@#>>">
+
+  return
+}
+
+// CHECK-LABEL: func @none_type
+func @none_type() {
+  // CHECK: "foo.unknown_op"() : () -> none
+  %none_val = "foo.unknown_op"() : () -> none
+  return
+}
+
+// CHECK-LABEL: func @scoped_names
+func @scoped_names() {
+  // CHECK-NEXT: "foo.region_op"
+  "foo.region_op"() ({
+    // CHECK-NEXT: "foo.unknown_op"
+    %scoped_name = "foo.unknown_op"() : () -> none
+    "foo.terminator"() : () -> ()
+  }, {
+    // CHECK: "foo.unknown_op"
+    %scoped_name = "foo.unknown_op"() : () -> none
+    "foo.terminator"() : () -> ()
+  }) : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @loc_attr(i1 {foo.loc_attr = loc(callsite("foo" at "mysource.cc":10:8))})
+func @loc_attr(i1 {foo.loc_attr = loc(callsite("foo" at "mysource.cc":10:8))})
+
+// CHECK-LABEL: func @dialect_attribute_with_type
+func @dialect_attribute_with_type() {
+  // CHECK-NEXT: foo = #foo.attr : i32
+  "foo.unknown_op"() {foo = #foo.attr : i32} : () -> ()
+}
+
+// CHECK-LABEL: @f16_special_values
+func @f16_special_values() {
+  // F16 NaNs.
+  // CHECK: constant 0x7C01 : f16
+  %0 = constant 0x7C01 : f16
+  // CHECK: constant 0x7FFF : f16
+  %1 = constant 0x7FFF : f16
+  // CHECK: constant 0xFFFF : f16
+  %2 = constant 0xFFFF : f16
+
+  // F16 positive infinity.
+  // CHECK: constant 0x7C00 : f16
+  %3 = constant 0x7C00 : f16
+  // F16 negative infinity.
+  // CHECK: constant 0xFC00 : f16
+  %4 = constant 0xFC00 : f16
+
+  return
+}
+
+// CHECK-LABEL: @f32_special_values
+func @f32_special_values() {
+  // F32 signaling NaNs.
+  // CHECK: constant 0x7F800001 : f32
+  %0 = constant 0x7F800001 : f32
+  // CHECK: constant 0x7FBFFFFF : f32
+  %1 = constant 0x7FBFFFFF : f32
+
+  // F32 quiet NaNs.
+  // CHECK: constant 0x7FC00000 : f32
+  %2 = constant 0x7FC00000 : f32
+  // CHECK: constant 0xFFFFFFFF : f32
+  %3 = constant 0xFFFFFFFF : f32
+
+  // F32 positive infinity.
+  // CHECK: constant 0x7F800000 : f32
+  %4 = constant 0x7F800000 : f32
+  // F32 negative infinity.
+  // CHECK: constant 0xFF800000 : f32
+  %5 = constant 0xFF800000 : f32
+
+  return
+}
+
+// CHECK-LABEL: @f64_special_values
+func @f64_special_values() {
+  // F64 signaling NaNs.
+  // CHECK: constant 0x7FF0000000000001 : f64
+  %0 = constant 0x7FF0000000000001 : f64
+  // CHECK: constant 0x7FF8000000000000 : f64
+  %1 = constant 0x7FF8000000000000 : f64
+
+  // F64 quiet NaNs.
+  // CHECK: constant 0x7FF0000001000000 : f64
+  %2 = constant 0x7FF0000001000000 : f64
+  // CHECK: constant 0xFFF0000001000000 : f64
+  %3 = constant 0xFFF0000001000000 : f64
+
+  // F64 positive infinity.
+  // CHECK: constant 0x7FF0000000000000 : f64
+  %4 = constant 0x7FF0000000000000 : f64
+  // F64 negative infinity.
+  // CHECK: constant 0xFFF0000000000000 : f64
+  %5 = constant 0xFFF0000000000000 : f64
+
+  return
+}
+
+// We want to print floats in exponential notation with 6 significant digits,
+// but it may lead to precision loss when parsing back, in which case we print
+// the decimal form instead.
+// CHECK-LABEL: @f32_potential_precision_loss()
+func @f32_potential_precision_loss() {
+  // CHECK: constant -1.23697901 : f32
+  %0 = constant -1.23697901 : f32
+  return
+}
+
+// CHECK-LABEL: @special_float_values_in_tensors
+func @special_float_values_in_tensors() {
+  // CHECK: dense<0xFFFFFFFF> : tensor<4x4xf32>
+  "foo"(){bar = dense<0xFFFFFFFF> : tensor<4x4xf32>} : () -> ()
+  // CHECK: dense<[{{\[}}0xFFFFFFFF, 0x7F800000], [0x7FBFFFFF, 0x7F800001]]> : tensor<2x2xf32>
+  "foo"(){bar = dense<[[0xFFFFFFFF, 0x7F800000], [0x7FBFFFFF, 0x7F800001]]> : tensor<2x2xf32>} : () -> ()
+  // CHECK: dense<[0xFFFFFFFF, 0.000000e+00]> : tensor<2xf32>
+  "foo"(){bar = dense<[0xFFFFFFFF, 0.0]> : tensor<2xf32>} : () -> ()
+
+  // CHECK: sparse<[{{\[}}1, 1, 0], [0, 1, 1]], [0xFFFFFFFF, 0x7F800001]>
+  "foo"(){bar = sparse<[[1,1,0],[0,1,1]], [0xFFFFFFFF, 0x7F800001]> : tensor<2x2x2xf32>} : () -> ()
+}
+
+// Test parsing of an op with multiple region arguments, and without a
+// delimiter.
+
+// CHECK-LABEL: func @op_with_region_args
+func @op_with_region_args() {
+  // CHECK: "test.polyfor"() ( {
+  // CHECK-NEXT: ^bb{{.*}}(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index):
+  test.polyfor %i, %j, %k {
+    "foo"() : () -> ()
+  }
+  return
+}
+
+// Test allowing different name scopes for regions isolated from above.
+
+// CHECK-LABEL: func @op_with_passthrough_region_args
+func @op_with_passthrough_region_args() {
+  // CHECK: [[VAL:%.*]] = constant
+  %0 = constant 10 : index
+
+  // CHECK: test.isolated_region [[VAL]] {
+  // CHECK-NEXT: "foo.consumer"([[VAL]]) : (index)
+  // CHECK-NEXT: }
+  test.isolated_region %0 {
+    "foo.consumer"(%0) : (index) -> ()
+  }
+
+  // CHECK: [[VAL:%.*]]:2 = "foo.op"
+  %result:2 = "foo.op"() : () -> (index, index)
+
+  // CHECK: test.isolated_region [[VAL]]#1 {
+  // CHECK-NEXT: "foo.consumer"([[VAL]]#1) : (index)
+  // CHECK-NEXT: }
+  test.isolated_region %result#1 {
+    "foo.consumer"(%result#1) : (index) -> ()
+  }
+
+  return
+}
+
+// CHECK-LABEL: func @ptr_to_function() -> !unreg.ptr<() -> ()>
+func @ptr_to_function() -> !unreg.ptr<() -> ()>
+
+// CHECK-LABEL: func @escaped_string_char(i1 {foo.value = "\0A"})
+func @escaped_string_char(i1 {foo.value = "\n"})
+
+// CHECK-LABEL: func @wrapped_keyword_test
+func @wrapped_keyword_test() {
+  // CHECK: test.wrapped_keyword foo.keyword
+  test.wrapped_keyword foo.keyword
+  return
+}
+
+// CHECK-LABEL: func @"\22_string_symbol_reference\22"
+func @"\"_string_symbol_reference\""() {
+  // CHECK: ref = @"\22_string_symbol_reference\22"
+  "foo.symbol_reference"() {ref = @"\"_string_symbol_reference\""} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @nested_reference
+// CHECK: ref = @some_symbol::@some_nested_symbol
+func @nested_reference() attributes {test.ref = @some_symbol::@some_nested_symbol }
+
+// CHECK-LABEL: func @custom_asm_names
+func @custom_asm_names() -> (i32, i32, i32, i32, i32, i32, i32) {
+  // CHECK: %[[FIRST:first.*]], %[[MIDDLE:middle_results.*]]:2, %[[LAST:[0-9]+]]
+  %0, %1:2, %2 = "test.asm_interface_op"() : () -> (i32, i32, i32, i32)
+
+  // CHECK: %[[FIRST_2:first.*]], %[[LAST_2:[0-9]+]]
+  %3, %4 = "test.asm_interface_op"() : () -> (i32, i32)
+
+  // CHECK: %[[RESULT:result.*]]
+  %5 = "test.asm_dialect_interface_op"() : () -> (i32)
+
+  // CHECK: return %[[FIRST]], %[[MIDDLE]]#0, %[[MIDDLE]]#1, %[[LAST]], %[[FIRST_2]], %[[LAST_2]]
+  return %0, %1#0, %1#1, %2, %3, %4, %5 : i32, i32, i32, i32, i32, i32, i32
+}
diff --git a/mlir/test/IR/pretty-attributes.mlir b/mlir/test/IR/pretty-attributes.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..bbcbe5b7b38c8a4ef419c3d21f9e905454e71caa
--- /dev/null
+++ b/mlir/test/IR/pretty-attributes.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-opt %s -mlir-elide-elementsattrs-if-larger=2 | FileCheck %s
+// Ensure that the elided version is still parseable, although depending on
+// what has been elided, it may not be semantically meaningful.
+// In the typical case where what is being elided is a very large constant
+// tensor which passes don't look at directly, this isn't an issue.
+// RUN: mlir-opt %s -mlir-elide-elementsattrs-if-larger=2 | mlir-opt
+
+// CHECK: opaque<"", "0xDEADBEEF"> : tensor<3xi32>
+"test.dense_attr"() {foo.dense_attr = dense<[1, 2, 3]> : tensor<3xi32>} : () -> ()
+
+// CHECK: dense<[1, 2]> : tensor<2xi32>
+"test.non_elided_dense_attr"() {foo.dense_attr = dense<[1, 2]> : tensor<2xi32>} : () -> ()
+
+// CHECK: opaque<"", "0xDEADBEEF"> : vector<1x1x1xf16>
+"test.sparse_attr"() {foo.sparse_attr = sparse<[[1, 2, 3]],  -2.0> : vector<1x1x1xf16>} : () -> ()
+
+// CHECK: opaque<"", "0xDEADBEEF"> : tensor<100xf32>
+"test.opaue_attr"() {foo.opaque_attr = opaque<"", "0xEBFE"> : tensor<100xf32> } : () -> ()
diff --git a/mlir/test/IR/pretty-locations.mlir b/mlir/test/IR/pretty-locations.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..da76ab94c5f21c3b8f627c6ec98e89f75e268be6
--- /dev/null
+++ b/mlir/test/IR/pretty-locations.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-opt %s -mlir-print-debuginfo -mlir-pretty-debuginfo | FileCheck %s
+
+#set0 = (d0) : (1 == 0)
+
+// CHECK-LABEL: func @inline_notation
+func @inline_notation() -> i32 {
+  // CHECK: -> i32 "foo"
+  %1 = "foo"() : () -> i32 loc("foo")
+
+  // CHECK: constant 4 : index "foo" at mysource.cc:10:8
+  %2 = constant 4 : index loc(callsite("foo" at "mysource.cc":10:8))
+
+  // CHECK:      constant 4 : index "foo"
+  // CHECK-NEXT:  at mysource1.cc:10:8
+  // CHECK-NEXT:  at mysource2.cc:13:8
+  // CHECK-NEXT:  at mysource3.cc:100:10
+  %3 = constant 4 : index loc(callsite("foo" at callsite("mysource1.cc":10:8 at callsite("mysource2.cc":13:8 at "mysource3.cc":100:10))))
+
+  // CHECK: } ["foo", mysource.cc:10:8]
+  affine.for %i0 = 0 to 8 {
+  } loc(fused["foo", "mysource.cc":10:8])
+
+  // CHECK: } <"myPass">["foo", "foo2"]
+  affine.if #set0(%2) {
+  } loc(fused<"myPass">["foo", "foo2"])
+
+  // CHECK: return %0 : i32 [unknown]
+  return %1 : i32 loc(unknown)
+}
diff --git a/mlir/test/IR/pretty-region-args.mlir b/mlir/test/IR/pretty-region-args.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..59a9ebce092ab2167e46da4869aec048dcb94116
--- /dev/null
+++ b/mlir/test/IR/pretty-region-args.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+// CHECK-LABEL: func @custom_region_names
+func @custom_region_names() -> () {
+  "test.polyfor"() ( {
+  ^bb0(%arg0: index, %arg1: index, %arg2: index):
+    "foo"() : () -> ()
+  }) { arg_names = ["i", "j", "k"] } : () -> ()
+  // CHECK: test.polyfor
+  // CHECK-NEXT: ^bb{{.*}}(%i: index, %j: index, %k: index):
+  return
+}
diff --git a/mlir/test/IR/print-op-local-scope.mlir b/mlir/test/IR/print-op-local-scope.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8fef248f4f14e8fd4d298be6b0ca63b4f95d83b8
--- /dev/null
+++ b/mlir/test/IR/print-op-local-scope.mlir
@@ -0,0 +1,5 @@
+// RUN: mlir-opt %s -mlir-print-local-scope | FileCheck %s --dump-input-on-failure
+
+// CHECK: "foo.op"() : () -> memref<?xf32, (d0) -> (d0 * 2)>
+"foo.op"() : () -> (memref<?xf32, (d0) -> (2*d0)>)
+
diff --git a/mlir/test/IR/print-op-on-diagnostic.mlir b/mlir/test/IR/print-op-on-diagnostic.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..439aa404f13f3f3ee819511aba1c46595410cee6
--- /dev/null
+++ b/mlir/test/IR/print-op-on-diagnostic.mlir
@@ -0,0 +1,7 @@
+// RUN: mlir-opt %s -verify-diagnostics -mlir-print-op-on-diagnostic
+
+// This file tests the functionality of 'mlir-print-op-on-diagnostic'.
+
+// expected-error@below {{invalid to use 'test.invalid_attr'}}
+// expected-note@below {{see current operation: "module"()}}
+module attributes {test.invalid_attr} {}
diff --git a/mlir/test/IR/region.mlir b/mlir/test/IR/region.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..27e19d612e2d27b5c8014ea41bcd65d30176e98e
--- /dev/null
+++ b/mlir/test/IR/region.mlir
@@ -0,0 +1,75 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Test the number of regions
+//===----------------------------------------------------------------------===//
+
+func @correct_number_of_regions() {
+    // CHECK: test.two_region_op
+    "test.two_region_op"()(
+      {"work"() : () -> ()},
+      {"work"() : () -> ()}
+    ) : () -> ()
+    return
+}
+
+// -----
+
+func @missing_regions() {
+    // expected-error@+1 {{op has incorrect number of regions: expected 2 but found 1}}
+    "test.two_region_op"()(
+      {"work"() : () -> ()}
+    ) : () -> ()
+    return
+}
+
+// -----
+
+func @extra_regions() {
+    // expected-error@+1 {{op has incorrect number of regions: expected 2 but found 3}}
+    "test.two_region_op"()(
+      {"work"() : () -> ()},
+      {"work"() : () -> ()},
+      {"work"() : () -> ()}
+    ) : () -> ()
+    return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Test SizedRegion
+//===----------------------------------------------------------------------===//
+
+func @unnamed_region_has_wrong_number_of_blocks() {
+    // expected-error@+1 {{region #1 failed to verify constraint: region with 1 blocks}}
+    "test.sized_region_op"() (
+    {
+        "work"() : () -> ()
+        br ^next1
+      ^next1:
+        "work"() : () -> ()
+    },
+    {
+        "work"() : () -> ()
+        br ^next2
+      ^next2:
+        "work"() : () -> ()
+    }) : () -> ()
+    return
+}
+
+// -----
+
+// Test region name in error message
+func @named_region_has_wrong_number_of_blocks() {
+    // expected-error@+1 {{region #0 ('my_region') failed to verify constraint: region with 2 blocks}}
+    "test.sized_region_op"() (
+    {
+        "work"() : () -> ()
+    },
+    {
+        "work"() : () -> ()
+    }) : () -> ()
+    return
+}
diff --git a/mlir/test/IR/repro_b120295301.mlir b/mlir/test/IR/repro_b120295301.mlir
new file mode 100755
index 0000000000000000000000000000000000000000..bef34d13b2380d4e07c0bef2cab66745db35fc80
--- /dev/null
+++ b/mlir/test/IR/repro_b120295301.mlir
@@ -0,0 +1,110 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+func @testType(tensor<1x224x224x3xf32>) -> tensor<96xf32> {
+^bb0(%arg0: tensor<1x224x224x3xf32>):
+  %1  = "std.constant"() {value = dense<0.1> : tensor<1xf32>} : () -> (tensor<1xf32>)
+  %2  = "std.constant"() {value = dense<0.1> : tensor<2xf32>} : () -> (tensor<2xf32>)
+  %3  = "std.constant"() {value = dense<0.1> : tensor<3xf32>} : () -> (tensor<3xf32>)
+  %4  = "std.constant"() {value = dense<0.1> : tensor<4xf32>} : () -> (tensor<4xf32>)
+  %5  = "std.constant"() {value = dense<0.1> : tensor<5xf32>} : () -> (tensor<5xf32>)
+  %6  = "std.constant"() {value = dense<0.1> : tensor<6xf32>} : () -> (tensor<6xf32>)
+  %7  = "std.constant"() {value = dense<0.1> : tensor<7xf32>} : () -> (tensor<7xf32>)
+  %8  = "std.constant"() {value = dense<0.1> : tensor<8xf32>} : () -> (tensor<8xf32>)
+  %9  = "std.constant"() {value = dense<0.1> : tensor<9xf32>} : () -> (tensor<9xf32>)
+  %10  = "std.constant"() {value = dense<0.1> : tensor<10xf32>} : () -> (tensor<10xf32>)
+  %11  = "std.constant"() {value = dense<0.1> : tensor<11xf32>} : () -> (tensor<11xf32>)
+  %12  = "std.constant"() {value = dense<0.1> : tensor<12xf32>} : () -> (tensor<12xf32>)
+  %13  = "std.constant"() {value = dense<0.1> : tensor<13xf32>} : () -> (tensor<13xf32>)
+  %14  = "std.constant"() {value = dense<0.1> : tensor<14xf32>} : () -> (tensor<14xf32>)
+  %15  = "std.constant"() {value = dense<0.1> : tensor<15xf32>} : () -> (tensor<15xf32>)
+  %16  = "std.constant"() {value = dense<0.1> : tensor<16xf32>} : () -> (tensor<16xf32>)
+  %17  = "std.constant"() {value = dense<0.1> : tensor<17xf32>} : () -> (tensor<17xf32>)
+  %18  = "std.constant"() {value = dense<0.1> : tensor<18xf32>} : () -> (tensor<18xf32>)
+  %19  = "std.constant"() {value = dense<0.1> : tensor<19xf32>} : () -> (tensor<19xf32>)
+  %20  = "std.constant"() {value = dense<0.1> : tensor<20xf32>} : () -> (tensor<20xf32>)
+  %21  = "std.constant"() {value = dense<0.1> : tensor<21xf32>} : () -> (tensor<21xf32>)
+  %22  = "std.constant"() {value = dense<0.1> : tensor<22xf32>} : () -> (tensor<22xf32>)
+  %23  = "std.constant"() {value = dense<0.1> : tensor<23xf32>} : () -> (tensor<23xf32>)
+  %24  = "std.constant"() {value = dense<0.1> : tensor<24xf32>} : () -> (tensor<24xf32>)
+  %25  = "std.constant"() {value = dense<0.1> : tensor<25xf32>} : () -> (tensor<25xf32>)
+  %26  = "std.constant"() {value = dense<0.1> : tensor<26xf32>} : () -> (tensor<26xf32>)
+  %27  = "std.constant"() {value = dense<0.1> : tensor<27xf32>} : () -> (tensor<27xf32>)
+  %28  = "std.constant"() {value = dense<0.1> : tensor<28xf32>} : () -> (tensor<28xf32>)
+  %29  = "std.constant"() {value = dense<0.1> : tensor<29xf32>} : () -> (tensor<29xf32>)
+  %30  = "std.constant"() {value = dense<0.1> : tensor<30xf32>} : () -> (tensor<30xf32>)
+  %31  = "std.constant"() {value = dense<0.1> : tensor<31xf32>} : () -> (tensor<31xf32>)
+  %32  = "std.constant"() {value = dense<0.1> : tensor<32xf32>} : () -> (tensor<32xf32>)
+  %33  = "std.constant"() {value = dense<0.1> : tensor<33xf32>} : () -> (tensor<33xf32>)
+  %34  = "std.constant"() {value = dense<0.1> : tensor<34xf32>} : () -> (tensor<34xf32>)
+  %35  = "std.constant"() {value = dense<0.1> : tensor<35xf32>} : () -> (tensor<35xf32>)
+  %36  = "std.constant"() {value = dense<0.1> : tensor<36xf32>} : () -> (tensor<36xf32>)
+  %37  = "std.constant"() {value = dense<0.1> : tensor<37xf32>} : () -> (tensor<37xf32>)
+  %38  = "std.constant"() {value = dense<0.1> : tensor<38xf32>} : () -> (tensor<38xf32>)
+  %39  = "std.constant"() {value = dense<0.1> : tensor<39xf32>} : () -> (tensor<39xf32>)
+  %40  = "std.constant"() {value = dense<0.1> : tensor<40xf32>} : () -> (tensor<40xf32>)
+  %41  = "std.constant"() {value = dense<0.1> : tensor<41xf32>} : () -> (tensor<41xf32>)
+  %42  = "std.constant"() {value = dense<0.1> : tensor<42xf32>} : () -> (tensor<42xf32>)
+  %43  = "std.constant"() {value = dense<0.1> : tensor<43xf32>} : () -> (tensor<43xf32>)
+  %44  = "std.constant"() {value = dense<0.1> : tensor<44xf32>} : () -> (tensor<44xf32>)
+  %45  = "std.constant"() {value = dense<0.1> : tensor<45xf32>} : () -> (tensor<45xf32>)
+  %46  = "std.constant"() {value = dense<0.1> : tensor<46xf32>} : () -> (tensor<46xf32>)
+  %47  = "std.constant"() {value = dense<0.1> : tensor<47xf32>} : () -> (tensor<47xf32>)
+  %48  = "std.constant"() {value = dense<0.1> : tensor<48xf32>} : () -> (tensor<48xf32>)
+  %49  = "std.constant"() {value = dense<0.1> : tensor<49xf32>} : () -> (tensor<49xf32>)
+  %50  = "std.constant"() {value = dense<0.1> : tensor<50xf32>} : () -> (tensor<50xf32>)
+  %51  = "std.constant"() {value = dense<0.1> : tensor<51xf32>} : () -> (tensor<51xf32>)
+  %52  = "std.constant"() {value = dense<0.1> : tensor<52xf32>} : () -> (tensor<52xf32>)
+  %53  = "std.constant"() {value = dense<0.1> : tensor<53xf32>} : () -> (tensor<53xf32>)
+  %54  = "std.constant"() {value = dense<0.1> : tensor<54xf32>} : () -> (tensor<54xf32>)
+  %55  = "std.constant"() {value = dense<0.1> : tensor<55xf32>} : () -> (tensor<55xf32>)
+  %56  = "std.constant"() {value = dense<0.1> : tensor<56xf32>} : () -> (tensor<56xf32>)
+  %57  = "std.constant"() {value = dense<0.1> : tensor<57xf32>} : () -> (tensor<57xf32>)
+  %58  = "std.constant"() {value = dense<0.1> : tensor<58xf32>} : () -> (tensor<58xf32>)
+  %59  = "std.constant"() {value = dense<0.1> : tensor<59xf32>} : () -> (tensor<59xf32>)
+  %60  = "std.constant"() {value = dense<0.1> : tensor<60xf32>} : () -> (tensor<60xf32>)
+  %61  = "std.constant"() {value = dense<0.1> : tensor<61xf32>} : () -> (tensor<61xf32>)
+  %62  = "std.constant"() {value = dense<0.1> : tensor<62xf32>} : () -> (tensor<62xf32>)
+  %63  = "std.constant"() {value = dense<0.1> : tensor<63xf32>} : () -> (tensor<63xf32>)
+  %64  = "std.constant"() {value = dense<0.1> : tensor<64xf32>} : () -> (tensor<64xf32>)
+  %65  = "std.constant"() {value = dense<0.1> : tensor<65xf32>} : () -> (tensor<65xf32>)
+  %66  = "std.constant"() {value = dense<0.1> : tensor<66xf32>} : () -> (tensor<66xf32>)
+  %67  = "std.constant"() {value = dense<0.1> : tensor<67xf32>} : () -> (tensor<67xf32>)
+  %68  = "std.constant"() {value = dense<0.1> : tensor<68xf32>} : () -> (tensor<68xf32>)
+  %69  = "std.constant"() {value = dense<0.1> : tensor<69xf32>} : () -> (tensor<69xf32>)
+  %70  = "std.constant"() {value = dense<0.1> : tensor<70xf32>} : () -> (tensor<70xf32>)
+  %71  = "std.constant"() {value = dense<0.1> : tensor<71xf32>} : () -> (tensor<71xf32>)
+  %72  = "std.constant"() {value = dense<0.1> : tensor<72xf32>} : () -> (tensor<72xf32>)
+  %73  = "std.constant"() {value = dense<0.1> : tensor<73xf32>} : () -> (tensor<73xf32>)
+  %74  = "std.constant"() {value = dense<0.1> : tensor<74xf32>} : () -> (tensor<74xf32>)
+  %75  = "std.constant"() {value = dense<0.1> : tensor<75xf32>} : () -> (tensor<75xf32>)
+  %76  = "std.constant"() {value = dense<0.1> : tensor<76xf32>} : () -> (tensor<76xf32>)
+  %77  = "std.constant"() {value = dense<0.1> : tensor<77xf32>} : () -> (tensor<77xf32>)
+  %78  = "std.constant"() {value = dense<0.1> : tensor<78xf32>} : () -> (tensor<78xf32>)
+  %79  = "std.constant"() {value = dense<0.1> : tensor<79xf32>} : () -> (tensor<79xf32>)
+  %80  = "std.constant"() {value = dense<0.1> : tensor<80xf32>} : () -> (tensor<80xf32>)
+  %81  = "std.constant"() {value = dense<0.1> : tensor<81xf32>} : () -> (tensor<81xf32>)
+  %82  = "std.constant"() {value = dense<0.1> : tensor<82xf32>} : () -> (tensor<82xf32>)
+  %83  = "std.constant"() {value = dense<0.1> : tensor<83xf32>} : () -> (tensor<83xf32>)
+  %84  = "std.constant"() {value = dense<0.1> : tensor<84xf32>} : () -> (tensor<84xf32>)
+  %85  = "std.constant"() {value = dense<0.1> : tensor<85xf32>} : () -> (tensor<85xf32>)
+  %86  = "std.constant"() {value = dense<0.1> : tensor<86xf32>} : () -> (tensor<86xf32>)
+  %87  = "std.constant"() {value = dense<0.1> : tensor<87xf32>} : () -> (tensor<87xf32>)
+  %88  = "std.constant"() {value = dense<0.1> : tensor<88xf32>} : () -> (tensor<88xf32>)
+  %89  = "std.constant"() {value = dense<0.1> : tensor<89xf32>} : () -> (tensor<89xf32>)
+  %90  = "std.constant"() {value = dense<0.1> : tensor<90xf32>} : () -> (tensor<90xf32>)
+  %91  = "std.constant"() {value = dense<0.1> : tensor<91xf32>} : () -> (tensor<91xf32>)
+  %92  = "std.constant"() {value = dense<0.1> : tensor<92xf32>} : () -> (tensor<92xf32>)
+  %93  = "std.constant"() {value = dense<0.1> : tensor<93xf32>} : () -> (tensor<93xf32>)
+  %94  = "std.constant"() {value = dense<0.1> : tensor<94xf32>} : () -> (tensor<94xf32>)
+  %95  = "std.constant"() {value = dense<0.1> : tensor<95xf32>} : () -> (tensor<95xf32>)
+  %96  = "std.constant"() {value = dense<0.1> : tensor<96xf32>} : () -> (tensor<96xf32>)
+  %97  = "std.constant"() {value = dense<0.1> : tensor<97xf32>} : () -> (tensor<97xf32>)
+  %98  = "std.constant"() {value = dense<0.1> : tensor<98xf32>} : () -> (tensor<98xf32>)
+  %99  = "std.constant"() {value = dense<0.1> : tensor<99xf32>} : () -> (tensor<99xf32>)
+  %100  = "std.constant"() {value = dense<0.1> : tensor<100xf32>} : () -> (tensor<100xf32>)
+  %101  = "std.constant"() {value = dense<0.1> : tensor<101xf32>} : () -> (tensor<101xf32>)
+  %102  = "std.constant"() {value = dense<0.1> : tensor<102xf32>} : () -> (tensor<102xf32>)
+  return %96 : tensor<96xf32>
+}
+// CHECK: testType
+// CHECK: return %cst_94
diff --git a/mlir/test/IR/result.mlir b/mlir/test/IR/result.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..cef89b222ccf3562adee31ef2fc9d09d63739d98
--- /dev/null
+++ b/mlir/test/IR/result.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Test mixed normal and variadic results
+//===----------------------------------------------------------------------===//
+
+func @correct_variadic_result() -> tensor<f32> {
+  // CHECK: mixed_normal_variadic_result
+  %0:5 = "test.mixed_normal_variadic_result"() : () -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>)
+  return %0#4 : tensor<f32>
+}
+
+// -----
+
+func @error_in_first_variadic_result() -> tensor<f32> {
+  // expected-error @+1 {{result #1 must be tensor of any type}}
+  %0:5 = "test.mixed_normal_variadic_result"() : () -> (tensor<f32>, f32, tensor<f32>, tensor<f32>, tensor<f32>)
+  return %0#4 : tensor<f32>
+}
+
+// -----
+
+func @error_in_normal_result() -> tensor<f32> {
+  // expected-error @+1 {{result #2 must be tensor of any type}}
+  %0:5 = "test.mixed_normal_variadic_result"() : () -> (tensor<f32>, tensor<f32>, f32, tensor<f32>, tensor<f32>)
+  return %0#4 : tensor<f32>
+}
+
+// -----
+
+func @error_in_second_variadic_result() -> tensor<f32> {
+  // expected-error @+1 {{result #3 must be tensor of any type}}
+  %0:5 = "test.mixed_normal_variadic_result"() : () -> (tensor<f32>, tensor<f32>, tensor<f32>, f32, tensor<f32>)
+  return %0#4 : tensor<f32>
+}
+
diff --git a/mlir/test/IR/test-func-erase-arg.mlir b/mlir/test/IR/test-func-erase-arg.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..2d6c71e867e2a1340be593b97bd9d8b0184adddf
--- /dev/null
+++ b/mlir/test/IR/test-func-erase-arg.mlir
@@ -0,0 +1,76 @@
+// RUN: mlir-opt %s -test-func-erase-arg -split-input-file | FileCheck %s
+
+// CHECK: func @f()
+// CHECK-NOT: attributes{{.*}}arg
+func @f(%arg0: f32 {test.erase_this_arg}) {
+  return
+}
+
+// -----
+
+// CHECK: func @f(%arg0: f32 {test.A})
+// CHECK-NOT: attributes{{.*}}arg
+func @f(
+  %arg0: f32 {test.erase_this_arg},
+  %arg1: f32 {test.A}) {
+  return
+}
+
+// -----
+
+// CHECK: func @f(%arg0: f32 {test.A})
+// CHECK-NOT: attributes{{.*}}arg
+func @f(
+  %arg0: f32 {test.A},
+  %arg1: f32 {test.erase_this_arg}) {
+  return
+}
+
+// -----
+
+// CHECK: func @f(%arg0: f32 {test.A}, %arg1: f32 {test.B})
+// CHECK-NOT: attributes{{.*}}arg
+func @f(
+  %arg0: f32 {test.A},
+  %arg1: f32 {test.erase_this_arg},
+  %arg2: f32 {test.B}) {
+  return
+}
+
+// -----
+
+// CHECK: func @f(%arg0: f32 {test.A}, %arg1: f32 {test.B})
+// CHECK-NOT: attributes{{.*}}arg
+func @f(
+  %arg0: f32 {test.A},
+  %arg1: f32 {test.erase_this_arg},
+  %arg2: f32 {test.erase_this_arg},
+  %arg3: f32 {test.B}) {
+  return
+}
+
+// -----
+
+// CHECK: func @f(%arg0: f32 {test.A}, %arg1: f32 {test.B}, %arg2: f32 {test.C})
+// CHECK-NOT: attributes{{.*}}arg
+func @f(
+  %arg0: f32 {test.A},
+  %arg1: f32 {test.erase_this_arg},
+  %arg2: f32 {test.B},
+  %arg3: f32 {test.erase_this_arg},
+  %arg4: f32 {test.C}) {
+  return
+}
+
+// -----
+
+// CHECK: func @f(%arg0: tensor<1xf32>, %arg1: tensor<2xf32>, %arg2: tensor<3xf32>)
+// CHECK-NOT: attributes{{.*}}arg
+func @f(
+  %arg0: tensor<1xf32>,
+  %arg1: f32 {test.erase_this_arg},
+  %arg2: tensor<2xf32>,
+  %arg3: f32 {test.erase_this_arg},
+  %arg4: tensor<3xf32>) {
+  return
+}
diff --git a/mlir/test/IR/test-func-set-type.mlir b/mlir/test/IR/test-func-set-type.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..0ec890ed92ac6e8029e472c398633a4018ecda4e
--- /dev/null
+++ b/mlir/test/IR/test-func-set-type.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-opt %s -test-func-set-type -split-input-file | FileCheck %s --dump-input=fail
+
+// It's currently not possible to have an attribute with a function type due to
+// parser ambiguity. So instead we reference a function declaration to take the
+// type from.
+
+// -----
+
+// Test case: The setType call needs to erase some arg attrs.
+
+// CHECK: func @erase_arg(f32 {test.A})
+// CHECK-NOT: attributes{{.*arg[0-9]}}
+func @t(f32)
+func @erase_arg(%arg0: f32 {test.A}, %arg1: f32 {test.B})
+attributes {test.set_type_from = @t}
+
+// -----
+
+// Test case: The setType call needs to erase some result attrs.
+
+// CHECK: func @erase_result() -> (f32 {test.A})
+// CHECK-NOT: attributes{{.*result[0-9]}}
+func @t() -> (f32)
+func @erase_result() -> (f32 {test.A}, f32 {test.B})
+attributes {test.set_type_from = @t}
diff --git a/mlir/test/IR/test-matchers.mlir b/mlir/test/IR/test-matchers.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..7808f25a2f8ee75e0b073bb9bf29362ada608ba6
--- /dev/null
+++ b/mlir/test/IR/test-matchers.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-opt %s -disable-pass-threading=true -test-matchers -o /dev/null 2>&1 | FileCheck %s
+
+func @test1(%a: f32, %b: f32, %c: f32) {
+  %0 = addf %a, %b: f32
+  %1 = addf %a, %c: f32
+  %2 = addf %c, %b: f32
+  %3 = mulf %a, %2: f32
+  %4 = mulf %3, %1: f32
+  %5 = mulf %4, %4: f32
+  %6 = mulf %5, %5: f32
+  return
+}
+
+// CHECK-LABEL: test1
+//       CHECK:   Pattern add(*) matched 3 times
+//       CHECK:   Pattern mul(*) matched 4 times
+//       CHECK:   Pattern add(add(*), *) matched 0 times
+//       CHECK:   Pattern add(*, add(*)) matched 0 times
+//       CHECK:   Pattern mul(add(*), *) matched 0 times
+//       CHECK:   Pattern mul(*, add(*)) matched 2 times
+//       CHECK:   Pattern mul(mul(*), *) matched 3 times
+//       CHECK:   Pattern mul(mul(*), mul(*)) matched 2 times
+//       CHECK:   Pattern mul(mul(mul(*), mul(*)), mul(mul(*), mul(*))) matched 1 times
+//       CHECK:   Pattern mul(mul(mul(mul(*), add(*)), mul(*)), mul(mul(*, add(*)), mul(*, add(*)))) matched 1 times
+//       CHECK:   Pattern add(a, b) matched 1 times
+//       CHECK:   Pattern add(a, c) matched 1 times
+//       CHECK:   Pattern add(b, a) matched 0 times
+//       CHECK:   Pattern add(c, a) matched 0 times
+//       CHECK:   Pattern mul(a, add(c, b)) matched 1 times
+//       CHECK:   Pattern mul(a, add(b, c)) matched 0 times
+//       CHECK:   Pattern mul(mul(a, *), add(a, c)) matched 1 times
+//       CHECK:   Pattern mul(mul(a, *), add(c, b)) matched 0 times
+
+func @test2(%a: f32) -> f32 {
+  %0 = constant 1.0: f32
+  %1 = addf %a, %0: f32
+  %2 = mulf %a, %1: f32
+  return %2: f32
+}
+
+// CHECK-LABEL: test2
+//       CHECK:   Pattern add(add(a, constant), a) matched and bound constant to: 1.000000e+00
diff --git a/mlir/test/IR/test-symbol-rauw.mlir b/mlir/test/IR/test-symbol-rauw.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..963875e762e737ada080c65cf3cb29ec504e1b28
--- /dev/null
+++ b/mlir/test/IR/test-symbol-rauw.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-opt %s -test-symbol-rauw -split-input-file | FileCheck %s
+
+// Symbol references to the module itself don't affect uses of symbols within
+// its table.
+// CHECK: module
+// CHECK-SAME: @symbol_foo
+module attributes {sym.outside_use = @symbol_foo } {
+  // CHECK: func @replaced_foo
+  func @symbol_foo() attributes {sym.new_name = "replaced_foo" }
+
+  // CHECK: func @symbol_bar
+  // CHECK: @replaced_foo
+  func @symbol_bar() attributes {sym.use = @symbol_foo} {
+    // CHECK: foo.op
+    // CHECK-SAME: non_symbol_attr,
+    // CHECK-SAME: use = [{nested_symbol = [@replaced_foo], other_use = @symbol_bar, z_use = @replaced_foo}],
+    // CHECK-SAME: z_non_symbol_attr_3
+    "foo.op"() {
+      non_symbol_attr,
+      use = [{nested_symbol = [@symbol_foo], other_use = @symbol_bar, z_use = @symbol_foo}],
+      z_non_symbol_attr_3
+    } : () -> ()
+  }
+
+  // CHECK: module attributes {test.reference = @replaced_foo}
+  module attributes {test.reference = @symbol_foo} {
+    // CHECK: foo.op
+    // CHECK-SAME: @symbol_foo
+    "foo.op"() {test.nested_reference = @symbol_foo} : () -> ()
+  }
+}
+
+// -----
+
+// Check that the replacement fails for potentially unknown symbol tables.
+module {
+  // CHECK: func @failed_repl
+  func @failed_repl() attributes {sym.new_name = "replaced_name" }
+
+  "foo.possibly_unknown_symbol_table"() ({
+  }) : () -> ()
+}
diff --git a/mlir/test/IR/test-symbol-uses.mlir b/mlir/test/IR/test-symbol-uses.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..4d6555aceedf2fc61754691dfbaf8fd7b447ac7a
--- /dev/null
+++ b/mlir/test/IR/test-symbol-uses.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-opt %s -test-symbol-uses -split-input-file -verify-diagnostics
+
+// Symbol references to the module itself don't affect uses of symbols within
+// its table.
+// expected-remark@below {{symbol_removable function successfully erased}}
+module attributes {sym.outside_use = @symbol_foo } {
+  // expected-remark@+1 {{function has 2 uses}}
+  func @symbol_foo()
+
+  // expected-remark@below {{function has no uses}}
+  // expected-remark@below {{found use of function : @symbol_foo}}
+  // expected-remark@below {{function contains 2 nested references}}
+  func @symbol_bar() attributes {sym.use = @symbol_foo} {
+    // expected-remark@+1 {{found use of function : @symbol_foo}}
+    "foo.op"() {
+      non_symbol_attr,
+      use = [{ nested_symbol = [@symbol_foo]}],
+      z_other_non_symbol_attr
+    } : () -> ()
+  }
+
+  // expected-remark@below {{function has no uses}}
+  func @symbol_removable()
+
+  // expected-remark@+1 {{function has 1 use}}
+  func @symbol_baz()
+
+  // expected-remark@+1 {{found use of function : @symbol_baz}}
+  module attributes {test.reference = @symbol_baz} {
+    "foo.op"() {test.nested_reference = @symbol_baz} : () -> ()
+  }
+}
+
+// -----
+
+// expected-remark@+1 {{contains an unknown nested operation that 'may' define a new symbol table}}
+func @symbol_bar() {
+  "foo.possibly_unknown_symbol_table"() ({
+  }) : () -> ()
+}
diff --git a/mlir/test/IR/traits.mlir b/mlir/test/IR/traits.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..794ed4cd4f7714298b9916c7dfbf5cf3c45a21de
--- /dev/null
+++ b/mlir/test/IR/traits.mlir
@@ -0,0 +1,328 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK: succeededSameOperandsElementType
+func @succeededSameOperandsElementType(%t10x10 : tensor<10x10xf32>, %t1f: tensor<1xf32>, %v1: vector<1xf32>, %t1i: tensor<1xi32>, %sf: f32) {
+  "test.same_operand_element_type"(%t1f, %t1f) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xi32>
+  "test.same_operand_element_type"(%t1f, %t10x10) : (tensor<1xf32>, tensor<10x10xf32>) -> tensor<1xi32>
+  "test.same_operand_element_type"(%t10x10, %v1) : (tensor<10x10xf32>, vector<1xf32>) -> tensor<1xi32>
+  "test.same_operand_element_type"(%v1, %t1f) : (vector<1xf32>, tensor<1xf32>) -> tensor<1xi32>
+  "test.same_operand_element_type"(%v1, %t1f) : (vector<1xf32>, tensor<1xf32>) -> tensor<121xi32>
+  "test.same_operand_element_type"(%sf, %sf) : (f32, f32) -> i32
+  "test.same_operand_element_type"(%sf, %t1f) : (f32, tensor<1xf32>) -> tensor<121xi32>
+  "test.same_operand_element_type"(%sf, %v1) : (f32, vector<1xf32>) -> tensor<121xi32>
+  "test.same_operand_element_type"(%sf, %t10x10) : (f32, tensor<10x10xf32>) -> tensor<121xi32>
+  return
+}
+
+// -----
+
+func @failedSameOperandElementType(%t1f: tensor<1xf32>, %t1i: tensor<1xi32>) {
+  // expected-error@+1 {{requires the same element type for all operands}}
+  "test.same_operand_element_type"(%t1f, %t1i) : (tensor<1xf32>, tensor<1xi32>) -> tensor<1xf32>
+}
+
+// -----
+
+func @failedSameOperandAndResultElementType_no_operands() {
+  // expected-error@+1 {{expected 1 or more operands}}
+  "test.same_operand_element_type"() : () -> tensor<1xf32>
+}
+
+// -----
+
+func @failedSameOperandElementType_scalar_type_mismatch(%si: i32, %sf: f32) {
+  // expected-error@+1 {{requires the same element type for all operands}}
+  "test.same_operand_element_type"(%sf, %si) : (f32, i32) -> tensor<1xf32>
+}
+
+// -----
+
+// CHECK: succeededSameOperandAndResultElementType
+func @succeededSameOperandAndResultElementType(%t10x10 : tensor<10x10xf32>, %t1f: tensor<1xf32>, %v1: vector<1xf32>, %t1i: tensor<1xi32>, %sf: f32) {
+  "test.same_operand_and_result_element_type"(%t1f, %t1f) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  "test.same_operand_and_result_element_type"(%t1f, %t10x10) : (tensor<1xf32>, tensor<10x10xf32>) -> tensor<1xf32>
+  "test.same_operand_and_result_element_type"(%t10x10, %v1) : (tensor<10x10xf32>, vector<1xf32>) -> tensor<1xf32>
+  "test.same_operand_and_result_element_type"(%v1, %t1f) : (vector<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  "test.same_operand_and_result_element_type"(%v1, %t1f) : (vector<1xf32>, tensor<1xf32>) -> tensor<121xf32>
+  "test.same_operand_and_result_element_type"(%sf, %sf) : (f32, f32) -> f32
+  "test.same_operand_and_result_element_type"(%sf, %t1f) : (f32, tensor<1xf32>) -> tensor<121xf32>
+  "test.same_operand_and_result_element_type"(%sf, %v1) : (f32, vector<1xf32>) -> tensor<121xf32>
+  "test.same_operand_and_result_element_type"(%sf, %t10x10) : (f32, tensor<10x10xf32>) -> tensor<121xf32>
+  return
+}
+
+// -----
+
+func @failedSameOperandAndResultElementType_operand_result_mismatch(%t1f: tensor<1xf32>) {
+  // expected-error@+1 {{requires the same element type for all operands and results}}
+  "test.same_operand_and_result_element_type"(%t1f, %t1f) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xi32>
+}
+
+// -----
+
+func @failedSameOperandAndResultElementType_operand_mismatch(%t1f: tensor<1xf32>, %t1i: tensor<1xi32>) {
+  // expected-error@+1 {{requires the same element type for all operands and results}}
+  "test.same_operand_and_result_element_type"(%t1f, %t1i) : (tensor<1xf32>, tensor<1xi32>) -> tensor<1xf32>
+}
+
+// -----
+
+func @failedSameOperandAndResultElementType_result_mismatch(%t1f: tensor<1xf32>) {
+  // expected-error@+1 {{requires the same element type for all operands and results}}
+  %0:2 = "test.same_operand_and_result_element_type"(%t1f) : (tensor<1xf32>) -> (tensor<1xf32>, tensor<1xi32>)
+}
+
+// -----
+
+func @failedSameOperandAndResultElementType_no_operands() {
+  // expected-error@+1 {{expected 1 or more operands}}
+  "test.same_operand_and_result_element_type"() : () -> tensor<1xf32>
+}
+
+// -----
+
+func @failedSameOperandAndResultElementType_no_results(%t1f: tensor<1xf32>) {
+  // expected-error@+1 {{expected 1 or more results}}
+  "test.same_operand_and_result_element_type"(%t1f) : (tensor<1xf32>) -> ()
+}
+
+// -----
+
+// CHECK: succeededSameOperandShape
+func @succeededSameOperandShape(%t10x10 : tensor<10x10xf32>, %t1: tensor<1xf32>, %m10x10 : memref<10x10xi32>, %tr: tensor<*xf32>) {
+  "test.same_operand_shape"(%t1, %t1) : (tensor<1xf32>, tensor<1xf32>) -> ()
+  "test.same_operand_shape"(%t10x10, %t10x10) : (tensor<10x10xf32>, tensor<10x10xf32>) -> ()
+  "test.same_operand_shape"(%t1, %tr) : (tensor<1xf32>, tensor<*xf32>) -> ()
+  "test.same_operand_shape"(%t10x10, %m10x10) : (tensor<10x10xf32>, memref<10x10xi32>) -> ()
+  return
+}
+
+// -----
+
+func @failedSameOperandShape_operand_mismatch(%t10x10 : tensor<10x10xf32>, %t1: tensor<1xf32>) {
+  // expected-error@+1 {{requires the same shape for all operands}}
+  "test.same_operand_shape"(%t1, %t10x10) : (tensor<1xf32>, tensor<10x10xf32>) -> ()
+}
+
+// -----
+
+func @failedSameOperandShape_no_operands() {
+  // expected-error@+1 {{expected 1 or more operands}}
+  "test.same_operand_shape"() : () -> ()
+}
+
+// -----
+
+// CHECK: succeededSameOperandAndResultShape
+func @succeededSameOperandAndResultShape(%t10x10 : tensor<10x10xf32>, %t1: tensor<1xf32>, %tr: tensor<*xf32>, %t1d: tensor<?xf32>) {
+  "test.same_operand_and_result_shape"(%t1, %t1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  "test.same_operand_and_result_shape"(%t10x10, %t10x10) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
+  "test.same_operand_and_result_shape"(%t1, %tr) : (tensor<1xf32>, tensor<*xf32>) -> tensor<1xf32>
+  "test.same_operand_and_result_shape"(%t1, %t1d) : (tensor<1xf32>, tensor<?xf32>) -> tensor<1xf32>
+  "test.same_operand_and_result_shape"(%t1, %t1d) : (tensor<1xf32>, tensor<?xf32>) -> memref<1xf32>
+
+  return
+}
+
+// -----
+
+func @failedSameOperandAndResultShape_operand_result_mismatch(%t10x10 : tensor<10x10xf32>, %t1: tensor<1xf32>) {
+  // expected-error@+1 {{requires the same shape for all operands and results}}
+  "test.same_operand_and_result_shape"(%t1, %t10x10) : (tensor<1xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
+}
+
+// -----
+
+func @failedSameOperandAndResultShape_no_operands() {
+  // expected-error@+1 {{expected 1 or more operands}}
+  "test.same_operand_and_result_shape"() : () -> (tensor<1xf32>)
+}
+
+// -----
+
+func @failedSameOperandAndResultShape_no_operands(%t1: tensor<1xf32>) {
+  // expected-error@+1 {{expected 1 or more results}}
+  "test.same_operand_and_result_shape"(%t1) : (tensor<1xf32>) -> ()
+}
+
+// -----
+
+// CHECK: succeededSameOperandAndResultType
+func @succeededSameOperandAndResultType(%t10x10 : tensor<10x10xf32>, %t1: tensor<1xf32>, %tr: tensor<*xf32>, %t1d: tensor<?xf32>, %i32 : i32) {
+  "test.same_operand_and_result_type"(%t1, %t1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  "test.same_operand_and_result_type"(%t10x10, %t10x10) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
+  "test.same_operand_and_result_type"(%t1, %tr) : (tensor<1xf32>, tensor<*xf32>) -> tensor<1xf32>
+  "test.same_operand_and_result_type"(%t1, %t1d) : (tensor<1xf32>, tensor<?xf32>) -> tensor<1xf32>
+  "test.same_operand_and_result_type"(%i32, %i32) : (i32, i32) -> i32
+  return
+}
+
+// -----
+
+func @failedSameOperandAndResultType_operand_result_mismatch(%t10 : tensor<10xf32>, %t20 : tensor<20xf32>) {
+  // expected-error@+1 {{requires the same type for all operands and results}}
+  "test.same_operand_and_result_type"(%t10, %t20) : (tensor<10xf32>, tensor<20xf32>) -> tensor<10xf32>
+}
+
+// -----
+
+func @failedHasParent_wrong_parent() {
+  "some.op"() ({
+   // expected-error@+1 {{'test.child' op expects parent op 'test.parent'}}
+    "test.child"() : () -> ()
+  }) : () -> ()
+}
+
+// -----
+
+func @failedSingleBlockImplicitTerminator_empty_block() {
+   // expected-error@+1 {{'test.SingleBlockImplicitTerminator' op expects a non-empty block}}
+  "test.SingleBlockImplicitTerminator"() ({
+  ^entry:
+  }) : () -> ()
+}
+
+// -----
+
+func @failedSingleBlockImplicitTerminator_too_many_blocks() {
+   // expected-error@+1 {{'test.SingleBlockImplicitTerminator' op expects region #0 to have 0 or 1 block}}
+  "test.SingleBlockImplicitTerminator"() ({
+  ^entry:
+    "test.finish" () : () -> ()
+  ^other:
+    "test.finish" () : () -> ()
+  }) : () -> ()
+}
+
+// -----
+
+func @failedSingleBlockImplicitTerminator_missing_terminator() {
+   // expected-error@+2 {{'test.SingleBlockImplicitTerminator' op expects regions to end with 'test.finish'}}
+   // expected-note@+1 {{in custom textual format, the absence of terminator implies 'test.finish'}}
+  "test.SingleBlockImplicitTerminator"() ({
+  ^entry:
+    "test.non_existent_op"() : () -> ()
+  }) : () -> ()
+}
+
+// -----
+
+// Test that operation with the SymbolTable Trait define a new symbol scope.
+"test.symbol_scope"() ({
+  func @foo() {
+  }
+  "test.finish" () : () -> ()
+}) : () -> ()
+func @foo() {
+}
+
+// -----
+
+// Test that operation with the SymbolTable Trait fails with  too many blocks.
+// expected-error@+1 {{Operations with a 'SymbolTable' must have exactly one block}}
+"test.symbol_scope"() ({
+  ^entry:
+    "test.finish" () : () -> ()
+  ^other:
+    "test.finish" () : () -> ()
+}) : () -> ()
+
+// -----
+
+func @failedMissingOperandSizeAttr(%arg: i32) {
+  // expected-error @+1 {{requires 1D vector attribute 'operand_segment_sizes'}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) : (i32, i32, i32, i32) -> ()
+}
+
+// -----
+
+func @failedOperandSizeAttrWrongType(%arg: i32) {
+  // expected-error @+1 {{requires 1D vector attribute 'operand_segment_sizes'}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = dense<[1, 1, 1, 1]>: tensor<4xi32>} : (i32, i32, i32, i32) -> ()
+}
+
+// -----
+
+func @failedOperandSizeAttrWrongRank(%arg: i32) {
+  // expected-error @+1 {{requires 1D vector attribute 'operand_segment_sizes'}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = dense<[[1, 1], [1, 1]]>: vector<2x2xi32>} : (i32, i32, i32, i32) -> ()
+}
+
+// -----
+
+func @failedOperandSizeAttrNegativeValue(%arg: i32) {
+  // expected-error @+1 {{'operand_segment_sizes' attribute cannot have negative elements}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = dense<[1, 1, -1, 1]>: vector<4xi32>} : (i32, i32, i32, i32) -> ()
+}
+
+// -----
+
+func @failedOperandSizeAttrWrongTotalSize(%arg: i32) {
+  // expected-error @+1 {{operand count (4) does not match with the total size (3) specified in attribute 'operand_segment_sizes'}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = dense<[0, 1, 1, 1]>: vector<4xi32>} : (i32, i32, i32, i32) -> ()
+}
+
+// -----
+
+func @failedOperandSizeAttrWrongCount(%arg: i32) {
+  // expected-error @+1 {{'operand_segment_sizes' attribute for specifying operand segments must have 4 elements}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = dense<[2, 1, 1]>: vector<3xi32>} : (i32, i32, i32, i32) -> ()
+}
+
+// -----
+
+func @succeededOperandSizeAttr(%arg: i32) {
+  // CHECK: test.attr_sized_operands
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = dense<[0, 2, 1, 1]>: vector<4xi32>} : (i32, i32, i32, i32) -> ()
+  return
+}
+
+// -----
+
+func @failedMissingResultSizeAttr() {
+  // expected-error @+1 {{requires 1D vector attribute 'result_segment_sizes'}}
+  %0:4 = "test.attr_sized_results"() : () -> (i32, i32, i32, i32)
+}
+
+// -----
+
+func @failedResultSizeAttrWrongType() {
+  // expected-error @+1 {{requires 1D vector attribute 'result_segment_sizes'}}
+  %0:4 = "test.attr_sized_results"() {result_segment_sizes = dense<[1, 1, 1, 1]>: tensor<4xi32>} : () -> (i32, i32, i32, i32)
+}
+
+// -----
+
+func @failedResultSizeAttrWrongRank() {
+  // expected-error @+1 {{requires 1D vector attribute 'result_segment_sizes'}}
+  %0:4 = "test.attr_sized_results"() {result_segment_sizes = dense<[[1, 1], [1, 1]]>: vector<2x2xi32>} : () -> (i32, i32, i32, i32)
+}
+
+// -----
+
+func @failedResultSizeAttrNegativeValue() {
+  // expected-error @+1 {{'result_segment_sizes' attribute cannot have negative elements}}
+  %0:4 = "test.attr_sized_results"() {result_segment_sizes = dense<[1, 1, -1, 1]>: vector<4xi32>} : () -> (i32, i32, i32, i32)
+}
+
+// -----
+
+func @failedResultSizeAttrWrongTotalSize() {
+  // expected-error @+1 {{result count (4) does not match with the total size (3) specified in attribute 'result_segment_sizes'}}
+  %0:4 = "test.attr_sized_results"() {result_segment_sizes = dense<[0, 1, 1, 1]>: vector<4xi32>} : () -> (i32, i32, i32, i32)
+}
+
+// -----
+
+func @failedResultSizeAttrWrongCount() {
+  // expected-error @+1 {{'result_segment_sizes' attribute for specifying result segments must have 4 elements}}
+  %0:4 = "test.attr_sized_results"() {result_segment_sizes = dense<[2, 1, 1]>: vector<3xi32>} : () -> (i32, i32, i32, i32)
+}
+
+// -----
+
+func @succeededResultSizeAttr() {
+  // CHECK: test.attr_sized_results
+  %0:4 = "test.attr_sized_results"() {result_segment_sizes = dense<[0, 2, 1, 1]>: vector<4xi32>} : () -> (i32, i32, i32, i32)
+  return
+}
diff --git a/mlir/test/IR/wrapping_op.mlir b/mlir/test/IR/wrapping_op.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..9212e8930ac3082267b0a0d383e393ca8ce82cd1
--- /dev/null
+++ b/mlir/test/IR/wrapping_op.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-opt %s | FileCheck %s
+// RUN: mlir-opt -mlir-print-op-generic -mlir-print-debuginfo %s | FileCheck %s --check-prefix=CHECK-GENERIC
+
+// CHECK-LABEL: func @wrapping_op
+// CHECK-GENERIC-LABEL: func @wrapping_op
+func @wrapping_op(%arg0 : i32, %arg1 : f32) -> (i3, i2, i1) {
+// CHECK: %0:3 = test.wrapping_region wraps "some.op"(%arg1, %arg0) {test.attr = "attr"} : (f32, i32) -> (i1, i2, i3)
+// CHECK-GENERIC: "test.wrapping_region"() ( {
+// CHECK-GENERIC:   %[[NESTED_RES:.*]]:3 = "some.op"(%arg1, %arg0) {test.attr = "attr"} : (f32, i32) -> (i1, i2, i3) loc("some_NameLoc")
+// CHECK-GENERIC:   "test.return"(%[[NESTED_RES]]#0, %[[NESTED_RES]]#1, %[[NESTED_RES]]#2) : (i1, i2, i3) -> () loc("some_NameLoc")
+// CHECK-GENERIC: }) : () -> (i1, i2, i3) loc("some_NameLoc")
+  %res:3 = test.wrapping_region wraps "some.op"(%arg1, %arg0) { test.attr = "attr" } : (f32, i32) -> (i1, i2, i3) loc("some_NameLoc")
+  return %res#2, %res#1, %res#0 : i3, i2, i1
+}
diff --git a/mlir/test/Pass/crash-recovery.mlir b/mlir/test/Pass/crash-recovery.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..5a2e88b48e8891247734cac29a85de464263bee3
--- /dev/null
+++ b/mlir/test/Pass/crash-recovery.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-opt %s -pass-pipeline='func(test-function-pass, test-pass-crash)' -pass-pipeline-crash-reproducer=%t -verify-diagnostics
+// RUN: cat %t | FileCheck -check-prefix=REPRO %s
+
+// expected-error@+1 {{A failure has been detected while processing the MLIR module}}
+module {
+  func @foo() {
+    return
+  }
+}
+
+// REPRO: configuration: -pass-pipeline='func(test-function-pass, test-pass-crash)'
+
+// REPRO: module
+// REPRO: func @foo() {
+// REPRO-NEXT: return
diff --git a/mlir/test/Pass/ir-printing.mlir b/mlir/test/Pass/ir-printing.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..892dc40a034ad3852e0a76aa0b804b409eb5cc7f
--- /dev/null
+++ b/mlir/test/Pass/ir-printing.mlir
@@ -0,0 +1,62 @@
+// RUN: mlir-opt %s -disable-pass-threading=true -pass-pipeline='func(cse,canonicalize)' -print-ir-before=cse  -o /dev/null 2>&1 | FileCheck -check-prefix=BEFORE %s
+// RUN: mlir-opt %s -disable-pass-threading=true -pass-pipeline='func(cse,canonicalize)' -print-ir-before-all -o /dev/null 2>&1 | FileCheck -check-prefix=BEFORE_ALL %s
+// RUN: mlir-opt %s -disable-pass-threading=true -pass-pipeline='func(cse,canonicalize)' -print-ir-after=cse -o /dev/null 2>&1 | FileCheck -check-prefix=AFTER %s
+// RUN: mlir-opt %s -disable-pass-threading=true -pass-pipeline='func(cse,canonicalize)' -print-ir-after-all -o /dev/null 2>&1 | FileCheck -check-prefix=AFTER_ALL %s
+// RUN: mlir-opt %s -disable-pass-threading=true -pass-pipeline='func(cse,canonicalize)' -print-ir-before=cse -print-ir-module-scope -o /dev/null 2>&1 | FileCheck -check-prefix=BEFORE_MODULE %s
+// RUN: mlir-opt %s -disable-pass-threading=true -pass-pipeline='func(cse,cse)' -print-ir-after-all -print-ir-after-change -o /dev/null 2>&1 | FileCheck -check-prefix=AFTER_ALL_CHANGE %s
+
+func @foo() {
+  %0 = constant 0 : i32
+  return
+}
+
+func @bar() {
+  return
+}
+
+// BEFORE: *** IR Dump Before{{.*}}CSE ***
+// BEFORE-NEXT: func @foo()
+// BEFORE: *** IR Dump Before{{.*}}CSE ***
+// BEFORE-NEXT: func @bar()
+// BEFORE-NOT: *** IR Dump Before{{.*}}Canonicalizer ***
+// BEFORE-NOT: *** IR Dump After
+
+// BEFORE_ALL: *** IR Dump Before{{.*}}CSE ***
+// BEFORE_ALL-NEXT: func @foo()
+// BEFORE_ALL: *** IR Dump Before{{.*}}Canonicalizer ***
+// BEFORE_ALL-NEXT: func @foo()
+// BEFORE_ALL: *** IR Dump Before{{.*}}CSE ***
+// BEFORE_ALL-NEXT: func @bar()
+// BEFORE_ALL: *** IR Dump Before{{.*}}Canonicalizer ***
+// BEFORE_ALL-NEXT: func @bar()
+// BEFORE_ALL-NOT: *** IR Dump After
+
+// AFTER-NOT: *** IR Dump Before
+// AFTER: *** IR Dump After{{.*}}CSE ***
+// AFTER-NEXT: func @foo()
+// AFTER: *** IR Dump After{{.*}}CSE ***
+// AFTER-NEXT: func @bar()
+// AFTER-NOT: *** IR Dump After{{.*}}Canonicalizer ***
+
+// AFTER_ALL-NOT: *** IR Dump Before
+// AFTER_ALL: *** IR Dump After{{.*}}CSE ***
+// AFTER_ALL-NEXT: func @foo()
+// AFTER_ALL: *** IR Dump After{{.*}}Canonicalizer ***
+// AFTER_ALL-NEXT: func @foo()
+// AFTER_ALL: *** IR Dump After{{.*}}CSE ***
+// AFTER_ALL-NEXT: func @bar()
+// AFTER_ALL: *** IR Dump After{{.*}}Canonicalizer ***
+// AFTER_ALL-NEXT: func @bar()
+
+// BEFORE_MODULE: *** IR Dump Before{{.*}}CSE *** ('func' operation: @foo)
+// BEFORE_MODULE: func @foo()
+// BEFORE_MODULE: func @bar()
+// BEFORE_MODULE: *** IR Dump Before{{.*}}CSE *** ('func' operation: @bar)
+// BEFORE_MODULE: func @foo()
+// BEFORE_MODULE: func @bar()
+
+// AFTER_ALL_CHANGE: *** IR Dump After{{.*}}CSE ***
+// AFTER_ALL_CHANGE-NEXT: func @foo()
+// AFTER_ALL_CHANGE-NOT: *** IR Dump After{{.*}}CSE ***
+// We expect that only 'foo' changed during CSE, and the second run of CSE did
+// nothing.
diff --git a/mlir/test/Pass/pass-timing.mlir b/mlir/test/Pass/pass-timing.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..db39ad6a1633b211ec7b21332f3885e92c45f224
--- /dev/null
+++ b/mlir/test/Pass/pass-timing.mlir
@@ -0,0 +1,91 @@
+// RUN: mlir-opt %s -disable-pass-threading=true -verify-each=true -pass-pipeline='func(cse,canonicalize,cse)' -pass-timing -pass-timing-display=list 2>&1 | FileCheck -check-prefix=LIST %s
+// RUN: mlir-opt %s -disable-pass-threading=true -verify-each=true -pass-pipeline='func(cse,canonicalize,cse)' -pass-timing -pass-timing-display=pipeline 2>&1 | FileCheck -check-prefix=PIPELINE %s
+// RUN: mlir-opt %s -disable-pass-threading=false -verify-each=true -pass-pipeline='func(cse,canonicalize,cse)' -pass-timing -pass-timing-display=list 2>&1 | FileCheck -check-prefix=MT_LIST %s
+// RUN: mlir-opt %s -disable-pass-threading=false -verify-each=true -pass-pipeline='func(cse,canonicalize,cse)' -pass-timing -pass-timing-display=pipeline 2>&1 | FileCheck -check-prefix=MT_PIPELINE %s
+// RUN: mlir-opt %s -disable-pass-threading=false -verify-each=false -test-pm-nested-pipeline -pass-timing -pass-timing-display=pipeline 2>&1 | FileCheck -check-prefix=NESTED_MT_PIPELINE %s
+
+// LIST: Pass execution timing report
+// LIST: Total Execution Time:
+// LIST: Name
+// LIST-DAG: Canonicalizer
+// LIST-DAG: Verifier
+// LIST-DAG: CSE
+// LIST-DAG: DominanceInfo
+// LIST: Total
+
+// PIPELINE: Pass execution timing report
+// PIPELINE: Total Execution Time:
+// PIPELINE: Name
+// PIPELINE-NEXT: 'func' Pipeline
+// PIPELINE-NEXT:   CSE
+// PIPELINE-NEXT:     (A) DominanceInfo
+// PIPELINE-NEXT:   Verifier
+// PIPELINE-NEXT:   Canonicalizer
+// PIPELINE-NEXT:   Verifier
+// PIPELINE-NEXT:   CSE
+// PIPELINE-NEXT:     (A) DominanceInfo
+// PIPELINE-NEXT:   Verifier
+// PIPELINE-NEXT: Verifier
+// PIPELINE-NEXT: Total
+
+// MT_LIST: Pass execution timing report
+// MT_LIST: Total Execution Time:
+// MT_LIST: Name
+// MT_LIST-DAG: Canonicalizer
+// MT_LIST-DAG: Verifier
+// MT_LIST-DAG: CSE
+// MT_LIST-DAG: DominanceInfo
+// MT_LIST: Total
+
+// MT_PIPELINE: Pass execution timing report
+// MT_PIPELINE: Total Execution Time:
+// MT_PIPELINE: Name
+// MT_PIPELINE-NEXT: 'func' Pipeline
+// MT_PIPELINE-NEXT:   CSE
+// MT_PIPELINE-NEXT:     (A) DominanceInfo
+// MT_PIPELINE-NEXT:   Verifier
+// MT_PIPELINE-NEXT:   Canonicalizer
+// MT_PIPELINE-NEXT:   Verifier
+// MT_PIPELINE-NEXT:   CSE
+// MT_PIPELINE-NEXT:     (A) DominanceInfo
+// MT_PIPELINE-NEXT:   Verifier
+// MT_PIPELINE-NEXT: Verifier
+// MT_PIPELINE-NEXT: Total
+
+// NESTED_MT_PIPELINE: Pass execution timing report
+// NESTED_MT_PIPELINE: Total Execution Time:
+// NESTED_MT_PIPELINE: Name
+// NESTED_MT_PIPELINE-NEXT: Pipeline Collection : ['func', 'module']
+// NESTED_MT_PIPELINE-NEXT:   'func' Pipeline
+// NESTED_MT_PIPELINE-NEXT:     TestFunctionPass
+// NESTED_MT_PIPELINE-NEXT:   'module' Pipeline
+// NESTED_MT_PIPELINE-NEXT:     TestModulePass
+// NESTED_MT_PIPELINE-NEXT:     'func' Pipeline
+// NESTED_MT_PIPELINE-NEXT:       TestFunctionPass
+// NESTED_MT_PIPELINE-NEXT: Total
+
+func @foo() {
+  return
+}
+
+func @bar() {
+  return
+}
+
+func @baz() {
+  return
+}
+
+func @foobar() {
+  return
+}
+
+module {
+  func @baz() {
+    return
+  }
+
+  func @foobar() {
+    return
+  }
+}
diff --git a/mlir/test/Pass/pipeline-options-parsing.mlir b/mlir/test/Pass/pipeline-options-parsing.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..bfb24af93027b6edb038df9b1e571929765ae7c0
--- /dev/null
+++ b/mlir/test/Pass/pipeline-options-parsing.mlir
@@ -0,0 +1,18 @@
+// RUN: not mlir-opt %s -pass-pipeline='module(test-module-pass{)' 2>&1 | FileCheck --check-prefix=CHECK_ERROR_1 %s
+// RUN: not mlir-opt %s -pass-pipeline='module(test-module-pass{test-option=3})' 2>&1 | FileCheck --check-prefix=CHECK_ERROR_2 %s
+// RUN: not mlir-opt %s -pass-pipeline='module(test-options-pass{list=3}, test-module-pass{invalid-option=3})' 2>&1 | FileCheck --check-prefix=CHECK_ERROR_3 %s
+// RUN: not mlir-opt %s -pass-pipeline='test-options-pass{list=3 list=notaninteger}' 2>&1 | FileCheck --check-prefix=CHECK_ERROR_4 %s
+// RUN: not mlir-opt %s -pass-pipeline='func(test-options-pass{list=1,2,3,4 list=5 string=value1 string=value2})' 2>&1 | FileCheck --check-prefix=CHECK_ERROR_5 %s
+// RUN: mlir-opt %s -verify-each=false -pass-pipeline='func(test-options-pass{string-list=a list=1,2,3,4 string-list=b,c list=5 string-list=d string=some_value})' -test-dump-pipeline 2>&1 | FileCheck --check-prefix=CHECK_1 %s
+// RUN: mlir-opt %s -verify-each=false -test-options-pass-pipeline='list=1 string-list=a,b' -test-dump-pipeline 2>&1 | FileCheck --check-prefix=CHECK_2 %s
+// RUN: mlir-opt %s -verify-each=false -pass-pipeline='module(test-options-pass{list=3}, test-options-pass{list=1,2,3,4})' -test-dump-pipeline 2>&1 | FileCheck --check-prefix=CHECK_3 %s
+
+// CHECK_ERROR_1: missing closing '}' while processing pass options
+// CHECK_ERROR_2: no such option test-option
+// CHECK_ERROR_3: no such option invalid-option
+// CHECK_ERROR_4: 'notaninteger' value invalid for integer argument
+// CHECK_ERROR_5: string option: may only occur zero or one times
+
+// CHECK_1: test-options-pass{list=1,2,3,4,5 string=some_value string-list=a,b,c,d}
+// CHECK_2: test-options-pass{list=1 string= string-list=a,b}
+// CHECK_3: module(func(test-options-pass{list=3 string= string-list=}), func(test-options-pass{list=1,2,3,4 string= string-list=}))
diff --git a/mlir/test/Pass/pipeline-parsing.mlir b/mlir/test/Pass/pipeline-parsing.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..118a87d42865bacf2e265ece8c4ef57e71757b81
--- /dev/null
+++ b/mlir/test/Pass/pipeline-parsing.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-opt %s -pass-pipeline='module(test-module-pass,func(test-function-pass)),func(test-function-pass)' -pass-pipeline="func(cse,canonicalize)" -verify-each=false -pass-timing -pass-timing-display=pipeline 2>&1 | FileCheck %s
+// RUN: mlir-opt %s -test-textual-pm-nested-pipeline -verify-each=false -pass-timing -pass-timing-display=pipeline 2>&1 | FileCheck %s --check-prefix=TEXTUAL_CHECK
+// RUN: not mlir-opt %s -pass-pipeline='module(test-module-pass' 2>&1 | FileCheck --check-prefix=CHECK_ERROR_1 %s
+// RUN: not mlir-opt %s -pass-pipeline='module(test-module-pass))' 2>&1 | FileCheck --check-prefix=CHECK_ERROR_2 %s
+// RUN: not mlir-opt %s -pass-pipeline='module()(' 2>&1 | FileCheck --check-prefix=CHECK_ERROR_3 %s
+// RUN: not mlir-opt %s -pass-pipeline=',' 2>&1 | FileCheck --check-prefix=CHECK_ERROR_4 %s
+
+// CHECK_ERROR_1: encountered unbalanced parentheses while parsing pipeline
+// CHECK_ERROR_2: encountered extra closing ')' creating unbalanced parentheses while parsing pipeline
+// CHECK_ERROR_3: expected ',' after parsing pipeline
+// CHECK_ERROR_4: does not refer to a registered pass or pass pipeline
+
+func @foo() {
+  return
+}
+
+module {
+  func @foo() {
+    return
+  }
+}
+
+// CHECK: Pipeline Collection : ['func', 'module']
+// CHECK-NEXT:   'func' Pipeline
+// CHECK-NEXT:     TestFunctionPass
+// CHECK-NEXT:     CSE
+// CHECK-NEXT:       DominanceInfo
+// CHECK-NEXT:     Canonicalizer
+// CHECK-NEXT:   'module' Pipeline
+// CHECK-NEXT:     TestModulePass
+// CHECK-NEXT:     'func' Pipeline
+// CHECK-NEXT:       TestFunctionPass
+
+// TEXTUAL_CHECK: Pipeline Collection : ['func', 'module']
+// TEXTUAL_CHECK-NEXT:   'func' Pipeline
+// TEXTUAL_CHECK-NEXT:     TestFunctionPass
+// TEXTUAL_CHECK-NEXT:   'module' Pipeline
+// TEXTUAL_CHECK-NEXT:     TestModulePass
+// TEXTUAL_CHECK-NEXT:     'func' Pipeline
+// TEXTUAL_CHECK-NEXT:       TestFunctionPass
diff --git a/mlir/test/Pass/pipeline-stats.mlir b/mlir/test/Pass/pipeline-stats.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..e3ee144ad47133ca054bf03aa1e9b764359787a2
--- /dev/null
+++ b/mlir/test/Pass/pipeline-stats.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-opt %s -verify-each=true -pass-pipeline='func(test-stats-pass,test-stats-pass)' -pass-statistics -pass-statistics-display=list 2>&1 | FileCheck -check-prefix=LIST %s
+// RUN: mlir-opt %s -verify-each=true -pass-pipeline='func(test-stats-pass,test-stats-pass)' -pass-statistics -pass-statistics-display=pipeline 2>&1 | FileCheck -check-prefix=PIPELINE %s
+
+// LIST: Pass statistics report
+// LIST: TestStatisticPass
+// LIST-NEXT:  (S) {{0|8}} num-ops - Number of operations counted
+// LIST-NOT: Verifier
+
+// PIPELINE: Pass statistics report
+// PIPELINE: 'func' Pipeline
+// PIPELINE-NEXT:   TestStatisticPass
+// PIPELINE-NEXT:     (S) {{0|4}} num-ops - Number of operations counted
+// PIPELINE-NEXT:   Verifier
+// PIPELINE-NEXT:   TestStatisticPass
+// PIPELINE-NEXT:     (S) {{0|4}} num-ops - Number of operations counted
+// PIPELINE-NEXT:   Verifier
+// PIPELINE-NEXT: Verifier
+
+func @foo() {
+  return
+}
+
+func @bar() {
+  return
+}
diff --git a/mlir/test/Quantizer/matmul.mlir b/mlir/test/Quantizer/matmul.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..38d6c1ee9bef61bad6ceff93add1f20bb96677d4
--- /dev/null
+++ b/mlir/test/Quantizer/matmul.mlir
@@ -0,0 +1,51 @@
+// RUN: mlir-opt %s -quantizer-infer-quantized-types -quant-convert-const -quantizer-remove-instrumentation -pass-pipeline='func(canonicalize)' -split-input-file | FileCheck %s
+
+// ----
+// A matmul without fused clamp or bias.
+// CHECK-LABEL: @matmul
+// CHECK: %cst = constant dense{{.*}}tensor<3x5xi8>
+// CHECK-NEXT: %0 = "quant.qcast"(%arg0) : (tensor<300x3xf32>) -> tensor<300x3x!quant.uniform<i8:f32, 0.037564418067230126:35>>
+// CHECK-NEXT: %1 = "quant.scast"(%cst) : (tensor<3x5xi8>) -> tensor<3x5x!quant.uniform<i8:f32, 0.0062823070315864236:-1>>
+// CHECK-NEXT: %2 = "fxpmath.real_matmul"(%0, %1) : (tensor<300x3x!quant.uniform<i8:f32, 0.037564418067230126:35>>, tensor<3x5x!quant.uniform<i8:f32, 0.0062823070315864236:-1>>) -> tensor<300x5x!quant.uniform<i8:f32, 0.0629921259842528:-1>>
+// CHECK-NEXT: %3 = "quant.dcast"(%2) : (tensor<300x5x!quant.uniform<i8:f32, 0.0629921259842528:-1>>) -> tensor<300x5xf32>
+func @matmul(%arg0: tensor<300x3xf32>) -> tensor<300x5xf32> {
+  %0 = "quant.stats"(%arg0) {layerStats = dense<[-6.123e+00, 3.45e+00]> : tensor<2xf32>} : (tensor<300x3xf32>) -> tensor<300x3xf32>
+  %cst = constant  {name = "constant.35"} dense<[[-1.060230e-01, 1.215050e-01, 8.002390e-01, -7.688850e-01, 0.0966112986], [6.890140e-01, -4.070560e-01, -0.797852993, 3.789250e-03, -2.088810e-01], [-6.085290e-01, 2.766170e-02, 2.685570e-01, 5.774010e-01, -4.284370e-01]]> : tensor<3x5xf32>
+  %1 = "fxpmath.real_matmul"(%0, %cst) : (tensor<300x3xf32>, tensor<3x5xf32>) -> tensor<300x5xf32>
+  %2 = "quant.stats"(%1) {layerStats = dense<[-8.000000e+00, 8.000000e+00]> : tensor<2xf32>} : (tensor<300x5xf32>) -> tensor<300x5xf32>
+  return %2 : tensor<300x5xf32>
+}
+
+// ----
+// A matmul with fused clamp which serves as statistics for the result.
+// CHECK-LABEL: @matmul_clamp
+// CHECK: %cst = constant dense{{.*}}tensor<3x5xi8>
+// CHECK-NEXT: %0 = "quant.qcast"(%arg0) : (tensor<300x3xf32>) -> tensor<300x3x!quant.uniform<i8:f32, 0.037564418067230126:35>>
+// CHECK-NEXT: %1 = "quant.scast"(%cst) : (tensor<3x5xi8>) -> tensor<3x5x!quant.uniform<i8:f32, 0.0062823070315864236:-1>>
+// CHECK-NEXT: %2 = "fxpmath.real_matmul"(%0, %1) {clamp_max = 6.100000e+00 : f64, clamp_min = -1.225000e+01 : f64} : (tensor<300x3x!quant.uniform<i8:f32, 0.037564418067230126:35>>, tensor<3x5x!quant.uniform<i8:f32, 0.0062823070315864236:-1>>) -> tensor<300x5x!quant.uniform<i8:f32, 0.072058823529412216:42>>
+// CHECK-NEXT: %3 = "quant.dcast"(%2) : (tensor<300x5x!quant.uniform<i8:f32, 0.072058823529412216:42>>) -> tensor<300x5xf32>
+func @matmul_clamp(%arg0: tensor<300x3xf32>) -> tensor<300x5xf32> {
+  %0 = "quant.stats"(%arg0) {layerStats = dense<[-6.123e+00, 3.45e+00]> : tensor<2xf32>} : (tensor<300x3xf32>) -> tensor<300x3xf32>
+  %cst = constant  {name = "constant.35"} dense<[[-1.060230e-01, 1.215050e-01, 8.002390e-01, -7.688850e-01, 0.0966112986], [6.890140e-01, -4.070560e-01, -0.797852993, 3.789250e-03, -2.088810e-01], [-6.085290e-01, 2.766170e-02, 2.685570e-01, 5.774010e-01, -4.284370e-01]]> : tensor<3x5xf32>
+  %1 = "fxpmath.real_matmul"(%0, %cst) {clamp_max = 6.10, clamp_min = -12.25} : (tensor<300x3xf32>, tensor<3x5xf32>) -> tensor<300x5xf32>
+  return %1 : tensor<300x5xf32>
+}
+
+// ----
+// A matmul with bias and clamp.
+// CHECK-LABEL: @matmul_add_clamp
+// CHECK: %cst = constant dense{{.*}}tensor<3x5xi8>
+// CHECK-NEXT: %cst_0 = constant dense<[14, 28, 42, 56, 69]> : tensor<5xi32>
+// CHECK-NEXT: %0 = "quant.qcast"(%arg0) : (tensor<300x3xf32>) -> tensor<300x3x!quant.uniform<i8:f32, 0.037564418067230126:35>>
+// CHECK-NEXT: %1 = "quant.scast"(%cst) : (tensor<3x5xi8>) -> tensor<3x5x!quant.uniform<i8:f32, 0.0062823070315864236:-1>>
+// CHECK-NEXT: %2 = "quant.scast"(%cst_0) : (tensor<5xi32>) -> tensor<5x!quant.uniform<i32:f32, 0.072058823529412216>>
+// CHECK-NEXT: %3 = "fxpmath.real_matmul_bias"(%0, %1, %2) {clamp_max = 6.100000e+00 : f64, clamp_min = -1.225000e+01 : f64} : (tensor<300x3x!quant.uniform<i8:f32, 0.037564418067230126:35>>, tensor<3x5x!quant.uniform<i8:f32, 0.0062823070315864236:-1>>, tensor<5x!quant.uniform<i32:f32, 0.072058823529412216>>) -> tensor<300x5x!quant.uniform<i8:f32, 0.072058823529412216:42>>
+// CHECK-NEXT: %4 = "quant.dcast"(%3) : (tensor<300x5x!quant.uniform<i8:f32, 0.072058823529412216:42>>) -> tensor<300x5xf32>
+func @matmul_add_clamp(%arg0: tensor<300x3xf32>) -> tensor<300x5xf32> {
+  %0 = "quant.stats"(%arg0) {layerStats = dense<[-6.123e+00, 3.45e+00]> : tensor<2xf32>} : (tensor<300x3xf32>) -> tensor<300x3xf32>
+  %cst = constant  {name = "constant.35"} dense<[[-1.060230e-01, 1.215050e-01, 8.002390e-01, -7.688850e-01, 0.0966112986], [6.890140e-01, -4.070560e-01, -0.797852993, 3.789250e-03, -2.088810e-01], [-6.085290e-01, 2.766170e-02, 2.685570e-01, 5.774010e-01, -4.284370e-01]]> : tensor<3x5xf32>
+  %cst_0 = constant  {name = "constant.37"} dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00]> : tensor<5xf32>
+  %1 = "fxpmath.real_matmul_bias"(%0, %cst, %cst_0) {clamp_max = 6.10, clamp_min = -12.25} : (tensor<300x3xf32>, tensor<3x5xf32>, tensor<5xf32>) -> tensor<300x5xf32>
+  return %1 : tensor<300x5xf32>
+}
+
diff --git a/mlir/test/Quantizer/remove-instrumentation.mlir b/mlir/test/Quantizer/remove-instrumentation.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..805095b1efc496e610c4e807eda2cdad31823949
--- /dev/null
+++ b/mlir/test/Quantizer/remove-instrumentation.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-opt %s -quantizer-remove-instrumentation -split-input-file | FileCheck %s
+
+// -----
+// CHECK-LABEL: remove_ops
+func @remove_ops(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
+  %0 = "quant.stats"(%arg0) {
+    layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>
+  } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  %1 = "quant.coupled_ref"(%0) { coupledKey = "foobar" } :
+      (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  %2 = "quant.stats_ref"(%1) { statsKey = "foobar" } :
+      (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
+  // CHECK: return %arg0 : tensor<8x4x3xf32>
+  return %2 : tensor<8x4x3xf32>
+}
diff --git a/mlir/test/SDBM/CMakeLists.txt b/mlir/test/SDBM/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..028a6b33536e8d275efbad090a32dbb29247f977
--- /dev/null
+++ b/mlir/test/SDBM/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_llvm_executable(mlir-sdbm-api-test
+  sdbm-api-test.cpp
+)
+
+llvm_update_compile_flags(mlir-sdbm-api-test)
+
+target_link_libraries(mlir-sdbm-api-test
+  PRIVATE
+  MLIRIR
+  MLIRSDBM
+  LLVMCore
+  LLVMSupport
+)
+
+target_include_directories(mlir-sdbm-api-test PRIVATE ..)
+
+whole_archive_link(mlir-sdbm-api-test
+  MLIRSDBM
+)
diff --git a/mlir/test/SDBM/lit.local.cfg b/mlir/test/SDBM/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..81261555b4246b27334873bcbc67cd9f54ba0d4d
--- /dev/null
+++ b/mlir/test/SDBM/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes.add('.cpp')
diff --git a/mlir/test/SDBM/sdbm-api-test.cpp b/mlir/test/SDBM/sdbm-api-test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a672290d01d6c005e1b3b8f4950090db953b4cee
--- /dev/null
+++ b/mlir/test/SDBM/sdbm-api-test.cpp
@@ -0,0 +1,197 @@
+//===- sdbm-api-test.cpp - Tests for SDBM expression APIs -----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: mlir-sdbm-api-test | FileCheck %s
+
+#include "mlir/Dialect/SDBM/SDBM.h"
+#include "mlir/Dialect/SDBM/SDBMDialect.h"
+#include "mlir/Dialect/SDBM/SDBMExpr.h"
+#include "mlir/IR/MLIRContext.h"
+
+#include "llvm/Support/raw_ostream.h"
+
+#include "APITest.h"
+
+using namespace mlir;
+
+static MLIRContext *ctx() {
+  static thread_local MLIRContext context;
+  return &context;
+}
+
+static SDBMDialect *dialect() {
+  static thread_local SDBMDialect *d = nullptr;
+  if (!d) {
+    d = ctx()->getRegisteredDialect<SDBMDialect>();
+  }
+  return d;
+}
+
+static SDBMExpr dim(unsigned pos) { return SDBMDimExpr::get(dialect(), pos); }
+
+static SDBMExpr symb(unsigned pos) {
+  return SDBMSymbolExpr::get(dialect(), pos);
+}
+
+namespace {
+
+using namespace mlir::ops_assertions;
+
+TEST_FUNC(SDBM_SingleConstraint) {
+  // Build an SDBM defined by
+  //   d0 - 3 <= 0  <=>  d0 <= 3.
+  auto sdbm = SDBM::get(dim(0) - 3, llvm::None);
+
+  //      CHECK:       cst   d0
+  // CHECK-NEXT: cst   inf    3
+  // CHECK-NEXT: d0    inf  inf
+  sdbm.print(llvm::outs());
+}
+
+TEST_FUNC(SDBM_Equality) {
+  // Build an SDBM defined by
+  //
+  //   d0 - d1 - 3 = 0
+  //     <=> {d0 - d1 - 3 <= 0 and d0 - d1 - 3 >= 0}
+  //     <=> {d0 - d1 <= 3 and d1 - d0 <= -3}.
+  auto sdbm = SDBM::get(llvm::None, dim(0) - dim(1) - 3);
+
+  //      CHECK:       cst   d0   d1
+  // CHECK-NEXT: cst   inf  inf  inf
+  // CHECK-NEXT: d0    inf  inf   -3
+  // CHECK-NEXT: d1    inf    3  inf
+  sdbm.print(llvm::outs());
+}
+
+TEST_FUNC(SDBM_TrivialSimplification) {
+  // Build an SDBM defined by
+  //
+  //   d0 - 3 <= 0  <=>  d0 <= 3
+  //   d0 - 5 <= 0  <=>  d0 <= 5
+  //
+  // which should get simplified on construction to only the former.
+  auto sdbm = SDBM::get({dim(0) - 3, dim(0) - 5}, llvm::None);
+
+  //      CHECK:       cst   d0
+  // CHECK-NEXT: cst   inf    3
+  // CHECK-NEXT: d0    inf  inf
+  sdbm.print(llvm::outs());
+}
+
+TEST_FUNC(SDBM_StripeInducedIneqs) {
+  // Build an SDBM defined by d1 = d0 # 3, which induces the constraints
+  //
+  //   d1 - d0 <= 0
+  //   d0 - d1 <= 3 - 1 = 2
+  auto sdbm = SDBM::get(llvm::None, dim(1) - stripe(dim(0), 3));
+
+  //      CHECK:       cst   d0   d1
+  // CHECK-NEXT: cst   inf  inf  inf
+  // CHECK-NEXT: d0    inf  inf    0
+  // CHECK-NEXT: d1    inf    2    0
+  // CHECK-NEXT: d1 = d0 # 3
+  sdbm.print(llvm::outs());
+}
+
+TEST_FUNC(SDBM_StripeTemporaries) {
+  // Build an SDBM defined by d0 # 3 <= 0, which creates a temporary
+  // t0 = d0 # 3 leading to a constraint t0 <= 0 and the stripe-induced
+  // constraints
+  //
+  //   t0 - d0 <= 0
+  //   d0 - t0 <= 3 - 1 = 2
+  auto sdbm = SDBM::get(stripe(dim(0), 3), llvm::None);
+
+  //      CHECK:       cst   d0   t0
+  // CHECK-NEXT: cst   inf  inf    0
+  // CHECK-NEXT: d0    inf  inf    0
+  // CHECK-NEXT: t0    inf    2  inf
+  // CHECK-NEXT: t0 = d0 # 3
+  sdbm.print(llvm::outs());
+}
+
+TEST_FUNC(SDBM_ElideInducedInequalities) {
+  // Build an SDBM defined by a single stripe equality d0 = s0 # 3 and make sure
+  // the induced inequalities are not present after converting the SDBM back
+  // into lists of expressions.
+  auto sdbm = SDBM::get(llvm::None, {dim(0) - stripe(symb(0), 3)});
+
+  SmallVector<SDBMExpr, 4> eqs, ineqs;
+  sdbm.getSDBMExpressions(dialect(), ineqs, eqs);
+  // CHECK-EMPTY:
+  for (auto ineq : ineqs)
+    ineq.print(llvm::outs() << '\n');
+  llvm::outs() << "\n";
+
+  // CHECK: d0 - s0 # 3
+  // CHECK-EMPTY:
+  for (auto eq : eqs)
+    eq.print(llvm::outs() << '\n');
+  llvm::outs() << "\n\n";
+}
+
+TEST_FUNC(SDBM_StripeTightening) {
+  // Build an SDBM defined by
+  //
+  //   d0 = s0 # 3 # 5
+  //   s0 # 3 # 5 - d1 + 42 = 0
+  //   s0 # 3 - d0 <= 2
+  //
+  // where the last inequality is tighter than that induced by the first stripe
+  // equality (s0 # 3 - d0 <= 5 - 1 = 4).  Check that the conversion from SDBM
+  // back to the lists of constraints conserves both the stripe equality and the
+  // tighter inequality.
+  auto s = stripe(stripe(symb(0), 3), 5);
+  auto tight = stripe(symb(0), 3) - dim(0) - 2;
+  auto sdbm = SDBM::get({tight}, {s - dim(0), s - dim(1) + 42});
+
+  SmallVector<SDBMExpr, 4> eqs, ineqs;
+  sdbm.getSDBMExpressions(dialect(), ineqs, eqs);
+  // CHECK: s0 # 3 + -2 - d0
+  // CHECK-EMPTY:
+  for (auto ineq : ineqs)
+    ineq.print(llvm::outs() << '\n');
+  llvm::outs() << "\n";
+
+  // CHECK-DAG: d1 + -42 - d0
+  // CHECK-DAG: d0 - s0 # 3 # 5
+  for (auto eq : eqs)
+    eq.print(llvm::outs() << '\n');
+  llvm::outs() << "\n\n";
+}
+
+TEST_FUNC(SDBM_StripeTransitive) {
+  // Build an SDBM defined by
+  //
+  //   d0 = d1 # 3
+  //   d0 = d2 # 7
+  //
+  // where the same dimension is declared equal to two stripe expressions over
+  // different variables.  This is practically handled by introducing a
+  // temporary variable for the second stripe expression and adding an equality
+  // constraint between this variable and the original dimension variable.
+  auto sdbm = SDBM::get(
+      llvm::None, {stripe(dim(1), 3) - dim(0), stripe(dim(2), 7) - dim(0)});
+
+  //      CHECK:       cst   d0   d1   d2   t0
+  // CHECK-NEXT: cst   inf  inf  inf  inf  inf
+  // CHECK-NEXT: d0    inf    0    2  inf    0
+  // CHECK-NEXT: d1    inf    0  inf  inf  inf
+  // CHECK-NEXT: d2    inf  inf  inf  inf    0
+  // CHECK-NEXT: t0    inf    0  inf    6  inf
+  // CHECK-NEXT: t0 = d2 # 7
+  // CHECK-NEXT: d0 = d1 # 3
+  sdbm.print(llvm::outs());
+}
+
+} // end namespace
+
+int main() {
+  RUN_TESTS();
+  return 0;
+}
diff --git a/mlir/test/Target/import.ll b/mlir/test/Target/import.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a8e0b4450cfc99147d680efd225017d5d8b8c4b7
--- /dev/null
+++ b/mlir/test/Target/import.ll
@@ -0,0 +1,182 @@
+; RUN: mlir-translate -import-llvm %s | FileCheck %s
+
+%struct.t = type {}
+%struct.s = type { %struct.t, i64 }
+
+; CHECK: llvm.mlir.global external @g1() : !llvm<"{ {}, i64 }">
+@g1 = external global %struct.s, align 8
+; CHECK: llvm.mlir.global external @g2() : !llvm.double
+@g2 = external global double, align 8
+; CHECK: llvm.mlir.global internal @g3("string")
+@g3 = internal global [6 x i8] c"string"
+
+; CHECK: llvm.mlir.global external @g5() : !llvm<"<8 x i32>">
+@g5 = external global <8 x i32>
+
+@g4 = external global i32, align 8
+; CHECK: llvm.mlir.global internal constant @int_gep() : !llvm<"i32*"> {
+; CHECK-DAG:   %[[addr:[0-9]+]] = llvm.mlir.addressof @g4 : !llvm<"i32*">
+; CHECK-DAG:   %[[c2:[0-9]+]] = llvm.mlir.constant(2 : i32) : !llvm.i32
+; CHECK-NEXT:  %[[gepinit:[0-9]+]] = llvm.getelementptr %[[addr]][%[[c2]]] : (!llvm<"i32*">, !llvm.i32) -> !llvm<"i32*">
+; CHECK-NEXT:  llvm.return %[[gepinit]] : !llvm<"i32*">
+; CHECK-NEXT: }
+@int_gep = internal constant i32* getelementptr (i32, i32* @g4, i32 2)
+
+;
+; Linkage attribute.
+;
+
+; CHECK: llvm.mlir.global private @private(42 : i32) : !llvm.i32
+@private = private global i32 42
+; CHECK: llvm.mlir.global internal @internal(42 : i32) : !llvm.i32
+@internal = internal global i32 42
+; CHECK: llvm.mlir.global available_externally @available_externally(42 : i32) : !llvm.i32
+@available_externally = available_externally global i32 42
+; CHECK: llvm.mlir.global linkonce @linkonce(42 : i32) : !llvm.i32
+@linkonce = linkonce global i32 42
+; CHECK: llvm.mlir.global weak @weak(42 : i32) : !llvm.i32
+@weak = weak global i32 42
+; CHECK: llvm.mlir.global common @common(42 : i32) : !llvm.i32
+@common = common global i32 42
+; CHECK: llvm.mlir.global appending @appending(42 : i32) : !llvm.i32
+@appending = appending global i32 42
+; CHECK: llvm.mlir.global extern_weak @extern_weak() : !llvm.i32
+@extern_weak = extern_weak global i32
+; CHECK: llvm.mlir.global linkonce_odr @linkonce_odr(42 : i32) : !llvm.i32
+@linkonce_odr = linkonce_odr global i32 42
+; CHECK: llvm.mlir.global weak_odr @weak_odr(42 : i32) : !llvm.i32
+@weak_odr = weak_odr global i32 42
+; CHECK: llvm.mlir.global external @external() : !llvm.i32
+@external = external global i32
+
+; CHECK: llvm.func @fe(!llvm.i32) -> !llvm.float
+declare float @fe(i32)
+
+; FIXME: function attributes.
+; CHECK-LABEL: llvm.func @f1(%arg0: !llvm.i64) -> !llvm.i32 {
+; CHECK-DAG: %[[c2:[0-9]+]] = llvm.mlir.constant(2 : i32) : !llvm.i32
+; CHECK-DAG: %[[c42:[0-9]+]] = llvm.mlir.constant(42 : i32) : !llvm.i32
+; CHECK-DAG: %[[c1:[0-9]+]] = llvm.mlir.constant(1 : i1) : !llvm.i1
+; CHECK-DAG: %[[c43:[0-9]+]] = llvm.mlir.constant(43 : i32) : !llvm.i32
+define internal dso_local i32 @f1(i64 %a) norecurse {
+entry:
+; CHECK: %{{[0-9]+}} = llvm.inttoptr %arg0 : !llvm.i64 to !llvm<"i64*">
+  %aa = inttoptr i64 %a to i64*
+; CHECK: %[[addrof:[0-9]+]] = llvm.mlir.addressof @g2 : !llvm<"double*">
+; CHECK: %{{[0-9]+}} = llvm.ptrtoint %[[addrof]] : !llvm<"double*"> to !llvm.i64
+  %bb = ptrtoint double* @g2 to i64
+; CHECK-DAG: %[[addrof2:[0-9]+]] = llvm.mlir.addressof @g2 : !llvm<"double*">
+; CHECK: %{{[0-9]+}} = llvm.getelementptr %[[addrof2]][%[[c2]]] : (!llvm<"double*">, !llvm.i32) -> !llvm<"double*">
+  %cc = getelementptr double, double* @g2, i32 2
+; CHECK: %[[b:[0-9]+]] = llvm.trunc %arg0 : !llvm.i64 to !llvm.i32
+  %b = trunc i64 %a to i32
+; CHECK: %[[c:[0-9]+]] = llvm.call @fe(%[[b]]) : (!llvm.i32) -> !llvm.float
+  %c = call float @fe(i32 %b)
+; CHECK: %[[d:[0-9]+]] = llvm.fptosi %[[c]] : !llvm.float to !llvm.i32
+  %d = fptosi float %c to i32
+; FIXME: icmp should return i1.
+; CHECK: %[[e:[0-9]+]] = llvm.icmp "ne" %[[d]], %[[c2]] : !llvm.i32
+  %e = icmp ne i32 %d, 2
+; CHECK: llvm.cond_br %[[e]], ^bb1, ^bb2
+  br i1 %e, label %if.then, label %if.end
+
+; CHECK: ^bb1:
+if.then:
+; CHECK: llvm.return %[[c42]] : !llvm.i32
+  ret i32 42
+  
+; CHECK: ^bb2:
+if.end:
+; CHECK: %[[orcond:[0-9]+]] = llvm.or %[[e]], %[[c1]] : !llvm.i1
+  %or.cond = or i1 %e, 1
+; CHECK: llvm.return %[[c43]]
+  ret i32 43
+}
+
+; Test that instructions that dominate can be out of sequential order.
+; CHECK-LABEL: llvm.func @f2(%arg0: !llvm.i64) -> !llvm.i64 {
+; CHECK-DAG: %[[c3:[0-9]+]] = llvm.mlir.constant(3 : i64) : !llvm.i64
+define i64 @f2(i64 %a) noduplicate {
+entry:
+; CHECK: llvm.br ^bb2
+  br label %next
+
+; CHECK: ^bb1:
+end:
+; CHECK: llvm.return %1
+  ret i64 %b
+
+; CHECK: ^bb2:
+next:
+; CHECK: %1 = llvm.add %arg0, %[[c3]] : !llvm.i64
+  %b = add i64 %a, 3
+; CHECK: llvm.br ^bb1
+  br label %end
+}
+
+; Test arguments/phis.
+; CHECK-LABEL: llvm.func @f2_phis(%arg0: !llvm.i64) -> !llvm.i64 {
+; CHECK-DAG: %[[c3:[0-9]+]] = llvm.mlir.constant(3 : i64) : !llvm.i64
+define i64 @f2_phis(i64 %a) noduplicate {
+entry:
+; CHECK: llvm.br ^bb2
+  br label %next
+
+; CHECK: ^bb1(%1: !llvm.i64):
+end:
+  %c = phi i64 [ %b, %next ]
+; CHECK: llvm.return %1
+  ret i64 %c
+
+; CHECK: ^bb2:
+next:
+; CHECK: %2 = llvm.add %arg0, %[[c3]] : !llvm.i64
+  %b = add i64 %a, 3
+; CHECK: llvm.br ^bb1
+  br label %end
+}
+
+; CHECK-LABEL: llvm.func @f3() -> !llvm<"i32*">
+define i32* @f3() {
+; CHECK: %[[c:[0-9]+]] = llvm.mlir.addressof @g2 : !llvm<"double*">
+; CHECK: %[[b:[0-9]+]] = llvm.bitcast %[[c]] : !llvm<"double*"> to !llvm<"i32*">
+; CHECK: llvm.return %[[b]] : !llvm<"i32*">
+  ret i32* bitcast (double* @g2 to i32*)
+}
+
+; CHECK-LABEL: llvm.func @f4() -> !llvm<"i32*">
+define i32* @f4() {
+; CHECK: %[[b:[0-9]+]] = llvm.mlir.null : !llvm<"i32*">
+; CHECK: llvm.return %[[b]] : !llvm<"i32*">
+  ret i32* bitcast (double* null to i32*)
+}
+
+; CHECK-LABEL: llvm.func @f5
+define void @f5(i32 %d) {
+; FIXME: icmp should return i1.
+; CHECK: = llvm.icmp "eq"
+  %1 = icmp eq i32 %d, 2
+; CHECK: = llvm.icmp "slt"
+  %2 = icmp slt i32 %d, 2
+; CHECK: = llvm.icmp "sle"
+  %3 = icmp sle i32 %d, 2
+; CHECK: = llvm.icmp "sgt"
+  %4 = icmp sgt i32 %d, 2
+; CHECK: = llvm.icmp "sge"
+  %5 = icmp sge i32 %d, 2
+; CHECK: = llvm.icmp "ult"
+  %6 = icmp ult i32 %d, 2
+; CHECK: = llvm.icmp "ule"
+  %7 = icmp ule i32 %d, 2
+; CHECK: = llvm.icmp "ugt"
+  %8 = icmp ugt i32 %d, 2
+  ret void
+}
+
+; CHECK-LABEL: llvm.func @f6(%arg0: !llvm<"void (i16)*">)
+define void @f6(void (i16) *%fn) {
+; CHECK: %[[c:[0-9]+]] = llvm.mlir.constant(0 : i16) : !llvm.i16
+; CHECK: llvm.call %arg0(%[[c]])
+  call void %fn(i16 0)
+  ret void
+}
diff --git a/mlir/test/Target/llvmir-intrinsics.mlir b/mlir/test/Target/llvmir-intrinsics.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..e61ab4506e5bc361591ff0b1aea350aad3f344fd
--- /dev/null
+++ b/mlir/test/Target/llvmir-intrinsics.mlir
@@ -0,0 +1,106 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// CHECK-LABEL: @intrinsics
+llvm.func @intrinsics(%arg0: !llvm.float, %arg1: !llvm.float, %arg2: !llvm<"<8 x float>">, %arg3: !llvm<"i8*">) {
+  %c3 = llvm.mlir.constant(3 : i32) : !llvm.i32
+  %c1 = llvm.mlir.constant(1 : i32) : !llvm.i32
+  %c0 = llvm.mlir.constant(0 : i32) : !llvm.i32
+  // CHECK: call float @llvm.fmuladd.f32.f32.f32
+  "llvm.intr.fmuladd"(%arg0, %arg1, %arg0) : (!llvm.float, !llvm.float, !llvm.float) -> !llvm.float
+  // CHECK: call <8 x float> @llvm.fmuladd.v8f32.v8f32.v8f32
+  "llvm.intr.fmuladd"(%arg2, %arg2, %arg2) : (!llvm<"<8 x float>">, !llvm<"<8 x float>">, !llvm<"<8 x float>">) -> !llvm<"<8 x float>">
+  // CHECK: call void @llvm.prefetch.p0i8(i8* %3, i32 0, i32 3, i32 1)
+  "llvm.intr.prefetch"(%arg3, %c0, %c3, %c1) : (!llvm<"i8*">, !llvm.i32, !llvm.i32, !llvm.i32) -> ()
+  llvm.return
+}
+
+// CHECK-LABEL: @exp_test
+llvm.func @exp_test(%arg0: !llvm.float, %arg1: !llvm<"<8 x float>">) {
+  // CHECK: call float @llvm.exp.f32
+  "llvm.intr.exp"(%arg0) : (!llvm.float) -> !llvm.float
+  // CHECK: call <8 x float> @llvm.exp.v8f32
+  "llvm.intr.exp"(%arg1) : (!llvm<"<8 x float>">) -> !llvm<"<8 x float>">
+  llvm.return
+}
+
+// CHECK-LABEL: @log_test
+llvm.func @log_test(%arg0: !llvm.float, %arg1: !llvm<"<8 x float>">) {
+  // CHECK: call float @llvm.log.f32
+  "llvm.intr.log"(%arg0) : (!llvm.float) -> !llvm.float
+  // CHECK: call <8 x float> @llvm.log.v8f32
+  "llvm.intr.log"(%arg1) : (!llvm<"<8 x float>">) -> !llvm<"<8 x float>">
+  llvm.return
+}
+
+// CHECK-LABEL: @log10_test
+llvm.func @log10_test(%arg0: !llvm.float, %arg1: !llvm<"<8 x float>">) {
+  // CHECK: call float @llvm.log10.f32
+  "llvm.intr.log10"(%arg0) : (!llvm.float) -> !llvm.float
+  // CHECK: call <8 x float> @llvm.log10.v8f32
+  "llvm.intr.log10"(%arg1) : (!llvm<"<8 x float>">) -> !llvm<"<8 x float>">
+  llvm.return
+}
+
+// CHECK-LABEL: @log2_test
+llvm.func @log2_test(%arg0: !llvm.float, %arg1: !llvm<"<8 x float>">) {
+  // CHECK: call float @llvm.log2.f32
+  "llvm.intr.log2"(%arg0) : (!llvm.float) -> !llvm.float
+  // CHECK: call <8 x float> @llvm.log2.v8f32
+  "llvm.intr.log2"(%arg1) : (!llvm<"<8 x float>">) -> !llvm<"<8 x float>">
+  llvm.return
+}
+
+// CHECK-LABEL: @fabs_test
+llvm.func @fabs_test(%arg0: !llvm.float, %arg1: !llvm<"<8 x float>">) {
+  // CHECK: call float @llvm.fabs.f32
+  "llvm.intr.fabs"(%arg0) : (!llvm.float) -> !llvm.float
+  // CHECK: call <8 x float> @llvm.fabs.v8f32
+  "llvm.intr.fabs"(%arg1) : (!llvm<"<8 x float>">) -> !llvm<"<8 x float>">
+  llvm.return
+}
+
+// CHECK-LABEL: @ceil_test
+llvm.func @ceil_test(%arg0: !llvm.float, %arg1: !llvm<"<8 x float>">) {
+  // CHECK: call float @llvm.ceil.f32
+  "llvm.intr.ceil"(%arg0) : (!llvm.float) -> !llvm.float
+  // CHECK: call <8 x float> @llvm.ceil.v8f32
+  "llvm.intr.ceil"(%arg1) : (!llvm<"<8 x float>">) -> !llvm<"<8 x float>">
+  llvm.return
+}
+
+// CHECK-LABEL: @cos_test
+llvm.func @cos_test(%arg0: !llvm.float, %arg1: !llvm<"<8 x float>">) {
+  // CHECK: call float @llvm.cos.f32
+  "llvm.intr.cos"(%arg0) : (!llvm.float) -> !llvm.float
+  // CHECK: call <8 x float> @llvm.cos.v8f32
+  "llvm.intr.cos"(%arg1) : (!llvm<"<8 x float>">) -> !llvm<"<8 x float>">
+  llvm.return
+}
+
+// CHECK-LABEL: @copysign_test
+llvm.func @copysign_test(%arg0: !llvm.float, %arg1: !llvm.float, %arg2: !llvm<"<8 x float>">, %arg3: !llvm<"<8 x float>">) {
+  // CHECK: call float @llvm.copysign.f32
+  "llvm.intr.copysign"(%arg0, %arg1) : (!llvm.float, !llvm.float) -> !llvm.float
+  // CHECK: call <8 x float> @llvm.copysign.v8f32
+  "llvm.intr.copysign"(%arg2, %arg3) : (!llvm<"<8 x float>">, !llvm<"<8 x float>">) -> !llvm<"<8 x float>">
+  llvm.return
+}
+
+// Check that intrinsics are declared with appropriate types.
+// CHECK: declare float @llvm.fmuladd.f32.f32.f32(float, float, float)
+// CHECK: declare <8 x float> @llvm.fmuladd.v8f32.v8f32.v8f32(<8 x float>, <8 x float>, <8 x float>) #0
+// CHECK: declare void @llvm.prefetch.p0i8(i8* nocapture readonly, i32 immarg, i32 immarg, i32)
+// CHECK: declare float @llvm.exp.f32(float)
+// CHECK: declare <8 x float> @llvm.exp.v8f32(<8 x float>) #0
+// CHECK: declare float @llvm.log.f32(float)
+// CHECK: declare <8 x float> @llvm.log.v8f32(<8 x float>) #0
+// CHECK: declare float @llvm.log10.f32(float)
+// CHECK: declare <8 x float> @llvm.log10.v8f32(<8 x float>) #0
+// CHECK: declare float @llvm.log2.f32(float)
+// CHECK: declare <8 x float> @llvm.log2.v8f32(<8 x float>) #0
+// CHECK: declare float @llvm.fabs.f32(float)
+// CHECK: declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #0
+// CHECK: declare float @llvm.ceil.f32(float)
+// CHECK: declare <8 x float> @llvm.ceil.v8f32(<8 x float>) #0
+// CHECK: declare float @llvm.cos.f32(float)
+// CHECK: declare <8 x float> @llvm.cos.v8f32(<8 x float>) #0
diff --git a/mlir/test/Target/llvmir-invalid.mlir b/mlir/test/Target/llvmir-invalid.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..1b616b3031e88755c9262cf89b2390bd3e5d180b
--- /dev/null
+++ b/mlir/test/Target/llvmir-invalid.mlir
@@ -0,0 +1,6 @@
+// RUN: mlir-translate -verify-diagnostics -mlir-to-llvmir %s
+
+// expected-error @+1 {{unsupported module-level operation}}
+func @foo() {
+  llvm.return
+}
diff --git a/mlir/test/Target/llvmir.mlir b/mlir/test/Target/llvmir.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..9d0ee383046e3d2172b12d857f5e093133a7b229
--- /dev/null
+++ b/mlir/test/Target/llvmir.mlir
@@ -0,0 +1,1041 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// CHECK: @i32_global = internal global i32 42
+llvm.mlir.global internal @i32_global(42: i32) : !llvm.i32
+
+// CHECK: @i32_const = internal constant i53 52
+llvm.mlir.global internal constant @i32_const(52: i53) : !llvm.i53
+
+// CHECK: @int_global_array = internal global [3 x i32] [i32 62, i32 62, i32 62]
+llvm.mlir.global internal @int_global_array(dense<62> : vector<3xi32>) : !llvm<"[3 x i32]">
+
+// CHECK: @i32_global_addr_space = internal addrspace(7) global i32 62
+llvm.mlir.global internal @i32_global_addr_space(62: i32) {addr_space = 7 : i32} : !llvm.i32
+
+// CHECK: @float_global = internal global float 0.000000e+00
+llvm.mlir.global internal @float_global(0.0: f32) : !llvm.float
+
+// CHECK: @float_global_array = internal global [1 x float] [float -5.000000e+00]
+llvm.mlir.global internal @float_global_array(dense<[-5.0]> : vector<1xf32>) : !llvm<"[1 x float]">
+
+// CHECK: @string_const = internal constant [6 x i8] c"foobar"
+llvm.mlir.global internal constant @string_const("foobar") : !llvm<"[6 x i8]">
+
+// CHECK: @int_global_undef = internal global i64 undef
+llvm.mlir.global internal @int_global_undef() : !llvm.i64
+
+// CHECK: @int_gep = internal constant i32* getelementptr (i32, i32* @i32_global, i32 2)
+llvm.mlir.global internal constant @int_gep() : !llvm<"i32*"> {
+  %addr = llvm.mlir.addressof @i32_global : !llvm<"i32*">
+  %_c0 = llvm.mlir.constant(2: i32) :!llvm.i32
+  %gepinit = llvm.getelementptr %addr[%_c0] : (!llvm<"i32*">, !llvm.i32) -> !llvm<"i32*">
+  llvm.return %gepinit : !llvm<"i32*">
+}
+
+//
+// Linkage attribute.
+//
+
+// CHECK: @private = private global i32 42
+llvm.mlir.global private @private(42 : i32) : !llvm.i32
+// CHECK: @internal = internal global i32 42
+llvm.mlir.global internal @internal(42 : i32) : !llvm.i32
+// CHECK: @available_externally = available_externally global i32 42
+llvm.mlir.global available_externally @available_externally(42 : i32) : !llvm.i32
+// CHECK: @linkonce = linkonce global i32 42
+llvm.mlir.global linkonce @linkonce(42 : i32) : !llvm.i32
+// CHECK: @weak = weak global i32 42
+llvm.mlir.global weak @weak(42 : i32) : !llvm.i32
+// CHECK: @common = common global i32 42
+llvm.mlir.global common @common(42 : i32) : !llvm.i32
+// CHECK: @appending = appending global i32 42
+llvm.mlir.global appending @appending(42 : i32) : !llvm.i32
+// CHECK: @extern_weak = extern_weak global i32
+llvm.mlir.global extern_weak @extern_weak() : !llvm.i32
+// CHECK: @linkonce_odr = linkonce_odr global i32 42
+llvm.mlir.global linkonce_odr @linkonce_odr(42 : i32) : !llvm.i32
+// CHECK: @weak_odr = weak_odr global i32 42
+llvm.mlir.global weak_odr @weak_odr(42 : i32) : !llvm.i32
+// CHECK: @external = external global i32
+llvm.mlir.global external @external() : !llvm.i32
+
+
+//
+// Declarations of the allocation functions to be linked against.
+//
+
+// CHECK: declare i8* @malloc(i64)
+llvm.func @malloc(!llvm.i64) -> !llvm<"i8*">
+// CHECK: declare void @free(i8*)
+
+
+//
+// Basic functionality: function and block conversion, function calls,
+// phi nodes, scalar type conversion, arithmetic operations.
+//
+
+// CHECK-LABEL: define void @empty() {
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
+llvm.func @empty() {
+  llvm.return
+}
+
+// CHECK-LABEL: @global_refs
+llvm.func @global_refs() {
+  // Check load from globals.
+  // CHECK: load i32, i32* @i32_global
+  %0 = llvm.mlir.addressof @i32_global : !llvm<"i32*">
+  %1 = llvm.load %0 : !llvm<"i32*">
+
+  // Check the contracted form of load from array constants.
+  // CHECK: load i8, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @string_const, i64 0, i64 0)
+  %2 = llvm.mlir.addressof @string_const : !llvm<"[6 x i8]*">
+  %c0 = llvm.mlir.constant(0 : index) : !llvm.i64
+  %3 = llvm.getelementptr %2[%c0, %c0] : (!llvm<"[6 x i8]*">, !llvm.i64, !llvm.i64) -> !llvm<"i8*">
+  %4 = llvm.load %3 : !llvm<"i8*">
+
+  llvm.return
+}
+
+// CHECK-LABEL: declare void @body(i64)
+llvm.func @body(!llvm.i64)
+
+
+// CHECK-LABEL: define void @simple_loop() {
+llvm.func @simple_loop() {
+// CHECK: br label %[[SIMPLE_bb1:[0-9]+]]
+  llvm.br ^bb1
+
+// Constants are inlined in LLVM rather than a separate instruction.
+// CHECK: [[SIMPLE_bb1]]:
+// CHECK-NEXT: br label %[[SIMPLE_bb2:[0-9]+]]
+^bb1:   // pred: ^bb0
+  %0 = llvm.mlir.constant(1 : index) : !llvm.i64
+  %1 = llvm.mlir.constant(42 : index) : !llvm.i64
+  llvm.br ^bb2(%0 : !llvm.i64)
+
+// CHECK: [[SIMPLE_bb2]]:
+// CHECK-NEXT:   %{{[0-9]+}} = phi i64 [ %{{[0-9]+}}, %[[SIMPLE_bb3:[0-9]+]] ], [ 1, %[[SIMPLE_bb1]] ]
+// CHECK-NEXT:   %{{[0-9]+}} = icmp slt i64 %{{[0-9]+}}, 42
+// CHECK-NEXT:   br i1 %{{[0-9]+}}, label %[[SIMPLE_bb3]], label %[[SIMPLE_bb4:[0-9]+]]
+^bb2(%2: !llvm.i64): // 2 preds: ^bb1, ^bb3
+  %3 = llvm.icmp "slt" %2, %1 : !llvm.i64
+  llvm.cond_br %3, ^bb3, ^bb4
+
+// CHECK: [[SIMPLE_bb3]]:
+// CHECK-NEXT:   call void @body(i64 %{{[0-9]+}})
+// CHECK-NEXT:   %{{[0-9]+}} = add i64 %{{[0-9]+}}, 1
+// CHECK-NEXT:   br label %[[SIMPLE_bb2]]
+^bb3:   // pred: ^bb2
+  llvm.call @body(%2) : (!llvm.i64) -> ()
+  %4 = llvm.mlir.constant(1 : index) : !llvm.i64
+  %5 = llvm.add %2, %4 : !llvm.i64
+  llvm.br ^bb2(%5 : !llvm.i64)
+
+// CHECK: [[SIMPLE_bb4]]:
+// CHECK-NEXT:    ret void
+^bb4:   // pred: ^bb2
+  llvm.return
+}
+
+// CHECK-LABEL: define void @simple_caller() {
+// CHECK-NEXT:   call void @simple_loop()
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
+llvm.func @simple_caller() {
+  llvm.call @simple_loop() : () -> ()
+  llvm.return
+}
+
+//func @simple_indirect_caller() {
+//^bb0:
+//  %f = constant @simple_loop : () -> ()
+//  call_indirect %f() : () -> ()
+//  return
+//}
+
+// CHECK-LABEL: define void @ml_caller() {
+// CHECK-NEXT:   call void @simple_loop()
+// CHECK-NEXT:   call void @more_imperfectly_nested_loops()
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
+llvm.func @ml_caller() {
+  llvm.call @simple_loop() : () -> ()
+  llvm.call @more_imperfectly_nested_loops() : () -> ()
+  llvm.return
+}
+
+// CHECK-LABEL: declare i64 @body_args(i64)
+llvm.func @body_args(!llvm.i64) -> !llvm.i64
+// CHECK-LABEL: declare i32 @other(i64, i32)
+llvm.func @other(!llvm.i64, !llvm.i32) -> !llvm.i32
+
+// CHECK-LABEL: define i32 @func_args(i32 {{%.*}}, i32 {{%.*}}) {
+// CHECK-NEXT: br label %[[ARGS_bb1:[0-9]+]]
+llvm.func @func_args(%arg0: !llvm.i32, %arg1: !llvm.i32) -> !llvm.i32 {
+  %0 = llvm.mlir.constant(0 : i32) : !llvm.i32
+  llvm.br ^bb1
+
+// CHECK: [[ARGS_bb1]]:
+// CHECK-NEXT: br label %[[ARGS_bb2:[0-9]+]]
+^bb1:   // pred: ^bb0
+  %1 = llvm.mlir.constant(0 : index) : !llvm.i64
+  %2 = llvm.mlir.constant(42 : index) : !llvm.i64
+  llvm.br ^bb2(%1 : !llvm.i64)
+
+// CHECK: [[ARGS_bb2]]:
+// CHECK-NEXT:   %5 = phi i64 [ %12, %[[ARGS_bb3:[0-9]+]] ], [ 0, %[[ARGS_bb1]] ]
+// CHECK-NEXT:   %6 = icmp slt i64 %5, 42
+// CHECK-NEXT:   br i1 %6, label %[[ARGS_bb3]], label %[[ARGS_bb4:[0-9]+]]
+^bb2(%3: !llvm.i64): // 2 preds: ^bb1, ^bb3
+  %4 = llvm.icmp "slt" %3, %2 : !llvm.i64
+  llvm.cond_br %4, ^bb3, ^bb4
+
+// CHECK: [[ARGS_bb3]]:
+// CHECK-NEXT:   %8 = call i64 @body_args(i64 %5)
+// CHECK-NEXT:   %9 = call i32 @other(i64 %8, i32 %0)
+// CHECK-NEXT:   %10 = call i32 @other(i64 %8, i32 %9)
+// CHECK-NEXT:   %11 = call i32 @other(i64 %8, i32 %1)
+// CHECK-NEXT:   %12 = add i64 %5, 1
+// CHECK-NEXT:   br label %[[ARGS_bb2]]
+^bb3:   // pred: ^bb2
+  %5 = llvm.call @body_args(%3) : (!llvm.i64) -> !llvm.i64
+  %6 = llvm.call @other(%5, %arg0) : (!llvm.i64, !llvm.i32) -> !llvm.i32
+  %7 = llvm.call @other(%5, %6) : (!llvm.i64, !llvm.i32) -> !llvm.i32
+  %8 = llvm.call @other(%5, %arg1) : (!llvm.i64, !llvm.i32) -> !llvm.i32
+  %9 = llvm.mlir.constant(1 : index) : !llvm.i64
+  %10 = llvm.add %3, %9 : !llvm.i64
+  llvm.br ^bb2(%10 : !llvm.i64)
+
+// CHECK: [[ARGS_bb4]]:
+// CHECK-NEXT:   %14 = call i32 @other(i64 0, i32 0)
+// CHECK-NEXT:   ret i32 %14
+^bb4:   // pred: ^bb2
+  %11 = llvm.mlir.constant(0 : index) : !llvm.i64
+  %12 = llvm.call @other(%11, %0) : (!llvm.i64, !llvm.i32) -> !llvm.i32
+  llvm.return %12 : !llvm.i32
+}
+
+// CHECK: declare void @pre(i64)
+llvm.func @pre(!llvm.i64)
+
+// CHECK: declare void @body2(i64, i64)
+llvm.func @body2(!llvm.i64, !llvm.i64)
+
+// CHECK: declare void @post(i64)
+llvm.func @post(!llvm.i64)
+
+// CHECK-LABEL: define void @imperfectly_nested_loops() {
+// CHECK-NEXT:   br label %[[IMPER_bb1:[0-9]+]]
+llvm.func @imperfectly_nested_loops() {
+  llvm.br ^bb1
+
+// CHECK: [[IMPER_bb1]]:
+// CHECK-NEXT:   br label %[[IMPER_bb2:[0-9]+]]
+^bb1:   // pred: ^bb0
+  %0 = llvm.mlir.constant(0 : index) : !llvm.i64
+  %1 = llvm.mlir.constant(42 : index) : !llvm.i64
+  llvm.br ^bb2(%0 : !llvm.i64)
+
+// CHECK: [[IMPER_bb2]]:
+// CHECK-NEXT:   %3 = phi i64 [ %13, %[[IMPER_bb7:[0-9]+]] ], [ 0, %[[IMPER_bb1]] ]
+// CHECK-NEXT:   %4 = icmp slt i64 %3, 42
+// CHECK-NEXT:   br i1 %4, label %[[IMPER_bb3:[0-9]+]], label %[[IMPER_bb8:[0-9]+]]
+^bb2(%2: !llvm.i64): // 2 preds: ^bb1, ^bb7
+  %3 = llvm.icmp "slt" %2, %1 : !llvm.i64
+  llvm.cond_br %3, ^bb3, ^bb8
+
+// CHECK: [[IMPER_bb3]]:
+// CHECK-NEXT:   call void @pre(i64 %3)
+// CHECK-NEXT:   br label %[[IMPER_bb4:[0-9]+]]
+^bb3:   // pred: ^bb2
+  llvm.call @pre(%2) : (!llvm.i64) -> ()
+  llvm.br ^bb4
+
+// CHECK: [[IMPER_bb4]]:
+// CHECK-NEXT:   br label %[[IMPER_bb5:[0-9]+]]
+^bb4:   // pred: ^bb3
+  %4 = llvm.mlir.constant(7 : index) : !llvm.i64
+  %5 = llvm.mlir.constant(56 : index) : !llvm.i64
+  llvm.br ^bb5(%4 : !llvm.i64)
+
+// CHECK: [[IMPER_bb5]]:
+// CHECK-NEXT:   %8 = phi i64 [ %11, %[[IMPER_bb6:[0-9]+]] ], [ 7, %[[IMPER_bb4]] ]
+// CHECK-NEXT:   %9 = icmp slt i64 %8, 56
+// CHECK-NEXT:   br i1 %9, label %[[IMPER_bb6]], label %[[IMPER_bb7]]
+^bb5(%6: !llvm.i64): // 2 preds: ^bb4, ^bb6
+  %7 = llvm.icmp "slt" %6, %5 : !llvm.i64
+  llvm.cond_br %7, ^bb6, ^bb7
+
+// CHECK: [[IMPER_bb6]]:
+// CHECK-NEXT:   call void @body2(i64 %3, i64 %8)
+// CHECK-NEXT:   %11 = add i64 %8, 2
+// CHECK-NEXT:   br label %[[IMPER_bb5]]
+^bb6:   // pred: ^bb5
+  llvm.call @body2(%2, %6) : (!llvm.i64, !llvm.i64) -> ()
+  %8 = llvm.mlir.constant(2 : index) : !llvm.i64
+  %9 = llvm.add %6, %8 : !llvm.i64
+  llvm.br ^bb5(%9 : !llvm.i64)
+
+// CHECK: [[IMPER_bb7]]:
+// CHECK-NEXT:   call void @post(i64 %3)
+// CHECK-NEXT:   %13 = add i64 %3, 1
+// CHECK-NEXT:   br label %[[IMPER_bb2]]
+^bb7:   // pred: ^bb5
+  llvm.call @post(%2) : (!llvm.i64) -> ()
+  %10 = llvm.mlir.constant(1 : index) : !llvm.i64
+  %11 = llvm.add %2, %10 : !llvm.i64
+  llvm.br ^bb2(%11 : !llvm.i64)
+
+// CHECK: [[IMPER_bb8]]:
+// CHECK-NEXT:   ret void
+^bb8:   // pred: ^bb2
+  llvm.return
+}
+
+// CHECK: declare void @mid(i64)
+llvm.func @mid(!llvm.i64)
+
+// CHECK: declare void @body3(i64, i64)
+llvm.func @body3(!llvm.i64, !llvm.i64)
+
+// A complete function transformation check.
+// CHECK-LABEL: define void @more_imperfectly_nested_loops() {
+// CHECK-NEXT:   br label %1
+// CHECK: 1:                                      ; preds = %0
+// CHECK-NEXT:   br label %2
+// CHECK: 2:                                      ; preds = %19, %1
+// CHECK-NEXT:   %3 = phi i64 [ %20, %19 ], [ 0, %1 ]
+// CHECK-NEXT:   %4 = icmp slt i64 %3, 42
+// CHECK-NEXT:   br i1 %4, label %5, label %21
+// CHECK: 5:                                      ; preds = %2
+// CHECK-NEXT:   call void @pre(i64 %3)
+// CHECK-NEXT:   br label %6
+// CHECK: 6:                                      ; preds = %5
+// CHECK-NEXT:   br label %7
+// CHECK: 7:                                      ; preds = %10, %6
+// CHECK-NEXT:   %8 = phi i64 [ %11, %10 ], [ 7, %6 ]
+// CHECK-NEXT:   %9 = icmp slt i64 %8, 56
+// CHECK-NEXT:   br i1 %9, label %10, label %12
+// CHECK: 10:                                     ; preds = %7
+// CHECK-NEXT:   call void @body2(i64 %3, i64 %8)
+// CHECK-NEXT:   %11 = add i64 %8, 2
+// CHECK-NEXT:   br label %7
+// CHECK: 12:                                     ; preds = %7
+// CHECK-NEXT:   call void @mid(i64 %3)
+// CHECK-NEXT:   br label %13
+// CHECK: 13:                                     ; preds = %12
+// CHECK-NEXT:   br label %14
+// CHECK: 14:                                     ; preds = %17, %13
+// CHECK-NEXT:   %15 = phi i64 [ %18, %17 ], [ 18, %13 ]
+// CHECK-NEXT:   %16 = icmp slt i64 %15, 37
+// CHECK-NEXT:   br i1 %16, label %17, label %19
+// CHECK: 17:                                     ; preds = %14
+// CHECK-NEXT:   call void @body3(i64 %3, i64 %15)
+// CHECK-NEXT:   %18 = add i64 %15, 3
+// CHECK-NEXT:   br label %14
+// CHECK: 19:                                     ; preds = %14
+// CHECK-NEXT:   call void @post(i64 %3)
+// CHECK-NEXT:   %20 = add i64 %3, 1
+// CHECK-NEXT:   br label %2
+// CHECK: 21:                                     ; preds = %2
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
+llvm.func @more_imperfectly_nested_loops() {
+  llvm.br ^bb1
+^bb1:	// pred: ^bb0
+  %0 = llvm.mlir.constant(0 : index) : !llvm.i64
+  %1 = llvm.mlir.constant(42 : index) : !llvm.i64
+  llvm.br ^bb2(%0 : !llvm.i64)
+^bb2(%2: !llvm.i64):	// 2 preds: ^bb1, ^bb11
+  %3 = llvm.icmp "slt" %2, %1 : !llvm.i64
+  llvm.cond_br %3, ^bb3, ^bb12
+^bb3:	// pred: ^bb2
+  llvm.call @pre(%2) : (!llvm.i64) -> ()
+  llvm.br ^bb4
+^bb4:	// pred: ^bb3
+  %4 = llvm.mlir.constant(7 : index) : !llvm.i64
+  %5 = llvm.mlir.constant(56 : index) : !llvm.i64
+  llvm.br ^bb5(%4 : !llvm.i64)
+^bb5(%6: !llvm.i64):	// 2 preds: ^bb4, ^bb6
+  %7 = llvm.icmp "slt" %6, %5 : !llvm.i64
+  llvm.cond_br %7, ^bb6, ^bb7
+^bb6:	// pred: ^bb5
+  llvm.call @body2(%2, %6) : (!llvm.i64, !llvm.i64) -> ()
+  %8 = llvm.mlir.constant(2 : index) : !llvm.i64
+  %9 = llvm.add %6, %8 : !llvm.i64
+  llvm.br ^bb5(%9 : !llvm.i64)
+^bb7:	// pred: ^bb5
+  llvm.call @mid(%2) : (!llvm.i64) -> ()
+  llvm.br ^bb8
+^bb8:	// pred: ^bb7
+  %10 = llvm.mlir.constant(18 : index) : !llvm.i64
+  %11 = llvm.mlir.constant(37 : index) : !llvm.i64
+  llvm.br ^bb9(%10 : !llvm.i64)
+^bb9(%12: !llvm.i64):	// 2 preds: ^bb8, ^bb10
+  %13 = llvm.icmp "slt" %12, %11 : !llvm.i64
+  llvm.cond_br %13, ^bb10, ^bb11
+^bb10:	// pred: ^bb9
+  llvm.call @body3(%2, %12) : (!llvm.i64, !llvm.i64) -> ()
+  %14 = llvm.mlir.constant(3 : index) : !llvm.i64
+  %15 = llvm.add %12, %14 : !llvm.i64
+  llvm.br ^bb9(%15 : !llvm.i64)
+^bb11:	// pred: ^bb9
+  llvm.call @post(%2) : (!llvm.i64) -> ()
+  %16 = llvm.mlir.constant(1 : index) : !llvm.i64
+  %17 = llvm.add %2, %16 : !llvm.i64
+  llvm.br ^bb2(%17 : !llvm.i64)
+^bb12:	// pred: ^bb2
+  llvm.return
+}
+
+//
+// MemRef type conversion, allocation and communication with functions.
+//
+
+// CHECK-LABEL: define void @memref_alloc()
+llvm.func @memref_alloc() {
+// CHECK-NEXT: %{{[0-9]+}} = call i8* @malloc(i64 400)
+// CHECK-NEXT: %{{[0-9]+}} = bitcast i8* %{{[0-9]+}} to float*
+// CHECK-NEXT: %{{[0-9]+}} = insertvalue { float* } undef, float* %{{[0-9]+}}, 0
+  %0 = llvm.mlir.constant(10 : index) : !llvm.i64
+  %1 = llvm.mlir.constant(10 : index) : !llvm.i64
+  %2 = llvm.mul %0, %1 : !llvm.i64
+  %3 = llvm.mlir.undef : !llvm<"{ float* }">
+  %4 = llvm.mlir.constant(4 : index) : !llvm.i64
+  %5 = llvm.mul %2, %4 : !llvm.i64
+  %6 = llvm.call @malloc(%5) : (!llvm.i64) -> !llvm<"i8*">
+  %7 = llvm.bitcast %6 : !llvm<"i8*"> to !llvm<"float*">
+  %8 = llvm.insertvalue %7, %3[0] : !llvm<"{ float* }">
+// CHECK-NEXT: ret void
+  llvm.return
+}
+
+// CHECK-LABEL: declare i64 @get_index()
+llvm.func @get_index() -> !llvm.i64
+
+// CHECK-LABEL: define void @store_load_static()
+llvm.func @store_load_static() {
+^bb0:
+// CHECK-NEXT: %{{[0-9]+}} = call i8* @malloc(i64 40)
+// CHECK-NEXT: %{{[0-9]+}} = bitcast i8* %{{[0-9]+}} to float*
+// CHECK-NEXT: %{{[0-9]+}} = insertvalue { float* } undef, float* %{{[0-9]+}}, 0
+  %0 = llvm.mlir.constant(10 : index) : !llvm.i64
+  %1 = llvm.mlir.undef : !llvm<"{ float* }">
+  %2 = llvm.mlir.constant(4 : index) : !llvm.i64
+  %3 = llvm.mul %0, %2 : !llvm.i64
+  %4 = llvm.call @malloc(%3) : (!llvm.i64) -> !llvm<"i8*">
+  %5 = llvm.bitcast %4 : !llvm<"i8*"> to !llvm<"float*">
+  %6 = llvm.insertvalue %5, %1[0] : !llvm<"{ float* }">
+  %7 = llvm.mlir.constant(1.000000e+00 : f32) : !llvm.float
+  llvm.br ^bb1
+^bb1:   // pred: ^bb0
+  %8 = llvm.mlir.constant(0 : index) : !llvm.i64
+  %9 = llvm.mlir.constant(10 : index) : !llvm.i64
+  llvm.br ^bb2(%8 : !llvm.i64)
+// CHECK: %{{[0-9]+}} = phi i64 [ %{{[0-9]+}}, %{{[0-9]+}} ], [ 0, %{{[0-9]+}} ]
+^bb2(%10: !llvm.i64):        // 2 preds: ^bb1, ^bb3
+// CHECK-NEXT: %{{[0-9]+}} = icmp slt i64 %{{[0-9]+}}, 10
+  %11 = llvm.icmp "slt" %10, %9 : !llvm.i64
+// CHECK-NEXT: br i1 %{{[0-9]+}}, label %{{[0-9]+}}, label %{{[0-9]+}}
+  llvm.cond_br %11, ^bb3, ^bb4
+^bb3:   // pred: ^bb2
+// CHECK: %{{[0-9]+}} = extractvalue { float* } %{{[0-9]+}}, 0
+// CHECK-NEXT: %{{[0-9]+}} = getelementptr float, float* %{{[0-9]+}}, i64 %{{[0-9]+}}
+// CHECK-NEXT: store float 1.000000e+00, float* %{{[0-9]+}}
+  %12 = llvm.mlir.constant(10 : index) : !llvm.i64
+  %13 = llvm.extractvalue %6[0] : !llvm<"{ float* }">
+  %14 = llvm.getelementptr %13[%10] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+  llvm.store %7, %14 : !llvm<"float*">
+  %15 = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, 1
+  %16 = llvm.add %10, %15 : !llvm.i64
+// CHECK-NEXT: br label %{{[0-9]+}}
+  llvm.br ^bb2(%16 : !llvm.i64)
+^bb4:   // pred: ^bb2
+  llvm.br ^bb5
+^bb5:   // pred: ^bb4
+  %17 = llvm.mlir.constant(0 : index) : !llvm.i64
+  %18 = llvm.mlir.constant(10 : index) : !llvm.i64
+  llvm.br ^bb6(%17 : !llvm.i64)
+// CHECK: %{{[0-9]+}} = phi i64 [ %{{[0-9]+}}, %{{[0-9]+}} ], [ 0, %{{[0-9]+}} ]
+^bb6(%19: !llvm.i64):        // 2 preds: ^bb5, ^bb7
+// CHECK-NEXT: %{{[0-9]+}} = icmp slt i64 %{{[0-9]+}}, 10
+  %20 = llvm.icmp "slt" %19, %18 : !llvm.i64
+// CHECK-NEXT: br i1 %{{[0-9]+}}, label %{{[0-9]+}}, label %{{[0-9]+}}
+  llvm.cond_br %20, ^bb7, ^bb8
+^bb7:   // pred: ^bb6
+// CHECK:      %{{[0-9]+}} = extractvalue { float* } %{{[0-9]+}}, 0
+// CHECK-NEXT: %{{[0-9]+}} = getelementptr float, float* %{{[0-9]+}}, i64 %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = load float, float* %{{[0-9]+}}
+  %21 = llvm.mlir.constant(10 : index) : !llvm.i64
+  %22 = llvm.extractvalue %6[0] : !llvm<"{ float* }">
+  %23 = llvm.getelementptr %22[%19] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+  %24 = llvm.load %23 : !llvm<"float*">
+  %25 = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, 1
+  %26 = llvm.add %19, %25 : !llvm.i64
+// CHECK-NEXT: br label %{{[0-9]+}}
+  llvm.br ^bb6(%26 : !llvm.i64)
+^bb8:   // pred: ^bb6
+// CHECK: ret void
+  llvm.return
+}
+
+// CHECK-LABEL: define void @store_load_dynamic(i64 {{%.*}})
+llvm.func @store_load_dynamic(%arg0: !llvm.i64) {
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 %{{[0-9]+}}, 4
+// CHECK-NEXT: %{{[0-9]+}} = call i8* @malloc(i64 %{{[0-9]+}})
+// CHECK-NEXT: %{{[0-9]+}} = bitcast i8* %{{[0-9]+}} to float*
+// CHECK-NEXT: %{{[0-9]+}} = insertvalue { float*, i64 } undef, float* %{{[0-9]+}}, 0
+// CHECK-NEXT: %{{[0-9]+}} = insertvalue { float*, i64 } %{{[0-9]+}}, i64 %{{[0-9]+}}, 1
+  %0 = llvm.mlir.undef : !llvm<"{ float*, i64 }">
+  %1 = llvm.mlir.constant(4 : index) : !llvm.i64
+  %2 = llvm.mul %arg0, %1 : !llvm.i64
+  %3 = llvm.call @malloc(%2) : (!llvm.i64) -> !llvm<"i8*">
+  %4 = llvm.bitcast %3 : !llvm<"i8*"> to !llvm<"float*">
+  %5 = llvm.insertvalue %4, %0[0] : !llvm<"{ float*, i64 }">
+  %6 = llvm.insertvalue %arg0, %5[1] : !llvm<"{ float*, i64 }">
+  %7 = llvm.mlir.constant(1.000000e+00 : f32) : !llvm.float
+// CHECK-NEXT: br label %{{[0-9]+}}
+  llvm.br ^bb1
+^bb1:   // pred: ^bb0
+  %8 = llvm.mlir.constant(0 : index) : !llvm.i64
+  llvm.br ^bb2(%8 : !llvm.i64)
+// CHECK: %{{[0-9]+}} = phi i64 [ %{{[0-9]+}}, %{{[0-9]+}} ], [ 0, %{{[0-9]+}} ]
+^bb2(%9: !llvm.i64): // 2 preds: ^bb1, ^bb3
+// CHECK-NEXT: %{{[0-9]+}} = icmp slt i64 %{{[0-9]+}}, %{{[0-9]+}}
+  %10 = llvm.icmp "slt" %9, %arg0 : !llvm.i64
+// CHECK-NEXT: br i1 %{{[0-9]+}}, label %{{[0-9]+}}, label %{{[0-9]+}}
+  llvm.cond_br %10, ^bb3, ^bb4
+^bb3:   // pred: ^bb2
+// CHECK:      %{{[0-9]+}} = extractvalue { float*, i64 } %{{[0-9]+}}, 1
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float*, i64 } %{{[0-9]+}}, 0
+// CHECK-NEXT: %{{[0-9]+}} = getelementptr float, float* %{{[0-9]+}}, i64 %{{[0-9]+}}
+// CHECK-NEXT: store float 1.000000e+00, float* %{{[0-9]+}}
+  %11 = llvm.extractvalue %6[1] : !llvm<"{ float*, i64 }">
+  %12 = llvm.extractvalue %6[0] : !llvm<"{ float*, i64 }">
+  %13 = llvm.getelementptr %12[%9] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+  llvm.store %7, %13 : !llvm<"float*">
+  %14 = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, 1
+  %15 = llvm.add %9, %14 : !llvm.i64
+// CHECK-NEXT: br label %{{[0-9]+}}
+  llvm.br ^bb2(%15 : !llvm.i64)
+^bb4:   // pred: ^bb3
+  llvm.br ^bb5
+^bb5:   // pred: ^bb4
+  %16 = llvm.mlir.constant(0 : index) : !llvm.i64
+  llvm.br ^bb6(%16 : !llvm.i64)
+// CHECK: %{{[0-9]+}} = phi i64 [ %{{[0-9]+}}, %{{[0-9]+}} ], [ 0, %{{[0-9]+}} ]
+^bb6(%17: !llvm.i64):        // 2 preds: ^bb5, ^bb7
+// CHECK-NEXT: %{{[0-9]+}} = icmp slt i64 %{{[0-9]+}}, %{{[0-9]+}}
+  %18 = llvm.icmp "slt" %17, %arg0 : !llvm.i64
+// CHECK-NEXT: br i1 %{{[0-9]+}}, label %{{[0-9]+}}, label %{{[0-9]+}}
+  llvm.cond_br %18, ^bb7, ^bb8
+^bb7:   // pred: ^bb6
+// CHECK:      %{{[0-9]+}} = extractvalue { float*, i64 } %{{[0-9]+}}, 1
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float*, i64 } %{{[0-9]+}}, 0
+// CHECK-NEXT: %{{[0-9]+}} = getelementptr float, float* %{{[0-9]+}}, i64 %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = load float, float* %{{[0-9]+}}
+  %19 = llvm.extractvalue %6[1] : !llvm<"{ float*, i64 }">
+  %20 = llvm.extractvalue %6[0] : !llvm<"{ float*, i64 }">
+  %21 = llvm.getelementptr %20[%17] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+  %22 = llvm.load %21 : !llvm<"float*">
+  %23 = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, 1
+  %24 = llvm.add %17, %23 : !llvm.i64
+// CHECK-NEXT: br label %{{[0-9]+}}
+  llvm.br ^bb6(%24 : !llvm.i64)
+^bb8:   // pred: ^bb6
+// CHECK: ret void
+  llvm.return
+}
+
+// CHECK-LABEL: define void @store_load_mixed(i64 {{%.*}})
+llvm.func @store_load_mixed(%arg0: !llvm.i64) {
+  %0 = llvm.mlir.constant(10 : index) : !llvm.i64
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 2, %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 %{{[0-9]+}}, 4
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 %{{[0-9]+}}, 10
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 %{{[0-9]+}}, 4
+// CHECK-NEXT: %{{[0-9]+}} = call i8* @malloc(i64 %{{[0-9]+}})
+// CHECK-NEXT: %{{[0-9]+}} = bitcast i8* %{{[0-9]+}} to float*
+// CHECK-NEXT: %{{[0-9]+}} = insertvalue { float*, i64, i64 } undef, float* %{{[0-9]+}}, 0
+// CHECK-NEXT: %{{[0-9]+}} = insertvalue { float*, i64, i64 } %{{[0-9]+}}, i64 %{{[0-9]+}}, 1
+// CHECK-NEXT: %{{[0-9]+}} = insertvalue { float*, i64, i64 } %{{[0-9]+}}, i64 10, 2
+  %1 = llvm.mlir.constant(2 : index) : !llvm.i64
+  %2 = llvm.mlir.constant(4 : index) : !llvm.i64
+  %3 = llvm.mul %1, %arg0 : !llvm.i64
+  %4 = llvm.mul %3, %2 : !llvm.i64
+  %5 = llvm.mul %4, %0 : !llvm.i64
+  %6 = llvm.mlir.undef : !llvm<"{ float*, i64, i64 }">
+  %7 = llvm.mlir.constant(4 : index) : !llvm.i64
+  %8 = llvm.mul %5, %7 : !llvm.i64
+  %9 = llvm.call @malloc(%8) : (!llvm.i64) -> !llvm<"i8*">
+  %10 = llvm.bitcast %9 : !llvm<"i8*"> to !llvm<"float*">
+  %11 = llvm.insertvalue %10, %6[0] : !llvm<"{ float*, i64, i64 }">
+  %12 = llvm.insertvalue %arg0, %11[1] : !llvm<"{ float*, i64, i64 }">
+  %13 = llvm.insertvalue %0, %12[2] : !llvm<"{ float*, i64, i64 }">
+
+// CHECK-NEXT: %{{[0-9]+}} = call i64 @get_index()
+// CHECK-NEXT: %{{[0-9]+}} = call i64 @get_index()
+  %14 = llvm.mlir.constant(1 : index) : !llvm.i64
+  %15 = llvm.mlir.constant(2 : index) : !llvm.i64
+  %16 = llvm.call @get_index() : () -> !llvm.i64
+  %17 = llvm.call @get_index() : () -> !llvm.i64
+  %18 = llvm.mlir.constant(4.200000e+01 : f32) : !llvm.float
+  %19 = llvm.mlir.constant(2 : index) : !llvm.i64
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float*, i64, i64 } %{{[0-9]+}}, 1
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float*, i64, i64 } %{{[0-9]+}}, 2
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 1, %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, 2
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 %{{[0-9]+}}, 4
+// CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 %{{[0-9]+}}, %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float*, i64, i64 } %{{[0-9]+}}, 0
+// CHECK-NEXT: %{{[0-9]+}} = getelementptr float, float* %{{[0-9]+}}, i64 %{{[0-9]+}}
+// CHECK-NEXT: store float 4.200000e+01, float* %{{[0-9]+}}
+  %20 = llvm.extractvalue %13[1] : !llvm<"{ float*, i64, i64 }">
+  %21 = llvm.mlir.constant(4 : index) : !llvm.i64
+  %22 = llvm.extractvalue %13[2] : !llvm<"{ float*, i64, i64 }">
+  %23 = llvm.mul %14, %20 : !llvm.i64
+  %24 = llvm.add %23, %15 : !llvm.i64
+  %25 = llvm.mul %24, %21 : !llvm.i64
+  %26 = llvm.add %25, %16 : !llvm.i64
+  %27 = llvm.mul %26, %22 : !llvm.i64
+  %28 = llvm.add %27, %17 : !llvm.i64
+  %29 = llvm.extractvalue %13[0] : !llvm<"{ float*, i64, i64 }">
+  %30 = llvm.getelementptr %29[%28] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+  llvm.store %18, %30 : !llvm<"float*">
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float*, i64, i64 } %{{[0-9]+}}, 1
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float*, i64, i64 } %{{[0-9]+}}, 2
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 %{{[0-9]+}}, %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 %{{[0-9]+}}, 4
+// CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, 2
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 %{{[0-9]+}}, %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, 1
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float*, i64, i64 } %{{[0-9]+}}, 0
+// CHECK-NEXT: %{{[0-9]+}} = getelementptr float, float* %{{[0-9]+}}, i64 %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = load float, float* %{{[0-9]+}}
+  %31 = llvm.mlir.constant(2 : index) : !llvm.i64
+  %32 = llvm.extractvalue %13[1] : !llvm<"{ float*, i64, i64 }">
+  %33 = llvm.mlir.constant(4 : index) : !llvm.i64
+  %34 = llvm.extractvalue %13[2] : !llvm<"{ float*, i64, i64 }">
+  %35 = llvm.mul %17, %32 : !llvm.i64
+  %36 = llvm.add %35, %16 : !llvm.i64
+  %37 = llvm.mul %36, %33 : !llvm.i64
+  %38 = llvm.add %37, %15 : !llvm.i64
+  %39 = llvm.mul %38, %34 : !llvm.i64
+  %40 = llvm.add %39, %14 : !llvm.i64
+  %41 = llvm.extractvalue %13[0] : !llvm<"{ float*, i64, i64 }">
+  %42 = llvm.getelementptr %41[%40] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+  %43 = llvm.load %42 : !llvm<"float*">
+// CHECK-NEXT: ret void
+  llvm.return
+}
+
+// CHECK-LABEL: define { float*, i64 } @memref_args_rets({ float* } {{%.*}}, { float*, i64 } {{%.*}}, { float*, i64 } {{%.*}}) {
+llvm.func @memref_args_rets(%arg0: !llvm<"{ float* }">, %arg1: !llvm<"{ float*, i64 }">, %arg2: !llvm<"{ float*, i64 }">) -> !llvm<"{ float*, i64 }"> {
+  %0 = llvm.mlir.constant(7 : index) : !llvm.i64
+// CHECK-NEXT: %{{[0-9]+}} = call i64 @get_index()
+  %1 = llvm.call @get_index() : () -> !llvm.i64
+  %2 = llvm.mlir.constant(4.200000e+01 : f32) : !llvm.float
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float* } %{{[0-9]+}}, 0
+// CHECK-NEXT: %{{[0-9]+}} = getelementptr float, float* %{{[0-9]+}}, i64 7
+// CHECK-NEXT: store float 4.200000e+01, float* %{{[0-9]+}}
+  %3 = llvm.mlir.constant(10 : index) : !llvm.i64
+  %4 = llvm.extractvalue %arg0[0] : !llvm<"{ float* }">
+  %5 = llvm.getelementptr %4[%0] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+  llvm.store %2, %5 : !llvm<"float*">
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float*, i64 } %{{[0-9]+}}, 1
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float*, i64 } %{{[0-9]+}}, 0
+// CHECK-NEXT: %{{[0-9]+}} = getelementptr float, float* %{{[0-9]+}}, i64 7
+// CHECK-NEXT: store float 4.200000e+01, float* %{{[0-9]+}}
+  %6 = llvm.extractvalue %arg1[1] : !llvm<"{ float*, i64 }">
+  %7 = llvm.extractvalue %arg1[0] : !llvm<"{ float*, i64 }">
+  %8 = llvm.getelementptr %7[%0] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+  llvm.store %2, %8 : !llvm<"float*">
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float*, i64 } %{{[0-9]+}}, 1
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 7, %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = extractvalue { float*, i64 } %{{[0-9]+}}, 0
+// CHECK-NEXT: %{{[0-9]+}} = getelementptr float, float* %{{[0-9]+}}, i64 %{{[0-9]+}}
+// CHECK-NEXT: store float 4.200000e+01, float* %{{[0-9]+}}
+  %9 = llvm.mlir.constant(10 : index) : !llvm.i64
+  %10 = llvm.extractvalue %arg2[1] : !llvm<"{ float*, i64 }">
+  %11 = llvm.mul %0, %10 : !llvm.i64
+  %12 = llvm.add %11, %1 : !llvm.i64
+  %13 = llvm.extractvalue %arg2[0] : !llvm<"{ float*, i64 }">
+  %14 = llvm.getelementptr %13[%12] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+  llvm.store %2, %14 : !llvm<"float*">
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 10, %{{[0-9]+}}
+// CHECK-NEXT: %{{[0-9]+}} = mul i64 %{{[0-9]+}}, 4
+// CHECK-NEXT: %{{[0-9]+}} = call i8* @malloc(i64 %{{[0-9]+}})
+// CHECK-NEXT: %{{[0-9]+}} = bitcast i8* %{{[0-9]+}} to float*
+// CHECK-NEXT: %{{[0-9]+}} = insertvalue { float*, i64 } undef, float* %{{[0-9]+}}, 0
+// CHECK-NEXT: %{{[0-9]+}} = insertvalue { float*, i64 } %{{[0-9]+}}, i64 %{{[0-9]+}}, 1
+  %15 = llvm.mlir.constant(10 : index) : !llvm.i64
+  %16 = llvm.mul %15, %1 : !llvm.i64
+  %17 = llvm.mlir.undef : !llvm<"{ float*, i64 }">
+  %18 = llvm.mlir.constant(4 : index) : !llvm.i64
+  %19 = llvm.mul %16, %18 : !llvm.i64
+  %20 = llvm.call @malloc(%19) : (!llvm.i64) -> !llvm<"i8*">
+  %21 = llvm.bitcast %20 : !llvm<"i8*"> to !llvm<"float*">
+  %22 = llvm.insertvalue %21, %17[0] : !llvm<"{ float*, i64 }">
+  %23 = llvm.insertvalue %1, %22[1] : !llvm<"{ float*, i64 }">
+// CHECK-NEXT: ret { float*, i64 } %{{[0-9]+}}
+  llvm.return %23 : !llvm<"{ float*, i64 }">
+}
+
+
+// CHECK-LABEL: define i64 @memref_dim({ float*, i64, i64 } {{%.*}})
+llvm.func @memref_dim(%arg0: !llvm<"{ float*, i64, i64 }">) -> !llvm.i64 {
+// Expecting this to create an LLVM constant.
+  %0 = llvm.mlir.constant(42 : index) : !llvm.i64
+// CHECK-NEXT: %2 = extractvalue { float*, i64, i64 } %0, 1
+  %1 = llvm.extractvalue %arg0[1] : !llvm<"{ float*, i64, i64 }">
+// Expecting this to create an LLVM constant.
+  %2 = llvm.mlir.constant(10 : index) : !llvm.i64
+// CHECK-NEXT: %3 = extractvalue { float*, i64, i64 } %0, 2
+  %3 = llvm.extractvalue %arg0[2] : !llvm<"{ float*, i64, i64 }">
+// Checking that the constant for d0 has been created.
+// CHECK-NEXT: %4 = add i64 42, %2
+  %4 = llvm.add %0, %1 : !llvm.i64
+// Checking that the constant for d2 has been created.
+// CHECK-NEXT: %5 = add i64 10, %3
+  %5 = llvm.add %2, %3 : !llvm.i64
+// CHECK-NEXT: %6 = add i64 %4, %5
+  %6 = llvm.add %4, %5 : !llvm.i64
+// CHECK-NEXT: ret i64 %6
+  llvm.return %6 : !llvm.i64
+}
+
+llvm.func @get_i64() -> !llvm.i64
+llvm.func @get_f32() -> !llvm.float
+llvm.func @get_memref() -> !llvm<"{ float*, i64, i64 }">
+
+// CHECK-LABEL: define { i64, float, { float*, i64, i64 } } @multireturn() {
+llvm.func @multireturn() -> !llvm<"{ i64, float, { float*, i64, i64 } }"> {
+  %0 = llvm.call @get_i64() : () -> !llvm.i64
+  %1 = llvm.call @get_f32() : () -> !llvm.float
+  %2 = llvm.call @get_memref() : () -> !llvm<"{ float*, i64, i64 }">
+// CHECK:        %{{[0-9]+}} = insertvalue { i64, float, { float*, i64, i64 } } undef, i64 %{{[0-9]+}}, 0
+// CHECK-NEXT:   %{{[0-9]+}} = insertvalue { i64, float, { float*, i64, i64 } } %{{[0-9]+}}, float %{{[0-9]+}}, 1
+// CHECK-NEXT:   %{{[0-9]+}} = insertvalue { i64, float, { float*, i64, i64 } } %{{[0-9]+}}, { float*, i64, i64 } %{{[0-9]+}}, 2
+// CHECK-NEXT:   ret { i64, float, { float*, i64, i64 } } %{{[0-9]+}}
+  %3 = llvm.mlir.undef : !llvm<"{ i64, float, { float*, i64, i64 } }">
+  %4 = llvm.insertvalue %0, %3[0] : !llvm<"{ i64, float, { float*, i64, i64 } }">
+  %5 = llvm.insertvalue %1, %4[1] : !llvm<"{ i64, float, { float*, i64, i64 } }">
+  %6 = llvm.insertvalue %2, %5[2] : !llvm<"{ i64, float, { float*, i64, i64 } }">
+  llvm.return %6 : !llvm<"{ i64, float, { float*, i64, i64 } }">
+}
+
+
+// CHECK-LABEL: define void @multireturn_caller() {
+llvm.func @multireturn_caller() {
+// CHECK-NEXT:   %1 = call { i64, float, { float*, i64, i64 } } @multireturn()
+// CHECK-NEXT:   [[ret0:%[0-9]+]] = extractvalue { i64, float, { float*, i64, i64 } } %1, 0
+// CHECK-NEXT:   [[ret1:%[0-9]+]] = extractvalue { i64, float, { float*, i64, i64 } } %1, 1
+// CHECK-NEXT:   [[ret2:%[0-9]+]] = extractvalue { i64, float, { float*, i64, i64 } } %1, 2
+  %0 = llvm.call @multireturn() : () -> !llvm<"{ i64, float, { float*, i64, i64 } }">
+  %1 = llvm.extractvalue %0[0] : !llvm<"{ i64, float, { float*, i64, i64 } }">
+  %2 = llvm.extractvalue %0[1] : !llvm<"{ i64, float, { float*, i64, i64 } }">
+  %3 = llvm.extractvalue %0[2] : !llvm<"{ i64, float, { float*, i64, i64 } }">
+  %4 = llvm.mlir.constant(42) : !llvm.i64
+// CHECK:   add i64 [[ret0]], 42
+  %5 = llvm.add %1, %4 : !llvm.i64
+  %6 = llvm.mlir.constant(4.200000e+01 : f32) : !llvm.float
+// CHECK:   fadd float [[ret1]], 4.200000e+01
+  %7 = llvm.fadd %2, %6 : !llvm.float
+  %8 = llvm.mlir.constant(0 : index) : !llvm.i64
+  %9 = llvm.mlir.constant(42 : index) : !llvm.i64
+// CHECK:   extractvalue { float*, i64, i64 } [[ret2]], 0
+  %10 = llvm.extractvalue %3[1] : !llvm<"{ float*, i64, i64 }">
+  %11 = llvm.mlir.constant(10 : index) : !llvm.i64
+  %12 = llvm.extractvalue %3[2] : !llvm<"{ float*, i64, i64 }">
+  %13 = llvm.mul %8, %10 : !llvm.i64
+  %14 = llvm.add %13, %8 : !llvm.i64
+  %15 = llvm.mul %14, %11 : !llvm.i64
+  %16 = llvm.add %15, %8 : !llvm.i64
+  %17 = llvm.mul %16, %12 : !llvm.i64
+  %18 = llvm.add %17, %8 : !llvm.i64
+  %19 = llvm.extractvalue %3[0] : !llvm<"{ float*, i64, i64 }">
+  %20 = llvm.getelementptr %19[%18] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+  %21 = llvm.load %20 : !llvm<"float*">
+  llvm.return
+}
+
+// CHECK-LABEL: define <4 x float> @vector_ops(<4 x float> {{%.*}}, <4 x i1> {{%.*}}, <4 x i64> {{%.*}}) {
+llvm.func @vector_ops(%arg0: !llvm<"<4 x float>">, %arg1: !llvm<"<4 x i1>">, %arg2: !llvm<"<4 x i64>">) -> !llvm<"<4 x float>"> {
+  %0 = llvm.mlir.constant(dense<4.200000e+01> : vector<4xf32>) : !llvm<"<4 x float>">
+// CHECK-NEXT: %4 = fadd <4 x float> %0, <float 4.200000e+01, float 4.200000e+01, float 4.200000e+01, float 4.200000e+01>
+  %1 = llvm.fadd %arg0, %0 : !llvm<"<4 x float>">
+// CHECK-NEXT: %5 = select <4 x i1> %1, <4 x float> %4, <4 x float> %0
+  %2 = llvm.select %arg1, %1, %arg0 : !llvm<"<4 x i1>">, !llvm<"<4 x float>">
+// CHECK-NEXT: %6 = sdiv <4 x i64> %2, %2
+  %3 = llvm.sdiv %arg2, %arg2 : !llvm<"<4 x i64>">
+// CHECK-NEXT: %7 = udiv <4 x i64> %2, %2
+  %4 = llvm.udiv %arg2, %arg2 : !llvm<"<4 x i64>">
+// CHECK-NEXT: %8 = srem <4 x i64> %2, %2
+  %5 = llvm.srem %arg2, %arg2 : !llvm<"<4 x i64>">
+// CHECK-NEXT: %9 = urem <4 x i64> %2, %2
+  %6 = llvm.urem %arg2, %arg2 : !llvm<"<4 x i64>">
+// CHECK-NEXT: %10 = fdiv <4 x float> %0, <float 4.200000e+01, float 4.200000e+01, float 4.200000e+01, float 4.200000e+01>
+  %7 = llvm.fdiv %arg0, %0 : !llvm<"<4 x float>">
+// CHECK-NEXT: %11 = frem <4 x float> %0, <float 4.200000e+01, float 4.200000e+01, float 4.200000e+01, float 4.200000e+01>
+  %8 = llvm.frem %arg0, %0 : !llvm<"<4 x float>">
+// CHECK-NEXT: %12 = and <4 x i64> %2, %2
+  %9 = llvm.and %arg2, %arg2 : !llvm<"<4 x i64>">
+// CHECK-NEXT: %13 = or <4 x i64> %2, %2
+  %10 = llvm.or %arg2, %arg2 : !llvm<"<4 x i64>">
+// CHECK-NEXT: %14 = xor <4 x i64> %2, %2
+  %11 = llvm.xor %arg2, %arg2 : !llvm<"<4 x i64>">
+// CHECK-NEXT: %15 = shl <4 x i64> %2, %2
+  %12 = llvm.shl %arg2, %arg2 : !llvm<"<4 x i64>">
+// CHECK-NEXT: %16 = lshr <4 x i64> %2, %2
+  %13 = llvm.lshr %arg2, %arg2 : !llvm<"<4 x i64>">
+// CHECK-NEXT: %17 = ashr <4 x i64> %2, %2
+  %14 = llvm.ashr %arg2, %arg2 : !llvm<"<4 x i64>">
+// CHECK-NEXT:    ret <4 x float> %4
+  llvm.return %1 : !llvm<"<4 x float>">
+}
+
+// CHECK-LABEL: @ops
+llvm.func @ops(%arg0: !llvm.float, %arg1: !llvm.float, %arg2: !llvm.i32, %arg3: !llvm.i32) -> !llvm<"{ float, i32 }"> {
+// CHECK-NEXT: fsub float %0, %1
+  %0 = llvm.fsub %arg0, %arg1 : !llvm.float
+// CHECK-NEXT: %6 = sub i32 %2, %3
+  %1 = llvm.sub %arg2, %arg3 : !llvm.i32
+// CHECK-NEXT: %7 = icmp slt i32 %2, %6
+  %2 = llvm.icmp "slt" %arg2, %1 : !llvm.i32
+// CHECK-NEXT: %8 = select i1 %7, i32 %2, i32 %6
+  %3 = llvm.select %2, %arg2, %1 : !llvm.i1, !llvm.i32
+// CHECK-NEXT: %9 = sdiv i32 %2, %3
+  %4 = llvm.sdiv %arg2, %arg3 : !llvm.i32
+// CHECK-NEXT: %10 = udiv i32 %2, %3
+  %5 = llvm.udiv %arg2, %arg3 : !llvm.i32
+// CHECK-NEXT: %11 = srem i32 %2, %3
+  %6 = llvm.srem %arg2, %arg3 : !llvm.i32
+// CHECK-NEXT: %12 = urem i32 %2, %3
+  %7 = llvm.urem %arg2, %arg3 : !llvm.i32
+
+  %8 = llvm.mlir.undef : !llvm<"{ float, i32 }">
+  %9 = llvm.insertvalue %0, %8[0] : !llvm<"{ float, i32 }">
+  %10 = llvm.insertvalue %3, %9[1] : !llvm<"{ float, i32 }">
+
+// CHECK: %15 = fdiv float %0, %1
+  %11 = llvm.fdiv %arg0, %arg1 : !llvm.float
+// CHECK-NEXT: %16 = frem float %0, %1
+  %12 = llvm.frem %arg0, %arg1 : !llvm.float
+
+// CHECK-NEXT: %17 = and i32 %2, %3
+  %13 = llvm.and %arg2, %arg3 : !llvm.i32
+// CHECK-NEXT: %18 = or i32 %2, %3
+  %14 = llvm.or %arg2, %arg3 : !llvm.i32
+// CHECK-NEXT: %19 = xor i32 %2, %3
+  %15 = llvm.xor %arg2, %arg3 : !llvm.i32
+// CHECK-NEXT: %20 = shl i32 %2, %3
+  %16 = llvm.shl %arg2, %arg3 : !llvm.i32
+// CHECK-NEXT: %21 = lshr i32 %2, %3
+  %17 = llvm.lshr %arg2, %arg3 : !llvm.i32
+// CHECK-NEXT: %22 = ashr i32 %2, %3
+  %18 = llvm.ashr %arg2, %arg3 : !llvm.i32
+
+// CHECK-NEXT: fneg float %0
+  %19 = llvm.fneg %arg0 : !llvm.float
+
+  llvm.return %10 : !llvm<"{ float, i32 }">
+}
+
+//
+// Indirect function calls
+//
+
+// CHECK-LABEL: define void @indirect_const_call(i64 {{%.*}}) {
+llvm.func @indirect_const_call(%arg0: !llvm.i64) {
+// CHECK-NEXT:  call void @body(i64 %0)
+  %0 = llvm.mlir.constant(@body) : !llvm<"void (i64)*">
+  llvm.call %0(%arg0) : (!llvm.i64) -> ()
+// CHECK-NEXT:  ret void
+  llvm.return
+}
+
+// CHECK-LABEL: define i32 @indirect_call(i32 (float)* {{%.*}}, float {{%.*}}) {
+llvm.func @indirect_call(%arg0: !llvm<"i32 (float)*">, %arg1: !llvm.float) -> !llvm.i32 {
+// CHECK-NEXT:  %3 = call i32 %0(float %1)
+  %0 = llvm.call %arg0(%arg1) : (!llvm.float) -> !llvm.i32
+// CHECK-NEXT:  ret i32 %3
+  llvm.return %0 : !llvm.i32
+}
+
+//
+// Check that we properly construct phi nodes in the blocks that have the same
+// predecessor more than once.
+//
+
+// CHECK-LABEL: define void @cond_br_arguments(i1 {{%.*}}, i1 {{%.*}}) {
+llvm.func @cond_br_arguments(%arg0: !llvm.i1, %arg1: !llvm.i1) {
+// CHECK-NEXT:   br i1 %0, label %3, label %5
+  llvm.cond_br %arg0, ^bb1(%arg0 : !llvm.i1), ^bb2
+
+// CHECK:      3:
+// CHECK-NEXT:   %4 = phi i1 [ %1, %5 ], [ %0, %2 ]
+^bb1(%0 : !llvm.i1):
+// CHECK-NEXT:   ret void
+  llvm.return
+
+// CHECK:      5:
+^bb2:
+// CHECK-NEXT:   br label %3
+  llvm.br ^bb1(%arg1 : !llvm.i1)
+}
+
+// CHECK-LABEL: define void @llvm_noalias(float* noalias {{%*.}}) {
+llvm.func @llvm_noalias(%arg0: !llvm<"float*"> {llvm.noalias = true}) {
+  llvm.return
+}
+
+// CHECK-LABEL: @llvm_varargs(...)
+llvm.func @llvm_varargs(...)
+
+llvm.func @intpointerconversion(%arg0 : !llvm.i32) -> !llvm.i32 {
+// CHECK:      %2 = inttoptr i32 %0 to i32*
+// CHECK-NEXT: %3 = ptrtoint i32* %2 to i32
+  %1 = llvm.inttoptr %arg0 : !llvm.i32 to !llvm<"i32*">
+  %2 = llvm.ptrtoint %1 : !llvm<"i32*"> to !llvm.i32
+  llvm.return %2 : !llvm.i32
+}
+
+llvm.func @fpconversion(%arg0 : !llvm.i32) -> !llvm.i32 {
+// CHECK:      %2 = sitofp i32 %0 to float
+// CHECK-NEXT: %3 = fptosi float %2 to i32
+// CHECK-NEXT: %4 = uitofp i32 %3 to float
+// CHECK-NEXT: %5 = fptoui float %4 to i32
+  %1 = llvm.sitofp %arg0 : !llvm.i32 to !llvm.float
+  %2 = llvm.fptosi %1 : !llvm.float to !llvm.i32
+  %3 = llvm.uitofp %2 : !llvm.i32 to !llvm.float
+  %4 = llvm.fptoui %3 : !llvm.float to !llvm.i32
+  llvm.return %4 : !llvm.i32
+}
+
+// CHECK-LABEL: @addrspace
+llvm.func @addrspace(%arg0 : !llvm<"i32*">) -> !llvm<"i32 addrspace(2)*"> {
+// CHECK: %2 = addrspacecast i32* %0 to i32 addrspace(2)*
+  %1 = llvm.addrspacecast %arg0 : !llvm<"i32*"> to !llvm<"i32 addrspace(2)*">
+  llvm.return %1 : !llvm<"i32 addrspace(2)*">
+}
+
+llvm.func @stringconstant() -> !llvm<"i8*"> {
+  %1 = llvm.mlir.constant("Hello world!") : !llvm<"i8*">
+  // CHECK: ret [12 x i8] c"Hello world!"
+  llvm.return %1 : !llvm<"i8*">
+}
+
+llvm.func @noreach() {
+// CHECK:    unreachable
+  llvm.unreachable
+}
+
+// CHECK-LABEL: define void @fcmp
+llvm.func @fcmp(%arg0: !llvm.float, %arg1: !llvm.float) {
+  // CHECK: fcmp oeq float %0, %1
+  // CHECK-NEXT: fcmp ogt float %0, %1
+  // CHECK-NEXT: fcmp oge float %0, %1
+  // CHECK-NEXT: fcmp olt float %0, %1
+  // CHECK-NEXT: fcmp ole float %0, %1
+  // CHECK-NEXT: fcmp one float %0, %1
+  // CHECK-NEXT: fcmp ord float %0, %1
+  // CHECK-NEXT: fcmp ueq float %0, %1
+  // CHECK-NEXT: fcmp ugt float %0, %1
+  // CHECK-NEXT: fcmp uge float %0, %1
+  // CHECK-NEXT: fcmp ult float %0, %1
+  // CHECK-NEXT: fcmp ule float %0, %1
+  // CHECK-NEXT: fcmp une float %0, %1
+  // CHECK-NEXT: fcmp uno float %0, %1
+  %0 = llvm.fcmp "oeq" %arg0, %arg1 : !llvm.float
+  %1 = llvm.fcmp "ogt" %arg0, %arg1 : !llvm.float
+  %2 = llvm.fcmp "oge" %arg0, %arg1 : !llvm.float
+  %3 = llvm.fcmp "olt" %arg0, %arg1 : !llvm.float
+  %4 = llvm.fcmp "ole" %arg0, %arg1 : !llvm.float
+  %5 = llvm.fcmp "one" %arg0, %arg1 : !llvm.float
+  %6 = llvm.fcmp "ord" %arg0, %arg1 : !llvm.float
+  %7 = llvm.fcmp "ueq" %arg0, %arg1 : !llvm.float
+  %8 = llvm.fcmp "ugt" %arg0, %arg1 : !llvm.float
+  %9 = llvm.fcmp "uge" %arg0, %arg1 : !llvm.float
+  %10 = llvm.fcmp "ult" %arg0, %arg1 : !llvm.float
+  %11 = llvm.fcmp "ule" %arg0, %arg1 : !llvm.float
+  %12 = llvm.fcmp "une" %arg0, %arg1 : !llvm.float
+  %13 = llvm.fcmp "uno" %arg0, %arg1 : !llvm.float
+  llvm.return
+}
+
+// CHECK-LABEL: @vect
+llvm.func @vect(%arg0: !llvm<"<4 x float>">, %arg1: !llvm.i32, %arg2: !llvm.float) {
+  // CHECK-NEXT: extractelement <4 x float> {{.*}}, i32
+  // CHECK-NEXT: insertelement <4 x float> {{.*}}, float %2, i32
+  // CHECK-NEXT: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <5 x i32> <i32 0, i32 0, i32 0, i32 0, i32 7>
+  %0 = llvm.extractelement %arg0[%arg1 : !llvm.i32] : !llvm<"<4 x float>">
+  %1 = llvm.insertelement %arg2, %arg0[%arg1 : !llvm.i32] : !llvm<"<4 x float>">
+  %2 = llvm.shufflevector %arg0, %arg0 [0 : i32, 0 : i32, 0 : i32, 0 : i32, 7 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
+  llvm.return
+}
+
+// CHECK-LABEL: @vect_i64idx
+llvm.func @vect_i64idx(%arg0: !llvm<"<4 x float>">, %arg1: !llvm.i64, %arg2: !llvm.float) {
+  // CHECK-NEXT: extractelement <4 x float> {{.*}}, i64
+  // CHECK-NEXT: insertelement <4 x float> {{.*}}, float %2, i64
+  %0 = llvm.extractelement %arg0[%arg1 : !llvm.i64] : !llvm<"<4 x float>">
+  %1 = llvm.insertelement %arg2, %arg0[%arg1 : !llvm.i64] : !llvm<"<4 x float>">
+  llvm.return
+}
+
+// CHECK-LABEL: @alloca
+llvm.func @alloca(%size : !llvm.i64) {
+  //      CHECK: alloca
+  //  CHECK-NOT: align
+  llvm.alloca %size x !llvm.i32 {alignment = 0} : (!llvm.i64) -> (!llvm<"i32*">)
+  // CHECK-NEXT: alloca {{.*}} align 8
+  llvm.alloca %size x !llvm.i32 {alignment = 8} : (!llvm.i64) -> (!llvm<"i32*">)
+  llvm.return
+}
+
+// CHECK-LABEL: @constants
+llvm.func @constants() -> !llvm<"<4 x float>"> {
+  // CHECK: ret <4 x float> <float 4.2{{0*}}e+01, float 0.{{0*}}e+00, float 0.{{0*}}e+00, float 0.{{0*}}e+00>
+  %0 = llvm.mlir.constant(sparse<[[0]], [4.2e+01]> : vector<4xf32>) : !llvm<"<4 x float>">
+  llvm.return %0 : !llvm<"<4 x float>">
+}
+
+// CHECK-LABEL: @fp_casts
+llvm.func @fp_casts(%fp1 : !llvm<"float">, %fp2 : !llvm<"double">) -> !llvm.i16 {
+// CHECK:    fptrunc double {{.*}} to float
+  %a = llvm.fptrunc %fp2 : !llvm<"double"> to !llvm<"float">
+// CHECK:    fpext float {{.*}} to double
+  %b = llvm.fpext %fp1 : !llvm<"float"> to !llvm<"double">
+// CHECK:    fptosi double {{.*}} to i16
+  %c = llvm.fptosi %b : !llvm<"double"> to !llvm.i16
+  llvm.return %c : !llvm.i16
+}
+
+// CHECK-LABEL: @integer_extension_and_truncation
+llvm.func @integer_extension_and_truncation(%a : !llvm.i32) {
+// CHECK:    sext i32 {{.*}} to i64
+// CHECK:    zext i32 {{.*}} to i64
+// CHECK:    trunc i32 {{.*}} to i16
+  %0 = llvm.sext %a : !llvm.i32 to !llvm.i64
+  %1 = llvm.zext %a : !llvm.i32 to !llvm.i64
+  %2 = llvm.trunc %a : !llvm.i32 to !llvm.i16
+  llvm.return
+}
+
+// Check that the auxiliary `null` operation is converted into a `null` value.
+// CHECK-LABEL: @null
+llvm.func @null() -> !llvm<"i32*"> {
+  %0 = llvm.mlir.null : !llvm<"i32*">
+  // CHECK: ret i32* null
+  llvm.return %0 : !llvm<"i32*">
+}
diff --git a/mlir/test/Target/nvvmir.mlir b/mlir/test/Target/nvvmir.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..2e63ecd68bc3d0f860c4c80013ce4857ac1f7589
--- /dev/null
+++ b/mlir/test/Target/nvvmir.mlir
@@ -0,0 +1,84 @@
+// RUN: mlir-translate -mlir-to-nvvmir %s | FileCheck %s
+
+llvm.func @nvvm_special_regs() -> !llvm.i32 {
+  // CHECK: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %1 = nvvm.read.ptx.sreg.tid.x : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  %2 = nvvm.read.ptx.sreg.tid.y : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+  %3 = nvvm.read.ptx.sreg.tid.z : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %4 = nvvm.read.ptx.sreg.ntid.x : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+  %5 = nvvm.read.ptx.sreg.ntid.y : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+  %6 = nvvm.read.ptx.sreg.ntid.z : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  %7 = nvvm.read.ptx.sreg.ctaid.x : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+  %8 = nvvm.read.ptx.sreg.ctaid.y : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+  %9 = nvvm.read.ptx.sreg.ctaid.z : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+  %10 = nvvm.read.ptx.sreg.nctaid.x : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+  %11 = nvvm.read.ptx.sreg.nctaid.y : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+  %12 = nvvm.read.ptx.sreg.nctaid.z : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  %13 = nvvm.read.ptx.sreg.warpsize : !llvm.i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
+  %14 = nvvm.read.ptx.sreg.laneid : !llvm.i32
+  llvm.return %1 : !llvm.i32
+}
+
+llvm.func @llvm.nvvm.barrier0() {
+  // CHECK: call void @llvm.nvvm.barrier0()
+  nvvm.barrier0
+  llvm.return
+}
+
+llvm.func @nvvm_shfl(
+    %0 : !llvm.i32, %1 : !llvm.i32, %2 : !llvm.i32,
+    %3 : !llvm.i32, %4 : !llvm.float) -> !llvm.i32 {
+  // CHECK: call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  %6 = nvvm.shfl.sync.bfly %0, %3, %1, %2 : !llvm.i32
+  // CHECK: call float @llvm.nvvm.shfl.sync.bfly.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  %7 = nvvm.shfl.sync.bfly %0, %4, %1, %2 : !llvm.float
+  llvm.return %6 : !llvm.i32
+}
+
+llvm.func @nvvm_shfl_pred(
+    %0 : !llvm.i32, %1 : !llvm.i32, %2 : !llvm.i32,
+    %3 : !llvm.i32, %4 : !llvm.float) -> !llvm<"{ i32, i1 }"> {
+  // CHECK: call { i32, i1 } @llvm.nvvm.shfl.sync.bfly.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  %6 = nvvm.shfl.sync.bfly %0, %3, %1, %2 {return_value_and_is_valid} : !llvm<"{ i32, i1 }">
+  // CHECK: call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  %7 = nvvm.shfl.sync.bfly %0, %4, %1, %2 {return_value_and_is_valid} : !llvm<"{ float, i1 }">
+  llvm.return %6 : !llvm<"{ i32, i1 }">
+}
+
+llvm.func @nvvm_vote(%0 : !llvm.i32, %1 : !llvm.i1) -> !llvm.i32 {
+  // CHECK: call i32 @llvm.nvvm.vote.ballot.sync(i32 %{{.*}}, i1 %{{.*}})
+  %3 = nvvm.vote.ballot.sync %0, %1 : !llvm.i32
+  llvm.return %3 : !llvm.i32
+}
+
+llvm.func @nvvm_mma(%a0 : !llvm<"<2 x half>">, %a1 : !llvm<"<2 x half>">,
+                    %b0 : !llvm<"<2 x half>">, %b1 : !llvm<"<2 x half>">,
+                    %c0 : !llvm.float, %c1 : !llvm.float, %c2 : !llvm.float, %c3 : !llvm.float,
+                    %c4 : !llvm.float, %c5 : !llvm.float, %c6 : !llvm.float, %c7 : !llvm.float) {
+  // CHECK: call { float, float, float, float, float, float, float, float } @llvm.nvvm.mma.m8n8k4.row.row.f32.f32
+  %0 = nvvm.mma.sync %a0, %a1, %b0, %b1, %c0, %c1, %c2, %c3, %c4, %c5, %c6, %c7 {alayout="row", blayout="row"} : (!llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm<"<2 x half>">, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float, !llvm.float) -> !llvm<"{ float, float, float, float, float, float, float, float }">
+  llvm.return %0 : !llvm<"{ float, float, float, float, float, float, float, float }">
+}
+
+// This function has the "kernel" attribute attached and should appear in the
+// NVVM annotations after conversion.
+llvm.func @kernel_func() attributes {gpu.kernel} {
+  llvm.return
+}
+
+// CHECK:     !nvvm.annotations =
+// CHECK-NOT: {i32 ()* @nvvm_special_regs, !"kernel", i32 1}
+// CHECK:     {void ()* @kernel_func, !"kernel", i32 1}
diff --git a/mlir/test/Target/rocdl.mlir b/mlir/test/Target/rocdl.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..276203d416785ff914ba3255b96d125d53bad714
--- /dev/null
+++ b/mlir/test/Target/rocdl.mlir
@@ -0,0 +1,35 @@
+// RUN: mlir-translate -mlir-to-rocdlir %s | FileCheck %s
+
+llvm.func @rocdl_special_regs() -> !llvm.i32 {
+  // CHECK-LABEL: rocdl_special_regs
+  // CHECK: call i32 @llvm.amdgcn.workitem.id.x()
+  %1 = rocdl.workitem.id.x : !llvm.i32
+  // CHECK: call i32 @llvm.amdgcn.workitem.id.y()
+  %2 = rocdl.workitem.id.y : !llvm.i32
+  // CHECK: call i32 @llvm.amdgcn.workitem.id.z()
+  %3 = rocdl.workitem.id.z : !llvm.i32
+  // CHECK: call i32 @llvm.amdgcn.workgroup.id.x()
+  %4 = rocdl.workgroup.id.x : !llvm.i32
+  // CHECK: call i32 @llvm.amdgcn.workgroup.id.y()
+  %5 = rocdl.workgroup.id.y : !llvm.i32
+  // CHECK: call i32 @llvm.amdgcn.workgroup.id.z()
+  %6 = rocdl.workgroup.id.z : !llvm.i32
+  // CHECK: call i64 @__ockl_get_local_size(i32 0)
+  %7 = rocdl.workgroup.dim.x : !llvm.i64
+  // CHECK: call i64 @__ockl_get_local_size(i32 1)
+  %8 = rocdl.workgroup.dim.y : !llvm.i64
+  // CHECK: call i64 @__ockl_get_local_size(i32 2)
+  %9 = rocdl.workgroup.dim.z : !llvm.i64
+  // CHECK: call i64 @__ockl_get_global_size(i32 0)
+  %10 = rocdl.grid.dim.x : !llvm.i64
+  // CHECK: call i64 @__ockl_get_global_size(i32 1)
+  %11 = rocdl.grid.dim.y : !llvm.i64
+  // CHECK: call i64 @__ockl_get_global_size(i32 2)
+  %12 = rocdl.grid.dim.z : !llvm.i64
+  llvm.return %1 : !llvm.i32
+}
+
+llvm.func @kernel_func() attributes {gpu.kernel} {
+  // CHECK-LABEL: amdgpu_kernel void @kernel_func
+  llvm.return
+}
diff --git a/mlir/test/Transforms/Vectorize/compose_maps.mlir b/mlir/test/Transforms/Vectorize/compose_maps.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..f1826f440f2d37cabe0ec9c2640e50cb99bcdcf7
--- /dev/null
+++ b/mlir/test/Transforms/Vectorize/compose_maps.mlir
@@ -0,0 +1,131 @@
+// RUN: mlir-opt %s -affine-vectorizer-test -compose-maps 2>&1 |  FileCheck %s
+
+// For all these cases, the test traverses the `test_affine_map` ops and
+// composes them in order one-by-one.
+// For instance, the pseudo-sequence:
+//    "test_affine_map"() { affine_map = f } : () -> ()
+//    "test_affine_map"() { affine_map = g } : () -> ()
+//    "test_affine_map"() { affine_map = h } : () -> ()
+// will produce the sequence of compositions: f, g(f), h(g(f)) and print the
+// AffineMap h(g(f)), which is what FileCheck checks against.
+
+func @simple1() {
+  // CHECK: Composed map: (d0) -> (d0)
+  "test_affine_map"() { affine_map = (d0) -> (d0 - 1) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 + 1) } : () -> ()
+  return
+}
+
+func @simple2() {
+  // CHECK: Composed map: (d0)[s0, s1] -> (d0 - s0 + s1)
+  "test_affine_map"() { affine_map = (d0)[s0] -> (d0 + s0 - 1) } : () -> ()
+  "test_affine_map"() { affine_map = (d0)[s0] -> (d0 - s0 + 1) } : () -> ()
+  return
+}
+
+func @simple3a() {
+  // CHECK: Composed map: (d0, d1)[s0, s1, s2, s3] -> ((d0 ceildiv s2) * s0, (d1 ceildiv s3) * s1)
+  "test_affine_map"() { affine_map = (d0, d1)[s0, s1] -> (d0 ceildiv s0, d1 ceildiv s1) } : () -> ()
+  "test_affine_map"() { affine_map = (d0, d1)[s0, s1] -> (d0 * s0, d1 * s1) } : () -> ()
+  return
+}
+
+func @simple3b() {
+  // CHECK: Composed map: (d0, d1)[s0, s1] -> (d0 mod s0, d1 mod s1)
+  "test_affine_map"() { affine_map = (d0, d1)[s0, s1] -> (d0 mod s0, d1 mod s1) } : () -> ()
+  return
+}
+
+func @simple3c() {
+  // CHECK: Composed map: (d0, d1)[s0, s1, s2, s3, s4, s5] -> ((d0 ceildiv s4) * s4 + d0 mod s2, (d1 ceildiv s5) * s5 + d1 mod s3)
+  "test_affine_map"() { affine_map = (d0, d1)[s0, s1] -> ((d0 ceildiv s0) * s0, (d1 ceildiv s1) * s1, d0, d1) } : () -> ()
+  "test_affine_map"() { affine_map = (d0, d1, d2, d3)[s0, s1, s2, s3] -> (d0 + d2 mod s2, d1 + d3 mod s3) } : () -> ()
+  return
+}
+
+func @simple4() {
+  // CHECK: Composed map: (d0, d1)[s0, s1] -> (d1 * s1, d0 ceildiv s0)
+  "test_affine_map"() { affine_map = (d0, d1) -> (d1, d0) } : () -> ()
+  "test_affine_map"() { affine_map = (d0, d1)[s0, s1] -> (d0 * s1, d1 ceildiv s0) } : () -> ()
+  return
+}
+
+func @simple5a() {
+  // CHECK: Composed map: (d0) -> (d0 * 3 + 18)
+  "test_affine_map"() { affine_map = (d0) -> (d0 - 1) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 + 7) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 * 24) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 ceildiv 8) } : () -> ()
+  return
+}
+
+func @simple5b() {
+  // CHECK: Composed map: (d0) -> ((d0 + 6) ceildiv 2)
+  "test_affine_map"() { affine_map = (d0) -> (d0 - 1) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 + 7) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 * 4) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 ceildiv 8) } : () -> ()
+  return
+}
+
+func @simple5c() {
+  // CHECK: Composed map: (d0) -> (d0 * 8 + 48)
+  "test_affine_map"() { affine_map = (d0) -> (d0 - 1) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 + 7) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 * 24) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 floordiv 3) } : () -> ()
+  return
+}
+
+func @simple5d() {
+  // CHECK: Composed map: (d0) -> ((d0 * 4) floordiv 3 + 8)
+  "test_affine_map"() { affine_map = (d0) -> (d0 - 1) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 + 7) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 * 4) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 floordiv 3) } : () -> ()
+  return
+}
+
+func @simple5e() {
+  // CHECK: Composed map: (d0) -> ((d0 + 6) ceildiv 8)
+  "test_affine_map"() { affine_map = (d0) -> (d0 - 1) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 + 7) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 ceildiv 8) } : () -> ()
+  return
+}
+
+func @simple5f() {
+  // CHECK: Composed map: (d0) -> ((d0 * 4 - 4) floordiv 3)
+  "test_affine_map"() { affine_map = (d0) -> (d0 - 1) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 * 4) } : () -> ()
+  "test_affine_map"() { affine_map = (d0) -> (d0 floordiv 3) } : () -> ()
+  return
+}
+
+func @perm_and_proj() {
+  // CHECK: Composed map: (d0, d1, d2, d3) -> (d1, d3, d0)
+  "test_affine_map"() { affine_map = (d0, d1, d2, d3) -> (d3, d1, d2, d0) } : () -> ()
+  "test_affine_map"() { affine_map = (d0, d1, d2, d3) -> (d1, d0, d3) } : () -> ()
+  return
+}
+
+func @symbols1() {
+  // CHECK: Composed map: (d0)[s0] -> (d0 + s0 + 1, d0 - s0 - 1)
+  "test_affine_map"() { affine_map = (d0)[s0] -> (d0 + s0, d0 - s0) } : () -> ()
+  "test_affine_map"() { affine_map = (d0, d1) -> (d0 + 1, d1 - 1) } : () -> ()
+  return
+}
+
+func @drop() {
+  // CHECK: Composed map: (d0, d1, d2)[s0, s1] -> (d0 * 2 + d1 + d2 + s1)
+  "test_affine_map"() { affine_map = (d0, d1, d2)[s0, s1] -> (d0 + s1, d1 + s0, d0 + d1 + d2) } : () -> ()
+  "test_affine_map"() { affine_map = (d0, d1, d2) -> (d0 + d2) } : () -> ()
+  return
+}
+
+func @multi_symbols() {
+  // CHECK: Composed map: (d0)[s0, s1, s2] -> (d0 + s1 + s2 + 1, d0 - s0 - s2 - 1)
+  "test_affine_map"() { affine_map = (d0)[s0] -> (d0 + s0, d0 - s0) } : () -> ()
+  "test_affine_map"() { affine_map = (d0, d1)[s0, s1] -> (d0 + 1 + s1, d1 - 1 - s0) } : () -> ()
+  return
+}
diff --git a/mlir/test/Transforms/Vectorize/normalize_maps.mlir b/mlir/test/Transforms/Vectorize/normalize_maps.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..4854a6226426d7eb7427ef8b5f3fc2723aa3aea0
--- /dev/null
+++ b/mlir/test/Transforms/Vectorize/normalize_maps.mlir
@@ -0,0 +1,58 @@
+// RUN: mlir-opt %s -affine-vectorizer-test -normalize-maps |  FileCheck %s
+
+// CHECK-DAG: #[[ZERO:[a-zA-Z0-9]+]] = () -> (0)
+// CHECK-DAG: #[[ID1:[a-zA-Z0-9]+]] = (d0) -> (d0)
+// CHECK-DAG: #[[D0TIMES2:[a-zA-Z0-9]+]] = (d0) -> (d0 * 2)
+// CHECK-DAG: #[[D0PLUSD1:[a-zA-Z0-9]+]] = (d0, d1) -> (d0 + d1)
+// CHECK-DAG: #[[MINSD0PLUSD1:[a-zA-Z0-9]+]] = (d0, d1) -> (-d0 + d1)
+// CHECK-DAG: #[[D0MINUSD1:[a-zA-Z0-9]+]] = (d0, d1) -> (d0 - d1)
+
+// CHECK-LABEL: func @simple()
+func @simple() {
+  affine.for %i0 = 0 to 7 {
+    %0 = affine.apply (d0) -> (d0) (%i0)
+    %1 = affine.apply (d0) -> (d0) (%0)
+    %2 = affine.apply (d0, d1) -> (d0 + d1) (%0, %0)
+    %3 = affine.apply (d0, d1) -> (d0 - d1) (%0, %0)
+  }
+  // CHECK-NEXT: affine.for %{{.*}} = 0 to 7
+  // CHECK-NEXT:   {{.*}} affine.apply #[[ID1]](%{{.*}})
+  // CHECK-NEXT:   {{.*}} affine.apply #[[D0TIMES2]](%{{.*}})
+  // CHECK-NEXT:   {{.*}} affine.apply #[[ZERO]]()
+
+  affine.for %i1 = 0 to 7 {
+    affine.for %i2 = 0 to 42 {
+      %20 = affine.apply (d0, d1) -> (d1) (%i1, %i2)
+      %21 = affine.apply (d0, d1) -> (d0) (%i1, %i2)
+      %22 = affine.apply (d0, d1) -> (d0 + d1) (%20, %21)
+      %23 = affine.apply (d0, d1) -> (d0 - d1) (%20, %21)
+      %24 = affine.apply (d0, d1) -> (-d0 + d1) (%20, %21)
+    }
+  }
+  //      CHECK: affine.for %{{.*}} = 0 to 7
+  // CHECK-NEXT:   affine.for %{{.*}} = 0 to 42
+  // CHECK-NEXT:     {{.*}} affine.apply #[[D0PLUSD1]](%{{.*}}, %{{.*}})
+  // CHECK-NEXT:     {{.*}} affine.apply #[[MINSD0PLUSD1]](%{{.*}}, %{{.*}})
+  // CHECK-NEXT:     {{.*}} affine.apply #[[D0MINUSD1]](%{{.*}}, %{{.*}})
+
+  affine.for %i3 = 0 to 16 {
+    affine.for %i4 = 0 to 47 step 2 {
+      affine.for %i5 = 0 to 78 step 16 {
+        %50 = affine.apply (d0) -> (d0) (%i3)
+        %51 = affine.apply (d0) -> (d0) (%i4)
+        %52 = affine.apply (d0) -> (d0) (%i5)
+        %53 = affine.apply (d0, d1, d2) -> (d0) (%50, %51, %52)
+        %54 = affine.apply (d0, d1, d2) -> (d1) (%50, %51, %52)
+        %55 = affine.apply (d0, d1, d2) -> (d2) (%50, %51, %52)
+      }
+    }
+  }
+  // CHECK:      affine.for %{{.*}} = 0 to 16
+  // CHECK-NEXT:   affine.for %{{.*}} = 0 to 47 step 2
+  // CHECK-NEXT:     affine.for %{{.*}} = 0 to 78 step 16
+  // CHECK-NEXT:       {{.*}} affine.apply #[[ID1]](%{{.*}})
+  // CHECK-NEXT:       {{.*}} affine.apply #[[ID1]](%{{.*}})
+  // CHECK-NEXT:       {{.*}} affine.apply #[[ID1]](%{{.*}})
+
+  return
+}
diff --git a/mlir/test/Transforms/Vectorize/vector_utils.mlir b/mlir/test/Transforms/Vectorize/vector_utils.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..e6d00b17130e72a93941a4334c31a63067b00d8d
--- /dev/null
+++ b/mlir/test/Transforms/Vectorize/vector_utils.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-opt %s -affine-vectorizer-test -vector-shape-ratio 4 -vector-shape-ratio 8 2>&1 | FileCheck %s
+// RUN: mlir-opt %s -affine-vectorizer-test -vector-shape-ratio 2 -vector-shape-ratio 5 -vector-shape-ratio 2 2>&1 | FileCheck %s -check-prefix=TEST-3x4x5x8
+
+func @vector_add_2d(%arg0: index, %arg1: index) -> f32 {
+  // Nothing should be matched in this first block.
+  // CHECK-NOT:matched: {{.*}} = alloc{{.*}}
+  // CHECK-NOT:matched: {{.*}} = constant 0{{.*}}
+  // CHECK-NOT:matched: {{.*}} = constant 1{{.*}}
+  %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
+  %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
+  %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
+  %c0 = constant 0 : index
+  %cst = constant 1.000000e+00 : f32
+
+  // CHECK:matched: {{.*}} constant dense{{.*}} with shape ratio: 2, 32
+  %cst_1 = constant dense<1.000000e+00> : vector<8x256xf32>
+  // CHECK:matched: {{.*}} constant dense{{.*}} with shape ratio: 1, 3, 7, 2, 1
+  %cst_a = constant dense<1.000000e+00> : vector<1x3x7x8x8xf32>
+  // CHECK-NOT:matched: {{.*}} constant dense{{.*}} with shape ratio: 1, 3, 7, 1{{.*}}
+  %cst_b = constant dense<1.000000e+00> : vector<1x3x7x4x4xf32>
+  // TEST-3x4x5x8:matched: {{.*}} constant dense{{.*}} with shape ratio: 3, 2, 1, 4
+  %cst_c = constant dense<1.000000e+00> : vector<3x4x5x8xf32>
+  // TEST-3x4x4x8-NOT:matched: {{.*}} constant dense{{.*}} with shape ratio{{.*}}
+  %cst_d = constant dense<1.000000e+00> : vector<3x4x4x8xf32>
+  // TEST-3x4x4x8:matched: {{.*}} constant dense{{.*}} with shape ratio: 1, 1, 2, 16
+  %cst_e = constant dense<1.000000e+00> : vector<1x2x10x32xf32>
+
+  // Nothing should be matched in this last block.
+  // CHECK-NOT:matched: {{.*}} = constant 7{{.*}}
+  // CHECK-NOT:matched: {{.*}} = constant 42{{.*}}
+  // CHECK-NOT:matched: {{.*}} = load{{.*}}
+  // CHECK-NOT:matched: return {{.*}}
+  %c7 = constant 7 : index
+  %c42 = constant 42 : index
+  %9 = load %2[%c7, %c42] : memref<?x?xf32>
+  return %9 : f32
+}
diff --git a/mlir/test/Transforms/Vectorize/vectorize_1d.mlir b/mlir/test/Transforms/Vectorize/vectorize_1d.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..83f783c3aefe996a866d31b77b2cb38bb239086a
--- /dev/null
+++ b/mlir/test/Transforms/Vectorize/vectorize_1d.mlir
@@ -0,0 +1,377 @@
+// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 128 --test-fastest-varying=0 | FileCheck %s
+
+// Permutation maps used in vectorization.
+// CHECK: #[[map_proj_d0d1_0:map[0-9]+]] = (d0, d1) -> (0)
+// CHECK: #[[map_proj_d0d1_d1:map[0-9]+]] = (d0, d1) -> (d1)
+
+#map0 = (d0) -> (d0)
+#mapadd1 = (d0) -> (d0 + 1)
+#mapadd2 = (d0) -> (d0 + 2)
+#mapadd3 = (d0) -> (d0 + 3)
+#set0 = (i) : (i >= 0)
+
+// Maps introduced to vectorize fastest varying memory index.
+// CHECK-LABEL: func @vec1d_1
+func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
+// CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK: for {{.*}} step 128
+// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
+// CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
+// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
+   affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector
+     %a0 = affine.load %A[%cst0, %cst0] : memref<?x?xf32>
+   }
+   return
+}
+
+// CHECK-LABEL: func @vec1d_2
+func @vec1d_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
+// CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK:for [[IV3:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
+// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+   affine.for %i3 = 0 to %M { // vectorized
+     %a3 = affine.load %A[%cst0, %i3] : memref<?x?xf32>
+   }
+   return
+}
+
+// CHECK-LABEL: func @vec1d_3
+func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
+// CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %arg0, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %arg0, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %arg1, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK:for [[IV8:%[arg0-9]+]] = 0 to [[ARG_M]] step 128
+// CHECK-NEXT:   for [[IV9:%[arg0-9]*]] = 0 to [[ARG_N]] {
+// CHECK-NEXT:   %[[APP9_0:[0-9]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
+// CHECK-NEXT:   %[[APP9_1:[0-9]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
+// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+   affine.for %i8 = 0 to %M { // vectorized
+     affine.for %i9 = 0 to %N {
+       %a9 = affine.load %A[%i9, %i8 + %i9] : memref<?x?xf32>
+     }
+   }
+   return
+}
+
+// CHECK-LABEL: func @vector_add_2d
+func @vector_add_2d(%M : index, %N : index) -> f32 {
+  %A = alloc (%M, %N) : memref<?x?xf32, 0>
+  %B = alloc (%M, %N) : memref<?x?xf32, 0>
+  %C = alloc (%M, %N) : memref<?x?xf32, 0>
+  %f1 = constant 1.0 : f32
+  %f2 = constant 2.0 : f32
+  affine.for %i0 = 0 to %M {
+    affine.for %i1 = 0 to %N {
+      // CHECK: [[C1:%.*]] = constant dense<1.000000e+00> : vector<128xf32>
+      // CHECK: vector.transfer_write [[C1]], {{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : vector<128xf32>, memref<?x?xf32>
+      // non-scoped %f1
+      affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
+    }
+  }
+  affine.for %i2 = 0 to %M {
+    affine.for %i3 = 0 to %N {
+      // CHECK: [[C3:%.*]] = constant dense<2.000000e+00> : vector<128xf32>
+      // CHECK: vector.transfer_write [[C3]], {{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : vector<128xf32>, memref<?x?xf32>
+      // non-scoped %f2
+      affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
+    }
+  }
+  affine.for %i4 = 0 to %M {
+    affine.for %i5 = 0 to %N {
+      // CHECK: [[A5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+      // CHECK: [[B5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+      // CHECK: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<128xf32>
+      // CHECK: [[SPLAT1:%.*]] = constant dense<1.000000e+00> : vector<128xf32>
+      // CHECK: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<128xf32>
+      // CHECK: [[SPLAT2:%.*]] = constant dense<2.000000e+00> : vector<128xf32>
+      // CHECK: [[S7:%.*]] = addf [[S5]], [[SPLAT2]] : vector<128xf32>
+      // CHECK: [[S8:%.*]] = addf [[S7]], [[S6]] : vector<128xf32>
+      // CHECK: vector.transfer_write [[S8]], {{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : vector<128xf32>, memref<?x?xf32>
+      %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0>
+      %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0>
+      %s5 = addf %a5, %b5 : f32
+      // non-scoped %f1
+      %s6 = addf %s5, %f1 : f32
+      // non-scoped %f2
+      %s7 = addf %s5, %f2 : f32
+      // diamond dependency.
+      %s8 = addf %s7, %s6 : f32
+      affine.store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
+    }
+  }
+  %c7 = constant 7 : index
+  %c42 = constant 42 : index
+  %res = affine.load %C[%c7, %c42] : memref<?x?xf32, 0>
+  return %res : f32
+}
+
+// CHECK-LABEL: func @vec_rejected_1
+func @vec_rejected_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: [[C0:%[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK:for {{.*}} [[ARG_M]] {
+   affine.for %i1 = 0 to %M { // not vectorized
+     %a1 = affine.load %A[%i1, %i1] : memref<?x?xf32>
+   }
+   return
+}
+
+// CHECK-LABEL: func @vec_rejected_2
+func @vec_rejected_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: [[C0:%[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK:   affine.for %{{.*}}{{[0-9]*}} = 0 to [[ARG_M]] {
+   affine.for %i2 = 0 to %M { // not vectorized, would vectorize with --test-fastest-varying=1
+     %a2 = affine.load %A[%i2, %cst0] : memref<?x?xf32>
+   }
+   return
+}
+
+// CHECK-LABEL: func @vec_rejected_3
+func @vec_rejected_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
+// CHECK-DAG: [[C0:%[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK:for [[IV4:%[arg0-9]+]] = 0 to [[ARG_M]] step 128 {
+// CHECK-NEXT:   for [[IV5:%[arg0-9]*]] = 0 to [[ARG_N]] {
+// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+   affine.for %i4 = 0 to %M { // vectorized
+     affine.for %i5 = 0 to %N { // not vectorized, would vectorize with --test-fastest-varying=1
+       %a5 = affine.load %A[%i5, %i4] : memref<?x?xf32>
+     }
+   }
+   return
+}
+
+// CHECK-LABEL: func @vec_rejected_4
+func @vec_rejected_4(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: [[C0:%[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK: for [[IV6:%[arg0-9]*]] = 0 to [[ARG_M]] {
+// CHECK-NEXT:   for [[IV7:%[arg0-9]*]] = 0 to [[ARG_N]] {
+   affine.for %i6 = 0 to %M { // not vectorized, would vectorize with --test-fastest-varying=1
+     affine.for %i7 = 0 to %N { // not vectorized, can never vectorize
+       %a7 = affine.load %A[%i6 + %i7, %i6] : memref<?x?xf32>
+     }
+   }
+   return
+}
+
+// CHECK-LABEL: func @vec_rejected_5
+func @vec_rejected_5(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: [[C0:%[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK: for [[IV10:%[arg0-9]*]] = 0 to %{{[0-9]*}} {
+// CHECK:   for [[IV11:%[arg0-9]*]] = 0 to %{{[0-9]*}} {
+   affine.for %i10 = 0 to %M { // not vectorized, need per load transposes
+     affine.for %i11 = 0 to %N { // not vectorized, need per load transposes
+       %a11 = affine.load %A[%i10, %i11] : memref<?x?xf32>
+       affine.store %a11, %A[%i11, %i10] : memref<?x?xf32>
+     }
+   }
+   return
+}
+
+// CHECK-LABEL: func @vec_rejected_6
+func @vec_rejected_6(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: [[C0:%[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK: for [[IV12:%[arg0-9]*]] = 0 to %{{[0-9]*}} {
+// CHECK:   for [[IV13:%[arg0-9]*]] = 0 to %{{[0-9]*}} {
+// CHECK:     for [[IV14:%[arg0-9]+]] = 0 to [[ARG_P]] step 128
+   affine.for %i12 = 0 to %M { // not vectorized, can never vectorize
+     affine.for %i13 = 0 to %N { // not vectorized, can never vectorize
+       affine.for %i14 = 0 to %P { // vectorized
+         %a14 = affine.load %B[%i13, %i12 + %i13, %i12 + %i14] : memref<?x?x?xf32>
+       }
+     }
+   }
+   return
+}
+
+// CHECK-LABEL: func @vec_rejected_7
+func @vec_rejected_7(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: [[C0:%[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK:  affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
+   affine.for %i16 = 0 to %M { // not vectorized, can't vectorize a vector load
+     %a16 = alloc(%M) : memref<?xvector<2xf32>>
+     %l16 = affine.load %a16[%i16] : memref<?xvector<2xf32>>
+   }
+   return
+}
+
+// CHECK-LABEL: func @vec_rejected_8
+func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
+// CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
+// CHECK:   for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
+// CHECK:     %{{.*}} = affine.apply #map0(%{{.*}})
+// CHECK:     %{{.*}} = affine.apply #map0(%{{.*}})
+// CHECK:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
+   affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}}
+     affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector
+       %a18 = affine.load %A[%cst0, %cst0] : memref<?x?xf32>
+     }
+   }
+   return
+}
+
+// CHECK-LABEL: func @vec_rejected_9
+func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
+// CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
+// CHECK:   for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
+// CHECK:      %{{.*}} = affine.apply #map0(%{{.*}})
+// CHECK-NEXT: %{{.*}} = affine.apply #map0(%{{.*}})
+// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
+   affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}}
+     affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector
+       %a18 = affine.load %A[%cst0, %cst0] : memref<?x?xf32>
+     }
+   }
+   return
+}
+
+// CHECK-LABEL: func @vec_rejected_10
+func @vec_rejected_10(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: [[C0:%[a-z0-9_]+]] = constant 0 : index
+// CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9]+]] = dim %{{.*}}, 2 : memref<?x?x?xf32>
+   %M = dim %A, 0 : memref<?x?xf32>
+   %N = dim %A, 1 : memref<?x?xf32>
+   %P = dim %B, 2 : memref<?x?x?xf32>
+   %cst0 = constant 0 : index
+//
+// CHECK:  affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
+   affine.for %i15 = 0 to %M { // not vectorized due to condition below
+     affine.if #set0(%i15) {
+       %a15 = affine.load %A[%cst0, %cst0] : memref<?x?xf32>
+     }
+   }
+   return
+}
+
+// This should not vectorize and should not crash.
+// CHECK-LABEL: @vec_rejected_11
+func @vec_rejected_11(%A : memref<?x?xf32>, %C : memref<?x?xf32>) {
+  %N = dim %A, 0 : memref<?x?xf32>
+  affine.for %i = 0 to %N {
+// CHECK-NOT: vector
+    %a = affine.load %A[%i, %i] : memref<?x?xf32> // not vectorized
+    affine.for %j = 0 to %N {
+      %b = affine.load %A[%i, %j] : memref<?x?xf32> // may be vectorized
+// CHECK-NOT: vector
+      %c = addf %a, %b : f32 // not vectorized because %a wasn't
+// CHECK-NOT: vector
+      affine.store %c, %C[%i, %j] : memref<?x?xf32> // not vectorized because %c wasn't
+    }
+  }
+  return
+}
+
+// This should not vectorize due to the sequential dependence in the loop.
+// CHECK-LABEL: @vec_rejected_sequential
+func @vec_rejected_sequential(%A : memref<?xf32>) {
+  %N = dim %A, 0 : memref<?xf32>
+  affine.for %i = 0 to %N {
+    // CHECK-NOT: vector
+    %a = affine.load %A[%i] : memref<?xf32>
+    // CHECK-NOT: vector
+    affine.store %a, %A[%i + 1] : memref<?xf32>
+  }
+  return
+}
diff --git a/mlir/test/Transforms/Vectorize/vectorize_2d.mlir b/mlir/test/Transforms/Vectorize/vectorize_2d.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..a75530925051c3ccd052f119bd28b8880563015f
--- /dev/null
+++ b/mlir/test/Transforms/Vectorize/vectorize_2d.mlir
@@ -0,0 +1,142 @@
+// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 4 -virtual-vector-size 8 | FileCheck %s -check-prefix=VECT
+// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s
+
+// Permutation maps used in vectorization.
+// CHECK-DAG: #[[map_id1:map[0-9]+]] = (d0) -> (d0)
+// CHECK-DAG: #[[map_id2:map[0-9]+]] = (d0, d1) -> (d0, d1)
+// CHECK-DAG: #[[map_proj_d0d1_zerod1:map[0-9]+]] = (d0, d1) -> (0, d1)
+// CHECK-DAG: #[[map_proj_d0d1_d0zero:map[0-9]+]] = (d0, d1) -> (d0, 0)
+// VECT-DAG: #[[map_id1:map[0-9]+]] = (d0) -> (d0)
+// VECT-DAG: #[[map_id2:map[0-9]+]] = (d0, d1) -> (d0, d1)
+// VECT-DAG: #[[map_proj_d0d1_zerod1:map[0-9]+]] = (d0, d1) -> (0, d1)
+// VECT-DAG: #[[map_proj_d0d1_d0zero:map[0-9]+]] = (d0, d1) -> (d0, 0)
+
+func @vec2d(%A : memref<?x?x?xf32>) {
+   %M = dim %A, 0 : memref<?x?x?xf32>
+   %N = dim %A, 1 : memref<?x?x?xf32>
+   %P = dim %A, 2 : memref<?x?x?xf32>
+   // CHECK: for  {{.*}} = 0 to %{{.*}} {
+   // CHECK:   for {{.*}} = 0 to %{{.*}} step 32
+   // CHECK:     for {{.*}} = 0 to %{{.*}} step 256
+   // Example:
+   // affine.for %{{.*}} = 0 to %{{.*}} {
+   //   affine.for %{{.*}} = 0 to %{{.*}} step 32 {
+   //     affine.for %{{.*}} = 0 to %{{.*}} step 256 {
+   //       %{{.*}} = "vector.transfer_read"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (memref<?x?x?xf32>, index, index, index) -> vector<32x256xf32>
+   affine.for %i0 = 0 to %M {
+     affine.for %i1 = 0 to %N {
+       affine.for %i2 = 0 to %P {
+         %a2 = affine.load %A[%i0, %i1, %i2] : memref<?x?x?xf32>
+       }
+     }
+   }
+   // CHECK: for  {{.*}} = 0 to %{{.*}} {
+   // CHECK:   for  {{.*}} = 0 to %{{.*}} {
+   // CHECK:     for  {{.*}} = 0 to %{{.*}} {
+   // For the case: --test-fastest-varying=1 --test-fastest-varying=0 no
+   // vectorization happens because of loop nesting order .
+   affine.for %i3 = 0 to %M {
+     affine.for %i4 = 0 to %N {
+       affine.for %i5 = 0 to %P {
+         %a5 = affine.load %A[%i4, %i5, %i3] : memref<?x?x?xf32>
+       }
+     }
+   }
+   return
+}
+
+func @vector_add_2d(%M : index, %N : index) -> f32 {
+  %A = alloc (%M, %N) : memref<?x?xf32, 0>
+  %B = alloc (%M, %N) : memref<?x?xf32, 0>
+  %C = alloc (%M, %N) : memref<?x?xf32, 0>
+  %f1 = constant 1.0 : f32
+  %f2 = constant 2.0 : f32
+  affine.for %i0 = 0 to %M {
+    affine.for %i1 = 0 to %N {
+      // CHECK: [[C1:%.*]] = constant dense<1.000000e+00> : vector<32x256xf32>
+      // CHECK: vector.transfer_write [[C1]], {{.*}} {permutation_map = #[[map_id2]]} : vector<32x256xf32>, memref<?x?xf32>
+      // non-scoped %f1
+      affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
+    }
+  }
+  affine.for %i2 = 0 to %M {
+    affine.for %i3 = 0 to %N {
+      // CHECK: [[C3:%.*]] = constant dense<2.000000e+00> : vector<32x256xf32>
+      // CHECK: vector.transfer_write [[C3]], {{.*}} {permutation_map = #[[map_id2]]}  : vector<32x256xf32>, memref<?x?xf32>
+      // non-scoped %f2
+      affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
+    }
+  }
+  affine.for %i4 = 0 to %M {
+    affine.for %i5 = 0 to %N {
+      // CHECK: [[A5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {permutation_map = #[[map_id2]]} : memref<?x?xf32>, vector<32x256xf32>
+      // CHECK: [[B5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {permutation_map = #[[map_id2]]} : memref<?x?xf32>, vector<32x256xf32>
+      // CHECK: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<32x256xf32>
+      // CHECK: [[SPLAT1:%.*]] = constant dense<1.000000e+00> : vector<32x256xf32>
+      // CHECK: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<32x256xf32>
+      // CHECK: [[SPLAT2:%.*]] = constant dense<2.000000e+00> : vector<32x256xf32>
+      // CHECK: [[S7:%.*]] = addf [[S5]], [[SPLAT2]] : vector<32x256xf32>
+      // CHECK: [[S8:%.*]] = addf [[S7]], [[S6]] : vector<32x256xf32>
+      // CHECK: vector.transfer_write [[S8]], {{.*}} {permutation_map = #[[map_id2]]} : vector<32x256xf32>, memref<?x?xf32>
+      //
+      %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0>
+      %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0>
+      %s5 = addf %a5, %b5 : f32
+      // non-scoped %f1
+      %s6 = addf %s5, %f1 : f32
+      // non-scoped %f2
+      %s7 = addf %s5, %f2 : f32
+      // diamond dependency.
+      %s8 = addf %s7, %s6 : f32
+      affine.store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
+    }
+  }
+  %c7 = constant 7 : index
+  %c42 = constant 42 : index
+  %res = affine.load %C[%c7, %c42] : memref<?x?xf32, 0>
+  return %res : f32
+}
+
+// VECT-LABEL: func @vectorize_matmul
+func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
+  %c0 = constant 0 : index
+  %M = dim %arg0, 0 : memref<?x?xf32>
+  %K = dim %arg0, 1 : memref<?x?xf32>
+  %N = dim %arg2, 1 : memref<?x?xf32>
+  //      VECT: %[[C0:.*]] = constant 0 : index
+  // VECT-NEXT: %[[M:.*]] = dim %{{.*}}, 0 : memref<?x?xf32>
+  // VECT-NEXT: %[[K:.*]] = dim %{{.*}}, 1 : memref<?x?xf32>
+  // VECT-NEXT: %[[N:.*]] = dim %{{.*}}, 1 : memref<?x?xf32>
+  //      VECT: {{.*}} #[[map_id1]](%[[M]]) step 4 {
+  // VECT-NEXT:   {{.*}} #[[map_id1]](%[[N]]) step 8 {
+  //      VECT:     %[[VC0:.*]] = constant dense<0.000000e+00> : vector<4x8xf32>
+  // VECT-NEXT:     vector.transfer_write %[[VC0]], %{{.*}}[%{{.*}}, %{{.*}}] {permutation_map = #[[map_id2]]} : vector<4x8xf32>, memref<?x?xf32>
+  affine.for %i0 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%M) {
+    affine.for %i1 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%N) {
+      %cst = constant 0.000000e+00 : f32
+      affine.store %cst, %arg2[%i0, %i1] : memref<?x?xf32>
+    }
+  }
+  //      VECT:  affine.for %[[I2:.*]] = #[[map_id1]](%[[C0]]) to #[[map_id1]](%[[M]]) step 4 {
+  // VECT-NEXT:    affine.for %[[I3:.*]] = #[[map_id1]](%[[C0]]) to #[[map_id1]](%[[N]]) step 8 {
+  // VECT-NEXT:      affine.for %[[I4:.*]] = #map5(%[[C0]]) to #[[map_id1]](%[[K]]) {
+  // VECT-NEXT:        %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
+  // VECT-NEXT:        %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
+  // VECT-NEXT:        %[[C:.*]] = mulf %[[B]], %[[A]] : vector<4x8xf32>
+  // VECT-NEXT:        %[[D:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I3]]], %{{.*}} {permutation_map = #[[map_id2]]} : memref<?x?xf32>, vector<4x8xf32>
+  // VECT-NEXT:        %[[E:.*]] = addf %[[D]], %[[C]] : vector<4x8xf32>
+  // VECT-NEXT:        vector.transfer_write %[[E]], %{{.*}}[%[[I2]], %[[I3]]] {permutation_map = #[[map_id2]]} : vector<4x8xf32>, memref<?x?xf32>
+  affine.for %i2 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%M) {
+    affine.for %i3 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%N) {
+      affine.for %i4 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%K) {
+        %6 = affine.load %arg1[%i4, %i3] : memref<?x?xf32>
+        %7 = affine.load %arg0[%i2, %i4] : memref<?x?xf32>
+        %8 = mulf %7, %6 : f32
+        %9 = affine.load %arg2[%i2, %i3] : memref<?x?xf32>
+        %10 = addf %9, %8 : f32
+        affine.store %10, %arg2[%i2, %i3] : memref<?x?xf32>
+      }
+    }
+  }
+  return
+}
diff --git a/mlir/test/Transforms/Vectorize/vectorize_3d.mlir b/mlir/test/Transforms/Vectorize/vectorize_3d.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..df60806155afd62683af231e1d311e4c68f4d910
--- /dev/null
+++ b/mlir/test/Transforms/Vectorize/vectorize_3d.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 64 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s
+
+// Permutation maps used in vectorization.
+// CHECK: #[[map_proj_d0d1d2_d0d1d2:map[0-9]+]] = (d0, d1, d2) -> (d0, d1, d2)
+
+func @vec3d(%A : memref<?x?x?xf32>) {
+   %0 = dim %A, 0 : memref<?x?x?xf32>
+   %1 = dim %A, 1 : memref<?x?x?xf32>
+   %2 = dim %A, 2 : memref<?x?x?xf32>
+   // CHECK: affine.for %{{.*}} = 0 to %{{.*}} {
+   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
+   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 32 {
+   // CHECK:       affine.for %{{.*}} = 0 to %{{.*}} step 64 {
+   // CHECK:         affine.for %{{.*}} = 0 to %{{.*}} step 256 {
+   // CHECK:           %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d0d1d2]]} : memref<?x?x?xf32>, vector<32x64x256xf32>
+   affine.for %t0 = 0 to %0 {
+     affine.for %t1 = 0 to %0 {
+       affine.for %i0 = 0 to %0 {
+         affine.for %i1 = 0 to %1 {
+           affine.for %i2 = 0 to %2 {
+             %a2 = affine.load %A[%i0, %i1, %i2] : memref<?x?x?xf32>
+           }
+         }
+       }
+     }
+   }
+   return
+}
diff --git a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir b/mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..e398144a22210b5b544522aa219667a5617c9b0e
--- /dev/null
+++ b/mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=0 | FileCheck %s
+
+// Permutation maps used in vectorization.
+// CHECK: #[[map_proj_d0d1d2_d0d2:map[0-9]+]] = (d0, d1, d2) -> (d0, d2)
+
+func @vec2d(%A : memref<?x?x?xf32>) {
+   %M = dim %A, 0 : memref<?x?x?xf32>
+   %N = dim %A, 1 : memref<?x?x?xf32>
+   %P = dim %A, 2 : memref<?x?x?xf32>
+   // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32
+   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
+   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256
+   // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d0d2]]} : memref<?x?x?xf32>,  vector<32x256xf32>
+   affine.for %i0 = 0 to %M {
+     affine.for %i1 = 0 to %N {
+       affine.for %i2 = 0 to %P {
+         %a2 = affine.load %A[%i0, %i1, %i2] : memref<?x?x?xf32>
+       }
+     }
+   }
+   // CHECK: for  {{.*}} = 0 to %{{.*}} {
+   // CHECK:   for  {{.*}} = 0 to %{{.*}} {
+   // CHECK:     for  {{.*}} = 0 to %{{.*}} {
+   // For the case: --test-fastest-varying=2 --test-fastest-varying=0 no
+   // vectorization happens because of loop nesting order
+   affine.for %i3 = 0 to %M {
+     affine.for %i4 = 0 to %N {
+       affine.for %i5 = 0 to %P {
+         %a5 = affine.load %A[%i4, %i5, %i3] : memref<?x?x?xf32>
+       }
+     }
+   }
+   return
+}
diff --git a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir b/mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d2de5f8d159ac48f4b76eaa65fc2c6178b589e3b
--- /dev/null
+++ b/mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir
@@ -0,0 +1,65 @@
+// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=2 | FileCheck %s
+
+// Permutation maps used in vectorization.
+// CHECK: #[[map_proj_d0d1d2_d2d0:map[0-9]+]] = (d0, d1, d2) -> (d2, d0)
+
+func @vec2d(%A : memref<?x?x?xf32>) {
+   %M = dim %A, 0 : memref<?x?x?xf32>
+   %N = dim %A, 1 : memref<?x?x?xf32>
+   %P = dim %A, 2 : memref<?x?x?xf32>
+   // CHECK: for  {{.*}} = 0 to %{{.*}} {
+   // CHECK:   for  {{.*}} = 0 to %{{.*}} {
+   // CHECK:     for  {{.*}} = 0 to %{{.*}} {
+   // For the case: --test-fastest-varying=0 --test-fastest-varying=2 no
+   // vectorization happens because of loop nesting order.
+   affine.for %i0 = 0 to %M {
+     affine.for %i1 = 0 to %N {
+       affine.for %i2 = 0 to %P {
+         %a2 = affine.load %A[%i0, %i1, %i2] : memref<?x?x?xf32>
+       }
+     }
+   }
+   // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32
+   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} step 256
+   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} {
+   // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   affine.for %i3 = 0 to %M {
+     affine.for %i4 = 0 to %N {
+       affine.for %i5 = 0 to %P {
+         %a5 = affine.load %A[%i4, %i5, %i3] : memref<?x?x?xf32>
+       }
+     }
+   }
+   return
+}
+
+func @vec2d_imperfectly_nested(%A : memref<?x?x?xf32>) {
+   %0 = dim %A, 0 : memref<?x?x?xf32>
+   %1 = dim %A, 1 : memref<?x?x?xf32>
+   %2 = dim %A, 2 : memref<?x?x?xf32>
+   // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 {
+   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
+   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256 {
+   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} step 256 {
+   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} {
+   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} {
+   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d0]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   affine.for %i0 = 0 to %0 {
+     affine.for %i1 = 0 to %1 {
+       affine.for %i2 = 0 to %2 {
+         %a2 = affine.load %A[%i2, %i1, %i0] : memref<?x?x?xf32>
+       }
+     }
+     affine.for %i3 = 0 to %1 {
+       affine.for %i4 = 0 to %2 {
+         %a4 = affine.load %A[%i3, %i4, %i0] : memref<?x?x?xf32>
+       }
+       affine.for %i5 = 0 to %2 {
+         %a5 = affine.load %A[%i3, %i5, %i0] : memref<?x?x?xf32>
+       }
+     }
+   }
+   return
+}
diff --git a/mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir b/mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..765cd07ce7dfd609865bcdd2f7d74e73433214c9
--- /dev/null
+++ b/mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir
@@ -0,0 +1,66 @@
+// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=1 | FileCheck %s
+
+// Permutation maps used in vectorization.
+// CHECK-DAG: #[[map_proj_d0d1d2_d2d1:map[0-9]+]] = (d0, d1, d2) -> (d2, d1)
+
+func @vec2d(%A : memref<?x?x?xf32>) {
+   %M = dim %A, 0 : memref<?x?x?xf32>
+   %N = dim %A, 1 : memref<?x?x?xf32>
+   %P = dim %A, 2 : memref<?x?x?xf32>
+   // CHECK: for  {{.*}} = 0 to %{{.*}} {
+   // CHECK:   for  {{.*}} = 0 to %{{.*}} {
+   // CHECK:     for  {{.*}} = 0 to %{{.*}} {
+   // For the case: --test-fastest-varying=0 --test-fastest-varying=1 no
+   // vectorization happens because of loop nesting order.
+  affine.for %i0 = 0 to %M {
+     affine.for %i1 = 0 to %N {
+       affine.for %i2 = 0 to %P {
+         %a2 = affine.load %A[%i0, %i1, %i2] : memref<?x?x?xf32>
+       }
+     }
+   }
+   // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32
+   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
+   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256
+   // CHECK:       {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   affine.for %i3 = 0 to %M {
+     affine.for %i4 = 0 to %N {
+       affine.for %i5 = 0 to %P {
+         %a5 = affine.load %A[%i4, %i5, %i3] : memref<?x?x?xf32>
+       }
+     }
+   }
+   return
+}
+
+func @vec2d_imperfectly_nested(%A : memref<?x?x?xf32>) {
+   %0 = dim %A, 0 : memref<?x?x?xf32>
+   %1 = dim %A, 1 : memref<?x?x?xf32>
+   %2 = dim %A, 2 : memref<?x?x?xf32>
+   // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 {
+   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} step 256 {
+   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} {
+   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   // CHECK:   affine.for %{{.*}} = 0 to %{{.*}} {
+   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256 {
+   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 256 {
+   // CHECK:       %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d2d1]]} : memref<?x?x?xf32>, vector<32x256xf32>
+   affine.for %i0 = 0 to %0 {
+     affine.for %i1 = 0 to %1 {
+       affine.for %i2 = 0 to %2 {
+         %a2 = affine.load %A[%i2, %i1, %i0] : memref<?x?x?xf32>
+       }
+     }
+     affine.for %i3 = 0 to %1 {
+       affine.for %i4 = 0 to %2 {
+         %a4 = affine.load %A[%i3, %i4, %i0] : memref<?x?x?xf32>
+       }
+       affine.for %i5 = 0 to %2 {
+         %a5 = affine.load %A[%i3, %i5, %i0] : memref<?x?x?xf32>
+       }
+     }
+   }
+   return
+}
+
diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Transforms/affine-data-copy.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..5a0b2eb058c48aa0fa2b623b7c69c28c3419c8b6
--- /dev/null
+++ b/mlir/test/Transforms/affine-data-copy.mlir
@@ -0,0 +1,163 @@
+// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-skip-non-unit-stride-loops | FileCheck %s
+// Small buffer size to trigger fine copies.
+// RUN: mlir-opt %s -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-fast-mem-capacity=1 | FileCheck --check-prefix=CHECK-SMALL %s
+
+// -copy-skip-non-stride-loops forces the copies to be placed right inside the
+// tile space loops, avoiding the sensitivity of copy placement depth to memory
+// footprint -- so that one could write a definite test case and not have to
+// update it each time something related to the cost functions change.
+
+#map0 = (d0) -> (d0)
+#map1 = (d0) -> (d0 + 128)
+
+// Map used to index the original memref while copying.
+// CHECK-DAG: [[MEM_IDX_MAP:map[0-9]+]] = (d0, d1) -> (d0 + d1)
+// Map used to index the buffer while computing.
+// CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = (d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)
+
+// CHECK-LABEL: func @matmul
+func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> {
+  affine.for %i = 0 to 4096 step 128 {
+    affine.for %j = 0 to 4096 step 128 {
+      affine.for %k = 0 to 4096 step 128 {
+        affine.for %ii = #map0(%i) to #map1(%i) {
+          affine.for %jj = #map0(%j) to #map1(%j) {
+            affine.for %kk = #map0(%k) to #map1(%k) {
+              %5 = affine.load %A[%ii, %kk] : memref<4096x4096xf32>
+              %6 = affine.load %B[%kk, %jj] : memref<4096x4096xf32>
+              %7 = affine.load %C[%ii, %jj] : memref<4096x4096xf32>
+              %8 = mulf %5, %6 : f32
+              %9 = addf %7, %8 : f32
+              affine.store %9, %C[%ii, %jj] : memref<4096x4096xf32>
+            }
+          }
+        }
+      }
+    }
+  }
+  return %C : memref<4096x4096xf32>
+}
+
+// Buffers of size 128x128 get created here for all three matrices.
+
+// CHECK: affine.for %{{.*}} = 0 to 4096 step 128 {
+// CHECK:   affine.for %{{.*}} = 0 to 4096 step 128 {
+// CHECK:     [[BUFC:%[0-9]+]] = alloc() : memref<128x128xf32>
+
+// The result matrix's copy gets hoisted out.
+// Result matrix copy-in.
+// CHECK:     affine.for %{{.*}} = 0 to 128 {
+// CHECK:       %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:       affine.for %{{.*}} = 0 to 128 {
+// CHECK:         %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
+// CHECK:         affine.store %{{.*}}, [[BUFC]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
+// CHECK:       }
+// CHECK:     }
+
+// LHS matrix copy-in.
+// CHECK:     affine.for %{{.*}} = 0 to 4096 step 128 {
+// CHECK:      [[BUFA:%[0-9]+]] = alloc() : memref<128x128xf32>
+// CHECK:       affine.for %{{.*}} = 0 to 128 {
+// CHECK:         %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:         affine.for %{{.*}} = 0 to 128 {
+// CHECK:           %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:           %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
+// CHECK:           affine.store %{{.*}}, [[BUFA]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
+// CHECK:         }
+// CHECK:       }
+
+// RHS matrix copy-in.
+// CHECK:       [[BUFB:%[0-9]+]] = alloc() : memref<128x128xf32>
+// CHECK:       affine.for %{{.*}} = 0 to 128 {
+// CHECK:         %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:         affine.for %{{.*}} = 0 to 128 {
+// CHECK:           %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:           %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
+// CHECK:           affine.store %{{.*}}, [[BUFB]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
+// CHECK:         }
+// CHECK:       }
+
+// Computation on the fast buffers.
+// CHECK:       affine.for %{{.*}} = #map7(%{{.*}}) to #map8(%{{.*}}) {
+// CHECK:         affine.for %{{.*}} = #map7(%{{.*}}) to #map8(%{{.*}}) {
+// CHECK:           affine.for %{{.*}} = #map7(%{{.*}}) to #map8(%{{.*}}) {
+// CHECK:             %{{.*}} = affine.load [[BUFA]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
+// CHECK:             %{{.*}} = affine.load [[BUFB]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
+// CHECK:             %{{.*}} = affine.load [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
+// CHECK:             %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+// CHECK:             %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK:             affine.store %{{.*}}, [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
+// CHECK:           }
+// CHECK:         }
+// CHECK:       }
+// CHECK:       dealloc [[BUFB]] : memref<128x128xf32>
+// CHECK:       dealloc [[BUFA]] : memref<128x128xf32>
+// CHECK:     }
+// CHECK:     %{{.*}} = affine.apply #map0(%{{.*}}, %{{.*}})
+// CHECK:     %{{.*}} = affine.apply #map1(%{{.*}}, %{{.*}})
+
+// Result matrix copy out.
+// CHECK:     affine.for %{{.*}} = 0 to 128 {
+// CHECK:       %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:       affine.for %{{.*}} = 0 to 128 {
+// CHECK:         %{{.*}} = affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK:         [[BUFA]] = affine.load [[BUFC]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
+// CHECK:         store [[BUFA]], %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
+// CHECK:       }
+// CHECK:     }
+// CHECK:     dealloc [[BUFC]] : memref<128x128xf32>
+// CHECK:   }
+// CHECK: }
+
+//
+// This test case will lead to single element buffers. These are eventually
+// expected to be turned into registers via alloca and mem2reg.
+//
+// CHECK-SMALL: func @foo
+func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
+  affine.for %i = 0 to 1024 {
+    affine.for %j = 0 to 1024 {
+      affine.for %k = 0 to 1024 {
+        %6 = affine.load %arg1[%k, %j] : memref<1024x1024xf32>
+        %7 = affine.load %arg2[%i, %j] : memref<1024x1024xf32>
+        %9 = addf %6, %7 : f32
+        affine.store %9, %arg2[%i, %j] : memref<1024x1024xf32>
+      }
+    }
+  }
+  return %arg2 : memref<1024x1024xf32>
+}
+// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
+// CHECK-SMALL:   affine.for %arg{{.*}} = 0 to 1024 {
+// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
+// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
+// CHECK-SMALL:     %{{.*}} = alloc() : memref<1x1xf32>
+// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
+// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
+// CHECK-SMALL:     %{{.*}} = affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+// CHECK-SMALL:     affine.store %{{.*}}, %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
+// CHECK-SMALL:     affine.for %arg{{.*}} = 0 to 1024 {
+// CHECK-SMALL:       %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
+// CHECK-SMALL:       %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
+// CHECK-SMALL:       %{{.*}} = alloc() : memref<1x1xf32>
+// CHECK-SMALL:       %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
+// CHECK-SMALL:       %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
+// CHECK-SMALL:       %{{.*}} = affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+// CHECK-SMALL:       affine.store %{{.*}}, %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
+// CHECK-SMALL:       %{{.*}} = affine.load %{{.*}}[0, 0] : memref<1x1xf32>
+// CHECK-SMALL:       %{{.*}} = affine.load %{{.*}}[0, 0] : memref<1x1xf32>
+// CHECK-SMALL:       %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-SMALL:       affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
+// CHECK-SMALL:       dealloc %{{.*}} : memref<1x1xf32>
+// CHECK-SMALL:     }
+// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
+// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
+// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
+// CHECK-SMALL:     %{{.*}} = affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
+// CHECK-SMALL:     %{{.*}} = affine.load %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
+// CHECK-SMALL:     affine.store %{{.*}}, %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+// CHECK-SMALL:     dealloc %{{.*}} : memref<1x1xf32>
+// CHECK-SMALL:   }
+// CHECK-SMALL: }
+// CHECK-SMALL: return
diff --git a/mlir/test/Transforms/affine-loop-invariant-code-motion.mlir b/mlir/test/Transforms/affine-loop-invariant-code-motion.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..f7143b7ad7db1d277f36a4fe295f66c40c2aca5c
--- /dev/null
+++ b/mlir/test/Transforms/affine-loop-invariant-code-motion.mlir
@@ -0,0 +1,507 @@
+// RUN: mlir-opt %s -affine-loop-invariant-code-motion -split-input-file | FileCheck %s
+
+func @nested_loops_both_having_invariant_code() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+
+  affine.for %arg0 = 0 to 10 {
+    %v0 = addf %cf7, %cf8 : f32
+    affine.for %arg1 = 0 to 10 {
+      affine.store %v0, %m[%arg0] : memref<10xf32>
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 7.000000e+00 : f32
+  // CHECK-NEXT: %cst_0 = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %1 = addf %cst, %cst_0 : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: affine.store %1, %0[%arg0] : memref<10xf32>
+
+  return
+}
+
+// The store-load forwarding can see through affine apply's since it relies on
+// dependence information.
+// CHECK-LABEL: func @store_affine_apply
+func @store_affine_apply() -> memref<10xf32> {
+  %cf7 = constant 7.0 : f32
+  %m = alloc() : memref<10xf32>
+  affine.for %arg0 = 0 to 10 {
+      %t0 = affine.apply (d1) -> (d1 + 1)(%arg0)
+      affine.store %cf7, %m[%t0] : memref<10xf32>
+  }
+  return %m : memref<10xf32>
+// CHECK:       %cst = constant 7.000000e+00 : f32
+// CHECK-NEXT:  %0 = alloc() : memref<10xf32>
+// CHECK-NEXT:  affine.for %arg0 = 0 to 10 {
+// CHECK-NEXT:      %1 = affine.apply #map3(%arg0)
+// CHECK-NEXT:      affine.store %cst, %0[%1] : memref<10xf32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return %0 : memref<10xf32>
+}
+
+func @nested_loops_code_invariant_to_both() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      %v0 = addf %cf7, %cf8 : f32
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 7.000000e+00 : f32
+  // CHECK-NEXT: %cst_0 = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %1 = addf %cst, %cst_0 : f32
+
+  return
+}
+
+func @single_loop_nothing_invariant() {
+  %m1 = alloc() : memref<10xf32>
+  %m2 = alloc() : memref<10xf32>
+  affine.for %arg0 = 0 to 10 {
+    %v0 = affine.load %m1[%arg0] : memref<10xf32>
+    %v1 = affine.load %m2[%arg0] : memref<10xf32>
+    %v2 = addf %v0, %v1 : f32
+    affine.store %v2, %m1[%arg0] : memref<10xf32>
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %1 = alloc() : memref<10xf32>
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: %2 = affine.load %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: %3 = affine.load %1[%arg0] : memref<10xf32>
+  // CHECK-NEXT: %4 = addf %2, %3 : f32
+  // CHECK-NEXT: affine.store %4, %0[%arg0] : memref<10xf32>
+
+  return
+}
+
+func @invariant_code_inside_affine_if() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+
+  affine.for %arg0 = 0 to 10 {
+    %t0 = affine.apply (d1) -> (d1 + 1)(%arg0)
+    affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %t0) {
+        %cf9 = addf %cf8, %cf8 : f32
+        affine.store %cf9, %m[%arg0] : memref<10xf32>
+
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: %1 = affine.apply #map3(%arg0)
+  // CHECK-NEXT: affine.if #set0(%arg0, %1) {
+  // CHECK-NEXT: %2 = addf %cst, %cst : f32
+  // CHECK-NEXT: affine.store %2, %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: }
+
+
+  return
+}
+
+
+func @dependent_stores() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+
+  affine.for %arg0 = 0 to 10 {
+    %v0 = addf %cf7, %cf8 : f32
+    affine.for %arg1 = 0 to 10 {
+      %v1 = addf %cf7, %cf7 : f32
+      affine.store %v1, %m[%arg1] : memref<10xf32>
+      affine.store %v0, %m[%arg0] : memref<10xf32>
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 7.000000e+00 : f32
+  // CHECK-NEXT: %cst_0 = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %1 = addf %cst, %cst_0 : f32
+  // CHECK-NEXT: %2 = addf %cst, %cst : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+
+  // CHECK-NEXT: affine.for %arg1 = 0 to 10 {
+  // CHECK-NEXT:   affine.store %2, %0[%arg1] : memref<10xf32>
+  // CHECK-NEXT:   affine.store %1, %0[%arg0] : memref<10xf32>
+
+  return
+}
+
+func @independent_stores() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+
+  affine.for %arg0 = 0 to 10 {
+    %v0 = addf %cf7, %cf8 : f32
+    affine.for %arg1 = 0 to 10 {
+      %v1 = addf %cf7, %cf7 : f32
+      affine.store %v0, %m[%arg0] : memref<10xf32>
+      affine.store %v1, %m[%arg1] : memref<10xf32>
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 7.000000e+00 : f32
+  // CHECK-NEXT: %cst_0 = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %1 = addf %cst, %cst_0 : f32
+  // CHECK-NEXT: %2 = addf %cst, %cst : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT:   affine.for %arg1 = 0 to 10 {
+  // CHECK-NEXT:     affine.store %1, %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT:     affine.store %2, %0[%arg1] : memref<10xf32>
+  // CHECK-NEXT:    }
+
+  return
+}
+
+func @load_dependent_store() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+
+  affine.for %arg0 = 0 to 10 {
+    %v0 = addf %cf7, %cf8 : f32
+    affine.for %arg1 = 0 to 10 {
+      %v1 = addf %cf7, %cf7 : f32
+      affine.store %v0, %m[%arg1] : memref<10xf32>
+      %v2 = affine.load %m[%arg0] : memref<10xf32>
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 7.000000e+00 : f32
+  // CHECK-NEXT: %cst_0 = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %1 = addf %cst, %cst_0 : f32
+  // CHECK-NEXT: %2 = addf %cst, %cst : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: affine.for %arg1 = 0 to 10 {
+  // CHECK-NEXT:   affine.store %1, %0[%arg1] : memref<10xf32>
+  // CHECK-NEXT:   %3 = affine.load %0[%arg0] : memref<10xf32>
+
+  return
+}
+
+func @load_after_load() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+
+  affine.for %arg0 = 0 to 10 {
+    %v0 = addf %cf7, %cf8 : f32
+    affine.for %arg1 = 0 to 10 {
+      %v1 = addf %cf7, %cf7 : f32
+      %v3 = affine.load %m[%arg1] : memref<10xf32>
+      %v2 = affine.load %m[%arg0] : memref<10xf32>
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 7.000000e+00 : f32
+  // CHECK-NEXT: %cst_0 = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %1 = addf %cst, %cst_0 : f32
+  // CHECK-NEXT: %2 = addf %cst, %cst : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: %3 = affine.load %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: affine.for %arg1 = 0 to 10 {
+  // CHECK-NEXT: %4 = affine.load %0[%arg1] : memref<10xf32>
+
+  return
+}
+
+func @invariant_affine_if() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+          %cf9 = addf %cf8, %cf8 : f32
+          affine.store %cf9, %m[%arg0] : memref<10xf32>
+
+      }
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: affine.if #set0(%arg0, %arg0) {
+  // CHECK-NEXT: %1 = addf %cst, %cst : f32
+  // CHECK-NEXT: affine.store %1, %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: }
+
+
+  return
+}
+
+func @invariant_affine_if2() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+          %cf9 = addf %cf8, %cf8 : f32
+          affine.store %cf9, %m[%arg1] : memref<10xf32>
+
+      }
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: affine.for %arg1 = 0 to 10 {
+  // CHECK-NEXT: affine.if #set0(%arg0, %arg0) {
+  // CHECK-NEXT: %1 = addf %cst, %cst : f32
+  // CHECK-NEXT: affine.store %1, %0[%arg1] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+
+
+  return
+}
+
+func @invariant_affine_nested_if() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+          %cf9 = addf %cf8, %cf8 : f32
+          affine.store %cf9, %m[%arg0] : memref<10xf32>
+          affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+            affine.store %cf9, %m[%arg1] : memref<10xf32>
+          }
+      }
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: affine.for %arg1 = 0 to 10 {
+  // CHECK-NEXT: affine.if #set0(%arg0, %arg0) {
+  // CHECK-NEXT: %1 = addf %cst, %cst : f32
+  // CHECK-NEXT: affine.store %1, %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: affine.if #set0(%arg0, %arg0) {
+  // CHECK-NEXT: affine.store %1, %0[%arg1] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+
+
+  return
+}
+
+func @invariant_affine_nested_if_else() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+          %cf9 = addf %cf8, %cf8 : f32
+          affine.store %cf9, %m[%arg0] : memref<10xf32>
+          affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+            affine.store %cf9, %m[%arg0] : memref<10xf32>
+          } else {
+            affine.store %cf9, %m[%arg1] : memref<10xf32>
+          }
+      }
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: affine.for %arg1 = 0 to 10 {
+  // CHECK-NEXT: affine.if #set0(%arg0, %arg0) {
+  // CHECK-NEXT: %1 = addf %cst, %cst : f32
+  // CHECK-NEXT: affine.store %1, %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: affine.if #set0(%arg0, %arg0) {
+  // CHECK-NEXT: affine.store %1, %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: } else {
+  // CHECK-NEXT: affine.store %1, %0[%arg1] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+
+
+  return
+}
+
+func @invariant_affine_nested_if_else2() {
+  %m = alloc() : memref<10xf32>
+  %m2 = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+          %cf9 = addf %cf8, %cf8 : f32
+          %tload1 = affine.load %m[%arg0] : memref<10xf32>
+          affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+            affine.store %cf9, %m2[%arg0] : memref<10xf32>
+          } else {
+            %tload2 = affine.load %m[%arg0] : memref<10xf32>
+          }
+      }
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %1 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: affine.if #set0(%arg0, %arg0) {
+  // CHECK-NEXT: %2 = addf %cst, %cst : f32
+  // CHECK-NEXT: %3 = affine.load %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: affine.if #set0(%arg0, %arg0) {
+  // CHECK-NEXT: affine.store %2, %1[%arg0] : memref<10xf32>
+  // CHECK-NEXT: } else {
+  // CHECK-NEXT: %4 = affine.load %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+
+
+  return
+}
+
+
+func @invariant_affine_nested_if2() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+          %cf9 = addf %cf8, %cf8 : f32
+          %v1 = affine.load %m[%arg0] : memref<10xf32>
+          affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+            %v2 = affine.load %m[%arg0] : memref<10xf32>
+          }
+      }
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: affine.if #set0(%arg0, %arg0) {
+  // CHECK-NEXT: %1 = addf %cst, %cst : f32
+  // CHECK-NEXT: %2 = affine.load %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: affine.if #set0(%arg0, %arg0) {
+  // CHECK-NEXT: %3 = affine.load %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+
+
+  return
+}
+
+func @invariant_affine_for_inside_affine_if() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+          %cf9 = addf %cf8, %cf8 : f32
+          affine.store %cf9, %m[%arg0] : memref<10xf32>
+          affine.for %arg2 = 0 to 10 {
+            affine.store %cf9, %m[%arg2] : memref<10xf32>
+          }
+      }
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: affine.for %arg1 = 0 to 10 {
+  // CHECK-NEXT: affine.if #set0(%arg0, %arg0) {
+  // CHECK-NEXT: %1 = addf %cst, %cst : f32
+  // CHECK-NEXT: affine.store %1, %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: affine.for %arg2 = 0 to 10 {
+  // CHECK-NEXT: affine.store %1, %0[%arg2] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+
+
+  return
+}
+
+
+func @invariant_constant_and_load() {
+  %m = alloc() : memref<100xf32>
+  %m2 = alloc() : memref<100xf32>
+  affine.for %arg0 = 0 to 5 {
+    %c0 = constant 0 : index
+    %v = affine.load %m2[%c0] : memref<100xf32>
+    affine.store %v, %m[%arg0] : memref<100xf32>
+  }
+
+  // CHECK: %0 = alloc() : memref<100xf32>
+  // CHECK-NEXT: %1 = alloc() : memref<100xf32>
+  // CHECK-NEXT: %c0 = constant 0 : index
+  // CHECK-NEXT: %2 = affine.load %1[%c0] : memref<100xf32>
+  // CHECK-NEXT: affine.for %arg0 = 0 to 5 {
+  // CHECK-NEXT:  affine.store %2, %0[%arg0] : memref<100xf32>
+
+
+  return
+}
+
+
+func @nested_load_store_same_memref() {
+  %m = alloc() : memref<10xf32>
+  %cst = constant 8.0 : f32
+  %c0 = constant 0 : index
+   affine.for %arg0 = 0 to 10 {
+    %v0 = affine.load %m[%c0] : memref<10xf32>
+    affine.for %arg1 = 0 to 10 {
+      affine.store %cst, %m[%arg1] : memref<10xf32>
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %c0 = constant 0 : index
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT:  %1 = affine.load %0[%c0] : memref<10xf32>
+  // CHECK-NEXT:   affine.for %arg1 = 0 to 10 {
+  // CHECK-NEXT:    affine.store %cst, %0[%arg1] : memref<10xf32>
+
+
+  return
+}
+
+
+func @nested_load_store_same_memref2() {
+  %m = alloc() : memref<10xf32>
+  %cst = constant 8.0 : f32
+  %c0 = constant 0 : index
+   affine.for %arg0 = 0 to 10 {
+     affine.store %cst, %m[%c0] : memref<10xf32>
+      affine.for %arg1 = 0 to 10 {
+        %v0 = affine.load %m[%arg0] : memref<10xf32>
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %c0 = constant 0 : index
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT:   affine.store %cst, %0[%c0] : memref<10xf32>
+  // CHECK-NEXT:   %1 = affine.load %0[%arg0] : memref<10xf32>
+
+
+  return
+}
diff --git a/mlir/test/Transforms/canonicalize-dce.mlir b/mlir/test/Transforms/canonicalize-dce.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..5d0da2167fc1f231815c5999e72485f17ee9b878
--- /dev/null
+++ b/mlir/test/Transforms/canonicalize-dce.mlir
@@ -0,0 +1,162 @@
+// RUN: mlir-opt %s -split-input-file -pass-pipeline='func(canonicalize)' | FileCheck %s --dump-input=fail
+
+// Test case: Simple case of deleting a dead pure op.
+
+// CHECK:      func @f(%arg0: f32) {
+// CHECK-NEXT:   return
+
+func @f(%arg0: f32) {
+  %0 = "std.addf"(%arg0, %arg0) : (f32, f32) -> f32
+  return
+}
+
+// -----
+
+// Test case: Simple case of deleting a block argument.
+
+// CHECK:      func @f(%arg0: f32)
+// CHECK-NEXT:   "test.br"()[^bb1]
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:   return
+
+func @f(%arg0: f32) {
+  "test.br"()[^succ(%arg0: f32)] : () -> ()
+^succ(%0: f32):
+  return
+}
+
+// -----
+
+// Test case: Deleting recursively dead block arguments.
+
+// CHECK:      func @f(%arg0: f32)
+// CHECK-NEXT:   br ^bb1
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:   br ^bb1
+
+
+func @f(%arg0: f32) {
+  br ^loop(%arg0: f32)
+^loop(%loop: f32):
+  br ^loop(%loop: f32)
+}
+
+// -----
+
+// Test case: Deleting recursively dead block arguments with pure ops in between.
+
+// CHECK:      func @f(%arg0: f32)
+// CHECK-NEXT:   br ^bb1
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:   br ^bb1
+
+func @f(%arg0: f32) {
+  br ^loop(%arg0: f32)
+^loop(%0: f32):
+  %1 = "std.exp"(%0) : (f32) -> f32
+  br ^loop(%1: f32)
+}
+
+// -----
+
+// Test case: Delete block arguments for cond_br.
+
+// CHECK:      func @f(%arg0: f32, %arg1: i1)
+// CHECK-NEXT:   cond_br %arg1, ^bb1, ^bb2
+// CHECK-NEXT: ^bb1:
+// CHECK-NEXT:   return
+// CHECK-NEXT: ^bb2:
+// CHECK-NEXT:   return
+
+func @f(%arg0: f32, %pred: i1) {
+  %exp = "std.exp"(%arg0) : (f32) -> f32
+  cond_br %pred, ^true(%exp: f32), ^false(%exp: f32)
+^true(%0: f32):
+  return
+^false(%1: f32):
+  return
+}
+
+// -----
+
+// Test case: Recursively DCE into enclosed regions.
+
+// CHECK:      func @f(%arg0: f32)
+// CHECK-NEXT:   func @g(%arg1: f32)
+// CHECK-NEXT:     return
+
+func @f(%arg0: f32) {
+  func @g(%arg1: f32) {
+    %0 = "std.addf"(%arg1, %arg1) : (f32, f32) -> f32
+    return
+  }
+  return
+}
+
+// -----
+
+// Test case: Don't delete pure ops that feed into returns.
+
+// CHECK:      func @f(%arg0: f32) -> f32
+// CHECK-NEXT:   [[VAL0:%.+]] = addf %arg0, %arg0 : f32
+// CHECK-NEXT:   return [[VAL0]] : f32
+
+func @f(%arg0: f32) -> f32 {
+  %0 = "std.addf"(%arg0, %arg0) : (f32, f32) -> f32
+  return %0 : f32
+}
+
+// -----
+
+// Test case: Don't delete potentially side-effecting ops.
+
+// CHECK:      func @f(%arg0: f32)
+// CHECK-NEXT:   "foo.print"(%arg0) : (f32) -> ()
+// CHECK-NEXT:   return
+
+func @f(%arg0: f32) {
+  "foo.print"(%arg0) : (f32) -> ()
+  return
+}
+
+// -----
+
+// Test case: Uses in nested regions are deleted correctly.
+
+// CHECK:      func @f(%arg0: f32)
+// CHECK-NEXT:   "foo.has_region"
+// CHECK-NEXT:     "foo.return"
+
+func @f(%arg0: f32) {
+  %0 = "std.exp"(%arg0) : (f32) -> f32
+  "foo.has_region"() ({
+    %1 = "std.exp"(%0) : (f32) -> f32
+    "foo.return"() : () -> ()
+  }) : () -> ()
+  return
+}
+
+// -----
+
+// Test case: Test the mechanics of deleting multiple block arguments.
+
+// CHECK:      func @f(%arg0: tensor<1xf32>, %arg1: tensor<2xf32>, %arg2: tensor<3xf32>, %arg3: tensor<4xf32>, %arg4: tensor<5xf32>)
+// CHECK-NEXT:   "test.br"()[^bb1(%arg1, %arg3 : tensor<2xf32>, tensor<4xf32>)
+// CHECK-NEXT: ^bb1([[VAL0:%.+]]: tensor<2xf32>, [[VAL1:%.+]]: tensor<4xf32>):
+// CHECK-NEXT:   "foo.print"([[VAL0]])
+// CHECK-NEXT:   "foo.print"([[VAL1]])
+// CHECK-NEXT:   return
+
+
+func @f(
+  %arg0: tensor<1xf32>,
+  %arg1: tensor<2xf32>,
+  %arg2: tensor<3xf32>,
+  %arg3: tensor<4xf32>,
+  %arg4: tensor<5xf32>) {
+  "test.br"()[^succ(%arg0, %arg1, %arg2, %arg3, %arg4 : tensor<1xf32>, tensor<2xf32>, tensor<3xf32>, tensor<4xf32>, tensor<5xf32>)] : () -> ()
+^succ(%t1: tensor<1xf32>, %t2: tensor<2xf32>, %t3: tensor<3xf32>, %t4: tensor<4xf32>, %t5: tensor<5xf32>):
+  "foo.print"(%t2) : (tensor<2xf32>) -> ()
+  "foo.print"(%t4) : (tensor<4xf32>) -> ()
+  return
+}
diff --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..09db0889655dd8188287fb72fa848a1211ea88b2
--- /dev/null
+++ b/mlir/test/Transforms/canonicalize.mlir
@@ -0,0 +1,874 @@
+// RUN: mlir-opt %s -pass-pipeline='func(canonicalize)' -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @test_subi_zero
+func @test_subi_zero(%arg0: i32) -> i32 {
+  // CHECK-NEXT: %c0_i32 = constant 0 : i32
+  // CHECK-NEXT: return %c0
+  %y = subi %arg0, %arg0 : i32
+  return %y: i32
+}
+
+// CHECK-LABEL: func @test_subi_zero_vector
+func @test_subi_zero_vector(%arg0: vector<4xi32>) -> vector<4xi32> {
+  //CHECK-NEXT: %cst = constant dense<0> : vector<4xi32>
+  %y = subi %arg0, %arg0 : vector<4xi32>
+  // CHECK-NEXT: return %cst
+  return %y: vector<4xi32>
+}
+
+// CHECK-LABEL: func @test_subi_zero_tensor
+func @test_subi_zero_tensor(%arg0: tensor<4x5xi32>) -> tensor<4x5xi32> {
+  //CHECK-NEXT: %cst = constant dense<0> : tensor<4x5xi32>
+  %y = subi %arg0, %arg0 : tensor<4x5xi32>
+  // CHECK-NEXT: return %cst
+  return %y: tensor<4x5xi32>
+}
+
+// CHECK-LABEL: func @dim
+func @dim(%arg0: tensor<8x4xf32>) -> index {
+
+  // CHECK: %c4 = constant 4 : index
+  %0 = dim %arg0, 1 : tensor<8x4xf32>
+
+  // CHECK-NEXT: return %c4
+  return %0 : index
+}
+
+// CHECK-LABEL: func @test_commutative
+func @test_commutative(%arg0: i32) -> (i32, i32) {
+  // CHECK: %c42_i32 = constant 42 : i32
+  %c42_i32 = constant 42 : i32
+  // CHECK-NEXT: %0 = addi %arg0, %c42_i32 : i32
+  %y = addi %c42_i32, %arg0 : i32
+
+  // This should not be swapped.
+  // CHECK-NEXT: %1 = subi %c42_i32, %arg0 : i32
+  %z = subi %c42_i32, %arg0 : i32
+
+  // CHECK-NEXT: return %0, %1
+  return %y, %z: i32, i32
+}
+
+// CHECK-LABEL: func @trivial_dce
+func @trivial_dce(%arg0: tensor<8x4xf32>) {
+  %0 = dim %arg0, 1 : tensor<8x4xf32>
+  // CHECK-NEXT: return
+  return
+}
+
+// CHECK-LABEL: func @addi_zero
+func @addi_zero(%arg0: i32) -> i32 {
+  // CHECK-NEXT: return %arg0
+  %c0_i32 = constant 0 : i32
+  %y = addi %c0_i32, %arg0 : i32
+  return %y: i32
+}
+
+// CHECK-LABEL: func @addi_zero_index
+func @addi_zero_index(%arg0: index) -> index {
+  // CHECK-NEXT: return %arg0
+  %c0_index = constant 0 : index
+  %y = addi %c0_index, %arg0 : index
+  return %y: index
+}
+
+
+// CHECK-LABEL: func @addi_zero_vector
+func @addi_zero_vector(%arg0: vector<4 x i32>) -> vector<4 x i32> {
+  // CHECK-NEXT: return %arg0
+  %c0_v4i32 = constant dense<0> : vector<4 x i32>
+  %y = addi %c0_v4i32, %arg0 : vector<4 x i32>
+  return %y: vector<4 x i32>
+}
+
+// CHECK-LABEL: func @addi_zero_tensor
+func @addi_zero_tensor(%arg0: tensor<4 x 5 x i32>) -> tensor<4 x 5 x i32> {
+  // CHECK-NEXT: return %arg0
+  %c0_t45i32 = constant dense<0> : tensor<4 x 5 x i32>
+  %y = addi %arg0, %c0_t45i32 : tensor<4 x 5 x i32>
+  return %y: tensor<4 x 5 x i32>
+}
+
+// CHECK-LABEL: func @muli_zero
+func @muli_zero(%arg0: i32) -> i32 {
+  // CHECK-NEXT: %c0_i32 = constant 0 : i32
+  %c0_i32 = constant 0 : i32
+
+  %y = muli %c0_i32, %arg0 : i32
+
+  // CHECK-NEXT: return %c0_i32
+  return %y: i32
+}
+
+// CHECK-LABEL: func @muli_zero_index
+func @muli_zero_index(%arg0: index) -> index {
+  // CHECK-NEXT: %[[CST:.*]] = constant 0 : index
+  %c0_index = constant 0 : index
+
+  %y = muli %c0_index, %arg0 : index
+
+  // CHECK-NEXT: return %[[CST]]
+  return %y: index
+}
+
+// CHECK-LABEL: func @muli_zero_vector
+func @muli_zero_vector(%arg0: vector<4 x i32>) -> vector<4 x i32> {
+  // CHECK-NEXT: %cst = constant dense<0> : vector<4xi32>
+  %cst = constant dense<0> : vector<4 x i32>
+
+  %y = muli %cst, %arg0 : vector<4 x i32>
+
+  // CHECK-NEXT: return %cst
+  return %y: vector<4 x i32>
+}
+
+// CHECK-LABEL: func @muli_zero_tensor
+func @muli_zero_tensor(%arg0: tensor<4 x 5 x i32>) -> tensor<4 x 5 x i32> {
+  // CHECK-NEXT: %cst = constant dense<0> : tensor<4x5xi32>
+  %cst = constant dense<0> : tensor<4 x 5 x i32>
+
+  %y = muli %arg0, %cst : tensor<4 x 5 x i32>
+
+  // CHECK-NEXT: return %cst
+  return %y: tensor<4 x 5 x i32>
+}
+
+// CHECK-LABEL: func @muli_one
+func @muli_one(%arg0: i32) -> i32 {
+  // CHECK-NEXT: return %arg0
+  %c0_i32 = constant 1 : i32
+  %y = muli %c0_i32, %arg0 : i32
+  return %y: i32
+}
+
+// CHECK-LABEL: func @muli_one_index
+func @muli_one_index(%arg0: index) -> index {
+  // CHECK-NEXT: return %arg0
+  %c0_index = constant 1 : index
+  %y = muli %c0_index, %arg0 : index
+  return %y: index
+}
+
+// CHECK-LABEL: func @muli_one_vector
+func @muli_one_vector(%arg0: vector<4 x i32>) -> vector<4 x i32> {
+  // CHECK-NEXT: return %arg0
+  %c1_v4i32 = constant dense<1> : vector<4 x i32>
+  %y = muli %c1_v4i32, %arg0 : vector<4 x i32>
+  return %y: vector<4 x i32>
+}
+
+// CHECK-LABEL: func @muli_one_tensor
+func @muli_one_tensor(%arg0: tensor<4 x 5 x i32>) -> tensor<4 x 5 x i32> {
+  // CHECK-NEXT: return %arg0
+  %c1_t45i32 = constant dense<1> : tensor<4 x 5 x i32>
+  %y = muli %arg0, %c1_t45i32 : tensor<4 x 5 x i32>
+  return %y: tensor<4 x 5 x i32>
+}
+
+//CHECK-LABEL: func @and_self
+func @and_self(%arg0: i32) -> i32 {
+  //CHECK-NEXT: return %arg0
+  %1 = and %arg0, %arg0 : i32
+  return %1 : i32
+}
+
+//CHECK-LABEL: func @and_self_vector
+func @and_self_vector(%arg0: vector<4xi32>) -> vector<4xi32> {
+  //CHECK-NEXT: return %arg0
+  %1 = and %arg0, %arg0 : vector<4xi32>
+  return %1 : vector<4xi32>
+}
+
+//CHECK-LABEL: func @and_self_tensor
+func @and_self_tensor(%arg0: tensor<4x5xi32>) -> tensor<4x5xi32> {
+  //CHECK-NEXT: return %arg0
+  %1 = and %arg0, %arg0 : tensor<4x5xi32>
+  return %1 : tensor<4x5xi32>
+}
+
+//CHECK-LABEL: func @and_zero
+func @and_zero(%arg0: i32) -> i32 {
+  // CHECK-NEXT: %c0_i32 = constant 0 : i32
+  %c0_i32 = constant 0 : i32
+  // CHECK-NEXT: return %c0_i32
+  %1 = and %arg0, %c0_i32 : i32
+  return %1 : i32
+}
+
+//CHECK-LABEL: func @and_zero_index
+func @and_zero_index(%arg0: index) -> index {
+  // CHECK-NEXT: %[[CST:.*]] = constant 0 : index
+  %c0_index = constant 0 : index
+  // CHECK-NEXT: return %[[CST]]
+  %1 = and %arg0, %c0_index : index
+  return %1 : index
+}
+
+//CHECK-LABEL: func @and_zero_vector
+func @and_zero_vector(%arg0: vector<4xi32>) -> vector<4xi32> {
+  // CHECK-NEXT: %cst = constant dense<0> : vector<4xi32>
+  %cst = constant dense<0> : vector<4xi32>
+  // CHECK-NEXT: return %cst
+  %1 = and %arg0, %cst : vector<4xi32>
+  return %1 : vector<4xi32>
+}
+
+//CHECK-LABEL: func @and_zero_tensor
+func @and_zero_tensor(%arg0: tensor<4x5xi32>) -> tensor<4x5xi32> {
+  // CHECK-NEXT: %cst = constant dense<0> : tensor<4x5xi32>
+  %cst = constant dense<0> : tensor<4x5xi32>
+  // CHECK-NEXT: return %cst
+  %1 = and %arg0, %cst : tensor<4x5xi32>
+  return %1 : tensor<4x5xi32>
+}
+
+//CHECK-LABEL: func @or_self
+func @or_self(%arg0: i32) -> i32 {
+  //CHECK-NEXT: return %arg0
+  %1 = or %arg0, %arg0 : i32
+  return %1 : i32
+}
+
+//CHECK-LABEL: func @or_self_vector
+func @or_self_vector(%arg0: vector<4xi32>) -> vector<4xi32> {
+  //CHECK-NEXT: return %arg0
+  %1 = or %arg0, %arg0 : vector<4xi32>
+  return %1 : vector<4xi32>
+}
+
+//CHECK-LABEL: func @or_self_tensor
+func @or_self_tensor(%arg0: tensor<4x5xi32>) -> tensor<4x5xi32> {
+  //CHECK-NEXT: return %arg0
+  %1 = or %arg0, %arg0 : tensor<4x5xi32>
+  return %1 : tensor<4x5xi32>
+}
+
+//CHECK-LABEL: func @or_zero
+func @or_zero(%arg0: i32) -> i32 {
+  %c0_i32 = constant 0 : i32
+  // CHECK-NEXT: return %arg0
+  %1 = or %arg0, %c0_i32 : i32
+  return %1 : i32
+}
+
+//CHECK-LABEL: func @or_zero_index
+func @or_zero_index(%arg0: index) -> index {
+  %c0_index = constant 0 : index
+  // CHECK-NEXT: return %arg0
+  %1 = or %arg0, %c0_index : index
+  return %1 : index
+}
+
+//CHECK-LABEL: func @or_zero_vector
+func @or_zero_vector(%arg0: vector<4xi32>) -> vector<4xi32> {
+  // CHECK-NEXT: return %arg0
+  %cst = constant dense<0> : vector<4xi32>
+  %1 = or %arg0, %cst : vector<4xi32>
+  return %1 : vector<4xi32>
+}
+
+//CHECK-LABEL: func @or_zero_tensor
+func @or_zero_tensor(%arg0: tensor<4x5xi32>) -> tensor<4x5xi32> {
+  // CHECK-NEXT: return %arg0
+  %cst = constant dense<0> : tensor<4x5xi32>
+  %1 = or %arg0, %cst : tensor<4x5xi32>
+  return %1 : tensor<4x5xi32>
+}
+
+//CHECK-LABEL: func @xor_self
+func @xor_self(%arg0: i32) -> i32 {
+  //CHECK-NEXT: %c0_i32 = constant 0
+  %1 = xor %arg0, %arg0 : i32
+  //CHECK-NEXT: return %c0_i32
+  return %1 : i32
+}
+
+//CHECK-LABEL: func @xor_self_vector
+func @xor_self_vector(%arg0: vector<4xi32>) -> vector<4xi32> {
+  //CHECK-NEXT: %cst = constant dense<0> : vector<4xi32>
+  %1 = xor %arg0, %arg0 : vector<4xi32>
+  //CHECK-NEXT: return %cst
+  return %1 : vector<4xi32>
+}
+
+//CHECK-LABEL: func @xor_self_tensor
+func @xor_self_tensor(%arg0: tensor<4x5xi32>) -> tensor<4x5xi32> {
+  //CHECK-NEXT: %cst = constant dense<0> : tensor<4x5xi32>
+  %1 = xor %arg0, %arg0 : tensor<4x5xi32>
+  //CHECK-NEXT: return %cst
+  return %1 : tensor<4x5xi32>
+}
+
+// CHECK-LABEL: func @memref_cast_folding
+func @memref_cast_folding(%arg0: memref<4 x f32>, %arg1: f32) -> f32 {
+  %1 = memref_cast %arg0 : memref<4xf32> to memref<?xf32>
+  // CHECK-NEXT: %c0 = constant 0 : index
+  %c0 = constant 0 : index
+  %dim = dim %1, 0 : memref<? x f32>
+
+  // CHECK-NEXT: affine.load %arg0[3]
+  affine.load %1[%dim - 1] : memref<?xf32>
+
+  // CHECK-NEXT: store %arg1, %arg0[%c0] : memref<4xf32>
+  store %arg1, %1[%c0] : memref<?xf32>
+
+  // CHECK-NEXT: %{{.*}} = load %arg0[%c0] : memref<4xf32>
+  %0 = load %1[%c0] : memref<?xf32>
+
+  // CHECK-NEXT: dealloc %arg0 : memref<4xf32>
+  dealloc %1: memref<?xf32>
+
+  // CHECK-NEXT: return %{{.*}}
+  return %0 : f32
+}
+
+// CHECK-LABEL: func @alloc_const_fold
+func @alloc_const_fold() -> memref<?xf32> {
+  // CHECK-NEXT: %0 = alloc() : memref<4xf32>
+  %c4 = constant 4 : index
+  %a = alloc(%c4) : memref<?xf32>
+
+  // CHECK-NEXT: %1 = memref_cast %0 : memref<4xf32> to memref<?xf32>
+  // CHECK-NEXT: return %1 : memref<?xf32>
+  return %a : memref<?xf32>
+}
+
+// CHECK-LABEL: func @dead_alloc_fold
+func @dead_alloc_fold() {
+  // CHECK-NEXT: return
+  %c4 = constant 4 : index
+  %a = alloc(%c4) : memref<?xf32>
+  return
+}
+
+// CHECK-LABEL: func @dead_dealloc_fold
+func @dead_dealloc_fold() {
+  // CHECK-NEXT: return
+  %a = alloc() : memref<4xf32>
+  dealloc %a: memref<4xf32>
+  return
+}
+
+// CHECK-LABEL: func @dead_dealloc_fold_multi_use
+func @dead_dealloc_fold_multi_use(%cond : i1) {
+  // CHECK-NEXT: cond_br
+  %a = alloc() : memref<4xf32>
+  cond_br %cond, ^bb1, ^bb2
+
+  // CHECK-LABEL: bb1:
+^bb1:
+  // CHECK-NEXT: return
+  dealloc %a: memref<4xf32>
+  return
+
+  // CHECK-LABEL: bb2:
+^bb2:
+  // CHECK-NEXT: return
+  dealloc %a: memref<4xf32>
+  return
+}
+
+// CHECK-LABEL: func @dead_block_elim
+func @dead_block_elim() {
+  // CHECK-NOT ^bb
+  func @nested() {
+    return
+
+  ^bb1:
+    return
+  }
+  return
+
+^bb1:
+  return
+}
+
+// CHECK-LABEL: func @dyn_shape_fold(%arg0: index, %arg1: index)
+func @dyn_shape_fold(%L : index, %M : index) -> (memref<? x ? x i32>, memref<? x ? x f32>) {
+  // CHECK: %c0 = constant 0 : index
+  %zero = constant 0 : index
+  // The constants below disappear after they propagate into shapes.
+  %nine = constant 9 : index
+  %N = constant 1024 : index
+  %K = constant 512 : index
+
+  // CHECK-NEXT: %0 = alloc(%arg0) : memref<?x1024xf32>
+  %a = alloc(%L, %N) : memref<? x ? x f32>
+
+  // CHECK-NEXT: %1 = alloc(%arg1) : memref<4x1024x8x512x?xf32>
+  %b = alloc(%N, %K, %M) : memref<4 x ? x 8 x ? x ? x f32>
+
+  // CHECK-NEXT: %2 = alloc() : memref<512x1024xi32>
+  %c = alloc(%K, %N) : memref<? x ? x i32>
+
+  // CHECK: affine.for
+  affine.for %i = 0 to %L {
+    // CHECK-NEXT: affine.for
+    affine.for %j = 0 to 10 {
+      // CHECK-NEXT: load %0[%arg2, %arg3] : memref<?x1024xf32>
+      // CHECK-NEXT: store %{{.*}}, %1[%c0, %c0, %arg2, %arg3, %c0] : memref<4x1024x8x512x?xf32>
+      %v = load %a[%i, %j] : memref<?x?xf32>
+      store %v, %b[%zero, %zero, %i, %j, %zero] : memref<4x?x8x?x?xf32>
+    }
+  }
+
+  // CHECK: alloc() : memref<9x9xf32>
+  %d = alloc(%nine, %nine) : memref<? x ? x f32>
+
+  return %c, %d : memref<? x ? x i32>, memref<? x ? x f32>
+}
+
+#map1 = (d0, d1)[s0, s1, s2] -> (d0 * s1 + d1 * s2 + s0)
+#map2 = (d0, d1, d2)[s0, s1, s2] -> (d0 * s2 + d1 * s1 + d2 + s0)
+
+// CHECK-LABEL: func @dim_op_fold(%arg0: index, %arg1: index, %arg2: index,
+func @dim_op_fold(%arg0: index, %arg1: index, %arg2: index, %BUF: memref<?xi8>, %M : index, %N : index, %K : index) {
+// CHECK-SAME: [[M:arg[0-9]+]]: index
+// CHECK-SAME: [[N:arg[0-9]+]]: index
+// CHECK-SAME: [[K:arg[0-9]+]]: index
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
+  %1 = alloc(%arg1, %arg2) : memref<?x8x?xf32>
+  %2 = dim %1, 2 : memref<?x8x?xf32>
+  affine.for %arg3 = 0 to %2 {
+    %3 = alloc(%arg0) : memref<?xi8>
+    %ub = dim %3, 0 : memref<?xi8>
+    affine.for %arg4 = 0 to %ub {
+      %s = dim %0, 0 : memref<?x?xf32>
+      %v = std.view %3[%c0][%arg4, %s] : memref<?xi8> to memref<?x?xf32, #map1>
+      %sv = std.subview %0[%c0, %c0][%s,%arg4][%c1,%c1] : memref<?x?xf32> to memref<?x?xf32, #map1>
+      %l = dim %v, 1 : memref<?x?xf32, #map1>
+      %u = dim %sv, 0 : memref<?x?xf32, #map1>
+      affine.for %arg5 = %l to %u {
+        "foo"() : () -> ()
+      }
+    }
+  }
+  // CHECK-NEXT: %c0 = constant 0 : index
+  // CHECK-NEXT: %c1 = constant 1 : index
+  // CHECK-NEXT: affine.for %arg7 = 0 to %arg2 {
+  // CHECK-NEXT:   affine.for %arg8 = 0 to %arg0 {
+  // CHECK-NEXT:     affine.for %arg9 = %arg0 to %arg0 {
+  // CHECK-NEXT:       "foo"() : () -> ()
+  // CHECK-NEXT:     }
+  // CHECK-NEXT:   }
+  // CHECK-NEXT: }
+
+  %A = view %BUF[%c0][%M, %K] : memref<?xi8> to memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %B = view %BUF[%c0][%K, %N] : memref<?xi8> to memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %C = view %BUF[%c0][%M, %N] : memref<?xi8> to memref<?x?xf32, offset: ?, strides: [?, 1]>
+
+  %M_ = dim %A, 0 : memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %K_ = dim %A, 1 : memref<?x?xf32, offset: ?, strides: [?, 1]>
+  %N_ = dim %C, 1 : memref<?x?xf32, offset: ?, strides: [?, 1]>
+  loop.for %i = %c0 to %M_ step %c1 {
+    loop.for %j = %c0 to %N_ step %c1 {
+      loop.for %k = %c0 to %K_ step %c1 {
+      }
+    }
+  }
+  // CHECK: loop.for %{{.*}} = %c0 to %[[M]] step %c1 {
+  // CHECK:   loop.for %arg8 = %c0 to %[[N]] step %c1 {
+  // CHECK:     loop.for %arg9 = %c0 to %[[K]] step %c1 {
+  return
+}
+
+// CHECK-LABEL: func @merge_constants
+func @merge_constants() -> (index, index) {
+  // CHECK-NEXT: %c42 = constant 42 : index
+  %0 = constant 42 : index
+  %1 = constant 42 : index
+  // CHECK-NEXT: return %c42, %c42
+  return %0, %1: index, index
+}
+
+// CHECK-LABEL: func @hoist_constant
+func @hoist_constant(%arg0: memref<8xi32>) {
+  // CHECK-NEXT: %c42_i32 = constant 42 : i32
+  // CHECK-NEXT: affine.for %arg1 = 0 to 8 {
+  affine.for %arg1 = 0 to 8 {
+    // CHECK-NEXT: store %c42_i32, %arg0[%arg1]
+    %c42_i32 = constant 42 : i32
+    store %c42_i32, %arg0[%arg1] : memref<8xi32>
+  }
+  return
+}
+
+// CHECK-LABEL: func @const_fold_propagate
+func @const_fold_propagate() -> memref<?x?xf32> {
+  %VT_i = constant 512 : index
+
+  %VT_i_s = affine.apply (d0) -> (d0 floordiv  8) (%VT_i)
+  %VT_k_l = affine.apply (d0) -> (d0 floordiv  16) (%VT_i)
+
+  // CHECK: = alloc() : memref<64x32xf32>
+  %Av = alloc(%VT_i_s, %VT_k_l) : memref<?x?xf32>
+  return %Av : memref<?x?xf32>
+}
+
+// CHECK-LABEL: func @br_folding
+func @br_folding() -> i32 {
+  // CHECK-NEXT: %[[CST:.*]] = constant 0 : i32
+  // CHECK-NEXT: return %[[CST]] : i32
+  %c0_i32 = constant 0 : i32
+  br ^bb1(%c0_i32 : i32)
+^bb1(%x : i32):
+  return %x : i32
+}
+
+// CHECK-LABEL: func @cond_br_folding
+func @cond_br_folding(%cond : i1, %a : i32) {
+  %false_cond = constant 0 : i1
+  %true_cond = constant 1 : i1
+  cond_br %cond, ^bb1, ^bb2(%a : i32)
+
+^bb1:
+  // CHECK: ^bb1:
+  // CHECK-NEXT: br ^bb3
+  cond_br %true_cond, ^bb3, ^bb2(%a : i32)
+
+^bb2(%x : i32):
+  // CHECK: ^bb2
+  // CHECK: br ^bb3
+  cond_br %false_cond, ^bb2(%x : i32), ^bb3
+
+^bb3:
+  return
+}
+
+// CHECK-LABEL: func @cond_br_and_br_folding
+func @cond_br_and_br_folding(%a : i32) {
+  // Test the compound folding of conditional and unconditional branches.
+  // CHECK-NEXT: return
+
+  %false_cond = constant 0 : i1
+  %true_cond = constant 1 : i1
+  cond_br %true_cond, ^bb2, ^bb1(%a : i32)
+
+^bb1(%x : i32):
+  cond_br %false_cond, ^bb1(%x : i32), ^bb2
+
+^bb2:
+  return
+}
+
+// CHECK-LABEL: func @indirect_call_folding
+func @indirect_target() {
+  return
+}
+
+func @indirect_call_folding() {
+  // CHECK-NEXT: call @indirect_target() : () -> ()
+  // CHECK-NEXT: return
+  %indirect_fn = constant @indirect_target : () -> ()
+  call_indirect %indirect_fn() : () -> ()
+  return
+}
+
+//
+// IMPORTANT NOTE: the operations in this test are exactly those produced by
+// lowering affine.apply (i) -> (i mod 42) to standard operations.  Please only
+// change these operations together with the affine lowering pass tests.
+//
+// CHECK-LABEL: @lowered_affine_mod
+func @lowered_affine_mod() -> (index, index) {
+// CHECK-NEXT: {{.*}} = constant 41 : index
+  %c-43 = constant -43 : index
+  %c42 = constant 42 : index
+  %0 = remi_signed %c-43, %c42 : index
+  %c0 = constant 0 : index
+  %1 = cmpi "slt", %0, %c0 : index
+  %2 = addi %0, %c42 : index
+  %3 = select %1, %2, %0 : index
+// CHECK-NEXT: {{.*}} = constant 1 : index
+  %c43 = constant 43 : index
+  %c42_0 = constant 42 : index
+  %4 = remi_signed %c43, %c42_0 : index
+  %c0_1 = constant 0 : index
+  %5 = cmpi "slt", %4, %c0_1 : index
+  %6 = addi %4, %c42_0 : index
+  %7 = select %5, %6, %4 : index
+  return %3, %7 : index, index
+}
+
+//
+// IMPORTANT NOTE: the operations in this test are exactly those produced by
+// lowering affine.apply (i) -> (i mod 42) to standard operations.  Please only
+// change these operations together with the affine lowering pass tests.
+//
+// CHECK-LABEL: func @lowered_affine_floordiv
+func @lowered_affine_floordiv() -> (index, index) {
+// CHECK-NEXT: %c-2 = constant -2 : index
+  %c-43 = constant -43 : index
+  %c42 = constant 42 : index
+  %c0 = constant 0 : index
+  %c-1 = constant -1 : index
+  %0 = cmpi "slt", %c-43, %c0 : index
+  %1 = subi %c-1, %c-43 : index
+  %2 = select %0, %1, %c-43 : index
+  %3 = divi_signed %2, %c42 : index
+  %4 = subi %c-1, %3 : index
+  %5 = select %0, %4, %3 : index
+// CHECK-NEXT: %c1 = constant 1 : index
+  %c43 = constant 43 : index
+  %c42_0 = constant 42 : index
+  %c0_1 = constant 0 : index
+  %c-1_2 = constant -1 : index
+  %6 = cmpi "slt", %c43, %c0_1 : index
+  %7 = subi %c-1_2, %c43 : index
+  %8 = select %6, %7, %c43 : index
+  %9 = divi_signed %8, %c42_0 : index
+  %10 = subi %c-1_2, %9 : index
+  %11 = select %6, %10, %9 : index
+  return %5, %11 : index, index
+}
+
+//
+// IMPORTANT NOTE: the operations in this test are exactly those produced by
+// lowering affine.apply (i) -> (i mod 42) to standard operations.  Please only
+// change these operations together with the affine lowering pass tests.
+//
+// CHECK-LABEL: func @lowered_affine_ceildiv
+func @lowered_affine_ceildiv() -> (index, index) {
+// CHECK-NEXT:  %c-1 = constant -1 : index
+  %c-43 = constant -43 : index
+  %c42 = constant 42 : index
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %0 = cmpi "sle", %c-43, %c0 : index
+  %1 = subi %c0, %c-43 : index
+  %2 = subi %c-43, %c1 : index
+  %3 = select %0, %1, %2 : index
+  %4 = divi_signed %3, %c42 : index
+  %5 = subi %c0, %4 : index
+  %6 = addi %4, %c1 : index
+  %7 = select %0, %5, %6 : index
+// CHECK-NEXT:  %c2 = constant 2 : index
+  %c43 = constant 43 : index
+  %c42_0 = constant 42 : index
+  %c0_1 = constant 0 : index
+  %c1_2 = constant 1 : index
+  %8 = cmpi "sle", %c43, %c0_1 : index
+  %9 = subi %c0_1, %c43 : index
+  %10 = subi %c43, %c1_2 : index
+  %11 = select %8, %9, %10 : index
+  %12 = divi_signed %11, %c42_0 : index
+  %13 = subi %c0_1, %12 : index
+  %14 = addi %12, %c1_2 : index
+  %15 = select %8, %13, %14 : index
+  return %7, %15 : index, index
+}
+
+// Checks that NOP casts are removed.
+// CHECK-LABEL: cast_values
+func @cast_values(%arg0: tensor<*xi32>, %arg1: memref<?xi32>) -> (tensor<2xi32>, memref<2xi32>) {
+
+  // NOP casts
+  %0 = tensor_cast %arg0 : tensor<*xi32> to tensor<*xi32>
+  %1 = memref_cast %arg1 : memref<?xi32> to memref<?xi32>
+
+  // CHECK-NEXT: %0 = tensor_cast %arg0 : tensor<*xi32> to tensor<2xi32>
+  // CHECK-NEXT: %1 = memref_cast %arg1 : memref<?xi32> to memref<2xi32>
+  %2 = tensor_cast %0 : tensor<*xi32> to tensor<2xi32>
+  %3 = memref_cast %1 : memref<?xi32> to memref<2xi32>
+
+  // NOP casts
+  %4 = tensor_cast %2 : tensor<2xi32> to tensor<2xi32>
+  %5 = memref_cast %3 : memref<2xi32> to memref<2xi32>
+
+  // CHECK-NEXT: return %0, %1 : tensor<2xi32>, memref<2xi32>
+  return %4, %5 : tensor<2xi32>, memref<2xi32>
+}
+
+// -----
+
+#TEST_VIEW_MAP0 = (d0, d1)[s0, s1] -> (d0 * s1 + d1 + s0)
+#TEST_VIEW_MAP1 = (d0, d1, d2)[s0, s1] -> (d0 * s1 + d1 * s0 + d2)
+#TEST_VIEW_MAP2 = (d0, d1)[s0] -> (d0 * 4 + d1 + s0)
+
+// CHECK-DAG: #[[VIEW_MAP0:map[0-9]+]] = (d0, d1) -> (d0 * 11 + d1 + 15)
+// CHECK-DAG: #[[VIEW_MAP1:map[0-9]+]] = (d0, d1)[s0] -> (d0 * 11 + s0 + d1)
+// CHECK-DAG: #[[VIEW_MAP2:map[0-9]+]] = (d0, d1)[s0] -> (d0 * s0 + d1 + 15)
+// CHECK-DAG: #[[VIEW_MAP3:map[0-9]+]] = (d0, d1, d2)[s0] -> (d0 * s0 + d1 * 7 + d2)
+// CHECK-DAG: #[[VIEW_MAP4:map[0-9]+]] = (d0, d1) -> (d0 * 4 + d1 + 15)
+
+// CHECK-LABEL: func @view
+func @view(%arg0 : index) {
+  %0 = alloc() : memref<2048xi8>
+  %c0 = constant 0 : index
+  %c7 = constant 7 : index
+  %c11 = constant 11 : index
+  %c15 = constant 15 : index
+
+  // Test: fold constant sizes and offset, update map with static stride/offset.
+  // CHECK: std.view %0[][] : memref<2048xi8> to memref<7x11xf32, #[[VIEW_MAP0]]>
+  %1 = view %0[%c15][%c7, %c11]
+    : memref<2048xi8> to memref<?x?xf32, #TEST_VIEW_MAP0>
+  load %1[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
+
+  // Test: fold constant sizes but not offset, update map with static stride.
+  // Test that we do not a fold dynamic dim which is not produced by a constant.
+  // CHECK: std.view %0[%arg0][] : memref<2048xi8> to memref<7x11xf32, #[[VIEW_MAP1]]>
+  %2 = view %0[%arg0][%c7, %c11]
+    : memref<2048xi8> to memref<?x?xf32, #TEST_VIEW_MAP0>
+  load %2[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
+
+  // Test: fold constant offset but not sizes, update map with constant offset.
+  // Test that we fold constant offset but not dynamic dims.
+  // CHECK: std.view %0[][%arg0, %arg0] : memref<2048xi8> to memref<?x?xf32, #[[VIEW_MAP2]]>
+  %3 = view %0[%c15][%arg0, %arg0]
+    : memref<2048xi8> to memref<?x?xf32,  #TEST_VIEW_MAP0>
+  load %3[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
+
+  // Test: fold one constant dim, no offset, should update with constant
+  // stride on dim 1, but leave dynamic stride on dim 0.
+  // CHECK: std.view %0[][%arg0, %arg0] : memref<2048xi8> to memref<?x?x7xf32, #[[VIEW_MAP3]]>
+  %4 = view %0[][%arg0, %arg0, %c7]
+    : memref<2048xi8> to memref<?x?x?xf32, #TEST_VIEW_MAP1>
+  load %4[%c0, %c0, %c0] : memref<?x?x?xf32, #TEST_VIEW_MAP1>
+
+  // Test: preserve an existing static dim size while folding a dynamic
+  // dimension and offset.
+  // CHECK: std.view %0[][] : memref<2048xi8> to memref<7x4xf32, #[[VIEW_MAP4]]>
+  %5 = view %0[%c15][%c7]
+    : memref<2048xi8> to memref<?x4xf32, #TEST_VIEW_MAP2>
+  load %5[%c0, %c0] : memref<?x4xf32, #TEST_VIEW_MAP2>
+
+  return
+}
+
+// -----
+
+// CHECK-DAG: #[[BASE_MAP0:map[0-9]+]] = (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)
+// CHECK-DAG: #[[SUBVIEW_MAP0:map[0-9]+]] = (d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 4 + d2)
+// CHECK-DAG: #[[SUBVIEW_MAP1:map[0-9]+]] = (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2 + 79)
+// CHECK-DAG: #[[SUBVIEW_MAP2:map[0-9]+]] = (d0, d1, d2) -> (d0 * 128 + d1 * 28 + d2 * 11)
+// CHECK-DAG: #[[SUBVIEW_MAP3:map[0-9]+]] = (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)
+// CHECK-DAG: #[[SUBVIEW_MAP4:map[0-9]+]] = (d0, d1, d2)[s0] -> (d0 * 128 + s0 + d1 * 28 + d2 * 11)
+// CHECK-DAG: #[[SUBVIEW_MAP5:map[0-9]+]] = (d0, d1, d2)[s0, s1, s2] -> (d0 * s0 + d1 * s1 + d2 * s2 + 79)
+// CHECK-DAG: #[[SUBVIEW_MAP6:map[0-9]+]] = (d0, d1)[s0] -> (d0 * 4 + s0 + d1)
+// CHECK-DAG: #[[SUBVIEW_MAP7:map[0-9]+]] = (d0, d1) -> (d0 * 4 + d1 + 12)
+
+// CHECK-LABEL: func @subview
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func @subview(%arg0 : index, %arg1 : index) -> (index, index) {
+  // CHECK: %[[C0:.*]] = constant 0 : index
+  %c0 = constant 0 : index
+  // CHECK: %[[C1:.*]] = constant 1 : index
+  %c1 = constant 1 : index
+  // CHECK: %[[C2:.*]] = constant 2 : index
+  %c2 = constant 2 : index
+  // CHECK: %[[C7:.*]] = constant 7 : index
+  %c7 = constant 7 : index
+  // CHECK: %[[C11:.*]] = constant 11 : index
+  %c11 = constant 11 : index
+  %c15 = constant 15 : index
+
+  // CHECK: %[[ALLOC0:.*]] = alloc()
+  %0 = alloc() : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>
+
+  // Test: subview with constant base memref and constant operands is folded.
+  // Note that the subview uses the base memrefs layout map because it used
+  // zero offset and unit stride arguments.
+  // CHECK: std.subview %[[ALLOC0]][][][] : memref<8x16x4xf32, #[[BASE_MAP0]]> to memref<7x11x2xf32, #[[BASE_MAP0]]>
+  %1 = subview %0[%c0, %c0, %c0][%c7, %c11, %c2][%c1, %c1, %c1]
+    : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to
+      memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+  load %1[%c0, %c0, %c0] : memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+
+  // Test: subview with one dynamic operand should not be folded.
+  // CHECK: std.subview %[[ALLOC0]][%[[C0]], %[[ARG0]], %[[C0]]][][] : memref<8x16x4xf32, #[[BASE_MAP0]]> to memref<7x11x15xf32, #[[SUBVIEW_MAP0]]>
+  %2 = subview %0[%c0, %arg0, %c0][%c7, %c11, %c15][%c1, %c1, %c1]
+    : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to
+      memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+  load %2[%c0, %c0, %c0] : memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+
+  // CHECK: %[[ALLOC1:.*]] = alloc(%[[ARG0]])
+  %3 = alloc(%arg0) : memref<?x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>
+  // Test: subview with constant operands but dynamic base memref is folded as long as the strides and offset of the base memref are static.
+  // CHECK: std.subview %[[ALLOC1]][][][] : memref<?x16x4xf32, #[[BASE_MAP0]]> to memref<7x11x15xf32, #[[BASE_MAP0]]>
+  %4 = subview %3[%c0, %c0, %c0][%c7, %c11, %c15][%c1, %c1, %c1]
+    : memref<?x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to
+      memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+  load %4[%c0, %c0, %c0] : memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+
+  // Test: subview offset operands are folded correctly w.r.t. base strides.
+  // CHECK: std.subview %[[ALLOC0]][][][] : memref<8x16x4xf32, #[[BASE_MAP0]]> to memref<7x11x2xf32, #[[SUBVIEW_MAP1]]>
+  %5 = subview %0[%c1, %c2, %c7][%c7, %c11, %c2][%c1, %c1, %c1]
+    : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to
+      memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+  load %5[%c0, %c0, %c0] : memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+
+  // Test: subview stride operands are folded correctly w.r.t. base strides.
+  // CHECK: std.subview %[[ALLOC0]][][][] : memref<8x16x4xf32, #[[BASE_MAP0]]> to memref<7x11x2xf32, #[[SUBVIEW_MAP2]]>
+  %6 = subview %0[%c0, %c0, %c0][%c7, %c11, %c2][%c2, %c7, %c11]
+    : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to
+      memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+  load %6[%c0, %c0, %c0] : memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+
+  // Test: subview shape are folded, but offsets and strides are not even if base memref is static
+  // CHECK: std.subview %[[ALLOC0]][%[[ARG0]], %[[ARG0]], %[[ARG0]]][][%[[ARG1]], %[[ARG1]], %[[ARG1]]] : memref<8x16x4xf32, #[[BASE_MAP0]]> to memref<7x11x2xf32, #[[SUBVIEW_MAP3]]>
+  %10 = subview %0[%arg0, %arg0, %arg0][%c7, %c11, %c2][%arg1, %arg1, %arg1] : memref<8x16x4xf32, offset:0, strides:[64, 4, 1]> to memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+  load %10[%arg1, %arg1, %arg1] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+
+  // Test: subview strides are folded, but offsets and shape are not even if base memref is static
+  // CHECK: std.subview %[[ALLOC0]][%[[ARG0]], %[[ARG0]], %[[ARG0]]][%[[ARG1]], %[[ARG1]], %[[ARG1]]][] : memref<8x16x4xf32, #[[BASE_MAP0]]> to memref<?x?x?xf32, #[[SUBVIEW_MAP4]]
+  %11 = subview %0[%arg0, %arg0, %arg0][%arg1, %arg1, %arg1][%c2, %c7, %c11] : memref<8x16x4xf32, offset:0, strides:[64, 4, 1]> to memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+  load %11[%arg0, %arg0, %arg0] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+
+  // Test: subview offsets are folded, but strides and shape are not even if base memref is static
+  // CHECK: std.subview %[[ALLOC0]][][%[[ARG1]], %[[ARG1]], %[[ARG1]]][%[[ARG0]], %[[ARG0]], %[[ARG0]]] : memref<8x16x4xf32, #[[BASE_MAP0]]> to memref<?x?x?xf32, #[[SUBVIEW_MAP5]]
+  %13 = subview %0[%c1, %c2, %c7][%arg1, %arg1, %arg1][%arg0, %arg0, %arg0] :  memref<8x16x4xf32, offset:0, strides:[64, 4, 1]> to memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+  load %13[%arg1, %arg1, %arg1] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+
+  // CHECK: %[[ALLOC2:.*]] = alloc(%[[ARG0]], %[[ARG0]], %[[ARG1]])
+  %14 = alloc(%arg0, %arg0, %arg1) : memref<?x?x?xf32>
+  // Test: subview shape are folded, even if base memref is not static
+  // CHECK: std.subview %[[ALLOC2]][%[[ARG0]], %[[ARG0]], %[[ARG0]]][][%[[ARG1]], %[[ARG1]], %[[ARG1]]] : memref<?x?x?xf32> to memref<7x11x2xf32, #[[SUBVIEW_MAP3]]>
+  %15 = subview %14[%arg0, %arg0, %arg0][%c7, %c11, %c2][%arg1, %arg1, %arg1] : memref<?x?x?xf32> to memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+  load %15[%arg1, %arg1, %arg1] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+
+  // TEST: subview strides are not folded when the base memref is not static
+  // CHECK: std.subview %[[ALLOC2]][%[[ARG0]], %[[ARG0]], %[[ARG0]]][%[[ARG1]], %[[ARG1]], %[[ARG1]]][%[[C2]], %[[C2]], %[[C2]]] : memref<?x?x?xf32> to memref<?x?x?xf32, #[[SUBVIEW_MAP3]]
+  %16 = subview %14[%arg0, %arg0, %arg0][%arg1, %arg1, %arg1][%c2, %c2, %c2] : memref<?x?x?xf32> to memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+  load %16[%arg0, %arg0, %arg0] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+
+  // TEST: subview offsets are not folded when the base memref is not static
+  // CHECK: std.subview %[[ALLOC2]][%[[C1]], %[[C1]], %[[C1]]][%[[ARG0]], %[[ARG0]], %[[ARG0]]][%[[ARG1]], %[[ARG1]], %[[ARG1]]] : memref<?x?x?xf32> to memref<?x?x?xf32, #[[SUBVIEW_MAP3]]
+  %17 = subview %14[%c1, %c1, %c1][%arg0, %arg0, %arg0][%arg1, %arg1, %arg1] : memref<?x?x?xf32> to memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+  load %17[%arg0, %arg0, %arg0] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+
+  // CHECK: %[[ALLOC3:.*]] = alloc() : memref<12x4xf32>
+  %18 = alloc() : memref<12x4xf32>
+  %c4 = constant 4 : index
+
+  // TEST: subview strides are maintained when sizes are folded
+  // CHECK: std.subview %[[ALLOC3]][%arg1, %arg1][][] : memref<12x4xf32> to memref<2x4xf32, #[[SUBVIEW_MAP6]]>
+  %19 = subview %18[%arg1, %arg1][%c2, %c4][] : memref<12x4xf32> to memref<?x?xf32, offset: ?, strides:[4, 1]>
+  load %19[%arg1, %arg1] : memref<?x?xf32, offset: ?, strides:[4, 1]>
+
+  // TEST: subview strides and sizes are maintained when offsets are folded
+  // CHECK: std.subview %[[ALLOC3]][][][] : memref<12x4xf32> to memref<12x4xf32, #[[SUBVIEW_MAP7]]>
+  %20 = subview %18[%c2, %c4][][] : memref<12x4xf32> to memref<12x4xf32, offset: ?, strides:[4, 1]>
+  load %20[%arg1, %arg1] : memref<12x4xf32, offset: ?, strides:[4, 1]>
+
+  // Test: dim on subview is rewritten to size operand.
+  %7 = dim %4, 0 : memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+  %8 = dim %4, 1 : memref<?x?x?xf32,
+       (d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>
+
+  // CHECK: return %[[C7]], %[[C11]]
+  return %7, %8 : index, index
+}
diff --git a/mlir/test/Transforms/constant-fold.mlir b/mlir/test/Transforms/constant-fold.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..a24aad2847a9bff846e42cac4f76270ac58fa475
--- /dev/null
+++ b/mlir/test/Transforms/constant-fold.mlir
@@ -0,0 +1,644 @@
+// RUN: mlir-opt %s -split-input-file -test-constant-fold | FileCheck %s
+
+// -----
+
+// CHECK-LABEL: @affine_for
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @affine_for(%p : memref<f32>) {
+  // CHECK: [[C:%.+]] = constant 6.{{0*}}e+00 : f32
+  affine.for %arg1 = 0 to 128 {
+    affine.for %arg2 = 0 to 8 { // CHECK: affine.for %{{.*}} = 0 to 8 {
+      %0 = constant 4.5 : f32
+      %1 = constant 1.5 : f32
+
+      %2 = addf %0, %1 : f32
+
+      // CHECK-NEXT: store [[C]], [[ARG]][]
+      store %2, %p[] : memref<f32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @simple_addf
+func @simple_addf() -> f32 {
+  %0 = constant 4.5 : f32
+  %1 = constant 1.5 : f32
+
+  // CHECK-NEXT: [[C:%.+]] = constant 6.{{0*}}e+00 : f32
+  %2 = addf %0, %1 : f32
+
+  // CHECK-NEXT: return [[C]]
+  return %2 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @addf_splat_tensor
+func @addf_splat_tensor() -> tensor<4xf32> {
+  %0 = constant dense<4.5> : tensor<4xf32>
+  %1 = constant dense<1.5> : tensor<4xf32>
+
+  // CHECK-NEXT: [[C:%.+]] = constant dense<6.{{0*}}e+00> : tensor<4xf32>
+  %2 = addf %0, %1 : tensor<4xf32>
+
+  // CHECK-NEXT: return [[C]]
+  return %2 : tensor<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @addf_dense_tensor
+func @addf_dense_tensor() -> tensor<4xf32> {
+  %0 = constant dense<[1.5, 2.5, 3.5, 4.5]> : tensor<4xf32>
+  %1 = constant dense<[1.5, 2.5, 3.5, 4.5]> : tensor<4xf32>
+
+  // CHECK-NEXT: [[C:%.+]] = constant dense<[3.{{0*}}e+00, 5.{{0*}}e+00, 7.{{0*}}e+00, 9.{{0*}}e+00]> : tensor<4xf32>
+  %2 = addf %0, %1 : tensor<4xf32>
+
+  // CHECK-NEXT: return [[C]]
+  return %2 : tensor<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @addf_dense_and_splat_tensors
+func @addf_dense_and_splat_tensors() -> tensor<4xf32> {
+  %0 = constant dense<[1.5, 2.5, 3.5, 4.5]> : tensor<4xf32>
+  %1 = constant dense<1.5> : tensor<4xf32>
+
+  // CHECK-NEXT: [[C:%.+]] = constant dense<[3.{{0*}}e+00, 4.{{0*}}e+00, 5.{{0*}}e+00, 6.{{0*}}e+00]> : tensor<4xf32>
+  %2 = addf %0, %1 : tensor<4xf32>
+
+  // CHECK-NEXT: return [[C]]
+  return %2 : tensor<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @simple_addi
+func @simple_addi() -> i32 {
+  %0 = constant 1 : i32
+  %1 = constant 5 : i32
+
+  // CHECK-NEXT: [[C:%.+]] = constant 6 : i32
+  %2 = addi %0, %1 : i32
+
+  // CHECK-NEXT: return [[C]]
+  return %2 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func @addi_splat_vector
+func @addi_splat_vector() -> vector<8xi32> {
+  %0 = constant dense<1> : vector<8xi32>
+  %1 = constant dense<5> : vector<8xi32>
+
+  // CHECK-NEXT: [[C:%.+]] = constant dense<6> : vector<8xi32>
+  %2 = addi %0, %1 : vector<8xi32>
+
+  // CHECK-NEXT: return [[C]]
+  return %2 : vector<8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @simple_subf
+func @simple_subf() -> f32 {
+  %0 = constant 4.5 : f32
+  %1 = constant 1.5 : f32
+
+  // CHECK-NEXT: [[C:%.+]] = constant 3.{{0*}}e+00 : f32
+  %2 = subf %0, %1 : f32
+
+  // CHECK-NEXT: return [[C]]
+  return %2 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @subf_splat_vector
+func @subf_splat_vector() -> vector<4xf32> {
+  %0 = constant dense<4.5> : vector<4xf32>
+  %1 = constant dense<1.5> : vector<4xf32>
+
+  // CHECK-NEXT: [[C:%.+]] = constant dense<3.{{0*}}e+00> : vector<4xf32>
+  %2 = subf %0, %1 : vector<4xf32>
+
+  // CHECK-NEXT: return [[C]]
+  return %2 : vector<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @simple_subi
+func @simple_subi() -> i32 {
+  %0 = constant 4 : i32
+  %1 = constant 1 : i32
+
+  // CHECK-NEXT:[[C3:%.+]] = constant 3 : i32
+  %2 = subi %0, %1 : i32
+
+  // CHECK-NEXT: return [[C3]]
+  return %2 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func @subi_splat_tensor
+func @subi_splat_tensor() -> tensor<4xi32> {
+  %0 = constant dense<4> : tensor<4xi32>
+  %1 = constant dense<1> : tensor<4xi32>
+
+  // CHECK-NEXT: [[C:%.+]] = constant dense<3> : tensor<4xi32>
+  %2 = subi %0, %1 : tensor<4xi32>
+
+  // CHECK-NEXT: return [[C]]
+  return %2 : tensor<4xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @affine_apply
+func @affine_apply(%variable : index) -> (index, index, index) {
+  %c177 = constant 177 : index
+  %c211 = constant 211 : index
+  %N = constant 1075 : index
+
+  // CHECK:[[C1159:%.+]] = constant 1159 : index
+  // CHECK:[[C1152:%.+]] = constant 1152 : index
+  %x0 = affine.apply (d0, d1)[S0] -> ( (d0 + 128 * S0) floordiv 128 + d1 mod 128)
+           (%c177, %c211)[%N]
+  %x1 = affine.apply (d0, d1)[S0] -> (128 * (S0 ceildiv 128))
+           (%c177, %c211)[%N]
+
+  // CHECK:[[C42:%.+]] = constant 42 : index
+  %y = affine.apply (d0) -> (42) (%variable)
+
+  // CHECK: return [[C1159]], [[C1152]], [[C42]]
+  return %x0, %x1, %y : index, index, index
+}
+
+// -----
+
+// CHECK-LABEL: func @simple_mulf
+func @simple_mulf() -> f32 {
+  %0 = constant 4.5 : f32
+  %1 = constant 1.5 : f32
+
+  // CHECK-NEXT: [[C:%.+]] = constant 6.75{{0*}}e+00 : f32
+  %2 = mulf %0, %1 : f32
+
+  // CHECK-NEXT: return [[C]]
+  return %2 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @mulf_splat_tensor
+func @mulf_splat_tensor() -> tensor<4xf32> {
+  %0 = constant dense<4.5> : tensor<4xf32>
+  %1 = constant dense<1.5> : tensor<4xf32>
+
+  // CHECK-NEXT: [[C:%.+]] = constant dense<6.75{{0*}}e+00> : tensor<4xf32>
+  %2 = mulf %0, %1 : tensor<4xf32>
+
+  // CHECK-NEXT: return [[C]]
+  return %2 : tensor<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @simple_divi_signed
+func @simple_divi_signed() -> (i32, i32, i32) {
+  // CHECK-DAG: [[C0:%.+]] = constant 0
+  %z = constant 0 : i32
+  // CHECK-DAG: [[C6:%.+]] = constant 6
+  %0 = constant 6 : i32
+  %1 = constant 2 : i32
+
+  // CHECK-NEXT: [[C3:%.+]] = constant 3 : i32
+  %2 = divi_signed %0, %1 : i32
+
+  %3 = constant -2 : i32
+
+  // CHECK-NEXT: [[CM3:%.+]] = constant -3 : i32
+  %4 = divi_signed %0, %3 : i32
+
+  // CHECK-NEXT: [[XZ:%.+]] = divi_signed [[C6]], [[C0]]
+  %5 = divi_signed %0, %z : i32
+
+  // CHECK-NEXT: return [[C3]], [[CM3]], [[XZ]]
+  return %2, %4, %5 : i32, i32, i32
+}
+
+// -----
+
+// CHECK-LABEL: func @divi_signed_splat_tensor
+func @divi_signed_splat_tensor() -> (tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
+  // CHECK-DAG: [[C0:%.+]] = constant dense<0>
+  %z = constant dense<0> : tensor<4xi32>
+  // CHECK-DAG: [[C6:%.+]] = constant dense<6>
+  %0 = constant dense<6> : tensor<4xi32>
+  %1 = constant dense<2> : tensor<4xi32>
+
+  // CHECK-NEXT: [[C3:%.+]] = constant dense<3> : tensor<4xi32>
+  %2 = divi_signed %0, %1 : tensor<4xi32>
+
+  %3 = constant dense<-2> : tensor<4xi32>
+
+  // CHECK-NEXT: [[CM3:%.+]] = constant dense<-3> : tensor<4xi32>
+  %4 = divi_signed %0, %3 : tensor<4xi32>
+
+  // CHECK-NEXT: [[XZ:%.+]] = divi_signed [[C6]], [[C0]]
+  %5 = divi_signed %0, %z : tensor<4xi32>
+
+  // CHECK-NEXT: return [[C3]], [[CM3]], [[XZ]]
+  return %2, %4, %5 : tensor<4xi32>, tensor<4xi32>, tensor<4xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @simple_divi_unsigned
+func @simple_divi_unsigned() -> (i32, i32, i32) {
+  %z = constant 0 : i32
+  // CHECK-DAG: [[C6:%.+]] = constant 6
+  %0 = constant 6 : i32
+  %1 = constant 2 : i32
+
+  // CHECK-DAG: [[C3:%.+]] = constant 3 : i32
+  %2 = divi_unsigned %0, %1 : i32
+
+  %3 = constant -2 : i32
+
+  // Unsigned division interprets -2 as 2^32-2, so the result is 0.
+  // CHECK-DAG: [[C0:%.+]] = constant 0 : i32
+  %4 = divi_unsigned %0, %3 : i32
+
+  // CHECK-NEXT: [[XZ:%.+]] = divi_unsigned [[C6]], [[C0]]
+  %5 = divi_unsigned %0, %z : i32
+
+  // CHECK-NEXT: return [[C3]], [[C0]], [[XZ]]
+  return %2, %4, %5 : i32, i32, i32
+}
+
+
+// -----
+
+// CHECK-LABEL: func @divi_unsigned_splat_tensor
+func @divi_unsigned_splat_tensor() -> (tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
+  %z = constant dense<0> : tensor<4xi32>
+  // CHECK-DAG: [[C6:%.+]] = constant dense<6>
+  %0 = constant dense<6> : tensor<4xi32>
+  %1 = constant dense<2> : tensor<4xi32>
+
+  // CHECK-DAG: [[C3:%.+]] = constant dense<3> : tensor<4xi32>
+  %2 = divi_unsigned %0, %1 : tensor<4xi32>
+
+  %3 = constant dense<-2> : tensor<4xi32>
+
+  // Unsigned division interprets -2 as 2^32-2, so the result is 0.
+  // CHECK-DAG: [[C0:%.+]] = constant dense<0> : tensor<4xi32>
+  %4 = divi_unsigned %0, %3 : tensor<4xi32>
+
+  // CHECK-NEXT: [[XZ:%.+]] = divi_unsigned [[C6]], [[C0]]
+  %5 = divi_unsigned %0, %z : tensor<4xi32>
+
+  // CHECK-NEXT: return [[C3]], [[C0]], [[XZ]]
+  return %2, %4, %5 : tensor<4xi32>, tensor<4xi32>, tensor<4xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @simple_remi_signed
+func @simple_remi_signed(%a : i32) -> (i32, i32, i32) {
+  %0 = constant 5 : i32
+  %1 = constant 2 : i32
+  %2 = constant 1 : i32
+  %3 = constant -2 : i32
+
+  // CHECK-NEXT:[[C1:%.+]] = constant 1 : i32
+  %4 = remi_signed %0, %1 : i32
+  %5 = remi_signed %0, %3 : i32
+  // CHECK-NEXT:[[C0:%.+]] = constant 0 : i32
+  %6 = remi_signed %a, %2 : i32
+
+  // CHECK-NEXT: return [[C1]], [[C1]], [[C0]] : i32, i32, i32
+  return %4, %5, %6 : i32, i32, i32
+}
+
+// -----
+
+// CHECK-LABEL: func @simple_remi_unsigned
+func @simple_remi_unsigned(%a : i32) -> (i32, i32, i32) {
+  %0 = constant 5 : i32
+  %1 = constant 2 : i32
+  %2 = constant 1 : i32
+  %3 = constant -2 : i32
+
+  // CHECK-DAG:[[C1:%.+]] = constant 1 : i32
+  %4 = remi_unsigned %0, %1 : i32
+  // CHECK-DAG:[[C5:%.+]] = constant 5 : i32
+  %5 = remi_unsigned %0, %3 : i32
+  // CHECK-DAG:[[C0:%.+]] = constant 0 : i32
+  %6 = remi_unsigned %a, %2 : i32
+
+  // CHECK-NEXT: return [[C1]], [[C5]], [[C0]] : i32, i32, i32
+  return %4, %5, %6 : i32, i32, i32
+}
+
+// -----
+
+// CHECK-LABEL: func @muli
+func @muli() -> i32 {
+  %0 = constant 4 : i32
+  %1 = constant 2 : i32
+
+  // CHECK-NEXT:[[C8:%.+]] = constant 8 : i32
+  %2 = muli %0, %1 : i32
+
+  // CHECK-NEXT: return [[C8]]
+  return %2 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func @muli_splat_vector
+func @muli_splat_vector() -> vector<4xi32> {
+  %0 = constant dense<4> : vector<4xi32>
+  %1 = constant dense<2> : vector<4xi32>
+
+  // CHECK-NEXT: [[C:%.+]] = constant dense<8> : vector<4xi32>
+  %2 = muli %0, %1 : vector<4xi32>
+
+  // CHECK-NEXT: return [[C]]
+  return %2 : vector<4xi32>
+}
+
+// CHECK-LABEL: func @dim
+func @dim(%x : tensor<8x4xf32>) -> index {
+
+  // CHECK:[[C4:%.+]] = constant 4 : index
+  %0 = dim %x, 1 : tensor<8x4xf32>
+
+  // CHECK-NEXT: return [[C4]]
+  return %0 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @cmpi
+func @cmpi() -> (i1, i1, i1, i1, i1, i1, i1, i1, i1, i1) {
+  %c42 = constant 42 : i32
+  %cm1 = constant -1 : i32
+  // CHECK-DAG: [[F:%.+]] = constant 0 : i1
+  // CHECK-DAG: [[T:%.+]] = constant 1 : i1
+  // CHECK-NEXT: return [[F]],
+  %0 = cmpi "eq", %c42, %cm1 : i32
+  // CHECK-SAME: [[T]],
+  %1 = cmpi "ne", %c42, %cm1 : i32
+  // CHECK-SAME: [[F]],
+  %2 = cmpi "slt", %c42, %cm1 : i32
+  // CHECK-SAME: [[F]],
+  %3 = cmpi "sle", %c42, %cm1 : i32
+  // CHECK-SAME: [[T]],
+  %4 = cmpi "sgt", %c42, %cm1 : i32
+  // CHECK-SAME: [[T]],
+  %5 = cmpi "sge", %c42, %cm1 : i32
+  // CHECK-SAME: [[T]],
+  %6 = cmpi "ult", %c42, %cm1 : i32
+  // CHECK-SAME: [[T]],
+  %7 = cmpi "ule", %c42, %cm1 : i32
+  // CHECK-SAME: [[F]],
+  %8 = cmpi "ugt", %c42, %cm1 : i32
+  // CHECK-SAME: [[F]]
+  %9 = cmpi "uge", %c42, %cm1 : i32
+  return %0, %1, %2, %3, %4, %5, %6, %7, %8, %9 : i1, i1, i1, i1, i1, i1, i1, i1, i1, i1
+}
+
+// -----
+
+// CHECK-LABEL: func @cmpf_normal_numbers
+func @cmpf_normal_numbers() -> (i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1) {
+  %c42 = constant 42. : f32
+  %cm1 = constant -1. : f32
+  // CHECK-DAG: [[F:%.+]] = constant 0 : i1
+  // CHECK-DAG: [[T:%.+]] = constant 1 : i1
+  // CHECK-NEXT: return [[F]],
+  %0 = cmpf "false", %c42, %cm1 : f32
+  // CHECK-SAME: [[F]],
+  %1 = cmpf "oeq", %c42, %cm1 : f32
+  // CHECK-SAME: [[T]],
+  %2 = cmpf "ogt", %c42, %cm1 : f32
+  // CHECK-SAME: [[T]],
+  %3 = cmpf "oge", %c42, %cm1 : f32
+  // CHECK-SAME: [[F]],
+  %4 = cmpf "olt", %c42, %cm1 : f32
+  // CHECK-SAME: [[F]],
+  %5 = cmpf "ole", %c42, %cm1 : f32
+  // CHECK-SAME: [[T]],
+  %6 = cmpf "one", %c42, %cm1 : f32
+  // CHECK-SAME: [[T]],
+  %7 = cmpf "ord", %c42, %cm1 : f32
+  // CHECK-SAME: [[F]],
+  %8 = cmpf "ueq", %c42, %cm1 : f32
+  // CHECK-SAME: [[T]],
+  %9 = cmpf "ugt", %c42, %cm1 : f32
+  // CHECK-SAME: [[T]],
+  %10 = cmpf "uge", %c42, %cm1 : f32
+  // CHECK-SAME: [[F]],
+  %11 = cmpf "ult", %c42, %cm1 : f32
+  // CHECK-SAME: [[F]],
+  %12 = cmpf "ule", %c42, %cm1 : f32
+  // CHECK-SAME: [[T]],
+  %13 = cmpf "une", %c42, %cm1 : f32
+  // CHECK-SAME: [[F]],
+  %14 = cmpf "uno", %c42, %cm1 : f32
+  // CHECK-SAME: [[T]]
+  %15 = cmpf "true", %c42, %cm1 : f32
+  return %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15 : i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1
+}
+
+// -----
+
+// CHECK-LABEL: func @cmpf_nan
+func @cmpf_nan() -> (i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1) {
+  %c42 = constant 42. : f32
+  %cqnan = constant 0xFFFFFFFF : f32
+  // CHECK-DAG: [[F:%.+]] = constant 0 : i1
+  // CHECK-DAG: [[T:%.+]] = constant 1 : i1
+  // CHECK-NEXT: return [[F]],
+  %0 = cmpf "false", %c42, %cqnan : f32
+  // CHECK-SAME: [[F]]
+  %1 = cmpf "oeq", %c42, %cqnan : f32
+  // CHECK-SAME: [[F]],
+  %2 = cmpf "ogt", %c42, %cqnan : f32
+  // CHECK-SAME: [[F]],
+  %3 = cmpf "oge", %c42, %cqnan : f32
+  // CHECK-SAME: [[F]],
+  %4 = cmpf "olt", %c42, %cqnan : f32
+  // CHECK-SAME: [[F]],
+  %5 = cmpf "ole", %c42, %cqnan : f32
+  // CHECK-SAME: [[F]],
+  %6 = cmpf "one", %c42, %cqnan : f32
+  // CHECK-SAME: [[F]],
+  %7 = cmpf "ord", %c42, %cqnan : f32
+  // CHECK-SAME: [[T]],
+  %8 = cmpf "ueq", %c42, %cqnan : f32
+  // CHECK-SAME: [[T]],
+  %9 = cmpf "ugt", %c42, %cqnan : f32
+  // CHECK-SAME: [[T]],
+  %10 = cmpf "uge", %c42, %cqnan : f32
+  // CHECK-SAME: [[T]],
+  %11 = cmpf "ult", %c42, %cqnan : f32
+  // CHECK-SAME: [[T]],
+  %12 = cmpf "ule", %c42, %cqnan : f32
+  // CHECK-SAME: [[T]],
+  %13 = cmpf "une", %c42, %cqnan : f32
+  // CHECK-SAME: [[T]],
+  %14 = cmpf "uno", %c42, %cqnan : f32
+  // CHECK-SAME: [[T]]
+  %15 = cmpf "true", %c42, %cqnan : f32
+  return %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15 : i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1
+}
+
+// -----
+
+// CHECK-LABEL: func @cmpf_inf
+func @cmpf_inf() -> (i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1) {
+  %c42 = constant 42. : f32
+  %cpinf = constant 0x7F800000 : f32
+  // CHECK-DAG: [[F:%.+]] = constant 0 : i1
+  // CHECK-DAG: [[T:%.+]] = constant 1 : i1
+  // CHECK-NEXT: return [[F]],
+  %0 = cmpf "false", %c42, %cpinf: f32
+  // CHECK-SAME: [[F]]
+  %1 = cmpf "oeq", %c42, %cpinf: f32
+  // CHECK-SAME: [[F]],
+  %2 = cmpf "ogt", %c42, %cpinf: f32
+  // CHECK-SAME: [[F]],
+  %3 = cmpf "oge", %c42, %cpinf: f32
+  // CHECK-SAME: [[T]],
+  %4 = cmpf "olt", %c42, %cpinf: f32
+  // CHECK-SAME: [[T]],
+  %5 = cmpf "ole", %c42, %cpinf: f32
+  // CHECK-SAME: [[T]],
+  %6 = cmpf "one", %c42, %cpinf: f32
+  // CHECK-SAME: [[T]],
+  %7 = cmpf "ord", %c42, %cpinf: f32
+  // CHECK-SAME: [[F]],
+  %8 = cmpf "ueq", %c42, %cpinf: f32
+  // CHECK-SAME: [[F]],
+  %9 = cmpf "ugt", %c42, %cpinf: f32
+  // CHECK-SAME: [[F]],
+  %10 = cmpf "uge", %c42, %cpinf: f32
+  // CHECK-SAME: [[T]],
+  %11 = cmpf "ult", %c42, %cpinf: f32
+  // CHECK-SAME: [[T]],
+  %12 = cmpf "ule", %c42, %cpinf: f32
+  // CHECK-SAME: [[T]],
+  %13 = cmpf "une", %c42, %cpinf: f32
+  // CHECK-SAME: [[F]],
+  %14 = cmpf "uno", %c42, %cpinf: f32
+  // CHECK-SAME: [[T]]
+  %15 = cmpf "true", %c42, %cpinf: f32
+  return %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15 : i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1
+}
+
+// -----
+
+// CHECK-LABEL: func @fold_extract_element
+func @fold_extract_element(%arg0 : index) -> (f32, f16, f16, i32) {
+  %const_0 = constant 0 : index
+  %const_1 = constant 1 : index
+  %const_3 = constant 3 : index
+
+  // Fold an extract into a splat.
+  // CHECK-NEXT: [[C4:%.+]] = constant 4.{{0*}}e+00 : f32
+  %0 = constant dense<4.0> : tensor<4xf32>
+  %ext_1 = extract_element %0[%arg0] : tensor<4xf32>
+
+  // Fold an extract into a sparse with a sparse index.
+  // CHECK-NEXT: [[CM2:%.+]] = constant -2.{{0*}}e+00 : f16
+  %1 = constant sparse<[[0, 0, 0], [1, 1, 1]],  [-5.0, -2.0]> : vector<4x4x4xf16>
+  %ext_2 = extract_element %1[%const_1, %const_1, %const_1] : vector<4x4x4xf16>
+
+  // Fold an extract into a sparse with a non sparse index.
+  // CHECK-NEXT: [[C0:%.+]] = constant 0.{{0*}}e+00 : f16
+  %2 = constant sparse<[[1, 1, 1]],  [-2.0]> : vector<1x1x1xf16>
+  %ext_3 = extract_element %2[%const_0, %const_0, %const_0] : vector<1x1x1xf16>
+
+  // Fold an extract into a dense tensor.
+  // CHECK-NEXT: [[C64:%.+]] = constant 64 : i32
+  %3 = constant dense<[[[1, -2, 1, 36]], [[0, 2, -1, 64]]]> : tensor<2x1x4xi32>
+  %ext_4 = extract_element %3[%const_1, %const_0, %const_3] : tensor<2x1x4xi32>
+
+  // CHECK-NEXT: return [[C4]], [[CM2]], [[C0]], [[C64]]
+  return %ext_1, %ext_2, %ext_3, %ext_4 : f32, f16, f16, i32
+}
+
+// -----
+
+// CHECK-LABEL: func @fold_rank
+func @fold_rank() -> (index) {
+  %const_0 = constant dense<[[[1, -2, 1, 36]], [[0, 2, -1, 64]]]> : tensor<2x1x4xi32>
+
+  // Fold a rank into a constant
+  // CHECK-NEXT: [[C3:%.+]] = constant 3 : index
+  %rank_0 = rank %const_0 : tensor<2x1x4xi32>
+
+  // CHECK-NEXT: return [[C3]]
+  return %rank_0 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @nested_isolated_region
+func @nested_isolated_region() {
+  // CHECK-NEXT: func @isolated_op
+  // CHECK-NEXT: constant 2
+  func @isolated_op() {
+    %0 = constant 1 : i32
+    %2 = addi %0, %0 : i32
+    "foo.yield"(%2) : (i32) -> ()
+  }
+
+  // CHECK: "foo.unknown_region"
+  // CHECK-NEXT: constant 2
+  "foo.unknown_region"() ({
+    %0 = constant 1 : i32
+    %2 = addi %0, %0 : i32
+    "foo.yield"(%2) : (i32) -> ()
+  }) : () -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @custom_insertion_position
+func @custom_insertion_position() {
+  // CHECK: test.one_region_op
+  // CHECK-NEXT: constant 2
+  "test.one_region_op"() ({
+
+    %0 = constant 1 : i32
+    %2 = addi %0, %0 : i32
+    "foo.yield"(%2) : (i32) -> ()
+  }) : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @splat_fold
+func @splat_fold() -> (vector<4xf32>, tensor<4xf32>) {
+  %c = constant 1.0 : f32
+  %v = splat %c : vector<4xf32>
+  %t = splat %c : tensor<4xf32>
+  return %v, %t : vector<4xf32>, tensor<4xf32>
+
+  // CHECK-NEXT: [[V:%.*]] = constant dense<1.000000e+00> : vector<4xf32>
+  // CHECK-NEXT: [[T:%.*]] = constant dense<1.000000e+00> : tensor<4xf32>
+  // CHECK-NEXT: return [[V]], [[T]] : vector<4xf32>, tensor<4xf32>
+}
diff --git a/mlir/test/Transforms/cse.mlir b/mlir/test/Transforms/cse.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8cc41e6a1a0199fdcc93790212de3443126719f8
--- /dev/null
+++ b/mlir/test/Transforms/cse.mlir
@@ -0,0 +1,246 @@
+// RUN: mlir-opt %s -pass-pipeline='func(cse)' | FileCheck %s
+
+// CHECK-DAG: #map0 = (d0) -> (d0 mod 2)
+#map0 = (d0) -> (d0 mod 2)
+
+// CHECK-LABEL: @simple_constant
+func @simple_constant() -> (i32, i32) {
+  // CHECK-NEXT: %c1_i32 = constant 1 : i32
+  %0 = constant 1 : i32
+
+  // CHECK-NEXT: return %c1_i32, %c1_i32 : i32, i32
+  %1 = constant 1 : i32
+  return %0, %1 : i32, i32
+}
+
+// CHECK-LABEL: @basic
+func @basic() -> (index, index) {
+  // CHECK: %c0 = constant 0 : index
+  %c0 = constant 0 : index
+  %c1 = constant 0 : index
+
+  // CHECK-NEXT: %0 = affine.apply #map0(%c0)
+  %0 = affine.apply #map0(%c0)
+  %1 = affine.apply #map0(%c1)
+
+  // CHECK-NEXT: return %0, %0 : index, index
+  return %0, %1 : index, index
+}
+
+// CHECK-LABEL: @many
+func @many(f32, f32) -> (f32) {
+^bb0(%a : f32, %b : f32):
+  // CHECK-NEXT: %0 = addf %arg0, %arg1 : f32
+  %c = addf %a, %b : f32
+  %d = addf %a, %b : f32
+  %e = addf %a, %b : f32
+  %f = addf %a, %b : f32
+
+  // CHECK-NEXT: %1 = addf %0, %0 : f32
+  %g = addf %c, %d : f32
+  %h = addf %e, %f : f32
+  %i = addf %c, %e : f32
+
+  // CHECK-NEXT: %2 = addf %1, %1 : f32
+  %j = addf %g, %h : f32
+  %k = addf %h, %i : f32
+
+  // CHECK-NEXT: %3 = addf %2, %2 : f32
+  %l = addf %j, %k : f32
+
+  // CHECK-NEXT: return %3 : f32
+  return %l : f32
+}
+
+/// Check that operations are not eliminated if they have different operands.
+// CHECK-LABEL: @different_ops
+func @different_ops() -> (i32, i32) {
+  // CHECK: %c0_i32 = constant 0 : i32
+  // CHECK: %c1_i32 = constant 1 : i32
+  %0 = constant 0 : i32
+  %1 = constant 1 : i32
+
+  // CHECK-NEXT: return %c0_i32, %c1_i32 : i32, i32
+  return %0, %1 : i32, i32
+}
+
+/// Check that operations are not eliminated if they have different result
+/// types.
+// CHECK-LABEL: @different_results
+func @different_results(%arg0: tensor<*xf32>) -> (tensor<?x?xf32>, tensor<4x?xf32>) {
+  // CHECK: %0 = tensor_cast %arg0 : tensor<*xf32> to tensor<?x?xf32>
+  // CHECK-NEXT: %1 = tensor_cast %arg0 : tensor<*xf32> to tensor<4x?xf32>
+  %0 = tensor_cast %arg0 : tensor<*xf32> to tensor<?x?xf32>
+  %1 = tensor_cast %arg0 : tensor<*xf32> to tensor<4x?xf32>
+
+  // CHECK-NEXT: return %0, %1 : tensor<?x?xf32>, tensor<4x?xf32>
+  return %0, %1 : tensor<?x?xf32>, tensor<4x?xf32>
+}
+
+/// Check that operations are not eliminated if they have different attributes.
+// CHECK-LABEL: @different_attributes
+func @different_attributes(index, index) -> (i1, i1, i1) {
+^bb0(%a : index, %b : index):
+  // CHECK: %0 = cmpi "slt", %arg0, %arg1 : index
+  %0 = cmpi "slt", %a, %b : index
+
+  // CHECK-NEXT: %1 = cmpi "ne", %arg0, %arg1 : index
+  /// Predicate 1 means inequality comparison.
+  %1 = cmpi "ne", %a, %b : index
+  %2 = "std.cmpi"(%a, %b) {predicate = 1} : (index, index) -> i1
+
+  // CHECK-NEXT: return %0, %1, %1 : i1, i1, i1
+  return %0, %1, %2 : i1, i1, i1
+}
+
+/// Check that operations with side effects are not eliminated.
+// CHECK-LABEL: @side_effect
+func @side_effect() -> (memref<2x1xf32>, memref<2x1xf32>) {
+  // CHECK: %0 = alloc() : memref<2x1xf32>
+  %0 = alloc() : memref<2x1xf32>
+
+  // CHECK-NEXT: %1 = alloc() : memref<2x1xf32>
+  %1 = alloc() : memref<2x1xf32>
+
+  // CHECK-NEXT: return %0, %1 : memref<2x1xf32>, memref<2x1xf32>
+  return %0, %1 : memref<2x1xf32>, memref<2x1xf32>
+}
+
+/// Check that operation definitions are properly propagated down the dominance
+/// tree.
+// CHECK-LABEL: @down_propagate_for
+func @down_propagate_for() {
+  // CHECK: %c1_i32 = constant 1 : i32
+  %0 = constant 1 : i32
+
+  // CHECK-NEXT: affine.for {{.*}} = 0 to 4 {
+  affine.for %i = 0 to 4 {
+    // CHECK-NEXT: "foo"(%c1_i32, %c1_i32) : (i32, i32) -> ()
+    %1 = constant 1 : i32
+    "foo"(%0, %1) : (i32, i32) -> ()
+  }
+  return
+}
+
+// CHECK-LABEL: @down_propagate
+func @down_propagate() -> i32 {
+  // CHECK-NEXT: %c1_i32 = constant 1 : i32
+  %0 = constant 1 : i32
+
+  // CHECK-NEXT: %true = constant 1 : i1
+  %cond = constant 1 : i1
+
+  // CHECK-NEXT: cond_br %true, ^bb1, ^bb2(%c1_i32 : i32)
+  cond_br %cond, ^bb1, ^bb2(%0 : i32)
+
+^bb1: // CHECK: ^bb1:
+  // CHECK-NEXT: br ^bb2(%c1_i32 : i32)
+  %1 = constant 1 : i32
+  br ^bb2(%1 : i32)
+
+^bb2(%arg : i32):
+  return %arg : i32
+}
+
+/// Check that operation definitions are NOT propagated up the dominance tree.
+// CHECK-LABEL: @up_propagate_for
+func @up_propagate_for() -> i32 {
+  // CHECK: affine.for {{.*}} = 0 to 4 {
+  affine.for %i = 0 to 4 {
+    // CHECK-NEXT: %c1_i32_0 = constant 1 : i32
+    // CHECK-NEXT: "foo"(%c1_i32_0) : (i32) -> ()
+    %0 = constant 1 : i32
+    "foo"(%0) : (i32) -> ()
+  }
+
+  // CHECK: %c1_i32 = constant 1 : i32
+  // CHECK-NEXT: return %c1_i32 : i32
+  %1 = constant 1 : i32
+  return %1 : i32
+}
+
+// CHECK-LABEL: func @up_propagate
+func @up_propagate() -> i32 {
+  // CHECK-NEXT:  %c0_i32 = constant 0 : i32
+  %0 = constant 0 : i32
+
+  // CHECK-NEXT: %true = constant 1 : i1
+  %cond = constant 1 : i1
+
+  // CHECK-NEXT: cond_br %true, ^bb1, ^bb2(%c0_i32 : i32)
+  cond_br %cond, ^bb1, ^bb2(%0 : i32)
+
+^bb1: // CHECK: ^bb1:
+  // CHECK-NEXT: %c1_i32 = constant 1 : i32
+  %1 = constant 1 : i32
+
+  // CHECK-NEXT: br ^bb2(%c1_i32 : i32)
+  br ^bb2(%1 : i32)
+
+^bb2(%arg : i32): // CHECK: ^bb2
+  // CHECK-NEXT: %c1_i32_0 = constant 1 : i32
+  %2 = constant 1 : i32
+
+  // CHECK-NEXT: %1 = addi %0, %c1_i32_0 : i32
+  %add = addi %arg, %2 : i32
+
+  // CHECK-NEXT: return %1 : i32
+  return %add : i32
+}
+
+/// The same test as above except that we are testing on a cfg embedded within
+/// an operation region.
+// CHECK-LABEL: func @up_propagate_region
+func @up_propagate_region() -> i32 {
+  // CHECK-NEXT: %0 = "foo.region"
+  %0 = "foo.region"() ({
+    // CHECK-NEXT:  %c0_i32 = constant 0 : i32
+    // CHECK-NEXT: %true = constant 1 : i1
+    // CHECK-NEXT: cond_br
+
+    %1 = constant 0 : i32
+    %true = constant 1 : i1
+    cond_br %true, ^bb1, ^bb2(%1 : i32)
+
+  ^bb1: // CHECK: ^bb1:
+    // CHECK-NEXT: %c1_i32 = constant 1 : i32
+    // CHECK-NEXT: br
+
+    %c1_i32 = constant 1 : i32
+    br ^bb2(%c1_i32 : i32)
+
+  ^bb2(%arg : i32): // CHECK: ^bb2(%1: i32):
+    // CHECK-NEXT: %c1_i32_0 = constant 1 : i32
+    // CHECK-NEXT: %2 = addi %1, %c1_i32_0 : i32
+    // CHECK-NEXT: "foo.yield"(%2) : (i32) -> ()
+
+    %c1_i32_0 = constant 1 : i32
+    %2 = addi %arg, %c1_i32_0 : i32
+    "foo.yield" (%2) : (i32) -> ()
+  }) : () -> (i32)
+  return %0 : i32
+}
+
+/// This test checks that nested regions that are isolated from above are
+/// properly handled.
+// CHECK-LABEL: @nested_isolated
+func @nested_isolated() -> i32 {
+  // CHECK-NEXT: constant 1
+  %0 = constant 1 : i32
+
+  // CHECK-NEXT: @nested_func
+  func @nested_func() {
+    // CHECK-NEXT: constant 1
+    %foo = constant 1 : i32
+    "foo.yield"(%foo) : (i32) -> ()
+  }
+
+  // CHECK: "foo.region"
+  "foo.region"() ({
+    // CHECK-NEXT: constant 1
+    %foo = constant 1 : i32
+    "foo.yield"(%foo) : (i32) -> ()
+  }) : () -> ()
+
+  return %0 : i32
+}
diff --git a/mlir/test/Transforms/dma-generate.mlir b/mlir/test/Transforms/dma-generate.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..0ca34554287846427d76a31e61f5b58c6e080e9f
--- /dev/null
+++ b/mlir/test/Transforms/dma-generate.mlir
@@ -0,0 +1,654 @@
+// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma -affine-data-copy-generate-fast-mem-space=2 -affine-data-copy-generate-skip-non-unit-stride-loops -verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma -affine-data-copy-generate-fast-mem-capacity=16 -affine-data-copy-generate-fast-mem-space=2 | FileCheck %s --check-prefix FAST-MEM-16KB
+
+// We run most test cases with -copy-skip-non-unit-stride-loops to allow testing
+// DMA generation at inner levels easily - since the DMA generation would
+// otherwise always generate DMAs at the outermost level (default for fast mem
+// capacity is infinite). Using a specific capacity makes it harder to write
+// a test case as one would have to calculate total footprints. With
+// -copy-skip-non-unit-stride-loops, non-unit strides will always be skipped and
+// its inner loops will be traversed till a unit stride loop is found (or the
+// innermost block is reached).
+
+// -----
+
+// Index of the buffer for the second DMA is remapped.
+// CHECK-DAG: [[MAP_PLUS_256:#map[0-9]+]] = (d0) -> (d0 + 256)
+// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0)
+
+// CHECK-LABEL: func @loop_nest_1d() {
+func @loop_nest_1d() {
+  %A = alloc() : memref<256 x f32>
+  %B = alloc() : memref<512 x f32>
+  %F = alloc() : memref<256 x f32, 2>
+  // First DMA buffer.
+  // CHECK:  %{{.*}} = alloc() : memref<256xf32>
+  // CHECK:  %{{.*}} = alloc() : memref<256xf32, 2>
+  // Tag for first DMA.
+  // CHECK:  %{{.*}} = alloc() : memref<1xi32>
+  // First DMA transfer.
+  // CHECK:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<256xf32, 2>, memref<1xi32>
+  // CHECK:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+  // Second DMA buffer.
+  // CHECK:  %{{.*}} = alloc() : memref<256xf32, 2>
+  // Tag for second DMA.
+  // CHECK:  %{{.*}} = alloc() : memref<1xi32>
+  // Second DMA transfer.
+  // CHECK:       affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<512xf32>, memref<256xf32, 2>, memref<1xi32>
+  // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+  // CHECK: affine.for %{{.*}} = 0 to 256 {
+      // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
+      // CHECK:      %{{.*}} = affine.apply [[MAP_PLUS_256]](%{{.*}})
+      // Buffer for '%{{.*}}' in faster memref space is smaller size: 256xf32
+      // Affine map for 'affine.load %{{.*}}' is composed: %{{.*}} + 256 - 256 = %{{.*}}.
+      // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
+      // Already in faster memory space.
+      // CHECK:     %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>
+  // CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2>
+  // CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>
+  // CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2>
+  // CHECK-NEXT: return
+  affine.for %i = 0 to 256 {
+    affine.load %A[%i] : memref<256 x f32>
+    %idx = affine.apply (d0) -> (d0 + 256)(%i)
+    affine.load %B[%idx] : memref<512 x f32>
+    affine.load %F[%i] : memref<256 x f32, 2>
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @loop_nest_high_d
+// CHECK:      %{{.*}} = constant 16384 : index
+// CHECK-DAG:  [[BUFB:%[0-9]+]] = alloc() : memref<512x32xf32, 2>
+// CHECK-DAG:  [[BUFA:%[0-9]+]] = alloc() : memref<512x32xf32, 2>
+// CHECK-DAG:  [[BUFC:%[0-9]+]] = alloc() : memref<512x32xf32, 2>
+// CHECK-DAG:  [[TAGB:%[0-9]+]] = alloc() : memref<1xi32>
+// CHECK-DAG:  [[TAGA:%[0-9]+]] = alloc() : memref<1xi32>
+// CHECK-DAG:  [[TAGC:%[0-9]+]] = alloc() : memref<1xi32>
+// CHECK-DAG:  [[TAGC_W:%[0-9]+]] = alloc() : memref<1xi32>
+// INCOMING DMA for B
+// CHECK-DAG:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], [[BUFB]][%{{.*}}, %{{.*}}], [[TAGB]][%{{.*}}], %{{.*}} : memref<512x32xf32>, memref<512x32xf32, 2>, memref<1xi32>
+// CHECK-DAG:  affine.dma_wait [[TAGB]][%{{.*}}], %{{.*}} : memref<1xi32>
+// INCOMING DMA for A.
+// CHECK-DAG:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], [[BUFA]][%{{.*}}, %{{.*}}], [[TAGA]][%{{.*}}], %{{.*}} : memref<512x32xf32>, memref<512x32xf32, 2>, memref<1xi32>
+// CHECK-DAG:  affine.dma_wait [[TAGA]][%{{.*}}], %{{.*}} : memref<1xi32>
+// INCOMING DMA for C.
+// CHECK-DAG:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], [[BUFC]][%{{.*}}, %{{.*}}], [[TAGC]][%{{.*}}], %{{.*}} : memref<512x32xf32>, memref<512x32xf32, 2>, memref<1xi32>
+// CHECK-DAG:  affine.dma_wait [[TAGC]][%{{.*}}], %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to 32 {
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to 32 {
+// CHECK-NEXT:      affine.for %{{.*}} = 0 to 32 {
+// CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
+// CHECK-NEXT:          %{{.*}} = affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
+// CHECK-NEXT:          %{{.*}} = affine.load [[BUFB]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
+// CHECK-NEXT:          "foo"(%{{.*}}) : (f32) -> ()
+// CHECK-NEXT:        }
+// CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
+// CHECK-NEXT:          %{{.*}} = affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
+// CHECK-NEXT:          %{{.*}} = affine.load [[BUFA]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
+// CHECK-NEXT:          "bar"(%{{.*}}) : (f32) -> ()
+// CHECK-NEXT:        }
+// CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
+// CHECK-NEXT:          %{{.*}} = "abc_compute"() : () -> f32
+// CHECK-NEXT:          %{{.*}} = affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
+// CHECK-NEXT:          %{{.*}} = affine.load [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
+// CHECK-NEXT:          %{{.*}} = "addf32"(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
+// CHECK-NEXT:          affine.store %{{.*}}, [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
+// CHECK-NEXT:        }
+// CHECK-NEXT:        "foobar"() : () -> ()
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// OUTGOING DMA for C.
+// CHECK-NEXT:  affine.dma_start [[BUFC]][%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], [[TAGC_W]][%{{.*}}], %{{.*}} : memref<512x32xf32, 2>, memref<512x32xf32>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait [[TAGC_W]][%{{.*}}], %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  dealloc [[TAGC_W]] : memref<1xi32>
+// CHECK-NEXT:  dealloc [[TAGC]] : memref<1xi32>
+// CHECK-NEXT:  dealloc [[BUFC]] : memref<512x32xf32, 2>
+// CHECK-NEXT:  dealloc [[TAGA]] : memref<1xi32>
+// CHECK-NEXT:  dealloc [[BUFA]] : memref<512x32xf32, 2>
+// CHECK-NEXT:  dealloc [[TAGB]] : memref<1xi32>
+// CHECK-NEXT:  dealloc [[BUFB]] : memref<512x32xf32, 2>
+// CHECK-NEXT:  return
+// CHECK-NEXT:}
+func @loop_nest_high_d(%A: memref<512 x 32 x f32>,
+    %B: memref<512 x 32 x f32>, %C: memref<512 x 32 x f32>) {
+  // DMAs will be performed at this level (jT is the first loop without a stride).
+  // A and B are read, while C is both read and written. A total of three new buffers
+  // are allocated and existing load's/store's are replaced by accesses to those buffers.
+  affine.for %jT = 0 to 32 {
+    affine.for %kT = 0 to 32 {
+      affine.for %iT = 0 to 32 {
+        affine.for %kk = 0 to 16 { // k intratile
+          %k = affine.apply (d0, d1) -> (16*d0 + d1) (%kT, %kk)
+          %v0 = affine.load %B[%k, %jT] : memref<512 x 32 x f32>
+          "foo"(%v0) : (f32) -> ()
+        }
+        affine.for %ii = 0 to 16 { // i intratile.
+          %i = affine.apply (d0, d1) -> (16*d0 + d1)(%iT, %ii)
+          %v1 = affine.load %A[%i, %kT] : memref<512 x 32 x f32>
+          "bar"(%v1) : (f32) -> ()
+        }
+        affine.for %ii_ = 0 to 16 { // i intratile.
+          %v2 = "abc_compute"() : () -> f32
+          %i_ = affine.apply (d0, d1) -> (16*d0 + d1)(%iT, %ii_)
+          %v3 =  affine.load %C[%i_, %jT] : memref<512 x 32 x f32>
+          %v4 = "addf32"(%v2, %v3) : (f32, f32) -> (f32)
+          affine.store %v4, %C[%i_, %jT] : memref<512 x 32 x f32>
+        }
+        "foobar"() : () -> ()
+      }
+    }
+  }
+  return
+}
+
+// -----
+
+// A loop nest with a modulo 2 access. A strided DMA is not needed here a 1x2
+// region within a 256 x 8 memref.
+//
+// CHECK-LABEL: func @loop_nest_modulo() {
+// CHECK:       %{{.*}} = alloc() : memref<256x8xf32>
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to 32 step 4 {
+// CHECK-NEXT:      %{{.*}} = affine.apply #map{{[0-9]+}}(%{{.*}})
+// CHECK-NEXT:      %{{.*}} = alloc() : memref<1x2xf32, 2>
+// CHECK-NEXT:      %{{.*}} = alloc() : memref<1xi32>
+// Composition of the affine map for '%{{.*}}' causes '%{{.*}}' to be added as a symbol.
+// CHECK-NEXT:      affine.dma_start %{{.*}}[%{{.*}}, 0], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256x8xf32>, memref<1x2xf32, 2>, memref<1xi32>
+// CHECK-NEXT:      affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+// CHECK-NEXT:      affine.for %{{.*}} = 0 to 8 {
+//                    ...
+//                    ...
+// CHECK:           }
+// CHECK-NEXT:      dealloc %{{.*}} : memref<1xi32>
+// CHECK-NEXT:      dealloc %{{.*}} : memref<1x2xf32, 2>
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return
+func @loop_nest_modulo() {
+  %A = alloc() : memref<256 x 8 x f32>
+  affine.for %i = 0 to 32 step 4 {
+    // DMAs will be performed at this level (%j is the first unit stride loop)
+    affine.for %j = 0 to 8 {
+      %idx = affine.apply (d0) -> (d0 mod 2) (%j)
+      // A buffer of size 32 x 2 will be allocated (original buffer was 256 x 8).
+      %v = affine.load %A[%i, %idx] : memref<256 x 8 x f32>
+    }
+  }
+  return
+}
+
+// -----
+
+// DMA on tiled loop nest. This also tests the case where the bounds are
+// dependent on outer loop IVs.
+// CHECK-LABEL: func @loop_nest_tiled() -> memref<256x1024xf32> {
+func @loop_nest_tiled() -> memref<256x1024xf32> {
+  %0 = alloc() : memref<256x1024xf32>
+  affine.for %i0 = 0 to 256 step 32 {
+    affine.for %i1 = 0 to 1024 step 32 {
+// CHECK:      %{{.*}} = alloc() : memref<32x32xf32, 2>
+// CHECK-NEXT: %{{.*}} = alloc() : memref<1xi32>
+// Strided DMA here: 32 x 32 tile in a 256 x 1024 memref.
+// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}, %{{.*}} : memref<256x1024xf32>, memref<32x32xf32, 2>, memref<1xi32>
+// CHECK-NEXT: affine.dma_wait
+// CHECK-NEXT: affine.for %{{.*}} = #map
+// CHECK-NEXT:   affine.for %{{.*}} = #map
+      affine.for %i2 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 32)(%i0) {
+        affine.for %i3 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 32)(%i1) {
+          // CHECK: %{{.*}} = affine.load %{{.*}}[-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<32x32xf32, 2>
+          %1 = affine.load %0[%i2, %i3] : memref<256x1024xf32>
+        } // CHECK-NEXT: }
+      }
+    }
+  }
+  return %0 : memref<256x1024xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @dma_constant_dim_access
+func @dma_constant_dim_access(%A : memref<100x100xf32>) {
+  %one = constant 1 : index
+  %N = constant 100 : index
+  // CHECK:      %{{.*}} = alloc() : memref<1x100xf32, 2>
+  // CHECK-NEXT: %{{.*}} = alloc() : memref<1xi32>
+  // No strided DMA needed here.
+  // CHECK:      affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}  : memref<100x100xf32>, memref<1x100xf32, 2>,
+  // CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+  affine.for %i = 0 to 100 {
+    affine.for %j = 0 to ()[s0] -> (s0) ()[%N] {
+      // CHECK: %{{.*}} = affine.load %{{.*}}[0, %{{.*}}] : memref<1x100xf32, 2>
+      affine.load %A[%one, %j] : memref<100 x 100 x f32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[MAP_SYM_SHIFT:#map[0-9]+]] = (d0, d1)[s0, s1] -> (d1 + s0 + s1)
+
+// CHECK-LABEL: func @dma_with_symbolic_accesses
+func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) {
+  %N = constant 9 : index
+  affine.for %i = 0 to 100 {
+    affine.for %j = 0 to 100 {
+      %idy = affine.apply (d0, d1) [s0, s1] -> (d1 + s0 + s1)(%i, %j)[%M, %N]
+      affine.load %A[%i, %idy] : memref<100 x 100 x f32>
+    }
+  }
+  return
+// CHECK:       %{{.*}} = alloc() : memref<100x100xf32, 2>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  affine.dma_start %{{.*}}[0, symbol(%{{.*}}) + 9], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}
+// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}}
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to 100 {
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to 100 {
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_SYM_SHIFT]](%{{.*}}, %{{.*}})[%{{.*}}, %{{.*}}]
+// CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<100x100xf32, 2>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK:       return
+}
+
+// -----
+
+// CHECK-LABEL: func @dma_with_symbolic_loop_bounds
+func @dma_with_symbolic_loop_bounds(%A : memref<100x100xf32>, %M : index, %N: index) {
+  %K = constant 9 : index
+// The buffer size can't be bound by a constant smaller than the original
+// memref size; so the DMA buffer is the entire 100x100.
+// CHECK:       %{{.*}} = alloc() : memref<100x100xf32, 2>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+  affine.for %i = 0 to 100 {
+    affine.for %j = %M to %N {
+      %idy = affine.apply (d1) [s0] -> (d1 + s0)(%j)[%K]
+      affine.load %A[%i, %idy] : memref<100 x 100 x f32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @dma_unknown_size
+func @dma_unknown_size(%arg0: memref<?x?xf32>) {
+  %M = dim %arg0, 0 : memref<? x ? x f32>
+  %N = dim %arg0, 0 : memref<? x ? x f32>
+  affine.for %i = 0 to %M {
+    affine.for %j = 0 to %N {
+      // If this loop nest isn't tiled, the access requires a non-constant DMA
+      // size -- not yet implemented.
+      // CHECK: %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+      affine.load %arg0[%i, %j] : memref<? x ? x f32>
+      // expected-error@-6 {{copy generation failed for one or more memref's in this block}}
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @dma_memref_3d
+func @dma_memref_3d(%arg0: memref<1024x1024x1024xf32>) {
+  affine.for %i = 0 to 1024 {
+    affine.for %j = 0 to 1024 {
+      affine.for %k = 0 to 1024 {
+        %idx = affine.apply (d0) -> (d0 mod 128)(%i)
+        %idy = affine.apply (d0) -> (d0 mod 128)(%j)
+        %idz = affine.apply (d0) -> (d0 mod 128)(%k)
+        // DMA with nested striding (or emulating with loop around strided DMA)
+        // not yet implemented.
+        // CHECK: %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<1024x1024x1024xf32>
+        %v = affine.load %arg0[%idx, %idy, %idz] : memref<1024 x 1024 x 1024 x f32>
+        // expected-error@-10 {{copy generation failed for one or more memref's in this block}}
+      }
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[MAP_PLUS_64:#map[0-9]+]] = (d0) -> (d0 + 64)
+// CHECK-DAG: [[MAP_PLUS_128:#map[0-9]+]] = (d0) -> (d0 + 128)
+// CHECK-DAG: [[MAP_PLUS_2:#map[0-9]+]] = (d0) -> (d0 + 2)
+// CHECK-DAG: [[MAP_PLUS_192:#map[0-9]+]] = (d0) -> (d0 + 192)
+
+// The first load accesses ([2,258), [128,384))
+// The second load accesses ([64,320), [2,258))
+// The first store writes to ([2,258), [192,448))
+// The second store writes to ([128,320), [2,258))
+// The union of all these regions is of size 318 x 446 and has its origin at (2,
+// 2), i.e., the window ([2,320), [2,448)) in the original space.
+
+// CHECK-LABEL: func @multi_load_store_union() {
+func @multi_load_store_union() {
+  %A = alloc() : memref<512 x 512 x f32>
+  affine.for %i = 0 to 256 {
+    affine.for %j = 0 to 256 {
+      %idx = affine.apply (d0) -> (d0 + 64)(%i)
+      %idy = affine.apply (d0) -> (d0 + 128)(%j)
+      %ishift = affine.apply (d0) -> (d0 + 2)(%i)
+      %jshift = affine.apply (d0) -> (d0 + 2)(%j)
+
+      %u = affine.load %A[%ishift, %idy] : memref<512 x 512 x f32>
+      %v = affine.load %A[%idx, %jshift] : memref<512 x 512 x f32>
+
+      %sidx = affine.apply (d0) -> (d0 + 128)(%i)
+      %sidy = affine.apply (d0) -> (d0 + 192)(%j)
+
+      affine.store %u, %A[%ishift, %sidy] : memref<512 x 512 x f32>
+      affine.store %v, %A[%sidx, %jshift] : memref<512 x 512 x f32>
+    }
+  }
+  return
+}
+// CHECK:       %{{.*}} = alloc() : memref<512x512xf32>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<382x446xf32, 2>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}, %{{.*}} : memref<512x512xf32>, memref<382x446xf32, 2>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to 256 {
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to 256 {
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_PLUS_64]](%{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_PLUS_128]](%{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_PLUS_2]](%{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_PLUS_2]](%{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}} + 126] : memref<382x446xf32, 2>
+// CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}} + 62, %{{.*}}] : memref<382x446xf32, 2>
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_PLUS_128]](%{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP_PLUS_192]](%{{.*}})
+// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}} + 190] : memref<382x446xf32, 2>
+// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}} + 126, %{{.*}}] : memref<382x446xf32, 2>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}, %{{.*}} : memref<382x446xf32, 2>, memref<512x512xf32>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  dealloc %{{.*}} : memref<382x446xf32, 2>
+// CHECK-NEXT:  return
+// CHECK-NEXT:}
+
+// -----
+
+// CHECK-LABEL: func @dma_loop_straightline_interspersed() {
+func @dma_loop_straightline_interspersed() {
+  %c0 = constant 0 : index
+  %c255 = constant 255 : index
+  %A = alloc() : memref<256 x f32>
+  %v = affine.load %A[%c0] : memref<256 x f32>
+  affine.for %i = 1 to 255 {
+    affine.load %A[%i] : memref<256 x f32>
+  }
+  %l = affine.load %A[%c255] : memref<256 x f32>
+  affine.store %l, %A[%c0] : memref<256 x f32>
+  return
+}
+// There are three regions here - the 'load' preceding the loop, the loop
+// itself, and the operations appearing after the loop.
+// CHECK:       %{{.*}} = alloc() : memref<256xf32>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xf32, 2>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<1xf32, 2>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32, 2>
+// CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  dealloc %{{.*}} : memref<1xf32, 2>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<254xf32, 2>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<254xf32, 2>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  affine.for %{{.*}} = 1 to 255 {
+// CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}} - 1] : memref<254xf32, 2>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  dealloc %{{.*}} : memref<254xf32, 2>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<256xf32, 2>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<256xf32, 2>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[255] : memref<256xf32, 2>
+// CHECK-NEXT:  affine.store %{{.*}}, %{{.*}}[0] : memref<256xf32, 2>
+// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32, 2>, memref<256xf32>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  dealloc %{{.*}} : memref<256xf32, 2>
+// CHECK-NEXT:  return
+
+// -----
+
+// CHECK-LABEL: func @dma_mixed_loop_blocks() {
+func @dma_mixed_loop_blocks() {
+  %c0 = constant 0 : index
+  %A = alloc() : memref<256 x 256 x vector<8 x f32>>
+  affine.for %i = 0 to 256 {
+    %v = affine.load %A[%c0, %c0] : memref<256 x 256 x vector<8 x f32>>
+    "foo"(%v) : (vector<8 x f32>) -> ()
+    affine.for %j = 0 to 256 {
+      %w = affine.load %A[%i, %j] : memref<256 x 256 x vector<8 x f32>>
+      "bar"(%w) : (vector<8 x f32>) -> ()
+    }
+  }
+  return
+}
+// CHECK-DAG:   [[MEM:%[0-9]+]] = alloc() : memref<256x256xvector<8xf32>>
+// CHECK-DAG:   [[BUF:%[0-9]+]] = alloc() : memref<256x256xvector<8xf32>, 2>
+// CHECK-DAG:   [[TAG:%[0-9]+]] = alloc() : memref<1xi32>
+// CHECK:       affine.dma_start [[MEM]][%{{.*}}, %{{.*}}], [[BUF]][%{{.*}}, %{{.*}}], [[TAG]][%{{.*}}], %{{.*}} : memref<256x256xvector<8xf32>>, memref<256x256xvector<8xf32>, 2>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait [[TAG]][%{{.*}}], %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %{{.*}} = affine.load [[BUF]][0, 0] : memref<256x256xvector<8xf32>, 2>
+// CHECK:         affine.for %{{.*}} = 0 to 256 {
+// CHECK-NEXT:      %{{.*}} = affine.load [[BUF]][%{{.*}}, %{{.*}}] : memref<256x256xvector<8xf32>, 2>
+
+// -----
+
+// CHECK-LABEL: func @relative_loop_bounds
+func @relative_loop_bounds(%arg0: memref<1027xf32>) {
+  affine.for %i0 = 0 to 1024 {
+    affine.for %i2 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 4)(%i0) {
+      %0 = constant 0.0 : f32
+      affine.store %0, %arg0[%i2] : memref<1027xf32>
+    }
+  }
+  return
+}
+// CHECK:      [[BUF:%[0-9]+]] = alloc() : memref<1027xf32, 2>
+// CHECK-NEXT: [[MEM:%[0-9]+]] = alloc() : memref<1xi32>
+// CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
+// CHECK-NEXT:    affine.for %{{.*}} = {{#map[0-9]+}}(%{{.*}}) to {{#map[0-9]+}}(%{{.*}}) {
+// CHECK-NEXT:      %{{.*}} = constant 0.000000e+00 : f32
+// CHECK-NEXT:      affine.store %{{.*}}, [[BUF]][%{{.*}}] : memref<1027xf32, 2>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  affine.dma_start [[BUF]][%{{.*}}], %{{.*}}[%{{.*}}], [[MEM]][%{{.*}}], %{{.*}}  : memref<1027xf32, 2>, memref<1027xf32>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait [[MEM]][%{{.*}}], %{{.*}} : memref<1xi32>
+
+// -----
+
+// CHECK-DAG: [[MAP_READ_OFFSET:#map[0-9]+]] = (d0) -> (d0 + 100)
+// CHECK-DAG: [[MAP_WRITE_OFFSET:#map[0-9]+]] = (d0) -> (d0 + 25)
+
+func @test_read_write_region_union() {
+  %0 = alloc() : memref<256xf32>
+  affine.for %i0 = 0 to 10 {
+    // memref dims:  [0, 256)
+    // read region:  [100, 110)
+    // write region: [25, 35)
+    // union region: [25, 110)
+    %a0 = affine.apply (d0) -> (d0 + 100)(%i0)
+    %a1 = affine.apply (d0) -> (d0 + 25)(%i0)
+    %1 = affine.load %0[%a0] : memref<256xf32>
+    affine.store %1, %0[%a1] : memref<256xf32>
+  }
+  return
+}
+
+// CHECK:       %{{.*}} = alloc() : memref<256xf32>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<85xf32, 2>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<85xf32, 2>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:    %{{.*}} = affine.apply [[MAP_READ_OFFSET]](%{{.*}})
+// CHECK-NEXT:    %{{.*}} = affine.apply [[MAP_WRITE_OFFSET]](%{{.*}})
+// CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}} + 75] : memref<85xf32, 2>
+// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<85xf32, 2>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<85xf32, 2>, memref<256xf32>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+
+// -----
+
+// This should create a buffer of size 2 affine.for %arg2.
+
+#map_lb = (d0) -> (d0)
+#map_ub = (d0) -> (d0 + 3)
+#map_acc = (d0) -> (d0 floordiv 8)
+// CHECK-LABEL: func @test_analysis_util
+func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>, %arg2: memref<2xf32>) -> (memref<144x9xf32>, memref<2xf32>) {
+  %c0 = constant 0 : index
+  %0 = alloc() : memref<64x1xf32>
+  %1 = alloc() : memref<144x4xf32>
+  %2 =  constant 0.0 : f32
+  affine.for %i8 = 0 to 9 step 3 {
+    affine.for %i9 = #map_lb(%i8) to #map_ub(%i8) {
+      affine.for %i17 = 0 to 64 {
+        %23 = affine.apply #map_acc(%i9)
+        %25 = affine.load %arg2[%23] : memref<2xf32>
+        %26 = affine.apply #map_lb(%i17)
+        %27 = affine.load %0[%26, %c0] : memref<64x1xf32>
+        affine.store %27, %arg2[%23] : memref<2xf32>
+      }
+    }
+  }
+  return %arg1, %arg2 : memref<144x9xf32>, memref<2xf32>
+}
+// CHECK:       affine.for %{{.*}} = 0 to 9 step 3 {
+// CHECK:         [[BUF:%[0-9]+]] = alloc() : memref<2xf32, 2>
+// CHECK:         affine.dma_start %{{.*}}[%{{.*}} floordiv 8], [[BUF]]
+// CHECK:         affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+// CHECK:         affine.for %{{.*}} =
+
+// ----
+
+#map3 = (d0) -> (d0)
+#map12 = (d0) -> (d0 + 3)
+#map14 = (d0, d1) -> ((d0 + d1 * 72) floordiv 2304 + ((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3)
+#map15 = (d0, d1) -> ((d0 + d1 * 72) mod 2304 - (((d0 + d1 * 72) mod 2304) floordiv 1152) * 1151 - ((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) * 9 - (((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3) * 3)
+#map16 = (d0, d1) -> (((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) floordiv 8)
+// Test for test case in b/128303048 #4.
+func @test_memref_bounds(%arg0: memref<4x4x16x1xvector<8x128xf32>>, %arg1: memref<144x9xvector<8x128xf32>>, %arg2: memref<2xvector<8x128xf32>>) -> (memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>) {
+  %c0 = constant 0 : index
+  affine.for %i8 = 0 to 9 step 3 {
+    affine.for %i9 = #map3(%i8) to #map12(%i8) {
+      affine.for %i10 = 0 to 64 {
+        %10 = affine.apply #map14(%i9, %i10)
+        %11 = affine.apply #map15(%i9, %i10)
+        %12 = affine.apply #map16(%i9, %i10)
+        %13 = affine.load %arg0[%10, %11, %12, %c0] : memref<4x4x16x1xvector<8x128xf32>>
+      }
+    }
+  }
+  return %arg1, %arg2 : memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>
+}
+
+// CHECK:       %{{.*}} = alloc() : memref<4x4x16x1xvector<8x128xf32>, 2>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<1xi32>
+// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<4x4x16x1xvector<8x128xf32>>, memref<4x4x16x1xvector<8x128xf32>, 2>, memref<1xi32>
+// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
+
+// -----
+
+// Since the fast memory size is 4 KB, DMA generation will happen right under
+// %i0.
+
+// FAST-MEM-16KB-LABEL: func @load_store_same_memref
+func @load_store_same_memref(%arg0: memref<256x1024xf32>) {
+  // FAST-MEM-16KB:  affine.for %{{.*}} = 0 to 256 step 4
+  affine.for %i0 = 0 to 256 step 4 {
+    // FAST-MEM-16KB: [[BUF:%[0-9]+]] = alloc() : memref<4x1024xf32, 2>
+    // FAST-MEM-16KB:    affine.dma_start %{{.*}}
+    // FAST-MEM-16KB-NEXT: affine.dma_wait
+    // FAST-MEM-16KB:  affine.for %{{.*}}
+    affine.for %i1 = 0 to 1024 step 4 {
+      // FAST-MEM-16KB:  affine.for %{{.*}}
+      affine.for %i2 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 4)(%i0) {
+        // FAST-MEM-16KB:  affine.for %{{.*}}
+        affine.for %i3 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 4)(%i1) {
+          %3 = affine.load %arg0[%i2, %i3] : memref<256x1024xf32>
+          %4 = mulf %3, %3 : f32
+          affine.store %4, %arg0[%i2, %i3] : memref<256x1024xf32>
+        } // FAST-MEM-16KB: }
+      } // FAST-MEM-16KB: }
+    } // FAST-MEM-16KB: }
+    // FAST-MEM-16KB:    affine.dma_start [[BUF]]
+    // FAST-MEM-16KB-NEXT: affine.dma_wait
+  }
+  return
+}
+
+// -----
+
+// This a 3-d loop nest tiled by 4 x 4 x 4. Under %i, %j, %k, the size of a
+// tile of arg0, arg1, and arg2 accessed is 4 KB (each), i.e., 12 KB in total.
+// With fast mem capacity set to 16 KB, the DMAs if placed under %k will fit.
+// However, the region of arg2 accessed is invariant w.r.t the %k loop unlike
+// %arg0 and %arg1. So, its DMA can be hoisted one level up and placed under
+// %j, while the DMAs for arg0 and arg1 appear right under the %k loop.
+
+#map0 = (d0) -> (d0)
+#map1 = (d0) -> (d0 + 4)
+// FAST-MEM-16KB-LABEL: func @simple_matmul
+func @simple_matmul(%arg0: memref<8x8xvector<64xf32>>, %arg1: memref<8x8xvector<64xf32>>, %arg2: memref<8x8xvector<64xf32>>) -> memref<8x8xvector<64xf32>> {
+  affine.for %i = 0 to 8 step 4 {
+    affine.for %j = 0 to 8 step 4 {
+      affine.for %k = 0 to 8 step 4 {
+        affine.for %ii = #map0(%i) to #map1(%i) {
+          affine.for %jj = #map0(%j) to #map1(%j) {
+            affine.for %kk = #map0(%k) to #map1(%k) {
+              %5 = affine.load %arg0[%ii, %kk] : memref<8x8xvector<64xf32>>
+              %6 = affine.load %arg1[%kk, %jj] : memref<8x8xvector<64xf32>>
+              %7 = affine.load %arg2[%ii, %jj] : memref<8x8xvector<64xf32>>
+              %8 = mulf %5, %6 : vector<64xf32>
+              %9 = addf %7, %8 : vector<64xf32>
+              affine.store %9, %arg2[%ii, %jj] : memref<8x8xvector<64xf32>>
+            }
+          }
+        }
+      }
+    }
+  }
+  return %arg2 : memref<8x8xvector<64xf32>>
+}
+// FAST-MEM-16KB: affine.for %{{.*}} = 0 to 8 step 4 {
+// FAST-MEM-16KB:   affine.for %{{.*}} = 0 to 8 step 4 {
+// FAST-MEM-16KB:     affine.dma_start %{{.*}}
+// FAST-MEM-16KB:     affine.dma_wait
+// FAST-MEM-16KB:     affine.for %{{.*}} = 0 to 8 step 4 {
+// FAST-MEM-16KB:       affine.dma_start %{{.*}}
+// FAST-MEM-16KB:       affine.dma_wait
+// FAST-MEM-16KB:       affine.dma_start %{{.*}}
+// FAST-MEM-16KB:       affine.dma_wait
+// FAST-MEM-16KB:       affine.for %{{.*}} = #map{{[0-9]+}}(%{{.*}}) to #map{{[0-9]+}}(%{{.*}}) {
+// FAST-MEM-16KB-NEXT:    affine.for %{{.*}} = #map{{[0-9]+}}(%{{.*}}) to #map{{[0-9]+}}(%{{.*}}) {
+// FAST-MEM-16KB-NEXT:      affine.for %{{.*}} = #map{{[0-9]+}}(%{{.*}}) to #map{{[0-9]+}}(%{{.*}}) {
+// FAST-MEM-16KB:           }
+// FAST-MEM-16KB:         }
+// FAST-MEM-16KB:       }
+// FAST-MEM-16KB:     }
+// FAST-MEM-16KB:     affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}
+// FAST-MEM-16KB:     affine.dma_wait
diff --git a/mlir/test/Transforms/inlining.mlir b/mlir/test/Transforms/inlining.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..980b3c380ae3ebb6ddda30fe1a3a330c37c26b4b
--- /dev/null
+++ b/mlir/test/Transforms/inlining.mlir
@@ -0,0 +1,164 @@
+// RUN: mlir-opt %s -inline -mlir-disable-inline-simplify | FileCheck %s
+// RUN: mlir-opt %s -inline -mlir-disable-inline-simplify -mlir-print-debuginfo | FileCheck %s --check-prefix INLINE-LOC
+// RUN: mlir-opt %s -inline -mlir-disable-inline-simplify=false | FileCheck %s --check-prefix INLINE_SIMPLIFY
+
+// Inline a function that takes an argument.
+func @func_with_arg(%c : i32) -> i32 {
+  %b = addi %c, %c : i32
+  return %b : i32
+}
+
+// CHECK-LABEL: func @inline_with_arg
+func @inline_with_arg(%arg0 : i32) -> i32 {
+  // CHECK-NEXT: addi
+  // CHECK-NEXT: return
+
+  %0 = call @func_with_arg(%arg0) : (i32) -> i32
+  return %0 : i32
+}
+
+// Inline a function that has multiple return operations.
+func @func_with_multi_return(%a : i1) -> (i32) {
+  cond_br %a, ^bb1, ^bb2
+
+^bb1:
+  %const_0 = constant 0 : i32
+  return %const_0 : i32
+
+^bb2:
+  %const_55 = constant 55 : i32
+  return %const_55 : i32
+}
+
+// CHECK-LABEL: func @inline_with_multi_return() -> i32
+func @inline_with_multi_return() -> i32 {
+// CHECK-NEXT:    [[VAL_7:%.*]] = constant 0 : i1
+// CHECK-NEXT:    cond_br [[VAL_7]], ^bb1, ^bb2
+// CHECK:       ^bb1:
+// CHECK-NEXT:    [[VAL_8:%.*]] = constant 0 : i32
+// CHECK-NEXT:    br ^bb3([[VAL_8]] : i32)
+// CHECK:       ^bb2:
+// CHECK-NEXT:    [[VAL_9:%.*]] = constant 55 : i32
+// CHECK-NEXT:    br ^bb3([[VAL_9]] : i32)
+// CHECK:       ^bb3([[VAL_10:%.*]]: i32):
+// CHECK-NEXT:    return [[VAL_10]] : i32
+
+  %false = constant 0 : i1
+  %x = call @func_with_multi_return(%false) : (i1) -> i32
+  return %x : i32
+}
+
+// Check that location information is updated for inlined instructions.
+func @func_with_locations(%c : i32) -> i32 {
+  %b = addi %c, %c : i32 loc("mysource.cc":10:8)
+  return %b : i32 loc("mysource.cc":11:2)
+}
+
+// INLINE-LOC-LABEL: func @inline_with_locations
+func @inline_with_locations(%arg0 : i32) -> i32 {
+  // INLINE-LOC-NEXT: addi %{{.*}}, %{{.*}} : i32 loc(callsite("mysource.cc":10:8 at "mysource.cc":55:14))
+  // INLINE-LOC-NEXT: return
+
+  %0 = call @func_with_locations(%arg0) : (i32) -> i32 loc("mysource.cc":55:14)
+  return %0 : i32
+}
+
+
+// Check that external functions are not inlined.
+func @func_external()
+
+// CHECK-LABEL: func @no_inline_external
+func @no_inline_external() {
+  // CHECK-NEXT: call @func_external()
+  call @func_external() : () -> ()
+  return
+}
+
+// Check that multiple levels of calls will be inlined.
+func @multilevel_func_a() {
+  return
+}
+func @multilevel_func_b() {
+  call @multilevel_func_a() : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @inline_multilevel
+func @inline_multilevel() {
+  // CHECK-NOT: call
+  %fn = "test.functional_region_op"() ({
+    call @multilevel_func_b() : () -> ()
+    "test.return"() : () -> ()
+  }) : () -> (() -> ())
+
+  call_indirect %fn() : () -> ()
+  return
+}
+
+// Check that recursive calls are not inlined.
+// CHECK-LABEL: func @no_inline_recursive
+func @no_inline_recursive() {
+  // CHECK: test.functional_region_op
+  // CHECK-NOT: test.functional_region_op
+  %fn = "test.functional_region_op"() ({
+    call @no_inline_recursive() : () -> ()
+    "test.return"() : () -> ()
+  }) : () -> (() -> ())
+  return
+}
+
+// Check that we can convert types for inputs and results as necessary.
+func @convert_callee_fn(%arg : i32) -> i32 {
+  return %arg : i32
+}
+func @convert_callee_fn_multi_arg(%a : i32, %b : i32) -> () {
+  return
+}
+func @convert_callee_fn_multi_res() -> (i32, i32) {
+  %res = constant 0 : i32
+  return %res, %res : i32, i32
+}
+
+// CHECK-LABEL: func @inline_convert_call
+func @inline_convert_call() -> i16 {
+  // CHECK: %[[INPUT:.*]] = constant
+  %test_input = constant 0 : i16
+
+  // CHECK: %[[CAST_INPUT:.*]] = "test.cast"(%[[INPUT]]) : (i16) -> i32
+  // CHECK: %[[CAST_RESULT:.*]] = "test.cast"(%[[CAST_INPUT]]) : (i32) -> i16
+  // CHECK-NEXT: return %[[CAST_RESULT]]
+  %res = "test.conversion_call_op"(%test_input) { callee=@convert_callee_fn } : (i16) -> (i16)
+  return %res : i16
+}
+
+// CHECK-LABEL: func @no_inline_convert_call
+func @no_inline_convert_call() {
+  // CHECK: "test.conversion_call_op"
+  %test_input_i16 = constant 0 : i16
+  %test_input_i64 = constant 0 : i64
+  "test.conversion_call_op"(%test_input_i16, %test_input_i64) { callee=@convert_callee_fn_multi_arg } : (i16, i64) -> ()
+
+  // CHECK: "test.conversion_call_op"
+  %res_2:2 = "test.conversion_call_op"() { callee=@convert_callee_fn_multi_res } : () -> (i16, i64)
+  return
+}
+
+// Check that we properly simplify when inlining.
+func @simplify_return_constant() -> i32 {
+  %res = constant 0 : i32
+  return %res : i32
+}
+
+func @simplify_return_reference() -> (() -> i32) {
+  %res = constant @simplify_return_constant : () -> i32
+  return %res : () -> i32
+}
+
+// INLINE_SIMPLIFY-LABEL: func @inline_simplify
+func @inline_simplify() -> i32 {
+  // INLINE_SIMPLIFY-NEXT: %[[CST:.*]] = constant 0 : i32
+  // INLINE_SIMPLIFY-NEXT: return %[[CST]]
+  %fn = call @simplify_return_reference() : () -> (() -> i32)
+  %res = call_indirect %fn() : () -> i32
+  return %res : i32
+}
diff --git a/mlir/test/Transforms/loop-coalescing.mlir b/mlir/test/Transforms/loop-coalescing.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d10cf19543eee07876be37f37d8f0840da48748c
--- /dev/null
+++ b/mlir/test/Transforms/loop-coalescing.mlir
@@ -0,0 +1,193 @@
+// RUN: mlir-opt -loop-coalescing %s | FileCheck %s
+
+// CHECK-LABEL: @one_3d_nest
+func @one_3d_nest() {
+  // Capture original bounds.  Note that for zero-based step-one loops, the
+  // upper bound is also the number of iterations.
+  // CHECK: %[[orig_lb:.*]] = constant 0
+  // CHECK: %[[orig_step:.*]] = constant 1
+  // CHECK: %[[orig_ub_k:.*]] = constant 3
+  // CHECK: %[[orig_ub_i:.*]] = constant 42
+  // CHECK: %[[orig_ub_j:.*]] = constant 56
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c42 = constant 42 : index
+  %c56 = constant 56 : index
+  // The range of the new loop.
+  // CHECK:     %[[partial_range:.*]] = muli %[[orig_ub_i]], %[[orig_ub_j]]
+  // CHECK-NEXT:%[[range:.*]] = muli %[[partial_range]], %[[orig_ub_k]]
+
+  // Updated loop bounds.
+  // CHECK: loop.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]]
+  loop.for %i = %c0 to %c42 step %c1 {
+    // Inner loops must have been removed.
+    // CHECK-NOT: loop.for
+
+    // Reconstruct original IVs from the linearized one.
+    // CHECK: %[[orig_k:.*]] = remi_signed %[[i]], %[[orig_ub_k]]
+    // CHECK: %[[div:.*]] = divi_signed %[[i]], %[[orig_ub_k]]
+    // CHECK: %[[orig_j:.*]] = remi_signed %[[div]], %[[orig_ub_j]]
+    // CHECK: %[[orig_i:.*]] = divi_signed %[[div]], %[[orig_ub_j]]
+    loop.for %j = %c0 to %c56 step %c1 {
+      loop.for %k = %c0 to %c3 step %c1 {
+        // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]])
+        "use"(%i, %j, %k) : (index, index, index) -> ()
+      }
+    }
+  }
+  return
+}
+
+// Check that there is no chasing the replacement of value uses by ensuring
+// multiple uses of loop induction variables get rewritten to the same values.
+
+// CHECK-LABEL: @multi_use
+func @multi_use() {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c10 = constant 10 : index
+  // CHECK: loop.for %[[iv:.*]] =
+  loop.for %i = %c1 to %c10 step %c1 {
+    loop.for %j = %c1 to %c10 step %c1 {
+      loop.for %k = %c1 to %c10 step %c1 {
+        // CHECK: %[[k_unshifted:.*]] = remi_signed %[[iv]], %[[k_extent:.*]]
+        // CHECK: %[[ij:.*]] = divi_signed %[[iv]], %[[k_extent]]
+        // CHECK: %[[j_unshifted:.*]] = remi_signed %[[ij]], %[[j_extent:.*]]
+        // CHECK: %[[i_unshifted:.*]] = divi_signed %[[ij]], %[[j_extent]]
+        // CHECK: %[[k:.*]] = addi %[[k_unshifted]]
+        // CHECK: %[[j:.*]] = addi %[[j_unshifted]]
+        // CHECK: %[[i:.*]] = addi %[[i_unshifted]]
+
+        // CHECK: "use1"(%[[i]], %[[j]], %[[k]])
+        "use1"(%i,%j,%k) : (index,index,index) -> ()
+        // CHECK: "use2"(%[[i]], %[[k]], %[[j]])
+        "use2"(%i,%k,%j) : (index,index,index) -> ()
+        // CHECK: "use3"(%[[k]], %[[j]], %[[i]])
+        "use3"(%k,%j,%i) : (index,index,index) -> ()
+      }
+    }
+  }
+  return
+}
+
+func @unnormalized_loops() {
+  // CHECK: %[[orig_step_i:.*]] = constant 2
+  // CHECK: %[[orig_step_j:.*]] = constant 3
+  // CHECK: %[[orig_lb_i:.*]] = constant 5
+  // CHECK: %[[orig_lb_j:.*]] = constant 7
+  // CHECK: %[[orig_ub_i:.*]] = constant 10
+  // CHECK: %[[orig_ub_j:.*]] = constant 17
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c5 = constant 5 : index
+  %c7 = constant 7 : index
+  %c10 = constant 10 : index
+  %c17 = constant 17 : index
+
+  // Number of iterations in the outer loop.
+  // CHECK: %[[diff_i:.*]] = subi %[[orig_ub_i]], %[[orig_lb_i]]
+  // CHECK: %[[c1:.*]] = constant 1
+  // CHECK: %[[step_minus_c1:.*]] = subi %[[orig_step_i]], %[[c1]]
+  // CHECK: %[[dividend:.*]] = addi %[[diff_i]], %[[step_minus_c1]]
+  // CHECK: %[[numiter_i:.*]] = divi_signed %[[dividend]], %[[orig_step_i]]
+
+  // Normalized lower bound and step for the outer loop.
+  // CHECK: %[[lb_i:.*]] = constant 0
+  // CHECK: %[[step_i:.*]] = constant 1
+
+  // Number of iterations in the inner loop, the pattern is the same as above,
+  // only capture the final result.
+  // CHECK: %[[numiter_j:.*]] = divi_signed {{.*}}, %[[orig_step_j]]
+
+  // New bounds of the outer loop.
+  // CHECK: %[[range:.*]] = muli %[[numiter_i]], %[[numiter_j]]
+  // CHECK: loop.for %[[i:.*]] = %[[lb_i]] to %[[range]] step %[[step_i]]
+  loop.for %i = %c5 to %c10 step %c2 {
+    // The inner loop has been removed.
+    // CHECK-NOT: loop.for
+    loop.for %j = %c7 to %c17 step %c3 {
+      // The IVs are rewritten.
+      // CHECK: %[[normalized_j:.*]] = remi_signed %[[i]], %[[numiter_j]]
+      // CHECK: %[[normalized_i:.*]] = divi_signed %[[i]], %[[numiter_j]]
+      // CHECK: %[[scaled_j:.*]] = muli %[[normalized_j]], %[[orig_step_j]]
+      // CHECK: %[[orig_j:.*]] = addi %[[scaled_j]], %[[orig_lb_j]]
+      // CHECK: %[[scaled_i:.*]] = muli %[[normalized_i]], %[[orig_step_i]]
+      // CHECK: %[[orig_i:.*]] = addi %[[scaled_i]], %[[orig_lb_i]]
+      // CHECK: "use"(%[[orig_i]], %[[orig_j]])
+      "use"(%i, %j) : (index, index) -> ()
+    }
+  }
+  return
+}
+
+// Check with parametric loop bounds and steps, capture the bounds here.
+// CHECK-LABEL: @parametric
+// CHECK-SAME: %[[orig_lb1:[A-Za-z0-9]+]]:
+// CHECK-SAME: %[[orig_ub1:[A-Za-z0-9]+]]:
+// CHECK-SAME: %[[orig_step1:[A-Za-z0-9]+]]:
+// CHECK-SAME: %[[orig_lb2:[A-Za-z0-9]+]]:
+// CHECK-SAME: %[[orig_ub2:[A-Za-z0-9]+]]:
+// CHECK-SAME: %[[orig_step2:[A-Za-z0-9]+]]:
+func @parametric(%lb1 : index, %ub1 : index, %step1 : index,
+                 %lb2 : index, %ub2 : index, %step2 : index) {
+  // Compute the number of iterations for each of the loops and the total
+  // number of iterations.
+  // CHECK: %[[range1:.*]] = subi %[[orig_ub1]], %[[orig_lb1]]
+  // CHECK: %[[orig_step1_minus_1:.*]] = subi %[[orig_step1]], %c1
+  // CHECK: %[[dividend1:.*]] = addi %[[range1]], %[[orig_step1_minus_1]]
+  // CHECK: %[[numiter1:.*]] = divi_signed %[[dividend1]], %[[orig_step1]]
+  // CHECK: %[[range2:.*]] = subi %[[orig_ub2]], %[[orig_lb2]]
+  // CHECK: %[[orig_step2_minus_1:.*]] = subi %arg5, %c1
+  // CHECK: %[[dividend2:.*]] = addi %[[range2]], %[[orig_step2_minus_1]]
+  // CHECK: %[[numiter2:.*]] = divi_signed %[[dividend2]], %[[orig_step2]]
+  // CHECK: %[[range:.*]] = muli %[[numiter1]], %[[numiter2]] : index
+
+  // Check that the outer loop is updated.
+  // CHECK: loop.for %[[i:.*]] = %c0{{.*}} to %[[range]] step %c1
+  loop.for %i = %lb1 to %ub1 step %step1 {
+    // Check that the inner loop is removed.
+    // CHECK-NOT: loop.for
+    loop.for %j = %lb2 to %ub2 step %step2 {
+      // Remapping of the induction variables.
+      // CHECK: %[[normalized_j:.*]] = remi_signed %[[i]], %[[numiter2]] : index
+      // CHECK: %[[normalized_i:.*]] = divi_signed %[[i]], %[[numiter2]] : index
+      // CHECK: %[[scaled_j:.*]] = muli %[[normalized_j]], %[[orig_step2]]
+      // CHECK: %[[orig_j:.*]] = addi %[[scaled_j]], %[[orig_lb2]]
+      // CHECK: %[[scaled_i:.*]] = muli %[[normalized_i]], %[[orig_step1]]
+      // CHECK: %[[orig_i:.*]] = addi %[[scaled_i]], %[[orig_lb1]]
+
+      // CHECK: "foo"(%[[orig_i]], %[[orig_j]])
+      "foo"(%i, %j) : (index, index) -> ()
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: @two_bands
+func @two_bands() {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c10 = constant 10 : index
+  // CHECK: %[[outer_range:.*]] = muli
+  // CHECK: loop.for %{{.*}} = %{{.*}} to %[[outer_range]]
+  loop.for %i = %c0 to %c10 step %c1 {
+    // Check that the "j" loop was removed and that the inner loops were
+    // coalesced as well.  The preparation step for coalescing will inject the
+    // subtraction operation unlike the IV remapping.
+    // CHECK-NOT: loop.for
+    // CHECK: subi
+    loop.for %j = %c0 to %c10 step %c1 {
+      // The inner pair of loops is coalesced separately.
+      // CHECK: loop.for
+      loop.for %k = %i to %j step %c1 {
+        // CHECK_NOT: loop.for
+        loop.for %l = %i to %j step %c1 {
+          "foo"() : () -> ()
+        }
+      }
+    }
+  }
+  return
+}
diff --git a/mlir/test/Transforms/loop-fusion-dependence-check.mlir b/mlir/test/Transforms/loop-fusion-dependence-check.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..4b5c77839fb22b21372ee5c0041ec0599d1afd94
--- /dev/null
+++ b/mlir/test/Transforms/loop-fusion-dependence-check.mlir
@@ -0,0 +1,337 @@
+// RUN: mlir-opt %s -test-loop-fusion -test-loop-fusion-dependence-check -split-input-file -verify-diagnostics | FileCheck %s
+
+// -----
+
+// CHECK-LABEL: func @cannot_fuse_would_create_cycle() {
+func @cannot_fuse_would_create_cycle() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %c = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+
+  // Set up the following dependences:
+  // 1) loop0 -> loop1 on memref '%a'
+  // 2) loop0 -> loop2 on memref '%b'
+  // 3) loop1 -> loop2 on memref '%c'
+
+  // Fusing loop nest '%i0' and loop nest '%i2' would create a cycle.
+  affine.for %i0 = 0 to 10 {
+    // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 0 into loop nest 2 at depth 0}}
+    %v0 = affine.load %a[%i0] : memref<10xf32>
+    affine.store %cf7, %b[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    affine.store %cf7, %a[%i1] : memref<10xf32>
+    %v1 = affine.load %c[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 2 into loop nest 0 at depth 0}}
+    %v2 = affine.load %b[%i2] : memref<10xf32>
+    affine.store %cf7, %c[%i2] : memref<10xf32>
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @can_fuse_rar_dependence() {
+func @can_fuse_rar_dependence() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %c = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+
+  // Set up the following dependences:
+  // Make dependence from 0 to 1 on '%a' read-after-read.
+  // 1) loop0 -> loop1 on memref '%a'
+  // 2) loop0 -> loop2 on memref '%b'
+  // 3) loop1 -> loop2 on memref '%c'
+
+  // Should fuse: no fusion preventing remarks should be emitted for this test.
+  affine.for %i0 = 0 to 10 {
+    %v0 = affine.load %a[%i0] : memref<10xf32>
+    affine.store %cf7, %b[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %v1 = affine.load %a[%i1] : memref<10xf32>
+    %v2 = affine.load %c[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    %v3 = affine.load %b[%i2] : memref<10xf32>
+    affine.store %cf7, %c[%i2] : memref<10xf32>
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @can_fuse_different_memrefs() {
+func @can_fuse_different_memrefs() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %c = alloc() : memref<10xf32>
+  %d = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+
+  // Set up the following dependences:
+  // Make dependence from 0 to 1 on unrelated memref '%d'.
+  // 1) loop0 -> loop1 on memref '%a'
+  // 2) loop0 -> loop2 on memref '%b'
+  // 3) loop1 -> loop2 on memref '%c'
+
+  // Should fuse: no fusion preventing remarks should be emitted for this test.
+  affine.for %i0 = 0 to 10 {
+    %v0 = affine.load %a[%i0] : memref<10xf32>
+    affine.store %cf7, %b[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    affine.store %cf7, %d[%i1] : memref<10xf32>
+    %v1 = affine.load %c[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    %v2 = affine.load %b[%i2] : memref<10xf32>
+    affine.store %cf7, %c[%i2] : memref<10xf32>
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_not_fuse_across_intermediate_store() {
+func @should_not_fuse_across_intermediate_store() {
+  %0 = alloc() : memref<10xf32>
+  %c0 = constant 0 : index
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 0 into loop nest 1 at depth 0}}
+    %v0 = affine.load %0[%i0] : memref<10xf32>
+    "op0"(%v0) : (f32) -> ()
+  }
+
+  // Should not fuse loop nests '%i0' and '%i1' across top-level store.
+  affine.store %cf7, %0[%c0] : memref<10xf32>
+
+  affine.for %i1 = 0 to 10 {
+    // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 1 into loop nest 0 at depth 0}}
+    %v1 = affine.load %0[%i1] : memref<10xf32>
+    "op1"(%v1) : (f32) -> ()
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_not_fuse_across_intermediate_load() {
+func @should_not_fuse_across_intermediate_load() {
+  %0 = alloc() : memref<10xf32>
+  %c0 = constant 0 : index
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 0 into loop nest 1 at depth 0}}
+    affine.store %cf7, %0[%i0] : memref<10xf32>
+  }
+
+  // Should not fuse loop nests '%i0' and '%i1' across top-level load.
+  %v0 = affine.load %0[%c0] : memref<10xf32>
+  "op0"(%v0) : (f32) -> ()
+
+  affine.for %i1 = 0 to 10 {
+    // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 1 into loop nest 0 at depth 0}}
+    affine.store %cf7, %0[%i1] : memref<10xf32>
+  }
+
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_not_fuse_across_ssa_value_def() {
+func @should_not_fuse_across_ssa_value_def() {
+  %0 = alloc() : memref<10xf32>
+  %1 = alloc() : memref<10xf32>
+  %c0 = constant 0 : index
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 0 into loop nest 1 at depth 0}}
+    %v0 = affine.load %0[%i0] : memref<10xf32>
+    affine.store %v0, %1[%i0] : memref<10xf32>
+  }
+
+  // Loop nest '%i0" cannot be fused past load from '%1' due to RAW dependence.
+  %v1 = affine.load %1[%c0] : memref<10xf32>
+  "op0"(%v1) : (f32) -> ()
+
+  // Loop nest '%i1' cannot be fused past SSA value def '%c2' which it uses.
+  %c2 = constant 2 : index
+
+  affine.for %i1 = 0 to 10 {
+    // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 1 into loop nest 0 at depth 0}}
+    affine.store %cf7, %0[%c2] : memref<10xf32>
+  }
+
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_not_fuse_store_before_load() {
+func @should_not_fuse_store_before_load() {
+  %0 = alloc() : memref<10xf32>
+  %c0 = constant 0 : index
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 0 into loop nest 2 at depth 0}}
+    affine.store %cf7, %0[%i0] : memref<10xf32>
+    %v0 = affine.load %0[%i0] : memref<10xf32>
+  }
+
+  affine.for %i1 = 0 to 10 {
+    %v1 = affine.load %0[%i1] : memref<10xf32>
+  }
+
+  affine.for %i2 = 0 to 10 {
+    // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 2 into loop nest 0 at depth 0}}
+    affine.store %cf7, %0[%i2] : memref<10xf32>
+    %v2 = affine.load %0[%i2] : memref<10xf32>
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_not_fuse_across_load_at_depth1() {
+func @should_not_fuse_across_load_at_depth1() {
+  %0 = alloc() : memref<10x10xf32>
+  %c0 = constant 0 : index
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 0 into loop nest 1 at depth 1}}
+      affine.store %cf7, %0[%i0, %i1] : memref<10x10xf32>
+    }
+
+    %v1 = affine.load %0[%i0, %c0] : memref<10x10xf32>
+
+    affine.for %i3 = 0 to 10 {
+      // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 1 into loop nest 0 at depth 1}}
+      affine.store %cf7, %0[%i0, %i3] : memref<10x10xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_not_fuse_across_load_in_loop_at_depth1() {
+func @should_not_fuse_across_load_in_loop_at_depth1() {
+  %0 = alloc() : memref<10x10xf32>
+  %c0 = constant 0 : index
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 0 into loop nest 2 at depth 1}}
+      affine.store %cf7, %0[%i0, %i1] : memref<10x10xf32>
+    }
+
+    affine.for %i2 = 0 to 10 {
+      %v1 = affine.load %0[%i0, %i2] : memref<10x10xf32>
+    }
+
+    affine.for %i3 = 0 to 10 {
+      // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 2 into loop nest 0 at depth 1}}
+      affine.store %cf7, %0[%i0, %i3] : memref<10x10xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_not_fuse_across_store_at_depth1() {
+func @should_not_fuse_across_store_at_depth1() {
+  %0 = alloc() : memref<10x10xf32>
+  %c0 = constant 0 : index
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 0 into loop nest 1 at depth 1}}
+      %v0 = affine.load %0[%i0, %i1] : memref<10x10xf32>
+    }
+
+    affine.store %cf7, %0[%i0, %c0] : memref<10x10xf32>
+
+    affine.for %i3 = 0 to 10 {
+      // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 1 into loop nest 0 at depth 1}}
+      %v1 = affine.load %0[%i0, %i3] : memref<10x10xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_not_fuse_across_store_in_loop_at_depth1() {
+func @should_not_fuse_across_store_in_loop_at_depth1() {
+  %0 = alloc() : memref<10x10xf32>
+  %c0 = constant 0 : index
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 0 into loop nest 2 at depth 1}}
+      %v0 = affine.load %0[%i0, %i1] : memref<10x10xf32>
+    }
+
+    affine.for %i2 = 0 to 10 {
+      affine.store %cf7, %0[%i0, %i2] : memref<10x10xf32>
+    }
+
+    affine.for %i3 = 0 to 10 {
+      // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 2 into loop nest 0 at depth 1}}
+      %v1 = affine.load %0[%i0, %i3] : memref<10x10xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_not_fuse_across_ssa_value_def_at_depth1() {
+func @should_not_fuse_across_ssa_value_def_at_depth1() {
+  %0 = alloc() : memref<10x10xf32>
+  %1 = alloc() : memref<10x10xf32>
+  %c0 = constant 0 : index
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 0 into loop nest 1 at depth 1}}
+      %v0 = affine.load %0[%i0, %i1] : memref<10x10xf32>
+      affine.store %v0, %1[%i0, %i1] : memref<10x10xf32>
+    }
+
+    // RAW dependence from store in loop nest '%i1' to 'load %1' prevents
+    // fusion loop nest '%i1' into loops after load.
+    %v1 = affine.load %1[%i0, %c0] : memref<10x10xf32>
+    "op0"(%v1) : (f32) -> ()
+
+    // Loop nest '%i2' cannot be fused past SSA value def '%c2' which it uses.
+    %c2 = constant 2 : index
+
+    affine.for %i2 = 0 to 10 {
+      // expected-remark@-1 {{block-level dependence preventing fusion of loop nest 1 into loop nest 0 at depth 1}}
+      affine.store %cf7, %0[%i0, %c2] : memref<10x10xf32>
+    }
+  }
+  return
+}
\ No newline at end of file
diff --git a/mlir/test/Transforms/loop-fusion-slice-computation.mlir b/mlir/test/Transforms/loop-fusion-slice-computation.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..f6872c201310061774d90cd1314a50b70950d845
--- /dev/null
+++ b/mlir/test/Transforms/loop-fusion-slice-computation.mlir
@@ -0,0 +1,145 @@
+// RUN: mlir-opt %s -test-loop-fusion -test-loop-fusion-slice-computation -split-input-file -verify-diagnostics | FileCheck %s
+
+// -----
+
+// CHECK-LABEL: func @slice_depth1_loop_nest() {
+func @slice_depth1_loop_nest() {
+  %0 = alloc() : memref<100xf32>
+  %cst = constant 7.000000e+00 : f32
+  affine.for %i0 = 0 to 16 {
+    // expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] )}}
+    affine.store %cst, %0[%i0] : memref<100xf32>
+  }
+  affine.for %i1 = 0 to 5 {
+    // expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] )}}
+    %1 = affine.load %0[%i1] : memref<100xf32>
+  }
+  return
+}
+
+// -----
+
+// Loop %i0 writes to locations [2, 17] and loop %i0 reads from locations [3, 6]
+// Slice loop bounds should be adjusted such that the load/store are for the
+// same location.
+// CHECK-LABEL: func @slice_depth1_loop_nest_with_offsets() {
+func @slice_depth1_loop_nest_with_offsets() {
+  %0 = alloc() : memref<100xf32>
+  %cst = constant 7.000000e+00 : f32
+  affine.for %i0 = 0 to 16 {
+    // expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 2) loop bounds: [(d0) -> (d0 + 3), (d0) -> (d0 + 4)] )}}
+    %a0 = affine.apply (d0) -> (d0 + 2)(%i0)
+    affine.store %cst, %0[%a0] : memref<100xf32>
+  }
+  affine.for %i1 = 4 to 8 {
+    // expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0 - 3), (d0) -> (d0 - 2)] )}}
+    %a1 = affine.apply (d0) -> (d0 - 1)(%i1)
+    %1 = affine.load %0[%a1] : memref<100xf32>
+  }
+  return
+}
+
+// -----
+
+// Slices at loop depth 1 should only slice the loop bounds of the first loop.
+// Slices at loop depth 2 should slice loop bounds of both loops.
+// CHECK-LABEL: func @slice_depth2_loop_nest() {
+func @slice_depth2_loop_nest() {
+  %0 = alloc() : memref<100x100xf32>
+  %cst = constant 7.000000e+00 : f32
+  affine.for %i0 = 0 to 16 {
+    // expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (8)] )}}
+    // expected-remark@-2 {{slice ( src loop: 1, dst loop: 0, depth: 2 : insert point: (2, 1) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}}
+    affine.for %i1 = 0 to 16 {
+      affine.store %cst, %0[%i0, %i1] : memref<100x100xf32>
+    }
+  }
+  affine.for %i2 = 0 to 10 {
+    // expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (8)] )}}
+    // expected-remark@-2 {{slice ( src loop: 0, dst loop: 1, depth: 2 : insert point: (2, 0) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}}
+    affine.for %i3 = 0 to 8 {
+      %1 = affine.load %0[%i2, %i3] : memref<100x100xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// The load at depth 1 in loop nest %i2 prevents slicing loop nest %i0 at depths
+// greater than 1. However, loop nest %i2 can be sliced into loop nest %i0 at
+// depths 1 and 2 because the dependent store in loop nest %i0 is at depth 2.
+// CHECK-LABEL: func @slice_depth2_loop_nest_two_loads() {
+func @slice_depth2_loop_nest_two_loads() {
+  %0 = alloc() : memref<100x100xf32>
+  %c0 = constant 0 : index
+  %cst = constant 7.000000e+00 : f32
+  affine.for %i0 = 0 to 16 {
+    // expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (8)] )}}
+    // expected-remark@-2 {{slice ( src loop: 1, dst loop: 0, depth: 2 : insert point: (2, 1) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (0), (d0, d1) -> (8)] )}}
+    affine.for %i1 = 0 to 16 {
+      affine.store %cst, %0[%i0, %i1] : memref<100x100xf32>
+    }
+  }
+  affine.for %i2 = 0 to 10 {
+    // expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (8)] )}}
+    affine.for %i3 = 0 to 8 {
+      %1 = affine.load %0[%i2, %i3] : memref<100x100xf32>
+    }
+    %2 = affine.load %0[%i2, %c0] : memref<100x100xf32>
+  }
+  return
+}
+
+// -----
+
+// The store at depth 1 in loop nest %i0 prevents slicing loop nest %i2 at
+// depths greater than 1 into loop nest %i0. However, loop nest %i0 can be
+// sliced into loop nest %i2 at depths 1 and 2 because the dependent load in
+// loop nest %i2 is at depth 2.
+// CHECK-LABEL: func @slice_depth2_loop_nest_two_stores() {
+func @slice_depth2_loop_nest_two_stores() {
+  %0 = alloc() : memref<100x100xf32>
+  %c0 = constant 0 : index
+  %cst = constant 7.000000e+00 : f32
+  affine.for %i0 = 0 to 16 {
+    // expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 2) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (8)] )}}
+    affine.for %i1 = 0 to 16 {
+      affine.store %cst, %0[%i0, %i1] : memref<100x100xf32>
+    }
+    affine.store %cst, %0[%i0, %c0] : memref<100x100xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    // expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (16)] )}}
+    // expected-remark@-2 {{slice ( src loop: 0, dst loop: 1, depth: 2 : insert point: (2, 0) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (0), (d0, d1) -> (16)] )}}
+    affine.for %i3 = 0 to 8 {
+      %1 = affine.load %0[%i2, %i3] : memref<100x100xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// Test loop nest which has a smaller outer trip count than its inner loop.
+// CHECK-LABEL: func @slice_loop_nest_with_smaller_outer_trip_count() {
+func @slice_loop_nest_with_smaller_outer_trip_count() {
+  %0 = alloc() : memref<100x100xf32>
+  %c0 = constant 0 : index
+  %cst = constant 7.000000e+00 : f32
+  affine.for %i0 = 0 to 16 {
+    // expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (10)] )}}
+    // expected-remark@-2 {{slice ( src loop: 1, dst loop: 0, depth: 2 : insert point: (2, 1) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}}
+    affine.for %i1 = 0 to 16 {
+      affine.store %cst, %0[%i0, %i1] : memref<100x100xf32>
+    }
+  }
+  affine.for %i2 = 0 to 8 {
+    // expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (10)] )}}
+    // expected-remark@-2 {{slice ( src loop: 0, dst loop: 1, depth: 2 : insert point: (2, 0) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}}
+    affine.for %i3 = 0 to 10 {
+      %1 = affine.load %0[%i2, %i3] : memref<100x100xf32>
+    }
+  }
+  return
+}
diff --git a/mlir/test/Transforms/loop-fusion.mlir b/mlir/test/Transforms/loop-fusion.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..339cc31f549033475fdf915da58162d351da5dbb
--- /dev/null
+++ b/mlir/test/Transforms/loop-fusion.mlir
@@ -0,0 +1,2446 @@
+// RUN: mlir-opt %s -affine-loop-fusion -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -affine-loop-fusion -fusion-maximal -split-input-file | FileCheck %s --check-prefix=MAXIMAL
+
+// TODO(andydavis) Add more tests:
+// *) Add nested fusion test cases when non-constant loop bound support is
+//    added to iteration domain in dependence check.
+// *) Add a test w/ floordiv/ceildiv/mod when supported in dependence check.
+// *) Add tests which check fused computation slice indexing and loop bounds.
+// TODO(andydavis) Test clean up: move memref allocs to func args.
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_raw_dep_for_locality() {
+func @should_fuse_raw_dep_for_locality() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %v0 = affine.load %m[%i1] : memref<10xf32>
+  }
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_reduction_to_pointwise() {
+func @should_fuse_reduction_to_pointwise() {
+  %a = alloc() : memref<10x10xf32>
+  %b = alloc() : memref<10xf32>
+  %c = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      %v0 = affine.load %b[%i0] : memref<10xf32>
+      %v1 = affine.load %a[%i0, %i1] : memref<10x10xf32>
+      %v3 = addf %v0, %v1 : f32
+      affine.store %v3, %b[%i0] : memref<10xf32>
+    }
+  }
+  affine.for %i2 = 0 to 10 {
+    %v4 = affine.load %b[%i2] : memref<10xf32>
+    affine.store %v4, %c[%i2] : memref<10xf32>
+  }
+
+  // Should fuse in entire inner loop on %i1 from source loop nest, as %i1
+  // is not used in the access function of the store/load on %b.
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+  // CHECK-NEXT:      %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[MAP_SHIFT_MINUS_ONE_R1:#map[0-9]+]] = (d0) -> (d0 - 1)
+// CHECK-DAG: [[MAP_SHIFT_BY_ONE:#map[0-9]+]] = (d0) -> (d0 + 1)
+
+// CHECK-LABEL: func @should_fuse_loop_nests_with_shifts() {
+func @should_fuse_loop_nests_with_shifts() {
+  %a = alloc() : memref<10x10xf32>
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 9 {
+    affine.for %i1 = 0 to 9 {
+      %idx = affine.apply (d0) -> (d0 + 1) (%i0)
+      %idy = affine.apply (d0) -> (d0 + 1) (%i1)
+      affine.store %cf7, %a[%idx, %idy] : memref<10x10xf32>
+    }
+  }
+  affine.for %i2 = 1 to 10 {
+    affine.for %i3 = 1 to 10 {
+      %v0 = affine.load %a[%i2, %i3] : memref<10x10xf32>
+    }
+  }
+
+  // Source slice affine apply sequence:
+  // *) First two affine apply's map from the dst to src iteration space.
+  // *) Third affine apply is access function around src store.
+  // *) Fourth affine apply shifts the stores access function by '-1', because
+  //    of the offset induced by reducing the memref shape from 10x10 to 9x9.
+  // *) Fifth affine apply shifts the loads access function by '-1', because
+  //    of the offset induced by reducing the memref shape from 10x10 to 9x9.
+  // NOTE: Should create a private memref with reduced shape 9x9xf32.
+  // CHECK:      affine.for %{{.*}} = 1 to 10 {
+  // CHECK-NEXT:   affine.for %{{.*}} = 1 to 10 {
+  // CHECK-NEXT:     %{{.*}} = affine.apply [[MAP_SHIFT_MINUS_ONE_R1]](%{{.*}})
+  // CHECK-NEXT:     %{{.*}} = affine.apply [[MAP_SHIFT_MINUS_ONE_R1]](%{{.*}})
+  // CHECK-NEXT:     %{{.*}} = affine.apply [[MAP_SHIFT_BY_ONE]](%{{.*}})
+  // CHECK-NEXT:     %{{.*}} = affine.apply [[MAP_SHIFT_BY_ONE]](%{{.*}})
+  // CHECK-NEXT:     affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:     %{{.*}} = affine.load %{{.*}}[0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:   }
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_loop_nest() {
+func @should_fuse_loop_nest() {
+  %a = alloc() : memref<10x10xf32>
+  %b = alloc() : memref<10x10xf32>
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.store %cf7, %a[%i0, %i1] : memref<10x10xf32>
+    }
+  }
+  affine.for %i2 = 0 to 10 {
+    affine.for %i3 = 0 to 10 {
+      %v0 = affine.load %a[%i3, %i2] : memref<10x10xf32>
+      affine.store %v0, %b[%i2, %i3] : memref<10x10xf32>
+    }
+  }
+  affine.for %i4 = 0 to 10 {
+    affine.for %i5 = 0 to 10 {
+      %v1 = affine.load %b[%i4, %i5] : memref<10x10xf32>
+    }
+  }
+  // Expecting private memref for '%a' first, then private memref for '%b'.
+  // CHECK-DAG:  [[NEWA:%[0-9]+]] = alloc() : memref<1x1xf32>
+  // CHECK-DAG:  [[NEWB:%[0-9]+]] = alloc() : memref<1x1xf32>
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:     affine.store %{{.*}}, [[NEWA]][0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:     %{{.*}} = affine.load [[NEWA]][0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:     affine.store %{{.*}}, [[NEWB]][0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:     %{{.*}} = affine.load [[NEWB]][0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:   }
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_across_intermediate_loop_with_no_deps() {
+func @should_fuse_across_intermediate_loop_with_no_deps() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %c = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    %v0 = affine.load %a[%i0] : memref<10xf32>
+    affine.store %v0, %b[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    affine.store %cf7, %c[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    %v1 = affine.load %b[%i2] : memref<10xf32>
+  }
+
+  // Should fuse first loop (past second loop with no dependences) into third.
+  // Note that fusion creates a private memref '%2' for the fused loop nest.
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_all_loops() {
+func @should_fuse_all_loops() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+
+  // Set up flow dependences from first and second loops to third.
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %a[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    affine.store %cf7, %b[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    %v0 = affine.load %a[%i2] : memref<10xf32>
+    %v1 = affine.load %b[%i2] : memref<10xf32>
+  }
+
+  // Should fuse first and second loops into third.
+  // Expecting private memref for '%a' first, then private memref for '%b'.
+  // CHECK-DAG: [[NEWA:%[0-9]+]] = alloc() : memref<1xf32>
+  // CHECK-DAG: [[NEWB:%[0-9]+]] = alloc() : memref<1xf32>
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, [[NEWA]][0] : memref<1xf32>
+  // CHECK-NEXT:   affine.store %{{.*}}, [[NEWB]][0] : memref<1xf32>
+  // CHECK-NEXT:   %{{.*}} = affine.load [[NEWA]][0] : memref<1xf32>
+  // CHECK-NEXT:   %{{.*}} = affine.load [[NEWB]][0] : memref<1xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_first_and_second_loops() {
+func @should_fuse_first_and_second_loops() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %c = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %a[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %v0 = affine.load %a[%i1] : memref<10xf32>
+    affine.store %cf7, %b[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    %v1 = affine.load %c[%i2] : memref<10xf32>
+  }
+
+  // Should fuse first loop into the second (last loop should not be fused).
+  // Should create private memref '%2' for fused loop.
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_not_fuse_would_create_cycle() {
+func @should_not_fuse_would_create_cycle() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %c = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+
+  // Set up the following dependences:
+  // 1) loop0 -> loop1 on memref '%{{.*}}'
+  // 2) loop0 -> loop2 on memref '%{{.*}}'
+  // 3) loop1 -> loop2 on memref '%{{.*}}'
+  affine.for %i0 = 0 to 10 {
+    %v0 = affine.load %a[%i0] : memref<10xf32>
+    affine.store %cf7, %b[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    affine.store %cf7, %a[%i1] : memref<10xf32>
+    %v1 = affine.load %c[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    %v2 = affine.load %b[%i2] : memref<10xf32>
+    affine.store %cf7, %c[%i2] : memref<10xf32>
+  }
+  // Should not fuse: fusing loop first loop into last would create a cycle.
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_producer_consumer() {
+func @should_fuse_producer_consumer() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    affine.store %cf7, %m[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    %v1 = affine.load %m[%i2] : memref<10xf32>
+  }
+  // Fusing loop %i0 to %i2 would violate the WAW dependence between %i0 and
+  // %i1, but OK to fuse %i1 into %i2.
+  // TODO(andydavis) When the fusion pass is run to a fixed-point, it should
+  // fuse all three of these loop nests.
+  // CHECK:      %{{.*}} = alloc() : memref<1xf32>
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_and_move_to_preserve_war_dep() {
+func @should_fuse_and_move_to_preserve_war_dep() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    %v0 = affine.load %a[%i0] : memref<10xf32>
+    affine.store %v0, %b[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    affine.store %cf7, %a[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    %v1 = affine.load %b[%i2] : memref<10xf32>
+  }
+  // Loops '%i1' and '%i2' have no dependences. We can fuse a slice of '%i0'
+  // into '%i2' if we move the fused loop nest before '%i1', which preserves
+  // the WAR dependence from load '%a' in '%i0' to the store '%a' in loop '%i1'.
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_with_private_memref_if_top_level_access() {
+func @should_fuse_with_private_memref_if_top_level_access() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %v0 = affine.load %m[%i1] : memref<10xf32>
+  }
+
+  %c0 = constant 4 : index
+  %v1 = affine.load %m[%c0] : memref<10xf32>
+  // Top-level load to '%{{.*}}' should prevent fusion.
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT: }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_no_top_level_access() {
+func @should_fuse_no_top_level_access() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %v0 = affine.load %m[%i1] : memref<10xf32>
+  }
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+#set0 = (d0) : (1 == 0)
+
+// CHECK-LABEL: func @should_not_fuse_if_inst_at_top_level() {
+func @should_not_fuse_if_inst_at_top_level() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %v0 = affine.load %m[%i1] : memref<10xf32>
+  }
+  %c0 = constant 4 : index
+  affine.if #set0(%c0) {
+  }
+  // Top-level IfOp should prevent fusion.
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT: }
+  return
+}
+
+// -----
+
+#set0 = (d0) : (1 == 0)
+
+// CHECK-LABEL: func @should_not_fuse_if_inst_in_loop_nest() {
+func @should_not_fuse_if_inst_in_loop_nest() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %c4 = constant 4 : index
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    affine.if #set0(%c4) {
+    }
+    %v0 = affine.load %m[%i1] : memref<10xf32>
+  }
+
+  // IfOp in ForInst should prevent fusion.
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:   affine.if #set0(%{{.*}}) {
+  // CHECK-NEXT:   }  
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT: }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @permute_and_fuse() {
+func @permute_and_fuse() {
+  %m = alloc() : memref<10x20x30xf32>
+
+  %cf7 = constant 7.0 : f32
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 20 {
+      affine.for %i2 = 0 to 30 {
+        affine.store %cf7, %m[%i0, %i1, %i2] : memref<10x20x30xf32>
+      }
+    }
+  }
+  affine.for %i3 = 0 to 30 {
+    affine.for %i4 = 0 to 10 {
+      affine.for %i5 = 0 to 20 {
+        %v0 = affine.load %m[%i4, %i5, %i3] : memref<10x20x30xf32>
+        "foo"(%v0) : (f32) -> ()
+      }
+    }
+  }
+// CHECK:       affine.for %{{.*}} = 0 to 30 {
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:      affine.for %{{.*}} = 0 to 20 {
+// CHECK-NEXT:        affine.store %{{.*}}, %{{.*}}[0, 0, 0] : memref<1x1x1xf32>
+// CHECK-NEXT:        %{{.*}} = affine.load %{{.*}}[0, 0, 0] : memref<1x1x1xf32>
+// CHECK-NEXT:        "foo"(%{{.*}}) : (f32) -> ()
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0, d1) -> (d0 * 4 + d1)
+// CHECK-DAG: [[MAP1:#map[0-9]+]] = (d0) -> (d0 floordiv 4)
+// CHECK-DAG: [[MAP2:#map[0-9]+]] = (d0) -> (d0 mod 4)
+
+// Reshape from a 64 x f32 to 16 x 4 x f32.
+// CHECK-LABEL: func @fuse_reshape_64_16_4
+func @fuse_reshape_64_16_4(%in : memref<64xf32>) {
+  %out = alloc() : memref<16x4xf32>
+
+  affine.for %i0 = 0 to 64 {
+    %v = affine.load %in[%i0] : memref<64xf32>
+    %idx = affine.apply (d0) -> (d0 floordiv 4) (%i0)
+    %idy = affine.apply (d0) -> (d0 mod 4) (%i0)
+    affine.store %v, %out[%idx, %idy] : memref<16x4xf32>
+  }
+
+  affine.for %i1 = 0 to 16 {
+    affine.for %i2 = 0 to 4 {
+      %w = affine.load %out[%i1, %i2] : memref<16x4xf32>
+      "foo"(%w) : (f32) -> ()
+    }
+  }
+  return
+  // CHECK:      affine.for %{{.*}} =
+  // CHECK-NEXT:   affine.for %{{.*}} =
+  // CHECK-NOT:    for
+  // CHECK:        }
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+}
+
+// -----
+// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0 floordiv 4)
+// CHECK-DAG: [[MAP1:#map[0-9]+]] = (d0) -> (d0 mod 4)
+// CHECK-DAG: [[MAP2:#map[0-9]+]] = (d0, d1) -> (d0 * 4 + d1)
+
+// Reshape a 16x4xf32 to 64xf32.
+// CHECK-LABEL: func @fuse_reshape_16_4_64
+func @fuse_reshape_16_4_64() {
+  %in = alloc() : memref<16x4xf32>
+  %out = alloc() : memref<64xf32>
+
+  affine.for %i0 = 0 to 16 {
+    affine.for %i1 = 0 to 4 {
+      %v = affine.load %in[%i0, %i1] : memref<16x4xf32>
+      %idx = affine.apply (d0, d1) -> (4*d0 + d1) (%i0, %i1)
+      affine.store %v, %out[%idx] : memref<64xf32>
+    }
+  }
+
+  affine.for %i2 = 0 to 64 {
+    %w = affine.load %out[%i2] : memref<64xf32>
+    "foo"(%w) : (f32) -> ()
+  }
+// CHECK:       affine.for %{{.*}} = 0 to 64 {
+// CHECK-NEXT:    %{{.*}} = affine.apply [[MAP0]](%{{.*}})
+// CHECK-NEXT:    %{{.*}} = affine.apply [[MAP1]](%{{.*}})
+// CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<16x4xf32>
+// CHECK-NEXT:    %{{.*}} = affine.apply [[MAP2]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+// CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+// CHECK-NEXT:    "foo"(%{{.*}}) : (f32) -> ()
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+  return
+}
+
+
+// -----
+
+// All three loop nests below (6-d one, 2-d one, 2-d one is fused into a single
+// 2-d loop nest).
+func @R6_to_R2_reshape_square() -> memref<64x9xi32> {
+  %in = alloc() : memref<2x2x3x3x16x1xi32>
+  %out = alloc() : memref<64x9xi32>
+  %live_out = alloc() : memref<64x9xi32>
+
+  // Initialize input.
+  affine.for %i0 = 0 to 2 {
+    affine.for %i1 = 0 to 2 {
+      affine.for %i2 = 0 to 3 {
+        affine.for %i3 = 0 to 3 {
+          affine.for %i4 = 0 to 16 {
+            affine.for %i5 = 0 to 1 {
+              %val = "foo"(%i0, %i1, %i2, %i3, %i4, %i5) : (index, index, index, index, index, index) -> i32
+              affine.store %val, %in[%i0, %i1, %i2, %i3, %i4, %i5] : memref<2x2x3x3x16x1xi32>
+            }
+          }
+        }
+      }
+    }
+  }
+
+  affine.for %ii = 0 to 64 {
+    affine.for %jj = 0 to 9 {
+      // Convert output coordinates to linear index.
+      %a0 = affine.apply (d0, d1) -> (d0 * 9 + d1) (%ii, %jj)
+      %0 = affine.apply (d0) -> (d0 floordiv (2 * 3 * 3 * 16 * 1))(%a0)
+      %1 = affine.apply (d0) -> ((d0 mod 288) floordiv (3 * 3 * 16 * 1))(%a0)
+      %2 = affine.apply (d0) -> (((d0 mod 288) mod 144) floordiv (3 * 16 * 1))(%a0)
+      %3 = affine.apply (d0) -> ((((d0 mod 288) mod 144) mod 48) floordiv (16 * 1))(%a0)
+      %4 = affine.apply (d0) -> ((((d0 mod 288) mod 144) mod 48) mod 16)(%a0)
+      %5 = affine.apply (d0) -> (((((d0 mod 144) mod 144) mod 48) mod 16) mod 1)(%a0)
+      %v = affine.load %in[%0, %1, %2, %3, %4, %5] : memref<2x2x3x3x16x1xi32>
+      affine.store %v, %out[%ii, %jj] : memref<64x9xi32>
+    }
+  }
+
+  affine.for %i = 0 to 64 {
+    affine.for %j = 0 to 9 {
+      %a = affine.load %out[%i, %j] : memref<64x9xi32>
+      %b = muli %a, %a : i32
+      affine.store %b, %live_out[%i, %j] : memref<64x9xi32>
+    }
+  }
+  return %live_out : memref<64x9xi32>
+}
+// Everything above is fused to a single 2-d loop nest, and the 6-d tensor %in
+// is eliminated if -memref-dataflow-opt is also supplied.
+//
+// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0, d1) -> ((d0 * 9 + d1) floordiv 288)
+// CHECK-DAG: [[MAP1:#map[0-9]+]] = (d0, d1) -> (((d0 * 9 + d1) mod 288) floordiv 144)
+// CHECK-DAG: [[MAP2:#map[0-9]+]] = (d0, d1) -> ((((d0 * 9 + d1) mod 288) mod 144) floordiv 48)
+// CHECK-DAG: [[MAP3:#map[0-9]+]] = (d0, d1) -> (((((d0 * 9 + d1) mod 288) mod 144) mod 48) floordiv 16)
+// CHECK-DAG: [[MAP4:#map[0-9]+]] = (d0, d1) -> (((((d0 * 9 + d1) mod 288) mod 144) mod 48) mod 16)
+// CHECK-DAG: [[MAP11:#map[0-9]+]] = (d0, d1) -> (d0 * 9 + d1)
+// CHECK-DAG: [[MAP12:#map[0-9]+]] = (d0) -> (d0 floordiv 288)
+// CHECK-DAG: [[MAP13:#map[0-9]+]] = (d0) -> ((d0 mod 288) floordiv 144)
+// CHECK-DAG: [[MAP14:#map[0-9]+]] = (d0) -> (((d0 mod 288) mod 144) floordiv 48)
+// CHECK-DAG: [[MAP15:#map[0-9]+]] = (d0) -> ((((d0 mod 288) mod 144) mod 48) floordiv 16)
+// CHECK-DAG: [[MAP16:#map[0-9]+]] = (d0) -> ((((d0 mod 288) mod 144) mod 48) mod 16)
+// CHECK-DAG: [[MAP17:#map[0-9]+]] = (d0) -> (0)
+
+//
+// CHECK-LABEL: func @R6_to_R2_reshape
+// CHECK:       %{{.*}} = alloc() : memref<1x2x3x3x16x1xi32>
+// CHECK:       %{{.*}} = alloc() : memref<1x1xi32>
+// CHECK:       %{{.*}} = alloc() : memref<64x9xi32>
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to 64 {
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to 9 {
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP0]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP1]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP2]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP3]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP4]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:      %{{.*}} = "foo"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (index, index, index, index, index, index) -> i32
+// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0, ((%{{.*}} * 9 + %{{.*}}) mod 288) floordiv 144, (((%{{.*}} * 9 + %{{.*}}) mod 288) mod 144) floordiv 48, ((((%{{.*}} * 9 + %{{.*}}) mod 288) mod 144) mod 48) floordiv 16, ((((%{{.*}} * 9 + %{{.*}}) mod 288) mod 144) mod 48) mod 16, 0] : memref<1x2x3x3x16x1xi32>
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP11]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP12]](%{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP13]](%{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP14]](%{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP15]](%{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP16]](%{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP17]](%{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[0, ((%{{.*}} * 9 + %{{.*}}) mod 288) floordiv 144, (((%{{.*}} * 9 + %{{.*}}) mod 288) mod 144) floordiv 48, ((((%{{.*}} * 9 + %{{.*}}) mod 288) mod 144) mod 48) floordiv 16, ((((%{{.*}} * 9 + %{{.*}}) mod 288) mod 144) mod 48) mod 16, 0] : memref<1x2x3x3x16x1xi32>
+// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xi32>
+// CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[0, 0] : memref<1x1xi32>
+// CHECK-NEXT:      %{{.*}} = muli %{{.*}}, %{{.*}} : i32
+// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x9xi32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return %{{.*}} : memref<64x9xi32>
+
+// -----
+
+// CHECK-LABEL: func @fuse_symbolic_bounds
+func @fuse_symbolic_bounds(%M : index, %N : index) {
+  %N_plus_5 = affine.apply (d0) -> (d0 + 5)(%N)
+  %m = alloc(%M, %N_plus_5) : memref<? x ? x f32>
+
+  %c0 = constant 0.0 : f32
+  %s = constant 5 : index
+
+  affine.for %i0 = 0 to %M {
+    affine.for %i1 = 0 to (d0) -> (d0 + 5) (%N) {
+      affine.store %c0, %m[%i0, %i1] : memref<? x ? x f32>
+    }
+  }
+
+  affine.for %i2 = 0 to %M {
+    affine.for %i3 = 0 to %N {
+      %idy = affine.apply (d0)[s0] -> (d0 + s0) (%i3)[%s]
+      %v = affine.load %m[%i2, %idy] : memref<? x ? x f32>
+    }
+  }
+
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_reduction_at_depth1
+func @should_fuse_reduction_at_depth1() {
+  %a = alloc() : memref<10x100xf32>
+  %b = alloc() : memref<10xf32>
+
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 100 {
+      %v0 = affine.load %b[%i0] : memref<10xf32>
+      %v1 = affine.load %a[%i0, %i1] : memref<10x100xf32>
+      %v2 = "maxf"(%v0, %v1) : (f32, f32) -> f32
+      affine.store %v2, %b[%i0] : memref<10xf32>
+    }
+  }
+  affine.for %i2 = 0 to 10 {
+    affine.for %i3 = 0 to 100 {
+      %v3 = affine.load %b[%i2] : memref<10xf32>
+      %v4 = affine.load %a[%i2, %i3] : memref<10x100xf32>
+      %v5 = subf %v4, %v3 : f32
+      affine.store %v5, %b[%i2] : memref<10xf32>
+    }
+  }
+  // This test should fuse the src reduction loop at depth 1 in the destination
+  // loop nest, which improves locality and enables subsequence passes to
+  // decrease the reduction memref size and possibly place it in a faster
+  // memory space.
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 100 {
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x100xf32>
+  // CHECK-NEXT:      %{{.*}} = "maxf"(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 100 {
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x100xf32>
+  // CHECK-NEXT:      %{{.*}} = subf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_at_src_depth1_and_dst_depth1
+func @should_fuse_at_src_depth1_and_dst_depth1() {
+  %a = alloc() : memref<100x16xf32>
+  %b = alloc() : memref<100x16xf32>
+
+  affine.for %i0 = 0 to 100 {
+    affine.for %i1 = 0 to 16 {
+      %v0 = affine.load %a[%i0, %i1] : memref<100x16xf32>
+      "op0"(%v0) : (f32) -> ()
+    }
+    affine.for %i2 = 0 to 16 {
+      %v1 = "op1"() : () -> (f32)
+      affine.store %v1, %b[%i0, %i2] : memref<100x16xf32>
+    }
+  }
+
+  affine.for %i3 = 0 to 100 {
+    affine.for %i4 = 0 to 16 {
+      %v2 = affine.load %b[%i3, %i4] : memref<100x16xf32>
+      "op2"(%v2) : (f32) -> ()
+    }
+  }
+  // We can slice iterations of the '%i0' and '%i1' loops in the source
+  // loop nest, but slicing at depth 2 and inserting the slice in the
+  // destination loop nest at depth2 causes extra computation. Instead,
+  // the fusion algorithm should detect that the source loop should be sliced
+  // at depth 1 and the slice should be inserted at depth 1.
+  // CHECK:       affine.for %{{.*}} = 0 to 100 {
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 16 {
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<100x16xf32>
+  // CHECK-NEXT:      "op0"(%{{.*}}) : (f32) -> ()
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 16 {
+  // CHECK-NEXT:      %{{.*}} = "op1"() : () -> f32
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 16 {
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
+  // CHECK-NEXT:      "op2"(%{{.*}}) : (f32) -> ()
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (d0 * 10 + d1)
+
+// CHECK-LABEL: func @should_fuse_src_depth1_at_dst_depth2
+func @should_fuse_src_depth1_at_dst_depth2() {
+  %a = alloc() : memref<100xf32>
+  %c0 = constant 0.0 : f32
+
+  affine.for %i0 = 0 to 100 {
+    affine.store %c0, %a[%i0] : memref<100xf32>
+  }
+
+  affine.for %i1 = 0 to 10 {
+    affine.for %i2 = 0 to 10 {
+      %a0 = affine.apply (d0, d1) -> (d0 * 10 + d1) (%i1, %i2)
+      %v0 = affine.load %a[%a0] : memref<100xf32>
+    }
+  }
+  // The source loop nest slice loop bound is a function of both destination
+  // loop IVs, so we should slice at depth 1 and insert the slice at depth 2.
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:      %{{.*}} = affine.apply [[MAP0]](%{{.*}}, %{{.*}})
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:      %{{.*}} = affine.apply [[MAP0]](%{{.*}}, %{{.*}})
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @fusion_at_depth0_not_currently_supported
+func @fusion_at_depth0_not_currently_supported() {
+  %0 = alloc() : memref<10xf32>
+  %c0 = constant 0 : index
+  %cst = constant 0.000000e+00 : f32
+  affine.for %i0 = 0 to 10 {
+    affine.store %cst, %0[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %1 = affine.load %0[%c0] : memref<10xf32>
+  }
+  // NOTE: Should shrink memref size to 1 element access by load in dst loop
+  // nest, and make the store in the slice store to the same element.
+  // CHECK-DAG:   %{{.*}} = alloc() : memref<1xf32>
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_deep_loop_nests
+func @should_fuse_deep_loop_nests() {
+  %0 = alloc() : memref<2x2x3x3x16x10xf32, 2>
+  %1 = alloc() : memref<2x2x3x3x16x10xf32, 2>
+  %2 = alloc() : memref<3x3x3x3x16x10xf32, 2>
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c1_0 = constant 1 : index
+  %cst = constant 0.000000e+00 : f32
+  affine.for %i0 = 0 to 2 {
+    affine.for %i1 = 0 to 2 {
+      affine.for %i2 = 0 to 3 {
+        affine.for %i3 = 0 to 3 {
+          affine.for %i4 = 0 to 16 {
+            affine.for %i5 = 0 to 10 {
+              %3 = affine.load %0[%i0, %i1, %i2, %i3, %i4, %i5]
+                : memref<2x2x3x3x16x10xf32, 2>
+            }
+          }
+          affine.for %i6 = 0 to 16 {
+            affine.for %i7 = 0 to 10 {
+              affine.store %cst, %1[%i0, %i1, %i2, %i3, %i6, %i7]
+                : memref<2x2x3x3x16x10xf32, 2>
+            }
+          }
+        }
+      }
+    }
+  }
+  affine.for %i8 = 0 to 3 {
+    affine.for %i9 = 0 to 3 {
+      affine.for %i10 = 0 to 2 {
+        affine.for %i11 = 0 to 2 {
+          affine.for %i12 = 0 to 3 {
+            affine.for %i13 = 0 to 3 {
+              affine.for %i14 = 0 to 2 {
+                affine.for %i15 = 0 to 2 {
+                  affine.for %i16 = 0 to 16 {
+                    affine.for %i17 = 0 to 10 {
+                      %5 = affine.load %0[%i14, %i15, %i12, %i13, %i16, %i17]
+                        : memref<2x2x3x3x16x10xf32, 2>
+                    }
+                  }
+                  affine.for %i18 = 0 to 16 {
+                    affine.for %i19 = 0 to 10 {
+                      %6 = affine.load %1[%i10, %i11, %i8, %i9, %i18, %i19]
+                        : memref<2x2x3x3x16x10xf32, 2>
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+// The first four loops of the source loop nest can be sliced with iteration
+// bounds which are a function of the first four loops of destination loop nest,
+// where the destination loops nests have been interchanged.
+
+// CHECK-DAG:   %{{.*}} = alloc() : memref<1x1x1x1x16x10xf32, 2>
+// CHECK:       affine.for %{{.*}} = 0 to 3 {
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to 3 {
+// CHECK-NEXT:      affine.for %{{.*}} = 0 to 2 {
+// CHECK-NEXT:        affine.for %{{.*}} = 0 to 2 {
+// CHECK-NEXT:          affine.for %{{.*}} = 0 to 3 {
+// CHECK-NEXT:            affine.for %{{.*}} = 0 to 3 {
+// CHECK-NEXT:              affine.for %{{.*}} = 0 to 16 {
+// CHECK-NEXT:                affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:                  %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<2x2x3x3x16x10xf32, 2>
+// CHECK-NEXT:                }
+// CHECK-NEXT:              }
+// CHECK-NEXT:              affine.for %{{.*}} = 0 to 16 {
+// CHECK-NEXT:                affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:                  affine.store %{{.*}}, %{{.*}}[0, 0, 0, 0, %{{.*}}, %{{.*}}] : memref<1x1x1x1x16x10xf32, 2>
+// CHECK-NEXT:                }
+// CHECK-NEXT:              }
+// CHECK-NEXT:              affine.for %{{.*}} = 0 to 2 {
+// CHECK-NEXT:                affine.for %{{.*}} = 0 to 2 {
+// CHECK-NEXT:                  affine.for %{{.*}} = 0 to 16 {
+// CHECK-NEXT:                    affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:                      %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<2x2x3x3x16x10xf32, 2>
+// CHECK-NEXT:                    }
+// CHECK-NEXT:                  }
+// CHECK-NEXT:                  affine.for %{{.*}} = 0 to 16 {
+// CHECK-NEXT:                    affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:                      %{{.*}} = affine.load %{{.*}}[0, 0, 0, 0, %{{.*}}, %{{.*}}] : memref<1x1x1x1x16x10xf32, 2>
+// CHECK-NEXT:                    }
+// CHECK-NEXT:                  }
+// CHECK-NEXT:                }
+// CHECK-NEXT:              }
+// CHECK-NEXT:            }
+// CHECK-NEXT:          }
+// CHECK-NEXT:        }
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_at_depth1_and_reduce_slice_trip_count
+func @should_fuse_at_depth1_and_reduce_slice_trip_count() {
+  %a = alloc() : memref<4x256xf32>
+  %b = alloc() : memref<4x256xf32>
+
+  %c0 = constant 0 : index
+  %cf0 = constant 0.0 : f32
+
+  affine.for %i0 = 0 to 4 {
+    affine.for %i1 = 0 to 256 {
+      %v0 = affine.load %b[%i0, %i1] : memref<4x256xf32>
+    }
+    affine.for %i2 = 0 to 256 {
+      affine.store %cf0, %a[%i0, %i2] : memref<4x256xf32>
+    }
+  }
+
+  affine.for %d0 = 0 to 4 {
+    affine.for %d1 = 0 to 16 {
+      %v1 = affine.load %a[%d0, %d1] : memref<4x256xf32>
+    }
+  }
+  // The cost of fusing at depth 2 is greater than the cost of fusing at depth 1
+  // for two reasons:
+  // 1) Inserting the unsliceable src loop %i1 to a higher depth removes
+  //    redundant computation and reduces costs.
+  // 2) Inserting the sliceable src loop %i2 at depth 1, we can still reduce
+  //    its trip count to 16 (from 256) reducing costs.
+  // NOTE: the size of the private memref created for the fused loop nest
+  // is reduced from the original shape from 4x256 to 4x16 because of the
+  // data accessed by the load.
+  // CHECK-DAG:   %{{.*}} = alloc() : memref<1x16xf32>
+  // CHECK:       affine.for %{{.*}} = 0 to 4 {
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 256 {
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4x256xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 16 {
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 16 {
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_at_depth1_with_trip_count_20
+func @should_fuse_at_depth1_with_trip_count_20() {
+  %a = alloc() : memref<100xf32>
+  %c0 = constant 0 : index
+  %cf0 = constant 0.0 : f32
+
+  affine.for %i0 = 0 to 100 {
+    affine.store %cf0, %a[%i0]: memref<100xf32>
+  }
+
+  affine.for %i1 = 0 to 5 {
+    affine.for %i2 = 0 to 10 {
+      %v0 = affine.load %a[%i2]: memref<100xf32>
+    }
+    affine.for %i3 = 0 to 10 {
+      affine.for %i4 = 0 to 20 {
+        %v1 = affine.load %a[%i4]: memref<100xf32>
+      }
+    }
+  }
+  // NOTE: The size of the private memref created for fusion is shrunk to 20xf32
+  // CHECK-DAG:   %{{.*}} = alloc() : memref<20xf32>
+  // CHECK:       affine.for %{{.*}} = 0 to 5 {
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 20 {
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<20xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<20xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:      affine.for %{{.*}} = 0 to 20 {
+  // CHECK-NEXT:        %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<20xf32>
+  // CHECK-NEXT:      }
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_at_depth1_with_trip_count_19
+func @should_fuse_at_depth1_with_trip_count_19() {
+  %a = alloc() : memref<100xf32>
+  %c0 = constant 0 : index
+  %cf0 = constant 0.0 : f32
+
+  affine.for %i0 = 0 to 100 {
+    affine.store %cf0, %a[%i0]: memref<100xf32>
+  }
+
+  affine.for %i1 = 0 to 5 {
+    affine.for %i2 = 0 to 19 {
+      %v0 = affine.load %a[%i2]: memref<100xf32>
+    }
+    affine.for %i3 = 0 to 10 {
+      affine.for %i4 = 0 to 10 {
+        %v1 = affine.load %a[%i4]: memref<100xf32>
+      }
+    }
+  }
+  // NOTE: The size of the private memref created for fusion is shrunk to 19xf32
+  // CHECK-DAG:   %{{.*}} = alloc() : memref<19xf32>
+  // CHECK:       affine.for %{{.*}} = 0 to 5 {
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 19 {
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<19xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 19 {
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<19xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:      affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:        %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<19xf32>
+  // CHECK-NEXT:      }
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_with_private_memrefs_with_diff_shapes() {
+func @should_fuse_with_private_memrefs_with_diff_shapes() {
+  %m = alloc() : memref<100xf32>
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 100 {
+    affine.store %cf7, %m[%i0] : memref<100xf32>
+  }
+  affine.for %i1 = 0 to 17 {
+    %v0 = affine.load %m[%i1] : memref<100xf32>
+  }
+  affine.for %i2 = 0 to 82 {
+    %v1 = affine.load %m[%i2] : memref<100xf32>
+  }
+  // Should create two new private memrefs customized to the shapes accessed
+  // by loops %{{.*}} and %{{.*}}.
+  // CHECK-DAG:  %{{.*}} = alloc() : memref<1xf32>
+  // CHECK-DAG:  %{{.*}} = alloc() : memref<1xf32>
+  // CHECK:      affine.for %{{.*}} = 0 to 17 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: affine.for %{{.*}} = 0 to 82 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_not_fuse_live_out_arg(%{{.*}}: memref<10xf32>) {
+func @should_not_fuse_live_out_arg(%arg0: memref<10xf32>) {
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %arg0[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 9 {
+    %v0 = affine.load %arg0[%i1] : memref<10xf32>
+  }
+  // This tests that the loop nest '%i0' should not be removed after fusion
+  // because it writes to memref argument '%arg0', and its read region
+  // does not cover its write region (so fusion would shrink the write region
+  // in the fused loop nest, so complete live out data region would not
+  // be written).
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 9 {
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_live_out_arg(%{{.*}}: memref<10xf32>) {
+func @should_fuse_live_out_arg(%arg0: memref<10xf32>) {
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %arg0[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %v0 = affine.load %arg0[%i1] : memref<10xf32>
+  }
+  // The read/write regions for memref '%{{.*}}' are the same for both
+  // loops, so they should fuse.
+
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_not_fuse_escaping_memref() -> memref<10xf32>
+func @should_not_fuse_escaping_memref() -> memref<10xf32> {
+  %cf7 = constant 7.0 : f32
+  %m = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 9 {
+    %v0 = affine.load %m[%i1] : memref<10xf32>
+  }
+  // This tests that the loop nest '%{{.*}}' should not be removed after fusion
+  // because it writes to memref '%{{.*}}' which is returned by the function. 
+  // CHECK-DAG:   %{{.*}} = alloc() : memref<10xf32>
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 9 {
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return %{{.*}} : memref<10xf32>
+  return %m : memref<10xf32>
+}
+
+// -----
+
+// This should fuse with the %in becoming a 1x1x1.
+func @R3_to_R2_reshape() {
+  %in = alloc() : memref<2x3x16xi32>
+
+  %c0 = constant 0 : index
+
+  affine.for %i0 = 0 to 2 {
+    affine.for %i1 = 0 to 3 {
+      affine.for %i2 = 0 to 16 {
+        %val = "foo"(%i0, %i1, %i2) : (index, index, index) -> i32
+        affine.store %val, %in[%i0, %i1, %i2] : memref<2x3x16xi32>
+      }
+    }
+  }
+
+  affine.for %ii = 0 to 32 {
+    affine.for %jj = 0 to 3 {
+      %a0 = affine.apply (d0, d1) -> (d0 * 3 + d1) (%ii, %jj)
+      %idx = affine.apply (d0) -> (d0 floordiv (3 * 16)) (%a0)
+      %v = affine.load %in[%idx, %jj, %c0]
+        : memref<2x3x16xi32>
+    }
+  }
+  return
+}
+// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0, d1) -> ((d0 * 3 + d1) floordiv 48)
+// CHECK-DAG: [[MAP1:#map[0-9]+]] = (d0, d1) -> (d0 * 3 + d1)
+// CHECK-DAG: [[MAP2:#map[0-9]+]] = (d0) -> (d0 floordiv 48)
+
+// CHECK-LABEL: func @R3_to_R2_reshape()
+// CHECK-DAG:    %{{.*}} = alloc() : memref<1x1x1xi32>
+// CHECK:        affine.for %{{.*}} = 0 to 32 {
+// CHECK-NEXT:     affine.for %{{.*}} = 0 to 3 {
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP0]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:      %{{.*}} = "foo"(%{{.*}}, %{{.*}}, %{{.*}}) : (index, index, index) -> i32
+// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0, 0, 0] : memref<1x1x1xi32>
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP1]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP2]](%{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[0, 0, 0] : memref<1x1x1xi32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+// -----
+
+func @should_not_fuse_multi_output_producer() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %a[%i0] : memref<10xf32>
+    affine.store %cf7, %b[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %v0 = affine.load %a[%i1] : memref<10xf32>
+    %v1 = affine.load %b[%i1] : memref<10xf32>
+  }
+
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @fusion_preventing_deps_on_middle_loop() {
+func @fusion_preventing_deps_on_middle_loop() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %c = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    %v0 = affine.load %a[%i0] : memref<10xf32>
+    affine.store %v0, %b[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    affine.store %cf7, %a[%i1] : memref<10xf32>
+    %v1 = affine.load %c[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    %v2 = affine.load %b[%i2] : memref<10xf32>
+    affine.store %v2, %c[%i2] : memref<10xf32>
+  }
+  // Loops '%i0' and '%i2' cannot fuse along producer/consumer edge on memref
+  // '%b', because of the WAR dep from '%i0' to '%i1' on memref '%a' and
+  // because of the WAR dep from '%i1' to '%i2' on memref '%c'.
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_and_move_to_preserve_war_dep() {
+func @should_fuse_and_move_to_preserve_war_dep() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %c = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    %v0 = affine.load %b[%i0] : memref<10xf32>
+    affine.store %v0, %a[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 3 {
+    %v2 = affine.load %c[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 5 {
+    affine.store %cf7, %b[%i2] : memref<10xf32>
+  }
+  affine.for %i3 = 0 to 10 {
+    %v1 = affine.load %a[%i3] : memref<10xf32>
+    affine.store %cf7, %c[%i3] : memref<10xf32>
+  }
+
+  // Dependence graph:
+  //
+  //         %i0 ---------
+  //               |     |
+  //     --- %i1   | %b  | %a
+  //     |         |     |
+  //  %c |   %i2 <--     |
+  //     |               |
+  //     --> %i3 <--------
+  //
+  // It is possible to fuse loop '%i0' into '%i3' and preserve dependences
+  // if the fused loop nest is inserted between loops '%i1' and '%i2'.
+
+  // CHECK-DAG:   %{{.*}} = alloc() : memref<1xf32>
+  // CHECK:       affine.for %{{.*}} = 0 to 3 {
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 5 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @fusion_preventing_dep_on_constant() {
+func @fusion_preventing_dep_on_constant() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %c = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    %v0 = affine.load %b[%i0] : memref<10xf32>
+    affine.store %cf7, %a[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    affine.store %cf7, %b[%i1] : memref<10xf32>
+  }
+  %cf11 = constant 11.0 : f32
+  affine.for %i2 = 0 to 10 {
+    %v2 = affine.load %a[%i2] : memref<10xf32>
+    affine.store %cf11, %c[%i2] : memref<10xf32>
+  }
+  // Loops '%i0' and '%i2' cannot fuse along producer/consumer edge on memref
+  // '%a', because of the WAR dep from '%i0' to '%i1' on memref '%b' and
+  // because of the SSA value dep from '%cf11' def to use in '%i2'.
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  %{{.*}} = constant 1.100000e+01 : f32
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_and_preserve_dep_on_constant() {
+func @should_fuse_and_preserve_dep_on_constant() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %c = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+  %cf11 = constant 11.0 : f32
+  affine.for %i0 = 0 to 10 {
+    %v0 = affine.load %b[%i0] : memref<10xf32>
+    affine.store %cf7, %a[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    affine.store %cf7, %b[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    %v2 = affine.load %a[%i2] : memref<10xf32>
+    affine.store %cf11, %c[%i2] : memref<10xf32>
+  }
+
+  // Loops '%i0' and '%i2' can fuse along producer/consumer edge on memref
+  // '%a', and preserve the WAR dep from '%i0' to '%i1' on memref '%b', and
+  // the SSA value dep from '%cf11' def to use in '%i2'.
+
+  // CHECK:       %{{.*}} = constant 1.100000e+01 : f32
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK: [[MAP2:#map[0-9]+]] = (d0, d1) -> (d0 * 16 - d1 + 15)
+// CHECK: [[MAP3:#map[0-9]+]] = (d0, d1) -> (d0 * 16 + d1)
+
+// CHECK-LABEL: func @should_fuse_at_depth_above_loop_carried_dependence(%{{.*}}: memref<64x4xf32>, %{{.*}}: memref<64x4xf32>) {
+func @should_fuse_at_depth_above_loop_carried_dependence(%arg0: memref<64x4xf32>, %arg1: memref<64x4xf32>) {
+  %out = alloc() : memref<64x4xf32>
+  %0 = constant 0.0 : f32
+  affine.for %i0 = 0 to 64 {
+    affine.for %i1 = 0 to 4 {
+      affine.store %0, %out[%i0, %i1] : memref<64x4xf32>
+    }
+  }
+  affine.for %i2 = 0 to 4 {
+    affine.for %i3 = 0 to 4 {
+      affine.for %i4 = 0 to 16 {
+        %1 = affine.apply (d0, d1) -> (d0 * 16 - d1 + 15)(%i3, %i4)
+        %2 = affine.load %arg1[%1, %i2] : memref<64x4xf32>
+        "op0"(%2) : (f32) -> ()
+      }
+      affine.for %i5 = 0 to 4 {
+        affine.for %i6 = 0 to 16 {
+          %3 = affine.apply (d0, d1) -> (d0 * 16 - d1 + 15)(%i5, %i6)
+          %4 = affine.load %arg0[%3, %i3] : memref<64x4xf32>
+          "op1"(%4) : (f32) -> ()
+        }
+        affine.for %i7 = 0 to 16 {
+          %5 = "op2"() : () -> (f32)
+          %6 = affine.apply (d0, d1) -> (d0 * 16 + d1)(%i5, %i7)
+          %7 = affine.load %out[%6, %i2] : memref<64x4xf32>
+          %8 = addf %7, %5 : f32
+          affine.store %8, %out[%6, %i2] : memref<64x4xf32>
+        }
+      }
+    }
+  }
+
+  // We can fuse source loop nest '%i0' into dst loop nest '%i2', but the
+  // depth at which we can insert the src loop nest slice into the dst loop
+  // lest must be decreased because of a loop carried dependence on loop '%i3'.
+  // As a result, the source loop nest is inserted at dst loop nest depth 1,
+  // just above the loop with the carried dependence. In addition, the source
+  // loop nest iteration bounds on its loop '%i1' are reduced to 1, so the
+  // memref size can be reduced to 128x1xf32.
+
+  // CHECK:       %{{.*}} = alloc() : memref<64x1xf32>
+  // CHECK:       affine.for %{{.*}} = 0 to 4 {
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 64 {
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}, 0] : memref<64x1xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 4 {
+  // CHECK-NEXT:      affine.for %{{.*}} = 0 to 16 {
+  // CHECK-NEXT:        %{{.*}} = affine.apply [[MAP2]](%{{.*}}, %{{.*}})
+  // CHECK-NEXT:        %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x4xf32>
+  // CHECK-NEXT:        "op0"(%{{.*}}) : (f32) -> ()
+  // CHECK-NEXT:      }
+  // CHECK-NEXT:      affine.for %{{.*}} = 0 to 4 {
+  // CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
+  // CHECK-NEXT:          %{{.*}} = affine.apply [[MAP2]](%{{.*}}, %{{.*}})
+  // CHECK-NEXT:          %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x4xf32>
+  // CHECK-NEXT:          "op1"(%{{.*}}) : (f32) -> ()
+  // CHECK-NEXT:        }
+  // CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
+  // CHECK-NEXT:          %{{.*}} = "op2"() : () -> f32
+  // CHECK-NEXT:          %{{.*}} = affine.apply [[MAP3]](%{{.*}}, %{{.*}})
+  // CHECK-NEXT:          %{{.*}} = affine.load %{{.*}}[%{{.*}} * 16 + %{{.*}}, 0] : memref<64x1xf32>
+  // CHECK-NEXT:          %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:          affine.store %{{.*}}, %{{.*}}[%{{.*}} * 16 + %{{.*}}, 0] : memref<64x1xf32>
+  // CHECK-NEXT:        }
+  // CHECK-NEXT:      }
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_after_private_memref_creation() {
+func @should_fuse_after_private_memref_creation() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %a[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %v0 = affine.load %a[%i1] : memref<10xf32>
+    affine.store %v0, %b[%i1] : memref<10xf32>
+  }
+  affine.for %i2 = 0 to 10 {
+    %v1 = affine.load %a[%i2] : memref<10xf32>
+    affine.store %v1, %b[%i2] : memref<10xf32>
+  }
+
+  // On the first visit to '%i2', the fusion algorithm can not fuse loop nest
+  // '%i0' into '%i2' because of the dependences '%i0' and '%i2' each have on
+  // '%i1'. However, once the loop nest '%i0' is fused into '%i1' with a
+  // private memref, the dependence between '%i0' and '%i1' on memref '%a' no
+  // longer exists, so '%i0' can now be fused into '%i2'.
+
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:   return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_after_one_loop_interchange() {
+func @should_fuse_after_one_loop_interchange() {
+  %a = alloc() : memref<10xf32>
+
+  %cf0 = constant 0.0 : f32
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf0, %a[%i0] : memref<10xf32>
+  }
+
+  affine.for %i1 = 0 to 5 {
+    affine.for %i2 = 0 to 10 {
+      %v0 = affine.load %a[%i2] : memref<10xf32>
+      affine.store %v0, %a[%i2] : memref<10xf32>
+    }
+  }
+
+  // The dependence between the load and affine.store is carried on loop '%i1', and
+  // cannot be fused with loop '%i0' without violating this dependence.
+  // Once loops '%i1' and %i2' are interchanged, loop '%i0' can be fused
+  // at loop depth 1, because the loop carrying the dependence has been
+  // interchanged and is now at depth 2.
+
+  // CHECK:       affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 5 {
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_after_two_loop_interchanges() {
+func @should_fuse_after_two_loop_interchanges() {
+  %a = alloc() : memref<6x8xf32>
+
+  %cf0 = constant 0.0 : f32
+  affine.for %i0 = 0 to 6 {
+    affine.for %i1 = 0 to 8 {
+      affine.store %cf0, %a[%i0, %i1] : memref<6x8xf32>
+    }
+  }
+
+  affine.for %i2 = 0 to 4 {
+    affine.for %i3 = 0 to 6 {
+      affine.for %i4 = 0 to 2 {
+        affine.for %i5 = 0 to 8 {
+          %v0 = affine.load %a[%i3, %i5] : memref<6x8xf32>
+          %v1 = addf %v0, %v0 : f32
+          affine.store %v1, %a[%i3, %i5] : memref<6x8xf32>
+        }
+      }
+    }
+  }
+
+  // The dependence between the load and affine.store is carried on loops '%i2' and
+  // '%i4', and cannot be fused with loop '%i0' without violating this
+  // dependence.
+  // Once loop '%i2' is interchanged with loop '%i3', and again with loop
+  // '%i5', then loop '%i0' can be fused at loop depth 2, because the loop
+  // carring the dependences have been interchanged with loops at depth > 2.
+
+  // CHECK:       affine.for %{{.*}} = 0 to 6 {
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 8 {
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:      affine.for %{{.*}} = 0 to 4 {
+  // CHECK-NEXT:        affine.for %{{.*}} = 0 to 2 {
+  // CHECK-NEXT:          %{{.*}} = affine.load %{{.*}}[0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:          %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:          affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:        }
+  // CHECK-NEXT:      }
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return
+  return
+}
+
+// -----
+
+func @should_fuse_live_out_writer(%arg0 : memref<10xf32>) -> memref<10xf32> {
+  %cst = constant 0.000000e+00 : f32
+  affine.for %i0 = 0 to 10 {
+    affine.store %cst, %arg0[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %1 = affine.load %arg0[%i1] : memref<10xf32>
+    affine.store %1, %arg0[%i1] : memref<10xf32>
+  }
+  return %arg0 : memref<10xf32>
+
+  // CHECK:       %{{.*}} = constant 0.000000e+00 : f32
+  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT:  return %{{.*}} : memref<10xf32>
+}
+
+// -----
+
+// The fused slice has 16 iterations from along %i0.
+
+// CHECK-DAG: [[MAP_LB:#map[0-9]+]] = (d0) -> (d0 * 16)
+// CHECK-DAG: [[MAP_UB:#map[0-9]+]] = (d0) -> (d0 * 16 + 16)
+
+#map = (d0, d1) -> (d0 * 16 + d1)
+
+// CHECK-LABEL: slice_tile
+func @slice_tile(%arg0: memref<128x8xf32>, %arg1: memref<32x8xf32>, %0 : f32) -> memref<32x8xf32> {
+  affine.for %i0 = 0 to 32 {
+    affine.for %i1 = 0 to 8 {
+      affine.store %0, %arg1[%i0, %i1] : memref<32x8xf32>
+    }
+  }
+  affine.for %i = 0 to 2 {
+    affine.for %j = 0 to 8 {
+      affine.for %k = 0 to 8 {
+        affine.for %kk = 0 to 16 {
+          %1 = affine.apply #map(%k, %kk)
+          %2 = affine.load %arg0[%1, %j] : memref<128x8xf32>
+          %3 = "foo"(%2) : (f32) -> f32
+        }
+        affine.for %ii = 0 to 16 {
+          %6 = affine.apply #map(%i, %ii)
+          %7 = affine.load %arg1[%6, %j] : memref<32x8xf32>
+          %8 = addf %7, %7 : f32
+          affine.store %8, %arg1[%6, %j] : memref<32x8xf32>
+        }
+      }
+    }
+  }
+  return %arg1 : memref<32x8xf32>
+}
+// CHECK:       affine.for %{{.*}} = 0 to 2 {
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to 8 {
+// CHECK-NEXT:      affine.for %{{.*}} = [[MAP_LB]](%{{.*}}) to [[MAP_UB]](%{{.*}}) {
+// CHECK-NEXT:        affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<32x8xf32>
+// CHECK-NEXT:      }
+// CHECK-NEXT:      affine.for %{{.*}} = 0 to 8 {
+// CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
+// CHECK-NEXT:          %{{.*}} = affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
+// CHECK-NEXT:          %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<128x8xf32>
+// CHECK-NEXT:          %{{.*}} = "foo"(%{{.*}}) : (f32) -> f32
+// CHECK-NEXT:        }
+// CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
+// CHECK-NEXT:          %{{.*}} = affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
+// CHECK-NEXT:          %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<32x8xf32>
+// CHECK-NEXT:          %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:          affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<32x8xf32>
+// CHECK-NEXT:        }
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return %{{.*}} : memref<32x8xf32>
+// CHECK-NEXT:}
+
+// -----
+
+// Test case which illustrates fix for b/126454413
+func @test_add_slice_bounds() {
+  %a = alloc() : memref<10xf32>
+  %b = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %c0 = constant 0 : index
+
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.for %i2 = 0 to 10 {
+        %a0 = affine.apply (d0) -> (d0) (%i0)
+        %a1 = affine.apply (d0) -> (d0) (%i0)
+        %a2 = affine.apply (d0, d1) -> (d0 - d1) (%a0, %a1)
+        affine.store %cf7, %a[%a2] : memref<10xf32>
+      }
+    }
+  }
+  affine.for %i3 = 0 to 10 {
+    affine.for %i4 = 0 to 10 {
+      affine.for %i5 = 0 to 10 {
+        %v0 = affine.load %a[%c0] : memref<10xf32>
+      }
+    }
+  }
+
+// CHECK:        affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:     affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:       affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:         %{{.*}} = affine.apply #map0(%{{.*}})
+// CHECK-NEXT:         %{{.*}} = affine.apply #map0(%{{.*}})
+// CHECK-NEXT:         %{{.*}} = affine.apply #map1(%{{.*}}, %{{.*}})
+// CHECK-NEXT:         affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:     affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:       affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+  return
+}
+
+// -----
+
+func @should_fuse_init_loops_siblings_then_shared_producer(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>) {
+  %0 = alloc() : memref<10x10xf32>
+  %cst = constant 0.000000e+00 : f32
+  %cst_0 = constant 1.000000e+00 : f32
+  %cst_1 = constant 7.000000e+00 : f32
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.store %cst_1, %0[%i0, %i1] : memref<10x10xf32>
+    }
+  }
+  affine.for %i2 = 0 to 3 {
+    affine.for %i3 = 0 to 3 {
+      affine.store %cst, %arg0[%i2, %i3] : memref<10x10xf32>
+    }
+  }
+  affine.for %i4 = 0 to 3 {
+    affine.for %i5 = 0 to 3 {
+      %1 = affine.load %0[%i4, %i5] : memref<10x10xf32>
+      %2 = affine.load %arg0[%i4, %i5] : memref<10x10xf32>
+      %3 = mulf %1, %2 : f32
+      affine.store %3, %arg0[%i4, %i5] : memref<10x10xf32>
+    }
+  }
+  affine.for %i6 = 0 to 3 {
+    affine.for %i7 = 0 to 3 {
+      affine.store %cst_0, %arg1[%i6, %i7] : memref<10x10xf32>
+    }
+  }
+  affine.for %i8 = 0 to 3 {
+    affine.for %i9 = 0 to 3 {
+      %4 = affine.load %0[%i8, %i9] : memref<10x10xf32>
+      %5 = affine.load %arg1[%i8, %i9] : memref<10x10xf32>
+      %6 = addf %4, %5 : f32
+      affine.store %6, %arg1[%i8, %i9] : memref<10x10xf32>
+    }
+  }
+
+  // Pass 1: should fuse single-use producer loop nests into their unique user,
+  //         so '%i2' will fuse into '%i4' and '%i6' will fuse into '%i8'.
+  // Pass 2: should fuse sibling loop nests which share no dependence edges,
+  //         so should fuse '%i4' into '%i8'.
+  // Pass 3: should fuse single-use producer loop nest '%i0' into '%i8'. Note
+  //         that loop nest '%i0' now has a single user after Pass 2 fused its
+  //         two users together).
+
+// CHECK:        affine.for %{{.*}} = 0 to 3 {
+// CHECK-NEXT:     affine.for %{{.*}} = 0 to 3 {
+// CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
+// CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+// CHECK-NEXT:       %{{.*}} = affine.load %{{.*}}[0, 0] : memref<1x1xf32>
+// CHECK-NEXT:       %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+// CHECK-NEXT:       %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+// CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+// CHECK-NEXT:       %{{.*}} = affine.load %{{.*}}[0, 0] : memref<1x1xf32>
+// CHECK-NEXT:       %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+// CHECK-NEXT:       %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+
+  return
+}
+
+// -----
+
+func @two_matrix_vector_products() {
+  %in_matrix = alloc() : memref<10x10xf32>
+  %in_vec0 = alloc() : memref<10xf32>
+  %in_vec1 = alloc() : memref<10xf32>
+  %out_vec0 = alloc() : memref<10xf32>
+  %out_vec1 = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+
+  // Populate input matrix.
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.store %cf7, %in_matrix[%i0, %i1] : memref<10x10xf32>
+    }
+  }
+  // out_vec0 = in_matrix x in_vec0
+  affine.for %i2 = 0 to 10 {
+    affine.for %i3 = 0 to 10 {
+      %v0 = affine.load %in_matrix[%i2, %i3] : memref<10x10xf32>
+      %v1 = affine.load %in_vec0[%i3] : memref<10xf32>
+      %v2 = mulf %v0, %v1 : f32
+      %v3 = affine.load %out_vec0[%i3] : memref<10xf32>
+      %v4 = addf %v2, %v3 : f32
+      affine.store %v4, %out_vec0[%i3] : memref<10xf32>
+    }
+  }
+  // out_vec1 = in_matrix x in_vec1
+  affine.for %i4 = 0 to 10 {
+    affine.for %i5 = 0 to 10 {
+      %v5 = affine.load %in_matrix[%i4, %i5] : memref<10x10xf32>
+      %v6 = affine.load %in_vec1[%i5] : memref<10xf32>
+      %v7 = mulf %v5, %v6 : f32
+      %v8 = affine.load %out_vec1[%i5] : memref<10xf32>
+      %v9 = addf %v7, %v8 : f32
+      affine.store %v9, %out_vec1[%i5] : memref<10xf32>
+    }
+  }
+
+// CHECK:        affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:     affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[%{{.*}}, 0] : memref<10x1xf32>
+// CHECK-NEXT:     }
+// CHECK-NEXT:     affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:       %{{.*}} = affine.load %{{.*}}[%{{.*}}, 0] : memref<10x1xf32>
+// CHECK-NEXT:       %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:       %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:       %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:       %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:     }
+// CHECK-NEXT:     affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:       %{{.*}} = affine.load %{{.*}}[%{{.*}}, 0] : memref<10x1xf32>
+// CHECK-NEXT:       %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:       %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:       %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:       %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+  return
+}
+
+// -----
+
+func @should_not_slice_past_slice_barrier() {
+  %0 = alloc() : memref<100x16xf32>
+  affine.for %i0 = 0 to 100 {
+    affine.for %i1 = 0 to 16 {
+      %1 = "op1"() : () -> f32
+      affine.store %1, %0[%i0, %i1] : memref<100x16xf32>
+    } {slice_fusion_barrier = true}
+  }
+  affine.for %i2 = 0 to 100 {
+    affine.for %i3 = 0 to 16 {
+      %2 = affine.load %0[%i2, %i3] : memref<100x16xf32>
+      "op2"(%2) : (f32) -> ()
+    }
+  }
+  // The 'slice_fusion_barrier' attribute on '%i1' prevents slicing the
+  // iteration space of '%i1' and any enclosing loop nests.
+// CHECK:        affine.for %{{.*}} = 0 to 100 {
+// CHECK-NEXT:     affine.for %{{.*}} = 0 to 16 {
+// CHECK-NEXT:       %{{.*}} = "op1"() : () -> f32
+// CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
+// CHECK-NEXT:     } {slice_fusion_barrier = true}
+// CHECK-NEXT:     affine.for %{{.*}} = 0 to 16 {
+// CHECK-NEXT:       %{{.*}} = affine.load %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
+// CHECK-NEXT:       "op2"(%{{.*}}) : (f32) -> ()
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+  return
+}
+
+// -----
+
+#map0 = (d0, d1) -> (d0 * 16 + d1)
+func @fuse_across_dim_mismatch(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>, %arg2: memref<9xf32>) {
+  %1 = alloc() : memref<144x4xf32>
+  %2 = constant 0.0 : f32
+  affine.for %i2 = 0 to 9 {
+    affine.for %i3 = 0 to 4 {
+      affine.for %i5 = 0 to 16 {
+        %7 = affine.apply #map0(%i2, %i5)
+        affine.store %2, %1[%7, %i3] : memref<144x4xf32>
+      }
+    }
+  }
+  affine.for %i6 = 0 to 9 {
+    affine.for %i7 = 0 to 9 {
+      affine.for %i8 = 0 to 4 {
+        affine.for %i10 = 0 to 16 {
+          %10 = affine.apply #map0(%i6, %i10)
+          %11 = affine.load %1[%10, %i8] : memref<144x4xf32>
+        }
+      }
+    }
+  }
+  return
+}
+// MAXIMAL:      #map0 = (d0, d1) -> (d0 * 16 + d1)
+// MAXIMAL-LABEL: func @fuse_across_dim_mismatch
+// MAXIMAL:        %{{.*}} = alloc() : memref<1x1xf32>
+// MAXIMAL:        affine.for %{{.*}} = 0 to 9 {
+// MAXIMAL-NEXT:    affine.for %{{.*}} = 0 to 9 {
+// MAXIMAL-NEXT:      affine.for %{{.*}} = 0 to 4 {
+// MAXIMAL-NEXT:        affine.for %{{.*}} = 0 to 16 {
+// MAXIMAL-NEXT:          %{{.*}} = affine.apply #map0(%{{.*}}, %{{.*}})
+// MAXIMAL-NEXT:          affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
+// MAXIMAL-NEXT:          %{{.*}} = affine.apply #map0(%{{.*}}, %{{.*}})
+// MAXIMAL-NEXT:          %{{.*}} = affine.load %{{.*}}[0, 0] : memref<1x1xf32>
+// MAXIMAL-NEXT:        }
+// MAXIMAL-NEXT:      }
+// MAXIMAL-NEXT:    }
+// MAXIMAL-NEXT:  }
+
+// -----
+
+#map3 = (d0, d1) -> ((d0 * 72 + d1) floordiv 2304)
+#map4 = (d0, d1) -> (((d0 * 72 + d1) mod 2304) floordiv 1152)
+#map5 = (d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) floordiv 9) floordiv 8)
+#map6 = (d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) mod 9) floordiv 3)
+#map7 = (d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) mod 9) mod 3)
+#map10 = (d0, d1) -> (d0 * 16 + d1)
+#map11 = (d0, d1) -> (d0 * 16 + d1)
+#map12 = (d0, d1) -> (d0 * 16 - d1 + 15)
+func @fuse_across_varying_dims_complex(%arg0: f32) {
+  %c0 = constant 0 : index
+  %0 = alloc() : memref<2x2x3x3x16x1xf32>
+  %1 = alloc() : memref<64x9xf32>
+  %2 = alloc() : memref<144x4xf32>
+  affine.for %i0 = 0 to 64 {
+    affine.for %i1 = 0 to 9 {
+      %4 = affine.apply #map3(%i0, %i1)
+      %5 = affine.apply #map4(%i0, %i1)
+      %6 = affine.apply #map5(%i0, %i1)
+      %7 = affine.apply #map6(%i0, %i1)
+      %8 = affine.apply #map7(%i0, %i1)
+      %9 = affine.load %0[%4, %5, %7, %8, %6, %c0] : memref<2x2x3x3x16x1xf32>
+      affine.store %9, %1[%i0, %i1] : memref<64x9xf32>
+    }
+  }
+  affine.for %i2 = 0 to 9 {
+    affine.for %i3 = 0 to 4 {
+      affine.for %i4 = 0 to 16 {
+        %10 = affine.apply #map10(%i3, %i4)
+        %11 = affine.load %1[%10, %i2] : memref<64x9xf32>
+      }
+      affine.for %i5 = 0 to 16 {
+        %14 = affine.apply #map11(%i2, %i5)
+        affine.store %arg0, %2[%14, %i3] : memref<144x4xf32>
+      }
+    }
+  }
+  affine.for %i6 = 0 to 9 {
+    affine.for %i7 = 0 to 9 {
+      affine.for %i8 = 0 to 4 {
+        affine.for %i9 = 0 to 16 {
+          %15 = affine.apply #map12(%i8, %i9)
+          %16 = affine.load %1[%15, %i7] : memref<64x9xf32>
+        }
+      }
+    }
+  }
+  return
+}
+// MAXIMAL-DAG: [[MAP0:#map[0-9]+]] = (d0, d1) -> ((d0 * 72 + d1) floordiv 2304)
+// MAXIMAL-DAG: [[MAP1:#map[0-9]+]] = (d0, d1) -> (((d0 * 72 + d1) mod 2304) floordiv 1152)
+// MAXIMAL-DAG: [[MAP2:#map[0-9]+]] = (d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) floordiv 9) floordiv 8)
+// MAXIMAL-DAG: [[MAP3:#map[0-9]+]] = (d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) mod 9) floordiv 3)
+// MAXIMAL-DAG: [[MAP4:#map[0-9]+]] = (d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) mod 9) mod 3)
+// MAXIMAL-DAG: [[MAP7:#map[0-9]+]] = (d0, d1) -> (d0 * 16 + d1)
+// MAXIMAL-DAG: [[MAP8:#map[0-9]+]] = (d0, d1) -> (d0 * 16 - d1 + 15)
+// MAXIMAL-LABEL: func @fuse_across_varying_dims_complex
+// MAXIMAL-NEXT:  %{{.*}} = alloc() : memref<64x1xf32>
+// MAXIMAL-NEXT:  %{{.*}} = constant 0 : index
+// MAXIMAL-NEXT:  %{{.*}} = alloc() : memref<2x2x3x3x16x1xf32>
+// MAXIMAL-NEXT:  %{{.*}} = alloc() : memref<144x4xf32>
+// MAXIMAL-NEXT:  affine.for %{{.*}} = 0 to 9 {
+// MAXIMAL-NEXT:    affine.for %{{.*}} = 0 to 9 {
+// MAXIMAL-NEXT:      affine.for %{{.*}} = 0 to 4 {
+// MAXIMAL-NEXT:        affine.for %{{.*}} = 0 to 16 {
+// MAXIMAL-NEXT:          affine.for %{{.*}} = 0 to 64 {
+// MAXIMAL-NEXT:            %{{.*}} = affine.apply [[MAP0]](%{{.*}}, %{{.*}})
+// MAXIMAL-NEXT:            %{{.*}} = affine.apply [[MAP1]](%{{.*}}, %{{.*}})
+// MAXIMAL-NEXT:            %{{.*}} = affine.apply [[MAP2]](%{{.*}}, %{{.*}})
+// MAXIMAL-NEXT:            %{{.*}} = affine.apply [[MAP3]](%{{.*}}, %{{.*}})
+// MAXIMAL-NEXT:            %{{.*}} = affine.apply [[MAP4]](%{{.*}}, %{{.*}})
+// MAXIMAL-NEXT:            %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<2x2x3x3x16x1xf32>
+// MAXIMAL-NEXT:            affine.store %{{.*}}, %{{.*}}[%{{.*}}, 0] : memref<64x1xf32>
+// MAXIMAL-NEXT:          }
+// MAXIMAL-NEXT:          affine.for %{{.*}} = 0 to 4 {
+// MAXIMAL-NEXT:            affine.for %{{.*}} = 0 to 16 {
+// MAXIMAL-NEXT:              %{{.*}} = affine.apply [[MAP7]](%{{.*}}, %{{.*}})
+// MAXIMAL-NEXT:              %{{.*}} = affine.load %{{.*}}[%{{.*}} * 16 + %{{.*}}, 0] : memref<64x1xf32>
+// MAXIMAL-NEXT:            }
+// MAXIMAL-NEXT:            affine.for %{{.*}} = 0 to 16 {
+// MAXIMAL-NEXT:              %{{.*}} = affine.apply [[MAP7]](%{{.*}}, %{{.*}})
+// MAXIMAL-NEXT:              affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<144x4xf32>
+// MAXIMAL-NEXT:            }
+// MAXIMAL-NEXT:          }
+// MAXIMAL-NEXT:          %{{.*}} = affine.apply [[MAP8]](%{{.*}}, %{{.*}})
+// MAXIMAL-NEXT:          %{{.*}} = affine.load %{{.*}}[%{{.*}} * 16 - %{{.*}} + 15, 0] : memref<64x1xf32>
+// MAXIMAL-NEXT:        }
+// MAXIMAL-NEXT:      }
+// MAXIMAL-NEXT:    }
+// MAXIMAL-NEXT:  }
+
+// -----
+
+func @should_fuse_with_slice_union() {
+  %a = alloc() : memref<100xf32>
+  %c0 = constant 0 : index
+  %cf0 = constant 0.0 : f32
+
+  affine.for %i0 = 0 to 100 {
+    affine.store %cf0, %a[%i0]: memref<100xf32>
+  }
+
+  affine.for %i1 = 10 to 20 {
+    %v0 = affine.load %a[%i1]: memref<100xf32>
+    affine.for %i2 = 15 to 25 {
+      %v1 = affine.load %a[%i2]: memref<100xf32>
+    }
+  }
+  // The union of two slice bounds (calculated between the store and each of
+  // the loads) is computed and used in the fusion cost calculation, index
+  // remapping, and private memref size. The result is that the temporary
+  // memref is reduced from 100xf32 to 15xf32 and properly indexed by
+  // the fused loops based on the union calculation.
+// CHECK:      affine.for %{{.*}} = 10 to 20 {
+// CHECK-NEXT:   affine.for %{{.*}} = 10 to 25 {
+// CHECK-NEXT:     affine.store %{{.*}}, %{{.*}}[%{{.*}} - 10] : memref<15xf32>
+// CHECK-NEXT:   }
+// CHECK-NEXT:   %{{.*}} = affine.load %{{.*}}[%{{.*}} - 10] : memref<15xf32>
+// CHECK-NEXT:   affine.for %{{.*}} = 15 to 25 {
+// CHECK-NEXT:     %{{.*}} = affine.load %{{.*}}[%{{.*}} - 10] : memref<15xf32>
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+// CHECK-NEXT: return
+  return
+}
+
+// -----
+
+func @affine_add_mm_fused(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>, %arg3: memref<1024x1024xf32>) {
+  affine.for %i2 = 0 to 1024 {
+    affine.for %i3 = 0 to 1024 {
+      %0 = affine.load %arg3[%i2, %i3] : memref<1024x1024xf32>
+      %1 = affine.load %arg2[%i2, %i3] : memref<1024x1024xf32>
+      %2 = addf %1, %0 : f32
+      affine.store %2, %arg2[%i2, %i3] : memref<1024x1024xf32>
+    }
+  }
+  affine.for %i4 = 0 to 1024 {
+    affine.for %i5 = 0 to 1024 {
+      affine.for %i6 = 0 to 1024 {
+        %3 = affine.load %arg1[%i6, %i5] : memref<1024x1024xf32>
+        %4 = affine.load %arg0[%i4, %i6] : memref<1024x1024xf32>
+        %5 = mulf %4, %3 : f32
+        %6 = affine.load %arg2[%i4, %i5] : memref<1024x1024xf32>
+        %7 = addf %6, %5 : f32
+        affine.store %7, %arg2[%i4, %i5] : memref<1024x1024xf32>
+      }
+    }
+  }
+  // Should fuse elementwise add loop at loop depth 2, above loop-carried
+  // dependence between load/store on '%arg2', carried on reduction loop %i6.
+  // CHECK:       affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:      %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:      affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:        %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:        %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:        %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:        %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:        %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:        affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:      }
+  // CHECK-NEXT:    }
+  // CHECK-NEXT:  }
+  return
+}
+
+// -----
+
+func @affine_2mm_fused(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>, %arg3: memref<1024x1024xf32>, %arg4: memref<1024x1024xf32>) {
+  %cst = constant 0.000000e+00 : f32
+  affine.for %i0 = 0 to 1024 {
+    affine.for %i1 = 0 to 1024 {
+      affine.store %cst, %arg2[%i0, %i1] : memref<1024x1024xf32>
+    }
+  }
+  affine.for %i2 = 0 to 1024 {
+    affine.for %i3 = 0 to 1024 {
+      affine.store %cst, %arg4[%i2, %i3] : memref<1024x1024xf32>
+    }
+  }
+  affine.for %i4 = 0 to 1024 {
+    affine.for %i5 = 0 to 1024 {
+      affine.for %i6 = 0 to 1024 {
+        %0 = affine.load %arg1[%i6, %i5] : memref<1024x1024xf32>
+        %1 = affine.load %arg0[%i4, %i6] : memref<1024x1024xf32>
+        %2 = mulf %1, %0 : f32
+        %3 = affine.load %arg2[%i4, %i5] : memref<1024x1024xf32>
+        %4 = addf %3, %2 : f32
+        affine.store %4, %arg2[%i4, %i5] : memref<1024x1024xf32>
+      }
+    }
+  }
+  affine.for %i7 = 0 to 1024 {
+    affine.for %i8 = 0 to 1024 {
+      affine.for %i9 = 0 to 1024 {
+        %5 = affine.load %arg1[%i9, %i8] : memref<1024x1024xf32>
+        %6 = affine.load %arg0[%i7, %i9] : memref<1024x1024xf32>
+        %7 = mulf %6, %5 : f32
+        %8 = affine.load %arg4[%i7, %i8] : memref<1024x1024xf32>
+        %9 = addf %8, %7 : f32
+        affine.store %9, %arg4[%i7, %i8] : memref<1024x1024xf32>
+      }
+    }
+  }
+
+  // Should fuse MM initialization loops into their consumers, then fuse the
+  // two matmul loops together for input reuse on '%arg0/%arg1'.
+
+  // CHECK:        affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:     affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:       affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:         %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:         %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:         affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:       }
+  // CHECK-NEXT:     }
+  // CHECK-NEXT:     affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:       affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:         %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:         %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:         affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:       }
+  // CHECK-NEXT:     }
+  // CHECK-NEXT:   }
+
+  return
+}
+
+// -----
+
+func @affine_2_dependent_mm_fused(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>, %arg3: memref<1024x1024xf32>, %arg4: memref<1024x1024xf32>) {
+  affine.for %i0 = 0 to 1024 {
+    affine.for %i1 = 0 to 1024 {
+      affine.for %i2 = 0 to 1024 {
+        %0 = affine.load %arg1[%i2, %i1] : memref<1024x1024xf32>
+        %1 = affine.load %arg0[%i0, %i2] : memref<1024x1024xf32>
+        %2 = mulf %1, %0 : f32
+        %3 = affine.load %arg2[%i0, %i1] : memref<1024x1024xf32>
+        %4 = addf %3, %2 : f32
+        affine.store %4, %arg2[%i0, %i1] : memref<1024x1024xf32>
+      }
+    }
+  }
+  affine.for %i3 = 0 to 1024 {
+    affine.for %i4 = 0 to 1024 {
+      affine.for %i5 = 0 to 1024 {
+        %5 = affine.load %arg3[%i5, %i4] : memref<1024x1024xf32>
+        %6 = affine.load %arg2[%i3, %i5] : memref<1024x1024xf32>
+        %7 = mulf %6, %5 : f32
+        %8 = affine.load %arg4[%i3, %i4] : memref<1024x1024xf32>
+        %9 = addf %8, %7 : f32
+        affine.store %9, %arg4[%i3, %i4] : memref<1024x1024xf32>
+      }
+    }
+  }
+
+  // CHECK:  affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:     affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:       affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:         %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:         %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:         affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:       }
+  // CHECK-NEXT:     }
+  // CHECK-NEXT:     affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:       affine.for %{{.*}} = 0 to 1024 {
+  // CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:         %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:         %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:         %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:         affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
+  // CHECK-NEXT:       }
+  // CHECK-NEXT:     }
+  // CHECK-NEXT:   }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_self_dependence_multi_store_producer() {
+func @should_fuse_self_dependence_multi_store_producer() {
+  %m = alloc() : memref<10xf32>
+  %local_m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %local_m[%i0] : memref<10xf32>
+    %v0 = affine.load %local_m[%i0] : memref<10xf32>
+    affine.store %v0, %m[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %v1 = affine.load %m[%i1] : memref<10xf32>
+  }
+  // CHECK:      affine.for %[[i0:.*]] = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, [[LOCAL_M:%.*]][%[[i0]]] : memref<10xf32>
+  // CHECK-NEXT:   [[v0:%.*]] = affine.load [[LOCAL_M]][%[[i0]]] : memref<10xf32>
+  // CHECK-NEXT:   affine.store [[v0]], %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_dead_multi_store_producer() {
+func @should_fuse_dead_multi_store_producer() {
+  %m = alloc() : memref<10xf32>
+  %dead_m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %dead_m[%i0] : memref<10xf32>
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %v0 = affine.load %m[%i1] : memref<10xf32>
+  }
+  // CHECK:      affine.for %[[i0:.*]] = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%[[i0]]] : memref<10xf32>
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @should_fuse_function_live_out_multi_store_producer
+func @should_fuse_function_live_out_multi_store_producer(%live_in_out_m : memref<10xf32>) {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %live_in_out_m[%i0] : memref<10xf32>
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+  }
+  affine.for %i1 = 0 to 10 {
+    %v0 = affine.load %m[%i1] : memref<10xf32>
+  }
+  // CHECK:      affine.for %[[i0:.*]] = 0 to 10 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%[[i0]]] : memref<10xf32>
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%[[i0]]] : memref<10xf32>
+  // CHECK-NEXT:   affine.load %{{.*}}[%[[i0]]] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// Test case from github bug 777.
+// CHECK-LABEL: func @mul_add_0
+func @mul_add_0(%arg0: memref<3x4xf32>, %arg1: memref<4x3xf32>, %arg2: memref<3x3xf32>, %arg3: memref<3x3xf32>) {
+  %cst = constant 0.000000e+00 : f32
+  %0 = alloc() : memref<3x3xf32>
+  affine.for %arg4 = 0 to 3 {
+    affine.for %arg5 = 0 to 3 {
+      affine.store %cst, %0[%arg4, %arg5] : memref<3x3xf32>
+    }
+  }
+  affine.for %arg4 = 0 to 3 {
+    affine.for %arg5 = 0 to 3 {
+      affine.for %arg6 = 0 to 4 {
+        %1 = affine.load %arg1[%arg6, %arg5] : memref<4x3xf32>
+        %2 = affine.load %arg0[%arg4, %arg6] : memref<3x4xf32>
+        %3 = mulf %2, %1 : f32
+        %4 = affine.load %0[%arg4, %arg5] : memref<3x3xf32>
+        %5 = addf %4, %3 : f32
+        affine.store %5, %0[%arg4, %arg5] : memref<3x3xf32>
+      }
+    }
+  }
+  affine.for %arg4 = 0 to 3 {
+    affine.for %arg5 = 0 to 3 {
+      %6 = affine.load %arg2[%arg4, %arg5] : memref<3x3xf32>
+      %7 = affine.load %0[%arg4, %arg5] : memref<3x3xf32>
+      %8 = addf %7, %6 : f32
+      affine.store %8, %arg3[%arg4, %arg5] : memref<3x3xf32>
+    }
+  }
+  // CHECK:      affine.for %[[i0:.*]] = 0 to 3 {
+  // CHECK-NEXT:   affine.for %[[i1:.*]] = 0 to 3 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:     affine.for %[[i2:.*]] = 0 to 4 {
+  // CHECK-NEXT:       affine.load %{{.*}}[%[[i2]], %[[i1]]] : memref<4x3xf32>
+  // CHECK-NEXT:       affine.load %{{.*}}[%[[i0]], %[[i2]]] : memref<3x4xf32>
+  // CHECK-NEXT:       %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:       affine.load %{{.*}}[0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:       %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:     }
+  // CHECK-NEXT:     affine.load %{{.*}}[%[[i0]], %[[i1]]] : memref<3x3xf32>
+  // CHECK-NEXT:     affine.load %{{.*}}[0, 0] : memref<1x1xf32>
+  // CHECK-NEXT:     %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+  // CHECK-NEXT:     affine.store %{{.*}}, %{{.*}}[%[[i0]], %[[i1]]] : memref<3x3xf32>
+  // CHECK-NEXT:   }
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// Verify that 'fuseProducerConsumerNodes' doesn't fuse a producer loop with
+// a store that has multiple outgoing edges. Sibling loop fusion should not fuse
+// any of these loops due to dependencies on external memref '%a'.
+
+// CHECK-LABEL: func @should_not_fuse_multi_outgoing_edge_store_producer1
+func @should_not_fuse_multi_outgoing_edge_store_producer1(%a : memref<1xf32>) {
+  %cst = constant 0.000000e+00 : f32
+  affine.for %arg0 = 0 to 1 {
+    affine.store %cst, %a[%arg0] : memref<1xf32>
+  }
+
+  affine.for %arg0 = 0 to 1 {
+    %0 = affine.load %a[%arg0] : memref<1xf32>
+  }
+
+  affine.for %arg0 = 0 to 1 {
+    %0 = affine.load %a[%arg0] : memref<1xf32>
+  }
+  // CHECK: affine.for %{{.*}} = 0 to 1
+  // CHECK: affine.for %{{.*}} = 0 to 1
+  // CHECK: affine.for %{{.*}} = 0 to 1
+  return
+}
+
+// -----
+
+// Verify that 'fuseProducerConsumerNodes' fuses a producer loop that: 1) has
+// multiple outgoing edges, 2) producer store has a single outgoing edge.
+// Sibling loop fusion should not fuse any of these loops due to
+// dependencies on external memrefs '%a' and '%b'.
+
+// CHECK-LABEL: func @should_fuse_producer_with_multi_outgoing_edges
+func @should_fuse_producer_with_multi_outgoing_edges(%a : memref<1xf32>, %b : memref<1xf32>) {
+  %cst = constant 0.000000e+00 : f32
+  affine.for %arg0 = 0 to 1 {
+    %0 = affine.load %a[%arg0] : memref<1xf32>
+    affine.store %cst, %b[%arg0] : memref<1xf32>
+  }
+
+  affine.for %arg0 = 0 to 1 {
+    affine.store %cst, %a[%arg0] : memref<1xf32>
+    %1 = affine.load %b[%arg0] : memref<1xf32>
+  }
+  // CHECK: affine.for %{{.*}} = 0 to 1
+  // CHECK-NEXT: affine.load %[[A:.*]][{{.*}}]
+  // CHECK-NEXT: affine.store %{{.*}}, %[[B:.*]][{{.*}}]
+  // CHECK-NEXT: affine.store %{{.*}}, %[[A]]
+  // CHECK-NEXT: affine.load %[[B]]
+  // CHECK-NOT: affine.for %{{.*}}
+  return
+}
diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..4d742acf246f34d6ac8d5223174d5471f5908f18
--- /dev/null
+++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir
@@ -0,0 +1,244 @@
+// RUN: mlir-opt %s -loop-invariant-code-motion -split-input-file | FileCheck %s
+
+func @nested_loops_both_having_invariant_code() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+
+  affine.for %arg0 = 0 to 10 {
+    %v0 = addf %cf7, %cf8 : f32
+    affine.for %arg1 = 0 to 10 {
+      %v1 = addf %v0, %cf8 : f32
+      affine.store %v0, %m[%arg0] : memref<10xf32>
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %[[CST0:.*]] = constant 7.000000e+00 : f32
+  // CHECK-NEXT: %[[CST1:.*]] = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %[[ADD0:.*]] = addf %[[CST0]], %[[CST1]] : f32
+  // CHECK-NEXT: addf %[[ADD0]], %[[CST1]] : f32
+  // CHECK-NEXT: affine.for
+  // CHECK-NEXT: affine.for
+  // CHECK-NEXT: affine.store
+
+  return
+}
+
+func @nested_loops_code_invariant_to_both() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      %v0 = addf %cf7, %cf8 : f32
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 7.000000e+00 : f32
+  // CHECK-NEXT: %cst_0 = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %1 = addf %cst, %cst_0 : f32
+
+  return
+}
+
+func @single_loop_nothing_invariant() {
+  %m1 = alloc() : memref<10xf32>
+  %m2 = alloc() : memref<10xf32>
+  affine.for %arg0 = 0 to 10 {
+    %v0 = affine.load %m1[%arg0] : memref<10xf32>
+    %v1 = affine.load %m2[%arg0] : memref<10xf32>
+    %v2 = addf %v0, %v1 : f32
+    affine.store %v2, %m1[%arg0] : memref<10xf32>
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %1 = alloc() : memref<10xf32>
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: %2 = affine.load %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: %3 = affine.load %1[%arg0] : memref<10xf32>
+  // CHECK-NEXT: %4 = addf %2, %3 : f32
+  // CHECK-NEXT: affine.store %4, %0[%arg0] : memref<10xf32>
+
+  return
+}
+
+func @invariant_code_inside_affine_if() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+
+  affine.for %arg0 = 0 to 10 {
+    %t0 = affine.apply (d1) -> (d1 + 1)(%arg0)
+    affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %t0) {
+        %cf9 = addf %cf8, %cf8 : f32
+        affine.store %cf9, %m[%arg0] : memref<10xf32>
+
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: affine.for %arg0 = 0 to 10 {
+  // CHECK-NEXT: %1 = affine.apply #map3(%arg0)
+  // CHECK-NEXT: affine.if #set0(%arg0, %1) {
+  // CHECK-NEXT: %2 = addf %cst, %cst : f32
+  // CHECK-NEXT: affine.store %2, %0[%arg0] : memref<10xf32>
+  // CHECK-NEXT: }
+
+
+  return
+}
+
+func @invariant_affine_if() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+          %cf9 = addf %cf8, %cf8 : f32
+      }
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %[[CST:.*]] = constant 8.000000e+00 : f32
+  // CHECK-NEXT: affine.for %[[ARG:.*]] = 0 to 10 {
+  // CHECK-NEXT: affine.if #set0(%[[ARG]], %[[ARG]]) {
+  // CHECK-NEXT: addf %[[CST]], %[[CST]] : f32
+  // CHECK-NEXT: }
+
+  return
+}
+
+func @invariant_affine_if2() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+          %cf9 = addf %cf8, %cf8 : f32
+          affine.store %cf9, %m[%arg1] : memref<10xf32>
+      }
+    }
+  }
+
+  // CHECK: alloc
+  // CHECK-NEXT: constant
+  // CHECK-NEXT: affine.for
+  // CHECK-NEXT: affine.for
+  // CHECK-NEXT: affine.if
+  // CHECK-NEXT: addf
+  // CHECK-NEXT: affine.store
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+
+  return
+}
+
+func @invariant_affine_nested_if() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+          %cf9 = addf %cf8, %cf8 : f32
+          affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+            %cf10 = addf %cf9, %cf9 : f32
+          }
+      }
+    }
+  }
+
+  // CHECK: alloc
+  // CHECK-NEXT: constant
+  // CHECK-NEXT: affine.for
+  // CHECK-NEXT: affine.for
+  // CHECK-NEXT: affine.if
+  // CHECK-NEXT: addf
+  // CHECK-NEXT: affine.if
+  // CHECK-NEXT: addf
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+
+
+  return
+}
+
+func @invariant_affine_nested_if_else() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+  affine.for %arg0 = 0 to 10 {
+    affine.for %arg1 = 0 to 10 {
+      affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+          %cf9 = addf %cf8, %cf8 : f32
+          affine.store %cf9, %m[%arg0] : memref<10xf32>
+          affine.if (d0, d1) : (d1 - d0 >= 0) (%arg0, %arg0) {
+            %cf10 = addf %cf9, %cf9 : f32
+          } else {
+            affine.store %cf9, %m[%arg1] : memref<10xf32>
+          }
+      }
+    }
+  }
+
+  // CHECK: alloc
+  // CHECK-NEXT: constant
+  // CHECK-NEXT: affine.for
+  // CHECK-NEXT: affine.for
+  // CHECK-NEXT: affine.if
+  // CHECK-NEXT: addf
+  // CHECK-NEXT: affine.store
+  // CHECK-NEXT: affine.if
+  // CHECK-NEXT: addf
+  // CHECK-NEXT: } else {
+  // CHECK-NEXT: affine.store
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+
+
+  return
+}
+
+func @invariant_loop_dialect() {
+  %ci0 = constant 0 : index
+  %ci10 = constant 10 : index
+  %ci1 = constant 1 : index
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+  loop.for %arg0 = %ci0 to %ci10 step %ci1 {
+    loop.for %arg1 = %ci0 to %ci10 step %ci1 {
+      %v0 = addf %cf7, %cf8 : f32
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 7.000000e+00 : f32
+  // CHECK-NEXT: %cst_0 = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %1 = addf %cst, %cst_0 : f32
+
+  return
+}
+
+func @variant_loop_dialect() {
+  %ci0 = constant 0 : index
+  %ci10 = constant 10 : index
+  %ci1 = constant 1 : index
+  %m = alloc() : memref<10xf32>
+  loop.for %arg0 = %ci0 to %ci10 step %ci1 {
+    loop.for %arg1 = %ci0 to %ci10 step %ci1 {
+      %v0 = addi %arg0, %arg1 : index
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: loop.for
+  // CHECK-NEXT: loop.for
+  // CHECK-NEXT: addi
+
+  return
+}
diff --git a/mlir/test/Transforms/loop-tiling.mlir b/mlir/test/Transforms/loop-tiling.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..cf0208faf4f5b1d941bca211118848bb5b5391b1
--- /dev/null
+++ b/mlir/test/Transforms/loop-tiling.mlir
@@ -0,0 +1,184 @@
+// RUN: mlir-opt %s -split-input-file  -affine-loop-tile -tile-size=32 | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -affine-loop-tile -tile-cache-size=512 | FileCheck %s --check-prefix=MODEL
+
+// -----
+
+// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 32)
+// CHECK-DAG: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 32, 50)
+// CHECK-DAG: [[IDENTITY:#map[0-9]+]] = (d0) -> (d0)
+
+// CHECK-LABEL: func @loop_tiling()
+// CHECK-NEXT:   affine.for %{{.*}} = 0 to 256 step 32 {
+// CHECK-NEXT:     affine.for %{{.*}} = 0 to 512 step 32 {
+// CHECK-NEXT:       affine.for %{{.*}} = 0 to 1024 step 32 {
+// CHECK-NEXT:         affine.for %{{.*}} = [[IDENTITY]](%{{.*}}) to [[MAP0]](%{{.*}}) {
+// CHECK-NEXT:           affine.for %{{.*}} = [[IDENTITY]](%{{.*}}) to [[MAP0]](%{{.*}}) {
+// CHECK-NEXT:             affine.for %{{.*}} = [[IDENTITY]](%{{.*}}) to [[MAP0]](%{{.*}}) {
+// CHECK-NEXT:               "foo"(%{{.*}}, %{{.*}}, %{{.*}}) : (index, index, index) -> ()
+// CHECK-NEXT:             }
+// CHECK-NEXT:           }
+// CHECK-NEXT:         }
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   affine.for %{{.*}} = 0 to 50 step 32 {
+// CHECK-NEXT:     affine.for %{{.*}} = [[IDENTITY]](%{{.*}}) to min [[MAP1]](%{{.*}}) {
+// CHECK-NEXT:       "bar"(%{{.*}}, %{{.*}}) : (index, index) -> ()
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: affine.for %{{.*}} = 0 to 21 step 32 {
+// CHECK-NEXT:    affine.for %{{.*}} = [[IDENTITY]](%{{.*}}) to 21 {
+// CHECK-NEXT:      "foobar"(%{{.*}}) : (index) -> ()
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+func @loop_tiling() {
+  affine.for %i = 0 to 256 {
+    affine.for %j = 0 to 512 {
+      affine.for %k = 0 to 1024 {
+        "foo"(%i, %j, %k) : (index, index, index) -> ()
+      }
+    }
+  }
+
+  affine.for %x = 0 to 50 {
+    "bar"(%x, %x) : (index, index) -> ()
+  }
+
+  // Intra-tile loop won't need a min expression.
+  affine.for %y = 0 to 21 {
+    "foobar"(%y) : (index) -> ()
+  }
+
+  return
+}
+
+// -----
+
+// CHECK-DAG: [[IDENTITY:#map[0-9]+]] = (d0) -> (d0)
+// CHECK-DAG: [[LB:#map[0-9]+]] = ()[s0] -> (0, s0)
+// CHECK-DAG: [[UB:#map[0-9]+]] = ()[s0, s1] -> (s0, 4096 floordiv s1)
+// CHECK-DAG: [[UB_INTRA_TILE:#map[0-9]+]] = (d0)[s0, s1] -> (d0 + 32, s0, 4096 floordiv s1)
+
+#lb = ()[s0] -> (0, s0)
+#ub = ()[s0, s1] -> (s0, 4096 floordiv s1)
+// CHECK-LABEL: func @loop_max_min_bound(%{{.*}}: memref<?xi32>, %{{.*}}: index, %{{.*}}: index) {
+func @loop_max_min_bound(%A : memref<? x i32>, %L : index, %U : index) {
+  %M = dim %A, 0 : memref<? x i32>
+  affine.for %iTT = max #lb()[%L] to min #ub()[%M, %U] {
+      %out = affine.apply (d0) -> (d0) (%iTT)
+  }
+  return
+// CHECK:       affine.for %{{.*}} = max [[LB]]()[%{{.*}}] to min [[UB]]()[%{{.*}}, %{{.*}}] step 32 {
+// CHECK-NEXT:    affine.for %{{.*}} = [[IDENTITY]](%{{.*}}) to min [[UB_INTRA_TILE]](%{{.*}})[%{{.*}}, %{{.*}}] {
+// CHECK-NEXT:      %{{.*}} = affine.apply [[IDENTITY]](%{{.*}})
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+}
+
+// -----
+
+// Cache size is set to 512 KiB. This loop nest accesses about 49 MiB, and the
+// tile sizes chosen would be 6 x 6 x 6. However, to avoid min/max, which is
+// possible here, they are adjusted to 4 x 4 x 5.
+
+// MODEL-LABEL: func @simple_matmul
+func @simple_matmul(%arg0: memref<256x256xvector<64xf32>>, %arg1: memref<256x256xvector<64xf32>>, %arg2: memref<256x256xvector<64xf32>>) -> memref<256x256xvector<64xf32>> {
+  affine.for %i = 0 to 256 {
+    affine.for %j = 0 to 256 {
+      affine.for %k = 0 to 250 {
+        %l = affine.load %arg0[%i, %k] : memref<256x256xvector<64xf32>>
+        %r = affine.load %arg1[%k, %j] : memref<256x256xvector<64xf32>>
+        %o = affine.load %arg2[%i, %j] : memref<256x256xvector<64xf32>>
+        %m = mulf %l, %r : vector<64xf32>
+        %a = addf %o, %m : vector<64xf32>
+        affine.store %a, %arg2[%i, %j] : memref<256x256xvector<64xf32>>
+      }
+    }
+  }
+  return %arg2 : memref<256x256xvector<64xf32>>
+}
+// MODEL:       affine.for %{{.*}} = 0 to 256 step 4 {
+// MODEL-NEXT:    affine.for %{{.*}} = 0 to 256 step 4 {
+// MODEL-NEXT:      affine.for %{{.*}} = 0 to 250 step 5 {
+
+
+// -----
+
+// CHECK-DAG: [[UBMAP:#map[0-9]+]] = (d0)[s0] -> (d0 + 32, s0)
+
+func @tile_with_symbolic_loop_upper_bounds(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
+  %cst = constant 0.000000e+00 : f32
+  %0 = dim %arg0, 0 : memref<?x?xf32>
+  affine.for %i0 = 0 to %0 {
+    affine.for %i1 = 0 to %0 {
+      affine.store %cst, %arg2[%i0, %i1] : memref<?x?xf32>
+      affine.for %i2 = 0 to %0 {
+        %1 = affine.load %arg0[%i0, %i2] : memref<?x?xf32>
+        %2 = affine.load %arg1[%i2, %i1] : memref<?x?xf32>
+        %3 = mulf %1, %2 : f32
+        %4 = affine.load %arg2[%i0, %i1] : memref<?x?xf32>
+        %5 = addf %4, %3 : f32
+        affine.store %5, %arg2[%i0, %i1] : memref<?x?xf32>
+      }
+    }
+  }
+  return
+}
+
+// CHECK:       %{{.*}} = dim %{{.*}}, 0 : memref<?x?xf32>
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to %{{.*}} step 32 {
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to %{{.*}} step 32 {
+// CHECK-NEXT:      affine.for %{{.*}} = #map3(%{{.*}}) to min [[UBMAP]](%{{.*}})[%{{.*}}] {
+// CHECK-NEXT:        affine.for %{{.*}} = #map3(%{{.*}}) to min [[UBMAP]](%{{.*}})[%{{.*}}] {
+// CHECK-NEXT:          affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+// CHECK-NEXT:          affine.for %{{.*}} = 0 to %{{.*}} {
+// CHECK-NEXT:            %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+// CHECK-NEXT:            %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+// CHECK-NEXT:            %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:            %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+// CHECK-NEXT:            %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:            affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
+// CHECK-NEXT:          }
+// CHECK-NEXT:        }
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+// -----
+
+// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0)
+// CHECK-DAG: [[MAP1:#map[0-9]+]] = ()[s0, s1] -> (s0 + s1)
+// CHECK-DAG: [[UBMAP:#map[0-9]+]] = (d0)[s0, s1] -> (d0 + 32, s0 + s1)
+
+func @tile_with_loop_upper_bounds_in_two_symbols(%arg0: memref<?xf32>, %limit: index) {
+  %dim0 = dim %arg0, 0 : memref<?xf32>
+  affine.for %i0 = 0 to ()[s0, s1] -> (s0 + s1) ()[%dim0, %limit] {
+    %v0 = affine.load %arg0[%i0] : memref<?xf32>
+  }
+  return
+}
+
+// CHECK:       %{{.*}} = dim %{{.*}}, 0 : memref<?xf32>
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to [[MAP1]]()[%{{.*}}, %{{.*}}] step 32 {
+// CHECK-NEXT:    affine.for %{{.*}} = [[MAP0]](%{{.*}}) to min [[UBMAP]](%{{.*}})[%{{.*}}, %{{.*}}] {
+// CHECK-NEXT:      %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<?xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+
+// -----
+
+func @trip_count_1(%arg0: memref<196608x1xf32>, %arg1: memref<196608x1xf32>)
+    -> memref<196608x1xf32> {
+  affine.for %i1 = 0 to 196608 {
+    affine.for %i3 = 0 to 1 {
+      %4 = affine.load %arg0[%i1, %i3] : memref<196608x1xf32>
+      affine.store %4, %arg1[%i1, %i3] : memref<196608x1xf32>
+    }
+  }
+  return %arg1 : memref<196608x1xf32>
+}
+
+// CHECK: %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<196608x1xf32>
+
diff --git a/mlir/test/Transforms/lower-affine.mlir b/mlir/test/Transforms/lower-affine.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..1c3de885adfa29f769d5d0ca01fcb690f6abd61c
--- /dev/null
+++ b/mlir/test/Transforms/lower-affine.mlir
@@ -0,0 +1,592 @@
+// RUN: mlir-opt -lower-affine %s | FileCheck %s
+
+// CHECK-LABEL: func @empty() {
+func @empty() {
+  return     // CHECK:  return
+}            // CHECK: }
+
+func @body(index) -> ()
+
+// Simple loops are properly converted.
+// CHECK-LABEL: func @simple_loop
+// CHECK-NEXT:   %[[c1:.*]] = constant 1 : index
+// CHECK-NEXT:   %[[c42:.*]] = constant 42 : index
+// CHECK-NEXT:   %[[c1_0:.*]] = constant 1 : index
+// CHECK-NEXT:   for %{{.*}} = %[[c1]] to %[[c42]] step %[[c1_0]] {
+// CHECK-NEXT:     call @body(%{{.*}}) : (index) -> ()
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+func @simple_loop() {
+  affine.for %i = 1 to 42 {
+    call @body(%i) : (index) -> ()
+  }
+  return
+}
+
+/////////////////////////////////////////////////////////////////////
+
+func @pre(index) -> ()
+func @body2(index, index) -> ()
+func @post(index) -> ()
+
+// CHECK-LABEL: func @imperfectly_nested_loops
+// CHECK-NEXT:   %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT:   %[[c42:.*]] = constant 42 : index
+// CHECK-NEXT:   %[[c1:.*]] = constant 1 : index
+// CHECK-NEXT:   for %{{.*}} = %[[c0]] to %[[c42]] step %[[c1]] {
+// CHECK-NEXT:     call @pre(%{{.*}}) : (index) -> ()
+// CHECK-NEXT:     %[[c7:.*]] = constant 7 : index
+// CHECK-NEXT:     %[[c56:.*]] = constant 56 : index
+// CHECK-NEXT:     %[[c2:.*]] = constant 2 : index
+// CHECK-NEXT:     for %{{.*}} = %[[c7]] to %[[c56]] step %[[c2]] {
+// CHECK-NEXT:       call @body2(%{{.*}}, %{{.*}}) : (index, index) -> ()
+// CHECK-NEXT:     }
+// CHECK-NEXT:     call @post(%{{.*}}) : (index) -> ()
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+func @imperfectly_nested_loops() {
+  affine.for %i = 0 to 42 {
+    call @pre(%i) : (index) -> ()
+    affine.for %j = 7 to 56 step 2 {
+      call @body2(%i, %j) : (index, index) -> ()
+    }
+    call @post(%i) : (index) -> ()
+  }
+  return
+}
+
+/////////////////////////////////////////////////////////////////////
+
+func @mid(index) -> ()
+func @body3(index, index) -> ()
+
+// CHECK-LABEL: func @more_imperfectly_nested_loops
+// CHECK-NEXT:   %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT:   %[[c42:.*]] = constant 42 : index
+// CHECK-NEXT:   %[[c1:.*]] = constant 1 : index
+// CHECK-NEXT:   for %{{.*}} = %[[c0]] to %[[c42]] step %[[c1]] {
+// CHECK-NEXT:     call @pre(%{{.*}}) : (index) -> ()
+// CHECK-NEXT:     %[[c7:.*]] = constant 7 : index
+// CHECK-NEXT:     %[[c56:.*]] = constant 56 : index
+// CHECK-NEXT:     %[[c2:.*]] = constant 2 : index
+// CHECK-NEXT:     for %{{.*}} = %[[c7]] to %[[c56]] step %[[c2]] {
+// CHECK-NEXT:       call @body2(%{{.*}}, %{{.*}}) : (index, index) -> ()
+// CHECK-NEXT:     }
+// CHECK-NEXT:     call @mid(%{{.*}}) : (index) -> ()
+// CHECK-NEXT:     %[[c18:.*]] = constant 18 : index
+// CHECK-NEXT:     %[[c37:.*]] = constant 37 : index
+// CHECK-NEXT:     %[[c3:.*]] = constant 3 : index
+// CHECK-NEXT:     for %{{.*}} = %[[c18]] to %[[c37]] step %[[c3]] {
+// CHECK-NEXT:       call @body3(%{{.*}}, %{{.*}}) : (index, index) -> ()
+// CHECK-NEXT:     }
+// CHECK-NEXT:     call @post(%{{.*}}) : (index) -> ()
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+func @more_imperfectly_nested_loops() {
+  affine.for %i = 0 to 42 {
+    call @pre(%i) : (index) -> ()
+    affine.for %j = 7 to 56 step 2 {
+      call @body2(%i, %j) : (index, index) -> ()
+    }
+    call @mid(%i) : (index) -> ()
+    affine.for %k = 18 to 37 step 3 {
+      call @body3(%i, %k) : (index, index) -> ()
+    }
+    call @post(%i) : (index) -> ()
+  }
+  return
+}
+
+// CHECK-LABEL: func @affine_apply_loops_shorthand
+// CHECK-NEXT:   %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT:   %[[c1:.*]] = constant 1 : index
+// CHECK-NEXT:   for %{{.*}} = %[[c0]] to %{{.*}} step %[[c1]] {
+// CHECK-NEXT:     %[[c42:.*]] = constant 42 : index
+// CHECK-NEXT:     %[[c1_0:.*]] = constant 1 : index
+// CHECK-NEXT:     for %{{.*}} = %{{.*}} to %[[c42]] step %[[c1_0]] {
+// CHECK-NEXT:       call @body2(%{{.*}}, %{{.*}}) : (index, index) -> ()
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+func @affine_apply_loops_shorthand(%N : index) {
+  affine.for %i = 0 to %N {
+    affine.for %j = (d0)[]->(d0)(%i)[] to 42 {
+      call @body2(%i, %j) : (index, index) -> ()
+    }
+  }
+  return
+}
+
+/////////////////////////////////////////////////////////////////////
+
+func @get_idx() -> (index)
+
+#set1 = (d0) : (20 - d0 >= 0)
+#set2 = (d0) : (d0 - 10 >= 0)
+
+// CHECK-LABEL: func @if_only
+// CHECK-NEXT:   %[[v0:.*]] = call @get_idx() : () -> index
+// CHECK-NEXT:   %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT:   %[[cm1:.*]] = constant -1 : index
+// CHECK-NEXT:   %[[v1:.*]] = muli %[[v0]], %[[cm1]] : index
+// CHECK-NEXT:   %[[c20:.*]] = constant 20 : index
+// CHECK-NEXT:   %[[v2:.*]] = addi %[[v1]], %[[c20]] : index
+// CHECK-NEXT:   %[[v3:.*]] = cmpi "sge", %[[v2]], %[[c0]] : index
+// CHECK-NEXT:   if %[[v3]] {
+// CHECK-NEXT:     call @body(%[[v0:.*]]) : (index) -> ()
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+func @if_only() {
+  %i = call @get_idx() : () -> (index)
+  affine.if #set1(%i) {
+    call @body(%i) : (index) -> ()
+  }
+  return
+}
+
+// CHECK-LABEL: func @if_else
+// CHECK-NEXT:   %[[v0:.*]] = call @get_idx() : () -> index
+// CHECK-NEXT:   %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT:   %[[cm1:.*]] = constant -1 : index
+// CHECK-NEXT:   %[[v1:.*]] = muli %[[v0]], %[[cm1]] : index
+// CHECK-NEXT:   %[[c20:.*]] = constant 20 : index
+// CHECK-NEXT:   %[[v2:.*]] = addi %[[v1]], %[[c20]] : index
+// CHECK-NEXT:   %[[v3:.*]] = cmpi "sge", %[[v2]], %[[c0]] : index
+// CHECK-NEXT:   if %[[v3]] {
+// CHECK-NEXT:     call @body(%[[v0:.*]]) : (index) -> ()
+// CHECK-NEXT:   } else {
+// CHECK-NEXT:     call @mid(%[[v0:.*]]) : (index) -> ()
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+func @if_else() {
+  %i = call @get_idx() : () -> (index)
+  affine.if #set1(%i) {
+    call @body(%i) : (index) -> ()
+  } else {
+    call @mid(%i) : (index) -> ()
+  }
+  return
+}
+
+// CHECK-LABEL: func @nested_ifs
+// CHECK-NEXT:   %[[v0:.*]] = call @get_idx() : () -> index
+// CHECK-NEXT:   %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT:   %[[cm1:.*]] = constant -1 : index
+// CHECK-NEXT:   %[[v1:.*]] = muli %[[v0]], %[[cm1]] : index
+// CHECK-NEXT:   %[[c20:.*]] = constant 20 : index
+// CHECK-NEXT:   %[[v2:.*]] = addi %[[v1]], %[[c20]] : index
+// CHECK-NEXT:   %[[v3:.*]] = cmpi "sge", %[[v2]], %[[c0]] : index
+// CHECK-NEXT:   if %[[v3]] {
+// CHECK-NEXT:     %[[c0_0:.*]] = constant 0 : index
+// CHECK-NEXT:     %[[cm10:.*]] = constant -10 : index
+// CHECK-NEXT:     %[[v4:.*]] = addi %[[v0]], %[[cm10]] : index
+// CHECK-NEXT:     %[[v5:.*]] = cmpi "sge", %[[v4]], %[[c0_0]] : index
+// CHECK-NEXT:     if %[[v5]] {
+// CHECK-NEXT:       call @body(%[[v0:.*]]) : (index) -> ()
+// CHECK-NEXT:     }
+// CHECK-NEXT:   } else {
+// CHECK-NEXT:     %[[c0_0:.*]] = constant 0 : index
+// CHECK-NEXT:     %[[cm10:.*]] = constant -10 : index
+// CHECK-NEXT:     %{{.*}} = addi %[[v0]], %[[cm10]] : index
+// CHECK-NEXT:     %{{.*}} = cmpi "sge", %{{.*}}, %[[c0_0]] : index
+// CHECK-NEXT:     if %{{.*}} {
+// CHECK-NEXT:       call @mid(%[[v0:.*]]) : (index) -> ()
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+func @nested_ifs() {
+  %i = call @get_idx() : () -> (index)
+  affine.if #set1(%i) {
+    affine.if #set2(%i) {
+      call @body(%i) : (index) -> ()
+    }
+  } else {
+    affine.if #set2(%i) {
+      call @mid(%i) : (index) -> ()
+    }
+  }
+  return
+}
+
+#setN = (d0)[N,M,K,L] : (N - d0 + 1 >= 0, N - 1 >= 0, M - 1 >= 0, K - 1 >= 0, L - 42 == 0)
+
+// CHECK-LABEL: func @multi_cond
+// CHECK-NEXT:   %[[v0:.*]] = call @get_idx() : () -> index
+// CHECK-NEXT:   %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT:   %[[cm1:.*]] = constant -1 : index
+// CHECK-NEXT:   %[[v1:.*]] = muli %[[v0]], %[[cm1]] : index
+// CHECK-NEXT:   %[[v2:.*]] = addi %[[v1]], %{{.*}} : index
+// CHECK-NEXT:   %[[c1:.*]] = constant 1 : index
+// CHECK-NEXT:   %[[v3:.*]] = addi %[[v2]], %[[c1]] : index
+// CHECK-NEXT:   %[[v4:.*]] = cmpi "sge", %[[v3]], %[[c0]] : index
+// CHECK-NEXT:   %[[cm1_0:.*]] = constant -1 : index
+// CHECK-NEXT:   %[[v5:.*]] = addi %{{.*}}, %[[cm1_0]] : index
+// CHECK-NEXT:   %[[v6:.*]] = cmpi "sge", %[[v5]], %[[c0]] : index
+// CHECK-NEXT:   %[[v7:.*]] = and %[[v4]], %[[v6]] : i1
+// CHECK-NEXT:   %[[cm1_1:.*]] = constant -1 : index
+// CHECK-NEXT:   %[[v8:.*]] = addi %{{.*}}, %[[cm1_1]] : index
+// CHECK-NEXT:   %[[v9:.*]] = cmpi "sge", %[[v8]], %[[c0]] : index
+// CHECK-NEXT:   %[[v10:.*]] = and %[[v7]], %[[v9]] : i1
+// CHECK-NEXT:   %[[cm1_2:.*]] = constant -1 : index
+// CHECK-NEXT:   %[[v11:.*]] = addi %{{.*}}, %[[cm1_2]] : index
+// CHECK-NEXT:   %[[v12:.*]] = cmpi "sge", %[[v11]], %[[c0]] : index
+// CHECK-NEXT:   %[[v13:.*]] = and %[[v10]], %[[v12]] : i1
+// CHECK-NEXT:   %[[cm42:.*]] = constant -42 : index
+// CHECK-NEXT:   %[[v14:.*]] = addi %{{.*}}, %[[cm42]] : index
+// CHECK-NEXT:   %[[v15:.*]] = cmpi "eq", %[[v14]], %[[c0]] : index
+// CHECK-NEXT:   %[[v16:.*]] = and %[[v13]], %[[v15]] : i1
+// CHECK-NEXT:   if %[[v16]] {
+// CHECK-NEXT:     call @body(%[[v0:.*]]) : (index) -> ()
+// CHECK-NEXT:   } else {
+// CHECK-NEXT:     call @mid(%[[v0:.*]]) : (index) -> ()
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+func @multi_cond(%N : index, %M : index, %K : index, %L : index) {
+  %i = call @get_idx() : () -> (index)
+  affine.if #setN(%i)[%N,%M,%K,%L] {
+    call @body(%i) : (index) -> ()
+  } else {
+    call @mid(%i) : (index) -> ()
+  }
+  return
+}
+
+// CHECK-LABEL: func @if_for
+func @if_for() {
+// CHECK-NEXT:   %[[v0:.*]] = call @get_idx() : () -> index
+  %i = call @get_idx() : () -> (index)
+// CHECK-NEXT:   %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT:   %[[cm1:.*]] = constant -1 : index
+// CHECK-NEXT:   %[[v1:.*]] = muli %[[v0]], %[[cm1]] : index
+// CHECK-NEXT:   %[[c20:.*]] = constant 20 : index
+// CHECK-NEXT:   %[[v2:.*]] = addi %[[v1]], %[[c20]] : index
+// CHECK-NEXT:   %[[v3:.*]] = cmpi "sge", %[[v2]], %[[c0]] : index
+// CHECK-NEXT:   if %[[v3]] {
+// CHECK-NEXT:     %[[c0:.*]]{{.*}} = constant 0 : index
+// CHECK-NEXT:     %[[c42:.*]]{{.*}} = constant 42 : index
+// CHECK-NEXT:     %[[c1:.*]]{{.*}} = constant 1 : index
+// CHECK-NEXT:     for %{{.*}} = %[[c0:.*]]{{.*}} to %[[c42:.*]]{{.*}} step %[[c1:.*]]{{.*}} {
+// CHECK-NEXT:       %[[c0_:.*]]{{.*}} = constant 0 : index
+// CHECK-NEXT:       %[[cm10:.*]] = constant -10 : index
+// CHECK-NEXT:       %[[v4:.*]] = addi %{{.*}}, %[[cm10]] : index
+// CHECK-NEXT:       %[[v5:.*]] = cmpi "sge", %[[v4]], %[[c0_:.*]]{{.*}} : index
+// CHECK-NEXT:       if %[[v5]] {
+// CHECK-NEXT:         call @body2(%[[v0]], %{{.*}}) : (index, index) -> ()
+  affine.if #set1(%i) {
+    affine.for %j = 0 to 42 {
+      affine.if #set2(%j) {
+        call @body2(%i, %j) : (index, index) -> ()
+      }
+    }
+  }
+//      CHECK:   %[[c0:.*]]{{.*}} = constant 0 : index
+// CHECK-NEXT:   %[[c42:.*]]{{.*}} = constant 42 : index
+// CHECK-NEXT:   %[[c1:.*]]{{.*}} = constant 1 : index
+// CHECK-NEXT:   for %{{.*}} = %[[c0:.*]]{{.*}} to %[[c42:.*]]{{.*}} step %[[c1:.*]]{{.*}} {
+// CHECK-NEXT:     %[[c0:.*]]{{.*}} = constant 0 : index
+// CHECK-NEXT:     %[[cm10:.*]]{{.*}} = constant -10 : index
+// CHECK-NEXT:     %{{.*}} = addi %{{.*}}, %[[cm10:.*]]{{.*}} : index
+// CHECK-NEXT:     %{{.*}} = cmpi "sge", %{{.*}}, %[[c0:.*]]{{.*}} : index
+// CHECK-NEXT:     if %{{.*}} {
+// CHECK-NEXT:       %[[c0_:.*]]{{.*}} = constant 0 : index
+// CHECK-NEXT:       %[[c42_:.*]]{{.*}} = constant 42 : index
+// CHECK-NEXT:       %[[c1_:.*]]{{.*}} = constant 1 : index
+// CHECK-NEXT:       for %{{.*}} = %[[c0_:.*]]{{.*}} to %[[c42_:.*]]{{.*}} step %[[c1_:.*]]{{.*}} {
+  affine.for %k = 0 to 42 {
+    affine.if #set2(%k) {
+      affine.for %l = 0 to 42 {
+        call @body3(%k, %l) : (index, index) -> ()
+      }
+    }
+  }
+//      CHECK:   return
+  return
+}
+
+#lbMultiMap = (d0)[s0] -> (d0, s0 - d0)
+#ubMultiMap = (d0)[s0] -> (s0, d0 + 10)
+
+// CHECK-LABEL: func @loop_min_max
+// CHECK-NEXT:   %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT:   %[[c42:.*]] = constant 42 : index
+// CHECK-NEXT:   %[[c1:.*]] = constant 1 : index
+// CHECK-NEXT:   for %{{.*}} = %[[c0]] to %[[c42]] step %[[c1]] {
+// CHECK-NEXT:     %[[cm1:.*]] = constant -1 : index
+// CHECK-NEXT:     %[[a:.*]] = muli %{{.*}}, %[[cm1]] : index
+// CHECK-NEXT:     %[[b:.*]] = addi %[[a]], %{{.*}} : index
+// CHECK-NEXT:     %[[c:.*]] = cmpi "sgt", %{{.*}}, %[[b]] : index
+// CHECK-NEXT:     %[[d:.*]] = select %[[c]], %{{.*}}, %[[b]] : index
+// CHECK-NEXT:     %[[c10:.*]] = constant 10 : index
+// CHECK-NEXT:     %[[e:.*]] = addi %{{.*}}, %[[c10]] : index
+// CHECK-NEXT:     %[[f:.*]] = cmpi "slt", %{{.*}}, %[[e]] : index
+// CHECK-NEXT:     %[[g:.*]] = select %[[f]], %{{.*}}, %[[e]] : index
+// CHECK-NEXT:     %[[c1_0:.*]] = constant 1 : index
+// CHECK-NEXT:     for %{{.*}} = %[[v3]] to %[[v6]] step %[[c1_0]] {
+// CHECK-NEXT:       call @body2(%{{.*}}, %{{.*}}) : (index, index) -> ()
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+func @loop_min_max(%N : index) {
+  affine.for %i = 0 to 42 {
+    affine.for %j = max #lbMultiMap(%i)[%N] to min #ubMultiMap(%i)[%N] {
+      call @body2(%i, %j) : (index, index) -> ()
+    }
+  }
+  return
+}
+
+#map_7_values = (i) -> (i, i, i, i, i, i, i)
+
+// Check that the "min" (cmpi "slt" + select) reduction sequence is emitted
+// correctly for a an affine map with 7 results.
+
+// CHECK-LABEL: func @min_reduction_tree
+// CHECK-NEXT:   %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT:   %[[c01:.+]] = cmpi "slt", %{{.*}}, %{{.*}} : index
+// CHECK-NEXT:   %[[r01:.+]] = select %[[c01]], %{{.*}}, %{{.*}} : index
+// CHECK-NEXT:   %[[c012:.+]] = cmpi "slt", %[[r01]], %{{.*}} : index
+// CHECK-NEXT:   %[[r012:.+]] = select %[[c012]], %[[r01]], %{{.*}} : index
+// CHECK-NEXT:   %[[c0123:.+]] = cmpi "slt", %[[r012]], %{{.*}} : index
+// CHECK-NEXT:   %[[r0123:.+]] = select %[[c0123]], %[[r012]], %{{.*}} : index
+// CHECK-NEXT:   %[[c01234:.+]] = cmpi "slt", %[[r0123]], %{{.*}} : index
+// CHECK-NEXT:   %[[r01234:.+]] = select %[[c01234]], %[[r0123]], %{{.*}} : index
+// CHECK-NEXT:   %[[c012345:.+]] = cmpi "slt", %[[r01234]], %{{.*}} : index
+// CHECK-NEXT:   %[[r012345:.+]] = select %[[c012345]], %[[r01234]], %{{.*}} : index
+// CHECK-NEXT:   %[[c0123456:.+]] = cmpi "slt", %[[r012345]], %{{.*}} : index
+// CHECK-NEXT:   %[[r0123456:.+]] = select %[[c0123456]], %[[r012345]], %{{.*}} : index
+// CHECK-NEXT:   %[[c1:.*]] = constant 1 : index
+// CHECK-NEXT:   for %{{.*}} = %[[c0]] to %[[v11]] step %[[c1]] {
+// CHECK-NEXT:     call @body(%{{.*}}) : (index) -> ()
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+func @min_reduction_tree(%v : index) {
+  affine.for %i = 0 to min #map_7_values(%v)[] {
+    call @body(%i) : (index) -> ()
+  }
+  return
+}
+
+/////////////////////////////////////////////////////////////////////
+
+#map0 = () -> (0)
+#map1 = ()[s0] -> (s0)
+#map2 = (d0) -> (d0)
+#map3 = (d0)[s0] -> (d0 + s0 + 1)
+#map4 = (d0,d1,d2,d3)[s0,s1,s2] -> (d0 + 2*d1 + 3*d2 + 4*d3 + 5*s0 + 6*s1 + 7*s2)
+#map5 = (d0,d1,d2) -> (d0,d1,d2)
+#map6 = (d0,d1,d2) -> (d0 + d1 + d2)
+
+// CHECK-LABEL: func @affine_applies(
+func @affine_applies(%arg0 : index) {
+// CHECK: %[[c0:.*]] = constant 0 : index
+  %zero = affine.apply #map0()
+
+// Identity maps are just discarded.
+// CHECK-NEXT: %[[c101:.*]] = constant 101 : index
+  %101 = constant 101 : index
+  %symbZero = affine.apply #map1()[%zero]
+// CHECK-NEXT: %[[c102:.*]] = constant 102 : index
+  %102 = constant 102 : index
+  %copy = affine.apply #map2(%zero)
+
+// CHECK-NEXT: %[[v0:.*]] = addi %[[c0]], %[[c0]] : index
+// CHECK-NEXT: %[[c1:.*]] = constant 1 : index
+// CHECK-NEXT: %[[v1:.*]] = addi %[[v0]], %[[c1]] : index
+  %one = affine.apply #map3(%symbZero)[%zero]
+
+// CHECK-NEXT: %[[c2:.*]] = constant 2 : index
+// CHECK-NEXT: %[[v2:.*]] = muli %arg0, %[[c2]] : index
+// CHECK-NEXT: %[[v3:.*]] = addi %arg0, %[[v2]] : index
+// CHECK-NEXT: %[[c3:.*]] = constant 3 : index
+// CHECK-NEXT: %[[v4:.*]] = muli %arg0, %[[c3]] : index
+// CHECK-NEXT: %[[v5:.*]] = addi %[[v3]], %[[v4]] : index
+// CHECK-NEXT: %[[c4:.*]] = constant 4 : index
+// CHECK-NEXT: %[[v6:.*]] = muli %arg0, %[[c4]] : index
+// CHECK-NEXT: %[[v7:.*]] = addi %[[v5]], %[[v6]] : index
+// CHECK-NEXT: %[[c5:.*]] = constant 5 : index
+// CHECK-NEXT: %[[v8:.*]] = muli %arg0, %[[c5]] : index
+// CHECK-NEXT: %[[v9:.*]] = addi %[[v7]], %[[v8]] : index
+// CHECK-NEXT: %[[c6:.*]] = constant 6 : index
+// CHECK-NEXT: %[[v10:.*]] = muli %arg0, %[[c6]] : index
+// CHECK-NEXT: %[[v11:.*]] = addi %[[v9]], %[[v10]] : index
+// CHECK-NEXT: %[[c7:.*]] = constant 7 : index
+// CHECK-NEXT: %[[v12:.*]] = muli %arg0, %[[c7]] : index
+// CHECK-NEXT: %[[v13:.*]] = addi %[[v11]], %[[v12]] : index
+  %four = affine.apply #map4(%arg0, %arg0, %arg0, %arg0)[%arg0, %arg0, %arg0]
+  return
+}
+
+// CHECK-LABEL: func @args_ret_affine_apply(
+func @args_ret_affine_apply(index, index) -> (index, index) {
+^bb0(%0 : index, %1 : index):
+// CHECK-NEXT: return %{{.*}}, %{{.*}} : index, index
+  %00 = affine.apply #map2 (%0)
+  %11 = affine.apply #map1 ()[%1]
+  return %00, %11 : index, index
+}
+
+//===---------------------------------------------------------------------===//
+// Test lowering of Euclidean (floor) division, ceil division and modulo
+// operation used in affine expressions.  In addition to testing the
+// operation-level output, check that the obtained results are correct by
+// applying constant folding transformation after affine lowering.
+//===---------------------------------------------------------------------===//
+
+#mapmod = (i) -> (i mod 42)
+
+// --------------------------------------------------------------------------//
+// IMPORTANT NOTE: if you change this test, also change the @lowered_affine_mod
+// test in the "constant-fold.mlir" test to reflect the expected output of
+// affine.apply lowering.
+// --------------------------------------------------------------------------//
+// CHECK-LABEL: func @affine_apply_mod
+func @affine_apply_mod(%arg0 : index) -> (index) {
+// CHECK-NEXT: %[[c42:.*]] = constant 42 : index
+// CHECK-NEXT: %[[v0:.*]] = remi_signed %{{.*}}, %[[c42]] : index
+// CHECK-NEXT: %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT: %[[v1:.*]] = cmpi "slt", %[[v0]], %[[c0]] : index
+// CHECK-NEXT: %[[v2:.*]] = addi %[[v0]], %[[c42]] : index
+// CHECK-NEXT: %[[v3:.*]] = select %[[v1]], %[[v2]], %[[v0]] : index
+  %0 = affine.apply #mapmod (%arg0)
+  return %0 : index
+}
+
+#mapfloordiv = (i) -> (i floordiv 42)
+
+// --------------------------------------------------------------------------//
+// IMPORTANT NOTE: if you change this test, also change the @lowered_affine_mod
+// test in the "constant-fold.mlir" test to reflect the expected output of
+// affine.apply lowering.
+// --------------------------------------------------------------------------//
+// CHECK-LABEL: func @affine_apply_floordiv
+func @affine_apply_floordiv(%arg0 : index) -> (index) {
+// CHECK-NEXT: %[[c42:.*]] = constant 42 : index
+// CHECK-NEXT: %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT: %[[cm1:.*]] = constant -1 : index
+// CHECK-NEXT: %[[v0:.*]] = cmpi "slt", %{{.*}}, %[[c0]] : index
+// CHECK-NEXT: %[[v1:.*]] = subi %[[cm1]], %{{.*}} : index
+// CHECK-NEXT: %[[v2:.*]] = select %[[v0]], %[[v1]], %{{.*}} : index
+// CHECK-NEXT: %[[v3:.*]] = divi_signed %[[v2]], %[[c42]] : index
+// CHECK-NEXT: %[[v4:.*]] = subi %[[cm1]], %[[v3]] : index
+// CHECK-NEXT: %[[v5:.*]] = select %[[v0]], %[[v4]], %[[v3]] : index
+  %0 = affine.apply #mapfloordiv (%arg0)
+  return %0 : index
+}
+
+#mapceildiv = (i) -> (i ceildiv 42)
+
+// --------------------------------------------------------------------------//
+// IMPORTANT NOTE: if you change this test, also change the @lowered_affine_mod
+// test in the "constant-fold.mlir" test to reflect the expected output of
+// affine.apply lowering.
+// --------------------------------------------------------------------------//
+// CHECK-LABEL: func @affine_apply_ceildiv
+func @affine_apply_ceildiv(%arg0 : index) -> (index) {
+// CHECK-NEXT:  %[[c42:.*]] = constant 42 : index
+// CHECK-NEXT:  %[[c0:.*]] = constant 0 : index
+// CHECK-NEXT:  %[[c1:.*]] = constant 1 : index
+// CHECK-NEXT:  %[[v0:.*]] = cmpi "sle", %{{.*}}, %[[c0]] : index
+// CHECK-NEXT:  %[[v1:.*]] = subi %[[c0]], %{{.*}} : index
+// CHECK-NEXT:  %[[v2:.*]] = subi %{{.*}}, %[[c1]] : index
+// CHECK-NEXT:  %[[v3:.*]] = select %[[v0]], %[[v1]], %[[v2]] : index
+// CHECK-NEXT:  %[[v4:.*]] = divi_signed %[[v3]], %[[c42]] : index
+// CHECK-NEXT:  %[[v5:.*]] = subi %[[c0]], %[[v4]] : index
+// CHECK-NEXT:  %[[v6:.*]] = addi %[[v4]], %[[c1]] : index
+// CHECK-NEXT:  %[[v7:.*]] = select %[[v0]], %[[v5]], %[[v6]] : index
+  %0 = affine.apply #mapceildiv (%arg0)
+  return %0 : index
+}
+
+// CHECK-LABEL: func @affine_load
+func @affine_load(%arg0 : index) {
+  %0 = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    %1 = affine.load %0[%i0 + symbol(%arg0) + 7] : memref<10xf32>
+  }
+// CHECK:       %[[a:.*]] = addi %{{.*}}, %{{.*}} : index
+// CHECK-NEXT:  %[[c7:.*]] = constant 7 : index
+// CHECK-NEXT:  %[[b:.*]] = addi %[[a]], %[[c7]] : index
+// CHECK-NEXT:  %{{.*}} = load %[[v0:.*]][%[[b]]] : memref<10xf32>
+  return
+}
+
+// CHECK-LABEL: func @affine_store
+func @affine_store(%arg0 : index) {
+  %0 = alloc() : memref<10xf32>
+  %1 = constant 11.0 : f32
+  affine.for %i0 = 0 to 10 {
+    affine.store %1, %0[%i0 - symbol(%arg0) + 7] : memref<10xf32>
+  }
+// CHECK:       %c-1 = constant -1 : index
+// CHECK-NEXT:  %[[a:.*]] = muli %arg0, %c-1 : index
+// CHECK-NEXT:  %[[b:.*]] = addi %{{.*}}, %[[a]] : index
+// CHECK-NEXT:  %c7 = constant 7 : index
+// CHECK-NEXT:  %[[c:.*]] = addi %[[b]], %c7 : index
+// CHECK-NEXT:  store %cst, %0[%[[c]]] : memref<10xf32>
+  return
+}
+
+// CHECK-LABEL: func @affine_load_store_zero_dim
+func @affine_load_store_zero_dim(%arg0 : memref<i32>, %arg1 : memref<i32>) {
+  %0 = affine.load %arg0[] : memref<i32>
+  affine.store %0, %arg1[] : memref<i32>
+// CHECK: %[[x:.*]] = load %arg0[] : memref<i32>
+// CHECK: store %[[x]], %arg1[] : memref<i32>
+  return
+}
+
+// CHECK-LABEL: func @affine_prefetch
+func @affine_prefetch(%arg0 : index) {
+  %0 = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.prefetch %0[%i0 + symbol(%arg0) + 7], read, locality<3>, data : memref<10xf32>
+  }
+// CHECK:       %[[a:.*]] = addi %{{.*}}, %{{.*}} : index
+// CHECK-NEXT:  %[[c7:.*]] = constant 7 : index
+// CHECK-NEXT:  %[[b:.*]] = addi %[[a]], %[[c7]] : index
+// CHECK-NEXT:  prefetch %[[v0:.*]][%[[b]]], read, locality<3>, data : memref<10xf32>
+  return
+}
+
+// CHECK-LABEL: func @affine_dma_start
+func @affine_dma_start(%arg0 : index) {
+  %0 = alloc() : memref<100xf32>
+  %1 = alloc() : memref<100xf32, 2>
+  %2 = alloc() : memref<1xi32>
+  %c0 = constant 0 : index
+  %c64 = constant 64 : index
+  affine.for %i0 = 0 to 10 {
+    affine.dma_start %0[%i0 + 7], %1[%arg0 + 11], %2[%c0], %c64
+        : memref<100xf32>, memref<100xf32, 2>, memref<1xi32>
+  }
+// CHECK:       %c7 = constant 7 : index
+// CHECK-NEXT:  %[[a:.*]] = addi %{{.*}}, %c7 : index
+// CHECK-NEXT:  %c11 = constant 11 : index
+// CHECK-NEXT:  %[[b:.*]] = addi %arg0, %c11 : index
+// CHECK-NEXT:  dma_start %0[%[[a]]], %1[%[[b]]], %c64, %2[%c0] : memref<100xf32>, memref<100xf32, 2>, memref<1xi32>
+  return
+}
+
+// CHECK-LABEL: func @affine_dma_wait
+func @affine_dma_wait(%arg0 : index) {
+  %2 = alloc() : memref<1xi32>
+  %c64 = constant 64 : index
+  affine.for %i0 = 0 to 10 {
+    affine.dma_wait %2[%i0 + %arg0 + 17], %c64 : memref<1xi32>
+  }
+// CHECK:       %[[a:.*]] = addi %{{.*}}, %arg0 : index
+// CHECK-NEXT:  %c17 = constant 17 : index
+// CHECK-NEXT:  %[[b:.*]] = addi %[[a]], %c17 : index
+// CHECK-NEXT:  dma_wait %0[%[[b]]], %c64 : memref<1xi32>
+  return
+}
diff --git a/mlir/test/Transforms/memref-bound-check.mlir b/mlir/test/Transforms/memref-bound-check.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..c81e44d1bfd8193901083a7ad3f5987d1746fa05
--- /dev/null
+++ b/mlir/test/Transforms/memref-bound-check.mlir
@@ -0,0 +1,286 @@
+// RUN: mlir-opt %s -memref-bound-check -split-input-file -verify-diagnostics | FileCheck %s
+
+// -----
+
+// CHECK-LABEL: func @test() {
+func @test() {
+  %zero = constant 0 : index
+  %minusone = constant -1 : index
+  %sym = constant 111 : index
+
+  %A = alloc() : memref<9 x 9 x i32>
+  %B = alloc() : memref<111 x i32>
+
+  affine.for %i = -1 to 10 {
+    affine.for %j = -1 to 10 {
+      %idx0 = affine.apply (d0, d1) -> (d0)(%i, %j)
+      %idx1 = affine.apply (d0, d1) -> (d1)(%i, %j)
+      // Out of bound access.
+      %x  = affine.load %A[%idx0, %idx1] : memref<9 x 9 x i32>
+      // expected-error@-1 {{'affine.load' op memref out of upper bound access along dimension #1}}
+      // expected-error@-2 {{'affine.load' op memref out of lower bound access along dimension #1}}
+      // expected-error@-3 {{'affine.load' op memref out of upper bound access along dimension #2}}
+      // expected-error@-4 {{'affine.load' op memref out of lower bound access along dimension #2}}
+      // This will access 0 to 110 - hence an overflow.
+      %idy = affine.apply (d0, d1) -> (10*d0 - d1 + 19)(%i, %j)
+      %y = affine.load %B[%idy] : memref<111 x i32>
+    }
+  }
+
+  affine.for %k = 0 to 10 {
+      // In bound.
+      %u = affine.load %B[%zero] : memref<111 x i32>
+      // Out of bounds.
+      %v = affine.load %B[%sym] : memref<111 x i32> // expected-error {{'affine.load' op memref out of upper bound access along dimension #1}}
+      // Out of bounds.
+      affine.store %v, %B[%minusone] : memref<111 x i32>  // expected-error {{'affine.store' op memref out of lower bound access along dimension #1}}
+  }
+  return
+}
+
+// CHECK-LABEL: func @test_mod_floordiv_ceildiv
+func @test_mod_floordiv_ceildiv() {
+  %zero = constant 0 : index
+  %A = alloc() : memref<128 x 64 x 64 x i32>
+
+  affine.for %i = 0 to 256 {
+    affine.for %j = 0 to 256 {
+      %idx0 = affine.apply (d0, d1, d2) -> (d0 mod 128 + 1)(%i, %j, %j)
+      %idx1 = affine.apply (d0, d1, d2) -> (d1 floordiv 4 + 1)(%i, %j, %j)
+      %idx2 = affine.apply (d0, d1, d2) -> (d2 ceildiv 4)(%i, %j, %j)
+      %x  = affine.load %A[%idx0, %idx1, %idx2] : memref<128 x 64 x 64 x i32>
+      // expected-error@-1 {{'affine.load' op memref out of upper bound access along dimension #1}}
+      // expected-error@-2 {{'affine.load' op memref out of upper bound access along dimension #2}}
+      // expected-error@-3 {{'affine.load' op memref out of upper bound access along dimension #3}}
+      %idy0 = affine.apply (d0, d1, d2) -> (d0 mod 128)(%i, %j, %j)
+      %idy1 = affine.apply (d0, d1, d2) -> (d1 floordiv 4)(%i, %j, %j)
+      %idy2 = affine.apply (d0, d1, d2) -> (d2 ceildiv 4 - 1)(%i, %j, %j)
+      affine.store %x, %A[%idy0, %idy1, %idy2] : memref<128 x 64 x 64 x i32> // expected-error {{'affine.store' op memref out of lower bound access along dimension #3}}
+    } // CHECK }
+  } // CHECK }
+  return
+}
+
+// CHECK-LABEL: func @test_no_out_of_bounds()
+func @test_no_out_of_bounds() {
+  %zero = constant 0 : index
+  %A = alloc() : memref<257 x 256 x i32>
+  %C = alloc() : memref<257 x i32>
+  %B = alloc() : memref<1 x i32>
+
+  affine.for %i = 0 to 256 {
+    affine.for %j = 0 to 256 {
+      // All of these accesses are in bound; check that no errors are emitted.
+      // CHECK: %{{.*}} = affine.apply {{#map.*}}(%{{.*}}, %{{.*}})
+      // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<257x256xi32>
+      // CHECK-NEXT: %{{.*}} = affine.apply {{#map.*}}(%{{.*}}, %{{.*}})
+      // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<1xi32>
+      %idx0 = affine.apply (d0, d1) -> ( 64 * (d0 ceildiv 64))(%i, %j)
+      // Without GCDTightenInequalities(), the upper bound on the region
+      // accessed along first memref dimension would have come out as d0 <= 318
+      // (instead of d0 <= 256), and led to a false positive out of bounds.
+      %x  = affine.load %A[%idx0, %zero] : memref<257 x 256 x i32>
+      %idy = affine.apply (d0, d1) -> (d0 floordiv 256)(%i, %i)
+      %y  = affine.load %B[%idy] : memref<1 x i32>
+    } // CHECK-NEXT }
+  }
+  return
+}
+
+// CHECK-LABEL: func @mod_div
+func @mod_div() {
+  %zero = constant 0 : index
+  %A = alloc() : memref<128 x 64 x 64 x i32>
+
+  affine.for %i = 0 to 256 {
+    affine.for %j = 0 to 256 {
+      %idx0 = affine.apply (d0, d1, d2) -> (d0 mod 128 + 1)(%i, %j, %j)
+      %idx1 = affine.apply (d0, d1, d2) -> (d1 floordiv 4 + 1)(%i, %j, %j)
+      %idx2 = affine.apply (d0, d1, d2) -> (d2 ceildiv 4)(%i, %j, %j)
+      %x  = affine.load %A[%idx0, %idx1, %idx2] : memref<128 x 64 x 64 x i32>
+      // expected-error@-1 {{'affine.load' op memref out of upper bound access along dimension #1}}
+      // expected-error@-2 {{'affine.load' op memref out of upper bound access along dimension #2}}
+      // expected-error@-3 {{'affine.load' op memref out of upper bound access along dimension #3}}
+      %idy0 = affine.apply (d0, d1, d2) -> (d0 mod 128)(%i, %j, %j)
+      %idy1 = affine.apply (d0, d1, d2) -> (d1 floordiv 4)(%i, %j, %j)
+      %idy2 = affine.apply (d0, d1, d2) -> (d2 ceildiv 4 - 1)(%i, %j, %j)
+      affine.store %x, %A[%idy0, %idy1, %idy2] : memref<128 x 64 x 64 x i32> // expected-error {{'affine.store' op memref out of lower bound access along dimension #3}}
+    }
+  }
+  return
+}
+
+// Tests with nested mod's and floordiv's.
+// CHECK-LABEL: func @mod_floordiv_nested() {
+func @mod_floordiv_nested() {
+  %A = alloc() : memref<256 x 256 x i32>
+  affine.for %i = 0 to 256 {
+    affine.for %j = 0 to 256 {
+      %idx0 = affine.apply (d0, d1) -> ((d0 mod 1024) floordiv 4)(%i, %j)
+      %idx1 = affine.apply (d0, d1) -> ((((d1 mod 128) mod 32) ceildiv 4) * 32)(%i, %j)
+      affine.load %A[%idx0, %idx1] : memref<256 x 256 x i32> // expected-error {{'affine.load' op memref out of upper bound access along dimension #2}}
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @test_semi_affine_bailout
+func @test_semi_affine_bailout(%N : index) {
+  %B = alloc() : memref<10 x i32>
+  affine.for %i = 0 to 10 {
+    %idx = affine.apply (d0)[s0] -> (d0 * s0)(%i)[%N]
+    %y = affine.load %B[%idx] : memref<10 x i32>
+    // expected-error@-1 {{getMemRefRegion: compose affine map failed}}
+  }
+  return
+}
+
+// CHECK-LABEL: func @multi_mod_floordiv
+func @multi_mod_floordiv() {
+  %A = alloc() : memref<2x2xi32>
+  affine.for %ii = 0 to 64 {
+      %idx0 = affine.apply (d0) -> ((d0 mod 147456) floordiv 1152) (%ii)
+      %idx1 = affine.apply (d0) -> (((d0 mod 147456) mod 1152) floordiv 384) (%ii)
+      %v = affine.load %A[%idx0, %idx1] : memref<2x2xi32>
+  }
+  return
+}
+
+// CHECK-LABEL: func @delinearize_mod_floordiv
+func @delinearize_mod_floordiv() {
+  %c0 = constant 0 : index
+  %in = alloc() : memref<2x2x3x3x16x1xi32>
+  %out = alloc() : memref<64x9xi32>
+
+  // Reshape '%in' into '%out'.
+  affine.for %ii = 0 to 64 {
+    affine.for %jj = 0 to 9 {
+      %a0 = affine.apply (d0, d1) -> (d0 * (9 * 1024) + d1 * 128) (%ii, %jj)
+      %a10 = affine.apply (d0) ->
+        (d0 floordiv (2 * 3 * 3 * 128 * 128)) (%a0)
+      %a11 = affine.apply (d0) ->
+        ((d0 mod 294912) floordiv (3 * 3 * 128 * 128)) (%a0)
+      %a12 = affine.apply (d0) ->
+        ((((d0 mod 294912) mod 147456) floordiv 1152) floordiv 8) (%a0)
+      %a13 = affine.apply (d0) ->
+        ((((d0 mod 294912) mod 147456) mod 1152) floordiv 384) (%a0)
+      %a14 = affine.apply (d0) ->
+        (((((d0 mod 294912) mod 147456) mod 1152) mod 384) floordiv 128) (%a0)
+      %a15 = affine.apply (d0) ->
+        ((((((d0 mod 294912) mod 147456) mod 1152) mod 384) mod 128)
+          floordiv 128) (%a0)
+      %v0 = affine.load %in[%a10, %a11, %a13, %a14, %a12, %a15]
+        : memref<2x2x3x3x16x1xi32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @zero_d_memref
+func @zero_d_memref(%arg0: memref<i32>) {
+  %c0 = constant 0 : i32
+  // A 0-d memref always has in-bound accesses!
+  affine.store %c0, %arg0[] : memref<i32>
+  return
+}
+
+// CHECK-LABEL: func @out_of_bounds
+func @out_of_bounds() {
+  %in = alloc() : memref<1xi32>
+  %c9 = constant 9 : i32
+
+  affine.for %i0 = 10 to 11 {
+    %idy = affine.apply (d0) ->  (100 * d0 floordiv 1000) (%i0)
+    affine.store %c9, %in[%idy] : memref<1xi32> // expected-error {{'affine.store' op memref out of upper bound access along dimension #1}}
+  }
+  return
+}
+
+// -----
+
+// This test case accesses within bounds. Without removal of a certain type of
+// trivially redundant constraints (those differing only in their constant
+// term), the number of constraints here explodes, and this would return out of
+// bounds errors conservatively due to FlatAffineConstraints::kExplosionFactor.
+#map3 = (d0, d1) -> ((d0 * 72 + d1) floordiv 2304 + ((((d0 * 72 + d1) mod 2304) mod 1152) mod 9) floordiv 3)
+#map4 = (d0, d1) -> ((d0 * 72 + d1) mod 2304 - (((d0 * 72 + d1) mod 2304) floordiv 1152) * 1151 - ((((d0 * 72 + d1) mod 2304) mod 1152) floordiv 9) * 9 - (((((d0 * 72 + d1) mod 2304) mod 1152) mod 9) floordiv 3) * 3)
+#map5 = (d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) floordiv 9) floordiv 8)
+// CHECK-LABEL: func @test_complex_mod_floordiv
+func @test_complex_mod_floordiv(%arg0: memref<4x4x16x1xf32>) {
+  %c0 = constant 0 : index
+  %0 = alloc() : memref<1x2x3x3x16x1xf32>
+  affine.for %i0 = 0 to 64 {
+    affine.for %i1 = 0 to 9 {
+      %2 = affine.apply #map3(%i0, %i1)
+      %3 = affine.apply #map4(%i0, %i1)
+      %4 = affine.apply #map5(%i0, %i1)
+      %5 = affine.load %arg0[%2, %c0, %4, %c0] : memref<4x4x16x1xf32>
+    }
+  }
+  return
+}
+
+// -----
+
+// The first load is within bounds, but not the second one.
+#map0 = (d0) -> (d0 mod 4)
+#map1 = (d0) -> (d0 mod 4 + 4)
+
+// CHECK-LABEL: func @test_mod_bound
+func @test_mod_bound() {
+  %0 = alloc() : memref<7 x f32>
+  %1 = alloc() : memref<6 x f32>
+  affine.for %i0 = 0 to 4096 {
+    affine.for %i1 = #map0(%i0) to #map1(%i0) {
+      affine.load %0[%i1] : memref<7 x f32>
+      affine.load %1[%i1] : memref<6 x f32>
+      // expected-error@-1 {{'affine.load' op memref out of upper bound access along dimension #1}}
+    }
+  }
+  return
+}
+
+// -----
+
+#map0 = (d0) -> (d0 floordiv 4)
+#map1 = (d0) -> (d0 floordiv 4 + 4)
+#map2 = (d0) -> (4 * (d0 floordiv 4)  + d0 mod 4)
+
+// CHECK-LABEL: func @test_floordiv_bound
+func @test_floordiv_bound() {
+  %0 = alloc() : memref<1027 x f32>
+  %1 = alloc() : memref<1026 x f32>
+  %2 = alloc() : memref<4096 x f32>
+  %N = constant 2048 : index
+  affine.for %i0 = 0 to 4096 {
+    affine.for %i1 = #map0(%i0) to #map1(%i0) {
+      affine.load %0[%i1] : memref<1027 x f32>
+      affine.load %1[%i1] : memref<1026 x f32>
+      // expected-error@-1 {{'affine.load' op memref out of upper bound access along dimension #1}}
+    }
+    affine.for %i2 = 0 to #map2(%N) {
+      // Within bounds.
+      %v = affine.load %2[%i2] : memref<4096 x f32>
+    }
+  }
+  return
+}
+
+// -----
+
+// This should not give an out of bounds error. The result of the affine.apply
+// is composed into the bound map during analysis.
+
+#map_lb = (d0) -> (d0)
+#map_ub = (d0) -> (d0 + 4)
+
+// CHECK-LABEL: func @non_composed_bound_operand
+func @non_composed_bound_operand(%arg0: memref<1024xf32>) {
+  affine.for %i0 = 4 to 1028 step 4 {
+    %i1 = affine.apply (d0) -> (d0 - 4) (%i0)
+    affine.for %i2 = #map_lb(%i1) to #map_ub(%i1) {
+        %0 = affine.load %arg0[%i2] : memref<1024xf32>
+    }
+  }
+  return
+}
diff --git a/mlir/test/Transforms/memref-dataflow-opt.mlir b/mlir/test/Transforms/memref-dataflow-opt.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..a7f6f25b816dcb4cc481ea13e5e2038681cf39cb
--- /dev/null
+++ b/mlir/test/Transforms/memref-dataflow-opt.mlir
@@ -0,0 +1,282 @@
+// RUN: mlir-opt %s -memref-dataflow-opt | FileCheck %s
+
+// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0, d1) -> (d1 + 1)
+// CHECK-DAG: [[MAP1:#map[0-9]+]] = (d0, d1) -> (d0)
+// CHECK-DAG: [[MAP2:#map[0-9]+]] = (d0, d1) -> (d1)
+// CHECK-DAG: [[MAP3:#map[0-9]+]] = (d0, d1) -> (d0 - 1)
+// CHECK-DAG: [[MAP4:#map[0-9]+]] = (d0) -> (d0 + 1)
+
+// CHECK-LABEL: func @simple_store_load() {
+func @simple_store_load() {
+  %cf7 = constant 7.0 : f32
+  %m = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+    %v0 = affine.load %m[%i0] : memref<10xf32>
+    %v1 = addf %v0, %v0 : f32
+  }
+  return
+// CHECK:       %{{.*}} = constant 7.000000e+00 : f32
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:    %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+}
+
+// CHECK-LABEL: func @multi_store_load() {
+func @multi_store_load() {
+  %c0 = constant 0 : index
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+  %cf9 = constant 9.0 : f32
+  %m = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+    %v0 = affine.load %m[%i0] : memref<10xf32>
+    %v1 = addf %v0, %v0 : f32
+    affine.store %cf8, %m[%i0] : memref<10xf32>
+    affine.store %cf9, %m[%i0] : memref<10xf32>
+    %v2 = affine.load %m[%i0] : memref<10xf32>
+    %v3 = affine.load %m[%i0] : memref<10xf32>
+    %v4 = mulf %v2, %v3 : f32
+  }
+  return
+// CHECK:       %{{.*}} = constant 0 : index
+// CHECK-NEXT:  %{{.*}} = constant 7.000000e+00 : f32
+// CHECK-NEXT:  %{{.*}} = constant 8.000000e+00 : f32
+// CHECK-NEXT:  %{{.*}} = constant 9.000000e+00 : f32
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:    %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:    %{{.*}} = mulf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+}
+
+// The store-load forwarding can see through affine apply's since it relies on
+// dependence information.
+// CHECK-LABEL: func @store_load_affine_apply
+func @store_load_affine_apply() -> memref<10x10xf32> {
+  %cf7 = constant 7.0 : f32
+  %m = alloc() : memref<10x10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      %t0 = affine.apply (d0, d1) -> (d1 + 1)(%i0, %i1)
+      %t1 = affine.apply (d0, d1) -> (d0)(%i0, %i1)
+      %idx0 = affine.apply (d0, d1) -> (d1) (%t0, %t1)
+      %idx1 = affine.apply (d0, d1) -> (d0 - 1) (%t0, %t1)
+      affine.store %cf7, %m[%idx0, %idx1] : memref<10x10xf32>
+      // CHECK-NOT: affine.load %{{[0-9]+}}
+      %v0 = affine.load %m[%i0, %i1] : memref<10x10xf32>
+      %v1 = addf %v0, %v0 : f32
+    }
+  }
+  // The memref and its stores won't be erased due to this memref return.
+  return %m : memref<10x10xf32>
+// CHECK:       %{{.*}} = constant 7.000000e+00 : f32
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<10x10xf32>
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP0]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP1]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP2]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP3]](%{{.*}}, %{{.*}})
+// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+// CHECK-NEXT:      %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return %{{.*}} : memref<10x10xf32>
+}
+
+// CHECK-LABEL: func @store_load_nested
+func @store_load_nested(%N : index) {
+  %cf7 = constant 7.0 : f32
+  %m = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+    affine.for %i1 = 0 to %N {
+      %v0 = affine.load %m[%i0] : memref<10xf32>
+      %v1 = addf %v0, %v0 : f32
+    }
+  }
+  return
+// CHECK:       %{{.*}} = constant 7.000000e+00 : f32
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to %{{.*}} {
+// CHECK-NEXT:      %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+}
+
+// No forwarding happens here since either of the two stores could be the last
+// writer; store/load forwarding will however be possible here once loop live
+// out SSA scalars are available.
+// CHECK-LABEL: func @multi_store_load_nested_no_fwd
+func @multi_store_load_nested_no_fwd(%N : index) {
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+  %m = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+    affine.for %i1 = 0 to %N {
+      affine.store %cf8, %m[%i1] : memref<10xf32>
+    }
+    affine.for %i2 = 0 to %N {
+      // CHECK: %{{[0-9]+}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+      %v0 = affine.load %m[%i0] : memref<10xf32>
+      %v1 = addf %v0, %v0 : f32
+    }
+  }
+  return
+}
+
+// No forwarding happens here since both stores have a value going into
+// the load.
+// CHECK-LABEL: func @store_load_store_nested_no_fwd
+func @store_load_store_nested_no_fwd(%N : index) {
+  %cf7 = constant 7.0 : f32
+  %cf9 = constant 9.0 : f32
+  %m = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+    affine.for %i1 = 0 to %N {
+      // CHECK: %{{[0-9]+}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+      %v0 = affine.load %m[%i0] : memref<10xf32>
+      %v1 = addf %v0, %v0 : f32
+      affine.store %cf9, %m[%i0] : memref<10xf32>
+    }
+  }
+  return
+}
+
+// Forwarding happens here since the last store postdominates all other stores
+// and other forwarding criteria are satisfied.
+// CHECK-LABEL: func @multi_store_load_nested_fwd
+func @multi_store_load_nested_fwd(%N : index) {
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+  %cf9 = constant 9.0 : f32
+  %cf10 = constant 10.0 : f32
+  %m = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+    affine.for %i1 = 0 to %N {
+      affine.store %cf8, %m[%i1] : memref<10xf32>
+    }
+    affine.for %i2 = 0 to %N {
+      affine.store %cf9, %m[%i2] : memref<10xf32>
+    }
+    affine.store %cf10, %m[%i0] : memref<10xf32>
+    affine.for %i3 = 0 to %N {
+      // CHECK-NOT: %{{[0-9]+}} = affine.load
+      %v0 = affine.load %m[%i0] : memref<10xf32>
+      %v1 = addf %v0, %v0 : f32
+    }
+  }
+  return
+}
+
+// There is no unique load location for the store to forward to.
+// CHECK-LABEL: func @store_load_no_fwd
+func @store_load_no_fwd() {
+  %cf7 = constant 7.0 : f32
+  %m = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+    affine.for %i1 = 0 to 10 {
+      affine.for %i2 = 0 to 10 {
+        // CHECK: affine.load %{{[0-9]+}}
+        %v0 = affine.load %m[%i2] : memref<10xf32>
+        %v1 = addf %v0, %v0 : f32
+      }
+    }
+  }
+  return
+}
+
+// Forwarding happens here as there is a one-to-one store-load correspondence.
+// CHECK-LABEL: func @store_load_fwd
+func @store_load_fwd() {
+  %cf7 = constant 7.0 : f32
+  %c0 = constant 0 : index
+  %m = alloc() : memref<10xf32>
+  affine.store %cf7, %m[%c0] : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.for %i2 = 0 to 10 {
+        // CHECK-NOT: affine.load %{{[0-9]}}+
+        %v0 = affine.load %m[%c0] : memref<10xf32>
+        %v1 = addf %v0, %v0 : f32
+      }
+    }
+  }
+  return
+}
+
+// Although there is a dependence from the second store to the load, it is
+// satisfied by the outer surrounding loop, and does not prevent the first
+// store to be forwarded to the load.
+func @store_load_store_nested_fwd(%N : index) -> f32 {
+  %cf7 = constant 7.0 : f32
+  %cf9 = constant 9.0 : f32
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %m = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    affine.store %cf7, %m[%i0] : memref<10xf32>
+    affine.for %i1 = 0 to %N {
+      %v0 = affine.load %m[%i0] : memref<10xf32>
+      %v1 = addf %v0, %v0 : f32
+      %idx = affine.apply (d0) -> (d0 + 1) (%i0)
+      affine.store %cf9, %m[%idx] : memref<10xf32>
+    }
+  }
+  // Due to this load, the memref isn't optimized away.
+  %v3 = affine.load %m[%c1] : memref<10xf32>
+  return %v3 : f32
+// CHECK:       %{{.*}} = alloc() : memref<10xf32>
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
+// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to %{{.*}} {
+// CHECK-NEXT:      %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:      %{{.*}} = affine.apply [[MAP4]](%{{.*}})
+// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:  return %{{.*}} : f32
+}
+
+// CHECK-LABEL: func @should_not_fwd
+func @should_not_fwd(%A: memref<100xf32>, %M : index, %N : index) -> f32 {
+  %cf = constant 0.0 : f32
+  affine.store %cf, %A[%M] : memref<100xf32>
+  // CHECK: affine.load %{{.*}}[%{{.*}}]
+  %v = affine.load %A[%N] : memref<100xf32>
+  return %v : f32
+}
+
+// Can store forward to A[%j, %i], but no forwarding to load on %A[%i, %j]
+// CHECK-LABEL: func @refs_not_known_to_be_equal
+func @refs_not_known_to_be_equal(%A : memref<100 x 100 x f32>, %M : index) {
+  %N = affine.apply (d0) -> (d0 + 1) (%M)
+  %cf1 = constant 1.0 : f32
+  affine.for %i = 0 to 100 {
+  // CHECK: affine.for %[[I:.*]] =
+    affine.for %j = 0 to 100 {
+    // CHECK: affine.for %[[J:.*]] =
+      // CHECK: affine.load %{{.*}}[%[[I]], %[[J]]]
+      %u = affine.load %A[%i, %j] : memref<100x100xf32>
+      // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[J]], %[[I]]]
+      affine.store %cf1, %A[%j, %i] : memref<100x100xf32>
+      // CHECK-NEXT: affine.load %{{.*}}[%[[I]], %[[J]]]
+      %v = affine.load %A[%i, %j] : memref<100x100xf32>
+      // This load should disappear.
+      %w = affine.load %A[%j, %i] : memref<100x100xf32>
+      // CHECK-NEXT: "foo"
+      "foo" (%u, %v, %w) : (f32, f32, f32) -> ()
+    }
+  }
+  return
+}
diff --git a/mlir/test/Transforms/memref-dependence-check.mlir b/mlir/test/Transforms/memref-dependence-check.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..03b6c74654d68a129c4271d61f9f46bbe42298a8
--- /dev/null
+++ b/mlir/test/Transforms/memref-dependence-check.mlir
@@ -0,0 +1,906 @@
+// RUN: mlir-opt %s -test-memref-dependence-check  -split-input-file -verify-diagnostics | FileCheck %s
+
+// -----
+
+#set0 = (d0) : (1 == 0)
+
+// CHECK-LABEL: func @store_may_execute_before_load() {
+func @store_may_execute_before_load() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %c0 = constant 4 : index
+  // There is a dependence from store 0 to load 1 at depth 1 because the
+  // ancestor IfOp of the store, dominates the ancestor ForSmt of the load,
+  // and thus the store "may" conditionally execute before the load.
+  affine.if #set0(%c0) {
+    affine.for %i0 = 0 to 10 {
+      affine.store %cf7, %m[%i0] : memref<10xf32>
+      // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 1 = true}}
+    }
+  }
+  affine.for %i1 = 0 to 10 {
+    %v0 = affine.load %m[%i1] : memref<10xf32>
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @dependent_loops() {
+func @dependent_loops() {
+  %0 = alloc() : memref<10xf32>
+  %cst = constant 7.000000e+00 : f32
+  // There is a dependence from 0 to 1 at depth 1 (common surrounding loops 0)
+  // because the first loop with the store dominates the second loop.
+  affine.for %i0 = 0 to 10 {
+    affine.store %cst, %0[%i0] : memref<10xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = true}}
+  }
+  affine.for %i1 = 0 to 10 {
+    %1 = affine.load %0[%i1] : memref<10xf32>
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @different_memrefs() {
+func @different_memrefs() {
+  %m.a = alloc() : memref<100xf32>
+  %m.b = alloc() : memref<100xf32>
+  %c0 = constant 0 : index
+  %c1 = constant 1.0 : f32
+  affine.store %c1, %m.a[%c0] : memref<100xf32>
+  // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+  %v0 = affine.load %m.b[%c0] : memref<100xf32>
+  // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_load_different_elements() {
+func @store_load_different_elements() {
+  %m = alloc() : memref<100xf32>
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c7 = constant 7.0 : f32
+  affine.store %c7, %m[%c0] : memref<100xf32>
+  // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+  %v0 = affine.load %m[%c1] : memref<100xf32>
+  // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+  return
+}
+
+// -----
+// CHECK-LABEL: func @load_store_different_elements() {
+func @load_store_different_elements() {
+  %m = alloc() : memref<100xf32>
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c7 = constant 7.0 : f32
+  %v0 = affine.load %m[%c1] : memref<100xf32>
+  // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+  affine.store %c7, %m[%c0] : memref<100xf32>
+  // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_load_same_element() {
+func @store_load_same_element() {
+  %m = alloc() : memref<100xf32>
+  %c11 = constant 11 : index
+  %c7 = constant 7.0 : f32
+  affine.store %c7, %m[%c11] : memref<100xf32>
+  // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 0 to 1 at depth 1 = true}}
+  %v0 = affine.load %m[%c11] : memref<100xf32>
+  // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+  return
+}
+
+// -----
+// CHECK-LABEL: func @load_load_same_element() {
+func @load_load_same_element() {
+  %m = alloc() : memref<100xf32>
+  %c11 = constant 11 : index
+  %c7 = constant 7.0 : f32
+  %v0 = affine.load %m[%c11] : memref<100xf32>
+  // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+  %v1 = affine.load %m[%c11] : memref<100xf32>
+  // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_load_same_symbol(%arg0: index) {
+func @store_load_same_symbol(%arg0: index) {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  affine.store %c7, %m[%arg0] : memref<100xf32>
+  // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 0 to 1 at depth 1 = true}}
+  %v0 = affine.load %m[%arg0] : memref<100xf32>
+  // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_load_different_symbols(%arg0: index, %arg1: index) {
+func @store_load_different_symbols(%arg0: index, %arg1: index) {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  affine.store %c7, %m[%arg0] : memref<100xf32>
+  // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 0 to 1 at depth 1 = true}}
+  %v0 = affine.load %m[%arg1] : memref<100xf32>
+  // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_load_diff_element_affine_apply_const() {
+func @store_load_diff_element_affine_apply_const() {
+  %m = alloc() : memref<100xf32>
+  %c1 = constant 1 : index
+  %c8 = constant 8.0 : f32
+  %a0 = affine.apply (d0) -> (d0) (%c1)
+  affine.store %c8, %m[%a0] : memref<100xf32>
+  // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+  %a1 = affine.apply (d0) -> (d0 + 1) (%c1)
+  %v0 = affine.load %m[%a1] : memref<100xf32>
+  // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_load_same_element_affine_apply_const() {
+func @store_load_same_element_affine_apply_const() {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  %c9 = constant 9 : index
+  %c11 = constant 11 : index
+  %a0 = affine.apply (d0) -> (d0 + 1) (%c9)
+  affine.store %c7, %m[%a0] : memref<100xf32>
+  // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 0 to 1 at depth 1 = true}}
+  %a1 = affine.apply (d0) -> (d0 - 1) (%c11)
+  %v0 = affine.load %m[%a1] : memref<100xf32>
+  // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_load_affine_apply_symbol(%arg0: index) {
+func @store_load_affine_apply_symbol(%arg0: index) {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  %a0 = affine.apply (d0) -> (d0) (%arg0)
+  affine.store %c7, %m[%a0] : memref<100xf32>
+  // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 0 to 1 at depth 1 = true}}
+  %a1 = affine.apply (d0) -> (d0) (%arg0)
+  %v0 = affine.load %m[%a1] : memref<100xf32>
+  // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_load_affine_apply_symbol_offset(%arg0: index) {
+func @store_load_affine_apply_symbol_offset(%arg0: index) {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  %a0 = affine.apply (d0) -> (d0) (%arg0)
+  affine.store %c7, %m[%a0] : memref<100xf32>
+  // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+  %a1 = affine.apply (d0) -> (d0 + 1) (%arg0)
+  %v0 = affine.load %m[%a1] : memref<100xf32>
+  // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+  // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_range_load_after_range() {
+func @store_range_load_after_range() {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  %c10 = constant 10 : index
+  affine.for %i0 = 0 to 10 {
+    %a0 = affine.apply (d0) -> (d0) (%i0)
+    affine.store %c7, %m[%a0] : memref<100xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+    %a1 = affine.apply (d0) -> (d0) (%c10)
+    %v0 = affine.load %m[%a1] : memref<100xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_load_func_symbol(%arg0: index, %arg1: index) {
+func @store_load_func_symbol(%arg0: index, %arg1: index) {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  %c10 = constant 10 : index
+  affine.for %i0 = 0 to %arg1 {
+    %a0 = affine.apply (d0) -> (d0) (%arg0)
+    affine.store %c7, %m[%a0] : memref<100xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = [1, +inf]}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = [1, +inf]}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = true}}
+    %a1 = affine.apply (d0) -> (d0) (%arg0)
+    %v0 = affine.load %m[%a1] : memref<100xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = [1, +inf]}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_range_load_last_in_range() {
+func @store_range_load_last_in_range() {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  %c10 = constant 10 : index
+  affine.for %i0 = 0 to 10 {
+    %a0 = affine.apply (d0) -> (d0) (%i0)
+    // For dependence from 0 to 1, we do not have a loop carried dependence
+    // because only the final write in the loop accesses the same element as the
+    // load, so this dependence appears only at depth 2 (loop independent).
+    affine.store %c7, %m[%a0] : memref<100xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = true}}
+    %a1 = affine.apply (d0) -> (d0 - 1) (%c10)
+    // For dependence from 1 to 0, we have write-after-read (WAR) dependences
+    // for all loads in the loop to the store on the last iteration.
+    %v0 = affine.load %m[%a1] : memref<100xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = [1, 9]}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_range_load_before_range() {
+func @store_range_load_before_range() {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  %c0 = constant 0 : index
+  affine.for %i0 = 1 to 11 {
+    %a0 = affine.apply (d0) -> (d0) (%i0)
+    affine.store %c7, %m[%a0] : memref<100xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+    %a1 = affine.apply (d0) -> (d0) (%c0)
+    %v0 = affine.load %m[%a1] : memref<100xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_range_load_first_in_range() {
+func @store_range_load_first_in_range() {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  %c0 = constant 0 : index
+  affine.for %i0 = 1 to 11 {
+    %a0 = affine.apply (d0) -> (d0) (%i0)
+    // Dependence from 0 to 1 at depth 1 is a range because all loads at
+    // constant index zero are reads after first store at index zero during
+    // first iteration of the loop.
+    affine.store %c7, %m[%a0] : memref<100xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = [1, 9]}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = true}}
+    %a1 = affine.apply (d0) -> (d0 + 1) (%c0)
+    %v0 = affine.load %m[%a1] : memref<100xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @store_plus_3() {
+func @store_plus_3() {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  affine.for %i0 = 1 to 11 {
+    %a0 = affine.apply (d0) -> (d0 + 3) (%i0)
+    affine.store %c7, %m[%a0] : memref<100xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = [3, 3]}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+    %a1 = affine.apply (d0) -> (d0) (%i0)
+    %v0 = affine.load %m[%a1] : memref<100xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @load_minus_2() {
+func @load_minus_2() {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  affine.for %i0 = 2 to 11 {
+    %a0 = affine.apply (d0) -> (d0) (%i0)
+    affine.store %c7, %m[%a0] : memref<100xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = [2, 2]}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+    %a1 = affine.apply (d0) -> (d0 - 2) (%i0)
+    %v0 = affine.load %m[%a1] : memref<100xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @perfectly_nested_loops_loop_independent() {
+func @perfectly_nested_loops_loop_independent() {
+  %m = alloc() : memref<10x10xf32>
+  %c7 = constant 7.0 : f32
+  affine.for %i0 = 0 to 11 {
+    affine.for %i1 = 0 to 11 {
+      // Dependence from access 0 to 1 is loop independent at depth = 3.
+      %a00 = affine.apply (d0, d1) -> (d0) (%i0, %i1)
+      %a01 = affine.apply (d0, d1) -> (d1) (%i0, %i1)
+      affine.store %c7, %m[%a00, %a01] : memref<10x10xf32>
+      // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 3 = true}}
+      %a10 = affine.apply (d0, d1) -> (d0) (%i0, %i1)
+      %a11 = affine.apply (d0, d1) -> (d1) (%i0, %i1)
+      %v0 = affine.load %m[%a10, %a11] : memref<10x10xf32>
+      // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 3 = false}}
+    }
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @perfectly_nested_loops_loop_carried_at_depth1() {
+func @perfectly_nested_loops_loop_carried_at_depth1() {
+  %m = alloc() : memref<10x10xf32>
+  %c7 = constant 7.0 : f32
+  affine.for %i0 = 0 to 9 {
+    affine.for %i1 = 0 to 9 {
+      // Dependence from access 0 to 1 is loop carried at depth 1.
+      %a00 = affine.apply (d0, d1) -> (d0) (%i0, %i1)
+      %a01 = affine.apply (d0, d1) -> (d1) (%i0, %i1)
+      affine.store %c7, %m[%a00, %a01] : memref<10x10xf32>
+      // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 1 = [2, 2][0, 0]}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 3 = false}}
+      %a10 = affine.apply (d0, d1) -> (d0 - 2) (%i0, %i1)
+      %a11 = affine.apply (d0, d1) -> (d1) (%i0, %i1)
+      %v0 = affine.load %m[%a10, %a11] : memref<10x10xf32>
+      // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 3 = false}}
+    }
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @perfectly_nested_loops_loop_carried_at_depth2() {
+func @perfectly_nested_loops_loop_carried_at_depth2() {
+  %m = alloc() : memref<10x10xf32>
+  %c7 = constant 7.0 : f32
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      // Dependence from access 0 to 1 is loop carried at depth 2.
+      %a00 = affine.apply (d0, d1) -> (d0) (%i0, %i1)
+      %a01 = affine.apply (d0, d1) -> (d1) (%i0, %i1)
+      affine.store %c7, %m[%a00, %a01] : memref<10x10xf32>
+      // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 2 = [0, 0][3, 3]}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 3 = false}}
+      %a10 = affine.apply (d0, d1) -> (d0) (%i0, %i1)
+      %a11 = affine.apply (d0, d1) -> (d1 - 3) (%i0, %i1)
+      %v0 = affine.load %m[%a10, %a11] : memref<10x10xf32>
+      // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 3 = false}}
+    }
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @one_common_loop() {
+func @one_common_loop() {
+  %m = alloc() : memref<10x10xf32>
+  %c7 = constant 7.0 : f32
+  // There is a loop-independent dependence from access 0 to 1 at depth 2.
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      %a00 = affine.apply (d0, d1) -> (d0) (%i0, %i1)
+      %a01 = affine.apply (d0, d1) -> (d1) (%i0, %i1)
+      affine.store %c7, %m[%a00, %a01] : memref<10x10xf32>
+      // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 2 = true}}
+    }
+    affine.for %i2 = 0 to 9 {
+      %a10 = affine.apply (d0, d1) -> (d0) (%i0, %i2)
+      %a11 = affine.apply (d0, d1) -> (d1) (%i0, %i2)
+      %v0 = affine.load %m[%a10, %a11] : memref<10x10xf32>
+      // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 3 = false}}
+    }
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @dependence_cycle() {
+func @dependence_cycle() {
+  %m.a = alloc() : memref<100xf32>
+  %m.b = alloc() : memref<100xf32>
+
+  // Dependences:
+  // *) loop-independent dependence from access 1 to 2 at depth 2.
+  // *) loop-carried dependence from access 3 to 0 at depth 1.
+  affine.for %i0 = 0 to 9 {
+    %a0 = affine.apply (d0) -> (d0) (%i0)
+    %v0 = affine.load %m.a[%a0] : memref<100xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 2 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 2 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 3 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 3 at depth 2 = false}}
+    %a1 = affine.apply (d0) -> (d0) (%i0)
+    affine.store %v0, %m.b[%a1] : memref<100xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 2 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 2 at depth 2 = true}}
+    // expected-remark@above {{dependence from 1 to 3 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 3 at depth 2 = false}}
+    %a2 = affine.apply (d0) -> (d0) (%i0)
+    %v1 = affine.load %m.b[%a2] : memref<100xf32>
+    // expected-remark@above {{dependence from 2 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 2 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 2 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 2 to 1 at depth 2 = false}}
+    // expected-remark@above {{dependence from 2 to 2 at depth 1 = false}}
+    // expected-remark@above {{dependence from 2 to 2 at depth 2 = false}}
+    // expected-remark@above {{dependence from 2 to 3 at depth 1 = false}}
+    // expected-remark@above {{dependence from 2 to 3 at depth 2 = false}}
+    %a3 = affine.apply (d0) -> (d0 + 1) (%i0)
+    affine.store %v1, %m.a[%a3] : memref<100xf32>
+    // expected-remark@above {{dependence from 3 to 0 at depth 1 = [1, 1]}}
+    // expected-remark@above {{dependence from 3 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 3 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 3 to 1 at depth 2 = false}}
+    // expected-remark@above {{dependence from 3 to 2 at depth 1 = false}}
+    // expected-remark@above {{dependence from 3 to 2 at depth 2 = false}}
+    // expected-remark@above {{dependence from 3 to 3 at depth 1 = false}}
+    // expected-remark@above {{dependence from 3 to 3 at depth 2 = false}}
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @negative_and_positive_direction_vectors(%arg0: index, %arg1: index) {
+func @negative_and_positive_direction_vectors(%arg0: index, %arg1: index) {
+  %m = alloc() : memref<10x10xf32>
+  %c7 = constant 7.0 : f32
+  affine.for %i0 = 0 to %arg0 {
+    affine.for %i1 = 0 to %arg1 {
+      %a00 = affine.apply (d0, d1) -> (d0 - 1) (%i0, %i1)
+      %a01 = affine.apply (d0, d1) -> (d1 + 1) (%i0, %i1)
+      %v0 = affine.load %m[%a00, %a01] : memref<10x10xf32>
+      // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 3 = false}}
+      %a10 = affine.apply (d0, d1) -> (d0) (%i0, %i1)
+      %a11 = affine.apply (d0, d1) -> (d1) (%i0, %i1)
+      affine.store %c7, %m[%a10, %a11] : memref<10x10xf32>
+      // expected-remark@above {{dependence from 1 to 0 at depth 1 = [1, 1][-1, -1]}}
+      // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 3 = false}}
+    }
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @war_raw_waw_deps() {
+func @war_raw_waw_deps() {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      %a0 = affine.apply (d0) -> (d0 + 1) (%i1)
+      %v0 = affine.load %m[%a0] : memref<100xf32>
+      // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 1 = [1, 9][1, 1]}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 2 = [0, 0][1, 1]}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 3 = false}}
+      %a1 = affine.apply (d0) -> (d0) (%i1)
+      affine.store %c7, %m[%a1] : memref<100xf32>
+      // expected-remark@above {{dependence from 1 to 0 at depth 1 = [1, 9][-1, -1]}}
+      // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 1 = [1, 9][0, 0]}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 3 = false}}
+    }
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @mod_deps() {
+func @mod_deps() {
+  %m = alloc() : memref<100xf32>
+  %c7 = constant 7.0 : f32
+  affine.for %i0 = 0 to 10 {
+    %a0 = affine.apply (d0) -> (d0 mod 2) (%i0)
+    // Results are conservative here since we currently don't have a way to
+    // represent strided sets in FlatAffineConstraints.
+    %v0 = affine.load %m[%a0] : memref<100xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = [1, 9]}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+    %a1 = affine.apply (d0) -> ( (d0 + 1) mod 2) (%i0)
+    affine.store %c7, %m[%a1] : memref<100xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = [1, 9]}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = [2, 9]}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+  }
+  return
+}
+
+// -----
+// CHECK-LABEL: func @loop_nest_depth() {
+func @loop_nest_depth() {
+  %0 = alloc() : memref<100x100xf32>
+  %c7 = constant 7.0 : f32
+
+  affine.for %i0 = 0 to 128 {
+    affine.for %i1 = 0 to 8 {
+      affine.store %c7, %0[%i0, %i1] : memref<100x100xf32>
+      // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 1 = true}}
+    }
+  }
+  affine.for %i2 = 0 to 8 {
+    affine.for %i3 = 0 to 8 {
+      affine.for %i4 = 0 to 8 {
+        affine.for %i5 = 0 to 16 {
+          %8 = affine.apply (d0, d1) -> (d0 * 16 + d1)(%i4, %i5)
+          %9 = affine.load %0[%8, %i3] : memref<100x100xf32>
+          // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+          // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+          // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+          // expected-remark@above {{dependence from 1 to 1 at depth 3 = false}}
+          // expected-remark@above {{dependence from 1 to 1 at depth 4 = false}}
+          // expected-remark@above {{dependence from 1 to 1 at depth 5 = false}}
+        }
+      }
+    }
+  }
+  return
+}
+
+// -----
+// Test case to exercise sanity when flattening multiple expressions involving
+// mod/div's successively.
+// CHECK-LABEL: func @mod_div_3d() {
+func @mod_div_3d() {
+  %M = alloc() : memref<2x2x2xi32>
+  %c0 = constant 0 : i32
+  affine.for %i0 = 0 to 8 {
+    affine.for %i1 = 0 to 8 {
+      affine.for %i2 = 0 to 8 {
+        %idx0 = affine.apply (d0, d1, d2) -> (d0 floordiv 4) (%i0, %i1, %i2)
+        %idx1 = affine.apply (d0, d1, d2) -> (d1 mod 2) (%i0, %i1, %i2)
+        %idx2 = affine.apply (d0, d1, d2) -> (d2 floordiv 4) (%i0, %i1, %i2)
+        affine.store %c0, %M[%idx0, %idx1, %idx2] : memref<2 x 2 x 2 x i32>
+        // expected-remark@above {{dependence from 0 to 0 at depth 1 = [1, 3][-7, 7][-3, 3]}}
+        // expected-remark@above {{dependence from 0 to 0 at depth 2 = [0, 0][2, 7][-3, 3]}}
+        // expected-remark@above {{dependence from 0 to 0 at depth 3 = [0, 0][0, 0][1, 3]}}
+        // expected-remark@above {{dependence from 0 to 0 at depth 4 = false}}
+      }
+    }
+  }
+  return
+}
+
+// -----
+// This test case arises in the context of a 6-d to 2-d reshape.
+// CHECK-LABEL: func @delinearize_mod_floordiv
+func @delinearize_mod_floordiv() {
+  %c0 = constant 0 : index
+  %val = constant 0 : i32
+  %in = alloc() : memref<2x2x3x3x16x1xi32>
+  %out = alloc() : memref<64x9xi32>
+
+  affine.for %i0 = 0 to 2 {
+    affine.for %i1 = 0 to 2 {
+      affine.for %i2 = 0 to 3 {
+        affine.for %i3 = 0 to 3 {
+          affine.for %i4 = 0 to 16 {
+            affine.for %i5 = 0 to 1 {
+              affine.store %val, %in[%i0, %i1, %i2, %i3, %i4, %i5] : memref<2x2x3x3x16x1xi32>
+// expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+// expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+// expected-remark@above {{dependence from 0 to 0 at depth 3 = false}}
+// expected-remark@above {{dependence from 0 to 0 at depth 4 = false}}
+// expected-remark@above {{dependence from 0 to 0 at depth 5 = false}}
+// expected-remark@above {{dependence from 0 to 0 at depth 6 = false}}
+// expected-remark@above {{dependence from 0 to 0 at depth 7 = false}}
+// expected-remark@above {{dependence from 0 to 1 at depth 1 = true}}
+// expected-remark@above {{dependence from 0 to 2 at depth 1 = false}}
+            }
+          }
+        }
+      }
+    }
+  }
+
+  affine.for %ii = 0 to 64 {
+    affine.for %jj = 0 to 9 {
+      %a0 = affine.apply (d0, d1) -> (d0 * (9 * 1024) + d1 * 128) (%ii, %jj)
+      %a10 = affine.apply (d0) ->
+        (d0 floordiv (2 * 3 * 3 * 128 * 128)) (%a0)
+      %a11 = affine.apply (d0) ->
+        ((d0 mod 294912) floordiv (3 * 3 * 128 * 128)) (%a0)
+      %a12 = affine.apply (d0) ->
+        ((((d0 mod 294912) mod 147456) floordiv 1152) floordiv 8) (%a0)
+      %a13 = affine.apply (d0) ->
+        ((((d0 mod 294912) mod 147456) mod 1152) floordiv 384) (%a0)
+      %a14 = affine.apply (d0) ->
+        (((((d0 mod 294912) mod 147456) mod 1152) mod 384) floordiv 128) (%a0)
+      %a15 = affine.apply (d0) ->
+        ((((((d0 mod 294912) mod 147456) mod 1152) mod 384) mod 128)
+          floordiv 128) (%a0)
+      %v0 = affine.load %in[%a10, %a11, %a13, %a14, %a12, %a15] : memref<2x2x3x3x16x1xi32>
+// expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+// expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+// expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+// expected-remark@above {{dependence from 1 to 1 at depth 3 = false}}
+// expected-remark@above {{dependence from 1 to 2 at depth 1 = false}}
+// expected-remark@above {{dependence from 1 to 2 at depth 2 = false}}
+// expected-remark@above {{dependence from 1 to 2 at depth 3 = false}}
+// TODO(andydavis): the dep tester shouldn't be printing out these messages
+// below; they are redundant.
+      affine.store %v0, %out[%ii, %jj] : memref<64x9xi32>
+// expected-remark@above {{dependence from 2 to 0 at depth 1 = false}}
+// expected-remark@above {{dependence from 2 to 1 at depth 1 = false}}
+// expected-remark@above {{dependence from 2 to 1 at depth 2 = false}}
+// expected-remark@above {{dependence from 2 to 1 at depth 3 = false}}
+// expected-remark@above {{dependence from 2 to 2 at depth 1 = false}}
+// expected-remark@above {{dependence from 2 to 2 at depth 2 = false}}
+// expected-remark@above {{dependence from 2 to 2 at depth 3 = false}}
+    }
+  }
+  return
+}
+
+// TODO(bondhugula): add more test cases involving mod's/div's.
+
+// -----
+
+// Load and store ops access the same elements in strided loop.
+// CHECK-LABEL: func @strided_loop_with_dependence_at_depth2
+func @strided_loop_with_dependence_at_depth2() {
+  %0 = alloc() : memref<10xf32>
+  %cf0 = constant 0.0 : f32
+  affine.for %i0 = 0 to 8 step 2 {
+    affine.store %cf0, %0[%i0] : memref<10xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = true}}
+    %v0 = affine.load %0[%i0] : memref<10xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+  }
+  return
+}
+
+// -----
+
+// Load and store ops access alternating memref elements: no dependence.
+// CHECK-LABEL: func @strided_loop_with_no_dependence
+func @strided_loop_with_no_dependence() {
+  %0 = alloc() : memref<10xf32>
+  %cf0 = constant 0.0 : f32
+  affine.for %i0 = 0 to 8 step 2 {
+    %a0 = affine.apply (d0) -> (d0 + 1)(%i0)
+    affine.store %cf0, %0[%a0] : memref<10xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+    %v0 = affine.load %0[%i0] : memref<10xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+  }
+  return
+}
+
+// -----
+
+// Affine.Store op accesses memref elements at offset causing loop-carried dependence.
+// CHECK-LABEL: func @strided_loop_with_loop_carried_dependence_at_depth1
+func @strided_loop_with_loop_carried_dependence_at_depth1() {
+  %0 = alloc() : memref<10xf32>
+  %cf0 = constant 0.0 : f32
+  affine.for %i0 = 0 to 8 step 2 {
+    %a0 = affine.apply (d0) -> (d0 + 4)(%i0)
+    affine.store %cf0, %0[%a0] : memref<10xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = [4, 4]}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+    %v0 = affine.load %0[%i0] : memref<10xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+  }
+  return
+}
+
+// -----
+
+// Test that the loop carried dependence from load to store on '%i0' is
+// properly computed when the load and store are at different loop depths.
+// CHECK-LABEL: func @test_dep_store_depth1_load_depth2
+func @test_dep_store_depth1_load_depth2() {
+  %0 = alloc() : memref<100xf32>
+  %cst = constant 7.000000e+00 : f32
+  affine.for %i0 = 0 to 10 {
+    %a0 = affine.apply (d0) -> (d0 - 1)(%i0)
+    affine.store %cst, %0[%a0] : memref<100xf32>
+    // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+    affine.for %i1 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 1)(%i0) {
+      %1 = affine.load %0[%i1] : memref<100xf32>
+      // expected-remark@above {{dependence from 1 to 0 at depth 1 = [1, 1]}}
+      // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+      // expected-remark@above {{dependence from 1 to 1 at depth 3 = false}}
+    }
+  }
+  return
+}
+
+// -----
+
+// Test that the loop carried dependence from store to load on '%i0' is
+// properly computed when the load and store are at different loop depths.
+// CHECK-LABEL: func @test_dep_store_depth2_load_depth1
+func @test_dep_store_depth2_load_depth1() {
+  %0 = alloc() : memref<100xf32>
+  %cst = constant 7.000000e+00 : f32
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 1)(%i0) {
+      affine.store %cst, %0[%i1] : memref<100xf32>
+      // expected-remark@above {{dependence from 0 to 0 at depth 1 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 2 = false}}
+      // expected-remark@above {{dependence from 0 to 0 at depth 3 = false}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 1 = [2, 2]}}
+      // expected-remark@above {{dependence from 0 to 1 at depth 2 = false}}
+    }
+    %a0 = affine.apply (d0) -> (d0 - 2)(%i0)
+    %1 = affine.load %0[%a0] : memref<100xf32>
+    // expected-remark@above {{dependence from 1 to 0 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 0 at depth 2 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 1 = false}}
+    // expected-remark@above {{dependence from 1 to 1 at depth 2 = false}}
+  }
+  return
+}
diff --git a/mlir/test/Transforms/memref-normalize.mlir b/mlir/test/Transforms/memref-normalize.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..90b363219ee2ee448a17b20e214dd22687316f59
--- /dev/null
+++ b/mlir/test/Transforms/memref-normalize.mlir
@@ -0,0 +1,137 @@
+// RUN: mlir-opt -simplify-affine-structures %s | FileCheck %s
+
+// CHECK-LABEL: func @permute()
+func @permute() {
+  %A = alloc() : memref<64x256xf32, (d0, d1) -> (d1, d0)>
+  affine.for %i = 0 to 64 {
+    affine.for %j = 0 to 256 {
+      affine.load %A[%i, %j] : memref<64x256xf32, (d0, d1) -> (d1, d0)>
+    }
+  }
+  dealloc %A : memref<64x256xf32, (d0, d1) -> (d1, d0)>
+  return
+}
+// The old memref alloc should disappear.
+// CHECK-NOT:  memref<64x256xf32>
+// CHECK:      [[MEM:%[0-9]+]] = alloc() : memref<256x64xf32>
+// CHECK-NEXT: affine.for %[[I:arg[0-9]+]] = 0 to 64 {
+// CHECK-NEXT:   affine.for %[[J:arg[0-9]+]] = 0 to 256 {
+// CHECK-NEXT:     affine.load [[MEM]][%[[J]], %[[I]]] : memref<256x64xf32>
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+// CHECK-NEXT: dealloc [[MEM]]
+// CHECK-NEXT: return
+
+// CHECK-LABEL: func @shift
+func @shift(%idx : index) {
+  // CHECK-NEXT: alloc() : memref<65xf32>
+  %A = alloc() : memref<64xf32, (d0) -> (d0 + 1)>
+  // CHECK-NEXT: affine.load %{{.*}}[symbol(%arg0) + 1] : memref<65xf32>
+  affine.load %A[%idx] : memref<64xf32, (d0) -> (d0 + 1)>
+  affine.for %i = 0 to 64 {
+    affine.load %A[%i] : memref<64xf32, (d0) -> (d0 + 1)>
+    // CHECK: %{{.*}} = affine.load %{{.*}}[%arg{{.*}} + 1] : memref<65xf32>
+  }
+  return
+}
+
+// CHECK-LABEL: func @high_dim_permute()
+func @high_dim_permute() {
+  // CHECK-NOT: memref<64x128x256xf32,
+  %A = alloc() : memref<64x128x256xf32, (d0, d1, d2) -> (d2, d0, d1)>
+  // CHECK: %[[I:arg[0-9]+]]
+  affine.for %i = 0 to 64 {
+    // CHECK: %[[J:arg[0-9]+]]
+    affine.for %j = 0 to 128 {
+      // CHECK: %[[K:arg[0-9]+]]
+      affine.for %k = 0 to 256 {
+        affine.load %A[%i, %j, %k] : memref<64x128x256xf32, (d0, d1, d2) -> (d2, d0, d1)>
+        // CHECK: %{{.*}} = affine.load %{{.*}}[%[[K]], %[[I]], %[[J]]] : memref<256x64x128xf32>
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @invalid_map
+func @invalid_map() {
+  %A = alloc() : memref<64x128xf32, (d0, d1) -> (d0, -d1 - 10)>
+  // CHECK: %{{.*}} = alloc() : memref<64x128xf32,
+  return
+}
+
+// A tiled layout.
+// CHECK-LABEL: func @data_tiling
+func @data_tiling(%idx : index) {
+  // CHECK: alloc() : memref<8x32x8x16xf32>
+  %A = alloc() : memref<64x512xf32, (d0, d1) -> (d0 floordiv 8, d1 floordiv 16, d0 mod 8, d1 mod 16)>
+  // CHECK: affine.load %{{.*}}[symbol(%arg0) floordiv 8, symbol(%arg0) floordiv 16, symbol(%arg0) mod 8, symbol(%arg0) mod 16]
+  affine.load %A[%idx, %idx] : memref<64x512xf32, (d0, d1) -> (d0 floordiv 8, d1 floordiv 16, d0 mod 8, d1 mod 16)>
+  return
+}
+
+// Strides 2 and 4 along respective dimensions.
+// CHECK-LABEL: func @strided
+func @strided() {
+  %A = alloc() : memref<64x128xf32, (d0, d1) -> (2*d0, 4*d1)>
+  // CHECK: affine.for %[[IV0:.*]] =
+  affine.for %i = 0 to 64 {
+    // CHECK: affine.for %[[IV1:.*]] =
+    affine.for %j = 0 to 128 {
+      // CHECK: affine.load %{{.*}}[%[[IV0]] * 2, %[[IV1]] * 4] : memref<127x509xf32>
+      affine.load %A[%i, %j] : memref<64x128xf32, (d0, d1) -> (2*d0, 4*d1)>
+    }
+  }
+  return
+}
+
+// Strided, but the strides are in the linearized space.
+// CHECK-LABEL: func @strided_cumulative
+func @strided_cumulative() {
+  %A = alloc() : memref<2x5xf32, (d0, d1) -> (3*d0 + 17*d1)>
+  // CHECK: affine.for %[[IV0:.*]] =
+  affine.for %i = 0 to 2 {
+    // CHECK: affine.for %[[IV1:.*]] =
+    affine.for %j = 0 to 5 {
+      // CHECK: affine.load %{{.*}}[%[[IV0]] * 3 + %[[IV1]] * 17] : memref<72xf32>
+      affine.load %A[%i, %j]  : memref<2x5xf32, (d0, d1) -> (3*d0 + 17*d1)>
+    }
+  }
+  return
+}
+
+// Symbolic operand for alloc, although unused. Tests replaceAllMemRefUsesWith
+// when the index remap has symbols.
+// CHECK-LABEL: func @symbolic_operands
+func @symbolic_operands(%s : index) {
+  // CHECK: alloc() : memref<100xf32>
+  %A = alloc()[%s] : memref<10x10xf32, (d0,d1)[s0] -> (10*d0 + d1)>
+  affine.for %i = 0 to 10 {
+    affine.for %j = 0 to 10 {
+      // CHECK: affine.load %{{.*}}[%{{.*}} * 10 + %{{.*}}] : memref<100xf32>
+      affine.load %A[%i, %j] : memref<10x10xf32, (d0,d1)[s0] -> (10*d0 + d1)>
+    }
+  }
+  return
+}
+
+// Memref escapes; no normalization.
+// CHECK-LABEL: func @escaping() -> memref<64xf32, #map{{[0-9]+}}>
+func @escaping() ->  memref<64xf32, (d0) -> (d0 + 2)> {
+  // CHECK: %{{.*}} = alloc() : memref<64xf32, #map{{[0-9]+}}>
+  %A = alloc() : memref<64xf32, (d0) -> (d0 + 2)>
+  return %A : memref<64xf32, (d0) -> (d0 + 2)>
+}
+
+// Semi-affine maps, normalization not implemented yet.
+// CHECK-LABEL: func @semi_affine_layout_map
+func @semi_affine_layout_map(%s0: index, %s1: index) {
+  %A = alloc()[%s0, %s1] : memref<256x1024xf32, (d0, d1)[s0, s1] -> (d0*s0 + d1*s1)>
+  affine.for %i = 0 to 256 {
+    affine.for %j = 0 to 1024 {
+      // CHECK: memref<256x1024xf32, #map{{[0-9]+}}>
+      affine.load %A[%i, %j] : memref<256x1024xf32, (d0, d1)[s0, s1] -> (d0*s0 + d1*s1)>
+    }
+  }
+  return
+}
diff --git a/mlir/test/Transforms/parallelism-detection.mlir b/mlir/test/Transforms/parallelism-detection.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..fa08a896868fbdef8d2b3b27326e70c33a0a4904
--- /dev/null
+++ b/mlir/test/Transforms/parallelism-detection.mlir
@@ -0,0 +1,47 @@
+// RUN: mlir-opt %s -test-detect-parallel -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: func @loop_nest_3d_outer_two_parallel
+func @loop_nest_3d_outer_two_parallel(%N : index) {
+  %0 = alloc() : memref<1024 x 1024 x vector<64xf32>>
+  %1 = alloc() : memref<1024 x 1024 x vector<64xf32>>
+  %2 = alloc() : memref<1024 x 1024 x vector<64xf32>>
+  affine.for %i = 0 to %N {
+    // expected-remark@-1 {{parallel loop}}
+    affine.for %j = 0 to %N {
+      // expected-remark@-1 {{parallel loop}}
+      affine.for %k = 0 to %N {
+        // expected-remark@-1 {{sequential loop}}
+        %5 = affine.load %0[%i, %k] : memref<1024x1024xvector<64xf32>>
+        %6 = affine.load %1[%k, %j] : memref<1024x1024xvector<64xf32>>
+        %7 = affine.load %2[%i, %j] : memref<1024x1024xvector<64xf32>>
+        %8 = mulf %5, %6 : vector<64xf32>
+        %9 = addf %7, %8 : vector<64xf32>
+        affine.store %9, %2[%i, %j] : memref<1024x1024xvector<64xf32>>
+      }
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: unknown_op_conservative
+func @unknown_op_conservative() {
+  affine.for %i = 0 to 10 {
+    // expected-remark@-1 {{sequential loop}}
+    "unknown"() : () -> ()
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: non_affine_load
+func @non_affine_load() {
+  %0 = alloc() : memref<100 x f32>
+  affine.for %i = 0 to 100 {
+    // expected-remark@-1 {{sequential loop}}
+    load %0[%i] : memref<100 x f32>
+  }
+  return
+}
diff --git a/mlir/test/Transforms/parametric-mapping.mlir b/mlir/test/Transforms/parametric-mapping.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..fdfd8cf526ed77cbf0fa04949438e4fb7336282c
--- /dev/null
+++ b/mlir/test/Transforms/parametric-mapping.mlir
@@ -0,0 +1,47 @@
+// RUN: mlir-opt -test-mapping-to-processing-elements %s | FileCheck %s
+
+// CHECK-LABEL: @map1d
+//       CHECK: (%[[lb:.*]]: index, %[[ub:.*]]: index, %[[step:.*]]: index) {
+func @map1d(%lb: index, %ub: index, %step: index) {
+  // CHECK: %[[threads:.*]]:2 = "new_processor_id_and_range"() : () -> (index, index)
+  %0:2 = "new_processor_id_and_range"() : () -> (index, index)
+
+  // CHECK: %[[thread_offset:.*]] = muli %[[step]], %[[threads]]#0
+  // CHECK: %[[new_lb:.*]] = addi %[[lb]], %[[thread_offset]]
+  // CHECK: %[[new_step:.*]] = muli %[[step]], %[[threads]]#1
+  // CHECK: loop.for %{{.*}} = %[[new_lb]] to %[[ub]] step %[[new_step]] {
+  loop.for %i = %lb to %ub step %step {}
+  return
+}
+
+// CHECK-LABEL: @map2d
+//       CHECK: (%[[lb:.*]]: index, %[[ub:.*]]: index, %[[step:.*]]: index) {
+func @map2d(%lb : index, %ub : index, %step : index) {
+  // CHECK: %[[blocks:.*]]:2 = "new_processor_id_and_range"() : () -> (index, index)
+  %0:2 = "new_processor_id_and_range"() : () -> (index, index)
+
+  // CHECK: %[[threads:.*]]:2 = "new_processor_id_and_range"() : () -> (index, index)
+  %1:2 = "new_processor_id_and_range"() : () -> (index, index)
+
+  // blockIdx.x * blockDim.x
+  // CHECK: %[[bidxXbdimx:.*]] = muli %[[blocks]]#0, %[[threads]]#1 : index
+  //
+  // threadIdx.x + blockIdx.x * blockDim.x
+  // CHECK: %[[tidxpbidxXbdimx:.*]] = addi %[[bidxXbdimx]], %[[threads]]#0 : index
+  //
+  // thread_offset = step * (threadIdx.x + blockIdx.x * blockDim.x)
+  // CHECK: %[[thread_offset:.*]] = muli %[[step]], %[[tidxpbidxXbdimx]] : index
+  //
+  // new_lb = lb + thread_offset
+  // CHECK: %[[new_lb:.*]] = addi %[[lb]], %[[thread_offset]] : index
+  //
+  // stepXgdimx = step * gridDim.x
+  // CHECK: %[[stepXgdimx:.*]] = muli %[[step]], %[[blocks]]#1 : index
+  //
+  // new_step = step * gridDim.x * blockDim.x
+  // CHECK: %[[new_step:.*]] = muli %[[stepXgdimx]], %[[threads]]#1 : index
+  //
+  // CHECK: loop.for %{{.*}} = %[[new_lb]] to %[[ub]] step %[[new_step]] {
+  loop.for %i = %lb to %ub step %step {}
+  return
+}
diff --git a/mlir/test/Transforms/parametric-tiling.mlir b/mlir/test/Transforms/parametric-tiling.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..afa33cb07c16770a7f31a38f2560b9059ca96fb1
--- /dev/null
+++ b/mlir/test/Transforms/parametric-tiling.mlir
@@ -0,0 +1,133 @@
+// RUN: mlir-opt -test-extract-fixed-outer-loops='test-outer-loop-sizes=7' %s | FileCheck %s --check-prefixes=COMMON,TILE_7
+// RUN: mlir-opt -test-extract-fixed-outer-loops='test-outer-loop-sizes=7,4' %s | FileCheck %s --check-prefixes=COMMON,TILE_74
+
+// COMMON-LABEL: @rectangular
+func @rectangular(%arg0: memref<?x?xf32>) {
+  %c2 = constant 2 : index
+  %c44 = constant 44 : index
+  %c1 = constant 1 : index
+  // Range of the original loop:
+  //   (upper - lower + step - 1) / step
+  // where step is known to be %c1.
+  // COMMON:      %[[diff:.*]] = subi %c44, %c2
+  // COMMON:      %[[adjustment:.*]] = subi %c1, %c1_{{.*}}
+  // COMMON-NEXT: %[[diff_adj:.*]] = addi %[[diff]], %[[adjustment]]
+  // COMMON-NEXT: %[[range:.*]] = divi_signed %[[diff_adj]], %c1
+
+  // Ceildiv to get the parametric tile size.
+  // COMMON:       %[[sum:.*]] = addi %[[range]], %c6
+  // COMMON-NEXT:  %[[size:.*]] = divi_signed %[[sum]], %c7
+  // New outer step (original is %c1).
+  // COMMON-NEXT:      %[[step:.*]] = muli %c1, %[[size]]
+
+  // Range of the second original loop
+  //   (upper - lower + step - 1) / step
+  // where step is known to be %c2.
+  // TILE_74:      %[[diff2:.*]] = subi %c44, %c1
+  // TILE_74:      %[[adjustment2:.*]] = subi %c2, %c1_{{.*}}
+  // TILE_74-NEXT: %[[diff2_adj:.*]] = addi %[[diff2]], %[[adjustment2]]
+  // TILE_74-NEXT: %[[range2:.*]] = divi_signed %[[diff2_adj]], %c2
+
+  // Ceildiv to get the parametric tile size for the second original loop.
+  // TILE_74:      %[[sum2:.*]] = addi %[[range2]], %c3
+  // TILE_74-NEXT: %[[size2:.*]] = divi_signed %[[sum2]], %c4
+  // New inner step (original is %c2).
+  // TILE_74-NEXT:     %[[step2:.*]] = muli %c2, %[[size2]]
+
+  // Updated outer loop(s) use new steps.
+  // COMMON: loop.for %[[i:.*]] = %c2 to %c44 step %[[step]]
+  // TILE_74:loop.for %[[j:.*]] = %c1 to %c44 step %[[step2]]
+ loop.for %i = %c2 to %c44 step %c1 {
+    // Upper bound for the inner loop min(%i + %step, %c44).
+    // COMMON:      %[[stepped:.*]] = addi %[[i]], %[[step]]
+    // COMMON-NEXT: cmpi "slt", %c44, %[[stepped]]
+    // COMMON-NEXT: %[[ub:.*]] = select {{.*}}, %c44, %[[stepped]]
+    //
+    // TILE_74:      %[[stepped2:.*]] = addi %[[j]], %[[step2]]
+    // TILE_74-NEXT: cmpi "slt", %c44, %[[stepped2]]
+    // TILE_74-NEXT: %[[ub2:.*]] = select {{.*}}, %c44, %[[stepped2]]
+
+    // Created inner loop.
+    // COMMON:loop.for %[[ii:.*]] = %[[i]] to %[[ub:.*]] step %c1
+
+    // This loop is not modified in TILE_7 case.
+    // TILE_7: loop.for %[[j:.*]] = %c1 to %c44 step %c2
+    //
+    // But is modified in TILE_74 case.
+    // TILE_74:loop.for %[[jj:.*]] = %[[j]] to %[[ub2]] step %c2
+   loop.for %j = %c1 to %c44 step %c2 {
+      // The right iterator are used.
+      // TILE_7:  load %arg0[%[[ii]], %[[j]]]
+      // TILE_74: load %arg0[%[[ii]], %[[jj]]]
+      load %arg0[%i, %j]: memref<?x?xf32>
+    }
+  }
+  return
+}
+
+// COMMON-LABEL: @triangular
+func @triangular(%arg0: memref<?x?xf32>) {
+  %c2 = constant 2 : index
+  %c44 = constant 44 : index
+  %c1 = constant 1 : index
+  // Range of the original outer loop:
+  //   (upper - lower + step - 1) / step
+  // where step is known to be %c1.
+  // COMMON:      %[[diff:.*]] = subi %c44, %c2
+  // COMMON:      %[[adjustment:.*]] = subi %c1, %c1_{{.*}}
+  // COMMON-NEXT: %[[diff_adj:.*]] = addi %[[diff]], %[[adjustment]]
+  // COMMON-NEXT: %[[range:.*]] = divi_signed %[[diff_adj]], %c1
+
+  // Ceildiv to get the parametric tile size.
+  // COMMON:       %[[sum:.*]] = addi %[[range]], %c6
+  // COMMON-NEXT:  %[[size:.*]] = divi_signed %[[sum]], %c7
+  // New outer step (original is %c1).
+  // COMMON-NEXT:  %[[step:.*]] = muli %c1, %[[size]]
+
+  // Constant adjustment for inner loop has been hoisted out.
+  // TILE_74:      %[[adjustment2:.*]] = subi %c2, %c1_{{.*}}
+
+  // New outer loop.
+  // COMMON: loop.for %[[i:.*]] = %c2 to %c44 step %[[step]]
+
+  // Range of the original inner loop
+  //   (upper - lower + step - 1) / step
+  // where step is known to be %c2.
+  // TILE_74:      %[[diff2:.*]] = subi %[[i]], %c1
+  // TILE_74-NEXT: %[[diff2_adj:.*]] = addi %[[diff2]], %[[adjustment2]]
+  // TILE_74-NEXT: %[[range2:.*]] = divi_signed %[[diff2_adj]], %c2
+
+  // Ceildiv to get the parametric tile size for the second original loop.
+  // TILE_74:      %[[sum2:.*]] = addi %[[range2]], %c3
+  // TILE_74-NEXT: %[[size2:.*]] = divi_signed %[[sum2]], %c4
+  // New inner step (original is %c2).
+  // TILE_74-NEXT:     %[[step2:.*]] = muli %c2, %[[size2]]
+
+  // New inner loop.
+  // TILE_74:loop.for %[[j:.*]] = %c1 to %[[i]] step %[[step2]]
+ loop.for %i = %c2 to %c44 step %c1 {
+    // Upper bound for the inner loop min(%i + %step, %c44).
+    // COMMON:      %[[stepped:.*]] = addi %[[i]], %[[step]]
+    // COMMON-NEXT: cmpi "slt", %c44, %[[stepped]]
+    // COMMON-NEXT: %[[ub:.*]] = select {{.*}}, %c44, %[[stepped]]
+    // TILE_74:      %[[stepped2:.*]] = addi %[[j]], %[[step2]]
+    // TILE_74-NEXT: cmpi "slt", %[[i]], %[[stepped2]]
+    // TILE_74-NEXT: %[[ub2:.*]] = select {{.*}}, %[[i]], %[[stepped2]]
+    //
+    // Created inner loop.
+    // COMMON:loop.for %[[ii:.*]] = %[[i]] to %[[ub:.*]] step %c1
+
+    // This loop is not modified in TILE_7 case.
+    // TILE_7: loop.for %[[j:.*]] = %c1 to %[[ii]] step %c2
+    //
+    // But is modified in TILE_74 case.
+    // TILE_74:loop.for %[[jj:.*]] = %[[j]] to %[[ub2]] step %c2
+   loop.for %j = %c1 to %i step %c2 {
+      // The right iterator are used.
+      // TILE_7:  load %arg0[%[[ii]], %[[j]]]
+      // TILE_74: load %arg0[%[[ii]], %[[jj]]]
+      load %arg0[%i, %j]: memref<?x?xf32>
+    }
+  }
+  return
+}
diff --git a/mlir/test/Transforms/pipeline-data-transfer.mlir b/mlir/test/Transforms/pipeline-data-transfer.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..c4e17ce1682ebb80ab2fcddc163eabfde72f4031
--- /dev/null
+++ b/mlir/test/Transforms/pipeline-data-transfer.mlir
@@ -0,0 +1,383 @@
+// RUN: mlir-opt %s -split-input-file -affine-pipeline-data-transfer | FileCheck %s
+
+// -----
+
+// CHECK-DAG: [[MOD_2:#map[0-9]+]] = (d0) -> (d0 mod 2)
+// CHECK-DAG: [[MAP_MINUS_1:#map[0-9]+]] = (d0) -> (d0 - 1)
+
+// CHECK-LABEL: func @loop_nest_dma() {
+func @loop_nest_dma() {
+
+  %A = alloc() : memref<256 x f32, (d0) -> (d0), 0>
+  %Ah = alloc() : memref<32 x f32, (d0) -> (d0), 1>
+
+  %tag = alloc() : memref<1 x f32>
+
+  %zero = constant 0 : index
+  %num_elts = constant 32 : index
+
+  affine.for %i = 0 to 8 {
+    affine.dma_start %A[%i], %Ah[%i], %tag[%zero], %num_elts : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32>
+    affine.dma_wait %tag[%zero], %num_elts : memref<1 x f32>
+    %v = affine.load %Ah[%i] : memref<32 x f32, (d0) -> (d0), 1>
+    %r = "compute"(%v) : (f32) -> (f32)
+    affine.store %r, %Ah[%i] : memref<32 x f32, (d0) -> (d0), 1>
+    affine.for %j = 0 to 32 {
+      "do_more_compute"(%i, %j) : (index, index) -> ()
+    }
+  }
+  dealloc %tag : memref<1 x f32>
+  dealloc %Ah : memref<32 x f32, (d0) -> (d0), 1>
+  return
+}
+// CHECK:       %{{.*}} = alloc() : memref<256xf32>
+// CHECK:       %{{.*}} = alloc() : memref<2x32xf32, 1>
+// CHECK-NEXT:  %{{.*}} = alloc() : memref<2x1xf32>
+// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+// CHECK-NEXT:  affine.for %{{.*}} = 1 to 8 {
+// CHECK-NEXT:    affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
+// CHECK-NEXT:    %{{.*}} = affine.apply [[MAP_MINUS_1]](%{{.*}})
+// CHECK-NEXT:    %{{.*}} = affine.apply [[MOD_2]](%{{.*}})
+// CHECK-NEXT:    %{{.*}} = affine.apply [[MOD_2]](%{{.*}})
+// CHECK-NEXT:    affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32>
+// CHECK-NEXT:    %{{.*}} = affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
+// CHECK-NEXT:    %{{.*}} = "compute"(%{{.*}}) : (f32) -> f32
+// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to 32 {
+// CHECK-NEXT:      "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> ()
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %{{.*}} = affine.apply [[MAP_MINUS_1]](%{{.*}})
+// CHECK-NEXT:  %{{.*}} = affine.apply [[MOD_2]](%{{.*}})
+// CHECK-NEXT:  %{{.*}} = affine.apply [[MOD_2]](%{{.*}})
+// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32>
+// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
+// CHECK-NEXT:  %{{.*}} = "compute"(%{{.*}}) : (f32) -> f32
+// CHECK-NEXT:  affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
+// CHECK-NEXT:  affine.for %{{.*}} = 0 to 32 {
+// CHECK-NEXT:    "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> ()
+// CHECK-NEXT:  }
+// CHECK-NEXT:  dealloc %{{.*}} : memref<2x1xf32>
+// CHECK-NEXT:  dealloc %{{.*}} : memref<2x32xf32, 1>
+// CHECK-NEXT:  return
+// CHECK-NEXT:}
+
+// -----
+
+// CHECK-DAG: [[FLOOR_MOD_2:#map[0-9]+]] = (d0) -> ((d0 floordiv 4) mod 2)
+// CHECK-DAG: [[REMAP_SHIFT_MINUS_4:#map[0-9]+]] = (d0) -> (d0 - 4)
+
+// CHECK-LABEL: @loop_step
+func @loop_step(%arg0: memref<512xf32>,
+                  %arg1: memref<512xf32>) {
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  affine.for %i0 = 0 to 512 step 4 {
+    %1 = alloc() : memref<4xf32, 1>
+    %2 = alloc() : memref<1xi32>
+    affine.dma_start %arg0[%i0], %1[%c0], %2[%c0], %c4,
+              : memref<512xf32>, memref<4xf32, 1>, memref<1xi32>
+    affine.dma_wait %2[%c0], %c4 : memref<1xi32>
+    "compute"(%i0) : (index) -> ()
+    dealloc %2 : memref<1xi32>
+    dealloc %1 : memref<4xf32, 1>
+  }
+  return
+}
+// CHECK:        [[BUF:%[0-9]+]] = alloc() : memref<2x4xf32, 1>
+// CHECK:        [[TAG:%[0-9]+]] = alloc() : memref<2x1xi32>
+// CHECK-NEXT:   affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
+// CHECK-NEXT:   affine.for %{{.*}} = 4 to 512 step 4 {
+// CHECK-NEXT:     affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
+// CHECK-NEXT:     %{{.*}} = affine.apply [[REMAP_SHIFT_MINUS_4]](%{{.*}})
+// CHECK-NEXT:     %{{.*}} = affine.apply [[FLOOR_MOD_2]](%{{.*}})
+// CHECK:          affine.dma_wait [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<2x1xi32>
+// CHECK-NEXT:     "compute"(%{{.*}}) : (index) -> ()
+// CHECK-NEXT:   }
+// CHECK-NEXT:   [[SHIFTED:%[0-9]+]] = affine.apply [[REMAP_SHIFT_MINUS_4]](%{{.*}})
+// CHECK-NEXT:   %{{.*}} = affine.apply [[FLOOR_MOD_2]]([[SHIFTED]])
+// CHECK:        affine.dma_wait [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<2x1xi32>
+// CHECK-NEXT:   "compute"(%{{.*}}) : (index) -> ()
+// CHECK-NEXT:   dealloc [[TAG]] : memref<2x1xi32>
+// CHECK-NEXT:   dealloc [[BUF]] : memref<2x4xf32, 1>
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+
+// -----
+
+#map1 = (d0, d1) -> ((d0 * 2048 + d1 * 256) floordiv 32)
+#map2 = (d0) -> ((d0 * 2048) floordiv 32)
+// CHECK-LABEL: func @loop_dma_nested(%{{.*}}: memref<512x32xvector<8xf32>
+func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>>, %arg1: memref<512x32xvector<8xf32>>, %arg2: memref<512x32xvector<8xf32>>) {
+  %num_elts = constant 256 : index
+  %c0 = constant 0 : index
+  %0 = alloc() : memref<64x4xvector<8xf32>, 2>
+  %1 = alloc() : memref<64x4xvector<8xf32>, 2>
+  %2 = alloc() : memref<64x4xvector<8xf32>, 2>
+  %3 = alloc() : memref<2xi32>
+  %4 = alloc() : memref<2xi32>
+  %5 = alloc() : memref<2xi32>
+  // Prologue for DMA overlap on arg2.
+  // CHECK-DAG: [[BUF_ARG2:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2>
+  // CHECK-DAG: [[TAG_ARG2:%[0-9]+]] = alloc() : memref<2x2xi32>
+  // CHECK: affine.dma_start %{{.*}}[
+  // CHECK: affine.for %{{.*}} = 1 to 8 {
+  affine.for %i0 = 0 to 8 {
+    %6 = affine.apply #map2(%i0)
+    affine.dma_start %arg2[%6, %c0], %2[%c0, %c0], %5[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
+    affine.dma_wait %5[%c0], %num_elts : memref<2xi32>
+    // Steady state for DMA overlap on arg2
+    // CHECK: affine.dma_start %{{.*}}[
+    // CHECK: affine.dma_wait [[TAG_ARG2]]
+    // Prologue for DMA overlap on arg0, arg1 nested within i0
+    // CHECK: [[BUF_ARG0:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2>
+    // CHECK: [[BUF_ARG1:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2>
+    // CHECK: [[TAG_ARG0:%[0-9]+]] = alloc() : memref<2x2xi32>
+    // CHECK: [[TAG_ARG1:%[0-9]+]] = alloc() : memref<2x2xi32>
+    // CHECK: affine.dma_start %{{.*}}[
+    // CHECK: affine.dma_start %{{.*}}[
+    // CHECK-NEXT affine.for %{{.*}} = 1 to 8 {
+    affine.for %i1 = 0 to 8 {
+      %7 = affine.apply #map1(%i0, %i1)
+      %8 = affine.apply #map2(%i1)
+      affine.dma_start %arg0[%7, %c0], %0[%c0, %c0], %3[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
+      affine.dma_start %arg1[%8, %c0], %1[%c0, %c0], %4[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
+      affine.dma_wait %3[%c0], %num_elts : memref<2xi32>
+      affine.dma_wait %4[%c0], %num_elts : memref<2xi32>
+      // Steady state for DMA overlap on arg0, arg1
+      // CHECK: affine.dma_start %{{.*}}[
+      // CHECK: affine.dma_start %{{.*}}[
+      // CHECK: affine.dma_wait [[TAG_ARG0]]
+      // CHECK: affine.dma_wait [[TAG_ARG1]]
+      // CHECK-NEXT: affine.for %{{.*}} = 0 to 4 {
+      affine.for %i2 = 0 to 4 {
+        "foo"() : () -> ()
+      }
+    }
+    // epilogue for arg0, arg1
+    // CHECK: affine.dma_wait [[TAG_ARG0]]
+    // CHECK: affine.dma_wait [[TAG_ARG1]]
+    // CHECK-DAG:    dealloc [[TAG_ARG1]] : memref<2x2xi32>
+    // CHECK-DAG:    dealloc [[TAG_ARG0]] : memref<2x2xi32>
+    // CHECK-DAG:    dealloc [[BUF_ARG1]] : memref<2x64x4xvector<8xf32>, 2>
+    // CHECK-DAG:    dealloc [[BUF_ARG0]] : memref<2x64x4xvector<8xf32>, 2>
+  // epilogue for DMA overlap on %arg2
+  // CHECK:  affine.dma_wait [[TAG_ARG2]]
+  // Within the epilogue for arg2's DMA, we have the DMAs on %arg1, %arg2 nested.
+  // CHECK: [[BUF_ARG0_NESTED:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2>
+  // CHECK: [[BUF_ARG1_NESTED:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2>
+  // CHECK: [[TAG_ARG0_NESTED:%[0-9]+]] = alloc() : memref<2x2xi32>
+  // CHECK: [[TAG_ARG1_NESTED:%[0-9]+]] = alloc() : memref<2x2xi32>
+  // CHECK:  affine.dma_start %{{.*}}[
+  // CHECK:  affine.dma_start %{{.*}}[
+  // CHECK:  affine.for %{{.*}} = 1 to 8 {
+  // CHECK:    affine.dma_start %{{.*}}[
+  // CHECK:    affine.dma_start %{{.*}}[
+  // CHECK:    affine.dma_wait [[TAG_ARG0_NESTED]]
+  // CHECK:    affine.dma_wait [[TAG_ARG1_NESTED]]
+  // CHECK:    affine.for %{{.*}} = 0 to 4 {
+  // CHECK:      "foo"() : () -> ()
+  // CHECK:  affine.dma_wait [[TAG_ARG0_NESTED]]
+  // CHECK:  affine.dma_wait [[TAG_ARG1_NESTED]]
+  // CHECK:  affine.for %{{.*}} = 0 to 4 {
+  }
+  dealloc %5 : memref<2xi32>
+  dealloc %4 : memref<2xi32>
+  dealloc %3 : memref<2xi32>
+  dealloc %2 : memref<64x4xvector<8xf32>, 2>
+  dealloc %1 : memref<64x4xvector<8xf32>, 2>
+  dealloc %0 : memref<64x4xvector<8xf32>, 2>
+  return
+// CHECK: }
+// CHECK-DAG:  dealloc [[TAG_ARG1_NESTED]] : memref<2x2xi32>
+// CHECK-DAG:  dealloc [[TAG_ARG0_NESTED]] : memref<2x2xi32>
+// CHECK-DAG:  dealloc [[BUF_ARG1_NESTED]] : memref<2x64x4xvector<8xf32>, 2>
+// CHECK-DAG:  dealloc [[BUF_ARG0_NESTED]] : memref<2x64x4xvector<8xf32>, 2>
+// CHECK-DAG:  dealloc [[TAG_ARG2]] : memref<2x2xi32>
+// CHECK-DAG:  dealloc [[BUF_ARG2]] : memref<2x64x4xvector<8xf32>, 2>
+// CHECK-NEXT: return
+}
+
+// -----
+#map2 = (d0) -> ((d0 * 2048) floordiv 32)
+
+// CHECK: func @loop_dma_dependent
+func @loop_dma_dependent(%arg2: memref<512x32xvector<8xf32>>) {
+  %num_elts = constant 256 : index
+  %c0 = constant 0 : index
+  %0 = alloc() : memref<64x4xvector<8xf32>, 2>
+  %1 = alloc() : memref<64x4xvector<8xf32>, 2>
+  %2 = alloc() : memref<64x4xvector<8xf32>, 2>
+  %3 = alloc() : memref<2xi32>
+  %4 = alloc() : memref<2xi32>
+  %5 = alloc() : memref<2xi32>
+
+  // The two DMAs below are dependent (incoming and outgoing on the same
+  // memref) in the same iteration; so no pipelining here.
+  // CHECK-NOT: affine.dma_start
+  // CHECK: affine.for %{{.*}} = 0 to 8 {
+  affine.for %i0 = 0 to 8 {
+    %6 = affine.apply #map2(%i0)
+    affine.dma_start %arg2[%6, %c0], %2[%c0, %c0], %5[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
+    affine.dma_wait %5[%c0], %num_elts : memref<2xi32>
+
+    affine.dma_start %2[%c0, %c0], %arg2[%6, %c0], %5[%c0], %num_elts : memref<64x4xvector<8xf32>, 2>, memref<512x32xvector<8xf32>>, memref<2xi32>
+    affine.dma_wait %5[%c0], %num_elts : memref<2xi32>
+  }
+  dealloc %5 : memref<2xi32>
+  dealloc %4 : memref<2xi32>
+  dealloc %3 : memref<2xi32>
+  dealloc %2 : memref<64x4xvector<8xf32>, 2>
+  dealloc %1 : memref<64x4xvector<8xf32>, 2>
+  dealloc %0 : memref<64x4xvector<8xf32>, 2>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @escaping_use
+func @escaping_use(%arg0: memref<512 x 32 x f32>) {
+  %c32 = constant 32 : index
+  %num_elt = constant 512 : index
+  %zero = constant 0 : index
+  %Av = alloc() : memref<32 x 32 x f32, 2>
+  %tag = alloc() : memref<1 x i32>
+
+  // CHECK-NOT: affine.dma_start
+  // CHECK: affine.for %{{.*}} = 0 to 16 {
+  affine.for %kTT = 0 to 16 {
+    affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
+      memref<512 x 32 x f32>,
+      memref<32 x 32 x f32, 2>, memref<1 x i32>
+    affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
+    // escaping use; no DMA pipelining / double buffering will be done.
+    "foo"(%Av) : (memref<32 x 32 x f32, 2>) -> ()
+  }
+  dealloc %tag : memref<1 x i32>
+  dealloc %Av : memref<32 x 32 x f32, 2>
+  return
+// CHECK:        "foo"(%{{[0-9]+}}) : (memref<32x32xf32, 2>) -> ()
+// CHECK:      }
+// CHECK:      return
+}
+
+// -----
+
+// CHECK-LABEL: func @escaping_tag
+func @escaping_tag(%arg0: memref<512 x 32 x f32>) {
+  %c32 = constant 32 : index
+  %num_elt = constant 512 : index
+  %zero = constant 0 : index
+  %Av = alloc() : memref<32 x 32 x f32, 2>
+  %tag = alloc() : memref<1 x i32>
+
+  // CHECK-NOT: affine.dma_start
+  // CHECK: affine.for %{{.*}} = 0 to 16 {
+  affine.for %kTT = 0 to 16 {
+    affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
+      memref<512 x 32 x f32>,
+      memref<32 x 32 x f32, 2>, memref<1 x i32>
+    affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
+    // escaping use; no DMA pipelining / double buffering will be done.
+    "foo"(%tag) : (memref<1 x i32>) -> ()
+  }
+  dealloc %tag : memref<1 x i32>
+  dealloc %Av : memref<32 x 32 x f32, 2>
+  return
+// CHECK:        "foo"(%{{[0-9]+}}) : (memref<1xi32>) -> ()
+// CHECK:      }
+// CHECK:      return
+}
+
+
+// -----
+
+// CHECK-LABEL: func @live_out_use
+func @live_out_use(%arg0: memref<512 x 32 x f32>) -> f32 {
+  %c32 = constant 32 : index
+  %num_elt = constant 512 : index
+  %zero = constant 0 : index
+  %Av = alloc() : memref<32 x 32 x f32, 2>
+  %tag = alloc() : memref<1 x i32>
+
+  // CHECK-NOT: affine.dma_start
+  // CHECK: affine.for %{{.*}} = 0 to 16 {
+  affine.for %kTT = 0 to 16 {
+    affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
+      memref<512 x 32 x f32>,
+      memref<32 x 32 x f32, 2>, memref<1 x i32>
+    affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
+  }
+  // Use live out of 'affine.for' op; no DMA pipelining will be done.
+  %v = affine.load %Av[%zero, %zero] : memref<32 x 32 x f32, 2>
+  dealloc %tag : memref<1 x i32>
+  dealloc %Av : memref<32 x 32 x f32, 2>
+  return %v : f32
+// CHECK:      %{{[0-9]+}} = affine.load %{{[0-9]+}}[%{{.*}}, %{{.*}}] : memref<32x32xf32, 2>
+// CHECK:      return
+}
+
+// -----
+
+// CHECK-LABEL: func @dynamic_shape_dma_buffer
+func @dynamic_shape_dma_buffer(%arg0: memref<512 x 32 x f32>) {
+  %c32 = constant 32 : index
+  %num_elt = constant 512 : index
+  %zero = constant 0 : index
+
+  %Av = alloc(%c32, %c32) : memref<? x ? x f32, 2>
+  %tag = alloc() : memref<1 x i32>
+
+// Double buffering for dynamic shaped buffer.
+// CHECK:       %{{.*}} = alloc(%{{.*}}, %{{.*}}) : memref<?x?xf32, 2>
+// CHECK-NEXT:  %{{.*}} = dim %{{.*}}, 0 : memref<?x?xf32, 2>
+// CHECK-NEXT:  %{{.*}} = dim %{{.*}}, 1 : memref<?x?xf32, 2>
+// CHECK-NEXT:  %{{.*}} = alloc(%{{.*}}, %{{.*}}) : memref<2x?x?xf32, 2>
+// CHECK:       affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0, 0], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}}
+  affine.for %kTT = 0 to 16 {
+    affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
+      memref<512 x 32 x f32>,
+      memref<? x ? x f32, 2>, memref<1 x i32>
+    affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
+  }
+  dealloc %Av : memref<? x ? x f32, 2>
+  return
+// CHECK-NEXT:  affine.for %{{.*}} = 1 to 16 {
+// CHECK:         affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0, 0], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}}
+// CHECK:         affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xi32>
+// CHECK:       }
+// CHECK:       affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xi32>
+// CHECK:       return
+}
+
+// Memref replacement will fail here due to a non-dereferencing use. However,
+// no incorrect transformation is performed in spite of one of the uses being a
+// dereferencing one since replaceAllMemRefUsesWith checks for escaping uses
+// before performing any replacement.
+// CHECK-LABEL: func @escaping_and_indexed_use_mix
+func @escaping_and_indexed_use_mix() {
+  %A = alloc() : memref<256 x f32, (d0) -> (d0), 0>
+  %Ah = alloc() : memref<32 x f32, (d0) -> (d0), 1>
+  %tag = alloc() : memref<1 x f32>
+  %zero = constant 0 : index
+  %num_elts = constant 32 : index
+
+  // alloc for the buffer is created but no replacement should happen.
+  affine.for %i = 0 to 8 {
+    affine.dma_start %A[%i], %Ah[%i], %tag[%zero], %num_elts : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32>
+    affine.dma_wait %tag[%zero], %num_elts : memref<1 x f32>
+    "compute"(%Ah) : (memref<32 x f32, 1>) -> ()
+    %v = affine.load %Ah[%i] : memref<32 x f32, (d0) -> (d0), 1>
+    "foo"(%v) : (f32) -> ()
+  }
+  dealloc %A : memref<256 x f32, (d0) -> (d0), 0>
+  dealloc %Ah : memref<32 x f32, (d0) -> (d0), 1>
+  return
+}
+// No replacement.
+// CHECK: affine.for %{{.*}} = 0 to 8 {
+// CHECK-NEXT:   affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}
+// CHECK-NEXT:   affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xf32>
+// CHECK-NEXT:   "compute"(%{{.*}}) : (memref<32xf32, 1>) -> ()
+// CHECK-NEXT:   [[VAL:%[0-9]+]] = affine.load %{{.*}}[%{{.*}}] : memref<32xf32, 1>
+// CHECK-NEXT:   "foo"([[VAL]]) : (f32) -> ()
diff --git a/mlir/test/Transforms/simplify-affine-structures.mlir b/mlir/test/Transforms/simplify-affine-structures.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..9e5e7f966d20fdefc34001c6b263fb05f1101a90
--- /dev/null
+++ b/mlir/test/Transforms/simplify-affine-structures.mlir
@@ -0,0 +1,237 @@
+// RUN: mlir-opt %s -simplify-affine-structures | FileCheck %s
+
+// CHECK-DAG: [[SET_EMPTY_2D:#set[0-9]+]] = (d0, d1) : (1 == 0)
+// CHECK-DAG: #set1 = (d0, d1) : (d0 - 100 == 0, d1 - 10 == 0, -d0 + 100 >= 0, d1 >= 0, d1 + 101 >= 0)
+// CHECK-DAG: #set2 = (d0, d1)[s0, s1] : (1 == 0)
+// CHECK-DAG: #set3 = (d0, d1)[s0, s1] : (d0 * 7 + d1 * 5 + s0 * 11 + s1 == 0, d0 * 5 - d1 * 11 + s0 * 7 + s1 == 0, d0 * 11 + d1 * 7 - s0 * 5 + s1 == 0, d0 * 7 + d1 * 5 + s0 * 11 + s1 == 0)
+// CHECK-DAG: [[SET_EMPTY_1D:#set[0-9]+]] = (d0) : (1 == 0)
+// CHECK-DAG: [[SET_EMPTY_1D_2S:#set[0-9]+]] = (d0)[s0, s1] : (1 == 0)
+// CHECK-DAG: [[SET_EMPTY_3D:#set[0-9]+]] = (d0, d1, d2) : (1 == 0)
+
+// Set for test case: test_gaussian_elimination_non_empty_set2
+// #set2 = (d0, d1) : (d0 - 100 == 0, d1 - 10 == 0, -d0 + 100 >= 0, d1 >= 0, d1 + 101 >= 0)
+#set2 = (d0, d1) : (d0 - 100 == 0, d1 - 10 == 0, -d0 + 100 >= 0, d1 >= 0, d1 + 101 >= 0)
+
+// Set for test case: test_gaussian_elimination_empty_set3
+// #set3 = (d0, d1)[s0, s1] : (1 == 0)
+#set3 = (d0, d1)[s0, s1] : (d0 - s0 == 0, d0 + s0 == 0, s0 - 1 == 0)
+
+// Set for test case: test_gaussian_elimination_non_empty_set4
+#set4 = (d0, d1)[s0, s1] : (d0 * 7 + d1 * 5 + s0 * 11 + s1 == 0,
+                            d0 * 5 - d1 * 11 + s0 * 7 + s1 == 0,
+                            d0 * 11 + d1 * 7 - s0 * 5 + s1 == 0,
+                            d0 * 7 + d1 * 5 + s0 * 11 + s1 == 0)
+
+// Add invalid constraints to previous non-empty set to make it empty.
+// Set for test case: test_gaussian_elimination_empty_set5
+#set5 = (d0, d1)[s0, s1] : (d0 * 7 + d1 * 5 + s0 * 11 + s1 == 0,
+                             d0 * 5 - d1 * 11 + s0 * 7 + s1 == 0,
+                             d0 * 11 + d1 * 7 - s0 * 5 + s1 == 0,
+                             d0 * 7 + d1 * 5 + s0 * 11 + s1 == 0,
+                             d0 - 1 == 0, d0 + 2 == 0)
+
+// This is an artificially created system to exercise the worst case behavior of
+// FM elimination - as a safeguard against improperly constructed constraint
+// systems or fuzz input.
+#set_fuzz_virus = (d0, d1, d2, d3, d4, d5) : ( 1089234*d0 + 203472*d1 + 82342 >= 0,
+                            -55*d0 + 24*d1 + 238*d2 - 234*d3 - 9743 >= 0,
+                            -5445*d0 - 284*d1 + 23*d2 + 34*d3 - 5943 >= 0,
+                            -5445*d0 + 284*d1 + 238*d2 - 34*d3 >= 0,
+                            445*d0 + 284*d1 + 238*d2 + 39*d3 >= 0,
+                            -545*d0 + 214*d1 + 218*d2 - 94*d3 >= 0,
+                            44*d0 - 184*d1 - 231*d2 + 14*d3 >= 0,
+                            -45*d0 + 284*d1 + 138*d2 - 39*d3 >= 0,
+                            154*d0 - 84*d1 + 238*d2 - 34*d3 >= 0,
+                            54*d0 - 284*d1 - 223*d2 + 384*d3 >= 0,
+                            -55*d0 + 284*d1 + 23*d2 + 34*d3 >= 0,
+                            54*d0 - 84*d1 + 28*d2 - 34*d3 >= 0,
+                            54*d0 - 24*d1 - 23*d2 + 34*d3 >= 0,
+                            -55*d0 + 24*d1 + 23*d2 + 4*d3 >= 0,
+                            15*d0 - 84*d1 + 238*d2 - 3*d3 >= 0,
+                            5*d0 - 24*d1 - 223*d2 + 84*d3 >= 0,
+                            -5*d0 + 284*d1 + 23*d2 - 4*d3 >= 0,
+                            14*d0 + 4*d2 + 7234 >= 0,
+                            -174*d0 - 534*d2 + 9834 >= 0,
+                            194*d0 - 954*d2 + 9234 >= 0,
+                            47*d0 - 534*d2 + 9734 >= 0,
+                            -194*d0 - 934*d2 + 984 >= 0,
+                            -947*d0 - 953*d2 + 234 >= 0,
+                            184*d0 - 884*d2 + 884 >= 0,
+                            -174*d0 + 834*d2 + 234 >= 0,
+                            844*d0 + 634*d2 + 9874 >= 0,
+                            -797*d2 - 79*d3 + 257 >= 0,
+                            2039*d0 + 793*d2 - 99*d3 - 24*d4 + 234*d5 >= 0,
+                            78*d2 - 788*d5 + 257 >= 0,
+                            d3 - (d5 + 97*d0) floordiv 423 >= 0,
+                            234* (d0 + d3 mod 5 floordiv 2342) mod 2309 
+                            + (d0 + 2038*d3) floordiv 208 >= 0,
+                            239* (d0 + 2300 * d3) floordiv 2342 
+                            mod 2309 mod 239423 == 0,
+                            d0 + d3 mod 2642 + (d3 + 2*d0) mod 1247 
+                            mod 2038 mod 2390 mod 2039 floordiv 55 >= 0
+)
+
+// CHECK-LABEL: func @test_gaussian_elimination_empty_set0() {
+func @test_gaussian_elimination_empty_set0() {
+  affine.for %arg0 = 1 to 10 {
+    affine.for %arg1 = 1 to 100 {
+      // CHECK: [[SET_EMPTY_2D]](%arg0, %arg1)
+      affine.if (d0, d1) : (2 == 0)(%arg0, %arg1) {
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @test_gaussian_elimination_empty_set1() {
+func @test_gaussian_elimination_empty_set1() {
+  affine.for %arg0 = 1 to 10 {
+    affine.for %arg1 = 1 to 100 {
+      // CHECK: [[SET_EMPTY_2D]](%arg0, %arg1)
+      affine.if (d0, d1) : (1 >= 0, -1 >= 0) (%arg0, %arg1) {
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @test_gaussian_elimination_non_empty_set2() {
+func @test_gaussian_elimination_non_empty_set2() {
+  affine.for %arg0 = 1 to 10 {
+    affine.for %arg1 = 1 to 100 {
+      // CHECK: #set1(%arg0, %arg1)
+      affine.if #set2(%arg0, %arg1) {
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @test_gaussian_elimination_empty_set3() {
+func @test_gaussian_elimination_empty_set3() {
+  %c7 = constant 7 : index
+  %c11 = constant 11 : index
+  affine.for %arg0 = 1 to 10 {
+    affine.for %arg1 = 1 to 100 {
+      // CHECK: #set2(%arg0, %arg1)[%c7, %c11]
+      affine.if #set3(%arg0, %arg1)[%c7, %c11] {
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @test_gaussian_elimination_non_empty_set4() {
+func @test_gaussian_elimination_non_empty_set4() {
+  %c7 = constant 7 : index
+  %c11 = constant 11 : index
+  affine.for %arg0 = 1 to 10 {
+    affine.for %arg1 = 1 to 100 {
+      // CHECK: #set3(%arg0, %arg1)[%c7, %c11]
+      affine.if #set4(%arg0, %arg1)[%c7, %c11] {
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @test_gaussian_elimination_empty_set5() {
+func @test_gaussian_elimination_empty_set5() {
+  %c7 = constant 7 : index
+  %c11 = constant 11 : index
+  affine.for %arg0 = 1 to 10 {
+    affine.for %arg1 = 1 to 100 {
+      // CHECK: #set2(%arg0, %arg1)[%c7, %c11]
+      affine.if #set5(%arg0, %arg1)[%c7, %c11] {
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @test_fuzz_explosion
+func @test_fuzz_explosion(%arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) {
+  affine.for %arg4 = 1 to 10 {
+    affine.for %arg5 = 1 to 100 {
+      affine.if #set_fuzz_virus(%arg4, %arg5, %arg0, %arg1, %arg2, %arg3) {
+      }
+    }
+  }
+  return
+}
+
+
+// CHECK-LABEL: func @test_empty_set(%arg0: index) {
+func @test_empty_set(%N : index) {
+  affine.for %i = 0 to 10 {
+    affine.for %j = 0 to 10 {
+      // CHECK: affine.if [[SET_EMPTY_2D]](%arg1, %arg2)
+      affine.if (d0, d1) : (d0 - d1 >= 0, d1 - d0 - 1 >= 0)(%i, %j) {
+        "foo"() : () -> ()
+      }
+      // CHECK: affine.if [[SET_EMPTY_1D]](%arg1)
+      affine.if (d0) : (d0 >= 0, -d0 - 1 >= 0)(%i) {
+        "bar"() : () -> ()
+      }
+      // CHECK: affine.if [[SET_EMPTY_1D]](%arg1)
+      affine.if (d0) : (d0 >= 0, -d0 - 1 >= 0)(%i) {
+        "foo"() : () -> ()
+      }
+      // CHECK: affine.if [[SET_EMPTY_1D_2S]](%arg1)[%arg0, %arg0]
+      affine.if (d0)[s0, s1] : (d0 >= 0, -d0 + s0 - 1 >= 0, -s0 >= 0)(%i)[%N, %N] {
+        "bar"() : () -> ()
+      }
+      // CHECK: affine.if [[SET_EMPTY_3D]](%arg1, %arg2, %arg0)
+      // The set below implies d0 = d1; so d1 >= d0, but d0 >= d1 + 1.
+      affine.if (d0, d1, d2) : (d0 - d1 == 0, d2 - d0 >= 0, d0 - d1 - 1 >= 0)(%i, %j, %N) {
+        "foo"() : () -> ()
+      }
+      // CHECK: affine.if [[SET_EMPTY_2D]](%arg1, %arg2)
+      // The set below has rational solutions but no integer solutions; GCD test catches it.
+      affine.if (d0, d1) : (d0*2 -d1*2 - 1 == 0, d0 >= 0, -d0 + 100 >= 0, d1 >= 0, -d1 + 100 >= 0)(%i, %j) {
+        "foo"() : () -> ()
+      }
+      // CHECK: affine.if [[SET_EMPTY_2D]](%arg1, %arg2)
+      affine.if (d0, d1) : (d1 == 0, d0 - 1 >= 0, - d0 - 1 >= 0)(%i, %j) {
+        "foo"() : () -> ()
+      }
+    }
+  }
+  // The tests below test GCDTightenInequalities().
+  affine.for %k = 0 to 10 {
+    affine.for %l = 0 to 10 {
+      // Empty because no multiple of 8 lies between 4 and 7.
+      // CHECK: affine.if [[SET_EMPTY_1D]](%arg1)
+      affine.if (d0) : (8*d0 - 4 >= 0, -8*d0 + 7 >= 0)(%k) {
+        "foo"() : () -> ()
+      }
+      // Same as above but with equalities and inequalities.
+      // CHECK: affine.if [[SET_EMPTY_2D]](%arg1, %arg2)
+      affine.if (d0, d1) : (d0 - 4*d1 == 0, 4*d1 - 5 >= 0, -4*d1 + 7 >= 0)(%k, %l) {
+        "foo"() : () -> ()
+      }
+      // Same as above but with a combination of multiple identifiers. 4*d0 +
+      // 8*d1 here is a multiple of 4, and so can't lie between 9 and 11. GCD
+      // tightening will tighten constraints to 4*d0 + 8*d1 >= 12 and 4*d0 +
+      // 8*d1 <= 8; hence infeasible.
+      // CHECK: affine.if [[SET_EMPTY_2D]](%arg1, %arg2)
+      affine.if (d0, d1) : (4*d0 + 8*d1 - 9 >= 0, -4*d0 - 8*d1 + 11 >=  0)(%k, %l) {
+        "foo"() : () -> ()
+      }
+      // Same as above but with equalities added into the mix.
+      // CHECK: affine.if [[SET_EMPTY_3D]](%arg1, %arg1, %arg2)
+      affine.if (d0, d1, d2) : (d0 - 4*d2 == 0, d0 + 8*d1 - 9 >= 0, -d0 - 8*d1 + 11 >=  0)(%k, %k, %l) {
+        "foo"() : () -> ()
+      }
+    }
+  }
+
+  affine.for %m = 0 to 10 {
+    // CHECK: affine.if [[SET_EMPTY_1D]](%arg{{[0-9]+}})
+    affine.if (d0) : (d0 mod 2 - 3 == 0) (%m) {
+      "foo"() : () -> ()
+    }
+  }
+
+  return
+}
diff --git a/mlir/test/Transforms/slicing-utils.mlir b/mlir/test/Transforms/slicing-utils.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8c6fb01e1f76b8a5a0da22829f726074222bdb41
--- /dev/null
+++ b/mlir/test/Transforms/slicing-utils.mlir
@@ -0,0 +1,290 @@
+// RUN: mlir-opt %s -affine-vectorizer-test -forward-slicing=true 2>&1 | FileCheck %s --check-prefix=FWD
+// RUN: mlir-opt %s -affine-vectorizer-test -backward-slicing=true 2>&1 | FileCheck %s --check-prefix=BWD
+// RUN: mlir-opt %s -affine-vectorizer-test -slicing=true 2>&1 | FileCheck %s --check-prefix=FWDBWD
+
+///   1       2      3      4
+///   |_______|      |______|
+///   |   |             |
+///   |   5             6
+///   |___|_____________|
+///     |               |
+///     7               8
+///     |_______________|
+///             |
+///             9
+// FWD-LABEL: slicing_test
+// BWD-LABEL: slicing_test
+// FWDBWD-LABEL: slicing_test
+func @slicing_test() {
+  // Fake 0 to align on 1 and match ASCII art.
+  %0 = alloc() : memref<1xi32>
+
+  // FWD: matched: %[[v1:.*]] {{.*}} forward static slice:
+  // FWD-NEXT: %[[v5:.*]] {{.*}} -> i5
+  // FWD-DAG: %[[v8:.*]] {{.*}} -> i8
+  // FWD-DAG: %[[v7:.*]] {{.*}} -> i7
+  // FWD-NEXT: %[[v9:.*]] {{.*}} -> i9
+  //
+  // BWD: matched: %[[v1:.*]] {{.*}} backward static slice:
+  //
+  // FWDBWD: matched: %[[v1:.*]] {{.*}} static slice:
+  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+
+  %1 = "slicing-test-op" () : () -> i1
+
+  // FWD-NEXT: matched: %[[v2:.*]] {{.*}} forward static slice:
+  // FWD-NEXT: %[[v5:.*]] {{.*}} -> i5
+  // FWD-DAG: %[[v8:.*]] {{.*}} -> i8
+  // FWD-DAG: %[[v7:.*]] {{.*}} -> i7
+  // FWD-NEXT: %[[v9:.*]] {{.*}} -> i9
+  //
+  // BWD: matched: %[[v2:.*]] {{.*}} backward static slice:
+  //
+  // FWDBWD-NEXT: matched: %[[v2:.*]] {{.*}} static slice:
+  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+
+  %2 = "slicing-test-op" () : () -> i2
+
+  // FWD-NEXT: matched: %[[v3:.*]] {{.*}} forward static slice:
+  // FWD-NEXT: %[[v6:.*]] {{.*}} -> i6
+  // FWD-NEXT: %[[v8:.*]] {{.*}} -> i8
+  // FWD-NEXT: %[[v9:.*]] {{.*}} -> i9
+  //
+  // BWD: matched: %[[v3:.*]] {{.*}} backward static slice:
+  //
+  // FWDBWD-NEXT: matched: %[[v3:.*]] {{.*}} static slice:
+  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD-NEXT: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD-NEXT: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+
+  %3 = "slicing-test-op" () : () -> i3
+
+  // FWD-NEXT: matched: %[[v4:.*]] {{.*}} forward static slice:
+  // FWD-NEXT: %[[v6:.*]] {{.*}} -> i6
+  // FWD-NEXT: %[[v8:.*]] {{.*}} -> i8
+  // FWD-NEXT: %[[v9:.*]] {{.*}} -> i9
+  //
+  // BWD: matched: %[[v4:.*]] {{.*}} backward static slice:
+  //
+  // FWDBWD-NEXT: matched: %[[v4:.*]] {{.*}} static slice:
+  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD-NEXT: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD-NEXT: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+
+  %4 = "slicing-test-op" () : () -> i4
+
+  // FWD-NEXT: matched: %[[v5:.*]] {{.*}} forward static slice:
+  // FWD-DAG: %[[v7:.*]] {{.*}} -> i7
+  // FWD-DAG: %[[v8:.*]] {{.*}} -> i8
+  // FWD-NEXT: %[[v9:.*]] {{.*}} -> i9
+  //
+  // BWD: matched: %[[v5:.*]] {{.*}} backward static slice:
+  // BWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // BWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  //
+  // FWDBWD-NEXT: matched: %[[v5:.*]] {{.*}} static slice:
+  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+
+  %5 = "slicing-test-op" (%1, %2) : (i1, i2) -> i5
+
+  // FWD-NEXT: matched: %[[v6:.*]] {{.*}} forward static slice:
+  // FWD-NEXT: %[[v8:.*]] {{.*}} -> i8
+  // FWD-NEXT: %[[v9:.*]] {{.*}} -> i9
+  //
+  // BWD: matched: %[[v6:.*]] {{.*}} backward static slice:
+  // BWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // BWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  //
+  // FWDBWD-NEXT: matched: %[[v6:.*]] {{.*}} static slice:
+  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD-NEXT: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD-NEXT: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+
+  %6 = "slicing-test-op" (%3, %4) : (i3, i4) -> i6
+
+  // FWD-NEXT: matched: %[[v7:.*]] {{.*}} forward static slice:
+  // FWD-NEXT: %[[v9:.*]] {{.*}} -> i9
+  //
+  // BWD: matched: %[[v7:.*]] {{.*}} backward static slice:
+  // BWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // BWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // BWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  //
+  // FWDBWD-NEXT: matched: %[[v7:.*]] {{.*}} static slice:
+  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+
+  %7 = "slicing-test-op" (%1, %5) : (i1, i5) -> i7
+
+  // FWD-NEXT: matched: %[[v8:.*]] {{.*}} forward static slice:
+  // FWD-NEXT: %[[v9:.*]] {{.*}} -> i9
+  //
+  // BWD: matched: %[[v8:.*]] {{.*}} backward static slice:
+  // BWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // BWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // BWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // BWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // BWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // BWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  //
+  // FWDBWD-NEXT: matched: %[[v8:.*]] {{.*}} static slice:
+  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+
+  %8 = "slicing-test-op" (%5, %6) : (i5, i6) -> i8
+
+  // FWD-NEXT: matched: %[[v9:.*]] {{.*}} forward static slice:
+  //
+  // BWD: matched: %[[v9:.*]] {{.*}} backward static slice:
+  // BWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // BWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // BWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // BWD-NEXT: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // BWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // BWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // BWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // BWD-NEXT: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  //
+  // FWDBWD-NEXT: matched: %[[v9:.*]] {{.*}} static slice:
+  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+
+  %9 = "slicing-test-op" (%7, %8) : (i7, i8) -> i9
+
+  return
+}
+
+// FWD-LABEL: slicing_test_2
+// BWD-LABEL: slicing_test_2
+// FWDBWD-LABEL: slicing_test_2
+func @slicing_test_2() {
+  %c0 = constant 0 : index
+  %c2 = constant 2 : index
+  %c16 = constant 16 : index
+  affine.for %i0 = %c0 to %c16 {
+    affine.for %i1 = (i)[] -> (i)(%i0) to 10 {
+      // BWD: matched: %[[b:.*]] {{.*}} backward static slice:
+      // BWD: affine.for {{.*}}
+
+      // affine.for appears in the body of loop.for
+      // BWD: affine.for {{.*}}
+
+      // affine.for appears as a proper op in the backward slice
+      // BWD: affine.for {{.*}}
+      %b = "slicing-test-op"(%i1): (index) -> index
+
+      // BWD: matched: %[[c:.*]] {{.*}} backward static slice:
+      // BWD: affine.for {{.*}}
+
+      // affine.for appears in the body of loop.for
+      // BWD-NEXT: affine.for {{.*}}
+
+      // affine.for only appears in the body of loop.for
+      // BWD-NOT: affine.for {{.*}}
+      %c = "slicing-test-op"(%i0): (index) -> index
+    }
+  }
+  return
+}
+
+// FWD-LABEL: slicing_test_3
+// BWD-LABEL: slicing_test_3
+// FWDBWD-LABEL: slicing_test_3
+func @slicing_test_3() {
+  %f = constant 1.0 : f32
+  %c = "slicing-test-op"(%f): (f32) -> index
+  // FWD: matched: {{.*}} (f32) -> index forward static slice:
+  // FWD: loop.for {{.*}}
+  // FWD: matched: {{.*}} (index, index) -> index forward static slice:
+  loop.for %i2 = %c to %c step %c {
+    %d = "slicing-test-op"(%c, %i2): (index, index) -> index
+  }
+  return
+}
+
+// FWD-LABEL: slicing_test_function_argument
+// BWD-LABEL: slicing_test_function_argument
+// FWDBWD-LABEL: slicing_test_function_argument
+func @slicing_test_function_argument(%arg0: index) -> index {
+  // BWD: matched: {{.*}} (index, index) -> index backward static slice:
+  %0 = "slicing-test-op"(%arg0, %arg0): (index, index) -> index
+  return %0 : index
+}
+
+// This test dumps 2 sets of outputs: first the test outputs themselves followed
+// by the module. These labels isolate the test outputs from the module dump.
+// FWD-LABEL: slicing_test
+// BWD-LABEL: slicing_test
+// FWDBWD-LABEL: slicing_test
+// FWD-LABEL: slicing_test_2
+// BWD-LABEL: slicing_test_2
+// FWDBWD-LABEL: slicing_test_2
+// FWD-LABEL: slicing_test_3
+// BWD-LABEL: slicing_test_3
+// FWDBWD-LABEL: slicing_test_3
+// FWD-LABEL: slicing_test_function_argument
+// BWD-LABEL: slicing_test_function_argument
+// FWDBWD-LABEL: slicing_test_function_argument
diff --git a/mlir/test/Transforms/strip-debuginfo.mlir b/mlir/test/Transforms/strip-debuginfo.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..9c746fa9c09937798d6a30ed7bc5a7810b813f73
--- /dev/null
+++ b/mlir/test/Transforms/strip-debuginfo.mlir
@@ -0,0 +1,22 @@
+// RUN: mlir-opt %s -mlir-print-debuginfo -strip-debuginfo | FileCheck %s
+// This test verifies that debug locations are stripped.
+
+#set0 = (d0) : (1 == 0)
+
+// CHECK-LABEL: func @inline_notation
+func @inline_notation() -> i32 {
+  // CHECK: "foo"() : () -> i32 loc(unknown)
+  %1 = "foo"() : () -> i32 loc("foo")
+
+  // CHECK: } loc(unknown)
+  affine.for %i0 = 0 to 8 {
+  } loc(fused["foo", "mysource.cc":10:8])
+
+  // CHECK: } loc(unknown)
+  %2 = constant 4 : index
+  affine.if #set0(%2) {
+  } loc(fused<"myPass">["foo", "foo2"])
+
+  // CHECK: return %0 : i32 loc(unknown)
+  return %1 : i32 loc("bar")
+}
diff --git a/mlir/test/Transforms/test-canonicalize.mlir b/mlir/test/Transforms/test-canonicalize.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..dfcc156912b69fb6bc399e0c2e39bc84ce8029a0
--- /dev/null
+++ b/mlir/test/Transforms/test-canonicalize.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-opt %s -pass-pipeline='func(canonicalize)' | FileCheck %s
+
+// CHECK-LABEL: func @remove_op_with_inner_ops_pattern
+func @remove_op_with_inner_ops_pattern() {
+  // CHECK-NEXT: return
+  "test.op_with_region_pattern"() ({
+    "foo.op_with_region_terminator"() : () -> ()
+  }) : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @remove_op_with_inner_ops_fold_no_side_effect
+func @remove_op_with_inner_ops_fold_no_side_effect() {
+  // CHECK-NEXT: return
+  "test.op_with_region_fold_no_side_effect"() ({
+    "foo.op_with_region_terminator"() : () -> ()
+  }) : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @remove_op_with_inner_ops_fold
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: i32)
+func @remove_op_with_inner_ops_fold(%arg0 : i32) -> (i32) {
+  // CHECK-NEXT: return %[[ARG_0]]
+  %0 = "test.op_with_region_fold"(%arg0) ({
+    "foo.op_with_region_terminator"() : () -> ()
+  }) : (i32) -> (i32)
+  return %0 : i32
+}
+
+// CHECK-LABEL: func @remove_op_with_variadic_results_and_folder
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: i32, %[[ARG_1:[a-z0-9]*]]: i32)
+func @remove_op_with_variadic_results_and_folder(%arg0 : i32, %arg1 : i32) -> (i32, i32) {
+  // CHECK-NEXT: return %[[ARG_0]], %[[ARG_1]]
+  %0, %1 = "test.op_with_variadic_results_and_folder"(%arg0, %arg1) : (i32, i32) -> (i32, i32)
+  return %0, %1 : i32, i32
+}
diff --git a/mlir/test/Transforms/test-inlining.mlir b/mlir/test/Transforms/test-inlining.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d3a1c7ca46bffc7f87eb41e8043491574945dda3
--- /dev/null
+++ b/mlir/test/Transforms/test-inlining.mlir
@@ -0,0 +1,67 @@
+// RUN: mlir-opt %s -test-inline | FileCheck %s
+
+// CHECK-LABEL: func @inline_with_arg
+func @inline_with_arg(%arg0 : i32) -> i32 {
+  // CHECK-NEXT: %[[ADD:.*]] = addi %{{.*}}, %{{.*}} : i32
+  // CHECK-NEXT: return %[[ADD]] : i32
+  %fn = "test.functional_region_op"() ({
+  ^bb0(%a : i32):
+    %b = addi %a, %a : i32
+    "test.return"(%b) : (i32) -> ()
+  }) : () -> ((i32) -> i32)
+
+  %0 = call_indirect %fn(%arg0) : (i32) -> i32
+  return %0 : i32
+}
+
+// CHECK-LABEL: func @no_inline_invalid_nested_operation
+func @no_inline_invalid_nested_operation() {
+  // CHECK: call_indirect
+
+  // test.region is analyzed recursively, so it must not have an invalid op.
+
+  %fn = "test.functional_region_op"() ({
+    "test.region"() ({
+      "foo.noinline_operation"() : () -> ()
+    }) : () -> ()
+    "test.return"() : () -> ()
+  }) : () -> (() -> ())
+
+  call_indirect %fn() : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @inline_ignore_invalid_nested_operation
+func @inline_ignore_invalid_nested_operation() {
+  // CHECK-NOT: call_indirect
+
+  // test.functional_region_op is not analyzed recursively, so it may have an
+  // invalid op.
+
+  %fn = "test.functional_region_op"() ({
+    %internal_fn = "test.functional_region_op"() ({
+      "foo.noinline_operation"() : () -> ()
+    }) : () -> (() -> ())
+    "test.return"() : () -> ()
+  }) : () -> (() -> ())
+
+  call_indirect %fn() : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @no_inline_invalid_dest_region
+func @no_inline_invalid_dest_region() {
+  // CHECK: call_indirect
+
+  // foo.unknown_region is unknown, so we can't inline into it.
+
+  "foo.unknown_region"() ({
+    %fn = "test.functional_region_op"() ({
+      "test.return"() : () -> ()
+    }) : () -> (() -> ())
+    call_indirect %fn() : () -> ()
+    "test.return"() : () -> ()
+  }) : () -> ()
+
+  return
+}
diff --git a/mlir/test/Transforms/test-legalize-remapped-value.mlir b/mlir/test/Transforms/test-legalize-remapped-value.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..ff571c93f938653f4bcc203ac16c9aec0d32978e
--- /dev/null
+++ b/mlir/test/Transforms/test-legalize-remapped-value.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-opt %s -test-remapped-value | FileCheck %s
+
+// Simple test that exercises ConvertPatternRewriter::getRemappedValue.
+func @remap_input_1_to_1(%arg0: i32) {
+  %0 = "test.one_variadic_out_one_variadic_in1"(%arg0) : (i32) -> i32
+  %1 = "test.one_variadic_out_one_variadic_in1"(%0) : (i32) -> i32
+  "test.return"() : () -> ()
+}
+// CHECK-LABEL: func @remap_input_1_to_1
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+// CHECK-NEXT: %[[VAL:.*]] = "test.one_variadic_out_one_variadic_in1"(%[[ARG]], %[[ARG]])
+// CHECK-NEXT: "test.one_variadic_out_one_variadic_in1"(%[[VAL]], %[[VAL]])
+
diff --git a/mlir/test/Transforms/test-legalizer-analysis.mlir b/mlir/test/Transforms/test-legalizer-analysis.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..a53813a4baff69ab091c347b53d657d561a37a43
--- /dev/null
+++ b/mlir/test/Transforms/test-legalizer-analysis.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-opt -test-legalize-patterns -verify-diagnostics -test-legalize-mode=analysis %s | FileCheck %s
+// expected-remark@-2 {{op 'module' is legalizable}}
+// expected-remark@-3 {{op 'module_terminator' is legalizable}}
+
+// expected-remark@+1 {{op 'func' is legalizable}}
+func @test(%arg0: f32) {
+  // expected-remark@+1 {{op 'test.illegal_op_a' is legalizable}}
+  %result = "test.illegal_op_a"() : () -> (i32)
+  "foo.region"() ({
+      // expected-remark@+1 {{op 'test.invalid' is legalizable}}
+      "test.invalid"() : () -> ()
+  }) : () -> ()
+  return
+}
+
+// Check that none of the legalizable operations were modified.
+// CHECK-LABEL: func @test
+// CHECK-NEXT: "test.illegal_op_a"
+// CHECK: "test.invalid"
diff --git a/mlir/test/Transforms/test-legalizer-full.mlir b/mlir/test/Transforms/test-legalizer-full.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d0fc4c9bc898ffd6f20ba7eef4bb6e6220a9a549
--- /dev/null
+++ b/mlir/test/Transforms/test-legalizer-full.mlir
@@ -0,0 +1,60 @@
+// RUN: mlir-opt -test-legalize-patterns -test-legalize-mode=full -split-input-file -verify-diagnostics %s | FileCheck %s
+
+// CHECK-LABEL: func @multi_level_mapping
+func @multi_level_mapping() {
+  // CHECK: "test.type_producer"() : () -> f64
+  // CHECK: "test.type_consumer"(%{{.*}}) : (f64) -> ()
+  %result = "test.type_producer"() : () -> i32
+  "test.type_consumer"(%result) : (i32) -> ()
+  "test.return"() : () -> ()
+}
+
+// Test that operations that are erased don't need to be legalized.
+// CHECK-LABEL: func @dropped_region_with_illegal_ops
+func @dropped_region_with_illegal_ops() {
+  // CHECK-NEXT: test.return
+  "test.drop_region_op"() ({
+    %ignored = "test.illegal_op_f"() : () -> (i32)
+    "test.return"() : () -> ()
+  }) : () -> ()
+  "test.return"() : () -> ()
+}
+// CHECK-LABEL: func @replace_non_root_illegal_op
+func @replace_non_root_illegal_op() {
+  // CHECK-NEXT: "test.legal_op_b"
+  // CHECK-NEXT: test.return
+  %result = "test.replace_non_root"() : () -> (i32)
+  "test.return"() : () -> ()
+}
+
+// -----
+
+// Test that children of recursively legal operations are ignored.
+func @recursively_legal_invalid_op() {
+  /// Operation that is statically legal.
+  module attributes {test.recursively_legal} {
+    %ignored = "test.illegal_op_f"() : () -> (i32)
+  }
+  /// Operation that is dynamically legal, i.e. the function has a pattern
+  /// applied to legalize the argument type before it becomes recursively legal.
+  func @dynamic_func(%arg: i64) attributes {test.recursively_legal} {
+    %ignored = "test.illegal_op_f"() : () -> (i32)
+    "test.return"() : () -> ()
+  }
+
+  "test.return"() : () -> ()
+}
+
+// -----
+
+// Test that region cloning can be properly undone.
+func @test_undo_region_clone() {
+  "test.region"() ({
+    ^bb1(%i0: i64):
+      "test.invalid"(%i0) : (i64) -> ()
+  }) {legalizer.should_clone} : () -> ()
+
+  // expected-error@+1 {{failed to legalize operation 'test.illegal_op_f'}}
+  %ignored = "test.illegal_op_f"() : () -> (i32)
+  "test.return"() : () -> ()
+}
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..38f87dd2302cc7f4706f00570add3b7a85d8b5ad
--- /dev/null
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -0,0 +1,158 @@
+// RUN: mlir-opt -split-input-file -test-legalize-patterns -verify-diagnostics %s | FileCheck %s
+
+// CHECK-LABEL: verifyDirectPattern
+func @verifyDirectPattern() -> i32 {
+  // CHECK-NEXT:  "test.legal_op_a"() {status = "Success"}
+  %result = "test.illegal_op_a"() : () -> (i32)
+  return %result : i32
+}
+
+// CHECK-LABEL: verifyLargerBenefit
+func @verifyLargerBenefit() -> i32 {
+  // CHECK-NEXT:  "test.legal_op_a"() {status = "Success"}
+  %result = "test.illegal_op_c"() : () -> (i32)
+  return %result : i32
+}
+
+// CHECK-LABEL: func @remap_input_1_to_0()
+func @remap_input_1_to_0(i16)
+
+// CHECK-LABEL: func @remap_input_1_to_1(%arg0: f64)
+func @remap_input_1_to_1(%arg0: i64) {
+  // CHECK-NEXT: "test.valid"{{.*}} : (f64)
+  "test.invalid"(%arg0) : (i64) -> ()
+}
+
+// CHECK-LABEL: func @remap_input_1_to_N({{.*}}f16, {{.*}}f16)
+func @remap_input_1_to_N(%arg0: f32) -> f32 {
+ // CHECK-NEXT: "test.return"{{.*}} : (f16, f16) -> ()
+ "test.return"(%arg0) : (f32) -> ()
+}
+
+// CHECK-LABEL: func @remap_input_1_to_N_remaining_use(%arg0: f16, %arg1: f16)
+func @remap_input_1_to_N_remaining_use(%arg0: f32) {
+  // CHECK-NEXT: [[CAST:%.*]] = "test.cast"(%arg0, %arg1) : (f16, f16) -> f32
+  // CHECK-NEXT: "work"([[CAST]]) : (f32) -> ()
+  "work"(%arg0) : (f32) -> ()
+}
+
+// CHECK-LABEL: func @remap_input_to_self
+func @remap_input_to_self(%arg0: index) {
+  // CHECK-NOT: test.cast
+  // CHECK: "work"
+  "work"(%arg0) : (index) -> ()
+}
+
+// CHECK-LABEL: func @remap_multi(%arg0: f64, %arg1: f64) -> (f64, f64)
+func @remap_multi(%arg0: i64, %unused: i16, %arg1: i64) -> (i64, i64) {
+ // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64)
+ "test.invalid"(%arg0, %arg1) : (i64, i64) -> ()
+}
+
+// CHECK-LABEL: func @no_remap_nested
+func @no_remap_nested() {
+  // CHECK-NEXT: "foo.region"
+  "foo.region"() ({
+    // CHECK-NEXT: ^bb0(%{{.*}}: i64, %{{.*}}: i16, %{{.*}}: i64):
+    ^bb0(%i0: i64, %unused: i16, %i1: i64):
+      // CHECK-NEXT: "test.valid"{{.*}} : (i64, i64)
+      "test.invalid"(%i0, %i1) : (i64, i64) -> ()
+  }) : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @remap_moved_region_args
+func @remap_moved_region_args() {
+  // CHECK-NEXT: return
+  // CHECK-NEXT: ^bb1(%{{.*}}: f64, %{{.*}}: f64, %{{.*}}: f16, %{{.*}}: f16):
+  // CHECK-NEXT: "test.cast"{{.*}} : (f16, f16) -> f32
+  // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64, f32)
+  "test.region"() ({
+    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
+      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
+  }) : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @remap_cloned_region_args
+func @remap_cloned_region_args() {
+  // CHECK-NEXT: return
+  // CHECK-NEXT: ^bb1(%{{.*}}: f64, %{{.*}}: f64, %{{.*}}: f16, %{{.*}}: f16):
+  // CHECK-NEXT: "test.cast"{{.*}} : (f16, f16) -> f32
+  // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64, f32)
+  "test.region"() ({
+    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
+      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
+  }) {legalizer.should_clone} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @remap_drop_region
+func @remap_drop_region() {
+  // CHECK-NEXT: return
+  // CHECK-NEXT: }
+  "test.drop_region_op"() ({
+    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
+      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
+  }) : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @dropped_input_in_use
+func @dropped_input_in_use(%arg: i16, %arg2: i64) {
+  // CHECK-NEXT: "test.cast"{{.*}} : () -> i16
+  // CHECK-NEXT: "work"{{.*}} : (i16)
+  "work"(%arg) : (i16) -> ()
+}
+
+// CHECK-LABEL: func @up_to_date_replacement
+func @up_to_date_replacement(%arg: i8) -> i8 {
+  // CHECK-NEXT: return
+  %repl_1 = "test.rewrite"(%arg) : (i8) -> i8
+  %repl_2 = "test.rewrite"(%repl_1) : (i8) -> i8
+  return %repl_2 : i8
+}
+
+// CHECK-LABEL: func @remove_foldable_op
+// CHECK-SAME:                          (%[[ARG_0:[a-z0-9]*]]: i32)
+func @remove_foldable_op(%arg0 : i32) -> (i32) {
+  // CHECK-NEXT: return %[[ARG_0]]
+  %0 = "test.op_with_region_fold"(%arg0) ({
+    "foo.op_with_region_terminator"() : () -> ()
+  }) : (i32) -> (i32)
+  return %0 : i32
+}
+
+// -----
+
+func @fail_to_convert_illegal_op() -> i32 {
+  // expected-error@+1 {{failed to legalize operation 'test.illegal_op_f'}}
+  %result = "test.illegal_op_f"() : () -> (i32)
+  return %result : i32
+}
+
+// -----
+
+func @fail_to_convert_illegal_op_in_region() {
+  // expected-error@+1 {{failed to legalize operation 'test.region_builder'}}
+  "test.region_builder"() : () -> ()
+  return
+}
+
+// -----
+
+// Check that the entry block arguments of a region are untouched in the case
+// of failure.
+
+// CHECK-LABEL: func @fail_to_convert_region
+func @fail_to_convert_region() {
+  // CHECK-NEXT: "test.region"
+  // CHECK-NEXT: ^bb{{.*}}(%{{.*}}: i64):
+  "test.region"() ({
+    ^bb1(%i0: i64):
+      // expected-error@+1 {{failed to legalize operation 'test.region_builder'}}
+      "test.region_builder"() : () -> ()
+      "test.valid"() : () -> ()
+  }) : () -> ()
+  return
+}
diff --git a/mlir/test/Transforms/unroll-jam.mlir b/mlir/test/Transforms/unroll-jam.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..be46548f652d61bd9b468afe7ea7c347ee8c854d
--- /dev/null
+++ b/mlir/test/Transforms/unroll-jam.mlir
@@ -0,0 +1,124 @@
+// RUN: mlir-opt %s -affine-loop-unroll-jam -unroll-jam-factor=2 | FileCheck %s
+// RUN: mlir-opt %s -affine-loop-unroll-jam -unroll-jam-factor=4 | FileCheck --check-prefix=UJAM-FOUR %s
+
+// CHECK-DAG: [[MAP_PLUS_1:#map[0-9]+]] = (d0) -> (d0 + 1)
+// CHECK-DAG: [[MAP_DIV_OFFSET:#map[0-9]+]] = ()[s0] -> (((s0 - 1) floordiv 2) * 2 + 1)
+// CHECK-DAG: [[MAP_MULTI_RES:#map[0-9]+]] = ()[s0, s1] -> ((s0 floordiv 2) * 2, (s1 floordiv 2) * 2, 1024)
+// CHECK-DAG: [[MAP_SYM_UB:#map[0-9]+]] = ()[s0, s1] -> (s0, s1, 1024)
+
+// UJAM-FOUR-DAG: [[UBMAP:#map[0-9]+]] = ()[s0] -> (s0 + 8)
+// UJAM-FOUR-DAG: [[MAP_PLUS_1:#map[0-9]+]] = (d0) -> (d0 + 1)
+// UJAM-FOUR-DAG: [[MAP_PLUS_2:#map[0-9]+]] = (d0) -> (d0 + 2)
+// UJAM-FOUR-DAG: [[MAP_PLUS_3:#map[0-9]+]] = (d0) -> (d0 + 3)
+
+// CHECK-LABEL: func @unroll_jam_imperfect_nest() {
+func @unroll_jam_imperfect_nest() {
+  affine.for %i = 0 to 101 {
+    %x = "addi32"(%i, %i) : (index, index) -> i32
+    affine.for %j = 0 to 17 {
+      %y = "addi32"(%i, %i) : (index, index) -> i32
+      %z = "addi32"(%y, %y) : (i32, i32) -> i32
+    }
+    %w = "foo"(%i, %x) : (index, i32) -> i32
+  }
+  return
+}
+// CHECK:      affine.for [[IV0:%arg[0-9]+]] = 0 to 100 step 2 {
+// CHECK-NEXT:   [[RES1:%[0-9]+]] = "addi32"([[IV0]], [[IV0]])
+// CHECK-NEXT:   [[INC:%[0-9]+]] = affine.apply [[MAP_PLUS_1]]([[IV0]])
+// CHECK-NEXT:   [[RES2:%[0-9]+]] = "addi32"([[INC]], [[INC]])
+// CHECK-NEXT:   affine.for %{{.*}} = 0 to 17 {
+// CHECK-NEXT:     [[RES3:%[0-9]+]] = "addi32"([[IV0]], [[IV0]])
+// CHECK-NEXT:     "addi32"([[RES3]], [[RES3]]) : (i32, i32) -> i32
+// CHECK-NEXT:     [[INC1:%[0-9]+]] = affine.apply [[MAP_PLUS_1]]([[IV0]])
+// CHECK-NEXT:     [[RES4:%[0-9]+]] = "addi32"([[INC1]], [[INC1]])
+// CHECK-NEXT:     "addi32"([[RES4]], [[RES4]]) : (i32, i32) -> i32
+// CHECK-NEXT:   }
+// CHECK:        "foo"([[IV0]], [[RES1]])
+// CHECK-NEXT:   affine.apply [[MAP_PLUS_1]]([[IV0]])
+// CHECK-NEXT:   "foo"({{.*}}, [[RES2]])
+// CHECK:      }
+// Cleanup loop (single iteration).
+// CHECK:      "addi32"(%c100, %c100)
+// CHECK-NEXT: affine.for [[IV0]] = 0 to 17 {
+// CHECK-NEXT:   [[RESC:%[0-9]+]] = "addi32"(%c100, %c100)
+// CHECK-NEXT:   "addi32"([[RESC]], [[RESC]]) : (i32, i32) -> i32
+// CHECK-NEXT: }
+// CHECK-NEXT: "foo"(%c100, %{{.*}})
+// CHECK-NEXT: return
+
+// CHECK-LABEL: func @loop_nest_unknown_count_1
+// CHECK-SAME: [[N:arg[0-9]+]]: index
+func @loop_nest_unknown_count_1(%N : index) {
+  // CHECK-NEXT: affine.for %{{.*}} = 1 to [[MAP_DIV_OFFSET]]()[%[[N]]] step 2 {
+  // CHECK-NEXT:   affine.for %{{.*}} = 1 to 100 {
+  // CHECK-NEXT:     "foo"() : () -> i32
+  // CHECK-NEXT:     "foo"() : () -> i32
+  // CHECK-NEXT:   }
+  // CHECK-NEXT: }
+  // A cleanup loop should be generated here.
+  // CHECK-NEXT: affine.for %{{.*}} = [[MAP_DIV_OFFSET]]()[%[[N]]] to %[[N]] {
+  // CHECK-NEXT:   affine.for %{{.*}} = 1 to 100 {
+  // CHECK-NEXT:     "foo"() : () -> i32
+  // CHECK-NEXT:   }
+  // CHECK-NEXT: }
+  affine.for %i = 1 to %N {
+    affine.for %j = 1 to 100 {
+      %x = "foo"() : () -> i32
+    }
+  }
+  return
+}
+
+// UJAM-FOUR-LABEL: func @loop_nest_unknown_count_2
+// UJAM-FOUR-SAME: %[[N:arg[0-9]+]]: index
+func @loop_nest_unknown_count_2(%N : index) {
+  // UJAM-FOUR-NEXT: affine.for [[IV0:%arg[0-9]+]] = %[[N]] to  [[UBMAP]]()[%[[N]]] step 4 {
+  // UJAM-FOUR-NEXT:   affine.for [[IV1:%arg[0-9]+]] = 1 to 100 {
+  // UJAM-FOUR-NEXT:     "foo"([[IV0]])
+  // UJAM-FOUR-NEXT:     [[IV_PLUS_1:%[0-9]+]] = affine.apply [[MAP_PLUS_1]]([[IV0]])
+  // UJAM-FOUR-NEXT:     "foo"([[IV_PLUS_1]])
+  // UJAM-FOUR-NEXT:     [[IV_PLUS_2:%[0-9]+]] = affine.apply [[MAP_PLUS_2]]([[IV0]])
+  // UJAM-FOUR-NEXT:     "foo"([[IV_PLUS_2]])
+  // UJAM-FOUR-NEXT:     [[IV_PLUS_3:%[0-9]+]] = affine.apply [[MAP_PLUS_3]]([[IV0]])
+  // UJAM-FOUR-NEXT:     "foo"([[IV_PLUS_3]])
+  // UJAM-FOUR-NEXT:   }
+  // UJAM-FOUR-NEXT: }
+  // The cleanup loop is a single iteration one and is promoted.
+  // UJAM-FOUR-NEXT: [[RES:%[0-9]+]] = affine.apply [[UBMAP]]()[%[[N]]]
+  // UJAM-FOUR-NEXT: affine.for [[IV0]] = 1 to 100 {
+  // UJAM-FOUR-NEXT:   "foo"([[RES]])
+  // UJAM-FOUR-NEXT: }
+  affine.for %i = %N to ()[s0] -> (s0+9) ()[%N] {
+    affine.for %j = 1 to 100 {
+      "foo"(%i) : (index) -> ()
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @loop_nest_symbolic_and_min_upper_bound
+// CHECK-SAME: [[M:arg[0-9]+]]: index
+// CHECK-SAME: [[N:arg[0-9]+]]: index
+// CHECK-SAME: [[K:arg[0-9]+]]: index
+func @loop_nest_symbolic_and_min_upper_bound(%M : index, %N : index, %K : index) {
+  affine.for %i = 0 to min ()[s0, s1] -> (s0, s1, 1024)()[%M, %N] {
+    affine.for %j = 0 to %K {
+      "foo"(%i, %j) : (index, index) -> ()
+    }
+  }
+  return
+}
+// CHECK-NEXT:  affine.for [[IV0:%arg[0-9]+]] = 0 to min [[MAP_MULTI_RES]]()[%[[M]], %[[N]]] step 2 {
+// CHECK-NEXT:    affine.for [[IV1:%arg[0-9]+]] = 0 to %[[K]] {
+// CHECK-NEXT:      "foo"([[IV0]], [[IV1]])
+// CHECK-NEXT:      [[RES:%[0-9]+]] = affine.apply [[MAP_PLUS_1]]([[IV0]])
+// CHECK-NEXT:      "foo"([[RES]], [[IV1]])
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  affine.for [[IV0]] = max [[MAP_MULTI_RES]]()[%[[M]], %[[N]]] to min [[MAP_SYM_UB]]()[%[[M]], %[[N]]] {
+// CHECK-NEXT:    affine.for [[IV1]] = 0 to %[[K]] {
+// CHECK-NEXT:      "foo"([[IV0]], [[IV1]])
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
diff --git a/mlir/test/Transforms/unroll.mlir b/mlir/test/Transforms/unroll.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..da2a5e59bc9bc356d0cf9bfd85009129761b6b02
--- /dev/null
+++ b/mlir/test/Transforms/unroll.mlir
@@ -0,0 +1,592 @@
+// RUN: mlir-opt %s -affine-loop-unroll -unroll-full | FileCheck %s --check-prefix UNROLL-FULL
+// RUN: mlir-opt %s -affine-loop-unroll -unroll-full -unroll-full-threshold=2 | FileCheck %s --check-prefix SHORT
+// RUN: mlir-opt %s -affine-loop-unroll -unroll-factor=4 | FileCheck %s --check-prefix UNROLL-BY-4
+// RUN: mlir-opt %s -affine-loop-unroll -unroll-factor=1 | FileCheck %s --check-prefix UNROLL-BY-1
+
+// UNROLL-FULL-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
+// UNROLL-FULL-DAG: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 2)
+// UNROLL-FULL-DAG: [[MAP2:#map[0-9]+]] = (d0) -> (d0 + 3)
+// UNROLL-FULL-DAG: [[MAP3:#map[0-9]+]] = (d0) -> (d0 + 4)
+// UNROLL-FULL-DAG: [[MAP4:#map[0-9]+]] = (d0, d1) -> (d0 + 1)
+// UNROLL-FULL-DAG: [[MAP5:#map[0-9]+]] = (d0, d1) -> (d0 + 3)
+// UNROLL-FULL-DAG: [[MAP6:#map[0-9]+]] = (d0)[s0] -> (d0 + s0 + 1)
+
+// SHORT-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
+
+// UNROLL-BY-4-DAG: [[MAP0:#map[0-9]+]] = (d0) -> (d0 + 1)
+// UNROLL-BY-4-DAG: [[MAP1:#map[0-9]+]] = (d0) -> (d0 + 2)
+// UNROLL-BY-4-DAG: [[MAP2:#map[0-9]+]] = (d0) -> (d0 + 3)
+// UNROLL-BY-4-DAG: [[MAP3:#map[0-9]+]] = (d0, d1) -> (d0 + 1)
+// UNROLL-BY-4-DAG: [[MAP4:#map[0-9]+]] = (d0, d1) -> (d0 + 3)
+// UNROLL-BY-4-DAG: [[MAP5:#map[0-9]+]] = (d0)[s0] -> (d0 + s0 + 1)
+// UNROLL-BY-4-DAG: [[MAP6:#map[0-9]+]] = (d0, d1) -> (d0 * 16 + d1)
+// UNROLL-BY-4-DAG: [[MAP11:#map[0-9]+]] = (d0) -> (d0)
+// UNROLL-BY-4-DAG: [[MAP_TRIP_COUNT_MULTIPLE_FOUR:#map[0-9]+]] = ()[s0, s1, s2] -> (s0 + ((-s0 + s1) floordiv 4) * 4, s0 + ((-s0 + s2) floordiv 4) * 4, s0 + ((-s0) floordiv 4) * 4 + 1024)
+
+// UNROLL-FULL-LABEL: func @loop_nest_simplest() {
+func @loop_nest_simplest() {
+  // UNROLL-FULL: affine.for %arg0 = 0 to 100 step 2 {
+  affine.for %i = 0 to 100 step 2 {
+    // UNROLL-FULL: %c1_i32 = constant 1 : i32
+    // UNROLL-FULL-NEXT: %c1_i32_0 = constant 1 : i32
+    // UNROLL-FULL-NEXT: %c1_i32_1 = constant 1 : i32
+    // UNROLL-FULL-NEXT: %c1_i32_2 = constant 1 : i32
+    affine.for %j = 0 to 4 {
+      %x = constant 1 : i32
+    }
+  }       // UNROLL-FULL:  }
+  return  // UNROLL-FULL:  return
+}         // UNROLL-FULL }
+
+// UNROLL-FULL-LABEL: func @loop_nest_simple_iv_use() {
+func @loop_nest_simple_iv_use() {
+  // UNROLL-FULL: %c0 = constant 0 : index
+  // UNROLL-FULL-NEXT: affine.for %arg0 = 0 to 100 step 2 {
+  affine.for %i = 0 to 100 step 2 {
+    // UNROLL-FULL: %0 = "addi32"(%c0, %c0) : (index, index) -> i32
+    // UNROLL-FULL: %1 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT:  %2 = "addi32"(%1, %1) : (index, index) -> i32
+    // UNROLL-FULL: %3 = affine.apply [[MAP1]](%c0)
+    // UNROLL-FULL-NEXT:  %4 = "addi32"(%3, %3) : (index, index) -> i32
+    // UNROLL-FULL: %5 = affine.apply [[MAP2]](%c0)
+    // UNROLL-FULL-NEXT:  %6 = "addi32"(%5, %5) : (index, index) -> i32
+    affine.for %j = 0 to 4 {
+      %x = "addi32"(%j, %j) : (index, index) -> i32
+    }
+  }       // UNROLL-FULL:  }
+  return  // UNROLL-FULL:  return
+}         // UNROLL-FULL }
+
+// Operations in the loop body have results that are used therein.
+// UNROLL-FULL-LABEL: func @loop_nest_body_def_use() {
+func @loop_nest_body_def_use() {
+  // UNROLL-FULL: %c0 = constant 0 : index
+  // UNROLL-FULL-NEXT: affine.for %arg0 = 0 to 100 step 2 {
+  affine.for %i = 0 to 100 step 2 {
+    // UNROLL-FULL: %c0_0 = constant 0 : index
+    %c0 = constant 0 : index
+    // UNROLL-FULL:      %0 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT: %1 = "addi32"(%0, %c0_0) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %2 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT: %3 = affine.apply [[MAP0]](%2)
+    // UNROLL-FULL-NEXT: %4 = "addi32"(%3, %c0_0) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %5 = affine.apply [[MAP1]](%c0)
+    // UNROLL-FULL-NEXT: %6 = affine.apply [[MAP0]](%5)
+    // UNROLL-FULL-NEXT: %7 = "addi32"(%6, %c0_0) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %8 = affine.apply [[MAP2]](%c0)
+    // UNROLL-FULL-NEXT: %9 = affine.apply [[MAP0]](%8)
+    // UNROLL-FULL-NEXT: %10 = "addi32"(%9, %c0_0) : (index, index) -> index
+    affine.for %j = 0 to 4 {
+      %x = "affine.apply" (%j) { map = (d0) -> (d0 + 1) } :
+        (index) -> (index)
+      %y = "addi32"(%x, %c0) : (index, index) -> index
+    }
+  }       // UNROLL-FULL:  }
+  return  // UNROLL-FULL:  return
+}         // UNROLL-FULL }
+
+// UNROLL-FULL-LABEL: func @loop_nest_strided() {
+func @loop_nest_strided() {
+  // UNROLL-FULL: %c2 = constant 2 : index
+  // UNROLL-FULL-NEXT: %c2_0 = constant 2 : index
+  // UNROLL-FULL-NEXT: affine.for %arg0 = 0 to 100 {
+  affine.for %i = 0 to 100 {
+    // UNROLL-FULL:      %0 = affine.apply [[MAP0]](%c2_0)
+    // UNROLL-FULL-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %2 = affine.apply [[MAP1]](%c2_0)
+    // UNROLL-FULL-NEXT: %3 = affine.apply [[MAP0]](%2)
+    // UNROLL-FULL-NEXT: %4 = "addi32"(%3, %3) : (index, index) -> index
+    affine.for %j = 2 to 6 step 2 {
+      %x = "affine.apply" (%j) { map = (d0) -> (d0 + 1) } :
+        (index) -> (index)
+      %y = "addi32"(%x, %x) : (index, index) -> index
+    }
+    // UNROLL-FULL:      %5 = affine.apply [[MAP0]](%c2)
+    // UNROLL-FULL-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %7 = affine.apply [[MAP1]](%c2)
+    // UNROLL-FULL-NEXT: %8 = affine.apply [[MAP0]](%7)
+    // UNROLL-FULL-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %10 = affine.apply [[MAP3]](%c2)
+    // UNROLL-FULL-NEXT: %11 = affine.apply [[MAP0]](%10)
+    // UNROLL-FULL-NEXT: %12 = "addi32"(%11, %11) : (index, index) -> index
+    affine.for %k = 2 to 7 step 2 {
+      %z = "affine.apply" (%k) { map = (d0) -> (d0 + 1) } :
+        (index) -> (index)
+      %w = "addi32"(%z, %z) : (index, index) -> index
+    }
+  }       // UNROLL-FULL:  }
+  return  // UNROLL-FULL:  return
+}         // UNROLL-FULL }
+
+// UNROLL-FULL-LABEL: func @loop_nest_multiple_results() {
+func @loop_nest_multiple_results() {
+  // UNROLL-FULL: %c0 = constant 0 : index
+  // UNROLL-FULL-NEXT: affine.for %arg0 = 0 to 100 {
+  affine.for %i = 0 to 100 {
+    // UNROLL-FULL: %0 = affine.apply [[MAP4]](%arg0, %c0)
+    // UNROLL-FULL-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %2 = affine.apply #map{{.*}}(%arg0, %c0)
+    // UNROLL-FULL-NEXT: %3:2 = "fma"(%2, %0, %0) : (index, index, index) -> (index, index)
+    // UNROLL-FULL-NEXT: %4 = affine.apply #map{{.*}}(%c0)
+    // UNROLL-FULL-NEXT: %5 = affine.apply #map{{.*}}(%arg0, %4)
+    // UNROLL-FULL-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %7 = affine.apply #map{{.*}}(%arg0, %4)
+    // UNROLL-FULL-NEXT: %8:2 = "fma"(%7, %5, %5) : (index, index, index) -> (index, index)
+    affine.for %j = 0 to 2 step 1 {
+      %x = affine.apply (d0, d1) -> (d0 + 1) (%i, %j)
+      %y = "addi32"(%x, %x) : (index, index) -> index
+      %z = affine.apply (d0, d1) -> (d0 + 3) (%i, %j)
+      %w:2 = "fma"(%z, %x, %x) : (index, index, index) -> (index, index)
+    }
+  }       // UNROLL-FULL:  }
+  return  // UNROLL-FULL:  return
+}         // UNROLL-FULL }
+
+
+// Imperfect loop nest. Unrolling innermost here yields a perfect nest.
+// UNROLL-FULL-LABEL: func @loop_nest_seq_imperfect(%arg0: memref<128x128xf32>) {
+func @loop_nest_seq_imperfect(%a : memref<128x128xf32>) {
+  // UNROLL-FULL: %c0 = constant 0 : index
+  // UNROLL-FULL-NEXT: %c128 = constant 128 : index
+  %c128 = constant 128 : index
+  // UNROLL-FULL: affine.for %arg1 = 0 to 100 {
+  affine.for %i = 0 to 100 {
+    // UNROLL-FULL: %0 = "vld"(%arg1) : (index) -> i32
+    %ld = "vld"(%i) : (index) -> i32
+    // UNROLL-FULL: %1 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT: %2 = "vmulf"(%c0, %1) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %3 = "vaddf"(%2, %2) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %4 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT: %5 = affine.apply [[MAP0]](%4)
+    // UNROLL-FULL-NEXT: %6 = "vmulf"(%4, %5) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %7 = "vaddf"(%6, %6) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %8 = affine.apply [[MAP1]](%c0)
+    // UNROLL-FULL-NEXT: %9 = affine.apply [[MAP0]](%8)
+    // UNROLL-FULL-NEXT: %10 = "vmulf"(%8, %9) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %11 = "vaddf"(%10, %10) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %12 = affine.apply [[MAP2]](%c0)
+    // UNROLL-FULL-NEXT: %13 = affine.apply [[MAP0]](%12)
+    // UNROLL-FULL-NEXT: %14 = "vmulf"(%12, %13) : (index, index) -> index
+    // UNROLL-FULL-NEXT: %15 = "vaddf"(%14, %14) : (index, index) -> index
+    affine.for %j = 0 to 4 {
+      %x = "affine.apply" (%j) { map = (d0) -> (d0 + 1) } :
+        (index) -> (index)
+       %y = "vmulf"(%j, %x) : (index, index) -> index
+       %z = "vaddf"(%y, %y) : (index, index) -> index
+    }
+    // UNROLL-FULL: %16 = "scale"(%c128, %arg1) : (index, index) -> index
+    %addr = "scale"(%c128, %i) : (index, index) -> index
+    // UNROLL-FULL: "vst"(%16, %arg1) : (index, index) -> ()
+    "vst"(%addr, %i) : (index, index) -> ()
+  }       // UNROLL-FULL }
+  return  // UNROLL-FULL:  return
+}
+
+// UNROLL-FULL-LABEL: func @loop_nest_seq_multiple() {
+func @loop_nest_seq_multiple() {
+  // UNROLL-FULL: c0 = constant 0 : index
+  // UNROLL-FULL-NEXT: %c0_0 = constant 0 : index
+  // UNROLL-FULL-NEXT: %0 = affine.apply [[MAP0]](%c0_0)
+  // UNROLL-FULL-NEXT: "mul"(%0, %0) : (index, index) -> ()
+  // UNROLL-FULL-NEXT: %1 = affine.apply [[MAP0]](%c0_0)
+  // UNROLL-FULL-NEXT: %2 = affine.apply [[MAP0]](%1)
+  // UNROLL-FULL-NEXT: "mul"(%2, %2) : (index, index) -> ()
+  // UNROLL-FULL-NEXT: %3 = affine.apply [[MAP1]](%c0_0)
+  // UNROLL-FULL-NEXT: %4 = affine.apply [[MAP0]](%3)
+  // UNROLL-FULL-NEXT: "mul"(%4, %4) : (index, index) -> ()
+  // UNROLL-FULL-NEXT: %5 = affine.apply [[MAP2]](%c0_0)
+  // UNROLL-FULL-NEXT: %6 = affine.apply [[MAP0]](%5)
+  // UNROLL-FULL-NEXT: "mul"(%6, %6) : (index, index) -> ()
+  affine.for %j = 0 to 4 {
+    %x = "affine.apply" (%j) { map = (d0) -> (d0 + 1) } :
+      (index) -> (index)
+    "mul"(%x, %x) : (index, index) -> ()
+  }
+
+  // UNROLL-FULL: %c99 = constant 99 : index
+  %k = constant 99 : index
+  // UNROLL-FULL: affine.for %arg0 = 0 to 100 step 2 {
+  affine.for %m = 0 to 100 step 2 {
+    // UNROLL-FULL: %7 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT: %8 = affine.apply [[MAP6]](%c0)[%c99]
+    // UNROLL-FULL-NEXT: %9 = affine.apply [[MAP0]](%c0)
+    // UNROLL-FULL-NEXT: %10 = affine.apply [[MAP0]](%9)
+    // UNROLL-FULL-NEXT: %11 = affine.apply [[MAP6]](%9)[%c99]
+    // UNROLL-FULL-NEXT: %12 = affine.apply [[MAP1]](%c0)
+    // UNROLL-FULL-NEXT: %13 = affine.apply [[MAP0]](%12)
+    // UNROLL-FULL-NEXT: %14 = affine.apply [[MAP6]](%12)[%c99]
+    // UNROLL-FULL-NEXT: %15 = affine.apply [[MAP2]](%c0)
+    // UNROLL-FULL-NEXT: %16 = affine.apply [[MAP0]](%15)
+    // UNROLL-FULL-NEXT: %17 = affine.apply [[MAP6]](%15)[%c99]
+    affine.for %n = 0 to 4 {
+      %y = "affine.apply" (%n) { map = (d0) -> (d0 + 1) } :
+        (index) -> (index)
+      %z = "affine.apply" (%n, %k) { map = (d0) [s0] -> (d0 + s0 + 1) } :
+        (index, index) -> (index)
+    }     // UNROLL-FULL }
+  }       // UNROLL-FULL }
+  return  // UNROLL-FULL:  return
+}         // UNROLL-FULL }
+
+// UNROLL-FULL-LABEL: func @loop_nest_unroll_full() {
+func @loop_nest_unroll_full() {
+  // UNROLL-FULL-NEXT: %0 = "foo"() : () -> i32
+  // UNROLL-FULL-NEXT: %1 = "bar"() : () -> i32
+  // UNROLL-FULL-NEXT:  return
+  affine.for %i = 0 to 1 {
+    %x = "foo"() : () -> i32
+    %y = "bar"() : () -> i32
+  }
+  return
+} // UNROLL-FULL }
+
+// SHORT-LABEL: func @loop_nest_outer_unroll() {
+func @loop_nest_outer_unroll() {
+  // SHORT:      affine.for %arg0 = 0 to 4 {
+  // SHORT-NEXT:   %0 = affine.apply [[MAP0]](%arg0)
+  // SHORT-NEXT:   %1 = "addi32"(%0, %0) : (index, index) -> index
+  // SHORT-NEXT: }
+  // SHORT-NEXT: affine.for %arg0 = 0 to 4 {
+  // SHORT-NEXT:   %0 = affine.apply [[MAP0]](%arg0)
+  // SHORT-NEXT:   %1 = "addi32"(%0, %0) : (index, index) -> index
+  // SHORT-NEXT: }
+  affine.for %i = 0 to 2 {
+    affine.for %j = 0 to 4 {
+      %x = "affine.apply" (%j) { map = (d0) -> (d0 + 1) } :
+        (index) -> (index)
+      %y = "addi32"(%x, %x) : (index, index) -> index
+    }
+  }
+  return  // SHORT:  return
+}         // SHORT }
+
+// We are doing a minimal FileCheck here. We just need this test case to
+// successfully run. Both %x and %y will get unrolled here as the min trip
+// count threshold set to 2.
+// SHORT-LABEL: func @loop_nest_seq_long() -> i32 {
+func @loop_nest_seq_long() -> i32 {
+  %A = alloc() : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
+  %B = alloc() : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
+  %C = alloc() : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
+
+  %zero = constant 0 : i32
+  %one = constant 1 : i32
+  %two = constant 2 : i32
+
+  %zero_idx = constant 0 : index
+
+  // CHECK: affine.for %arg0 = 0 to 512
+  affine.for %n0 = 0 to 512 {
+    // CHECK: affine.for %arg1 = 0 to 8
+    affine.for %n1 = 0 to 8 {
+      store %one,  %A[%n0, %n1] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
+      store %two,  %B[%n0, %n1] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
+      store %zero, %C[%n0, %n1] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
+    }
+  }
+
+  affine.for %x = 0 to 2 {
+    affine.for %y = 0 to 2 {
+      // CHECK: affine.for
+      affine.for %arg2 = 0 to 8 {
+        // CHECK-NOT: affine.for
+        // CHECK: %{{[0-9]+}} = affine.apply
+        %b2 = "affine.apply" (%y, %arg2) {map = (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
+        %z = load %B[%x, %b2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
+        "op1"(%z) : (i32) -> ()
+      }
+      affine.for %j1 = 0 to 8 {
+        affine.for %j2 = 0 to 8 {
+          %a2 = "affine.apply" (%y, %j2) {map = (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
+          %v203 = load %A[%j1, %a2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
+          "op2"(%v203) : (i32) -> ()
+        }
+        affine.for %k2 = 0 to 8 {
+          %s0 = "op3"() : () -> i32
+          %c2 = "affine.apply" (%x, %k2) {map = (d0, d1) -> (16*d0 + d1)} : (index, index) -> index
+          %s1 =  load %C[%j1, %c2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
+          %s2 = "addi32"(%s0, %s1) : (i32, i32) -> i32
+          store %s2, %C[%j1, %c2] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
+        }
+      }
+      "op4"() : () -> ()
+    }
+  }
+  %ret = load %C[%zero_idx, %zero_idx] : memref<512 x 512 x i32, (d0, d1) -> (d0, d1), 2>
+  return %ret : i32
+}
+
+// UNROLL-BY-4-LABEL: func @unroll_unit_stride_no_cleanup() {
+func @unroll_unit_stride_no_cleanup() {
+  // UNROLL-BY-4: affine.for %arg0 = 0 to 100 {
+  affine.for %i = 0 to 100 {
+    // UNROLL-BY-4: for [[L1:%arg[0-9]+]] = 0 to 8 step 4 {
+    // UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]+}}([[L1]])
+    // UNROLL-BY-4-NEXT: %3 = "addi32"(%2, %2) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]+}}([[L1]])
+    // UNROLL-BY-4-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %7 = "addi32"(%6, %6) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT: %8 = affine.apply #map{{[0-9]+}}([[L1]])
+    // UNROLL-BY-4-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %10 = "addi32"(%9, %9) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT: }
+    affine.for %j = 0 to 8 {
+      %x = "addi32"(%j, %j) : (index, index) -> i32
+      %y = "addi32"(%x, %x) : (i32, i32) -> i32
+    }
+    // empty loop
+    // UNROLL-BY-4: affine.for %arg1 = 0 to 8 {
+    affine.for %k = 0 to 8 {
+    }
+  }
+  return
+}
+
+// UNROLL-BY-4-LABEL: func @unroll_unit_stride_cleanup() {
+func @unroll_unit_stride_cleanup() {
+  // UNROLL-BY-4: affine.for %arg0 = 0 to 100 {
+  affine.for %i = 0 to 100 {
+    // UNROLL-BY-4: for [[L1:%arg[0-9]+]] = 0 to 8 step 4 {
+    // UNROLL-BY-4-NEXT:   %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT:   %1 = "addi32"(%0, %0) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT:   %2 = affine.apply #map{{[0-9]+}}([[L1]])
+    // UNROLL-BY-4-NEXT:   %3 = "addi32"(%2, %2) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT:   %4 = "addi32"(%3, %3) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT:   %5 = affine.apply #map{{[0-9]+}}([[L1]])
+    // UNROLL-BY-4-NEXT:   %6 = "addi32"(%5, %5) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT:   %7 = "addi32"(%6, %6) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT:   %8 = affine.apply #map{{[0-9]+}}([[L1]])
+    // UNROLL-BY-4-NEXT:   %9 = "addi32"(%8, %8) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT:   %10 = "addi32"(%9, %9) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT: }
+    // UNROLL-BY-4-NEXT: for [[L2:%arg[0-9]+]] = 8 to 10 {
+    // UNROLL-BY-4-NEXT:   %0 = "addi32"([[L2]], [[L2]]) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT:   %1 = "addi32"(%0, %0) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT: }
+    affine.for %j = 0 to 10 {
+      %x = "addi32"(%j, %j) : (index, index) -> i32
+      %y = "addi32"(%x, %x) : (i32, i32) -> i32
+    }
+  }
+  return
+}
+
+// UNROLL-BY-4-LABEL: func @unroll_non_unit_stride_cleanup() {
+func @unroll_non_unit_stride_cleanup() {
+  // UNROLL-BY-4: affine.for %arg0 = 0 to 100 {
+  affine.for %i = 0 to 100 {
+    // UNROLL-BY-4: for [[L1:%arg[0-9]+]] = 2 to 42 step 20 {
+    // UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]+}}([[L1]])
+    // UNROLL-BY-4-NEXT: %3 = "addi32"(%2, %2) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]+}}([[L1]])
+    // UNROLL-BY-4-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %7 = "addi32"(%6, %6) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT: %8 = affine.apply #map{{[0-9]+}}([[L1]])
+    // UNROLL-BY-4-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %10 = "addi32"(%9, %9) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT: }
+    // UNROLL-BY-4-NEXT: for [[L2:%arg[0-9]+]] = 42 to 48 step 5 {
+    // UNROLL-BY-4-NEXT: %0 = "addi32"([[L2]], [[L2]]) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32
+    // UNROLL-BY-4-NEXT: }
+    affine.for %j = 2 to 48 step 5 {
+      %x = "addi32"(%j, %j) : (index, index) -> i32
+      %y = "addi32"(%x, %x) : (i32, i32) -> i32
+    }
+  }
+  return
+}
+
+// Both the unrolled loop and the cleanup loop are single iteration loops.
+// UNROLL-BY-4-LABEL: func @loop_nest_single_iteration_after_unroll
+func @loop_nest_single_iteration_after_unroll(%N: index) {
+  // UNROLL-BY-4: %c0 = constant 0 : index
+  // UNROLL-BY-4: %c4 = constant 4 : index
+  // UNROLL-BY-4: affine.for %arg1 = 0 to %arg0 {
+  affine.for %i = 0 to %N {
+    // UNROLL-BY-4: %0 = "addi32"(%c0, %c0) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %1 = affine.apply [[MAP0]](%c0)
+    // UNROLL-BY-4-NEXT: %2 = "addi32"(%1, %1) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %3 = affine.apply [[MAP1]](%c0)
+    // UNROLL-BY-4-NEXT: %4 = "addi32"(%3, %3) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %5 = affine.apply [[MAP2]](%c0)
+    // UNROLL-BY-4-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
+    // UNROLL-BY-4-NEXT: %7 = "addi32"(%c4, %c4) : (index, index) -> i32
+    // UNROLL-BY-4-NOT: for
+    affine.for %j = 0 to 5 {
+      %x = "addi32"(%j, %j) : (index, index) -> i32
+    } // UNROLL-BY-4-NOT: }
+  } // UNROLL-BY-4:  }
+  return
+}
+
+// Test cases with loop bound operands.
+
+// No cleanup will be generated here.
+// UNROLL-BY-4-LABEL: func @loop_nest_operand1() {
+func @loop_nest_operand1() {
+// UNROLL-BY-4:      affine.for %arg0 = 0 to 100 step 2 {
+// UNROLL-BY-4-NEXT:   affine.for %arg1 = 0 to #map{{[0-9]+}}(%arg0) step 4
+// UNROLL-BY-4-NEXT:      %0 = "foo"() : () -> i32
+// UNROLL-BY-4-NEXT:      %1 = "foo"() : () -> i32
+// UNROLL-BY-4-NEXT:      %2 = "foo"() : () -> i32
+// UNROLL-BY-4-NEXT:      %3 = "foo"() : () -> i32
+// UNROLL-BY-4-NEXT:   }
+// UNROLL-BY-4-NEXT: }
+// UNROLL-BY-4-NEXT: return
+  affine.for %i = 0 to 100 step 2 {
+    affine.for %j = 0 to (d0) -> (d0 - d0 mod 4) (%i) {
+      %x = "foo"() : () -> i32
+    }
+  }
+  return
+}
+
+// No cleanup will be generated here.
+// UNROLL-BY-4-LABEL: func @loop_nest_operand2() {
+func @loop_nest_operand2() {
+// UNROLL-BY-4:      affine.for %arg0 = 0 to 100 step 2 {
+// UNROLL-BY-4-NEXT:   affine.for %arg1 = [[MAP11]](%arg0) to #map{{[0-9]+}}(%arg0) step 4 {
+// UNROLL-BY-4-NEXT:     %0 = "foo"() : () -> i32
+// UNROLL-BY-4-NEXT:     %1 = "foo"() : () -> i32
+// UNROLL-BY-4-NEXT:     %2 = "foo"() : () -> i32
+// UNROLL-BY-4-NEXT:     %3 = "foo"() : () -> i32
+// UNROLL-BY-4-NEXT:   }
+// UNROLL-BY-4-NEXT: }
+// UNROLL-BY-4-NEXT: return
+  affine.for %i = 0 to 100 step 2 {
+    affine.for %j = (d0) -> (d0) (%i) to (d0) -> (5*d0 + 4) (%i) {
+      %x = "foo"() : () -> i32
+    }
+  }
+  return
+}
+
+// Difference between loop bounds is constant, but not a multiple of unroll
+// factor. The cleanup loop happens to be a single iteration one and is promoted.
+// UNROLL-BY-4-LABEL: func @loop_nest_operand3() {
+func @loop_nest_operand3() {
+  // UNROLL-BY-4: affine.for %arg0 = 0 to 100 step 2 {
+  affine.for %i = 0 to 100 step 2 {
+    // UNROLL-BY-4: affine.for %arg1 = [[MAP11]](%arg0) to #map{{[0-9]+}}(%arg0) step 4 {
+    // UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: %3 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: %4 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: }
+    // UNROLL-BY-4-NEXT: %0 = "foo"() : () -> i32
+    affine.for %j = (d0) -> (d0) (%i) to (d0) -> (d0 + 9) (%i) {
+      %x = "foo"() : () -> i32
+    }
+  } // UNROLL-BY-4: }
+  return
+}
+
+// UNROLL-BY-4-LABEL: func @loop_nest_symbolic_bound(%arg0: index) {
+func @loop_nest_symbolic_bound(%N : index) {
+  // UNROLL-BY-4: affine.for %arg1 = 0 to 100 {
+  affine.for %i = 0 to 100 {
+    // UNROLL-BY-4: affine.for %arg2 = 0 to #map{{[0-9]+}}()[%arg0] step 4 {
+    // UNROLL-BY-4: %0 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: %3 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: }
+    // A cleanup loop will be be generated here.
+    // UNROLL-BY-4-NEXT: affine.for %arg2 = #map{{[0-9]+}}()[%arg0] to %arg0 {
+    // UNROLL-BY-4-NEXT: %0 = "foo"() : () -> i32
+    // UNROLL-BY-4_NEXT: }
+    affine.for %j = 0 to %N {
+      %x = "foo"() : () -> i32
+    }
+  }
+  return
+}
+
+// UNROLL-BY-4-LABEL: func @loop_nest_symbolic_bound_with_step
+// UNROLL-BY-4-SAME: %[[N:.*]]: index
+func @loop_nest_symbolic_bound_with_step(%N : index) {
+  // UNROLL-BY-4: affine.for %arg1 = 0 to 100 {
+  affine.for %i = 0 to 100 {
+    affine.for %j = 0 to %N step 3 {
+      %x = "foo"() : () -> i32
+    }
+// UNROLL-BY-4:      affine.for %{{.*}} = 0 to #map{{[0-9]+}}()[%[[N]]] step 12 {
+// UNROLL-BY-4:        "foo"()
+// UNROLL-BY-4-NEXT:   "foo"()
+// UNROLL-BY-4-NEXT:   "foo"()
+// UNROLL-BY-4-NEXT:   "foo"()
+// UNROLL-BY-4-NEXT: }
+// A cleanup loop will be be generated here.
+// UNROLL-BY-4-NEXT: affine.for %{{.*}} = #map{{[0-9]+}}()[%[[N]]] to %[[N]] step 3 {
+// UNROLL-BY-4-NEXT:   "foo"()
+// UNROLL-BY-4_NEXT: }
+  }
+  return
+}
+
+// UNROLL-BY-4-LABEL: func @loop_nest_symbolic_and_min_upper_bound
+func @loop_nest_symbolic_and_min_upper_bound(%M : index, %N : index, %K : index) {
+  affine.for %i = %M to min ()[s0, s1] -> (s0, s1, 1024)()[%N, %K] {
+    "foo"() : () -> ()
+  }
+  return
+}
+// CHECK-NEXT:  affine.for %arg0 = %arg0 to min [[MAP_TRIP_COUNT_MULTIPLE_FOUR]]()[%arg0, %arg1, %arg2] step 4 {
+// CHECK-NEXT:    "foo"() : () -> ()
+// CHECK-NEXT:    "foo"() : () -> ()
+// CHECK-NEXT:    "foo"() : () -> ()
+// CHECK-NEXT:    "foo"() : () -> ()
+// CHECK-NEXT:  }
+// CHECK-NEXT:  affine.for %arg1 = max [[MAP_TRIP_COUNT_MULTIPLE_FOUR]]()[%arg0, %arg1, %arg2] to min #map28()[%arg1, %arg2] {
+// CHECK-NEXT:    "foo"() : () -> ()
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+
+// The trip count here is a multiple of four, but this can be inferred only
+// through composition. Check for no cleanup loop.
+// UNROLL-BY-4-LABEL: func @loop_nest_non_trivial_multiple_upper_bound
+func @loop_nest_non_trivial_multiple_upper_bound(%M : index, %N : index) {
+  %T = affine.apply (d0) -> (4*d0 + 1)(%M)
+  %K = affine.apply (d0) -> (d0 - 1) (%T)
+  affine.for %i = 0 to min (d0, d1) -> (4 * d0, d1, 1024)(%N, %K) {
+    "foo"() : () -> ()
+  }
+  return
+}
+// UNROLL-BY-4: affine.for %arg2 = 0 to min
+// UNROLL-BY-4-NOT: for
+// UNROLL-BY-4: return
+
+// UNROLL-BY-4-LABEL: func @loop_nest_non_trivial_multiple_upper_bound_alt
+func @loop_nest_non_trivial_multiple_upper_bound_alt(%M : index, %N : index) {
+  %K = affine.apply (d0) -> (4*d0) (%M)
+  affine.for %i = 0 to min ()[s0, s1] -> (4 * s0, s1, 1024)()[%N, %K] {
+    "foo"() : () -> ()
+  }
+  // UNROLL-BY-4: affine.for %arg2 = 0 to min
+  // UNROLL-BY-4-NEXT: "foo"
+  // UNROLL-BY-4-NEXT: "foo"
+  // UNROLL-BY-4-NEXT: "foo"
+  // UNROLL-BY-4-NEXT: "foo"
+  // UNROLL-BY-4-NOT for
+  // UNROLL-BY-4: return
+  return
+}
+
+// UNROLL-BY-1-LABEL: func @unroll_by_one_should_promote_single_iteration_loop()
+func @unroll_by_one_should_promote_single_iteration_loop() {
+  affine.for %i = 0 to 1 {
+    %x = "foo"(%i) : (index) -> i32
+  }
+  return
+// UNROLL-BY-1-NEXT: %c0 = constant 0 : index
+// UNROLL-BY-1-NEXT: %0 = "foo"(%c0) : (index) -> i32
+// UNROLL-BY-1-NEXT: return
+}
diff --git a/mlir/test/Unit/lit.cfg.py b/mlir/test/Unit/lit.cfg.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea14853e71d634b370a9e84600c318caa69b7790
--- /dev/null
+++ b/mlir/test/Unit/lit.cfg.py
@@ -0,0 +1,42 @@
+# -*- Python -*-
+
+# Configuration file for the 'lit' test runner.
+
+import os
+import subprocess
+
+import lit.formats
+
+# name: The name of this test suite.
+config.name = 'MLIR-Unit'
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = []
+
+# is_early; Request to run this suite early.
+config.is_early = True
+
+# test_source_root: The root path where tests are located.
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.mlir_obj_root, 'unittests')
+config.test_source_root = config.test_exec_root
+
+# testFormat: The test format to use to interpret tests.
+config.test_format = lit.formats.GoogleTest(config.llvm_build_mode, 'Tests')
+
+# Propagate the temp directory. Windows requires this because it uses \Windows\
+# if none of these are present.
+if 'TMP' in os.environ:
+    config.environment['TMP'] = os.environ['TMP']
+if 'TEMP' in os.environ:
+    config.environment['TEMP'] = os.environ['TEMP']
+
+# Propagate HOME as it can be used to override incorrect homedir in passwd
+# that causes the tests to fail.
+if 'HOME' in os.environ:
+    config.environment['HOME'] = os.environ['HOME']
+
+# Propagate path to symbolizer for ASan/MSan.
+for symbolizer in ['ASAN_SYMBOLIZER_PATH', 'MSAN_SYMBOLIZER_PATH']:
+    if symbolizer in os.environ:
+        config.environment[symbolizer] = os.environ[symbolizer]
diff --git a/mlir/test/Unit/lit.site.cfg.py.in b/mlir/test/Unit/lit.site.cfg.py.in
new file mode 100644
index 0000000000000000000000000000000000000000..a9464bbc20c236097085b36f5c8450c68209eb35
--- /dev/null
+++ b/mlir/test/Unit/lit.site.cfg.py.in
@@ -0,0 +1,26 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.llvm_src_root = "@LLVM_SOURCE_DIR@"
+config.llvm_obj_root = "@LLVM_BINARY_DIR@"
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.llvm_build_mode = "@LLVM_BUILD_MODE@"
+config.enable_shared = @ENABLE_SHARED@
+config.shlibdir = "@SHLIBDIR@"
+config.mlir_src_root = "@MLIR_SOURCE_DIR@"
+config.mlir_obj_root = "@MLIR_BINARY_DIR@"
+config.mlir_tools_dir = "@MLIR_TOOLS_DIR@"
+
+# Support substitution of the tools_dir and build_mode with user parameters.
+# This is used when we can't determine the tool dir at configuration time.
+try:
+    config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
+    config.llvm_build_mode = config.llvm_build_mode % lit_config.params
+except KeyError:
+    e = sys.exc_info()[1]
+    key, = e.args
+    lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@MLIR_SOURCE_DIR@/test/Unit/lit.cfg.py")
diff --git a/mlir/test/lib/CMakeLists.txt b/mlir/test/lib/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..534d0d31216d11b3b00ac3f506cefb3e771081e1
--- /dev/null
+++ b/mlir/test/lib/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_subdirectory(DeclarativeTransforms)
+add_subdirectory(IR)
+add_subdirectory(Pass)
+add_subdirectory(TestDialect)
+add_subdirectory(Transforms)
diff --git a/mlir/test/lib/DeclarativeTransforms/CMakeLists.txt b/mlir/test/lib/DeclarativeTransforms/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9672edb4c493a6be8d73087a7f743bd0f43c5e3d
--- /dev/null
+++ b/mlir/test/lib/DeclarativeTransforms/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(LLVM_TARGET_DEFINITIONS TestLinalgTransformPatterns.td)
+mlir_tablegen(TestLinalgTransformPatterns.h.inc -gen-rewriters)
+add_public_tablegen_target(MLIRTestLinalgTransformPatternsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS TestVectorTransformPatterns.td)
+mlir_tablegen(TestVectorTransformPatterns.h.inc -gen-rewriters)
+add_public_tablegen_target(MLIRTestVectorTransformPatternsIncGen)
diff --git a/mlir/test/lib/DeclarativeTransforms/TestLinalgTransformPatterns.td b/mlir/test/lib/DeclarativeTransforms/TestLinalgTransformPatterns.td
new file mode 100644
index 0000000000000000000000000000000000000000..d07f6060c3bba2143b4416d8e6038bdaf943a245
--- /dev/null
+++ b/mlir/test/lib/DeclarativeTransforms/TestLinalgTransformPatterns.td
@@ -0,0 +1,126 @@
+//===- TestLinalgTransformPatterns.td - Test patterns --*- tablegen ----*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the pattern definition file for declarative Linalg transformations
+// tests.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_LINALG_TRANSFORMS_PATTERNS
+#define TEST_LINALG_TRANSFORMS_PATTERNS
+
+include "mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td"
+
+//===----------------------------------------------------------------------===//
+// Test Linalg fusion patterns.
+//===----------------------------------------------------------------------===//
+def : Pat<(MatmulOp:$consumer $A, $B, $C),
+          (TileAndFuseLinalgOp<[100, 150], [0], "L1"> $consumer),
+          [
+            (Constraint<HasNoLinalgTransformMarker> $consumer),
+            (Constraint<IsProducedByOpOfType<"MatmulOp">> $consumer, $A),
+          ],
+          // In the buffer world there is no use-def chains or dags so benefits
+          // cannot be computed automatically from the length of the matched
+          // pattern. Instead we specify the benefit ourselves for now.
+          // This is not expected to be a big challenge long-term because
+          // pattern benefits are akin to feature engineering: features should
+          // be learned.
+          (addBenefit 1)>;
+
+//===----------------------------------------------------------------------===//
+// Linalg tiling patterns.
+//===----------------------------------------------------------------------===//
+def : Pat<(MatmulOp:$op $A, $B, $C),
+          (TileLinalgOp<[2000, 3000, 4000], "L3"> $op),
+          [(Constraint<Or<[HasNoLinalgTransformMarker,
+                           HasLinalgTransformMarker<"MEM">]>> $op)]>;
+def : Pat<(MatmulOp:$op $A, $B, $C),
+          (TileLinalgOp<[200, 300, 400], "L2"> $op),
+          [(Constraint<HasLinalgTransformMarker<"L3">> $op)]>;
+def : Pat<(MatmulOp:$op $A, $B, $C),
+          (TileLinalgOp<[20, 30, 40], "L1"> $op),
+          [(Constraint<HasLinalgTransformMarker<"L2">> $op)]>;
+def : Pat<(MatmulOp:$op $A, $B, $C),
+          (TileLinalgOp<[2, 3, 4], "REG"> $op),
+          [(Constraint<HasLinalgTransformMarker<"L1">> $op)]>;
+
+def : Pattern<(MatvecOp:$op $A, $b, $c),
+              [(TileLinalgOp<[5, 6], "L1"> $op)],
+              [(Constraint<HasNoLinalgTransformMarker> $op)]>;
+
+def : Pattern<(DotOp:$op $a, $b, $c),
+              [(TileLinalgOp<[8000], "L1"> $op)],
+              [(Constraint<Or<[HasNoLinalgTransformMarker,
+                               HasLinalgTransformMarker<"MEM">,
+                               HasLinalgTransformMarker<"L3">,
+                               HasLinalgTransformMarker<"L2">]>> $op)]>;
+def : Pattern<(DotOp:$op $a, $b, $c),
+              [(TileLinalgOp<[8], "REG"> $op)],
+              [(Constraint<HasLinalgTransformMarker<"L1">> $op)]>;
+
+//===----------------------------------------------------------------------===//
+// Linalg tiling and permutation patterns.
+//===----------------------------------------------------------------------===//
+def : Pat<(MatmulOp:$op $A, $B, $C),
+          (TileLinalgOp<[2000, 3000, 4000], "L2__with_perm__", [1,2,0]> $op),
+          [(Constraint<HasLinalgTransformMarker<"__with_perm__">> $op)]>;
+def : Pat<(MatmulOp:$op $A, $B, $C),
+          (TileLinalgOp<[200, 300, 400], "L1__with_perm__", [1,0,2]> $op),
+          [(Constraint<HasLinalgTransformMarker<"L2__with_perm__">> $op)]>;
+def : Pat<(MatmulOp:$op $A, $B, $C),
+          (TileLinalgOp<[20, 30, 40], "REG__with_perm__"> $op),
+          [(Constraint<HasLinalgTransformMarker<"L1__with_perm__">> $op)]>;
+
+
+def : Pattern<(MatvecOp:$op $A, $b, $c),
+              [(TileLinalgOp<[5, 6], "L1__with_perm__", [1,0]> $op)],
+              [(Constraint<HasLinalgTransformMarker<"__with_perm__">> $op)]>;
+
+def : Pattern<(DotOp:$op $a, $b, $c),
+              [(TileLinalgOp<[8000], "L1__with_perm__"> $op)],
+              [(Constraint<HasLinalgTransformMarker<"__with_perm__">> $op)]>;
+def : Pattern<(DotOp:$op $a, $b, $c),
+              [(TileLinalgOp<[8], "REG__with_perm__"> $op)],
+              [(Constraint<HasLinalgTransformMarker<"L1__with_perm__">> $op)]>;
+
+//===----------------------------------------------------------------------===//
+// Linalg to loops patterns.
+//===----------------------------------------------------------------------===//
+def : Pattern<(DotOp:$op $a, $b, $c),
+              [(LinalgOpToLoops<"DotOp"> $op)],
+              [(Constraint<HasLinalgTransformMarker<"REG">> $op)]>;
+
+//===----------------------------------------------------------------------===//
+// Linalg to vector contraction patterns.
+//===----------------------------------------------------------------------===//
+def : Pattern<(GenericOp:$op $_1, $_2, $_3, $_4, $_5, $_6, $_7, $_8),
+              [(LinalgOpToVectorContraction<"GenericOp"> $op)],
+              [(Constraint<HasLinalgTransformMarker<"_marked_matmul_">> $op)]>;
+
+//===----------------------------------------------------------------------===//
+// Linalg generic permutation patterns.
+//===----------------------------------------------------------------------===//
+def : Pat<(GenericOp:$op $_1, $_2, $_3, $_4, $_5, $_6, $_7, $_8),
+              (PermuteGenericLinalgOp<[1,2,0],"PERMUTED"> $op),
+              [(Constraint<And<[HasNoLinalgTransformMarker,
+                           AffineMapDomainHasDim<3>]>> $op)]>;
+
+def : Pat<(IndexedGenericOp:$op $_1, $_2, $_3, $_4, $_5, $_6, $_7, $_8),
+              (PermuteGenericLinalgOp<[1,2,0],"PERMUTED"> $op),
+              [(Constraint<And<[HasNoLinalgTransformMarker,
+                           AffineMapDomainHasDim<3>]>> $op)]>;
+
+//===----------------------------------------------------------------------===//
+// Linalg subview operands promotion.
+//===----------------------------------------------------------------------===//
+def : Pat<(MatmulOp:$op $A, $B, $C),
+          (LinalgOpPromoteSubviews<"MatmulOp"> $op),
+          [(Constraint<HasOperandsOfType<"SubViewOp">> $op),
+          (Constraint<HasLinalgTransformMarker<"_promote_views_">> $op)]>;
+#endif // TEST_LINALG_TRANSFORMS_PATTERNS
diff --git a/mlir/test/lib/DeclarativeTransforms/TestVectorTransformPatterns.td b/mlir/test/lib/DeclarativeTransforms/TestVectorTransformPatterns.td
new file mode 100644
index 0000000000000000000000000000000000000000..29875ccd543bed9df0096593db53999d8b1d0ee0
--- /dev/null
+++ b/mlir/test/lib/DeclarativeTransforms/TestVectorTransformPatterns.td
@@ -0,0 +1,34 @@
+//===- TestVectorTransformPatterns.td - Test patterns ---*- tablegen ----*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the pattern definition file for declarative Vector transformations
+// tests.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_VECTOR_TRANSFORMS_PATTERNS
+#define TEST_VECTOR_TRANSFORMS_PATTERNS
+
+include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/VectorOps/VectorOps.td"
+include "mlir/Dialect/VectorOps/VectorTransformPatterns.td"
+
+def : Pat<(AddFOp:$op_results $a, $b),
+          (UnrollVectorOp<[2, 2]> $op_results, $a, $b),
+          [(Constraint<HasShape<[4, 2]>> $a)]>;
+
+def : Pat<(AddFOp:$op_results $a, $b),
+          (UnrollVectorOp<[2, 2]> $op_results, $a, $b),
+          [(Constraint<HasShape<[4, 4]>> $a)]>;
+
+// TODO(andydavis) Add Constraints on lhs/rhs shapes.
+def : Pat<(Vector_ContractionOp:$op_results $a, $b, $c, $masks, $attr0, $attr1),
+          (UnrollVectorOp<[2, 2, 2]> $op_results, $a, $b, $c),
+          [(Constraint<HasShape<[4, 4]>> $c)]>;
+
+#endif // TEST_VECTOR_TRANSFORMS_PATTERNS
diff --git a/mlir/test/lib/DeclarativeTransforms/lit.local.cfg b/mlir/test/lib/DeclarativeTransforms/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..edb5b44b2e2fe3ab6de5b9005de130e21bdb092b
--- /dev/null
+++ b/mlir/test/lib/DeclarativeTransforms/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes.remove('.td')
\ No newline at end of file
diff --git a/mlir/test/lib/IR/CMakeLists.txt b/mlir/test/lib/IR/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5cb2769a1f0c07ea006e1e0b1c119479b79ab305
--- /dev/null
+++ b/mlir/test/lib/IR/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_llvm_library(MLIRTestIR
+  TestFunc.cpp
+  TestMatchers.cpp
+  TestSymbolUses.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  )
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../TestDialect)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/../TestDialect)
+add_dependencies(MLIRTestIR
+  MLIRTestDialect
+)
+target_link_libraries(MLIRTestIR
+  MLIRPass
+  )
diff --git a/mlir/test/lib/IR/TestFunc.cpp b/mlir/test/lib/IR/TestFunc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e131590fae528773bed6ee81047378401495b14
--- /dev/null
+++ b/mlir/test/lib/IR/TestFunc.cpp
@@ -0,0 +1,58 @@
+//===- TestFunctionLike.cpp - Pass to test helpers on FunctionLike --------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// This is a test pass for verifying FuncOp's eraseArgument method.
+struct TestFuncEraseArg : public ModulePass<TestFuncEraseArg> {
+  void runOnModule() override {
+    auto module = getModule();
+
+    for (FuncOp func : module.getOps<FuncOp>()) {
+      SmallVector<unsigned, 4> indicesToErase;
+      for (auto argIndex : llvm::seq<int>(0, func.getNumArguments())) {
+        if (func.getArgAttr(argIndex, "test.erase_this_arg")) {
+          // Push back twice to test that duplicate arg indices are handled
+          // correctly.
+          indicesToErase.push_back(argIndex);
+          indicesToErase.push_back(argIndex);
+        }
+      }
+      // Reverse the order to test that unsorted index lists are handled
+      // correctly.
+      std::reverse(indicesToErase.begin(), indicesToErase.end());
+      func.eraseArguments(indicesToErase);
+    }
+  }
+};
+
+/// This is a test pass for verifying FuncOp's setType method.
+struct TestFuncSetType : public ModulePass<TestFuncSetType> {
+  void runOnModule() override {
+    auto module = getModule();
+    SymbolTable symbolTable(module);
+
+    for (FuncOp func : module.getOps<FuncOp>()) {
+      auto sym = func.getAttrOfType<FlatSymbolRefAttr>("test.set_type_from");
+      if (!sym)
+        continue;
+      func.setType(symbolTable.lookup<FuncOp>(sym.getValue()).getType());
+    }
+  }
+};
+} // end anonymous namespace
+
+static PassRegistration<TestFuncEraseArg> pass("test-func-erase-arg",
+                                               "Test erasing func args.");
+
+static PassRegistration<TestFuncSetType> pass2("test-func-set-type",
+                                               "Test FuncOp::setType.");
diff --git a/mlir/test/lib/IR/TestMatchers.cpp b/mlir/test/lib/IR/TestMatchers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b62daa8437c69c156dea15cc64da3ce0567da598
--- /dev/null
+++ b/mlir/test/lib/IR/TestMatchers.cpp
@@ -0,0 +1,147 @@
+//===- TestMatchers.cpp - Pass to test matchers ---------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// This is a test pass for verifying matchers.
+struct TestMatchers : public FunctionPass<TestMatchers> {
+  void runOnFunction() override;
+};
+} // end anonymous namespace
+
+// This could be done better but is not worth the variadic template trouble.
+template <typename Matcher> unsigned countMatches(FuncOp f, Matcher &matcher) {
+  unsigned count = 0;
+  f.walk([&count, &matcher](Operation *op) {
+    if (matcher.match(op))
+      ++count;
+  });
+  return count;
+}
+
+using mlir::matchers::m_Any;
+using mlir::matchers::m_Val;
+static void test1(FuncOp f) {
+  assert(f.getNumArguments() == 3 && "matcher test funcs must have 3 args");
+
+  auto a = m_Val(f.getArgument(0));
+  auto b = m_Val(f.getArgument(1));
+  auto c = m_Val(f.getArgument(2));
+
+  auto p0 = m_Op<AddFOp>(); // using 0-arity matcher
+  llvm::outs() << "Pattern add(*) matched " << countMatches(f, p0)
+               << " times\n";
+
+  auto p1 = m_Op<MulFOp>(); // using 0-arity matcher
+  llvm::outs() << "Pattern mul(*) matched " << countMatches(f, p1)
+               << " times\n";
+
+  auto p2 = m_Op<AddFOp>(m_Op<AddFOp>(), m_Any());
+  llvm::outs() << "Pattern add(add(*), *) matched " << countMatches(f, p2)
+               << " times\n";
+
+  auto p3 = m_Op<AddFOp>(m_Any(), m_Op<AddFOp>());
+  llvm::outs() << "Pattern add(*, add(*)) matched " << countMatches(f, p3)
+               << " times\n";
+
+  auto p4 = m_Op<MulFOp>(m_Op<AddFOp>(), m_Any());
+  llvm::outs() << "Pattern mul(add(*), *) matched " << countMatches(f, p4)
+               << " times\n";
+
+  auto p5 = m_Op<MulFOp>(m_Any(), m_Op<AddFOp>());
+  llvm::outs() << "Pattern mul(*, add(*)) matched " << countMatches(f, p5)
+               << " times\n";
+
+  auto p6 = m_Op<MulFOp>(m_Op<MulFOp>(), m_Any());
+  llvm::outs() << "Pattern mul(mul(*), *) matched " << countMatches(f, p6)
+               << " times\n";
+
+  auto p7 = m_Op<MulFOp>(m_Op<MulFOp>(), m_Op<MulFOp>());
+  llvm::outs() << "Pattern mul(mul(*), mul(*)) matched " << countMatches(f, p7)
+               << " times\n";
+
+  auto mul_of_mulmul = m_Op<MulFOp>(m_Op<MulFOp>(), m_Op<MulFOp>());
+  auto p8 = m_Op<MulFOp>(mul_of_mulmul, mul_of_mulmul);
+  llvm::outs()
+      << "Pattern mul(mul(mul(*), mul(*)), mul(mul(*), mul(*))) matched "
+      << countMatches(f, p8) << " times\n";
+
+  // clang-format off
+  auto mul_of_muladd = m_Op<MulFOp>(m_Op<MulFOp>(), m_Op<AddFOp>());
+  auto mul_of_anyadd = m_Op<MulFOp>(m_Any(), m_Op<AddFOp>());
+  auto p9 = m_Op<MulFOp>(m_Op<MulFOp>(
+                     mul_of_muladd, m_Op<MulFOp>()),
+                   m_Op<MulFOp>(mul_of_anyadd, mul_of_anyadd));
+  // clang-format on
+  llvm::outs() << "Pattern mul(mul(mul(mul(*), add(*)), mul(*)), mul(mul(*, "
+                  "add(*)), mul(*, add(*)))) matched "
+               << countMatches(f, p9) << " times\n";
+
+  auto p10 = m_Op<AddFOp>(a, b);
+  llvm::outs() << "Pattern add(a, b) matched " << countMatches(f, p10)
+               << " times\n";
+
+  auto p11 = m_Op<AddFOp>(a, c);
+  llvm::outs() << "Pattern add(a, c) matched " << countMatches(f, p11)
+               << " times\n";
+
+  auto p12 = m_Op<AddFOp>(b, a);
+  llvm::outs() << "Pattern add(b, a) matched " << countMatches(f, p12)
+               << " times\n";
+
+  auto p13 = m_Op<AddFOp>(c, a);
+  llvm::outs() << "Pattern add(c, a) matched " << countMatches(f, p13)
+               << " times\n";
+
+  auto p14 = m_Op<MulFOp>(a, m_Op<AddFOp>(c, b));
+  llvm::outs() << "Pattern mul(a, add(c, b)) matched " << countMatches(f, p14)
+               << " times\n";
+
+  auto p15 = m_Op<MulFOp>(a, m_Op<AddFOp>(b, c));
+  llvm::outs() << "Pattern mul(a, add(b, c)) matched " << countMatches(f, p15)
+               << " times\n";
+
+  auto mul_of_aany = m_Op<MulFOp>(a, m_Any());
+  auto p16 = m_Op<MulFOp>(mul_of_aany, m_Op<AddFOp>(a, c));
+  llvm::outs() << "Pattern mul(mul(a, *), add(a, c)) matched "
+               << countMatches(f, p16) << " times\n";
+
+  auto p17 = m_Op<MulFOp>(mul_of_aany, m_Op<AddFOp>(c, b));
+  llvm::outs() << "Pattern mul(mul(a, *), add(c, b)) matched "
+               << countMatches(f, p17) << " times\n";
+}
+
+void test2(FuncOp f) {
+  auto a = m_Val(f.getArgument(0));
+  FloatAttr floatAttr;
+  auto p = m_Op<MulFOp>(a, m_Op<AddFOp>(a, m_Constant(&floatAttr)));
+  // Last operation that is not the terminator.
+  Operation *lastOp = f.getBody().front().back().getPrevNode();
+  if (p.match(lastOp))
+    llvm::outs()
+        << "Pattern add(add(a, constant), a) matched and bound constant to: "
+        << floatAttr.getValueAsDouble() << "\n";
+}
+
+void TestMatchers::runOnFunction() {
+  auto f = getFunction();
+  llvm::outs() << f.getName() << "\n";
+  if (f.getName() == "test1")
+    test1(f);
+  if (f.getName() == "test2")
+    test2(f);
+}
+
+static PassRegistration<TestMatchers> pass("test-matchers",
+                                           "Test C++ pattern matchers.");
diff --git a/mlir/test/lib/IR/TestSymbolUses.cpp b/mlir/test/lib/IR/TestSymbolUses.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c8fb1d8eecfc5c78653fa5ca11cc7df9b229a38f
--- /dev/null
+++ b/mlir/test/lib/IR/TestSymbolUses.cpp
@@ -0,0 +1,92 @@
+//===- TestSymbolUses.cpp - Pass to test symbol uselists ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestDialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// This is a symbol test pass that tests the symbol uselist functionality
+/// provided by the symbol table along with erasing from the symbol table.
+struct SymbolUsesPass : public ModulePass<SymbolUsesPass> {
+  void runOnModule() override {
+    auto module = getModule();
+    std::vector<FuncOp> ops_to_delete;
+
+    for (FuncOp func : module.getOps<FuncOp>()) {
+      // Test computing uses on a non symboltable op.
+      Optional<SymbolTable::UseRange> symbolUses =
+          SymbolTable::getSymbolUses(func);
+
+      // Test the conservative failure case.
+      if (!symbolUses) {
+        func.emitRemark() << "function contains an unknown nested operation "
+                             "that 'may' define a new symbol table";
+        return;
+      }
+      if (unsigned numUses = llvm::size(*symbolUses))
+        func.emitRemark() << "function contains " << numUses
+                          << " nested references";
+
+      // Test the functionality of symbolKnownUseEmpty.
+      if (func.symbolKnownUseEmpty(module)) {
+        func.emitRemark() << "function has no uses";
+        if (func.getBody().empty())
+          ops_to_delete.push_back(func);
+        continue;
+      }
+
+      // Test the functionality of getSymbolUses.
+      symbolUses = func.getSymbolUses(module);
+      assert(symbolUses.hasValue() && "expected no unknown operations");
+      for (SymbolTable::SymbolUse symbolUse : *symbolUses) {
+        symbolUse.getUser()->emitRemark()
+            << "found use of function : " << symbolUse.getSymbolRef();
+      }
+      func.emitRemark() << "function has " << llvm::size(*symbolUses)
+                        << " uses";
+    }
+
+    for (FuncOp func : ops_to_delete) {
+      // In order to test the SymbolTable::erase method, also erase completely
+      // useless functions.
+      SymbolTable table(module);
+      auto func_name = func.getName();
+      assert(table.lookup(func_name) && "expected no unknown operations");
+      table.erase(func);
+      assert(!table.lookup(func_name) &&
+             "expected erased operation to be unknown now");
+      module.emitRemark() << func_name << " function successfully erased";
+    }
+  }
+};
+
+/// This is a symbol test pass that tests the symbol use replacement
+/// functionality provided by the symbol table.
+struct SymbolReplacementPass : public ModulePass<SymbolReplacementPass> {
+  void runOnModule() override {
+    auto module = getModule();
+
+    for (FuncOp func : module.getOps<FuncOp>()) {
+      StringAttr newName = func.getAttrOfType<StringAttr>("sym.new_name");
+      if (!newName)
+        continue;
+      if (succeeded(func.replaceAllSymbolUses(newName.getValue(), module)))
+        func.setName(newName.getValue());
+    }
+  }
+};
+} // end anonymous namespace
+
+static PassRegistration<SymbolUsesPass> pass("test-symbol-uses",
+                                             "Test detection of symbol uses");
+
+static PassRegistration<SymbolReplacementPass>
+    rauwPass("test-symbol-rauw", "Test replacement of symbol uses");
diff --git a/mlir/test/lib/Pass/CMakeLists.txt b/mlir/test/lib/Pass/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3289a7add942e81176eb1d20cb47e1944b5a4f15
--- /dev/null
+++ b/mlir/test/lib/Pass/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(MLIRTestPass
+  TestPassManager.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Pass
+  )
+target_link_libraries(MLIRTestPass
+  MLIRIR
+  MLIRPass
+  )
diff --git a/mlir/test/lib/Pass/TestPassManager.cpp b/mlir/test/lib/Pass/TestPassManager.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc926e1c01e9f06a8ea57bf8538c036b2a379ff3
--- /dev/null
+++ b/mlir/test/lib/Pass/TestPassManager.cpp
@@ -0,0 +1,127 @@
+//===- TestPassManager.cpp - Test pass manager functionality --------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+
+using namespace mlir;
+
+namespace {
+struct TestModulePass : public ModulePass<TestModulePass> {
+  void runOnModule() final {}
+};
+struct TestFunctionPass : public FunctionPass<TestFunctionPass> {
+  void runOnFunction() final {}
+};
+class TestOptionsPass : public FunctionPass<TestOptionsPass> {
+public:
+  struct Options : public PassPipelineOptions<Options> {
+    ListOption<int> listOption{*this, "list",
+                               llvm::cl::MiscFlags::CommaSeparated,
+                               llvm::cl::desc("Example list option")};
+    ListOption<std::string> stringListOption{
+        *this, "string-list", llvm::cl::MiscFlags::CommaSeparated,
+        llvm::cl::desc("Example string list option")};
+    Option<std::string> stringOption{*this, "string",
+                                     llvm::cl::desc("Example string option")};
+  };
+  TestOptionsPass() = default;
+  TestOptionsPass(const TestOptionsPass &) {}
+  TestOptionsPass(const Options &options) {
+    listOption->assign(options.listOption.begin(), options.listOption.end());
+    stringOption.setValue(options.stringOption);
+    stringListOption->assign(options.stringListOption.begin(),
+                             options.stringListOption.end());
+  }
+
+  void runOnFunction() final {}
+
+  ListOption<int> listOption{*this, "list", llvm::cl::MiscFlags::CommaSeparated,
+                             llvm::cl::desc("Example list option")};
+  ListOption<std::string> stringListOption{
+      *this, "string-list", llvm::cl::MiscFlags::CommaSeparated,
+      llvm::cl::desc("Example string list option")};
+  Option<std::string> stringOption{*this, "string",
+                                   llvm::cl::desc("Example string option")};
+};
+
+/// A test pass that always aborts to enable testing the crash recovery
+/// mechanism of the pass manager.
+class TestCrashRecoveryPass : public OperationPass<TestCrashRecoveryPass> {
+  void runOnOperation() final { abort(); }
+};
+
+/// A test pass that contains a statistic.
+struct TestStatisticPass : public OperationPass<TestStatisticPass> {
+  TestStatisticPass() = default;
+  TestStatisticPass(const TestStatisticPass &) {}
+
+  Statistic opCount{this, "num-ops", "Number of operations counted"};
+
+  void runOnOperation() final {
+    getOperation()->walk([&](Operation *) { ++opCount; });
+  }
+};
+} // end anonymous namespace
+
+static void testNestedPipeline(OpPassManager &pm) {
+  // Nest a module pipeline that contains:
+  /// A module pass.
+  auto &modulePM = pm.nest<ModuleOp>();
+  modulePM.addPass(std::make_unique<TestModulePass>());
+  /// A nested function pass.
+  auto &nestedFunctionPM = modulePM.nest<FuncOp>();
+  nestedFunctionPM.addPass(std::make_unique<TestFunctionPass>());
+
+  // Nest a function pipeline that contains a single pass.
+  auto &functionPM = pm.nest<FuncOp>();
+  functionPM.addPass(std::make_unique<TestFunctionPass>());
+}
+
+static void testNestedPipelineTextual(OpPassManager &pm) {
+  (void)parsePassPipeline("test-pm-nested-pipeline", pm);
+}
+
+static PassRegistration<TestOptionsPass>
+    reg("test-options-pass", "Test options parsing capabilities");
+
+static PassRegistration<TestModulePass>
+    unusedMP("test-module-pass", "Test a module pass in the pass manager");
+static PassRegistration<TestFunctionPass>
+    unusedFP("test-function-pass", "Test a function pass in the pass manager");
+
+static PassRegistration<TestCrashRecoveryPass>
+    unusedCrashP("test-pass-crash",
+                 "Test a pass in the pass manager that always crashes");
+
+static PassRegistration<TestStatisticPass> unusedStatP("test-stats-pass",
+                                                       "Test pass statistics");
+
+static PassPipelineRegistration<>
+    unused("test-pm-nested-pipeline",
+           "Test a nested pipeline in the pass manager", testNestedPipeline);
+static PassPipelineRegistration<>
+    unusedTextual("test-textual-pm-nested-pipeline",
+                  "Test a nested pipeline in the pass manager",
+                  testNestedPipelineTextual);
+static PassPipelineRegistration<>
+    unusedDump("test-dump-pipeline",
+               "Dumps the pipeline build so far for debugging purposes",
+               [](OpPassManager &pm) {
+                 pm.printAsTextualPipeline(llvm::errs());
+                 llvm::errs() << "\n";
+               });
+
+static PassPipelineRegistration<TestOptionsPass::Options>
+    registerOptionsPassPipeline(
+        "test-options-pass-pipeline",
+        "Parses options using pass pipeline registration",
+        [](OpPassManager &pm, const TestOptionsPass::Options &options) {
+          pm.addPass(std::make_unique<TestOptionsPass>(options));
+        });
diff --git a/mlir/test/lib/TestDialect/CMakeLists.txt b/mlir/test/lib/TestDialect/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e6a22833de4d17bfe6d3d49c9909589a23e9564b
--- /dev/null
+++ b/mlir/test/lib/TestDialect/CMakeLists.txt
@@ -0,0 +1,28 @@
+set(LLVM_OPTIONAL_SOURCES
+  TestDialect.cpp
+  TestPatterns.cpp
+)
+
+set(LLVM_TARGET_DEFINITIONS TestOps.td)
+mlir_tablegen(TestOps.h.inc -gen-op-decls)
+mlir_tablegen(TestOps.cpp.inc -gen-op-defs)
+mlir_tablegen(TestOpEnums.h.inc -gen-enum-decls)
+mlir_tablegen(TestOpEnums.cpp.inc -gen-enum-defs)
+mlir_tablegen(TestPatterns.inc -gen-rewriters)
+add_public_tablegen_target(MLIRTestOpsIncGen)
+
+add_llvm_library(MLIRTestDialect
+  TestDialect.cpp
+  TestPatterns.cpp
+)
+add_dependencies(MLIRTestDialect
+  MLIRTestOpsIncGen
+  MLIRIR
+  LLVMSupport
+  MLIRTypeInferOpInterfaceIncGen
+)
+target_link_libraries(MLIRTestDialect
+  MLIRDialect
+  MLIRIR
+  LLVMSupport
+)
diff --git a/mlir/test/lib/TestDialect/TestDialect.cpp b/mlir/test/lib/TestDialect/TestDialect.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..21cf69ec1fad1c7e6052be3c3d6e61546c5853e1
--- /dev/null
+++ b/mlir/test/lib/TestDialect/TestDialect.cpp
@@ -0,0 +1,316 @@
+//===- TestDialect.cpp - MLIR Dialect for Testing -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestDialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Transforms/FoldUtils.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/StringSwitch.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// TestDialect Interfaces
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Test support for interacting with the AsmPrinter.
+struct TestOpAsmInterface : public OpAsmDialectInterface {
+  using OpAsmDialectInterface::OpAsmDialectInterface;
+
+  void getAsmResultNames(Operation *op,
+                         OpAsmSetValueNameFn setNameFn) const final {
+    if (auto asmOp = dyn_cast<AsmDialectInterfaceOp>(op))
+      setNameFn(asmOp, "result");
+  }
+
+  void getAsmBlockArgumentNames(Block *block,
+                                OpAsmSetValueNameFn setNameFn) const final {
+    auto op = block->getParentOp();
+    auto arrayAttr = op->getAttrOfType<ArrayAttr>("arg_names");
+    if (!arrayAttr)
+      return;
+    auto args = block->getArguments();
+    auto e = std::min(arrayAttr.size(), args.size());
+    for (unsigned i = 0; i < e; ++i) {
+      if (auto strAttr = arrayAttr.getValue()[i].dyn_cast<StringAttr>())
+        setNameFn(args[i], strAttr.getValue());
+    }
+  }
+};
+
+struct TestOpFolderDialectInterface : public OpFolderDialectInterface {
+  using OpFolderDialectInterface::OpFolderDialectInterface;
+
+  /// Registered hook to check if the given region, which is attached to an
+  /// operation that is *not* isolated from above, should be used when
+  /// materializing constants.
+  bool shouldMaterializeInto(Region *region) const final {
+    // If this is a one region operation, then insert into it.
+    return isa<OneRegionOp>(region->getParentOp());
+  }
+};
+
+/// This class defines the interface for handling inlining with standard
+/// operations.
+struct TestInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  bool isLegalToInline(Region *, Region *, BlockAndValueMapping &) const final {
+    // Inlining into test dialect regions is legal.
+    return true;
+  }
+  bool isLegalToInline(Operation *, Region *,
+                       BlockAndValueMapping &) const final {
+    return true;
+  }
+
+  bool shouldAnalyzeRecursively(Operation *op) const final {
+    // Analyze recursively if this is not a functional region operation, it
+    // froms a separate functional scope.
+    return !isa<FunctionalRegionOp>(op);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary.
+  void handleTerminator(Operation *op,
+                        ArrayRef<Value> valuesToRepl) const final {
+    // Only handle "test.return" here.
+    auto returnOp = dyn_cast<TestReturnOp>(op);
+    if (!returnOp)
+      return;
+
+    // Replace the values directly with the return operands.
+    assert(returnOp.getNumOperands() == valuesToRepl.size());
+    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+      valuesToRepl[it.index()]->replaceAllUsesWith(it.value());
+  }
+
+  /// Attempt to materialize a conversion for a type mismatch between a call
+  /// from this dialect, and a callable region. This method should generate an
+  /// operation that takes 'input' as the only operand, and produces a single
+  /// result of 'resultType'. If a conversion can not be generated, nullptr
+  /// should be returned.
+  Operation *materializeCallConversion(OpBuilder &builder, Value input,
+                                       Type resultType,
+                                       Location conversionLoc) const final {
+    // Only allow conversion for i16/i32 types.
+    if (!(resultType.isInteger(16) || resultType.isInteger(32)) ||
+        !(input->getType().isInteger(16) || input->getType().isInteger(32)))
+      return nullptr;
+    return builder.create<TestCastOp>(conversionLoc, resultType, input);
+  }
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// TestDialect
+//===----------------------------------------------------------------------===//
+
+TestDialect::TestDialect(MLIRContext *context)
+    : Dialect(getDialectName(), context) {
+  addOperations<
+#define GET_OP_LIST
+#include "TestOps.cpp.inc"
+      >();
+  addInterfaces<TestOpAsmInterface, TestOpFolderDialectInterface,
+                TestInlinerInterface>();
+  allowUnknownOperations();
+}
+
+LogicalResult TestDialect::verifyOperationAttribute(Operation *op,
+                                                    NamedAttribute namedAttr) {
+  if (namedAttr.first == "test.invalid_attr")
+    return op->emitError() << "invalid to use 'test.invalid_attr'";
+  return success();
+}
+
+LogicalResult TestDialect::verifyRegionArgAttribute(Operation *op,
+                                                    unsigned regionIndex,
+                                                    unsigned argIndex,
+                                                    NamedAttribute namedAttr) {
+  if (namedAttr.first == "test.invalid_attr")
+    return op->emitError() << "invalid to use 'test.invalid_attr'";
+  return success();
+}
+
+LogicalResult
+TestDialect::verifyRegionResultAttribute(Operation *op, unsigned regionIndex,
+                                         unsigned resultIndex,
+                                         NamedAttribute namedAttr) {
+  if (namedAttr.first == "test.invalid_attr")
+    return op->emitError() << "invalid to use 'test.invalid_attr'";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Test IsolatedRegionOp - parse passthrough region arguments.
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseIsolatedRegionOp(OpAsmParser &parser,
+                                         OperationState &result) {
+  OpAsmParser::OperandType argInfo;
+  Type argType = parser.getBuilder().getIndexType();
+
+  // Parse the input operand.
+  if (parser.parseOperand(argInfo) ||
+      parser.resolveOperand(argInfo, argType, result.operands))
+    return failure();
+
+  // Parse the body region, and reuse the operand info as the argument info.
+  Region *body = result.addRegion();
+  return parser.parseRegion(*body, argInfo, argType,
+                            /*enableNameShadowing=*/true);
+}
+
+static void print(OpAsmPrinter &p, IsolatedRegionOp op) {
+  p << "test.isolated_region ";
+  p.printOperand(op.getOperand());
+  p.shadowRegionArgs(op.region(), op.getOperand());
+  p.printRegion(op.region(), /*printEntryBlockArgs=*/false);
+}
+
+//===----------------------------------------------------------------------===//
+// Test parser.
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseWrappedKeywordOp(OpAsmParser &parser,
+                                         OperationState &result) {
+  StringRef keyword;
+  if (parser.parseKeyword(&keyword))
+    return failure();
+  result.addAttribute("keyword", parser.getBuilder().getStringAttr(keyword));
+  return success();
+}
+
+static void print(OpAsmPrinter &p, WrappedKeywordOp op) {
+  p << WrappedKeywordOp::getOperationName() << " " << op.keyword();
+}
+
+//===----------------------------------------------------------------------===//
+// Test WrapRegionOp - wrapping op exercising `parseGenericOperation()`.
+
+static ParseResult parseWrappingRegionOp(OpAsmParser &parser,
+                                         OperationState &result) {
+  if (parser.parseKeyword("wraps"))
+    return failure();
+
+  // Parse the wrapped op in a region
+  Region &body = *result.addRegion();
+  body.push_back(new Block);
+  Block &block = body.back();
+  Operation *wrapped_op = parser.parseGenericOperation(&block, block.begin());
+  if (!wrapped_op)
+    return failure();
+
+  // Create a return terminator in the inner region, pass as operand to the
+  // terminator the returned values from the wrapped operation.
+  SmallVector<Value, 8> return_operands(wrapped_op->getResults());
+  OpBuilder builder(parser.getBuilder().getContext());
+  builder.setInsertionPointToEnd(&block);
+  builder.create<TestReturnOp>(wrapped_op->getLoc(), return_operands);
+
+  // Get the results type for the wrapping op from the terminator operands.
+  Operation &return_op = body.back().back();
+  result.types.append(return_op.operand_type_begin(),
+                      return_op.operand_type_end());
+
+  // Use the location of the wrapped op for the "test.wrapping_region" op.
+  result.location = wrapped_op->getLoc();
+
+  return success();
+}
+
+static void print(OpAsmPrinter &p, WrappingRegionOp op) {
+  p << op.getOperationName() << " wraps ";
+  p.printGenericOp(&op.region().front().front());
+}
+
+//===----------------------------------------------------------------------===//
+// Test PolyForOp - parse list of region arguments.
+//===----------------------------------------------------------------------===//
+
+static ParseResult parsePolyForOp(OpAsmParser &parser, OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 4> ivsInfo;
+  // Parse list of region arguments without a delimiter.
+  if (parser.parseRegionArgumentList(ivsInfo))
+    return failure();
+
+  // Parse the body region.
+  Region *body = result.addRegion();
+  auto &builder = parser.getBuilder();
+  SmallVector<Type, 4> argTypes(ivsInfo.size(), builder.getIndexType());
+  return parser.parseRegion(*body, ivsInfo, argTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// Test removing op with inner ops.
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct TestRemoveOpWithInnerOps
+    : public OpRewritePattern<TestOpWithRegionPattern> {
+  using OpRewritePattern<TestOpWithRegionPattern>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(TestOpWithRegionPattern op,
+                                     PatternRewriter &rewriter) const override {
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+};
+} // end anonymous namespace
+
+void TestOpWithRegionPattern::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<TestRemoveOpWithInnerOps>(context);
+}
+
+OpFoldResult TestOpWithRegionFold::fold(ArrayRef<Attribute> operands) {
+  return operand();
+}
+
+LogicalResult TestOpWithVariadicResultsAndFolder::fold(
+    ArrayRef<Attribute> operands, SmallVectorImpl<OpFoldResult> &results) {
+  for (Value input : this->operands()) {
+    results.push_back(input);
+  }
+  return success();
+}
+
+LogicalResult mlir::OpWithInferTypeInterfaceOp::inferReturnTypes(
+    llvm::Optional<Location> location, ValueRange operands,
+    ArrayRef<NamedAttribute> attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferedReturnTypes) {
+  if (operands[0]->getType() != operands[1]->getType()) {
+    return emitOptionalError(location, "operand type mismatch ",
+                             operands[0]->getType(), " vs ",
+                             operands[1]->getType());
+  }
+  inferedReturnTypes.assign({operands[0]->getType()});
+  return success();
+}
+
+// Static initialization for Test dialect registration.
+static mlir::DialectRegistration<mlir::TestDialect> testDialect;
+
+#include "TestOpEnums.cpp.inc"
+
+#define GET_OP_CLASSES
+#include "TestOps.cpp.inc"
diff --git a/mlir/test/lib/TestDialect/TestDialect.h b/mlir/test/lib/TestDialect/TestDialect.h
new file mode 100644
index 0000000000000000000000000000000000000000..20db0f39b81dfdbe25a3759e13cde5cbd3b02238
--- /dev/null
+++ b/mlir/test/lib/TestDialect/TestDialect.h
@@ -0,0 +1,53 @@
+//===- TestDialect.h - MLIR Dialect for testing -----------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a fake 'test' dialect that can be used for testing things
+// that do not have a respective counterpart in the main source directories.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TESTDIALECT_H
+#define MLIR_TESTDIALECT_H
+
+#include "mlir/Analysis/CallInterfaces.h"
+#include "mlir/Analysis/InferTypeOpInterface.h"
+#include "mlir/Dialect/Traits.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/SymbolTable.h"
+
+#include "TestOpEnums.h.inc"
+
+namespace mlir {
+
+class TestDialect : public Dialect {
+public:
+  /// Create the dialect in the given `context`.
+  TestDialect(MLIRContext *context);
+
+  /// Get the canonical string name of the dialect.
+  static StringRef getDialectName() { return "test"; }
+
+  LogicalResult verifyOperationAttribute(Operation *op,
+                                         NamedAttribute namedAttr) override;
+  LogicalResult verifyRegionArgAttribute(Operation *op, unsigned regionIndex,
+                                         unsigned argIndex,
+                                         NamedAttribute namedAttr) override;
+  LogicalResult verifyRegionResultAttribute(Operation *op, unsigned regionIndex,
+                                            unsigned resultIndex,
+                                            NamedAttribute namedAttr) override;
+};
+
+#define GET_OP_CLASSES
+#include "TestOps.h.inc"
+
+} // end namespace mlir
+
+#endif // MLIR_TESTDIALECT_H
diff --git a/mlir/test/lib/TestDialect/TestOps.td b/mlir/test/lib/TestDialect/TestOps.td
new file mode 100644
index 0000000000000000000000000000000000000000..dacb796de18d9ae748a872a30733ab89edf41863
--- /dev/null
+++ b/mlir/test/lib/TestDialect/TestOps.td
@@ -0,0 +1,1047 @@
+//===-- TestOps.td - Test dialect operation definitions ----*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_OPS
+#define TEST_OPS
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/OpAsmInterface.td"
+include "mlir/Analysis/CallInterfaces.td"
+include "mlir/Analysis/InferTypeOpInterface.td"
+
+def TEST_Dialect : Dialect {
+  let name = "test";
+  let cppNamespace = "";
+}
+
+class TEST_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<TEST_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Test Types
+//===----------------------------------------------------------------------===//
+
+def ComplexF64 : Complex<F64>;
+def ComplexOp : TEST_Op<"complex_f64"> {
+  let results = (outs ComplexF64);
+}
+
+def ComplexTensorOp : TEST_Op<"complex_f64_tensor"> {
+  let results = (outs TensorOf<[ComplexF64]>);
+}
+
+def AnyShaped: ShapedContainerType<[AnyType], IsShapedTypePred, "shaped">;
+
+def TupleOp : TEST_Op<"tuple_32_bit"> {
+  let results = (outs TupleOf<[I32, F32]>);
+}
+
+def NestedTupleOp : TEST_Op<"nested_tuple_32_bit"> {
+  let results = (outs NestedTupleOf<[I32, F32]>);
+}
+
+def TakesStaticMemRefOp : TEST_Op<"takes_static_memref"> {
+  let arguments = (ins AnyStaticShapeMemRef:$x);
+}
+
+def RankLessThan2I8F32MemRefOp : TEST_Op<"rank_less_than_2_I8_F32_memref"> {
+  let results = (outs MemRefRankOf<[I8, F32], [0, 1]>);
+}
+
+def NDTensorOfOp : TEST_Op<"nd_tensor_of"> {
+  let arguments = (ins
+    0DTensorOf<[F32]>:$arg0,
+    1DTensorOf<[F32]>:$arg1,
+    2DTensorOf<[I16]>:$arg2,
+    3DTensorOf<[I16]>:$arg3,
+    4DTensorOf<[I16]>:$arg4
+  );
+}
+
+def RankedTensorOp : TEST_Op<"ranked_tensor_op"> {
+  let arguments = (ins AnyRankedTensor:$input);
+}
+
+def MultiTensorRankOf : TEST_Op<"multi_tensor_rank_of"> {
+  let arguments = (ins
+    TensorRankOf<[I8, I32, F32], [0, 1]>:$arg0
+  );
+}
+
+//===----------------------------------------------------------------------===//
+// Test Operands
+//===----------------------------------------------------------------------===//
+
+def SymbolScopeOp : TEST_Op<"symbol_scope",
+    [SymbolTable, SingleBlockImplicitTerminator<"TerminatorOp">]> {
+  let summary =  "operation which defines a new symbol table";
+  let regions = (region SizedRegion<1>:$region);
+}
+
+def SymbolTableRegionOp : TEST_Op<"symbol_table_region", [SymbolTable]> {
+  let summary =  "operation which defines a new symbol table without a "
+                 "restriction on a terminator";
+  let regions = (region SizedRegion<1>:$region);
+}
+
+//===----------------------------------------------------------------------===//
+// Test Operands
+//===----------------------------------------------------------------------===//
+
+def MixedNormalVariadicOperandOp : TEST_Op<
+    "mixed_normal_variadic_operand", [SameVariadicOperandSize]> {
+  let arguments = (ins
+    Variadic<AnyTensor>:$input1,
+    AnyTensor:$input2,
+    Variadic<AnyTensor>:$input3
+  );
+}
+
+//===----------------------------------------------------------------------===//
+// Test Results
+//===----------------------------------------------------------------------===//
+
+def MixedNormalVariadicResults : TEST_Op<
+    "mixed_normal_variadic_result", [SameVariadicResultSize]> {
+  let results = (outs
+    Variadic<AnyTensor>:$output1,
+    AnyTensor:$output2,
+    Variadic<AnyTensor>:$output3
+  );
+}
+
+//===----------------------------------------------------------------------===//
+// Test Attributes
+//===----------------------------------------------------------------------===//
+
+def NonNegIntAttrOp : TEST_Op<"non_negative_int_attr"> {
+  let arguments = (ins
+      NonNegativeI32Attr:$i32attr,
+      NonNegativeI64Attr:$i64attr
+  );
+}
+
+def PositiveIntAttrOp : TEST_Op<"positive_int_attr"> {
+  let arguments = (ins
+      PositiveI32Attr:$i32attr,
+      PositiveI64Attr:$i64attr
+  );
+}
+
+def TypeArrayAttrOp : TEST_Op<"type_array_attr"> {
+  let arguments = (ins TypeArrayAttr:$attr);
+}
+def TypeStringAttrWithTypeOp : TEST_Op<"string_attr_with_type"> {
+  let arguments = (ins StrAttr:$attr);
+  let printer = [{ p << getAttr("attr"); }];
+  let parser = [{
+    Attribute attr;
+    Type stringType = OpaqueType::get(Identifier::get("foo",
+                                      result.getContext()), "string",
+                                      result.getContext());
+    return parser.parseAttribute(attr, stringType, "attr", result.attributes);
+  }];
+}
+
+def StrCaseA: StrEnumAttrCase<"A">;
+def StrCaseB: StrEnumAttrCase<"B">;
+
+def SomeStrEnum: StrEnumAttr<
+  "SomeStrEnum", "", [StrCaseA, StrCaseB]>;
+
+def StrEnumAttrOp : TEST_Op<"str_enum_attr"> {
+  let arguments = (ins SomeStrEnum:$attr);
+  let results = (outs I32:$val);
+}
+
+def I32Case5:  I32EnumAttrCase<"case5", 5>;
+def I32Case10: I32EnumAttrCase<"case10", 10>;
+
+def SomeI32Enum: I32EnumAttr<
+  "SomeI32Enum", "", [I32Case5, I32Case10]>;
+
+def I32EnumAttrOp : TEST_Op<"i32_enum_attr"> {
+  let arguments = (ins SomeI32Enum:$attr);
+  let results = (outs I32:$val);
+}
+
+def I64Case5:  I64EnumAttrCase<"case5", 5>;
+def I64Case10: I64EnumAttrCase<"case10", 10>;
+
+def SomeI64Enum: I64EnumAttr<
+  "SomeI64Enum", "", [I64Case5, I64Case10]>;
+
+def I64EnumAttrOp : TEST_Op<"i64_enum_attr"> {
+  let arguments = (ins SomeI64Enum:$attr);
+  let results = (outs I32:$val);
+}
+
+def FloatElementsAttrOp : TEST_Op<"float_elements_attr"> {
+  let arguments = (ins
+      RankedF32ElementsAttr<[2]>:$scalar_f32_attr,
+      RankedF64ElementsAttr<[4, 8]>:$tensor_f64_attr
+  );
+}
+
+// A pattern that updates dense<[3.0, 4.0]> to dense<[5.0, 6.0]>.
+// This tests both matching and generating float elements attributes.
+def UpdateFloatElementsAttr : Pat<
+  (FloatElementsAttrOp
+    ConstantAttr<RankedF32ElementsAttr<[2]>, "{3.0f, 4.0f}">:$f32attr,
+    $f64attr),
+  (FloatElementsAttrOp
+    ConstantAttr<RankedF32ElementsAttr<[2]>, "{5.0f, 6.0f}">:$f32attr,
+    $f64attr)>;
+
+//===----------------------------------------------------------------------===//
+// Test Attribute Constraints
+//===----------------------------------------------------------------------===//
+
+def SymbolRefOp : TEST_Op<"symbol_ref_attr"> {
+  let arguments = (ins
+    Confined<FlatSymbolRefAttr, [ReferToOp<"FuncOp">]>:$symbol
+  );
+}
+
+//===----------------------------------------------------------------------===//
+// Test Regions
+//===----------------------------------------------------------------------===//
+
+def OneRegionOp : TEST_Op<"one_region_op", []> {
+  let regions = (region AnyRegion);
+}
+
+def TwoRegionOp : TEST_Op<"two_region_op", []> {
+  let regions = (region AnyRegion, AnyRegion);
+}
+
+def SizedRegionOp : TEST_Op<"sized_region_op", []> {
+  let regions = (region SizedRegion<2>:$my_region, SizedRegion<1>);
+}
+
+//===----------------------------------------------------------------------===//
+// Test Call Interfaces
+//===----------------------------------------------------------------------===//
+
+def ConversionCallOp : TEST_Op<"conversion_call_op",
+    [CallOpInterface]> {
+  let arguments = (ins Variadic<AnyType>:$inputs, FlatSymbolRefAttr:$callee);
+  let results = (outs Variadic<AnyType>);
+
+  let extraClassDeclaration = [{
+    /// Get the argument operands to the called function.
+    operand_range getArgOperands() { return inputs(); }
+
+    /// Return the callee of this operation.
+    CallInterfaceCallable getCallableForCallee() {
+      return getAttrOfType<FlatSymbolRefAttr>("callee");
+    }
+  }];
+}
+
+def FunctionalRegionOp : TEST_Op<"functional_region_op",
+    [CallableOpInterface]> {
+  let regions = (region AnyRegion:$body);
+  let results = (outs FunctionType);
+
+  let extraClassDeclaration = [{
+    Region *getCallableRegion(CallInterfaceCallable) { return &body(); }
+    void getCallableRegions(SmallVectorImpl<Region *> &callables) {
+      callables.push_back(&body());
+    }
+    ArrayRef<Type> getCallableResults(Region *) {
+      return getType().cast<FunctionType>().getResults();
+    }
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Test Traits
+//===----------------------------------------------------------------------===//
+
+def SameOperandElementTypeOp : TEST_Op<"same_operand_element_type",
+    [SameOperandsElementType]> {
+  let arguments = (ins AnyType, AnyType);
+  let results = (outs AnyType);
+}
+
+def SameOperandAndResultElementTypeOp : TEST_Op<"same_operand_and_result_element_type",
+    [SameOperandsAndResultElementType]> {
+  let arguments = (ins Variadic<AnyType>);
+  let results = (outs Variadic<AnyType>);
+}
+
+def SameOperandShapeOp : TEST_Op<"same_operand_shape", [SameOperandsShape]> {
+  let arguments = (ins Variadic<AnyShaped>);
+}
+
+def SameOperandAndResultShapeOp : TEST_Op<"same_operand_and_result_shape",
+    [SameOperandsAndResultShape]> {
+  let arguments = (ins Variadic<AnyShaped>);
+  let results = (outs Variadic<AnyShaped>);
+}
+
+def SameOperandAndResultTypeOp : TEST_Op<"same_operand_and_result_type",
+    [SameOperandsAndResultType]> {
+  let arguments = (ins Variadic<AnyType>);
+  let results = (outs Variadic<AnyType>);
+}
+
+def ArgAndResHaveFixedElementTypesOp :
+    TEST_Op<"arg_and_res_have_fixed_element_types",
+      [PredOpTrait<"fixed type combination",
+         And<[ElementTypeIsPred<"x", I32>,
+              ElementTypeIsPred<"y", F32>]>>,
+      ElementTypeIs<"res", I16>]> {
+  let arguments = (ins
+    AnyShaped:$x, AnyShaped:$y);
+  let results = (outs AnyShaped:$res);
+}
+
+def OperandsHaveSameElementType : TEST_Op<"operands_have_same_element_type", [
+    AllElementTypesMatch<["x", "y"]>]> {
+  let arguments = (ins AnyType:$x, AnyType:$y);
+}
+
+def OperandZeroAndResultHaveSameElementType : TEST_Op<
+    "operand0_and_result_have_same_element_type",
+    [AllElementTypesMatch<["x", "res"]>]> {
+  let arguments = (ins AnyType:$x, AnyType:$y);
+  let results = (outs AnyType:$res);
+}
+
+def OperandsHaveSameType :
+    TEST_Op<"operands_have_same_type", [AllTypesMatch<["x", "y"]>]> {
+  let arguments = (ins AnyType:$x, AnyType:$y);
+}
+
+def OperandZeroAndResultHaveSameType :
+    TEST_Op<"operand0_and_result_have_same_type",
+            [AllTypesMatch<["x", "res"]>]> {
+  let arguments = (ins AnyType:$x, AnyType:$y);
+  let results = (outs AnyType:$res);
+}
+
+def OperandsHaveSameRank :
+    TEST_Op<"operands_have_same_rank", [AllRanksMatch<["x", "y"]>]> {
+  let arguments = (ins AnyShaped:$x, AnyShaped:$y);
+}
+
+def OperandZeroAndResultHaveSameRank :
+    TEST_Op<"operand0_and_result_have_same_rank",
+            [AllRanksMatch<["x", "res"]>]> {
+  let arguments = (ins AnyShaped:$x, AnyShaped:$y);
+  let results = (outs AnyShaped:$res);
+}
+
+def OperandZeroAndResultHaveSameShape :
+    TEST_Op<"operand0_and_result_have_same_shape",
+            [AllShapesMatch<["x", "res"]>]> {
+  let arguments = (ins AnyShaped:$x, AnyShaped:$y);
+  let results = (outs AnyShaped:$res);
+}
+
+def OperandZeroAndResultHaveSameElementCount :
+    TEST_Op<"operand0_and_result_have_same_element_count",
+            [AllElementCountsMatch<["x", "res"]>]> {
+  let arguments = (ins AnyShaped:$x, AnyShaped:$y);
+  let results = (outs AnyShaped:$res);
+}
+
+def FourEqualsFive :
+    TEST_Op<"four_equals_five", [AllMatch<["5", "4"], "4 equals 5">]>;
+
+def OperandRankEqualsResultSize :
+    TEST_Op<"operand_rank_equals_result_size",
+            [AllMatch<[Rank<"operand">.result, ElementCount<"result">.result],
+                      "operand rank equals result size">]> {
+  let arguments = (ins AnyShaped:$operand);
+  let results = (outs AnyShaped:$result);
+}
+
+def IfFirstOperandIsNoneThenSoIsSecond :
+    TEST_Op<"if_first_operand_is_none_then_so_is_second", [PredOpTrait<
+    "has either both none type operands or first is not none",
+     Or<[
+        And<[TypeIsPred<"x", NoneType>, TypeIsPred<"y", NoneType>]>,
+        Neg<TypeIsPred<"x", NoneType>>]>>]> {
+  let arguments = (ins AnyType:$x, AnyType:$y);
+}
+
+def BroadcastableOp : TEST_Op<"broadcastable", [Broadcastable]> {
+  let arguments = (ins AnyTensor, AnyTensor);
+  let results = (outs AnyTensor);
+}
+
+// There the "HasParent" trait.
+def ParentOp : TEST_Op<"parent">;
+def ChildOp : TEST_Op<"child", [HasParent<"ParentOp">]>;
+
+
+def TerminatorOp : TEST_Op<"finish", [Terminator]>;
+def SingleBlockImplicitTerminatorOp : TEST_Op<"SingleBlockImplicitTerminator",
+    [SingleBlockImplicitTerminator<"TerminatorOp">]> {
+  let regions = (region SizedRegion<1>:$region);
+}
+
+def I32ElementsAttrOp : TEST_Op<"i32ElementsAttr"> {
+  let arguments = (ins I32ElementsAttr:$attr);
+}
+
+def OpWithInferTypeInterfaceOp : TEST_Op<"op_with_infer_type_if", [
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let arguments = (ins AnyTensor, AnyTensor);
+  let results = (outs AnyTensor);
+}
+
+def IsNotScalar : Constraint<CPred<"$0.getType().getRank() != 0">>;
+
+def UpdateAttr : Pat<(I32ElementsAttrOp $attr),
+                     (I32ElementsAttrOp ConstantAttr<I32ElementsAttr, "0">),
+                     [(IsNotScalar $attr)]>;
+
+def TestBranchOp : TEST_Op<"br", [Terminator]> {
+  let arguments = (ins Variadic<AnyType>:$operands);
+}
+
+def AttrSizedOperandOp : TEST_Op<"attr_sized_operands",
+                                 [AttrSizedOperandSegments]> {
+  let arguments = (ins
+    Variadic<I32>:$a,
+    Variadic<I32>:$b,
+    I32:$c,
+    Variadic<I32>:$d,
+    I32ElementsAttr:$operand_segment_sizes
+  );
+}
+
+def AttrSizedResultOp : TEST_Op<"attr_sized_results",
+                                [AttrSizedResultSegments]> {
+  let arguments = (ins
+    I32ElementsAttr:$result_segment_sizes
+  );
+  let results = (outs
+    Variadic<I32>:$a,
+    Variadic<I32>:$b,
+    I32:$c,
+    Variadic<I32>:$d
+  );
+}
+
+//===----------------------------------------------------------------------===//
+// Test Patterns
+//===----------------------------------------------------------------------===//
+
+def OpA : TEST_Op<"op_a"> {
+  let arguments = (ins I32, I32Attr:$attr);
+  let results = (outs I32);
+}
+
+def OpB : TEST_Op<"op_b"> {
+  let arguments = (ins I32, I32Attr:$attr);
+  let results = (outs I32);
+}
+
+// Test named pattern.
+def TestNamedPatternRule : Pat<(OpA $input, $attr), (OpB $input, $attr)>;
+
+// Test with fused location.
+def : Pat<(OpA (OpA $input, $attr), $bttr), (OpB $input, $bttr)>;
+
+// Test added benefit.
+def OpD : TEST_Op<"op_d">, Arguments<(ins I32)>, Results<(outs I32)>;
+def OpE : TEST_Op<"op_e">, Arguments<(ins I32)>, Results<(outs I32)>;
+def OpF : TEST_Op<"op_f">, Arguments<(ins I32)>, Results<(outs I32)>;
+def OpG : TEST_Op<"op_g">, Arguments<(ins I32)>, Results<(outs I32)>;
+// Verify that bumping benefit results in selecting different op.
+def : Pat<(OpD $input), (OpE $input)>;
+def : Pat<(OpD $input), (OpF $input), [], (addBenefit 10)>;
+// Verify that patterns with more source nodes are selected before those with fewer.
+def : Pat<(OpG $input), (OpB $input, ConstantAttr<I32Attr, "20">:$attr)>;
+def : Pat<(OpG (OpG $input)), (OpB $input, ConstantAttr<I32Attr, "34">:$attr)>;
+
+// Test patterns for zero-result op.
+def OpH : TEST_Op<"op_h">, Arguments<(ins I32)>, Results<(outs)>;
+def OpI : TEST_Op<"op_i">, Arguments<(ins I32)>, Results<(outs)>;
+def : Pat<(OpH $input), (OpI $input)>;
+
+// Test patterns for zero-input op.
+def OpJ : TEST_Op<"op_j">, Arguments<(ins)>, Results<(outs I32)>;
+def OpK : TEST_Op<"op_k">, Arguments<(ins)>, Results<(outs I32)>;
+def : Pat<(OpJ), (OpK)>;
+
+// Test `$_` for ignoring op argument match.
+def TestIgnoreArgMatchSrcOp : TEST_Op<"ignore_arg_match_src"> {
+  let arguments = (ins
+    AnyType:$a, AnyType:$b, AnyType:$c,
+    AnyAttr:$d, AnyAttr:$e, AnyAttr:$f);
+}
+def TestIgnoreArgMatchDstOp : TEST_Op<"ignore_arg_match_dst"> {
+  let arguments = (ins AnyType:$b, AnyAttr:$f);
+}
+def : Pat<(TestIgnoreArgMatchSrcOp $_, $b, I32, I64Attr:$_, $_, $f),
+          (TestIgnoreArgMatchDstOp $b, $f)>;
+
+def OpInterleavedOperandAttribute1 : TEST_Op<"interleaved_operand_attr1"> {
+  let arguments = (ins
+    I32:$input1,
+    I64Attr:$attr1,
+    I32:$input2,
+    I64Attr:$attr2
+  );
+}
+
+def OpInterleavedOperandAttribute2 : TEST_Op<"interleaved_operand_attr2"> {
+  let arguments = (ins
+    I32:$input1,
+    I64Attr:$attr1,
+    I32:$input2,
+    I64Attr:$attr2
+  );
+}
+
+def ManyArgsOp : TEST_Op<"many_arguments"> {
+  let arguments = (ins
+    I32:$input1, I32:$input2, I32:$input3, I32:$input4, I32:$input5,
+    I32:$input6, I32:$input7, I32:$input8, I32:$input9,
+    I64Attr:$attr1, I64Attr:$attr2, I64Attr:$attr3, I64Attr:$attr4,
+    I64Attr:$attr5, I64Attr:$attr6, I64Attr:$attr7, I64Attr:$attr8,
+    I64Attr:$attr9
+  );
+}
+
+// Test that DRR does not blow up when seeing lots of arguments.
+def : Pat<(ManyArgsOp
+            $input1, $input2, $input3, $input4, $input5,
+            $input6, $input7, $input8, $input9,
+            ConstantAttr<I64Attr, "42">,
+            $attr2, $attr3, $attr4, $attr5, $attr6,
+            $attr7, $attr8, $attr9),
+          (ManyArgsOp
+            $input1, $input2, $input3, $input4, $input5,
+            $input6, $input7, $input8, $input9,
+            ConstantAttr<I64Attr, "24">,
+            $attr2, $attr3, $attr4, $attr5, $attr6,
+            $attr7, $attr8, $attr9)>;
+
+// Test that we can capture and reference interleaved operands and attributes.
+def : Pat<(OpInterleavedOperandAttribute1 $input1, $attr1, $input2, $attr2),
+          (OpInterleavedOperandAttribute2 $input1, $attr1, $input2, $attr2)>;
+
+// Test NativeCodeCall.
+def OpNativeCodeCall1 : TEST_Op<"native_code_call1"> {
+  let arguments = (ins
+    I32:$input1, I32:$input2,
+    BoolAttr:$choice,
+    I64Attr:$attr1, I64Attr:$attr2
+  );
+  let results = (outs I32);
+}
+def OpNativeCodeCall2 : TEST_Op<"native_code_call2"> {
+  let arguments = (ins I32:$input, I64ArrayAttr:$attr);
+  let results = (outs I32);
+}
+// Native code call to invoke a C++ function
+def CreateOperand: NativeCodeCall<"chooseOperand($0, $1, $2)">;
+// Native code call to invoke a C++ expression
+def CreateArrayAttr: NativeCodeCall<"$_builder.getArrayAttr({$0, $1})">;
+// Test that we can use NativeCodeCall to create operand and attribute.
+// This pattern chooses between $input1 and $input2 according to $choice and
+// it combines $attr1 and $attr2 into an array attribute.
+def : Pat<(OpNativeCodeCall1 $input1, $input2,
+                             ConstBoolAttrTrue:$choice, $attr1, $attr2),
+          (OpNativeCodeCall2 (CreateOperand $input1, $input2, $choice),
+                             (CreateArrayAttr $attr1, $attr2))>;
+// Note: the following is just for testing purpose.
+// Should use the replaceWithValue directive instead.
+def UseOpResult: NativeCodeCall<"$0">;
+// Test that we can use NativeCodeCall to create result.
+def : Pat<(OpNativeCodeCall1 $input1, $input2,
+                             ConstBoolAttrFalse, $attr1, $attr2),
+          (UseOpResult $input2)>;
+
+def OpNativeCodeCall3 : TEST_Op<"native_code_call3"> {
+  let arguments = (ins I32:$input);
+  let results = (outs I32);
+}
+// Test that NativeCodeCall is not ignored if it is not used to directly
+// replace the matched root op.
+def : Pattern<(OpNativeCodeCall3 $input),
+              [(NativeCodeCall<"createOpI($_builder, $0)"> $input), (OpK)]>;
+
+// Test AllAttrConstraintsOf.
+def OpAllAttrConstraint1 : TEST_Op<"all_attr_constraint_of1"> {
+  let arguments = (ins I64ArrayAttr:$attr);
+  let results = (outs I32);
+}
+def OpAllAttrConstraint2 : TEST_Op<"all_attr_constraint_of2"> {
+  let arguments = (ins I64ArrayAttr:$attr);
+  let results = (outs I32);
+}
+def Constraint0 : AttrConstraint<
+    CPred<"$_self.cast<ArrayAttr>().getValue()[0]."
+          "cast<IntegerAttr>().getInt() == 0">,
+    "[0] == 0">;
+def Constraint1 : AttrConstraint<
+    CPred<"$_self.cast<ArrayAttr>().getValue()[1]."
+          "cast<IntegerAttr>().getInt() == 1">,
+    "[1] == 1">;
+def : Pat<(OpAllAttrConstraint1
+            AllAttrConstraintsOf<[Constraint0, Constraint1]>:$attr),
+          (OpAllAttrConstraint2 $attr)>;
+
+// Op for testing RewritePattern removing op with inner ops.
+def TestOpWithRegionPattern : TEST_Op<"op_with_region_pattern"> {
+  let regions = (region SizedRegion<1>:$region);
+  let hasCanonicalizer = 1;
+}
+
+// Op for testing trivial removal via folding of op with inner ops and no uses.
+def TestOpWithRegionFoldNoSideEffect : TEST_Op<
+    "op_with_region_fold_no_side_effect", [NoSideEffect]> {
+  let regions = (region SizedRegion<1>:$region);
+}
+
+// Op for testing folding of outer op with inner ops.
+def TestOpWithRegionFold : TEST_Op<"op_with_region_fold"> {
+  let arguments = (ins I32:$operand);
+  let results = (outs I32);
+  let regions = (region SizedRegion<1>:$region);
+  let hasFolder = 1;
+}
+
+def TestOpWithVariadicResultsAndFolder: TEST_Op<"op_with_variadic_results_and_folder"> {
+  let arguments = (ins Variadic<I32>:$operands);
+  let results = (outs Variadic<I32>);
+  let hasFolder = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Test Patterns (Symbol Binding)
+
+// Test symbol binding.
+def OpSymbolBindingA : TEST_Op<"symbol_binding_a", []> {
+  let arguments = (ins I32:$operand, I64Attr:$attr);
+  let results = (outs I32);
+}
+def OpSymbolBindingB : TEST_Op<"symbol_binding_b", []> {
+  let arguments = (ins I32:$operand);
+  let results = (outs I32);
+
+  let builders = [
+    OpBuilder<
+      "Builder *builder, OperationState &state, Value operand",
+      [{
+        state.types.assign({builder->getIntegerType(32)});
+        state.addOperands({operand});
+      }]>
+  ];
+}
+def OpSymbolBindingC : TEST_Op<"symbol_binding_c", []> {
+  let arguments = (ins I32:$operand);
+  let results = (outs I32);
+  let builders = OpSymbolBindingB.builders;
+}
+def OpSymbolBindingD : TEST_Op<"symbol_binding_d", []> {
+  let arguments = (ins I32:$input1, I32:$input2, I64Attr:$attr);
+  let results = (outs I32);
+}
+def HasOneUse: Constraint<CPred<"$0->hasOneUse()">, "has one use">;
+def : Pattern<
+    // Bind to source pattern op operand/attribute/result
+    (OpSymbolBindingA:$res_a $operand, $attr), [
+        // Bind to auxiliary op result
+        (OpSymbolBindingC:$res_c (OpSymbolBindingB:$res_b $operand)),
+
+        // Use bound symbols in resultant ops
+        (OpSymbolBindingD $res_b, $res_c, $attr)],
+    // Use bound symbols in additional constraints
+    [(HasOneUse $res_a)]>;
+
+def OpSymbolBindingNoResult : TEST_Op<"symbol_binding_no_result", []> {
+  let arguments = (ins I32:$operand);
+}
+
+// Test that we can bind to an op without results and reference it later.
+def : Pat<(OpSymbolBindingNoResult:$op $operand),
+          (NativeCodeCall<"handleNoResultOp($_builder, $0)"> $op)>;
+
+//===----------------------------------------------------------------------===//
+// Test Patterns (Attributes)
+
+// Test matching against op attributes.
+def OpAttrMatch1 : TEST_Op<"match_op_attribute1"> {
+  let arguments = (ins
+    I32Attr:$required_attr,
+    OptionalAttr<I32Attr>:$optional_attr,
+    DefaultValuedAttr<I32Attr, "42">:$default_valued_attr,
+    I32Attr:$more_attr
+  );
+  let results = (outs I32);
+}
+def OpAttrMatch2 : TEST_Op<"match_op_attribute2"> {
+  let arguments = OpAttrMatch1.arguments;
+  let results = (outs I32);
+}
+def MoreConstraint : AttrConstraint<
+    CPred<"$_self.cast<IntegerAttr>().getInt() == 4">, "more constraint">;
+def : Pat<(OpAttrMatch1 $required, $optional, $default_valued,
+                        MoreConstraint:$more),
+          (OpAttrMatch2 $required, $optional, $default_valued, $more)>;
+
+// Test unit attrs.
+def OpAttrMatch3 : TEST_Op<"match_op_attribute3"> {
+  let arguments = (ins UnitAttr:$attr);
+  let results = (outs I32);
+}
+def OpAttrMatch4 : TEST_Op<"match_op_attribute4"> {
+  let arguments = (ins UnitAttr:$attr1, UnitAttr:$attr2);
+  let results = (outs I32);
+}
+def : Pat<(OpAttrMatch3 $attr), (OpAttrMatch4 ConstUnitAttr, $attr)>;
+
+// Test with constant attr.
+def OpC : TEST_Op<"op_c">, Arguments<(ins I32)>, Results<(outs I32)>;
+def : Pat<(OpC $input), (OpB $input, ConstantAttr<I32Attr, "17">:$attr)>;
+
+// Test string enum attribute in rewrites.
+def : Pat<(StrEnumAttrOp StrCaseA), (StrEnumAttrOp StrCaseB)>;
+// Test integer enum attribute in rewrites.
+def : Pat<(I32EnumAttrOp I32Case5), (I32EnumAttrOp I32Case10)>;
+def : Pat<(I64EnumAttrOp I64Case5), (I64EnumAttrOp I64Case10)>;
+
+//===----------------------------------------------------------------------===//
+// Test Patterns (Multi-result Ops)
+
+def MultiResultOpKind1: I64EnumAttrCase<"kind1", 1>;
+def MultiResultOpKind2: I64EnumAttrCase<"kind2", 2>;
+def MultiResultOpKind3: I64EnumAttrCase<"kind3", 3>;
+def MultiResultOpKind4: I64EnumAttrCase<"kind4", 4>;
+def MultiResultOpKind5: I64EnumAttrCase<"kind5", 5>;
+def MultiResultOpKind6: I64EnumAttrCase<"kind6", 6>;
+
+def MultiResultOpEnum: I64EnumAttr<
+  "MultiResultOpEnum", "Multi-result op kinds", [
+    MultiResultOpKind1, MultiResultOpKind2, MultiResultOpKind3,
+    MultiResultOpKind4, MultiResultOpKind5, MultiResultOpKind6
+  ]>;
+
+def ThreeResultOp : TEST_Op<"three_result"> {
+  let arguments = (ins MultiResultOpEnum:$kind);
+  let results = (outs I32:$result1, F32:$result2, F32:$result3);
+}
+
+def AnotherThreeResultOp : TEST_Op<"another_three_result"> {
+  let arguments = (ins MultiResultOpEnum:$kind);
+  let results = (outs I32:$result1, F32:$result2, F32:$result3);
+}
+
+def TwoResultOp : TEST_Op<"two_result"> {
+  let arguments = (ins MultiResultOpEnum:$kind);
+  let results = (outs I32:$result1, F32:$result2);
+
+  let builders = [
+    OpBuilder<
+      "Builder *builder, OperationState &state, IntegerAttr kind",
+      [{
+        auto i32 = builder->getIntegerType(32);
+        auto f32 = builder->getF32Type();
+        state.types.assign({i32, f32});
+        state.addAttribute("kind", kind);
+      }]>
+  ];
+}
+
+def AnotherTwoResultOp : TEST_Op<"another_two_result"> {
+  let arguments = (ins MultiResultOpEnum:$kind);
+  let results = (outs F32:$result1, F32:$result2);
+}
+
+def OneResultOp1 : TEST_Op<"one_result1"> {
+  let arguments = (ins MultiResultOpEnum:$kind);
+  let results = (outs F32:$result1);
+}
+
+def OneResultOp2 : TEST_Op<"one_result2"> {
+  let arguments = (ins MultiResultOpEnum:$kind);
+  let results = (outs I32:$result1);
+}
+
+def OneResultOp3 : TEST_Op<"one_result3"> {
+  let arguments = (ins F32);
+  let results = (outs I32:$result1);
+}
+
+// Test using multi-result op as a whole
+def : Pat<(ThreeResultOp MultiResultOpKind1),
+          (AnotherThreeResultOp MultiResultOpKind1)>;
+
+// Test using multi-result op as a whole for partial replacement
+def : Pattern<(ThreeResultOp MultiResultOpKind2),
+              [(TwoResultOp MultiResultOpKind2),
+               (OneResultOp1 MultiResultOpKind2)]>;
+def : Pattern<(ThreeResultOp MultiResultOpKind3),
+              [(OneResultOp2 MultiResultOpKind3),
+               (AnotherTwoResultOp MultiResultOpKind3)]>;
+
+// Test using results separately in a multi-result op
+def : Pattern<(ThreeResultOp MultiResultOpKind4),
+              [(TwoResultOp:$res1__0 MultiResultOpKind4),
+               (OneResultOp1 MultiResultOpKind4),
+               (TwoResultOp:$res2__1 MultiResultOpKind4)]>;
+
+// Test referencing a single value in the value pack
+// This rule only matches TwoResultOp if its second result has no use.
+def : Pattern<(TwoResultOp:$res MultiResultOpKind5),
+              [(OneResultOp2 MultiResultOpKind5),
+               (OneResultOp1 MultiResultOpKind5)],
+              [(HasNoUseOf:$res__1)]>;
+
+// Test using auxiliary ops for replacing multi-result op
+def : Pattern<
+    (ThreeResultOp MultiResultOpKind6), [
+        // Auxiliary op generated to help building the final result but not
+        // directly used to replace the source op's results.
+        (TwoResultOp:$interm MultiResultOpKind6),
+
+        (OneResultOp3 $interm__1),
+        (AnotherTwoResultOp MultiResultOpKind6)
+    ]>;
+
+//===----------------------------------------------------------------------===//
+// Test Patterns (Variadic Ops)
+
+def OneVResOneVOperandOp1 : TEST_Op<"one_variadic_out_one_variadic_in1"> {
+  let arguments = (ins Variadic<I32>);
+  let results = (outs Variadic<I32>);
+}
+def OneVResOneVOperandOp2 : TEST_Op<"one_variadic_out_one_variadic_in2"> {
+  let arguments = (ins Variadic<I32>);
+  let results = (outs Variadic<I32>);
+}
+
+// Rewrite an op with one variadic operand and one variadic result to
+// another similar op.
+def : Pat<(OneVResOneVOperandOp1 $inputs), (OneVResOneVOperandOp2 $inputs)>;
+
+def MixedVOperandOp1 : TEST_Op<"mixed_variadic_in1",
+                               [SameVariadicOperandSize]> {
+  let arguments = (ins
+    Variadic<I32>:$input1,
+    F32:$input2,
+    Variadic<I32>:$input3
+  );
+}
+
+def MixedVOperandOp2 : TEST_Op<"mixed_variadic_in2",
+                               [SameVariadicOperandSize]> {
+  let arguments = (ins
+    Variadic<I32>:$input1,
+    F32:$input2,
+    Variadic<I32>:$input3
+  );
+}
+
+// Rewrite an op with both variadic operands and normal operands.
+def : Pat<(MixedVOperandOp1 $input1, $input2, $input3),
+          (MixedVOperandOp2 $input1, $input2, $input3)>;
+
+def MixedVResultOp1 : TEST_Op<"mixed_variadic_out1", [SameVariadicResultSize]> {
+  let results = (outs
+    Variadic<I32>:$output1,
+    F32:$output2,
+    Variadic<I32>:$output3
+  );
+}
+
+def MixedVResultOp2 : TEST_Op<"mixed_variadic_out2", [SameVariadicResultSize]> {
+  let results = (outs
+    Variadic<I32>:$output1,
+    F32:$output2,
+    Variadic<I32>:$output3
+  );
+}
+
+// Rewrite an op with both variadic results and normal results.
+// Note that because we are generating the op with a top-level result pattern,
+// we are able to deduce the correct result types for the generated op using
+// the information from the matched root op.
+def : Pat<(MixedVResultOp1), (MixedVResultOp2)>;
+
+def OneI32ResultOp : TEST_Op<"one_i32_out"> {
+  let results = (outs I32);
+}
+
+def MixedVOperandOp3 : TEST_Op<"mixed_variadic_in3",
+                               [SameVariadicOperandSize]> {
+  let arguments = (ins
+    I32:$input1,
+    Variadic<I32>:$input2,
+    Variadic<I32>:$input3,
+    I32Attr:$count
+  );
+
+  let results = (outs I32);
+}
+
+def MixedVResultOp3 : TEST_Op<"mixed_variadic_out3",
+                               [SameVariadicResultSize]> {
+  let arguments = (ins I32Attr:$count);
+
+  let results = (outs
+    I32:$output1,
+    Variadic<I32>:$output2,
+    Variadic<I32>:$output3
+  );
+
+  // We will use this op in a nested result pattern, where we cannot deduce the
+  // result type. So need to provide a builder not requiring result types.
+  let builders = [
+    OpBuilder<
+      "Builder *builder, OperationState &state, IntegerAttr count",
+      [{
+        auto i32Type = builder->getIntegerType(32);
+        state.addTypes(i32Type); // $output1
+        SmallVector<Type, 4> types(count.getInt(), i32Type);
+        state.addTypes(types); // $output2
+        state.addTypes(types); // $output3
+        state.addAttribute("count", count);
+      }]>
+  ];
+}
+
+// Generates an op with variadic results using nested pattern.
+def : Pat<(OneI32ResultOp),
+          (MixedVOperandOp3
+              (MixedVResultOp3:$results__0 ConstantAttr<I32Attr, "2">),
+              (replaceWithValue $results__1),
+              (replaceWithValue $results__2),
+              ConstantAttr<I32Attr, "2">)>;
+
+//===----------------------------------------------------------------------===//
+// Test Legalization
+//===----------------------------------------------------------------------===//
+
+def Test_LegalizerEnum_Success : StrEnumAttrCase<"Success">;
+def Test_LegalizerEnum_Failure : StrEnumAttrCase<"Failure">;
+
+def Test_LegalizerEnum : StrEnumAttr<"Success", "Failure",
+  [Test_LegalizerEnum_Success, Test_LegalizerEnum_Failure]>;
+
+def ILLegalOpA : TEST_Op<"illegal_op_a">, Results<(outs I32)>;
+def ILLegalOpB : TEST_Op<"illegal_op_b">, Results<(outs I32)>;
+def ILLegalOpC : TEST_Op<"illegal_op_c">, Results<(outs I32)>;
+def ILLegalOpD : TEST_Op<"illegal_op_d">, Results<(outs I32)>;
+def ILLegalOpE : TEST_Op<"illegal_op_e">, Results<(outs I32)>;
+def ILLegalOpF : TEST_Op<"illegal_op_f">, Results<(outs I32)>;
+def LegalOpA : TEST_Op<"legal_op_a">,
+  Arguments<(ins Test_LegalizerEnum:$status)>, Results<(outs I32)>;
+def LegalOpB : TEST_Op<"legal_op_b">, Results<(outs I32)>;
+
+// Check that smaller pattern depths are chosen, i.e. prioritize more direct
+// mappings.
+def : Pat<(ILLegalOpA), (LegalOpA Test_LegalizerEnum_Success)>;
+
+def : Pat<(ILLegalOpA), (ILLegalOpB)>;
+def : Pat<(ILLegalOpB), (LegalOpA Test_LegalizerEnum_Failure)>;
+
+// Check that the higher benefit pattern is taken for multiple legalizations
+// with the same depth.
+def : Pat<(ILLegalOpC), (ILLegalOpD)>;
+def : Pat<(ILLegalOpD), (LegalOpA Test_LegalizerEnum_Failure)>;
+
+def : Pat<(ILLegalOpC), (ILLegalOpE), [], (addBenefit 10)>;
+def : Pat<(ILLegalOpE), (LegalOpA Test_LegalizerEnum_Success)>;
+
+// Check that patterns use the most up-to-date value when being replaced.
+def TestRewriteOp : TEST_Op<"rewrite">,
+  Arguments<(ins AnyType)>, Results<(outs AnyType)>;
+def : Pat<(TestRewriteOp $input), (replaceWithValue $input)>;
+
+//===----------------------------------------------------------------------===//
+// Test Type Legalization
+//===----------------------------------------------------------------------===//
+
+def TestRegionBuilderOp : TEST_Op<"region_builder">;
+def TestReturnOp : TEST_Op<"return", [Terminator]>,
+  Arguments<(ins Variadic<AnyType>)>;
+def TestCastOp : TEST_Op<"cast">,
+  Arguments<(ins Variadic<AnyType>)>, Results<(outs AnyType)>;
+def TestInvalidOp : TEST_Op<"invalid", [Terminator]>,
+  Arguments<(ins Variadic<AnyType>)>;
+def TestTypeProducerOp : TEST_Op<"type_producer">,
+  Results<(outs AnyType)>;
+def TestTypeConsumerOp : TEST_Op<"type_consumer">,
+  Arguments<(ins AnyType)>;
+def TestValidOp : TEST_Op<"valid", [Terminator]>,
+  Arguments<(ins Variadic<AnyType>)>;
+
+//===----------------------------------------------------------------------===//
+// Test parser.
+//===----------------------------------------------------------------------===//
+
+def WrappedKeywordOp : TEST_Op<"wrapped_keyword"> {
+  let arguments = (ins StrAttr:$keyword);
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+  let printer = [{ return ::print(p, *this); }];
+}
+
+//===----------------------------------------------------------------------===//
+// Test region argument list parsing.
+
+def IsolatedRegionOp : TEST_Op<"isolated_region", [IsolatedFromAbove]> {
+  let summary =  "isolated region operation";
+  let description = [{
+    Test op with an isolated region, to test passthrough region arguments. Each
+    argument is of index type.
+  }];
+
+  let arguments = (ins Index);
+  let regions = (region SizedRegion<1>:$region);
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+  let printer = [{ return ::print(p, *this); }];
+}
+
+def WrappingRegionOp : TEST_Op<"wrapping_region",
+    [SingleBlockImplicitTerminator<"TestReturnOp">]> {
+  let summary =  "wrapping region operation";
+  let description = [{
+    Test op wrapping another op in a region, to test calling
+    parseGenericOperation from the custom parser.
+  }];
+
+  let results = (outs Variadic<AnyType>);
+  let regions = (region SizedRegion<1>:$region);
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+  let printer = [{ return ::print(p, *this); }];
+}
+
+def PolyForOp : TEST_Op<"polyfor">
+{
+  let summary =  "polyfor operation";
+  let description = [{
+    Test op with multiple region arguments, each argument of index type.
+  }];
+
+  let regions = (region SizedRegion<1>:$region);
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+}
+
+//===----------------------------------------------------------------------===//
+// Test OpAsmInterface.
+
+def AsmInterfaceOp : TEST_Op<"asm_interface_op"> {
+  let results = (outs AnyType:$first, Variadic<AnyType>:$middle_results,
+                      AnyType);
+}
+
+def AsmDialectInterfaceOp : TEST_Op<"asm_dialect_interface_op"> {
+  let results = (outs AnyType);
+}
+
+#endif // TEST_OPS
diff --git a/mlir/test/lib/TestDialect/TestPatterns.cpp b/mlir/test/lib/TestDialect/TestPatterns.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..929c4a941a2d4c5b7e891bda9efb925eb6036b09
--- /dev/null
+++ b/mlir/test/lib/TestDialect/TestPatterns.cpp
@@ -0,0 +1,504 @@
+//===- TestPatterns.cpp - Test dialect pattern driver ---------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+using namespace mlir;
+
+// Native function for testing NativeCodeCall
+static Value chooseOperand(Value input1, Value input2, BoolAttr choice) {
+  return choice.getValue() ? input1 : input2;
+}
+
+static void createOpI(PatternRewriter &rewriter, Value input) {
+  rewriter.create<OpI>(rewriter.getUnknownLoc(), input);
+}
+
+void handleNoResultOp(PatternRewriter &rewriter, OpSymbolBindingNoResult op) {
+  // Turn the no result op to a one-result op.
+  rewriter.create<OpSymbolBindingB>(op.getLoc(), op.operand()->getType(),
+                                    op.operand());
+}
+
+namespace {
+#include "TestPatterns.inc"
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Canonicalizer Driver.
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct TestPatternDriver : public FunctionPass<TestPatternDriver> {
+  void runOnFunction() override {
+    mlir::OwningRewritePatternList patterns;
+    populateWithGenerated(&getContext(), &patterns);
+
+    // Verify named pattern is generated with expected name.
+    patterns.insert<TestNamedPatternRule>(&getContext());
+
+    applyPatternsGreedily(getFunction(), patterns);
+  }
+};
+} // end anonymous namespace
+
+static mlir::PassRegistration<TestPatternDriver>
+    pass("test-patterns", "Run test dialect patterns");
+
+//===----------------------------------------------------------------------===//
+// ReturnType Driver.
+//===----------------------------------------------------------------------===//
+
+struct ReturnTypeOpMatch : public RewritePattern {
+  ReturnTypeOpMatch(MLIRContext *ctx)
+      : RewritePattern(OpWithInferTypeInterfaceOp::getOperationName(), 1, ctx) {
+  }
+
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const final {
+    if (auto retTypeFn = dyn_cast<InferTypeOpInterface>(op)) {
+      SmallVector<Value, 4> values(op->getOperands());
+      SmallVector<Type, 2> inferedReturnTypes;
+      if (failed(retTypeFn.inferReturnTypes(op->getLoc(), values,
+                                            op->getAttrs(), op->getRegions(),
+                                            inferedReturnTypes)))
+        return matchFailure();
+      SmallVector<Type, 1> resultTypes(op->getResultTypes());
+      if (!retTypeFn.isCompatibleReturnTypes(inferedReturnTypes, resultTypes))
+        return op->emitOpError(
+                   "inferred type incompatible with return type of operation"),
+               matchFailure();
+
+      // TODO(jpienaar): Split this out to make the test more focused.
+      // Create new op with unknown location to verify building with
+      // InferTypeOpInterface is triggered.
+      auto fop = op->getParentOfType<FuncOp>();
+      if (values[0] == fop.getArgument(0)) {
+        // Use the 2nd function argument if the first function argument is used
+        // when constructing the new op so that a new return type is inferred.
+        values[0] = fop.getArgument(1);
+        values[1] = fop.getArgument(1);
+        // TODO(jpienaar): Expand to regions.
+        rewriter.create<OpWithInferTypeInterfaceOp>(
+            UnknownLoc::get(op->getContext()), values, op->getAttrs());
+      }
+    }
+    return matchFailure();
+  }
+};
+
+namespace {
+struct TestReturnTypeDriver : public FunctionPass<TestReturnTypeDriver> {
+  void runOnFunction() override {
+    mlir::OwningRewritePatternList patterns;
+    populateWithGenerated(&getContext(), &patterns);
+    patterns.insert<ReturnTypeOpMatch>(&getContext());
+    applyPatternsGreedily(getFunction(), patterns);
+  }
+};
+} // end anonymous namespace
+
+static mlir::PassRegistration<TestReturnTypeDriver>
+    rt_pass("test-return-type", "Run return type functions");
+
+//===----------------------------------------------------------------------===//
+// Legalization Driver.
+//===----------------------------------------------------------------------===//
+
+namespace {
+//===----------------------------------------------------------------------===//
+// Region-Block Rewrite Testing
+
+/// This pattern is a simple pattern that inlines the first region of a given
+/// operation into the parent region.
+struct TestRegionRewriteBlockMovement : public ConversionPattern {
+  TestRegionRewriteBlockMovement(MLIRContext *ctx)
+      : ConversionPattern("test.region", 1, ctx) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    // Inline this region into the parent region.
+    auto &parentRegion = *op->getParentRegion();
+    if (op->getAttr("legalizer.should_clone"))
+      rewriter.cloneRegionBefore(op->getRegion(0), parentRegion,
+                                 parentRegion.end());
+    else
+      rewriter.inlineRegionBefore(op->getRegion(0), parentRegion,
+                                  parentRegion.end());
+
+    // Drop this operation.
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+};
+/// This pattern is a simple pattern that generates a region containing an
+/// illegal operation.
+struct TestRegionRewriteUndo : public RewritePattern {
+  TestRegionRewriteUndo(MLIRContext *ctx)
+      : RewritePattern("test.region_builder", 1, ctx) {}
+
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const final {
+    // Create the region operation with an entry block containing arguments.
+    OperationState newRegion(op->getLoc(), "test.region");
+    newRegion.addRegion();
+    auto *regionOp = rewriter.createOperation(newRegion);
+    auto *entryBlock = rewriter.createBlock(&regionOp->getRegion(0));
+    entryBlock->addArgument(rewriter.getIntegerType(64));
+
+    // Add an explicitly illegal operation to ensure the conversion fails.
+    rewriter.create<ILLegalOpF>(op->getLoc(), rewriter.getIntegerType(32));
+    rewriter.create<TestValidOp>(op->getLoc(), ArrayRef<Value>());
+
+    // Drop this operation.
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Type-Conversion Rewrite Testing
+
+/// This patterns erases a region operation that has had a type conversion.
+struct TestDropOpSignatureConversion : public ConversionPattern {
+  TestDropOpSignatureConversion(MLIRContext *ctx, TypeConverter &converter)
+      : ConversionPattern("test.drop_region_op", 1, ctx), converter(converter) {
+  }
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    Region &region = op->getRegion(0);
+    Block *entry = &region.front();
+
+    // Convert the original entry arguments.
+    TypeConverter::SignatureConversion result(entry->getNumArguments());
+    for (unsigned i = 0, e = entry->getNumArguments(); i != e; ++i)
+      if (failed(converter.convertSignatureArg(
+              i, entry->getArgument(i)->getType(), result)))
+        return matchFailure();
+
+    // Convert the region signature and just drop the operation.
+    rewriter.applySignatureConversion(&region, result);
+    rewriter.eraseOp(op);
+    return matchSuccess();
+  }
+
+  /// The type converter to use when rewriting the signature.
+  TypeConverter &converter;
+};
+/// This pattern simply updates the operands of the given operation.
+struct TestPassthroughInvalidOp : public ConversionPattern {
+  TestPassthroughInvalidOp(MLIRContext *ctx)
+      : ConversionPattern("test.invalid", 1, ctx) {}
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    rewriter.replaceOpWithNewOp<TestValidOp>(op, llvm::None, operands,
+                                             llvm::None);
+    return matchSuccess();
+  }
+};
+/// This pattern handles the case of a split return value.
+struct TestSplitReturnType : public ConversionPattern {
+  TestSplitReturnType(MLIRContext *ctx)
+      : ConversionPattern("test.return", 1, ctx) {}
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    // Check for a return of F32.
+    if (op->getNumOperands() != 1 || !op->getOperand(0)->getType().isF32())
+      return matchFailure();
+
+    // Check if the first operation is a cast operation, if it is we use the
+    // results directly.
+    auto *defOp = operands[0]->getDefiningOp();
+    if (auto packerOp = llvm::dyn_cast_or_null<TestCastOp>(defOp)) {
+      rewriter.replaceOpWithNewOp<TestReturnOp>(op, packerOp.getOperands());
+      return matchSuccess();
+    }
+
+    // Otherwise, fail to match.
+    return matchFailure();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Multi-Level Type-Conversion Rewrite Testing
+struct TestChangeProducerTypeI32ToF32 : public ConversionPattern {
+  TestChangeProducerTypeI32ToF32(MLIRContext *ctx)
+      : ConversionPattern("test.type_producer", 1, ctx) {}
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    // If the type is I32, change the type to F32.
+    if (!(*op->result_type_begin()).isInteger(32))
+      return matchFailure();
+    rewriter.replaceOpWithNewOp<TestTypeProducerOp>(op, rewriter.getF32Type());
+    return matchSuccess();
+  }
+};
+struct TestChangeProducerTypeF32ToF64 : public ConversionPattern {
+  TestChangeProducerTypeF32ToF64(MLIRContext *ctx)
+      : ConversionPattern("test.type_producer", 1, ctx) {}
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    // If the type is F32, change the type to F64.
+    if (!(*op->result_type_begin()).isF32())
+      return matchFailure();
+    rewriter.replaceOpWithNewOp<TestTypeProducerOp>(op, rewriter.getF64Type());
+    return matchSuccess();
+  }
+};
+struct TestChangeProducerTypeF32ToInvalid : public ConversionPattern {
+  TestChangeProducerTypeF32ToInvalid(MLIRContext *ctx)
+      : ConversionPattern("test.type_producer", 10, ctx) {}
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    // Always convert to B16, even though it is not a legal type. This tests
+    // that values are unmapped correctly.
+    rewriter.replaceOpWithNewOp<TestTypeProducerOp>(op, rewriter.getBF16Type());
+    return matchSuccess();
+  }
+};
+struct TestUpdateConsumerType : public ConversionPattern {
+  TestUpdateConsumerType(MLIRContext *ctx)
+      : ConversionPattern("test.type_consumer", 1, ctx) {}
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    // Verify that the incoming operand has been successfully remapped to F64.
+    if (!operands[0]->getType().isF64())
+      return matchFailure();
+    rewriter.replaceOpWithNewOp<TestTypeConsumerOp>(op, operands[0]);
+    return matchSuccess();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Non-Root Replacement Rewrite Testing
+/// This pattern generates an invalid operation, but replaces it before the
+/// pattern is finished. This checks that we don't need to legalize the
+/// temporary op.
+struct TestNonRootReplacement : public RewritePattern {
+  TestNonRootReplacement(MLIRContext *ctx)
+      : RewritePattern("test.replace_non_root", 1, ctx) {}
+
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const final {
+    auto resultType = *op->result_type_begin();
+    auto illegalOp = rewriter.create<ILLegalOpF>(op->getLoc(), resultType);
+    auto legalOp = rewriter.create<LegalOpB>(op->getLoc(), resultType);
+
+    rewriter.replaceOp(illegalOp, {legalOp});
+    rewriter.replaceOp(op, {illegalOp});
+    return matchSuccess();
+  }
+};
+} // namespace
+
+namespace {
+struct TestTypeConverter : public TypeConverter {
+  using TypeConverter::TypeConverter;
+
+  LogicalResult convertType(Type t, SmallVectorImpl<Type> &results) override {
+    // Drop I16 types.
+    if (t.isInteger(16))
+      return success();
+
+    // Convert I64 to F64.
+    if (t.isInteger(64)) {
+      results.push_back(FloatType::getF64(t.getContext()));
+      return success();
+    }
+
+    // Split F32 into F16,F16.
+    if (t.isF32()) {
+      results.assign(2, FloatType::getF16(t.getContext()));
+      return success();
+    }
+
+    // Otherwise, convert the type directly.
+    results.push_back(t);
+    return success();
+  }
+
+  /// Override the hook to materialize a conversion. This is necessary because
+  /// we generate 1->N type mappings.
+  Operation *materializeConversion(PatternRewriter &rewriter, Type resultType,
+                                   ArrayRef<Value> inputs,
+                                   Location loc) override {
+    return rewriter.create<TestCastOp>(loc, resultType, inputs);
+  }
+};
+
+struct TestLegalizePatternDriver
+    : public ModulePass<TestLegalizePatternDriver> {
+  /// The mode of conversion to use with the driver.
+  enum class ConversionMode { Analysis, Full, Partial };
+
+  TestLegalizePatternDriver(ConversionMode mode) : mode(mode) {}
+
+  void runOnModule() override {
+    TestTypeConverter converter;
+    mlir::OwningRewritePatternList patterns;
+    populateWithGenerated(&getContext(), &patterns);
+    patterns
+        .insert<TestRegionRewriteBlockMovement, TestRegionRewriteUndo,
+                TestPassthroughInvalidOp, TestSplitReturnType,
+                TestChangeProducerTypeI32ToF32, TestChangeProducerTypeF32ToF64,
+                TestChangeProducerTypeF32ToInvalid, TestUpdateConsumerType,
+                TestNonRootReplacement>(&getContext());
+    patterns.insert<TestDropOpSignatureConversion>(&getContext(), converter);
+    mlir::populateFuncOpTypeConversionPattern(patterns, &getContext(),
+                                              converter);
+
+    // Define the conversion target used for the test.
+    ConversionTarget target(getContext());
+    target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
+    target.addLegalOp<LegalOpA, LegalOpB, TestCastOp, TestValidOp>();
+    target
+        .addIllegalOp<ILLegalOpF, TestRegionBuilderOp, TestOpWithRegionFold>();
+    target.addDynamicallyLegalOp<TestReturnOp>([](TestReturnOp op) {
+      // Don't allow F32 operands.
+      return llvm::none_of(op.getOperandTypes(),
+                           [](Type type) { return type.isF32(); });
+    });
+    target.addDynamicallyLegalOp<FuncOp>(
+        [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
+
+    // Expect the type_producer/type_consumer operations to only operate on f64.
+    target.addDynamicallyLegalOp<TestTypeProducerOp>(
+        [](TestTypeProducerOp op) { return op.getType().isF64(); });
+    target.addDynamicallyLegalOp<TestTypeConsumerOp>([](TestTypeConsumerOp op) {
+      return op.getOperand()->getType().isF64();
+    });
+
+    // Check support for marking certain operations as recursively legal.
+    target.markOpRecursivelyLegal<FuncOp, ModuleOp>([](Operation *op) {
+      return static_cast<bool>(
+          op->getAttrOfType<UnitAttr>("test.recursively_legal"));
+    });
+
+    // Handle a partial conversion.
+    if (mode == ConversionMode::Partial) {
+      (void)applyPartialConversion(getModule(), target, patterns, &converter);
+      return;
+    }
+
+    // Handle a full conversion.
+    if (mode == ConversionMode::Full) {
+      (void)applyFullConversion(getModule(), target, patterns, &converter);
+      return;
+    }
+
+    // Otherwise, handle an analysis conversion.
+    assert(mode == ConversionMode::Analysis);
+
+    // Analyze the convertible operations.
+    DenseSet<Operation *> legalizedOps;
+    if (failed(applyAnalysisConversion(getModule(), target, patterns,
+                                       legalizedOps, &converter)))
+      return signalPassFailure();
+
+    // Emit remarks for each legalizable operation.
+    for (auto *op : legalizedOps)
+      op->emitRemark() << "op '" << op->getName() << "' is legalizable";
+  }
+
+  /// The mode of conversion to use.
+  ConversionMode mode;
+};
+} // end anonymous namespace
+
+static llvm::cl::opt<TestLegalizePatternDriver::ConversionMode>
+    legalizerConversionMode(
+        "test-legalize-mode",
+        llvm::cl::desc("The legalization mode to use with the test driver"),
+        llvm::cl::init(TestLegalizePatternDriver::ConversionMode::Partial),
+        llvm::cl::values(
+            clEnumValN(TestLegalizePatternDriver::ConversionMode::Analysis,
+                       "analysis", "Perform an analysis conversion"),
+            clEnumValN(TestLegalizePatternDriver::ConversionMode::Full, "full",
+                       "Perform a full conversion"),
+            clEnumValN(TestLegalizePatternDriver::ConversionMode::Partial,
+                       "partial", "Perform a partial conversion")));
+
+static mlir::PassRegistration<TestLegalizePatternDriver>
+    legalizer_pass("test-legalize-patterns",
+                   "Run test dialect legalization patterns", [] {
+                     return std::make_unique<TestLegalizePatternDriver>(
+                         legalizerConversionMode);
+                   });
+
+//===----------------------------------------------------------------------===//
+// ConversionPatternRewriter::getRemappedValue testing. This method is used
+// to get the remapped value of a original value that was replaced using
+// ConversionPatternRewriter.
+namespace {
+/// Converter that replaces a one-result one-operand OneVResOneVOperandOp1 with
+/// a one-operand two-result OneVResOneVOperandOp1 by replicating its original
+/// operand twice.
+///
+/// Example:
+///   %1 = test.one_variadic_out_one_variadic_in1"(%0)
+/// is replaced with:
+///   %1 = test.one_variadic_out_one_variadic_in1"(%0, %0)
+struct OneVResOneVOperandOp1Converter
+    : public OpConversionPattern<OneVResOneVOperandOp1> {
+  using OpConversionPattern<OneVResOneVOperandOp1>::OpConversionPattern;
+
+  PatternMatchResult
+  matchAndRewrite(OneVResOneVOperandOp1 op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto origOps = op.getOperands();
+    assert(std::distance(origOps.begin(), origOps.end()) == 1 &&
+           "One operand expected");
+    Value origOp = *origOps.begin();
+    SmallVector<Value, 2> remappedOperands;
+    // Replicate the remapped original operand twice. Note that we don't used
+    // the remapped 'operand' since the goal is testing 'getRemappedValue'.
+    remappedOperands.push_back(rewriter.getRemappedValue(origOp));
+    remappedOperands.push_back(rewriter.getRemappedValue(origOp));
+
+    SmallVector<Type, 1> resultTypes(op.getResultTypes());
+    rewriter.replaceOpWithNewOp<OneVResOneVOperandOp1>(op, resultTypes,
+                                                       remappedOperands);
+    return matchSuccess();
+  }
+};
+
+struct TestRemappedValue : public mlir::FunctionPass<TestRemappedValue> {
+  void runOnFunction() override {
+    mlir::OwningRewritePatternList patterns;
+    patterns.insert<OneVResOneVOperandOp1Converter>(&getContext());
+
+    mlir::ConversionTarget target(getContext());
+    target.addLegalOp<ModuleOp, ModuleTerminatorOp, FuncOp, TestReturnOp>();
+    // We make OneVResOneVOperandOp1 legal only when it has more that one
+    // operand. This will trigger the conversion that will replace one-operand
+    // OneVResOneVOperandOp1 with two-operand OneVResOneVOperandOp1.
+    target.addDynamicallyLegalOp<OneVResOneVOperandOp1>(
+        [](Operation *op) -> bool {
+          return std::distance(op->operand_begin(), op->operand_end()) > 1;
+        });
+
+    if (failed(mlir::applyFullConversion(getFunction(), target, patterns))) {
+      signalPassFailure();
+    }
+  }
+};
+} // end anonymous namespace
+
+static PassRegistration<TestRemappedValue> remapped_value_pass(
+    "test-remapped-value",
+    "Test public remapped value mechanism in ConversionPatternRewriter");
diff --git a/mlir/test/lib/TestDialect/lit.local.cfg b/mlir/test/lib/TestDialect/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..edb5b44b2e2fe3ab6de5b9005de130e21bdb092b
--- /dev/null
+++ b/mlir/test/lib/TestDialect/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes.remove('.td')
\ No newline at end of file
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b6338e1d167cd5608935e020f09f6b5048507dae
--- /dev/null
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -0,0 +1,33 @@
+add_llvm_library(MLIRTestTransforms
+  TestCallGraph.cpp
+  TestConstantFold.cpp
+  TestLoopFusion.cpp
+  TestInlining.cpp
+  TestLinalgTransforms.cpp
+  TestLiveness.cpp
+  TestLoopMapping.cpp
+  TestLoopParametricTiling.cpp
+  TestOpaqueLoc.cpp
+  TestMemRefStrideCalculation.cpp
+  TestVectorToLoopsConversion.cpp
+  TestVectorTransforms.cpp
+  TestVectorizationUtils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
+  )
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../TestDialect)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/../TestDialect)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../DeclarativeTransforms)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/../DeclarativeTransforms)
+add_dependencies(MLIRTestTransforms MLIRStandardOpsIncGen)
+add_dependencies(MLIRTestTransforms MLIRTestLinalgTransformPatternsIncGen)
+add_dependencies(MLIRTestTransforms MLIRTestVectorTransformPatternsIncGen)
+target_link_libraries(MLIRTestTransforms
+  MLIRAffineOps
+  MLIRAnalysis
+  MLIRLoopOps
+  MLIRPass
+  MLIRTestDialect
+  MLIRVectorOps
+  )
diff --git a/mlir/test/lib/Transforms/TestCallGraph.cpp b/mlir/test/lib/Transforms/TestCallGraph.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6378d953648b72e4baccbea0fab7bdc60e24002d
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestCallGraph.cpp
@@ -0,0 +1,30 @@
+//===- TestCallGraph.cpp - Test callgraph construction and iteration ------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains test passes for constructing and iterating over a
+// callgraph.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/CallGraph.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+struct TestCallGraphPass : public ModulePass<TestCallGraphPass> {
+  void runOnModule() {
+    llvm::errs() << "Testing : " << getModule().getAttr("test.name") << "\n";
+    getAnalysis<CallGraph>().print(llvm::errs());
+  }
+};
+} // end anonymous namespace
+
+static PassRegistration<TestCallGraphPass>
+    pass("test-print-callgraph",
+         "Print the contents of a constructed callgraph.");
diff --git a/mlir/test/lib/Transforms/TestConstantFold.cpp b/mlir/test/lib/Transforms/TestConstantFold.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f660bccca1dd7d4245593cc90558cedc5cd3bb57
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestConstantFold.cpp
@@ -0,0 +1,68 @@
+//===- TestConstantFold.cpp - Pass to test constant folding ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/FoldUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple constant folding pass.
+struct TestConstantFold : public FunctionPass<TestConstantFold> {
+  // All constants in the function post folding.
+  SmallVector<Operation *, 8> existingConstants;
+
+  void foldOperation(Operation *op, OperationFolder &helper);
+  void runOnFunction() override;
+};
+} // end anonymous namespace
+
+void TestConstantFold::foldOperation(Operation *op, OperationFolder &helper) {
+  auto processGeneratedConstants = [this](Operation *op) {
+    existingConstants.push_back(op);
+  };
+
+  // Attempt to fold the specified operation, including handling unused or
+  // duplicated constants.
+  (void)helper.tryToFold(op, processGeneratedConstants);
+}
+
+// For now, we do a simple top-down pass over a function folding constants.  We
+// don't handle conditional control flow, block arguments, folding conditional
+// branches, or anything else fancy.
+void TestConstantFold::runOnFunction() {
+  existingConstants.clear();
+
+  // Collect and fold the operations within the function.
+  SmallVector<Operation *, 8> ops;
+  getFunction().walk([&](Operation *op) { ops.push_back(op); });
+
+  // Fold the constants in reverse so that the last generated constants from
+  // folding are at the beginning. This creates somewhat of a linear ordering to
+  // the newly generated constants that matches the operation order and improves
+  // the readability of test cases.
+  OperationFolder helper(&getContext());
+  for (Operation *op : llvm::reverse(ops))
+    foldOperation(op, helper);
+
+  // By the time we are done, we may have simplified a bunch of code, leaving
+  // around dead constants.  Check for them now and remove them.
+  for (auto *cst : existingConstants) {
+    if (cst->use_empty())
+      cst->erase();
+  }
+}
+
+static PassRegistration<TestConstantFold>
+    pass("test-constant-fold", "Test operation constant folding");
diff --git a/mlir/test/lib/Transforms/TestInlining.cpp b/mlir/test/lib/Transforms/TestInlining.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..36378283f8e45b4464075fbc90fc76bb83f621b8
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestInlining.cpp
@@ -0,0 +1,64 @@
+//===- TestInlining.cpp - Pass to inline calls in the test dialect --------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO(riverriddle) This pass is only necessary because the main inlining pass
+// has no abstracted away the call+callee relationship. When the inlining
+// interface has this support, this pass should be removed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestDialect.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/ADT/StringSet.h"
+
+using namespace mlir;
+
+namespace {
+struct Inliner : public FunctionPass<Inliner> {
+  void runOnFunction() override {
+    auto function = getFunction();
+
+    // Collect each of the direct function calls within the module.
+    SmallVector<CallIndirectOp, 16> callers;
+    function.walk([&](CallIndirectOp caller) { callers.push_back(caller); });
+
+    // Build the inliner interface.
+    InlinerInterface interface(&getContext());
+
+    // Try to inline each of the call operations.
+    for (auto caller : callers) {
+      auto callee = dyn_cast_or_null<FunctionalRegionOp>(
+          caller.getCallee()->getDefiningOp());
+      if (!callee)
+        continue;
+
+      // Inline the functional region operation, but only clone the internal
+      // region if there is more than one use.
+      if (failed(inlineRegion(
+              interface, &callee.body(), caller,
+              llvm::to_vector<8>(caller.getArgOperands()),
+              llvm::to_vector<8>(caller.getResults()), caller.getLoc(),
+              /*shouldCloneInlinedRegion=*/!callee.getResult()->hasOneUse())))
+        continue;
+
+      // If the inlining was successful then erase the call and callee if
+      // possible.
+      caller.erase();
+      if (callee.use_empty())
+        callee.erase();
+    }
+  }
+};
+} // end anonymous namespace
+
+static PassRegistration<Inliner> pass("test-inline",
+                                      "Test inlining region calls");
diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6ea995d3dfec447d8a76f6323626625e7c689b01
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -0,0 +1,53 @@
+//===- TestLinalgTransforms.cpp - Test Linalg transformation patterns -----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements logic for testing Linalg transformations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Transforms/LinalgTransforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+namespace mlir {
+namespace linalg {
+namespace {
+#include "TestLinalgTransformPatterns.h.inc"
+} // end namespace
+} // end namespace linalg
+} // end namespace mlir
+
+namespace {
+struct TestLinalgTransforms : public FunctionPass<TestLinalgTransforms> {
+  void runOnFunction() override;
+};
+} // end anonymous namespace
+
+/// Apply transformations specified as patterns.
+void TestLinalgTransforms::runOnFunction() {
+  OwningRewritePatternList patterns;
+  auto funcOp = getFunction();
+
+  // Add the generated patterns to the list.
+  linalg::populateWithGenerated(&getContext(), &patterns);
+  applyPatternsGreedily(funcOp, patterns);
+
+  // Drop the marker.
+  funcOp.walk([](LinalgOp op) {
+    op.removeAttr(LinalgTransforms::kLinalgTransformMarker);
+  });
+}
+
+static PassRegistration<TestLinalgTransforms>
+    pass("test-linalg-transform-patterns",
+         "Test Linalg transformation patterns by applying them greedily.");
diff --git a/mlir/test/lib/Transforms/TestLiveness.cpp b/mlir/test/lib/Transforms/TestLiveness.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..23725740df422153130de53461b33303fd2a6514
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestLiveness.cpp
@@ -0,0 +1,33 @@
+//===- TestLiveness.cpp - Test liveness construction and information
+//-------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains test passes for constructing and resolving liveness
+// information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Liveness.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+
+struct TestLivenessPass : public FunctionPass<TestLivenessPass> {
+  void runOnFunction() override {
+    llvm::errs() << "Testing : " << getFunction().getName() << "\n";
+    getAnalysis<Liveness>().print(llvm::errs());
+  }
+};
+
+} // end anonymous namespace
+
+static PassRegistration<TestLivenessPass>
+    pass("test-print-liveness",
+         "Print the contents of a constructed liveness information.");
diff --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Transforms/TestLoopFusion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..23e5035153e4a8586bfb2d9987aefa884034a6a9
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestLoopFusion.cpp
@@ -0,0 +1,166 @@
+//===- TestLoopFusion.cpp - Test loop fusion ------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to test various loop fusion utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopFusionUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "test-loop-fusion"
+
+using namespace mlir;
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+static llvm::cl::opt<bool> clTestDependenceCheck(
+    "test-loop-fusion-dependence-check",
+    llvm::cl::desc("Enable testing of loop fusion dependence check"),
+    llvm::cl::cat(clOptionsCategory));
+
+static llvm::cl::opt<bool> clTestSliceComputation(
+    "test-loop-fusion-slice-computation",
+    llvm::cl::desc("Enable testing of loop fusion slice computation"),
+    llvm::cl::cat(clOptionsCategory));
+
+namespace {
+
+struct TestLoopFusion : public FunctionPass<TestLoopFusion> {
+  void runOnFunction() override;
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createTestLoopFusionPass() {
+  return std::make_unique<TestLoopFusion>();
+}
+
+// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
+static void
+gatherLoops(Block *block, unsigned currLoopDepth,
+            DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
+  auto &loopsAtDepth = depthToLoops[currLoopDepth];
+  for (auto &op : *block) {
+    if (auto forOp = dyn_cast<AffineForOp>(op)) {
+      loopsAtDepth.push_back(forOp);
+      gatherLoops(forOp.getBody(), currLoopDepth + 1, depthToLoops);
+    }
+  }
+}
+
+// Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths
+// in range ['loopDepth' + 1, 'maxLoopDepth'].
+// Emits a remark on 'loops[i]' if a fusion-preventing dependence exists.
+static void testDependenceCheck(SmallVector<AffineForOp, 2> &loops, unsigned i,
+                                unsigned j, unsigned loopDepth,
+                                unsigned maxLoopDepth) {
+  AffineForOp srcForOp = loops[i];
+  AffineForOp dstForOp = loops[j];
+  mlir::ComputationSliceState sliceUnion;
+  for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
+    FusionResult result =
+        mlir::canFuseLoops(srcForOp, dstForOp, d, &sliceUnion);
+    if (result.value == FusionResult::FailBlockDependence) {
+      srcForOp.getOperation()->emitRemark("block-level dependence preventing"
+                                          " fusion of loop nest ")
+          << i << " into loop nest " << j << " at depth " << loopDepth;
+    }
+  }
+}
+
+// Returns the index of 'op' in its block.
+static unsigned getBlockIndex(Operation &op) {
+  unsigned index = 0;
+  for (auto &opX : *op.getBlock()) {
+    if (&op == &opX)
+      break;
+    ++index;
+  }
+  return index;
+}
+
+// Returns a string representation of 'sliceUnion'.
+static std::string getSliceStr(const mlir::ComputationSliceState &sliceUnion) {
+  std::string result;
+  llvm::raw_string_ostream os(result);
+  // Slice insertion point format [loop-depth, operation-block-index]
+  unsigned ipd = getNestingDepth(*sliceUnion.insertPoint);
+  unsigned ipb = getBlockIndex(*sliceUnion.insertPoint);
+  os << "insert point: (" << std::to_string(ipd) << ", " << std::to_string(ipb)
+     << ")";
+  assert(sliceUnion.lbs.size() == sliceUnion.ubs.size());
+  os << " loop bounds: ";
+  for (unsigned k = 0, e = sliceUnion.lbs.size(); k < e; ++k) {
+    os << '[';
+    sliceUnion.lbs[k].print(os);
+    os << ", ";
+    sliceUnion.ubs[k].print(os);
+    os << "] ";
+  }
+  return os.str();
+}
+
+// Computes fusion slice union on 'loops[i]' and 'loops[j]' at loop depths
+// in range ['loopDepth' + 1, 'maxLoopDepth'].
+// Emits a string representation of the slice union as a remark on 'loops[j]'.
+static void testSliceComputation(SmallVector<AffineForOp, 2> &loops, unsigned i,
+                                 unsigned j, unsigned loopDepth,
+                                 unsigned maxLoopDepth) {
+  AffineForOp forOpA = loops[i];
+  AffineForOp forOpB = loops[j];
+  for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
+    mlir::ComputationSliceState sliceUnion;
+    FusionResult result = mlir::canFuseLoops(forOpA, forOpB, d, &sliceUnion);
+    if (result.value == FusionResult::Success) {
+      forOpB.getOperation()->emitRemark("slice (")
+          << " src loop: " << i << ", dst loop: " << j << ", depth: " << d
+          << " : " << getSliceStr(sliceUnion) << ")";
+    }
+  }
+}
+
+void TestLoopFusion::runOnFunction() {
+  // Gather all AffineForOps by loop depth.
+  DenseMap<unsigned, SmallVector<AffineForOp, 2>> depthToLoops;
+  for (auto &block : getFunction()) {
+    gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
+  }
+
+  // Run tests on all combinations of src/dst loop nests in 'depthToLoops'.
+  for (auto &depthAndLoops : depthToLoops) {
+    unsigned loopDepth = depthAndLoops.first;
+    auto &loops = depthAndLoops.second;
+    unsigned numLoops = loops.size();
+    for (unsigned j = 0; j < numLoops; ++j) {
+      for (unsigned k = 0; k < numLoops; ++k) {
+        if (j == k)
+          continue;
+        if (clTestDependenceCheck)
+          testDependenceCheck(loops, j, k, loopDepth, depthToLoops.size());
+        if (clTestSliceComputation)
+          testSliceComputation(loops, j, k, loopDepth, depthToLoops.size());
+      }
+    }
+  }
+}
+
+static PassRegistration<TestLoopFusion>
+    pass("test-loop-fusion", "Tests loop fusion utility functions.");
diff --git a/mlir/test/lib/Transforms/TestLoopMapping.cpp b/mlir/test/lib/Transforms/TestLoopMapping.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86e5713eb036790b356abe2ca4829f3ff75c3259
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestLoopMapping.cpp
@@ -0,0 +1,56 @@
+//===- TestLoopMapping.cpp --- Parametric loop mapping pass ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to parametrically map loop.for loops to virtual
+// processing element dimensions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/SetVector.h"
+
+using namespace mlir;
+
+namespace {
+class TestLoopMappingPass : public FunctionPass<TestLoopMappingPass> {
+public:
+  explicit TestLoopMappingPass() {}
+
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+
+    // SSA values for the transformation are created out of thin air by
+    // unregistered "new_processor_id_and_range" operations. This is enough to
+    // emulate mapping conditions.
+    SmallVector<Value, 8> processorIds, numProcessors;
+    func.walk([&processorIds, &numProcessors](Operation *op) {
+      if (op->getName().getStringRef() != "new_processor_id_and_range")
+        return;
+      processorIds.push_back(op->getResult(0));
+      numProcessors.push_back(op->getResult(1));
+    });
+
+    func.walk([&processorIds, &numProcessors](loop::ForOp op) {
+      // Ignore nested loops.
+      if (op.getParentRegion()->getParentOfType<loop::ForOp>())
+        return;
+      mapLoopToProcessorIds(op, processorIds, numProcessors);
+    });
+  }
+};
+} // end namespace
+
+static PassRegistration<TestLoopMappingPass>
+    reg("test-mapping-to-processing-elements",
+        "test mapping a single loop on a virtual processor grid",
+        [] { return std::make_unique<TestLoopMappingPass>(); });
diff --git a/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp b/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e793ee54cdaad8e760fa99bf07a96f78a682ed48
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp
@@ -0,0 +1,59 @@
+//===- TestLoopParametricTiling.cpp --- Parametric loop tiling pass -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to parametrically tile nests of standard loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+
+// Extracts fixed-range loops for top-level loop nests with ranges defined in
+// the pass constructor.  Assumes loops are permutable.
+class SimpleParametricLoopTilingPass
+    : public FunctionPass<SimpleParametricLoopTilingPass> {
+public:
+  SimpleParametricLoopTilingPass() = default;
+  SimpleParametricLoopTilingPass(const SimpleParametricLoopTilingPass &) {}
+  explicit SimpleParametricLoopTilingPass(ArrayRef<int64_t> outerLoopSizes) {
+    sizes = outerLoopSizes;
+  }
+
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+    func.walk([this](loop::ForOp op) {
+      // Ignore nested loops.
+      if (op.getParentRegion()->getParentOfType<loop::ForOp>())
+        return;
+      extractFixedOuterLoops(op, sizes);
+    });
+  }
+
+  ListOption<int64_t> sizes{
+      *this, "test-outer-loop-sizes", llvm::cl::MiscFlags::CommaSeparated,
+      llvm::cl::desc(
+          "fixed number of iterations that the outer loops should have")};
+};
+} // end namespace
+
+std::unique_ptr<OpPassBase<FuncOp>>
+mlir::createSimpleParametricTilingPass(ArrayRef<int64_t> outerLoopSizes) {
+  return std::make_unique<SimpleParametricLoopTilingPass>(outerLoopSizes);
+}
+
+static PassRegistration<SimpleParametricLoopTilingPass>
+    reg("test-extract-fixed-outer-loops",
+        "test application of parametric tiling to the outer loops so that the "
+        "ranges of outer loops become static");
diff --git a/mlir/test/lib/Transforms/TestMemRefStrideCalculation.cpp b/mlir/test/lib/Transforms/TestMemRefStrideCalculation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d5e0b7df02b68f8869beaee282eb0bad940c7b93
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestMemRefStrideCalculation.cpp
@@ -0,0 +1,54 @@
+//===- TestMemRefStrideCalculation.cpp - Pass to test strides computation--===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple constant folding pass.
+struct TestMemRefStrideCalculation
+    : public FunctionPass<struct TestMemRefStrideCalculation> {
+  void runOnFunction() override;
+};
+} // end anonymous namespace
+
+// Traverse AllocOp and compute strides of each MemRefType independently.
+void TestMemRefStrideCalculation::runOnFunction() {
+  llvm::outs() << "Testing: " << getFunction().getName() << "\n";
+  getFunction().walk([&](AllocOp allocOp) {
+    auto memrefType = allocOp.getResult()->getType().cast<MemRefType>();
+    int64_t offset;
+    SmallVector<int64_t, 4> strides;
+    if (failed(getStridesAndOffset(memrefType, strides, offset))) {
+      llvm::outs() << "MemRefType " << memrefType << " cannot be converted to "
+                   << "strided form\n";
+      return;
+    }
+    llvm::outs() << "MemRefType offset: ";
+    if (offset == MemRefType::getDynamicStrideOrOffset())
+      llvm::outs() << "?";
+    else
+      llvm::outs() << offset;
+    llvm::outs() << " strides: ";
+    interleaveComma(strides, llvm::outs(), [&](int64_t v) {
+      if (v == MemRefType::getDynamicStrideOrOffset())
+        llvm::outs() << "?";
+      else
+        llvm::outs() << v;
+    });
+    llvm::outs() << "\n";
+  });
+  llvm::outs().flush();
+}
+
+static PassRegistration<TestMemRefStrideCalculation>
+    pass("test-memref-stride-calculation", "Test operation constant folding");
diff --git a/mlir/test/lib/Transforms/TestOpaqueLoc.cpp b/mlir/test/lib/Transforms/TestOpaqueLoc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9a261c0bb3bda4cb9be83c2a1301f82242284f26
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestOpaqueLoc.cpp
@@ -0,0 +1,84 @@
+//===- TestOpaqueLoc.cpp - Pass to test opaque locations ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// Pass that changes locations to opaque locations for each operation.
+/// It also takes all operations that are not function operations or
+/// terminators and clones them with opaque locations which store the initial
+/// locations.
+struct TestOpaqueLoc : public ModulePass<TestOpaqueLoc> {
+
+  /// A simple structure which is used for testing as an underlying location in
+  /// OpaqueLoc.
+  struct MyLocation {
+    MyLocation() : id(42) {}
+    MyLocation(int id) : id(id) {}
+    int getId() { return id; }
+
+    int id;
+  };
+
+  void runOnModule() override {
+    std::vector<std::unique_ptr<MyLocation>> myLocs;
+    int last_it = 0;
+
+    getModule().walk([&](Operation *op) {
+      myLocs.push_back(std::make_unique<MyLocation>(last_it++));
+
+      Location loc = op->getLoc();
+
+      /// Set opaque location without fallback location to test the
+      /// corresponding get method.
+      op->setLoc(
+          OpaqueLoc::get<MyLocation *>(myLocs.back().get(), &getContext()));
+
+      if (isa<FuncOp>(op) || op->isKnownTerminator())
+        return;
+
+      OpBuilder builder(op);
+
+      /// Add the same operation but with fallback location to test the
+      /// corresponding get method and serialization.
+      Operation *op_cloned_1 = builder.clone(*op);
+      op_cloned_1->setLoc(
+          OpaqueLoc::get<MyLocation *>(myLocs.back().get(), loc));
+
+      /// Add the same operation but with void* instead of MyLocation* to test
+      /// getUnderlyingLocationOrNull method.
+      Operation *op_cloned_2 = builder.clone(*op);
+      op_cloned_2->setLoc(OpaqueLoc::get<void *>(nullptr, loc));
+    });
+
+    ScopedDiagnosticHandler diagHandler(&getContext(), [](Diagnostic &diag) {
+      auto &os = llvm::outs();
+      if (diag.getLocation().isa<OpaqueLoc>()) {
+        MyLocation *loc = OpaqueLoc::getUnderlyingLocationOrNull<MyLocation *>(
+            diag.getLocation());
+        if (loc)
+          os << "MyLocation: " << loc->id;
+        else
+          os << "nullptr";
+      }
+      os << ": " << diag << '\n';
+      os.flush();
+    });
+
+    getModule().walk([&](Operation *op) { op->emitOpError(); });
+  }
+};
+
+} // end anonymous namespace
+
+static PassRegistration<TestOpaqueLoc>
+    pass("test-opaque-loc", "Changes all leaf locations to opaque locations");
diff --git a/mlir/test/lib/Transforms/TestVectorToLoopsConversion.cpp b/mlir/test/lib/Transforms/TestVectorToLoopsConversion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a31f8e474b49aeee26cf5970ce17d2d194293911
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestVectorToLoopsConversion.cpp
@@ -0,0 +1,34 @@
+//===- TestVectorToLoopsConversion.cpp - Test VectorTransfers lowering ----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <type_traits>
+
+#include "mlir/Conversion/VectorToLoops/ConvertVectorToLoops.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+
+struct TestVectorToLoopsPass
+    : public FunctionPass<TestVectorToLoopsPass> {
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    auto *context = &getContext();
+    populateVectorToAffineLoopsConversionPatterns(context, patterns);
+    applyPatternsGreedily(getFunction(), patterns);
+  }
+};
+
+} // end anonymous namespace
+
+static PassRegistration<TestVectorToLoopsPass>
+    pass("test-convert-vector-to-loops",
+         "Converts vector transfer ops to loops over scalars and vector casts");
diff --git a/mlir/test/lib/Transforms/TestVectorTransforms.cpp b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..664d49ab4e50e62947950e820853f3bb5c4c0623
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
@@ -0,0 +1,38 @@
+//===- TestVectorToVectorConversion.cpp - Test VectorTransfers lowering ---===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <type_traits>
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Dialect/VectorOps/VectorOps.h"
+#include "mlir/Dialect/VectorOps/VectorTransforms.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::vector;
+
+namespace {
+#include "TestVectorTransformPatterns.h.inc"
+
+struct TestVectorToVectorConversion
+    : public FunctionPass<TestVectorToVectorConversion> {
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    auto *context = &getContext();
+    populateWithGenerated(context, &patterns);
+    populateVectorToVectorCanonicalizationPatterns(patterns, context);
+    populateVectorToVectorTransformationPatterns(patterns, context);
+    applyPatternsGreedily(getFunction(), patterns);
+  }
+};
+} // end anonymous namespace
+
+static PassRegistration<TestVectorToVectorConversion>
+    pass("test-vector-to-vector-conversion",
+         "Test conversion patterns between ops in the vector dialect");
diff --git a/mlir/test/lib/Transforms/TestVectorizationUtils.cpp b/mlir/test/lib/Transforms/TestVectorizationUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6f4d948e55f5dc6ac21ad023c482a8827029102e
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestVectorizationUtils.cpp
@@ -0,0 +1,292 @@
+//===- VectorizerTestPass.cpp - VectorizerTestPass Pass Impl --------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple testing pass for vectorization functionality.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/NestedMatcher.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/VectorOps/Utils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "affine-vectorizer-test"
+
+using namespace mlir;
+
+using llvm::SetVector;
+
+using functional::map;
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+static llvm::cl::list<int> clTestVectorShapeRatio(
+    "vector-shape-ratio",
+    llvm::cl::desc("Specify the HW vector size for vectorization"),
+    llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
+static llvm::cl::opt<bool> clTestForwardSlicingAnalysis(
+    "forward-slicing",
+    llvm::cl::desc("Enable testing forward static slicing and topological sort "
+                   "functionalities"),
+    llvm::cl::cat(clOptionsCategory));
+static llvm::cl::opt<bool> clTestBackwardSlicingAnalysis(
+    "backward-slicing",
+    llvm::cl::desc("Enable testing backward static slicing and "
+                   "topological sort functionalities"),
+    llvm::cl::cat(clOptionsCategory));
+static llvm::cl::opt<bool> clTestSlicingAnalysis(
+    "slicing",
+    llvm::cl::desc("Enable testing static slicing and topological sort "
+                   "functionalities"),
+    llvm::cl::cat(clOptionsCategory));
+static llvm::cl::opt<bool> clTestComposeMaps(
+    "compose-maps",
+    llvm::cl::desc(
+        "Enable testing the composition of AffineMap where each "
+        "AffineMap in the composition is specified as the affine_map attribute "
+        "in a constant op."),
+    llvm::cl::cat(clOptionsCategory));
+static llvm::cl::opt<bool> clTestNormalizeMaps(
+    "normalize-maps",
+    llvm::cl::desc(
+        "Enable testing the normalization of AffineAffineApplyOp "
+        "where each AffineAffineApplyOp in the composition is a single output "
+        "operation."),
+    llvm::cl::cat(clOptionsCategory));
+
+namespace {
+struct VectorizerTestPass : public FunctionPass<VectorizerTestPass> {
+  static constexpr auto kTestAffineMapOpName = "test_affine_map";
+  static constexpr auto kTestAffineMapAttrName = "affine_map";
+
+  void runOnFunction() override;
+  void testVectorShapeRatio(llvm::raw_ostream &outs);
+  void testForwardSlicing(llvm::raw_ostream &outs);
+  void testBackwardSlicing(llvm::raw_ostream &outs);
+  void testSlicing(llvm::raw_ostream &outs);
+  void testComposeMaps(llvm::raw_ostream &outs);
+  void testNormalizeMaps();
+};
+
+} // end anonymous namespace
+
+void VectorizerTestPass::testVectorShapeRatio(llvm::raw_ostream &outs) {
+  auto f = getFunction();
+  using matcher::Op;
+  SmallVector<int64_t, 8> shape(clTestVectorShapeRatio.begin(),
+                                clTestVectorShapeRatio.end());
+  auto subVectorType =
+      VectorType::get(shape, FloatType::getF32(f.getContext()));
+  // Only filter operations that operate on a strict super-vector and have one
+  // return. This makes testing easier.
+  auto filter = [&](Operation &op) {
+    assert(subVectorType.getElementType().isF32() &&
+           "Only f32 supported for now");
+    if (!matcher::operatesOnSuperVectorsOf(op, subVectorType)) {
+      return false;
+    }
+    if (op.getNumResults() != 1) {
+      return false;
+    }
+    return true;
+  };
+  auto pat = Op(filter);
+  SmallVector<NestedMatch, 8> matches;
+  pat.match(f, &matches);
+  for (auto m : matches) {
+    auto *opInst = m.getMatchedOperation();
+    // This is a unit test that only checks and prints shape ratio.
+    // As a consequence we write only Ops with a single return type for the
+    // purpose of this test. If we need to test more intricate behavior in the
+    // future we can always extend.
+    auto superVectorType = opInst->getResult(0)->getType().cast<VectorType>();
+    auto ratio = shapeRatio(superVectorType, subVectorType);
+    if (!ratio.hasValue()) {
+      opInst->emitRemark("NOT MATCHED");
+    } else {
+      outs << "\nmatched: " << *opInst << " with shape ratio: ";
+      interleaveComma(MutableArrayRef<int64_t>(*ratio), outs);
+    }
+  }
+}
+
+static NestedPattern patternTestSlicingOps() {
+  using functional::map;
+  using matcher::Op;
+  // Match all operations with the kTestSlicingOpName name.
+  auto filter = [](Operation &op) {
+    // Just use a custom op name for this test, it makes life easier.
+    return op.getName().getStringRef() == "slicing-test-op";
+  };
+  return Op(filter);
+}
+
+void VectorizerTestPass::testBackwardSlicing(llvm::raw_ostream &outs) {
+  auto f = getFunction();
+  outs << "\n" << f.getName();
+
+  SmallVector<NestedMatch, 8> matches;
+  patternTestSlicingOps().match(f, &matches);
+  for (auto m : matches) {
+    SetVector<Operation *> backwardSlice;
+    getBackwardSlice(m.getMatchedOperation(), &backwardSlice);
+    outs << "\nmatched: " << *m.getMatchedOperation()
+         << " backward static slice: ";
+    for (auto *op : backwardSlice)
+      outs << "\n" << *op;
+  }
+}
+
+void VectorizerTestPass::testForwardSlicing(llvm::raw_ostream &outs) {
+  auto f = getFunction();
+  outs << "\n" << f.getName();
+
+  SmallVector<NestedMatch, 8> matches;
+  patternTestSlicingOps().match(f, &matches);
+  for (auto m : matches) {
+    SetVector<Operation *> forwardSlice;
+    getForwardSlice(m.getMatchedOperation(), &forwardSlice);
+    outs << "\nmatched: " << *m.getMatchedOperation()
+         << " forward static slice: ";
+    for (auto *op : forwardSlice)
+      outs << "\n" << *op;
+  }
+}
+
+void VectorizerTestPass::testSlicing(llvm::raw_ostream &outs) {
+  auto f = getFunction();
+  outs << "\n" << f.getName();
+
+  SmallVector<NestedMatch, 8> matches;
+  patternTestSlicingOps().match(f, &matches);
+  for (auto m : matches) {
+    SetVector<Operation *> staticSlice = getSlice(m.getMatchedOperation());
+    outs << "\nmatched: " << *m.getMatchedOperation() << " static slice: ";
+    for (auto *op : staticSlice)
+      outs << "\n" << *op;
+  }
+}
+
+static bool customOpWithAffineMapAttribute(Operation &op) {
+  return op.getName().getStringRef() ==
+         VectorizerTestPass::kTestAffineMapOpName;
+}
+
+void VectorizerTestPass::testComposeMaps(llvm::raw_ostream &outs) {
+  auto f = getFunction();
+
+  using matcher::Op;
+  auto pattern = Op(customOpWithAffineMapAttribute);
+  SmallVector<NestedMatch, 8> matches;
+  pattern.match(f, &matches);
+  SmallVector<AffineMap, 4> maps;
+  maps.reserve(matches.size());
+  for (auto m : llvm::reverse(matches)) {
+    auto *opInst = m.getMatchedOperation();
+    auto map = opInst->getAttr(VectorizerTestPass::kTestAffineMapAttrName)
+                   .cast<AffineMapAttr>()
+                   .getValue();
+    maps.push_back(map);
+  }
+  AffineMap res;
+  for (auto m : maps) {
+    res = res ? res.compose(m) : m;
+  }
+  simplifyAffineMap(res).print(outs << "\nComposed map: ");
+}
+
+static bool affineApplyOp(Operation &op) { return isa<AffineApplyOp>(op); }
+
+static bool singleResultAffineApplyOpWithoutUses(Operation &op) {
+  auto app = dyn_cast<AffineApplyOp>(op);
+  return app && app.use_empty();
+}
+
+void VectorizerTestPass::testNormalizeMaps() {
+  using matcher::Op;
+
+  auto f = getFunction();
+
+  // Save matched AffineApplyOp that all need to be erased in the end.
+  auto pattern = Op(affineApplyOp);
+  SmallVector<NestedMatch, 8> toErase;
+  pattern.match(f, &toErase);
+  {
+    // Compose maps.
+    auto pattern = Op(singleResultAffineApplyOpWithoutUses);
+    SmallVector<NestedMatch, 8> matches;
+    pattern.match(f, &matches);
+    for (auto m : matches) {
+      auto app = cast<AffineApplyOp>(m.getMatchedOperation());
+      OpBuilder b(m.getMatchedOperation());
+      SmallVector<Value, 8> operands(app.getOperands());
+      makeComposedAffineApply(b, app.getLoc(), app.getAffineMap(), operands);
+    }
+  }
+  // We should now be able to erase everything in reverse order in this test.
+  for (auto m : llvm::reverse(toErase)) {
+    m.getMatchedOperation()->erase();
+  }
+}
+
+void VectorizerTestPass::runOnFunction() {
+  // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+  NestedPatternContext mlContext;
+
+  // Only support single block functions at this point.
+  FuncOp f = getFunction();
+  if (f.getBlocks().size() != 1)
+    return;
+
+  std::string str;
+  llvm::raw_string_ostream outs(str);
+
+  if (!clTestVectorShapeRatio.empty())
+    testVectorShapeRatio(outs);
+
+  if (clTestForwardSlicingAnalysis)
+    testForwardSlicing(outs);
+
+  if (clTestBackwardSlicingAnalysis)
+    testBackwardSlicing(outs);
+
+  if (clTestSlicingAnalysis)
+    testSlicing(outs);
+
+  if (clTestComposeMaps)
+    testComposeMaps(outs);
+
+  if (clTestNormalizeMaps)
+    testNormalizeMaps();
+
+  if (!outs.str().empty()) {
+    emitRemark(UnknownLoc::get(&getContext()), outs.str());
+  }
+}
+
+std::unique_ptr<OpPassBase<FuncOp>> mlir::createVectorizerTestPass() {
+  return std::make_unique<VectorizerTestPass>();
+}
+
+static PassRegistration<VectorizerTestPass>
+    pass("affine-vectorizer-test",
+         "Tests vectorizer standalone functionality.");
+
+#undef DEBUG_TYPE
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
new file mode 100644
index 0000000000000000000000000000000000000000..6333eaab89a9407c957782a0cf36e9cfb6b0c657
--- /dev/null
+++ b/mlir/test/lit.cfg.py
@@ -0,0 +1,73 @@
+# -*- Python -*-
+
+import os
+import platform
+import re
+import subprocess
+import tempfile
+
+import lit.formats
+import lit.util
+
+from lit.llvm import llvm_config
+from lit.llvm.subst import ToolSubst
+from lit.llvm.subst import FindTool
+
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'MLIR'
+
+config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.td', '.mlir', '.toy', '.ll']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.mlir_obj_root, 'test')
+
+config.substitutions.append(('%PATH%', config.environment['PATH']))
+config.substitutions.append(('%shlibext', config.llvm_shlib_ext))
+
+llvm_config.with_system_environment(
+    ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP'])
+
+llvm_config.use_default_substitutions()
+
+# excludes: A list of directories to exclude from the testsuite. The 'Inputs'
+# subdirectories contain auxiliary inputs for various tests in their parent
+# directories.
+config.excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.mlir_obj_root, 'test')
+
+# Tweak the PATH to include the tools dir.
+llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
+
+tool_dirs = [config.mlir_tools_dir, config.llvm_tools_dir]
+tools = [
+    'mlir-opt',
+    'mlir-tblgen',
+    'mlir-translate',
+    'mlir-edsc-builder-api-test',
+]
+
+# The following tools are optional
+tools.extend([
+    ToolSubst('toy-ch1', unresolved='ignore'),
+    ToolSubst('toy-ch2', unresolved='ignore'),
+    ToolSubst('toy-ch3', unresolved='ignore'),
+    ToolSubst('toy-ch4', unresolved='ignore'),
+    ToolSubst('toy-ch5', unresolved='ignore'),
+    ToolSubst('%linalg_test_lib_dir', config.linalg_test_lib_dir, unresolved='ignore'),
+    ToolSubst('%cuda_wrapper_library_dir', config.cuda_wrapper_library_dir, unresolved='ignore')
+])
+
+llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
new file mode 100644
index 0000000000000000000000000000000000000000..aab566173e6c84e31bafc9a2b1ced7ccdb1b4cfb
--- /dev/null
+++ b/mlir/test/lit.site.cfg.py.in
@@ -0,0 +1,54 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.host_triple = "@LLVM_HOST_TRIPLE@"
+config.target_triple = "@TARGET_TRIPLE@"
+config.llvm_src_root = "@LLVM_SOURCE_DIR@"
+config.llvm_obj_root = "@LLVM_BINARY_DIR@"
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
+config.llvm_shlib_dir = "@SHLIBDIR@"
+config.llvm_shlib_ext = "@SHLIBEXT@"
+config.llvm_exe_ext = "@EXEEXT@"
+config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
+config.python_executable = "@PYTHON_EXECUTABLE@"
+config.gold_executable = "@GOLD_EXECUTABLE@"
+config.ld64_executable = "@LD64_EXECUTABLE@"
+config.enable_shared = @ENABLE_SHARED@
+config.enable_assertions = @ENABLE_ASSERTIONS@
+config.targets_to_build = "@TARGETS_TO_BUILD@"
+config.native_target = "@LLVM_NATIVE_ARCH@"
+config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
+config.host_os = "@HOST_OS@"
+config.host_cc = "@HOST_CC@"
+config.host_cxx = "@HOST_CXX@"
+# Note: ldflags can contain double-quoted paths, so must use single quotes here.
+config.host_ldflags = '@HOST_LDFLAGS@'
+config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
+config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
+config.host_arch = "@HOST_ARCH@"
+config.mlir_src_root = "@MLIR_SOURCE_DIR@"
+config.mlir_obj_root = "@MLIR_BINARY_DIR@"
+config.mlir_tools_dir = "@MLIR_TOOLS_DIR@"
+config.linalg_test_lib_dir = "@MLIR_DIALECT_LINALG_INTEGRATION_TEST_LIB_DIR@"
+config.build_examples = @LLVM_BUILD_EXAMPLES@
+config.run_cuda_tests = @MLIR_CUDA_CONVERSIONS_ENABLED@
+config.cuda_wrapper_library_dir = "@MLIR_CUDA_WRAPPER_LIBRARY_DIR@"
+config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@
+
+# Support substitution of the tools_dir with user parameters. This is
+# used when we can't determine the tool dir at configuration time.
+try:
+    config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
+    config.llvm_shlib_dir = config.llvm_shlib_dir % lit_config.params
+except KeyError:
+    e = sys.exc_info()[1]
+    key, = e.args
+    lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@MLIR_SOURCE_DIR@/test/lit.cfg.py")
diff --git a/mlir/test/mlir-cpu-runner/CMakeLists.txt b/mlir/test/mlir-cpu-runner/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..94f50a4a611c5fabc418af01cc99dbef5b47499c
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(LLVM_OPTIONAL_SOURCES
+  cblas.cpp
+  cblas_interface.cpp
+  mlir_runner_utils.cpp
+  )
+
+add_llvm_library(cblas SHARED cblas.cpp)
+target_compile_definitions(cblas PRIVATE cblas_EXPORTS)
+
+add_llvm_library(cblas_interface SHARED cblas_interface.cpp)
+target_link_libraries(cblas_interface PRIVATE cblas)
+
+add_llvm_library(mlir_runner_utils SHARED mlir_runner_utils.cpp)
+target_compile_definitions(mlir_runner_utils PRIVATE mlir_runner_utils_EXPORTS)
+
diff --git a/mlir/test/mlir-cpu-runner/cblas.cpp b/mlir/test/mlir-cpu-runner/cblas.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aebb8f212b48a17dc86c0c8c1e1d90a2d70fbde0
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/cblas.cpp
@@ -0,0 +1,47 @@
+//===- cblas.cpp - Simple Blas subset implementation ----------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple Blas subset implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/cblas.h"
+#include <assert.h>
+
+extern "C" float cblas_sdot(const int N, const float *X, const int incX,
+                            const float *Y, const int incY) {
+  float res = 0.0f;
+  for (int i = 0; i < N; ++i)
+    res += X[i * incX] * Y[i * incY];
+  return res;
+}
+
+extern "C" void cblas_sgemm(const enum CBLAS_ORDER Order,
+                            const enum CBLAS_TRANSPOSE TransA,
+                            const enum CBLAS_TRANSPOSE TransB, const int M,
+                            const int N, const int K, const float alpha,
+                            const float *A, const int lda, const float *B,
+                            const int ldb, const float beta, float *C,
+                            const int ldc) {
+  assert(Order == CBLAS_ORDER::CblasRowMajor);
+  assert(TransA == CBLAS_TRANSPOSE::CblasNoTrans);
+  assert(TransB == CBLAS_TRANSPOSE::CblasNoTrans);
+  for (int m = 0; m < M; ++m) {
+    auto *pA = A + m * lda;
+    auto *pC = C + m * ldc;
+    for (int n = 0; n < N; ++n) {
+      float c = pC[n];
+      float res = 0.0f;
+      for (int k = 0; k < K; ++k) {
+        auto *pB = B + k * ldb;
+        res += pA[k] * pB[n];
+      }
+      pC[n] = alpha * c + beta * res;
+    }
+  }
+}
diff --git a/mlir/test/mlir-cpu-runner/cblas_interface.cpp b/mlir/test/mlir-cpu-runner/cblas_interface.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5e3a00e7fd1acf88ada48f8b70fe7ce727c966d1
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/cblas_interface.cpp
@@ -0,0 +1,105 @@
+//===- cblas_interface.cpp - Simple Blas subset interface -----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple Blas subset interface implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/cblas.h"
+#include <assert.h>
+#include <iostream>
+
+extern "C" void linalg_fill_viewf32_f32(StridedMemRefType<float, 0> *X,
+                                        float f) {
+  X->data[X->offset] = f;
+}
+
+extern "C" void linalg_fill_viewsxf32_f32(StridedMemRefType<float, 1> *X,
+                                          float f) {
+  for (unsigned i = 0; i < X->sizes[0]; ++i)
+    *(X->data + X->offset + i * X->strides[0]) = f;
+}
+
+extern "C" void linalg_fill_viewsxsxf32_f32(StridedMemRefType<float, 2> *X,
+                                            float f) {
+  for (unsigned i = 0; i < X->sizes[0]; ++i)
+    for (unsigned j = 0; j < X->sizes[1]; ++j)
+      *(X->data + X->offset + i * X->strides[0] + j * X->strides[1]) = f;
+}
+
+extern "C" void linalg_copy_viewf32_viewf32(StridedMemRefType<float, 0> *I,
+                                            StridedMemRefType<float, 0> *O) {
+  O->data[O->offset] = I->data[I->offset];
+}
+
+extern "C" void
+linalg_copy_viewsxf32_viewsxf32(StridedMemRefType<float, 1> *I,
+                                StridedMemRefType<float, 1> *O) {
+  if (I->sizes[0] != O->sizes[0]) {
+    std::cerr << "Incompatible strided memrefs\n";
+    printMemRefMetaData(std::cerr, *I);
+    printMemRefMetaData(std::cerr, *O);
+    return;
+  }
+  for (unsigned i = 0; i < I->sizes[0]; ++i)
+    O->data[O->offset + i * O->strides[0]] =
+        I->data[I->offset + i * I->strides[0]];
+}
+
+extern "C" void
+linalg_copy_viewsxsxf32_viewsxsxf32(StridedMemRefType<float, 2> *I,
+                                    StridedMemRefType<float, 2> *O) {
+  if (I->sizes[0] != O->sizes[0] || I->sizes[1] != O->sizes[1]) {
+    std::cerr << "Incompatible strided memrefs\n";
+    printMemRefMetaData(std::cerr, *I);
+    printMemRefMetaData(std::cerr, *O);
+    return;
+  }
+  auto so0 = O->strides[0], so1 = O->strides[1];
+  auto si0 = I->strides[0], si1 = I->strides[1];
+  for (unsigned i = 0; i < I->sizes[0]; ++i)
+    for (unsigned j = 0; j < I->sizes[1]; ++j)
+      O->data[O->offset + i * so0 + j * so1] =
+          I->data[I->offset + i * si0 + j * si1];
+}
+
+extern "C" void
+linalg_dot_viewsxf32_viewsxf32_viewf32(StridedMemRefType<float, 1> *X,
+                                       StridedMemRefType<float, 1> *Y,
+                                       StridedMemRefType<float, 0> *Z) {
+  if (X->strides[0] != 1 || Y->strides[0] != 1 || X->sizes[0] != Y->sizes[0]) {
+    std::cerr << "Incompatible strided memrefs\n";
+    printMemRefMetaData(std::cerr, *X);
+    printMemRefMetaData(std::cerr, *Y);
+    printMemRefMetaData(std::cerr, *Z);
+    return;
+  }
+  Z->data[Z->offset] +=
+      cblas_sdot(X->sizes[0], X->data + X->offset, X->strides[0],
+                 Y->data + Y->offset, Y->strides[0]);
+}
+
+extern "C" void linalg_matmul_viewsxsxf32_viewsxsxf32_viewsxsxf32(
+    StridedMemRefType<float, 2> *A, StridedMemRefType<float, 2> *B,
+    StridedMemRefType<float, 2> *C) {
+  if (A->strides[1] != B->strides[1] || A->strides[1] != C->strides[1] ||
+      A->strides[1] != 1 || A->sizes[0] < A->strides[1] ||
+      B->sizes[0] < B->strides[1] || C->sizes[0] < C->strides[1] ||
+      C->sizes[0] != A->sizes[0] || C->sizes[1] != B->sizes[1] ||
+      A->sizes[1] != B->sizes[0]) {
+    printMemRefMetaData(std::cerr, *A);
+    printMemRefMetaData(std::cerr, *B);
+    printMemRefMetaData(std::cerr, *C);
+    return;
+  }
+  cblas_sgemm(CBLAS_ORDER::CblasRowMajor, CBLAS_TRANSPOSE::CblasNoTrans,
+              CBLAS_TRANSPOSE::CblasNoTrans, C->sizes[0], C->sizes[1],
+              A->sizes[1], 1.0f, A->data + A->offset, A->strides[0],
+              B->data + B->offset, B->strides[0], 1.0f, C->data + C->offset,
+              C->strides[0]);
+}
diff --git a/mlir/test/mlir-cpu-runner/include/cblas.h b/mlir/test/mlir-cpu-runner/include/cblas.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccd316ff52e8dc8fc5317f2a40072caab504773f
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/include/cblas.h
@@ -0,0 +1,49 @@
+//===- cblas.h - Simple Blas subset ---------------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CPU_RUNNER_CBLAS_H_
+#define MLIR_CPU_RUNNER_CBLAS_H_
+
+#include "mlir_runner_utils.h"
+
+#ifdef _WIN32
+#ifndef MLIR_CBLAS_EXPORT
+#ifdef cblas_EXPORTS
+/* We are building this library */
+#define MLIR_CBLAS_EXPORT __declspec(dllexport)
+#else
+/* We are using this library */
+#define MLIR_CBLAS_EXPORT __declspec(dllimport)
+#endif
+#endif
+#else
+#define MLIR_CBLAS_EXPORT
+#endif
+
+/// This reproduces a minimal subset of cblas to allow integration testing
+/// without explicitly requiring a dependence on an external library.
+/// Without loss of generality, various cblas implementations may be swapped in
+/// by including the proper headers and linking with the proper library.
+enum CBLAS_ORDER { CblasRowMajor = 101, CblasColMajor = 102 };
+enum CBLAS_TRANSPOSE {
+  CblasNoTrans = 111,
+  CblasTrans = 112,
+  CblasConjTrans = 113
+};
+
+extern "C" MLIR_CBLAS_EXPORT float cblas_sdot(const int N, const float *X,
+                                              const int incX, const float *Y,
+                                              const int incY);
+
+extern "C" MLIR_CBLAS_EXPORT void
+cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+            const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+            const int K, const float alpha, const float *A, const int lda,
+            const float *B, const int ldb, const float beta, float *C,
+            const int ldc);
+
+#endif // MLIR_CPU_RUNNER_CBLAS_H_
diff --git a/mlir/test/mlir-cpu-runner/include/mlir_runner_utils.h b/mlir/test/mlir-cpu-runner/include/mlir_runner_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f4e638c33e4621350ae91c8962c3a5cf15e227a
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/include/mlir_runner_utils.h
@@ -0,0 +1,290 @@
+//===- mlir_runner_utils.h - Utils for debugging MLIR CPU execution -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CPU_RUNNER_MLIRUTILS_H_
+#define MLIR_CPU_RUNNER_MLIRUTILS_H_
+
+#include <assert.h>
+#include <cstdint>
+#include <iostream>
+
+#ifdef _WIN32
+#ifndef MLIR_RUNNER_UTILS_EXPORT
+#ifdef mlir_runner_utils_EXPORTS
+/* We are building this library */
+#define MLIR_RUNNER_UTILS_EXPORT __declspec(dllexport)
+#else
+/* We are using this library */
+#define MLIR_RUNNER_UTILS_EXPORT __declspec(dllimport)
+#endif
+#endif
+#else
+#define MLIR_RUNNER_UTILS_EXPORT
+#endif
+
+template <typename T, int N> struct StridedMemRefType;
+template <typename StreamType, typename T, int N>
+void printMemRefMetaData(StreamType &os, StridedMemRefType<T, N> &V);
+
+template <int N> void dropFront(int64_t arr[N], int64_t *res) {
+  for (unsigned i = 1; i < N; ++i)
+    *(res + i - 1) = arr[i];
+}
+
+/// StridedMemRef descriptor type with static rank.
+template <typename T, int N> struct StridedMemRefType {
+  T *basePtr;
+  T *data;
+  int64_t offset;
+  int64_t sizes[N];
+  int64_t strides[N];
+  // This operator[] is extremely slow and only for sugaring purposes.
+  StridedMemRefType<T, N - 1> operator[](int64_t idx) {
+    StridedMemRefType<T, N - 1> res;
+    res.basePtr = basePtr;
+    res.data = data;
+    res.offset = offset + idx * strides[0];
+    dropFront<N>(sizes, res.sizes);
+    dropFront<N>(strides, res.strides);
+    return res;
+  }
+};
+
+/// StridedMemRef descriptor type specialized for rank 1.
+template <typename T> struct StridedMemRefType<T, 1> {
+  T *basePtr;
+  T *data;
+  int64_t offset;
+  int64_t sizes[1];
+  int64_t strides[1];
+  T &operator[](int64_t idx) { return *(data + offset + idx * strides[0]); }
+};
+
+/// StridedMemRef descriptor type specialized for rank 0.
+template <typename T> struct StridedMemRefType<T, 0> {
+  T *basePtr;
+  T *data;
+  int64_t offset;
+};
+
+// Unranked MemRef
+template <typename T> struct UnrankedMemRefType {
+  int64_t rank;
+  void *descriptor;
+};
+
+template <typename StreamType, typename T, int N>
+void printMemRefMetaData(StreamType &os, StridedMemRefType<T, N> &V) {
+  static_assert(N > 0, "Expected N > 0");
+  os << "Memref base@ = " << reinterpret_cast<void *>(V.data) << " rank = " << N
+     << " offset = " << V.offset << " sizes = [" << V.sizes[0];
+  for (unsigned i = 1; i < N; ++i)
+    os << ", " << V.sizes[i];
+  os << "] strides = [" << V.strides[0];
+  for (unsigned i = 1; i < N; ++i)
+    os << ", " << V.strides[i];
+  os << "]";
+}
+
+template <typename StreamType, typename T>
+void printMemRefMetaData(StreamType &os, StridedMemRefType<T, 0> &V) {
+  os << "Memref base@ = " << reinterpret_cast<void *>(V.data) << " rank = 0"
+     << " offset = " << V.offset;
+}
+
+template <typename T, typename StreamType>
+void printUnrankedMemRefMetaData(StreamType &os, UnrankedMemRefType<T> &V) {
+  os << "Unranked Memref rank = " << V.rank << " "
+     << "descriptor@ = " << reinterpret_cast<void *>(V.descriptor) << "\n";
+}
+
+template <typename T, int Dim, int... Dims> struct Vector {
+  Vector<T, Dims...> vector[Dim];
+};
+template <typename T, int Dim> struct Vector<T, Dim> { T vector[Dim]; };
+
+template <int D1, typename T> using Vector1D = Vector<T, D1>;
+template <int D1, int D2, typename T> using Vector2D = Vector<T, D1, D2>;
+template <int D1, int D2, int D3, typename T>
+using Vector3D = Vector<T, D1, D2, D3>;
+template <int D1, int D2, int D3, int D4, typename T>
+using Vector4D = Vector<T, D1, D2, D3, D4>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Templated instantiation follows.
+////////////////////////////////////////////////////////////////////////////////
+namespace impl {
+template <typename T, int M, int... Dims>
+std::ostream &operator<<(std::ostream &os, const Vector<T, M, Dims...> &v);
+
+template <int... Dims> struct StaticSizeMult {
+  static constexpr int value = 1;
+};
+
+template <int N, int... Dims> struct StaticSizeMult<N, Dims...> {
+  static constexpr int value = N * StaticSizeMult<Dims...>::value;
+};
+
+static void printSpace(std::ostream &os, int count) {
+  for (int i = 0; i < count; ++i) {
+    os << ' ';
+  }
+}
+
+template <typename T, int M, int... Dims> struct VectorDataPrinter {
+  static void print(std::ostream &os, const Vector<T, M, Dims...> &val);
+};
+
+template <typename T, int M, int... Dims>
+void VectorDataPrinter<T, M, Dims...>::print(std::ostream &os,
+                                             const Vector<T, M, Dims...> &val) {
+  static_assert(M > 0, "0 dimensioned tensor");
+  static_assert(sizeof(val) == M * StaticSizeMult<Dims...>::value * sizeof(T),
+                "Incorrect vector size!");
+  // First
+  os << "(" << val.vector[0];
+  if (M > 1)
+    os << ", ";
+  if (sizeof...(Dims) > 1)
+    os << "\n";
+  // Kernel
+  for (unsigned i = 1; i + 1 < M; ++i) {
+    printSpace(os, 2 * sizeof...(Dims));
+    os << val.vector[i] << ", ";
+    if (sizeof...(Dims) > 1)
+      os << "\n";
+  }
+  // Last
+  if (M > 1) {
+    printSpace(os, sizeof...(Dims));
+    os << val.vector[M - 1];
+  }
+  os << ")";
+}
+
+template <typename T, int M, int... Dims>
+std::ostream &operator<<(std::ostream &os, const Vector<T, M, Dims...> &v) {
+  VectorDataPrinter<T, M, Dims...>::print(os, v);
+  return os;
+}
+
+template <typename T, int N> struct MemRefDataPrinter {
+  static void print(std::ostream &os, T *base, int64_t rank, int64_t offset,
+                    int64_t *sizes, int64_t *strides);
+  static void printFirst(std::ostream &os, T *base, int64_t rank,
+                         int64_t offset, int64_t *sizes, int64_t *strides);
+  static void printLast(std::ostream &os, T *base, int64_t rank, int64_t offset,
+                        int64_t *sizes, int64_t *strides);
+};
+
+template <typename T> struct MemRefDataPrinter<T, 0> {
+  static void print(std::ostream &os, T *base, int64_t rank, int64_t offset,
+                    int64_t *sizes = nullptr, int64_t *strides = nullptr);
+};
+
+template <typename T, int N>
+void MemRefDataPrinter<T, N>::printFirst(std::ostream &os, T *base,
+                                         int64_t rank, int64_t offset,
+                                         int64_t *sizes, int64_t *strides) {
+  os << "[";
+  MemRefDataPrinter<T, N - 1>::print(os, base, rank, offset, sizes + 1,
+                                     strides + 1);
+  // If single element, close square bracket and return early.
+  if (sizes[0] <= 1) {
+    os << "]";
+    return;
+  }
+  os << ", ";
+  if (N > 1)
+    os << "\n";
+}
+
+template <typename T, int N>
+void MemRefDataPrinter<T, N>::print(std::ostream &os, T *base, int64_t rank,
+                                    int64_t offset, int64_t *sizes,
+                                    int64_t *strides) {
+  printFirst(os, base, rank, offset, sizes, strides);
+  for (unsigned i = 1; i + 1 < sizes[0]; ++i) {
+    printSpace(os, rank - N + 1);
+    MemRefDataPrinter<T, N - 1>::print(os, base, rank, offset + i * strides[0],
+                                       sizes + 1, strides + 1);
+    os << ", ";
+    if (N > 1)
+      os << "\n";
+  }
+  if (sizes[0] <= 1)
+    return;
+  printLast(os, base, rank, offset, sizes, strides);
+}
+
+template <typename T, int N>
+void MemRefDataPrinter<T, N>::printLast(std::ostream &os, T *base, int64_t rank,
+                                        int64_t offset, int64_t *sizes,
+                                        int64_t *strides) {
+  printSpace(os, rank - N + 1);
+  MemRefDataPrinter<T, N - 1>::print(os, base, rank,
+                                     offset + (sizes[0] - 1) * (*strides),
+                                     sizes + 1, strides + 1);
+  os << "]";
+}
+
+template <typename T>
+void MemRefDataPrinter<T, 0>::print(std::ostream &os, T *base, int64_t rank,
+                                    int64_t offset, int64_t *sizes,
+                                    int64_t *strides) {
+  os << base[offset];
+}
+
+template <typename T, int N> void printMemRef(StridedMemRefType<T, N> &M) {
+  static_assert(N > 0, "Expected N > 0");
+  printMemRefMetaData(std::cout, M);
+  std::cout << " data = " << std::endl;
+  MemRefDataPrinter<T, N>::print(std::cout, M.data, N, M.offset, M.sizes,
+                                 M.strides);
+  std::cout << std::endl;
+}
+
+template <typename T> void printMemRef(StridedMemRefType<T, 0> &M) {
+  printMemRefMetaData(std::cout, M);
+  std::cout << " data = " << std::endl;
+  std::cout << "[";
+  MemRefDataPrinter<T, 0>::print(std::cout, M.data, 0, M.offset);
+  std::cout << "]" << std::endl;
+}
+} // namespace impl
+
+////////////////////////////////////////////////////////////////////////////////
+// Currently exposed C API.
+////////////////////////////////////////////////////////////////////////////////
+extern "C" MLIR_RUNNER_UTILS_EXPORT void
+print_memref_i8(UnrankedMemRefType<int8_t> *M);
+extern "C" MLIR_RUNNER_UTILS_EXPORT void
+print_memref_f32(UnrankedMemRefType<float> *M);
+
+extern "C" MLIR_RUNNER_UTILS_EXPORT void
+print_memref_0d_f32(StridedMemRefType<float, 0> *M);
+extern "C" MLIR_RUNNER_UTILS_EXPORT void
+print_memref_1d_f32(StridedMemRefType<float, 1> *M);
+extern "C" MLIR_RUNNER_UTILS_EXPORT void
+print_memref_2d_f32(StridedMemRefType<float, 2> *M);
+extern "C" MLIR_RUNNER_UTILS_EXPORT void
+print_memref_3d_f32(StridedMemRefType<float, 3> *M);
+extern "C" MLIR_RUNNER_UTILS_EXPORT void
+print_memref_4d_f32(StridedMemRefType<float, 4> *M);
+
+extern "C" MLIR_RUNNER_UTILS_EXPORT void
+print_memref_vector_4x4xf32(StridedMemRefType<Vector2D<4, 4, float>, 2> *M);
+
+// Small runtime support "lib" for vector.print lowering.
+extern "C" MLIR_RUNNER_UTILS_EXPORT void print_f32(float f);
+extern "C" MLIR_RUNNER_UTILS_EXPORT void print_f64(double d);
+extern "C" MLIR_RUNNER_UTILS_EXPORT void print_open();
+extern "C" MLIR_RUNNER_UTILS_EXPORT void print_close();
+extern "C" MLIR_RUNNER_UTILS_EXPORT void print_comma();
+extern "C" MLIR_RUNNER_UTILS_EXPORT void print_newline();
+
+#endif // MLIR_CPU_RUNNER_MLIRUTILS_H_
diff --git a/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir b/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..4fce008ae82d3f0ad602baada1e1faa5e714719d
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir
@@ -0,0 +1,83 @@
+// RUN: mlir-opt %s -convert-linalg-to-llvm | mlir-cpu-runner -e dot -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
+// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e dot -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
+// RUN: mlir-opt %s -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
+// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
+// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 -linalg-promote-subviews -convert-linalg-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
+// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 -linalg-promote-subviews -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
+
+#strided1D = (d0) -> (d0)
+#strided2D = (d0, d1)[s0] -> (d0 * s0 + d1)
+
+// Creates and returns a 1-D buffer of size %s filled with the value %f
+func @alloc_filled_f32(%s : index, %f : f32) -> memref<?xi8> {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c4 = constant 4 : index
+  %s4 = muli %s, %c4: index
+  %buf = alloc(%s4) {alignment = 256} : memref<?xi8>
+  %V = view %buf[%s][] : memref<?xi8> to memref<?xf32, #strided1D>
+  linalg.fill(%V, %f) : memref<?xf32, #strided1D>, f32
+  return %buf : memref<?xi8>
+}
+
+// Test for linalg.dot.
+func @dot() -> f32 {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c16 = constant 16 : index
+  %f10 = constant 10.00000e+00 : f32
+  %f1 = constant 1.00000e+00 : f32
+  %f2 = constant 2.00000e+00 : f32
+
+  %bA = call @alloc_filled_f32(%c16, %f2) : (index, f32) -> (memref<?xi8>)
+  %bB = call @alloc_filled_f32(%c16, %f1) : (index, f32) -> (memref<?xi8>)
+  %bC = call @alloc_filled_f32(%c1, %f10) : (index, f32) -> (memref<?xi8>)
+
+  %A = view %bA[%c16][] : memref<?xi8> to memref<?xf32, #strided1D>
+  %B = view %bB[%c16][] : memref<?xi8> to memref<?xf32, #strided1D>
+  %C = view %bC[][] : memref<?xi8> to memref<f32>
+
+  linalg.dot(%A, %B, %C) : memref<?xf32, #strided1D>, memref<?xf32, #strided1D>, memref<f32>
+  %res = load %C[] : memref<f32>
+
+  dealloc %bC : memref<?xi8>
+  dealloc %bB : memref<?xi8>
+  dealloc %bA : memref<?xi8>
+
+  return %res : f32
+}
+
+// Test for linalg.matmul.
+func @matmul() -> f32 {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c6 = constant 6 : index
+  %c7 = constant 7 : index
+  %c10 = constant 10 : index
+  %c16 = constant 16 : index
+  %c100 = constant 100 : index
+  %c160 = constant 160 : index
+  %f1 = constant 1.00000e+00 : f32
+  %f2 = constant 2.00000e+00 : f32
+  %f10 = constant 10.00000e+00 : f32
+
+  %bA = call @alloc_filled_f32(%c160, %f2) : (index, f32) -> (memref<?xi8>)
+  %bB = call @alloc_filled_f32(%c160, %f1) : (index, f32) -> (memref<?xi8>)
+  %bC = call @alloc_filled_f32(%c100, %f10) : (index, f32) -> (memref<?xi8>)
+
+  %A = view %bA[][%c10, %c16] : memref<?xi8> to memref<?x?xf32, #strided2D>
+  %B = view %bB[][%c16, %c10] : memref<?xi8> to memref<?x?xf32, #strided2D>
+  %C = view %bC[][%c10, %c10] : memref<?xi8> to memref<?x?xf32, #strided2D>
+
+  linalg.matmul(%A, %B, %C) : memref<?x?xf32, #strided2D>, memref<?x?xf32, #strided2D>, memref<?x?xf32, #strided2D>
+  %res = load %C[%c6, %c7] : memref<?x?xf32, #strided2D>
+
+  dealloc %bC : memref<?xi8>
+  dealloc %bB : memref<?xi8>
+  dealloc %bA : memref<?xi8>
+
+  return %res : f32
+}
+
+// All tests return this value
+// CHECK: 4.2{{0+}}e+01
diff --git a/mlir/test/mlir-cpu-runner/lit.local.cfg b/mlir/test/mlir-cpu-runner/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..7fd56a7f60dc20030f09e3e4c99dca54a47f547b
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/lit.local.cfg
@@ -0,0 +1,5 @@
+import sys
+
+# FIXME: llvm orc does not support the COFF rtld.
+if sys.platform == 'win32':
+    config.unsupported = True
\ No newline at end of file
diff --git a/mlir/test/mlir-cpu-runner/mlir_runner_utils.cpp b/mlir/test/mlir-cpu-runner/mlir_runner_utils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc3a782c080eb921cab48f80e3bc5d34737c5af9
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/mlir_runner_utils.cpp
@@ -0,0 +1,85 @@
+//===- mlir_runner_utils.cpp - Utils for MLIR CPU execution ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utilities for interfacing MLIR types with C code as well as printing,
+// debugging etc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/mlir_runner_utils.h"
+
+#include <cstdio>
+
+extern "C" void
+print_memref_vector_4x4xf32(StridedMemRefType<Vector2D<4, 4, float>, 2> *M) {
+  impl::printMemRef(*M);
+}
+
+#define MEMREF_CASE(TYPE, RANK)                                                \
+  case RANK:                                                                   \
+    impl::printMemRef(*(static_cast<StridedMemRefType<TYPE, RANK> *>(ptr)));   \
+    break
+
+extern "C" void print_memref_i8(UnrankedMemRefType<int8_t> *M) {
+  printUnrankedMemRefMetaData(std::cout, *M);
+  int rank = M->rank;
+  void *ptr = M->descriptor;
+
+  switch (rank) {
+    MEMREF_CASE(int8_t, 0);
+    MEMREF_CASE(int8_t, 1);
+    MEMREF_CASE(int8_t, 2);
+    MEMREF_CASE(int8_t, 3);
+    MEMREF_CASE(int8_t, 4);
+  default:
+    assert(0 && "Unsupported rank to print");
+  }
+}
+
+extern "C" void print_memref_f32(UnrankedMemRefType<float> *M) {
+  printUnrankedMemRefMetaData(std::cout, *M);
+  int rank = M->rank;
+  void *ptr = M->descriptor;
+
+  switch (rank) {
+    MEMREF_CASE(float, 0);
+    MEMREF_CASE(float, 1);
+    MEMREF_CASE(float, 2);
+    MEMREF_CASE(float, 3);
+    MEMREF_CASE(float, 4);
+  default:
+    assert(0 && "Unsupported rank to print");
+  }
+}
+
+extern "C" void print_memref_0d_f32(StridedMemRefType<float, 0> *M) {
+  impl::printMemRef(*M);
+}
+extern "C" void print_memref_1d_f32(StridedMemRefType<float, 1> *M) {
+  impl::printMemRef(*M);
+}
+extern "C" void print_memref_2d_f32(StridedMemRefType<float, 2> *M) {
+  impl::printMemRef(*M);
+}
+extern "C" void print_memref_3d_f32(StridedMemRefType<float, 3> *M) {
+  impl::printMemRef(*M);
+}
+extern "C" void print_memref_4d_f32(StridedMemRefType<float, 4> *M) {
+  impl::printMemRef(*M);
+}
+
+// Small runtime support "lib" for vector.print lowering.
+// By providing elementary printing methods only, this
+// library can remain fully unaware of low-level implementation
+// details of our vectors.
+extern "C" void print_f32(float f) { fprintf(stdout, "%g", f); }
+extern "C" void print_f64(double d) { fprintf(stdout, "%lg", d); }
+extern "C" void print_open() { fputs("( ", stdout); }
+extern "C" void print_close() { fputs(" )", stdout); }
+extern "C" void print_comma() { fputs(", ", stdout); }
+extern "C" void print_newline() { fputc('\n', stdout); }
diff --git a/mlir/test/mlir-cpu-runner/simple.mlir b/mlir/test/mlir-cpu-runner/simple.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..b64421feef4fecd0aa439eb3e1cbd287e758c35f
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/simple.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-cpu-runner %s | FileCheck %s
+// RUN: mlir-cpu-runner %s -e foo | FileCheck -check-prefix=NOMAIN %s
+// RUN: mlir-cpu-runner %s -O3 | FileCheck %s
+
+// RUN: cp %s %t
+// RUN: mlir-cpu-runner %t -dump-object-file | FileCheck %t
+// RUN: ls %t.o
+// RUN: rm %t.o
+
+// RUN: mlir-cpu-runner %s -dump-object-file -object-filename=%T/test.o | FileCheck %s
+// RUN: ls %T/test.o
+// RUN: rm %T/test.o
+
+// Declarations of C library functions.
+llvm.func @fabsf(!llvm.float) -> !llvm.float
+llvm.func @malloc(!llvm.i64) -> !llvm<"i8*">
+llvm.func @free(!llvm<"i8*">)
+
+// Check that a simple function with a nested call works.
+llvm.func @main() -> !llvm.float {
+  %0 = llvm.mlir.constant(-4.200000e+02 : f32) : !llvm.float
+  %1 = llvm.call @fabsf(%0) : (!llvm.float) -> !llvm.float
+  llvm.return %1 : !llvm.float
+}
+// CHECK: 4.200000e+02
+
+// Helper typed functions wrapping calls to "malloc" and "free".
+llvm.func @allocation() -> !llvm<"float*"> {
+  %0 = llvm.mlir.constant(4 : index) : !llvm.i64
+  %1 = llvm.call @malloc(%0) : (!llvm.i64) -> !llvm<"i8*">
+  %2 = llvm.bitcast %1 : !llvm<"i8*"> to !llvm<"float*">
+  llvm.return %2 : !llvm<"float*">
+}
+llvm.func @deallocation(%arg0: !llvm<"float*">) {
+  %0 = llvm.bitcast %arg0 : !llvm<"float*"> to !llvm<"i8*">
+  llvm.call @free(%0) : (!llvm<"i8*">) -> ()
+  llvm.return
+}
+
+// Check that allocation and deallocation works, and that a custom entry point
+// works.
+llvm.func @foo() -> !llvm.float {
+  %0 = llvm.call @allocation() : () -> !llvm<"float*">
+  %1 = llvm.mlir.constant(0 : index) : !llvm.i64
+  %2 = llvm.mlir.constant(1.234000e+03 : f32) : !llvm.float
+  %3 = llvm.getelementptr %0[%1] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+  llvm.store %2, %3 : !llvm<"float*">
+  %4 = llvm.getelementptr %0[%1] : (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+  %5 = llvm.load %4 : !llvm<"float*">
+  llvm.call @deallocation(%0) : (!llvm<"float*">) -> ()
+  llvm.return %5 : !llvm.float
+}
+// NOMAIN: 1.234000e+03
diff --git a/mlir/test/mlir-cpu-runner/unranked_memref.mlir b/mlir/test/mlir-cpu-runner/unranked_memref.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..7447e9d6e02b2b7002b21df3be645b980dbcfe99
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/unranked_memref.mlir
@@ -0,0 +1,59 @@
+// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | mlir-cpu-runner -e main -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext,%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s
+
+// CHECK: rank = 2
+// CHECK: rank = 2
+// CHECK-SAME: sizes = [10, 3]
+// CHECK-SAME: strides = [3, 1]
+// CHECK-COUNT-10: [10, 10, 10]
+//
+// CHECK: rank = 2
+// CHECK: rank = 2
+// CHECK-SAME: sizes = [10, 3]
+// CHECK-SAME: strides = [3, 1]
+// CHECK-COUNT-10: [5, 5, 5]
+//
+// CHECK: rank = 2
+// CHECK: rank = 2
+// CHECK-SAME: sizes = [10, 3]
+// CHECK-SAME: strides = [3, 1]
+// CHECK-COUNT-10: [2, 2, 2]
+//
+// CHECK: rank = 0
+// CHECK: rank = 0
+// 122 is ASCII for 'z'.
+// CHECK: [z]
+func @main() -> () {
+    %A = alloc() : memref<10x3xf32, 0>
+    %f2 = constant 2.00000e+00 : f32
+    %f5 = constant 5.00000e+00 : f32
+    %f10 = constant 10.00000e+00 : f32
+
+    %V = memref_cast %A : memref<10x3xf32, 0> to memref<?x?xf32>
+    linalg.fill(%V, %f10) : memref<?x?xf32, 0>, f32
+    %U = memref_cast %A : memref<10x3xf32, 0> to memref<*xf32>
+    call @print_memref_f32(%U) : (memref<*xf32>) -> ()
+
+    %V2 = memref_cast %U : memref<*xf32> to memref<?x?xf32>
+    linalg.fill(%V2, %f5) : memref<?x?xf32, 0>, f32
+    %U2 = memref_cast %V2 : memref<?x?xf32, 0> to memref<*xf32>
+    call @print_memref_f32(%U2) : (memref<*xf32>) -> ()
+
+    %V3 = memref_cast %V2 : memref<?x?xf32> to memref<*xf32>
+    %V4 = memref_cast %V3 : memref<*xf32> to memref<?x?xf32>
+    linalg.fill(%V4, %f2) : memref<?x?xf32, 0>, f32
+    %U3 = memref_cast %V2 : memref<?x?xf32> to memref<*xf32>
+    call @print_memref_f32(%U3) : (memref<*xf32>) -> ()
+
+    // 122 is ASCII for 'z'.
+    %i8_z = constant 122 : i8
+    %I8 = alloc() : memref<i8>
+    store %i8_z, %I8[]: memref<i8>
+    %U4 = memref_cast %I8 : memref<i8> to memref<*xi8>
+    call @print_memref_i8(%U4) : (memref<*xi8>) -> ()
+
+    dealloc %A : memref<10x3xf32, 0>
+    return
+}
+
+func @print_memref_i8(memref<*xi8>)
+func @print_memref_f32(memref<*xf32>)
diff --git a/mlir/test/mlir-cpu-runner/utils.mlir b/mlir/test/mlir-cpu-runner/utils.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..2a5692035db8a875526252b001d8ec8c808cb9fb
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/utils.mlir
@@ -0,0 +1,74 @@
+// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | mlir-cpu-runner -e print_0d -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext | FileCheck %s --check-prefix=PRINT-0D
+// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | mlir-cpu-runner -e print_1d -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext | FileCheck %s --check-prefix=PRINT-1D
+// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | mlir-cpu-runner -e print_3d -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext | FileCheck %s --check-prefix=PRINT-3D
+// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | mlir-cpu-runner -e vector_splat_2d -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext | FileCheck %s --check-prefix=PRINT-VECTOR-SPLAT-2D
+
+func @print_0d() {
+  %f = constant 2.00000e+00 : f32
+  %A = alloc() : memref<f32>
+  store %f, %A[]: memref<f32>
+  %U = memref_cast %A :  memref<f32> to memref<*xf32>
+  call @print_memref_f32(%U): (memref<*xf32>) -> ()
+  dealloc %A : memref<f32>
+  return
+}
+// PRINT-0D: Unranked Memref rank = 0 descriptor@ = {{.*}}
+// PRINT-0D: Memref base@ = {{.*}} rank = 0 offset = 0 data =
+// PRINT-0D: [2]
+
+func @print_1d() {
+  %f = constant 2.00000e+00 : f32
+  %A = alloc() : memref<16xf32>
+  %B = memref_cast %A: memref<16xf32> to memref<?xf32>
+  linalg.fill(%B, %f) : memref<?xf32>, f32
+  %U = memref_cast %B :  memref<?xf32> to memref<*xf32>
+  call @print_memref_f32(%U): (memref<*xf32>) -> ()
+  dealloc %A : memref<16xf32>
+  return
+}
+// PRINT-1D: Memref base@ = {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
+// PRINT-1D-NEXT: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+
+func @print_3d() {
+  %f = constant 2.00000e+00 : f32
+  %f4 = constant 4.00000e+00 : f32
+  %A = alloc() : memref<3x4x5xf32>
+  %B = memref_cast %A: memref<3x4x5xf32> to memref<?x?x?xf32>
+  linalg.fill(%B, %f) : memref<?x?x?xf32>, f32
+
+  %c2 = constant 2 : index
+  store %f4, %B[%c2, %c2, %c2]: memref<?x?x?xf32>
+  %U = memref_cast %B : memref<?x?x?xf32> to memref<*xf32>
+  call @print_memref_f32(%U): (memref<*xf32>) -> ()
+  dealloc %A : memref<3x4x5xf32>
+  return
+}
+// PRINT-3D: Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [3, 4, 5] strides = [20, 5, 1] data =
+// PRINT-3D-COUNT-4: {{.*[[:space:]].*}}2,    2,    2,    2,    2
+// PRINT-3D-COUNT-4: {{.*[[:space:]].*}}2,    2,    2,    2,    2
+// PRINT-3D-COUNT-2: {{.*[[:space:]].*}}2,    2,    2,    2,    2
+//    PRINT-3D-NEXT: 2,    2,    4,    2,    2
+//    PRINT-3D-NEXT: 2,    2,    2,    2,    2
+
+func @print_memref_f32(memref<*xf32>)
+
+!vector_type_C = type vector<4x4xf32>
+!matrix_type_CC = type memref<1x1x!vector_type_C>
+func @vector_splat_2d() {
+  %c0 = constant 0 : index
+  %f10 = constant 10.0 : f32
+  %vf10 = splat %f10: !vector_type_C
+  %C = alloc() : !matrix_type_CC
+  store %vf10, %C[%c0, %c0]: !matrix_type_CC
+
+  %CC = memref_cast %C: !matrix_type_CC to memref<?x?x!vector_type_C>
+  call @print_memref_vector_4x4xf32(%CC): (memref<?x?x!vector_type_C>) -> ()
+
+  dealloc %C : !matrix_type_CC
+  return
+}
+
+// PRINT-VECTOR-SPLAT-2D: Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [1, 1] strides = [1, 1] data =
+// PRINT-VECTOR-SPLAT-2D-NEXT: [((10, 10, 10, 10),   (10, 10, 10, 10),   (10, 10, 10, 10),   (10, 10, 10, 10))]
+
+func @print_memref_vector_4x4xf32(memref<?x?x!vector_type_C>)
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..a993d0c0dfb62c812ee8569015c72f94a7ecb3a7
--- /dev/null
+++ b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s
+
+// CHECK-COUNT-8: [{{(5356, ){12}5356}}]
+func @main() {
+  %arg = alloc() : memref<2x4x13xf32>
+  %dst = memref_cast %arg : memref<2x4x13xf32> to memref<?x?x?xf32>
+  %one = constant 1 : index
+  %sx = dim %dst, 2 : memref<?x?x?xf32>
+  %sy = dim %dst, 1 : memref<?x?x?xf32>
+  %sz = dim %dst, 0 : memref<?x?x?xf32>
+  call @mcuMemHostRegisterMemRef3dFloat(%dst) : (memref<?x?x?xf32>) -> ()
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %sy, %block_z = %sz)
+             args(%kernel_dst = %dst) : memref<?x?x?xf32> {
+    %t0 = muli %tz, %block_y : index
+    %t1 = addi %ty, %t0 : index
+    %t2 = muli %t1, %block_x : index
+    %idx = addi %tx, %t2 : index
+    %t3 = index_cast %idx : index to i32
+    %val = sitofp %t3 : i32 to f32
+    %sum = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32)
+    store %sum, %kernel_dst[%tz, %ty, %tx] : memref<?x?x?xf32>
+    gpu.return
+  }
+  %U = memref_cast %dst : memref<?x?x?xf32> to memref<*xf32>
+  call @print_memref_f32(%U) : (memref<*xf32>) -> ()
+  return
+}
+
+func @mcuMemHostRegisterMemRef3dFloat(%ptr : memref<?x?x?xf32>)
+func @print_memref_f32(%ptr : memref<*xf32>)
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..3839770a798707c4472fbb25cc15bae5d01f2f6d
--- /dev/null
+++ b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s
+
+// CHECK: [{{(35, ){34}35}}]
+func @main() {
+  %arg = alloc() : memref<35xf32>
+  %dst = memref_cast %arg : memref<35xf32> to memref<?xf32>
+  %one = constant 1 : index
+  %sx = dim %dst, 0 : memref<?xf32>
+  call @mcuMemHostRegisterMemRef1dFloat(%dst) : (memref<?xf32>) -> ()
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one)
+             args(%kernel_dst = %dst) : memref<?xf32> {
+    %val = index_cast %tx : index to i32
+    %xor = "gpu.all_reduce"(%val) ({
+    ^bb(%lhs : i32, %rhs : i32):
+      %xor = xor %lhs, %rhs : i32
+      "gpu.yield"(%xor) : (i32) -> ()
+    }) : (i32) -> (i32)
+    %res = sitofp %xor : i32 to f32
+    store %res, %kernel_dst[%tx] : memref<?xf32>
+    gpu.return
+  }
+  %U = memref_cast %dst : memref<?xf32> to memref<*xf32>
+  call @print_memref_f32(%U) : (memref<*xf32>) -> ()
+  return
+}
+
+func @mcuMemHostRegisterMemRef1dFloat(%ptr : memref<?xf32>)
+func @print_memref_f32(memref<*xf32>)
diff --git a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..d887e7376761f93679a02f0144a6b9abd91796ff
--- /dev/null
+++ b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s
+
+func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) {
+  %cst = constant 1 : index
+  %cst2 = dim %arg1, 0 : memref<?xf32>
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst)
+             args(%kernel_arg0 = %arg0, %kernel_arg1 = %arg1) : f32, memref<?xf32> {
+    store %kernel_arg0, %kernel_arg1[%tx] : memref<?xf32>
+    gpu.return
+  }
+  return
+}
+
+// CHECK: [1, 1, 1, 1, 1]
+func @main() {
+  %arg0 = alloc() : memref<5xf32>
+  %21 = constant 5 : i32
+  %22 = memref_cast %arg0 : memref<5xf32> to memref<?xf32>
+  call @mcuMemHostRegisterMemRef1dFloat(%22) : (memref<?xf32>) -> ()
+  call @print_memref_1d_f32(%22) : (memref<?xf32>) -> ()
+  %24 = constant 1.0 : f32
+  call @other_func(%24, %22) : (f32, memref<?xf32>) -> ()
+  call @print_memref_1d_f32(%22) : (memref<?xf32>) -> ()
+  return
+}
+
+func @mcuMemHostRegisterMemRef1dFloat(%ptr : memref<?xf32>)
+func @print_memref_1d_f32(memref<?xf32>)
diff --git a/mlir/test/mlir-cuda-runner/lit.local.cfg b/mlir/test/mlir-cuda-runner/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..b063ddda7e1d6c9e945d2d1b8ced0751c5f32dce
--- /dev/null
+++ b/mlir/test/mlir-cuda-runner/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.enable_cuda_runner:
+  config.unsupported = True
\ No newline at end of file
diff --git a/mlir/test/mlir-cuda-runner/shuffle.mlir b/mlir/test/mlir-cuda-runner/shuffle.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..1b01399cb6d45c93264722675e4e27410db61e3f
--- /dev/null
+++ b/mlir/test/mlir-cuda-runner/shuffle.mlir
@@ -0,0 +1,32 @@
+// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s
+
+// CHECK: [4, 5, 6, 7, 0, 1, 2, 3, 12, -1, -1, -1, 8]
+func @main() {
+  %arg = alloc() : memref<13xf32>
+  %dst = memref_cast %arg : memref<13xf32> to memref<?xf32>
+  %one = constant 1 : index
+  %sx = dim %dst, 0 : memref<?xf32>
+  call @mcuMemHostRegisterMemRef1dFloat(%dst) : (memref<?xf32>) -> ()
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one)
+             args(%kernel_dst = %dst) : memref<?xf32> {
+    %t0 = index_cast %tx : index to i32
+    %val = sitofp %t0 : i32 to f32
+    %width = index_cast %block_x : index to i32
+    %offset = constant 4 : i32
+    %shfl, %valid = gpu.shuffle %val, %offset, %width xor : f32
+    cond_br %valid, ^bb1(%shfl : f32), ^bb0
+  ^bb0:
+    %m1 = constant -1.0 : f32
+    br ^bb1(%m1 : f32)
+  ^bb1(%value : f32):
+    store %value, %kernel_dst[%tx] : memref<?xf32>
+    gpu.return
+  }
+  %U = memref_cast %dst : memref<?xf32> to memref<*xf32>
+  call @print_memref_f32(%U) : (memref<*xf32>) -> ()
+  return
+}
+
+func @mcuMemHostRegisterMemRef1dFloat(%ptr : memref<?xf32>)
+func @print_memref_f32(%ptr : memref<*xf32>)
diff --git a/mlir/test/mlir-tblgen/dialect.td b/mlir/test/mlir-tblgen/dialect.td
new file mode 100644
index 0000000000000000000000000000000000000000..a9acfaaf7104fca0851fbf7ee73ce2a0f40376b4
--- /dev/null
+++ b/mlir/test/mlir-tblgen/dialect.td
@@ -0,0 +1,53 @@
+// RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck %s --check-prefix=DECL
+// RUN: mlir-tblgen -gen-op-defs -I %S/../../include %s | FileCheck %s --check-prefix=DEF
+
+include "mlir/IR/OpBase.td"
+
+// Check using the dialect name as the namespace
+def A_Dialect : Dialect {
+  let name = "a";
+}
+
+def A_SomeOp : Op<A_Dialect, "some_op", []>;
+
+// Check a single namespace
+def B_Dialect : Dialect {
+  let name = "b";
+  let cppNamespace = "BNS";
+}
+
+// Check nested namespaces
+def B_SomeOp : Op<B_Dialect, "some_op", []>;
+
+def C_Dialect : Dialect {
+  let name = "c";
+  let cppNamespace = "::C::CC";
+}
+
+def C_SomeOp : Op<C_Dialect, "some_op", []>;
+
+// Check no namespaces
+def D_Dialect : Dialect {
+  let name = "d";
+  let cppNamespace = "";
+}
+
+def D_DSomeOp : Op<D_Dialect, "some_op", []>;
+
+// DEF-LABEL: GET_OP_LIST
+// DEF:      a::SomeOp
+// DEF-NEXT: BNS::SomeOp
+// DEF-NEXT: ::C::CC::SomeOp
+// DEF-NEXT: DSomeOp
+
+// DEF-LABEL: GET_OP_CLASSES
+// DEF: a::SomeOp definitions
+// DEF: BNS::SomeOp definitions
+// DEF: ::C::CC::SomeOp definitions
+// DEF: DSomeOp definitions
+
+// DECL-LABEL: GET_OP_CLASSES
+// DECL: a::SomeOp declarations
+// DECL: BNS::SomeOp declarations
+// DECL: ::C::CC::SomeOp declarations
+// DECL: DSomeOp declarations
diff --git a/mlir/test/mlir-tblgen/expect-symbol.td b/mlir/test/mlir-tblgen/expect-symbol.td
new file mode 100644
index 0000000000000000000000000000000000000000..809a1c79a48e7dc1d1c4eb6541ee2ca2a9189ae7
--- /dev/null
+++ b/mlir/test/mlir-tblgen/expect-symbol.td
@@ -0,0 +1,18 @@
+// RUN: not mlir-tblgen -gen-rewriters -I %S/../../include %s 2>&1 | FileCheck %s
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+}
+
+def OpA : Op<Test_Dialect, "a"> {
+  let arguments = (ins I32Attr:$attr);
+}
+def OpB : Op<Test_Dialect, "b"> {
+  let arguments = (ins I32Attr:$attr);
+}
+
+def : Pat<(OpA $attr), (OpB $attr),
+          // CHECK: operands to additional constraints can only be symbol references
+          [(Constraint<CPred<"$0->getValue() == $1">> $attr, 42)]>;
diff --git a/mlir/test/mlir-tblgen/op-attribute.td b/mlir/test/mlir-tblgen/op-attribute.td
new file mode 100644
index 0000000000000000000000000000000000000000..5e6d56ccfa2b6125015f77dd58696bcc24cb7b49
--- /dev/null
+++ b/mlir/test/mlir-tblgen/op-attribute.td
@@ -0,0 +1,236 @@
+// RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck %s --check-prefix=DECL --dump-input-on-failure
+// RUN: mlir-tblgen -gen-op-defs -I %S/../../include %s | FileCheck %s --check-prefix=DEF --dump-input-on-failure
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+}
+class NS_Op<string mnemonic, list<OpTrait> traits> :
+    Op<Test_Dialect, mnemonic, traits>;
+
+def SomeAttr : Attr<CPred<"some-condition">, "some attribute kind"> {
+  let storageType = "some-attr-kind";
+  let returnType = "some-return-type";
+  let convertFromStorage = "$_self.some-convert-from-storage()";
+  let constBuilderCall = "some-const-builder-call($_builder, $0)";
+}
+
+// Test required, optional, default-valued attributes
+// ---
+
+def AOp : NS_Op<"a_op", []> {
+  let arguments = (ins
+      SomeAttr:$aAttr,
+      DefaultValuedAttr<SomeAttr, "4.2">:$bAttr,
+      OptionalAttr<SomeAttr>:$cAttr
+  );
+}
+
+// DEF-LABEL: AOp definitions
+
+// Test getter methods
+// ---
+
+// DEF:      some-attr-kind AOp::aAttrAttr()
+// DEF-NEXT:   this->getAttr("aAttr").cast<some-attr-kind>()
+// DEF:      some-return-type AOp::aAttr() {
+// DEF-NEXT:   auto attr = aAttrAttr()
+// DEF-NEXT:   return attr.some-convert-from-storage();
+
+// DEF:      some-attr-kind AOp::bAttrAttr()
+// DEF-NEXT:   return this->getAttr("bAttr").dyn_cast_or_null<some-attr-kind>()
+// DEF:      some-return-type AOp::bAttr() {
+// DEF-NEXT:   auto attr = bAttrAttr();
+// DEF-NEXT:   if (!attr)
+// DEF-NEXT:       return some-const-builder-call(mlir::Builder(this->getContext()), 4.2).some-convert-from-storage();
+// DEF-NEXT:   return attr.some-convert-from-storage();
+
+// DEF:      some-attr-kind AOp::cAttrAttr()
+// DEF-NEXT:   return this->getAttr("cAttr").dyn_cast_or_null<some-attr-kind>()
+// DEF:      Optional<some-return-type> AOp::cAttr() {
+// DEF-NEXT:   auto attr = cAttrAttr()
+// DEF-NEXT:   return attr ? Optional<some-return-type>(attr.some-convert-from-storage()) : (llvm::None);
+
+// Test build methods
+// ---
+
+// DEF:      void AOp::build(
+// DEF:        tblgen_state.addAttribute("aAttr", aAttr);
+// DEF:        tblgen_state.addAttribute("bAttr", bAttr);
+// DEF:        if (cAttr) {
+// DEF-NEXT:     tblgen_state.addAttribute("cAttr", cAttr);
+
+// DEF:      void AOp::build(
+// DEF:        some-return-type aAttr, some-return-type bAttr, /*optional*/some-attr-kind cAttr
+// DEF:        tblgen_state.addAttribute("aAttr", some-const-builder-call((*tblgen_builder), aAttr));
+
+// DEF:      void AOp::build(
+// DEF:        ArrayRef<NamedAttribute> attributes
+// DEF:      tblgen_state.addAttributes(attributes);
+
+// Test verify method
+// ---
+
+// DEF:      AOp::verify()
+// DEF:      auto tblgen_aAttr = this->getAttr("aAttr");
+// DEF-NEXT: if (!tblgen_aAttr) return emitOpError("requires attribute 'aAttr'");
+// DEF:        if (!((some-condition))) return emitOpError("attribute 'aAttr' failed to satisfy constraint: some attribute kind");
+// DEF:      auto tblgen_bAttr = this->getAttr("bAttr");
+// DEF-NEXT: if (tblgen_bAttr) {
+// DEF-NEXT:   if (!((some-condition))) return emitOpError("attribute 'bAttr' failed to satisfy constraint: some attribute kind");
+// DEF:      auto tblgen_cAttr = this->getAttr("cAttr");
+// DEF-NEXT: if (tblgen_cAttr) {
+// DEF-NEXT:   if (!((some-condition))) return emitOpError("attribute 'cAttr' failed to satisfy constraint: some attribute kind");
+
+def SomeTypeAttr : TypeAttrBase<"SomeType", "some type attribute">;
+
+def BOp : NS_Op<"b_op", []> {
+  let arguments = (ins
+    AnyAttr:$any_attr,
+    BoolAttr:$bool_attr,
+    I32Attr:$i32_attr,
+    I64Attr:$i64_attr,
+    F32Attr:$f32_attr,
+    F64Attr:$f64_attr,
+    StrAttr:$str_attr,
+    ElementsAttr:$elements_attr,
+    FlatSymbolRefAttr:$function_attr,
+    SomeTypeAttr:$type_attr,
+    ArrayAttr:$array_attr,
+    TypedArrayAttrBase<SomeAttr, "SomeAttr array">:$some_attr_array,
+    TypeAttr:$type_attr
+  );
+}
+
+// Test common attribute kind getters' return types
+// ---
+
+// DEF: Attribute BOp::any_attr()
+// DEF: bool BOp::bool_attr()
+// DEF: APInt BOp::i32_attr()
+// DEF: APInt BOp::i64_attr()
+// DEF: APFloat BOp::f32_attr()
+// DEF: APFloat BOp::f64_attr()
+// DEF: StringRef BOp::str_attr()
+// DEF: ElementsAttr BOp::elements_attr()
+// DEF: StringRef BOp::function_attr()
+// DEF: SomeType BOp::type_attr()
+// DEF: ArrayAttr BOp::array_attr()
+// DEF: ArrayAttr BOp::some_attr_array()
+// DEF: Type BOp::type_attr()
+
+// Test common attribute kinds' constraints
+// ---
+
+// DEF-LABEL: BOp::verify
+// DEF: if (!((true)))
+// DEF: if (!((tblgen_bool_attr.isa<BoolAttr>())))
+// DEF: if (!(((tblgen_i32_attr.isa<IntegerAttr>())) && ((tblgen_i32_attr.cast<IntegerAttr>().getType().isInteger(32)))))
+// DEF: if (!(((tblgen_i64_attr.isa<IntegerAttr>())) && ((tblgen_i64_attr.cast<IntegerAttr>().getType().isInteger(64)))))
+// DEF: if (!(((tblgen_f32_attr.isa<FloatAttr>())) && ((tblgen_f32_attr.cast<FloatAttr>().getType().isF32()))))
+// DEF: if (!(((tblgen_f64_attr.isa<FloatAttr>())) && ((tblgen_f64_attr.cast<FloatAttr>().getType().isF64()))))
+// DEF: if (!((tblgen_str_attr.isa<StringAttr>())))
+// DEF: if (!((tblgen_elements_attr.isa<ElementsAttr>())))
+// DEF: if (!((tblgen_function_attr.isa<FlatSymbolRefAttr>())))
+// DEF: if (!(((tblgen_type_attr.isa<TypeAttr>())) && ((tblgen_type_attr.cast<TypeAttr>().getValue().isa<SomeType>()))))
+// DEF: if (!((tblgen_array_attr.isa<ArrayAttr>())))
+// DEF: if (!(((tblgen_some_attr_array.isa<ArrayAttr>())) && (llvm::all_of(tblgen_some_attr_array.cast<ArrayAttr>(), [](Attribute attr) { return (some-condition); }))))
+// DEF: if (!(((tblgen_type_attr.isa<TypeAttr>())) && ((tblgen_type_attr.cast<TypeAttr>().getValue().isa<Type>()))))
+
+// Test building constant values for array attribute kinds
+// ---
+
+def COp : NS_Op<"c_op", []> {
+  let arguments = (ins
+    DefaultValuedAttr<I32ArrayAttr, "{1, 2}">:$i32_array_attr,
+    DefaultValuedAttr<I64ArrayAttr, "{3, 4}">:$i64_array_attr,
+    DefaultValuedAttr<F32ArrayAttr, "{5.f, 6.f}">:$f32_array_attr,
+    DefaultValuedAttr<F64ArrayAttr, "{7., 8.}">:$f64_array_attr,
+    DefaultValuedAttr<StrArrayAttr, "{\"a\", \"b\"}">:$str_array_attr
+  );
+}
+
+// DEF-LABEL: COp definitions
+// DEF: mlir::Builder(this->getContext()).getI32ArrayAttr({1, 2})
+// DEF: mlir::Builder(this->getContext()).getI64ArrayAttr({3, 4})
+// DEF: mlir::Builder(this->getContext()).getF32ArrayAttr({5.f, 6.f})
+// DEF: mlir::Builder(this->getContext()).getF64ArrayAttr({7., 8.})
+// DEF: mlir::Builder(this->getContext()).getStrArrayAttr({"a", "b"})
+
+
+// Test builder method which takes unwrapped values for attributes
+// ---
+
+def I32Case5:  I32EnumAttrCase<"case5", 5>;
+def I32Case10: I32EnumAttrCase<"case10", 10>;
+
+def SomeI32Enum: I32EnumAttr<
+  "SomeI32Enum", "", [I32Case5, I32Case10]>;
+
+def DOp : NS_Op<"d_op", []> {
+  let arguments = (ins
+    I32Attr:$i32_attr,
+    F64Attr:$f64_attr,
+    StrAttr:$str_attr,
+    BoolAttr:$bool_attr,
+    SomeI32Enum:$enum_attr,
+    DefaultValuedAttr<I32Attr, "42">:$dv_i32_attr,
+    DefaultValuedAttr<F64Attr, "8.">:$dv_f64_attr,
+    DefaultValuedAttr<StrAttr, "abc">:$dv_str_attr,
+    DefaultValuedAttr<BoolAttr, "true">:$dv_bool_attr,
+    DefaultValuedAttr<SomeI32Enum, "::SomeI32Enum::case5">:$dv_enum_attr
+  );
+}
+
+// DECL-LABEL: DOp declarations
+// DECL: static void build({{.*}}, APInt i32_attr, APFloat f64_attr, StringRef str_attr, bool bool_attr, ::SomeI32Enum enum_attr, APInt dv_i32_attr, APFloat dv_f64_attr, StringRef dv_str_attr = "abc", bool dv_bool_attr = true, ::SomeI32Enum dv_enum_attr = ::SomeI32Enum::case5)
+
+// Test that only default valued attributes at the end of the arguments
+// list get default values in the builder signature
+// ---
+
+def EOp : NS_Op<"e_op", []> {
+  let arguments = (ins
+    I32Attr:$i32_attr,
+    DefaultValuedAttr<I32Attr, "42">:$dv_i32_attr,
+    F64Attr:$f64_attr,
+    DefaultValuedAttr<F64Attr, "8.">:$dv_f64_attr,
+    StrAttr:$str_attr,
+    DefaultValuedAttr<StrAttr, "abc">:$dv_str_attr,
+    BoolAttr:$bool_attr,
+    DefaultValuedAttr<BoolAttr, "true">:$dv_bool_attr,
+    SomeI32Enum:$enum_attr,
+    DefaultValuedAttr<SomeI32Enum, "::SomeI32Enum::case5">:$dv_enum_attr
+  );
+}
+
+// DECL-LABEL: EOp declarations
+// DECL: static void build({{.*}}, APInt i32_attr, APInt dv_i32_attr, APFloat f64_attr, APFloat dv_f64_attr, StringRef str_attr, StringRef dv_str_attr, bool bool_attr, bool dv_bool_attr, ::SomeI32Enum enum_attr, ::SomeI32Enum dv_enum_attr = ::SomeI32Enum::case5)
+
+// Test mixing operands and attributes in arbitrary order
+// ---
+
+def MixOperandsAndAttrs : NS_Op<"mix_operands_and_attrs", []> {
+  let arguments = (ins F32Attr:$attr, F32:$operand, F32Attr:$otherAttr, F32:$otherArg);
+}
+
+// DEF-LABEL: MixOperandsAndAttrs definitions
+// DEF-DAG: Value MixOperandsAndAttrs::operand()
+// DEF-DAG: Value MixOperandsAndAttrs::otherArg()
+// DEF-DAG: void MixOperandsAndAttrs::build(Builder *tblgen_builder, OperationState &tblgen_state, FloatAttr attr, Value operand, FloatAttr otherAttr, Value otherArg)
+// DEF-DAG: APFloat MixOperandsAndAttrs::attr()
+// DEF-DAG: APFloat MixOperandsAndAttrs::otherAttr()
+
+// Test unit attributes.
+// ---
+
+def UnitAttrOp : NS_Op<"unit_attr_op", []> {
+  let arguments = (ins UnitAttr:$attr);
+}
+
+// DEF-LABEL: UnitAttrOp definitions
+// DEF: bool UnitAttrOp::attr() {
+// DEF:   return {{.*}} != nullptr
+
+// DEF: build(Builder *tblgen_builder, OperationState &tblgen_state, /*optional*/UnitAttr attr)
diff --git a/mlir/test/mlir-tblgen/op-decl.td b/mlir/test/mlir-tblgen/op-decl.td
new file mode 100644
index 0000000000000000000000000000000000000000..74da938bd6709bcbcdb2c3f0bb504a93dc9ddf5d
--- /dev/null
+++ b/mlir/test/mlir-tblgen/op-decl.td
@@ -0,0 +1,133 @@
+// RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck --dump-input-on-failure %s
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+  let cppNamespace = "NS";
+}
+class NS_Op<string mnemonic, list<OpTrait> traits> :
+    Op<Test_Dialect, mnemonic, traits>;
+
+// NoSideEffect trait is included twice to ensure it gets uniqued during
+// emission.
+def NS_AOp : NS_Op<"a_op", [NoSideEffect, NoSideEffect]> {
+  let arguments = (ins
+    I32:$a,
+    Variadic<F32>:$b,
+
+    I32Attr:$attr1,
+    OptionalAttr<F32Attr>:$attr2
+  );
+
+  let results = (outs
+    I32:$r,
+    Variadic<F32>:$s
+  );
+
+  let regions = (region AnyRegion:$someRegion);
+  let builders = [OpBuilder<"Value val">];
+  let parser = [{ foo }];
+  let printer = [{ bar }];
+  let verifier = [{ baz }];
+
+  let hasCanonicalizer = 1;
+  let hasFolder = 1;
+
+  let extraClassDeclaration = [{
+    // Display a graph for debugging purposes.
+    void displayGraph();
+  }];
+}
+
+// CHECK: class AOp;
+
+// CHECK-LABEL: NS::AOp declarations
+
+// CHECK: class AOpOperandAdaptor {
+// CHECK: public:
+// CHECK:   AOpOperandAdaptor(ArrayRef<Value> values);
+// CHECK:   ArrayRef<Value> getODSOperands(unsigned index);
+// CHECK:   Value a();
+// CHECK:   ArrayRef<Value> b();
+// CHECK: private:
+// CHECK:   ArrayRef<Value> tblgen_operands;
+// CHECK: };
+
+// CHECK: class AOp : public Op<AOp, OpTrait::AtLeastNResults<1>::Impl, OpTrait::HasNoSideEffect, OpTrait::AtLeastNOperands<1>::Impl
+// CHECK: public:
+// CHECK:   using Op::Op;
+// CHECK:   using OperandAdaptor = AOpOperandAdaptor;
+// CHECK:   static StringRef getOperationName();
+// CHECK:   Operation::operand_range getODSOperands(unsigned index);
+// CHECK:   Value a();
+// CHECK:   Operation::operand_range b();
+// CHECK:   Operation::result_range getODSResults(unsigned index);
+// CHECK:   Value r();
+// CHECK:   Region &someRegion();
+// CHECK:   IntegerAttr attr1Attr()
+// CHECK:   APInt attr1();
+// CHECK:   FloatAttr attr2Attr()
+// CHECK:   Optional< APFloat > attr2();
+// CHECK:   static void build(Value val);
+// CHECK:   static void build(Builder *tblgen_builder, OperationState &tblgen_state, Type r, ArrayRef<Type> s, Value a, ValueRange b, IntegerAttr attr1, /*optional*/FloatAttr attr2)
+// CHECK:   static void build(Builder *tblgen_builder, OperationState &tblgen_state, Type r, ArrayRef<Type> s, Value a, ValueRange b, APInt attr1, /*optional*/FloatAttr attr2)
+// CHECK:   static void build(Builder *, OperationState &tblgen_state, ArrayRef<Type> resultTypes, ValueRange operands, ArrayRef<NamedAttribute> attributes)
+// CHECK:   static ParseResult parse(OpAsmParser &parser, OperationState &result);
+// CHECK:   void print(OpAsmPrinter &p);
+// CHECK:   LogicalResult verify();
+// CHECK:   static void getCanonicalizationPatterns(OwningRewritePatternList &results, MLIRContext *context);
+// CHECK:   LogicalResult fold(ArrayRef<Attribute> operands, SmallVectorImpl<OpFoldResult> &results);
+// CHECK:   // Display a graph for debugging purposes.
+// CHECK:   void displayGraph();
+// CHECK: };
+
+// Check op trait for different number of operands
+// ---
+
+def NS_BOp : NS_Op<"op_with_no_operand", []> {
+  let arguments = (ins);
+}
+
+// CHECK-LABEL: NS::BOp declarations
+// CHECK: OpTrait::ZeroOperands
+
+def NS_COp : NS_Op<"op_with_one_operand", []> {
+  let arguments = (ins I32:$operand);
+}
+
+// CHECK-LABEL: NS::COp declarations
+// CHECK: OpTrait::OneOperand
+
+def NS_DOp : NS_Op<"op_with_two_operands", []> {
+  let arguments = (ins I32:$input1, I32:$input2);
+}
+
+// CHECK-LABEL: NS::DOp declarations
+// CHECK: OpTrait::NOperands<2>::Impl
+
+// Check that default builders can be suppressed.
+// ---
+
+def NS_SkipDefaultBuildersOp : NS_Op<"skip_default_builders", []> {
+  let skipDefaultBuilders = 1;
+  let builders = [OpBuilder<"Value val">];
+}
+
+// CHECK-LABEL: NS::SkipDefaultBuildersOp declarations
+// CHECK:     class SkipDefaultBuildersOp
+// CHECK-NOT:   static void build(Builder
+// CHECK:       static void build(Value
+
+// Check leading underscore in op name
+// ---
+
+def NS__AOp : NS_Op<"_op_with_leading_underscore", []>;
+
+// CHECK-LABEL: NS::_AOp declarations
+// CHECK: class _AOp : public Op<_AOp
+
+def _BOp : NS_Op<"_op_with_leading_underscore_and_no_namespace", []>;
+
+// CHECK-LABEL: _BOp declarations
+// CHECK: class _BOp : public Op<_BOp
diff --git a/mlir/test/mlir-tblgen/op-interface.td b/mlir/test/mlir-tblgen/op-interface.td
new file mode 100644
index 0000000000000000000000000000000000000000..7cda61da08e045d11682cddfff8775cf1e494ece
--- /dev/null
+++ b/mlir/test/mlir-tblgen/op-interface.td
@@ -0,0 +1,34 @@
+// RUN: mlir-tblgen -gen-op-interface-decls -I %S/../../include %s | FileCheck %s --check-prefix=DECL --dump-input-on-failure
+
+include "mlir/IR/OpBase.td"
+
+def TestOpInterface : OpInterface<"TestOpInterface"> {
+  let description = [{some op interface description}];
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/[{some function comment}],
+      /*retTy=*/"int",
+      /*methodName=*/"foo",
+      /*args=*/(ins "int":$input)
+    >,
+  ];
+}
+
+// Define Ops with TestOpInterface and
+// DeclareOpInterfaceMethods<TestOpInterface> traits to check that there
+// are not duplicated C++ classes generated.
+def TestDialect : Dialect {
+  let name = "test";
+}
+
+def OpInterfaceOp : Op<TestDialect, "op_interface_op", [TestOpInterface]>;
+
+def DeclareMethodsOp : Op<TestDialect, "declare_methods_op",
+                          [DeclareOpInterfaceMethods<TestOpInterface>]>;
+
+// DECL-LABEL: TestOpInterfaceInterfaceTraits
+// DECL: class TestOpInterface : public OpInterface<TestOpInterface, detail::TestOpInterfaceInterfaceTraits>
+// DECL: int foo(int input);
+
+// DECL-NOT: TestOpInterface
diff --git a/mlir/test/mlir-tblgen/op-operand.td b/mlir/test/mlir-tblgen/op-operand.td
new file mode 100644
index 0000000000000000000000000000000000000000..e2d5862e3ca2463c85055136ced3395e3ed922bc
--- /dev/null
+++ b/mlir/test/mlir-tblgen/op-operand.td
@@ -0,0 +1,60 @@
+// RUN: mlir-tblgen -gen-op-defs -I %S/../../include %s | FileCheck %s
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+}
+class NS_Op<string mnemonic, list<OpTrait> traits> :
+    Op<Test_Dialect, mnemonic, traits>;
+
+def OpA : NS_Op<"one_normal_operand_op", []> {
+  let arguments = (ins I32:$input);
+}
+
+// CHECK-LABEL: OpA definitions
+
+// CHECK:      OpAOperandAdaptor::OpAOperandAdaptor
+// CHECK-NEXT: tblgen_operands = values
+
+// CHECK:      void OpA::build
+// CHECK:        Value input
+// CHECK:        tblgen_state.addOperands(input);
+
+// CHECK:      void OpA::build
+// CHECK:        ValueRange operands
+// CHECK:        assert(operands.size() == 1u && "mismatched number of parameters");
+// CHECK:        tblgen_state.addOperands(operands);
+
+def OpB : NS_Op<"one_variadic_operand_op", []> {
+  let arguments = (ins Variadic<I32>:$input);
+}
+
+// CHECK-LABEL: OpB::build
+// CHECK:         ValueRange input
+// CHECK-NOT:     assert
+// CHECK:         tblgen_state.addOperands(input);
+
+def OpD : NS_Op<"mix_variadic_and_normal_inputs_op", [SameVariadicOperandSize]> {
+  let arguments = (ins Variadic<AnyTensor>:$input1, AnyTensor:$input2, Variadic<AnyTensor>:$input3);
+}
+
+// CHECK-LABEL: ArrayRef<Value> OpDOperandAdaptor::input1
+// CHECK-NEXT:    return getODSOperands(0);
+
+// CHECK-LABEL: Value OpDOperandAdaptor::input2
+// CHECK-NEXT:    return *getODSOperands(1).begin();
+
+// CHECK-LABEL: ArrayRef<Value> OpDOperandAdaptor::input3
+// CHECK-NEXT:    return getODSOperands(2);
+
+// CHECK-LABEL: Operation::operand_range OpD::input1
+// CHECK-NEXT: return getODSOperands(0);
+
+// CHECK-LABEL: Value OpD::input2
+// CHECK-NEXT: return *getODSOperands(1).begin();
+
+// CHECK-LABEL: OpD::build
+// CHECK-NEXT: tblgen_state.addOperands(input1);
+// CHECK-NEXT: tblgen_state.addOperands(input2);
+// CHECK-NEXT: tblgen_state.addOperands(input3);
diff --git a/mlir/test/mlir-tblgen/op-result.td b/mlir/test/mlir-tblgen/op-result.td
new file mode 100644
index 0000000000000000000000000000000000000000..a177f9ca3051ae20cb0b4283afa2d24578cc184b
--- /dev/null
+++ b/mlir/test/mlir-tblgen/op-result.td
@@ -0,0 +1,109 @@
+// RUN: mlir-tblgen -gen-op-defs -I %S/../../include %s | FileCheck %s
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+}
+class NS_Op<string mnemonic, list<OpTrait> traits> :
+    Op<Test_Dialect, mnemonic, traits>;
+
+def OpA : NS_Op<"one_normal_result_op", []> {
+  let results = (outs I32:$result);
+}
+
+// CHECK-LABEL: void OpA::build
+// CHECK:         ArrayRef<Type> resultTypes, ValueRange operands
+// CHECK:         assert(resultTypes.size() == 1u && "mismatched number of return types");
+// CHECK-NEXT:    tblgen_state.addTypes(resultTypes);
+
+def OpB : NS_Op<"same_input_output_type_op", [SameOperandsAndResultType]> {
+  let arguments = (ins I32:$x);
+  let results = (outs I32:$y);
+}
+
+// CHECK-LABEL: OpB definitions
+// CHECK: void OpB::build(Builder *tblgen_builder, OperationState &tblgen_state, Type y, Value x)
+// CHECK:   tblgen_state.addTypes(y);
+// CHECK: void OpB::build(Builder *tblgen_builder, OperationState &tblgen_state, Value x)
+// CHECK:   tblgen_state.addTypes({x->getType()});
+
+def OpC : NS_Op<"three_normal_result_op", []> {
+  let results = (outs I32:$x, /*unnamed*/I32, I32:$z);
+}
+
+// CHECK-LABEL: OpC definitions
+// CHECK:       void OpC::build(Builder *tblgen_builder, OperationState &tblgen_state, Type x, Type resultType1, Type z)
+// CHECK-NEXT:   tblgen_state.addTypes(x)
+// CHECK-NEXT:   tblgen_state.addTypes(resultType1)
+// CHECK-NEXT:   tblgen_state.addTypes(z)
+
+def IntegerTypeAttr : TypeAttrBase<"IntegerType", "Integer type attribute">;
+def OpD : NS_Op<"type_attr_as_result_type", [FirstAttrDerivedResultType]> {
+  let arguments = (ins I32:$x, IntegerTypeAttr:$attr, F32Attr:$f32);
+  let results = (outs AnyTensor:$y);
+}
+
+// CHECK-LABEL: OpD definitions
+// CHECK: void OpD::build(Builder *, OperationState &tblgen_state, ValueRange operands, ArrayRef<NamedAttribute> attributes)
+// CHECK: tblgen_state.addTypes({attr.second.cast<TypeAttr>().getValue()});
+
+def OpE : NS_Op<"value_attr_as_result_type", [FirstAttrDerivedResultType]> {
+  let arguments = (ins I32:$x, F32Attr:$attr);
+  let results = (outs AnyTensor:$y);
+}
+
+// CHECK-LABEL: OpE definitions
+// CHECK: void OpE::build(Builder *, OperationState &tblgen_state, ValueRange operands, ArrayRef<NamedAttribute> attributes)
+// CHECK: tblgen_state.addTypes({attr.second.getType()});
+
+def OpF : NS_Op<"one_variadic_result_op", []> {
+  let results = (outs Variadic<I32>:$x);
+}
+
+// CHECK-LABEL: void OpF::build
+// CHECK-SAME:    ArrayRef<Type> x
+// CHECK-NOT:     assert
+// CHECK:         tblgen_state.addTypes(x);
+
+def OpG : NS_Op<"one_normal_and_one_variadic_result_op", []> {
+
+  let results = (outs I32:$x, Variadic<I32>:$y);
+}
+
+// CHECK-LABEL: OpG definitions
+
+// CHECK:      void OpG::build(Builder *tblgen_builder, OperationState &tblgen_state, Type x, ArrayRef<Type> y)
+// CHECK-NEXT:   tblgen_state.addTypes(x);
+// CHECK-NEXT:   tblgen_state.addTypes(y);
+
+// CHECK:       void OpG::build
+// CHECK:         ArrayRef<Type> resultTypes
+// CHECK:         assert(resultTypes.size() >= 1u && "mismatched number of return types");
+// CHECK-NEXT:    tblgen_state.addTypes(resultTypes);
+
+def OpI : NS_Op<"mix_variadic_and_normal_results_op", [SameVariadicResultSize]> {
+  let results = (outs Variadic<AnyTensor>:$output1, AnyTensor:$output2, Variadic<AnyTensor>:$output3);
+}
+
+// CHECK-LABEL: Operation::result_range OpI::output1
+// CHECK-NEXT:    return getODSResults(0);
+
+// CHECK-LABEL: Value OpI::output2
+// CHECK-NEXT:    return *getODSResults(1).begin();
+
+// CHECK-LABEL: OpI::build
+// CHECK-NEXT:    tblgen_state.addTypes(output1);
+// CHECK-NEXT:    tblgen_state.addTypes(output2);
+// CHECK-NEXT:    tblgen_state.addTypes(output3);
+
+// Test that if the only operand is variadic, we access the first value in the
+// pack to set result type
+// ---
+def OpK : NS_Op<"only_input_is_variadic_with_same_value_type_op", [SameOperandsAndResultType]> {
+  let arguments = (ins Variadic<AnyTensor>:$input);
+  let results = (outs AnyTensor:$result);
+}
+
+// CHECK-LABEL: OpK::build(Builder *tblgen_builder, OperationState &tblgen_state, ValueRange input)
+// CHECK: tblgen_state.addTypes({input.front()->getType()});
diff --git a/mlir/test/mlir-tblgen/pattern.mlir b/mlir/test/mlir-tblgen/pattern.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..67ea2fd91809bdce29478bd4403b316903785366
--- /dev/null
+++ b/mlir/test/mlir-tblgen/pattern.mlir
@@ -0,0 +1,349 @@
+// RUN: mlir-opt -test-patterns -mlir-print-debuginfo %s | FileCheck %s
+
+// CHECK-LABEL: verifyFusedLocs
+func @verifyFusedLocs(%arg0 : i32) -> i32 {
+  %0 = "test.op_a"(%arg0) {attr = 10 : i32} : (i32) -> i32 loc("a")
+  %result = "test.op_a"(%0) {attr = 20 : i32} : (i32) -> i32 loc("b")
+
+  // CHECK: "test.op_b"(%arg0) {attr = 10 : i32} : (i32) -> i32 loc("a")
+  // CHECK: "test.op_b"(%arg0) {attr = 20 : i32} : (i32) -> i32 loc(fused["b", "a"])
+  return %result : i32
+}
+
+// CHECK-LABEL: verifyZeroResult
+func @verifyZeroResult(%arg0 : i32) {
+  // CHECK: "test.op_i"(%arg0) : (i32) -> ()
+  "test.op_h"(%arg0) : (i32) -> ()
+  return
+}
+
+// CHECK-LABEL verifyZeroArg
+func @verifyZeroArg() -> i32 {
+  // CHECK: "test.op_k"() : () -> i32
+  %0 = "test.op_j"() : () -> i32
+  return %0 : i32
+}
+
+// CHECK-LABEL: testIgnoreArgMatch
+// CHECK-SAME: (%{{[a-z0-9]*}}: i32, %[[ARG1:[a-z0-9]*]]: i32
+func @testIgnoreArgMatch(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: f32) {
+  // CHECK: "test.ignore_arg_match_dst"(%[[ARG1]]) {f = 15 : i64}
+  "test.ignore_arg_match_src"(%arg0, %arg1, %arg2) {d = 42, e = 24, f = 15} : (i32, i32, i32) -> ()
+
+  // CHECK: test.ignore_arg_match_src
+  // Not match because wrong type for $c.
+  "test.ignore_arg_match_src"(%arg0, %arg1, %arg3) {d = 42, e = 24, f = 15} : (i32, i32, f32) -> ()
+
+  // CHECK: test.ignore_arg_match_src
+  // Not match because wrong type for $f.
+  "test.ignore_arg_match_src"(%arg0, %arg1, %arg2) {d = 42 : i32, e = 24, f = 15} : (i32, i32, i32) -> ()
+  return
+}
+
+// CHECK-LABEL: verifyInterleavedOperandAttribute
+// CHECK-SAME:    %[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32
+func @verifyInterleavedOperandAttribute(%arg0: i32, %arg1: i32) {
+  // CHECK: "test.interleaved_operand_attr2"(%[[ARG0]], %[[ARG1]]) {attr1 = 15 : i64, attr2 = 42 : i64}
+  "test.interleaved_operand_attr1"(%arg0, %arg1) {attr1 = 15, attr2 = 42} : (i32, i32) -> ()
+  return
+}
+
+// CHECK-LABEL: verifyBenefit
+func @verifyBenefit(%arg0 : i32) -> i32 {
+  %0 = "test.op_d"(%arg0) : (i32) -> i32
+  %1 = "test.op_g"(%arg0) : (i32) -> i32
+  %2 = "test.op_g"(%1) : (i32) -> i32
+
+  // CHECK: "test.op_f"(%arg0)
+  // CHECK: "test.op_b"(%arg0) {attr = 34 : i32}
+  return %0 : i32
+}
+
+// CHECK-LABEL: verifyNativeCodeCall
+func @verifyNativeCodeCall(%arg0: i32, %arg1: i32) -> (i32, i32) {
+  // CHECK: %0 = "test.native_code_call2"(%arg0) {attr = [42, 24]} : (i32) -> i32
+  // CHECK:  return %0, %arg1
+  %0 = "test.native_code_call1"(%arg0, %arg1) {choice = true, attr1 = 42, attr2 = 24} : (i32, i32) -> (i32)
+  %1 = "test.native_code_call1"(%arg0, %arg1) {choice = false, attr1 = 42, attr2 = 24} : (i32, i32) -> (i32)
+  return %0, %1: i32, i32
+}
+
+// CHECK-LABEL: verifyAuxiliaryNativeCodeCall
+func @verifyAuxiliaryNativeCodeCall(%arg0: i32) -> (i32) {
+  // CHECK: test.op_i
+  // CHECK: test.op_k
+  %0 = "test.native_code_call3"(%arg0) : (i32) -> (i32)
+  return %0 : i32
+}
+
+// CHECK-LABEL: verifyAllAttrConstraintOf
+func @verifyAllAttrConstraintOf() -> (i32, i32, i32) {
+  // CHECK: "test.all_attr_constraint_of2"
+  %0 = "test.all_attr_constraint_of1"() {attr = [0, 1]} : () -> (i32)
+  // CHECK: "test.all_attr_constraint_of1"
+  %1 = "test.all_attr_constraint_of1"() {attr = [0, 2]} : () -> (i32)
+  // CHECK: "test.all_attr_constraint_of1"
+  %2 = "test.all_attr_constraint_of1"() {attr = [-1, 1]} : () -> (i32)
+  return %0, %1, %2: i32, i32, i32
+}
+
+// CHECK-LABEL: verifyManyArgs
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+func @verifyManyArgs(%arg: i32) {
+  // CHECK: "test.many_arguments"(%[[ARG]], %[[ARG]], %[[ARG]], %[[ARG]], %[[ARG]], %[[ARG]], %[[ARG]], %[[ARG]], %[[ARG]])
+  // CHECK-SAME: {attr1 = 24 : i64, attr2 = 42 : i64, attr3 = 42 : i64, attr4 = 42 : i64, attr5 = 42 : i64, attr6 = 42 : i64, attr7 = 42 : i64, attr8 = 42 : i64, attr9 = 42 : i64}
+  "test.many_arguments"(%arg, %arg, %arg, %arg, %arg, %arg, %arg, %arg, %arg) {
+    attr1 = 42, attr2 = 42, attr3 = 42, attr4 = 42, attr5 = 42,
+    attr6 = 42, attr7 = 42, attr8 = 42, attr9 = 42
+  } : (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// Test Symbol Binding
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: symbolBinding
+func @symbolBinding(%arg0: i32) -> i32 {
+  // An op with one use is matched.
+  // CHECK: %0 = "test.symbol_binding_b"(%arg0)
+  // CHECK: %1 = "test.symbol_binding_c"(%0)
+  // CHECK: %2 = "test.symbol_binding_d"(%0, %1) {attr = 42 : i64}
+  %0 = "test.symbol_binding_a"(%arg0) {attr = 42} : (i32) -> (i32)
+
+  // An op without any use is not matched.
+  // CHECK: "test.symbol_binding_a"(%arg0)
+  %1 = "test.symbol_binding_a"(%arg0) {attr = 42} : (i32) -> (i32)
+
+  // CHECK: return %2
+  return %0: i32
+}
+
+// CHECK-LABEL: symbolBindingNoResult
+func @symbolBindingNoResult(%arg0: i32) {
+  // CHECK: test.symbol_binding_b
+  "test.symbol_binding_no_result"(%arg0) : (i32) -> ()
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// Test Attributes
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: succeedMatchOpAttr
+func @succeedMatchOpAttr() -> i32 {
+  // CHECK: "test.match_op_attribute2"() {default_valued_attr = 3 : i32, more_attr = 4 : i32, optional_attr = 2 : i32, required_attr = 1 : i32}
+  %0 = "test.match_op_attribute1"() {required_attr = 1: i32, optional_attr = 2: i32, default_valued_attr = 3: i32, more_attr = 4: i32} : () -> (i32)
+  return %0: i32
+}
+
+// CHECK-LABEL: succeedMatchMissingOptionalAttr
+func @succeedMatchMissingOptionalAttr() -> i32 {
+  // CHECK: "test.match_op_attribute2"() {default_valued_attr = 3 : i32, more_attr = 4 : i32, required_attr = 1 : i32}
+  %0 = "test.match_op_attribute1"() {required_attr = 1: i32, default_valued_attr = 3: i32, more_attr = 4: i32} : () -> (i32)
+  return %0: i32
+}
+
+// CHECK-LABEL: succeedMatchMissingDefaultValuedAttr
+func @succeedMatchMissingDefaultValuedAttr() -> i32 {
+  // CHECK: "test.match_op_attribute2"() {default_valued_attr = 42 : i32, more_attr = 4 : i32, optional_attr = 2 : i32, required_attr = 1 : i32}
+  %0 = "test.match_op_attribute1"() {required_attr = 1: i32, optional_attr = 2: i32, more_attr = 4: i32} : () -> (i32)
+  return %0: i32
+}
+
+// CHECK-LABEL: failedMatchAdditionalConstraintNotSatisfied
+func @failedMatchAdditionalConstraintNotSatisfied() -> i32 {
+  // CHECK: "test.match_op_attribute1"()
+  %0 = "test.match_op_attribute1"() {required_attr = 1: i32, optional_attr = 2: i32, more_attr = 5: i32} : () -> (i32)
+  return %0: i32
+}
+
+// CHECK-LABEL: verifyConstantAttr
+func @verifyConstantAttr(%arg0 : i32) -> i32 {
+  // CHECK: "test.op_b"(%arg0) {attr = 17 : i32} : (i32) -> i32 loc("a")
+  %0 = "test.op_c"(%arg0) : (i32) -> i32 loc("a")
+  return %0 : i32
+}
+
+// CHECK-LABEL: verifyUnitAttr
+func @verifyUnitAttr() -> (i32, i32) {
+  // Unit attribute present in the matched op is propagated as attr2.
+  // CHECK: "test.match_op_attribute4"() {attr1, attr2} : () -> i32
+  %0 = "test.match_op_attribute3"() {attr} : () -> i32
+
+  // Since the original op doesn't have the unit attribute, the new op
+  // only has the constant-constructed unit attribute attr1.
+  // CHECK: "test.match_op_attribute4"() {attr1} : () -> i32
+  %1 = "test.match_op_attribute3"() : () -> i32
+  return %0, %1 : i32, i32
+}
+
+//===----------------------------------------------------------------------===//
+// Test Enum Attributes
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: verifyStrEnumAttr
+func @verifyStrEnumAttr() -> i32 {
+  // CHECK: "test.str_enum_attr"() {attr = "B"}
+  %0 = "test.str_enum_attr"() {attr = "A"} : () -> i32
+  return %0 : i32
+}
+
+// CHECK-LABEL: verifyI32EnumAttr
+func @verifyI32EnumAttr() -> i32 {
+  // CHECK: "test.i32_enum_attr"() {attr = 10 : i32}
+  %0 = "test.i32_enum_attr"() {attr = 5: i32} : () -> i32
+  return %0 : i32
+}
+
+// CHECK-LABEL: verifyI64EnumAttr
+func @verifyI64EnumAttr() -> i32 {
+  // CHECK: "test.i64_enum_attr"() {attr = 10 : i64}
+  %0 = "test.i64_enum_attr"() {attr = 5: i64} : () -> i32
+  return %0 : i32
+}
+
+//===----------------------------------------------------------------------===//
+// Test ElementsAttr
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: rewrite_i32elementsattr
+func @rewrite_i32elementsattr() -> () {
+  // CHECK: attr = dense<0> : tensor<i32>
+  "test.i32ElementsAttr"() {attr = dense<[3, 5]>:tensor<2xi32>} : () -> ()
+  return
+}
+
+// CHECK-LABEL: rewrite_f64elementsattr
+func @rewrite_f64elementsattr() -> () {
+  "test.float_elements_attr"() {
+    // Should match
+    // CHECK: scalar_f32_attr = dense<[5.000000e+00, 6.000000e+00]> : tensor<2xf32>
+    scalar_f32_attr = dense<[3.0, 4.0]> : tensor<2xf32>,
+    tensor_f64_attr = dense<6.0> : tensor<4x8xf64>
+  } : () -> ()
+
+  "test.float_elements_attr"() {
+    // Should not match
+    // CHECK: scalar_f32_attr = dense<7.000000e+00> : tensor<2xf32>
+    scalar_f32_attr = dense<7.0> : tensor<2xf32>,
+    tensor_f64_attr = dense<3.0> : tensor<4x8xf64>
+  } : () -> ()
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// Test Multi-result Ops
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @useMultiResultOpToReplaceWhole
+func @useMultiResultOpToReplaceWhole() -> (i32, f32, f32) {
+  // CHECK: %[[A:.*]], %[[B:.*]], %[[C:.*]] = "test.another_three_result"()
+  // CHECK: return %[[A]], %[[B]], %[[C]]
+  %0:3 = "test.three_result"() {kind = 1} : () -> (i32, f32, f32)
+  return %0#0, %0#1, %0#2 : i32, f32, f32
+}
+
+// CHECK-LABEL: @useMultiResultOpToReplacePartial1
+func @useMultiResultOpToReplacePartial1() -> (i32, f32, f32) {
+  // CHECK: %[[A:.*]], %[[B:.*]] = "test.two_result"()
+  // CHECK: %[[C:.*]] = "test.one_result1"()
+  // CHECK: return %[[A]], %[[B]], %[[C]]
+  %0:3 = "test.three_result"() {kind = 2} : () -> (i32, f32, f32)
+  return %0#0, %0#1, %0#2 : i32, f32, f32
+}
+
+// CHECK-LABEL: @useMultiResultOpToReplacePartial2
+func @useMultiResultOpToReplacePartial2() -> (i32, f32, f32) {
+  // CHECK: %[[A:.*]] = "test.one_result2"()
+  // CHECK: %[[B:.*]], %[[C:.*]] = "test.another_two_result"()
+  // CHECK: return %[[A]], %[[B]], %[[C]]
+  %0:3 = "test.three_result"() {kind = 3} : () -> (i32, f32, f32)
+  return %0#0, %0#1, %0#2 : i32, f32, f32
+}
+
+// CHECK-LABEL: @useMultiResultOpResultsSeparately
+func @useMultiResultOpResultsSeparately() -> (i32, f32, f32) {
+  // CHECK: %[[A:.*]], %[[B:.*]] = "test.two_result"()
+  // CHECK: %[[C:.*]] = "test.one_result1"()
+  // CHECK: %[[D:.*]], %[[E:.*]] = "test.two_result"()
+  // CHECK: return %[[A]], %[[C]], %[[E]]
+  %0:3 = "test.three_result"() {kind = 4} : () -> (i32, f32, f32)
+  return %0#0, %0#1, %0#2 : i32, f32, f32
+}
+
+// CHECK-LABEL: @constraintOnSourceOpResult
+func @constraintOnSourceOpResult() -> (i32, f32, i32) {
+  // CHECK: %[[A:.*]], %[[B:.*]] = "test.two_result"()
+  // CHECK: %[[C:.*]] = "test.one_result2"()
+  // CHECK: %[[D:.*]] = "test.one_result1"()
+  // CHECK: return %[[A]], %[[B]], %[[C]]
+  %0:2 = "test.two_result"() {kind = 5} : () -> (i32, f32)
+  %1:2 = "test.two_result"() {kind = 5} : () -> (i32, f32)
+  return %0#0, %0#1, %1#0 : i32, f32, i32
+}
+
+// CHECK-LABEL: @useAuxiliaryOpToReplaceMultiResultOp
+func @useAuxiliaryOpToReplaceMultiResultOp() -> (i32, f32, f32) {
+  // An auxiliary op is generated to help building the op for replacing the
+  // matched op.
+  // CHECK: %[[A:.*]], %[[B:.*]] = "test.two_result"()
+
+  // CHECK: %[[C:.*]] = "test.one_result3"(%[[B]])
+  // CHECK: %[[D:.*]], %[[E:.*]] = "test.another_two_result"()
+  // CHECK: return %[[C]], %[[D]], %[[E]]
+  %0:3 = "test.three_result"() {kind = 6} : () -> (i32, f32, f32)
+  return %0#0, %0#1, %0#2 : i32, f32, f32
+}
+
+//===----------------------------------------------------------------------===//
+// Test Multi-result Ops
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @replaceOneVariadicOutOneVariadicInOp
+func @replaceOneVariadicOutOneVariadicInOp(%arg0: i32, %arg1: i32, %arg2: i32) -> (i32, i32, i32, i32, i32, i32) {
+  // CHECK: %[[cnt1:.*]] = "test.one_variadic_out_one_variadic_in2"(%arg0)
+  // CHECK: %[[cnt2:.*]]:2 = "test.one_variadic_out_one_variadic_in2"(%arg0, %arg1)
+  // CHECK: %[[cnt3:.*]]:3 = "test.one_variadic_out_one_variadic_in2"(%arg0, %arg1, %arg2)
+  // CHECK: return %[[cnt1]], %[[cnt2]]#0, %[[cnt2]]#1, %[[cnt3]]#0, %[[cnt3]]#1, %[[cnt3]]#2
+
+  %0   = "test.one_variadic_out_one_variadic_in1"(%arg0) : (i32) -> (i32)
+  %1:2 = "test.one_variadic_out_one_variadic_in1"(%arg0, %arg1) : (i32, i32) -> (i32, i32)
+  %2:3 = "test.one_variadic_out_one_variadic_in1"(%arg0, %arg1, %arg2) : (i32, i32, i32) -> (i32, i32, i32)
+  return %0, %1#0, %1#1, %2#0, %2#1, %2#2 : i32, i32, i32, i32, i32, i32
+}
+
+// CHECK-LABEL: @replaceMixedVariadicInputOp
+func @replaceMixedVariadicInputOp(%arg0: i32, %arg1: f32, %arg2: i32) -> () {
+  // CHECK: "test.mixed_variadic_in2"(%arg1)
+  // CHECK: "test.mixed_variadic_in2"(%arg0, %arg1, %arg2)
+  // CHECK: "test.mixed_variadic_in2"(%arg0, %arg0, %arg1, %arg2, %arg2)
+
+  "test.mixed_variadic_in1"(%arg1) : (f32) -> ()
+  "test.mixed_variadic_in1"(%arg0, %arg1, %arg2) : (i32, f32, i32) -> ()
+  "test.mixed_variadic_in1"(%arg0, %arg0, %arg1, %arg2, %arg2) : (i32, i32, f32, i32, i32) -> ()
+  return
+}
+
+// CHECK-LABEL: @replaceMixedVariadicOutputOp
+func @replaceMixedVariadicOutputOp() -> (f32, i32, f32, i32, i32, i32, f32, i32, i32) {
+  // CHECK: %[[cnt1:.*]] = "test.mixed_variadic_out2"()
+  // CHECK: %[[cnt3_a:.*]], %[[cnt3_b:.*]], %[[cnt3_c:.*]] = "test.mixed_variadic_out2"()
+  // CHECK: %[[cnt5_a:.*]]:2, %[[cnt5_b:.*]], %[[cnt5_c:.*]]:2 = "test.mixed_variadic_out2"()
+  // CHECK: return %[[cnt1]], %[[cnt3_a]], %[[cnt3_b]], %[[cnt3_c]], %[[cnt5_a]]#0, %[[cnt5_a]]#1, %[[cnt5_b]], %[[cnt5_c]]#0, %[[cnt5_c]]#1
+
+  %0   = "test.mixed_variadic_out1"() : () -> (f32)
+  %1:3 = "test.mixed_variadic_out1"() : () -> (i32, f32, i32)
+  %2:5 = "test.mixed_variadic_out1"() : () -> (i32, i32, f32, i32, i32)
+  return %0, %1#0, %1#1, %1#2, %2#0, %2#1, %2#2, %2#3, %2#4 : f32, i32, f32, i32, i32, i32, f32, i32, i32
+}
+
+// CHECK-LABEL: @generateVariadicOutputOpInNestedPattern
+func @generateVariadicOutputOpInNestedPattern() -> (i32) {
+  // CHECK: %[[cnt5_a:.*]], %[[cnt5_b:.*]]:2, %[[cnt5_c:.*]]:2 = "test.mixed_variadic_out3"()
+  // CHECK: %[[res:.*]] = "test.mixed_variadic_in3"(%[[cnt5_a]], %[[cnt5_b]]#0, %[[cnt5_b]]#1, %[[cnt5_c]]#0, %[[cnt5_c]]#1)
+  // CHECK: return %[[res]]
+
+  %0 = "test.one_i32_out"() : () -> (i32)
+  return %0 : i32
+}
diff --git a/mlir/test/mlir-tblgen/predicate.td b/mlir/test/mlir-tblgen/predicate.td
new file mode 100644
index 0000000000000000000000000000000000000000..ecfe709aa1be419728a6ca02c8bb081d4296e641
--- /dev/null
+++ b/mlir/test/mlir-tblgen/predicate.td
@@ -0,0 +1,94 @@
+// RUN: mlir-tblgen -gen-op-defs -I %S/../../include %s | FileCheck %s
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+}
+class NS_Op<string mnemonic, list<OpTrait> traits> :
+    Op<Test_Dialect, mnemonic, traits>;
+
+def I32OrF32 : Type<CPred<"$_self.isInteger(32) || $_self.isF32()">,
+                    "32-bit integer or floating-point type">;
+
+def OpA : NS_Op<"op_for_CPred_containing_multiple_same_placeholder", []> {
+  let arguments = (ins I32OrF32:$x);
+}
+
+// CHECK-LABEL: OpA::verify
+// CHECK: for (Value v : getODSOperands(0)) {
+// CHECK:   if (!((v->getType().isInteger(32) || v->getType().isF32())))
+
+def OpB : NS_Op<"op_for_And_PredOpTrait", [
+    PredOpTrait<"both first and second holds",
+                And<[CPred<"first">, CPred<"second">]>>]> {
+}
+
+// CHECK-LABEL: OpB::verify
+// CHECK: if (!(((first)) && ((second))))
+
+def OpF : NS_Op<"op_for_int_min_val", []> {
+  let arguments = (ins Confined<I32Attr, [IntMinValue<10>]>:$attr);
+}
+
+// CHECK-LABEL: OpF::verify()
+// CHECK:       (tblgen_attr.cast<IntegerAttr>().getInt() >= 10)
+// CHECK-SAME:    return emitOpError("attribute 'attr' failed to satisfy constraint: 32-bit integer attribute whose minimum value is 10");
+
+def OpFX : NS_Op<"op_for_int_max_val", []> {
+  let arguments = (ins Confined<I32Attr, [IntMaxValue<10>]>:$attr);
+}
+
+// CHECK-LABEL: OpFX::verify()
+// CHECK:       (tblgen_attr.cast<IntegerAttr>().getInt() <= 10)
+// CHECK-SAME:    return emitOpError("attribute 'attr' failed to satisfy constraint: 32-bit integer attribute whose maximum value is 10");
+
+def OpG : NS_Op<"op_for_arr_min_count", []> {
+  let arguments = (ins Confined<ArrayAttr, [ArrayMinCount<8>]>:$attr);
+}
+
+// CHECK-LABEL: OpG::verify()
+// CHECK:       (tblgen_attr.cast<ArrayAttr>().size() >= 8)
+// CHECK-SAME:    return emitOpError("attribute 'attr' failed to satisfy constraint: array attribute with at least 8 elements");
+
+def OpH : NS_Op<"op_for_arr_value_at_index", []> {
+  let arguments = (ins Confined<ArrayAttr, [IntArrayNthElemEq<0, 8>]>:$attr);
+}
+
+// CHECK-LABEL: OpH::verify()
+// CHECK: (((tblgen_attr.cast<ArrayAttr>().size() > 0)) && ((tblgen_attr.cast<ArrayAttr>().getValue()[0].cast<IntegerAttr>().getInt() == 8)))))
+// CHECK-SAME:    return emitOpError("attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be 8");
+
+def OpI: NS_Op<"op_for_arr_min_value_at_index", []> {
+  let arguments = (ins Confined<ArrayAttr, [IntArrayNthElemMinValue<0, 8>]>:$attr);
+}
+
+// CHECK-LABEL: OpI::verify()
+// CHECK: (((tblgen_attr.cast<ArrayAttr>().size() > 0)) && ((tblgen_attr.cast<ArrayAttr>().getValue()[0].cast<IntegerAttr>().getInt() >= 8)))))
+// CHECK-SAME:    return emitOpError("attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at least 8");
+
+def OpJ: NS_Op<"op_for_TCopVTEtAreSameAt", [
+                PredOpTrait<"operands indexed at 0, 2, 3 should all have "
+                 "the same type", TCopVTEtAreSameAt<[0, 2, 3]>>]> {
+  let arguments = (ins
+    AnyTensor:$a,
+    AnyTensor:$b,
+    AnyTensor:$c,
+    AnyTensor:$d,
+    AnyTensor:$e
+  );
+}
+
+// CHECK-LABEL: OpJ::verify()
+// CHECK:      llvm::is_splat(mlir::functional::map(
+// CHECK-SAME:   [this](unsigned i) { return getElementTypeOrSelf(this->getOperand(i)); },
+// CHECK-SAME:   llvm::ArrayRef<unsigned>({0, 2, 3})))
+// CHECK:   return emitOpError("failed to verify that operands indexed at 0, 2, 3 should all have the same type");
+
+def OpK : NS_Op<"op_for_AnyTensorOf", []> {
+  let arguments = (ins TensorOf<[F32, I32]>:$x);
+}
+
+// CHECK-LABEL: OpK::verify
+// CHECK: for (Value v : getODSOperands(0)) {
+// CHECK: if (!(((v->getType().isa<TensorType>())) && (((v->getType().cast<ShapedType>().getElementType().isF32())) || ((v->getType().cast<ShapedType>().getElementType().isInteger(32))))))
diff --git a/mlir/test/mlir-tblgen/return-types.mlir b/mlir/test/mlir-tblgen/return-types.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..e8c76f18f4f54e9934fecf77c6c8ee088fecc03a
--- /dev/null
+++ b/mlir/test/mlir-tblgen/return-types.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-opt %s -test-return-type -split-input-file -verify-diagnostics | FileCheck %s --dump-input-on-failure
+
+// CHECK-LABEL: testReturnTypeOpInterface
+func @testReturnTypeOpInterface(%arg0 : tensor<10xf32>, %arg1 : tensor<20xi32>) {
+  %good = "test.op_with_infer_type_if"(%arg0, %arg0) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
+  // CHECK: test.op_with_infer_type_if
+  // CHECK-SAME: tensor<20xi32>
+  // CHECK: test.op_with_infer_type_if
+  // CHECK-SAME: tensor<10xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: testReturnTypeOpInterface
+func @testReturnTypeOpInterface(%arg0 : tensor<10xf32>) {
+  // expected-error@+1 {{incompatible with return type}}
+  %bad = "test.op_with_infer_type_if"(%arg0, %arg0) : (tensor<10xf32>, tensor<10xf32>) -> tensor<*xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: testReturnTypeOpInterface
+func @testReturnTypeOpInterfaceMismatch(%arg0 : tensor<10xf32>, %arg1 : tensor<20xf32>) {
+  // expected-error@+1 {{operand type mismatch}}
+  %bad = "test.op_with_infer_type_if"(%arg0, %arg1) : (tensor<10xf32>, tensor<20xf32>) -> tensor<*xf32>
+  return
+}
diff --git a/mlir/test/mlir-tblgen/types.mlir b/mlir/test/mlir-tblgen/types.mlir
new file mode 100644
index 0000000000000000000000000000000000000000..8c3cce72f3dc6e6034eb40128f8efd266aabce52
--- /dev/null
+++ b/mlir/test/mlir-tblgen/types.mlir
@@ -0,0 +1,452 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics | FileCheck %s
+
+// -----
+
+// CHECK-LABEL: @complex_f64_success
+func @complex_f64_success() {
+  "test.complex_f64"() : () -> (complex<f64>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @complex_f64_tensor_success
+func @complex_f64_tensor_success() {
+  "test.complex_f64_tensor"() : () -> (tensor<complex<f64>>)
+  return
+}
+
+// -----
+
+func @complex_f64_failure() {
+  // expected-error@+1 {{must be complex type with 64-bit float elements}}
+  "test.complex_f64"() : () -> (f64)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @tuple_success
+func @tuple_success() {
+  "test.tuple_32_bit"() : () -> (tuple<i32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @tuple_mixed_success
+func @tuple_mixed_success() {
+  "test.tuple_32_bit"() : () -> (tuple<i32, f32>)
+  return
+}
+
+// -----
+
+func @tuple_empty_success() {
+  "test.tuple_32_bit"() : () -> (tuple<>)
+  return
+}
+
+// -----
+
+func @tuple_wrong_type_scalar() {
+  // expected-error@+1 {{must be tuple with any combination of 32-bit integer or 32-bit float values}}
+  "test.tuple_32_bit"() : () -> (tuple<i64>)
+  return
+}
+
+// -----
+
+func @tuple_wrong_type_tensor() {
+  // expected-error@+1 {{must be tuple with any combination of 32-bit integer or 32-bit float values}}
+  "test.tuple_32_bit"() : () -> (tuple<tensor<i32>>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @nested_tuple_empty_success
+func @nested_tuple_empty_success() {
+  "test.nested_tuple_32_bit"() : () -> (tuple<>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @nested_tuple_one_level_success
+func @nested_tuple_one_level_success() {
+  "test.nested_tuple_32_bit"() : () -> (tuple<i32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @nested_tuple_multi_level_success
+func @nested_tuple_multi_level_success() {
+  "test.nested_tuple_32_bit"() : () -> (tuple<i32, tuple<i32, tuple<i32>>>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @nested_tuple_multi_level_mixed_success
+func @nested_tuple_multi_level_mixed_success() {
+  "test.nested_tuple_32_bit"() : () -> (tuple<i32, tuple<f32, tuple<i32>>>)
+  return
+}
+
+// -----
+
+func @nested_tuple_multi_level_wrong_type() {
+  // expected-error@+1 {{must be nested tuple with any combination of 32-bit integer or 32-bit float values}}
+  "test.nested_tuple_32_bit"() : () -> (tuple<i32, tuple<i32, tuple<i64>>>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @rank_less_than_2_I8_F32_memref_success
+func @rank_less_than_2_I8_F32_memref_success() {
+  "test.rank_less_than_2_I8_F32_memref"() : () -> (memref<i8>)
+  "test.rank_less_than_2_I8_F32_memref"() : () -> (memref<3xi8>)
+  "test.rank_less_than_2_I8_F32_memref"() : () -> (memref<f32>)
+  "test.rank_less_than_2_I8_F32_memref"() : () -> (memref<1xf32>)
+  return
+}
+
+// -----
+
+func @rank_less_than_2_I8_F32_memref_bad_type() {
+  // expected-error@+1 {{must be 0D/1D memref of 8-bit integer or 32-bit float values}}
+  "test.rank_less_than_2_I8_F32_memref"() : () -> (memref<i16>)
+  return
+}
+
+// -----
+
+func @rank_less_than_2_I8_F32_memref_bad_rank() {
+  // expected-error@+1 {{must be 0D/1D memref of 8-bit integer or 32-bit float values}}
+  "test.rank_less_than_2_I8_F32_memref"() : () -> (memref<1x2xi8>)
+  return
+}
+
+// -----
+
+func @nd_tensor_of_success(%arg0: tensor<f32>, %arg1: tensor<10xf32>, %arg2: tensor<20x30xi16>, %arg3: tensor<40x50x60xi16>, %arg4: tensor<70x80x90x100xi16>) {
+  "test.nd_tensor_of"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<f32>, tensor<10xf32>, tensor<20x30xi16>, tensor<40x50x60xi16>, tensor<70x80x90x100xi16>) -> ()
+  return
+}
+
+// -----
+
+func @nd_tensor_of_success_wrong_type_0d(%arg0: tensor<f32>, %arg1: tensor<10xf32>, %arg2: tensor<20x30xi16>, %arg3: tensor<40x50x60xi16>, %arg4: tensor<70x80x90x100xi32>) {
+  // expected-error @+1 {{'test.nd_tensor_of' op operand #0 must be 0D tensor of 32-bit float values}}
+  "test.nd_tensor_of"(%arg1, %arg1, %arg2, %arg3, %arg4) : (tensor<10xf32>, tensor<10xf32>, tensor<20x30xi16>, tensor<40x50x60xi16>, tensor<70x80x90x100xi32>) -> ()
+  return
+}
+
+// -----
+
+func @nd_tensor_of_success_wrong_type_4d(%arg0: tensor<f32>, %arg1: tensor<10xf32>, %arg2: tensor<20x30xi16>, %arg3: tensor<40x50x60xi16>, %arg4: tensor<70x80x90x100xi32>) {
+  // expected-error @+1 {{'test.nd_tensor_of' op operand #4 must be 4D tensor of 16-bit integer values}}
+  "test.nd_tensor_of"(%arg0, %arg1, %arg2, %arg3, %arg3) : (tensor<f32>, tensor<10xf32>, tensor<20x30xi16>, tensor<40x50x60xi16>, tensor<40x50x60xi16>) -> ()
+  return
+}
+
+// -----
+
+func @ranked_tensor_success(%arg0: tensor<i8>, %arg1: tensor<1xi32>, %arg2: tensor<?x?xf32>) {
+  "test.ranked_tensor_op"(%arg0) : (tensor<i8>) -> ()
+  "test.ranked_tensor_op"(%arg1) : (tensor<1xi32>) -> ()
+  "test.ranked_tensor_op"(%arg2) : (tensor<?x?xf32>) -> ()
+  return
+}
+
+// -----
+
+func @ranked_tensor_success(%arg0: tensor<*xf32>) {
+  // expected-error @+1 {{must be ranked tensor of any type values}}
+  "test.ranked_tensor_op"(%arg0) : (tensor<*xf32>) -> ()
+  return
+}
+
+// -----
+
+func @ranked_tensor_success(%arg0: vector<2xf32>) {
+  // expected-error @+1 {{must be ranked tensor of any type values}}
+  "test.ranked_tensor_op"(%arg0) : (vector<2xf32>) -> ()
+  return
+}
+
+// -----
+
+func @multi_tensor_rank_of_success(%arg0: tensor<i8>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3: tensor<1xi8>, %arg4: tensor<1xi32>, %arg5: tensor<1xf32>) {
+  "test.multi_tensor_rank_of"(%arg0) : (tensor<i8>) -> ()
+  "test.multi_tensor_rank_of"(%arg1) : (tensor<i32>) -> ()
+  "test.multi_tensor_rank_of"(%arg2) : (tensor<f32>) -> ()
+  "test.multi_tensor_rank_of"(%arg3) : (tensor<1xi8>) -> ()
+  "test.multi_tensor_rank_of"(%arg4) : (tensor<1xi32>) -> ()
+  "test.multi_tensor_rank_of"(%arg5) : (tensor<1xf32>) -> ()
+  return
+}
+
+// -----
+
+func @multi_tensor_rank_of_wrong_unranked_type(%arg0: tensor<2x2xi8>) {
+  // expected-error @+1 {{'test.multi_tensor_rank_of' op operand #0 must be 0D/1D tensor of 8-bit integer or 32-bit integer or 32-bit float values}}
+  "test.multi_tensor_rank_of"(%arg0) : (tensor<2x2xi8>) -> ()
+  return
+}
+
+// -----
+
+func @multi_tensor_rank_of_wrong_element_type(%arg0: tensor<2xi16>) {
+  // expected-error @+1 {{'test.multi_tensor_rank_of' op operand #0 must be 0D/1D tensor of 8-bit integer or 32-bit integer or 32-bit float values}}
+  "test.multi_tensor_rank_of"(%arg0) : (tensor<2xi16>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @fixed_element_types
+func @fixed_element_types(%ti32: tensor<* x i32>, %tf32: tensor<* x f32>, %mi32 : memref<2x3xi32>, %vf32 : vector<2xf32>) {
+  "test.arg_and_res_have_fixed_element_types"(%ti32, %tf32) : (tensor<* x i32>, tensor<* x f32>) -> tensor<* x i16>
+  "test.arg_and_res_have_fixed_element_types"(%mi32, %vf32) : (memref<2x3xi32>, vector<2xf32>) -> memref<1x2xi16>
+  return
+}
+
+// -----
+
+func @fixed_element_types(%arg0: tensor<* x i32>, %arg1: tensor<* x f32>) {
+  // expected-error@+1 {{'res' is 16-bit integer}}
+  "test.arg_and_res_have_fixed_element_types"(%arg0, %arg1) : (tensor<* x i32>, tensor<* x f32>) -> tensor<* x i32>
+  return
+}
+
+// -----
+
+func @fixed_element_types(%arg0: tensor<* x i32>, %arg1: tensor<* x f32>) {
+  // expected-error@+1 {{fixed type combination}}
+  "test.arg_and_res_have_fixed_element_types"(%arg1, %arg0) : (tensor<* x f32>, tensor<* x i32>) -> tensor<* x i16>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: same_element_types_success
+func @same_element_types_success(%ti32: tensor<* x i32>, %i32 : i32, %mi32 : memref<2x3xi32>) {
+  "test.operands_have_same_element_type"(%ti32, %ti32): (tensor<* x i32>, tensor<* x i32>) -> ()
+  "test.operands_have_same_element_type"(%i32, %ti32): (i32, tensor<* x i32>) -> ()
+  "test.operands_have_same_element_type"(%i32, %mi32): (i32, memref<2x3xi32>) -> ()
+  return
+}
+
+
+// -----
+
+func @same_element_types_failure(%arg0: tensor<* x i32>, %arg1: tensor<* x f32>) {
+  // expected-error@+1 {{verify that all of {x, y} have same element type}}
+  "test.operands_have_same_element_type"(%arg1, %arg0): (tensor<* x f32>, tensor<* x i32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: same_element_types_success
+func @same_element_types_success(%ti32: tensor<* x i32>, %tf32: tensor<* x f32>) {
+  "test.operand0_and_result_have_same_element_type"(%tf32, %ti32) : (tensor<* x f32>, tensor<* x i32>) -> tensor<* x f32>
+  "test.operand0_and_result_have_same_element_type"(%tf32, %ti32) : (tensor<* x f32>, tensor<* x i32>) -> memref<2x3xf32>
+  "test.operand0_and_result_have_same_element_type"(%tf32, %ti32) : (tensor<* x f32>, tensor<* x i32>) -> f32
+  return
+}
+
+// -----
+
+func @same_element_types_failure(%arg0: tensor<* x i32>, %arg1: tensor<* x f32>) {
+  // expected-error@+1 {{all of {x, res} have same element type}}
+  "test.operand0_and_result_have_same_element_type"(%arg1, %arg0) : (tensor<* x f32>, tensor<* x i32>) -> tensor<* x i32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: same_types
+func @same_types(%ti32: tensor<* x i32>, %tf32: tensor<* x f32>) {
+  "test.operands_have_same_type"(%ti32, %ti32) : (tensor<* x i32>, tensor<* x i32>) -> ()
+  "test.operand0_and_result_have_same_type"(%ti32, %tf32) : (tensor<* x i32>, tensor<* x f32>) -> tensor<* x i32>
+  return
+}
+
+// -----
+
+func @same_types_failure(%ti32: tensor<* x i32>, %i32: i32) {
+  // expected-error@+1 {{all of {x, y} have same type}}
+  "test.operands_have_same_type"(%ti32, %i32) : (tensor<* x i32>, i32) -> ()
+  return
+}
+
+// -----
+
+func @same_types_element_mismatch(%ti32: tensor<* x i32>, %tf32: tensor<* x f32>) {
+  // expected-error@+1 {{all of {x, y} have same type}}
+  "test.operands_have_same_type"(%ti32, %tf32) : (tensor<* x i32>, tensor<* x f32>) -> ()
+  return
+}
+
+// -----
+
+func @same_types_shape_mismatch(%arg0: tensor<1x2xi32>, %arg1: tensor<2x1xi32>) {
+  // expected-error@+1 {{all of {x, y} have same type}}
+  "test.operands_have_same_type"(%arg0, %arg1) : (tensor<1x2xi32>, tensor<2x1xi32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: same_rank_success
+func @same_rank_success(%t1xi : tensor<1xi32>, %t2xf : tensor<2xf32>, %m3xi : memref<3xi32>, %t1x2xf : tensor<1x2xf32>, %t1x2xi : tensor<1x2xi32>) {
+  "test.operands_have_same_rank"(%t1xi, %t2xf) : (tensor<1xi32>, tensor<2xf32>) -> ()
+  "test.operands_have_same_rank"(%t1xi, %m3xi) : (tensor<1xi32>, memref<3xi32>) -> ()
+  "test.operand0_and_result_have_same_rank"(%t1xi, %t1x2xf) : (tensor<1xi32>, tensor<1x2xf32>) -> (tensor<3xf32>)
+  "test.operand0_and_result_have_same_rank"(%t1x2xi, %t1x2xf) : (tensor<1x2xi32>, tensor<1x2xf32>) -> (tensor<3x3xf64>)
+  return
+}
+
+// -----
+
+func @same_rank_failure(%arg0: tensor<1xi32>, %arg1: tensor<1x2xf32>) {
+  // expected-error@+1 {{all of {x, y} have same rank}}
+  "test.operands_have_same_rank"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xf32>) -> ()
+  return
+}
+
+// -----
+
+func @same_rank_failure(%arg0: tensor<1xi32>, %arg1: tensor<1x2xf32>) {
+  // expected-error@+1 {{all of {x, res} have same rank}}
+  "test.operand0_and_result_have_same_rank"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xf32>) -> (tensor<i32>)
+  return
+}
+
+// -----
+
+func @same_rank_failure(%arg0: tensor<1x2xi32>, %arg1: tensor<1x2xf32>) {
+  // expected-error@+1 {{all of {x, res} have same rank}}
+  "test.operand0_and_result_have_same_rank"(%arg0, %arg1) : (tensor<1x2xi32>, tensor<1x2xf32>) -> (tensor<3xi32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: same_shape_success
+func @same_shape_success(%t2x3: tensor<2x3xi32>, %m2x3: memref<2x3xf32>, %v2x3 : vector<2x3xi32>, %t4x5 : tensor<4x5xi32>) {
+  "test.operand0_and_result_have_same_shape"(%t2x3, %t4x5) : (tensor<2x3xi32>, tensor<4x5xi32>) -> (tensor<2x3xf32>)
+  "test.operand0_and_result_have_same_shape"(%t2x3, %t4x5) : (tensor<2x3xi32>, tensor<4x5xi32>) -> (memref<2x3xf32>)
+  "test.operand0_and_result_have_same_shape"(%t2x3, %t4x5) : (tensor<2x3xi32>, tensor<4x5xi32>) -> (vector<2x3xf32>)
+  return
+}
+
+// -----
+
+func @same_shape_failure(%t2x3: tensor<2x3xi32>, %t4x5 : tensor<4x5xi32>) {
+  // expected-error@+1 {{all of {x, res} have same shape}}
+  "test.operand0_and_result_have_same_shape"(%t2x3, %t4x5) : (tensor<2x3xi32>, tensor<4x5xi32>) -> (tensor<1x3xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: same_element_count_success
+func @same_element_count_success(%arg0: tensor<36xi32>, %arg1: tensor<1x2xf32>, %arg3: tensor<f32>) {
+  "test.operand0_and_result_have_same_element_count"(%arg0, %arg1) : (tensor<36xi32>, tensor<1x2xf32>) -> (tensor<3x4x3xf32>)
+  "test.operand0_and_result_have_same_element_count"(%arg0, %arg1) : (tensor<36xi32>, tensor<1x2xf32>) -> (tensor<3x12xf64>)
+  "test.operand0_and_result_have_same_element_count"(%arg3, %arg1) : (tensor<f32>, tensor<1x2xf32>) -> (memref<1x1x1xi32>)
+  return
+}
+
+// -----
+
+func @same_element_count_failure(%arg0: tensor<1xi32>, %arg1: tensor<1x2xf32>) {
+  // expected-error@+1 {{all of {x, res} have same element count}}
+  "test.operand0_and_result_have_same_element_count"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xf32>) -> (tensor<2xi32>)
+  return
+}
+
+// -----
+
+func @four_equals_five() {
+  // expected-error@+1 {{failed to verify that 4 equals 5}}
+  "test.four_equals_five"() : () -> ()
+  return
+}
+
+// -----
+
+func @operand_rank_equals_result_size_success(%arg : tensor<1x2x3x4xi32>) {
+  "test.operand_rank_equals_result_size"(%arg) : (tensor<1x2x3x4xi32>) -> tensor<4xi32>
+  "test.operand_rank_equals_result_size"(%arg) : (tensor<1x2x3x4xi32>) -> memref<2x2xf32>
+  return
+}
+
+// -----
+
+func @operand_rank_equals_result_size_failure(%arg : tensor<1x2x3x4xi32>) {
+  // expected-error@+1 {{failed to verify that operand rank equals result size}}
+  "test.operand_rank_equals_result_size"(%arg) : (tensor<1x2x3x4xi32>) -> tensor<2xi32>
+  return
+}
+
+// -----
+
+func @same_types_element_mismatch(%arg0: tensor<* x i32>, %arg1: tensor<* x f32>) {
+  // expected-error@+1 {{all of {x, res} have same type}}
+  "test.operand0_and_result_have_same_type"(%arg0, %arg1) : (tensor<* x i32>, tensor<* x f32>) -> tensor<* x f32>
+  return
+}
+
+// -----
+
+func @same_types_shape_mismatch(%arg0: tensor<1x2xi32>, %arg1: tensor<2x1xi32>) {
+  // expected-error@+1 {{all of {x, res} have same type}}
+  "test.operand0_and_result_have_same_type"(%arg0, %arg1) : (tensor<1x2xi32>, tensor<2x1xi32>) -> tensor<2x1xi32>
+  return
+}
+
+// -----
+
+func @does_not_have_i32(%arg0: tensor<1x2xi32>, %arg1: none) {
+  // expected-error@+1 {{either both none type operands or first is not none}}
+  "test.if_first_operand_is_none_then_so_is_second"(%arg1, %arg0) : (none, tensor<1x2xi32>) -> ()
+  return
+}
+
+// -----
+
+func @does_not_have_static_memref(%arg0: memref<?xi32>) {
+  // expected-error@+1 {{'test.takes_static_memref' op operand #0 must be statically shaped memref of any type values}}
+  "test.takes_static_memref"(%arg0) : (memref<?xi32>) -> ()
+}
+
+// -----
+
+func @elements_attr_not_i32_f32() {
+  // expected-error@+1 {{32-bit integer elements attribute}}
+  "test.i32ElementsAttr"() {attr = dense<[1.0, 20.0]>:tensor<2xf32>} : () -> ()
+  return
+}
+
+// -----
+
+func @elements_attr_not_i32_i64() {
+  // expected-error@+1 {{32-bit integer elements attribute}}
+  "test.i32ElementsAttr"() {attr = dense<[1, 20]>:tensor<2xi64>} : () -> ()
+  return
+}
+
+
+// -----
+
+func @elements_attr_i32(%arg0: tensor<1x2xi32>) {
+  "test.i32ElementsAttr"() {attr = dense<[1, 2]>:tensor<2xi32>} : () -> ()
+  return
+}
diff --git a/mlir/tools/CMakeLists.txt b/mlir/tools/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2566dd8728838400a077bd7145626ffab5a7a9d7
--- /dev/null
+++ b/mlir/tools/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_subdirectory(mlir-cuda-runner)
+add_subdirectory(mlir-cpu-runner)
+add_subdirectory(mlir-opt)
+add_subdirectory(mlir-tblgen)
+add_subdirectory(mlir-translate)
diff --git a/mlir/tools/mlir-cpu-runner/CMakeLists.txt b/mlir/tools/mlir-cpu-runner/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8227ac27c9655206b700de3e9b3fe58a4375feed
--- /dev/null
+++ b/mlir/tools/mlir-cpu-runner/CMakeLists.txt
@@ -0,0 +1,23 @@
+add_llvm_executable(mlir-cpu-runner
+  mlir-cpu-runner.cpp
+)
+llvm_update_compile_flags(mlir-cpu-runner)
+whole_archive_link(mlir-cpu-runner
+  MLIRAffineOps
+  MLIRLLVMIR
+  MLIRTargetLLVMIR
+  MLIRTranslation
+)
+target_link_libraries(mlir-cpu-runner PRIVATE
+  MLIRAnalysis
+  MLIREDSC
+  MLIRExecutionEngine
+  MLIRIR
+  MLIRJitRunner
+  MLIRLLVMIR
+  MLIRParser
+  MLIRTargetLLVMIR
+  MLIRSupport
+  LLVMCore
+  LLVMSupport
+)
diff --git a/mlir/tools/mlir-cpu-runner/mlir-cpu-runner.cpp b/mlir/tools/mlir-cpu-runner/mlir-cpu-runner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..144f73d9c9730a76a760cb4428d0deae1aaa4f8e
--- /dev/null
+++ b/mlir/tools/mlir-cpu-runner/mlir-cpu-runner.cpp
@@ -0,0 +1,19 @@
+//===- mlir-cpu-runner.cpp - MLIR CPU Execution Driver---------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Main entry point to a command line utility that executes an MLIR file on the
+// CPU by  translating MLIR to LLVM IR before JIT-compiling and executing the
+// latter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/JitRunner.h"
+
+int main(int argc, char **argv) {
+  return mlir::JitRunnerMain(argc, argv, nullptr);
+}
diff --git a/mlir/tools/mlir-cuda-runner/CMakeLists.txt b/mlir/tools/mlir-cuda-runner/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6d296a5a5c779227279f36ce85189987900d1a91
--- /dev/null
+++ b/mlir/tools/mlir-cuda-runner/CMakeLists.txt
@@ -0,0 +1,74 @@
+set(LLVM_OPTIONAL_SOURCES
+  cuda-runtime-wrappers.cpp
+  mlir-cuda-runner.cpp
+  )
+
+if(MLIR_CUDA_RUNNER_ENABLED)
+  if (NOT ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD))
+    message(SEND_ERROR
+      "Building the mlir cuda runner requires the NVPTX backend")
+  endif()
+
+  # Configure CUDA runner support. Using check_language first allows us to give
+  # a custom error message.
+  include(CheckLanguage)
+  check_language(CUDA)
+  if (CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+  else()
+    message(SEND_ERROR
+      "Building the mlir cuda runner requires a working CUDA install")
+  endif()
+
+  # We need the libcuda.so library.
+  find_library(CUDA_RUNTIME_LIBRARY cuda)
+
+  add_llvm_library(cuda-runtime-wrappers SHARED
+    cuda-runtime-wrappers.cpp
+  )
+  target_include_directories(cuda-runtime-wrappers
+    PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+    LLVMSupport
+  )
+  target_link_libraries(cuda-runtime-wrappers
+    LLVMSupport
+    ${CUDA_RUNTIME_LIBRARY}
+  )
+
+  set(FULL_LINK_LIBS
+    MLIRAffineOps
+    MLIRLoopToStandard
+    MLIRGPU
+    MLIRGPUtoCUDATransforms
+    MLIRGPUtoNVVMTransforms
+    MLIRLLVMIR
+    MLIRStandardOps
+    MLIRStandardToLLVM
+    MLIRTargetLLVMIR
+    MLIRTransforms
+    MLIRTranslation
+  )
+  set(LIBS
+    MLIRIR
+    MLIRParser
+    MLIREDSC
+    MLIRAnalysis
+    MLIRExecutionEngine
+    MLIRJitRunner
+    MLIRSupport
+    LLVMCore
+    LLVMSupport
+    ${CUDA_RUNTIME_LIBRARY}
+  )
+  add_llvm_executable(mlir-cuda-runner
+    mlir-cuda-runner.cpp
+  )
+  add_dependencies(mlir-cuda-runner cuda-runtime-wrappers)
+  target_include_directories(mlir-cuda-runner
+    PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+  )
+  llvm_update_compile_flags(mlir-cuda-runner)
+  whole_archive_link(mlir-cuda-runner ${FULL_LINK_LIBS})
+  target_link_libraries(mlir-cuda-runner PRIVATE ${FULL_LINK_LIBS} ${LIBS})
+
+endif()
diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f1591b5a8c7e2a76177bc334383c9814ef0bcdf
--- /dev/null
+++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
@@ -0,0 +1,106 @@
+//===- cuda-runtime-wrappers.cpp - MLIR CUDA runner wrapper library -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements C wrappers around the CUDA library for easy linking in ORC jit.
+// Also adds some debugging helpers that are helpful when writing MLIR code to
+// run on GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <numeric>
+
+#include "llvm/Support/raw_ostream.h"
+
+#include "cuda.h"
+
+namespace {
+int32_t reportErrorIfAny(CUresult result, const char *where) {
+  if (result != CUDA_SUCCESS) {
+    llvm::errs() << "CUDA failed with " << result << " in " << where << "\n";
+  }
+  return result;
+}
+} // anonymous namespace
+
+extern "C" int32_t mcuModuleLoad(void **module, void *data) {
+  int32_t err = reportErrorIfAny(
+      cuModuleLoadData(reinterpret_cast<CUmodule *>(module), data),
+      "ModuleLoad");
+  return err;
+}
+
+extern "C" int32_t mcuModuleGetFunction(void **function, void *module,
+                                        const char *name) {
+  return reportErrorIfAny(
+      cuModuleGetFunction(reinterpret_cast<CUfunction *>(function),
+                          reinterpret_cast<CUmodule>(module), name),
+      "GetFunction");
+}
+
+// The wrapper uses intptr_t instead of CUDA's unsigned int to match
+// the type of MLIR's index type. This avoids the need for casts in the
+// generated MLIR code.
+extern "C" int32_t mcuLaunchKernel(void *function, intptr_t gridX,
+                                   intptr_t gridY, intptr_t gridZ,
+                                   intptr_t blockX, intptr_t blockY,
+                                   intptr_t blockZ, int32_t smem, void *stream,
+                                   void **params, void **extra) {
+  return reportErrorIfAny(
+      cuLaunchKernel(reinterpret_cast<CUfunction>(function), gridX, gridY,
+                     gridZ, blockX, blockY, blockZ, smem,
+                     reinterpret_cast<CUstream>(stream), params, extra),
+      "LaunchKernel");
+}
+
+extern "C" void *mcuGetStreamHelper() {
+  CUstream stream;
+  reportErrorIfAny(cuStreamCreate(&stream, CU_STREAM_DEFAULT), "StreamCreate");
+  return stream;
+}
+
+extern "C" int32_t mcuStreamSynchronize(void *stream) {
+  return reportErrorIfAny(
+      cuStreamSynchronize(reinterpret_cast<CUstream>(stream)), "StreamSync");
+}
+
+/// Helper functions for writing mlir example code
+
+// Allows to register byte array with the CUDA runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void mcuMemHostRegister(void *ptr, uint64_t sizeBytes) {
+  reportErrorIfAny(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0),
+                   "MemHostRegister");
+}
+
+// A struct that corresponds to how MLIR represents memrefs.
+template <typename T, int N> struct MemRefType {
+  T *basePtr;
+  T *data;
+  int64_t offset;
+  int64_t sizes[N];
+  int64_t strides[N];
+};
+
+// Allows to register a MemRef with the CUDA runtime. Initializes array with
+// value. Helpful until we have transfer functions implemented.
+template <typename T, int N>
+void mcuMemHostRegisterMemRef(const MemRefType<T, N> *arg, T value) {
+  auto count = std::accumulate(arg->sizes, arg->sizes + N, 1,
+                               std::multiplies<int64_t>());
+  std::fill_n(arg->data, count, value);
+  mcuMemHostRegister(arg->data, count * sizeof(T));
+}
+extern "C" void
+mcuMemHostRegisterMemRef1dFloat(const MemRefType<float, 1> *arg) {
+  mcuMemHostRegisterMemRef(arg, 1.23f);
+}
+extern "C" void
+mcuMemHostRegisterMemRef3dFloat(const MemRefType<float, 3> *arg) {
+  mcuMemHostRegisterMemRef(arg, 1.23f);
+}
diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6160d6d6e0c816d72bec50b4e0cc9b4b3ca24eb
--- /dev/null
+++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -0,0 +1,120 @@
+//===- mlir-cpu-runner.cpp - MLIR CPU Execution Driver---------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a command line utility that executes an MLIR file on the GPU by
+// translating MLIR to NVVM/LVVM IR before JIT-compiling and executing the
+// latter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+
+#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
+#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/JitRunner.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "cuda.h"
+
+using namespace mlir;
+
+inline void emit_cuda_error(const llvm::Twine &message, const char *buffer,
+                            CUresult error, Location loc) {
+  emitError(loc, message.concat(" failed with error code ")
+                     .concat(llvm::Twine{error})
+                     .concat("[")
+                     .concat(buffer)
+                     .concat("]"));
+}
+
+#define RETURN_ON_CUDA_ERROR(expr, msg)                                        \
+  {                                                                            \
+    auto _cuda_error = (expr);                                                 \
+    if (_cuda_error != CUDA_SUCCESS) {                                         \
+      emit_cuda_error(msg, jitErrorBuffer, _cuda_error, loc);                  \
+      return {};                                                               \
+    }                                                                          \
+  }
+
+OwnedCubin compilePtxToCubin(const std::string ptx, Location loc,
+                             StringRef name) {
+  char jitErrorBuffer[4096] = {0};
+
+  RETURN_ON_CUDA_ERROR(cuInit(0), "cuInit");
+
+  // Linking requires a device context.
+  CUdevice device;
+  RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0), "cuDeviceGet");
+  CUcontext context;
+  RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device), "cuCtxCreate");
+  CUlinkState linkState;
+
+  CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
+                               CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
+  void *jitOptionsVals[] = {jitErrorBuffer,
+                            reinterpret_cast<void *>(sizeof(jitErrorBuffer))};
+
+  RETURN_ON_CUDA_ERROR(cuLinkCreate(2,              /* number of jit options */
+                                    jitOptions,     /* jit options */
+                                    jitOptionsVals, /* jit option values */
+                                    &linkState),
+                       "cuLinkCreate");
+
+  RETURN_ON_CUDA_ERROR(
+      cuLinkAddData(linkState, CUjitInputType::CU_JIT_INPUT_PTX,
+                    const_cast<void *>(static_cast<const void *>(ptx.c_str())),
+                    ptx.length(), name.data(), /* kernel name */
+                    0,                         /* number of jit options */
+                    nullptr,                   /* jit options */
+                    nullptr                    /* jit option values */
+                    ),
+      "cuLinkAddData");
+
+  void *cubinData;
+  size_t cubinSize;
+  RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize),
+                       "cuLinkComplete");
+
+  char *cubinAsChar = static_cast<char *>(cubinData);
+  OwnedCubin result =
+      std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
+
+  // This will also destroy the cubin data.
+  RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState), "cuLinkDestroy");
+
+  return result;
+}
+
+static LogicalResult runMLIRPasses(ModuleOp m) {
+  PassManager pm(m.getContext());
+  applyPassManagerCLOptions(pm);
+
+  pm.addPass(createGpuKernelOutliningPass());
+  auto &kernelPm = pm.nest<ModuleOp>();
+  kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
+  kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
+  pm.addPass(createLowerToLLVMPass());
+  pm.addPass(createConvertGpuLaunchFuncToCudaCallsPass());
+
+  return pm.run(m);
+}
+
+int main(int argc, char **argv) {
+  registerPassManagerCLOptions();
+  return mlir::JitRunnerMain(argc, argv, &runMLIRPasses);
+}
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b30d7e39ce83256b7ed2affdc16e13402d99e6e2
--- /dev/null
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -0,0 +1,67 @@
+set(LLVM_OPTIONAL_SOURCES
+  null.cpp
+)
+
+set(LIB_LIBS
+  MLIRAnalysis
+  MLIRLLVMIR
+  MLIRParser
+  MLIRPass
+  MLIRTransforms
+  MLIRSupport
+)
+add_llvm_library(MLIRMlirOptLib
+  mlir-opt.cpp
+)
+target_link_libraries(MLIRMlirOptLib ${LIB_LIBS})
+
+set(LIBS
+  MLIRAnalysis
+  MLIRAffineOps
+  MLIRAffineToStandard
+  MLIRLoopsToGPU
+  MLIRLinalgToLLVM
+
+  MLIRLoopToStandard
+  MLIREDSC
+  MLIRFxpMathOps
+  MLIRGPU
+  MLIRGPUtoNVVMTransforms
+  MLIRGPUtoROCDLTransforms
+  MLIRGPUtoSPIRVTransforms
+  MLIRLinalg
+  MLIRLLVMIR
+  MLIRLoopOps
+  MLIRNVVMIR
+  MLIROptMain
+  MLIRParser
+  MLIRPass
+  MLIRQuantizerTransforms
+  MLIRQuantOps
+  MLIRROCDLIR
+  MLIRSPIRV
+  MLIRStandardToSPIRVTransforms
+  MLIRSPIRVTransforms
+  MLIRStandardOps
+  MLIRStandardToLLVM
+  MLIRTransforms
+  MLIRTestDialect
+  MLIRTestIR
+  MLIRTestPass
+  MLIRTestTransforms
+  MLIRSupport
+  MLIRVectorOps
+  MLIRVectorToLLVM
+  MLIRVectorToLoops
+)
+if(MLIR_CUDA_CONVERSIONS_ENABLED)
+  list(APPEND LIBS
+    MLIRGPUtoCUDATransforms
+  )
+endif()
+add_llvm_tool(mlir-opt
+ mlir-opt.cpp
+)
+llvm_update_compile_flags(mlir-opt)
+whole_archive_link(mlir-opt ${LIBS})
+target_link_libraries(mlir-opt PRIVATE MLIRIR MLIRMlirOptLib ${LIBS} LLVMSupport)
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0dd1b59ce73ae10ec9eb44a347d0eb9ef604cac
--- /dev/null
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -0,0 +1,76 @@
+//===- mlir-opt.cpp - MLIR Optimizer Driver -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Main entry function for mlir-opt for when built as standalone binary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/MlirOptMain.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace llvm;
+using namespace mlir;
+
+static cl::opt<std::string>
+    inputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+static cl::opt<std::string> outputFilename("o", cl::desc("Output filename"),
+                                           cl::value_desc("filename"),
+                                           cl::init("-"));
+
+static cl::opt<bool>
+    splitInputFile("split-input-file",
+                   cl::desc("Split the input file into pieces and process each "
+                            "chunk independently"),
+                   cl::init(false));
+
+static cl::opt<bool>
+    verifyDiagnostics("verify-diagnostics",
+                      cl::desc("Check that emitted diagnostics match "
+                               "expected-* lines on the corresponding line"),
+                      cl::init(false));
+
+static cl::opt<bool>
+    verifyPasses("verify-each",
+                 cl::desc("Run the verifier after each transformation pass"),
+                 cl::init(true));
+
+int main(int argc, char **argv) {
+  InitLLVM y(argc, argv);
+
+  // Register any pass manager command line options.
+  registerPassManagerCLOptions();
+  PassPipelineCLParser passPipeline("", "Compiler passes to run");
+
+  // Parse pass names in main to ensure static initialization completed.
+  cl::ParseCommandLineOptions(argc, argv, "MLIR modular optimizer driver\n");
+
+  // Set up the input file.
+  std::string errorMessage;
+  auto file = openInputFile(inputFilename, &errorMessage);
+  if (!file) {
+    llvm::errs() << errorMessage << "\n";
+    return 1;
+  }
+
+  auto output = openOutputFile(outputFilename, &errorMessage);
+  if (!output) {
+    llvm::errs() << errorMessage << "\n";
+    exit(1);
+  }
+
+  return failed(MlirOptMain(output->os(), std::move(file), passPipeline,
+                            splitInputFile, verifyDiagnostics, verifyPasses));
+}
diff --git a/mlir/tools/mlir-tblgen/CMakeLists.txt b/mlir/tools/mlir-tblgen/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..31c23b8bd3873bc0d99520a0861b90f5b29a6683
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(LLVM_LINK_COMPONENTS
+  MLIRTableGen
+  Support
+  )
+
+add_tablegen(mlir-tblgen MLIR
+  EnumsGen.cpp
+  LLVMIRConversionGen.cpp
+  mlir-tblgen.cpp
+  OpDefinitionsGen.cpp
+  OpDocGen.cpp
+  OpInterfacesGen.cpp
+  ReferenceImplGen.cpp
+  RewriterGen.cpp
+  SPIRVUtilsGen.cpp
+  StructsGen.cpp
+  )
+set_target_properties(mlir-tblgen PROPERTIES FOLDER "Tablegenning")
diff --git a/mlir/tools/mlir-tblgen/DocGenUtilities.h b/mlir/tools/mlir-tblgen/DocGenUtilities.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b3c8541aeefd0abdb7d72677e09ca5920cdcde9
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/DocGenUtilities.h
@@ -0,0 +1,36 @@
+//===- DocGenUtilities.h - MLIR doc gen utilities ---------------*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines common utilities for generating documents from tablegen
+// structures.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TOOLS_MLIRTBLGEN_DOCGENUTILITIES_H_
+#define MLIR_TOOLS_MLIRTBLGEN_DOCGENUTILITIES_H_
+
+namespace llvm {
+class raw_ostream;
+class StringRef;
+} // namespace llvm
+
+namespace mlir {
+namespace tblgen {
+
+// Emit the description by aligning the text to the left per line (e.g.
+// removing the minimum indentation across the block).
+//
+// This expects that the description in the tablegen file is already formatted
+// in a way the user wanted but has some additional indenting due to being
+// nested.
+void emitDescription(llvm::StringRef description, llvm::raw_ostream &os);
+
+} // namespace tblgen
+} // namespace mlir
+
+#endif // MLIR_TOOLS_MLIRTBLGEN_DOCGENUTILITIES_H_
diff --git a/mlir/tools/mlir-tblgen/EnumsGen.cpp b/mlir/tools/mlir-tblgen/EnumsGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..610a380dab329c8f10ed221d2f0ef50990dbe83d
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/EnumsGen.cpp
@@ -0,0 +1,433 @@
+//===- EnumsGen.cpp - MLIR enum utility generator -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// EnumsGen generates common utility functions for enums.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using llvm::formatv;
+using llvm::isDigit;
+using llvm::raw_ostream;
+using llvm::Record;
+using llvm::RecordKeeper;
+using llvm::StringRef;
+using mlir::tblgen::EnumAttr;
+using mlir::tblgen::EnumAttrCase;
+
+static std::string makeIdentifier(StringRef str) {
+  if (!str.empty() && isDigit(static_cast<unsigned char>(str.front()))) {
+    std::string newStr = std::string("_") + str.str();
+    return newStr;
+  }
+  return str.str();
+}
+
+static void emitEnumClass(const Record &enumDef, StringRef enumName,
+                          StringRef underlyingType, StringRef description,
+                          const std::vector<EnumAttrCase> &enumerants,
+                          raw_ostream &os) {
+  os << "// " << description << "\n";
+  os << "enum class " << enumName;
+
+  if (!underlyingType.empty())
+    os << " : " << underlyingType;
+  os << " {\n";
+
+  for (const auto &enumerant : enumerants) {
+    auto symbol = makeIdentifier(enumerant.getSymbol());
+    auto value = enumerant.getValue();
+    if (value >= 0) {
+      os << formatv("  {0} = {1},\n", symbol, value);
+    } else {
+      os << formatv("  {0},\n", symbol);
+    }
+  }
+  os << "};\n\n";
+}
+
+static void emitDenseMapInfo(StringRef enumName, std::string underlyingType,
+                             StringRef cppNamespace, raw_ostream &os) {
+  std::string qualName = formatv("{0}::{1}", cppNamespace, enumName);
+  if (underlyingType.empty())
+    underlyingType = formatv("std::underlying_type<{0}>::type", qualName);
+
+  const char *const mapInfo = R"(
+namespace llvm {
+template<> struct DenseMapInfo<{0}> {{
+  using StorageInfo = llvm::DenseMapInfo<{1}>;
+
+  static inline {0} getEmptyKey() {{
+    return static_cast<{0}>(StorageInfo::getEmptyKey());
+  }
+
+  static inline {0} getTombstoneKey() {{
+    return static_cast<{0}>(StorageInfo::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const {0} &val) {{
+    return StorageInfo::getHashValue(static_cast<{1}>(val));
+  }
+
+  static bool isEqual(const {0} &lhs, const {0} &rhs) {{
+    return lhs == rhs;
+  }
+};
+})";
+  os << formatv(mapInfo, qualName, underlyingType);
+  os << "\n\n";
+}
+
+static void emitMaxValueFn(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef maxEnumValFnName = enumAttr.getMaxEnumValFnName();
+  auto enumerants = enumAttr.getAllCases();
+
+  unsigned maxEnumVal = 0;
+  for (const auto &enumerant : enumerants) {
+    int64_t value = enumerant.getValue();
+    // Avoid generating the max value function if there is an enumerant without
+    // explicit value.
+    if (value < 0)
+      return;
+
+    maxEnumVal = std::max(maxEnumVal, static_cast<unsigned>(value));
+  }
+
+  // Emit the function to return the max enum value
+  os << formatv("inline constexpr unsigned {0}() {{\n", maxEnumValFnName);
+  os << formatv("  return {0};\n", maxEnumVal);
+  os << "}\n\n";
+}
+
+// Returns the EnumAttrCase whose value is zero if exists; returns llvm::None
+// otherwise.
+static llvm::Optional<EnumAttrCase>
+getAllBitsUnsetCase(llvm::ArrayRef<EnumAttrCase> cases) {
+  for (auto attrCase : cases) {
+    if (attrCase.getValue() == 0)
+      return attrCase;
+  }
+  return llvm::None;
+}
+
+// Emits the following inline function for bit enums:
+//
+// inline <enum-type> operator|(<enum-type> a, <enum-type> b);
+// inline <enum-type> operator&(<enum-type> a, <enum-type> b);
+// inline <enum-type> bitEnumContains(<enum-type> a, <enum-type> b);
+static void emitOperators(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef enumName = enumAttr.getEnumClassName();
+  std::string underlyingType = enumAttr.getUnderlyingType();
+  os << formatv("inline {0} operator|({0} lhs, {0} rhs) {{\n", enumName)
+     << formatv("  return static_cast<{0}>("
+                "static_cast<{1}>(lhs) | static_cast<{1}>(rhs));\n",
+                enumName, underlyingType)
+     << "}\n";
+  os << formatv("inline {0} operator&({0} lhs, {0} rhs) {{\n", enumName)
+     << formatv("  return static_cast<{0}>("
+                "static_cast<{1}>(lhs) & static_cast<{1}>(rhs));\n",
+                enumName, underlyingType)
+     << "}\n";
+  os << formatv(
+            "inline bool bitEnumContains({0} bits, {0} bit) {{\n"
+            "  return (static_cast<{1}>(bits) & static_cast<{1}>(bit)) != 0;\n",
+            enumName, underlyingType)
+     << "}\n";
+}
+
+static void emitSymToStrFnForIntEnum(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef enumName = enumAttr.getEnumClassName();
+  StringRef symToStrFnName = enumAttr.getSymbolToStringFnName();
+  StringRef symToStrFnRetType = enumAttr.getSymbolToStringFnRetType();
+  auto enumerants = enumAttr.getAllCases();
+
+  os << formatv("{2} {1}({0} val) {{\n", enumName, symToStrFnName,
+                symToStrFnRetType);
+  os << "  switch (val) {\n";
+  for (const auto &enumerant : enumerants) {
+    auto symbol = enumerant.getSymbol();
+    os << formatv("    case {0}::{1}: return \"{2}\";\n", enumName,
+                  makeIdentifier(symbol), symbol);
+  }
+  os << "  }\n";
+  os << "  return \"\";\n";
+  os << "}\n\n";
+}
+
+static void emitSymToStrFnForBitEnum(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef enumName = enumAttr.getEnumClassName();
+  StringRef symToStrFnName = enumAttr.getSymbolToStringFnName();
+  StringRef symToStrFnRetType = enumAttr.getSymbolToStringFnRetType();
+  StringRef separator = enumDef.getValueAsString("separator");
+  auto enumerants = enumAttr.getAllCases();
+  auto allBitsUnsetCase = getAllBitsUnsetCase(enumerants);
+
+  os << formatv("{2} {1}({0} symbol) {{\n", enumName, symToStrFnName,
+                symToStrFnRetType);
+
+  os << formatv("  auto val = static_cast<{0}>(symbol);\n",
+                enumAttr.getUnderlyingType());
+  if (allBitsUnsetCase) {
+    os << "  // Special case for all bits unset.\n";
+    os << formatv("  if (val == 0) return \"{0}\";\n\n",
+                  allBitsUnsetCase->getSymbol());
+  }
+  os << "  llvm::SmallVector<llvm::StringRef, 2> strs;\n";
+  for (const auto &enumerant : enumerants) {
+    // Skip the special enumerant for None.
+    if (auto val = enumerant.getValue())
+      os << formatv("  if ({0}u & val) {{ strs.push_back(\"{1}\"); "
+                    "val &= ~{0}u; }\n",
+                    val, enumerant.getSymbol());
+  }
+  // If we have unknown bit set, return an empty string to signal errors.
+  os << "\n  if (val) return \"\";\n";
+  os << formatv("  return llvm::join(strs, \"{0}\");\n", separator);
+
+  os << "}\n\n";
+}
+
+static void emitStrToSymFnForIntEnum(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef enumName = enumAttr.getEnumClassName();
+  StringRef strToSymFnName = enumAttr.getStringToSymbolFnName();
+  auto enumerants = enumAttr.getAllCases();
+
+  os << formatv("llvm::Optional<{0}> {1}(llvm::StringRef str) {{\n", enumName,
+                strToSymFnName);
+  os << formatv("  return llvm::StringSwitch<llvm::Optional<{0}>>(str)\n",
+                enumName);
+  for (const auto &enumerant : enumerants) {
+    auto symbol = enumerant.getSymbol();
+    os << formatv("      .Case(\"{1}\", {0}::{2})\n", enumName, symbol,
+                  makeIdentifier(symbol));
+  }
+  os << "      .Default(llvm::None);\n";
+  os << "}\n";
+}
+
+static void emitStrToSymFnForBitEnum(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef enumName = enumAttr.getEnumClassName();
+  std::string underlyingType = enumAttr.getUnderlyingType();
+  StringRef strToSymFnName = enumAttr.getStringToSymbolFnName();
+  StringRef separator = enumDef.getValueAsString("separator");
+  auto enumerants = enumAttr.getAllCases();
+  auto allBitsUnsetCase = getAllBitsUnsetCase(enumerants);
+
+  os << formatv("llvm::Optional<{0}> {1}(llvm::StringRef str) {{\n", enumName,
+                strToSymFnName);
+
+  if (allBitsUnsetCase) {
+    os << "  // Special case for all bits unset.\n";
+    StringRef caseSymbol = allBitsUnsetCase->getSymbol();
+    os << formatv("  if (str == \"{1}\") return {0}::{2};\n\n", enumName,
+                  caseSymbol, makeIdentifier(caseSymbol));
+  }
+
+  // Split the string to get symbols for all the bits.
+  os << "  llvm::SmallVector<llvm::StringRef, 2> symbols;\n";
+  os << formatv("  str.split(symbols, \"{0}\");\n\n", separator);
+
+  os << formatv("  {0} val = 0;\n", underlyingType);
+  os << "  for (auto symbol : symbols) {\n";
+
+  // Convert each symbol to the bit ordinal and set the corresponding bit.
+  os << formatv(
+      "    auto bit = llvm::StringSwitch<llvm::Optional<{0}>>(symbol)\n",
+      underlyingType);
+  for (const auto &enumerant : enumerants) {
+    // Skip the special enumerant for None.
+    if (auto val = enumerant.getValue())
+      os.indent(6) << formatv(".Case(\"{0}\", {1})\n", enumerant.getSymbol(),
+                              val);
+  }
+  os.indent(6) << ".Default(llvm::None);\n";
+
+  os << "    if (bit) { val |= *bit; } else { return llvm::None; }\n";
+  os << "  }\n";
+
+  os << formatv("  return static_cast<{0}>(val);\n", enumName);
+  os << "}\n\n";
+}
+
+static void emitUnderlyingToSymFnForIntEnum(const Record &enumDef,
+                                            raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef enumName = enumAttr.getEnumClassName();
+  std::string underlyingType = enumAttr.getUnderlyingType();
+  StringRef underlyingToSymFnName = enumAttr.getUnderlyingToSymbolFnName();
+  auto enumerants = enumAttr.getAllCases();
+
+  // Avoid generating the underlying value to symbol conversion function if
+  // there is an enumerant without explicit value.
+  if (llvm::any_of(enumerants, [](EnumAttrCase enumerant) {
+        return enumerant.getValue() < 0;
+      }))
+    return;
+
+  os << formatv("llvm::Optional<{0}> {1}({2} value) {{\n", enumName,
+                underlyingToSymFnName,
+                underlyingType.empty() ? std::string("unsigned")
+                                       : underlyingType)
+     << "  switch (value) {\n";
+  for (const auto &enumerant : enumerants) {
+    auto symbol = enumerant.getSymbol();
+    auto value = enumerant.getValue();
+    os << formatv("  case {0}: return {1}::{2};\n", value, enumName,
+                  makeIdentifier(symbol));
+  }
+  os << "  default: return llvm::None;\n"
+     << "  }\n"
+     << "}\n\n";
+}
+
+static void emitUnderlyingToSymFnForBitEnum(const Record &enumDef,
+                                            raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef enumName = enumAttr.getEnumClassName();
+  std::string underlyingType = enumAttr.getUnderlyingType();
+  StringRef underlyingToSymFnName = enumAttr.getUnderlyingToSymbolFnName();
+  auto enumerants = enumAttr.getAllCases();
+  auto allBitsUnsetCase = getAllBitsUnsetCase(enumerants);
+
+  os << formatv("llvm::Optional<{0}> {1}({2} value) {{\n", enumName,
+                underlyingToSymFnName, underlyingType);
+  if (allBitsUnsetCase) {
+    os << "  // Special case for all bits unset.\n";
+    os << formatv("  if (value == 0) return {0}::{1};\n\n", enumName,
+                  makeIdentifier(allBitsUnsetCase->getSymbol()));
+  }
+  llvm::SmallVector<std::string, 8> values;
+  for (const auto &enumerant : enumerants) {
+    if (auto val = enumerant.getValue())
+      values.push_back(formatv("{0}u", val));
+  }
+  os << formatv("  if (value & ~({0})) return llvm::None;\n",
+                llvm::join(values, " | "));
+  os << formatv("  return static_cast<{0}>(value);\n", enumName);
+  os << "}\n";
+}
+
+static void emitEnumDecl(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef enumName = enumAttr.getEnumClassName();
+  StringRef cppNamespace = enumAttr.getCppNamespace();
+  std::string underlyingType = enumAttr.getUnderlyingType();
+  StringRef description = enumAttr.getDescription();
+  StringRef strToSymFnName = enumAttr.getStringToSymbolFnName();
+  StringRef symToStrFnName = enumAttr.getSymbolToStringFnName();
+  StringRef symToStrFnRetType = enumAttr.getSymbolToStringFnRetType();
+  StringRef underlyingToSymFnName = enumAttr.getUnderlyingToSymbolFnName();
+  auto enumerants = enumAttr.getAllCases();
+
+  llvm::SmallVector<StringRef, 2> namespaces;
+  llvm::SplitString(cppNamespace, namespaces, "::");
+
+  for (auto ns : namespaces)
+    os << "namespace " << ns << " {\n";
+
+  // Emit the enum class definition
+  emitEnumClass(enumDef, enumName, underlyingType, description, enumerants, os);
+
+  // Emit conversion function declarations
+  if (llvm::all_of(enumerants, [](EnumAttrCase enumerant) {
+        return enumerant.getValue() >= 0;
+      })) {
+    os << formatv(
+        "llvm::Optional<{0}> {1}({2});\n", enumName, underlyingToSymFnName,
+        underlyingType.empty() ? std::string("unsigned") : underlyingType);
+  }
+  os << formatv("{2} {1}({0});\n", enumName, symToStrFnName, symToStrFnRetType);
+  os << formatv("llvm::Optional<{0}> {1}(llvm::StringRef);\n", enumName,
+                strToSymFnName);
+
+  if (enumAttr.isBitEnum()) {
+    emitOperators(enumDef, os);
+  } else {
+    emitMaxValueFn(enumDef, os);
+  }
+
+  for (auto ns : llvm::reverse(namespaces))
+    os << "} // namespace " << ns << "\n";
+
+  // Emit DenseMapInfo for this enum class
+  emitDenseMapInfo(enumName, underlyingType, cppNamespace, os);
+}
+
+static bool emitEnumDecls(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  llvm::emitSourceFileHeader("Enum Utility Declarations", os);
+
+  auto defs = recordKeeper.getAllDerivedDefinitions("EnumAttrInfo");
+  for (const auto *def : defs)
+    emitEnumDecl(*def, os);
+
+  return false;
+}
+
+static void emitEnumDef(const Record &enumDef, raw_ostream &os) {
+  EnumAttr enumAttr(enumDef);
+  StringRef cppNamespace = enumAttr.getCppNamespace();
+
+  llvm::SmallVector<StringRef, 2> namespaces;
+  llvm::SplitString(cppNamespace, namespaces, "::");
+
+  for (auto ns : namespaces)
+    os << "namespace " << ns << " {\n";
+
+  if (enumAttr.isBitEnum()) {
+    emitSymToStrFnForBitEnum(enumDef, os);
+    emitStrToSymFnForBitEnum(enumDef, os);
+    emitUnderlyingToSymFnForBitEnum(enumDef, os);
+  } else {
+    emitSymToStrFnForIntEnum(enumDef, os);
+    emitStrToSymFnForIntEnum(enumDef, os);
+    emitUnderlyingToSymFnForIntEnum(enumDef, os);
+  }
+
+  for (auto ns : llvm::reverse(namespaces))
+    os << "} // namespace " << ns << "\n";
+  os << "\n";
+}
+
+static bool emitEnumDefs(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  llvm::emitSourceFileHeader("Enum Utility Definitions", os);
+
+  auto defs = recordKeeper.getAllDerivedDefinitions("EnumAttrInfo");
+  for (const auto *def : defs)
+    emitEnumDef(*def, os);
+
+  return false;
+}
+
+// Registers the enum utility generator to mlir-tblgen.
+static mlir::GenRegistration
+    genEnumDecls("gen-enum-decls", "Generate enum utility declarations",
+                 [](const RecordKeeper &records, raw_ostream &os) {
+                   return emitEnumDecls(records, os);
+                 });
+
+// Registers the enum utility generator to mlir-tblgen.
+static mlir::GenRegistration
+    genEnumDefs("gen-enum-defs", "Generate enum utility definitions",
+                [](const RecordKeeper &records, raw_ostream &os) {
+                  return emitEnumDefs(records, os);
+                });
diff --git a/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp b/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..30f720e8d7314d890827e4a9d02c0b3363de85b4
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp
@@ -0,0 +1,176 @@
+//===- LLVMIRConversionGen.cpp - MLIR LLVM IR builder generator -----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file uses tablegen definitions of the LLVM IR Dialect operations to
+// generate the code building the LLVM IR from it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Operator.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+
+static bool emitError(const Twine &message) {
+  llvm::errs() << message << "\n";
+  return false;
+}
+
+namespace {
+// Helper structure to return a position of the substring in a string.
+struct StringLoc {
+  size_t pos;
+  size_t length;
+
+  // Take a substring identified by this location in the given string.
+  StringRef in(StringRef str) const { return str.substr(pos, length); }
+
+  // A location is invalid if its position is outside the string.
+  explicit operator bool() { return pos != std::string::npos; }
+};
+} // namespace
+
+// Find the next TableGen variable in the given pattern.  These variables start
+// with a `$` character and can contain alphanumeric characters or underscores.
+// Return the position of the variable in the pattern and its length, including
+// the `$` character.  The escape syntax `$$` is also detected and returned.
+static StringLoc findNextVariable(StringRef str) {
+  size_t startPos = str.find('$');
+  if (startPos == std::string::npos)
+    return {startPos, 0};
+
+  // If we see "$$", return immediately.
+  if (startPos != str.size() - 1 && str[startPos + 1] == '$')
+    return {startPos, 2};
+
+  // Otherwise, the symbol spans until the first character that is not
+  // alphanumeric or '_'.
+  size_t endPos = str.find_if_not([](char c) { return isAlnum(c) || c == '_'; },
+                                  startPos + 1);
+  if (endPos == std::string::npos)
+    endPos = str.size();
+
+  return {startPos, endPos - startPos};
+}
+
+// Check if `name` is the name of the variadic operand of `op`.  The variadic
+// operand can only appear at the last position in the list of operands.
+static bool isVariadicOperandName(const tblgen::Operator &op, StringRef name) {
+  unsigned numOperands = op.getNumOperands();
+  if (numOperands == 0)
+    return false;
+  const auto &operand = op.getOperand(numOperands - 1);
+  return operand.isVariadic() && operand.name == name;
+}
+
+// Check if `result` is a known name of a result of `op`.
+static bool isResultName(const tblgen::Operator &op, StringRef name) {
+  for (int i = 0, e = op.getNumResults(); i < e; ++i)
+    if (op.getResultName(i) == name)
+      return true;
+  return false;
+}
+
+// Check if `name` is a known name of an attribute of `op`.
+static bool isAttributeName(const tblgen::Operator &op, StringRef name) {
+  return llvm::any_of(
+      op.getAttributes(),
+      [name](const tblgen::NamedAttribute &attr) { return attr.name == name; });
+}
+
+// Check if `name` is a known name of an operand of `op`.
+static bool isOperandName(const tblgen::Operator &op, StringRef name) {
+  for (int i = 0, e = op.getNumOperands(); i < e; ++i)
+    if (op.getOperand(i).name == name)
+      return true;
+  return false;
+}
+
+// Emit to `os` the operator-name driven check and the call to LLVM IRBuilder
+// for one definition of a LLVM IR Dialect operation.  Return true on success.
+static bool emitOneBuilder(const Record &record, raw_ostream &os) {
+  auto op = tblgen::Operator(record);
+
+  if (!record.getValue("llvmBuilder"))
+    return emitError("no 'llvmBuilder' field for op " + op.getOperationName());
+
+  // Return early if there is no builder specified.
+  auto builderStrRef = record.getValueAsString("llvmBuilder");
+  if (builderStrRef.empty())
+    return true;
+
+  // Progressively create the builder string by replacing $-variables with
+  // value lookups.  Keep only the not-yet-traversed part of the builder pattern
+  // to avoid re-traversing the string multiple times.
+  std::string builder;
+  llvm::raw_string_ostream bs(builder);
+  while (auto loc = findNextVariable(builderStrRef)) {
+    auto name = loc.in(builderStrRef).drop_front();
+    // First, insert the non-matched part as is.
+    bs << builderStrRef.substr(0, loc.pos);
+    // Then, rewrite the name based on its kind.
+    bool isVariadicOperand = isVariadicOperandName(op, name);
+    if (isOperandName(op, name)) {
+      auto result = isVariadicOperand
+                        ? formatv("lookupValues(op.{0}())", name)
+                        : formatv("valueMapping.lookup(op.{0}())", name);
+      bs << result;
+    } else if (isAttributeName(op, name)) {
+      bs << formatv("op.{0}()", name);
+    } else if (isResultName(op, name)) {
+      bs << formatv("valueMapping[op.{0}()]", name);
+    } else if (name == "_resultType") {
+      bs << "op.getResult()->getType().cast<LLVM::LLVMType>()."
+            "getUnderlyingType()";
+    } else if (name == "_hasResult") {
+      bs << "opInst.getNumResults() == 1";
+    } else if (name == "_location") {
+      bs << "opInst.getLoc()";
+    } else if (name == "_numOperands") {
+      bs << "opInst.getNumOperands()";
+    } else if (name == "$") {
+      bs << '$';
+    } else {
+      return emitError(name + " is neither an argument nor a result of " +
+                       op.getOperationName());
+    }
+    // Finally, only keep the untraversed part of the string.
+    builderStrRef = builderStrRef.substr(loc.pos + loc.length);
+  }
+
+  // Output the check and the rewritten builder string.
+  os << "if (auto op = dyn_cast<" << op.getQualCppClassName()
+     << ">(opInst)) {\n";
+  os << bs.str() << builderStrRef << "\n";
+  os << "  return success();\n";
+  os << "}\n";
+
+  return true;
+}
+
+// Emit all builders.  Returns false on success because of the generator
+// registration requirements.
+static bool emitBuilders(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  for (const auto *def : recordKeeper.getAllDerivedDefinitions("LLVM_OpBase")) {
+    if (!emitOneBuilder(*def, os))
+      return true;
+  }
+  return false;
+}
+
+static mlir::GenRegistration
+    genLLVMIRConversions("gen-llvmir-conversions",
+                         "Generate LLVM IR conversions", emitBuilders);
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f5b3e0163a1e3d0f2cce92ca6e0cbfdc36196b78
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -0,0 +1,1793 @@
+//===- OpDefinitionsGen.cpp - MLIR op definitions generator ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpDefinitionsGen uses the description of operations to generate C++
+// definitions for ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/STLExtras.h"
+#include "mlir/TableGen/Format.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/OpInterfaces.h"
+#include "mlir/TableGen/OpTrait.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+#define DEBUG_TYPE "mlir-tblgen-opdefgen"
+
+using namespace llvm;
+using namespace mlir;
+using namespace mlir::tblgen;
+
+static const char *const tblgenNamePrefix = "tblgen_";
+static const char *const generatedArgName = "tblgen_arg";
+static const char *const builderOpState = "tblgen_state";
+
+// The logic to calculate the actual value range for a declared operand/result
+// of an op with variadic operands/results. Note that this logic is not for
+// general use; it assumes all variadic operands/results must have the same
+// number of values.
+//
+// {0}: The list of whether each declared operand/result is variadic.
+// {1}: The total number of non-variadic operands/results.
+// {2}: The total number of variadic operands/results.
+// {3}: The total number of actual values.
+// {4}: The begin iterator of the actual values.
+// {5}: "operand" or "result".
+const char *sameVariadicSizeValueRangeCalcCode = R"(
+  bool isVariadic[] = {{{0}};
+  int prevVariadicCount = 0;
+  for (unsigned i = 0; i < index; ++i)
+    if (isVariadic[i]) ++prevVariadicCount;
+
+  // Calculate how many dynamic values a static variadic {5} corresponds to.
+  // This assumes all static variadic {5}s have the same dynamic value count.
+  int variadicSize = ({3} - {1}) / {2};
+  // `index` passed in as the parameter is the static index which counts each
+  // {5} (variadic or not) as size 1. So here for each previous static variadic
+  // {5}, we need to offset by (variadicSize - 1) to get where the dynamic
+  // value pack for this static {5} starts.
+  int offset = index + (variadicSize - 1) * prevVariadicCount;
+  int size = isVariadic[index] ? variadicSize : 1;
+
+  return {{std::next({4}, offset), std::next({4}, offset + size)};
+)";
+
+// The logic to calculate the actual value range for a declared operand/result
+// of an op with variadic operands/results. Note that this logic is assumes
+// the op has an attribute specifying the size of each operand/result segment
+// (variadic or not).
+//
+// {0}: The name of the attribute specifying the segment sizes.
+// {1}: The begin iterator of the actual values.
+const char *attrSizedSegmentValueRangeCalcCode = R"(
+  auto sizeAttr = getAttrOfType<DenseIntElementsAttr>("{0}");
+  unsigned start = 0;
+  for (unsigned i = 0; i < index; ++i)
+    start += (*(sizeAttr.begin() + i)).getZExtValue();
+  unsigned end = start + (*(sizeAttr.begin() + index)).getZExtValue();
+  return {{std::next({1}, start), std::next({1}, end)};
+)";
+
+static const char *const opCommentHeader = R"(
+//===----------------------------------------------------------------------===//
+// {0} {1}
+//===----------------------------------------------------------------------===//
+
+)";
+
+//===----------------------------------------------------------------------===//
+// Utility structs and functions
+//===----------------------------------------------------------------------===//
+
+// Returns whether the record has a value of the given name that can be returned
+// via getValueAsString.
+static inline bool hasStringAttribute(const Record &record,
+                                      StringRef fieldName) {
+  auto valueInit = record.getValueInit(fieldName);
+  return isa<CodeInit>(valueInit) || isa<StringInit>(valueInit);
+}
+
+static std::string getArgumentName(const Operator &op, int index) {
+  const auto &operand = op.getOperand(index);
+  if (!operand.name.empty())
+    return operand.name;
+  else
+    return formatv("{0}_{1}", generatedArgName, index);
+}
+
+// Returns true if we can use unwrapped value for the given `attr` in builders.
+static bool canUseUnwrappedRawValue(const tblgen::Attribute &attr) {
+  return attr.getReturnType() != attr.getStorageType() &&
+         // We need to wrap the raw value into an attribute in the builder impl
+         // so we need to make sure that the attribute specifies how to do that.
+         !attr.getConstBuilderTemplate().empty();
+}
+
+namespace {
+// Simple RAII helper for defining ifdef-undef-endif scopes.
+class IfDefScope {
+public:
+  IfDefScope(StringRef name, raw_ostream &os) : name(name), os(os) {
+    os << "#ifdef " << name << "\n"
+       << "#undef " << name << "\n\n";
+  }
+
+  ~IfDefScope() { os << "\n#endif  // " << name << "\n\n"; }
+
+private:
+  StringRef name;
+  raw_ostream &os;
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Classes for C++ code emission
+//===----------------------------------------------------------------------===//
+
+// We emit the op declaration and definition into separate files: *Ops.h.inc
+// and *Ops.cpp.inc. The former is to be included in the dialect *Ops.h and
+// the latter for dialect *Ops.cpp. This way provides a cleaner interface.
+//
+// In order to do this split, we need to track method signature and
+// implementation logic separately. Signature information is used for both
+// declaration and definition, while implementation logic is only for
+// definition. So we have the following classes for C++ code emission.
+
+namespace {
+// Class for holding the signature of an op's method for C++ code emission
+class OpMethodSignature {
+public:
+  OpMethodSignature(StringRef retType, StringRef name, StringRef params);
+
+  // Writes the signature as a method declaration to the given `os`.
+  void writeDeclTo(raw_ostream &os) const;
+  // Writes the signature as the start of a method definition to the given `os`.
+  // `namePrefix` is the prefix to be prepended to the method name (typically
+  // namespaces for qualifying the method definition).
+  void writeDefTo(raw_ostream &os, StringRef namePrefix) const;
+
+private:
+  // Returns true if the given C++ `type` ends with '&' or '*', or is empty.
+  static bool elideSpaceAfterType(StringRef type);
+
+  std::string returnType;
+  std::string methodName;
+  std::string parameters;
+};
+
+// Class for holding the body of an op's method for C++ code emission
+class OpMethodBody {
+public:
+  explicit OpMethodBody(bool declOnly);
+
+  OpMethodBody &operator<<(Twine content);
+  OpMethodBody &operator<<(int content);
+  OpMethodBody &operator<<(const FmtObjectBase &content);
+
+  void writeTo(raw_ostream &os) const;
+
+private:
+  // Whether this class should record method body.
+  bool isEffective;
+  std::string body;
+};
+
+// Class for holding an op's method for C++ code emission
+class OpMethod {
+public:
+  // Properties (qualifiers) of class methods. Bitfield is used here to help
+  // querying properties.
+  enum Property {
+    MP_None = 0x0,
+    MP_Static = 0x1,      // Static method
+    MP_Constructor = 0x2, // Constructor
+    MP_Private = 0x4,     // Private method
+  };
+
+  OpMethod(StringRef retType, StringRef name, StringRef params,
+           Property property, bool declOnly);
+
+  OpMethodBody &body();
+
+  // Returns true if this is a static method.
+  bool isStatic() const;
+
+  // Returns true if this is a private method.
+  bool isPrivate() const;
+
+  // Writes the method as a declaration to the given `os`.
+  void writeDeclTo(raw_ostream &os) const;
+  // Writes the method as a definition to the given `os`. `namePrefix` is the
+  // prefix to be prepended to the method name (typically namespaces for
+  // qualifying the method definition).
+  void writeDefTo(raw_ostream &os, StringRef namePrefix) const;
+
+private:
+  Property properties;
+  // Whether this method only contains a declaration.
+  bool isDeclOnly;
+  OpMethodSignature methodSignature;
+  OpMethodBody methodBody;
+};
+
+// A class used to emit C++ classes from Tablegen.  Contains a list of public
+// methods and a list of private fields to be emitted.
+class Class {
+public:
+  explicit Class(StringRef name);
+
+  // Creates a new method in this class.
+  OpMethod &newMethod(StringRef retType, StringRef name, StringRef params = "",
+                      OpMethod::Property = OpMethod::MP_None,
+                      bool declOnly = false);
+
+  OpMethod &newConstructor(StringRef params = "", bool declOnly = false);
+
+  // Creates a new field in this class.
+  void newField(StringRef type, StringRef name, StringRef defaultValue = "");
+
+  // Writes this op's class as a declaration to the given `os`.
+  void writeDeclTo(raw_ostream &os) const;
+  // Writes the method definitions in this op's class to the given `os`.
+  void writeDefTo(raw_ostream &os) const;
+
+  // Returns the C++ class name of the op.
+  StringRef getClassName() const { return className; }
+
+protected:
+  std::string className;
+  SmallVector<OpMethod, 8> methods;
+  SmallVector<std::string, 4> fields;
+};
+
+// Class for holding an op for C++ code emission
+class OpClass : public Class {
+public:
+  explicit OpClass(StringRef name, StringRef extraClassDeclaration = "");
+
+  // Sets whether this OpClass should generate the using directive for its
+  // associate operand adaptor class.
+  void setHasOperandAdaptorClass(bool has);
+
+  // Adds an op trait.
+  void addTrait(Twine trait);
+
+  // Writes this op's class as a declaration to the given `os`.  Redefines
+  // Class::writeDeclTo to also emit traits and extra class declarations.
+  void writeDeclTo(raw_ostream &os) const;
+
+private:
+  StringRef extraClassDeclaration;
+  SmallVector<std::string, 4> traits;
+  bool hasOperandAdaptor;
+};
+} // end anonymous namespace
+
+OpMethodSignature::OpMethodSignature(StringRef retType, StringRef name,
+                                     StringRef params)
+    : returnType(retType), methodName(name), parameters(params) {}
+
+void OpMethodSignature::writeDeclTo(raw_ostream &os) const {
+  os << returnType << (elideSpaceAfterType(returnType) ? "" : " ") << methodName
+     << "(" << parameters << ")";
+}
+
+void OpMethodSignature::writeDefTo(raw_ostream &os,
+                                   StringRef namePrefix) const {
+  // We need to remove the default values for parameters in method definition.
+  // TODO(antiagainst): We are using '=' and ',' as delimiters for parameter
+  // initializers. This is incorrect for initializer list with more than one
+  // element. Change to a more robust approach.
+  auto removeParamDefaultValue = [](StringRef params) {
+    std::string result;
+    std::pair<StringRef, StringRef> parts;
+    while (!params.empty()) {
+      parts = params.split("=");
+      result.append(result.empty() ? "" : ", ");
+      result.append(parts.first);
+      params = parts.second.split(",").second;
+    }
+    return result;
+  };
+
+  os << returnType << (elideSpaceAfterType(returnType) ? "" : " ") << namePrefix
+     << (namePrefix.empty() ? "" : "::") << methodName << "("
+     << removeParamDefaultValue(parameters) << ")";
+}
+
+bool OpMethodSignature::elideSpaceAfterType(StringRef type) {
+  return type.empty() || type.endswith("&") || type.endswith("*");
+}
+
+OpMethodBody::OpMethodBody(bool declOnly) : isEffective(!declOnly) {}
+
+OpMethodBody &OpMethodBody::operator<<(Twine content) {
+  if (isEffective)
+    body.append(content.str());
+  return *this;
+}
+
+OpMethodBody &OpMethodBody::operator<<(int content) {
+  if (isEffective)
+    body.append(std::to_string(content));
+  return *this;
+}
+
+OpMethodBody &OpMethodBody::operator<<(const FmtObjectBase &content) {
+  if (isEffective)
+    body.append(content.str());
+  return *this;
+}
+
+void OpMethodBody::writeTo(raw_ostream &os) const {
+  auto bodyRef = StringRef(body).drop_while([](char c) { return c == '\n'; });
+  os << bodyRef;
+  if (bodyRef.empty() || bodyRef.back() != '\n')
+    os << "\n";
+}
+
+OpMethod::OpMethod(StringRef retType, StringRef name, StringRef params,
+                   OpMethod::Property property, bool declOnly)
+    : properties(property), isDeclOnly(declOnly),
+      methodSignature(retType, name, params), methodBody(declOnly) {}
+
+OpMethodBody &OpMethod::body() { return methodBody; }
+
+bool OpMethod::isStatic() const { return properties & MP_Static; }
+
+bool OpMethod::isPrivate() const { return properties & MP_Private; }
+
+void OpMethod::writeDeclTo(raw_ostream &os) const {
+  os.indent(2);
+  if (isStatic())
+    os << "static ";
+  methodSignature.writeDeclTo(os);
+  os << ";";
+}
+
+void OpMethod::writeDefTo(raw_ostream &os, StringRef namePrefix) const {
+  if (isDeclOnly)
+    return;
+
+  methodSignature.writeDefTo(os, namePrefix);
+  os << " {\n";
+  methodBody.writeTo(os);
+  os << "}";
+}
+
+Class::Class(StringRef name) : className(name) {}
+
+OpMethod &Class::newMethod(StringRef retType, StringRef name, StringRef params,
+                           OpMethod::Property property, bool declOnly) {
+  methods.emplace_back(retType, name, params, property, declOnly);
+  return methods.back();
+}
+
+OpMethod &Class::newConstructor(StringRef params, bool declOnly) {
+  return newMethod("", getClassName(), params, OpMethod::MP_Constructor,
+                   declOnly);
+}
+
+void Class::newField(StringRef type, StringRef name, StringRef defaultValue) {
+  std::string varName = formatv("{0} {1}", type, name).str();
+  std::string field = defaultValue.empty()
+                          ? varName
+                          : formatv("{0} = {1}", varName, defaultValue).str();
+  fields.push_back(std::move(field));
+}
+
+void Class::writeDeclTo(raw_ostream &os) const {
+  bool hasPrivateMethod = false;
+  os << "class " << className << " {\n";
+  os << "public:\n";
+  for (const auto &method : methods) {
+    if (!method.isPrivate()) {
+      method.writeDeclTo(os);
+      os << '\n';
+    } else {
+      hasPrivateMethod = true;
+    }
+  }
+  os << '\n';
+  os << "private:\n";
+  if (hasPrivateMethod) {
+    for (const auto &method : methods) {
+      if (method.isPrivate()) {
+        method.writeDeclTo(os);
+        os << '\n';
+      }
+    }
+    os << '\n';
+  }
+  for (const auto &field : fields)
+    os.indent(2) << field << ";\n";
+  os << "};\n";
+}
+
+void Class::writeDefTo(raw_ostream &os) const {
+  for (const auto &method : methods) {
+    method.writeDefTo(os, className);
+    os << "\n\n";
+  }
+}
+
+OpClass::OpClass(StringRef name, StringRef extraClassDeclaration)
+    : Class(name), extraClassDeclaration(extraClassDeclaration),
+      hasOperandAdaptor(true) {}
+
+void OpClass::setHasOperandAdaptorClass(bool has) { hasOperandAdaptor = has; }
+
+// Adds the given trait to this op.
+void OpClass::addTrait(Twine trait) { traits.push_back(trait.str()); }
+
+void OpClass::writeDeclTo(raw_ostream &os) const {
+  os << "class " << className << " : public Op<" << className;
+  for (const auto &trait : traits)
+    os << ", " << trait;
+  os << "> {\npublic:\n";
+  os << "  using Op::Op;\n";
+  if (hasOperandAdaptor)
+    os << "  using OperandAdaptor = " << className << "OperandAdaptor;\n";
+
+  bool hasPrivateMethod = false;
+  for (const auto &method : methods) {
+    if (!method.isPrivate()) {
+      method.writeDeclTo(os);
+      os << "\n";
+    } else {
+      hasPrivateMethod = true;
+    }
+  }
+
+  // TODO: Add line control markers to make errors easier to debug.
+  if (!extraClassDeclaration.empty())
+    os << extraClassDeclaration << "\n";
+
+  if (hasPrivateMethod) {
+    os << "\nprivate:\n";
+    for (const auto &method : methods) {
+      if (method.isPrivate()) {
+        method.writeDeclTo(os);
+        os << "\n";
+      }
+    }
+  }
+
+  os << "};\n";
+}
+
+//===----------------------------------------------------------------------===//
+// Op emitter
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Helper class to emit a record into the given output stream.
+class OpEmitter {
+public:
+  static void emitDecl(const Operator &op, raw_ostream &os);
+  static void emitDef(const Operator &op, raw_ostream &os);
+
+private:
+  OpEmitter(const Operator &op);
+
+  void emitDecl(raw_ostream &os);
+  void emitDef(raw_ostream &os);
+
+  // Generates the OpAsmOpInterface for this operation if possible.
+  void genOpAsmInterface();
+
+  // Generates the `getOperationName` method for this op.
+  void genOpNameGetter();
+
+  // Generates getters for the attributes.
+  void genAttrGetters();
+
+  // Generates getters for named operands.
+  void genNamedOperandGetters();
+
+  // Generates getters for named results.
+  void genNamedResultGetters();
+
+  // Generates getters for named regions.
+  void genNamedRegionGetters();
+
+  // Generates builder methods for the operation.
+  void genBuilder();
+
+  // Generates the build() method that takes each operand/attribute
+  // as a stand-alone parameter.
+  void genSeparateArgParamBuilder();
+
+  // Generates the build() method that takes each operand/attribute as a
+  // stand-alone parameter. The generated build() method uses first operand's
+  // type as all results' types.
+  void genUseOperandAsResultTypeSeparateParamBuilder();
+
+  // Generates the build() method that takes all operands/attributes
+  // collectively as one parameter. The generated build() method uses first
+  // operand's type as all results' types.
+  void genUseOperandAsResultTypeCollectiveParamBuilder();
+
+  // Generates the build() method that takes aggregate operands/attributes
+  // parameters. This build() method uses inferred types as result types.
+  // Requires: The type needs to be inferable via InferTypeOpInterface.
+  void genInferedTypeCollectiveParamBuilder();
+
+  // Generates the build() method that takes each operand/attribute as a
+  // stand-alone parameter. The generated build() method uses first attribute's
+  // type as all result's types.
+  void genUseAttrAsResultTypeBuilder();
+
+  // Generates the build() method that takes all result types collectively as
+  // one parameter. Similarly for operands and attributes.
+  void genCollectiveParamBuilder();
+
+  // The kind of parameter to generate for result types in builders.
+  enum class TypeParamKind {
+    None,       // No result type in parameter list.
+    Separate,   // A separate parameter for each result type.
+    Collective, // An ArrayRef<Type> for all result types.
+  };
+
+  // The kind of parameter to generate for attributes in builders.
+  enum class AttrParamKind {
+    WrappedAttr,    // A wrapped MLIR Attribute instance.
+    UnwrappedValue, // A raw value without MLIR Attribute wrapper.
+  };
+
+  // Builds the parameter list for build() method of this op. This method writes
+  // to `paramList` the comma-separated parameter list and updates
+  // `resultTypeNames` with the names for parameters for specifying result
+  // types. The given `typeParamKind` and `attrParamKind` controls how result
+  // types and attributes are placed in the parameter list.
+  void buildParamList(std::string &paramList,
+                      SmallVectorImpl<std::string> &resultTypeNames,
+                      TypeParamKind typeParamKind,
+                      AttrParamKind attrParamKind = AttrParamKind::WrappedAttr);
+
+  // Adds op arguments and regions into operation state for build() methods.
+  void genCodeForAddingArgAndRegionForBuilder(OpMethodBody &body,
+                                              bool isRawValueAttr = false);
+
+  // Generates canonicalizer declaration for the operation.
+  void genCanonicalizerDecls();
+
+  // Generates the folder declaration for the operation.
+  void genFolderDecls();
+
+  // Generates the parser for the operation.
+  void genParser();
+
+  // Generates the printer for the operation.
+  void genPrinter();
+
+  // Generates verify method for the operation.
+  void genVerifier();
+
+  // Generates verify statements for operands and results in the operation.
+  // The generated code will be attached to `body`.
+  void genOperandResultVerifier(OpMethodBody &body,
+                                Operator::value_range values,
+                                StringRef valueKind);
+
+  // Generates verify statements for regions in the operation.
+  // The generated code will be attached to `body`.
+  void genRegionVerifier(OpMethodBody &body);
+
+  // Generates the traits used by the object.
+  void genTraits();
+
+  // Generate the OpInterface methods.
+  void genOpInterfaceMethods();
+
+private:
+  // The TableGen record for this op.
+  // TODO(antiagainst,zinenko): OpEmitter should not have a Record directly,
+  // it should rather go through the Operator for better abstraction.
+  const Record &def;
+
+  // The wrapper operator class for querying information from this op.
+  Operator op;
+
+  // The C++ code builder for this op
+  OpClass opClass;
+
+  // The format context for verification code generation.
+  FmtContext verifyCtx;
+};
+} // end anonymous namespace
+
+OpEmitter::OpEmitter(const Operator &op)
+    : def(op.getDef()), op(op),
+      opClass(op.getCppClassName(), op.getExtraClassDeclaration()) {
+  verifyCtx.withOp("(*this->getOperation())");
+
+  genTraits();
+  // Generate C++ code for various op methods. The order here determines the
+  // methods in the generated file.
+  genOpAsmInterface();
+  genOpNameGetter();
+  genNamedOperandGetters();
+  genNamedResultGetters();
+  genNamedRegionGetters();
+  genAttrGetters();
+  genBuilder();
+  genParser();
+  genPrinter();
+  genVerifier();
+  genCanonicalizerDecls();
+  genFolderDecls();
+  genOpInterfaceMethods();
+}
+
+void OpEmitter::emitDecl(const Operator &op, raw_ostream &os) {
+  OpEmitter(op).emitDecl(os);
+}
+
+void OpEmitter::emitDef(const Operator &op, raw_ostream &os) {
+  OpEmitter(op).emitDef(os);
+}
+
+void OpEmitter::emitDecl(raw_ostream &os) { opClass.writeDeclTo(os); }
+
+void OpEmitter::emitDef(raw_ostream &os) { opClass.writeDefTo(os); }
+
+void OpEmitter::genAttrGetters() {
+  FmtContext fctx;
+  fctx.withBuilder("mlir::Builder(this->getContext())");
+
+  // Emit the derived attribute body.
+  auto emitDerivedAttr = [&](StringRef name, Attribute attr) {
+    auto &method = opClass.newMethod(attr.getReturnType(), name);
+    auto &body = method.body();
+    body << "  " << attr.getDerivedCodeBody() << "\n";
+  };
+
+  // Emit with return type specified.
+  auto emitAttrWithReturnType = [&](StringRef name, Attribute attr) {
+    auto &method = opClass.newMethod(attr.getReturnType(), name);
+    auto &body = method.body();
+    body << "  auto attr = " << name << "Attr();\n";
+    if (attr.hasDefaultValue()) {
+      // Returns the default value if not set.
+      // TODO: this is inefficient, we are recreating the attribute for every
+      // call. This should be set instead.
+      std::string defaultValue =
+          tgfmt(attr.getConstBuilderTemplate(), &fctx, attr.getDefaultValue());
+      body << "    if (!attr)\n      return "
+           << tgfmt(attr.getConvertFromStorageCall(),
+                    &fctx.withSelf(defaultValue))
+           << ";\n";
+    }
+    body << "  return "
+         << tgfmt(attr.getConvertFromStorageCall(), &fctx.withSelf("attr"))
+         << ";\n";
+  };
+
+  // Generate raw named accessor type. This is a wrapper class that allows
+  // referring to the attributes via accessors instead of having to use
+  // the string interface for better compile time verification.
+  auto emitAttrWithStorageType = [&](StringRef name, Attribute attr) {
+    auto &method =
+        opClass.newMethod(attr.getStorageType(), (name + "Attr").str());
+    auto &body = method.body();
+    body << "  return this->getAttr(\"" << name << "\").";
+    if (attr.isOptional() || attr.hasDefaultValue())
+      body << "dyn_cast_or_null<";
+    else
+      body << "cast<";
+    body << attr.getStorageType() << ">();";
+  };
+
+  for (auto &namedAttr : op.getAttributes()) {
+    const auto &name = namedAttr.name;
+    const auto &attr = namedAttr.attr;
+    if (attr.isDerivedAttr()) {
+      emitDerivedAttr(name, attr);
+    } else {
+      emitAttrWithStorageType(name, attr);
+      emitAttrWithReturnType(name, attr);
+    }
+  }
+}
+
+// Generates the named operand getter methods for the given Operator `op` and
+// puts them in `opClass`.  Uses `rangeType` as the return type of getters that
+// return a range of operands (individual operands are `Value ` and each
+// element in the range must also be `Value `); use `rangeBeginCall` to get
+// an iterator to the beginning of the operand range; use `rangeSizeCall` to
+// obtain the number of operands. `getOperandCallPattern` contains the code
+// necessary to obtain a single operand whose position will be substituted
+// instead of
+// "{0}" marker in the pattern.  Note that the pattern should work for any kind
+// of ops, in particular for one-operand ops that may not have the
+// `getOperand(unsigned)` method.
+static void generateNamedOperandGetters(const Operator &op, Class &opClass,
+                                        StringRef rangeType,
+                                        StringRef rangeBeginCall,
+                                        StringRef rangeSizeCall,
+                                        StringRef getOperandCallPattern) {
+  const int numOperands = op.getNumOperands();
+  const int numVariadicOperands = op.getNumVariadicOperands();
+  const int numNormalOperands = numOperands - numVariadicOperands;
+
+  const auto *sameVariadicSize =
+      op.getTrait("OpTrait::SameVariadicOperandSize");
+  const auto *attrSizedOperands =
+      op.getTrait("OpTrait::AttrSizedOperandSegments");
+
+  if (numVariadicOperands > 1 && !sameVariadicSize && !attrSizedOperands) {
+    PrintFatalError(op.getLoc(), "op has multiple variadic operands but no "
+                                 "specification over their sizes");
+  }
+
+  if (numVariadicOperands < 2 && attrSizedOperands) {
+    PrintFatalError(op.getLoc(), "op must have at least two variadic operands "
+                                 "to use 'AttrSizedOperandSegments' trait");
+  }
+
+  if (attrSizedOperands && sameVariadicSize) {
+    PrintFatalError(op.getLoc(),
+                    "op cannot have both 'AttrSizedOperandSegments' and "
+                    "'SameVariadicOperandSize' traits");
+  }
+
+  // First emit a "sink" getter method upon which we layer all nicer named
+  // getter methods.
+  auto &m = opClass.newMethod(rangeType, "getODSOperands", "unsigned index");
+
+  if (numVariadicOperands == 0) {
+    // We still need to match the return type, which is a range.
+    m.body() << "  return {std::next(" << rangeBeginCall
+             << ", index), std::next(" << rangeBeginCall << ", index + 1)};";
+  } else if (attrSizedOperands) {
+    m.body() << formatv(attrSizedSegmentValueRangeCalcCode,
+                        "operand_segment_sizes", rangeBeginCall);
+  } else {
+    // Because the op can have arbitrarily interleaved variadic and non-variadic
+    // operands, we need to embed a list in the "sink" getter method for
+    // calculation at run-time.
+    llvm::SmallVector<StringRef, 4> isVariadic;
+    isVariadic.reserve(numOperands);
+    for (int i = 0; i < numOperands; ++i) {
+      isVariadic.push_back(llvm::toStringRef(op.getOperand(i).isVariadic()));
+    }
+    std::string isVariadicList = llvm::join(isVariadic, ", ");
+
+    m.body() << formatv(sameVariadicSizeValueRangeCalcCode, isVariadicList,
+                        numNormalOperands, numVariadicOperands, rangeSizeCall,
+                        rangeBeginCall, "operand");
+  }
+
+  // Then we emit nicer named getter methods by redirecting to the "sink" getter
+  // method.
+
+  for (int i = 0; i != numOperands; ++i) {
+    const auto &operand = op.getOperand(i);
+    if (operand.name.empty())
+      continue;
+
+    if (operand.isVariadic()) {
+      auto &m = opClass.newMethod(rangeType, operand.name);
+      m.body() << "  return getODSOperands(" << i << ");";
+    } else {
+      auto &m = opClass.newMethod("Value ", operand.name);
+      m.body() << "  return *getODSOperands(" << i << ").begin();";
+    }
+  }
+}
+
+void OpEmitter::genNamedOperandGetters() {
+  if (op.getTrait("OpTrait::AttrSizedOperandSegments"))
+    opClass.setHasOperandAdaptorClass(false);
+
+  generateNamedOperandGetters(
+      op, opClass, /*rangeType=*/"Operation::operand_range",
+      /*rangeBeginCall=*/"getOperation()->operand_begin()",
+      /*rangeSizeCall=*/"getOperation()->getNumOperands()",
+      /*getOperandCallPattern=*/"getOperation()->getOperand({0})");
+}
+
+void OpEmitter::genNamedResultGetters() {
+  const int numResults = op.getNumResults();
+  const int numVariadicResults = op.getNumVariadicResults();
+  const int numNormalResults = numResults - numVariadicResults;
+
+  // If we have more than one variadic results, we need more complicated logic
+  // to calculate the value range for each result.
+
+  const auto *sameVariadicSize = op.getTrait("OpTrait::SameVariadicResultSize");
+  const auto *attrSizedResults =
+      op.getTrait("OpTrait::AttrSizedResultSegments");
+
+  if (numVariadicResults > 1 && !sameVariadicSize && !attrSizedResults) {
+    PrintFatalError(op.getLoc(), "op has multiple variadic results but no "
+                                 "specification over their sizes");
+  }
+
+  if (numVariadicResults < 2 && attrSizedResults) {
+    PrintFatalError(op.getLoc(), "op must have at least two variadic results "
+                                 "to use 'AttrSizedResultSegments' trait");
+  }
+
+  if (attrSizedResults && sameVariadicSize) {
+    PrintFatalError(op.getLoc(),
+                    "op cannot have both 'AttrSizedResultSegments' and "
+                    "'SameVariadicResultSize' traits");
+  }
+
+  auto &m = opClass.newMethod("Operation::result_range", "getODSResults",
+                              "unsigned index");
+
+  if (numVariadicResults == 0) {
+    m.body() << "  return {std::next(getOperation()->result_begin(), index), "
+                "std::next(getOperation()->result_begin(), index + 1)};";
+  } else if (attrSizedResults) {
+    m.body() << formatv(attrSizedSegmentValueRangeCalcCode,
+                        "result_segment_sizes",
+                        "getOperation()->result_begin()");
+  } else {
+    llvm::SmallVector<StringRef, 4> isVariadic;
+    isVariadic.reserve(numResults);
+    for (int i = 0; i < numResults; ++i) {
+      isVariadic.push_back(llvm::toStringRef(op.getResult(i).isVariadic()));
+    }
+    std::string isVariadicList = llvm::join(isVariadic, ", ");
+
+    m.body() << formatv(sameVariadicSizeValueRangeCalcCode, isVariadicList,
+                        numNormalResults, numVariadicResults,
+                        "getOperation()->getNumResults()",
+                        "getOperation()->result_begin()", "result");
+  }
+
+  for (int i = 0; i != numResults; ++i) {
+    const auto &result = op.getResult(i);
+    if (result.name.empty())
+      continue;
+
+    if (result.isVariadic()) {
+      auto &m = opClass.newMethod("Operation::result_range", result.name);
+      m.body() << "  return getODSResults(" << i << ");";
+    } else {
+      auto &m = opClass.newMethod("Value ", result.name);
+      m.body() << "  return *getODSResults(" << i << ").begin();";
+    }
+  }
+}
+
+void OpEmitter::genNamedRegionGetters() {
+  unsigned numRegions = op.getNumRegions();
+  for (unsigned i = 0; i < numRegions; ++i) {
+    const auto &region = op.getRegion(i);
+    if (!region.name.empty()) {
+      auto &m = opClass.newMethod("Region &", region.name);
+      m.body() << formatv("  return this->getOperation()->getRegion({0});", i);
+    }
+  }
+}
+
+static bool canGenerateUnwrappedBuilder(Operator &op) {
+  // If this op does not have native attributes at all, return directly to avoid
+  // redefining builders.
+  if (op.getNumNativeAttributes() == 0)
+    return false;
+
+  bool canGenerate = false;
+  // We are generating builders that take raw values for attributes. We need to
+  // make sure the native attributes have a meaningful "unwrapped" value type
+  // different from the wrapped mlir::Attribute type to avoid redefining
+  // builders. This checks for the op has at least one such native attribute.
+  for (int i = 0, e = op.getNumNativeAttributes(); i < e; ++i) {
+    NamedAttribute &namedAttr = op.getAttribute(i);
+    if (canUseUnwrappedRawValue(namedAttr.attr)) {
+      canGenerate = true;
+      break;
+    }
+  }
+  return canGenerate;
+}
+
+void OpEmitter::genSeparateArgParamBuilder() {
+  SmallVector<AttrParamKind, 2> attrBuilderType;
+  attrBuilderType.push_back(AttrParamKind::WrappedAttr);
+  if (canGenerateUnwrappedBuilder(op))
+    attrBuilderType.push_back(AttrParamKind::UnwrappedValue);
+
+  // Emit with separate builders with or without unwrapped attributes and/or
+  // inferring result type.
+  auto emit = [&](AttrParamKind attrType, TypeParamKind paramKind,
+                  bool inferType) {
+    std::string paramList;
+    llvm::SmallVector<std::string, 4> resultNames;
+    buildParamList(paramList, resultNames, paramKind, attrType);
+
+    auto &m =
+        opClass.newMethod("void", "build", paramList, OpMethod::MP_Static);
+    auto &body = m.body();
+    genCodeForAddingArgAndRegionForBuilder(
+        body, /*isRawValueAttr=*/attrType == AttrParamKind::UnwrappedValue);
+
+    // Push all result types to the operation state
+
+    if (inferType) {
+      // Generate builder that infers type too.
+      // TODO(jpienaar): Subsume this with general checking if type can be
+      // infered automatically.
+      // TODO(jpienaar): Expand to handle regions.
+      body << formatv(R"(
+        SmallVector<Type, 2> inferedReturnTypes;
+        if (succeeded({0}::inferReturnTypes({1}.location, {1}.operands,
+                      {1}.attributes, /*regions=*/{{}, inferedReturnTypes)))
+          {1}.addTypes(inferedReturnTypes);
+        else
+          llvm::report_fatal_error("Failed to infer result type(s).");)",
+                      opClass.getClassName(), builderOpState);
+      return;
+    }
+
+    switch (paramKind) {
+    case TypeParamKind::None:
+      return;
+    case TypeParamKind::Separate:
+      for (int i = 0, e = op.getNumResults(); i < e; ++i) {
+        body << "  " << builderOpState << ".addTypes(" << resultNames[i]
+             << ");\n";
+      }
+      return;
+    case TypeParamKind::Collective:
+      body << "  " << builderOpState << ".addTypes(resultTypes);\n";
+      return;
+    };
+    llvm_unreachable("unhandled TypeParamKind");
+  };
+
+  bool canInferType =
+      op.getTrait("InferTypeOpInterface::Trait") && op.getNumRegions() == 0;
+  for (auto attrType : attrBuilderType) {
+    emit(attrType, TypeParamKind::Separate, /*inferType=*/false);
+    if (canInferType)
+      emit(attrType, TypeParamKind::None, /*inferType=*/true);
+    // Emit separate arg build with collective type, unless there is only one
+    // variadic result, in which case the above would have already generated
+    // the same build method.
+    if (!(op.getNumResults() == 1 && op.getResult(0).isVariadic()))
+      emit(attrType, TypeParamKind::Collective, /*inferType=*/false);
+  }
+}
+
+void OpEmitter::genUseOperandAsResultTypeCollectiveParamBuilder() {
+  // If this op has a variadic result, we cannot generate this builder because
+  // we don't know how many results to create.
+  if (op.getNumVariadicResults() != 0)
+    return;
+
+  int numResults = op.getNumResults();
+
+  // Signature
+  std::string params =
+      std::string("Builder *, OperationState &") + builderOpState +
+      ", ValueRange operands, ArrayRef<NamedAttribute> attributes";
+  auto &m = opClass.newMethod("void", "build", params, OpMethod::MP_Static);
+  auto &body = m.body();
+
+  // Operands
+  body << "  " << builderOpState << ".addOperands(operands);\n\n";
+
+  // Attributes
+  body << "  " << builderOpState << ".addAttributes(attributes);\n";
+
+  // Create the correct number of regions
+  if (int numRegions = op.getNumRegions()) {
+    for (int i = 0; i < numRegions; ++i)
+      m.body() << "  (void)" << builderOpState << ".addRegion();\n";
+  }
+
+  // Result types
+  SmallVector<std::string, 2> resultTypes(numResults, "operands[0]->getType()");
+  body << "  " << builderOpState << ".addTypes({"
+       << llvm::join(resultTypes, ", ") << "});\n\n";
+}
+
+void OpEmitter::genInferedTypeCollectiveParamBuilder() {
+  // TODO(jpienaar): Expand to support regions.
+  const char *params =
+      "Builder *builder, OperationState &{0}, "
+      "ValueRange operands, ArrayRef<NamedAttribute> attributes";
+  auto &m =
+      opClass.newMethod("void", "build", formatv(params, builderOpState).str(),
+                        OpMethod::MP_Static);
+  auto &body = m.body();
+  body << formatv(R"(
+    SmallVector<Type, 2> inferedReturnTypes;
+    if (succeeded({0}::inferReturnTypes({1}.location, operands, attributes,
+                  /*regions=*/{{}, inferedReturnTypes)))
+      build(builder, tblgen_state, inferedReturnTypes, operands, attributes);
+    else
+      llvm::report_fatal_error("Failed to infer result type(s).");)",
+                  opClass.getClassName(), builderOpState);
+}
+
+void OpEmitter::genUseOperandAsResultTypeSeparateParamBuilder() {
+  std::string paramList;
+  llvm::SmallVector<std::string, 4> resultNames;
+  buildParamList(paramList, resultNames, TypeParamKind::None);
+
+  auto &m = opClass.newMethod("void", "build", paramList, OpMethod::MP_Static);
+  genCodeForAddingArgAndRegionForBuilder(m.body());
+
+  auto numResults = op.getNumResults();
+  if (numResults == 0)
+    return;
+
+  // Push all result types to the operation state
+  const char *index = op.getOperand(0).isVariadic() ? ".front()" : "";
+  std::string resultType =
+      formatv("{0}{1}->getType()", getArgumentName(op, 0), index).str();
+  m.body() << "  " << builderOpState << ".addTypes({" << resultType;
+  for (int i = 1; i != numResults; ++i)
+    m.body() << ", " << resultType;
+  m.body() << "});\n\n";
+}
+
+void OpEmitter::genUseAttrAsResultTypeBuilder() {
+  std::string params =
+      std::string("Builder *, OperationState &") + builderOpState +
+      ", ValueRange operands, ArrayRef<NamedAttribute> attributes";
+  auto &m = opClass.newMethod("void", "build", params, OpMethod::MP_Static);
+  auto &body = m.body();
+
+  // Push all result types to the operation state
+  std::string resultType;
+  const auto &namedAttr = op.getAttribute(0);
+
+  body << "  for (auto attr : attributes) {\n";
+  body << "    if (attr.first != \"" << namedAttr.name << "\") continue;\n";
+  if (namedAttr.attr.isTypeAttr()) {
+    resultType = "attr.second.cast<TypeAttr>().getValue()";
+  } else {
+    resultType = "attr.second.getType()";
+  }
+
+  // Operands
+  body << "  " << builderOpState << ".addOperands(operands);\n\n";
+  // Attributes
+  body << "  " << builderOpState << ".addAttributes(attributes);\n";
+
+  // Result types
+  SmallVector<std::string, 2> resultTypes(op.getNumResults(), resultType);
+  body << "    " << builderOpState << ".addTypes({"
+       << llvm::join(resultTypes, ", ") << "});\n";
+  body << "  }\n";
+}
+
+void OpEmitter::genBuilder() {
+  // Handle custom builders if provided.
+  // TODO(antiagainst): Create wrapper class for OpBuilder to hide the native
+  // TableGen API calls here.
+  {
+    auto *listInit = dyn_cast_or_null<ListInit>(def.getValueInit("builders"));
+    if (listInit) {
+      for (Init *init : listInit->getValues()) {
+        Record *builderDef = cast<DefInit>(init)->getDef();
+        StringRef params = builderDef->getValueAsString("params");
+        StringRef body = builderDef->getValueAsString("body");
+        bool hasBody = !body.empty();
+
+        auto &method =
+            opClass.newMethod("void", "build", params, OpMethod::MP_Static,
+                              /*declOnly=*/!hasBody);
+        if (hasBody)
+          method.body() << body;
+      }
+    }
+    if (op.skipDefaultBuilders()) {
+      if (!listInit || listInit->empty())
+        PrintFatalError(
+            op.getLoc(),
+            "default builders are skipped and no custom builders provided");
+      return;
+    }
+  }
+
+  // Generate default builders that requires all result type, operands, and
+  // attributes as parameters.
+
+  // We generate three classes of builders here:
+  // 1. one having a stand-alone parameter for each operand / attribute, and
+  genSeparateArgParamBuilder();
+  // 2. one having an aggregated parameter for all result types / operands /
+  //    attributes, and
+  genCollectiveParamBuilder();
+  // 3. one having a stand-alone parameter for each operand and attribute,
+  //    use the first operand or attribute's type as all result types
+  //    to facilitate different call patterns.
+  if (op.getNumVariadicResults() == 0) {
+    if (op.getTrait("OpTrait::SameOperandsAndResultType")) {
+      genUseOperandAsResultTypeSeparateParamBuilder();
+      genUseOperandAsResultTypeCollectiveParamBuilder();
+    }
+    if (op.getTrait("OpTrait::FirstAttrDerivedResultType"))
+      genUseAttrAsResultTypeBuilder();
+  }
+}
+
+void OpEmitter::genCollectiveParamBuilder() {
+  int numResults = op.getNumResults();
+  int numVariadicResults = op.getNumVariadicResults();
+  int numNonVariadicResults = numResults - numVariadicResults;
+
+  int numOperands = op.getNumOperands();
+  int numVariadicOperands = op.getNumVariadicOperands();
+  int numNonVariadicOperands = numOperands - numVariadicOperands;
+  // Signature
+  std::string params = std::string("Builder *, OperationState &") +
+                       builderOpState +
+                       ", ArrayRef<Type> resultTypes, ValueRange operands, "
+                       "ArrayRef<NamedAttribute> attributes";
+  auto &m = opClass.newMethod("void", "build", params, OpMethod::MP_Static);
+  auto &body = m.body();
+
+  // Operands
+  if (numVariadicOperands == 0 || numNonVariadicOperands != 0)
+    body << "  assert(operands.size()"
+         << (numVariadicOperands != 0 ? " >= " : " == ")
+         << numNonVariadicOperands
+         << "u && \"mismatched number of parameters\");\n";
+  body << "  " << builderOpState << ".addOperands(operands);\n\n";
+
+  // Attributes
+  body << "  " << builderOpState << ".addAttributes(attributes);\n";
+
+  // Create the correct number of regions
+  if (int numRegions = op.getNumRegions()) {
+    for (int i = 0; i < numRegions; ++i)
+      m.body() << "  (void)" << builderOpState << ".addRegion();\n";
+  }
+
+  // Result types
+  if (numVariadicResults == 0 || numNonVariadicResults != 0)
+    body << "  assert(resultTypes.size()"
+         << (numVariadicResults != 0 ? " >= " : " == ") << numNonVariadicResults
+         << "u && \"mismatched number of return types\");\n";
+  body << "  " << builderOpState << ".addTypes(resultTypes);\n";
+
+  // Generate builder that infers type too.
+  // TODO(jpienaar): Subsume this with general checking if type can be infered
+  // automatically.
+  // TODO(jpienaar): Expand to handle regions.
+  if (op.getTrait("InferTypeOpInterface::Trait") && op.getNumRegions() == 0)
+    genInferedTypeCollectiveParamBuilder();
+}
+
+void OpEmitter::buildParamList(std::string &paramList,
+                               SmallVectorImpl<std::string> &resultTypeNames,
+                               TypeParamKind typeParamKind,
+                               AttrParamKind attrParamKind) {
+  resultTypeNames.clear();
+  auto numResults = op.getNumResults();
+  resultTypeNames.reserve(numResults);
+
+  paramList = "Builder *tblgen_builder, OperationState &";
+  paramList.append(builderOpState);
+
+  switch (typeParamKind) {
+  case TypeParamKind::None:
+    break;
+  case TypeParamKind::Separate: {
+    // Add parameters for all return types
+    for (int i = 0; i < numResults; ++i) {
+      const auto &result = op.getResult(i);
+      std::string resultName = result.name;
+      if (resultName.empty())
+        resultName = formatv("resultType{0}", i);
+
+      paramList.append(result.isVariadic() ? ", ArrayRef<Type> " : ", Type ");
+      paramList.append(resultName);
+
+      resultTypeNames.emplace_back(std::move(resultName));
+    }
+  } break;
+  case TypeParamKind::Collective: {
+    paramList.append(", ArrayRef<Type> resultTypes");
+    resultTypeNames.push_back("resultTypes");
+  } break;
+  }
+
+  // Add parameters for all arguments (operands and attributes).
+
+  int numOperands = 0;
+  int numAttrs = 0;
+
+  int defaultValuedAttrStartIndex = op.getNumArgs();
+  if (attrParamKind == AttrParamKind::UnwrappedValue) {
+    // Calculate the start index from which we can attach default values in the
+    // builder declaration.
+    for (int i = op.getNumArgs() - 1; i >= 0; --i) {
+      auto *namedAttr = op.getArg(i).dyn_cast<tblgen::NamedAttribute *>();
+      if (!namedAttr || !namedAttr->attr.hasDefaultValue())
+        break;
+
+      if (!canUseUnwrappedRawValue(namedAttr->attr))
+        break;
+
+      // Creating an APInt requires us to provide bitwidth, value, and
+      // signedness, which is complicated compared to others. Similarly
+      // for APFloat.
+      // TODO(b/144412160) Adjust the 'returnType' field of such attributes
+      // to support them.
+      StringRef retType = namedAttr->attr.getReturnType();
+      if (retType == "APInt" || retType == "APFloat")
+        break;
+
+      defaultValuedAttrStartIndex = i;
+    }
+  }
+
+  for (int i = 0, e = op.getNumArgs(); i < e; ++i) {
+    auto argument = op.getArg(i);
+    if (argument.is<tblgen::NamedTypeConstraint *>()) {
+      const auto &operand = op.getOperand(numOperands);
+      paramList.append(operand.isVariadic() ? ", ValueRange " : ", Value ");
+      paramList.append(getArgumentName(op, numOperands));
+      ++numOperands;
+    } else {
+      const auto &namedAttr = op.getAttribute(numAttrs);
+      const auto &attr = namedAttr.attr;
+      paramList.append(", ");
+
+      if (attr.isOptional())
+        paramList.append("/*optional*/");
+
+      switch (attrParamKind) {
+      case AttrParamKind::WrappedAttr:
+        paramList.append(attr.getStorageType());
+        break;
+      case AttrParamKind::UnwrappedValue:
+        if (canUseUnwrappedRawValue(attr)) {
+          paramList.append(attr.getReturnType());
+        } else {
+          paramList.append(attr.getStorageType());
+        }
+        break;
+      }
+      paramList.append(" ");
+      paramList.append(namedAttr.name);
+
+      // Attach default value if requested and possible.
+      if (attrParamKind == AttrParamKind::UnwrappedValue &&
+          i >= defaultValuedAttrStartIndex) {
+        bool isString = attr.getReturnType() == "StringRef";
+        paramList.append(" = ");
+        if (isString)
+          paramList.append("\"");
+        paramList.append(attr.getDefaultValue());
+        if (isString)
+          paramList.append("\"");
+      }
+      ++numAttrs;
+    }
+  }
+}
+
+void OpEmitter::genCodeForAddingArgAndRegionForBuilder(OpMethodBody &body,
+                                                       bool isRawValueAttr) {
+  // Push all operands to the result
+  for (int i = 0, e = op.getNumOperands(); i < e; ++i) {
+    body << "  " << builderOpState << ".addOperands(" << getArgumentName(op, i)
+         << ");\n";
+  }
+
+  // Push all attributes to the result
+  for (const auto &namedAttr : op.getAttributes()) {
+    auto &attr = namedAttr.attr;
+    if (!attr.isDerivedAttr()) {
+      bool emitNotNullCheck = attr.isOptional();
+      if (emitNotNullCheck) {
+        body << formatv("  if ({0}) ", namedAttr.name) << "{\n";
+      }
+      if (isRawValueAttr && canUseUnwrappedRawValue(attr)) {
+        // If this is a raw value, then we need to wrap it in an Attribute
+        // instance.
+        FmtContext fctx;
+        fctx.withBuilder("(*tblgen_builder)");
+        std::string value =
+            tgfmt(attr.getConstBuilderTemplate(), &fctx, namedAttr.name);
+        body << formatv("  {0}.addAttribute(\"{1}\", {2});\n", builderOpState,
+                        namedAttr.name, value);
+      } else {
+        body << formatv("  {0}.addAttribute(\"{1}\", {1});\n", builderOpState,
+                        namedAttr.name);
+      }
+      if (emitNotNullCheck) {
+        body << "  }\n";
+      }
+    }
+  }
+
+  // Create the correct number of regions
+  if (int numRegions = op.getNumRegions()) {
+    for (int i = 0; i < numRegions; ++i)
+      body << "  (void)" << builderOpState << ".addRegion();\n";
+  }
+}
+
+void OpEmitter::genCanonicalizerDecls() {
+  if (!def.getValueAsBit("hasCanonicalizer"))
+    return;
+
+  const char *const params =
+      "OwningRewritePatternList &results, MLIRContext *context";
+  opClass.newMethod("void", "getCanonicalizationPatterns", params,
+                    OpMethod::MP_Static, /*declOnly=*/true);
+}
+
+void OpEmitter::genFolderDecls() {
+  bool hasSingleResult =
+      op.getNumResults() == 1 && op.getNumVariadicResults() == 0;
+
+  if (def.getValueAsBit("hasFolder")) {
+    if (hasSingleResult) {
+      const char *const params = "ArrayRef<Attribute> operands";
+      opClass.newMethod("OpFoldResult", "fold", params, OpMethod::MP_None,
+                        /*declOnly=*/true);
+    } else {
+      const char *const params = "ArrayRef<Attribute> operands, "
+                                 "SmallVectorImpl<OpFoldResult> &results";
+      opClass.newMethod("LogicalResult", "fold", params, OpMethod::MP_None,
+                        /*declOnly=*/true);
+    }
+  }
+}
+
+void OpEmitter::genOpInterfaceMethods() {
+  for (const auto &trait : op.getTraits()) {
+    auto opTrait = dyn_cast<tblgen::InterfaceOpTrait>(&trait);
+    if (!opTrait || !opTrait->shouldDeclareMethods())
+      continue;
+    auto interface = opTrait->getOpInterface();
+    for (auto method : interface.getMethods()) {
+      // Don't declare if the method has a body.
+      if (method.getBody())
+        continue;
+      std::string args;
+      llvm::raw_string_ostream os(args);
+      mlir::interleaveComma(method.getArguments(), os,
+                            [&](const OpInterfaceMethod::Argument &arg) {
+                              os << arg.type << " " << arg.name;
+                            });
+      opClass.newMethod(method.getReturnType(), method.getName(), os.str(),
+                        method.isStatic() ? OpMethod::MP_Static
+                                          : OpMethod::MP_None,
+                        /*declOnly=*/true);
+    }
+  }
+}
+
+void OpEmitter::genParser() {
+  if (!hasStringAttribute(def, "parser"))
+    return;
+
+  auto &method = opClass.newMethod(
+      "ParseResult", "parse", "OpAsmParser &parser, OperationState &result",
+      OpMethod::MP_Static);
+  FmtContext fctx;
+  fctx.addSubst("cppClass", opClass.getClassName());
+  auto parser = def.getValueAsString("parser").ltrim().rtrim(" \t\v\f\r");
+  method.body() << "  " << tgfmt(parser, &fctx);
+}
+
+void OpEmitter::genPrinter() {
+  auto valueInit = def.getValueInit("printer");
+  CodeInit *codeInit = dyn_cast<CodeInit>(valueInit);
+  if (!codeInit)
+    return;
+
+  auto &method = opClass.newMethod("void", "print", "OpAsmPrinter &p");
+  FmtContext fctx;
+  fctx.addSubst("cppClass", opClass.getClassName());
+  auto printer = codeInit->getValue().ltrim().rtrim(" \t\v\f\r");
+  method.body() << "  " << tgfmt(printer, &fctx);
+}
+
+void OpEmitter::genVerifier() {
+  auto valueInit = def.getValueInit("verifier");
+  CodeInit *codeInit = dyn_cast<CodeInit>(valueInit);
+  bool hasCustomVerify = codeInit && !codeInit->getValue().empty();
+
+  auto &method = opClass.newMethod("LogicalResult", "verify", /*params=*/"");
+  auto &body = method.body();
+
+  // Populate substitutions for attributes and named operands and results.
+  for (const auto &namedAttr : op.getAttributes())
+    verifyCtx.addSubst(namedAttr.name,
+                       formatv("this->getAttr(\"{0}\")", namedAttr.name));
+  for (int i = 0, e = op.getNumOperands(); i < e; ++i) {
+    auto &value = op.getOperand(i);
+    // Skip from from first variadic operands for now. Else getOperand index
+    // used below doesn't match.
+    if (value.isVariadic())
+      break;
+    if (!value.name.empty())
+      verifyCtx.addSubst(
+          value.name, formatv("(*this->getOperation()->getOperand({0}))", i));
+  }
+  for (int i = 0, e = op.getNumResults(); i < e; ++i) {
+    auto &value = op.getResult(i);
+    // Skip from from first variadic results for now. Else getResult index used
+    // below doesn't match.
+    if (value.isVariadic())
+      break;
+    if (!value.name.empty())
+      verifyCtx.addSubst(value.name,
+                         formatv("(*this->getOperation()->getResult({0}))", i));
+  }
+
+  // Verify the attributes have the correct type.
+  for (const auto &namedAttr : op.getAttributes()) {
+    const auto &attr = namedAttr.attr;
+    if (attr.isDerivedAttr())
+      continue;
+
+    auto attrName = namedAttr.name;
+    // Prefix with `tblgen_` to avoid hiding the attribute accessor.
+    auto varName = tblgenNamePrefix + attrName;
+    body << formatv("  auto {0} = this->getAttr(\"{1}\");\n", varName,
+                    attrName);
+
+    bool allowMissingAttr = attr.hasDefaultValue() || attr.isOptional();
+    if (allowMissingAttr) {
+      // If the attribute has a default value, then only verify the predicate if
+      // set. This does effectively assume that the default value is valid.
+      // TODO: verify the debug value is valid (perhaps in debug mode only).
+      body << "  if (" << varName << ") {\n";
+    } else {
+      body << "  if (!" << varName
+           << ") return emitOpError(\"requires attribute '" << attrName
+           << "'\");\n  {\n";
+    }
+
+    auto attrPred = attr.getPredicate();
+    if (!attrPred.isNull()) {
+      body << tgfmt(
+          "    if (!($0)) return emitOpError(\"attribute '$1' "
+          "failed to satisfy constraint: $2\");\n",
+          /*ctx=*/nullptr,
+          tgfmt(attrPred.getCondition(), &verifyCtx.withSelf(varName)),
+          attrName, attr.getDescription());
+    }
+
+    body << "  }\n";
+  }
+
+  const char *code = R"(
+  auto sizeAttr = getAttrOfType<DenseIntElementsAttr>("{0}");
+  auto numElements = sizeAttr.getType().cast<ShapedType>().getNumElements();
+  if (numElements != {1}) {{
+    return emitOpError("'{0}' attribute for specifying {2} segments "
+                       "must have {1} elements");
+  }
+  )";
+
+  for (auto &trait : op.getTraits()) {
+    if (auto *t = dyn_cast<tblgen::PredOpTrait>(&trait)) {
+      body << tgfmt("  if (!($0)) {\n    "
+                    "return emitOpError(\"failed to verify that $1\");\n  }\n",
+                    &verifyCtx, tgfmt(t->getPredTemplate(), &verifyCtx),
+                    t->getDescription());
+    } else if (auto *t = dyn_cast<tblgen::NativeOpTrait>(&trait)) {
+      if (t->getTrait() == "OpTrait::AttrSizedOperandSegments") {
+        body << formatv(code, "operand_segment_sizes", op.getNumOperands(),
+                        "operand");
+      } else if (t->getTrait() == "OpTrait::AttrSizedResultSegments") {
+        body << formatv(code, "result_segment_sizes", op.getNumResults(),
+                        "result");
+      }
+    }
+  }
+
+  // These should happen after we verified the traits because
+  // getODSOperands()/getODSResults() may depend on traits (e.g.,
+  // AttrSizedOperandSegments/AttrSizedResultSegments).
+  genOperandResultVerifier(body, op.getOperands(), "operand");
+  genOperandResultVerifier(body, op.getResults(), "result");
+
+  genRegionVerifier(body);
+
+  if (hasCustomVerify) {
+    FmtContext fctx;
+    fctx.addSubst("cppClass", opClass.getClassName());
+    auto printer = codeInit->getValue().ltrim().rtrim(" \t\v\f\r");
+    body << "  " << tgfmt(printer, &fctx);
+  } else {
+    body << "  return mlir::success();\n";
+  }
+}
+
+void OpEmitter::genOperandResultVerifier(OpMethodBody &body,
+                                         Operator::value_range values,
+                                         StringRef valueKind) {
+  FmtContext fctx;
+
+  body << "  {\n";
+  body << "    unsigned index = 0; (void)index;\n";
+
+  for (auto staticValue : llvm::enumerate(values)) {
+    if (!staticValue.value().hasPredicate())
+      continue;
+
+    // Emit a loop to check all the dynamic values in the pack.
+    body << formatv("    for (Value v : getODS{0}{1}s({2})) {{\n",
+                    // Capitalize the first letter to match the function name
+                    valueKind.substr(0, 1).upper(), valueKind.substr(1),
+                    staticValue.index());
+
+    auto constraint = staticValue.value().constraint;
+
+    body << "      (void)v;\n"
+         << "      if (!("
+         << tgfmt(constraint.getConditionTemplate(),
+                  &fctx.withSelf("v->getType()"))
+         << ")) {\n"
+         << formatv("        return emitOpError(\"{0} #\") << index "
+                    "<< \" must be {1}, but got \" << v->getType();\n",
+                    valueKind, constraint.getDescription())
+         << "      }\n" // if
+         << "      ++index;\n"
+         << "    }\n"; // for
+  }
+
+  body << "  }\n";
+}
+
+void OpEmitter::genRegionVerifier(OpMethodBody &body) {
+  unsigned numRegions = op.getNumRegions();
+
+  // Verify this op has the correct number of regions
+  body << formatv(
+      "  if (this->getOperation()->getNumRegions() != {0}) {\n    "
+      "return emitOpError(\"has incorrect number of regions: expected {0} but "
+      "found \") << this->getOperation()->getNumRegions();\n  }\n",
+      numRegions);
+
+  for (unsigned i = 0; i < numRegions; ++i) {
+    const auto &region = op.getRegion(i);
+
+    std::string name = formatv("#{0}", i);
+    if (!region.name.empty()) {
+      name += formatv(" ('{0}')", region.name);
+    }
+
+    auto getRegion = formatv("this->getOperation()->getRegion({0})", i).str();
+    auto constraint = tgfmt(region.constraint.getConditionTemplate(),
+                            &verifyCtx.withSelf(getRegion))
+                          .str();
+
+    body << formatv("  if (!({0})) {\n    "
+                    "return emitOpError(\"region {1} failed to verify "
+                    "constraint: {2}\");\n  }\n",
+                    constraint, name, region.constraint.getDescription());
+  }
+}
+
+void OpEmitter::genTraits() {
+  int numResults = op.getNumResults();
+  int numVariadicResults = op.getNumVariadicResults();
+
+  // Add return size trait.
+  if (numVariadicResults != 0) {
+    if (numResults == numVariadicResults)
+      opClass.addTrait("OpTrait::VariadicResults");
+    else
+      opClass.addTrait("OpTrait::AtLeastNResults<" +
+                       Twine(numResults - numVariadicResults) + ">::Impl");
+  } else {
+    switch (numResults) {
+    case 0:
+      opClass.addTrait("OpTrait::ZeroResult");
+      break;
+    case 1:
+      opClass.addTrait("OpTrait::OneResult");
+      break;
+    default:
+      opClass.addTrait("OpTrait::NResults<" + Twine(numResults) + ">::Impl");
+      break;
+    }
+  }
+
+  for (const auto &trait : op.getTraits()) {
+    if (auto opTrait = dyn_cast<tblgen::NativeOpTrait>(&trait))
+      opClass.addTrait(opTrait->getTrait());
+    else if (auto opTrait = dyn_cast<tblgen::InterfaceOpTrait>(&trait))
+      opClass.addTrait(opTrait->getTrait());
+  }
+
+  // Add variadic size trait and normal op traits.
+  int numOperands = op.getNumOperands();
+  int numVariadicOperands = op.getNumVariadicOperands();
+
+  // Add operand size trait.
+  if (numVariadicOperands != 0) {
+    if (numOperands == numVariadicOperands)
+      opClass.addTrait("OpTrait::VariadicOperands");
+    else
+      opClass.addTrait("OpTrait::AtLeastNOperands<" +
+                       Twine(numOperands - numVariadicOperands) + ">::Impl");
+  } else {
+    switch (numOperands) {
+    case 0:
+      opClass.addTrait("OpTrait::ZeroOperands");
+      break;
+    case 1:
+      opClass.addTrait("OpTrait::OneOperand");
+      break;
+    default:
+      opClass.addTrait("OpTrait::NOperands<" + Twine(numOperands) + ">::Impl");
+      break;
+    }
+  }
+}
+
+void OpEmitter::genOpNameGetter() {
+  auto &method = opClass.newMethod("StringRef", "getOperationName",
+                                   /*params=*/"", OpMethod::MP_Static);
+  method.body() << "  return \"" << op.getOperationName() << "\";\n";
+}
+
+void OpEmitter::genOpAsmInterface() {
+  // If the user only has one results or specifically added the Asm trait,
+  // then don't generate it for them. We specifically only handle multi result
+  // operations, because the name of a single result in the common case is not
+  // interesting(generally 'result'/'output'/etc.).
+  // TODO: We could also add a flag to allow operations to opt in to this
+  // generation, even if they only have a single operation.
+  int numResults = op.getNumResults();
+  if (numResults <= 1 || op.getTrait("OpAsmOpInterface::Trait"))
+    return;
+
+  SmallVector<StringRef, 4> resultNames(numResults);
+  for (int i = 0; i != numResults; ++i)
+    resultNames[i] = op.getResultName(i);
+
+  // Don't add the trait if none of the results have a valid name.
+  if (llvm::all_of(resultNames, [](StringRef name) { return name.empty(); }))
+    return;
+  opClass.addTrait("OpAsmOpInterface::Trait");
+
+  // Generate the right accessor for the number of results.
+  auto &method = opClass.newMethod("void", "getAsmResultNames",
+                                   "OpAsmSetValueNameFn setNameFn");
+  auto &body = method.body();
+  for (int i = 0; i != numResults; ++i) {
+    body << "  auto resultGroup" << i << " = getODSResults(" << i << ");\n"
+         << "  if (!llvm::empty(resultGroup" << i << "))\n"
+         << "    setNameFn(*resultGroup" << i << ".begin(), \""
+         << resultNames[i] << "\");\n";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// OpOperandAdaptor emitter
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Helper class to emit Op operand adaptors to an output stream.  Operand
+// adaptors are wrappers around ArrayRef<Value> that provide named operand
+// getters identical to those defined in the Op.
+class OpOperandAdaptorEmitter {
+public:
+  static void emitDecl(const Operator &op, raw_ostream &os);
+  static void emitDef(const Operator &op, raw_ostream &os);
+
+private:
+  explicit OpOperandAdaptorEmitter(const Operator &op);
+
+  Class adapterClass;
+};
+} // end namespace
+
+OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(const Operator &op)
+    : adapterClass(op.getCppClassName().str() + "OperandAdaptor") {
+  adapterClass.newField("ArrayRef<Value>", "tblgen_operands");
+  auto &constructor = adapterClass.newConstructor("ArrayRef<Value> values");
+  constructor.body() << "  tblgen_operands = values;\n";
+
+  generateNamedOperandGetters(op, adapterClass,
+                              /*rangeType=*/"ArrayRef<Value>",
+                              /*rangeBeginCall=*/"tblgen_operands.begin()",
+                              /*rangeSizeCall=*/"tblgen_operands.size()",
+                              /*getOperandCallPattern=*/"tblgen_operands[{0}]");
+}
+
+void OpOperandAdaptorEmitter::emitDecl(const Operator &op, raw_ostream &os) {
+  OpOperandAdaptorEmitter(op).adapterClass.writeDeclTo(os);
+}
+
+void OpOperandAdaptorEmitter::emitDef(const Operator &op, raw_ostream &os) {
+  OpOperandAdaptorEmitter(op).adapterClass.writeDefTo(os);
+}
+
+// Emits the opcode enum and op classes.
+static void emitOpClasses(const std::vector<Record *> &defs, raw_ostream &os,
+                          bool emitDecl) {
+  IfDefScope scope("GET_OP_CLASSES", os);
+  // First emit forward declaration for each class, this allows them to refer
+  // to each others in traits for example.
+  if (emitDecl) {
+    for (auto *def : defs) {
+      Operator op(*def);
+      os << "class " << op.getCppClassName() << ";\n";
+    }
+  }
+  for (auto *def : defs) {
+    Operator op(*def);
+    const auto *attrSizedOperands =
+        op.getTrait("OpTrait::AttrSizedOperandSegments");
+    if (emitDecl) {
+      os << formatv(opCommentHeader, op.getQualCppClassName(), "declarations");
+      // We cannot generate the operand adaptor class if operand getters depend
+      // on an attribute.
+      if (!attrSizedOperands)
+        OpOperandAdaptorEmitter::emitDecl(op, os);
+      OpEmitter::emitDecl(op, os);
+    } else {
+      os << formatv(opCommentHeader, op.getQualCppClassName(), "definitions");
+      if (!attrSizedOperands)
+        OpOperandAdaptorEmitter::emitDef(op, os);
+      OpEmitter::emitDef(op, os);
+    }
+  }
+}
+
+// Emits a comma-separated list of the ops.
+static void emitOpList(const std::vector<Record *> &defs, raw_ostream &os) {
+  IfDefScope scope("GET_OP_LIST", os);
+
+  interleave(
+      // TODO: We are constructing the Operator wrapper instance just for
+      // getting it's qualified class name here. Reduce the overhead by having a
+      // lightweight version of Operator class just for that purpose.
+      defs, [&os](Record *def) { os << Operator(def).getQualCppClassName(); },
+      [&os]() { os << ",\n"; });
+}
+
+static bool emitOpDecls(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  emitSourceFileHeader("Op Declarations", os);
+
+  const auto &defs = recordKeeper.getAllDerivedDefinitions("Op");
+  emitOpClasses(defs, os, /*emitDecl=*/true);
+
+  return false;
+}
+
+static bool emitOpDefs(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  emitSourceFileHeader("Op Definitions", os);
+
+  const auto &defs = recordKeeper.getAllDerivedDefinitions("Op");
+  emitOpList(defs, os);
+  emitOpClasses(defs, os, /*emitDecl=*/false);
+
+  return false;
+}
+
+static mlir::GenRegistration
+    genOpDecls("gen-op-decls", "Generate op declarations",
+               [](const RecordKeeper &records, raw_ostream &os) {
+                 return emitOpDecls(records, os);
+               });
+
+static mlir::GenRegistration genOpDefs("gen-op-defs", "Generate op definitions",
+                                       [](const RecordKeeper &records,
+                                          raw_ostream &os) {
+                                         return emitOpDefs(records, os);
+                                       });
diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..87a27238ce360452dedc41c0bb44d40cfecb0462
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp
@@ -0,0 +1,183 @@
+//===- OpDocGen.cpp - MLIR operation documentation generator --------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpDocGen uses the description of operations to generate documentation for the
+// operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DocGenUtilities.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+using namespace mlir::tblgen;
+
+using mlir::tblgen::Operator;
+
+// Emit the description by aligning the text to the left per line (e.g.,
+// removing the minimum indentation across the block).
+//
+// This expects that the description in the tablegen file is already formatted
+// in a way the user wanted but has some additional indenting due to being
+// nested in the op definition.
+void mlir::tblgen::emitDescription(StringRef description, raw_ostream &os) {
+  // Determine the minimum number of spaces in a line.
+  size_t min_indent = -1;
+  StringRef remaining = description;
+  while (!remaining.empty()) {
+    auto split = remaining.split('\n');
+    size_t indent = split.first.find_first_not_of(" \t");
+    if (indent != StringRef::npos)
+      min_indent = std::min(indent, min_indent);
+    remaining = split.second;
+  }
+
+  // Print out the description indented.
+  os << "\n";
+  remaining = description;
+  bool printed = false;
+  while (!remaining.empty()) {
+    auto split = remaining.split('\n');
+    if (split.second.empty()) {
+      // Skip last line with just spaces.
+      if (split.first.ltrim().empty())
+        break;
+    }
+    // Print empty new line without spaces if line only has spaces, unless no
+    // text has been emitted before.
+    if (split.first.ltrim().empty()) {
+      if (printed)
+        os << "\n";
+    } else {
+      os << split.first.substr(min_indent) << "\n";
+      printed = true;
+    }
+    remaining = split.second;
+  }
+}
+
+// Emits `str` with trailing newline if not empty.
+static void emitIfNotEmpty(StringRef str, raw_ostream &os) {
+  if (!str.empty()) {
+    emitDescription(str, os);
+    os << "\n";
+  }
+}
+
+static void emitOpDocForDialect(const Dialect &dialect,
+                                const std::vector<Operator> &ops,
+                                const std::vector<Type> &types,
+                                raw_ostream &os) {
+  os << "# Dialect '" << dialect.getName() << "' definition\n\n";
+  emitIfNotEmpty(dialect.getSummary(), os);
+  emitIfNotEmpty(dialect.getDescription(), os);
+
+  // TODO(b/143543720) Generate TOC where extension is not supported.
+  os << "[TOC]\n\n";
+
+  // TODO(antiagainst): Add link between use and def for types
+  if (!types.empty())
+    os << "## Type definition\n\n";
+  for (auto type : types) {
+    os << "### " << type.getDescription() << "\n";
+    emitDescription(type.getTypeDescription(), os);
+    os << "\n";
+  }
+
+  if (!ops.empty())
+    os << "## Operation definition\n\n";
+  for (auto op : ops) {
+    os << "### " << op.getOperationName() << " (" << op.getQualCppClassName()
+       << ")";
+
+    // Emit summary & description of operator.
+    if (op.hasSummary())
+      os << "\n" << op.getSummary() << "\n";
+    os << "\n#### Description:\n\n";
+    if (op.hasDescription())
+      mlir::tblgen::emitDescription(op.getDescription(), os);
+
+    // Emit operands & type of operand. All operands are numbered, some may be
+    // named too.
+    os << "\n#### Operands:\n\n";
+    for (const auto &operand : op.getOperands()) {
+      os << "1. ";
+      if (!operand.name.empty())
+        os << "`" << operand.name << "`: ";
+      else
+        os << "&laquo;unnamed&raquo;: ";
+      os << operand.constraint.getDescription() << "\n";
+    }
+
+    // Emit attributes.
+    // TODO: Attributes are only documented by TableGen name, with no further
+    // info. This should be improved.
+    os << "\n#### Attributes:\n\n";
+    if (op.getNumAttributes() > 0) {
+      os << "| Attribute | MLIR Type | Description |\n"
+         << "| :-------: | :-------: | ----------- |\n";
+    }
+    for (auto namedAttr : op.getAttributes()) {
+      os << "| `" << namedAttr.name << "` | `"
+         << namedAttr.attr.getStorageType() << "` | "
+         << namedAttr.attr.getDescription() << " attribute |\n";
+    }
+
+    // Emit results.
+    os << "\n#### Results:\n\n";
+    for (unsigned i = 0, e = op.getNumResults(); i < e; ++i) {
+      os << "1. ";
+      auto name = op.getResultName(i);
+      if (name.empty())
+        os << "&laquo;unnamed&raquo;: ";
+      else
+        os << "`" << name << "`: ";
+      os << op.getResultTypeConstraint(i).getDescription() << "\n";
+    }
+
+    os << "\n";
+  }
+}
+
+static void emitOpDoc(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  const auto &opDefs = recordKeeper.getAllDerivedDefinitions("Op");
+  const auto &typeDefs = recordKeeper.getAllDerivedDefinitions("DialectType");
+
+  std::map<Dialect, std::vector<Operator>> dialectOps;
+  std::map<Dialect, std::vector<Type>> dialectTypes;
+  for (auto *opDef : opDefs) {
+    Operator op(opDef);
+    dialectOps[op.getDialect()].push_back(op);
+  }
+  for (auto *typeDef : typeDefs) {
+    Type type(typeDef);
+    if (auto dialect = type.getDialect())
+      dialectTypes[dialect].push_back(type);
+  }
+
+  os << "<!-- Autogenerated by mlir-tblgen; don't manually edit -->\n";
+  for (auto dialectWithOps : dialectOps)
+    emitOpDocForDialect(dialectWithOps.first, dialectWithOps.second,
+                        dialectTypes[dialectWithOps.first], os);
+}
+
+static mlir::GenRegistration
+    genRegister("gen-op-doc", "Generate operation documentation",
+                [](const RecordKeeper &records, raw_ostream &os) {
+                  emitOpDoc(records, os);
+                  return false;
+                });
diff --git a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a96736cd2c5b7306cc9e704f45a00dffa06ff68d
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
@@ -0,0 +1,292 @@
+//===- OpInterfacesGen.cpp - MLIR op interface utility generator ----------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpInterfacesGen generates definitions for operation interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DocGenUtilities.h"
+#include "mlir/Support/STLExtras.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/OpInterfaces.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+using mlir::tblgen::OpInterface;
+using mlir::tblgen::OpInterfaceMethod;
+
+// Emit the method name and argument list for the given method. If
+// 'addOperationArg' is true, then an Operation* argument is added to the
+// beginning of the argument list.
+static void emitMethodNameAndArgs(const OpInterfaceMethod &method,
+                                  raw_ostream &os, bool addOperationArg) {
+  os << method.getName() << '(';
+  if (addOperationArg)
+    os << "Operation *tablegen_opaque_op" << (method.arg_empty() ? "" : ", ");
+  interleaveComma(method.getArguments(), os,
+                  [&](const OpInterfaceMethod::Argument &arg) {
+                    os << arg.type << " " << arg.name;
+                  });
+  os << ')';
+}
+
+// Get an array of all OpInterface definitions but exclude those subclassing
+// "DeclareOpInterfaceMethods".
+static std::vector<Record *>
+getAllOpInterfaceDefinitions(const RecordKeeper &recordKeeper) {
+  std::vector<Record *> defs =
+      recordKeeper.getAllDerivedDefinitions("OpInterface");
+
+  llvm::erase_if(defs, [](const Record *def) {
+    return def->isSubClassOf("DeclareOpInterfaceMethods");
+  });
+  return defs;
+}
+
+//===----------------------------------------------------------------------===//
+// GEN: Interface definitions
+//===----------------------------------------------------------------------===//
+
+static void emitInterfaceDef(OpInterface &interface, raw_ostream &os) {
+  StringRef interfaceName = interface.getName();
+
+  // Insert the method definitions.
+  for (auto &method : interface.getMethods()) {
+    os << method.getReturnType() << " " << interfaceName << "::";
+    emitMethodNameAndArgs(method, os, /*addOperationArg=*/false);
+
+    // Forward to the method on the concrete operation type.
+    os << " {\n      return getImpl()->" << method.getName() << '(';
+    if (!method.isStatic())
+      os << "getOperation()" << (method.arg_empty() ? "" : ", ");
+    interleaveComma(
+        method.getArguments(), os,
+        [&](const OpInterfaceMethod::Argument &arg) { os << arg.name; });
+    os << ");\n  }\n";
+  }
+}
+
+static bool emitInterfaceDefs(const RecordKeeper &recordKeeper,
+                              raw_ostream &os) {
+  llvm::emitSourceFileHeader("Operation Interface Definitions", os);
+
+  for (const auto *def : getAllOpInterfaceDefinitions(recordKeeper)) {
+    OpInterface interface(def);
+    emitInterfaceDef(interface, os);
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// GEN: Interface declarations
+//===----------------------------------------------------------------------===//
+
+static void emitConceptDecl(OpInterface &interface, raw_ostream &os) {
+  os << "  class Concept {\n"
+     << "  public:\n"
+     << "    virtual ~Concept() = default;\n";
+
+  // Insert each of the pure virtual concept methods.
+  for (auto &method : interface.getMethods()) {
+    os << "    virtual " << method.getReturnType() << " ";
+    emitMethodNameAndArgs(method, os, /*addOperationArg=*/!method.isStatic());
+    os << " = 0;\n";
+  }
+  os << "  };\n";
+}
+
+static void emitModelDecl(OpInterface &interface, raw_ostream &os) {
+  os << "  template<typename ConcreteOp>\n";
+  os << "  class Model : public Concept {\npublic:\n";
+
+  // Insert each of the virtual method overrides.
+  for (auto &method : interface.getMethods()) {
+    os << "    " << method.getReturnType() << " ";
+    emitMethodNameAndArgs(method, os, /*addOperationArg=*/!method.isStatic());
+    os << " final {\n";
+
+    // Provide a definition of the concrete op if this is non static.
+    if (!method.isStatic()) {
+      os << "      auto op = llvm::cast<ConcreteOp>(tablegen_opaque_op);\n"
+         << "      (void)op;\n";
+    }
+
+    // Check for a provided body to the function.
+    if (auto body = method.getBody()) {
+      os << body << "\n    }\n";
+      continue;
+    }
+
+    // Forward to the method on the concrete operation type.
+    os << "      return " << (method.isStatic() ? "ConcreteOp::" : "op.");
+
+    // Add the arguments to the call.
+    os << method.getName() << '(';
+    interleaveComma(
+        method.getArguments(), os,
+        [&](const OpInterfaceMethod::Argument &arg) { os << arg.name; });
+    os << ");\n    }\n";
+  }
+  os << "  };\n";
+}
+
+static void emitTraitDecl(OpInterface &interface, raw_ostream &os,
+                          StringRef interfaceName,
+                          StringRef interfaceTraitsName) {
+  os << "  template <typename ConcreteOp>\n  "
+     << llvm::formatv("struct Trait : public OpInterface<{0},"
+                      " detail::{1}>::Trait<ConcreteOp> {{\n",
+                      interfaceName, interfaceTraitsName);
+
+  // Insert the default implementation for any methods.
+  for (auto &method : interface.getMethods()) {
+    auto defaultImpl = method.getDefaultImplementation();
+    if (!defaultImpl)
+      continue;
+
+    os << "  " << (method.isStatic() ? "static " : "") << method.getReturnType()
+       << " ";
+    emitMethodNameAndArgs(method, os, /*addOperationArg=*/false);
+    os << " {\n" << defaultImpl.getValue() << "  }\n";
+  }
+
+  os << "  };\n";
+}
+
+static void emitInterfaceDecl(OpInterface &interface, raw_ostream &os) {
+  StringRef interfaceName = interface.getName();
+  auto interfaceTraitsName = (interfaceName + "InterfaceTraits").str();
+
+  // Emit the traits struct containing the concept and model declarations.
+  os << "namespace detail {\n"
+     << "struct " << interfaceTraitsName << " {\n";
+  emitConceptDecl(interface, os);
+  emitModelDecl(interface, os);
+  os << "};\n} // end namespace detail\n";
+
+  // Emit the main interface class declaration.
+  os << llvm::formatv("class {0} : public OpInterface<{1}, detail::{2}> {\n"
+                      "public:\n"
+                      "  using OpInterface<{1}, detail::{2}>::OpInterface;\n",
+                      interfaceName, interfaceName, interfaceTraitsName);
+
+  // Emit the derived trait for the interface.
+  emitTraitDecl(interface, os, interfaceName, interfaceTraitsName);
+
+  // Insert the method declarations.
+  for (auto &method : interface.getMethods()) {
+    os << "  " << method.getReturnType() << " ";
+    emitMethodNameAndArgs(method, os, /*addOperationArg=*/false);
+    os << ";\n";
+  }
+  os << "};\n";
+}
+
+static bool emitInterfaceDecls(const RecordKeeper &recordKeeper,
+                               raw_ostream &os) {
+  llvm::emitSourceFileHeader("Operation Interface Declarations", os);
+
+  for (const auto *def : getAllOpInterfaceDefinitions(recordKeeper)) {
+    OpInterface interface(def);
+    emitInterfaceDecl(interface, os);
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// GEN: Interface documentation
+//===----------------------------------------------------------------------===//
+
+/// Emit a string corresponding to a C++ type, followed by a space if necessary.
+static raw_ostream &emitCPPType(StringRef type, raw_ostream &os) {
+  type = type.trim();
+  os << type;
+  if (type.back() != '&' && type.back() != '*')
+    os << " ";
+  return os;
+}
+
+static void emitInterfaceDoc(const Record &interfaceDef, raw_ostream &os) {
+  OpInterface interface(&interfaceDef);
+
+  // Emit the interface name followed by the description.
+  os << "## " << interface.getName() << " (" << interfaceDef.getName() << ")";
+  if (auto description = interface.getDescription())
+    mlir::tblgen::emitDescription(*description, os);
+
+  // Emit the methods required by the interface.
+  os << "\n### Methods:\n";
+  for (const auto &method : interface.getMethods()) {
+    // Emit the method name.
+    os << "#### `" << method.getName() << "`\n\n```c++\n";
+
+    // Emit the method signature.
+    if (method.isStatic())
+      os << "static ";
+    emitCPPType(method.getReturnType(), os) << method.getName() << '(';
+    interleaveComma(method.getArguments(), os,
+                    [&](const OpInterfaceMethod::Argument &arg) {
+                      emitCPPType(arg.type, os) << arg.name;
+                    });
+    os << ");\n```\n";
+
+    // Emit the description.
+    if (auto description = method.getDescription())
+      mlir::tblgen::emitDescription(*description, os);
+
+    // If the body is not provided, this method must be provided by the
+    // operation.
+    if (!method.getBody())
+      os << "\nNOTE: This method *must* be implemented by the operation.\n\n";
+  }
+}
+
+static bool emitInterfaceDocs(const RecordKeeper &recordKeeper,
+                              raw_ostream &os) {
+  os << "<!-- Autogenerated by mlir-tblgen; don't manually edit -->\n";
+  os << "# Operation Interface definition\n";
+
+  for (const auto *def : getAllOpInterfaceDefinitions(recordKeeper))
+    emitInterfaceDoc(*def, os);
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// GEN: Interface registration hooks
+//===----------------------------------------------------------------------===//
+
+// Registers the operation interface generator to mlir-tblgen.
+static mlir::GenRegistration
+    genInterfaceDecls("gen-op-interface-decls",
+                      "Generate op interface declarations",
+                      [](const RecordKeeper &records, raw_ostream &os) {
+                        return emitInterfaceDecls(records, os);
+                      });
+
+// Registers the operation interface generator to mlir-tblgen.
+static mlir::GenRegistration
+    genInterfaceDefs("gen-op-interface-defs",
+                     "Generate op interface definitions",
+                     [](const RecordKeeper &records, raw_ostream &os) {
+                       return emitInterfaceDefs(records, os);
+                     });
+
+// Registers the operation interface document generator to mlir-tblgen.
+static mlir::GenRegistration
+    genInterfaceDocs("gen-op-interface-doc",
+                     "Generate op interface documentation",
+                     [](const RecordKeeper &records, raw_ostream &os) {
+                       return emitInterfaceDocs(records, os);
+                     });
diff --git a/mlir/tools/mlir-tblgen/ReferenceImplGen.cpp b/mlir/tools/mlir-tblgen/ReferenceImplGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..90b60e5efed5efe31b4ca5f052b42d5f376d72d4
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/ReferenceImplGen.cpp
@@ -0,0 +1,85 @@
+//===- ReferenceImplGen.cpp - MLIR reference implementation generator -----===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// ReferenceImplGen uses the description of operations to generate reference
+// implementations for the ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+
+using mlir::tblgen::Operator;
+
+static void emitReferenceImplementations(const RecordKeeper &recordKeeper,
+                                         raw_ostream &os) {
+  emitSourceFileHeader("Reference implementation file", os);
+  const auto &defs = recordKeeper.getAllDerivedDefinitions("Op");
+
+  os << "void printRefImplementation(StringRef opName, mlir::FuncOp *f) {\n"
+     << "  using namespace ::mlir::edsc;\n"
+     << "if (false) {}";
+  for (auto *def : defs) {
+    Operator op(def);
+    auto referenceImplGenerator = def->getValueInit("referenceImplementation");
+    if (!referenceImplGenerator)
+      continue;
+    os << " else if (opName == \"" << op.getOperationName() << "\") {\n"
+       << "  edsc::ScopedContext scope(f);\n";
+
+    for (auto en : llvm::enumerate(op.getOperands())) {
+      os.indent(2) << formatv("ValueHandle arg_{0}(f->getArgument({1})); "
+                              "(void)arg_{0};\n",
+                              en.value().name, en.index());
+      // TODO(jpienaar): this is generally incorrect, not all args are memref
+      // in the general case.
+      os.indent(2) << formatv("MemRefView view_{0}(f->getArgument({1})); "
+                              "(void)view_{0};\n",
+                              en.value().name, en.index());
+    }
+    unsigned numOperands = op.getNumOperands();
+    unsigned numResults = op.getNumResults();
+    for (unsigned idx = 0; idx < numResults; ++idx) {
+      os.indent(2) << formatv("ValueHandle arg_{0}(f->getArgument({1})); "
+                              "(void)arg_{0};\n",
+                              op.getResult(idx).name, numOperands + idx);
+      // TODO(jpienaar): this is generally incorrect, not all args are memref
+      // in the general case.
+      os.indent(2) << formatv("MemRefView view_{0}(f->getArgument({1})); "
+                              "(void)view_{0};\n",
+                              op.getResult(idx).name, numOperands + idx);
+    }
+
+    // Print the EDSC.
+    os << referenceImplGenerator->getAsUnquotedString() << "\n";
+    os.indent(2) << "f->print(llvm::outs());\n\n";
+    os << "}";
+  }
+  os << " else {\n";
+  os.indent(2) << "f->emitError(\"no reference impl. for \" + opName);\n";
+  os.indent(2) << "return;\n";
+  os << "}\n";
+  os << "}\n";
+}
+
+static mlir::GenRegistration
+    genRegister("gen-reference-implementations",
+                "Generate reference implementations",
+                [](const RecordKeeper &records, raw_ostream &os) {
+                  emitReferenceImplementations(records, os);
+                  return false;
+                });
diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..824ddae85aaf86e64aa0c7ee9dcf550a0b2045ca
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp
@@ -0,0 +1,1036 @@
+//===- RewriterGen.cpp - MLIR pattern rewriter generator ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RewriterGen uses pattern rewrite definitions to generate rewriter matchers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/STLExtras.h"
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/Format.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Operator.h"
+#include "mlir/TableGen/Pattern.h"
+#include "mlir/TableGen/Predicate.h"
+#include "mlir/TableGen/Type.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatAdapters.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Main.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace mlir;
+using namespace mlir::tblgen;
+
+using llvm::formatv;
+using llvm::Record;
+using llvm::RecordKeeper;
+
+#define DEBUG_TYPE "mlir-tblgen-rewritergen"
+
+namespace llvm {
+template <> struct format_provider<mlir::tblgen::Pattern::IdentifierLine> {
+  static void format(const mlir::tblgen::Pattern::IdentifierLine &v,
+                     raw_ostream &os, StringRef style) {
+    os << v.first << ":" << v.second;
+  }
+};
+} // end namespace llvm
+
+//===----------------------------------------------------------------------===//
+// PatternEmitter
+//===----------------------------------------------------------------------===//
+
+namespace {
+class PatternEmitter {
+public:
+  PatternEmitter(Record *pat, RecordOperatorMap *mapper, raw_ostream &os);
+
+  // Emits the mlir::RewritePattern struct named `rewriteName`.
+  void emit(StringRef rewriteName);
+
+private:
+  // Emits the code for matching ops.
+  void emitMatchLogic(DagNode tree);
+
+  // Emits the code for rewriting ops.
+  void emitRewriteLogic();
+
+  //===--------------------------------------------------------------------===//
+  // Match utilities
+  //===--------------------------------------------------------------------===//
+
+  // Emits C++ statements for matching the op constrained by the given DAG
+  // `tree`.
+  void emitOpMatch(DagNode tree, int depth);
+
+  // Emits C++ statements for matching the `argIndex`-th argument of the given
+  // DAG `tree` as an operand.
+  void emitOperandMatch(DagNode tree, int argIndex, int depth, int indent);
+
+  // Emits C++ statements for matching the `argIndex`-th argument of the given
+  // DAG `tree` as an attribute.
+  void emitAttributeMatch(DagNode tree, int argIndex, int depth, int indent);
+
+  //===--------------------------------------------------------------------===//
+  // Rewrite utilities
+  //===--------------------------------------------------------------------===//
+
+  // The entry point for handling a result pattern rooted at `resultTree`. This
+  // method dispatches to concrete handlers according to `resultTree`'s kind and
+  // returns a symbol representing the whole value pack. Callers are expected to
+  // further resolve the symbol according to the specific use case.
+  //
+  // `depth` is the nesting level of `resultTree`; 0 means top-level result
+  // pattern. For top-level result pattern, `resultIndex` indicates which result
+  // of the matched root op this pattern is intended to replace, which can be
+  // used to deduce the result type of the op generated from this result
+  // pattern.
+  std::string handleResultPattern(DagNode resultTree, int resultIndex,
+                                  int depth);
+
+  // Emits the C++ statement to replace the matched DAG with a value built via
+  // calling native C++ code.
+  std::string handleReplaceWithNativeCodeCall(DagNode resultTree);
+
+  // Returns the C++ expression referencing the old value serving as the
+  // replacement.
+  std::string handleReplaceWithValue(DagNode tree);
+
+  // Emits the C++ statement to build a new op out of the given DAG `tree` and
+  // returns the variable name that this op is assigned to. If the root op in
+  // DAG `tree` has a specified name, the created op will be assigned to a
+  // variable of the given name. Otherwise, a unique name will be used as the
+  // result value name.
+  std::string handleOpCreation(DagNode tree, int resultIndex, int depth);
+
+  using ChildNodeIndexNameMap = DenseMap<unsigned, std::string>;
+
+  // Emits a local variable for each value and attribute to be used for creating
+  // an op.
+  void createSeparateLocalVarsForOpArgs(DagNode node,
+                                        ChildNodeIndexNameMap &childNodeNames);
+
+  // Emits the concrete arguments used to call a op's builder.
+  void supplyValuesForOpArgs(DagNode node,
+                             const ChildNodeIndexNameMap &childNodeNames);
+
+  // Emits the local variables for holding all values as a whole and all named
+  // attributes as a whole to be used for creating an op.
+  void createAggregateLocalVarsForOpArgs(
+      DagNode node, const ChildNodeIndexNameMap &childNodeNames);
+
+  // Returns the C++ expression to construct a constant attribute of the given
+  // `value` for the given attribute kind `attr`.
+  std::string handleConstantAttr(Attribute attr, StringRef value);
+
+  // Returns the C++ expression to build an argument from the given DAG `leaf`.
+  // `patArgName` is used to bound the argument to the source pattern.
+  std::string handleOpArgument(DagLeaf leaf, StringRef patArgName);
+
+  //===--------------------------------------------------------------------===//
+  // General utilities
+  //===--------------------------------------------------------------------===//
+
+  // Collects all of the operations within the given dag tree.
+  void collectOps(DagNode tree, llvm::SmallPtrSetImpl<const Operator *> &ops);
+
+  // Returns a unique symbol for a local variable of the given `op`.
+  std::string getUniqueSymbol(const Operator *op);
+
+  //===--------------------------------------------------------------------===//
+  // Symbol utilities
+  //===--------------------------------------------------------------------===//
+
+  // Returns how many static values the given DAG `node` correspond to.
+  int getNodeValueCount(DagNode node);
+
+private:
+  // Pattern instantiation location followed by the location of multiclass
+  // prototypes used. This is intended to be used as a whole to
+  // PrintFatalError() on errors.
+  ArrayRef<llvm::SMLoc> loc;
+
+  // Op's TableGen Record to wrapper object.
+  RecordOperatorMap *opMap;
+
+  // Handy wrapper for pattern being emitted.
+  Pattern pattern;
+
+  // Map for all bound symbols' info.
+  SymbolInfoMap symbolInfoMap;
+
+  // The next unused ID for newly created values.
+  unsigned nextValueId;
+
+  raw_ostream &os;
+
+  // Format contexts containing placeholder substitutions.
+  FmtContext fmtCtx;
+
+  // Number of op processed.
+  int opCounter = 0;
+};
+} // end anonymous namespace
+
+PatternEmitter::PatternEmitter(Record *pat, RecordOperatorMap *mapper,
+                               raw_ostream &os)
+    : loc(pat->getLoc()), opMap(mapper), pattern(pat, mapper),
+      symbolInfoMap(pat->getLoc()), nextValueId(0), os(os) {
+  fmtCtx.withBuilder("rewriter");
+}
+
+std::string PatternEmitter::handleConstantAttr(Attribute attr,
+                                               StringRef value) {
+  if (!attr.isConstBuildable())
+    PrintFatalError(loc, "Attribute " + attr.getAttrDefName() +
+                             " does not have the 'constBuilderCall' field");
+
+  // TODO(jpienaar): Verify the constants here
+  return tgfmt(attr.getConstBuilderTemplate(), &fmtCtx, value);
+}
+
+// Helper function to match patterns.
+void PatternEmitter::emitOpMatch(DagNode tree, int depth) {
+  Operator &op = tree.getDialectOp(opMap);
+  LLVM_DEBUG(llvm::dbgs() << "start emitting match for op '"
+                          << op.getOperationName() << "' at depth " << depth
+                          << '\n');
+
+  int indent = 4 + 2 * depth;
+  os.indent(indent) << formatv(
+      "auto castedOp{0} = dyn_cast_or_null<{1}>(op{0}); (void)castedOp{0};\n",
+      depth, op.getQualCppClassName());
+  // Skip the operand matching at depth 0 as the pattern rewriter already does.
+  if (depth != 0) {
+    // Skip if there is no defining operation (e.g., arguments to function).
+    os.indent(indent) << formatv("if (!castedOp{0}) return matchFailure();\n",
+                                 depth);
+  }
+  if (tree.getNumArgs() != op.getNumArgs()) {
+    PrintFatalError(loc, formatv("op '{0}' argument number mismatch: {1} in "
+                                 "pattern vs. {2} in definition",
+                                 op.getOperationName(), tree.getNumArgs(),
+                                 op.getNumArgs()));
+  }
+
+  // If the operand's name is set, set to that variable.
+  auto name = tree.getSymbol();
+  if (!name.empty())
+    os.indent(indent) << formatv("{0} = castedOp{1};\n", name, depth);
+
+  for (int i = 0, e = tree.getNumArgs(); i != e; ++i) {
+    auto opArg = op.getArg(i);
+
+    // Handle nested DAG construct first
+    if (DagNode argTree = tree.getArgAsNestedDag(i)) {
+      if (auto *operand = opArg.dyn_cast<NamedTypeConstraint *>()) {
+        if (operand->isVariadic()) {
+          auto error = formatv("use nested DAG construct to match op {0}'s "
+                               "variadic operand #{1} unsupported now",
+                               op.getOperationName(), i);
+          PrintFatalError(loc, error);
+        }
+      }
+      os.indent(indent) << "{\n";
+
+      os.indent(indent + 2) << formatv(
+          "auto *op{0} = "
+          "(*castedOp{1}.getODSOperands({2}).begin())->getDefiningOp();\n",
+          depth + 1, depth, i);
+      emitOpMatch(argTree, depth + 1);
+      os.indent(indent + 2)
+          << formatv("tblgen_ops[{0}] = op{1};\n", ++opCounter, depth + 1);
+      os.indent(indent) << "}\n";
+      continue;
+    }
+
+    // Next handle DAG leaf: operand or attribute
+    if (opArg.is<NamedTypeConstraint *>()) {
+      emitOperandMatch(tree, i, depth, indent);
+    } else if (opArg.is<NamedAttribute *>()) {
+      emitAttributeMatch(tree, i, depth, indent);
+    } else {
+      PrintFatalError(loc, "unhandled case when matching op");
+    }
+  }
+  LLVM_DEBUG(llvm::dbgs() << "done emitting match for op '"
+                          << op.getOperationName() << "' at depth " << depth
+                          << '\n');
+}
+
+void PatternEmitter::emitOperandMatch(DagNode tree, int argIndex, int depth,
+                                      int indent) {
+  Operator &op = tree.getDialectOp(opMap);
+  auto *operand = op.getArg(argIndex).get<NamedTypeConstraint *>();
+  auto matcher = tree.getArgAsLeaf(argIndex);
+
+  // If a constraint is specified, we need to generate C++ statements to
+  // check the constraint.
+  if (!matcher.isUnspecified()) {
+    if (!matcher.isOperandMatcher()) {
+      PrintFatalError(
+          loc, formatv("the {1}-th argument of op '{0}' should be an operand",
+                       op.getOperationName(), argIndex + 1));
+    }
+
+    // Only need to verify if the matcher's type is different from the one
+    // of op definition.
+    if (operand->constraint != matcher.getAsConstraint()) {
+      if (operand->isVariadic()) {
+        auto error = formatv(
+            "further constrain op {0}'s variadic operand #{1} unsupported now",
+            op.getOperationName(), argIndex);
+        PrintFatalError(loc, error);
+      }
+      auto self =
+          formatv("(*castedOp{0}.getODSOperands({1}).begin())->getType()",
+                  depth, argIndex);
+      os.indent(indent) << "if (!("
+                        << tgfmt(matcher.getConditionTemplate(),
+                                 &fmtCtx.withSelf(self))
+                        << ")) return matchFailure();\n";
+    }
+  }
+
+  // Capture the value
+  auto name = tree.getArgName(argIndex);
+  // `$_` is a special symbol to ignore op argument matching.
+  if (!name.empty() && name != "_") {
+    // We need to subtract the number of attributes before this operand to get
+    // the index in the operand list.
+    auto numPrevAttrs = std::count_if(
+        op.arg_begin(), op.arg_begin() + argIndex,
+        [](const Argument &arg) { return arg.is<NamedAttribute *>(); });
+
+    os.indent(indent) << formatv("{0} = castedOp{1}.getODSOperands({2});\n",
+                                 name, depth, argIndex - numPrevAttrs);
+  }
+}
+
+void PatternEmitter::emitAttributeMatch(DagNode tree, int argIndex, int depth,
+                                        int indent) {
+
+  Operator &op = tree.getDialectOp(opMap);
+  auto *namedAttr = op.getArg(argIndex).get<NamedAttribute *>();
+  const auto &attr = namedAttr->attr;
+
+  os.indent(indent) << "{\n";
+  indent += 2;
+  os.indent(indent) << formatv(
+      "auto tblgen_attr = op{0}->getAttrOfType<{1}>(\"{2}\");\n", depth,
+      attr.getStorageType(), namedAttr->name);
+
+  // TODO(antiagainst): This should use getter method to avoid duplication.
+  if (attr.hasDefaultValue()) {
+    os.indent(indent) << "if (!tblgen_attr) tblgen_attr = "
+                      << tgfmt(attr.getConstBuilderTemplate(), &fmtCtx,
+                               attr.getDefaultValue())
+                      << ";\n";
+  } else if (attr.isOptional()) {
+    // For a missing attribute that is optional according to definition, we
+    // should just capture a mlir::Attribute() to signal the missing state.
+    // That is precisely what getAttr() returns on missing attributes.
+  } else {
+    os.indent(indent) << "if (!tblgen_attr) return matchFailure();\n";
+  }
+
+  auto matcher = tree.getArgAsLeaf(argIndex);
+  if (!matcher.isUnspecified()) {
+    if (!matcher.isAttrMatcher()) {
+      PrintFatalError(
+          loc, formatv("the {1}-th argument of op '{0}' should be an attribute",
+                       op.getOperationName(), argIndex + 1));
+    }
+
+    // If a constraint is specified, we need to generate C++ statements to
+    // check the constraint.
+    os.indent(indent) << "if (!("
+                      << tgfmt(matcher.getConditionTemplate(),
+                               &fmtCtx.withSelf("tblgen_attr"))
+                      << ")) return matchFailure();\n";
+  }
+
+  // Capture the value
+  auto name = tree.getArgName(argIndex);
+  // `$_` is a special symbol to ignore op argument matching.
+  if (!name.empty() && name != "_") {
+    os.indent(indent) << formatv("{0} = tblgen_attr;\n", name);
+  }
+
+  indent -= 2;
+  os.indent(indent) << "}\n";
+}
+
+void PatternEmitter::emitMatchLogic(DagNode tree) {
+  LLVM_DEBUG(llvm::dbgs() << "--- start emitting match logic ---\n");
+  emitOpMatch(tree, 0);
+
+  for (auto &appliedConstraint : pattern.getConstraints()) {
+    auto &constraint = appliedConstraint.constraint;
+    auto &entities = appliedConstraint.entities;
+
+    auto condition = constraint.getConditionTemplate();
+    auto cmd = "if (!({0})) return matchFailure();\n";
+
+    if (isa<TypeConstraint>(constraint)) {
+      auto self = formatv("({0}->getType())",
+                          symbolInfoMap.getValueAndRangeUse(entities.front()));
+      os.indent(4) << formatv(cmd,
+                              tgfmt(condition, &fmtCtx.withSelf(self.str())));
+    } else if (isa<AttrConstraint>(constraint)) {
+      PrintFatalError(
+          loc, "cannot use AttrConstraint in Pattern multi-entity constraints");
+    } else {
+      // TODO(b/138794486): replace formatv arguments with the exact specified
+      // args.
+      if (entities.size() > 4) {
+        PrintFatalError(loc, "only support up to 4-entity constraints now");
+      }
+      SmallVector<std::string, 4> names;
+      int i = 0;
+      for (int e = entities.size(); i < e; ++i)
+        names.push_back(symbolInfoMap.getValueAndRangeUse(entities[i]));
+      std::string self = appliedConstraint.self;
+      if (!self.empty())
+        self = symbolInfoMap.getValueAndRangeUse(self);
+      for (; i < 4; ++i)
+        names.push_back("<unused>");
+      os.indent(4) << formatv(cmd,
+                              tgfmt(condition, &fmtCtx.withSelf(self), names[0],
+                                    names[1], names[2], names[3]));
+    }
+  }
+  LLVM_DEBUG(llvm::dbgs() << "--- done emitting match logic ---\n");
+}
+
+void PatternEmitter::collectOps(DagNode tree,
+                                llvm::SmallPtrSetImpl<const Operator *> &ops) {
+  // Check if this tree is an operation.
+  if (tree.isOperation()) {
+    const Operator &op = tree.getDialectOp(opMap);
+    LLVM_DEBUG(llvm::dbgs()
+               << "found operation " << op.getOperationName() << '\n');
+    ops.insert(&op);
+  }
+
+  // Recurse the arguments of the tree.
+  for (unsigned i = 0, e = tree.getNumArgs(); i != e; ++i)
+    if (auto child = tree.getArgAsNestedDag(i))
+      collectOps(child, ops);
+}
+
+void PatternEmitter::emit(StringRef rewriteName) {
+  // Get the DAG tree for the source pattern.
+  DagNode sourceTree = pattern.getSourcePattern();
+
+  const Operator &rootOp = pattern.getSourceRootOp();
+  auto rootName = rootOp.getOperationName();
+
+  // Collect the set of result operations.
+  llvm::SmallPtrSet<const Operator *, 4> resultOps;
+  LLVM_DEBUG(llvm::dbgs() << "start collecting ops used in result patterns\n");
+  for (unsigned i = 0, e = pattern.getNumResultPatterns(); i != e; ++i) {
+    collectOps(pattern.getResultPattern(i), resultOps);
+  }
+  LLVM_DEBUG(llvm::dbgs() << "done collecting ops used in result patterns\n");
+
+  // Emit RewritePattern for Pattern.
+  auto locs = pattern.getLocation();
+  os << formatv("/* Generated from:\n\t{0:$[ instantiating\n\t]}\n*/\n",
+                make_range(locs.rbegin(), locs.rend()));
+  os << formatv(R"(struct {0} : public RewritePattern {
+  {0}(MLIRContext *context)
+      : RewritePattern("{1}", {{)",
+                rewriteName, rootName);
+  // Sort result operators by name.
+  llvm::SmallVector<const Operator *, 4> sortedResultOps(resultOps.begin(),
+                                                         resultOps.end());
+  llvm::sort(sortedResultOps, [&](const Operator *lhs, const Operator *rhs) {
+    return lhs->getOperationName() < rhs->getOperationName();
+  });
+  interleaveComma(sortedResultOps, os, [&](const Operator *op) {
+    os << '"' << op->getOperationName() << '"';
+  });
+  os << formatv(R"(}, {0}, context) {{})", pattern.getBenefit()) << "\n";
+
+  // Emit matchAndRewrite() function.
+  os << R"(
+  PatternMatchResult matchAndRewrite(Operation *op0,
+                                     PatternRewriter &rewriter) const override {
+)";
+
+  // Register all symbols bound in the source pattern.
+  pattern.collectSourcePatternBoundSymbols(symbolInfoMap);
+
+  LLVM_DEBUG(
+      llvm::dbgs() << "start creating local variables for capturing matches\n");
+  os.indent(4) << "// Variables for capturing values and attributes used for "
+                  "creating ops\n";
+  // Create local variables for storing the arguments and results bound
+  // to symbols.
+  for (const auto &symbolInfoPair : symbolInfoMap) {
+    StringRef symbol = symbolInfoPair.getKey();
+    auto &info = symbolInfoPair.getValue();
+    os.indent(4) << info.getVarDecl(symbol);
+  }
+  // TODO(jpienaar): capture ops with consistent numbering so that it can be
+  // reused for fused loc.
+  os.indent(4) << formatv("Operation *tblgen_ops[{0}];\n\n",
+                          pattern.getSourcePattern().getNumOps());
+  LLVM_DEBUG(
+      llvm::dbgs() << "done creating local variables for capturing matches\n");
+
+  os.indent(4) << "// Match\n";
+  os.indent(4) << "tblgen_ops[0] = op0;\n";
+  emitMatchLogic(sourceTree);
+  os << "\n";
+
+  os.indent(4) << "// Rewrite\n";
+  emitRewriteLogic();
+
+  os.indent(4) << "return matchSuccess();\n";
+  os << "  };\n";
+  os << "};\n";
+}
+
+void PatternEmitter::emitRewriteLogic() {
+  LLVM_DEBUG(llvm::dbgs() << "--- start emitting rewrite logic ---\n");
+  const Operator &rootOp = pattern.getSourceRootOp();
+  int numExpectedResults = rootOp.getNumResults();
+  int numResultPatterns = pattern.getNumResultPatterns();
+
+  // First register all symbols bound to ops generated in result patterns.
+  pattern.collectResultPatternBoundSymbols(symbolInfoMap);
+
+  // Only the last N static values generated are used to replace the matched
+  // root N-result op. We need to calculate the starting index (of the results
+  // of the matched op) each result pattern is to replace.
+  SmallVector<int, 4> offsets(numResultPatterns + 1, numExpectedResults);
+  // If we don't need to replace any value at all, set the replacement starting
+  // index as the number of result patterns so we skip all of them when trying
+  // to replace the matched op's results.
+  int replStartIndex = numExpectedResults == 0 ? numResultPatterns : -1;
+  for (int i = numResultPatterns - 1; i >= 0; --i) {
+    auto numValues = getNodeValueCount(pattern.getResultPattern(i));
+    offsets[i] = offsets[i + 1] - numValues;
+    if (offsets[i] == 0) {
+      if (replStartIndex == -1)
+        replStartIndex = i;
+    } else if (offsets[i] < 0 && offsets[i + 1] > 0) {
+      auto error = formatv(
+          "cannot use the same multi-result op '{0}' to generate both "
+          "auxiliary values and values to be used for replacing the matched op",
+          pattern.getResultPattern(i).getSymbol());
+      PrintFatalError(loc, error);
+    }
+  }
+
+  if (offsets.front() > 0) {
+    const char error[] = "no enough values generated to replace the matched op";
+    PrintFatalError(loc, error);
+  }
+
+  os.indent(4) << "auto loc = rewriter.getFusedLoc({";
+  for (int i = 0, e = pattern.getSourcePattern().getNumOps(); i != e; ++i) {
+    os << (i ? ", " : "") << "tblgen_ops[" << i << "]->getLoc()";
+  }
+  os << "}); (void)loc;\n";
+
+  // Process auxiliary result patterns.
+  for (int i = 0; i < replStartIndex; ++i) {
+    DagNode resultTree = pattern.getResultPattern(i);
+    auto val = handleResultPattern(resultTree, offsets[i], 0);
+    // Normal op creation will be streamed to `os` by the above call; but
+    // NativeCodeCall will only be materialized to `os` if it is used. Here
+    // we are handling auxiliary patterns so we want the side effect even if
+    // NativeCodeCall is not replacing matched root op's results.
+    if (resultTree.isNativeCodeCall())
+      os.indent(4) << val << ";\n";
+  }
+
+  if (numExpectedResults == 0) {
+    assert(replStartIndex >= numResultPatterns &&
+           "invalid auxiliary vs. replacement pattern division!");
+    // No result to replace. Just erase the op.
+    os.indent(4) << "rewriter.eraseOp(op0);\n";
+  } else {
+    // Process replacement result patterns.
+    os.indent(4) << "SmallVector<Value, 4> tblgen_repl_values;\n";
+    for (int i = replStartIndex; i < numResultPatterns; ++i) {
+      DagNode resultTree = pattern.getResultPattern(i);
+      auto val = handleResultPattern(resultTree, offsets[i], 0);
+      os.indent(4) << "\n";
+      // Resolve each symbol for all range use so that we can loop over them.
+      os << symbolInfoMap.getAllRangeUse(
+          val, "    for (auto v : {0}) {{ tblgen_repl_values.push_back(v); }",
+          "\n");
+    }
+    os.indent(4) << "\n";
+    os.indent(4) << "rewriter.replaceOp(op0, tblgen_repl_values);\n";
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "--- done emitting rewrite logic ---\n");
+}
+
+std::string PatternEmitter::getUniqueSymbol(const Operator *op) {
+  return formatv("tblgen_{0}_{1}", op->getCppClassName(), nextValueId++);
+}
+
+std::string PatternEmitter::handleResultPattern(DagNode resultTree,
+                                                int resultIndex, int depth) {
+  LLVM_DEBUG(llvm::dbgs() << "handle result pattern: ");
+  LLVM_DEBUG(resultTree.print(llvm::dbgs()));
+  LLVM_DEBUG(llvm::dbgs() << '\n');
+
+  if (resultTree.isNativeCodeCall()) {
+    auto symbol = handleReplaceWithNativeCodeCall(resultTree);
+    symbolInfoMap.bindValue(symbol);
+    return symbol;
+  }
+
+  if (resultTree.isReplaceWithValue()) {
+    return handleReplaceWithValue(resultTree);
+  }
+
+  // Normal op creation.
+  auto symbol = handleOpCreation(resultTree, resultIndex, depth);
+  if (resultTree.getSymbol().empty()) {
+    // This is an op not explicitly bound to a symbol in the rewrite rule.
+    // Register the auto-generated symbol for it.
+    symbolInfoMap.bindOpResult(symbol, pattern.getDialectOp(resultTree));
+  }
+  return symbol;
+}
+
+std::string PatternEmitter::handleReplaceWithValue(DagNode tree) {
+  assert(tree.isReplaceWithValue());
+
+  if (tree.getNumArgs() != 1) {
+    PrintFatalError(
+        loc, "replaceWithValue directive must take exactly one argument");
+  }
+
+  if (!tree.getSymbol().empty()) {
+    PrintFatalError(loc, "cannot bind symbol to replaceWithValue");
+  }
+
+  return tree.getArgName(0);
+}
+
+std::string PatternEmitter::handleOpArgument(DagLeaf leaf,
+                                             StringRef patArgName) {
+  if (leaf.isConstantAttr()) {
+    auto constAttr = leaf.getAsConstantAttr();
+    return handleConstantAttr(constAttr.getAttribute(),
+                              constAttr.getConstantValue());
+  }
+  if (leaf.isEnumAttrCase()) {
+    auto enumCase = leaf.getAsEnumAttrCase();
+    if (enumCase.isStrCase())
+      return handleConstantAttr(enumCase, enumCase.getSymbol());
+    // This is an enum case backed by an IntegerAttr. We need to get its value
+    // to build the constant.
+    std::string val = std::to_string(enumCase.getValue());
+    return handleConstantAttr(enumCase, val);
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "handle argument '" << patArgName << "'\n");
+  auto argName = symbolInfoMap.getValueAndRangeUse(patArgName);
+  if (leaf.isUnspecified() || leaf.isOperandMatcher()) {
+    LLVM_DEBUG(llvm::dbgs() << "replace " << patArgName << " with '" << argName
+                            << "' (via symbol ref)\n");
+    return argName;
+  }
+  if (leaf.isNativeCodeCall()) {
+    auto repl = tgfmt(leaf.getNativeCodeTemplate(), &fmtCtx.withSelf(argName));
+    LLVM_DEBUG(llvm::dbgs() << "replace " << patArgName << " with '" << repl
+                            << "' (via NativeCodeCall)\n");
+    return repl;
+  }
+  PrintFatalError(loc, "unhandled case when rewriting op");
+}
+
+std::string PatternEmitter::handleReplaceWithNativeCodeCall(DagNode tree) {
+  LLVM_DEBUG(llvm::dbgs() << "handle NativeCodeCall pattern: ");
+  LLVM_DEBUG(tree.print(llvm::dbgs()));
+  LLVM_DEBUG(llvm::dbgs() << '\n');
+
+  auto fmt = tree.getNativeCodeTemplate();
+  // TODO(b/138794486): replace formatv arguments with the exact specified args.
+  SmallVector<std::string, 8> attrs(8);
+  if (tree.getNumArgs() > 8) {
+    PrintFatalError(loc, "unsupported NativeCodeCall argument numbers: " +
+                             Twine(tree.getNumArgs()));
+  }
+  for (int i = 0, e = tree.getNumArgs(); i != e; ++i) {
+    attrs[i] = handleOpArgument(tree.getArgAsLeaf(i), tree.getArgName(i));
+    LLVM_DEBUG(llvm::dbgs() << "NativeCodeCall argument #" << i
+                            << " replacement: " << attrs[i] << "\n");
+  }
+  return tgfmt(fmt, &fmtCtx, attrs[0], attrs[1], attrs[2], attrs[3], attrs[4],
+               attrs[5], attrs[6], attrs[7]);
+}
+
+int PatternEmitter::getNodeValueCount(DagNode node) {
+  if (node.isOperation()) {
+    // If the op is bound to a symbol in the rewrite rule, query its result
+    // count from the symbol info map.
+    auto symbol = node.getSymbol();
+    if (!symbol.empty()) {
+      return symbolInfoMap.getStaticValueCount(symbol);
+    }
+    // Otherwise this is an unbound op; we will use all its results.
+    return pattern.getDialectOp(node).getNumResults();
+  }
+  // TODO(antiagainst): This considers all NativeCodeCall as returning one
+  // value. Enhance if multi-value ones are needed.
+  return 1;
+}
+
+std::string PatternEmitter::handleOpCreation(DagNode tree, int resultIndex,
+                                             int depth) {
+  LLVM_DEBUG(llvm::dbgs() << "create op for pattern: ");
+  LLVM_DEBUG(tree.print(llvm::dbgs()));
+  LLVM_DEBUG(llvm::dbgs() << '\n');
+
+  Operator &resultOp = tree.getDialectOp(opMap);
+  auto numOpArgs = resultOp.getNumArgs();
+
+  if (numOpArgs != tree.getNumArgs()) {
+    PrintFatalError(loc, formatv("resultant op '{0}' argument number mismatch: "
+                                 "{1} in pattern vs. {2} in definition",
+                                 resultOp.getOperationName(), tree.getNumArgs(),
+                                 numOpArgs));
+  }
+
+  // A map to collect all nested DAG child nodes' names, with operand index as
+  // the key. This includes both bound and unbound child nodes.
+  ChildNodeIndexNameMap childNodeNames;
+
+  // First go through all the child nodes who are nested DAG constructs to
+  // create ops for them and remember the symbol names for them, so that we can
+  // use the results in the current node. This happens in a recursive manner.
+  for (int i = 0, e = resultOp.getNumOperands(); i != e; ++i) {
+    if (auto child = tree.getArgAsNestedDag(i)) {
+      childNodeNames[i] = handleResultPattern(child, i, depth + 1);
+    }
+  }
+
+  // The name of the local variable holding this op.
+  std::string valuePackName;
+  // The symbol for holding the result of this pattern. Note that the result of
+  // this pattern is not necessarily the same as the variable created by this
+  // pattern because we can use `__N` suffix to refer only a specific result if
+  // the generated op is a multi-result op.
+  std::string resultValue;
+  if (tree.getSymbol().empty()) {
+    // No symbol is explicitly bound to this op in the pattern. Generate a
+    // unique name.
+    valuePackName = resultValue = getUniqueSymbol(&resultOp);
+  } else {
+    resultValue = tree.getSymbol();
+    // Strip the index to get the name for the value pack and use it to name the
+    // local variable for the op.
+    valuePackName = SymbolInfoMap::getValuePackName(resultValue);
+  }
+
+  // Create the local variable for this op.
+  os.indent(4) << formatv("{0} {1};\n", resultOp.getQualCppClassName(),
+                          valuePackName);
+  os.indent(4) << "{\n";
+
+  // Right now ODS don't have general type inference support. Except a few
+  // special cases listed below, DRR needs to supply types for all results
+  // when building an op.
+  bool isSameOperandsAndResultType =
+      resultOp.getTrait("OpTrait::SameOperandsAndResultType");
+  bool useFirstAttr = resultOp.getTrait("OpTrait::FirstAttrDerivedResultType");
+
+  if (isSameOperandsAndResultType || useFirstAttr) {
+    // We know how to deduce the result type for ops with these traits and we've
+    // generated builders taking aggregate parameters. Use those builders to
+    // create the ops.
+
+    // First prepare local variables for op arguments used in builder call.
+    createAggregateLocalVarsForOpArgs(tree, childNodeNames);
+    // Then create the op.
+    os.indent(6) << formatv(
+        "{0} = rewriter.create<{1}>(loc, tblgen_values, tblgen_attrs);\n",
+        valuePackName, resultOp.getQualCppClassName());
+    os.indent(4) << "}\n";
+    return resultValue;
+  }
+
+  bool isBroadcastable =
+      resultOp.getTrait("OpTrait::BroadcastableTwoOperandsOneResult");
+  bool usePartialResults = valuePackName != resultValue;
+
+  if (isBroadcastable || usePartialResults || depth > 0 || resultIndex < 0) {
+    // For these cases (broadcastable ops, op results used both as auxiliary
+    // values and replacement values, ops in nested patterns, auxiliary ops), we
+    // still need to supply the result types when building the op. But because
+    // we don't generate a builder automatically with ODS for them, it's the
+    // developer's responsiblity to make sure such a builder (with result type
+    // deduction ability) exists. We go through the separate-parameter builder
+    // here given that it's easier for developers to write compared to
+    // aggregate-parameter builders.
+    createSeparateLocalVarsForOpArgs(tree, childNodeNames);
+    os.indent(6) << formatv("{0} = rewriter.create<{1}>(loc", valuePackName,
+                            resultOp.getQualCppClassName());
+    supplyValuesForOpArgs(tree, childNodeNames);
+    os << "\n      );\n";
+    os.indent(4) << "}\n";
+    return resultValue;
+  }
+
+  // If depth == 0 and resultIndex >= 0, it means we are replacing the values
+  // generated from the source pattern root op. Then we can use the source
+  // pattern's value types to determine the value type of the generated op
+  // here.
+
+  // First prepare local variables for op arguments used in builder call.
+  createAggregateLocalVarsForOpArgs(tree, childNodeNames);
+
+  // Then prepare the result types. We need to specify the types for all
+  // results.
+  os.indent(6) << formatv(
+      "SmallVector<Type, 4> tblgen_types; (void)tblgen_types;\n");
+  int numResults = resultOp.getNumResults();
+  if (numResults != 0) {
+    for (int i = 0; i < numResults; ++i)
+      os.indent(6) << formatv("for (auto v : castedOp0.getODSResults({0})) {{"
+                              "tblgen_types.push_back(v->getType()); }\n",
+                              resultIndex + i);
+  }
+  os.indent(6) << formatv("{0} = rewriter.create<{1}>(loc, tblgen_types, "
+                          "tblgen_values, tblgen_attrs);\n",
+                          valuePackName, resultOp.getQualCppClassName());
+  os.indent(4) << "}\n";
+  return resultValue;
+}
+
+void PatternEmitter::createSeparateLocalVarsForOpArgs(
+    DagNode node, ChildNodeIndexNameMap &childNodeNames) {
+  Operator &resultOp = node.getDialectOp(opMap);
+
+  // Now prepare operands used for building this op:
+  // * If the operand is non-variadic, we create a `Value` local variable.
+  // * If the operand is variadic, we create a `SmallVector<Value>` local
+  //   variable.
+
+  int valueIndex = 0; // An index for uniquing local variable names.
+  for (int argIndex = 0, e = resultOp.getNumArgs(); argIndex < e; ++argIndex) {
+    const auto *operand =
+        resultOp.getArg(argIndex).dyn_cast<NamedTypeConstraint *>();
+    if (!operand) {
+      // We do not need special handling for attributes.
+      continue;
+    }
+
+    std::string varName;
+    if (operand->isVariadic()) {
+      varName = formatv("tblgen_values_{0}", valueIndex++);
+      os.indent(6) << formatv("SmallVector<Value, 4> {0};\n", varName);
+      std::string range;
+      if (node.isNestedDagArg(argIndex)) {
+        range = childNodeNames[argIndex];
+      } else {
+        range = node.getArgName(argIndex);
+      }
+      // Resolve the symbol for all range use so that we have a uniform way of
+      // capturing the values.
+      range = symbolInfoMap.getValueAndRangeUse(range);
+      os.indent(6) << formatv("for (auto v : {0}) {1}.push_back(v);\n", range,
+                              varName);
+    } else {
+      varName = formatv("tblgen_value_{0}", valueIndex++);
+      os.indent(6) << formatv("Value {0} = ", varName);
+      if (node.isNestedDagArg(argIndex)) {
+        os << symbolInfoMap.getValueAndRangeUse(childNodeNames[argIndex]);
+      } else {
+        DagLeaf leaf = node.getArgAsLeaf(argIndex);
+        auto symbol =
+            symbolInfoMap.getValueAndRangeUse(node.getArgName(argIndex));
+        if (leaf.isNativeCodeCall()) {
+          os << tgfmt(leaf.getNativeCodeTemplate(), &fmtCtx.withSelf(symbol));
+        } else {
+          os << symbol;
+        }
+      }
+      os << ";\n";
+    }
+
+    // Update to use the newly created local variable for building the op later.
+    childNodeNames[argIndex] = varName;
+  }
+}
+
+void PatternEmitter::supplyValuesForOpArgs(
+    DagNode node, const ChildNodeIndexNameMap &childNodeNames) {
+  Operator &resultOp = node.getDialectOp(opMap);
+  for (int argIndex = 0, numOpArgs = resultOp.getNumArgs();
+       argIndex != numOpArgs; ++argIndex) {
+    // Start each argument on its own line.
+    (os << ",\n").indent(8);
+
+    Argument opArg = resultOp.getArg(argIndex);
+    // Handle the case of operand first.
+    if (auto *operand = opArg.dyn_cast<NamedTypeConstraint *>()) {
+      if (!operand->name.empty())
+        os << "/*" << operand->name << "=*/";
+      os << childNodeNames.lookup(argIndex);
+      continue;
+    }
+
+    // The argument in the op definition.
+    auto opArgName = resultOp.getArgName(argIndex);
+    if (auto subTree = node.getArgAsNestedDag(argIndex)) {
+      if (!subTree.isNativeCodeCall())
+        PrintFatalError(loc, "only NativeCodeCall allowed in nested dag node "
+                             "for creating attribute");
+      os << formatv("/*{0}=*/{1}", opArgName,
+                    handleReplaceWithNativeCodeCall(subTree));
+    } else {
+      auto leaf = node.getArgAsLeaf(argIndex);
+      // The argument in the result DAG pattern.
+      auto patArgName = node.getArgName(argIndex);
+      if (leaf.isConstantAttr() || leaf.isEnumAttrCase()) {
+        // TODO(jpienaar): Refactor out into map to avoid recomputing these.
+        if (!opArg.is<NamedAttribute *>())
+          PrintFatalError(loc, Twine("expected attribute ") + Twine(argIndex));
+        if (!patArgName.empty())
+          os << "/*" << patArgName << "=*/";
+      } else {
+        os << "/*" << opArgName << "=*/";
+      }
+      os << handleOpArgument(leaf, patArgName);
+    }
+  }
+}
+
+void PatternEmitter::createAggregateLocalVarsForOpArgs(
+    DagNode node, const ChildNodeIndexNameMap &childNodeNames) {
+  Operator &resultOp = node.getDialectOp(opMap);
+
+  os.indent(6) << formatv(
+      "SmallVector<Value, 4> tblgen_values; (void)tblgen_values;\n");
+  os.indent(6) << formatv(
+      "SmallVector<NamedAttribute, 4> tblgen_attrs; (void)tblgen_attrs;\n");
+
+  for (int argIndex = 0, e = resultOp.getNumArgs(); argIndex < e; ++argIndex) {
+    if (resultOp.getArg(argIndex).is<NamedAttribute *>()) {
+      const char *addAttrCmd = "if ({1}) {{"
+                               "  tblgen_attrs.emplace_back(rewriter."
+                               "getIdentifier(\"{0}\"), {1}); }\n";
+      // The argument in the op definition.
+      auto opArgName = resultOp.getArgName(argIndex);
+      if (auto subTree = node.getArgAsNestedDag(argIndex)) {
+        if (!subTree.isNativeCodeCall())
+          PrintFatalError(loc, "only NativeCodeCall allowed in nested dag node "
+                               "for creating attribute");
+        os.indent(6) << formatv(addAttrCmd, opArgName,
+                                handleReplaceWithNativeCodeCall(subTree));
+      } else {
+        auto leaf = node.getArgAsLeaf(argIndex);
+        // The argument in the result DAG pattern.
+        auto patArgName = node.getArgName(argIndex);
+        os.indent(6) << formatv(addAttrCmd, opArgName,
+                                handleOpArgument(leaf, patArgName));
+      }
+      continue;
+    }
+
+    const auto *operand =
+        resultOp.getArg(argIndex).get<NamedTypeConstraint *>();
+    std::string varName;
+    if (operand->isVariadic()) {
+      std::string range;
+      if (node.isNestedDagArg(argIndex)) {
+        range = childNodeNames.lookup(argIndex);
+      } else {
+        range = node.getArgName(argIndex);
+      }
+      // Resolve the symbol for all range use so that we have a uniform way of
+      // capturing the values.
+      range = symbolInfoMap.getValueAndRangeUse(range);
+      os.indent(6) << formatv(
+          "for (auto v : {0}) tblgen_values.push_back(v);\n", range);
+    } else {
+      os.indent(6) << formatv("tblgen_values.push_back(", varName);
+      if (node.isNestedDagArg(argIndex)) {
+        os << symbolInfoMap.getValueAndRangeUse(
+            childNodeNames.lookup(argIndex));
+      } else {
+        DagLeaf leaf = node.getArgAsLeaf(argIndex);
+        auto symbol =
+            symbolInfoMap.getValueAndRangeUse(node.getArgName(argIndex));
+        if (leaf.isNativeCodeCall()) {
+          os << tgfmt(leaf.getNativeCodeTemplate(), &fmtCtx.withSelf(symbol));
+        } else {
+          os << symbol;
+        }
+      }
+      os << ");\n";
+    }
+  }
+}
+
+static void emitRewriters(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  emitSourceFileHeader("Rewriters", os);
+
+  const auto &patterns = recordKeeper.getAllDerivedDefinitions("Pattern");
+  auto numPatterns = patterns.size();
+
+  // We put the map here because it can be shared among multiple patterns.
+  RecordOperatorMap recordOpMap;
+
+  std::vector<std::string> rewriterNames;
+  rewriterNames.reserve(numPatterns);
+
+  std::string baseRewriterName = "GeneratedConvert";
+  int rewriterIndex = 0;
+
+  for (Record *p : patterns) {
+    std::string name;
+    if (p->isAnonymous()) {
+      // If no name is provided, ensure unique rewriter names simply by
+      // appending unique suffix.
+      name = baseRewriterName + llvm::utostr(rewriterIndex++);
+    } else {
+      name = p->getName();
+    }
+    LLVM_DEBUG(llvm::dbgs()
+               << "=== start generating pattern '" << name << "' ===\n");
+    PatternEmitter(p, &recordOpMap, os).emit(name);
+    LLVM_DEBUG(llvm::dbgs()
+               << "=== done generating pattern '" << name << "' ===\n");
+    rewriterNames.push_back(std::move(name));
+  }
+
+  // Emit function to add the generated matchers to the pattern list.
+  os << "void populateWithGenerated(MLIRContext *context, "
+     << "OwningRewritePatternList *patterns) {\n";
+  for (const auto &name : rewriterNames) {
+    os << "  patterns->insert<" << name << ">(context);\n";
+  }
+  os << "}\n";
+}
+
+static mlir::GenRegistration
+    genRewriters("gen-rewriters", "Generate pattern rewriters",
+                 [](const RecordKeeper &records, raw_ostream &os) {
+                   emitRewriters(records, os);
+                   return false;
+                 });
diff --git a/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp b/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d65b216e109353ffe04aad8a7dc24ef3a247be12
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
@@ -0,0 +1,725 @@
+//===- SPIRVSerializationGen.cpp - SPIR-V serialization utility generator -===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// SPIRVSerializationGen generates common utility functions for SPIR-V
+// serialization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/StringExtras.h"
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using llvm::ArrayRef;
+using llvm::formatv;
+using llvm::raw_ostream;
+using llvm::raw_string_ostream;
+using llvm::Record;
+using llvm::RecordKeeper;
+using llvm::SmallVector;
+using llvm::SMLoc;
+using llvm::StringMap;
+using llvm::StringRef;
+using llvm::Twine;
+using mlir::tblgen::Attribute;
+using mlir::tblgen::EnumAttr;
+using mlir::tblgen::NamedAttribute;
+using mlir::tblgen::NamedTypeConstraint;
+using mlir::tblgen::Operator;
+
+//===----------------------------------------------------------------------===//
+// Serialization AutoGen
+//===----------------------------------------------------------------------===//
+
+// Writes the following function to `os`:
+//   inline uint32_t getOpcode(<op-class-name>) { return <opcode>; }
+static void emitGetOpcodeFunction(const Record *record, Operator const &op,
+                                  raw_ostream &os) {
+  os << formatv("template <> constexpr inline ::mlir::spirv::Opcode "
+                "getOpcode<{0}>() {{\n",
+                op.getQualCppClassName());
+  os << formatv("  return ::mlir::spirv::Opcode::{0};\n",
+                record->getValueAsString("spirvOpName"));
+  os << "}\n";
+}
+
+/// Forward declaration of function to return the SPIR-V opcode corresponding to
+/// an operation. This function will be generated for all SPV_Op instances that
+/// have hasOpcode = 1.
+static void declareOpcodeFn(raw_ostream &os) {
+  os << "template <typename OpClass> inline constexpr ::mlir::spirv::Opcode "
+        "getOpcode();\n";
+}
+
+/// Generates code to serialize attributes of a SPV_Op `op` into `os`. The
+/// generates code extracts the attribute with name `attrName` from
+/// `operandList` of `op`.
+static void emitAttributeSerialization(const Attribute &attr,
+                                       ArrayRef<SMLoc> loc, StringRef tabs,
+                                       StringRef opVar, StringRef operandList,
+                                       StringRef attrName, raw_ostream &os) {
+  os << tabs << formatv("auto attr = {0}.getAttr(\"{1}\");\n", opVar, attrName);
+  os << tabs << "if (attr) {\n";
+  if (attr.getAttrDefName() == "SPV_ScopeAttr" ||
+      attr.getAttrDefName() == "SPV_MemorySemanticsAttr") {
+    os << tabs
+       << formatv("  {0}.push_back(prepareConstantInt({1}.getLoc(), "
+                  "attr.cast<IntegerAttr>()));\n",
+                  operandList, opVar);
+  } else if (attr.getAttrDefName() == "I32ArrayAttr") {
+    // Serialize all the elements of the array
+    os << tabs << "  for (auto attrElem : attr.cast<ArrayAttr>()) {\n";
+    os << tabs
+       << formatv("    {0}.push_back(static_cast<uint32_t>("
+                  "attrElem.cast<IntegerAttr>().getValue().getZExtValue()));\n",
+                  operandList);
+    os << tabs << "  }\n";
+  } else if (attr.isEnumAttr() || attr.getAttrDefName() == "I32Attr") {
+    os << tabs
+       << formatv("  {0}.push_back(static_cast<uint32_t>("
+                  "attr.cast<IntegerAttr>().getValue().getZExtValue()));\n",
+                  operandList);
+  } else {
+    PrintFatalError(
+        loc,
+        llvm::Twine(
+            "unhandled attribute type in SPIR-V serialization generation : '") +
+            attr.getAttrDefName() + llvm::Twine("'"));
+  }
+  os << tabs << "}\n";
+}
+
+/// Generates code to serialize the operands of a SPV_Op `op` into `os`. The
+/// generated queries the SSA-ID if operand is a SSA-Value, or serializes the
+/// attributes. The `operands` vector is updated appropriately. `elidedAttrs`
+/// updated as well to include the serialized attributes.
+static void emitOperandSerialization(const Operator &op, ArrayRef<SMLoc> loc,
+                                     StringRef tabs, StringRef opVar,
+                                     StringRef operands, StringRef elidedAttrs,
+                                     raw_ostream &os) {
+  auto operandNum = 0;
+  for (unsigned i = 0, e = op.getNumArgs(); i < e; ++i) {
+    auto argument = op.getArg(i);
+    os << tabs << "{\n";
+    if (argument.is<NamedTypeConstraint *>()) {
+      os << tabs
+         << formatv("  for (auto arg : {0}.getODSOperands({1})) {{\n", opVar,
+                    operandNum);
+      os << tabs << "    auto argID = getValueID(arg);\n";
+      os << tabs << "    if (!argID) {\n";
+      os << tabs
+         << formatv("      return emitError({0}.getLoc(), "
+                    "\"operand #{1} has a use before def\");\n",
+                    opVar, operandNum);
+      os << tabs << "    }\n";
+      os << tabs << formatv("    {0}.push_back(argID);\n", operands);
+      os << "    }\n";
+      operandNum++;
+    } else {
+      auto attr = argument.get<NamedAttribute *>();
+      auto newtabs = tabs.str() + "  ";
+      emitAttributeSerialization(
+          (attr->attr.isOptional() ? attr->attr.getBaseAttr() : attr->attr),
+          loc, newtabs, opVar, operands, attr->name, os);
+      os << newtabs
+         << formatv("{0}.push_back(\"{1}\");\n", elidedAttrs, attr->name);
+    }
+    os << tabs << "}\n";
+  }
+}
+
+/// Generates code to serializes the result of SPV_Op `op` into `os`. The
+/// generated gets the ID for the type of the result (if any), the SSA-ID of
+/// the result and updates `resultID` with the SSA-ID.
+static void emitResultSerialization(const Operator &op, ArrayRef<SMLoc> loc,
+                                    StringRef tabs, StringRef opVar,
+                                    StringRef operands, StringRef resultID,
+                                    raw_ostream &os) {
+  if (op.getNumResults() == 1) {
+    StringRef resultTypeID("resultTypeID");
+    os << tabs << formatv("uint32_t {0} = 0;\n", resultTypeID);
+    os << tabs
+       << formatv(
+              "if (failed(processType({0}.getLoc(), {0}.getType(), {1}))) {{\n",
+              opVar, resultTypeID);
+    os << tabs << "  return failure();\n";
+    os << tabs << "}\n";
+    os << tabs << formatv("{0}.push_back({1});\n", operands, resultTypeID);
+    // Create an SSA result <id> for the op
+    os << tabs << formatv("{0} = getNextID();\n", resultID);
+    os << tabs
+       << formatv("valueIDMap[{0}.getResult()] = {1};\n", opVar, resultID);
+    os << tabs << formatv("{0}.push_back({1});\n", operands, resultID);
+  } else if (op.getNumResults() != 0) {
+    PrintFatalError(loc, "SPIR-V ops can only have zero or one result");
+  }
+}
+
+/// Generates code to serialize attributes of SPV_Op `op` that become
+/// decorations on the `resultID` of the serialized operation `opVar` in the
+/// SPIR-V binary.
+static void emitDecorationSerialization(const Operator &op, StringRef tabs,
+                                        StringRef opVar, StringRef elidedAttrs,
+                                        StringRef resultID, raw_ostream &os) {
+  if (op.getNumResults() == 1) {
+    // All non-argument attributes translated into OpDecorate instruction
+    os << tabs << formatv("for (auto attr : {0}.getAttrs()) {{\n", opVar);
+    os << tabs
+       << formatv("  if (llvm::any_of({0}, [&](StringRef elided)", elidedAttrs);
+    os << " {return attr.first.is(elided);})) {\n";
+    os << tabs << "    continue;\n";
+    os << tabs << "  }\n";
+    os << tabs
+       << formatv(
+              "  if (failed(processDecoration({0}.getLoc(), {1}, attr))) {{\n",
+              opVar, resultID);
+    os << tabs << "    return failure();\n";
+    os << tabs << "  }\n";
+    os << tabs << "}\n";
+  }
+}
+
+/// Generates code to serialize an SPV_Op `op` into `os`.
+static void emitSerializationFunction(const Record *attrClass,
+                                      const Record *record, const Operator &op,
+                                      raw_ostream &os) {
+  // If the record has 'autogenSerialization' set to 0, nothing to do
+  if (!record->getValueAsBit("autogenSerialization")) {
+    return;
+  }
+  StringRef opVar("op"), operands("operands"), elidedAttrs("elidedAttrs"),
+      resultID("resultID");
+  os << formatv(
+      "template <> LogicalResult\nSerializer::processOp<{0}>({0} {1}) {{\n",
+      op.getQualCppClassName(), opVar);
+  os << formatv("  SmallVector<uint32_t, 4> {0};\n", operands);
+  os << formatv("  SmallVector<StringRef, 2> {0};\n", elidedAttrs);
+
+  // Serialize result information.
+  if (op.getNumResults() == 1) {
+    os << formatv("  uint32_t {0} = 0;\n", resultID);
+    emitResultSerialization(op, record->getLoc(), "  ", opVar, operands,
+                            resultID, os);
+  }
+
+  // Process arguments.
+  emitOperandSerialization(op, record->getLoc(), "  ", opVar, operands,
+                           elidedAttrs, os);
+
+  if (record->isSubClassOf("SPV_ExtInstOp")) {
+    os << formatv("  encodeExtensionInstruction({0}, \"{1}\", {2}, {3});\n",
+                  opVar, record->getValueAsString("extendedInstSetName"),
+                  record->getValueAsInt("extendedInstOpcode"), operands);
+  } else {
+    os << formatv("  encodeInstructionInto("
+                  "functionBody, spirv::getOpcode<{0}>(), {1});\n",
+                  op.getQualCppClassName(), operands);
+  }
+
+  // Process decorations.
+  emitDecorationSerialization(op, "  ", opVar, elidedAttrs, resultID, os);
+
+  os << "  return success();\n";
+  os << "}\n\n";
+}
+
+/// Generates the prologue for the function that dispatches the serialization of
+/// the operation `opVar` based on its opcode.
+static void initDispatchSerializationFn(StringRef opVar, raw_ostream &os) {
+  os << formatv(
+      "LogicalResult Serializer::dispatchToAutogenSerialization(Operation "
+      "*{0}) {{\n ",
+      opVar);
+}
+
+/// Generates the body of the dispatch function. This function generates the
+/// check that if satisfied, will call the serialization function generated for
+/// the `op`.
+static void emitSerializationDispatch(const Operator &op, StringRef tabs,
+                                      StringRef opVar, raw_ostream &os) {
+  os << tabs
+     << formatv("if (isa<{0}>({1})) {{\n", op.getQualCppClassName(), opVar);
+  os << tabs
+     << formatv("  return processOp(cast<{0}>({1}));\n",
+                op.getQualCppClassName(), opVar);
+  os << tabs << "} else";
+}
+
+/// Generates the epilogue for the function that dispatches the serialization of
+/// the operation.
+static void finalizeDispatchSerializationFn(StringRef opVar, raw_ostream &os) {
+  os << " {\n";
+  os << formatv(
+      "    return {0}->emitError(\"unhandled operation serialization\");\n",
+      opVar);
+  os << "  }\n";
+  os << "  return success();\n";
+  os << "}\n\n";
+}
+
+/// Generates code to deserialize the attribute of a SPV_Op into `os`. The
+/// generated code reads the `words` of the serialized instruction at
+/// position `wordIndex` and adds the deserialized attribute into `attrList`.
+static void emitAttributeDeserialization(const Attribute &attr,
+                                         ArrayRef<SMLoc> loc, StringRef tabs,
+                                         StringRef attrList, StringRef attrName,
+                                         StringRef words, StringRef wordIndex,
+                                         raw_ostream &os) {
+  if (attr.getAttrDefName() == "SPV_ScopeAttr" ||
+      attr.getAttrDefName() == "SPV_MemorySemanticsAttr") {
+    os << tabs
+       << formatv("{0}.push_back(opBuilder.getNamedAttr(\"{1}\", "
+                  "getConstantInt({2}[{3}++])));\n",
+                  attrList, attrName, words, wordIndex);
+  } else if (attr.getAttrDefName() == "I32ArrayAttr") {
+    os << tabs << "SmallVector<Attribute, 4> attrListElems;\n";
+    os << tabs << formatv("while ({0} < {1}.size()) {{\n", wordIndex, words);
+    os << tabs
+       << formatv(
+              "  "
+              "attrListElems.push_back(opBuilder.getI32IntegerAttr({0}[{1}++]))"
+              ";\n",
+              words, wordIndex);
+    os << tabs << "}\n";
+    os << tabs
+       << formatv("{0}.push_back(opBuilder.getNamedAttr(\"{1}\", "
+                  "opBuilder.getArrayAttr(attrListElems)));\n",
+                  attrList, attrName);
+  } else if (attr.isEnumAttr() || attr.getAttrDefName() == "I32Attr") {
+    os << tabs
+       << formatv("{0}.push_back(opBuilder.getNamedAttr(\"{1}\", "
+                  "opBuilder.getI32IntegerAttr({2}[{3}++])));\n",
+                  attrList, attrName, words, wordIndex);
+  } else {
+    PrintFatalError(
+        loc, llvm::Twine(
+                 "unhandled attribute type in deserialization generation : '") +
+                 attr.getAttrDefName() + llvm::Twine("'"));
+  }
+}
+
+/// Generates the code to deserialize the result of an SPV_Op `op` into
+/// `os`. The generated code gets the type of the result specified at
+/// `words`[`wordIndex`], the SSA ID for the result at position `wordIndex` + 1
+/// and updates the `resultType` and `valueID` with the parsed type and SSA ID,
+/// respectively.
+static void emitResultDeserialization(const Operator &op, ArrayRef<SMLoc> loc,
+                                      StringRef tabs, StringRef words,
+                                      StringRef wordIndex,
+                                      StringRef resultTypes, StringRef valueID,
+                                      raw_ostream &os) {
+  // Deserialize result information if it exists
+  if (op.getNumResults() == 1) {
+    os << tabs << "{\n";
+    os << tabs << formatv("  if ({0} >= {1}.size()) {{\n", wordIndex, words);
+    os << tabs
+       << formatv(
+              "    return emitError(unknownLoc, \"expected result type <id> "
+              "while deserializing {0}\");\n",
+              op.getQualCppClassName());
+    os << tabs << "  }\n";
+    os << tabs << formatv("  auto ty = getType({0}[{1}]);\n", words, wordIndex);
+    os << tabs << "  if (!ty) {\n";
+    os << tabs
+       << formatv(
+              "    return emitError(unknownLoc, \"unknown type result <id> : "
+              "\") << {0}[{1}];\n",
+              words, wordIndex);
+    os << tabs << "  }\n";
+    os << tabs << formatv("  {0}.push_back(ty);\n", resultTypes);
+    os << tabs << formatv("  {0}++;\n", wordIndex);
+    os << tabs << formatv("  if ({0} >= {1}.size()) {{\n", wordIndex, words);
+    os << tabs
+       << formatv(
+              "    return emitError(unknownLoc, \"expected result <id> while "
+              "deserializing {0}\");\n",
+              op.getQualCppClassName());
+    os << tabs << "  }\n";
+    os << tabs << "}\n";
+    os << tabs << formatv("{0} = {1}[{2}++];\n", valueID, words, wordIndex);
+  } else if (op.getNumResults() != 0) {
+    PrintFatalError(loc, "SPIR-V ops can have only zero or one result");
+  }
+}
+
+/// Generates the code to deserialize the operands of an SPV_Op `op` into
+/// `os`. The generated code reads the `words` of the binary instruction, from
+/// position `wordIndex` to the end, and either gets the Value corresponding to
+/// the ID encoded, or deserializes the attributes encoded. The parsed
+/// operand(attribute) is added to the `operands` list or `attributes` list.
+static void emitOperandDeserialization(const Operator &op, ArrayRef<SMLoc> loc,
+                                       StringRef tabs, StringRef words,
+                                       StringRef wordIndex, StringRef operands,
+                                       StringRef attributes, raw_ostream &os) {
+  // Process operands/attributes
+  unsigned operandNum = 0;
+  for (unsigned i = 0, e = op.getNumArgs(); i < e; ++i) {
+    auto argument = op.getArg(i);
+    if (auto valueArg = argument.dyn_cast<NamedTypeConstraint *>()) {
+      if (valueArg->isVariadic()) {
+        if (i != e - 1) {
+          PrintFatalError(loc,
+                          "SPIR-V ops can have Variadic<..> argument only if "
+                          "it's the last argument");
+        }
+        os << tabs
+           << formatv("for (; {0} < {1}.size(); ++{0})", wordIndex, words);
+      } else {
+        os << tabs << formatv("if ({0} < {1}.size())", wordIndex, words);
+      }
+      os << " {\n";
+      os << tabs
+         << formatv("  auto arg = getValue({0}[{1}]);\n", words, wordIndex);
+      os << tabs << "  if (!arg) {\n";
+      os << tabs
+         << formatv(
+                "    return emitError(unknownLoc, \"unknown result <id> : \") "
+                "<< {0}[{1}];\n",
+                words, wordIndex);
+      os << tabs << "  }\n";
+      os << tabs << formatv("  {0}.push_back(arg);\n", operands);
+      if (!valueArg->isVariadic()) {
+        os << tabs << formatv("  {0}++;\n", wordIndex);
+      }
+      operandNum++;
+      os << tabs << "}\n";
+    } else {
+      os << tabs << formatv("if ({0} < {1}.size()) {{\n", wordIndex, words);
+      auto attr = argument.get<NamedAttribute *>();
+      auto newtabs = tabs.str() + "  ";
+      emitAttributeDeserialization(
+          (attr->attr.isOptional() ? attr->attr.getBaseAttr() : attr->attr),
+          loc, newtabs, attributes, attr->name, words, wordIndex, os);
+      os << "  }\n";
+    }
+  }
+
+  os << tabs << formatv("if ({0} != {1}.size()) {{\n", wordIndex, words);
+  os << tabs
+     << formatv(
+            "  return emitError(unknownLoc, \"found more operands than "
+            "expected when deserializing {0}, only \") << {1} << \" of \" << "
+            "{2}.size() << \" processed\";\n",
+            op.getQualCppClassName(), wordIndex, words);
+  os << tabs << "}\n\n";
+}
+
+/// Generates code to update the `attributes` vector with the attributes
+/// obtained from parsing the decorations in the SPIR-V binary associated with
+/// an <id> `valueID`
+static void emitDecorationDeserialization(const Operator &op, StringRef tabs,
+                                          StringRef valueID,
+                                          StringRef attributes,
+                                          raw_ostream &os) {
+  // Import decorations parsed
+  if (op.getNumResults() == 1) {
+    os << tabs << formatv("if (decorations.count({0})) {{\n", valueID);
+    os << tabs
+       << formatv("  auto attrs = decorations[{0}].getAttrs();\n", valueID);
+    os << tabs
+       << formatv("  {0}.append(attrs.begin(), attrs.end());\n", attributes);
+    os << tabs << "}\n";
+  }
+}
+
+/// Generates code to deserialize an SPV_Op `op` into `os`.
+static void emitDeserializationFunction(const Record *attrClass,
+                                        const Record *record,
+                                        const Operator &op, raw_ostream &os) {
+  // If the record has 'autogenSerialization' set to 0, nothing to do
+  if (!record->getValueAsBit("autogenSerialization")) {
+    return;
+  }
+  StringRef resultTypes("resultTypes"), valueID("valueID"), words("words"),
+      wordIndex("wordIndex"), opVar("op"), operands("operands"),
+      attributes("attributes");
+  os << formatv("template <> "
+                "LogicalResult\nDeserializer::processOp<{0}>(ArrayRef<"
+                "uint32_t> {1}) {{\n",
+                op.getQualCppClassName(), words);
+  os << formatv("  SmallVector<Type, 1> {0};\n", resultTypes);
+  os << formatv("  size_t {0} = 0; (void){0};\n", wordIndex);
+  os << formatv("  uint32_t {0} = 0; (void){0};\n", valueID);
+
+  // Deserialize result information
+  emitResultDeserialization(op, record->getLoc(), "  ", words, wordIndex,
+                            resultTypes, valueID, os);
+
+  os << formatv("  SmallVector<Value, 4> {0};\n", operands);
+  os << formatv("  SmallVector<NamedAttribute, 4> {0};\n", attributes);
+  // Operand deserialization
+  emitOperandDeserialization(op, record->getLoc(), "  ", words, wordIndex,
+                             operands, attributes, os);
+
+  os << formatv(
+      "  auto {1} = opBuilder.create<{0}>(unknownLoc, {2}, {3}, {4}); "
+      "(void){1};\n",
+      op.getQualCppClassName(), opVar, resultTypes, operands, attributes);
+  if (op.getNumResults() == 1) {
+    os << formatv("  valueMap[{0}] = {1}.getResult();\n\n", valueID, opVar);
+  }
+
+  // Decorations
+  emitDecorationDeserialization(op, "  ", valueID, attributes, os);
+  os << "  return success();\n";
+  os << "}\n\n";
+}
+
+/// Generates the prologue for the function that dispatches the deserialization
+/// based on the `opcode`.
+static void initDispatchDeserializationFn(StringRef opcode, StringRef words,
+                                          raw_ostream &os) {
+  os << formatv(
+      "LogicalResult "
+      "Deserializer::dispatchToAutogenDeserialization(spirv::Opcode {0}, "
+      "ArrayRef<uint32_t> {1}) {{\n",
+      opcode, words);
+  os << formatv("  switch ({0}) {{\n", opcode);
+}
+
+/// Generates the body of the dispatch function, by generating the case label
+/// for an opcode and the call to the method to perform the deserialization.
+static void emitDeserializationDispatch(const Operator &op, const Record *def,
+                                        StringRef tabs, StringRef words,
+                                        raw_ostream &os) {
+  os << tabs
+     << formatv("case spirv::Opcode::{0}:\n",
+                def->getValueAsString("spirvOpName"));
+  os << tabs
+     << formatv("  return processOp<{0}>({1});\n", op.getQualCppClassName(),
+                words);
+}
+
+/// Generates the epilogue for the function that dispatches the deserialization
+/// of the operation.
+static void finalizeDispatchDeserializationFn(StringRef opcode,
+                                              raw_ostream &os) {
+  os << "  default:\n";
+  os << "    ;\n";
+  os << "  }\n";
+  StringRef opcodeVar("opcodeString");
+  os << formatv("  auto {0} = spirv::stringifyOpcode({1});\n", opcodeVar,
+                opcode);
+  os << formatv("  if (!{0}.empty()) {{\n", opcodeVar);
+  os << formatv("    return emitError(unknownLoc, \"unhandled deserialization "
+                "of \") << {0};\n",
+                opcodeVar);
+  os << "  } else {\n";
+  os << formatv("   return emitError(unknownLoc, \"unhandled opcode \") << "
+                "static_cast<uint32_t>({0});\n",
+                opcode);
+  os << "  }\n";
+  os << "}\n";
+}
+
+static void initExtendedSetDeserializationDispatch(StringRef extensionSetName,
+                                                   StringRef instructionID,
+                                                   StringRef words,
+                                                   raw_ostream &os) {
+  os << formatv("LogicalResult "
+                "Deserializer::dispatchToExtensionSetAutogenDeserialization("
+                "StringRef {0}, uint32_t {1}, ArrayRef<uint32_t> {2}) {{\n",
+                extensionSetName, instructionID, words);
+}
+
+static void
+emitExtendedSetDeserializationDispatch(const RecordKeeper &recordKeeper,
+                                       raw_ostream &os) {
+  StringRef extensionSetName("extensionSetName"),
+      instructionID("instructionID"), words("words");
+
+  // First iterate over all ops derived from SPV_ExtensionSetOps to get all
+  // extensionSets.
+
+  // For each of the extensions a separate raw_string_ostream is used to
+  // generate code into. These are then concatenated at the end. Since
+  // raw_string_ostream needs a string&, use a vector to store all the string
+  // that are captured by reference within raw_string_ostream.
+  StringMap<raw_string_ostream> extensionSets;
+  SmallVector<std::string, 1> extensionSetNames;
+
+  initExtendedSetDeserializationDispatch(extensionSetName, instructionID, words,
+                                         os);
+  auto defs = recordKeeper.getAllDerivedDefinitions("SPV_ExtInstOp");
+  for (const auto *def : defs) {
+    if (!def->getValueAsBit("autogenSerialization")) {
+      continue;
+    }
+    Operator op(def);
+    auto setName = def->getValueAsString("extendedInstSetName");
+    if (!extensionSets.count(setName)) {
+      extensionSetNames.push_back("");
+      extensionSets.try_emplace(setName, extensionSetNames.back());
+      auto &setos = extensionSets.find(setName)->second;
+      setos << formatv("  if ({0} == \"{1}\") {{\n", extensionSetName, setName);
+      setos << formatv("    switch ({0}) {{\n", instructionID);
+    }
+    auto &setos = extensionSets.find(setName)->second;
+    setos << formatv("    case {0}:\n",
+                     def->getValueAsInt("extendedInstOpcode"));
+    setos << formatv("      return processOp<{0}>({1});\n",
+                     op.getQualCppClassName(), words);
+  }
+
+  // Append the dispatch code for all the extended sets.
+  for (auto &extensionSet : extensionSets) {
+    os << extensionSet.second.str();
+    os << "    default:\n";
+    os << formatv(
+        "      return emitError(unknownLoc, \"unhandled deserializations of "
+        "\") << {0} << \" from extension set \" << {1};\n",
+        instructionID, extensionSetName);
+    os << "    }\n";
+    os << "  }\n";
+  }
+
+  os << formatv("  return emitError(unknownLoc, \"unhandled deserialization of "
+                "extended instruction set {0}\");\n",
+                extensionSetName);
+  os << "}\n";
+}
+
+/// Emits all the autogenerated serialization/deserializations functions for the
+/// SPV_Ops.
+static bool emitSerializationFns(const RecordKeeper &recordKeeper,
+                                 raw_ostream &os) {
+  llvm::emitSourceFileHeader("SPIR-V Serialization Utilities/Functions", os);
+
+  std::string dSerFnString, dDesFnString, serFnString, deserFnString,
+      utilsString;
+  raw_string_ostream dSerFn(dSerFnString), dDesFn(dDesFnString),
+      serFn(serFnString), deserFn(deserFnString), utils(utilsString);
+  auto attrClass = recordKeeper.getClass("Attr");
+
+  // Emit the serialization and deserialization functions simultaneously.
+  declareOpcodeFn(utils);
+  StringRef opVar("op");
+  StringRef opcode("opcode"), words("words");
+
+  // Handle the SPIR-V ops.
+  initDispatchSerializationFn(opVar, dSerFn);
+  initDispatchDeserializationFn(opcode, words, dDesFn);
+  auto defs = recordKeeper.getAllDerivedDefinitions("SPV_Op");
+  for (const auto *def : defs) {
+    Operator op(def);
+    emitSerializationFunction(attrClass, def, op, serFn);
+    emitDeserializationFunction(attrClass, def, op, deserFn);
+    if (def->getValueAsBit("hasOpcode") || def->isSubClassOf("SPV_ExtInstOp")) {
+      emitSerializationDispatch(op, "  ", opVar, dSerFn);
+    }
+    if (def->getValueAsBit("hasOpcode")) {
+      emitGetOpcodeFunction(def, op, utils);
+      emitDeserializationDispatch(op, def, "  ", words, dDesFn);
+    }
+  }
+  finalizeDispatchSerializationFn(opVar, dSerFn);
+  finalizeDispatchDeserializationFn(opcode, dDesFn);
+
+  emitExtendedSetDeserializationDispatch(recordKeeper, dDesFn);
+
+  os << "#ifdef GET_SPIRV_SERIALIZATION_UTILS\n";
+  os << utils.str();
+  os << "#endif // GET_SPIRV_SERIALIZATION_UTILS\n\n";
+
+  os << "#ifdef GET_SERIALIZATION_FNS\n\n";
+  os << serFn.str();
+  os << dSerFn.str();
+  os << "#endif // GET_SERIALIZATION_FNS\n\n";
+
+  os << "#ifdef GET_DESERIALIZATION_FNS\n\n";
+  os << deserFn.str();
+  os << dDesFn.str();
+  os << "#endif // GET_DESERIALIZATION_FNS\n\n";
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Op Utils AutoGen
+//===----------------------------------------------------------------------===//
+
+static void emitEnumGetAttrNameFnDecl(raw_ostream &os) {
+  os << formatv("template <typename EnumClass> inline constexpr StringRef "
+                "attributeName();\n");
+}
+
+static void emitEnumGetSymbolizeFnDecl(raw_ostream &os) {
+  os << "template <typename EnumClass> using SymbolizeFnTy = "
+        "llvm::Optional<EnumClass> (*)(StringRef);\n";
+  os << "template <typename EnumClass> inline constexpr "
+        "SymbolizeFnTy<EnumClass> symbolizeEnum();\n";
+}
+
+static void emitEnumGetAttrNameFnDefn(const EnumAttr &enumAttr,
+                                      raw_ostream &os) {
+  auto enumName = enumAttr.getEnumClassName();
+  os << formatv("template <> inline StringRef attributeName<{0}>() {{\n",
+                enumName);
+  os << "  "
+     << formatv("static constexpr const char attrName[] = \"{0}\";\n",
+                mlir::convertToSnakeCase(enumName));
+  os << "  return attrName;\n";
+  os << "}\n";
+}
+
+static void emitEnumGetSymbolizeFnDefn(const EnumAttr &enumAttr,
+                                       raw_ostream &os) {
+  auto enumName = enumAttr.getEnumClassName();
+  auto strToSymFnName = enumAttr.getStringToSymbolFnName();
+  os << formatv(
+      "template <> inline SymbolizeFnTy<{0}> symbolizeEnum<{0}>() {{\n",
+      enumName);
+  os << "  return " << strToSymFnName << ";\n";
+  os << "}\n";
+}
+
+static bool emitOpUtils(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  llvm::emitSourceFileHeader("SPIR-V Op Utilities", os);
+
+  auto defs = recordKeeper.getAllDerivedDefinitions("EnumAttrInfo");
+  os << "#ifndef SPIRV_OP_UTILS_H_\n";
+  os << "#define SPIRV_OP_UTILS_H_\n";
+  emitEnumGetAttrNameFnDecl(os);
+  emitEnumGetSymbolizeFnDecl(os);
+  for (const auto *def : defs) {
+    EnumAttr enumAttr(*def);
+    emitEnumGetAttrNameFnDefn(enumAttr, os);
+    emitEnumGetSymbolizeFnDefn(enumAttr, os);
+  }
+  os << "#endif // SPIRV_OP_UTILS_H\n";
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Hook Registration
+//===----------------------------------------------------------------------===//
+
+static mlir::GenRegistration genSerialization(
+    "gen-spirv-serialization",
+    "Generate SPIR-V (de)serialization utilities and functions",
+    [](const RecordKeeper &records, raw_ostream &os) {
+      return emitSerializationFns(records, os);
+    });
+
+static mlir::GenRegistration
+    genOpUtils("gen-spirv-op-utils",
+               "Generate SPIR-V operation utility definitions",
+               [](const RecordKeeper &records, raw_ostream &os) {
+                 return emitOpUtils(records, os);
+               });
diff --git a/mlir/tools/mlir-tblgen/StructsGen.cpp b/mlir/tools/mlir-tblgen/StructsGen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..576085e41eb25081aac0100839fc0af499de0d38
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/StructsGen.cpp
@@ -0,0 +1,250 @@
+//===- StructsGen.cpp - MLIR struct utility generator ---------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// StructsGen generates common utility functions for grouping attributes into a
+// set of structured data.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/Format.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using llvm::raw_ostream;
+using llvm::Record;
+using llvm::RecordKeeper;
+using llvm::StringRef;
+using mlir::tblgen::StructAttr;
+
+static void
+emitStructClass(const Record &structDef, StringRef structName,
+                llvm::ArrayRef<mlir::tblgen::StructFieldAttr> fields,
+                StringRef description, raw_ostream &os) {
+  const char *structInfo = R"(
+// {0}
+class {1} : public mlir::DictionaryAttr)";
+  const char *structInfoEnd = R"( {
+public:
+  using DictionaryAttr::DictionaryAttr;
+  static bool classof(mlir::Attribute attr);
+)";
+  os << formatv(structInfo, description, structName) << structInfoEnd;
+
+  // Declares a constructor function for the tablegen structure.
+  //   TblgenStruct::get(MLIRContext context, Type1 Field1, Type2 Field2, ...);
+  const char *getInfoDecl = "  static {0} get(\n";
+  const char *getInfoDeclArg = "      {0} {1},\n";
+  const char *getInfoDeclEnd = "      mlir::MLIRContext* context);\n\n";
+
+  os << llvm::formatv(getInfoDecl, structName);
+
+  for (auto field : fields) {
+    auto name = field.getName();
+    auto type = field.getType();
+    auto storage = type.getStorageType();
+    os << llvm::formatv(getInfoDeclArg, storage, name);
+  }
+  os << getInfoDeclEnd;
+
+  // Declares an accessor for the fields owned by the tablegen structure.
+  //   namespace::storage TblgenStruct::field1() const;
+  const char *fieldInfo = R"(  {0} {1}() const;
+)";
+  for (const auto field : fields) {
+    auto name = field.getName();
+    auto type = field.getType();
+    auto storage = type.getStorageType();
+    os << formatv(fieldInfo, storage, name);
+  }
+
+  os << "};\n\n";
+}
+
+static void emitStructDecl(const Record &structDef, raw_ostream &os) {
+  StructAttr structAttr(&structDef);
+  StringRef structName = structAttr.getStructClassName();
+  StringRef cppNamespace = structAttr.getCppNamespace();
+  StringRef description = structAttr.getDescription();
+  auto fields = structAttr.getAllFields();
+
+  // Wrap in the appropriate namespace.
+  llvm::SmallVector<StringRef, 2> namespaces;
+  llvm::SplitString(cppNamespace, namespaces, "::");
+
+  for (auto ns : namespaces)
+    os << "namespace " << ns << " {\n";
+
+  // Emit the struct class definition
+  emitStructClass(structDef, structName, fields, description, os);
+
+  // Close the declared namespace.
+  for (auto ns : namespaces)
+    os << "} // namespace " << ns << "\n";
+}
+
+static bool emitStructDecls(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  llvm::emitSourceFileHeader("Struct Utility Declarations", os);
+
+  auto defs = recordKeeper.getAllDerivedDefinitions("StructAttr");
+  for (const auto *def : defs) {
+    emitStructDecl(*def, os);
+  }
+
+  return false;
+}
+
+static void emitFactoryDef(llvm::StringRef structName,
+                           llvm::ArrayRef<mlir::tblgen::StructFieldAttr> fields,
+                           raw_ostream &os) {
+  const char *getInfoDecl = "{0} {0}::get(\n";
+  const char *getInfoDeclArg = "    {0} {1},\n";
+  const char *getInfoDeclEnd = "    mlir::MLIRContext* context) {";
+
+  os << llvm::formatv(getInfoDecl, structName);
+
+  for (auto field : fields) {
+    auto name = field.getName();
+    auto type = field.getType();
+    auto storage = type.getStorageType();
+    os << llvm::formatv(getInfoDeclArg, storage, name);
+  }
+  os << getInfoDeclEnd;
+
+  const char *fieldStart = R"(
+  llvm::SmallVector<mlir::NamedAttribute, {0}> fields;
+)";
+  os << llvm::formatv(fieldStart, fields.size());
+
+  const char *getFieldInfo = R"(
+  assert({0});
+  auto {0}_id = mlir::Identifier::get("{0}", context);
+  fields.emplace_back({0}_id, {0});
+)";
+
+  for (auto field : fields) {
+    os << llvm::formatv(getFieldInfo, field.getName());
+  }
+
+  const char *getEndInfo = R"(
+  Attribute dict = mlir::DictionaryAttr::get(fields, context);
+  return dict.dyn_cast<{0}>();
+}
+)";
+  os << llvm::formatv(getEndInfo, structName);
+}
+
+static void emitClassofDef(llvm::StringRef structName,
+                           llvm::ArrayRef<mlir::tblgen::StructFieldAttr> fields,
+                           raw_ostream &os) {
+  const char *classofInfo = R"(
+bool {0}::classof(mlir::Attribute attr))";
+
+  const char *classofInfoHeader = R"(
+   auto derived = attr.dyn_cast<mlir::DictionaryAttr>();
+   if (!derived)
+     return false;
+   if (derived.size() != {0})
+     return false;
+)";
+
+  os << llvm::formatv(classofInfo, structName) << " {";
+  os << llvm::formatv(classofInfoHeader, fields.size());
+
+  const char *classofArgInfo = R"(
+  auto {0} = derived.get("{0}");
+  if (!{0} || !{0}.isa<{1}>())
+    return false;
+)";
+  for (auto field : fields) {
+    auto name = field.getName();
+    auto type = field.getType();
+    auto storage = type.getStorageType();
+    os << llvm::formatv(classofArgInfo, name, storage);
+  }
+
+  const char *classofEndInfo = R"(
+  return true;
+}
+)";
+  os << classofEndInfo;
+}
+
+static void
+emitAccessorDef(llvm::StringRef structName,
+                llvm::ArrayRef<mlir::tblgen::StructFieldAttr> fields,
+                raw_ostream &os) {
+  const char *fieldInfo = R"(
+{0} {2}::{1}() const {
+  auto derived = this->cast<mlir::DictionaryAttr>();
+  auto {1} = derived.get("{1}");
+  assert({1} && "attribute not found.");
+  assert({1}.isa<{0}>() && "incorrect Attribute type found.");
+  return {1}.cast<{0}>();
+}
+)";
+  for (auto field : fields) {
+    auto name = field.getName();
+    auto type = field.getType();
+    auto storage = type.getStorageType();
+    os << llvm::formatv(fieldInfo, storage, name, structName);
+  }
+}
+
+static void emitStructDef(const Record &structDef, raw_ostream &os) {
+  StructAttr structAttr(&structDef);
+  StringRef cppNamespace = structAttr.getCppNamespace();
+  StringRef structName = structAttr.getStructClassName();
+  mlir::tblgen::FmtContext ctx;
+  auto fields = structAttr.getAllFields();
+
+  llvm::SmallVector<StringRef, 2> namespaces;
+  llvm::SplitString(cppNamespace, namespaces, "::");
+
+  for (auto ns : namespaces)
+    os << "namespace " << ns << " {\n";
+
+  emitFactoryDef(structName, fields, os);
+  emitClassofDef(structName, fields, os);
+  emitAccessorDef(structName, fields, os);
+
+  for (auto ns : llvm::reverse(namespaces))
+    os << "} // namespace " << ns << "\n";
+}
+
+static bool emitStructDefs(const RecordKeeper &recordKeeper, raw_ostream &os) {
+  llvm::emitSourceFileHeader("Struct Utility Definitions", os);
+
+  auto defs = recordKeeper.getAllDerivedDefinitions("StructAttr");
+  for (const auto *def : defs)
+    emitStructDef(*def, os);
+
+  return false;
+}
+
+// Registers the struct utility generator to mlir-tblgen.
+static mlir::GenRegistration
+    genStructDecls("gen-struct-attr-decls",
+                   "Generate struct utility declarations",
+                   [](const RecordKeeper &records, raw_ostream &os) {
+                     return emitStructDecls(records, os);
+                   });
+
+// Registers the struct utility generator to mlir-tblgen.
+static mlir::GenRegistration
+    genStructDefs("gen-struct-attr-defs", "Generate struct utility definitions",
+                  [](const RecordKeeper &records, raw_ostream &os) {
+                    return emitStructDefs(records, os);
+                  });
diff --git a/mlir/tools/mlir-tblgen/mlir-tblgen.cpp b/mlir/tools/mlir-tblgen/mlir-tblgen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c9778b3ec78702aea1d848bcdd5a5ffefa1b0e8
--- /dev/null
+++ b/mlir/tools/mlir-tblgen/mlir-tblgen.cpp
@@ -0,0 +1,83 @@
+//===- mlir-tblgen.cpp - Top-Level TableGen implementation for MLIR -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the main function for MLIR's TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/GenNameParser.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Main.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+
+static llvm::ManagedStatic<std::vector<GenInfo>> generatorRegistry;
+
+mlir::GenRegistration::GenRegistration(StringRef arg, StringRef description,
+                                       GenFunction function) {
+  generatorRegistry->emplace_back(arg, description, function);
+}
+
+GenNameParser::GenNameParser(llvm::cl::Option &opt)
+    : llvm::cl::parser<const GenInfo *>(opt) {
+  for (const auto &kv : *generatorRegistry) {
+    addLiteralOption(kv.getGenArgument(), &kv, kv.getGenDescription());
+  }
+}
+
+void GenNameParser::printOptionInfo(const llvm::cl::Option &O,
+                                    size_t GlobalWidth) const {
+  GenNameParser *TP = const_cast<GenNameParser *>(this);
+  llvm::array_pod_sort(TP->Values.begin(), TP->Values.end(),
+                       [](const GenNameParser::OptionInfo *VT1,
+                          const GenNameParser::OptionInfo *VT2) {
+                         return VT1->Name.compare(VT2->Name);
+                       });
+  using llvm::cl::parser;
+  parser<const GenInfo *>::printOptionInfo(O, GlobalWidth);
+}
+
+// Generator that prints records.
+GenRegistration printRecords("print-records", "Print all records to stdout",
+                             [](const RecordKeeper &records, raw_ostream &os) {
+                               os << records;
+                               return false;
+                             });
+
+// Generator to invoke.
+const mlir::GenInfo *generator;
+
+// TableGenMain requires a function pointer so this function is passed in which
+// simply wraps the call to the generator.
+static bool MlirTableGenMain(raw_ostream &os, RecordKeeper &records) {
+  if (!generator) {
+    os << records;
+    return false;
+  }
+  return generator->invoke(records, os);
+}
+
+int main(int argc, char **argv) {
+  llvm::InitLLVM y(argc, argv);
+  llvm::cl::opt<const mlir::GenInfo *, false, mlir::GenNameParser> generator(
+      "", llvm::cl::desc("Generator to run"));
+  cl::ParseCommandLineOptions(argc, argv);
+  ::generator = generator.getValue();
+
+  return TableGenMain(argv[0], &MlirTableGenMain);
+}
diff --git a/mlir/tools/mlir-translate/CMakeLists.txt b/mlir/tools/mlir-translate/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..22b5ff356790a50c13b831e3bb9dfb445281ea06
--- /dev/null
+++ b/mlir/tools/mlir-translate/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(LIBS
+  MLIRParser
+  MLIRPass
+  MLIRSPIRV
+  MLIRSPIRVSerialization
+  MLIRTargetLLVMIR
+  MLIRTargetNVVMIR
+  MLIRTargetROCDLIR
+  MLIRTranslation
+  MLIRSupport
+)
+add_llvm_executable(mlir-translate
+  mlir-translate.cpp
+)
+llvm_update_compile_flags(mlir-translate)
+whole_archive_link(mlir-translate ${LIBS})
+target_link_libraries(mlir-translate PRIVATE MLIRIR MLIRTranslateClParser ${LIBS} LLVMSupport)
diff --git a/mlir/tools/mlir-translate/mlir-translate.cpp b/mlir/tools/mlir-translate/mlir-translate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3b15c5f3875e2b7172d34f183923e9f78bde8e75
--- /dev/null
+++ b/mlir/tools/mlir-translate/mlir-translate.cpp
@@ -0,0 +1,101 @@
+//===- mlir-translate.cpp - MLIR Translate Driver -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a command line utility that translates a file from/to MLIR using one
+// of the registered translations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/ToolUtilities.h"
+#include "mlir/Support/TranslateClParser.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+static llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
+                                                llvm::cl::desc("<input file>"),
+                                                llvm::cl::init("-"));
+
+static llvm::cl::opt<std::string>
+    outputFilename("o", llvm::cl::desc("Output filename"),
+                   llvm::cl::value_desc("filename"), llvm::cl::init("-"));
+
+static llvm::cl::opt<bool>
+    splitInputFile("split-input-file",
+                   llvm::cl::desc("Split the input file into pieces and "
+                                  "process each chunk independently"),
+                   llvm::cl::init(false));
+
+static llvm::cl::opt<bool> verifyDiagnostics(
+    "verify-diagnostics",
+    llvm::cl::desc("Check that emitted diagnostics match "
+                   "expected-* lines on the corresponding line"),
+    llvm::cl::init(false));
+
+int main(int argc, char **argv) {
+  llvm::InitLLVM y(argc, argv);
+
+  // Add flags for all the registered translations.
+  llvm::cl::opt<const TranslateFunction *, false, TranslationParser>
+      translationRequested("", llvm::cl::desc("Translation to perform"),
+                           llvm::cl::Required);
+
+  llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR translation driver\n");
+
+  std::string errorMessage;
+  auto input = openInputFile(inputFilename, &errorMessage);
+  if (!input) {
+    llvm::errs() << errorMessage << "\n";
+    return 1;
+  }
+
+  auto output = openOutputFile(outputFilename, &errorMessage);
+  if (!output) {
+    llvm::errs() << errorMessage << "\n";
+    return 1;
+  }
+
+  // Processes the memory buffer with a new MLIRContext.
+  auto processBuffer = [&](std::unique_ptr<llvm::MemoryBuffer> ownedBuffer,
+                           raw_ostream &os) {
+    MLIRContext context;
+    llvm::SourceMgr sourceMgr;
+    sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), llvm::SMLoc());
+
+    if (!verifyDiagnostics) {
+      SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
+      return (*translationRequested)(sourceMgr, os, &context);
+    }
+
+    // In the diagnostic verification flow, we ignore whether the translation
+    // failed (in most cases, it is expected to fail). Instead, we check if the
+    // diagnostics were produced as expected.
+    SourceMgrDiagnosticVerifierHandler sourceMgrHandler(sourceMgr, &context);
+    (*translationRequested)(sourceMgr, os, &context);
+    return sourceMgrHandler.verify();
+  };
+
+  if (splitInputFile) {
+    if (failed(splitAndProcessBuffer(std::move(input), processBuffer,
+                                     output->os())))
+      return 1;
+  } else {
+    if (failed(processBuffer(std::move(input), output->os())))
+      return 1;
+  }
+
+  output->keep();
+  return 0;
+}
diff --git a/mlir/unittests/ADT/CMakeLists.txt b/mlir/unittests/ADT/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cb122620512b4d4d1d3c141fefbaff28ded2a93b
--- /dev/null
+++ b/mlir/unittests/ADT/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_mlir_unittest(MLIRADTTests
+  TypeSwitchTest.cpp
+)
+
+target_link_libraries(MLIRADTTests PRIVATE MLIRSupport LLVMSupport)
diff --git a/mlir/unittests/ADT/TypeSwitchTest.cpp b/mlir/unittests/ADT/TypeSwitchTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..549fb9b221e12a9e7cd334aaf8224263115e2539
--- /dev/null
+++ b/mlir/unittests/ADT/TypeSwitchTest.cpp
@@ -0,0 +1,88 @@
+//===- TypeSwitchTest.cpp - TypeSwitch unit tests -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ADT/TypeSwitch.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+
+namespace {
+/// Utility classes to setup casting functionality.
+struct Base {
+  enum Kind { DerivedA, DerivedB, DerivedC, DerivedD, DerivedE };
+  Kind kind;
+};
+template <Base::Kind DerivedKind> struct DerivedImpl : Base {
+  DerivedImpl() : Base{DerivedKind} {}
+  static bool classof(const Base *base) { return base->kind == DerivedKind; }
+};
+struct DerivedA : public DerivedImpl<Base::DerivedA> {};
+struct DerivedB : public DerivedImpl<Base::DerivedB> {};
+struct DerivedC : public DerivedImpl<Base::DerivedC> {};
+struct DerivedD : public DerivedImpl<Base::DerivedD> {};
+struct DerivedE : public DerivedImpl<Base::DerivedE> {};
+} // end anonymous namespace
+
+TEST(StringSwitchTest, CaseResult) {
+  auto translate = [](auto value) {
+    return TypeSwitch<Base *, int>(&value)
+        .Case<DerivedA>([](DerivedA *) { return 0; })
+        .Case([](DerivedB *) { return 1; })
+        .Case([](DerivedC *) { return 2; })
+        .Default([](Base *) { return -1; });
+  };
+  EXPECT_EQ(0, translate(DerivedA()));
+  EXPECT_EQ(1, translate(DerivedB()));
+  EXPECT_EQ(2, translate(DerivedC()));
+  EXPECT_EQ(-1, translate(DerivedD()));
+}
+
+TEST(StringSwitchTest, CasesResult) {
+  auto translate = [](auto value) {
+    return TypeSwitch<Base *, int>(&value)
+        .Case<DerivedA, DerivedB, DerivedD>([](auto *) { return 0; })
+        .Case([](DerivedC *) { return 1; })
+        .Default([](Base *) { return -1; });
+  };
+  EXPECT_EQ(0, translate(DerivedA()));
+  EXPECT_EQ(0, translate(DerivedB()));
+  EXPECT_EQ(1, translate(DerivedC()));
+  EXPECT_EQ(0, translate(DerivedD()));
+  EXPECT_EQ(-1, translate(DerivedE()));
+}
+
+TEST(StringSwitchTest, CaseVoid) {
+  auto translate = [](auto value) {
+    int result = -2;
+    TypeSwitch<Base *>(&value)
+        .Case([&](DerivedA *) { result = 0; })
+        .Case([&](DerivedB *) { result = 1; })
+        .Case([&](DerivedC *) { result = 2; })
+        .Default([&](Base *) { result = -1; });
+    return result;
+  };
+  EXPECT_EQ(0, translate(DerivedA()));
+  EXPECT_EQ(1, translate(DerivedB()));
+  EXPECT_EQ(2, translate(DerivedC()));
+  EXPECT_EQ(-1, translate(DerivedD()));
+}
+
+TEST(StringSwitchTest, CasesVoid) {
+  auto translate = [](auto value) {
+    int result = -1;
+    TypeSwitch<Base *>(&value)
+        .Case<DerivedA, DerivedB, DerivedD>([&](auto *) { result = 0; })
+        .Case([&](DerivedC *) { result = 1; });
+    return result;
+  };
+  EXPECT_EQ(0, translate(DerivedA()));
+  EXPECT_EQ(0, translate(DerivedB()));
+  EXPECT_EQ(1, translate(DerivedC()));
+  EXPECT_EQ(0, translate(DerivedD()));
+  EXPECT_EQ(-1, translate(DerivedE()));
+}
diff --git a/mlir/unittests/CMakeLists.txt b/mlir/unittests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..79a5297b66e01f6c2c14c15ddc57d25cc5b401c9
--- /dev/null
+++ b/mlir/unittests/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_custom_target(MLIRUnitTests)
+set_target_properties(MLIRUnitTests PROPERTIES FOLDER "MLIR Tests")
+
+function(add_mlir_unittest test_dirname)
+  add_unittest(MLIRUnitTests ${test_dirname} ${ARGN})
+endfunction()
+
+add_subdirectory(ADT)
+add_subdirectory(Dialect)
+add_subdirectory(IR)
+add_subdirectory(Pass)
+add_subdirectory(SDBM)
+add_subdirectory(TableGen)
diff --git a/mlir/unittests/Dialect/BroadcastShapeTest.cpp b/mlir/unittests/Dialect/BroadcastShapeTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..594e98741e1ef0cd97e2328711dd05e333ae7c1a
--- /dev/null
+++ b/mlir/unittests/Dialect/BroadcastShapeTest.cpp
@@ -0,0 +1,71 @@
+//===- BroadcastShapeTest.cpp - broadcasting shape unit tests -------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Traits.h"
+#include "llvm/ADT/SmallVector.h"
+#include "gmock/gmock.h"
+
+using namespace mlir::OpTrait::util;
+
+using llvm::SmallVector;
+using ::testing::ElementsAre;
+
+TEST(BroadcastShapeTest, CompatibleScalarAndScalar) {
+  SmallVector<int64_t, 4> result;
+  ASSERT_TRUE(getBroadcastedShape({}, {}, result));
+  EXPECT_TRUE(result.empty());
+}
+
+TEST(BroadcastShapeTest, Compatible0DAnd1DTensor) {
+  SmallVector<int64_t, 4> result;
+  ASSERT_TRUE(getBroadcastedShape({}, {4}, result));
+  EXPECT_THAT(result, ElementsAre(4));
+}
+
+TEST(BroadcastShapeTest, Compatible0DAnd3DTensor) {
+  SmallVector<int64_t, 4> result;
+  ASSERT_TRUE(getBroadcastedShape({}, {3, 5, 4}, result));
+  EXPECT_THAT(result, ElementsAre(3, 5, 4));
+}
+
+TEST(BroadcastShapeTest, CompatibleTensorAndTensor) {
+  SmallVector<int64_t, 4> result;
+  ASSERT_TRUE(getBroadcastedShape({1, 7, 8, 9}, {8, 9}, result));
+  EXPECT_THAT(result, ElementsAre(1, 7, 8, 9));
+}
+
+TEST(BroadcastShapeTest, InterleavingOnes) {
+  SmallVector<int64_t, 4> result;
+  ASSERT_TRUE(getBroadcastedShape({8, 1, 2, 1, 4}, {5, 1, 7, 1}, result));
+  EXPECT_THAT(result, ElementsAre(8, 5, 2, 7, 4));
+}
+
+TEST(BroadcastShapeTest, InterleavingUnknowns) {
+  SmallVector<int64_t, 4> result;
+  ASSERT_TRUE(
+      getBroadcastedShape({1, 2, -1, -1, -1}, {-1, -1, -1, 4, 1}, result));
+  EXPECT_THAT(result, ElementsAre(-1, 2, -1, 4, -1));
+}
+
+TEST(BroadcastShapeTest, IncompatibleLowDim) {
+  SmallVector<int64_t, 4> result;
+  ASSERT_FALSE(getBroadcastedShape({4, 3, 5, 5}, {3, 5, 4}, result));
+  EXPECT_TRUE(result.empty());
+}
+
+TEST(BroadcastShapeTest, IncompatibleMiddleDim) {
+  SmallVector<int64_t, 4> result;
+  ASSERT_FALSE(getBroadcastedShape({4, 3, 5, 5}, {3, 7, 5}, result));
+  EXPECT_TRUE(result.empty());
+}
+
+TEST(BroadcastShapeTest, IncompatibleHighDim) {
+  SmallVector<int64_t, 4> result;
+  ASSERT_FALSE(getBroadcastedShape({3, 5, 5}, {4, 5, 5}, result));
+  EXPECT_TRUE(result.empty());
+}
diff --git a/mlir/unittests/Dialect/CMakeLists.txt b/mlir/unittests/Dialect/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..87eccaae8a30e61a3fbb42e8ef4dc27a5f7e2ab0
--- /dev/null
+++ b/mlir/unittests/Dialect/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_mlir_unittest(MLIRDialectTests
+  BroadcastShapeTest.cpp
+)
+target_link_libraries(MLIRDialectTests
+  PRIVATE
+  MLIRIR
+  MLIRDialect)
+
+add_subdirectory(SPIRV)
diff --git a/mlir/unittests/Dialect/QuantOps/QuantizationUtilsTest.cpp b/mlir/unittests/Dialect/QuantOps/QuantizationUtilsTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f6ad302c7c9036be54404ff0dcdf3a289f876ec
--- /dev/null
+++ b/mlir/unittests/Dialect/QuantOps/QuantizationUtilsTest.cpp
@@ -0,0 +1,164 @@
+//===- QuantizationUtilsTest.cpp - unit tests for quantization utils ------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/QuantOps/QuantizeUtils.h"
+#include "mlir/Dialect/QuantOps/UniformSupport.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::quant;
+
+namespace {
+
+// Test UniformQuantizedValueConverter converts all APFloat to a magic number 5.
+class TestUniformQuantizedValueConverter
+    : public UniformQuantizedValueConverter {
+public:
+  TestUniformQuantizedValueConverter(UniformQuantizedType type)
+      : UniformQuantizedValueConverter(type), qtype(type) {}
+  APInt quantizeFloatToInt(APFloat expressedValue) const {
+    return APInt(qtype.getStorageType().cast<IntegerType>().getWidth(), 5L);
+  }
+
+private:
+  UniformQuantizedType qtype;
+};
+
+Attribute getTestFloatAttr(double value, MLIRContext *ctx) {
+  return FloatAttr::get(FloatType::getF32(ctx), value);
+}
+
+template <typename ConcreteAttrClass, typename... Arg>
+ConcreteAttrClass getTestElementsAttr(MLIRContext *ctx, ArrayRef<int64_t> shape,
+                                      Arg... value) {
+  auto eleType = FloatType::getF32(ctx);
+  ShapedType tensorType;
+  if (shape.size() == 1 && shape[0] == -1) {
+    tensorType = UnrankedTensorType::get(eleType);
+  } else {
+    tensorType = RankedTensorType::get(shape, eleType);
+  }
+  return ConcreteAttrClass::get(tensorType, value...);
+}
+
+ElementsAttr getTestSparseElementsAttr(MLIRContext *ctx,
+                                       ArrayRef<int64_t> shape) {
+  auto eleType = FloatType::getF32(ctx);
+  ShapedType tensorType;
+  if (shape.size() == 1 && shape[0] == -1) {
+    tensorType = UnrankedTensorType::get(eleType);
+  } else {
+    tensorType = RankedTensorType::get(shape, eleType);
+  }
+  auto indicesType = RankedTensorType::get({1, 2}, IntegerType::get(64, ctx));
+  auto indices =
+      DenseIntElementsAttr::get(indicesType, {APInt(64, 0), APInt(64, 0)});
+  auto valuesType = RankedTensorType::get({1}, eleType);
+  auto values = DenseFPElementsAttr::get(valuesType, {APFloat(0.0f)});
+  return SparseElementsAttr::get(tensorType, indices, values);
+}
+
+UniformQuantizedType getTestQuantizedType(Type storageType, MLIRContext *ctx) {
+  return UniformQuantizedType::get(/*flags=*/false, storageType,
+                                   FloatType::getF32(ctx), /*scale=*/1.0,
+                                   /*zeroPoint=*/0, /*storageTypeMin=*/0,
+                                   /*storageTypeMax=*/255);
+}
+
+TEST(QuantizationUtilsTest, convertFloatAttrUniform) {
+  MLIRContext ctx;
+  IntegerType convertedType = IntegerType::get(8, &ctx);
+  auto quantizedType = getTestQuantizedType(convertedType, &ctx);
+  TestUniformQuantizedValueConverter converter(quantizedType);
+
+  auto realValue = getTestFloatAttr(1.0, &ctx);
+  Type typeResult;
+  auto valueResult =
+      quantizeAttrUniform(realValue, quantizedType, converter, typeResult);
+
+  EXPECT_EQ(valueResult.cast<IntegerAttr>().getInt(), 5);
+  EXPECT_EQ(
+      valueResult.cast<IntegerAttr>().getType().cast<IntegerType>().getWidth(),
+      convertedType.getWidth());
+}
+
+TEST(QuantizationUtilsTest, convertRankedDenseAttrUniform) {
+  MLIRContext ctx;
+  IntegerType convertedType = IntegerType::get(8, &ctx);
+  auto quantizedType = getTestQuantizedType(convertedType, &ctx);
+  TestUniformQuantizedValueConverter converter(quantizedType);
+  auto realValue = getTestElementsAttr<DenseElementsAttr, ArrayRef<Attribute>>(
+      &ctx, {1, 2}, {getTestFloatAttr(1.0, &ctx), getTestFloatAttr(2.0, &ctx)});
+
+  Type returnedType;
+  auto returnedValue =
+      quantizeAttrUniform(realValue, quantizedType, converter, returnedType);
+
+  // Check Elements attribute shape and kind are not changed.
+  auto tensorType = returnedType.cast<TensorType>();
+  auto expectedTensorType = realValue.getType().cast<TensorType>();
+  EXPECT_EQ(tensorType.getShape(), expectedTensorType.getShape());
+  EXPECT_EQ(tensorType.getElementType(), convertedType);
+  EXPECT_TRUE(returnedValue.isa<DenseIntElementsAttr>());
+
+  // Check Elements attribute element value is expected.
+  auto firstValue = returnedValue.cast<ElementsAttr>().getValue({0, 0});
+  EXPECT_EQ(firstValue.cast<IntegerAttr>().getInt(), 5);
+}
+
+TEST(QuantizationUtilsTest, convertRankedSplatAttrUniform) {
+  MLIRContext ctx;
+  IntegerType convertedType = IntegerType::get(8, &ctx);
+  auto quantizedType = getTestQuantizedType(convertedType, &ctx);
+  TestUniformQuantizedValueConverter converter(quantizedType);
+  auto realValue = getTestElementsAttr<DenseElementsAttr, Attribute>(
+      &ctx, {1, 2}, getTestFloatAttr(1.0, &ctx));
+
+  Type returnedType;
+  auto returnedValue =
+      quantizeAttrUniform(realValue, quantizedType, converter, returnedType);
+
+  // Check Elements attribute shape and kind are not changed.
+  auto tensorType = returnedType.cast<TensorType>();
+  auto expectedTensorType = realValue.getType().cast<TensorType>();
+  EXPECT_EQ(tensorType.getShape(), expectedTensorType.getShape());
+  EXPECT_EQ(tensorType.getElementType(), convertedType);
+  EXPECT_TRUE(returnedValue.isa<SplatElementsAttr>());
+
+  // Check Elements attribute element value is expected.
+  auto firstValue = returnedValue.cast<ElementsAttr>().getValue({0, 0});
+  EXPECT_EQ(firstValue.cast<IntegerAttr>().getInt(), 5);
+}
+
+TEST(QuantizationUtilsTest, convertRankedSparseAttrUniform) {
+  MLIRContext ctx;
+  IntegerType convertedType = IntegerType::get(8, &ctx);
+  auto quantizedType = getTestQuantizedType(convertedType, &ctx);
+  TestUniformQuantizedValueConverter converter(quantizedType);
+  auto realValue = getTestSparseElementsAttr(&ctx, {1, 2});
+
+  Type returnedType;
+  auto returnedValue =
+      quantizeAttrUniform(realValue, quantizedType, converter, returnedType);
+
+  // Check Elements attribute shape and kind are not changed.
+  auto tensorType = returnedType.cast<TensorType>();
+  auto expectedTensorType = realValue.getType().cast<TensorType>();
+  EXPECT_EQ(tensorType.getShape(), expectedTensorType.getShape());
+  EXPECT_EQ(tensorType.getElementType(), convertedType);
+  EXPECT_EQ(returnedValue.getKind(), StandardAttributes::SparseElements);
+
+  // Check Elements attribute element value is expected.
+  auto firstValue = returnedValue.cast<ElementsAttr>().getValue({0, 0});
+  EXPECT_EQ(firstValue.cast<IntegerAttr>().getInt(), 5);
+}
+
+} // end namespace
diff --git a/mlir/unittests/Dialect/SPIRV/CMakeLists.txt b/mlir/unittests/Dialect/SPIRV/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b444b5c0220a7117e7263bc548b82899fca760f6
--- /dev/null
+++ b/mlir/unittests/Dialect/SPIRV/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_mlir_unittest(MLIRSPIRVTests
+  DeserializationTest.cpp
+  SerializationTest.cpp
+)
+target_link_libraries(MLIRSPIRVTests
+  PRIVATE
+  MLIRSPIRV
+  MLIRSPIRVSerialization)
+
+whole_archive_link(MLIRSPIRVTests MLIRSPIRV)
+
diff --git a/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp b/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72fee15ac9050b25103aa3ea5e0aca6d2c6d0fcc
--- /dev/null
+++ b/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp
@@ -0,0 +1,286 @@
+//===- DeserializationTest.cpp - SPIR-V Deserialization Tests -------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of this file is to provide negative deserialization tests.
+// For positive deserialization tests, please use serialization and
+// deserialization for roundtripping.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/SPIRVBinaryUtils.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/Serialization.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/MLIRContext.h"
+#include "gmock/gmock.h"
+
+#include <memory>
+
+using namespace mlir;
+
+using ::testing::StrEq;
+
+//===----------------------------------------------------------------------===//
+// Test Fixture
+//===----------------------------------------------------------------------===//
+
+/// A deserialization test fixture providing minimal SPIR-V building and
+/// diagnostic checking utilities.
+class DeserializationTest : public ::testing::Test {
+protected:
+  DeserializationTest() {
+    // Register a diagnostic handler to capture the diagnostic so that we can
+    // check it later.
+    context.getDiagEngine().registerHandler([&](Diagnostic &diag) {
+      diagnostic.reset(new Diagnostic(std::move(diag)));
+    });
+  }
+
+  /// Performs deserialization and returns the constructed spv.module op.
+  Optional<spirv::ModuleOp> deserialize() {
+    return spirv::deserialize(binary, &context);
+  }
+
+  /// Checks there is a diagnostic generated with the given `errorMessage`.
+  void expectDiagnostic(StringRef errorMessage) {
+    ASSERT_NE(nullptr, diagnostic.get());
+
+    // TODO(antiagainst): check error location too.
+    EXPECT_THAT(diagnostic->str(), StrEq(errorMessage));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // SPIR-V builder methods
+  //===--------------------------------------------------------------------===//
+
+  /// Adds the SPIR-V module header to `binary`.
+  void addHeader() { spirv::appendModuleHeader(binary, /*idBound=*/0); }
+
+  /// Adds the SPIR-V instruction into `binary`.
+  void addInstruction(spirv::Opcode op, ArrayRef<uint32_t> operands) {
+    uint32_t wordCount = 1 + operands.size();
+    binary.push_back(spirv::getPrefixedOpcode(wordCount, op));
+    binary.append(operands.begin(), operands.end());
+  }
+
+  uint32_t addVoidType() {
+    auto id = nextID++;
+    addInstruction(spirv::Opcode::OpTypeVoid, {id});
+    return id;
+  }
+
+  uint32_t addIntType(uint32_t bitwidth) {
+    auto id = nextID++;
+    addInstruction(spirv::Opcode::OpTypeInt, {id, bitwidth, /*signedness=*/1});
+    return id;
+  }
+
+  uint32_t addStructType(ArrayRef<uint32_t> memberTypes) {
+    auto id = nextID++;
+    SmallVector<uint32_t, 2> words;
+    words.push_back(id);
+    words.append(memberTypes.begin(), memberTypes.end());
+    addInstruction(spirv::Opcode::OpTypeStruct, words);
+    return id;
+  }
+
+  uint32_t addFunctionType(uint32_t retType, ArrayRef<uint32_t> paramTypes) {
+    auto id = nextID++;
+    SmallVector<uint32_t, 4> operands;
+    operands.push_back(id);
+    operands.push_back(retType);
+    operands.append(paramTypes.begin(), paramTypes.end());
+    addInstruction(spirv::Opcode::OpTypeFunction, operands);
+    return id;
+  }
+
+  uint32_t addFunction(uint32_t retType, uint32_t fnType) {
+    auto id = nextID++;
+    addInstruction(spirv::Opcode::OpFunction,
+                   {retType, id,
+                    static_cast<uint32_t>(spirv::FunctionControl::None),
+                    fnType});
+    return id;
+  }
+
+  void addFunctionEnd() { addInstruction(spirv::Opcode::OpFunctionEnd, {}); }
+
+  void addReturn() { addInstruction(spirv::Opcode::OpReturn, {}); }
+
+protected:
+  SmallVector<uint32_t, 5> binary;
+  uint32_t nextID = 1;
+  MLIRContext context;
+  std::unique_ptr<Diagnostic> diagnostic;
+};
+
+//===----------------------------------------------------------------------===//
+// Basics
+//===----------------------------------------------------------------------===//
+
+TEST_F(DeserializationTest, EmptyModuleFailure) {
+  ASSERT_EQ(llvm::None, deserialize());
+  expectDiagnostic("SPIR-V binary module must have a 5-word header");
+}
+
+TEST_F(DeserializationTest, WrongMagicNumberFailure) {
+  addHeader();
+  binary.front() = 0xdeadbeef; // Change to a wrong magic number
+  ASSERT_EQ(llvm::None, deserialize());
+  expectDiagnostic("incorrect magic number");
+}
+
+TEST_F(DeserializationTest, OnlyHeaderSuccess) {
+  addHeader();
+  EXPECT_NE(llvm::None, deserialize());
+}
+
+TEST_F(DeserializationTest, ZeroWordCountFailure) {
+  addHeader();
+  binary.push_back(0); // OpNop with zero word count
+
+  ASSERT_EQ(llvm::None, deserialize());
+  expectDiagnostic("word count cannot be zero");
+}
+
+TEST_F(DeserializationTest, InsufficientWordFailure) {
+  addHeader();
+  binary.push_back((2u << 16) |
+                   static_cast<uint32_t>(spirv::Opcode::OpTypeVoid));
+  // Missing word for type <id>
+
+  ASSERT_EQ(llvm::None, deserialize());
+  expectDiagnostic("insufficient words for the last instruction");
+}
+
+//===----------------------------------------------------------------------===//
+// Types
+//===----------------------------------------------------------------------===//
+
+TEST_F(DeserializationTest, IntTypeMissingSignednessFailure) {
+  addHeader();
+  addInstruction(spirv::Opcode::OpTypeInt, {nextID++, 32});
+
+  ASSERT_EQ(llvm::None, deserialize());
+  expectDiagnostic("OpTypeInt must have bitwidth and signedness parameters");
+}
+
+//===----------------------------------------------------------------------===//
+// StructType
+//===----------------------------------------------------------------------===//
+
+TEST_F(DeserializationTest, OpMemberNameSuccess) {
+  addHeader();
+  SmallVector<uint32_t, 5> typeDecl;
+  std::swap(typeDecl, binary);
+
+  auto int32Type = addIntType(32);
+  auto structType = addStructType({int32Type, int32Type});
+  std::swap(typeDecl, binary);
+
+  SmallVector<uint32_t, 5> operands1 = {structType, 0};
+  spirv::encodeStringLiteralInto(operands1, "i1");
+  addInstruction(spirv::Opcode::OpMemberName, operands1);
+
+  SmallVector<uint32_t, 5> operands2 = {structType, 1};
+  spirv::encodeStringLiteralInto(operands2, "i2");
+  addInstruction(spirv::Opcode::OpMemberName, operands2);
+
+  binary.append(typeDecl.begin(), typeDecl.end());
+  EXPECT_NE(llvm::None, deserialize());
+}
+
+TEST_F(DeserializationTest, OpMemberNameMissingOperands) {
+  addHeader();
+  SmallVector<uint32_t, 5> typeDecl;
+  std::swap(typeDecl, binary);
+
+  auto int32Type = addIntType(32);
+  auto int64Type = addIntType(64);
+  auto structType = addStructType({int32Type, int64Type});
+  std::swap(typeDecl, binary);
+
+  SmallVector<uint32_t, 5> operands1 = {structType};
+  addInstruction(spirv::Opcode::OpMemberName, operands1);
+
+  binary.append(typeDecl.begin(), typeDecl.end());
+  ASSERT_EQ(llvm::None, deserialize());
+  expectDiagnostic("OpMemberName must have at least 3 operands");
+}
+
+TEST_F(DeserializationTest, OpMemberNameExcessOperands) {
+  addHeader();
+  SmallVector<uint32_t, 5> typeDecl;
+  std::swap(typeDecl, binary);
+
+  auto int32Type = addIntType(32);
+  auto structType = addStructType({int32Type});
+  std::swap(typeDecl, binary);
+
+  SmallVector<uint32_t, 5> operands = {structType, 0};
+  spirv::encodeStringLiteralInto(operands, "int32");
+  operands.push_back(42);
+  addInstruction(spirv::Opcode::OpMemberName, operands);
+
+  binary.append(typeDecl.begin(), typeDecl.end());
+  ASSERT_EQ(llvm::None, deserialize());
+  expectDiagnostic("unexpected trailing words in OpMemberName instruction");
+}
+
+//===----------------------------------------------------------------------===//
+// Functions
+//===----------------------------------------------------------------------===//
+
+TEST_F(DeserializationTest, FunctionMissingEndFailure) {
+  addHeader();
+  auto voidType = addVoidType();
+  auto fnType = addFunctionType(voidType, {});
+  addFunction(voidType, fnType);
+  // Missing OpFunctionEnd
+
+  ASSERT_EQ(llvm::None, deserialize());
+  expectDiagnostic("expected OpFunctionEnd instruction");
+}
+
+TEST_F(DeserializationTest, FunctionMissingParameterFailure) {
+  addHeader();
+  auto voidType = addVoidType();
+  auto i32Type = addIntType(32);
+  auto fnType = addFunctionType(voidType, {i32Type});
+  addFunction(voidType, fnType);
+  // Missing OpFunctionParameter
+
+  ASSERT_EQ(llvm::None, deserialize());
+  expectDiagnostic("expected OpFunctionParameter instruction");
+}
+
+TEST_F(DeserializationTest, FunctionMissingLabelForFirstBlockFailure) {
+  addHeader();
+  auto voidType = addVoidType();
+  auto fnType = addFunctionType(voidType, {});
+  addFunction(voidType, fnType);
+  // Missing OpLabel
+  addReturn();
+  addFunctionEnd();
+
+  ASSERT_EQ(llvm::None, deserialize());
+  expectDiagnostic("a basic block must start with OpLabel");
+}
+
+TEST_F(DeserializationTest, FunctionMalformedLabelFailure) {
+  addHeader();
+  auto voidType = addVoidType();
+  auto fnType = addFunctionType(voidType, {});
+  addFunction(voidType, fnType);
+  addInstruction(spirv::Opcode::OpLabel, {}); // Malformed OpLabel
+  addReturn();
+  addFunctionEnd();
+
+  ASSERT_EQ(llvm::None, deserialize());
+  expectDiagnostic("OpLabel should only have result <id>");
+}
diff --git a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..61f5fcbfb7b05b0d9b131fdc1b11b512630fdab7
--- /dev/null
+++ b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
@@ -0,0 +1,115 @@
+//===- SerializationTest.cpp - SPIR-V Serialization Tests -----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains corner case tests for the SPIR-V serializer that are not
+// covered by normal serialization and deserialization roundtripping.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/Serialization.h"
+#include "mlir/Dialect/SPIRV/SPIRVBinaryUtils.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "gmock/gmock.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Test Fixture
+//===----------------------------------------------------------------------===//
+
+class SerializationTest : public ::testing::Test {
+protected:
+  SerializationTest() { createModuleOp(); }
+
+  void createModuleOp() {
+    Builder builder(&context);
+    OperationState state(UnknownLoc::get(&context),
+                         spirv::ModuleOp::getOperationName());
+    state.addAttribute("addressing_model",
+                       builder.getI32IntegerAttr(static_cast<uint32_t>(
+                           spirv::AddressingModel::Logical)));
+    state.addAttribute("memory_model",
+                       builder.getI32IntegerAttr(
+                           static_cast<uint32_t>(spirv::MemoryModel::GLSL450)));
+    spirv::ModuleOp::build(&builder, state);
+    module = cast<spirv::ModuleOp>(Operation::create(state));
+  }
+
+  Type getFloatStructType() {
+    OpBuilder opBuilder(module.body());
+    llvm::SmallVector<Type, 1> elementTypes{opBuilder.getF32Type()};
+    llvm::SmallVector<spirv::StructType::LayoutInfo, 1> layoutInfo{0};
+    auto structType = spirv::StructType::get(elementTypes, layoutInfo);
+    return structType;
+  }
+
+  void addGlobalVar(Type type, llvm::StringRef name) {
+    OpBuilder opBuilder(module.body());
+    auto ptrType = spirv::PointerType::get(type, spirv::StorageClass::Uniform);
+    opBuilder.create<spirv::GlobalVariableOp>(
+        UnknownLoc::get(&context), TypeAttr::get(ptrType),
+        opBuilder.getStringAttr(name), nullptr);
+  }
+
+  bool findInstruction(llvm::function_ref<bool(spirv::Opcode opcode,
+                                               ArrayRef<uint32_t> operands)>
+                           matchFn) {
+    auto binarySize = binary.size();
+    auto begin = binary.begin();
+    auto currOffset = spirv::kHeaderWordCount;
+
+    while (currOffset < binarySize) {
+      auto wordCount = binary[currOffset] >> 16;
+      if (!wordCount || (currOffset + wordCount > binarySize)) {
+        return false;
+      }
+      spirv::Opcode opcode =
+          static_cast<spirv::Opcode>(binary[currOffset] & 0xffff);
+
+      if (matchFn(opcode,
+                  llvm::ArrayRef<uint32_t>(begin + currOffset + 1,
+                                           begin + currOffset + wordCount))) {
+        return true;
+      }
+      currOffset += wordCount;
+    }
+    return false;
+  }
+
+protected:
+  MLIRContext context;
+  spirv::ModuleOp module;
+  SmallVector<uint32_t, 0> binary;
+};
+
+//===----------------------------------------------------------------------===//
+// Block decoration
+//===----------------------------------------------------------------------===//
+
+TEST_F(SerializationTest, BlockDecorationTest) {
+  auto structType = getFloatStructType();
+  addGlobalVar(structType, "var0");
+  ASSERT_TRUE(succeeded(spirv::serialize(module, binary)));
+  auto hasBlockDecoration = [](spirv::Opcode opcode,
+                               ArrayRef<uint32_t> operands) -> bool {
+    if (opcode != spirv::Opcode::OpDecorate || operands.size() != 2)
+      return false;
+    return operands[1] == static_cast<uint32_t>(spirv::Decoration::Block);
+  };
+  EXPECT_TRUE(findInstruction(hasBlockDecoration));
+}
diff --git a/mlir/unittests/IR/AttributeTest.cpp b/mlir/unittests/IR/AttributeTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5a1750e11231522eb25fba1aa88942c9971be504
--- /dev/null
+++ b/mlir/unittests/IR/AttributeTest.cpp
@@ -0,0 +1,138 @@
+//===- AttributeTest.cpp - Attribute unit tests ---------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/StandardTypes.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+template <typename EltTy>
+static void testSplat(Type eltType, const EltTy &splatElt) {
+  VectorType shape = VectorType::get({2, 1}, eltType);
+
+  // Check that the generated splat is the same for 1 element and N elements.
+  DenseElementsAttr splat = DenseElementsAttr::get(shape, splatElt);
+  EXPECT_TRUE(splat.isSplat());
+
+  auto detectedSplat =
+      DenseElementsAttr::get(shape, llvm::makeArrayRef({splatElt, splatElt}));
+  EXPECT_EQ(detectedSplat, splat);
+}
+
+namespace {
+TEST(DenseSplatTest, BoolSplat) {
+  MLIRContext context;
+  IntegerType boolTy = IntegerType::get(1, &context);
+  VectorType shape = VectorType::get({2, 2}, boolTy);
+
+  // Check that splat is automatically detected for boolean values.
+  /// True.
+  DenseElementsAttr trueSplat = DenseElementsAttr::get(shape, true);
+  EXPECT_TRUE(trueSplat.isSplat());
+  /// False.
+  DenseElementsAttr falseSplat = DenseElementsAttr::get(shape, false);
+  EXPECT_TRUE(falseSplat.isSplat());
+  EXPECT_NE(falseSplat, trueSplat);
+
+  /// Detect and handle splat within 8 elements (bool values are bit-packed).
+  /// True.
+  auto detectedSplat = DenseElementsAttr::get(shape, {true, true, true, true});
+  EXPECT_EQ(detectedSplat, trueSplat);
+  /// False.
+  detectedSplat = DenseElementsAttr::get(shape, {false, false, false, false});
+  EXPECT_EQ(detectedSplat, falseSplat);
+}
+
+TEST(DenseSplatTest, LargeBoolSplat) {
+  constexpr int64_t boolCount = 56;
+
+  MLIRContext context;
+  IntegerType boolTy = IntegerType::get(1, &context);
+  VectorType shape = VectorType::get({boolCount}, boolTy);
+
+  // Check that splat is automatically detected for boolean values.
+  /// True.
+  DenseElementsAttr trueSplat = DenseElementsAttr::get(shape, true);
+  DenseElementsAttr falseSplat = DenseElementsAttr::get(shape, false);
+  EXPECT_TRUE(trueSplat.isSplat());
+  EXPECT_TRUE(falseSplat.isSplat());
+
+  /// Detect that the large boolean arrays are properly splatted.
+  /// True.
+  SmallVector<bool, 64> trueValues(boolCount, true);
+  auto detectedSplat = DenseElementsAttr::get(shape, trueValues);
+  EXPECT_EQ(detectedSplat, trueSplat);
+  /// False.
+  SmallVector<bool, 64> falseValues(boolCount, false);
+  detectedSplat = DenseElementsAttr::get(shape, falseValues);
+  EXPECT_EQ(detectedSplat, falseSplat);
+}
+
+TEST(DenseSplatTest, BoolNonSplat) {
+  MLIRContext context;
+  IntegerType boolTy = IntegerType::get(1, &context);
+  VectorType shape = VectorType::get({6}, boolTy);
+
+  // Check that we properly handle non-splat values.
+  DenseElementsAttr nonSplat =
+      DenseElementsAttr::get(shape, {false, false, true, false, false, true});
+  EXPECT_FALSE(nonSplat.isSplat());
+}
+
+TEST(DenseSplatTest, OddIntSplat) {
+  // Test detecting a splat with an odd(non 8-bit) integer bitwidth.
+  MLIRContext context;
+  constexpr size_t intWidth = 19;
+  IntegerType intTy = IntegerType::get(intWidth, &context);
+  APInt value(intWidth, 10);
+
+  testSplat(intTy, value);
+}
+
+TEST(DenseSplatTest, Int32Splat) {
+  MLIRContext context;
+  IntegerType intTy = IntegerType::get(32, &context);
+  int value = 64;
+
+  testSplat(intTy, value);
+}
+
+TEST(DenseSplatTest, IntAttrSplat) {
+  MLIRContext context;
+  IntegerType intTy = IntegerType::get(85, &context);
+  Attribute value = IntegerAttr::get(intTy, 109);
+
+  testSplat(intTy, value);
+}
+
+TEST(DenseSplatTest, F32Splat) {
+  MLIRContext context;
+  FloatType floatTy = FloatType::getF32(&context);
+  float value = 10.0;
+
+  testSplat(floatTy, value);
+}
+
+TEST(DenseSplatTest, F64Splat) {
+  MLIRContext context;
+  FloatType floatTy = FloatType::getF64(&context);
+  double value = 10.0;
+
+  testSplat(floatTy, APFloat(value));
+}
+
+TEST(DenseSplatTest, FloatAttrSplat) {
+  MLIRContext context;
+  FloatType floatTy = FloatType::getBF16(&context);
+  Attribute value = FloatAttr::get(floatTy, 10.0);
+
+  testSplat(floatTy, value);
+}
+} // end namespace
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..15393585e07aabbe870e4b1e88f5aef5754baf7e
--- /dev/null
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_mlir_unittest(MLIRIRTests
+  AttributeTest.cpp
+  DialectTest.cpp
+  OperationSupportTest.cpp
+  StringExtrasTest.cpp
+)
+target_link_libraries(MLIRIRTests
+  PRIVATE
+  MLIRIR)
diff --git a/mlir/unittests/IR/DialectTest.cpp b/mlir/unittests/IR/DialectTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1438d3226529d4d2ce24f19df66ebc5105c1df91
--- /dev/null
+++ b/mlir/unittests/IR/DialectTest.cpp
@@ -0,0 +1,29 @@
+//===- DialectTest.cpp - Dialect unit tests -------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Dialect.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+namespace {
+struct TestDialect : public Dialect {
+  TestDialect(MLIRContext *context) : Dialect(/*name=*/"test", context) {}
+};
+
+TEST(DialectDeathTest, MultipleDialectsWithSameNamespace) {
+  MLIRContext context;
+
+  // Registering a dialect with the same namespace twice should result in a
+  // failure.
+  new TestDialect(&context);
+  ASSERT_DEATH(new TestDialect(&context), "");
+}
+
+} // end namespace
diff --git a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c5bc517978584443c2398768eb977b4f405680c0
--- /dev/null
+++ b/mlir/unittests/IR/OperationSupportTest.cpp
@@ -0,0 +1,108 @@
+//===- OperationSupportTest.cpp - Operation support unit tests ------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/StandardTypes.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+namespace {
+Operation *createOp(MLIRContext *context, bool resizableOperands,
+                    ArrayRef<Value> operands = llvm::None,
+                    ArrayRef<Type> resultTypes = llvm::None) {
+  return Operation::create(
+      UnknownLoc::get(context), OperationName("foo.bar", context), resultTypes,
+      operands, llvm::None, llvm::None, 0, resizableOperands);
+}
+
+TEST(OperandStorageTest, NonResizable) {
+  MLIRContext context;
+  Builder builder(&context);
+
+  Operation *useOp =
+      createOp(&context, /*resizableOperands=*/false, /*operands=*/llvm::None,
+               builder.getIntegerType(16));
+  Value operand = useOp->getResult(0);
+
+  // Create a non-resizable operation with one operand.
+  Operation *user = createOp(&context, /*resizableOperands=*/false, operand,
+                             builder.getIntegerType(16));
+
+  // Sanity check the storage.
+  EXPECT_EQ(user->hasResizableOperandsList(), false);
+
+  // The same number of operands is okay.
+  user->setOperands(operand);
+  EXPECT_EQ(user->getNumOperands(), 1u);
+
+  // Removing is okay.
+  user->setOperands(llvm::None);
+  EXPECT_EQ(user->getNumOperands(), 0u);
+
+  // Destroy the operations.
+  user->destroy();
+  useOp->destroy();
+}
+
+TEST(OperandStorageDeathTest, AddToNonResizable) {
+  MLIRContext context;
+  Builder builder(&context);
+
+  Operation *useOp =
+      createOp(&context, /*resizableOperands=*/false, /*operands=*/llvm::None,
+               builder.getIntegerType(16));
+  Value operand = useOp->getResult(0);
+
+  // Create a non-resizable operation with one operand.
+  Operation *user = createOp(&context, /*resizableOperands=*/false, operand,
+                             builder.getIntegerType(16));
+
+  // Sanity check the storage.
+  EXPECT_EQ(user->hasResizableOperandsList(), false);
+
+  // Adding operands to a non resizable operation should result in a failure.
+  ASSERT_DEATH(user->setOperands({operand, operand}), "");
+}
+
+TEST(OperandStorageTest, Resizable) {
+  MLIRContext context;
+  Builder builder(&context);
+
+  Operation *useOp =
+      createOp(&context, /*resizableOperands=*/false, /*operands=*/llvm::None,
+               builder.getIntegerType(16));
+  Value operand = useOp->getResult(0);
+
+  // Create a resizable operation with one operand.
+  Operation *user = createOp(&context, /*resizableOperands=*/true, operand,
+                             builder.getIntegerType(16));
+
+  // Sanity check the storage.
+  EXPECT_EQ(user->hasResizableOperandsList(), true);
+
+  // The same number of operands is okay.
+  user->setOperands(operand);
+  EXPECT_EQ(user->getNumOperands(), 1u);
+
+  // Removing is okay.
+  user->setOperands(llvm::None);
+  EXPECT_EQ(user->getNumOperands(), 0u);
+
+  // Adding more operands is okay.
+  user->setOperands({operand, operand, operand});
+  EXPECT_EQ(user->getNumOperands(), 3u);
+
+  // Destroy the operations.
+  user->destroy();
+  useOp->destroy();
+}
+
+} // end namespace
diff --git a/mlir/unittests/IR/StringExtrasTest.cpp b/mlir/unittests/IR/StringExtrasTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3773006faee649cb0a43327499ad48ab719d5fbb
--- /dev/null
+++ b/mlir/unittests/IR/StringExtrasTest.cpp
@@ -0,0 +1,65 @@
+//===- StringExtrasTest.cpp - Tests for utility methods in StringExtras.h -===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/StringExtras.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+
+static void testConvertToSnakeCase(llvm::StringRef input,
+                                   llvm::StringRef expected) {
+  EXPECT_EQ(convertToSnakeCase(input), expected.str());
+}
+
+TEST(StringExtras, ConvertToSnakeCase) {
+  testConvertToSnakeCase("OpName", "op_name");
+  testConvertToSnakeCase("opName", "op_name");
+  testConvertToSnakeCase("_OpName", "_op_name");
+  testConvertToSnakeCase("Op_Name", "op_name");
+  testConvertToSnakeCase("", "");
+  testConvertToSnakeCase("A", "a");
+  testConvertToSnakeCase("_", "_");
+  testConvertToSnakeCase("a", "a");
+  testConvertToSnakeCase("op_name", "op_name");
+  testConvertToSnakeCase("_op_name", "_op_name");
+  testConvertToSnakeCase("__op_name", "__op_name");
+  testConvertToSnakeCase("op__name", "op__name");
+}
+
+template <bool capitalizeFirst>
+static void testConvertToCamelCase(llvm::StringRef input,
+                                   llvm::StringRef expected) {
+  EXPECT_EQ(convertToCamelCase(input, capitalizeFirst), expected.str());
+}
+
+TEST(StringExtras, ConvertToCamelCase) {
+  testConvertToCamelCase<false>("op_name", "opName");
+  testConvertToCamelCase<false>("_op_name", "_opName");
+  testConvertToCamelCase<false>("__op_name", "_OpName");
+  testConvertToCamelCase<false>("op__name", "op_Name");
+  testConvertToCamelCase<false>("", "");
+  testConvertToCamelCase<false>("A", "A");
+  testConvertToCamelCase<false>("_", "_");
+  testConvertToCamelCase<false>("a", "a");
+  testConvertToCamelCase<false>("OpName", "OpName");
+  testConvertToCamelCase<false>("opName", "opName");
+  testConvertToCamelCase<false>("_OpName", "_OpName");
+  testConvertToCamelCase<false>("Op_Name", "Op_Name");
+  testConvertToCamelCase<true>("op_name", "OpName");
+  testConvertToCamelCase<true>("_op_name", "_opName");
+  testConvertToCamelCase<true>("__op_name", "_OpName");
+  testConvertToCamelCase<true>("op__name", "Op_Name");
+  testConvertToCamelCase<true>("", "");
+  testConvertToCamelCase<true>("A", "A");
+  testConvertToCamelCase<true>("_", "_");
+  testConvertToCamelCase<true>("a", "A");
+  testConvertToCamelCase<true>("OpName", "OpName");
+  testConvertToCamelCase<true>("_OpName", "_OpName");
+  testConvertToCamelCase<true>("Op_Name", "Op_Name");
+  testConvertToCamelCase<true>("opName", "OpName");
+}
diff --git a/mlir/unittests/Pass/AnalysisManagerTest.cpp b/mlir/unittests/Pass/AnalysisManagerTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ea2c3f66e55080b656db0ab10b9d06d202f274b
--- /dev/null
+++ b/mlir/unittests/Pass/AnalysisManagerTest.cpp
@@ -0,0 +1,141 @@
+//===- AnalysisManagerTest.cpp - AnalysisManager unit tests ---------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Pass/AnalysisManager.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+namespace {
+/// Minimal class definitions for two analyses.
+struct MyAnalysis {
+  MyAnalysis(Operation *) {}
+};
+struct OtherAnalysis {
+  OtherAnalysis(Operation *) {}
+};
+
+TEST(AnalysisManagerTest, FineGrainModuleAnalysisPreservation) {
+  MLIRContext context;
+
+  // Test fine grain invalidation of the module analysis manager.
+  OwningModuleRef module(ModuleOp::create(UnknownLoc::get(&context)));
+  ModuleAnalysisManager mam(*module, /*passInstrumentor=*/nullptr);
+  AnalysisManager am = mam;
+
+  // Query two different analyses, but only preserve one before invalidating.
+  am.getAnalysis<MyAnalysis>();
+  am.getAnalysis<OtherAnalysis>();
+
+  detail::PreservedAnalyses pa;
+  pa.preserve<MyAnalysis>();
+  am.invalidate(pa);
+
+  // Check that only MyAnalysis is preserved.
+  EXPECT_TRUE(am.getCachedAnalysis<MyAnalysis>().hasValue());
+  EXPECT_FALSE(am.getCachedAnalysis<OtherAnalysis>().hasValue());
+}
+
+TEST(AnalysisManagerTest, FineGrainFunctionAnalysisPreservation) {
+  MLIRContext context;
+  Builder builder(&context);
+
+  // Create a function and a module.
+  OwningModuleRef module(ModuleOp::create(UnknownLoc::get(&context)));
+  FuncOp func1 =
+      FuncOp::create(builder.getUnknownLoc(), "foo",
+                     builder.getFunctionType(llvm::None, llvm::None));
+  module->push_back(func1);
+
+  // Test fine grain invalidation of the function analysis manager.
+  ModuleAnalysisManager mam(*module, /*passInstrumentor=*/nullptr);
+  AnalysisManager am = mam;
+  AnalysisManager fam = am.slice(func1);
+
+  // Query two different analyses, but only preserve one before invalidating.
+  fam.getAnalysis<MyAnalysis>();
+  fam.getAnalysis<OtherAnalysis>();
+
+  detail::PreservedAnalyses pa;
+  pa.preserve<MyAnalysis>();
+  fam.invalidate(pa);
+
+  // Check that only MyAnalysis is preserved.
+  EXPECT_TRUE(fam.getCachedAnalysis<MyAnalysis>().hasValue());
+  EXPECT_FALSE(fam.getCachedAnalysis<OtherAnalysis>().hasValue());
+}
+
+TEST(AnalysisManagerTest, FineGrainChildFunctionAnalysisPreservation) {
+  MLIRContext context;
+  Builder builder(&context);
+
+  // Create a function and a module.
+  OwningModuleRef module(ModuleOp::create(UnknownLoc::get(&context)));
+  FuncOp func1 =
+      FuncOp::create(builder.getUnknownLoc(), "foo",
+                     builder.getFunctionType(llvm::None, llvm::None));
+  module->push_back(func1);
+
+  // Test fine grain invalidation of a function analysis from within a module
+  // analysis manager.
+  ModuleAnalysisManager mam(*module, /*passInstrumentor=*/nullptr);
+  AnalysisManager am = mam;
+
+  // Check that the analysis cache is initially empty.
+  EXPECT_FALSE(am.getCachedChildAnalysis<MyAnalysis>(func1).hasValue());
+
+  // Query two different analyses, but only preserve one before invalidating.
+  am.getChildAnalysis<MyAnalysis>(func1);
+  am.getChildAnalysis<OtherAnalysis>(func1);
+
+  detail::PreservedAnalyses pa;
+  pa.preserve<MyAnalysis>();
+  am.invalidate(pa);
+
+  // Check that only MyAnalysis is preserved.
+  EXPECT_TRUE(am.getCachedChildAnalysis<MyAnalysis>(func1).hasValue());
+  EXPECT_FALSE(am.getCachedChildAnalysis<OtherAnalysis>(func1).hasValue());
+}
+
+/// Test analyses with custom invalidation logic.
+struct TestAnalysisSet {};
+
+struct CustomInvalidatingAnalysis {
+  CustomInvalidatingAnalysis(Operation *) {}
+
+  bool isInvalidated(const AnalysisManager::PreservedAnalyses &pa) {
+    return !pa.isPreserved<TestAnalysisSet>();
+  }
+};
+
+TEST(AnalysisManagerTest, CustomInvalidation) {
+  MLIRContext context;
+  Builder builder(&context);
+
+  // Create a function and a module.
+  OwningModuleRef module(ModuleOp::create(UnknownLoc::get(&context)));
+  ModuleAnalysisManager mam(*module, /*passInstrumentor=*/nullptr);
+  AnalysisManager am = mam;
+
+  detail::PreservedAnalyses pa;
+
+  // Check that the analysis is invalidated properly.
+  am.getAnalysis<CustomInvalidatingAnalysis>();
+  am.invalidate(pa);
+  EXPECT_FALSE(am.getCachedAnalysis<CustomInvalidatingAnalysis>().hasValue());
+
+  // Check that the analysis is preserved properly.
+  am.getAnalysis<CustomInvalidatingAnalysis>();
+  pa.preserve<TestAnalysisSet>();
+  am.invalidate(pa);
+  EXPECT_TRUE(am.getCachedAnalysis<CustomInvalidatingAnalysis>().hasValue());
+}
+} // end namespace
diff --git a/mlir/unittests/Pass/CMakeLists.txt b/mlir/unittests/Pass/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a5aaee378f3396842cc8adad1cb180dc1a841890
--- /dev/null
+++ b/mlir/unittests/Pass/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_mlir_unittest(MLIRPassTests
+  AnalysisManagerTest.cpp
+)
+target_link_libraries(MLIRPassTests
+  PRIVATE
+  MLIRPass)
diff --git a/mlir/unittests/Quantizer/Support/RulesTest.cpp b/mlir/unittests/Quantizer/Support/RulesTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e2593848cbf6f5c0ff563973ca79d7b83c4c6bf2
--- /dev/null
+++ b/mlir/unittests/Quantizer/Support/RulesTest.cpp
@@ -0,0 +1,99 @@
+//===- RulesTest.cpp - Rules unit tests -----------------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Quantizer/Support/Rules.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+namespace {
+
+using TestDiscreteFact = DiscreteFact<int>;
+
+TEST(ExpandingMinMaxReducer, Basic) {
+  ExpandingMinMaxFact f;
+  EXPECT_FALSE(f.hasValue());
+
+  // First assertion always modifies.
+  EXPECT_TRUE(modified(f.assertValue(0, {-1.0, 1.0})));
+  EXPECT_TRUE(f.hasValue());
+  EXPECT_EQ(std::make_pair(-1.0, 1.0), f.getValue());
+  EXPECT_EQ(0, f.getSalience());
+
+  // Assertion in the same band expands.
+  EXPECT_TRUE(modified(f.assertValue(0, {-0.5, 2.0})));
+  EXPECT_TRUE(f.hasValue());
+  EXPECT_EQ(std::make_pair(-1.0, 2.0), f.getValue());
+  EXPECT_EQ(0, f.getSalience());
+
+  EXPECT_TRUE(modified(f.assertValue(0, {-2.0, 0.5})));
+  EXPECT_TRUE(f.hasValue());
+  EXPECT_EQ(std::make_pair(-2.0, 2.0), f.getValue());
+  EXPECT_EQ(0, f.getSalience());
+
+  // Same band smaller bound does not modify.
+  EXPECT_FALSE(modified(f.assertValue(0, {-0.5, 0.5})));
+  EXPECT_TRUE(f.hasValue());
+  EXPECT_EQ(std::make_pair(-2.0, 2.0), f.getValue());
+  EXPECT_EQ(0, f.getSalience());
+
+  // Higher salience overrides.
+  EXPECT_TRUE(modified(f.assertValue(10, {-0.2, 0.2})));
+  EXPECT_TRUE(f.hasValue());
+  EXPECT_EQ(std::make_pair(-0.2, 0.2), f.getValue());
+  EXPECT_EQ(10, f.getSalience());
+
+  // Lower salience no-ops.
+  EXPECT_FALSE(modified(f.assertValue(5, {-2.0, 2.0})));
+  EXPECT_TRUE(f.hasValue());
+  EXPECT_EQ(std::make_pair(-0.2, 0.2), f.getValue());
+  EXPECT_EQ(10, f.getSalience());
+
+  // Merge from a fact without a value no-ops.
+  ExpandingMinMaxFact f1;
+  EXPECT_FALSE(modified(f.mergeFrom(f1)));
+  EXPECT_TRUE(f.hasValue());
+  EXPECT_EQ(std::make_pair(-0.2, 0.2), f.getValue());
+  EXPECT_EQ(10, f.getSalience());
+
+  // Merge from a fact with a value merges.
+  EXPECT_TRUE(modified(f1.mergeFrom(f)));
+  EXPECT_TRUE(f1.hasValue());
+  EXPECT_EQ(std::make_pair(-0.2, 0.2), f1.getValue());
+  EXPECT_EQ(10, f1.getSalience());
+}
+
+TEST(TestDiscreteFact, Basic) {
+  TestDiscreteFact f;
+  EXPECT_FALSE(f.hasValue());
+
+  // Initial value.
+  EXPECT_TRUE(modified(f.assertValue(0, {2})));
+  EXPECT_FALSE(modified(f.assertValue(0, {2})));
+  EXPECT_EQ(2, f.getValue().value);
+  EXPECT_FALSE(f.getValue().conflict);
+
+  // Conflicting update.
+  EXPECT_TRUE(modified(f.assertValue(0, {4})));
+  EXPECT_EQ(2, f.getValue().value); // Arbitrary but known to be first wins.
+  EXPECT_TRUE(f.getValue().conflict);
+
+  // Further update still conflicts.
+  EXPECT_FALSE(modified(f.assertValue(0, {4})));
+  EXPECT_EQ(2, f.getValue().value); // Arbitrary but known to be first wins.
+  EXPECT_TRUE(f.getValue().conflict);
+
+  // Different salience update does not conflict.
+  EXPECT_TRUE(modified(f.assertValue(1, {6})));
+  EXPECT_EQ(6, f.getValue().value);
+  EXPECT_FALSE(f.getValue().conflict);
+}
+
+} // end anonymous namespace
diff --git a/mlir/unittests/Quantizer/Support/UniformSolversTest.cpp b/mlir/unittests/Quantizer/Support/UniformSolversTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e27cdc1d660e7120ccc3f844c895ecefa7706d1
--- /dev/null
+++ b/mlir/unittests/Quantizer/Support/UniformSolversTest.cpp
@@ -0,0 +1,142 @@
+//===- UniformSolversTest.cpp - Tests for uniform solvers -----------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Quantizer/Support/UniformSolvers.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::quantizer;
+
+namespace {
+
+const double kEpsilon = 1e-12;
+
+TEST(UniformMathTest, testAsym) {
+  UniformParamsFromMinMaxSolver s(UniformStorageParams::getQuint8(), -8, 8.123);
+  ASSERT_TRUE(s.compute());
+
+  llvm::errs() << "testBasic Results: " << s << "\n";
+
+  EXPECT_EQ(0.0, s.dequantize(s.getZp())); // Exact.
+  EXPECT_EQ(s.getZp(), s.quantize(0.0));
+  EXPECT_GE(s.getAdjMax() + kEpsilon, s.getBoundingMax());
+  EXPECT_LE(s.getAdjMin() - kEpsilon, s.getBoundingMin());
+}
+
+TEST(UniformMathTest, testPOT) {
+  UniformParamsFromMinMaxSolver s(UniformStorageParams::getQuint8(), -8,
+                                  7.9375);
+  ASSERT_TRUE(s.compute());
+
+  llvm::errs() << "testPOT Results: " << s << "\n";
+
+  // POT ranges should be exact.
+  EXPECT_EQ(128, s.getZp());
+  EXPECT_NEAR(6.25e-2, s.getScale(), kEpsilon);
+  EXPECT_EQ(0.0, s.dequantize(s.getZp())); // Exact.
+  EXPECT_EQ(s.getZp(), s.quantize(0.0));
+  EXPECT_GE(s.getAdjMax() + kEpsilon, s.getBoundingMax());
+  EXPECT_LE(s.getAdjMin() - kEpsilon, s.getBoundingMin());
+}
+
+TEST(UniformMathTest, testLopsidedPositive) {
+  UniformParamsFromMinMaxSolver s(UniformStorageParams::getQuint8(), 1.0, 8.0);
+  ASSERT_TRUE(s.compute());
+
+  llvm::errs() << "testLopsidedPositive Results: " << s << "\n";
+
+  EXPECT_EQ(0, s.getZp());
+  EXPECT_EQ(0.0, s.dequantize(s.getZp())); // Exact.
+  EXPECT_EQ(0, s.quantize(0.0));
+  EXPECT_GE(s.getAdjMax() + kEpsilon, s.getBoundingMax());
+  EXPECT_LE(s.getAdjMin() - kEpsilon, s.getBoundingMin());
+}
+
+TEST(UniformMathTest, testLopsidedNegative) {
+  UniformParamsFromMinMaxSolver s(UniformStorageParams::getQuint8(), -72.0,
+                                  -4.0);
+  ASSERT_TRUE(s.compute());
+
+  llvm::errs() << "testLopsidedNegative Results: " << s << "\n";
+
+  EXPECT_EQ(255, s.getZp());
+  EXPECT_EQ(0.0, s.dequantize(s.getZp())); // Exact.
+  EXPECT_EQ(255, s.quantize(0.0));
+  EXPECT_GE(s.getAdjMax() + kEpsilon, s.getBoundingMax());
+  EXPECT_LE(s.getAdjMin() - kEpsilon, s.getBoundingMin());
+}
+
+TEST(UniformMathTest, testLargeRange) {
+  UniformParamsFromMinMaxSolver s(UniformStorageParams::getQuint8(), -123.23389,
+                                  231.1289);
+  ASSERT_TRUE(s.compute());
+
+  llvm::errs() << "testLargeRange Results: " << s << "\n";
+
+  // EXPECT_EQ(255, s.getZp());
+  EXPECT_EQ(0.0, s.dequantize(s.getZp())); // Exact.
+  EXPECT_EQ(s.getZp(), s.quantize(0.0));
+  EXPECT_GE(s.getAdjMax() + kEpsilon, s.getBoundingMax());
+  EXPECT_LE(s.getAdjMin() - kEpsilon, s.getBoundingMin());
+}
+
+TEST(UniformMathTest, test16BitLargeRange) {
+  UniformParamsFromMinMaxSolver s(UniformStorageParams::getQuint16(),
+                                  -123.23389, 231.1289);
+  ASSERT_TRUE(s.compute());
+
+  llvm::errs() << "test16BitLargeRange Results: " << s << "\n";
+
+  EXPECT_EQ(0.0, s.dequantize(s.getZp())); // Exact.
+  EXPECT_EQ(s.getZp(), s.quantize(0.0));
+  EXPECT_GE(s.getAdjMax() + kEpsilon, s.getBoundingMax());
+  EXPECT_LE(s.getAdjMin() - kEpsilon, s.getBoundingMin());
+}
+
+TEST(UniformMathTest, testQuint8SymmetricRight) {
+  UniformParamsFromMinMaxSolver s(
+      UniformStorageParams::getQuint8SymmetricRight(), -123.23389, 231.1289);
+  ASSERT_TRUE(s.compute());
+
+  llvm::errs() << "testQuint8SymmetricRight Results: " << s << "\n";
+
+  EXPECT_EQ(0.0, s.dequantize(s.getZp())); // Exact.
+  EXPECT_EQ(s.getZp(), s.quantize(0.0));
+  EXPECT_GE(s.getAdjMax() + kEpsilon, s.getBoundingMax());
+  EXPECT_LE(s.getAdjMin() - kEpsilon, s.getBoundingMin());
+}
+
+TEST(UniformMathTest, testQuint4) {
+  UniformParamsFromMinMaxSolver s({15, 0}, -1.0, 1.0);
+  ASSERT_TRUE(s.compute());
+
+  llvm::errs() << "testQuint4 Results: " << s << "\n";
+
+  EXPECT_EQ(0.0, s.dequantize(s.getZp())); // Exact.
+  EXPECT_EQ(s.getZp(), s.quantize(0.0));
+  EXPECT_GE(s.getAdjMax() + kEpsilon, s.getBoundingMax());
+  EXPECT_LE(s.getAdjMin() - kEpsilon, s.getBoundingMin());
+}
+
+TEST(UniformMathTest, testNan) {
+  UniformParamsFromMinMaxSolver s({0, 0}, -1.0, 1.0);
+  ASSERT_FALSE(s.compute());
+}
+
+TEST(UniformMathTest, testBadBounds) {
+  UniformParamsFromMinMaxSolver s(UniformStorageParams::getQuint16(), 123.23389,
+                                  -231.1289);
+  ASSERT_FALSE(s.compute());
+}
+
+TEST(UniformMathTest, testZeroBounds) {
+  UniformParamsFromMinMaxSolver s(UniformStorageParams::getQuint16(), 0, 0);
+  ASSERT_FALSE(s.compute());
+}
+
+} // end namespace
diff --git a/mlir/unittests/SDBM/CMakeLists.txt b/mlir/unittests/SDBM/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3d832ec15488ca706742fb001360a2eefda0eb41
--- /dev/null
+++ b/mlir/unittests/SDBM/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_mlir_unittest(MLIRSDBMTests
+  SDBMTest.cpp
+)
+target_link_libraries(MLIRSDBMTests
+  PRIVATE
+  MLIRSDBM
+)
+whole_archive_link(MLIRSDBMTests MLIRSDBM)
diff --git a/mlir/unittests/SDBM/SDBMTest.cpp b/mlir/unittests/SDBM/SDBMTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa55ce5847790a009d100e824c4e75b41f815e48
--- /dev/null
+++ b/mlir/unittests/SDBM/SDBMTest.cpp
@@ -0,0 +1,447 @@
+//===- SDBMTest.cpp - SDBM expression unit tests --------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SDBM/SDBM.h"
+#include "mlir/Dialect/SDBM/SDBMDialect.h"
+#include "mlir/Dialect/SDBM/SDBMExpr.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/MLIRContext.h"
+#include "gtest/gtest.h"
+
+#include "llvm/ADT/DenseSet.h"
+
+using namespace mlir;
+
+static MLIRContext *ctx() {
+  static thread_local MLIRContext context;
+  return &context;
+}
+
+static SDBMDialect *dialect() {
+  static thread_local SDBMDialect *d = nullptr;
+  if (!d) {
+    d = ctx()->getRegisteredDialect<SDBMDialect>();
+  }
+  return d;
+}
+
+static SDBMExpr dim(unsigned pos) { return SDBMDimExpr::get(dialect(), pos); }
+
+static SDBMExpr symb(unsigned pos) {
+  return SDBMSymbolExpr::get(dialect(), pos);
+}
+
+namespace {
+
+using namespace mlir::ops_assertions;
+
+TEST(SDBMOperators, Add) {
+  auto expr = dim(0) + 42;
+  auto sumExpr = expr.dyn_cast<SDBMSumExpr>();
+  ASSERT_TRUE(sumExpr);
+  EXPECT_EQ(sumExpr.getLHS(), dim(0));
+  EXPECT_EQ(sumExpr.getRHS().getValue(), 42);
+}
+
+TEST(SDBMOperators, AddFolding) {
+  auto constant = SDBMConstantExpr::get(dialect(), 2) + 42;
+  auto constantExpr = constant.dyn_cast<SDBMConstantExpr>();
+  ASSERT_TRUE(constantExpr);
+  EXPECT_EQ(constantExpr.getValue(), 44);
+
+  auto expr = (dim(0) + 10) + 32;
+  auto sumExpr = expr.dyn_cast<SDBMSumExpr>();
+  ASSERT_TRUE(sumExpr);
+  EXPECT_EQ(sumExpr.getRHS().getValue(), 42);
+
+  expr = dim(0) + SDBMNegExpr::get(SDBMDimExpr::get(dialect(), 1));
+  auto diffExpr = expr.dyn_cast<SDBMDiffExpr>();
+  ASSERT_TRUE(diffExpr);
+  EXPECT_EQ(diffExpr.getLHS(), dim(0));
+  EXPECT_EQ(diffExpr.getRHS(), dim(1));
+
+  auto inverted = SDBMNegExpr::get(SDBMDimExpr::get(dialect(), 1)) + dim(0);
+  EXPECT_EQ(inverted, expr);
+
+  // Check that opposite values cancel each other, and that we elide the zero
+  // constant.
+  expr = dim(0) + 42;
+  auto onlyDim = expr - 42;
+  EXPECT_EQ(onlyDim, dim(0));
+
+  // Check that we can sink a constant under a negation.
+  expr = -(dim(0) + 2);
+  auto negatedSum = (expr + 10).dyn_cast<SDBMNegExpr>();
+  ASSERT_TRUE(negatedSum);
+  auto sum = negatedSum.getVar().dyn_cast<SDBMSumExpr>();
+  ASSERT_TRUE(sum);
+  EXPECT_EQ(sum.getRHS().getValue(), -8);
+
+  // Sum with zero is the same as the original expression.
+  EXPECT_EQ(dim(0) + 0, dim(0));
+
+  // Sum of opposite differences is zero.
+  auto diffOfDiffs =
+      ((dim(0) - dim(1)) + (dim(1) - dim(0))).dyn_cast<SDBMConstantExpr>();
+  EXPECT_EQ(diffOfDiffs.getValue(), 0);
+}
+
+TEST(SDBMOperators, AddNegativeTerms) {
+  const int64_t A = 7;
+  const int64_t B = -5;
+  auto x = SDBMDimExpr::get(dialect(), 0);
+  auto y = SDBMDimExpr::get(dialect(), 1);
+
+  // Check the simplification patterns in addition where one of the variables is
+  // cancelled out and the result remains an SDBM.
+  EXPECT_EQ(-(x + A) + ((x + B) - y), -(y + (A - B)));
+  EXPECT_EQ((x + A) + ((y + B) - x), (y + B) + A);
+  EXPECT_EQ(((x + A) - y) + (-(x + B)), -(y + (B - A)));
+  EXPECT_EQ(((x + A) - y) + (y + B), (x + A) + B);
+}
+
+TEST(SDBMOperators, Diff) {
+  auto expr = dim(0) - dim(1);
+  auto diffExpr = expr.dyn_cast<SDBMDiffExpr>();
+  ASSERT_TRUE(diffExpr);
+  EXPECT_EQ(diffExpr.getLHS(), dim(0));
+  EXPECT_EQ(diffExpr.getRHS(), dim(1));
+}
+
+TEST(SDBMOperators, DiffFolding) {
+  auto constant = SDBMConstantExpr::get(dialect(), 10) - 3;
+  auto constantExpr = constant.dyn_cast<SDBMConstantExpr>();
+  ASSERT_TRUE(constantExpr);
+  EXPECT_EQ(constantExpr.getValue(), 7);
+
+  auto expr = dim(0) - 3;
+  auto sumExpr = expr.dyn_cast<SDBMSumExpr>();
+  ASSERT_TRUE(sumExpr);
+  EXPECT_EQ(sumExpr.getRHS().getValue(), -3);
+
+  auto zero = dim(0) - dim(0);
+  constantExpr = zero.dyn_cast<SDBMConstantExpr>();
+  ASSERT_TRUE(constantExpr);
+  EXPECT_EQ(constantExpr.getValue(), 0);
+
+  // Check that the constant terms in difference-of-sums are folded.
+  // (d0 - 3) - (d1 - 5) = (d0 + 2) - d1
+  auto diffOfSums = ((dim(0) - 3) - (dim(1) - 5)).dyn_cast<SDBMDiffExpr>();
+  ASSERT_TRUE(diffOfSums);
+  auto lhs = diffOfSums.getLHS().dyn_cast<SDBMSumExpr>();
+  ASSERT_TRUE(lhs);
+  EXPECT_EQ(lhs.getLHS(), dim(0));
+  EXPECT_EQ(lhs.getRHS().getValue(), 2);
+  EXPECT_EQ(diffOfSums.getRHS(), dim(1));
+
+  // Check that identical dimensions with opposite signs cancel each other.
+  auto cstOnly = ((dim(0) + 42) - dim(0)).dyn_cast<SDBMConstantExpr>();
+  ASSERT_TRUE(cstOnly);
+  EXPECT_EQ(cstOnly.getValue(), 42);
+
+  // Check that identical terms in sum of diffs cancel out.
+  auto dimOnly = (-dim(0) + (dim(0) - dim(1)));
+  EXPECT_EQ(dimOnly, -dim(1));
+  dimOnly = (dim(0) - dim(1)) + (-dim(0));
+  EXPECT_EQ(dimOnly, -dim(1));
+  dimOnly = (dim(0) - dim(1)) + dim(1);
+  EXPECT_EQ(dimOnly, dim(0));
+  dimOnly = dim(0) + (dim(1) - dim(0));
+  EXPECT_EQ(dimOnly, dim(1));
+
+  // Top-level zero constant is fine.
+  cstOnly = (-symb(1) + symb(1)).dyn_cast<SDBMConstantExpr>();
+  ASSERT_TRUE(cstOnly);
+  EXPECT_EQ(cstOnly.getValue(), 0);
+}
+
+TEST(SDBMOperators, Negate) {
+  auto sum = dim(0) + 3;
+  auto negated = (-sum).dyn_cast<SDBMNegExpr>();
+  ASSERT_TRUE(negated);
+  EXPECT_EQ(negated.getVar(), sum);
+}
+
+TEST(SDBMOperators, Stripe) {
+  auto expr = stripe(dim(0), 3);
+  auto stripeExpr = expr.dyn_cast<SDBMStripeExpr>();
+  ASSERT_TRUE(stripeExpr);
+  EXPECT_EQ(stripeExpr.getLHS(), dim(0));
+  EXPECT_EQ(stripeExpr.getStripeFactor().getValue(), 3);
+}
+
+TEST(SDBM, RoundTripEqs) {
+  // Build an SDBM defined by
+  //
+  //   d0 = s0 # 3 # 5
+  //   s0 # 3 # 5 - d1 + 42 = 0
+  //
+  // and perform a double round-trip between the "list of equalities" and SDBM
+  // representation.  After the first round-trip, the equalities may be
+  // different due to simplification or equivalent substitutions (e.g., the
+  // second equality may become d0 - d1 + 42 = 0).  However, there should not
+  // be any further simplification after the second round-trip,
+
+  // Build the SDBM from a pair of equalities and extract back the lists of
+  // inequalities and equalities.  Check that all equalities are properly
+  // detected and none of them decayed into inequalities.
+  auto s = stripe(stripe(symb(0), 3), 5);
+  auto sdbm = SDBM::get(llvm::None, {s - dim(0), s - dim(1) + 42});
+  SmallVector<SDBMExpr, 4> eqs, ineqs;
+  sdbm.getSDBMExpressions(dialect(), ineqs, eqs);
+  ASSERT_TRUE(ineqs.empty());
+
+  // Do the second round-trip.
+  auto sdbm2 = SDBM::get(llvm::None, eqs);
+  SmallVector<SDBMExpr, 4> eqs2, ineqs2;
+  sdbm2.getSDBMExpressions(dialect(), ineqs2, eqs2);
+  ASSERT_EQ(eqs.size(), eqs2.size());
+
+  // Check that the sets of equalities are equal, their order is not relevant.
+  llvm::DenseSet<SDBMExpr> eqSet, eq2Set;
+  eqSet.insert(eqs.begin(), eqs.end());
+  eq2Set.insert(eqs2.begin(), eqs2.end());
+  EXPECT_EQ(eqSet, eq2Set);
+}
+
+TEST(SDBMExpr, Constant) {
+  // We can create constants and query them.
+  auto expr = SDBMConstantExpr::get(dialect(), 42);
+  EXPECT_EQ(expr.getValue(), 42);
+
+  // Two separately created constants with identical values are trivially equal.
+  auto expr2 = SDBMConstantExpr::get(dialect(), 42);
+  EXPECT_EQ(expr, expr2);
+
+  // Hierarchy is okay.
+  auto generic = static_cast<SDBMExpr>(expr);
+  EXPECT_TRUE(generic.isa<SDBMConstantExpr>());
+}
+
+TEST(SDBMExpr, Dim) {
+  // We can create dimension expressions and query them.
+  auto expr = SDBMDimExpr::get(dialect(), 0);
+  EXPECT_EQ(expr.getPosition(), 0u);
+
+  // Two separately created dimensions with the same position are trivially
+  // equal.
+  auto expr2 = SDBMDimExpr::get(dialect(), 0);
+  EXPECT_EQ(expr, expr2);
+
+  // Hierarchy is okay.
+  auto generic = static_cast<SDBMExpr>(expr);
+  EXPECT_TRUE(generic.isa<SDBMDimExpr>());
+  EXPECT_TRUE(generic.isa<SDBMInputExpr>());
+  EXPECT_TRUE(generic.isa<SDBMTermExpr>());
+  EXPECT_TRUE(generic.isa<SDBMDirectExpr>());
+  EXPECT_TRUE(generic.isa<SDBMVaryingExpr>());
+
+  // Dimensions are not Symbols.
+  auto symbol = SDBMSymbolExpr::get(dialect(), 0);
+  EXPECT_NE(expr, symbol);
+  EXPECT_FALSE(expr.isa<SDBMSymbolExpr>());
+}
+
+TEST(SDBMExpr, Symbol) {
+  // We can create symbol expressions and query them.
+  auto expr = SDBMSymbolExpr::get(dialect(), 0);
+  EXPECT_EQ(expr.getPosition(), 0u);
+
+  // Two separately created symbols with the same position are trivially equal.
+  auto expr2 = SDBMSymbolExpr::get(dialect(), 0);
+  EXPECT_EQ(expr, expr2);
+
+  // Hierarchy is okay.
+  auto generic = static_cast<SDBMExpr>(expr);
+  EXPECT_TRUE(generic.isa<SDBMSymbolExpr>());
+  EXPECT_TRUE(generic.isa<SDBMInputExpr>());
+  EXPECT_TRUE(generic.isa<SDBMTermExpr>());
+  EXPECT_TRUE(generic.isa<SDBMDirectExpr>());
+  EXPECT_TRUE(generic.isa<SDBMVaryingExpr>());
+
+  // Dimensions are not Symbols.
+  auto symbol = SDBMDimExpr::get(dialect(), 0);
+  EXPECT_NE(expr, symbol);
+  EXPECT_FALSE(expr.isa<SDBMDimExpr>());
+}
+
+TEST(SDBMExpr, Stripe) {
+  auto cst2 = SDBMConstantExpr::get(dialect(), 2);
+  auto cst0 = SDBMConstantExpr::get(dialect(), 0);
+  auto var = SDBMSymbolExpr::get(dialect(), 0);
+
+  // We can create stripe expressions and query them.
+  auto expr = SDBMStripeExpr::get(var, cst2);
+  EXPECT_EQ(expr.getLHS(), var);
+  EXPECT_EQ(expr.getStripeFactor(), cst2);
+
+  // Two separately created stripe expressions with the same LHS and RHS are
+  // trivially equal.
+  auto expr2 = SDBMStripeExpr::get(SDBMSymbolExpr::get(dialect(), 0), cst2);
+  EXPECT_EQ(expr, expr2);
+
+  // Stripes can be nested.
+  SDBMStripeExpr::get(expr, SDBMConstantExpr::get(dialect(), 4));
+
+  // Non-positive stripe factors are not allowed.
+  EXPECT_DEATH(SDBMStripeExpr::get(var, cst0), "non-positive");
+
+  // Stripes can have sums on the LHS.
+  SDBMStripeExpr::get(SDBMSumExpr::get(var, cst2), cst2);
+
+  // Hierarchy is okay.
+  auto generic = static_cast<SDBMExpr>(expr);
+  EXPECT_TRUE(generic.isa<SDBMStripeExpr>());
+  EXPECT_TRUE(generic.isa<SDBMTermExpr>());
+  EXPECT_TRUE(generic.isa<SDBMDirectExpr>());
+  EXPECT_TRUE(generic.isa<SDBMVaryingExpr>());
+}
+
+TEST(SDBMExpr, Neg) {
+  auto cst2 = SDBMConstantExpr::get(dialect(), 2);
+  auto var = SDBMSymbolExpr::get(dialect(), 0);
+  auto stripe = SDBMStripeExpr::get(var, cst2);
+
+  // We can create negation expressions and query them.
+  auto expr = SDBMNegExpr::get(var);
+  EXPECT_EQ(expr.getVar(), var);
+  auto expr2 = SDBMNegExpr::get(stripe);
+  EXPECT_EQ(expr2.getVar(), stripe);
+
+  // Neg expressions are trivially comparable.
+  EXPECT_EQ(expr, SDBMNegExpr::get(var));
+
+  // Hierarchy is okay.
+  auto generic = static_cast<SDBMExpr>(expr);
+  EXPECT_TRUE(generic.isa<SDBMNegExpr>());
+  EXPECT_TRUE(generic.isa<SDBMVaryingExpr>());
+}
+
+TEST(SDBMExpr, Sum) {
+  auto cst2 = SDBMConstantExpr::get(dialect(), 2);
+  auto var = SDBMSymbolExpr::get(dialect(), 0);
+  auto stripe = SDBMStripeExpr::get(var, cst2);
+
+  // We can create sum expressions and query them.
+  auto expr = SDBMSumExpr::get(var, cst2);
+  EXPECT_EQ(expr.getLHS(), var);
+  EXPECT_EQ(expr.getRHS(), cst2);
+  auto expr2 = SDBMSumExpr::get(stripe, cst2);
+  EXPECT_EQ(expr2.getLHS(), stripe);
+  EXPECT_EQ(expr2.getRHS(), cst2);
+
+  // Sum expressions are trivially comparable.
+  EXPECT_EQ(expr, SDBMSumExpr::get(var, cst2));
+
+  // Hierarchy is okay.
+  auto generic = static_cast<SDBMExpr>(expr);
+  EXPECT_TRUE(generic.isa<SDBMSumExpr>());
+  EXPECT_TRUE(generic.isa<SDBMDirectExpr>());
+  EXPECT_TRUE(generic.isa<SDBMVaryingExpr>());
+}
+
+TEST(SDBMExpr, Diff) {
+  auto cst2 = SDBMConstantExpr::get(dialect(), 2);
+  auto var = SDBMSymbolExpr::get(dialect(), 0);
+  auto stripe = SDBMStripeExpr::get(var, cst2);
+
+  // We can create sum expressions and query them.
+  auto expr = SDBMDiffExpr::get(var, stripe);
+  EXPECT_EQ(expr.getLHS(), var);
+  EXPECT_EQ(expr.getRHS(), stripe);
+  auto expr2 = SDBMDiffExpr::get(stripe, var);
+  EXPECT_EQ(expr2.getLHS(), stripe);
+  EXPECT_EQ(expr2.getRHS(), var);
+
+  // Sum expressions are trivially comparable.
+  EXPECT_EQ(expr, SDBMDiffExpr::get(var, stripe));
+
+  // Hierarchy is okay.
+  auto generic = static_cast<SDBMExpr>(expr);
+  EXPECT_TRUE(generic.isa<SDBMDiffExpr>());
+  EXPECT_TRUE(generic.isa<SDBMVaryingExpr>());
+}
+
+TEST(SDBMExpr, AffineRoundTrip) {
+  // Build an expression (s0 - s0 # 2)
+  auto cst2 = SDBMConstantExpr::get(dialect(), 2);
+  auto var = SDBMSymbolExpr::get(dialect(), 0);
+  auto stripe = SDBMStripeExpr::get(var, cst2);
+  auto expr = SDBMDiffExpr::get(var, stripe);
+
+  // Check that it can be converted to AffineExpr and back, i.e. stripe
+  // detection works correctly.
+  Optional<SDBMExpr> roundtripped =
+      SDBMExpr::tryConvertAffineExpr(expr.getAsAffineExpr());
+  ASSERT_TRUE(roundtripped.hasValue());
+  EXPECT_EQ(roundtripped, static_cast<SDBMExpr>(expr));
+
+  // Check that (s0 # 2 # 5) can be converted to AffineExpr, i.e. stripe
+  // detection supports nested expressions.
+  auto cst5 = SDBMConstantExpr::get(dialect(), 5);
+  auto outerStripe = SDBMStripeExpr::get(stripe, cst5);
+  roundtripped = SDBMExpr::tryConvertAffineExpr(outerStripe.getAsAffineExpr());
+  ASSERT_TRUE(roundtripped.hasValue());
+  EXPECT_EQ(roundtripped, static_cast<SDBMExpr>(outerStripe));
+
+  // Check that ((s0 + 2) # 5) can be round-tripped through AffineExpr, i.e.
+  // stripe detection supports sum expressions.
+  auto inner = SDBMSumExpr::get(var, cst2);
+  auto stripeSum = SDBMStripeExpr::get(inner, cst5);
+  roundtripped = SDBMExpr::tryConvertAffineExpr(stripeSum.getAsAffineExpr());
+  ASSERT_TRUE(roundtripped.hasValue());
+  EXPECT_EQ(roundtripped, static_cast<SDBMExpr>(stripeSum));
+
+  // Check that (s0 # 2 # 5 - s0 # 2) + 2 can be converted as an example of a
+  // deeper expression tree.
+  auto sum = SDBMSumExpr::get(outerStripe, cst2);
+  auto diff = SDBMDiffExpr::get(sum, stripe);
+  roundtripped = SDBMExpr::tryConvertAffineExpr(diff.getAsAffineExpr());
+  ASSERT_TRUE(roundtripped.hasValue());
+  EXPECT_EQ(roundtripped, static_cast<SDBMExpr>(diff));
+
+  // Check a nested stripe-sum combination.
+  auto cst7 = SDBMConstantExpr::get(dialect(), 7);
+  auto nestedStripe =
+      SDBMStripeExpr::get(SDBMSumExpr::get(stripeSum, cst2), cst7);
+  diff = SDBMDiffExpr::get(nestedStripe, stripe);
+  roundtripped = SDBMExpr::tryConvertAffineExpr(diff.getAsAffineExpr());
+  ASSERT_TRUE(roundtripped.hasValue());
+  EXPECT_EQ(roundtripped, static_cast<SDBMExpr>(diff));
+}
+
+TEST(SDBMExpr, MatchStripeMulPattern) {
+  // Make sure conversion from AffineExpr recognizes multiplicative stripe
+  // pattern (x floordiv B) * B == x # B.
+  auto cst = getAffineConstantExpr(42, ctx());
+  auto dim = getAffineDimExpr(0, ctx());
+  auto floor = dim.floorDiv(cst);
+  auto mul = cst * floor;
+  Optional<SDBMExpr> converted = SDBMStripeExpr::tryConvertAffineExpr(mul);
+  ASSERT_TRUE(converted.hasValue());
+  EXPECT_TRUE(converted->isa<SDBMStripeExpr>());
+}
+
+TEST(SDBMExpr, NonSDBM) {
+  auto d0 = getAffineDimExpr(0, ctx());
+  auto d1 = getAffineDimExpr(1, ctx());
+  auto sum = d0 + d1;
+  auto c2 = getAffineConstantExpr(2, ctx());
+  auto prod = d0 * c2;
+  auto ceildiv = d1.ceilDiv(c2);
+
+  // The following are not valid SDBM expressions:
+  // - a sum of two variables
+  EXPECT_FALSE(SDBMExpr::tryConvertAffineExpr(sum).hasValue());
+  // - a variable with coefficient other than 1 or -1
+  EXPECT_FALSE(SDBMExpr::tryConvertAffineExpr(prod).hasValue());
+  // - a ceildiv expression
+  EXPECT_FALSE(SDBMExpr::tryConvertAffineExpr(ceildiv).hasValue());
+}
+
+} // end namespace
diff --git a/mlir/unittests/TableGen/CMakeLists.txt b/mlir/unittests/TableGen/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3c58db389cb5ffd8455cf68fab29069cafa67fe0
--- /dev/null
+++ b/mlir/unittests/TableGen/CMakeLists.txt
@@ -0,0 +1,21 @@
+set(LLVM_TARGET_DEFINITIONS enums.td)
+mlir_tablegen(EnumsGenTest.h.inc -gen-enum-decls)
+mlir_tablegen(EnumsGenTest.cpp.inc -gen-enum-defs)
+add_public_tablegen_target(MLIRTableGenEnumsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS structs.td)
+mlir_tablegen(StructAttrGenTest.h.inc -gen-struct-attr-decls)
+mlir_tablegen(StructAttrGenTest.cpp.inc -gen-struct-attr-defs)
+add_public_tablegen_target(MLIRTableGenStructAttrIncGen)
+
+add_mlir_unittest(MLIRTableGenTests
+  EnumsGenTest.cpp
+  StructsGenTest.cpp
+  FormatTest.cpp
+)
+
+add_dependencies(MLIRTableGenTests MLIRTableGenEnumsIncGen)
+add_dependencies(MLIRTableGenTests MLIRTableGenStructAttrIncGen)
+
+target_link_libraries(MLIRTableGenTests
+  PRIVATE LLVMMLIRTableGen MLIRIR)
diff --git a/mlir/unittests/TableGen/EnumsGenTest.cpp b/mlir/unittests/TableGen/EnumsGenTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4fe68482ef4c47b7fe2d7ee4a255ea5c2a22ac8
--- /dev/null
+++ b/mlir/unittests/TableGen/EnumsGenTest.cpp
@@ -0,0 +1,98 @@
+//===- EnumsGenTest.cpp - TableGen EnumsGen Tests -------------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "gmock/gmock.h"
+#include <type_traits>
+
+// Pull in generated enum utility declarations
+#include "EnumsGenTest.h.inc"
+// And definitions
+#include "EnumsGenTest.cpp.inc"
+
+using ::testing::StrEq;
+
+// Test namespaces and enum class/utility names
+using Outer::Inner::ConvertToEnum;
+using Outer::Inner::ConvertToString;
+using Outer::Inner::StrEnum;
+
+TEST(EnumsGenTest, GeneratedStrEnumDefinition) {
+  EXPECT_EQ(0u, static_cast<uint64_t>(StrEnum::CaseA));
+  EXPECT_EQ(10u, static_cast<uint64_t>(StrEnum::CaseB));
+}
+
+TEST(EnumsGenTest, GeneratedI32EnumDefinition) {
+  EXPECT_EQ(5u, static_cast<uint64_t>(I32Enum::Case5));
+  EXPECT_EQ(10u, static_cast<uint64_t>(I32Enum::Case10));
+}
+
+TEST(EnumsGenTest, GeneratedDenseMapInfo) {
+  llvm::DenseMap<StrEnum, std::string> myMap;
+
+  myMap[StrEnum::CaseA] = "zero";
+  myMap[StrEnum::CaseB] = "one";
+
+  EXPECT_THAT(myMap[StrEnum::CaseA], StrEq("zero"));
+  EXPECT_THAT(myMap[StrEnum::CaseB], StrEq("one"));
+}
+
+TEST(EnumsGenTest, GeneratedSymbolToStringFn) {
+  EXPECT_THAT(ConvertToString(StrEnum::CaseA), StrEq("CaseA"));
+  EXPECT_THAT(ConvertToString(StrEnum::CaseB), StrEq("CaseB"));
+}
+
+TEST(EnumsGenTest, GeneratedStringToSymbolFn) {
+  EXPECT_EQ(llvm::Optional<StrEnum>(StrEnum::CaseA), ConvertToEnum("CaseA"));
+  EXPECT_EQ(llvm::Optional<StrEnum>(StrEnum::CaseB), ConvertToEnum("CaseB"));
+  EXPECT_EQ(llvm::None, ConvertToEnum("X"));
+}
+
+TEST(EnumsGenTest, GeneratedUnderlyingType) {
+  bool v = std::is_same<uint32_t, std::underlying_type<I32Enum>::type>::value;
+  EXPECT_TRUE(v);
+}
+
+TEST(EnumsGenTest, GeneratedBitEnumDefinition) {
+  EXPECT_EQ(0u, static_cast<uint32_t>(BitEnumWithNone::None));
+  EXPECT_EQ(1u, static_cast<uint32_t>(BitEnumWithNone::Bit1));
+  EXPECT_EQ(4u, static_cast<uint32_t>(BitEnumWithNone::Bit3));
+}
+
+TEST(EnumsGenTest, GeneratedSymbolToStringFnForBitEnum) {
+  EXPECT_THAT(stringifyBitEnumWithNone(BitEnumWithNone::None), StrEq("None"));
+  EXPECT_THAT(stringifyBitEnumWithNone(BitEnumWithNone::Bit1), StrEq("Bit1"));
+  EXPECT_THAT(stringifyBitEnumWithNone(BitEnumWithNone::Bit3), StrEq("Bit3"));
+  EXPECT_THAT(
+      stringifyBitEnumWithNone(BitEnumWithNone::Bit1 | BitEnumWithNone::Bit3),
+      StrEq("Bit1|Bit3"));
+}
+
+TEST(EnumsGenTest, GeneratedStringToSymbolForBitEnum) {
+  EXPECT_EQ(symbolizeBitEnumWithNone("None"), BitEnumWithNone::None);
+  EXPECT_EQ(symbolizeBitEnumWithNone("Bit1"), BitEnumWithNone::Bit1);
+  EXPECT_EQ(symbolizeBitEnumWithNone("Bit3"), BitEnumWithNone::Bit3);
+  EXPECT_EQ(symbolizeBitEnumWithNone("Bit3|Bit1"),
+            BitEnumWithNone::Bit3 | BitEnumWithNone::Bit1);
+
+  EXPECT_EQ(symbolizeBitEnumWithNone("Bit2"), llvm::None);
+  EXPECT_EQ(symbolizeBitEnumWithNone("Bit3|Bit4"), llvm::None);
+
+  EXPECT_EQ(symbolizeBitEnumWithoutNone("None"), llvm::None);
+}
+
+TEST(EnumsGenTest, GeneratedOperator) {
+  EXPECT_TRUE(bitEnumContains(BitEnumWithNone::Bit1 | BitEnumWithNone::Bit3,
+                              BitEnumWithNone::Bit1));
+  EXPECT_FALSE(bitEnumContains(BitEnumWithNone::Bit1 & BitEnumWithNone::Bit3,
+                               BitEnumWithNone::Bit1));
+}
diff --git a/mlir/unittests/TableGen/FormatTest.cpp b/mlir/unittests/TableGen/FormatTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0566c8a5a7b73b40dd7296c228749b51bf7504ba
--- /dev/null
+++ b/mlir/unittests/TableGen/FormatTest.cpp
@@ -0,0 +1,157 @@
+//===- FormatTest.cpp - TableGen Format Utility Tests ---------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/Format.h"
+#include "gmock/gmock.h"
+
+using mlir::tblgen::FmtContext;
+using mlir::tblgen::tgfmt;
+using ::testing::StrEq;
+
+TEST(FormatTest, EmptyFmtStr) {
+  FmtContext ctx;
+  std::string result = tgfmt("", &ctx);
+  EXPECT_TRUE(result.empty());
+}
+
+// Allow extra unused positional parameters
+TEST(FormatTest, EmptyFmtStrExtraParams) {
+  FmtContext ctx;
+  std::string result = tgfmt("", &ctx, "a", "b", "c");
+  EXPECT_TRUE(result.empty());
+}
+
+// Allow unused placeholder substitution in context
+TEST(FormatTest, EmptyFmtStrPopulatedCtx) {
+  FmtContext ctx;
+  ctx.withBuilder("builder");
+  std::string result = tgfmt("", &ctx);
+  EXPECT_TRUE(result.empty());
+}
+
+TEST(FormatTest, LiteralFmtStr) {
+  FmtContext ctx;
+  std::string result = tgfmt("void foo {}", &ctx);
+  EXPECT_THAT(result, StrEq("void foo {}"));
+}
+
+// Print single dollar literally
+TEST(FormatTest, AdjacentDollar) {
+  FmtContext ctx;
+  std::string result = tgfmt("$", &ctx);
+  EXPECT_THAT(result, StrEq("$"));
+}
+
+// Print dangling dollar literally
+TEST(FormatTest, DanglingDollar) {
+  FmtContext ctx;
+  std::string result = tgfmt("foo bar baz$", &ctx);
+  EXPECT_THAT(result, StrEq("foo bar baz$"));
+}
+
+// Allow escape dollars with '$$'
+TEST(FormatTest, EscapeDollars) {
+  FmtContext ctx;
+  std::string result =
+      tgfmt("$$ $$$$ $$$0 $$$_self", &ctx.withSelf("self"), "-0");
+  EXPECT_THAT(result, StrEq("$ $$ $-0 $self"));
+}
+
+TEST(FormatTest, PositionalFmtStr) {
+  FmtContext ctx;
+  std::string b = "b";
+  int c = 42;
+  char d = 'd';
+  std::string result = tgfmt("$0 $1 $2 $3", &ctx, "a", b, c + 1, d);
+  EXPECT_THAT(result, StrEq("a b 43 d"));
+}
+
+// Output the placeholder if missing substitution
+TEST(FormatTest, PositionalFmtStrMissingParams) {
+  FmtContext ctx;
+  std::string result = tgfmt("$0 %1 $2", &ctx);
+  EXPECT_THAT(result, StrEq("$0<no-subst-found> %1 $2<no-subst-found>"));
+}
+
+// Allow flexible reference of positional parameters
+TEST(FormatTest, PositionalFmtStrFlexibleRef) {
+  FmtContext ctx;
+  std::string result = tgfmt("$2 $0 $2", &ctx, "a", "b", "c");
+  EXPECT_THAT(result, StrEq("c a c"));
+}
+
+TEST(FormatTest, PositionalFmtStrNoWhitespace) {
+  FmtContext ctx;
+  std::string result = tgfmt("foo$0bar", &ctx, "-");
+  EXPECT_THAT(result, StrEq("foo-bar"));
+}
+
+TEST(FormatTest, PlaceHolderFmtStrWithSelf) {
+  FmtContext ctx;
+  std::string result = tgfmt("$_self", &ctx.withSelf("sss"));
+  EXPECT_THAT(result, StrEq("sss"));
+}
+
+TEST(FormatTest, PlaceHolderFmtStrWithBuilder) {
+  FmtContext ctx;
+
+  std::string result = tgfmt("$_builder", &ctx.withBuilder("bbb"));
+  EXPECT_THAT(result, StrEq("bbb"));
+}
+
+TEST(FormatTest, PlaceHolderFmtStrWithOp) {
+  FmtContext ctx;
+  std::string result = tgfmt("$_op", &ctx.withOp("ooo"));
+  EXPECT_THAT(result, StrEq("ooo"));
+}
+
+TEST(FormatTest, PlaceHolderMissingCtx) {
+  std::string result = tgfmt("$_op", nullptr);
+  EXPECT_THAT(result, StrEq("$_op<no-subst-found>"));
+}
+
+TEST(FormatTest, PlaceHolderMissingSubst) {
+  FmtContext ctx;
+  std::string result = tgfmt("$_op", &ctx.withBuilder("builder"));
+  EXPECT_THAT(result, StrEq("$_op<no-subst-found>"));
+}
+
+// Test commonly used delimiters in C++
+TEST(FormatTest, PlaceHolderFmtStrDelimiter) {
+  FmtContext ctx;
+  ctx.addSubst("m", "");
+  std::string result = tgfmt("$m{$m($m[$m]$m)$m}$m|", &ctx);
+  EXPECT_THAT(result, StrEq("{([])}|"));
+}
+
+// Test allowed characters in placeholder symbol
+TEST(FormatTest, CustomPlaceHolderFmtStrPlaceHolderChars) {
+  FmtContext ctx;
+  ctx.addSubst("m", "0 ");
+  ctx.addSubst("m1", "1 ");
+  ctx.addSubst("m2C", "2 ");
+  ctx.addSubst("M_3", "3 ");
+  std::string result = tgfmt("$m$m1$m2C$M_3", &ctx);
+  EXPECT_THAT(result, StrEq("0 1 2 3 "));
+}
+
+TEST(FormatTest, CustomPlaceHolderFmtStrUnregisteredPlaceHolders) {
+  FmtContext ctx;
+  std::string result = tgfmt("foo($awesome, $param)", &ctx);
+  EXPECT_THAT(result,
+              StrEq("foo($awesome<no-subst-found>, $param<no-subst-found>)"));
+}
+
+TEST(FormatTest, MixedFmtStr) {
+  FmtContext ctx;
+  ctx.withBuilder("bbb");
+
+  std::string result = tgfmt("$_builder.build($_self, {$0, $1})",
+                             &ctx.withSelf("sss"), "a", "b");
+  EXPECT_THAT(result, StrEq("bbb.build(sss, {a, b})"));
+}
diff --git a/mlir/unittests/TableGen/StructsGenTest.cpp b/mlir/unittests/TableGen/StructsGenTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..45455d7e2aa166edc4471fca876450814e9118ef
--- /dev/null
+++ b/mlir/unittests/TableGen/StructsGenTest.cpp
@@ -0,0 +1,137 @@
+//===- StructsGenTest.cpp - TableGen StructsGen Tests ---------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/StandardTypes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "gmock/gmock.h"
+#include <type_traits>
+
+namespace mlir {
+
+// Pull in generated enum utility declarations
+#include "StructAttrGenTest.h.inc"
+// And definitions
+#include "StructAttrGenTest.cpp.inc"
+// Helper that returns an example test::TestStruct for testing its
+// implementation.
+static test::TestStruct getTestStruct(mlir::MLIRContext *context) {
+  auto integerType = mlir::IntegerType::get(32, context);
+  auto integerAttr = mlir::IntegerAttr::get(integerType, 127);
+
+  auto floatType = mlir::FloatType::getF16(context);
+  auto floatAttr = mlir::FloatAttr::get(floatType, 0.25);
+
+  auto elementsType = mlir::RankedTensorType::get({2, 3}, integerType);
+  auto elementsAttr =
+      mlir::DenseElementsAttr::get(elementsType, {1, 2, 3, 4, 5, 6});
+
+  return test::TestStruct::get(integerAttr, floatAttr, elementsAttr, context);
+}
+
+// Validates that test::TestStruct::classof correctly identifies a valid
+// test::TestStruct.
+TEST(StructsGenTest, ClassofTrue) {
+  mlir::MLIRContext context;
+  auto structAttr = getTestStruct(&context);
+  ASSERT_TRUE(test::TestStruct::classof(structAttr));
+}
+
+// Validates that test::TestStruct::classof fails when an extra attribute is in
+// the class.
+TEST(StructsGenTest, ClassofExtraFalse) {
+  mlir::MLIRContext context;
+  mlir::DictionaryAttr structAttr = getTestStruct(&context);
+  auto expectedValues = structAttr.getValue();
+  ASSERT_EQ(expectedValues.size(), 3u);
+
+  // Copy the set of named attributes.
+  llvm::SmallVector<mlir::NamedAttribute, 5> newValues(expectedValues.begin(),
+                                                       expectedValues.end());
+
+  // Add an extra NamedAttribute.
+  auto wrongId = mlir::Identifier::get("wrong", &context);
+  auto wrongAttr = mlir::NamedAttribute(wrongId, expectedValues[0].second);
+  newValues.push_back(wrongAttr);
+
+  // Make a new DictionaryAttr and validate.
+  auto badDictionary = mlir::DictionaryAttr::get(newValues, &context);
+  ASSERT_FALSE(test::TestStruct::classof(badDictionary));
+}
+
+// Validates that test::TestStruct::classof fails when a NamedAttribute has an
+// incorrect name.
+TEST(StructsGenTest, ClassofBadNameFalse) {
+  mlir::MLIRContext context;
+  mlir::DictionaryAttr structAttr = getTestStruct(&context);
+  auto expectedValues = structAttr.getValue();
+  ASSERT_EQ(expectedValues.size(), 3u);
+
+  // Create a copy of all but the first NamedAttributes.
+  llvm::SmallVector<mlir::NamedAttribute, 4> newValues(
+      expectedValues.begin() + 1, expectedValues.end());
+
+  // Add a copy of the first attribute with the wrong Identifier.
+  auto wrongId = mlir::Identifier::get("wrong", &context);
+  auto wrongAttr = mlir::NamedAttribute(wrongId, expectedValues[0].second);
+  newValues.push_back(wrongAttr);
+
+  auto badDictionary = mlir::DictionaryAttr::get(newValues, &context);
+  ASSERT_FALSE(test::TestStruct::classof(badDictionary));
+}
+
+// Validates that test::TestStruct::classof fails when a NamedAttribute is
+// missing.
+TEST(StructsGenTest, ClassofMissingFalse) {
+  mlir::MLIRContext context;
+  mlir::DictionaryAttr structAttr = getTestStruct(&context);
+  auto expectedValues = structAttr.getValue();
+  ASSERT_EQ(expectedValues.size(), 3u);
+
+  // Copy a subset of the structures Named Attributes.
+  llvm::SmallVector<mlir::NamedAttribute, 3> newValues(
+      expectedValues.begin() + 1, expectedValues.end());
+
+  // Make a new DictionaryAttr and validate it is not a validate TestStruct.
+  auto badDictionary = mlir::DictionaryAttr::get(newValues, &context);
+  ASSERT_FALSE(test::TestStruct::classof(badDictionary));
+}
+
+// Validate the accessor for the FloatAttr value.
+TEST(StructsGenTest, GetFloat) {
+  mlir::MLIRContext context;
+  auto structAttr = getTestStruct(&context);
+  auto returnedAttr = structAttr.sample_float();
+  EXPECT_EQ(returnedAttr.getValueAsDouble(), 0.25);
+}
+
+// Validate the accessor for the IntegerAttr value.
+TEST(StructsGenTest, GetInteger) {
+  mlir::MLIRContext context;
+  auto structAttr = getTestStruct(&context);
+  auto returnedAttr = structAttr.sample_integer();
+  EXPECT_EQ(returnedAttr.getInt(), 127);
+}
+
+// Validate the accessor for the ElementsAttr value.
+TEST(StructsGenTest, GetElements) {
+  mlir::MLIRContext context;
+  auto structAttr = getTestStruct(&context);
+  auto returnedAttr = structAttr.sample_elements();
+  auto denseAttr = returnedAttr.dyn_cast<mlir::DenseElementsAttr>();
+  ASSERT_TRUE(denseAttr);
+
+  for (const auto &valIndexIt : llvm::enumerate(denseAttr.getIntValues())) {
+    EXPECT_EQ(valIndexIt.value(), valIndexIt.index() + 1);
+  }
+}
+
+} // namespace mlir
diff --git a/mlir/unittests/TableGen/enums.td b/mlir/unittests/TableGen/enums.td
new file mode 100644
index 0000000000000000000000000000000000000000..7d44856f9dc27bdfcf28ba5309bdc9aff3de82d1
--- /dev/null
+++ b/mlir/unittests/TableGen/enums.td
@@ -0,0 +1,33 @@
+//===-- enums.td - EnumsGen test definition file -----------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "mlir/IR/OpBase.td"
+
+def CaseA: StrEnumAttrCase<"CaseA">;
+def CaseB: StrEnumAttrCase<"CaseB", 10>;
+
+def StrEnum: StrEnumAttr<"StrEnum", "A test enum", [CaseA, CaseB]> {
+  let cppNamespace = "Outer::Inner";
+  let stringToSymbolFnName = "ConvertToEnum";
+  let symbolToStringFnName = "ConvertToString";
+}
+
+def Case5: I32EnumAttrCase<"Case5", 5>;
+def Case10: I32EnumAttrCase<"Case10", 10>;
+
+def I32Enum: I32EnumAttr<"I32Enum", "A test enum", [Case5, Case10]>;
+
+def Bit0 : BitEnumAttrCase<"None", 0x0000>;
+def Bit1 : BitEnumAttrCase<"Bit1", 0x0001>;
+def Bit3 : BitEnumAttrCase<"Bit3", 0x0004>;
+
+def BitEnumWithNone : BitEnumAttr<"BitEnumWithNone", "A test enum",
+                                  [Bit0, Bit1, Bit3]>;
+
+def BitEnumWithoutNone : BitEnumAttr<"BitEnumWithoutNone", "A test enum",
+                                     [Bit1, Bit3]>;
diff --git a/mlir/unittests/TableGen/structs.td b/mlir/unittests/TableGen/structs.td
new file mode 100644
index 0000000000000000000000000000000000000000..885517511822593f19cbd62ecf8b28dadbacfa31
--- /dev/null
+++ b/mlir/unittests/TableGen/structs.td
@@ -0,0 +1,20 @@
+//===-- structs.td - StructsGen test definition file -------*- tablegen -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+}
+
+def Test_Struct : StructAttr<"TestStruct", Test_Dialect, [
+                StructFieldAttr<"sample_integer", I32Attr>,
+                StructFieldAttr<"sample_float", F32Attr>,
+                StructFieldAttr<"sample_elements", ElementsAttr>] > {
+  let description = "Structure for test data";
+}
diff --git a/mlir/utils/emacs/mlir-mode.el b/mlir/utils/emacs/mlir-mode.el
new file mode 100644
index 0000000000000000000000000000000000000000..636c5db99619e7fdcf28d9230a4bec6ea41a4bfc
--- /dev/null
+++ b/mlir/utils/emacs/mlir-mode.el
@@ -0,0 +1,79 @@
+;;; mlir-mode.el --- Major mode for the MLIR assembler language.
+
+;; Copyright (C) 2019 The MLIR Authors.
+;;
+;; Licensed under the Apache License, Version 2.0 (the "License");
+;; you may not use this file except in compliance with the License.
+;; You may obtain a copy of the License at
+;;
+;;      http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+
+;;; Commentary:
+
+;; Major mode for editing MLIR files.
+
+;;; Code:
+
+(defvar mlir-mode-syntax-table
+  (let ((table (make-syntax-table)))
+    (modify-syntax-entry ?% "_" table)
+    (modify-syntax-entry ?@ "_" table)
+    (modify-syntax-entry ?# "_" table)
+    (modify-syntax-entry ?. "_" table)
+    (modify-syntax-entry ?/ ". 12" table)
+    (modify-syntax-entry ?\n "> " table)
+    table)
+  "Syntax table used while in MLIR mode.")
+
+(defvar mlir-font-lock-keywords
+  (list
+   ;; Variables
+   '("%[-a-zA-Z$._0-9]*" . font-lock-variable-name-face)
+   ;; Functions
+   '("@[-a-zA-Z$._0-9]*" . font-lock-function-name-face)
+   ;; Affinemaps
+   '("#[-a-zA-Z$._0-9]*" . font-lock-variable-name-face)
+   ;; Types
+   '("\\b\\(f16\\|bf16\\|f32\\|f64\\|index\\|tf_control\\|i[1-9][0-9]*\\)\\b" . font-lock-type-face)
+   '("\\b\\(tensor\\|vector\\|memref\\)\\b" . font-lock-type-face)
+   ;; Dimension lists
+   '("\\b\\([0-9?]+x\\)*\\(f16\\|bf16\\|f32\\|f64\\|index\\|i[1-9][0-9]*\\)\\b" . font-lock-preprocessor-face)
+   ;; Integer literals
+   '("\\b[-]?[0-9]+\\b" . font-lock-preprocessor-face)
+   ;; Floating point constants
+   '("\\b[-+]?[0-9]+.[0-9]*\\([eE][-+]?[0-9]+\\)?\\b" . font-lock-preprocessor-face)
+   ;; Hex constants
+   '("\\b0x[0-9A-Fa-f]+\\b" . font-lock-preprocessor-face)
+   ;; Keywords
+   `(,(regexp-opt
+       '(;; Toplevel entities
+         "br" "ceildiv" "func" "cond_br" "else" "extfunc" "false" "floordiv" "for" "if" "mod" "return" "size" "step" "to" "true" "??" ) 'symbols) . font-lock-keyword-face))
+  "Syntax highlighting for MLIR.")
+
+;; Emacs 23 compatibility.
+(defalias 'mlir-mode-prog-mode
+  (if (fboundp 'prog-mode)
+      'prog-mode
+    'fundamental-mode))
+
+;;;###autoload
+(define-derived-mode mlir-mode mlir-mode-prog-mode "MLIR"
+  "Major mode for editing MLIR source files.
+\\{mlir-mode-map}
+  Runs `mlir-mode-hook' on startup."
+  (setq font-lock-defaults `(mlir-font-lock-keywords))
+  (setq-local comment-start "//"))
+
+;; Associate .mlir files with mlir-mode
+;;;###autoload
+(add-to-list 'auto-mode-alist (cons "\\.mlir\\'" 'mlir-mode))
+
+(provide 'mlir-mode)
+
+;;; mlir-mode.el ends here
diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py
new file mode 100755
index 0000000000000000000000000000000000000000..6dc40c797e289c3386b5f42e35cb84eb57f1ec61
--- /dev/null
+++ b/mlir/utils/generate-test-checks.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python
+"""A script to generate FileCheck statements for mlir unit tests.
+
+This script is a utility to add FileCheck patterns to an mlir file.
+
+NOTE: The input .mlir is expected to be the output from the parser, not a
+stripped down variant.
+
+Example usage:
+$ generate-test-checks.py foo.mlir
+$ mlir-opt foo.mlir -transformation | generate-test-checks.py
+
+The script will heuristically insert CHECK/CHECK-LABEL commands for each line
+within the file. By default this script will also try to insert string
+substitution blocks for all SSA value names. The script is designed to make
+adding checks to a test case fast, it is *not* designed to be authoritative
+about what constitutes a good test!
+"""
+
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import argparse
+import os  # Used to advertise this file's name ("autogenerated_note").
+import re
+import sys
+
+ADVERT = '// NOTE: Assertions have been autogenerated by '
+
+# Regex command to match an SSA identifier.
+SSA_RE_STR = '[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*'
+SSA_RE = re.compile(SSA_RE_STR)
+
+
+# Class used to generate and manage string substitution blocks for SSA value
+# names.
+class SSAVariableNamer:
+
+  def __init__(self):
+    self.scopes = []
+    self.name_counter = 0
+
+  # Generate a substitution name for the given ssa value name.
+  def generate_name(self, ssa_name):
+    variable = 'VAL_' + str(self.name_counter)
+    self.name_counter += 1
+    self.scopes[-1][ssa_name] = variable
+    return variable
+
+  # Push a new variable name scope.
+  def push_name_scope(self):
+    self.scopes.append({})
+
+  # Pop the last variable name scope.
+  def pop_name_scope(self):
+    self.scopes.pop()
+
+
+# Process a line of input that has been split at each SSA identifier '%'.
+def process_line(line_chunks, variable_namer):
+  output_line = ''
+
+  # Process the rest that contained an SSA value name.
+  for chunk in line_chunks:
+    m = SSA_RE.match(chunk)
+    ssa_name = m.group(0)
+
+    # Check if an existing variable exists for this name.
+    variable = None
+    for scope in variable_namer.scopes:
+      variable = scope.get(ssa_name)
+      if variable is not None:
+        break
+
+    # If one exists, then output the existing name.
+    if variable is not None:
+      output_line += '[[' + variable + ']]'
+    else:
+      # Otherwise, generate a new variable.
+      variable = variable_namer.generate_name(ssa_name)
+      output_line += '[[' + variable + ':%.*]]'
+
+    # Append the non named group.
+    output_line += chunk[len(ssa_name):]
+
+  return output_line + '\n'
+
+
+# Pre-process a line of input to remove any character sequences that will be
+# problematic with FileCheck.
+def preprocess_line(line):
+  # Replace any double brackets, '[[' with escaped replacements. '[['
+  # corresponds to variable names in FileCheck.
+  output_line = line.replace('[[', '{{\\[\\[}}')
+
+  # Replace any single brackets that are followed by an SSA identifier, the
+  # identifier will be replace by a variable; Creating the same situation as
+  # above.
+  output_line = output_line.replace('[%', '{{\\[}}%')
+
+  return output_line
+
+
+def main():
+  parser = argparse.ArgumentParser(
+      description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
+  parser.add_argument(
+      '--check-prefix', default='CHECK', help='Prefix to use from check file.')
+  parser.add_argument(
+      '-o',
+      '--output',
+      nargs='?',
+      type=argparse.FileType('w'),
+      default=sys.stdout)
+  parser.add_argument(
+      'input',
+      nargs='?',
+      type=argparse.FileType('r'),
+      default=sys.stdin)
+  args = parser.parse_args()
+
+  # Open the given input file.
+  input_lines = [l.rstrip() for l in args.input]
+  args.input.close()
+
+  output_lines = []
+
+  # Generate a note used for the generated check file.
+  script_name = os.path.basename(__file__)
+  autogenerated_note = (ADVERT + 'utils/' + script_name)
+  output_lines.append(autogenerated_note + '\n')
+
+  # A map containing data used for naming SSA value names.
+  variable_namer = SSAVariableNamer()
+  for input_line in input_lines:
+    if not input_line:
+      continue
+    lstripped_input_line = input_line.lstrip()
+
+    # Lines with blocks begin with a ^. These lines have a trailing comment
+    # that needs to be stripped.
+    is_block = lstripped_input_line[0] == '^'
+    if is_block:
+      input_line = input_line.rsplit('//', 1)[0].rstrip()
+
+    # Top-level operations are heuristically the operations at nesting level 1.
+    is_toplevel_op = (not is_block and input_line.startswith('  ') and
+                      input_line[2] != ' ' and input_line[2] != '}')
+
+    # If the line starts with a '}', pop the last name scope.
+    if lstripped_input_line[0] == '}':
+      variable_namer.pop_name_scope()
+
+    # If the line ends with a '{', push a new name scope.
+    if input_line[-1] == '{':
+      variable_namer.push_name_scope()
+
+    # Preprocess the input to remove any sequences that may be problematic with
+    # FileCheck.
+    input_line = preprocess_line(input_line)
+
+    # Split the line at the each SSA value name.
+    ssa_split = input_line.split('%')
+
+    # If this is a top-level operation use 'CHECK-LABEL', otherwise 'CHECK:'.
+    if not is_toplevel_op or not ssa_split[0]:
+      output_line = '// ' + args.check_prefix + ': '
+      # Pad to align with the 'LABEL' statements.
+      output_line += (' ' * len('-LABEL'))
+
+      # Output the first line chunk that does not contain an SSA name.
+      output_line += ssa_split[0]
+
+      # Process the rest of the input line.
+      output_line += process_line(ssa_split[1:], variable_namer)
+
+    else:
+      # Append a newline to the output to separate the logical blocks.
+      output_lines.append('\n')
+      output_line = '// ' + args.check_prefix + '-LABEL: '
+
+      # Output the first line chunk that does not contain an SSA name for the
+      # label.
+      output_line += ssa_split[0] + '\n'
+
+      # Process the rest of the input line on a separate check line.
+      if len(ssa_split) > 1:
+        output_line += '// ' + args.check_prefix + '-SAME:  '
+
+        # Pad to align with the original position in the line.
+        output_line += ' ' * len(ssa_split[0])
+
+        # Process the rest of the line.
+        output_line += process_line(ssa_split[1:], variable_namer)
+
+    # Append the output line.
+    output_lines.append(output_line)
+
+  # Write the output.
+  for output_line in output_lines:
+    args.output.write(output_line)
+  args.output.write('\n')
+  args.output.close()
+
+
+if __name__ == '__main__':
+  main()
diff --git a/mlir/utils/spirv/define_enum.sh b/mlir/utils/spirv/define_enum.sh
new file mode 100755
index 0000000000000000000000000000000000000000..87b88c9313337eaeff77b4359aabe418540d6524
--- /dev/null
+++ b/mlir/utils/spirv/define_enum.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Script for defining a new enum attr using SPIR-V spec from the Internet.
+#
+# Run as:
+# ./define_enum.sh <enum-class-name>
+#
+# The 'operand_kinds' dict of spirv.core.grammar.json contains all supported
+# SPIR-V enum classes.
+#
+# If <enum-name> is missing, this script updates existing ones.
+
+set -e
+
+new_enum=$1
+
+current_file="$(readlink -f "$0")"
+current_dir="$(dirname "$current_file")"
+
+python3 ${current_dir}/gen_spirv_dialect.py \
+  --base-td-path ${current_dir}/../../include/mlir/Dialect/SPIRV/SPIRVBase.td \
+  --new-enum "${new_enum}"
diff --git a/mlir/utils/spirv/define_inst.sh b/mlir/utils/spirv/define_inst.sh
new file mode 100755
index 0000000000000000000000000000000000000000..db58813c7b9057be8668afafbd887520bfcd0067
--- /dev/null
+++ b/mlir/utils/spirv/define_inst.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Script for defining a new op using SPIR-V spec from the Internet.
+#
+# Run as:
+# ./define_inst.sh <filename> <baseclass> (<opname>)*
+
+# <filename> is required, which is the file name of MLIR SPIR-V op definitions
+# spec.
+# <baseclass> is required. It will be the direct base class the newly defined
+# op will drive from.
+# If <opname> is missing, this script updates existing ones in <filename>.
+
+# For example:
+# ./define_inst.sh SPIRVArithmeticOps.td ArithmeticBianryOp OpIAdd
+# ./define_inst.sh SPIRVLogicalOps.td LogicalOp OpFOrdEqual
+set -e
+
+file_name=$1
+baseclass=$2
+
+case $baseclass in
+  Op | ArithmeticBinaryOp | ArithmeticUnaryOp | LogicalBinaryOp | LogicalUnaryOp | CastOp | ControlFlowOp | StructureOp | AtomicUpdateOp | AtomicUpdateWithValueOp)
+  ;;
+  *)
+    echo "Usage : " $0 "<filename> <baseclass> (<opname>)*"
+    echo "<filename> is the file name of MLIR SPIR-V op definitions spec"
+    echo "<baseclass> must be one of " \
+      "(Op|ArithmeticBinaryOp|ArithmeticUnaryOp|LogicalBinaryOp|LogicalUnaryOp|CastOp|ControlFlowOp|StructureOp|AtomicUpdateOp)"
+    exit 1;
+  ;;
+esac
+
+shift
+shift
+
+current_file="$(readlink -f "$0")"
+current_dir="$(dirname "$current_file")"
+
+python3 ${current_dir}/gen_spirv_dialect.py \
+  --op-td-path \
+  ${current_dir}/../../include/mlir/Dialect/SPIRV/${file_name} \
+  --inst-category $baseclass --new-inst "$@"
+
+${current_dir}/define_opcodes.sh "$@"
+
diff --git a/mlir/utils/spirv/define_opcodes.sh b/mlir/utils/spirv/define_opcodes.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7b9aeab9c08ffd27d93239aa7b3817562557b708
--- /dev/null
+++ b/mlir/utils/spirv/define_opcodes.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Script for defining map for opname to opcode using SPIR-V spec from the
+# Internet
+#
+# Run as:
+# ./define_opcode.sh (<op-name>)*
+#
+# For example:
+# ./define_opcode.sh OpTypeVoid OpTypeFunction
+#
+# If no op-name is specified, the existing opcodes are updated
+#
+# The 'instructions' list of spirv.core.grammar.json contains all instructions
+# in SPIR-V
+
+set -e
+
+current_file="$(readlink -f "$0")"
+current_dir="$(dirname "$current_file")"
+
+python3 ${current_dir}/gen_spirv_dialect.py \
+  --base-td-path ${current_dir}/../../include/mlir/Dialect/SPIRV/SPIRVBase.td \
+  --new-opcode $@
diff --git a/mlir/utils/spirv/gen_spirv_dialect.py b/mlir/utils/spirv/gen_spirv_dialect.py
new file mode 100755
index 0000000000000000000000000000000000000000..2433cf4e6da0818e7afdcfdfb81a1f72904c6e80
--- /dev/null
+++ b/mlir/utils/spirv/gen_spirv_dialect.py
@@ -0,0 +1,729 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Script for updating SPIR-V dialect by scraping information from SPIR-V
+# HTML and JSON specs from the Internet.
+#
+# For example, to define the enum attribute for SPIR-V memory model:
+#
+# ./gen_spirv_dialect.py --base_td_path /path/to/SPIRVBase.td \
+#                        --new-enum MemoryModel
+#
+# The 'operand_kinds' dict of spirv.core.grammar.json contains all supported
+# SPIR-V enum classes.
+
+import itertools
+import re
+import requests
+import textwrap
+
+SPIRV_HTML_SPEC_URL = 'https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html'
+SPIRV_JSON_SPEC_URL = 'https://raw.githubusercontent.com/KhronosGroup/SPIRV-Headers/master/include/spirv/unified1/spirv.core.grammar.json'
+
+AUTOGEN_OP_DEF_SEPARATOR = '\n// -----\n\n'
+AUTOGEN_ENUM_SECTION_MARKER = 'enum section. Generated from SPIR-V spec; DO NOT MODIFY!'
+AUTOGEN_OPCODE_SECTION_MARKER = (
+    'opcode section. Generated from SPIR-V spec; DO NOT MODIFY!')
+
+
+def get_spirv_doc_from_html_spec():
+  """Extracts instruction documentation from SPIR-V HTML spec.
+
+  Returns:
+    - A dict mapping from instruction opcode to documentation.
+  """
+  response = requests.get(SPIRV_HTML_SPEC_URL)
+  spec = response.content
+
+  from bs4 import BeautifulSoup
+  spirv = BeautifulSoup(spec, 'html.parser')
+
+  section_anchor = spirv.find('h3', {'id': '_a_id_instructions_a_instructions'})
+
+  doc = {}
+
+  for section in section_anchor.parent.find_all('div', {'class': 'sect3'}):
+    for table in section.find_all('table'):
+      inst_html = table.tbody.tr.td.p
+      opname = inst_html.a['id']
+      # Ignore the first line, which is just the opname.
+      doc[opname] = inst_html.text.split('\n', 1)[1].strip()
+
+  return doc
+
+
+def get_spirv_grammar_from_json_spec():
+  """Extracts operand kind and instruction grammar from SPIR-V JSON spec.
+
+  Returns:
+    - A list containing all operand kinds' grammar
+    - A list containing all instructions' grammar
+  """
+  response = requests.get(SPIRV_JSON_SPEC_URL)
+  spec = response.content
+
+  import json
+  spirv = json.loads(spec)
+
+  return spirv['operand_kinds'], spirv['instructions']
+
+
+def split_list_into_sublists(items, offset):
+  """Split the list of items into multiple sublists.
+
+  This is to make sure the string composed from each sublist won't exceed
+  80 characters.
+
+  Arguments:
+    - items: a list of strings
+    - offset: the offset in calculating each sublist's length
+  """
+  chuncks = []
+  chunk = []
+  chunk_len = 0
+
+  for item in items:
+    chunk_len += len(item) + 2
+    if chunk_len > 80:
+      chuncks.append(chunk)
+      chunk = []
+      chunk_len = len(item) + 2
+    chunk.append(item)
+
+  if len(chunk) != 0:
+    chuncks.append(chunk)
+
+  return chuncks
+
+
+def uniquify_enum_cases(lst):
+  """Prunes duplicate enum cases from the list.
+
+  Arguments:
+   - lst: List whose elements are to be uniqued. Assumes each element is a
+     (symbol, value) pair and elements already sorted according to value.
+
+  Returns:
+   - A list with all duplicates removed. The elements are sorted according to
+     value and, for each value, uniqued according to symbol.
+     original list,
+  """
+  cases = lst
+  uniqued_cases = []
+
+  # First sort according to the value
+  cases.sort(key=lambda x: x[1])
+
+  # Then group them according to the value
+  for _, groups in itertools.groupby(cases, key=lambda x: x[1]):
+    # For each value, sort according to the enumerant symbol.
+    sorted_group = sorted(groups, key=lambda x: x[0])
+    # Keep the "smallest" case, which is typically the symbol without extension
+    # suffix. But we have special cases that we want to fix.
+    case = sorted_group[0]
+    if case[0] == 'HlslSemanticGOOGLE':
+      case = sorted_group[1]
+    uniqued_cases.append(case)
+
+  return uniqued_cases
+
+
+def gen_operand_kind_enum_attr(operand_kind):
+  """Generates the TableGen EnumAttr definition for the given operand kind.
+
+  Returns:
+    - The operand kind's name
+    - A string containing the TableGen EnumAttr definition
+  """
+  if 'enumerants' not in operand_kind:
+    return '', ''
+
+  # Returns a symbol for the given case in the given kind. This function
+  # handles Dim specially to avoid having numbers as the start of symbols,
+  # which does not play well with C++ and the MLIR parser.
+  def get_case_symbol(kind_name, case_name):
+    if kind_name == 'Dim':
+      if case_name == '1D' or case_name == '2D' or case_name == '3D':
+        return 'Dim{}'.format(case_name)
+    return case_name
+
+  kind_name = operand_kind['kind']
+  is_bit_enum = operand_kind['category'] == 'BitEnum'
+  kind_category = 'Bit' if is_bit_enum else 'I32'
+  kind_acronym = ''.join([c for c in kind_name if c >= 'A' and c <= 'Z'])
+  kind_cases = [(case['enumerant'], case['value'])
+                for case in operand_kind['enumerants']]
+  kind_cases = uniquify_enum_cases(kind_cases)
+  max_len = max([len(symbol) for (symbol, _) in kind_cases])
+
+  # Generate the definition for each enum case
+  fmt_str = 'def SPV_{acronym}_{case} {colon:>{offset}} '\
+            '{category}EnumAttrCase<"{symbol}", {value}>;'
+  case_defs = [
+      fmt_str.format(
+          category=kind_category,
+          acronym=kind_acronym,
+          case=case[0],
+          symbol=get_case_symbol(kind_name, case[0]),
+          value=case[1],
+          colon=':',
+          offset=(max_len + 1 - len(case[0]))) for case in kind_cases
+  ]
+  case_defs = '\n'.join(case_defs)
+
+  # Generate the list of enum case names
+  fmt_str = 'SPV_{acronym}_{symbol}';
+  case_names = [fmt_str.format(acronym=kind_acronym,symbol=case[0])
+                for case in kind_cases]
+
+  # Split them into sublists and concatenate into multiple lines
+  case_names = split_list_into_sublists(case_names, 6)
+  case_names = ['{:6}'.format('') + ', '.join(sublist)
+                for sublist in case_names]
+  case_names = ',\n'.join(case_names)
+
+  # Generate the enum attribute definition
+  enum_attr = 'def SPV_{name}Attr :\n    '\
+      '{category}EnumAttr<"{name}", "valid SPIR-V {name}", [\n{cases}\n'\
+      '    ]> {{\n'\
+      '  let cppNamespace = "::mlir::spirv";\n}}'.format(
+          name=kind_name, category=kind_category, cases=case_names)
+  return kind_name, case_defs + '\n\n' + enum_attr
+
+
+def gen_opcode(instructions):
+  """ Generates the TableGen definition to map opname to opcode
+
+  Returns:
+    - A string containing the TableGen SPV_OpCode definition
+  """
+
+  max_len = max([len(inst['opname']) for inst in instructions])
+  def_fmt_str = 'def SPV_OC_{name} {colon:>{offset}} '\
+            'I32EnumAttrCase<"{name}", {value}>;'
+  opcode_defs = [
+      def_fmt_str.format(
+          name=inst['opname'],
+          value=inst['opcode'],
+          colon=':',
+          offset=(max_len + 1 - len(inst['opname']))) for inst in instructions
+  ]
+  opcode_str = '\n'.join(opcode_defs)
+
+  decl_fmt_str = 'SPV_OC_{name}'
+  opcode_list = [
+      decl_fmt_str.format(name=inst['opname']) for inst in instructions
+  ]
+  opcode_list = split_list_into_sublists(opcode_list, 6)
+  opcode_list = [
+      '{:6}'.format('') + ', '.join(sublist) for sublist in opcode_list
+  ]
+  opcode_list = ',\n'.join(opcode_list)
+  enum_attr = 'def SPV_OpcodeAttr :\n'\
+              '    I32EnumAttr<"{name}", "valid SPIR-V instructions", [\n'\
+              '{lst}\n'\
+              '      ]> {{\n'\
+              '    let cppNamespace = "::mlir::spirv";\n}}'.format(
+                  name='Opcode', lst=opcode_list)
+  return opcode_str + '\n\n' + enum_attr
+
+
+def update_td_opcodes(path, instructions, filter_list):
+  """Updates SPIRBase.td with new generated opcode cases.
+
+  Arguments:
+    - path: the path to SPIRBase.td
+    - instructions: a list containing all SPIR-V instructions' grammar
+    - filter_list: a list containing new opnames to add
+  """
+
+  with open(path, 'r') as f:
+    content = f.read()
+
+  content = content.split(AUTOGEN_OPCODE_SECTION_MARKER)
+  assert len(content) == 3
+
+  # Extend opcode list with existing list
+  existing_opcodes = [k[11:] for k in re.findall('def SPV_OC_\w+', content[1])]
+  filter_list.extend(existing_opcodes)
+  filter_list = list(set(filter_list))
+
+  # Generate the opcode for all instructions in SPIR-V
+  filter_instrs = list(
+      filter(lambda inst: (inst['opname'] in filter_list), instructions))
+  # Sort instruction based on opcode
+  filter_instrs.sort(key=lambda inst: inst['opcode'])
+  opcode = gen_opcode(filter_instrs)
+
+  # Substitute the opcode
+  content = content[0] + AUTOGEN_OPCODE_SECTION_MARKER + '\n\n' + \
+        opcode + '\n\n// End ' + AUTOGEN_OPCODE_SECTION_MARKER \
+        + content[2]
+
+  with open(path, 'w') as f:
+    f.write(content)
+
+
+def update_td_enum_attrs(path, operand_kinds, filter_list):
+  """Updates SPIRBase.td with new generated enum definitions.
+
+  Arguments:
+    - path: the path to SPIRBase.td
+    - operand_kinds: a list containing all operand kinds' grammar
+    - filter_list: a list containing new enums to add
+  """
+  with open(path, 'r') as f:
+    content = f.read()
+
+  content = content.split(AUTOGEN_ENUM_SECTION_MARKER)
+  assert len(content) == 3
+
+  # Extend filter list with existing enum definitions
+  existing_kinds = [
+      k[8:-4] for k in re.findall('def SPV_\w+Attr', content[1])]
+  filter_list.extend(existing_kinds)
+
+  # Generate definitions for all enums in filter list
+  defs = [gen_operand_kind_enum_attr(kind)
+          for kind in operand_kinds if kind['kind'] in filter_list]
+  # Sort alphabetically according to enum name
+  defs.sort(key=lambda enum : enum[0])
+  # Only keep the definitions from now on
+  # Put Capability's definition at the very beginning because capability cases
+  # will be referenced later
+  defs = [enum[1] for enum in defs if enum[0] == 'Capability'
+         ] + [enum[1] for enum in defs if enum[0] != 'Capability']
+
+  # Substitute the old section
+  content = content[0] + AUTOGEN_ENUM_SECTION_MARKER + '\n\n' + \
+      '\n\n'.join(defs) + "\n\n// End " + AUTOGEN_ENUM_SECTION_MARKER  \
+      + content[2];
+
+  with open(path, 'w') as f:
+    f.write(content)
+
+
+def snake_casify(name):
+  """Turns the given name to follow snake_case convention."""
+  name = re.sub('\W+', '', name).split()
+  name = [s.lower() for s in name]
+  return '_'.join(name)
+
+
+def map_spec_operand_to_ods_argument(operand):
+  """Maps a operand in SPIR-V JSON spec to an op argument in ODS.
+
+  Arguments:
+    - A dict containing the operand's kind, quantifier, and name
+
+  Returns:
+    - A string containing both the type and name for the argument
+  """
+  kind = operand['kind']
+  quantifier = operand.get('quantifier', '')
+
+  # These instruction "operands" are for encoding the results; they should
+  # not be handled here.
+  assert kind != 'IdResultType', 'unexpected to handle "IdResultType" kind'
+  assert kind != 'IdResult', 'unexpected to handle "IdResult" kind'
+
+  if kind == 'IdRef':
+    if quantifier == '':
+      arg_type = 'SPV_Type'
+    elif quantifier == '?':
+      arg_type = 'SPV_Optional<SPV_Type>'
+    else:
+      arg_type = 'Variadic<SPV_Type>'
+  elif kind == 'IdMemorySemantics' or kind == 'IdScope':
+    # TODO(antiagainst): Need to further constrain 'IdMemorySemantics'
+    # and 'IdScope' given that they should be generated from OpConstant.
+    assert quantifier == '', ('unexpected to have optional/variadic memory '
+                              'semantics or scope <id>')
+    arg_type = 'SPV_' + kind[2:] + 'Attr'
+  elif kind == 'LiteralInteger':
+    if quantifier == '':
+      arg_type = 'I32Attr'
+    elif quantifier == '?':
+      arg_type = 'OptionalAttr<I32Attr>'
+    else:
+      arg_type = 'OptionalAttr<I32ArrayAttr>'
+  elif kind == 'LiteralString' or \
+      kind == 'LiteralContextDependentNumber' or \
+      kind == 'LiteralExtInstInteger' or \
+      kind == 'LiteralSpecConstantOpInteger' or \
+      kind == 'PairLiteralIntegerIdRef' or \
+      kind == 'PairIdRefLiteralInteger' or \
+      kind == 'PairIdRefIdRef':
+    assert False, '"{}" kind unimplemented'.format(kind)
+  else:
+    # The rest are all enum operands that we represent with op attributes.
+    assert quantifier != '*', 'unexpected to have variadic enum attribute'
+    arg_type = 'SPV_{}Attr'.format(kind)
+    if quantifier == '?':
+      arg_type = 'OptionalAttr<{}>'.format(arg_type)
+
+  name = operand.get('name', '')
+  name = snake_casify(name) if name else kind.lower()
+
+  return '{}:${}'.format(arg_type, name)
+
+
+def get_description(text, assembly):
+  """Generates the description for the given SPIR-V instruction.
+
+  Arguments:
+    - text: Textual description of the operation as string.
+    - assembly: Custom Assembly format with example as string.
+
+  Returns:
+    - A string that corresponds to the description of the Tablegen op.
+  """
+  fmt_str = ('{text}\n\n    ### Custom assembly ' 'form\n{assembly}\n  ')
+  return fmt_str.format(
+      text=text, assembly=assembly)
+
+
+def get_op_definition(instruction, doc, existing_info):
+  """Generates the TableGen op definition for the given SPIR-V instruction.
+
+  Arguments:
+    - instruction: the instruction's SPIR-V JSON grammar
+    - doc: the instruction's SPIR-V HTML doc
+    - existing_info: a dict containing potential manually specified sections for
+      this instruction
+
+  Returns:
+    - A string containing the TableGen op definition
+  """
+  fmt_str = ('def SPV_{opname}Op : '
+             'SPV_{inst_category}<"{opname}"{category_args}[{traits}]> '
+             '{{\n  let summary = {summary};\n\n  let description = '
+             '[{{\n{description}}}];\n')
+  inst_category = existing_info.get('inst_category', 'Op')
+  if inst_category == 'Op':
+    fmt_str +='\n  let arguments = (ins{args});\n\n'\
+              '  let results = (outs{results});\n'
+
+  fmt_str +='{extras}'\
+            '}}\n'
+
+  opname = instruction['opname'][2:]
+  category_args = existing_info.get('category_args', '')
+  # Make sure we have ', ' to separate the category arguments from traits
+  category_args = category_args.rstrip(', ') + ', '
+
+  if '\n' in doc:
+    summary, text = doc.split('\n', 1)
+  else:
+    summary = doc
+    text = ''
+  wrapper = textwrap.TextWrapper(
+      width=76, initial_indent='    ', subsequent_indent='    ')
+
+  # Format summary. If the summary can fit in the same line, we print it out
+  # as a "-quoted string; otherwise, wrap the lines using "[{...}]".
+  summary = summary.strip();
+  if len(summary) + len('  let summary = "";') <= 80:
+    summary = '"{}"'.format(summary)
+  else:
+    summary = '[{{\n{}\n  }}]'.format(wrapper.fill(summary))
+
+  # Wrap text
+  text = text.split('\n')
+  text = [wrapper.fill(line) for line in text if line]
+  text = '\n\n'.join(text)
+
+  operands = instruction.get('operands', [])
+
+  # Set op's result
+  results = ''
+  if len(operands) > 0 and operands[0]['kind'] == 'IdResultType':
+    results = '\n    SPV_Type:$result\n  '
+    operands = operands[1:]
+  if 'results' in existing_info:
+    results = existing_info['results']
+
+  # Ignore the operand standing for the result <id>
+  if len(operands) > 0 and operands[0]['kind'] == 'IdResult':
+    operands = operands[1:]
+
+  # Set op' argument
+  arguments = existing_info.get('arguments', None)
+  if arguments is None:
+    arguments = [map_spec_operand_to_ods_argument(o) for o in operands]
+    arguments = ',\n    '.join(arguments)
+    if arguments:
+      # Prepend and append whitespace for formatting
+      arguments = '\n    {}\n  '.format(arguments)
+
+  description = existing_info.get('description', None)
+  if description is None:
+    assembly = '\n    ```\n'\
+               '    [TODO]\n'\
+               '    ```\n\n'\
+               '    For example:\n\n'\
+               '    ```\n'\
+               '    [TODO]\n' \
+               '    ```'
+    description = get_description(text, assembly)
+
+  return fmt_str.format(
+      opname=opname,
+      category_args=category_args,
+      inst_category=inst_category,
+      traits=existing_info.get('traits', ''),
+      summary=summary,
+      description=description,
+      args=arguments,
+      results=results,
+      extras=existing_info.get('extras', ''))
+
+
+def get_string_between(base, start, end):
+  """Extracts a substring with a specified start and end from a string.
+
+  Arguments:
+    - base: string to extract from.
+    - start: string to use as the start of the substring.
+    - end: string to use as the end of the substring.
+
+  Returns:
+    - The substring if found
+    - The part of the base after end of the substring. Is the base string itself
+      if the substring wasnt found.
+  """
+  split = base.split(start, 1)
+  if len(split) == 2:
+    rest = split[1].split(end, 1)
+    assert len(rest) == 2, \
+           'cannot find end "{end}" while extracting substring '\
+           'starting with {start}'.format(start=start, end=end)
+    return rest[0].rstrip(end), rest[1]
+  return '', split[0]
+
+
+def get_string_between_nested(base, start, end):
+  """Extracts a substring with a nested start and end from a string.
+
+  Arguments:
+    - base: string to extract from.
+    - start: string to use as the start of the substring.
+    - end: string to use as the end of the substring.
+
+  Returns:
+    - The substring if found
+    - The part of the base after end of the substring. Is the base string itself
+      if the substring wasnt found.
+  """
+  split = base.split(start, 1)
+  if len(split) == 2:
+    # Handle nesting delimiters
+    rest = split[1]
+    unmatched_start = 1
+    index = 0
+    while unmatched_start > 0 and index < len(rest):
+      if rest[index:].startswith(end):
+        unmatched_start -= 1
+        index += len(end)
+      elif rest[index:].startswith(start):
+        unmatched_start += 1
+        index += len(start)
+      else:
+        index += 1
+
+    assert index < len(rest), \
+           'cannot find end "{end}" while extracting substring '\
+           'starting with "{start}"'.format(start=start, end=end)
+    return rest[:index - len(end)].rstrip(end), rest[index:]
+  return '', split[0]
+
+
+def extract_td_op_info(op_def):
+  """Extracts potentially manually specified sections in op's definition.
+
+  Arguments: - A string containing the op's TableGen definition
+    - doc: the instruction's SPIR-V HTML doc
+
+  Returns:
+    - A dict containing potential manually specified sections
+  """
+  # Get opname
+  opname = [o[8:-2] for o in re.findall('def SPV_\w+Op', op_def)]
+  assert len(opname) == 1, 'more than one ops in the same section!'
+  opname = opname[0]
+
+  # Get instruction category
+  inst_category = [
+      o[4:] for o in re.findall('SPV_\w+Op',
+                                op_def.split(':', 1)[1])
+  ]
+  assert len(inst_category) <= 1, 'more than one ops in the same section!'
+  inst_category = inst_category[0] if len(inst_category) == 1 else 'Op'
+
+  # Get category_args
+  op_tmpl_params = get_string_between_nested(op_def, '<', '>')[0]
+  opstringname, rest = get_string_between(op_tmpl_params, '"', '"')
+  category_args = rest.split('[', 1)[0]
+
+  # Get traits
+  traits, _ = get_string_between(rest, '[', ']')
+
+  # Get description
+  description, rest = get_string_between(op_def, 'let description = [{\n',
+                                         '}];\n')
+
+  # Get arguments
+  args, rest = get_string_between(rest, '  let arguments = (ins', ');\n')
+
+  # Get results
+  results, rest = get_string_between(rest, '  let results = (outs', ');\n')
+
+  extras = rest.strip(' }\n')
+  if extras:
+    extras = '\n  {}\n'.format(extras)
+
+  return {
+      # Prefix with 'Op' to make it consistent with SPIR-V spec
+      'opname': 'Op{}'.format(opname),
+      'inst_category': inst_category,
+      'category_args': category_args,
+      'traits': traits,
+      'description': description,
+      'arguments': args,
+      'results': results,
+      'extras': extras
+  }
+
+
+def update_td_op_definitions(path, instructions, docs, filter_list,
+                             inst_category):
+  """Updates SPIRVOps.td with newly generated op definition.
+
+  Arguments:
+    - path: path to SPIRVOps.td
+    - instructions: SPIR-V JSON grammar for all instructions
+    - docs: SPIR-V HTML doc for all instructions
+    - filter_list: a list containing new opnames to include
+
+  Returns:
+    - A string containing all the TableGen op definitions
+  """
+  with open(path, 'r') as f:
+    content = f.read()
+
+  # Split the file into chuncks, each containing one op.
+  ops = content.split(AUTOGEN_OP_DEF_SEPARATOR)
+  header = ops[0]
+  footer = ops[-1]
+  ops = ops[1:-1]
+
+  # For each existing op, extract the manually-written sections out to retain
+  # them when re-generating the ops. Also append the existing ops to filter
+  # list.
+  name_op_map = {}  # Map from opname to its existing ODS definition
+  op_info_dict = {}
+  for op in ops:
+    info_dict = extract_td_op_info(op)
+    opname = info_dict['opname']
+    name_op_map[opname] = op
+    op_info_dict[opname] = info_dict
+    filter_list.append(opname)
+  filter_list = sorted(list(set(filter_list)))
+
+  op_defs = []
+  for opname in filter_list:
+    # Find the grammar spec for this op
+    try:
+      instruction = next(
+          inst for inst in instructions if inst['opname'] == opname)
+      op_defs.append(
+          get_op_definition(
+              instruction, docs[opname],
+              op_info_dict.get(opname, {'inst_category': inst_category})))
+    except StopIteration:
+      # This is an op added by us; use the existing ODS definition.
+      op_defs.append(name_op_map[opname])
+
+  # Substitute the old op definitions
+  op_defs = [header] + op_defs + [footer]
+  content = AUTOGEN_OP_DEF_SEPARATOR.join(op_defs)
+
+  with open(path, 'w') as f:
+    f.write(content)
+
+
+if __name__ == '__main__':
+  import argparse
+
+  cli_parser = argparse.ArgumentParser(
+      description='Update SPIR-V dialect definitions using SPIR-V spec')
+
+  cli_parser.add_argument(
+      '--base-td-path',
+      dest='base_td_path',
+      type=str,
+      default=None,
+      help='Path to SPIRVBase.td')
+  cli_parser.add_argument(
+      '--op-td-path',
+      dest='op_td_path',
+      type=str,
+      default=None,
+      help='Path to SPIRVOps.td')
+
+  cli_parser.add_argument(
+      '--new-enum',
+      dest='new_enum',
+      type=str,
+      default=None,
+      help='SPIR-V enum to be added to SPIRVBase.td')
+  cli_parser.add_argument(
+      '--new-opcodes',
+      dest='new_opcodes',
+      type=str,
+      default=None,
+      nargs='*',
+      help='update SPIR-V opcodes in SPIRVBase.td')
+  cli_parser.add_argument(
+      '--new-inst',
+      dest='new_inst',
+      type=str,
+      default=None,
+      nargs='*',
+      help='SPIR-V instruction to be added to ops file')
+  cli_parser.add_argument(
+      '--inst-category',
+      dest='inst_category',
+      type=str,
+      default='Op',
+      help='SPIR-V instruction category used for choosing '\
+           'the TableGen base class to define this op')
+
+  args = cli_parser.parse_args()
+
+  operand_kinds, instructions = get_spirv_grammar_from_json_spec()
+
+  # Define new enum attr
+  if args.new_enum is not None:
+    assert args.base_td_path is not None
+    filter_list = [args.new_enum] if args.new_enum else []
+    update_td_enum_attrs(args.base_td_path, operand_kinds, filter_list)
+
+  # Define new opcode
+  if args.new_opcodes is not None:
+    assert args.base_td_path is not None
+    update_td_opcodes(args.base_td_path, instructions, args.new_opcodes)
+
+  # Define new op
+  if args.new_inst is not None:
+    assert args.op_td_path is not None
+    docs = get_spirv_doc_from_html_spec()
+    update_td_op_definitions(args.op_td_path, instructions, docs, args.new_inst,
+                             args.inst_category)
+    print('Done. Note that this script just generates a template; ', end='')
+    print('please read the spec and update traits, arguments, and ', end='')
+    print('results accordingly.')
diff --git a/mlir/utils/textmate/mlir.json b/mlir/utils/textmate/mlir.json
new file mode 100644
index 0000000000000000000000000000000000000000..8399a972d81b62a1d491bb183640c41419b346d4
--- /dev/null
+++ b/mlir/utils/textmate/mlir.json
@@ -0,0 +1,113 @@
+{
+  "fileTypes":[
+    "mlir"
+  ],
+  "repository":{
+    "attribute":{
+      "match":"\\W[\\w_][\\w\\d_.$]*\\s*=",
+      "name":"meta.attribute.mlir"
+    },
+    "branch_target":{
+      "match":"\\^bb[\\w\\d_$\\.-]+",
+      "name":"entity.name.label.mlir"
+    },
+    "comment":{
+      "match":"\/\/.*$",
+      "name":"comment.line.double-slash.mlir"
+    },
+    "identifier":{
+      "match":"[\\%#@][\\w_][\\w\\d_.$]*",
+      "captures":{
+        "0":{
+          "name":"variable.mlir"
+        }
+      },
+      "name":"meta.identifier.mlir"
+    },
+    "integer":{
+      "match":"[\\Wx]([0-9]+)",
+      "captures":{
+        "1":{
+          "name":"constant.numeric.mlir"
+        }
+      },
+      "name":"meta.identifier.mlir"
+    },
+    "string":{
+      "end":"\"",
+      "begin":"\"",
+      "beginCaptures":{
+        "0":{
+          "name":"punctuation.definition.string.begin.mlir"
+        }
+      },
+      "patterns":[
+        {
+          "match":"\\\\[nt\"]",
+          "name":"constant.character.escape.mlir"
+        },
+        {
+          "match":"\\\\.",
+          "name":"invalid.illegal.mlir"
+        }
+      ],
+      "endCaptures":{
+        "0":{
+          "name":"punctuation.definition.string.end.mlir"
+        }
+      },
+      "name":"string.quoted.double.mlir"
+    },
+    "types":{
+      "match":"[\\Wx](index|i[1-9][0-9]*|f16|bf16|f32|f64|memref|tensor|vector)\\b",
+      "captures":{
+        "1":{
+          "name":"storage.type.mlir"
+        }
+      },
+      "name":"meta.types.simple.mlir"
+    }
+  },
+  "patterns":[
+    {
+      "include":"#comment"
+    },
+    {
+      "include":"#string"
+    },
+    {
+      "match":"\\b(func)\\b\\s*(@[\\w_][\\w\\d_.$]*)",
+      "captures":{
+        "1":{
+          "name":"keyword.function.mlir"
+        },
+        "2":{
+          "name":"entity.name.function.mlir"
+        }
+      },
+      "name":"support.function.mlir"
+    },
+    {
+      "match":"\\b(attributes|br|call|constant|loc|return)\\b",
+      "name":"keyword.module.mlir"
+    },
+    {
+      "include":"#identifier"
+    },
+    {
+      "include":"#branch_target"
+    },
+    {
+      "include":"#attribute"
+    },
+    {
+      "include":"#types"
+    },
+    {
+      "include":"#integer"
+    }
+  ],
+  "name":"MLIR",
+  "scopeName":"source.mlir"
+}
+
diff --git a/mlir/utils/vim/README b/mlir/utils/vim/README
new file mode 100644
index 0000000000000000000000000000000000000000..d9a1f7f3da52f66be8ce1a04191a84302e8fd240
--- /dev/null
+++ b/mlir/utils/vim/README
@@ -0,0 +1,9 @@
+-*- mlir/utils/vim/README -*-
+
+This directory contains settings for the vim editor to work on MLIR *.mlir 
+files.  It comes with filetype detection rules in the (ftdetect),
+syntax highlighting (syntax), some minimal sensible default settings (ftplugin)
+and indentation plugins (indent).
+
+To install, copy all subdirectories to your $HOME/.vim/, or if you
+prefer, create symlinks to the files here.
diff --git a/mlir/utils/vim/ftdetect/mlir.vim b/mlir/utils/vim/ftdetect/mlir.vim
new file mode 100644
index 0000000000000000000000000000000000000000..f8b1de4830d22a1cc6bf7adff711577a0480d9a7
--- /dev/null
+++ b/mlir/utils/vim/ftdetect/mlir.vim
@@ -0,0 +1 @@
+au BufRead,BufNewFile *.mlir set filetype=mlir
diff --git a/mlir/utils/vim/ftplugin/mlir.vim b/mlir/utils/vim/ftplugin/mlir.vim
new file mode 100644
index 0000000000000000000000000000000000000000..83b07f51001c0296f0fa8fe03f675ea0f5b60574
--- /dev/null
+++ b/mlir/utils/vim/ftplugin/mlir.vim
@@ -0,0 +1,12 @@
+" Vim filetype plugin file
+" Language: MLIR Assembly
+" Maintainer: The MLIR team
+
+if exists("b:did_ftplugin")
+  finish
+endif
+let b:did_ftplugin = 1
+
+setlocal softtabstop=2 shiftwidth=2
+setlocal expandtab
+setlocal comments+=://
diff --git a/mlir/utils/vim/indent/mlir.vim b/mlir/utils/vim/indent/mlir.vim
new file mode 100644
index 0000000000000000000000000000000000000000..2c5b4aea534c74c0b653a006aa273a956b6c3622
--- /dev/null
+++ b/mlir/utils/vim/indent/mlir.vim
@@ -0,0 +1,75 @@
+" Vim indent file
+" Language:   mlir
+" Maintainer: The MLIR team
+" Adapted from the LLVM vim indent file
+" What this indent plugin currently does:
+"  - If no other rule matches copy indent from previous non-empty,
+"    non-commented line.
+"  - On '}' align the same as the line containing the matching '{'.
+"  - If previous line starts with a block label, increase indentation.
+"  - If the current line is a block label and ends with ':' indent at the same
+"    level as the enclosing '{'/'}' block.
+" Stuff that would be nice to add:
+"  - Continue comments on next line.
+"  - If there is an opening+unclosed parenthesis on previous line indent to
+"    that.
+if exists("b:did_indent")
+  finish
+endif
+let b:did_indent = 1
+
+setlocal shiftwidth=2 expandtab
+
+setlocal indentkeys=0{,0},<:>,!^F,o,O,e
+setlocal indentexpr=GetMLIRIndent()
+
+if exists("*GetMLIRIndent")
+  finish
+endif
+
+function! FindOpenBrace(lnum)
+  call cursor(a:lnum, 1)
+  return searchpair('{', '', '}', 'bW')
+endfun
+
+function! GetMLIRIndent()
+  " On '}' align the same as the line containing the matching '{'
+  let thisline = getline(v:lnum)
+  if thisline =~ '^\s*}'
+    call cursor(v:lnum, 1)
+    silent normal %
+    let opening_lnum = line('.')
+    if opening_lnum != v:lnum
+      return indent(opening_lnum)
+    endif
+  endif
+
+  " Indent labels the same as the current opening block
+  if thisline =~ '\^\h\+.*:\s*$'
+    let blockbegin = FindOpenBrace(v:lnum)
+    if blockbegin > 0
+      return indent(blockbegin)
+    endif
+  endif
+
+  " Find a non-blank not-completely commented line above the current line.
+  let prev_lnum = prevnonblank(v:lnum - 1)
+  while prev_lnum > 0 && synIDattr(synID(prev_lnum, 1 + indent(prev_lnum), 0), "name") == "mlirComment"
+    let prev_lnum = prevnonblank(prev_lnum-1)
+  endwhile
+  " Hit the start of the file, use zero indent.
+  if prev_lnum == 0
+    return 0
+  endif
+
+  let ind = indent(prev_lnum)
+  let prevline = getline(prev_lnum)
+
+  " Add a 'shiftwidth' after lines that start a function, block/labels, or a
+  " region.
+  if prevline =~ '{\s*$' || prevline =~ '\^\h\+.*:\s*$'
+    let ind = ind + &shiftwidth
+  endif
+
+  return ind
+endfunction
diff --git a/mlir/utils/vim/syntax/mlir.vim b/mlir/utils/vim/syntax/mlir.vim
new file mode 100644
index 0000000000000000000000000000000000000000..8d0106a128ee2f6b0a1c8f089795e8cd5a96c62c
--- /dev/null
+++ b/mlir/utils/vim/syntax/mlir.vim
@@ -0,0 +1,116 @@
+" Vim syntax file
+" Language:   mlir
+" Maintainer: The MLIR team, http://github.com/tensorflow/mlir/
+" Version:      $Revision$
+" Some parts adapted from the LLVM vim syntax file.
+
+if version < 600
+  syntax clear
+elseif exists("b:current_syntax")
+  finish
+endif
+
+syn case match
+
+" Types.
+syn keyword mlirType index f16 f32 f64
+" Integer type.
+syn match mlirType /\<i\d\+\>/
+
+" Elemental types inside memref, tensor, or vector types.
+syn match mlirType /x\s*\zs\(f16\|f32\|f64\|i\d\+\)/
+
+" Shaped types.
+syn match mlirType /\<memref\ze\s*<.*>/
+syn match mlirType /\<tensor\ze\s*<.*>/
+syn match mlirType /\<vector\ze\s*<.*>/
+
+" vector types inside memref or tensor.
+syn match mlirType /x\s*\zsvector/
+
+" Operations.
+" Core ops (not exhaustive yet).
+" TODO: the list is not exhaustive.
+syn keyword mlirOps alloc alloca addf addi call call_indirect cmpi constant
+syn keyword mlirOps dealloc divf dma_start dma_wait dim extract_element
+syn keyword getTensor index_cast load memref_cast memref_shape_cast mulf muli
+syn keyword prefetch sitofp splat store select subf subi subview tensor_cast
+syn keyword view
+
+" Affine ops.
+syn match mlirOps /\<affine\.apply\>/
+syn match mlirOps /\<affine\.dma_start\>/
+syn match mlirOps /\<affine\.dma_wait\>/
+syn match mlirOps /\<affine\.for\>/
+syn match mlirOps /\<affine\.if\>/
+syn match mlirOps /\<affine\.load\>/
+syn match mlirOps /\<affine\.prefetch\>/
+syn match mlirOps /\<affine\.store\>/
+syn match mlirOps /\<loop\.for\>/
+syn match mlirOps /\<loop\.if\>/
+
+" TODO: dialect name prefixed ops (llvm or std).
+
+" Keywords.
+syn keyword mlirKeyword
+      \ dense
+      \ else
+      \ func
+      \ module
+      \ return
+      \ step
+      \ to
+
+" Misc syntax.
+
+syn match   mlirNumber /-\?\<\d\+\>/
+" Match numbers even in shaped types.
+syn match   mlirNumber /-\?\<\d\+\ze\s*x/
+syn match   mlirNumber /x\s*\zs-\?\d\+\ze\s*x/
+
+syn match   mlirFloat  /-\?\<\d\+\.\d*\(e[+-]\d\+\)\?\>/
+syn match   mlirFloat  /\<0x\x\+\>/
+syn keyword mlirBoolean true false
+syn match   mlirComment /\/\/.*$/
+syn region  mlirString start=/"/ skip=/\\"/ end=/"/
+syn match   mlirLabel /[-a-zA-Z$._][-a-zA-Z$._0-9]*:/
+syn match   mlirIdentifier /[%@][a-zA-Z$._-][a-zA-Z0-9$._-]*/
+syn match   mlirIdentifier /[%@!]\d\+\>/
+syn match mlirMapSetOutline "#.*$"
+
+" Syntax-highlight lit test commands and bug numbers.
+syn match  mlirSpecialComment /\/\/\s*RUN:.*$/
+syn match  mlirSpecialComment /\/\/\s*CHECK:.*$/
+syn match  mlirSpecialComment "\v\/\/\s*CHECK-(NEXT|NOT|DAG|SAME|LABEL):.*$"
+syn match  mlirSpecialComment /\/\/\s*expected-error.*$/
+syn match  mlirSpecialComment /\/\/\s*expected-remark.*$/
+syn match  mlirSpecialComment /;\s*XFAIL:.*$/
+syn match  mlirSpecialComment /\/\/\s*PR\d*\s*$/
+syn match  mlirSpecialComment /\/\/\s*REQUIRES:.*$/
+
+if version >= 508 || !exists("did_c_syn_inits")
+  if version < 508
+    let did_c_syn_inits = 1
+    command -nargs=+ HiLink hi link <args>
+  else
+    command -nargs=+ HiLink hi def link <args>
+  endif
+
+  HiLink mlirType Type
+  HiLink mlirOps Statement
+  HiLink mlirMapSetOutline PreProc
+  HiLink mlirNumber Number
+  HiLink mlirComment Comment
+  HiLink mlirString String
+  HiLink mlirLabel Label
+  HiLink mlirKeyword Keyword
+  HiLink mlirBoolean Boolean
+  HiLink mlirFloat Float
+  HiLink mlirConstant Constant
+  HiLink mlirSpecialComment SpecialComment
+  HiLink mlirIdentifier Identifier
+
+  delcommand HiLink
+endif
+
+let b:current_syntax = "mlir"